fork of https://github.com/sourcegraph/zoekt
0

Configure Feed

Select the types of activity you want to include in your feed.

1// Copyright 2016 Google Inc. All rights reserved. 2// 3// Licensed under the Apache License, Version 2.0 (the "License"); 4// you may not use this file except in compliance with the License. 5// You may obtain a copy of the License at 6// 7// http://www.apache.org/licenses/LICENSE-2.0 8// 9// Unless required by applicable law or agreed to in writing, software 10// distributed under the License is distributed on an "AS IS" BASIS, 11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12// See the License for the specific language governing permissions and 13// limitations under the License. 14 15package zoekt 16 17import ( 18 "bytes" 19 "fmt" 20 "log" 21 "path" 22 "slices" 23 "sort" 24 "strings" 25 "unicode" 26 "unicode/utf8" 27 28 "github.com/sourcegraph/zoekt/ctags" 29) 30 31var _ = log.Println 32 33// contentProvider is an abstraction to treat matches for names and 34// content with the same code. 35type contentProvider struct { 36 id *indexData 37 stats *Stats 38 39 // mutable 40 err error 41 idx uint32 42 _data []byte 43 _nl []uint32 44 _nlBuf []uint32 45 _sects []DocumentSection 46 _sectBuf []DocumentSection 47 fileSize uint32 48} 49 50// setDocument skips to the given document. 51func (p *contentProvider) setDocument(docID uint32) { 52 fileStart := p.id.boundaries[docID] 53 54 p.idx = docID 55 p.fileSize = p.id.boundaries[docID+1] - fileStart 56 57 p._nl = nil 58 p._sects = nil 59 p._data = nil 60} 61 62func (p *contentProvider) docSections() []DocumentSection { 63 if p._sects == nil { 64 var sz uint32 65 p._sects, sz, p.err = p.id.readDocSections(p.idx, p._sectBuf) 66 p.stats.ContentBytesLoaded += int64(sz) 67 p._sectBuf = p._sects 68 } 69 return p._sects 70} 71 72func (p *contentProvider) newlines() newlines { 73 if p._nl == nil { 74 var sz uint32 75 p._nl, sz, p.err = p.id.readNewlines(p.idx, p._nlBuf) 76 p._nlBuf = p._nl 77 p.stats.ContentBytesLoaded += int64(sz) 78 } 79 return newlines{locs: p._nl, fileSize: p.fileSize} 80} 81 82func (p *contentProvider) data(fileName bool) []byte { 83 if fileName { 84 return p.id.fileNameContent[p.id.fileNameIndex[p.idx]:p.id.fileNameIndex[p.idx+1]] 85 } 86 87 if p._data == nil { 88 p._data, p.err = p.id.readContents(p.idx) 89 p.stats.FilesLoaded++ 90 p.stats.ContentBytesLoaded += int64(len(p._data)) 91 } 92 return p._data 93} 94 95// Find offset in bytes (relative to corpus start) for an offset in 96// runes (relative to document start). If filename is set, the corpus 97// is the set of filenames, with the document being the name itself. 98func (p *contentProvider) findOffset(filename bool, r uint32) uint32 { 99 if p.id.metaData.PlainASCII { 100 return r 101 } 102 103 sample := p.id.runeOffsets 104 runeEnds := p.id.fileEndRunes 105 fileStartByte := p.id.boundaries[p.idx] 106 if filename { 107 sample = p.id.fileNameRuneOffsets 108 runeEnds = p.id.fileNameEndRunes 109 fileStartByte = p.id.fileNameIndex[p.idx] 110 } 111 112 absR := r 113 if p.idx > 0 { 114 absR += runeEnds[p.idx-1] 115 } 116 117 byteOff, left := sample.lookup(absR) 118 119 var data []byte 120 121 if filename { 122 data = p.id.fileNameContent[byteOff:] 123 } else { 124 data, p.err = p.id.readContentSlice(byteOff, 3*runeOffsetFrequency) 125 if p.err != nil { 126 return 0 127 } 128 } 129 for left > 0 { 130 _, sz := utf8.DecodeRune(data) 131 byteOff += uint32(sz) 132 data = data[sz:] 133 left-- 134 } 135 136 byteOff -= fileStartByte 137 return byteOff 138} 139 140// fillMatches converts the internal candidateMatch slice into our API's LineMatch. 141// It only ever returns content XOR filename matches, not both. If there are any 142// content matches, these are always returned, and we omit filename matches. 143// 144// Performance invariant: ms is sorted and non-overlapping. 145// 146// Note: the byte slices may be backed by mmapped data, so before being 147// returned by the API it needs to be copied. 148func (p *contentProvider) fillMatches(ms []*candidateMatch, numContextLines int, language string, debug bool) []LineMatch { 149 var filenameMatches []*candidateMatch 150 contentMatches := make([]*candidateMatch, 0, len(ms)) 151 152 for _, m := range ms { 153 if m.fileName { 154 filenameMatches = append(filenameMatches, m) 155 } else { 156 contentMatches = append(contentMatches, m) 157 } 158 } 159 160 // If there are any content matches, we only return these and skip filename matches. 161 if len(contentMatches) > 0 { 162 contentMatches = breakMatchesOnNewlines(contentMatches, p.data(false)) 163 return p.fillContentMatches(contentMatches, numContextLines, language, debug) 164 } 165 166 // Otherwise, we return a single line containing the filematch match. 167 score, debugScore, _ := p.candidateMatchScore(filenameMatches, language, debug) 168 res := LineMatch{ 169 Line: p.id.fileName(p.idx), 170 FileName: true, 171 Score: score, 172 DebugScore: debugScore, 173 } 174 175 for _, m := range ms { 176 res.LineFragments = append(res.LineFragments, LineFragmentMatch{ 177 LineOffset: int(m.byteOffset), 178 MatchLength: int(m.byteMatchSz), 179 Offset: m.byteOffset, 180 }) 181 } 182 183 return []LineMatch{res} 184 185} 186 187// fillChunkMatches converts the internal candidateMatch slice into our API's ChunkMatch. 188// It only ever returns content XOR filename matches, not both. If there are any content 189// matches, these are always returned, and we omit filename matches. 190// 191// Performance invariant: ms is sorted and non-overlapping. 192// 193// Note: the byte slices may be backed by mmapped data, so before being 194// returned by the API it needs to be copied. 195func (p *contentProvider) fillChunkMatches(ms []*candidateMatch, numContextLines int, language string, debug bool) []ChunkMatch { 196 var filenameMatches []*candidateMatch 197 contentMatches := make([]*candidateMatch, 0, len(ms)) 198 199 for _, m := range ms { 200 if m.fileName { 201 filenameMatches = append(filenameMatches, m) 202 } else { 203 contentMatches = append(contentMatches, m) 204 } 205 } 206 207 // If there are any content matches, we only return these and skip filename matches. 208 if len(contentMatches) > 0 { 209 return p.fillContentChunkMatches(contentMatches, numContextLines, language, debug) 210 } 211 212 // Otherwise, we return a single chunk representing the filename match. 213 score, debugScore, _ := p.candidateMatchScore(filenameMatches, language, debug) 214 fileName := p.id.fileName(p.idx) 215 ranges := make([]Range, 0, len(ms)) 216 for _, m := range ms { 217 ranges = append(ranges, Range{ 218 Start: Location{ 219 ByteOffset: m.byteOffset, 220 LineNumber: 1, 221 Column: uint32(utf8.RuneCount(fileName[:m.byteOffset]) + 1), 222 }, 223 End: Location{ 224 ByteOffset: m.byteOffset + m.byteMatchSz, 225 LineNumber: 1, 226 Column: uint32(utf8.RuneCount(fileName[:m.byteOffset+m.byteMatchSz]) + 1), 227 }, 228 }) 229 } 230 231 return []ChunkMatch{{ 232 Content: fileName, 233 ContentStart: Location{ByteOffset: 0, LineNumber: 1, Column: 1}, 234 Ranges: ranges, 235 FileName: true, 236 237 Score: score, 238 DebugScore: debugScore, 239 }} 240} 241 242func (p *contentProvider) fillContentMatches(ms []*candidateMatch, numContextLines int, language string, debug bool) []LineMatch { 243 var result []LineMatch 244 for len(ms) > 0 { 245 m := ms[0] 246 num := p.newlines().atOffset(m.byteOffset) 247 lineStart := int(p.newlines().lineStart(num)) 248 nextLineStart := int(p.newlines().lineStart(num + 1)) 249 250 var lineCands []*candidateMatch 251 252 endMatch := m.byteOffset + m.byteMatchSz 253 254 for len(ms) > 0 { 255 m := ms[0] 256 if int(m.byteOffset) < nextLineStart { 257 endMatch = m.byteOffset + m.byteMatchSz 258 lineCands = append(lineCands, m) 259 ms = ms[1:] 260 } else { 261 break 262 } 263 } 264 265 if len(lineCands) == 0 { 266 log.Panicf( 267 "%s %v infinite loop: num %d start,end %d,%d, offset %d", 268 p.id.fileName(p.idx), p.id.metaData, 269 num, lineStart, nextLineStart, 270 m.byteOffset) 271 } 272 273 data := p.data(false) 274 275 // Due to merging matches, we may have a match that 276 // crosses a line boundary. Prevent confusion by 277 // taking lines until we pass the last match 278 for nextLineStart < len(data) && endMatch > uint32(nextLineStart) { 279 next := bytes.IndexByte(data[nextLineStart:], '\n') 280 if next == -1 { 281 nextLineStart = len(data) 282 } else { 283 // TODO(hanwen): test that checks "+1" part here. 284 nextLineStart += next + 1 285 } 286 } 287 288 finalMatch := LineMatch{ 289 LineStart: lineStart, 290 LineEnd: nextLineStart, 291 LineNumber: num, 292 } 293 finalMatch.Line = data[lineStart:nextLineStart] 294 295 if numContextLines > 0 { 296 finalMatch.Before = p.newlines().getLines(data, num-numContextLines, num) 297 finalMatch.After = p.newlines().getLines(data, num+1, num+1+numContextLines) 298 } 299 300 score, debugScore, symbolInfo := p.candidateMatchScore(lineCands, language, debug) 301 finalMatch.Score = score 302 finalMatch.DebugScore = debugScore 303 304 for i, m := range lineCands { 305 fragment := LineFragmentMatch{ 306 Offset: m.byteOffset, 307 LineOffset: int(m.byteOffset) - lineStart, 308 MatchLength: int(m.byteMatchSz), 309 } 310 if i < len(symbolInfo) && symbolInfo[i] != nil { 311 fragment.SymbolInfo = symbolInfo[i] 312 } 313 314 finalMatch.LineFragments = append(finalMatch.LineFragments, fragment) 315 } 316 result = append(result, finalMatch) 317 } 318 return result 319} 320 321func (p *contentProvider) fillContentChunkMatches(ms []*candidateMatch, numContextLines int, language string, debug bool) []ChunkMatch { 322 newlines := p.newlines() 323 data := p.data(false) 324 325 // columnHelper prevents O(len(ms) * len(data)) lookups for all columns. 326 // However, it depends on ms being sorted by byteOffset and non-overlapping. 327 // This invariant is true at the time of writing, but we conservatively 328 // enforce this. Note: chunkCandidates preserves the sorting so safe to 329 // transform now. 330 columnHelper := columnHelper{data: data} 331 if !sort.IsSorted((sortByOffsetSlice)(ms)) { 332 log.Printf("WARN: performance invariant violated. candidate matches are not sorted in fillContentChunkMatches. Report to developers.") 333 sort.Sort((sortByOffsetSlice)(ms)) 334 } 335 336 chunks := chunkCandidates(ms, newlines, numContextLines) 337 chunkMatches := make([]ChunkMatch, 0, len(chunks)) 338 for _, chunk := range chunks { 339 score, debugScore, symbolInfo := p.candidateMatchScore(chunk.candidates, language, debug) 340 341 ranges := make([]Range, 0, len(chunk.candidates)) 342 for _, cm := range chunk.candidates { 343 startOffset := cm.byteOffset 344 endOffset := cm.byteOffset + cm.byteMatchSz 345 startLine, endLine := newlines.offsetRangeToLineRange(startOffset, endOffset) 346 347 ranges = append(ranges, Range{ 348 Start: Location{ 349 ByteOffset: startOffset, 350 LineNumber: uint32(startLine), 351 Column: columnHelper.get(int(newlines.lineStart(startLine)), startOffset), 352 }, 353 End: Location{ 354 ByteOffset: endOffset, 355 LineNumber: uint32(endLine), 356 Column: columnHelper.get(int(newlines.lineStart(endLine)), endOffset), 357 }, 358 }) 359 } 360 361 firstLineNumber := int(chunk.firstLine) - numContextLines 362 if firstLineNumber < 1 { 363 firstLineNumber = 1 364 } 365 firstLineStart := newlines.lineStart(firstLineNumber) 366 367 chunkMatches = append(chunkMatches, ChunkMatch{ 368 Content: newlines.getLines(data, firstLineNumber, int(chunk.lastLine)+numContextLines+1), 369 ContentStart: Location{ 370 ByteOffset: firstLineStart, 371 LineNumber: uint32(firstLineNumber), 372 Column: 1, 373 }, 374 FileName: false, 375 Ranges: ranges, 376 SymbolInfo: symbolInfo, 377 Score: score, 378 DebugScore: debugScore, 379 }) 380 } 381 return chunkMatches 382} 383 384type candidateChunk struct { 385 candidates []*candidateMatch 386 firstLine uint32 // 1-based, inclusive 387 lastLine uint32 // 1-based, inclusive 388 minOffset uint32 // 0-based, inclusive 389 maxOffset uint32 // 0-based, exclusive 390} 391 392// chunkCandidates groups a set of sorted, non-overlapping candidate matches by line number. Adjacent 393// chunks will be merged if adding `numContextLines` to the beginning and end of the chunk would cause 394// it to overlap with an adjacent chunk. 395// 396// input invariants: ms is sorted by byteOffset and is non overlapping with respect to endOffset. 397// output invariants: if you flatten candidates the input invariant is retained. 398func chunkCandidates(ms []*candidateMatch, newlines newlines, numContextLines int) []candidateChunk { 399 var chunks []candidateChunk 400 for _, m := range ms { 401 startOffset := m.byteOffset 402 endOffset := m.byteOffset + m.byteMatchSz 403 firstLine, lastLine := newlines.offsetRangeToLineRange(startOffset, endOffset) 404 405 if len(chunks) > 0 && int(chunks[len(chunks)-1].lastLine)+numContextLines >= firstLine-numContextLines { 406 // If a new chunk created with the current candidateMatch would 407 // overlap with the previous chunk, instead add the candidateMatch 408 // to the last chunk and extend end of the last chunk. 409 last := &chunks[len(chunks)-1] 410 last.candidates = append(last.candidates, m) 411 if last.maxOffset < endOffset { 412 last.lastLine = uint32(lastLine) 413 last.maxOffset = uint32(endOffset) 414 } 415 } else { 416 chunks = append(chunks, candidateChunk{ 417 firstLine: uint32(firstLine), 418 lastLine: uint32(lastLine), 419 minOffset: startOffset, 420 maxOffset: endOffset, 421 candidates: []*candidateMatch{m}, 422 }) 423 } 424 } 425 return chunks 426} 427 428// columnHelper is a helper struct which caches the number of runes last 429// counted. If we naively use utf8.RuneCount for each match on a line, this 430// leads to an O(nm) algorithm where m is the number of matches and n is the 431// length of the line. Aassuming we our candidates are increasing in offset 432// makes this operation O(n) instead. 433type columnHelper struct { 434 data []byte 435 436 // 0 values for all these are valid values 437 lastLineOffset int 438 lastOffset uint32 439 lastRuneCount uint32 440} 441 442// get returns the line column for offset. offset is the byte offset of the 443// rune in data. lineOffset is the byte offset inside of data for the line 444// containing offset. 445func (c *columnHelper) get(lineOffset int, offset uint32) uint32 { 446 var runeCount uint32 447 448 if lineOffset == c.lastLineOffset && offset >= c.lastOffset { 449 // Can count from last calculation 450 runeCount = c.lastRuneCount + uint32(utf8.RuneCount(c.data[c.lastOffset:offset])) 451 } else { 452 // Need to count from the beginning of line 453 runeCount = uint32(utf8.RuneCount(c.data[lineOffset:offset])) 454 } 455 456 c.lastLineOffset = lineOffset 457 c.lastOffset = offset 458 c.lastRuneCount = runeCount 459 460 return runeCount + 1 461} 462 463type newlines struct { 464 // locs is the sorted set of byte offsets of the newlines in the file 465 locs []uint32 466 467 // fileSize is just the number of bytes in the file. It is stored 468 // on this struct so we can safely know the length of the last line 469 // in the file since not all files end in a newline. 470 fileSize uint32 471} 472 473// atOffset returns the line containing the offset. If the offset lands on 474// the newline ending line M, we return M. 475func (nls newlines) atOffset(offset uint32) (lineNumber int) { 476 idx := sort.Search(len(nls.locs), func(n int) bool { 477 return nls.locs[n] >= offset 478 }) 479 return idx + 1 480} 481 482// lineStart returns the byte offset of the beginning of the given line. 483// lineNumber is 1-based. If lineNumber is out of range of the lines in the 484// file, the return value will be clamped to [0,fileSize]. 485func (nls newlines) lineStart(lineNumber int) uint32 { 486 // nls.locs[0] + 1 is the start of the 2nd line of data. 487 startIdx := lineNumber - 2 488 489 if startIdx < 0 { 490 return 0 491 } else if startIdx >= len(nls.locs) { 492 return nls.fileSize 493 } else { 494 return nls.locs[startIdx] + 1 495 } 496} 497 498// offsetRangeToLineRange returns range of lines that fully contains the given byte range. 499// The inputs are 0-based byte offsets into the file representing the (exclusive) range [startOffset, endOffset). 500// The return values are 1-based line numbers representing the (inclusive) range [startLine, endLine]. 501func (nls newlines) offsetRangeToLineRange(startOffset, endOffset uint32) (startLine, endLine int) { 502 startLine = nls.atOffset(startOffset) 503 endLine = nls.atOffset( 504 max(startOffset, max(endOffset, 1)-1), // clamp endOffset and prevent underflow 505 ) 506 return startLine, endLine 507} 508 509// getLines returns a slice of data containing the lines [low, high). 510// low is 1-based and inclusive. high is 1-based and exclusive. 511func (nls newlines) getLines(data []byte, low, high int) []byte { 512 if low >= high { 513 return nil 514 } 515 516 return data[nls.lineStart(low):nls.lineStart(high)] 517} 518 519const ( 520 // Query-dependent scoring signals. All of these together are bounded at ~9000 521 // (scoreWordMatch + scoreSymbol + scoreKindMatch * 10 + scoreFactorAtomMatch). 522 scorePartialWordMatch = 50.0 523 scoreWordMatch = 500.0 524 scoreBase = 7000.0 525 scorePartialBase = 4000.0 526 scoreSymbol = 7000.0 527 scorePartialSymbol = 4000.0 528 scoreKindMatch = 100.0 529 scoreFactorAtomMatch = 400.0 530 531 // File-only scoring signals. For now these are also bounded ~9000 to give them 532 // equal weight with the query-dependent signals. 533 scoreFileRankFactor = 9000.0 534 scoreFileOrderFactor = 10.0 535 scoreRepoRankFactor = 20.0 536 537 // Used for ordering line and chunk matches within a file. 538 scoreLineOrderFactor = 1.0 539) 540 541// findMaxOverlappingSection returns the index of the section in secs that 542// overlaps the most with the area defined by off and sz, relative to the size 543// of the section. If no section overlaps, it returns 0, false. If multiple 544// sections overlap the same amount, the first one is returned. 545// 546// The implementation assumes that sections do not overlap and are sorted by 547// DocumentSection.Start. 548func findMaxOverlappingSection(secs []DocumentSection, off, sz uint32) (uint32, bool) { 549 start := off 550 end := off + sz 551 552 // Find the first section that might overlap 553 j := sort.Search(len(secs), func(i int) bool { return secs[i].End > start }) 554 555 if j == len(secs) || secs[j].Start >= end { 556 // No overlap. 557 return 0, false 558 } 559 560 relOverlap := func(j int) float64 { 561 secSize := secs[j].End - secs[j].Start 562 if secSize == 0 { 563 return 0 564 } 565 // This cannot overflow because we make sure there is overlap before calling relOverlap 566 overlap := min(secs[j].End, end) - max(secs[j].Start, start) 567 return float64(overlap) / float64(secSize) 568 } 569 570 ol1 := relOverlap(j) 571 if epsilonEqualsOne(ol1) || j == len(secs)-1 || secs[j+1].Start >= end { 572 return uint32(j), ol1 > 0 573 } 574 575 // We know that [off,off+sz[ overlaps with at least 2 sections. We only have to check 576 // if the second section overlaps more than the first one, because a third 577 // section can only overlap if the overlap with the second section is complete. 578 ol2 := relOverlap(j + 1) 579 if ol2 > ol1 { 580 return uint32(j + 1), ol2 > 0 581 } 582 583 return uint32(j), ol1 > 0 584} 585 586func (p *contentProvider) findSymbol(cm *candidateMatch) (DocumentSection, *Symbol, bool) { 587 if cm.fileName { 588 return DocumentSection{}, nil, false 589 } 590 591 secs := p.docSections() 592 593 secIdx, ok := cm.symbolIdx, cm.symbol 594 if !ok { 595 // Not from a symbol matchTree. Let's see if it overlaps with a symbol. 596 secIdx, ok = findMaxOverlappingSection(secs, cm.byteOffset, cm.byteMatchSz) 597 } 598 if !ok { 599 return DocumentSection{}, nil, false 600 } 601 602 sec := secs[secIdx] 603 604 // Now lets hydrate in the SymbolInfo. We do not hydrate in SymbolInfo.Sym 605 // since some callsites do not need it stored, and that incurs an extra 606 // copy. 607 // 608 // 2024-01-08 we are refactoring this and the code path indicates this can 609 // fail, so callers need to handle nil symbol. However, it would be 610 // surprising that we have a matching section but not symbol data. 611 start := p.id.fileEndSymbol[p.idx] 612 si := p.id.symbols.data(start + secIdx) 613 614 return sec, si, true 615} 616 617func (p *contentProvider) candidateMatchScore(ms []*candidateMatch, language string, debug bool) (float64, string, []*Symbol) { 618 type debugScore struct { 619 what string 620 score float64 621 } 622 623 score := &debugScore{} 624 maxScore := &debugScore{} 625 626 addScore := func(what string, s float64) { 627 if s != 0 && debug { 628 score.what += fmt.Sprintf("%s:%.2f, ", what, s) 629 } 630 score.score += s 631 } 632 633 filename := p.data(true) 634 var symbolInfo []*Symbol 635 636 for i, m := range ms { 637 data := p.data(m.fileName) 638 639 endOffset := m.byteOffset + m.byteMatchSz 640 startBoundary := m.byteOffset < uint32(len(data)) && (m.byteOffset == 0 || byteClass(data[m.byteOffset-1]) != byteClass(data[m.byteOffset])) 641 endBoundary := endOffset > 0 && (endOffset == uint32(len(data)) || byteClass(data[endOffset-1]) != byteClass(data[endOffset])) 642 643 score.score = 0 644 score.what = "" 645 646 if startBoundary && endBoundary { 647 addScore("WordMatch", scoreWordMatch) 648 } else if startBoundary || endBoundary { 649 addScore("PartialWordMatch", scorePartialWordMatch) 650 } 651 652 if m.fileName { 653 sep := bytes.LastIndexByte(data, '/') 654 startMatch := int(m.byteOffset) == sep+1 655 endMatch := endOffset == uint32(len(data)) 656 if startMatch && endMatch { 657 addScore("Base", scoreBase) 658 } else if startMatch || endMatch { 659 addScore("EdgeBase", (scoreBase+scorePartialBase)/2) 660 } else if sep < int(m.byteOffset) { 661 addScore("InnerBase", scorePartialBase) 662 } 663 } else if sec, si, ok := p.findSymbol(m); ok { 664 startMatch := sec.Start == m.byteOffset 665 endMatch := sec.End == endOffset 666 if startMatch && endMatch { 667 addScore("Symbol", scoreSymbol) 668 } else if startMatch || endMatch { 669 addScore("EdgeSymbol", (scoreSymbol+scorePartialSymbol)/2) 670 } else { 671 addScore("OverlapSymbol", scorePartialSymbol) 672 } 673 674 // Score based on symbol data 675 if si != nil { 676 symbolKind := ctags.ParseSymbolKind(si.Kind) 677 sym := sectionSlice(data, sec) 678 679 addScore(fmt.Sprintf("kind:%s:%s", language, si.Kind), scoreSymbolKind(language, filename, sym, symbolKind)) 680 681 // This is from a symbol tree, so we need to store the symbol 682 // information. 683 if m.symbol { 684 if symbolInfo == nil { 685 symbolInfo = make([]*Symbol, len(ms)) 686 } 687 // findSymbols does not hydrate in Sym. So we need to store it. 688 si.Sym = string(sym) 689 symbolInfo[i] = si 690 } 691 } 692 } 693 694 // scoreWeight != 1 means it affects score 695 if !epsilonEqualsOne(m.scoreWeight) { 696 score.score = score.score * m.scoreWeight 697 if debug { 698 score.what += fmt.Sprintf("boost:%.2f, ", m.scoreWeight) 699 } 700 } 701 702 if score.score > maxScore.score { 703 maxScore.score = score.score 704 maxScore.what = score.what 705 } 706 } 707 708 if debug { 709 maxScore.what = fmt.Sprintf("score:%.2f <- %s", maxScore.score, strings.TrimSuffix(maxScore.what, ", ")) 710 } 711 712 return maxScore.score, maxScore.what, symbolInfo 713} 714 715// sectionSlice will return data[sec.Start:sec.End] but will clip Start and 716// End such that it won't be out of range. 717func sectionSlice(data []byte, sec DocumentSection) []byte { 718 l := uint32(len(data)) 719 if sec.Start >= l { 720 return nil 721 } 722 if sec.End > l { 723 sec.End = l 724 } 725 return data[sec.Start:sec.End] 726} 727 728// scoreSymbolKind boosts a match based on the combination of language, symbol 729// and kind. The language string comes from go-enry, the symbol and kind from 730// ctags. 731func scoreSymbolKind(language string, filename []byte, sym []byte, kind ctags.SymbolKind) float64 { 732 var factor float64 733 734 // Generic ranking which will be overriden by language specific ranking 735 switch kind { 736 case ctags.Type: // scip-ctags regression workaround https://github.com/sourcegraph/sourcegraph/issues/57659 737 factor = 8 738 case ctags.Class: 739 factor = 10 740 case ctags.Struct: 741 factor = 9.5 742 case ctags.Enum: 743 factor = 9 744 case ctags.Interface: 745 factor = 8 746 case ctags.Function, ctags.Method: 747 factor = 7 748 case ctags.Field: 749 factor = 5.5 750 case ctags.Constant: 751 factor = 5 752 case ctags.Variable: 753 factor = 4 754 default: 755 // For all other kinds, assign a low score by default. 756 factor = 1 757 } 758 759 switch language { 760 case "Java", "java": 761 switch kind { 762 // 2022-03-30: go-ctags contains a regex rule for Java classes that sets "kind" 763 // to "classes" instead of "c". We have to cover both cases to support existing 764 // indexes. 765 case ctags.Class: 766 factor = 10 767 case ctags.Enum: 768 factor = 9 769 case ctags.Interface: 770 factor = 8 771 case ctags.Method: 772 factor = 7 773 case ctags.Field: 774 factor = 6 775 case ctags.EnumConstant: 776 factor = 5 777 } 778 case "Kotlin", "kotlin": 779 switch kind { 780 case ctags.Class: 781 factor = 10 782 case ctags.Interface: 783 factor = 9 784 case ctags.Method: 785 factor = 8 786 case ctags.TypeAlias: 787 factor = 7 788 case ctags.Constant: 789 factor = 6 790 case ctags.Variable: 791 factor = 5 792 } 793 case "Go", "go": 794 switch kind { 795 // scip-ctags regression workaround https://github.com/sourcegraph/sourcegraph/issues/57659 796 // for each case a description of the fields in ctags in the comment 797 case ctags.Type: // interface struct talias 798 factor = 9 799 case ctags.Interface: // interfaces 800 factor = 10 801 case ctags.Struct: // structs 802 factor = 9 803 case ctags.TypeAlias: // type aliases 804 factor = 9 805 case ctags.MethodSpec: // interface method specification 806 factor = 8.5 807 case ctags.Method, ctags.Function: // functions 808 factor = 8 809 case ctags.Field: // struct fields 810 factor = 7 811 case ctags.Constant: // constants 812 factor = 6 813 case ctags.Variable: // variables 814 factor = 5 815 } 816 817 // Boost exported go symbols. Same implementation as token.IsExported 818 if ch, _ := utf8.DecodeRune(sym); unicode.IsUpper(ch) { 819 factor += 0.5 820 } 821 822 if bytes.HasSuffix(filename, []byte("_test.go")) { 823 factor *= 0.8 824 } 825 826 // Could also rank on: 827 // 828 // - anonMember struct anonymous members 829 // - packageName name for specifying imported package 830 // - receiver receivers 831 // - package packages 832 // - type types 833 // - unknown unknown 834 case "C++", "c++": 835 switch kind { 836 case ctags.Class: // classes 837 factor = 10 838 case ctags.Enum: // enumeration names 839 factor = 9 840 case ctags.Function: // function definitions 841 factor = 8 842 case ctags.Struct: // structure names 843 factor = 7 844 case ctags.Union: // union names 845 factor = 6 846 case ctags.TypeAlias: // typedefs 847 factor = 5 848 case ctags.Field: // class, struct, and union members 849 factor = 4 850 case ctags.Variable: // varialbe definitions 851 factor = 3 852 } 853 // Could also rank on: 854 // NAME DESCRIPTION 855 // macro macro definitions 856 // enumerator enumerators (values inside an enumeration) 857 // header included header files 858 // namespace namespaces 859 // variable variable definitions 860 case "Scala", "scala": 861 switch kind { 862 case ctags.Class: 863 factor = 10 864 case ctags.Interface: 865 factor = 9 866 case ctags.Object: 867 factor = 8 868 case ctags.Function: 869 factor = 7 870 case ctags.Type: 871 factor = 6 872 case ctags.Variable: 873 factor = 5 874 case ctags.Package: 875 factor = 4 876 } 877 case "Python", "python": 878 switch kind { 879 case ctags.Class: // classes 880 factor = 10 881 case ctags.Function, ctags.Method: // function definitions 882 factor = 8 883 case ctags.Field: // class, struct, and union members 884 factor = 4 885 case ctags.Variable: // variable definitions 886 factor = 3 887 case ctags.Local: // local variables 888 factor = 2 889 } 890 // Could also rank on: 891 // 892 // - namespace name referring a module defined in other file 893 // - module modules 894 // - unknown name referring a class/variable/function/module defined in other module 895 // - parameter function parameters 896 case "Ruby", "ruby": 897 switch kind { 898 case ctags.Class: 899 factor = 10 900 case ctags.Method: 901 factor = 9 902 case ctags.MethodAlias: 903 factor = 8 904 case ctags.Module: 905 factor = 7 906 case ctags.SingletonMethod: 907 factor = 6 908 case ctags.Constant: 909 factor = 5 910 case ctags.Accessor: 911 factor = 4 912 case ctags.Library: 913 factor = 3 914 } 915 case "PHP", "php": 916 switch kind { 917 case ctags.Class: 918 factor = 10 919 case ctags.Interface: 920 factor = 9 921 case ctags.Function: 922 factor = 8 923 case ctags.Trait: 924 factor = 7 925 case ctags.Define: 926 factor = 6 927 case ctags.Namespace: 928 factor = 5 929 case ctags.MethodAlias: 930 factor = 4 931 case ctags.Variable: 932 factor = 3 933 case ctags.Local: 934 factor = 3 935 } 936 case "GraphQL", "graphql": 937 switch kind { 938 case ctags.Type: 939 factor = 10 940 } 941 case "Markdown", "markdown": 942 // Headers are good signal in docs, but do not rank as highly as code. 943 switch kind { 944 case ctags.Chapter: // # 945 factor = 4 946 case ctags.Section: // ## 947 factor = 3 948 case ctags.Subsection: // ### 949 factor = 2 950 } 951 } 952 953 return factor * scoreKindMatch 954} 955 956type matchScoreSlice []LineMatch 957 958func (m matchScoreSlice) Len() int { return len(m) } 959func (m matchScoreSlice) Swap(i, j int) { m[i], m[j] = m[j], m[i] } 960func (m matchScoreSlice) Less(i, j int) bool { return m[i].Score > m[j].Score } 961 962type chunkMatchScoreSlice []ChunkMatch 963 964func (m chunkMatchScoreSlice) Len() int { return len(m) } 965func (m chunkMatchScoreSlice) Swap(i, j int) { m[i], m[j] = m[j], m[i] } 966func (m chunkMatchScoreSlice) Less(i, j int) bool { return m[i].Score > m[j].Score } 967 968type fileMatchesByScore []FileMatch 969 970func (m fileMatchesByScore) Len() int { return len(m) } 971func (m fileMatchesByScore) Swap(i, j int) { m[i], m[j] = m[j], m[i] } 972func (m fileMatchesByScore) Less(i, j int) bool { return m[i].Score > m[j].Score } 973 974func sortMatchesByScore(ms []LineMatch) { 975 sort.Sort(matchScoreSlice(ms)) 976} 977 978func sortChunkMatchesByScore(ms []ChunkMatch) { 979 sort.Sort(chunkMatchScoreSlice(ms)) 980} 981 982// SortFiles sorts files matches in the order we want to present results to 983// users. The order depends on the match score, which includes both 984// query-dependent signals like word overlap, and file-only signals like the 985// file ranks (if file ranks are enabled). 986// 987// We don't only use the scores, we will also boost some results to present 988// files with novel extensions. 989func SortFiles(ms []FileMatch) { 990 sort.Sort(fileMatchesByScore(ms)) 991 992 // Boost a file extension not in the top 3 to the third filematch. 993 boostNovelExtension(ms, 2, 0.9) 994} 995 996func boostNovelExtension(ms []FileMatch, boostOffset int, minScoreRatio float64) { 997 if len(ms) <= boostOffset+1 { 998 return 999 } 1000 1001 top := ms[:boostOffset] 1002 candidates := ms[boostOffset:] 1003 1004 // Don't bother boosting something which is significantly different to the 1005 // result it replaces. 1006 minScoreForNovelty := candidates[0].Score * minScoreRatio 1007 1008 // We want to look for an ext that isn't in the top exts 1009 exts := make([]string, len(top)) 1010 for i := range top { 1011 exts[i] = path.Ext(top[i].FileName) 1012 } 1013 1014 for i := range candidates { 1015 // Do not assume sorted due to boostNovelExtension being called on subsets 1016 if candidates[i].Score < minScoreForNovelty { 1017 continue 1018 } 1019 1020 if slices.Contains(exts, path.Ext(candidates[i].FileName)) { 1021 continue 1022 } 1023 1024 // Found what we are looking for, now boost to front of candidates (which 1025 // is ms[boostOffset]) 1026 for ; i > 0; i-- { 1027 candidates[i], candidates[i-1] = candidates[i-1], candidates[i] 1028 } 1029 return 1030 } 1031}