fork of https://github.com/sourcegraph/zoekt
1// Copyright 2016 Google Inc. All rights reserved.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15package zoekt
16
17import (
18 "bytes"
19 "fmt"
20 "log"
21 "path"
22 "slices"
23 "sort"
24 "strings"
25 "unicode"
26 "unicode/utf8"
27
28 "github.com/sourcegraph/zoekt/ctags"
29)
30
31var _ = log.Println
32
33// contentProvider is an abstraction to treat matches for names and
34// content with the same code.
35type contentProvider struct {
36 id *indexData
37 stats *Stats
38
39 // mutable
40 err error
41 idx uint32
42 _data []byte
43 _nl []uint32
44 _nlBuf []uint32
45 _sects []DocumentSection
46 _sectBuf []DocumentSection
47 fileSize uint32
48}
49
50// setDocument skips to the given document.
51func (p *contentProvider) setDocument(docID uint32) {
52 fileStart := p.id.boundaries[docID]
53
54 p.idx = docID
55 p.fileSize = p.id.boundaries[docID+1] - fileStart
56
57 p._nl = nil
58 p._sects = nil
59 p._data = nil
60}
61
62func (p *contentProvider) docSections() []DocumentSection {
63 if p._sects == nil {
64 var sz uint32
65 p._sects, sz, p.err = p.id.readDocSections(p.idx, p._sectBuf)
66 p.stats.ContentBytesLoaded += int64(sz)
67 p._sectBuf = p._sects
68 }
69 return p._sects
70}
71
72func (p *contentProvider) newlines() newlines {
73 if p._nl == nil {
74 var sz uint32
75 p._nl, sz, p.err = p.id.readNewlines(p.idx, p._nlBuf)
76 p._nlBuf = p._nl
77 p.stats.ContentBytesLoaded += int64(sz)
78 }
79 return newlines{locs: p._nl, fileSize: p.fileSize}
80}
81
82func (p *contentProvider) data(fileName bool) []byte {
83 if fileName {
84 return p.id.fileNameContent[p.id.fileNameIndex[p.idx]:p.id.fileNameIndex[p.idx+1]]
85 }
86
87 if p._data == nil {
88 p._data, p.err = p.id.readContents(p.idx)
89 p.stats.FilesLoaded++
90 p.stats.ContentBytesLoaded += int64(len(p._data))
91 }
92 return p._data
93}
94
95// Find offset in bytes (relative to corpus start) for an offset in
96// runes (relative to document start). If filename is set, the corpus
97// is the set of filenames, with the document being the name itself.
98func (p *contentProvider) findOffset(filename bool, r uint32) uint32 {
99 if p.id.metaData.PlainASCII {
100 return r
101 }
102
103 sample := p.id.runeOffsets
104 runeEnds := p.id.fileEndRunes
105 fileStartByte := p.id.boundaries[p.idx]
106 if filename {
107 sample = p.id.fileNameRuneOffsets
108 runeEnds = p.id.fileNameEndRunes
109 fileStartByte = p.id.fileNameIndex[p.idx]
110 }
111
112 absR := r
113 if p.idx > 0 {
114 absR += runeEnds[p.idx-1]
115 }
116
117 byteOff, left := sample.lookup(absR)
118
119 var data []byte
120
121 if filename {
122 data = p.id.fileNameContent[byteOff:]
123 } else {
124 data, p.err = p.id.readContentSlice(byteOff, 3*runeOffsetFrequency)
125 if p.err != nil {
126 return 0
127 }
128 }
129 for left > 0 {
130 _, sz := utf8.DecodeRune(data)
131 byteOff += uint32(sz)
132 data = data[sz:]
133 left--
134 }
135
136 byteOff -= fileStartByte
137 return byteOff
138}
139
140// fillMatches converts the internal candidateMatch slice into our API's LineMatch.
141// It only ever returns content XOR filename matches, not both. If there are any
142// content matches, these are always returned, and we omit filename matches.
143//
144// Performance invariant: ms is sorted and non-overlapping.
145//
146// Note: the byte slices may be backed by mmapped data, so before being
147// returned by the API it needs to be copied.
148func (p *contentProvider) fillMatches(ms []*candidateMatch, numContextLines int, language string, debug bool) []LineMatch {
149 var filenameMatches []*candidateMatch
150 contentMatches := make([]*candidateMatch, 0, len(ms))
151
152 for _, m := range ms {
153 if m.fileName {
154 filenameMatches = append(filenameMatches, m)
155 } else {
156 contentMatches = append(contentMatches, m)
157 }
158 }
159
160 // If there are any content matches, we only return these and skip filename matches.
161 if len(contentMatches) > 0 {
162 contentMatches = breakMatchesOnNewlines(contentMatches, p.data(false))
163 return p.fillContentMatches(contentMatches, numContextLines, language, debug)
164 }
165
166 // Otherwise, we return a single line containing the filematch match.
167 bestMatch, _ := p.candidateMatchScore(filenameMatches, language, debug)
168 res := LineMatch{
169 Line: p.id.fileName(p.idx),
170 FileName: true,
171 Score: bestMatch.score,
172 DebugScore: bestMatch.debugScore,
173 }
174
175 for _, m := range ms {
176 res.LineFragments = append(res.LineFragments, LineFragmentMatch{
177 LineOffset: int(m.byteOffset),
178 MatchLength: int(m.byteMatchSz),
179 Offset: m.byteOffset,
180 })
181 }
182
183 return []LineMatch{res}
184
185}
186
187// fillChunkMatches converts the internal candidateMatch slice into our API's ChunkMatch.
188// It only ever returns content XOR filename matches, not both. If there are any content
189// matches, these are always returned, and we omit filename matches.
190//
191// Performance invariant: ms is sorted and non-overlapping.
192//
193// Note: the byte slices may be backed by mmapped data, so before being
194// returned by the API it needs to be copied.
195func (p *contentProvider) fillChunkMatches(ms []*candidateMatch, numContextLines int, language string, debug bool) []ChunkMatch {
196 var filenameMatches []*candidateMatch
197 contentMatches := make([]*candidateMatch, 0, len(ms))
198
199 for _, m := range ms {
200 if m.fileName {
201 filenameMatches = append(filenameMatches, m)
202 } else {
203 contentMatches = append(contentMatches, m)
204 }
205 }
206
207 // If there are any content matches, we only return these and skip filename matches.
208 if len(contentMatches) > 0 {
209 return p.fillContentChunkMatches(contentMatches, numContextLines, language, debug)
210 }
211
212 // Otherwise, we return a single chunk representing the filename match.
213 bestMatch, _ := p.candidateMatchScore(filenameMatches, language, debug)
214 fileName := p.id.fileName(p.idx)
215 ranges := make([]Range, 0, len(ms))
216 for _, m := range ms {
217 ranges = append(ranges, Range{
218 Start: Location{
219 ByteOffset: m.byteOffset,
220 LineNumber: 1,
221 Column: uint32(utf8.RuneCount(fileName[:m.byteOffset]) + 1),
222 },
223 End: Location{
224 ByteOffset: m.byteOffset + m.byteMatchSz,
225 LineNumber: 1,
226 Column: uint32(utf8.RuneCount(fileName[:m.byteOffset+m.byteMatchSz]) + 1),
227 },
228 })
229 }
230
231 return []ChunkMatch{{
232 Content: fileName,
233 ContentStart: Location{ByteOffset: 0, LineNumber: 1, Column: 1},
234 Ranges: ranges,
235 FileName: true,
236 Score: bestMatch.score,
237 DebugScore: bestMatch.debugScore,
238 }}
239}
240
241func (p *contentProvider) fillContentMatches(ms []*candidateMatch, numContextLines int, language string, debug bool) []LineMatch {
242 var result []LineMatch
243 for len(ms) > 0 {
244 m := ms[0]
245 num := p.newlines().atOffset(m.byteOffset)
246 lineStart := int(p.newlines().lineStart(num))
247 nextLineStart := int(p.newlines().lineStart(num + 1))
248
249 var lineCands []*candidateMatch
250
251 endMatch := m.byteOffset + m.byteMatchSz
252
253 for len(ms) > 0 {
254 m := ms[0]
255 if int(m.byteOffset) < nextLineStart {
256 endMatch = m.byteOffset + m.byteMatchSz
257 lineCands = append(lineCands, m)
258 ms = ms[1:]
259 } else {
260 break
261 }
262 }
263
264 if len(lineCands) == 0 {
265 log.Panicf(
266 "%s %v infinite loop: num %d start,end %d,%d, offset %d",
267 p.id.fileName(p.idx), p.id.metaData,
268 num, lineStart, nextLineStart,
269 m.byteOffset)
270 }
271
272 data := p.data(false)
273
274 // Due to merging matches, we may have a match that
275 // crosses a line boundary. Prevent confusion by
276 // taking lines until we pass the last match
277 for nextLineStart < len(data) && endMatch > uint32(nextLineStart) {
278 next := bytes.IndexByte(data[nextLineStart:], '\n')
279 if next == -1 {
280 nextLineStart = len(data)
281 } else {
282 // TODO(hanwen): test that checks "+1" part here.
283 nextLineStart += next + 1
284 }
285 }
286
287 finalMatch := LineMatch{
288 LineStart: lineStart,
289 LineEnd: nextLineStart,
290 LineNumber: num,
291 }
292 finalMatch.Line = data[lineStart:nextLineStart]
293
294 if numContextLines > 0 {
295 finalMatch.Before = p.newlines().getLines(data, num-numContextLines, num)
296 finalMatch.After = p.newlines().getLines(data, num+1, num+1+numContextLines)
297 }
298
299 bestMatch, symbolInfo := p.candidateMatchScore(lineCands, language, debug)
300 finalMatch.Score = bestMatch.score
301 finalMatch.DebugScore = bestMatch.debugScore
302
303 for i, m := range lineCands {
304 fragment := LineFragmentMatch{
305 Offset: m.byteOffset,
306 LineOffset: int(m.byteOffset) - lineStart,
307 MatchLength: int(m.byteMatchSz),
308 }
309 if i < len(symbolInfo) && symbolInfo[i] != nil {
310 fragment.SymbolInfo = symbolInfo[i]
311 }
312
313 finalMatch.LineFragments = append(finalMatch.LineFragments, fragment)
314 }
315 result = append(result, finalMatch)
316 }
317 return result
318}
319
320func (p *contentProvider) fillContentChunkMatches(ms []*candidateMatch, numContextLines int, language string, debug bool) []ChunkMatch {
321 newlines := p.newlines()
322 data := p.data(false)
323
324 // columnHelper prevents O(len(ms) * len(data)) lookups for all columns.
325 // However, it depends on ms being sorted by byteOffset and non-overlapping.
326 // This invariant is true at the time of writing, but we conservatively
327 // enforce this. Note: chunkCandidates preserves the sorting so safe to
328 // transform now.
329 columnHelper := columnHelper{data: data}
330 if !sort.IsSorted((sortByOffsetSlice)(ms)) {
331 log.Printf("WARN: performance invariant violated. candidate matches are not sorted in fillContentChunkMatches. Report to developers.")
332 sort.Sort((sortByOffsetSlice)(ms))
333 }
334
335 chunks := chunkCandidates(ms, newlines, numContextLines)
336 chunkMatches := make([]ChunkMatch, 0, len(chunks))
337 for _, chunk := range chunks {
338 bestMatch, symbolInfo := p.candidateMatchScore(chunk.candidates, language, debug)
339
340 ranges := make([]Range, 0, len(chunk.candidates))
341 for _, cm := range chunk.candidates {
342 startOffset := cm.byteOffset
343 endOffset := cm.byteOffset + cm.byteMatchSz
344 startLine, endLine := newlines.offsetRangeToLineRange(startOffset, endOffset)
345
346 ranges = append(ranges, Range{
347 Start: Location{
348 ByteOffset: startOffset,
349 LineNumber: uint32(startLine),
350 Column: columnHelper.get(int(newlines.lineStart(startLine)), startOffset),
351 },
352 End: Location{
353 ByteOffset: endOffset,
354 LineNumber: uint32(endLine),
355 Column: columnHelper.get(int(newlines.lineStart(endLine)), endOffset),
356 },
357 })
358 }
359
360 firstLineNumber := int(chunk.firstLine) - numContextLines
361 if firstLineNumber < 1 {
362 firstLineNumber = 1
363 }
364 firstLineStart := newlines.lineStart(firstLineNumber)
365
366 bestLineMatch := 0
367 if bestMatch.match != nil {
368 bestLineMatch = newlines.atOffset(bestMatch.match.byteOffset)
369 if debug {
370 bestMatch.debugScore = fmt.Sprintf("%s, (line: %d)", bestMatch.debugScore, bestLineMatch)
371 }
372 }
373
374 chunkMatches = append(chunkMatches, ChunkMatch{
375 Content: newlines.getLines(data, firstLineNumber, int(chunk.lastLine)+numContextLines+1),
376 ContentStart: Location{
377 ByteOffset: firstLineStart,
378 LineNumber: uint32(firstLineNumber),
379 Column: 1,
380 },
381 FileName: false,
382 Ranges: ranges,
383 SymbolInfo: symbolInfo,
384 BestLineMatch: uint32(bestLineMatch),
385 Score: bestMatch.score,
386 DebugScore: bestMatch.debugScore,
387 })
388 }
389 return chunkMatches
390}
391
392type candidateChunk struct {
393 candidates []*candidateMatch
394 firstLine uint32 // 1-based, inclusive
395 lastLine uint32 // 1-based, inclusive
396 minOffset uint32 // 0-based, inclusive
397 maxOffset uint32 // 0-based, exclusive
398}
399
400// chunkCandidates groups a set of sorted, non-overlapping candidate matches by line number. Adjacent
401// chunks will be merged if adding `numContextLines` to the beginning and end of the chunk would cause
402// it to overlap with an adjacent chunk.
403//
404// input invariants: ms is sorted by byteOffset and is non overlapping with respect to endOffset.
405// output invariants: if you flatten candidates the input invariant is retained.
406func chunkCandidates(ms []*candidateMatch, newlines newlines, numContextLines int) []candidateChunk {
407 var chunks []candidateChunk
408 for _, m := range ms {
409 startOffset := m.byteOffset
410 endOffset := m.byteOffset + m.byteMatchSz
411 firstLine, lastLine := newlines.offsetRangeToLineRange(startOffset, endOffset)
412
413 if len(chunks) > 0 && int(chunks[len(chunks)-1].lastLine)+numContextLines >= firstLine-numContextLines {
414 // If a new chunk created with the current candidateMatch would
415 // overlap with the previous chunk, instead add the candidateMatch
416 // to the last chunk and extend end of the last chunk.
417 last := &chunks[len(chunks)-1]
418 last.candidates = append(last.candidates, m)
419 if last.maxOffset < endOffset {
420 last.lastLine = uint32(lastLine)
421 last.maxOffset = uint32(endOffset)
422 }
423 } else {
424 chunks = append(chunks, candidateChunk{
425 firstLine: uint32(firstLine),
426 lastLine: uint32(lastLine),
427 minOffset: startOffset,
428 maxOffset: endOffset,
429 candidates: []*candidateMatch{m},
430 })
431 }
432 }
433 return chunks
434}
435
436// columnHelper is a helper struct which caches the number of runes last
437// counted. If we naively use utf8.RuneCount for each match on a line, this
438// leads to an O(nm) algorithm where m is the number of matches and n is the
439// length of the line. Aassuming we our candidates are increasing in offset
440// makes this operation O(n) instead.
441type columnHelper struct {
442 data []byte
443
444 // 0 values for all these are valid values
445 lastLineOffset int
446 lastOffset uint32
447 lastRuneCount uint32
448}
449
450// get returns the line column for offset. offset is the byte offset of the
451// rune in data. lineOffset is the byte offset inside of data for the line
452// containing offset.
453func (c *columnHelper) get(lineOffset int, offset uint32) uint32 {
454 var runeCount uint32
455
456 if lineOffset == c.lastLineOffset && offset >= c.lastOffset {
457 // Can count from last calculation
458 runeCount = c.lastRuneCount + uint32(utf8.RuneCount(c.data[c.lastOffset:offset]))
459 } else {
460 // Need to count from the beginning of line
461 runeCount = uint32(utf8.RuneCount(c.data[lineOffset:offset]))
462 }
463
464 c.lastLineOffset = lineOffset
465 c.lastOffset = offset
466 c.lastRuneCount = runeCount
467
468 return runeCount + 1
469}
470
471type newlines struct {
472 // locs is the sorted set of byte offsets of the newlines in the file
473 locs []uint32
474
475 // fileSize is just the number of bytes in the file. It is stored
476 // on this struct so we can safely know the length of the last line
477 // in the file since not all files end in a newline.
478 fileSize uint32
479}
480
481// atOffset returns the line containing the offset. If the offset lands on
482// the newline ending line M, we return M.
483func (nls newlines) atOffset(offset uint32) (lineNumber int) {
484 idx := sort.Search(len(nls.locs), func(n int) bool {
485 return nls.locs[n] >= offset
486 })
487 return idx + 1
488}
489
490// lineStart returns the byte offset of the beginning of the given line.
491// lineNumber is 1-based. If lineNumber is out of range of the lines in the
492// file, the return value will be clamped to [0,fileSize].
493func (nls newlines) lineStart(lineNumber int) uint32 {
494 // nls.locs[0] + 1 is the start of the 2nd line of data.
495 startIdx := lineNumber - 2
496
497 if startIdx < 0 {
498 return 0
499 } else if startIdx >= len(nls.locs) {
500 return nls.fileSize
501 } else {
502 return nls.locs[startIdx] + 1
503 }
504}
505
506// offsetRangeToLineRange returns range of lines that fully contains the given byte range.
507// The inputs are 0-based byte offsets into the file representing the (exclusive) range [startOffset, endOffset).
508// The return values are 1-based line numbers representing the (inclusive) range [startLine, endLine].
509func (nls newlines) offsetRangeToLineRange(startOffset, endOffset uint32) (startLine, endLine int) {
510 startLine = nls.atOffset(startOffset)
511 endLine = nls.atOffset(
512 max(startOffset, max(endOffset, 1)-1), // clamp endOffset and prevent underflow
513 )
514 return startLine, endLine
515}
516
517// getLines returns a slice of data containing the lines [low, high).
518// low is 1-based and inclusive. high is 1-based and exclusive.
519func (nls newlines) getLines(data []byte, low, high int) []byte {
520 if low >= high {
521 return nil
522 }
523
524 return data[nls.lineStart(low):nls.lineStart(high)]
525}
526
527const (
528 // Query-dependent scoring signals. All of these together are bounded at ~9000
529 // (scoreWordMatch + scoreSymbol + scoreKindMatch * 10 + scoreFactorAtomMatch).
530 scorePartialWordMatch = 50.0
531 scoreWordMatch = 500.0
532 scoreBase = 7000.0
533 scorePartialBase = 4000.0
534 scoreSymbol = 7000.0
535 scorePartialSymbol = 4000.0
536 scoreKindMatch = 100.0
537 scoreFactorAtomMatch = 400.0
538
539 // File-only scoring signals. For now these are also bounded ~9000 to give them
540 // equal weight with the query-dependent signals.
541 scoreFileRankFactor = 9000.0
542
543 // Used for ordering line and chunk matches within a file.
544 scoreLineOrderFactor = 1.0
545
546 // Used for tiebreakers. The scores are not combined with the main score, but
547 // are used to break ties between matches with the same score. The factors are
548 // chosen to separate the tiebreakers from the main score and from each other.
549 // If you make changes here, make sure to update indexData.scoreFile too.
550 scoreRepoRankFactor = 100.0
551 scoreFileOrderFactor = 10.0
552)
553
554// findMaxOverlappingSection returns the index of the section in secs that
555// overlaps the most with the area defined by off and sz, relative to the size
556// of the section. If no section overlaps, it returns 0, false. If multiple
557// sections overlap the same amount, the first one is returned.
558//
559// The implementation assumes that sections do not overlap and are sorted by
560// DocumentSection.Start.
561func findMaxOverlappingSection(secs []DocumentSection, off, sz uint32) (uint32, bool) {
562 start := off
563 end := off + sz
564
565 // Find the first section that might overlap
566 j := sort.Search(len(secs), func(i int) bool { return secs[i].End > start })
567
568 if j == len(secs) || secs[j].Start >= end {
569 // No overlap.
570 return 0, false
571 }
572
573 relOverlap := func(j int) float64 {
574 secSize := secs[j].End - secs[j].Start
575 if secSize == 0 {
576 return 0
577 }
578 // This cannot overflow because we make sure there is overlap before calling relOverlap
579 overlap := min(secs[j].End, end) - max(secs[j].Start, start)
580 return float64(overlap) / float64(secSize)
581 }
582
583 ol1 := relOverlap(j)
584 if epsilonEqualsOne(ol1) || j == len(secs)-1 || secs[j+1].Start >= end {
585 return uint32(j), ol1 > 0
586 }
587
588 // We know that [off,off+sz[ overlaps with at least 2 sections. We only have to check
589 // if the second section overlaps more than the first one, because a third
590 // section can only overlap if the overlap with the second section is complete.
591 ol2 := relOverlap(j + 1)
592 if ol2 > ol1 {
593 return uint32(j + 1), ol2 > 0
594 }
595
596 return uint32(j), ol1 > 0
597}
598
599func (p *contentProvider) matchesSymbol(cm *candidateMatch) bool {
600 if cm.fileName {
601 return false
602 }
603
604 // Check if this candidate came from a symbol matchTree
605 if cm.symbol {
606 return true
607 }
608
609 // Check if it overlaps with a symbol.
610 secs := p.docSections()
611 _, ok := findMaxOverlappingSection(secs, cm.byteOffset, cm.byteMatchSz)
612 return ok
613}
614
615func (p *contentProvider) findSymbol(cm *candidateMatch) (DocumentSection, *Symbol, bool) {
616 if cm.fileName {
617 return DocumentSection{}, nil, false
618 }
619
620 secs := p.docSections()
621
622 secIdx, ok := cm.symbolIdx, cm.symbol
623 if !ok {
624 // Not from a symbol matchTree. Let's see if it overlaps with a symbol.
625 secIdx, ok = findMaxOverlappingSection(secs, cm.byteOffset, cm.byteMatchSz)
626 }
627 if !ok {
628 return DocumentSection{}, nil, false
629 }
630
631 sec := secs[secIdx]
632
633 // Now lets hydrate in the SymbolInfo. We do not hydrate in SymbolInfo.Sym
634 // since some callsites do not need it stored, and that incurs an extra
635 // copy.
636 //
637 // 2024-01-08 we are refactoring this and the code path indicates this can
638 // fail, so callers need to handle nil symbol. However, it would be
639 // surprising that we have a matching section but not symbol data.
640 start := p.id.fileEndSymbol[p.idx]
641 si := p.id.symbols.data(start + secIdx)
642
643 return sec, si, true
644}
645
646// calculateTermFrequency computes the term frequency for the file match.
647// Notes:
648// * Filename matches count more than content matches. This mimics a common text search strategy to 'boost' matches on document titles.
649// * Symbol matches also count more than content matches, to reward matches on symbol definitions.
650func (p *contentProvider) calculateTermFrequency(cands []*candidateMatch, df termDocumentFrequency) map[string]int {
651 // Treat each candidate match as a term and compute the frequencies. For now, ignore case
652 // sensitivity and treat filenames and symbols the same as content.
653 termFreqs := map[string]int{}
654 for _, m := range cands {
655 term := string(m.substrLowered)
656 if m.fileName || p.matchesSymbol(m) {
657 termFreqs[term] += 5
658 } else {
659 termFreqs[term]++
660 }
661 }
662
663 for term := range termFreqs {
664 df[term] += 1
665 }
666 return termFreqs
667}
668
669// scoredMatch holds the score information for a candidate match.
670type scoredMatch struct {
671 score float64
672 debugScore string
673 match *candidateMatch
674}
675
676// candidateMatchScore scores all candidate matches and returns the best-scoring match plus its score information.
677// Invariant: there should be at least one input candidate, len(ms) > 0.
678func (p *contentProvider) candidateMatchScore(ms []*candidateMatch, language string, debug bool) (scoredMatch, []*Symbol) {
679 score := 0.0
680 what := ""
681
682 addScore := func(w string, s float64) {
683 if s != 0 && debug {
684 what += fmt.Sprintf("%s:%.2f, ", w, s)
685 }
686 score += s
687 }
688
689 filename := p.data(true)
690 var symbolInfo []*Symbol
691
692 var bestMatch scoredMatch
693 for i, m := range ms {
694 data := p.data(m.fileName)
695
696 endOffset := m.byteOffset + m.byteMatchSz
697 startBoundary := m.byteOffset < uint32(len(data)) && (m.byteOffset == 0 || byteClass(data[m.byteOffset-1]) != byteClass(data[m.byteOffset]))
698 endBoundary := endOffset > 0 && (endOffset == uint32(len(data)) || byteClass(data[endOffset-1]) != byteClass(data[endOffset]))
699
700 score = 0
701 what = ""
702
703 if startBoundary && endBoundary {
704 addScore("WordMatch", scoreWordMatch)
705 } else if startBoundary || endBoundary {
706 addScore("PartialWordMatch", scorePartialWordMatch)
707 }
708
709 if m.fileName {
710 sep := bytes.LastIndexByte(data, '/')
711 startMatch := int(m.byteOffset) == sep+1
712 endMatch := endOffset == uint32(len(data))
713 if startMatch && endMatch {
714 addScore("Base", scoreBase)
715 } else if startMatch || endMatch {
716 addScore("EdgeBase", (scoreBase+scorePartialBase)/2)
717 } else if sep < int(m.byteOffset) {
718 addScore("InnerBase", scorePartialBase)
719 }
720 } else if sec, si, ok := p.findSymbol(m); ok {
721 startMatch := sec.Start == m.byteOffset
722 endMatch := sec.End == endOffset
723 if startMatch && endMatch {
724 addScore("Symbol", scoreSymbol)
725 } else if startMatch || endMatch {
726 addScore("EdgeSymbol", (scoreSymbol+scorePartialSymbol)/2)
727 } else {
728 addScore("OverlapSymbol", scorePartialSymbol)
729 }
730
731 // Score based on symbol data
732 if si != nil {
733 symbolKind := ctags.ParseSymbolKind(si.Kind)
734 sym := sectionSlice(data, sec)
735
736 addScore(fmt.Sprintf("kind:%s:%s", language, si.Kind), scoreSymbolKind(language, filename, sym, symbolKind))
737
738 // This is from a symbol tree, so we need to store the symbol
739 // information.
740 if m.symbol {
741 if symbolInfo == nil {
742 symbolInfo = make([]*Symbol, len(ms))
743 }
744 // findSymbols does not hydrate in Sym. So we need to store it.
745 si.Sym = string(sym)
746 symbolInfo[i] = si
747 }
748 }
749 }
750
751 // scoreWeight != 1 means it affects score
752 if !epsilonEqualsOne(m.scoreWeight) {
753 score = score * m.scoreWeight
754 if debug {
755 what += fmt.Sprintf("boost:%.2f, ", m.scoreWeight)
756 }
757 }
758
759 if score > bestMatch.score {
760 bestMatch.score = score
761 bestMatch.debugScore = what
762 bestMatch.match = m
763 }
764 }
765
766 if debug {
767 bestMatch.debugScore = fmt.Sprintf("score:%.2f <- %s", bestMatch.score, strings.TrimSuffix(bestMatch.debugScore, ", "))
768 }
769
770 return bestMatch, symbolInfo
771}
772
773// sectionSlice will return data[sec.Start:sec.End] but will clip Start and
774// End such that it won't be out of range.
775func sectionSlice(data []byte, sec DocumentSection) []byte {
776 l := uint32(len(data))
777 if sec.Start >= l {
778 return nil
779 }
780 if sec.End > l {
781 sec.End = l
782 }
783 return data[sec.Start:sec.End]
784}
785
786// scoreSymbolKind boosts a match based on the combination of language, symbol
787// and kind. The language string comes from go-enry, the symbol and kind from
788// ctags.
789func scoreSymbolKind(language string, filename []byte, sym []byte, kind ctags.SymbolKind) float64 {
790 var factor float64
791
792 // Generic ranking which will be overriden by language specific ranking
793 switch kind {
794 case ctags.Type: // scip-ctags regression workaround https://github.com/sourcegraph/sourcegraph/issues/57659
795 factor = 8
796 case ctags.Class:
797 factor = 10
798 case ctags.Struct:
799 factor = 9.5
800 case ctags.Enum:
801 factor = 9
802 case ctags.Interface:
803 factor = 8
804 case ctags.Function, ctags.Method:
805 factor = 7
806 case ctags.Field:
807 factor = 5.5
808 case ctags.Constant:
809 factor = 5
810 case ctags.Variable:
811 factor = 4
812 default:
813 // For all other kinds, assign a low score by default.
814 factor = 1
815 }
816
817 switch language {
818 case "Java", "java":
819 switch kind {
820 // 2022-03-30: go-ctags contains a regex rule for Java classes that sets "kind"
821 // to "classes" instead of "c". We have to cover both cases to support existing
822 // indexes.
823 case ctags.Class:
824 factor = 10
825 case ctags.Enum:
826 factor = 9
827 case ctags.Interface:
828 factor = 8
829 case ctags.Method:
830 factor = 7
831 case ctags.Field:
832 factor = 6
833 case ctags.EnumConstant:
834 factor = 5
835 }
836 case "Kotlin", "kotlin":
837 switch kind {
838 case ctags.Class:
839 factor = 10
840 case ctags.Interface:
841 factor = 9
842 case ctags.Method:
843 factor = 8
844 case ctags.TypeAlias:
845 factor = 7
846 case ctags.Constant:
847 factor = 6
848 case ctags.Variable:
849 factor = 5
850 }
851 case "Go", "go":
852 switch kind {
853 // scip-ctags regression workaround https://github.com/sourcegraph/sourcegraph/issues/57659
854 // for each case a description of the fields in ctags in the comment
855 case ctags.Type: // interface struct talias
856 factor = 9
857 case ctags.Interface: // interfaces
858 factor = 10
859 case ctags.Struct: // structs
860 factor = 9
861 case ctags.TypeAlias: // type aliases
862 factor = 9
863 case ctags.MethodSpec: // interface method specification
864 factor = 8.5
865 case ctags.Method, ctags.Function: // functions
866 factor = 8
867 case ctags.Field: // struct fields
868 factor = 7
869 case ctags.Constant: // constants
870 factor = 6
871 case ctags.Variable: // variables
872 factor = 5
873 }
874
875 // Boost exported go symbols. Same implementation as token.IsExported
876 if ch, _ := utf8.DecodeRune(sym); unicode.IsUpper(ch) {
877 factor += 0.5
878 }
879
880 if bytes.HasSuffix(filename, []byte("_test.go")) {
881 factor *= 0.8
882 }
883
884 // Could also rank on:
885 //
886 // - anonMember struct anonymous members
887 // - packageName name for specifying imported package
888 // - receiver receivers
889 // - package packages
890 // - type types
891 // - unknown unknown
892 case "C++", "c++":
893 switch kind {
894 case ctags.Class: // classes
895 factor = 10
896 case ctags.Enum: // enumeration names
897 factor = 9
898 case ctags.Function: // function definitions
899 factor = 8
900 case ctags.Struct: // structure names
901 factor = 7
902 case ctags.Union: // union names
903 factor = 6
904 case ctags.TypeAlias: // typedefs
905 factor = 5
906 case ctags.Field: // class, struct, and union members
907 factor = 4
908 case ctags.Variable: // varialbe definitions
909 factor = 3
910 }
911 // Could also rank on:
912 // NAME DESCRIPTION
913 // macro macro definitions
914 // enumerator enumerators (values inside an enumeration)
915 // header included header files
916 // namespace namespaces
917 // variable variable definitions
918 case "Scala", "scala":
919 switch kind {
920 case ctags.Class:
921 factor = 10
922 case ctags.Interface:
923 factor = 9
924 case ctags.Object:
925 factor = 8
926 case ctags.Function:
927 factor = 7
928 case ctags.Type:
929 factor = 6
930 case ctags.Variable:
931 factor = 5
932 case ctags.Package:
933 factor = 4
934 }
935 case "Python", "python":
936 switch kind {
937 case ctags.Class: // classes
938 factor = 10
939 case ctags.Function, ctags.Method: // function definitions
940 factor = 8
941 case ctags.Field: // class, struct, and union members
942 factor = 4
943 case ctags.Variable: // variable definitions
944 factor = 3
945 case ctags.Local: // local variables
946 factor = 2
947 }
948 // Could also rank on:
949 //
950 // - namespace name referring a module defined in other file
951 // - module modules
952 // - unknown name referring a class/variable/function/module defined in other module
953 // - parameter function parameters
954 case "Ruby", "ruby":
955 switch kind {
956 case ctags.Class:
957 factor = 10
958 case ctags.Method:
959 factor = 9
960 case ctags.MethodAlias:
961 factor = 8
962 case ctags.Module:
963 factor = 7
964 case ctags.SingletonMethod:
965 factor = 6
966 case ctags.Constant:
967 factor = 5
968 case ctags.Accessor:
969 factor = 4
970 case ctags.Library:
971 factor = 3
972 }
973 case "PHP", "php":
974 switch kind {
975 case ctags.Class:
976 factor = 10
977 case ctags.Interface:
978 factor = 9
979 case ctags.Function:
980 factor = 8
981 case ctags.Trait:
982 factor = 7
983 case ctags.Define:
984 factor = 6
985 case ctags.Namespace:
986 factor = 5
987 case ctags.MethodAlias:
988 factor = 4
989 case ctags.Variable:
990 factor = 3
991 case ctags.Local:
992 factor = 3
993 }
994 case "GraphQL", "graphql":
995 switch kind {
996 case ctags.Type:
997 factor = 10
998 }
999 case "Markdown", "markdown":
1000 // Headers are good signal in docs, but do not rank as highly as code.
1001 switch kind {
1002 case ctags.Chapter: // #
1003 factor = 4
1004 case ctags.Section: // ##
1005 factor = 3
1006 case ctags.Subsection: // ###
1007 factor = 2
1008 }
1009 }
1010
1011 return factor * scoreKindMatch
1012}
1013
1014type matchScoreSlice []LineMatch
1015
1016func (m matchScoreSlice) Len() int { return len(m) }
1017func (m matchScoreSlice) Swap(i, j int) { m[i], m[j] = m[j], m[i] }
1018func (m matchScoreSlice) Less(i, j int) bool { return m[i].Score > m[j].Score }
1019
1020type chunkMatchScoreSlice []ChunkMatch
1021
1022func (m chunkMatchScoreSlice) Len() int { return len(m) }
1023func (m chunkMatchScoreSlice) Swap(i, j int) { m[i], m[j] = m[j], m[i] }
1024func (m chunkMatchScoreSlice) Less(i, j int) bool { return m[i].Score > m[j].Score }
1025
1026type fileMatchesByScore []FileMatch
1027
1028func (m fileMatchesByScore) Len() int { return len(m) }
1029func (m fileMatchesByScore) Swap(i, j int) { m[i], m[j] = m[j], m[i] }
1030func (m fileMatchesByScore) Less(i, j int) bool { return m[i].Score > m[j].Score }
1031
1032func sortMatchesByScore(ms []LineMatch) {
1033 sort.Sort(matchScoreSlice(ms))
1034}
1035
1036func sortChunkMatchesByScore(ms []ChunkMatch) {
1037 sort.Sort(chunkMatchScoreSlice(ms))
1038}
1039
1040// SortFiles sorts files matches in the order we want to present results to
1041// users. The order depends on the match score, which includes both
1042// query-dependent signals like word overlap, and file-only signals like the
1043// file ranks (if file ranks are enabled).
1044//
1045// We don't only use the scores, we will also boost some results to present
1046// files with novel extensions.
1047func SortFiles(ms []FileMatch) {
1048 sort.Sort(fileMatchesByScore(ms))
1049
1050 // Boost a file extension not in the top 3 to the third filematch.
1051 boostNovelExtension(ms, 2, 0.9)
1052}
1053
1054func boostNovelExtension(ms []FileMatch, boostOffset int, minScoreRatio float64) {
1055 if len(ms) <= boostOffset+1 {
1056 return
1057 }
1058
1059 top := ms[:boostOffset]
1060 candidates := ms[boostOffset:]
1061
1062 // Don't bother boosting something which is significantly different to the
1063 // result it replaces.
1064 minScoreForNovelty := candidates[0].Score * minScoreRatio
1065
1066 // We want to look for an ext that isn't in the top exts
1067 exts := make([]string, len(top))
1068 for i := range top {
1069 exts[i] = path.Ext(top[i].FileName)
1070 }
1071
1072 for i := range candidates {
1073 // Do not assume sorted due to boostNovelExtension being called on subsets
1074 if candidates[i].Score < minScoreForNovelty {
1075 continue
1076 }
1077
1078 if slices.Contains(exts, path.Ext(candidates[i].FileName)) {
1079 continue
1080 }
1081
1082 // Found what we are looking for, now boost to front of candidates (which
1083 // is ms[boostOffset])
1084 for ; i > 0; i-- {
1085 candidates[i], candidates[i-1] = candidates[i-1], candidates[i]
1086 }
1087 return
1088 }
1089}