fork of https://github.com/sourcegraph/zoekt
1// Copyright 2016 Google Inc. All rights reserved.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15package zoekt
16
17import (
18 "bytes"
19 "fmt"
20 "log"
21 "path"
22 "slices"
23 "sort"
24 "strings"
25 "unicode"
26 "unicode/utf8"
27
28 "github.com/sourcegraph/zoekt/ctags"
29)
30
31var _ = log.Println
32
33// contentProvider is an abstraction to treat matches for names and
34// content with the same code.
35type contentProvider struct {
36 id *indexData
37 stats *Stats
38
39 // mutable
40 err error
41 idx uint32
42 _data []byte
43 _nl []uint32
44 _nlBuf []uint32
45 _sects []DocumentSection
46 _sectBuf []DocumentSection
47 fileSize uint32
48}
49
50// setDocument skips to the given document.
51func (p *contentProvider) setDocument(docID uint32) {
52 fileStart := p.id.boundaries[docID]
53
54 p.idx = docID
55 p.fileSize = p.id.boundaries[docID+1] - fileStart
56
57 p._nl = nil
58 p._sects = nil
59 p._data = nil
60}
61
62func (p *contentProvider) docSections() []DocumentSection {
63 if p._sects == nil {
64 var sz uint32
65 p._sects, sz, p.err = p.id.readDocSections(p.idx, p._sectBuf)
66 p.stats.ContentBytesLoaded += int64(sz)
67 p._sectBuf = p._sects
68 }
69 return p._sects
70}
71
72func (p *contentProvider) newlines() newlines {
73 if p._nl == nil {
74 var sz uint32
75 p._nl, sz, p.err = p.id.readNewlines(p.idx, p._nlBuf)
76 p._nlBuf = p._nl
77 p.stats.ContentBytesLoaded += int64(sz)
78 }
79 return newlines{locs: p._nl, fileSize: p.fileSize}
80}
81
82func (p *contentProvider) data(fileName bool) []byte {
83 if fileName {
84 return p.id.fileNameContent[p.id.fileNameIndex[p.idx]:p.id.fileNameIndex[p.idx+1]]
85 }
86
87 if p._data == nil {
88 p._data, p.err = p.id.readContents(p.idx)
89 p.stats.FilesLoaded++
90 p.stats.ContentBytesLoaded += int64(len(p._data))
91 }
92 return p._data
93}
94
95// Find offset in bytes (relative to corpus start) for an offset in
96// runes (relative to document start). If filename is set, the corpus
97// is the set of filenames, with the document being the name itself.
98func (p *contentProvider) findOffset(filename bool, r uint32) uint32 {
99 if p.id.metaData.PlainASCII {
100 return r
101 }
102
103 sample := p.id.runeOffsets
104 runeEnds := p.id.fileEndRunes
105 fileStartByte := p.id.boundaries[p.idx]
106 if filename {
107 sample = p.id.fileNameRuneOffsets
108 runeEnds = p.id.fileNameEndRunes
109 fileStartByte = p.id.fileNameIndex[p.idx]
110 }
111
112 absR := r
113 if p.idx > 0 {
114 absR += runeEnds[p.idx-1]
115 }
116
117 byteOff, left := sample.lookup(absR)
118
119 var data []byte
120
121 if filename {
122 data = p.id.fileNameContent[byteOff:]
123 } else {
124 data, p.err = p.id.readContentSlice(byteOff, 3*runeOffsetFrequency)
125 if p.err != nil {
126 return 0
127 }
128 }
129 for left > 0 {
130 _, sz := utf8.DecodeRune(data)
131 byteOff += uint32(sz)
132 data = data[sz:]
133 left--
134 }
135
136 byteOff -= fileStartByte
137 return byteOff
138}
139
140// fillMatches converts the internal candidateMatch slice into our API's LineMatch.
141// It only ever returns content XOR filename matches, not both. If there are any
142// content matches, these are always returned, and we omit filename matches.
143//
144// Performance invariant: ms is sorted and non-overlapping.
145//
146// Note: the byte slices may be backed by mmapped data, so before being
147// returned by the API it needs to be copied.
148func (p *contentProvider) fillMatches(ms []*candidateMatch, numContextLines int, language string, debug bool) []LineMatch {
149 var filenameMatches []*candidateMatch
150 contentMatches := make([]*candidateMatch, 0, len(ms))
151
152 for _, m := range ms {
153 if m.fileName {
154 filenameMatches = append(filenameMatches, m)
155 } else {
156 contentMatches = append(contentMatches, m)
157 }
158 }
159
160 // If there are any content matches, we only return these and skip filename matches.
161 if len(contentMatches) > 0 {
162 contentMatches = breakMatchesOnNewlines(contentMatches, p.data(false))
163 return p.fillContentMatches(contentMatches, numContextLines, language, debug)
164 }
165
166 // Otherwise, we return a single line containing the filematch match.
167 score, debugScore, _ := p.candidateMatchScore(filenameMatches, language, debug)
168 res := LineMatch{
169 Line: p.id.fileName(p.idx),
170 FileName: true,
171 Score: score,
172 DebugScore: debugScore,
173 }
174
175 for _, m := range ms {
176 res.LineFragments = append(res.LineFragments, LineFragmentMatch{
177 LineOffset: int(m.byteOffset),
178 MatchLength: int(m.byteMatchSz),
179 Offset: m.byteOffset,
180 })
181 }
182
183 return []LineMatch{res}
184
185}
186
187// fillChunkMatches converts the internal candidateMatch slice into our API's ChunkMatch.
188// It only ever returns content XOR filename matches, not both. If there are any content
189// matches, these are always returned, and we omit filename matches.
190//
191// Performance invariant: ms is sorted and non-overlapping.
192//
193// Note: the byte slices may be backed by mmapped data, so before being
194// returned by the API it needs to be copied.
195func (p *contentProvider) fillChunkMatches(ms []*candidateMatch, numContextLines int, language string, debug bool) []ChunkMatch {
196 var filenameMatches []*candidateMatch
197 contentMatches := make([]*candidateMatch, 0, len(ms))
198
199 for _, m := range ms {
200 if m.fileName {
201 filenameMatches = append(filenameMatches, m)
202 } else {
203 contentMatches = append(contentMatches, m)
204 }
205 }
206
207 // If there are any content matches, we only return these and skip filename matches.
208 if len(contentMatches) > 0 {
209 return p.fillContentChunkMatches(contentMatches, numContextLines, language, debug)
210 }
211
212 // Otherwise, we return a single chunk representing the filename match.
213 score, debugScore, _ := p.candidateMatchScore(filenameMatches, language, debug)
214 fileName := p.id.fileName(p.idx)
215 ranges := make([]Range, 0, len(ms))
216 for _, m := range ms {
217 ranges = append(ranges, Range{
218 Start: Location{
219 ByteOffset: m.byteOffset,
220 LineNumber: 1,
221 Column: uint32(utf8.RuneCount(fileName[:m.byteOffset]) + 1),
222 },
223 End: Location{
224 ByteOffset: m.byteOffset + m.byteMatchSz,
225 LineNumber: 1,
226 Column: uint32(utf8.RuneCount(fileName[:m.byteOffset+m.byteMatchSz]) + 1),
227 },
228 })
229 }
230
231 return []ChunkMatch{{
232 Content: fileName,
233 ContentStart: Location{ByteOffset: 0, LineNumber: 1, Column: 1},
234 Ranges: ranges,
235 FileName: true,
236
237 Score: score,
238 DebugScore: debugScore,
239 }}
240}
241
242func (p *contentProvider) fillContentMatches(ms []*candidateMatch, numContextLines int, language string, debug bool) []LineMatch {
243 var result []LineMatch
244 for len(ms) > 0 {
245 m := ms[0]
246 num := p.newlines().atOffset(m.byteOffset)
247 lineStart := int(p.newlines().lineStart(num))
248 nextLineStart := int(p.newlines().lineStart(num + 1))
249
250 var lineCands []*candidateMatch
251
252 endMatch := m.byteOffset + m.byteMatchSz
253
254 for len(ms) > 0 {
255 m := ms[0]
256 if int(m.byteOffset) < nextLineStart {
257 endMatch = m.byteOffset + m.byteMatchSz
258 lineCands = append(lineCands, m)
259 ms = ms[1:]
260 } else {
261 break
262 }
263 }
264
265 if len(lineCands) == 0 {
266 log.Panicf(
267 "%s %v infinite loop: num %d start,end %d,%d, offset %d",
268 p.id.fileName(p.idx), p.id.metaData,
269 num, lineStart, nextLineStart,
270 m.byteOffset)
271 }
272
273 data := p.data(false)
274
275 // Due to merging matches, we may have a match that
276 // crosses a line boundary. Prevent confusion by
277 // taking lines until we pass the last match
278 for nextLineStart < len(data) && endMatch > uint32(nextLineStart) {
279 next := bytes.IndexByte(data[nextLineStart:], '\n')
280 if next == -1 {
281 nextLineStart = len(data)
282 } else {
283 // TODO(hanwen): test that checks "+1" part here.
284 nextLineStart += next + 1
285 }
286 }
287
288 finalMatch := LineMatch{
289 LineStart: lineStart,
290 LineEnd: nextLineStart,
291 LineNumber: num,
292 }
293 finalMatch.Line = data[lineStart:nextLineStart]
294
295 if numContextLines > 0 {
296 finalMatch.Before = p.newlines().getLines(data, num-numContextLines, num)
297 finalMatch.After = p.newlines().getLines(data, num+1, num+1+numContextLines)
298 }
299
300 score, debugScore, symbolInfo := p.candidateMatchScore(lineCands, language, debug)
301 finalMatch.Score = score
302 finalMatch.DebugScore = debugScore
303
304 for i, m := range lineCands {
305 fragment := LineFragmentMatch{
306 Offset: m.byteOffset,
307 LineOffset: int(m.byteOffset) - lineStart,
308 MatchLength: int(m.byteMatchSz),
309 }
310 if i < len(symbolInfo) && symbolInfo[i] != nil {
311 fragment.SymbolInfo = symbolInfo[i]
312 }
313
314 finalMatch.LineFragments = append(finalMatch.LineFragments, fragment)
315 }
316 result = append(result, finalMatch)
317 }
318 return result
319}
320
321func (p *contentProvider) fillContentChunkMatches(ms []*candidateMatch, numContextLines int, language string, debug bool) []ChunkMatch {
322 newlines := p.newlines()
323 data := p.data(false)
324
325 // columnHelper prevents O(len(ms) * len(data)) lookups for all columns.
326 // However, it depends on ms being sorted by byteOffset and non-overlapping.
327 // This invariant is true at the time of writing, but we conservatively
328 // enforce this. Note: chunkCandidates preserves the sorting so safe to
329 // transform now.
330 columnHelper := columnHelper{data: data}
331 if !sort.IsSorted((sortByOffsetSlice)(ms)) {
332 log.Printf("WARN: performance invariant violated. candidate matches are not sorted in fillContentChunkMatches. Report to developers.")
333 sort.Sort((sortByOffsetSlice)(ms))
334 }
335
336 chunks := chunkCandidates(ms, newlines, numContextLines)
337 chunkMatches := make([]ChunkMatch, 0, len(chunks))
338 for _, chunk := range chunks {
339 score, debugScore, symbolInfo := p.candidateMatchScore(chunk.candidates, language, debug)
340
341 ranges := make([]Range, 0, len(chunk.candidates))
342 for _, cm := range chunk.candidates {
343 startOffset := cm.byteOffset
344 endOffset := cm.byteOffset + cm.byteMatchSz
345 startLine, endLine := newlines.offsetRangeToLineRange(startOffset, endOffset)
346
347 ranges = append(ranges, Range{
348 Start: Location{
349 ByteOffset: startOffset,
350 LineNumber: uint32(startLine),
351 Column: columnHelper.get(int(newlines.lineStart(startLine)), startOffset),
352 },
353 End: Location{
354 ByteOffset: endOffset,
355 LineNumber: uint32(endLine),
356 Column: columnHelper.get(int(newlines.lineStart(endLine)), endOffset),
357 },
358 })
359 }
360
361 firstLineNumber := int(chunk.firstLine) - numContextLines
362 if firstLineNumber < 1 {
363 firstLineNumber = 1
364 }
365 firstLineStart := newlines.lineStart(firstLineNumber)
366
367 chunkMatches = append(chunkMatches, ChunkMatch{
368 Content: newlines.getLines(data, firstLineNumber, int(chunk.lastLine)+numContextLines+1),
369 ContentStart: Location{
370 ByteOffset: firstLineStart,
371 LineNumber: uint32(firstLineNumber),
372 Column: 1,
373 },
374 FileName: false,
375 Ranges: ranges,
376 SymbolInfo: symbolInfo,
377 Score: score,
378 DebugScore: debugScore,
379 })
380 }
381 return chunkMatches
382}
383
384type candidateChunk struct {
385 candidates []*candidateMatch
386 firstLine uint32 // 1-based, inclusive
387 lastLine uint32 // 1-based, inclusive
388 minOffset uint32 // 0-based, inclusive
389 maxOffset uint32 // 0-based, exclusive
390}
391
392// chunkCandidates groups a set of sorted, non-overlapping candidate matches by line number. Adjacent
393// chunks will be merged if adding `numContextLines` to the beginning and end of the chunk would cause
394// it to overlap with an adjacent chunk.
395//
396// input invariants: ms is sorted by byteOffset and is non overlapping with respect to endOffset.
397// output invariants: if you flatten candidates the input invariant is retained.
398func chunkCandidates(ms []*candidateMatch, newlines newlines, numContextLines int) []candidateChunk {
399 var chunks []candidateChunk
400 for _, m := range ms {
401 startOffset := m.byteOffset
402 endOffset := m.byteOffset + m.byteMatchSz
403 firstLine, lastLine := newlines.offsetRangeToLineRange(startOffset, endOffset)
404
405 if len(chunks) > 0 && int(chunks[len(chunks)-1].lastLine)+numContextLines >= firstLine-numContextLines {
406 // If a new chunk created with the current candidateMatch would
407 // overlap with the previous chunk, instead add the candidateMatch
408 // to the last chunk and extend end of the last chunk.
409 last := &chunks[len(chunks)-1]
410 last.candidates = append(last.candidates, m)
411 if last.maxOffset < endOffset {
412 last.lastLine = uint32(lastLine)
413 last.maxOffset = uint32(endOffset)
414 }
415 } else {
416 chunks = append(chunks, candidateChunk{
417 firstLine: uint32(firstLine),
418 lastLine: uint32(lastLine),
419 minOffset: startOffset,
420 maxOffset: endOffset,
421 candidates: []*candidateMatch{m},
422 })
423 }
424 }
425 return chunks
426}
427
428// columnHelper is a helper struct which caches the number of runes last
429// counted. If we naively use utf8.RuneCount for each match on a line, this
430// leads to an O(nm) algorithm where m is the number of matches and n is the
431// length of the line. Aassuming we our candidates are increasing in offset
432// makes this operation O(n) instead.
433type columnHelper struct {
434 data []byte
435
436 // 0 values for all these are valid values
437 lastLineOffset int
438 lastOffset uint32
439 lastRuneCount uint32
440}
441
442// get returns the line column for offset. offset is the byte offset of the
443// rune in data. lineOffset is the byte offset inside of data for the line
444// containing offset.
445func (c *columnHelper) get(lineOffset int, offset uint32) uint32 {
446 var runeCount uint32
447
448 if lineOffset == c.lastLineOffset && offset >= c.lastOffset {
449 // Can count from last calculation
450 runeCount = c.lastRuneCount + uint32(utf8.RuneCount(c.data[c.lastOffset:offset]))
451 } else {
452 // Need to count from the beginning of line
453 runeCount = uint32(utf8.RuneCount(c.data[lineOffset:offset]))
454 }
455
456 c.lastLineOffset = lineOffset
457 c.lastOffset = offset
458 c.lastRuneCount = runeCount
459
460 return runeCount + 1
461}
462
463type newlines struct {
464 // locs is the sorted set of byte offsets of the newlines in the file
465 locs []uint32
466
467 // fileSize is just the number of bytes in the file. It is stored
468 // on this struct so we can safely know the length of the last line
469 // in the file since not all files end in a newline.
470 fileSize uint32
471}
472
473// atOffset returns the line containing the offset. If the offset lands on
474// the newline ending line M, we return M.
475func (nls newlines) atOffset(offset uint32) (lineNumber int) {
476 idx := sort.Search(len(nls.locs), func(n int) bool {
477 return nls.locs[n] >= offset
478 })
479 return idx + 1
480}
481
482// lineStart returns the byte offset of the beginning of the given line.
483// lineNumber is 1-based. If lineNumber is out of range of the lines in the
484// file, the return value will be clamped to [0,fileSize].
485func (nls newlines) lineStart(lineNumber int) uint32 {
486 // nls.locs[0] + 1 is the start of the 2nd line of data.
487 startIdx := lineNumber - 2
488
489 if startIdx < 0 {
490 return 0
491 } else if startIdx >= len(nls.locs) {
492 return nls.fileSize
493 } else {
494 return nls.locs[startIdx] + 1
495 }
496}
497
498// offsetRangeToLineRange returns range of lines that fully contains the given byte range.
499// The inputs are 0-based byte offsets into the file representing the (exclusive) range [startOffset, endOffset).
500// The return values are 1-based line numbers representing the (inclusive) range [startLine, endLine].
501func (nls newlines) offsetRangeToLineRange(startOffset, endOffset uint32) (startLine, endLine int) {
502 startLine = nls.atOffset(startOffset)
503 endLine = nls.atOffset(
504 max(startOffset, max(endOffset, 1)-1), // clamp endOffset and prevent underflow
505 )
506 return startLine, endLine
507}
508
509// getLines returns a slice of data containing the lines [low, high).
510// low is 1-based and inclusive. high is 1-based and exclusive.
511func (nls newlines) getLines(data []byte, low, high int) []byte {
512 if low >= high {
513 return nil
514 }
515
516 return data[nls.lineStart(low):nls.lineStart(high)]
517}
518
519const (
520 // Query-dependent scoring signals. All of these together are bounded at ~9000
521 // (scoreWordMatch + scoreSymbol + scoreKindMatch * 10 + scoreFactorAtomMatch).
522 scorePartialWordMatch = 50.0
523 scoreWordMatch = 500.0
524 scoreBase = 7000.0
525 scorePartialBase = 4000.0
526 scoreSymbol = 7000.0
527 scorePartialSymbol = 4000.0
528 scoreKindMatch = 100.0
529 scoreFactorAtomMatch = 400.0
530
531 // File-only scoring signals. For now these are also bounded ~9000 to give them
532 // equal weight with the query-dependent signals.
533 scoreFileRankFactor = 9000.0
534
535 // Used for ordering line and chunk matches within a file.
536 scoreLineOrderFactor = 1.0
537
538 // Used for tiebreakers. The scores are not combined with the main score, but
539 // are used to break ties between matches with the same score. The factors are
540 // chosen to separate the tiebreakers from the main score and from each other.
541 // If you make changes here, make sure to update indexData.scoreFile too.
542 scoreRepoRankFactor = 100.0
543 scoreFileOrderFactor = 10.0
544)
545
546// findMaxOverlappingSection returns the index of the section in secs that
547// overlaps the most with the area defined by off and sz, relative to the size
548// of the section. If no section overlaps, it returns 0, false. If multiple
549// sections overlap the same amount, the first one is returned.
550//
551// The implementation assumes that sections do not overlap and are sorted by
552// DocumentSection.Start.
553func findMaxOverlappingSection(secs []DocumentSection, off, sz uint32) (uint32, bool) {
554 start := off
555 end := off + sz
556
557 // Find the first section that might overlap
558 j := sort.Search(len(secs), func(i int) bool { return secs[i].End > start })
559
560 if j == len(secs) || secs[j].Start >= end {
561 // No overlap.
562 return 0, false
563 }
564
565 relOverlap := func(j int) float64 {
566 secSize := secs[j].End - secs[j].Start
567 if secSize == 0 {
568 return 0
569 }
570 // This cannot overflow because we make sure there is overlap before calling relOverlap
571 overlap := min(secs[j].End, end) - max(secs[j].Start, start)
572 return float64(overlap) / float64(secSize)
573 }
574
575 ol1 := relOverlap(j)
576 if epsilonEqualsOne(ol1) || j == len(secs)-1 || secs[j+1].Start >= end {
577 return uint32(j), ol1 > 0
578 }
579
580 // We know that [off,off+sz[ overlaps with at least 2 sections. We only have to check
581 // if the second section overlaps more than the first one, because a third
582 // section can only overlap if the overlap with the second section is complete.
583 ol2 := relOverlap(j + 1)
584 if ol2 > ol1 {
585 return uint32(j + 1), ol2 > 0
586 }
587
588 return uint32(j), ol1 > 0
589}
590
591func (p *contentProvider) matchesSymbol(cm *candidateMatch) bool {
592 if cm.fileName {
593 return false
594 }
595
596 // Check if this candidate came from a symbol matchTree
597 if cm.symbol {
598 return true
599 }
600
601 // Check if it overlaps with a symbol.
602 secs := p.docSections()
603 _, ok := findMaxOverlappingSection(secs, cm.byteOffset, cm.byteMatchSz)
604 return ok
605}
606
607func (p *contentProvider) findSymbol(cm *candidateMatch) (DocumentSection, *Symbol, bool) {
608 if cm.fileName {
609 return DocumentSection{}, nil, false
610 }
611
612 secs := p.docSections()
613
614 secIdx, ok := cm.symbolIdx, cm.symbol
615 if !ok {
616 // Not from a symbol matchTree. Let's see if it overlaps with a symbol.
617 secIdx, ok = findMaxOverlappingSection(secs, cm.byteOffset, cm.byteMatchSz)
618 }
619 if !ok {
620 return DocumentSection{}, nil, false
621 }
622
623 sec := secs[secIdx]
624
625 // Now lets hydrate in the SymbolInfo. We do not hydrate in SymbolInfo.Sym
626 // since some callsites do not need it stored, and that incurs an extra
627 // copy.
628 //
629 // 2024-01-08 we are refactoring this and the code path indicates this can
630 // fail, so callers need to handle nil symbol. However, it would be
631 // surprising that we have a matching section but not symbol data.
632 start := p.id.fileEndSymbol[p.idx]
633 si := p.id.symbols.data(start + secIdx)
634
635 return sec, si, true
636}
637
638// calculateTermFrequency computes the term frequency for the file match.
639// Notes:
640// * Filename matches count more than content matches. This mimics a common text search strategy to 'boost' matches on document titles.
641// * Symbol matches also count more than content matches, to reward matches on symbol definitions.
642func (p *contentProvider) calculateTermFrequency(cands []*candidateMatch, df termDocumentFrequency) map[string]int {
643 // Treat each candidate match as a term and compute the frequencies. For now, ignore case
644 // sensitivity and treat filenames and symbols the same as content.
645 termFreqs := map[string]int{}
646 for _, m := range cands {
647 term := string(m.substrLowered)
648 if m.fileName || p.matchesSymbol(m) {
649 termFreqs[term] += 5
650 } else {
651 termFreqs[term]++
652 }
653 }
654
655 for term := range termFreqs {
656 df[term] += 1
657 }
658 return termFreqs
659}
660
661func (p *contentProvider) candidateMatchScore(ms []*candidateMatch, language string, debug bool) (float64, string, []*Symbol) {
662 type debugScore struct {
663 what string
664 score float64
665 }
666
667 score := &debugScore{}
668 maxScore := &debugScore{}
669
670 addScore := func(what string, s float64) {
671 if s != 0 && debug {
672 score.what += fmt.Sprintf("%s:%.2f, ", what, s)
673 }
674 score.score += s
675 }
676
677 filename := p.data(true)
678 var symbolInfo []*Symbol
679
680 for i, m := range ms {
681 data := p.data(m.fileName)
682
683 endOffset := m.byteOffset + m.byteMatchSz
684 startBoundary := m.byteOffset < uint32(len(data)) && (m.byteOffset == 0 || byteClass(data[m.byteOffset-1]) != byteClass(data[m.byteOffset]))
685 endBoundary := endOffset > 0 && (endOffset == uint32(len(data)) || byteClass(data[endOffset-1]) != byteClass(data[endOffset]))
686
687 score.score = 0
688 score.what = ""
689
690 if startBoundary && endBoundary {
691 addScore("WordMatch", scoreWordMatch)
692 } else if startBoundary || endBoundary {
693 addScore("PartialWordMatch", scorePartialWordMatch)
694 }
695
696 if m.fileName {
697 sep := bytes.LastIndexByte(data, '/')
698 startMatch := int(m.byteOffset) == sep+1
699 endMatch := endOffset == uint32(len(data))
700 if startMatch && endMatch {
701 addScore("Base", scoreBase)
702 } else if startMatch || endMatch {
703 addScore("EdgeBase", (scoreBase+scorePartialBase)/2)
704 } else if sep < int(m.byteOffset) {
705 addScore("InnerBase", scorePartialBase)
706 }
707 } else if sec, si, ok := p.findSymbol(m); ok {
708 startMatch := sec.Start == m.byteOffset
709 endMatch := sec.End == endOffset
710 if startMatch && endMatch {
711 addScore("Symbol", scoreSymbol)
712 } else if startMatch || endMatch {
713 addScore("EdgeSymbol", (scoreSymbol+scorePartialSymbol)/2)
714 } else {
715 addScore("OverlapSymbol", scorePartialSymbol)
716 }
717
718 // Score based on symbol data
719 if si != nil {
720 symbolKind := ctags.ParseSymbolKind(si.Kind)
721 sym := sectionSlice(data, sec)
722
723 addScore(fmt.Sprintf("kind:%s:%s", language, si.Kind), scoreSymbolKind(language, filename, sym, symbolKind))
724
725 // This is from a symbol tree, so we need to store the symbol
726 // information.
727 if m.symbol {
728 if symbolInfo == nil {
729 symbolInfo = make([]*Symbol, len(ms))
730 }
731 // findSymbols does not hydrate in Sym. So we need to store it.
732 si.Sym = string(sym)
733 symbolInfo[i] = si
734 }
735 }
736 }
737
738 // scoreWeight != 1 means it affects score
739 if !epsilonEqualsOne(m.scoreWeight) {
740 score.score = score.score * m.scoreWeight
741 if debug {
742 score.what += fmt.Sprintf("boost:%.2f, ", m.scoreWeight)
743 }
744 }
745
746 if score.score > maxScore.score {
747 maxScore.score = score.score
748 maxScore.what = score.what
749 }
750 }
751
752 if debug {
753 maxScore.what = fmt.Sprintf("score:%.2f <- %s", maxScore.score, strings.TrimSuffix(maxScore.what, ", "))
754 }
755
756 return maxScore.score, maxScore.what, symbolInfo
757}
758
759// sectionSlice will return data[sec.Start:sec.End] but will clip Start and
760// End such that it won't be out of range.
761func sectionSlice(data []byte, sec DocumentSection) []byte {
762 l := uint32(len(data))
763 if sec.Start >= l {
764 return nil
765 }
766 if sec.End > l {
767 sec.End = l
768 }
769 return data[sec.Start:sec.End]
770}
771
772// scoreSymbolKind boosts a match based on the combination of language, symbol
773// and kind. The language string comes from go-enry, the symbol and kind from
774// ctags.
775func scoreSymbolKind(language string, filename []byte, sym []byte, kind ctags.SymbolKind) float64 {
776 var factor float64
777
778 // Generic ranking which will be overriden by language specific ranking
779 switch kind {
780 case ctags.Type: // scip-ctags regression workaround https://github.com/sourcegraph/sourcegraph/issues/57659
781 factor = 8
782 case ctags.Class:
783 factor = 10
784 case ctags.Struct:
785 factor = 9.5
786 case ctags.Enum:
787 factor = 9
788 case ctags.Interface:
789 factor = 8
790 case ctags.Function, ctags.Method:
791 factor = 7
792 case ctags.Field:
793 factor = 5.5
794 case ctags.Constant:
795 factor = 5
796 case ctags.Variable:
797 factor = 4
798 default:
799 // For all other kinds, assign a low score by default.
800 factor = 1
801 }
802
803 switch language {
804 case "Java", "java":
805 switch kind {
806 // 2022-03-30: go-ctags contains a regex rule for Java classes that sets "kind"
807 // to "classes" instead of "c". We have to cover both cases to support existing
808 // indexes.
809 case ctags.Class:
810 factor = 10
811 case ctags.Enum:
812 factor = 9
813 case ctags.Interface:
814 factor = 8
815 case ctags.Method:
816 factor = 7
817 case ctags.Field:
818 factor = 6
819 case ctags.EnumConstant:
820 factor = 5
821 }
822 case "Kotlin", "kotlin":
823 switch kind {
824 case ctags.Class:
825 factor = 10
826 case ctags.Interface:
827 factor = 9
828 case ctags.Method:
829 factor = 8
830 case ctags.TypeAlias:
831 factor = 7
832 case ctags.Constant:
833 factor = 6
834 case ctags.Variable:
835 factor = 5
836 }
837 case "Go", "go":
838 switch kind {
839 // scip-ctags regression workaround https://github.com/sourcegraph/sourcegraph/issues/57659
840 // for each case a description of the fields in ctags in the comment
841 case ctags.Type: // interface struct talias
842 factor = 9
843 case ctags.Interface: // interfaces
844 factor = 10
845 case ctags.Struct: // structs
846 factor = 9
847 case ctags.TypeAlias: // type aliases
848 factor = 9
849 case ctags.MethodSpec: // interface method specification
850 factor = 8.5
851 case ctags.Method, ctags.Function: // functions
852 factor = 8
853 case ctags.Field: // struct fields
854 factor = 7
855 case ctags.Constant: // constants
856 factor = 6
857 case ctags.Variable: // variables
858 factor = 5
859 }
860
861 // Boost exported go symbols. Same implementation as token.IsExported
862 if ch, _ := utf8.DecodeRune(sym); unicode.IsUpper(ch) {
863 factor += 0.5
864 }
865
866 if bytes.HasSuffix(filename, []byte("_test.go")) {
867 factor *= 0.8
868 }
869
870 // Could also rank on:
871 //
872 // - anonMember struct anonymous members
873 // - packageName name for specifying imported package
874 // - receiver receivers
875 // - package packages
876 // - type types
877 // - unknown unknown
878 case "C++", "c++":
879 switch kind {
880 case ctags.Class: // classes
881 factor = 10
882 case ctags.Enum: // enumeration names
883 factor = 9
884 case ctags.Function: // function definitions
885 factor = 8
886 case ctags.Struct: // structure names
887 factor = 7
888 case ctags.Union: // union names
889 factor = 6
890 case ctags.TypeAlias: // typedefs
891 factor = 5
892 case ctags.Field: // class, struct, and union members
893 factor = 4
894 case ctags.Variable: // varialbe definitions
895 factor = 3
896 }
897 // Could also rank on:
898 // NAME DESCRIPTION
899 // macro macro definitions
900 // enumerator enumerators (values inside an enumeration)
901 // header included header files
902 // namespace namespaces
903 // variable variable definitions
904 case "Scala", "scala":
905 switch kind {
906 case ctags.Class:
907 factor = 10
908 case ctags.Interface:
909 factor = 9
910 case ctags.Object:
911 factor = 8
912 case ctags.Function:
913 factor = 7
914 case ctags.Type:
915 factor = 6
916 case ctags.Variable:
917 factor = 5
918 case ctags.Package:
919 factor = 4
920 }
921 case "Python", "python":
922 switch kind {
923 case ctags.Class: // classes
924 factor = 10
925 case ctags.Function, ctags.Method: // function definitions
926 factor = 8
927 case ctags.Field: // class, struct, and union members
928 factor = 4
929 case ctags.Variable: // variable definitions
930 factor = 3
931 case ctags.Local: // local variables
932 factor = 2
933 }
934 // Could also rank on:
935 //
936 // - namespace name referring a module defined in other file
937 // - module modules
938 // - unknown name referring a class/variable/function/module defined in other module
939 // - parameter function parameters
940 case "Ruby", "ruby":
941 switch kind {
942 case ctags.Class:
943 factor = 10
944 case ctags.Method:
945 factor = 9
946 case ctags.MethodAlias:
947 factor = 8
948 case ctags.Module:
949 factor = 7
950 case ctags.SingletonMethod:
951 factor = 6
952 case ctags.Constant:
953 factor = 5
954 case ctags.Accessor:
955 factor = 4
956 case ctags.Library:
957 factor = 3
958 }
959 case "PHP", "php":
960 switch kind {
961 case ctags.Class:
962 factor = 10
963 case ctags.Interface:
964 factor = 9
965 case ctags.Function:
966 factor = 8
967 case ctags.Trait:
968 factor = 7
969 case ctags.Define:
970 factor = 6
971 case ctags.Namespace:
972 factor = 5
973 case ctags.MethodAlias:
974 factor = 4
975 case ctags.Variable:
976 factor = 3
977 case ctags.Local:
978 factor = 3
979 }
980 case "GraphQL", "graphql":
981 switch kind {
982 case ctags.Type:
983 factor = 10
984 }
985 case "Markdown", "markdown":
986 // Headers are good signal in docs, but do not rank as highly as code.
987 switch kind {
988 case ctags.Chapter: // #
989 factor = 4
990 case ctags.Section: // ##
991 factor = 3
992 case ctags.Subsection: // ###
993 factor = 2
994 }
995 }
996
997 return factor * scoreKindMatch
998}
999
1000type matchScoreSlice []LineMatch
1001
1002func (m matchScoreSlice) Len() int { return len(m) }
1003func (m matchScoreSlice) Swap(i, j int) { m[i], m[j] = m[j], m[i] }
1004func (m matchScoreSlice) Less(i, j int) bool { return m[i].Score > m[j].Score }
1005
1006type chunkMatchScoreSlice []ChunkMatch
1007
1008func (m chunkMatchScoreSlice) Len() int { return len(m) }
1009func (m chunkMatchScoreSlice) Swap(i, j int) { m[i], m[j] = m[j], m[i] }
1010func (m chunkMatchScoreSlice) Less(i, j int) bool { return m[i].Score > m[j].Score }
1011
1012type fileMatchesByScore []FileMatch
1013
1014func (m fileMatchesByScore) Len() int { return len(m) }
1015func (m fileMatchesByScore) Swap(i, j int) { m[i], m[j] = m[j], m[i] }
1016func (m fileMatchesByScore) Less(i, j int) bool { return m[i].Score > m[j].Score }
1017
1018func sortMatchesByScore(ms []LineMatch) {
1019 sort.Sort(matchScoreSlice(ms))
1020}
1021
1022func sortChunkMatchesByScore(ms []ChunkMatch) {
1023 sort.Sort(chunkMatchScoreSlice(ms))
1024}
1025
1026// SortFiles sorts files matches in the order we want to present results to
1027// users. The order depends on the match score, which includes both
1028// query-dependent signals like word overlap, and file-only signals like the
1029// file ranks (if file ranks are enabled).
1030//
1031// We don't only use the scores, we will also boost some results to present
1032// files with novel extensions.
1033func SortFiles(ms []FileMatch) {
1034 sort.Sort(fileMatchesByScore(ms))
1035
1036 // Boost a file extension not in the top 3 to the third filematch.
1037 boostNovelExtension(ms, 2, 0.9)
1038}
1039
1040func boostNovelExtension(ms []FileMatch, boostOffset int, minScoreRatio float64) {
1041 if len(ms) <= boostOffset+1 {
1042 return
1043 }
1044
1045 top := ms[:boostOffset]
1046 candidates := ms[boostOffset:]
1047
1048 // Don't bother boosting something which is significantly different to the
1049 // result it replaces.
1050 minScoreForNovelty := candidates[0].Score * minScoreRatio
1051
1052 // We want to look for an ext that isn't in the top exts
1053 exts := make([]string, len(top))
1054 for i := range top {
1055 exts[i] = path.Ext(top[i].FileName)
1056 }
1057
1058 for i := range candidates {
1059 // Do not assume sorted due to boostNovelExtension being called on subsets
1060 if candidates[i].Score < minScoreForNovelty {
1061 continue
1062 }
1063
1064 if slices.Contains(exts, path.Ext(candidates[i].FileName)) {
1065 continue
1066 }
1067
1068 // Found what we are looking for, now boost to front of candidates (which
1069 // is ms[boostOffset])
1070 for ; i > 0; i-- {
1071 candidates[i], candidates[i-1] = candidates[i-1], candidates[i]
1072 }
1073 return
1074 }
1075}