fork of https://github.com/sourcegraph/zoekt
1// Copyright 2016 Google Inc. All rights reserved.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15package zoekt
16
17import (
18 "bytes"
19 "fmt"
20 "log"
21 "path"
22 "slices"
23 "sort"
24 "strings"
25 "unicode"
26 "unicode/utf8"
27
28 "github.com/sourcegraph/zoekt/ctags"
29)
30
31var _ = log.Println
32
33// contentProvider is an abstraction to treat matches for names and
34// content with the same code.
35type contentProvider struct {
36 id *indexData
37 stats *Stats
38
39 // mutable
40 err error
41 idx uint32
42 _data []byte
43 _nl []uint32
44 _nlBuf []uint32
45 _sects []DocumentSection
46 _sectBuf []DocumentSection
47 fileSize uint32
48}
49
50// setDocument skips to the given document.
51func (p *contentProvider) setDocument(docID uint32) {
52 fileStart := p.id.boundaries[docID]
53
54 p.idx = docID
55 p.fileSize = p.id.boundaries[docID+1] - fileStart
56
57 p._nl = nil
58 p._sects = nil
59 p._data = nil
60}
61
62func (p *contentProvider) docSections() []DocumentSection {
63 if p._sects == nil {
64 var sz uint32
65 p._sects, sz, p.err = p.id.readDocSections(p.idx, p._sectBuf)
66 p.stats.ContentBytesLoaded += int64(sz)
67 p._sectBuf = p._sects
68 }
69 return p._sects
70}
71
72func (p *contentProvider) newlines() newlines {
73 if p._nl == nil {
74 var sz uint32
75 p._nl, sz, p.err = p.id.readNewlines(p.idx, p._nlBuf)
76 p._nlBuf = p._nl
77 p.stats.ContentBytesLoaded += int64(sz)
78 }
79 return newlines{locs: p._nl, fileSize: p.fileSize}
80}
81
82func (p *contentProvider) data(fileName bool) []byte {
83 if fileName {
84 return p.id.fileNameContent[p.id.fileNameIndex[p.idx]:p.id.fileNameIndex[p.idx+1]]
85 }
86
87 if p._data == nil {
88 p._data, p.err = p.id.readContents(p.idx)
89 p.stats.FilesLoaded++
90 p.stats.ContentBytesLoaded += int64(len(p._data))
91 }
92 return p._data
93}
94
95// Find offset in bytes (relative to corpus start) for an offset in
96// runes (relative to document start). If filename is set, the corpus
97// is the set of filenames, with the document being the name itself.
98func (p *contentProvider) findOffset(filename bool, r uint32) uint32 {
99 if p.id.metaData.PlainASCII {
100 return r
101 }
102
103 sample := p.id.runeOffsets
104 runeEnds := p.id.fileEndRunes
105 fileStartByte := p.id.boundaries[p.idx]
106 if filename {
107 sample = p.id.fileNameRuneOffsets
108 runeEnds = p.id.fileNameEndRunes
109 fileStartByte = p.id.fileNameIndex[p.idx]
110 }
111
112 absR := r
113 if p.idx > 0 {
114 absR += runeEnds[p.idx-1]
115 }
116
117 byteOff, left := sample.lookup(absR)
118
119 var data []byte
120
121 if filename {
122 data = p.id.fileNameContent[byteOff:]
123 } else {
124 data, p.err = p.id.readContentSlice(byteOff, 3*runeOffsetFrequency)
125 if p.err != nil {
126 return 0
127 }
128 }
129 for left > 0 {
130 _, sz := utf8.DecodeRune(data)
131 byteOff += uint32(sz)
132 data = data[sz:]
133 left--
134 }
135
136 byteOff -= fileStartByte
137 return byteOff
138}
139
140func (p *contentProvider) fillMatches(ms []*candidateMatch, numContextLines int, language string, debug bool) []LineMatch {
141 var result []LineMatch
142 if ms[0].fileName {
143 score, debugScore, _ := p.candidateMatchScore(ms, language, debug)
144
145 // There is only "line" in a filename.
146 res := LineMatch{
147 Line: p.id.fileName(p.idx),
148 FileName: true,
149
150 Score: score,
151 DebugScore: debugScore,
152 }
153
154 for _, m := range ms {
155 res.LineFragments = append(res.LineFragments, LineFragmentMatch{
156 LineOffset: int(m.byteOffset),
157 MatchLength: int(m.byteMatchSz),
158 Offset: m.byteOffset,
159 })
160
161 result = []LineMatch{res}
162 }
163 } else {
164 ms = breakMatchesOnNewlines(ms, p.data(false))
165 result = p.fillContentMatches(ms, numContextLines, language, debug)
166 }
167
168 return result
169}
170
171// fillChunkMatches converts the internal candidateMatch slice into our APIs ChunkMatch.
172//
173// Performance invariant: ms is sorted and non-overlapping.
174//
175// Note: the byte slices may be backed by mmapped data, so before being
176// returned by the API it needs to be copied.
177func (p *contentProvider) fillChunkMatches(ms []*candidateMatch, numContextLines int, language string, debug bool) []ChunkMatch {
178 var result []ChunkMatch
179 if ms[0].fileName {
180 // If the first match is a filename match, there will only be
181 // one match and the matched content will be the filename.
182
183 score, debugScore, _ := p.candidateMatchScore(ms, language, debug)
184
185 fileName := p.id.fileName(p.idx)
186 ranges := make([]Range, 0, len(ms))
187 for _, m := range ms {
188 ranges = append(ranges, Range{
189 Start: Location{
190 ByteOffset: m.byteOffset,
191 LineNumber: 1,
192 Column: uint32(utf8.RuneCount(fileName[:m.byteOffset]) + 1),
193 },
194 End: Location{
195 ByteOffset: m.byteOffset + m.byteMatchSz,
196 LineNumber: 1,
197 Column: uint32(utf8.RuneCount(fileName[:m.byteOffset+m.byteMatchSz]) + 1),
198 },
199 })
200 }
201
202 result = []ChunkMatch{{
203 Content: fileName,
204 ContentStart: Location{ByteOffset: 0, LineNumber: 1, Column: 1},
205 Ranges: ranges,
206 FileName: true,
207
208 Score: score,
209 DebugScore: debugScore,
210 }}
211 } else {
212 result = p.fillContentChunkMatches(ms, numContextLines, language, debug)
213 }
214
215 return result
216}
217
218func (p *contentProvider) fillContentMatches(ms []*candidateMatch, numContextLines int, language string, debug bool) []LineMatch {
219 var result []LineMatch
220 for len(ms) > 0 {
221 m := ms[0]
222 num, lineStart, lineEnd := p.newlines().atOffset(m.byteOffset)
223
224 var lineCands []*candidateMatch
225
226 endMatch := m.byteOffset + m.byteMatchSz
227
228 for len(ms) > 0 {
229 m := ms[0]
230 if int(m.byteOffset) <= lineEnd {
231 endMatch = m.byteOffset + m.byteMatchSz
232 lineCands = append(lineCands, m)
233 ms = ms[1:]
234 } else {
235 break
236 }
237 }
238
239 if len(lineCands) == 0 {
240 log.Panicf(
241 "%s %v infinite loop: num %d start,end %d,%d, offset %d",
242 p.id.fileName(p.idx), p.id.metaData,
243 num, lineStart, lineEnd,
244 m.byteOffset)
245 }
246
247 data := p.data(false)
248
249 // Due to merging matches, we may have a match that
250 // crosses a line boundary. Prevent confusion by
251 // taking lines until we pass the last match
252 for lineEnd < len(data) && endMatch > uint32(lineEnd) {
253 next := bytes.IndexByte(data[lineEnd+1:], '\n')
254 if next == -1 {
255 lineEnd = len(data)
256 } else {
257 // TODO(hanwen): test that checks "+1" part here.
258 lineEnd += next + 1
259 }
260 }
261
262 finalMatch := LineMatch{
263 LineStart: lineStart,
264 LineEnd: lineEnd,
265 LineNumber: num,
266 }
267 finalMatch.Line = data[lineStart:lineEnd]
268
269 if numContextLines > 0 {
270 finalMatch.Before = p.newlines().getLines(data, num-numContextLines, num)
271 finalMatch.After = p.newlines().getLines(data, num+1, num+1+numContextLines)
272 }
273
274 score, debugScore, symbolInfo := p.candidateMatchScore(lineCands, language, debug)
275 finalMatch.Score = score
276 finalMatch.DebugScore = debugScore
277
278 for i, m := range lineCands {
279 fragment := LineFragmentMatch{
280 Offset: m.byteOffset,
281 LineOffset: int(m.byteOffset) - lineStart,
282 MatchLength: int(m.byteMatchSz),
283 }
284 if i < len(symbolInfo) && symbolInfo[i] != nil {
285 fragment.SymbolInfo = symbolInfo[i]
286 }
287
288 finalMatch.LineFragments = append(finalMatch.LineFragments, fragment)
289 }
290 result = append(result, finalMatch)
291 }
292 return result
293}
294
295func (p *contentProvider) fillContentChunkMatches(ms []*candidateMatch, numContextLines int, language string, debug bool) []ChunkMatch {
296 newlines := p.newlines()
297 data := p.data(false)
298
299 // columnHelper prevents O(len(ms) * len(data)) lookups for all columns.
300 // However, it depends on ms being sorted by byteOffset and non-overlapping.
301 // This invariant is true at the time of writing, but we conservatively
302 // enforce this. Note: chunkCandidates preserves the sorting so safe to
303 // transform now.
304 columnHelper := columnHelper{data: data}
305 if !sort.IsSorted((sortByOffsetSlice)(ms)) {
306 log.Printf("WARN: performance invariant violated. candidate matches are not sorted in fillContentChunkMatches. Report to developers.")
307 sort.Sort((sortByOffsetSlice)(ms))
308 }
309
310 chunks := chunkCandidates(ms, newlines, numContextLines)
311 chunkMatches := make([]ChunkMatch, 0, len(chunks))
312 for _, chunk := range chunks {
313 score, debugScore, symbolInfo := p.candidateMatchScore(chunk.candidates, language, debug)
314
315 ranges := make([]Range, 0, len(chunk.candidates))
316 for _, cm := range chunk.candidates {
317 startOffset := cm.byteOffset
318 endOffset := cm.byteOffset + cm.byteMatchSz
319 startLine, startLineOffset, _ := newlines.atOffset(startOffset)
320 endLine, endLineOffset, _ := newlines.atOffset(endOffset)
321
322 ranges = append(ranges, Range{
323 Start: Location{
324 ByteOffset: startOffset,
325 LineNumber: uint32(startLine),
326 Column: columnHelper.get(startLineOffset, startOffset),
327 },
328 End: Location{
329 ByteOffset: endOffset,
330 LineNumber: uint32(endLine),
331 Column: columnHelper.get(endLineOffset, endOffset),
332 },
333 })
334 }
335
336 firstLineNumber := int(chunk.firstLine) - numContextLines
337 if firstLineNumber < 1 {
338 firstLineNumber = 1
339 }
340 firstLineStart, _ := newlines.lineBounds(firstLineNumber)
341
342 chunkMatches = append(chunkMatches, ChunkMatch{
343 Content: newlines.getLines(data, firstLineNumber, int(chunk.lastLine)+numContextLines+1),
344 ContentStart: Location{
345 ByteOffset: firstLineStart,
346 LineNumber: uint32(firstLineNumber),
347 Column: 1,
348 },
349 FileName: false,
350 Ranges: ranges,
351 SymbolInfo: symbolInfo,
352 Score: score,
353 DebugScore: debugScore,
354 })
355 }
356 return chunkMatches
357}
358
359type candidateChunk struct {
360 candidates []*candidateMatch
361 firstLine uint32 // 1-based, inclusive
362 lastLine uint32 // 1-based, inclusive
363 minOffset uint32 // 0-based, inclusive
364 maxOffset uint32 // 0-based, exclusive
365}
366
367// chunkCandidates groups a set of sorted, non-overlapping candidate matches by line number. Adjacent
368// chunks will be merged if adding `numContextLines` to the beginning and end of the chunk would cause
369// it to overlap with an adjacent chunk.
370//
371// input invariants: ms is sorted by byteOffset and is non overlapping with respect to endOffset.
372// output invariants: if you flatten candidates the input invariant is retained.
373func chunkCandidates(ms []*candidateMatch, newlines newlines, numContextLines int) []candidateChunk {
374 var chunks []candidateChunk
375 for _, m := range ms {
376 startOffset := m.byteOffset
377 endOffset := m.byteOffset + m.byteMatchSz
378 firstLine, _, _ := newlines.atOffset(startOffset)
379 lastLine, _, _ := newlines.atOffset(endOffset)
380
381 if len(chunks) > 0 && int(chunks[len(chunks)-1].lastLine)+numContextLines >= firstLine-numContextLines {
382 // If a new chunk created with the current candidateMatch would
383 // overlap with the previous chunk, instead add the candidateMatch
384 // to the last chunk and extend end of the last chunk.
385 last := &chunks[len(chunks)-1]
386 last.candidates = append(last.candidates, m)
387 if last.maxOffset < endOffset {
388 last.lastLine = uint32(lastLine)
389 last.maxOffset = uint32(endOffset)
390 }
391 } else {
392 chunks = append(chunks, candidateChunk{
393 firstLine: uint32(firstLine),
394 lastLine: uint32(lastLine),
395 minOffset: startOffset,
396 maxOffset: endOffset,
397 candidates: []*candidateMatch{m},
398 })
399 }
400 }
401 return chunks
402}
403
404// columnHelper is a helper struct which caches the number of runes last
405// counted. If we naively use utf8.RuneCount for each match on a line, this
406// leads to an O(nm) algorithm where m is the number of matches and n is the
407// length of the line. Aassuming we our candidates are increasing in offset
408// makes this operation O(n) instead.
409type columnHelper struct {
410 data []byte
411
412 // 0 values for all these are valid values
413 lastLineOffset int
414 lastOffset uint32
415 lastRuneCount uint32
416}
417
418// get returns the line column for offset. offset is the byte offset of the
419// rune in data. lineOffset is the byte offset inside of data for the line
420// containing offset.
421func (c *columnHelper) get(lineOffset int, offset uint32) uint32 {
422 var runeCount uint32
423
424 if lineOffset == c.lastLineOffset && offset >= c.lastOffset {
425 // Can count from last calculation
426 runeCount = c.lastRuneCount + uint32(utf8.RuneCount(c.data[c.lastOffset:offset]))
427 } else {
428 // Need to count from the beginning of line
429 runeCount = uint32(utf8.RuneCount(c.data[lineOffset:offset]))
430 }
431
432 c.lastLineOffset = lineOffset
433 c.lastOffset = offset
434 c.lastRuneCount = runeCount
435
436 return runeCount + 1
437}
438
439type newlines struct {
440 // locs is the sorted set of byte offsets of the newlines in the file
441 locs []uint32
442
443 // fileSize is just the number of bytes in the file. It is stored
444 // on this struct so we can safely know the length of the last line
445 // in the file since not all files end in a newline.
446 fileSize uint32
447}
448
449// atOffset returns the line containing the offset. If the offset lands on
450// the newline ending line M, we return M. The line is characterized
451// by its linenumber (base-1, byte index of line start, byte index of
452// line end). The line end is the index of a newline, or the filesize
453// (if matching the last line of the file.)
454func (nls newlines) atOffset(offset uint32) (lineNumber, lineStart, lineEnd int) {
455 idx := sort.Search(len(nls.locs), func(n int) bool {
456 return nls.locs[n] >= offset
457 })
458
459 start, end := nls.lineBounds(idx + 1)
460 return idx + 1, int(start), int(end)
461}
462
463// lineBounds returns the byte offsets of the start and end of the 1-based
464// lineNumber. The end offset is exclusive and will not contain the line-ending
465// newline. If the line number is out of range of the lines in the file, start
466// and end will be clamped to [0,fileSize].
467func (nls newlines) lineBounds(lineNumber int) (start, end uint32) {
468 // nls.locs[0] + 1 is the start of the 2nd line of data.
469 startIdx := lineNumber - 2
470 endIdx := lineNumber - 1
471
472 if startIdx < 0 {
473 start = 0
474 } else if startIdx >= len(nls.locs) {
475 start = nls.fileSize
476 } else {
477 start = nls.locs[startIdx] + 1
478 }
479
480 if endIdx < 0 {
481 end = 0
482 } else if endIdx >= len(nls.locs) {
483 end = nls.fileSize
484 } else {
485 end = nls.locs[endIdx]
486 }
487
488 return start, end
489}
490
491// getLines returns a slice of data containing the lines [low, high).
492// low is 1-based and inclusive. high is 1-based and exclusive.
493func (nls newlines) getLines(data []byte, low, high int) []byte {
494 if low >= high {
495 return nil
496 }
497
498 lowStart, _ := nls.lineBounds(low)
499 _, highEnd := nls.lineBounds(high - 1)
500
501 // Drop any trailing newline. Editors do not treat a trailing newline as
502 // the start of a new line, so we should not either. lineBounds clamps to
503 // len(data) when an out-of-bounds line is requested.
504 //
505 // As an example, if we request lines 1-5 from a file with contents
506 // `one\ntwo\nthree\n`, we should return `one\ntwo\nthree` because those are
507 // the three "lines" in the file, separated by newlines.
508 if highEnd == uint32(len(data)) && bytes.HasSuffix(data, []byte{'\n'}) {
509 highEnd = highEnd - 1
510 lowStart = min(lowStart, highEnd)
511 }
512
513 return data[lowStart:highEnd]
514}
515
516const (
517 // Query-dependent scoring signals. All of these together are bounded at ~9000
518 // (scoreWordMatch + scoreSymbol + scoreKindMatch * 10 + scoreFactorAtomMatch).
519 scorePartialWordMatch = 50.0
520 scoreWordMatch = 500.0
521 scoreBase = 7000.0
522 scorePartialBase = 4000.0
523 scoreSymbol = 7000.0
524 scorePartialSymbol = 4000.0
525 scoreKindMatch = 100.0
526 scoreFactorAtomMatch = 400.0
527
528 // File-only scoring signals. For now these are also bounded ~9000 to give them
529 // equal weight with the query-dependent signals.
530 scoreFileRankFactor = 9000.0
531 scoreFileOrderFactor = 10.0
532 scoreRepoRankFactor = 20.0
533
534 // Used for ordering line and chunk matches within a file.
535 scoreLineOrderFactor = 1.0
536)
537
538// findSection checks whether a section defined by offset and size lies within
539// one of the sections in secs.
540func findSection(secs []DocumentSection, off, sz uint32) (uint32, bool) {
541 j := sort.Search(len(secs), func(i int) bool {
542 return secs[i].End >= off+sz
543 })
544
545 if j == len(secs) {
546 return 0, false
547 }
548
549 if secs[j].Start <= off && off+sz <= secs[j].End {
550 return uint32(j), true
551 }
552 return 0, false
553}
554
555// findMaxOverlappingSection returns the index of the section in secs that
556// overlaps the most with the area defined by off and sz, relative to the size
557// of the section. If no section overlaps, it returns 0, false. If multiple
558// sections overlap the same amount, the first one is returned.
559//
560// The implementation assumes that sections do not overlap and are sorted by
561// DocumentSection.Start.
562func findMaxOverlappingSection(secs []DocumentSection, off, sz uint32) (uint32, bool) {
563 start := off
564 end := off + sz
565
566 // Find the first section that might overlap
567 j := sort.Search(len(secs), func(i int) bool { return secs[i].End > start })
568
569 if j == len(secs) || secs[j].Start >= end {
570 // No overlap.
571 return 0, false
572 }
573
574 relOverlap := func(j int) float64 {
575 secSize := secs[j].End - secs[j].Start
576 if secSize == 0 {
577 return 0
578 }
579 // This cannot overflow because we make sure there is overlap before calling relOverlap
580 overlap := min(secs[j].End, end) - max(secs[j].Start, start)
581 return float64(overlap) / float64(secSize)
582 }
583
584 ol1 := relOverlap(j)
585 if epsilonEqualsOne(ol1) || j == len(secs)-1 || secs[j+1].Start >= end {
586 return uint32(j), ol1 > 0
587 }
588
589 // We know that [off,off+sz[ overlaps with at least 2 sections. We only have to check
590 // if the second section overlaps more than the first one, because a third
591 // section can only overlap if the overlap with the second section is complete.
592 ol2 := relOverlap(j + 1)
593 if ol2 > ol1 {
594 return uint32(j + 1), ol2 > 0
595 }
596
597 return uint32(j), ol1 > 0
598}
599
600func (p *contentProvider) findSymbol(cm *candidateMatch) (DocumentSection, *Symbol, bool) {
601 if cm.fileName {
602 return DocumentSection{}, nil, false
603 }
604
605 secs := p.docSections()
606
607 secIdx, ok := cm.symbolIdx, cm.symbol
608 if !ok {
609 // Not from a symbol matchTree. Let's see if it overlaps with a symbol.
610 secIdx, ok = findMaxOverlappingSection(secs, cm.byteOffset, cm.byteMatchSz)
611 }
612 if !ok {
613 return DocumentSection{}, nil, false
614 }
615
616 sec := secs[secIdx]
617
618 // Now lets hydrate in the SymbolInfo. We do not hydrate in SymbolInfo.Sym
619 // since some callsites do not need it stored, and that incurs an extra
620 // copy.
621 //
622 // 2024-01-08 we are refactoring this and the code path indicates this can
623 // fail, so callers need to handle nil symbol. However, it would be
624 // surprising that we have a matching section but not symbol data.
625 start := p.id.fileEndSymbol[p.idx]
626 si := p.id.symbols.data(start + secIdx)
627
628 return sec, si, true
629}
630
631func (p *contentProvider) candidateMatchScore(ms []*candidateMatch, language string, debug bool) (float64, string, []*Symbol) {
632 type debugScore struct {
633 what string
634 score float64
635 }
636
637 score := &debugScore{}
638 maxScore := &debugScore{}
639
640 addScore := func(what string, s float64) {
641 if s != 0 && debug {
642 score.what += fmt.Sprintf("%s:%.2f, ", what, s)
643 }
644 score.score += s
645 }
646
647 filename := p.data(true)
648 var symbolInfo []*Symbol
649
650 for i, m := range ms {
651 data := p.data(m.fileName)
652
653 endOffset := m.byteOffset + m.byteMatchSz
654 startBoundary := m.byteOffset < uint32(len(data)) && (m.byteOffset == 0 || byteClass(data[m.byteOffset-1]) != byteClass(data[m.byteOffset]))
655 endBoundary := endOffset > 0 && (endOffset == uint32(len(data)) || byteClass(data[endOffset-1]) != byteClass(data[endOffset]))
656
657 score.score = 0
658 score.what = ""
659
660 if startBoundary && endBoundary {
661 addScore("WordMatch", scoreWordMatch)
662 } else if startBoundary || endBoundary {
663 addScore("PartialWordMatch", scorePartialWordMatch)
664 }
665
666 if m.fileName {
667 sep := bytes.LastIndexByte(data, '/')
668 startMatch := int(m.byteOffset) == sep+1
669 endMatch := endOffset == uint32(len(data))
670 if startMatch && endMatch {
671 addScore("Base", scoreBase)
672 } else if startMatch || endMatch {
673 addScore("EdgeBase", (scoreBase+scorePartialBase)/2)
674 } else if sep < int(m.byteOffset) {
675 addScore("InnerBase", scorePartialBase)
676 }
677 } else if sec, si, ok := p.findSymbol(m); ok {
678 startMatch := sec.Start == m.byteOffset
679 endMatch := sec.End == endOffset
680 if startMatch && endMatch {
681 addScore("Symbol", scoreSymbol)
682 } else if startMatch || endMatch {
683 addScore("EdgeSymbol", (scoreSymbol+scorePartialSymbol)/2)
684 } else {
685 addScore("OverlapSymbol", scorePartialSymbol)
686 }
687
688 // Score based on symbol data
689 if si != nil {
690 symbolKind := ctags.ParseSymbolKind(si.Kind)
691 sym := sectionSlice(data, sec)
692
693 addScore(fmt.Sprintf("kind:%s:%s", language, si.Kind), scoreSymbolKind(language, filename, sym, symbolKind))
694
695 // This is from a symbol tree, so we need to store the symbol
696 // information.
697 if m.symbol {
698 if symbolInfo == nil {
699 symbolInfo = make([]*Symbol, len(ms))
700 }
701 // findSymbols does not hydrate in Sym. So we need to store it.
702 si.Sym = string(sym)
703 symbolInfo[i] = si
704 }
705 }
706 }
707
708 // scoreWeight != 1 means it affects score
709 if !epsilonEqualsOne(m.scoreWeight) {
710 score.score = score.score * m.scoreWeight
711 if debug {
712 score.what += fmt.Sprintf("boost:%.2f, ", m.scoreWeight)
713 }
714 }
715
716 if score.score > maxScore.score {
717 maxScore.score = score.score
718 maxScore.what = score.what
719 }
720 }
721
722 if debug {
723 maxScore.what = fmt.Sprintf("score:%.2f <- %s", maxScore.score, strings.TrimSuffix(maxScore.what, ", "))
724 }
725
726 return maxScore.score, maxScore.what, symbolInfo
727}
728
729// sectionSlice will return data[sec.Start:sec.End] but will clip Start and
730// End such that it won't be out of range.
731func sectionSlice(data []byte, sec DocumentSection) []byte {
732 l := uint32(len(data))
733 if sec.Start >= l {
734 return nil
735 }
736 if sec.End > l {
737 sec.End = l
738 }
739 return data[sec.Start:sec.End]
740}
741
742// scoreSymbolKind boosts a match based on the combination of language, symbol
743// and kind. The language string comes from go-enry, the symbol and kind from
744// ctags.
745func scoreSymbolKind(language string, filename []byte, sym []byte, kind ctags.SymbolKind) float64 {
746 var factor float64
747
748 // Generic ranking which will be overriden by language specific ranking
749 switch kind {
750 case ctags.Type: // scip-ctags regression workaround https://github.com/sourcegraph/sourcegraph/issues/57659
751 factor = 8
752 case ctags.Class:
753 factor = 10
754 case ctags.Struct:
755 factor = 9.5
756 case ctags.Enum:
757 factor = 9
758 case ctags.Interface:
759 factor = 8
760 case ctags.Function, ctags.Method:
761 factor = 7
762 case ctags.Field:
763 factor = 5.5
764 case ctags.Constant:
765 factor = 5
766 case ctags.Variable:
767 factor = 4
768 default:
769 // For all other kinds, assign a low score by default.
770 factor = 1
771 }
772
773 switch language {
774 case "Java", "java":
775 switch kind {
776 // 2022-03-30: go-ctags contains a regex rule for Java classes that sets "kind"
777 // to "classes" instead of "c". We have to cover both cases to support existing
778 // indexes.
779 case ctags.Class:
780 factor = 10
781 case ctags.Enum:
782 factor = 9
783 case ctags.Interface:
784 factor = 8
785 case ctags.Method:
786 factor = 7
787 case ctags.Field:
788 factor = 6
789 case ctags.EnumConstant:
790 factor = 5
791 }
792 case "Kotlin", "kotlin":
793 switch kind {
794 case ctags.Class:
795 factor = 10
796 case ctags.Interface:
797 factor = 9
798 case ctags.Method:
799 factor = 8
800 case ctags.TypeAlias:
801 factor = 7
802 case ctags.Constant:
803 factor = 6
804 case ctags.Variable:
805 factor = 5
806 }
807 case "Go", "go":
808 switch kind {
809 // scip-ctags regression workaround https://github.com/sourcegraph/sourcegraph/issues/57659
810 // for each case a description of the fields in ctags in the comment
811 case ctags.Type: // interface struct talias
812 factor = 9
813 case ctags.Interface: // interfaces
814 factor = 10
815 case ctags.Struct: // structs
816 factor = 9
817 case ctags.TypeAlias: // type aliases
818 factor = 9
819 case ctags.MethodSpec: // interface method specification
820 factor = 8.5
821 case ctags.Method, ctags.Function: // functions
822 factor = 8
823 case ctags.Field: // struct fields
824 factor = 7
825 case ctags.Constant: // constants
826 factor = 6
827 case ctags.Variable: // variables
828 factor = 5
829 }
830
831 // Boost exported go symbols. Same implementation as token.IsExported
832 if ch, _ := utf8.DecodeRune(sym); unicode.IsUpper(ch) {
833 factor += 0.5
834 }
835
836 if bytes.HasSuffix(filename, []byte("_test.go")) {
837 factor *= 0.8
838 }
839
840 // Could also rank on:
841 //
842 // - anonMember struct anonymous members
843 // - packageName name for specifying imported package
844 // - receiver receivers
845 // - package packages
846 // - type types
847 // - unknown unknown
848 case "C++", "c++":
849 switch kind {
850 case ctags.Class: // classes
851 factor = 10
852 case ctags.Enum: // enumeration names
853 factor = 9
854 case ctags.Function: // function definitions
855 factor = 8
856 case ctags.Struct: // structure names
857 factor = 7
858 case ctags.Union: // union names
859 factor = 6
860 case ctags.TypeAlias: // typedefs
861 factor = 5
862 case ctags.Field: // class, struct, and union members
863 factor = 4
864 case ctags.Variable: // varialbe definitions
865 factor = 3
866 }
867 // Could also rank on:
868 // NAME DESCRIPTION
869 // macro macro definitions
870 // enumerator enumerators (values inside an enumeration)
871 // header included header files
872 // namespace namespaces
873 // variable variable definitions
874 case "Scala", "scala":
875 switch kind {
876 case ctags.Class:
877 factor = 10
878 case ctags.Interface:
879 factor = 9
880 case ctags.Object:
881 factor = 8
882 case ctags.Function:
883 factor = 7
884 case ctags.Type:
885 factor = 6
886 case ctags.Variable:
887 factor = 5
888 case ctags.Package:
889 factor = 4
890 }
891 case "Python", "python":
892 switch kind {
893 case ctags.Class: // classes
894 factor = 10
895 case ctags.Function, ctags.Method: // function definitions
896 factor = 8
897 case ctags.Field: // class, struct, and union members
898 factor = 4
899 case ctags.Variable: // variable definitions
900 factor = 3
901 case ctags.Local: // local variables
902 factor = 2
903 }
904 // Could also rank on:
905 //
906 // - namespace name referring a module defined in other file
907 // - module modules
908 // - unknown name referring a class/variable/function/module defined in other module
909 // - parameter function parameters
910 case "Ruby", "ruby":
911 switch kind {
912 case ctags.Class:
913 factor = 10
914 case ctags.Method:
915 factor = 9
916 case ctags.MethodAlias:
917 factor = 8
918 case ctags.Module:
919 factor = 7
920 case ctags.SingletonMethod:
921 factor = 6
922 case ctags.Constant:
923 factor = 5
924 case ctags.Accessor:
925 factor = 4
926 case ctags.Library:
927 factor = 3
928 }
929 case "PHP", "php":
930 switch kind {
931 case ctags.Class:
932 factor = 10
933 case ctags.Interface:
934 factor = 9
935 case ctags.Function:
936 factor = 8
937 case ctags.Trait:
938 factor = 7
939 case ctags.Define:
940 factor = 6
941 case ctags.Namespace:
942 factor = 5
943 case ctags.MethodAlias:
944 factor = 4
945 case ctags.Variable:
946 factor = 3
947 case ctags.Local:
948 factor = 3
949 }
950 case "GraphQL", "graphql":
951 switch kind {
952 case ctags.Type:
953 factor = 10
954 }
955 case "Markdown", "markdown":
956 // Headers are good signal in docs, but do not rank as highly as code.
957 switch kind {
958 case ctags.Chapter: // #
959 factor = 4
960 case ctags.Section: // ##
961 factor = 3
962 case ctags.Subsection: // ###
963 factor = 2
964 }
965 }
966
967 return factor * scoreKindMatch
968}
969
970type matchScoreSlice []LineMatch
971
972func (m matchScoreSlice) Len() int { return len(m) }
973func (m matchScoreSlice) Swap(i, j int) { m[i], m[j] = m[j], m[i] }
974func (m matchScoreSlice) Less(i, j int) bool { return m[i].Score > m[j].Score }
975
976type chunkMatchScoreSlice []ChunkMatch
977
978func (m chunkMatchScoreSlice) Len() int { return len(m) }
979func (m chunkMatchScoreSlice) Swap(i, j int) { m[i], m[j] = m[j], m[i] }
980func (m chunkMatchScoreSlice) Less(i, j int) bool { return m[i].Score > m[j].Score }
981
982type fileMatchesByScore []FileMatch
983
984func (m fileMatchesByScore) Len() int { return len(m) }
985func (m fileMatchesByScore) Swap(i, j int) { m[i], m[j] = m[j], m[i] }
986func (m fileMatchesByScore) Less(i, j int) bool { return m[i].Score > m[j].Score }
987
988func sortMatchesByScore(ms []LineMatch) {
989 sort.Sort(matchScoreSlice(ms))
990}
991
992func sortChunkMatchesByScore(ms []ChunkMatch) {
993 sort.Sort(chunkMatchScoreSlice(ms))
994}
995
996// SortFiles sorts files matches in the order we want to present results to
997// users. The order depends on the match score, which includes both
998// query-dependent signals like word overlap, and file-only signals like the
999// file ranks (if file ranks are enabled).
1000//
1001// We don't only use the scores, we will also boost some results to present
1002// files with novel extensions.
1003func SortFiles(ms []FileMatch) {
1004 sort.Sort(fileMatchesByScore(ms))
1005
1006 // Boost a file extension not in the top 3 to the third filematch.
1007 boostNovelExtension(ms, 2, 0.9)
1008}
1009
1010func boostNovelExtension(ms []FileMatch, boostOffset int, minScoreRatio float64) {
1011 if len(ms) <= boostOffset+1 {
1012 return
1013 }
1014
1015 top := ms[:boostOffset]
1016 candidates := ms[boostOffset:]
1017
1018 // Don't bother boosting something which is significantly different to the
1019 // result it replaces.
1020 minScoreForNovelty := candidates[0].Score * minScoreRatio
1021
1022 // We want to look for an ext that isn't in the top exts
1023 exts := make([]string, len(top))
1024 for i := range top {
1025 exts[i] = path.Ext(top[i].FileName)
1026 }
1027
1028 for i := range candidates {
1029 // Do not assume sorted due to boostNovelExtension being called on subsets
1030 if candidates[i].Score < minScoreForNovelty {
1031 continue
1032 }
1033
1034 if slices.Contains(exts, path.Ext(candidates[i].FileName)) {
1035 continue
1036 }
1037
1038 // Found what we are looking for, now boost to front of candidates (which
1039 // is ms[boostOffset])
1040 for ; i > 0; i-- {
1041 candidates[i], candidates[i-1] = candidates[i-1], candidates[i]
1042 }
1043 return
1044 }
1045}