fork of https://github.com/sourcegraph/zoekt
1// Copyright 2016 Google Inc. All rights reserved.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15package index
16
17import (
18 "bytes"
19 "log"
20 "path"
21 "slices"
22 "sort"
23 "unicode"
24 "unicode/utf8"
25
26 "github.com/sourcegraph/zoekt"
27 "github.com/sourcegraph/zoekt/internal/ctags"
28)
29
30var _ = log.Println
31
32// contentProvider is an abstraction to treat matches for names and
33// content with the same code.
34type contentProvider struct {
35 id *indexData
36 stats *zoekt.Stats
37
38 // mutable
39 err error
40 idx uint32
41 _data []byte
42 _nl []uint32
43 _nlBuf []uint32
44 _sects []DocumentSection
45 _sectBuf []DocumentSection
46 fileSize uint32
47}
48
49// setDocument skips to the given document.
50func (p *contentProvider) setDocument(docID uint32) {
51 fileStart := p.id.boundaries[docID]
52
53 p.idx = docID
54 p.fileSize = p.id.boundaries[docID+1] - fileStart
55
56 p._nl = nil
57 p._sects = nil
58 p._data = nil
59}
60
61func (p *contentProvider) docSections() []DocumentSection {
62 if p._sects == nil {
63 var sz uint32
64 p._sects, sz, p.err = p.id.readDocSections(p.idx, p._sectBuf)
65 p.stats.ContentBytesLoaded += int64(sz)
66 p._sectBuf = p._sects
67 }
68 return p._sects
69}
70
71func (p *contentProvider) newlines() newlines {
72 if p._nl == nil {
73 var sz uint32
74 p._nl, sz, p.err = p.id.readNewlines(p.idx, p._nlBuf)
75 p._nlBuf = p._nl
76 p.stats.ContentBytesLoaded += int64(sz)
77 }
78 return newlines{locs: p._nl, fileSize: p.fileSize}
79}
80
81func (p *contentProvider) data(fileName bool) []byte {
82 if fileName {
83 return p.id.fileNameContent[p.id.fileNameIndex[p.idx]:p.id.fileNameIndex[p.idx+1]]
84 }
85
86 if p._data == nil {
87 p._data, p.err = p.id.readContents(p.idx)
88 p.stats.FilesLoaded++
89 p.stats.ContentBytesLoaded += int64(len(p._data))
90 }
91 return p._data
92}
93
94// Find offset in bytes (relative to corpus start) for an offset in
95// runes (relative to document start). If filename is set, the corpus
96// is the set of filenames, with the document being the name itself.
97func (p *contentProvider) findOffset(filename bool, r uint32) uint32 {
98 if p.id.metaData.PlainASCII {
99 return r
100 }
101
102 sample := p.id.runeOffsets
103 runeEnds := p.id.fileEndRunes
104 fileStartByte := p.id.boundaries[p.idx]
105 if filename {
106 sample = p.id.fileNameRuneOffsets
107 runeEnds = p.id.fileNameEndRunes
108 fileStartByte = p.id.fileNameIndex[p.idx]
109 }
110
111 absR := r
112 if p.idx > 0 {
113 absR += runeEnds[p.idx-1]
114 }
115
116 byteOff, left := sample.lookup(absR)
117
118 var data []byte
119
120 if filename {
121 data = p.id.fileNameContent[byteOff:]
122 } else {
123 data, p.err = p.id.readContentSlice(byteOff, 3*runeOffsetFrequency)
124 if p.err != nil {
125 return 0
126 }
127 }
128 for left > 0 {
129 _, sz := utf8.DecodeRune(data)
130 byteOff += uint32(sz)
131 data = data[sz:]
132 left--
133 }
134
135 byteOff -= fileStartByte
136 return byteOff
137}
138
139// fillMatches converts the internal candidateMatch slice into our API's LineMatch.
140// It only ever returns content XOR filename matches, not both. If there are any
141// content matches, these are always returned, and we omit filename matches.
142//
143// Performance invariant: ms is sorted and non-overlapping.
144//
145// Note: the byte slices may be backed by mmapped data, so before being
146// returned by the API it needs to be copied.
147func (p *contentProvider) fillMatches(ms []*candidateMatch, numContextLines int, language string, opts *zoekt.SearchOptions) []zoekt.LineMatch {
148 var filenameMatches []*candidateMatch
149 contentMatches := make([]*candidateMatch, 0, len(ms))
150
151 for _, m := range ms {
152 if m.fileName {
153 filenameMatches = append(filenameMatches, m)
154 } else {
155 contentMatches = append(contentMatches, m)
156 }
157 }
158
159 // If there are any content matches, we only return these and skip filename matches.
160 if len(contentMatches) > 0 {
161 contentMatches = breakMatchesOnNewlines(contentMatches, p.data(false))
162 return p.fillContentMatches(contentMatches, numContextLines, language, opts)
163 }
164
165 // Otherwise, we return a single line containing the filematch index.
166 lineScore, _ := p.scoreLine(filenameMatches, language, -1 /* must pass -1 for filenames */, opts)
167 res := zoekt.LineMatch{
168 Line: p.id.fileName(p.idx),
169 FileName: true,
170 Score: lineScore.score,
171 DebugScore: lineScore.debugScore,
172 }
173
174 for _, m := range ms {
175 res.LineFragments = append(res.LineFragments, zoekt.LineFragmentMatch{
176 LineOffset: int(m.byteOffset),
177 MatchLength: int(m.byteMatchSz),
178 Offset: m.byteOffset,
179 })
180 }
181
182 return []zoekt.LineMatch{res}
183
184}
185
186// fillChunkMatches converts the internal candidateMatch slice into our API's ChunkMatch.
187// It only ever returns content XOR filename matches, not both. If there are any content
188// matches, these are always returned, and we omit filename matches.
189//
190// Performance invariant: ms is sorted and non-overlapping.
191//
192// Note: the byte slices may be backed by mmapped data, so before being
193// returned by the API it needs to be copied.
194func (p *contentProvider) fillChunkMatches(ms []*candidateMatch, numContextLines int, language string, opts *zoekt.SearchOptions) []zoekt.ChunkMatch {
195 var filenameMatches []*candidateMatch
196 contentMatches := make([]*candidateMatch, 0, len(ms))
197
198 for _, m := range ms {
199 if m.fileName {
200 filenameMatches = append(filenameMatches, m)
201 } else {
202 contentMatches = append(contentMatches, m)
203 }
204 }
205
206 // If there are any content matches, we only return these and skip filename matches.
207 if len(contentMatches) > 0 {
208 return p.fillContentChunkMatches(contentMatches, numContextLines, language, opts)
209 }
210
211 // Otherwise, we return a single chunk representing the filename index.
212 lineScore, _ := p.scoreLine(filenameMatches, language, -1 /* must pass -1 for filenames */, opts)
213 fileName := p.id.fileName(p.idx)
214 ranges := make([]zoekt.Range, 0, len(ms))
215 for _, m := range ms {
216 ranges = append(ranges, zoekt.Range{
217 Start: zoekt.Location{
218 ByteOffset: m.byteOffset,
219 LineNumber: 1,
220 Column: uint32(utf8.RuneCount(fileName[:m.byteOffset]) + 1),
221 },
222 End: zoekt.Location{
223 ByteOffset: m.byteOffset + m.byteMatchSz,
224 LineNumber: 1,
225 Column: uint32(utf8.RuneCount(fileName[:m.byteOffset+m.byteMatchSz]) + 1),
226 },
227 })
228 }
229
230 return []zoekt.ChunkMatch{{
231 Content: fileName,
232 ContentStart: zoekt.Location{ByteOffset: 0, LineNumber: 1, Column: 1},
233 Ranges: ranges,
234 FileName: true,
235 Score: lineScore.score,
236 DebugScore: lineScore.debugScore,
237 }}
238}
239
240func (p *contentProvider) fillContentMatches(ms []*candidateMatch, numContextLines int, language string, opts *zoekt.SearchOptions) []zoekt.LineMatch {
241 var result []zoekt.LineMatch
242 for len(ms) > 0 {
243 m := ms[0]
244 num := p.newlines().atOffset(m.byteOffset)
245 lineStart := int(p.newlines().lineStart(num))
246 nextLineStart := int(p.newlines().lineStart(num + 1))
247
248 var lineCands []*candidateMatch
249
250 endMatch := m.byteOffset + m.byteMatchSz
251
252 for len(ms) > 0 {
253 m := ms[0]
254 if int(m.byteOffset) < nextLineStart {
255 endMatch = m.byteOffset + m.byteMatchSz
256 lineCands = append(lineCands, m)
257 ms = ms[1:]
258 } else {
259 break
260 }
261 }
262
263 if len(lineCands) == 0 {
264 log.Panicf(
265 "%s %v infinite loop: num %d start,end %d,%d, offset %d",
266 p.id.fileName(p.idx), p.id.metaData,
267 num, lineStart, nextLineStart,
268 m.byteOffset)
269 }
270
271 data := p.data(false)
272
273 // Due to merging matches, we may have a match that
274 // crosses a line boundary. Prevent confusion by
275 // taking lines until we pass the last index
276 for nextLineStart < len(data) && endMatch > uint32(nextLineStart) {
277 next := bytes.IndexByte(data[nextLineStart:], '\n')
278 if next == -1 {
279 nextLineStart = len(data)
280 } else {
281 // TODO(hanwen): test that checks "+1" part here.
282 nextLineStart += next + 1
283 }
284 }
285
286 finalMatch := zoekt.LineMatch{
287 LineStart: lineStart,
288 LineEnd: nextLineStart,
289 LineNumber: num,
290 }
291 finalMatch.Line = data[lineStart:nextLineStart]
292
293 if numContextLines > 0 {
294 finalMatch.Before = p.newlines().getLines(data, num-numContextLines, num)
295 finalMatch.After = p.newlines().getLines(data, num+1, num+1+numContextLines)
296 }
297
298 lineScore, symbolInfo := p.scoreLine(lineCands, language, num, opts)
299 finalMatch.Score = lineScore.score
300 finalMatch.DebugScore = lineScore.debugScore
301
302 for i, m := range lineCands {
303 fragment := zoekt.LineFragmentMatch{
304 Offset: m.byteOffset,
305 LineOffset: int(m.byteOffset) - lineStart,
306 MatchLength: int(m.byteMatchSz),
307 }
308
309 if i < len(symbolInfo) && symbolInfo[i] != nil {
310 fragment.SymbolInfo = symbolInfo[i]
311 }
312
313 finalMatch.LineFragments = append(finalMatch.LineFragments, fragment)
314 }
315 result = append(result, finalMatch)
316 }
317 return result
318}
319
320func (p *contentProvider) fillContentChunkMatches(ms []*candidateMatch, numContextLines int, language string, opts *zoekt.SearchOptions) []zoekt.ChunkMatch {
321 data := p.data(false)
322
323 // columnHelper prevents O(len(ms) * len(data)) lookups for all columns.
324 // However, it depends on ms being sorted by byteOffset and non-overlapping.
325 // This invariant is true at the time of writing, but we conservatively
326 // enforce this. Note: chunkCandidates preserves the sorting so safe to
327 // transform now.
328 columnHelper := columnHelper{data: data}
329 if !sort.IsSorted((sortByOffsetSlice)(ms)) {
330 log.Printf("WARN: performance invariant violated. candidate matches are not sorted in fillContentChunkMatches. Report to developers.")
331 sort.Sort((sortByOffsetSlice)(ms))
332 }
333
334 newlines := p.newlines()
335 chunks := chunkCandidates(ms, newlines, numContextLines)
336 chunkMatches := make([]zoekt.ChunkMatch, 0, len(chunks))
337 for _, chunk := range chunks {
338 ranges := make([]zoekt.Range, 0, len(chunk.candidates))
339 for _, cm := range chunk.candidates {
340 startOffset := cm.byteOffset
341 endOffset := cm.byteOffset + cm.byteMatchSz
342 startLine, endLine := newlines.offsetRangeToLineRange(startOffset, endOffset)
343
344 ranges = append(ranges, zoekt.Range{
345 Start: zoekt.Location{
346 ByteOffset: startOffset,
347 LineNumber: uint32(startLine),
348 Column: columnHelper.get(int(newlines.lineStart(startLine)), startOffset),
349 },
350 End: zoekt.Location{
351 ByteOffset: endOffset,
352 LineNumber: uint32(endLine),
353 Column: columnHelper.get(int(newlines.lineStart(endLine)), endOffset),
354 },
355 })
356 }
357
358 firstLineNumber := int(chunk.firstLine) - numContextLines
359 if firstLineNumber < 1 {
360 firstLineNumber = 1
361 }
362 firstLineStart := newlines.lineStart(firstLineNumber)
363
364 chunkScore, symbolInfo := p.scoreChunk(chunk.candidates, language, opts)
365 chunkMatches = append(chunkMatches, zoekt.ChunkMatch{
366 Content: newlines.getLines(data, firstLineNumber, int(chunk.lastLine)+numContextLines+1),
367 ContentStart: zoekt.Location{
368 ByteOffset: firstLineStart,
369 LineNumber: uint32(firstLineNumber),
370 Column: 1,
371 },
372 FileName: false,
373 Ranges: ranges,
374 SymbolInfo: symbolInfo,
375 BestLineMatch: uint32(chunkScore.bestLine),
376 Score: chunkScore.score,
377 DebugScore: chunkScore.debugScore,
378 })
379 }
380 return chunkMatches
381}
382
383type candidateChunk struct {
384 candidates []*candidateMatch
385 firstLine uint32 // 1-based, inclusive
386 lastLine uint32 // 1-based, inclusive
387 minOffset uint32 // 0-based, inclusive
388 maxOffset uint32 // 0-based, exclusive
389}
390
391// chunkCandidates groups a set of sorted, non-overlapping candidate matches by line number. Adjacent
392// chunks will be merged if adding `numContextLines` to the beginning and end of the chunk would cause
393// it to overlap with an adjacent chunk.
394//
395// input invariants: ms is sorted by byteOffset and is non overlapping with respect to endOffset.
396// output invariants: if you flatten candidates the input invariant is retained.
397func chunkCandidates(ms []*candidateMatch, newlines newlines, numContextLines int) []candidateChunk {
398 var chunks []candidateChunk
399
400 for _, m := range ms {
401 startOffset := m.byteOffset
402 endOffset := m.byteOffset + m.byteMatchSz
403 firstLine, lastLine := newlines.offsetRangeToLineRange(startOffset, endOffset)
404
405 if len(chunks) > 0 && int(chunks[len(chunks)-1].lastLine)+numContextLines >= firstLine-numContextLines {
406 // If a new chunk created with the current candidateMatch would
407 // overlap with the previous chunk, instead add the candidateMatch
408 // to the last chunk and extend end of the last chunk.
409 last := &chunks[len(chunks)-1]
410 last.candidates = append(last.candidates, m)
411 if last.maxOffset < endOffset {
412 last.lastLine = uint32(lastLine)
413 last.maxOffset = uint32(endOffset)
414 }
415 } else {
416 chunks = append(chunks, candidateChunk{
417 firstLine: uint32(firstLine),
418 lastLine: uint32(lastLine),
419 minOffset: startOffset,
420 maxOffset: endOffset,
421 candidates: []*candidateMatch{m},
422 })
423 }
424 }
425 return chunks
426}
427
428// columnHelper is a helper struct which caches the number of runes last
429// counted. If we naively use utf8.RuneCount for each match on a line, this
430// leads to an O(nm) algorithm where m is the number of matches and n is the
431// length of the line. Aassuming we our candidates are increasing in offset
432// makes this operation O(n) instead.
433type columnHelper struct {
434 data []byte
435
436 // 0 values for all these are valid values
437 lastLineOffset int
438 lastOffset uint32
439 lastRuneCount uint32
440}
441
442// get returns the line column for offset. offset is the byte offset of the
443// rune in data. lineOffset is the byte offset inside of data for the line
444// containing offset.
445func (c *columnHelper) get(lineOffset int, offset uint32) uint32 {
446 var runeCount uint32
447
448 if lineOffset == c.lastLineOffset && offset >= c.lastOffset {
449 // Can count from last calculation
450 runeCount = c.lastRuneCount + uint32(utf8.RuneCount(c.data[c.lastOffset:offset]))
451 } else {
452 // Need to count from the beginning of line
453 runeCount = uint32(utf8.RuneCount(c.data[lineOffset:offset]))
454 }
455
456 c.lastLineOffset = lineOffset
457 c.lastOffset = offset
458 c.lastRuneCount = runeCount
459
460 return runeCount + 1
461}
462
463type newlines struct {
464 // locs is the sorted set of byte offsets of the newlines in the file
465 locs []uint32
466
467 // fileSize is just the number of bytes in the file. It is stored
468 // on this struct so we can safely know the length of the last line
469 // in the file since not all files end in a newline.
470 fileSize uint32
471}
472
473// atOffset returns the line containing the offset. If the offset lands on
474// the newline ending line M, we return M.
475func (nls newlines) atOffset(offset uint32) (lineNumber int) {
476 idx := sort.Search(len(nls.locs), func(n int) bool {
477 return nls.locs[n] >= offset
478 })
479 return idx + 1
480}
481
482// lineStart returns the byte offset of the beginning of the given line.
483// lineNumber is 1-based. If lineNumber is out of range of the lines in the
484// file, the return value will be clamped to [0,fileSize].
485func (nls newlines) lineStart(lineNumber int) uint32 {
486 // nls.locs[0] + 1 is the start of the 2nd line of data.
487 startIdx := lineNumber - 2
488
489 if startIdx < 0 {
490 return 0
491 } else if startIdx >= len(nls.locs) {
492 return nls.fileSize
493 } else {
494 return nls.locs[startIdx] + 1
495 }
496}
497
498// offsetRangeToLineRange returns range of lines that fully contains the given byte range.
499// The inputs are 0-based byte offsets into the file representing the (exclusive) range [startOffset, endOffset).
500// The return values are 1-based line numbers representing the (inclusive) range [startLine, endLine].
501func (nls newlines) offsetRangeToLineRange(startOffset, endOffset uint32) (startLine, endLine int) {
502 startLine = nls.atOffset(startOffset)
503 endLine = nls.atOffset(
504 max(startOffset, max(endOffset, 1)-1), // clamp endOffset and prevent underflow
505 )
506 return startLine, endLine
507}
508
509// getLines returns a slice of data containing the lines [low, high).
510// low is 1-based and inclusive. high is 1-based and exclusive.
511func (nls newlines) getLines(data []byte, low, high int) []byte {
512 if low >= high {
513 return nil
514 }
515
516 return data[nls.lineStart(low):nls.lineStart(high)]
517}
518
519const (
520 // Query-dependent scoring signals. All of these together are bounded at ~9000
521 // (scoreWordMatch + scoreSymbol + scoreKindMatch * 10 + scoreFactorAtomMatch).
522 scorePartialWordMatch = 50.0
523 scoreWordMatch = 500.0
524 scoreBase = 7000.0
525 scorePartialBase = 4000.0
526 scoreSymbol = 7000.0
527 scorePartialSymbol = 4000.0
528 scoreKindMatch = 100.0
529 scoreFactorAtomMatch = 400.0
530
531 // Used for ordering line and chunk matches within a file.
532 scoreLineOrderFactor = 1.0
533
534 // Used for tiebreakers. The scores are not combined with the main score, but
535 // are used to break ties between matches with the same score. The factors are
536 // chosen to separate the tiebreakers from the main score and from each other.
537 // If you make changes here, make sure to update indexData.scoreFile too.
538 scoreRepoRankFactor = 100.0
539 scoreFileOrderFactor = 10.0
540)
541
542// findMaxOverlappingSection returns the index of the section in secs that
543// overlaps the most with the area defined by off and sz, relative to the size
544// of the section. If no section overlaps, it returns 0, false. If multiple
545// sections overlap the same amount, the first one is returned.
546//
547// The implementation assumes that sections do not overlap and are sorted by
548// DocumentSection.Start.
549func findMaxOverlappingSection(secs []DocumentSection, off, sz uint32) (uint32, bool) {
550 start := off
551 end := off + sz
552
553 // Find the first section that might overlap
554 j := sort.Search(len(secs), func(i int) bool { return secs[i].End > start })
555
556 if j == len(secs) || secs[j].Start >= end {
557 // No overlap.
558 return 0, false
559 }
560
561 relOverlap := func(j int) float64 {
562 secSize := secs[j].End - secs[j].Start
563 if secSize == 0 {
564 return 0
565 }
566 // This cannot overflow because we make sure there is overlap before calling relOverlap
567 overlap := min(secs[j].End, end) - max(secs[j].Start, start)
568 return float64(overlap) / float64(secSize)
569 }
570
571 ol1 := relOverlap(j)
572 if epsilonEqualsOne(ol1) || j == len(secs)-1 || secs[j+1].Start >= end {
573 return uint32(j), ol1 > 0
574 }
575
576 // We know that [off,off+sz[ overlaps with at least 2 sections. We only have to check
577 // if the second section overlaps more than the first one, because a third
578 // section can only overlap if the overlap with the second section is complete.
579 ol2 := relOverlap(j + 1)
580 if ol2 > ol1 {
581 return uint32(j + 1), ol2 > 0
582 }
583
584 return uint32(j), ol1 > 0
585}
586
587func (p *contentProvider) matchesSymbol(cm *candidateMatch) bool {
588 if cm.fileName {
589 return false
590 }
591
592 // Check if this candidate came from a symbol matchTree
593 if cm.symbol {
594 return true
595 }
596
597 // Check if it overlaps with a symbol.
598 secs := p.docSections()
599 _, ok := findMaxOverlappingSection(secs, cm.byteOffset, cm.byteMatchSz)
600 return ok
601}
602
603func (p *contentProvider) findSymbol(cm *candidateMatch) (DocumentSection, *zoekt.Symbol, bool) {
604 if cm.fileName {
605 return DocumentSection{}, nil, false
606 }
607
608 secs := p.docSections()
609
610 secIdx, ok := cm.symbolIdx, cm.symbol
611 if !ok {
612 // Not from a symbol matchTree. Let's see if it overlaps with a symbol.
613 secIdx, ok = findMaxOverlappingSection(secs, cm.byteOffset, cm.byteMatchSz)
614 }
615 if !ok {
616 return DocumentSection{}, nil, false
617 }
618
619 sec := secs[secIdx]
620
621 // Now lets hydrate in the SymbolInfo. We do not hydrate in SymbolInfo.Sym
622 // since some callsites do not need it stored, and that incurs an extra
623 // copy.
624 //
625 // 2024-01-08 we are refactoring this and the code path indicates this can
626 // fail, so callers need to handle nil symbol. However, it would be
627 // surprising that we have a matching section but not symbol data.
628 start := p.id.fileEndSymbol[p.idx]
629 si := p.id.symbols.data(start + secIdx)
630
631 return sec, si, true
632}
633
634// sectionSlice will return data[sec.Start:sec.End] but will clip Start and
635// End such that it won't be out of range.
636func sectionSlice(data []byte, sec DocumentSection) []byte {
637 l := uint32(len(data))
638 if sec.Start >= l {
639 return nil
640 }
641 if sec.End > l {
642 sec.End = l
643 }
644 return data[sec.Start:sec.End]
645}
646
647// scoreSymbolKind boosts a match based on the combination of language, symbol
648// and kind. The language string comes from go-enry, the symbol and kind from
649// ctags.
650func scoreSymbolKind(language string, filename []byte, sym []byte, kind ctags.SymbolKind) float64 {
651 var factor float64
652
653 // Generic ranking which will be overriden by language specific ranking
654 switch kind {
655 case ctags.Type: // scip-ctags regression workaround https://github.com/sourcegraph/sourcegraph/issues/57659
656 factor = 8
657 case ctags.Class:
658 factor = 10
659 case ctags.Struct:
660 factor = 9.5
661 case ctags.Enum:
662 factor = 9
663 case ctags.Interface:
664 factor = 8
665 case ctags.Function, ctags.Method:
666 factor = 7
667 case ctags.Field:
668 factor = 5.5
669 case ctags.Constant:
670 factor = 5
671 case ctags.Variable:
672 factor = 4
673 default:
674 // For all other kinds, assign a low score by default.
675 factor = 1
676 }
677
678 switch language {
679 case "Java", "java":
680 switch kind {
681 // 2022-03-30: go-ctags contains a regex rule for Java classes that sets "kind"
682 // to "classes" instead of "c". We have to cover both cases to support existing
683 // indexes.
684 case ctags.Class:
685 factor = 10
686 case ctags.Enum:
687 factor = 9
688 case ctags.Interface:
689 factor = 8
690 case ctags.Method:
691 factor = 7
692 case ctags.Field:
693 factor = 6
694 case ctags.EnumConstant:
695 factor = 5
696 }
697 case "Kotlin", "kotlin":
698 switch kind {
699 case ctags.Class:
700 factor = 10
701 case ctags.Interface:
702 factor = 9
703 case ctags.Method:
704 factor = 8
705 case ctags.TypeAlias:
706 factor = 7
707 case ctags.Constant:
708 factor = 6
709 case ctags.Variable:
710 factor = 5
711 }
712 case "Go", "go":
713 switch kind {
714 // scip-ctags regression workaround https://github.com/sourcegraph/sourcegraph/issues/57659
715 // for each case a description of the fields in ctags in the comment
716 case ctags.Type: // interface struct talias
717 factor = 9
718 case ctags.Interface: // interfaces
719 factor = 10
720 case ctags.Struct: // structs
721 factor = 9
722 case ctags.TypeAlias: // type aliases
723 factor = 9
724 case ctags.MethodSpec: // interface method specification
725 factor = 8.5
726 case ctags.Method, ctags.Function: // functions
727 factor = 8
728 case ctags.Field: // struct fields
729 factor = 7
730 case ctags.Constant: // constants
731 factor = 6
732 case ctags.Variable: // variables
733 factor = 5
734 }
735
736 // Boost exported go symbols. Same implementation as token.IsExported
737 if ch, _ := utf8.DecodeRune(sym); unicode.IsUpper(ch) {
738 factor += 0.5
739 }
740
741 if bytes.HasSuffix(filename, []byte("_test.go")) {
742 factor *= 0.8
743 }
744
745 // Could also rank on:
746 //
747 // - anonMember struct anonymous members
748 // - packageName name for specifying imported package
749 // - receiver receivers
750 // - package packages
751 // - type types
752 // - unknown unknown
753 case "C++", "c++":
754 switch kind {
755 case ctags.Class: // classes
756 factor = 10
757 case ctags.Enum: // enumeration names
758 factor = 9
759 case ctags.Function: // function definitions
760 factor = 8
761 case ctags.Struct: // structure names
762 factor = 7
763 case ctags.Union: // union names
764 factor = 6
765 case ctags.TypeAlias: // typedefs
766 factor = 5
767 case ctags.Field: // class, struct, and union members
768 factor = 4
769 case ctags.Variable: // varialbe definitions
770 factor = 3
771 }
772 // Could also rank on:
773 // NAME DESCRIPTION
774 // macro macro definitions
775 // enumerator enumerators (values inside an enumeration)
776 // header included header files
777 // namespace namespaces
778 // variable variable definitions
779 case "Scala", "scala":
780 switch kind {
781 case ctags.Class:
782 factor = 10
783 case ctags.Interface:
784 factor = 9
785 case ctags.Object:
786 factor = 8
787 case ctags.Function:
788 factor = 7
789 case ctags.Type:
790 factor = 6
791 case ctags.Variable:
792 factor = 5
793 case ctags.Package:
794 factor = 4
795 }
796 case "Python", "python":
797 switch kind {
798 case ctags.Class: // classes
799 factor = 10
800 case ctags.Function, ctags.Method: // function definitions
801 factor = 8
802 case ctags.Field: // class, struct, and union members
803 factor = 4
804 case ctags.Variable: // variable definitions
805 factor = 3
806 case ctags.Local: // local variables
807 factor = 2
808 }
809 // Could also rank on:
810 //
811 // - namespace name referring a module defined in other file
812 // - module modules
813 // - unknown name referring a class/variable/function/module defined in other module
814 // - parameter function parameters
815 case "Ruby", "ruby":
816 switch kind {
817 case ctags.Class:
818 factor = 10
819 case ctags.Method:
820 factor = 9
821 case ctags.MethodAlias:
822 factor = 8
823 case ctags.Module:
824 factor = 7
825 case ctags.SingletonMethod:
826 factor = 6
827 case ctags.Constant:
828 factor = 5
829 case ctags.Accessor:
830 factor = 4
831 case ctags.Library:
832 factor = 3
833 }
834 case "PHP", "php":
835 switch kind {
836 case ctags.Class:
837 factor = 10
838 case ctags.Interface:
839 factor = 9
840 case ctags.Function:
841 factor = 8
842 case ctags.Trait:
843 factor = 7
844 case ctags.Define:
845 factor = 6
846 case ctags.Namespace:
847 factor = 5
848 case ctags.MethodAlias:
849 factor = 4
850 case ctags.Variable:
851 factor = 3
852 case ctags.Local:
853 factor = 3
854 }
855 case "GraphQL", "graphql":
856 switch kind {
857 case ctags.Type:
858 factor = 10
859 }
860 case "Markdown", "markdown":
861 // Headers are good signal in docs, but do not rank as highly as code.
862 switch kind {
863 case ctags.Chapter: // #
864 factor = 4
865 case ctags.Section: // ##
866 factor = 3
867 case ctags.Subsection: // ###
868 factor = 2
869 }
870 }
871
872 return factor * scoreKindMatch
873}
874
875type matchScoreSlice []zoekt.LineMatch
876
877func (m matchScoreSlice) Len() int { return len(m) }
878func (m matchScoreSlice) Swap(i, j int) { m[i], m[j] = m[j], m[i] }
879func (m matchScoreSlice) Less(i, j int) bool { return m[i].Score > m[j].Score }
880
881type chunkMatchScoreSlice []zoekt.ChunkMatch
882
883func (m chunkMatchScoreSlice) Len() int { return len(m) }
884func (m chunkMatchScoreSlice) Swap(i, j int) { m[i], m[j] = m[j], m[i] }
885func (m chunkMatchScoreSlice) Less(i, j int) bool { return m[i].Score > m[j].Score }
886
887type fileMatchesByScore []zoekt.FileMatch
888
889func (m fileMatchesByScore) Len() int { return len(m) }
890func (m fileMatchesByScore) Swap(i, j int) { m[i], m[j] = m[j], m[i] }
891func (m fileMatchesByScore) Less(i, j int) bool { return m[i].Score > m[j].Score }
892
893func sortMatchesByScore(ms []zoekt.LineMatch) {
894 sort.Sort(matchScoreSlice(ms))
895}
896
897func sortChunkMatchesByScore(ms []zoekt.ChunkMatch) {
898 sort.Sort(chunkMatchScoreSlice(ms))
899}
900
901// SortFiles sorts files matches in the order we want to present results to
902// users. The order depends on the match score, which includes both
903// query-dependent signals like word overlap, and file-only signals like the
904// file ranks (if file ranks are enabled).
905//
906// We don't only use the scores, we will also boost some results to present
907// files with novel extensions.
908func SortFiles(ms []zoekt.FileMatch) {
909 sort.Sort(fileMatchesByScore(ms))
910
911 // Boost a file extension not in the top 3 to the third filematch.
912 boostNovelExtension(ms, 2, 0.9)
913}
914
915func boostNovelExtension(ms []zoekt.FileMatch, boostOffset int, minScoreRatio float64) {
916 if len(ms) <= boostOffset+1 {
917 return
918 }
919
920 top := ms[:boostOffset]
921 candidates := ms[boostOffset:]
922
923 // Don't bother boosting something which is significantly different to the
924 // result it replaces.
925 minScoreForNovelty := candidates[0].Score * minScoreRatio
926
927 // We want to look for an ext that isn't in the top exts
928 exts := make([]string, len(top))
929 for i := range top {
930 exts[i] = path.Ext(top[i].FileName)
931 }
932
933 for i := range candidates {
934 // Do not assume sorted due to boostNovelExtension being called on subsets
935 if candidates[i].Score < minScoreForNovelty {
936 continue
937 }
938
939 if slices.Contains(exts, path.Ext(candidates[i].FileName)) {
940 continue
941 }
942
943 // Found what we are looking for, now boost to front of candidates (which
944 // is ms[boostOffset])
945 for ; i > 0; i-- {
946 candidates[i], candidates[i-1] = candidates[i-1], candidates[i]
947 }
948 return
949 }
950}