fork of https://github.com/sourcegraph/zoekt
1// Copyright 2016 Google Inc. All rights reserved.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15package zoekt
16
17import (
18 "bytes"
19 "log"
20 "path"
21 "slices"
22 "sort"
23 "unicode"
24 "unicode/utf8"
25
26 "github.com/sourcegraph/zoekt/ctags"
27)
28
29var _ = log.Println
30
31// contentProvider is an abstraction to treat matches for names and
32// content with the same code.
33type contentProvider struct {
34 id *indexData
35 stats *Stats
36
37 // mutable
38 err error
39 idx uint32
40 _data []byte
41 _nl []uint32
42 _nlBuf []uint32
43 _sects []DocumentSection
44 _sectBuf []DocumentSection
45 fileSize uint32
46}
47
48// setDocument skips to the given document.
49func (p *contentProvider) setDocument(docID uint32) {
50 fileStart := p.id.boundaries[docID]
51
52 p.idx = docID
53 p.fileSize = p.id.boundaries[docID+1] - fileStart
54
55 p._nl = nil
56 p._sects = nil
57 p._data = nil
58}
59
60func (p *contentProvider) docSections() []DocumentSection {
61 if p._sects == nil {
62 var sz uint32
63 p._sects, sz, p.err = p.id.readDocSections(p.idx, p._sectBuf)
64 p.stats.ContentBytesLoaded += int64(sz)
65 p._sectBuf = p._sects
66 }
67 return p._sects
68}
69
70func (p *contentProvider) newlines() newlines {
71 if p._nl == nil {
72 var sz uint32
73 p._nl, sz, p.err = p.id.readNewlines(p.idx, p._nlBuf)
74 p._nlBuf = p._nl
75 p.stats.ContentBytesLoaded += int64(sz)
76 }
77 return newlines{locs: p._nl, fileSize: p.fileSize}
78}
79
80func (p *contentProvider) data(fileName bool) []byte {
81 if fileName {
82 return p.id.fileNameContent[p.id.fileNameIndex[p.idx]:p.id.fileNameIndex[p.idx+1]]
83 }
84
85 if p._data == nil {
86 p._data, p.err = p.id.readContents(p.idx)
87 p.stats.FilesLoaded++
88 p.stats.ContentBytesLoaded += int64(len(p._data))
89 }
90 return p._data
91}
92
93// Find offset in bytes (relative to corpus start) for an offset in
94// runes (relative to document start). If filename is set, the corpus
95// is the set of filenames, with the document being the name itself.
96func (p *contentProvider) findOffset(filename bool, r uint32) uint32 {
97 if p.id.metaData.PlainASCII {
98 return r
99 }
100
101 sample := p.id.runeOffsets
102 runeEnds := p.id.fileEndRunes
103 fileStartByte := p.id.boundaries[p.idx]
104 if filename {
105 sample = p.id.fileNameRuneOffsets
106 runeEnds = p.id.fileNameEndRunes
107 fileStartByte = p.id.fileNameIndex[p.idx]
108 }
109
110 absR := r
111 if p.idx > 0 {
112 absR += runeEnds[p.idx-1]
113 }
114
115 byteOff, left := sample.lookup(absR)
116
117 var data []byte
118
119 if filename {
120 data = p.id.fileNameContent[byteOff:]
121 } else {
122 data, p.err = p.id.readContentSlice(byteOff, 3*runeOffsetFrequency)
123 if p.err != nil {
124 return 0
125 }
126 }
127 for left > 0 {
128 _, sz := utf8.DecodeRune(data)
129 byteOff += uint32(sz)
130 data = data[sz:]
131 left--
132 }
133
134 byteOff -= fileStartByte
135 return byteOff
136}
137
138// fillMatches converts the internal candidateMatch slice into our API's LineMatch.
139// It only ever returns content XOR filename matches, not both. If there are any
140// content matches, these are always returned, and we omit filename matches.
141//
142// Performance invariant: ms is sorted and non-overlapping.
143//
144// Note: the byte slices may be backed by mmapped data, so before being
145// returned by the API it needs to be copied.
146func (p *contentProvider) fillMatches(ms []*candidateMatch, numContextLines int, language string, opts *SearchOptions) []LineMatch {
147 var filenameMatches []*candidateMatch
148 contentMatches := make([]*candidateMatch, 0, len(ms))
149
150 for _, m := range ms {
151 if m.fileName {
152 filenameMatches = append(filenameMatches, m)
153 } else {
154 contentMatches = append(contentMatches, m)
155 }
156 }
157
158 // If there are any content matches, we only return these and skip filename matches.
159 if len(contentMatches) > 0 {
160 contentMatches = breakMatchesOnNewlines(contentMatches, p.data(false))
161 return p.fillContentMatches(contentMatches, numContextLines, language, opts)
162 }
163
164 // Otherwise, we return a single line containing the filematch match.
165 lineScore, _ := p.scoreLine(filenameMatches, language, -1 /* must pass -1 for filenames */, opts)
166 res := LineMatch{
167 Line: p.id.fileName(p.idx),
168 FileName: true,
169 Score: lineScore.score,
170 DebugScore: lineScore.debugScore,
171 }
172
173 for _, m := range ms {
174 res.LineFragments = append(res.LineFragments, LineFragmentMatch{
175 LineOffset: int(m.byteOffset),
176 MatchLength: int(m.byteMatchSz),
177 Offset: m.byteOffset,
178 })
179 }
180
181 return []LineMatch{res}
182
183}
184
185// fillChunkMatches converts the internal candidateMatch slice into our API's ChunkMatch.
186// It only ever returns content XOR filename matches, not both. If there are any content
187// matches, these are always returned, and we omit filename matches.
188//
189// Performance invariant: ms is sorted and non-overlapping.
190//
191// Note: the byte slices may be backed by mmapped data, so before being
192// returned by the API it needs to be copied.
193func (p *contentProvider) fillChunkMatches(ms []*candidateMatch, numContextLines int, language string, opts *SearchOptions) []ChunkMatch {
194 var filenameMatches []*candidateMatch
195 contentMatches := make([]*candidateMatch, 0, len(ms))
196
197 for _, m := range ms {
198 if m.fileName {
199 filenameMatches = append(filenameMatches, m)
200 } else {
201 contentMatches = append(contentMatches, m)
202 }
203 }
204
205 // If there are any content matches, we only return these and skip filename matches.
206 if len(contentMatches) > 0 {
207 return p.fillContentChunkMatches(contentMatches, numContextLines, language, opts)
208 }
209
210 // Otherwise, we return a single chunk representing the filename match.
211 lineScore, _ := p.scoreLine(filenameMatches, language, -1 /* must pass -1 for filenames */, opts)
212 fileName := p.id.fileName(p.idx)
213 ranges := make([]Range, 0, len(ms))
214 for _, m := range ms {
215 ranges = append(ranges, Range{
216 Start: Location{
217 ByteOffset: m.byteOffset,
218 LineNumber: 1,
219 Column: uint32(utf8.RuneCount(fileName[:m.byteOffset]) + 1),
220 },
221 End: Location{
222 ByteOffset: m.byteOffset + m.byteMatchSz,
223 LineNumber: 1,
224 Column: uint32(utf8.RuneCount(fileName[:m.byteOffset+m.byteMatchSz]) + 1),
225 },
226 })
227 }
228
229 return []ChunkMatch{{
230 Content: fileName,
231 ContentStart: Location{ByteOffset: 0, LineNumber: 1, Column: 1},
232 Ranges: ranges,
233 FileName: true,
234 Score: lineScore.score,
235 DebugScore: lineScore.debugScore,
236 }}
237}
238
239func (p *contentProvider) fillContentMatches(ms []*candidateMatch, numContextLines int, language string, opts *SearchOptions) []LineMatch {
240 var result []LineMatch
241 for len(ms) > 0 {
242 m := ms[0]
243 num := p.newlines().atOffset(m.byteOffset)
244 lineStart := int(p.newlines().lineStart(num))
245 nextLineStart := int(p.newlines().lineStart(num + 1))
246
247 var lineCands []*candidateMatch
248
249 endMatch := m.byteOffset + m.byteMatchSz
250
251 for len(ms) > 0 {
252 m := ms[0]
253 if int(m.byteOffset) < nextLineStart {
254 endMatch = m.byteOffset + m.byteMatchSz
255 lineCands = append(lineCands, m)
256 ms = ms[1:]
257 } else {
258 break
259 }
260 }
261
262 if len(lineCands) == 0 {
263 log.Panicf(
264 "%s %v infinite loop: num %d start,end %d,%d, offset %d",
265 p.id.fileName(p.idx), p.id.metaData,
266 num, lineStart, nextLineStart,
267 m.byteOffset)
268 }
269
270 data := p.data(false)
271
272 // Due to merging matches, we may have a match that
273 // crosses a line boundary. Prevent confusion by
274 // taking lines until we pass the last match
275 for nextLineStart < len(data) && endMatch > uint32(nextLineStart) {
276 next := bytes.IndexByte(data[nextLineStart:], '\n')
277 if next == -1 {
278 nextLineStart = len(data)
279 } else {
280 // TODO(hanwen): test that checks "+1" part here.
281 nextLineStart += next + 1
282 }
283 }
284
285 finalMatch := LineMatch{
286 LineStart: lineStart,
287 LineEnd: nextLineStart,
288 LineNumber: num,
289 }
290 finalMatch.Line = data[lineStart:nextLineStart]
291
292 if numContextLines > 0 {
293 finalMatch.Before = p.newlines().getLines(data, num-numContextLines, num)
294 finalMatch.After = p.newlines().getLines(data, num+1, num+1+numContextLines)
295 }
296
297 lineScore, symbolInfo := p.scoreLine(lineCands, language, num, opts)
298 finalMatch.Score = lineScore.score
299 finalMatch.DebugScore = lineScore.debugScore
300
301 for i, m := range lineCands {
302 fragment := LineFragmentMatch{
303 Offset: m.byteOffset,
304 LineOffset: int(m.byteOffset) - lineStart,
305 MatchLength: int(m.byteMatchSz),
306 }
307
308 if i < len(symbolInfo) && symbolInfo[i] != nil {
309 fragment.SymbolInfo = symbolInfo[i]
310 }
311
312 finalMatch.LineFragments = append(finalMatch.LineFragments, fragment)
313 }
314 result = append(result, finalMatch)
315 }
316 return result
317}
318
319func (p *contentProvider) fillContentChunkMatches(ms []*candidateMatch, numContextLines int, language string, opts *SearchOptions) []ChunkMatch {
320 data := p.data(false)
321
322 // columnHelper prevents O(len(ms) * len(data)) lookups for all columns.
323 // However, it depends on ms being sorted by byteOffset and non-overlapping.
324 // This invariant is true at the time of writing, but we conservatively
325 // enforce this. Note: chunkCandidates preserves the sorting so safe to
326 // transform now.
327 columnHelper := columnHelper{data: data}
328 if !sort.IsSorted((sortByOffsetSlice)(ms)) {
329 log.Printf("WARN: performance invariant violated. candidate matches are not sorted in fillContentChunkMatches. Report to developers.")
330 sort.Sort((sortByOffsetSlice)(ms))
331 }
332
333 newlines := p.newlines()
334 chunks := chunkCandidates(ms, newlines, numContextLines)
335 chunkMatches := make([]ChunkMatch, 0, len(chunks))
336 for _, chunk := range chunks {
337 ranges := make([]Range, 0, len(chunk.candidates))
338 for _, cm := range chunk.candidates {
339 startOffset := cm.byteOffset
340 endOffset := cm.byteOffset + cm.byteMatchSz
341 startLine, endLine := newlines.offsetRangeToLineRange(startOffset, endOffset)
342
343 ranges = append(ranges, Range{
344 Start: Location{
345 ByteOffset: startOffset,
346 LineNumber: uint32(startLine),
347 Column: columnHelper.get(int(newlines.lineStart(startLine)), startOffset),
348 },
349 End: Location{
350 ByteOffset: endOffset,
351 LineNumber: uint32(endLine),
352 Column: columnHelper.get(int(newlines.lineStart(endLine)), endOffset),
353 },
354 })
355 }
356
357 firstLineNumber := int(chunk.firstLine) - numContextLines
358 if firstLineNumber < 1 {
359 firstLineNumber = 1
360 }
361 firstLineStart := newlines.lineStart(firstLineNumber)
362
363 chunkScore, symbolInfo := p.scoreChunk(chunk.candidates, language, opts)
364 chunkMatches = append(chunkMatches, ChunkMatch{
365 Content: newlines.getLines(data, firstLineNumber, int(chunk.lastLine)+numContextLines+1),
366 ContentStart: Location{
367 ByteOffset: firstLineStart,
368 LineNumber: uint32(firstLineNumber),
369 Column: 1,
370 },
371 FileName: false,
372 Ranges: ranges,
373 SymbolInfo: symbolInfo,
374 BestLineMatch: uint32(chunkScore.bestLine),
375 Score: chunkScore.score,
376 DebugScore: chunkScore.debugScore,
377 })
378 }
379 return chunkMatches
380}
381
382type candidateChunk struct {
383 candidates []*candidateMatch
384 firstLine uint32 // 1-based, inclusive
385 lastLine uint32 // 1-based, inclusive
386 minOffset uint32 // 0-based, inclusive
387 maxOffset uint32 // 0-based, exclusive
388}
389
390// chunkCandidates groups a set of sorted, non-overlapping candidate matches by line number. Adjacent
391// chunks will be merged if adding `numContextLines` to the beginning and end of the chunk would cause
392// it to overlap with an adjacent chunk.
393//
394// input invariants: ms is sorted by byteOffset and is non overlapping with respect to endOffset.
395// output invariants: if you flatten candidates the input invariant is retained.
396func chunkCandidates(ms []*candidateMatch, newlines newlines, numContextLines int) []candidateChunk {
397 var chunks []candidateChunk
398
399 for _, m := range ms {
400 startOffset := m.byteOffset
401 endOffset := m.byteOffset + m.byteMatchSz
402 firstLine, lastLine := newlines.offsetRangeToLineRange(startOffset, endOffset)
403
404 if len(chunks) > 0 && int(chunks[len(chunks)-1].lastLine)+numContextLines >= firstLine-numContextLines {
405 // If a new chunk created with the current candidateMatch would
406 // overlap with the previous chunk, instead add the candidateMatch
407 // to the last chunk and extend end of the last chunk.
408 last := &chunks[len(chunks)-1]
409 last.candidates = append(last.candidates, m)
410 if last.maxOffset < endOffset {
411 last.lastLine = uint32(lastLine)
412 last.maxOffset = uint32(endOffset)
413 }
414 } else {
415 chunks = append(chunks, candidateChunk{
416 firstLine: uint32(firstLine),
417 lastLine: uint32(lastLine),
418 minOffset: startOffset,
419 maxOffset: endOffset,
420 candidates: []*candidateMatch{m},
421 })
422 }
423 }
424 return chunks
425}
426
427// columnHelper is a helper struct which caches the number of runes last
428// counted. If we naively use utf8.RuneCount for each match on a line, this
429// leads to an O(nm) algorithm where m is the number of matches and n is the
430// length of the line. Aassuming we our candidates are increasing in offset
431// makes this operation O(n) instead.
432type columnHelper struct {
433 data []byte
434
435 // 0 values for all these are valid values
436 lastLineOffset int
437 lastOffset uint32
438 lastRuneCount uint32
439}
440
441// get returns the line column for offset. offset is the byte offset of the
442// rune in data. lineOffset is the byte offset inside of data for the line
443// containing offset.
444func (c *columnHelper) get(lineOffset int, offset uint32) uint32 {
445 var runeCount uint32
446
447 if lineOffset == c.lastLineOffset && offset >= c.lastOffset {
448 // Can count from last calculation
449 runeCount = c.lastRuneCount + uint32(utf8.RuneCount(c.data[c.lastOffset:offset]))
450 } else {
451 // Need to count from the beginning of line
452 runeCount = uint32(utf8.RuneCount(c.data[lineOffset:offset]))
453 }
454
455 c.lastLineOffset = lineOffset
456 c.lastOffset = offset
457 c.lastRuneCount = runeCount
458
459 return runeCount + 1
460}
461
462type newlines struct {
463 // locs is the sorted set of byte offsets of the newlines in the file
464 locs []uint32
465
466 // fileSize is just the number of bytes in the file. It is stored
467 // on this struct so we can safely know the length of the last line
468 // in the file since not all files end in a newline.
469 fileSize uint32
470}
471
472// atOffset returns the line containing the offset. If the offset lands on
473// the newline ending line M, we return M.
474func (nls newlines) atOffset(offset uint32) (lineNumber int) {
475 idx := sort.Search(len(nls.locs), func(n int) bool {
476 return nls.locs[n] >= offset
477 })
478 return idx + 1
479}
480
481// lineStart returns the byte offset of the beginning of the given line.
482// lineNumber is 1-based. If lineNumber is out of range of the lines in the
483// file, the return value will be clamped to [0,fileSize].
484func (nls newlines) lineStart(lineNumber int) uint32 {
485 // nls.locs[0] + 1 is the start of the 2nd line of data.
486 startIdx := lineNumber - 2
487
488 if startIdx < 0 {
489 return 0
490 } else if startIdx >= len(nls.locs) {
491 return nls.fileSize
492 } else {
493 return nls.locs[startIdx] + 1
494 }
495}
496
497// offsetRangeToLineRange returns range of lines that fully contains the given byte range.
498// The inputs are 0-based byte offsets into the file representing the (exclusive) range [startOffset, endOffset).
499// The return values are 1-based line numbers representing the (inclusive) range [startLine, endLine].
500func (nls newlines) offsetRangeToLineRange(startOffset, endOffset uint32) (startLine, endLine int) {
501 startLine = nls.atOffset(startOffset)
502 endLine = nls.atOffset(
503 max(startOffset, max(endOffset, 1)-1), // clamp endOffset and prevent underflow
504 )
505 return startLine, endLine
506}
507
508// getLines returns a slice of data containing the lines [low, high).
509// low is 1-based and inclusive. high is 1-based and exclusive.
510func (nls newlines) getLines(data []byte, low, high int) []byte {
511 if low >= high {
512 return nil
513 }
514
515 return data[nls.lineStart(low):nls.lineStart(high)]
516}
517
518const (
519 // Query-dependent scoring signals. All of these together are bounded at ~9000
520 // (scoreWordMatch + scoreSymbol + scoreKindMatch * 10 + scoreFactorAtomMatch).
521 scorePartialWordMatch = 50.0
522 scoreWordMatch = 500.0
523 scoreBase = 7000.0
524 scorePartialBase = 4000.0
525 scoreSymbol = 7000.0
526 scorePartialSymbol = 4000.0
527 scoreKindMatch = 100.0
528 scoreFactorAtomMatch = 400.0
529
530 // Used for ordering line and chunk matches within a file.
531 scoreLineOrderFactor = 1.0
532
533 // Used for tiebreakers. The scores are not combined with the main score, but
534 // are used to break ties between matches with the same score. The factors are
535 // chosen to separate the tiebreakers from the main score and from each other.
536 // If you make changes here, make sure to update indexData.scoreFile too.
537 scoreRepoRankFactor = 100.0
538 scoreFileOrderFactor = 10.0
539)
540
541// findMaxOverlappingSection returns the index of the section in secs that
542// overlaps the most with the area defined by off and sz, relative to the size
543// of the section. If no section overlaps, it returns 0, false. If multiple
544// sections overlap the same amount, the first one is returned.
545//
546// The implementation assumes that sections do not overlap and are sorted by
547// DocumentSection.Start.
548func findMaxOverlappingSection(secs []DocumentSection, off, sz uint32) (uint32, bool) {
549 start := off
550 end := off + sz
551
552 // Find the first section that might overlap
553 j := sort.Search(len(secs), func(i int) bool { return secs[i].End > start })
554
555 if j == len(secs) || secs[j].Start >= end {
556 // No overlap.
557 return 0, false
558 }
559
560 relOverlap := func(j int) float64 {
561 secSize := secs[j].End - secs[j].Start
562 if secSize == 0 {
563 return 0
564 }
565 // This cannot overflow because we make sure there is overlap before calling relOverlap
566 overlap := min(secs[j].End, end) - max(secs[j].Start, start)
567 return float64(overlap) / float64(secSize)
568 }
569
570 ol1 := relOverlap(j)
571 if epsilonEqualsOne(ol1) || j == len(secs)-1 || secs[j+1].Start >= end {
572 return uint32(j), ol1 > 0
573 }
574
575 // We know that [off,off+sz[ overlaps with at least 2 sections. We only have to check
576 // if the second section overlaps more than the first one, because a third
577 // section can only overlap if the overlap with the second section is complete.
578 ol2 := relOverlap(j + 1)
579 if ol2 > ol1 {
580 return uint32(j + 1), ol2 > 0
581 }
582
583 return uint32(j), ol1 > 0
584}
585
586func (p *contentProvider) matchesSymbol(cm *candidateMatch) bool {
587 if cm.fileName {
588 return false
589 }
590
591 // Check if this candidate came from a symbol matchTree
592 if cm.symbol {
593 return true
594 }
595
596 // Check if it overlaps with a symbol.
597 secs := p.docSections()
598 _, ok := findMaxOverlappingSection(secs, cm.byteOffset, cm.byteMatchSz)
599 return ok
600}
601
602func (p *contentProvider) findSymbol(cm *candidateMatch) (DocumentSection, *Symbol, bool) {
603 if cm.fileName {
604 return DocumentSection{}, nil, false
605 }
606
607 secs := p.docSections()
608
609 secIdx, ok := cm.symbolIdx, cm.symbol
610 if !ok {
611 // Not from a symbol matchTree. Let's see if it overlaps with a symbol.
612 secIdx, ok = findMaxOverlappingSection(secs, cm.byteOffset, cm.byteMatchSz)
613 }
614 if !ok {
615 return DocumentSection{}, nil, false
616 }
617
618 sec := secs[secIdx]
619
620 // Now lets hydrate in the SymbolInfo. We do not hydrate in SymbolInfo.Sym
621 // since some callsites do not need it stored, and that incurs an extra
622 // copy.
623 //
624 // 2024-01-08 we are refactoring this and the code path indicates this can
625 // fail, so callers need to handle nil symbol. However, it would be
626 // surprising that we have a matching section but not symbol data.
627 start := p.id.fileEndSymbol[p.idx]
628 si := p.id.symbols.data(start + secIdx)
629
630 return sec, si, true
631}
632
633// sectionSlice will return data[sec.Start:sec.End] but will clip Start and
634// End such that it won't be out of range.
635func sectionSlice(data []byte, sec DocumentSection) []byte {
636 l := uint32(len(data))
637 if sec.Start >= l {
638 return nil
639 }
640 if sec.End > l {
641 sec.End = l
642 }
643 return data[sec.Start:sec.End]
644}
645
646// scoreSymbolKind boosts a match based on the combination of language, symbol
647// and kind. The language string comes from go-enry, the symbol and kind from
648// ctags.
649func scoreSymbolKind(language string, filename []byte, sym []byte, kind ctags.SymbolKind) float64 {
650 var factor float64
651
652 // Generic ranking which will be overriden by language specific ranking
653 switch kind {
654 case ctags.Type: // scip-ctags regression workaround https://github.com/sourcegraph/sourcegraph/issues/57659
655 factor = 8
656 case ctags.Class:
657 factor = 10
658 case ctags.Struct:
659 factor = 9.5
660 case ctags.Enum:
661 factor = 9
662 case ctags.Interface:
663 factor = 8
664 case ctags.Function, ctags.Method:
665 factor = 7
666 case ctags.Field:
667 factor = 5.5
668 case ctags.Constant:
669 factor = 5
670 case ctags.Variable:
671 factor = 4
672 default:
673 // For all other kinds, assign a low score by default.
674 factor = 1
675 }
676
677 switch language {
678 case "Java", "java":
679 switch kind {
680 // 2022-03-30: go-ctags contains a regex rule for Java classes that sets "kind"
681 // to "classes" instead of "c". We have to cover both cases to support existing
682 // indexes.
683 case ctags.Class:
684 factor = 10
685 case ctags.Enum:
686 factor = 9
687 case ctags.Interface:
688 factor = 8
689 case ctags.Method:
690 factor = 7
691 case ctags.Field:
692 factor = 6
693 case ctags.EnumConstant:
694 factor = 5
695 }
696 case "Kotlin", "kotlin":
697 switch kind {
698 case ctags.Class:
699 factor = 10
700 case ctags.Interface:
701 factor = 9
702 case ctags.Method:
703 factor = 8
704 case ctags.TypeAlias:
705 factor = 7
706 case ctags.Constant:
707 factor = 6
708 case ctags.Variable:
709 factor = 5
710 }
711 case "Go", "go":
712 switch kind {
713 // scip-ctags regression workaround https://github.com/sourcegraph/sourcegraph/issues/57659
714 // for each case a description of the fields in ctags in the comment
715 case ctags.Type: // interface struct talias
716 factor = 9
717 case ctags.Interface: // interfaces
718 factor = 10
719 case ctags.Struct: // structs
720 factor = 9
721 case ctags.TypeAlias: // type aliases
722 factor = 9
723 case ctags.MethodSpec: // interface method specification
724 factor = 8.5
725 case ctags.Method, ctags.Function: // functions
726 factor = 8
727 case ctags.Field: // struct fields
728 factor = 7
729 case ctags.Constant: // constants
730 factor = 6
731 case ctags.Variable: // variables
732 factor = 5
733 }
734
735 // Boost exported go symbols. Same implementation as token.IsExported
736 if ch, _ := utf8.DecodeRune(sym); unicode.IsUpper(ch) {
737 factor += 0.5
738 }
739
740 if bytes.HasSuffix(filename, []byte("_test.go")) {
741 factor *= 0.8
742 }
743
744 // Could also rank on:
745 //
746 // - anonMember struct anonymous members
747 // - packageName name for specifying imported package
748 // - receiver receivers
749 // - package packages
750 // - type types
751 // - unknown unknown
752 case "C++", "c++":
753 switch kind {
754 case ctags.Class: // classes
755 factor = 10
756 case ctags.Enum: // enumeration names
757 factor = 9
758 case ctags.Function: // function definitions
759 factor = 8
760 case ctags.Struct: // structure names
761 factor = 7
762 case ctags.Union: // union names
763 factor = 6
764 case ctags.TypeAlias: // typedefs
765 factor = 5
766 case ctags.Field: // class, struct, and union members
767 factor = 4
768 case ctags.Variable: // varialbe definitions
769 factor = 3
770 }
771 // Could also rank on:
772 // NAME DESCRIPTION
773 // macro macro definitions
774 // enumerator enumerators (values inside an enumeration)
775 // header included header files
776 // namespace namespaces
777 // variable variable definitions
778 case "Scala", "scala":
779 switch kind {
780 case ctags.Class:
781 factor = 10
782 case ctags.Interface:
783 factor = 9
784 case ctags.Object:
785 factor = 8
786 case ctags.Function:
787 factor = 7
788 case ctags.Type:
789 factor = 6
790 case ctags.Variable:
791 factor = 5
792 case ctags.Package:
793 factor = 4
794 }
795 case "Python", "python":
796 switch kind {
797 case ctags.Class: // classes
798 factor = 10
799 case ctags.Function, ctags.Method: // function definitions
800 factor = 8
801 case ctags.Field: // class, struct, and union members
802 factor = 4
803 case ctags.Variable: // variable definitions
804 factor = 3
805 case ctags.Local: // local variables
806 factor = 2
807 }
808 // Could also rank on:
809 //
810 // - namespace name referring a module defined in other file
811 // - module modules
812 // - unknown name referring a class/variable/function/module defined in other module
813 // - parameter function parameters
814 case "Ruby", "ruby":
815 switch kind {
816 case ctags.Class:
817 factor = 10
818 case ctags.Method:
819 factor = 9
820 case ctags.MethodAlias:
821 factor = 8
822 case ctags.Module:
823 factor = 7
824 case ctags.SingletonMethod:
825 factor = 6
826 case ctags.Constant:
827 factor = 5
828 case ctags.Accessor:
829 factor = 4
830 case ctags.Library:
831 factor = 3
832 }
833 case "PHP", "php":
834 switch kind {
835 case ctags.Class:
836 factor = 10
837 case ctags.Interface:
838 factor = 9
839 case ctags.Function:
840 factor = 8
841 case ctags.Trait:
842 factor = 7
843 case ctags.Define:
844 factor = 6
845 case ctags.Namespace:
846 factor = 5
847 case ctags.MethodAlias:
848 factor = 4
849 case ctags.Variable:
850 factor = 3
851 case ctags.Local:
852 factor = 3
853 }
854 case "GraphQL", "graphql":
855 switch kind {
856 case ctags.Type:
857 factor = 10
858 }
859 case "Markdown", "markdown":
860 // Headers are good signal in docs, but do not rank as highly as code.
861 switch kind {
862 case ctags.Chapter: // #
863 factor = 4
864 case ctags.Section: // ##
865 factor = 3
866 case ctags.Subsection: // ###
867 factor = 2
868 }
869 }
870
871 return factor * scoreKindMatch
872}
873
874type matchScoreSlice []LineMatch
875
876func (m matchScoreSlice) Len() int { return len(m) }
877func (m matchScoreSlice) Swap(i, j int) { m[i], m[j] = m[j], m[i] }
878func (m matchScoreSlice) Less(i, j int) bool { return m[i].Score > m[j].Score }
879
880type chunkMatchScoreSlice []ChunkMatch
881
882func (m chunkMatchScoreSlice) Len() int { return len(m) }
883func (m chunkMatchScoreSlice) Swap(i, j int) { m[i], m[j] = m[j], m[i] }
884func (m chunkMatchScoreSlice) Less(i, j int) bool { return m[i].Score > m[j].Score }
885
886type fileMatchesByScore []FileMatch
887
888func (m fileMatchesByScore) Len() int { return len(m) }
889func (m fileMatchesByScore) Swap(i, j int) { m[i], m[j] = m[j], m[i] }
890func (m fileMatchesByScore) Less(i, j int) bool { return m[i].Score > m[j].Score }
891
892func sortMatchesByScore(ms []LineMatch) {
893 sort.Sort(matchScoreSlice(ms))
894}
895
896func sortChunkMatchesByScore(ms []ChunkMatch) {
897 sort.Sort(chunkMatchScoreSlice(ms))
898}
899
900// SortFiles sorts files matches in the order we want to present results to
901// users. The order depends on the match score, which includes both
902// query-dependent signals like word overlap, and file-only signals like the
903// file ranks (if file ranks are enabled).
904//
905// We don't only use the scores, we will also boost some results to present
906// files with novel extensions.
907func SortFiles(ms []FileMatch) {
908 sort.Sort(fileMatchesByScore(ms))
909
910 // Boost a file extension not in the top 3 to the third filematch.
911 boostNovelExtension(ms, 2, 0.9)
912}
913
914func boostNovelExtension(ms []FileMatch, boostOffset int, minScoreRatio float64) {
915 if len(ms) <= boostOffset+1 {
916 return
917 }
918
919 top := ms[:boostOffset]
920 candidates := ms[boostOffset:]
921
922 // Don't bother boosting something which is significantly different to the
923 // result it replaces.
924 minScoreForNovelty := candidates[0].Score * minScoreRatio
925
926 // We want to look for an ext that isn't in the top exts
927 exts := make([]string, len(top))
928 for i := range top {
929 exts[i] = path.Ext(top[i].FileName)
930 }
931
932 for i := range candidates {
933 // Do not assume sorted due to boostNovelExtension being called on subsets
934 if candidates[i].Score < minScoreForNovelty {
935 continue
936 }
937
938 if slices.Contains(exts, path.Ext(candidates[i].FileName)) {
939 continue
940 }
941
942 // Found what we are looking for, now boost to front of candidates (which
943 // is ms[boostOffset])
944 for ; i > 0; i-- {
945 candidates[i], candidates[i-1] = candidates[i-1], candidates[i]
946 }
947 return
948 }
949}