fork of https://github.com/sourcegraph/zoekt
1// Copyright 2016 Google Inc. All rights reserved.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15package zoekt
16
17import (
18 "bytes"
19 "fmt"
20 "log"
21 "os"
22 "path"
23 "sort"
24 "strings"
25 "unicode"
26 "unicode/utf8"
27
28 "github.com/sourcegraph/zoekt/ctags"
29 "golang.org/x/exp/slices"
30)
31
32var _ = log.Println
33
34// contentProvider is an abstraction to treat matches for names and
35// content with the same code.
36type contentProvider struct {
37 id *indexData
38 stats *Stats
39
40 // mutable
41 err error
42 idx uint32
43 _data []byte
44 _nl []uint32
45 _nlBuf []uint32
46 _sects []DocumentSection
47 _sectBuf []DocumentSection
48 fileSize uint32
49}
50
51// setDocument skips to the given document.
52func (p *contentProvider) setDocument(docID uint32) {
53 fileStart := p.id.boundaries[docID]
54
55 p.idx = docID
56 p.fileSize = p.id.boundaries[docID+1] - fileStart
57
58 p._nl = nil
59 p._sects = nil
60 p._data = nil
61}
62
63func (p *contentProvider) docSections() []DocumentSection {
64 if p._sects == nil {
65 var sz uint32
66 p._sects, sz, p.err = p.id.readDocSections(p.idx, p._sectBuf)
67 p.stats.ContentBytesLoaded += int64(sz)
68 p._sectBuf = p._sects
69 }
70 return p._sects
71}
72
73func (p *contentProvider) newlines() newlines {
74 if p._nl == nil {
75 var sz uint32
76 p._nl, sz, p.err = p.id.readNewlines(p.idx, p._nlBuf)
77 p._nlBuf = p._nl
78 p.stats.ContentBytesLoaded += int64(sz)
79 }
80 return newlines{locs: p._nl, fileSize: p.fileSize}
81}
82
83func (p *contentProvider) data(fileName bool) []byte {
84 if fileName {
85 return p.id.fileNameContent[p.id.fileNameIndex[p.idx]:p.id.fileNameIndex[p.idx+1]]
86 }
87
88 if p._data == nil {
89 p._data, p.err = p.id.readContents(p.idx)
90 p.stats.FilesLoaded++
91 p.stats.ContentBytesLoaded += int64(len(p._data))
92 }
93 return p._data
94}
95
96// Find offset in bytes (relative to corpus start) for an offset in
97// runes (relative to document start). If filename is set, the corpus
98// is the set of filenames, with the document being the name itself.
99func (p *contentProvider) findOffset(filename bool, r uint32) uint32 {
100 if p.id.metaData.PlainASCII {
101 return r
102 }
103
104 sample := p.id.runeOffsets
105 runeEnds := p.id.fileEndRunes
106 fileStartByte := p.id.boundaries[p.idx]
107 if filename {
108 sample = p.id.fileNameRuneOffsets
109 runeEnds = p.id.fileNameEndRunes
110 fileStartByte = p.id.fileNameIndex[p.idx]
111 }
112
113 absR := r
114 if p.idx > 0 {
115 absR += runeEnds[p.idx-1]
116 }
117
118 byteOff, left := sample.lookup(absR)
119
120 var data []byte
121
122 if filename {
123 data = p.id.fileNameContent[byteOff:]
124 } else {
125 data, p.err = p.id.readContentSlice(byteOff, 3*runeOffsetFrequency)
126 if p.err != nil {
127 return 0
128 }
129 }
130 for left > 0 {
131 _, sz := utf8.DecodeRune(data)
132 byteOff += uint32(sz)
133 data = data[sz:]
134 left--
135 }
136
137 byteOff -= fileStartByte
138 return byteOff
139}
140
141func (p *contentProvider) fillMatches(ms []*candidateMatch, numContextLines int, language string, debug bool) []LineMatch {
142 var result []LineMatch
143 if ms[0].fileName {
144 // There is only "line" in a filename.
145 res := LineMatch{
146 Line: p.id.fileName(p.idx),
147 FileName: true,
148 }
149
150 for _, m := range ms {
151 res.LineFragments = append(res.LineFragments, LineFragmentMatch{
152 LineOffset: int(m.byteOffset),
153 MatchLength: int(m.byteMatchSz),
154 Offset: m.byteOffset,
155 })
156
157 result = []LineMatch{res}
158 }
159 } else {
160 ms = breakMatchesOnNewlines(ms, p.data(false))
161 result = p.fillContentMatches(ms, numContextLines)
162 }
163
164 sects := p.docSections()
165 for i, m := range result {
166 result[i].Score, result[i].DebugScore = p.matchScore(sects, &m, language, debug)
167 }
168
169 return result
170}
171
172func (p *contentProvider) fillChunkMatches(ms []*candidateMatch, numContextLines int, language string, debug bool) []ChunkMatch {
173 var result []ChunkMatch
174 if ms[0].fileName {
175 // If the first match is a filename match, there will only be
176 // one match and the matched content will be the filename.
177
178 fileName := p.id.fileName(p.idx)
179 ranges := make([]Range, 0, len(ms))
180 for _, m := range ms {
181 ranges = append(ranges, Range{
182 Start: Location{
183 ByteOffset: m.byteOffset,
184 LineNumber: 1,
185 Column: uint32(utf8.RuneCount(fileName[:m.byteOffset]) + 1),
186 },
187 End: Location{
188 ByteOffset: m.byteOffset + m.byteMatchSz,
189 LineNumber: 1,
190 Column: uint32(utf8.RuneCount(fileName[:m.byteOffset+m.byteMatchSz]) + 1),
191 },
192 })
193 }
194
195 result = []ChunkMatch{{
196 Content: fileName,
197 ContentStart: Location{ByteOffset: 0, LineNumber: 1, Column: 1},
198 Ranges: ranges,
199 FileName: true,
200 }}
201 } else {
202 result = p.fillContentChunkMatches(ms, numContextLines)
203 }
204
205 sects := p.docSections()
206 for i, m := range result {
207 result[i].Score, result[i].DebugScore = p.chunkMatchScore(sects, &m, language, debug)
208 }
209
210 return result
211}
212
213func (p *contentProvider) fillContentMatches(ms []*candidateMatch, numContextLines int) []LineMatch {
214 var result []LineMatch
215 for len(ms) > 0 {
216 m := ms[0]
217 num, lineStart, lineEnd := p.newlines().atOffset(m.byteOffset)
218
219 var lineCands []*candidateMatch
220
221 endMatch := m.byteOffset + m.byteMatchSz
222
223 for len(ms) > 0 {
224 m := ms[0]
225 if int(m.byteOffset) <= lineEnd {
226 endMatch = m.byteOffset + m.byteMatchSz
227 lineCands = append(lineCands, m)
228 ms = ms[1:]
229 } else {
230 break
231 }
232 }
233
234 if len(lineCands) == 0 {
235 log.Panicf(
236 "%s %v infinite loop: num %d start,end %d,%d, offset %d",
237 p.id.fileName(p.idx), p.id.metaData,
238 num, lineStart, lineEnd,
239 m.byteOffset)
240 }
241
242 data := p.data(false)
243
244 // Due to merging matches, we may have a match that
245 // crosses a line boundary. Prevent confusion by
246 // taking lines until we pass the last match
247 for lineEnd < len(data) && endMatch > uint32(lineEnd) {
248 next := bytes.IndexByte(data[lineEnd+1:], '\n')
249 if next == -1 {
250 lineEnd = len(data)
251 } else {
252 // TODO(hanwen): test that checks "+1" part here.
253 lineEnd += next + 1
254 }
255 }
256
257 finalMatch := LineMatch{
258 LineStart: lineStart,
259 LineEnd: lineEnd,
260 LineNumber: num,
261 }
262 finalMatch.Line = data[lineStart:lineEnd]
263
264 if numContextLines > 0 {
265 finalMatch.Before = p.newlines().getLines(data, num-numContextLines, num)
266 finalMatch.After = p.newlines().getLines(data, num+1, num+1+numContextLines)
267 }
268
269 for _, m := range lineCands {
270 fragment := LineFragmentMatch{
271 Offset: m.byteOffset,
272 LineOffset: int(m.byteOffset) - lineStart,
273 MatchLength: int(m.byteMatchSz),
274 }
275 if m.symbol {
276 start := p.id.fileEndSymbol[p.idx]
277 fragment.SymbolInfo = p.id.symbols.data(start + m.symbolIdx)
278 if fragment.SymbolInfo != nil {
279 sec := p.docSections()[m.symbolIdx]
280 fragment.SymbolInfo.Sym = string(data[sec.Start:sec.End])
281 }
282 }
283
284 finalMatch.LineFragments = append(finalMatch.LineFragments, fragment)
285 }
286 result = append(result, finalMatch)
287 }
288 return result
289}
290
291func (p *contentProvider) fillContentChunkMatches(ms []*candidateMatch, numContextLines int) []ChunkMatch {
292 newlines := p.newlines()
293 chunks := chunkCandidates(ms, newlines, numContextLines)
294 data := p.data(false)
295 chunkMatches := make([]ChunkMatch, 0, len(chunks))
296 for _, chunk := range chunks {
297 ranges := make([]Range, 0, len(chunk.candidates))
298 var symbolInfo []*Symbol
299 for i, cm := range chunk.candidates {
300 startOffset := cm.byteOffset
301 endOffset := cm.byteOffset + cm.byteMatchSz
302 startLine, startLineOffset, _ := newlines.atOffset(startOffset)
303 endLine, endLineOffset, _ := newlines.atOffset(endOffset)
304
305 ranges = append(ranges, Range{
306 Start: Location{
307 ByteOffset: startOffset,
308 LineNumber: uint32(startLine),
309 Column: uint32(utf8.RuneCount(data[startLineOffset:startOffset]) + 1),
310 },
311 End: Location{
312 ByteOffset: endOffset,
313 LineNumber: uint32(endLine),
314 Column: uint32(utf8.RuneCount(data[endLineOffset:endOffset]) + 1),
315 },
316 })
317
318 if cm.symbol {
319 if symbolInfo == nil {
320 symbolInfo = make([]*Symbol, len(chunk.candidates))
321 }
322 start := p.id.fileEndSymbol[p.idx]
323 si := p.id.symbols.data(start + cm.symbolIdx)
324 if si != nil {
325 sec := p.docSections()[cm.symbolIdx]
326 si.Sym = string(data[sec.Start:sec.End])
327 }
328 symbolInfo[i] = si
329 }
330 }
331
332 firstLineNumber := int(chunk.firstLine) - numContextLines
333 if firstLineNumber < 1 {
334 firstLineNumber = 1
335 }
336 firstLineStart, _ := newlines.lineBounds(firstLineNumber)
337
338 chunkMatches = append(chunkMatches, ChunkMatch{
339 Content: newlines.getLines(data, firstLineNumber, int(chunk.lastLine)+numContextLines+1),
340 ContentStart: Location{
341 ByteOffset: firstLineStart,
342 LineNumber: uint32(firstLineNumber),
343 Column: 1,
344 },
345 FileName: false,
346 Ranges: ranges,
347 SymbolInfo: symbolInfo,
348 })
349 }
350 return chunkMatches
351}
352
353type candidateChunk struct {
354 firstLine uint32 // 1-based, inclusive
355 lastLine uint32 // 1-based, inclusive
356 minOffset uint32 // 0-based, inclusive
357 maxOffset uint32 // 0-based, exclusive
358 candidates []*candidateMatch
359}
360
361// chunkCandidates groups a set of sorted, non-overlapping candidate matches by line number. Adjacent
362// chunks will be merged if adding `numContextLines` to the beginning and end of the chunk would cause
363// it to overlap with an adjacent chunk.
364func chunkCandidates(ms []*candidateMatch, newlines newlines, numContextLines int) []candidateChunk {
365 var chunks []candidateChunk
366 for _, m := range ms {
367 startOffset := m.byteOffset
368 endOffset := m.byteOffset + m.byteMatchSz
369 firstLine, _, _ := newlines.atOffset(startOffset)
370 lastLine, _, _ := newlines.atOffset(endOffset)
371
372 if len(chunks) > 0 && int(chunks[len(chunks)-1].lastLine)+numContextLines >= firstLine-numContextLines {
373 // If a new chunk created with the current candidateMatch would
374 // overlap with the previous chunk, instead add the candidateMatch
375 // to the last chunk and extend end of the last chunk.
376 last := &chunks[len(chunks)-1]
377 last.candidates = append(last.candidates, m)
378 if last.maxOffset < endOffset {
379 last.lastLine = uint32(lastLine)
380 last.maxOffset = uint32(endOffset)
381 }
382 } else {
383 chunks = append(chunks, candidateChunk{
384 firstLine: uint32(firstLine),
385 lastLine: uint32(lastLine),
386 minOffset: startOffset,
387 maxOffset: endOffset,
388 candidates: []*candidateMatch{m},
389 })
390 }
391 }
392 return chunks
393}
394
395type newlines struct {
396 // locs is the sorted set of byte offsets of the newlines in the file
397 locs []uint32
398
399 // fileSize is just the number of bytes in the file. It is stored
400 // on this struct so we can safely know the length of the last line
401 // in the file since not all files end in a newline.
402 fileSize uint32
403}
404
405// atOffset returns the line containing the offset. If the offset lands on
406// the newline ending line M, we return M. The line is characterized
407// by its linenumber (base-1, byte index of line start, byte index of
408// line end). The line end is the index of a newline, or the filesize
409// (if matching the last line of the file.)
410func (nls newlines) atOffset(offset uint32) (lineNumber, lineStart, lineEnd int) {
411 idx := sort.Search(len(nls.locs), func(n int) bool {
412 return nls.locs[n] >= offset
413 })
414
415 start, end := nls.lineBounds(idx + 1)
416 return idx + 1, int(start), int(end)
417}
418
419// lineBounds returns the byte offsets of the start and end of the 1-based
420// lineNumber. The end offset is exclusive and will not contain the line-ending
421// newline. If the line number is out of range of the lines in the file, start
422// and end will be clamped to [0,fileSize].
423func (nls newlines) lineBounds(lineNumber int) (start, end uint32) {
424 // nls.locs[0] + 1 is the start of the 2nd line of data.
425 startIdx := lineNumber - 2
426 endIdx := lineNumber - 1
427
428 if startIdx < 0 {
429 start = 0
430 } else if startIdx >= len(nls.locs) {
431 start = nls.fileSize
432 } else {
433 start = nls.locs[startIdx] + 1
434 }
435
436 if endIdx < 0 {
437 end = 0
438 } else if endIdx >= len(nls.locs) {
439 end = nls.fileSize
440 } else {
441 end = nls.locs[endIdx]
442 }
443
444 return start, end
445}
446
447// getLines returns a slice of data containing the lines [low, high).
448// low is 1-based and inclusive. high is 1-based and exclusive.
449func (nls newlines) getLines(data []byte, low, high int) []byte {
450 if low >= high {
451 return nil
452 }
453
454 lowStart, _ := nls.lineBounds(low)
455 _, highEnd := nls.lineBounds(high - 1)
456
457 return data[lowStart:highEnd]
458}
459
460const (
461 // Query-dependent scoring signals. All of these together are bounded at ~9000
462 // (scoreWordMatch + scoreSymbol + scoreKindMatch * 10 + scoreFactorAtomMatch).
463 scorePartialWordMatch = 50.0
464 scoreWordMatch = 500.0
465 scoreBase = 7000.0
466 scorePartialBase = 4000.0
467 scoreSymbol = 7000.0
468 scorePartialSymbol = 4000.0
469 scoreKindMatch = 100.0
470 scoreFactorAtomMatch = 400.0
471
472 // File-only scoring signals. For now these are also bounded ~9000 to give them
473 // equal weight with the query-dependent signals.
474 scoreFileRankFactor = 9000.0
475 scoreFileOrderFactor = 10.0
476 scoreRepoRankFactor = 20.0
477
478 // Used for ordering line and chunk matches within a file.
479 scoreLineOrderFactor = 1.0
480)
481
482// findSection checks whether a section defined by offset and size lies within
483// one of the sections in secs.
484func findSection(secs []DocumentSection, off, sz uint32) (int, bool) {
485 j := sort.Search(len(secs), func(i int) bool {
486 return secs[i].End >= off+sz
487 })
488
489 if j == len(secs) {
490 return 0, false
491 }
492
493 if secs[j].Start <= off && off+sz <= secs[j].End {
494 return j, true
495 }
496 return 0, false
497}
498
499func (p *contentProvider) chunkMatchScore(secs []DocumentSection, m *ChunkMatch, language string, debug bool) (float64, string) {
500 type debugScore struct {
501 score float64
502 what string
503 }
504
505 score := &debugScore{}
506 maxScore := &debugScore{}
507
508 addScore := func(what string, s float64) {
509 if s != 0 && debug {
510 score.what += fmt.Sprintf("%s:%.2f, ", what, s)
511 }
512 score.score += s
513 }
514
515 data := p.data(m.FileName)
516 filename := p.data(true)
517
518 for i, r := range m.Ranges {
519 // calculate the start and end offset relative to the start of the content
520 relStartOffset := int(r.Start.ByteOffset - m.ContentStart.ByteOffset)
521 relEndOffset := int(r.End.ByteOffset - m.ContentStart.ByteOffset)
522
523 startBoundary := relStartOffset < len(m.Content) && (relStartOffset == 0 || byteClass(m.Content[relStartOffset-1]) != byteClass(m.Content[relStartOffset]))
524 endBoundary := relEndOffset > 0 && (relEndOffset == len(m.Content) || byteClass(m.Content[relEndOffset-1]) != byteClass(m.Content[relEndOffset]))
525
526 score.score = 0
527 score.what = ""
528
529 if startBoundary && endBoundary {
530 addScore("WordMatch", scoreWordMatch)
531 } else if startBoundary || endBoundary {
532 addScore("PartialWordMatch", scorePartialWordMatch)
533 }
534
535 if m.FileName {
536 sep := bytes.LastIndexByte(m.Content, '/')
537 startMatch := relStartOffset == sep+1
538 endMatch := relEndOffset == len(m.Content)
539 if startMatch && endMatch {
540 addScore("Base", scoreBase)
541 } else if startMatch || endMatch {
542 addScore("EdgeBase", (scoreBase+scorePartialBase)/2)
543 } else if sep < relStartOffset {
544 addScore("InnerBase", scorePartialBase)
545 }
546 } else if secIdx, ok := findSection(secs, uint32(r.Start.ByteOffset), uint32(r.End.ByteOffset-r.Start.ByteOffset)); ok {
547 sec := secs[secIdx]
548 startMatch := sec.Start == uint32(r.Start.ByteOffset)
549 endMatch := sec.End == uint32(r.End.ByteOffset)
550 if startMatch && endMatch {
551 addScore("Symbol", scoreSymbol)
552 } else if startMatch || endMatch {
553 addScore("EdgeSymbol", (scoreSymbol+scorePartialSymbol)/2)
554 } else {
555 addScore("InnerSymbol", scorePartialSymbol)
556 }
557
558 var si *Symbol
559 if m.SymbolInfo != nil {
560 si = m.SymbolInfo[i]
561 }
562 if si == nil {
563 // for non-symbol queries, we need to hydrate in SymbolInfo.
564 start := p.id.fileEndSymbol[p.idx]
565 si = p.id.symbols.data(start + uint32(secIdx))
566 }
567 if si != nil {
568 symbolKind := ctags.ParseSymbolKind(si.Kind)
569 sym := sectionSlice(data, sec)
570 addScore(fmt.Sprintf("kind:%s:%s", language, si.Kind), scoreSymbolKind(language, filename, sym, symbolKind))
571 }
572 }
573
574 if score.score > maxScore.score {
575 maxScore.score = score.score
576 maxScore.what = score.what
577 }
578 }
579
580 if debug {
581 maxScore.what = fmt.Sprintf("score:%.2f <- %s", maxScore.score, strings.TrimSuffix(maxScore.what, ", "))
582 }
583
584 return maxScore.score, maxScore.what
585}
586
587func (p *contentProvider) matchScore(secs []DocumentSection, m *LineMatch, language string, debug bool) (float64, string) {
588 type debugScore struct {
589 score float64
590 what string
591 }
592
593 score := &debugScore{}
594 maxScore := &debugScore{}
595
596 addScore := func(what string, s float64) {
597 if s != 0 && debug {
598 score.what += fmt.Sprintf("%s:%.2f, ", what, s)
599 }
600 score.score += s
601 }
602
603 data := p.data(m.FileName)
604 filename := p.data(true)
605
606 for _, f := range m.LineFragments {
607 startBoundary := f.LineOffset < len(m.Line) && (f.LineOffset == 0 || byteClass(m.Line[f.LineOffset-1]) != byteClass(m.Line[f.LineOffset]))
608
609 end := int(f.LineOffset) + f.MatchLength
610 endBoundary := end > 0 && (end == len(m.Line) || byteClass(m.Line[end-1]) != byteClass(m.Line[end]))
611
612 score.score = 0
613 score.what = ""
614
615 if startBoundary && endBoundary {
616 addScore("WordMatch", scoreWordMatch)
617 } else if startBoundary || endBoundary {
618 addScore("PartialWordMatch", scorePartialWordMatch)
619 }
620
621 if m.FileName {
622 sep := bytes.LastIndexByte(m.Line, '/')
623 startMatch := sep+1 == f.LineOffset
624 endMatch := len(m.Line) == f.LineOffset+f.MatchLength
625 if startMatch && endMatch {
626 addScore("Base", scoreBase)
627 } else if startMatch || endMatch {
628 addScore("EdgeBase", (scoreBase+scorePartialBase)/2)
629 } else if sep < f.LineOffset {
630 addScore("InnerBase", scorePartialBase)
631 }
632 } else if secIdx, ok := findSection(secs, f.Offset, uint32(f.MatchLength)); ok {
633 sec := secs[secIdx]
634 startMatch := sec.Start == f.Offset
635 endMatch := sec.End == f.Offset+uint32(f.MatchLength)
636 if startMatch && endMatch {
637 addScore("Symbol", scoreSymbol)
638 } else if startMatch || endMatch {
639 addScore("EdgeSymbol", (scoreSymbol+scorePartialSymbol)/2)
640 } else {
641 addScore("InnerSymbol", scorePartialSymbol)
642 }
643
644 si := f.SymbolInfo
645 if si == nil {
646 // for non-symbol queries, we need to hydrate in SymbolInfo.
647 start := p.id.fileEndSymbol[p.idx]
648 si = p.id.symbols.data(start + uint32(secIdx))
649 }
650 if si != nil {
651 // the LineFragment may not be on a symbol, then si will be nil.
652 symbolKind := ctags.ParseSymbolKind(si.Kind)
653 sym := sectionSlice(data, sec)
654 addScore(fmt.Sprintf("kind:%s:%s", language, si.Kind), scoreSymbolKind(language, filename, sym, symbolKind))
655 }
656 }
657
658 if score.score > maxScore.score {
659 maxScore.score = score.score
660 maxScore.what = score.what
661 }
662 }
663
664 if debug {
665 maxScore.what = fmt.Sprintf("score:%.2f <- %s", maxScore.score, strings.TrimSuffix(maxScore.what, ", "))
666 }
667
668 return maxScore.score, maxScore.what
669}
670
671// sectionSlice will return data[sec.Start:sec.End] but will clip Start and
672// End such that it won't be out of range.
673func sectionSlice(data []byte, sec DocumentSection) []byte {
674 l := uint32(len(data))
675 if sec.Start >= l {
676 return nil
677 }
678 if sec.End > l {
679 sec.End = l
680 }
681 return data[sec.Start:sec.End]
682}
683
684
685// scoreSymbolKind boosts a match based on the combination of language, symbol
686// and kind. The language string comes from go-enry, the symbol and kind from
687// ctags.
688func scoreSymbolKind(language string, filename []byte, sym []byte, kind ctags.SymbolKind) float64 {
689 var factor float64
690
691 // Generic ranking which will be overriden by language specific ranking
692 switch kind {
693 case ctags.Type: // scip-ctags regression workaround https://github.com/sourcegraph/sourcegraph/issues/57659
694 factor = 8
695 case ctags.Class:
696 factor = 10
697 case ctags.Struct:
698 factor = 9.5
699 case ctags.Enum:
700 factor = 9
701 case ctags.Interface:
702 factor = 8
703 case ctags.Function, ctags.Method:
704 factor = 7
705 case ctags.Field:
706 factor = 5.5
707 case ctags.Constant:
708 factor = 5
709 case ctags.Variable:
710 factor = 4
711 default:
712 // For all other kinds, assign a low score by default.
713 factor = 1
714 }
715
716 switch language {
717 case "Java", "java":
718 switch kind {
719 // 2022-03-30: go-ctags contains a regex rule for Java classes that sets "kind"
720 // to "classes" instead of "c". We have to cover both cases to support existing
721 // indexes.
722 case ctags.Class:
723 factor = 10
724 case ctags.Enum:
725 factor = 9
726 case ctags.Interface:
727 factor = 8
728 case ctags.Method:
729 factor = 7
730 case ctags.Field:
731 factor = 6
732 case ctags.EnumConstant:
733 factor = 5
734 }
735 case "Kotlin", "kotlin":
736 switch kind {
737 case ctags.Class:
738 factor = 10
739 case ctags.Interface:
740 factor = 9
741 case ctags.Method:
742 factor = 8
743 case ctags.TypeAlias:
744 factor = 7
745 case ctags.Constant:
746 factor = 6
747 case ctags.Variable:
748 factor = 5
749 }
750 case "Go", "go":
751 switch kind {
752 // scip-ctags regression workaround https://github.com/sourcegraph/sourcegraph/issues/57659
753 // for each case a description of the fields in ctags in the comment
754 case ctags.Type: // interface struct talias
755 factor = 9
756 case ctags.Interface: // interfaces
757 factor = 10
758 case ctags.Struct: // structs
759 factor = 9
760 case ctags.TypeAlias: // type aliases
761 factor = 9
762 case ctags.MethodSpec: // interface method specification
763 factor = 8.5
764 case ctags.Method, ctags.Function: // functions
765 factor = 8
766 case ctags.Field: // struct fields
767 factor = 7
768 case ctags.Constant: // constants
769 factor = 6
770 case ctags.Variable: // variables
771 factor = 5
772 }
773
774 // Boost exported go symbols. Same implementation as token.IsExported
775 if ch, _ := utf8.DecodeRune(sym); unicode.IsUpper(ch) {
776 factor += 0.5
777 }
778
779 if bytes.HasSuffix(filename, []byte("_test.go")) {
780 factor *= 0.8
781 }
782
783 // Could also rank on:
784 //
785 // - anonMember struct anonymous members
786 // - packageName name for specifying imported package
787 // - receiver receivers
788 // - package packages
789 // - type types
790 // - unknown unknown
791 case "C++", "c++":
792 switch kind {
793 case ctags.Class: // classes
794 factor = 10
795 case ctags.Enum: // enumeration names
796 factor = 9
797 case ctags.Function: // function definitions
798 factor = 8
799 case ctags.Struct: // structure names
800 factor = 7
801 case ctags.Union: // union names
802 factor = 6
803 case ctags.TypeAlias: // typedefs
804 factor = 5
805 case ctags.Field: // class, struct, and union members
806 factor = 4
807 case ctags.Variable: // varialbe definitions
808 factor = 3
809 }
810 // Could also rank on:
811 // NAME DESCRIPTION
812 // macro macro definitions
813 // enumerator enumerators (values inside an enumeration)
814 // header included header files
815 // namespace namespaces
816 // variable variable definitions
817 case "Scala", "scala":
818 switch kind {
819 case ctags.Class:
820 factor = 10
821 case ctags.Interface:
822 factor = 9
823 case ctags.Object:
824 factor = 8
825 case ctags.Function:
826 factor = 7
827 case ctags.Type:
828 factor = 6
829 case ctags.Variable:
830 factor = 5
831 case ctags.Package:
832 factor = 4
833 }
834 case "Python", "python":
835 switch kind {
836 case ctags.Class: // classes
837 factor = 10
838 case ctags.Function, ctags.Method: // function definitions
839 factor = 8
840 case ctags.Field: // class, struct, and union members
841 factor = 4
842 case ctags.Variable: // variable definitions
843 factor = 3
844 case ctags.Local: // local variables
845 factor = 2
846 }
847 // Could also rank on:
848 //
849 // - namespace name referring a module defined in other file
850 // - module modules
851 // - unknown name referring a class/variable/function/module defined in other module
852 // - parameter function parameters
853 case "Ruby", "ruby":
854 switch kind {
855 case ctags.Class:
856 factor = 10
857 case ctags.Method:
858 factor = 9
859 case ctags.MethodAlias:
860 factor = 8
861 case ctags.Module:
862 factor = 7
863 case ctags.SingletonMethod:
864 factor = 6
865 case ctags.Constant:
866 factor = 5
867 case ctags.Accessor:
868 factor = 4
869 case ctags.Library:
870 factor = 3
871 }
872 case "PHP", "php":
873 switch kind {
874 case ctags.Class:
875 factor = 10
876 case ctags.Interface:
877 factor = 9
878 case ctags.Function:
879 factor = 8
880 case ctags.Trait:
881 factor = 7
882 case ctags.Define:
883 factor = 6
884 case ctags.Namespace:
885 factor = 5
886 case ctags.MethodAlias:
887 factor = 4
888 case ctags.Variable:
889 factor = 3
890 case ctags.Local:
891 factor = 3
892 }
893 case "GraphQL", "graphql":
894 switch kind {
895 case ctags.Type:
896 factor = 10
897 }
898 case "Markdown", "markdown":
899 // Headers are good signal in docs, but do not rank as highly as code.
900 switch kind {
901 case ctags.Chapter: // #
902 factor = 4
903 case ctags.Section: // ##
904 factor = 3
905 case ctags.Subsection: // ###
906 factor = 2
907 }
908 }
909
910 return factor * scoreKindMatch
911}
912
913type matchScoreSlice []LineMatch
914
915func (m matchScoreSlice) Len() int { return len(m) }
916func (m matchScoreSlice) Swap(i, j int) { m[i], m[j] = m[j], m[i] }
917func (m matchScoreSlice) Less(i, j int) bool { return m[i].Score > m[j].Score }
918
919type chunkMatchScoreSlice []ChunkMatch
920
921func (m chunkMatchScoreSlice) Len() int { return len(m) }
922func (m chunkMatchScoreSlice) Swap(i, j int) { m[i], m[j] = m[j], m[i] }
923func (m chunkMatchScoreSlice) Less(i, j int) bool { return m[i].Score > m[j].Score }
924
925type fileMatchesByScore []FileMatch
926
927func (m fileMatchesByScore) Len() int { return len(m) }
928func (m fileMatchesByScore) Swap(i, j int) { m[i], m[j] = m[j], m[i] }
929func (m fileMatchesByScore) Less(i, j int) bool { return m[i].Score > m[j].Score }
930
931func sortMatchesByScore(ms []LineMatch) {
932 sort.Sort(matchScoreSlice(ms))
933}
934
935func sortChunkMatchesByScore(ms []ChunkMatch) {
936 sort.Sort(chunkMatchScoreSlice(ms))
937}
938
939var doNovelty = os.Getenv("ZOEKT_NOVELTY_DISABLE") == ""
940
941// SortFiles sorts files matches in the order we want to present results to
942// users. The order depends on the match score, which includes both
943// query-dependent signals like word overlap, and file-only signals like the
944// file ranks (if file ranks are enabled).
945//
946// We don't only use the scores, we will also boost some results to present
947// files with novel extensions.
948func SortFiles(ms []FileMatch) {
949 sort.Sort(fileMatchesByScore(ms))
950
951 if doNovelty {
952 // Experimentally boost something into the third filematch
953 boostNovelExtension(ms, 2, 0.9)
954 }
955}
956
957func boostNovelExtension(ms []FileMatch, boostOffset int, minScoreRatio float64) {
958 if len(ms) <= boostOffset+1 {
959 return
960 }
961
962 top := ms[:boostOffset]
963 candidates := ms[boostOffset:]
964
965 // Don't bother boosting something which is significantly different to the
966 // result it replaces.
967 minScoreForNovelty := candidates[0].Score * minScoreRatio
968
969 // We want to look for an ext that isn't in the top exts
970 exts := make([]string, len(top))
971 for i := range top {
972 exts[i] = path.Ext(top[i].FileName)
973 }
974
975 for i := range candidates {
976 // Do not assume sorted due to boostNovelExtension being called on subsets
977 if candidates[i].Score < minScoreForNovelty {
978 continue
979 }
980
981 if slices.Contains(exts, path.Ext(candidates[i].FileName)) {
982 continue
983 }
984
985 // Found what we are looking for, now boost to front of candidates (which
986 // is ms[boostOffset])
987 for ; i > 0; i-- {
988 candidates[i], candidates[i-1] = candidates[i-1], candidates[i]
989 }
990 return
991 }
992}