fork of https://github.com/sourcegraph/zoekt
1// Copyright 2016 Google Inc. All rights reserved.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15package zoekt
16
17import (
18 "bytes"
19 "fmt"
20 "log"
21 "os"
22 "path"
23 "sort"
24 "strings"
25 "unicode"
26 "unicode/utf8"
27
28 "golang.org/x/exp/slices"
29)
30
31var _ = log.Println
32
33// contentProvider is an abstraction to treat matches for names and
34// content with the same code.
35type contentProvider struct {
36 id *indexData
37 stats *Stats
38
39 // mutable
40 err error
41 idx uint32
42 _data []byte
43 _nl []uint32
44 _nlBuf []uint32
45 _sects []DocumentSection
46 _sectBuf []DocumentSection
47 fileSize uint32
48}
49
50// setDocument skips to the given document.
51func (p *contentProvider) setDocument(docID uint32) {
52 fileStart := p.id.boundaries[docID]
53
54 p.idx = docID
55 p.fileSize = p.id.boundaries[docID+1] - fileStart
56
57 p._nl = nil
58 p._sects = nil
59 p._data = nil
60}
61
62func (p *contentProvider) docSections() []DocumentSection {
63 if p._sects == nil {
64 var sz uint32
65 p._sects, sz, p.err = p.id.readDocSections(p.idx, p._sectBuf)
66 p.stats.ContentBytesLoaded += int64(sz)
67 p._sectBuf = p._sects
68 }
69 return p._sects
70}
71
72func (p *contentProvider) newlines() newlines {
73 if p._nl == nil {
74 var sz uint32
75 p._nl, sz, p.err = p.id.readNewlines(p.idx, p._nlBuf)
76 p._nlBuf = p._nl
77 p.stats.ContentBytesLoaded += int64(sz)
78 }
79 return newlines{locs: p._nl, fileSize: p.fileSize}
80}
81
82func (p *contentProvider) data(fileName bool) []byte {
83 if fileName {
84 return p.id.fileNameContent[p.id.fileNameIndex[p.idx]:p.id.fileNameIndex[p.idx+1]]
85 }
86
87 if p._data == nil {
88 p._data, p.err = p.id.readContents(p.idx)
89 p.stats.FilesLoaded++
90 p.stats.ContentBytesLoaded += int64(len(p._data))
91 }
92 return p._data
93}
94
95// Find offset in bytes (relative to corpus start) for an offset in
96// runes (relative to document start). If filename is set, the corpus
97// is the set of filenames, with the document being the name itself.
98func (p *contentProvider) findOffset(filename bool, r uint32) uint32 {
99 if p.id.metaData.PlainASCII {
100 return r
101 }
102
103 sample := p.id.runeOffsets
104 runeEnds := p.id.fileEndRunes
105 fileStartByte := p.id.boundaries[p.idx]
106 if filename {
107 sample = p.id.fileNameRuneOffsets
108 runeEnds = p.id.fileNameEndRunes
109 fileStartByte = p.id.fileNameIndex[p.idx]
110 }
111
112 absR := r
113 if p.idx > 0 {
114 absR += runeEnds[p.idx-1]
115 }
116
117 byteOff, left := sample.lookup(absR)
118
119 var data []byte
120
121 if filename {
122 data = p.id.fileNameContent[byteOff:]
123 } else {
124 data, p.err = p.id.readContentSlice(byteOff, 3*runeOffsetFrequency)
125 if p.err != nil {
126 return 0
127 }
128 }
129 for left > 0 {
130 _, sz := utf8.DecodeRune(data)
131 byteOff += uint32(sz)
132 data = data[sz:]
133 left--
134 }
135
136 byteOff -= fileStartByte
137 return byteOff
138}
139
140func (p *contentProvider) fillMatches(ms []*candidateMatch, numContextLines int, language string, debug bool) []LineMatch {
141 var result []LineMatch
142 if ms[0].fileName {
143 // There is only "line" in a filename.
144 res := LineMatch{
145 Line: p.id.fileName(p.idx),
146 FileName: true,
147 }
148
149 for _, m := range ms {
150 res.LineFragments = append(res.LineFragments, LineFragmentMatch{
151 LineOffset: int(m.byteOffset),
152 MatchLength: int(m.byteMatchSz),
153 Offset: m.byteOffset,
154 })
155
156 result = []LineMatch{res}
157 }
158 } else {
159 ms = breakMatchesOnNewlines(ms, p.data(false))
160 result = p.fillContentMatches(ms, numContextLines)
161 }
162
163 sects := p.docSections()
164 for i, m := range result {
165 result[i].Score, result[i].DebugScore = p.matchScore(sects, &m, language, debug)
166 }
167
168 return result
169}
170
171func (p *contentProvider) fillChunkMatches(ms []*candidateMatch, numContextLines int, language string, debug bool) []ChunkMatch {
172 var result []ChunkMatch
173 if ms[0].fileName {
174 // If the first match is a filename match, there will only be
175 // one match and the matched content will be the filename.
176
177 fileName := p.id.fileName(p.idx)
178 ranges := make([]Range, 0, len(ms))
179 for _, m := range ms {
180 ranges = append(ranges, Range{
181 Start: Location{
182 ByteOffset: m.byteOffset,
183 LineNumber: 1,
184 Column: uint32(utf8.RuneCount(fileName[:m.byteOffset]) + 1),
185 },
186 End: Location{
187 ByteOffset: m.byteOffset + m.byteMatchSz,
188 LineNumber: 1,
189 Column: uint32(utf8.RuneCount(fileName[:m.byteOffset+m.byteMatchSz]) + 1),
190 },
191 })
192 }
193
194 result = []ChunkMatch{{
195 Content: fileName,
196 ContentStart: Location{ByteOffset: 0, LineNumber: 1, Column: 1},
197 Ranges: ranges,
198 FileName: true,
199 }}
200 } else {
201 result = p.fillContentChunkMatches(ms, numContextLines)
202 }
203
204 sects := p.docSections()
205 for i, m := range result {
206 result[i].Score, result[i].DebugScore = p.chunkMatchScore(sects, &m, language, debug)
207 }
208
209 return result
210}
211
212func (p *contentProvider) fillContentMatches(ms []*candidateMatch, numContextLines int) []LineMatch {
213 var result []LineMatch
214 for len(ms) > 0 {
215 m := ms[0]
216 num, lineStart, lineEnd := p.newlines().atOffset(m.byteOffset)
217
218 var lineCands []*candidateMatch
219
220 endMatch := m.byteOffset + m.byteMatchSz
221
222 for len(ms) > 0 {
223 m := ms[0]
224 if int(m.byteOffset) <= lineEnd {
225 endMatch = m.byteOffset + m.byteMatchSz
226 lineCands = append(lineCands, m)
227 ms = ms[1:]
228 } else {
229 break
230 }
231 }
232
233 if len(lineCands) == 0 {
234 log.Panicf(
235 "%s %v infinite loop: num %d start,end %d,%d, offset %d",
236 p.id.fileName(p.idx), p.id.metaData,
237 num, lineStart, lineEnd,
238 m.byteOffset)
239 }
240
241 data := p.data(false)
242
243 // Due to merging matches, we may have a match that
244 // crosses a line boundary. Prevent confusion by
245 // taking lines until we pass the last match
246 for lineEnd < len(data) && endMatch > uint32(lineEnd) {
247 next := bytes.IndexByte(data[lineEnd+1:], '\n')
248 if next == -1 {
249 lineEnd = len(data)
250 } else {
251 // TODO(hanwen): test that checks "+1" part here.
252 lineEnd += next + 1
253 }
254 }
255
256 finalMatch := LineMatch{
257 LineStart: lineStart,
258 LineEnd: lineEnd,
259 LineNumber: num,
260 }
261 finalMatch.Line = data[lineStart:lineEnd]
262
263 if numContextLines > 0 {
264 finalMatch.Before = p.newlines().getLines(data, num-numContextLines, num)
265 finalMatch.After = p.newlines().getLines(data, num+1, num+1+numContextLines)
266 }
267
268 for _, m := range lineCands {
269 fragment := LineFragmentMatch{
270 Offset: m.byteOffset,
271 LineOffset: int(m.byteOffset) - lineStart,
272 MatchLength: int(m.byteMatchSz),
273 }
274 if m.symbol {
275 start := p.id.fileEndSymbol[p.idx]
276 fragment.SymbolInfo = p.id.symbols.data(start + m.symbolIdx)
277 if fragment.SymbolInfo != nil {
278 sec := p.docSections()[m.symbolIdx]
279 fragment.SymbolInfo.Sym = string(data[sec.Start:sec.End])
280 }
281 }
282
283 finalMatch.LineFragments = append(finalMatch.LineFragments, fragment)
284 }
285 result = append(result, finalMatch)
286 }
287 return result
288}
289
290func (p *contentProvider) fillContentChunkMatches(ms []*candidateMatch, numContextLines int) []ChunkMatch {
291 newlines := p.newlines()
292 chunks := chunkCandidates(ms, newlines, numContextLines)
293 data := p.data(false)
294 chunkMatches := make([]ChunkMatch, 0, len(chunks))
295 for _, chunk := range chunks {
296 ranges := make([]Range, 0, len(chunk.candidates))
297 var symbolInfo []*Symbol
298 for i, cm := range chunk.candidates {
299 startOffset := cm.byteOffset
300 endOffset := cm.byteOffset + cm.byteMatchSz
301 startLine, startLineOffset, _ := newlines.atOffset(startOffset)
302 endLine, endLineOffset, _ := newlines.atOffset(endOffset)
303
304 ranges = append(ranges, Range{
305 Start: Location{
306 ByteOffset: startOffset,
307 LineNumber: uint32(startLine),
308 Column: uint32(utf8.RuneCount(data[startLineOffset:startOffset]) + 1),
309 },
310 End: Location{
311 ByteOffset: endOffset,
312 LineNumber: uint32(endLine),
313 Column: uint32(utf8.RuneCount(data[endLineOffset:endOffset]) + 1),
314 },
315 })
316
317 if cm.symbol {
318 if symbolInfo == nil {
319 symbolInfo = make([]*Symbol, len(chunk.candidates))
320 }
321 start := p.id.fileEndSymbol[p.idx]
322 si := p.id.symbols.data(start + cm.symbolIdx)
323 if si != nil {
324 sec := p.docSections()[cm.symbolIdx]
325 si.Sym = string(data[sec.Start:sec.End])
326 }
327 symbolInfo[i] = si
328 }
329 }
330
331 firstLineNumber := int(chunk.firstLine) - numContextLines
332 if firstLineNumber < 1 {
333 firstLineNumber = 1
334 }
335 firstLineStart, _ := newlines.lineBounds(firstLineNumber)
336
337 chunkMatches = append(chunkMatches, ChunkMatch{
338 Content: newlines.getLines(data, firstLineNumber, int(chunk.lastLine)+numContextLines+1),
339 ContentStart: Location{
340 ByteOffset: firstLineStart,
341 LineNumber: uint32(firstLineNumber),
342 Column: 1,
343 },
344 FileName: false,
345 Ranges: ranges,
346 SymbolInfo: symbolInfo,
347 })
348 }
349 return chunkMatches
350}
351
352type candidateChunk struct {
353 firstLine uint32 // 1-based, inclusive
354 lastLine uint32 // 1-based, inclusive
355 minOffset uint32 // 0-based, inclusive
356 maxOffset uint32 // 0-based, exclusive
357 candidates []*candidateMatch
358}
359
360// chunkCandidates groups a set of sorted, non-overlapping candidate matches by line number. Adjacent
361// chunks will be merged if adding `numContextLines` to the beginning and end of the chunk would cause
362// it to overlap with an adjacent chunk.
363func chunkCandidates(ms []*candidateMatch, newlines newlines, numContextLines int) []candidateChunk {
364 var chunks []candidateChunk
365 for _, m := range ms {
366 startOffset := m.byteOffset
367 endOffset := m.byteOffset + m.byteMatchSz
368 firstLine, _, _ := newlines.atOffset(startOffset)
369 lastLine, _, _ := newlines.atOffset(endOffset)
370
371 if len(chunks) > 0 && int(chunks[len(chunks)-1].lastLine)+numContextLines >= firstLine-numContextLines {
372 // If a new chunk created with the current candidateMatch would
373 // overlap with the previous chunk, instead add the candidateMatch
374 // to the last chunk and extend end of the last chunk.
375 last := &chunks[len(chunks)-1]
376 last.candidates = append(last.candidates, m)
377 if last.maxOffset < endOffset {
378 last.lastLine = uint32(lastLine)
379 last.maxOffset = uint32(endOffset)
380 }
381 } else {
382 chunks = append(chunks, candidateChunk{
383 firstLine: uint32(firstLine),
384 lastLine: uint32(lastLine),
385 minOffset: startOffset,
386 maxOffset: endOffset,
387 candidates: []*candidateMatch{m},
388 })
389 }
390 }
391 return chunks
392}
393
394type newlines struct {
395 // locs is the sorted set of byte offsets of the newlines in the file
396 locs []uint32
397
398 // fileSize is just the number of bytes in the file. It is stored
399 // on this struct so we can safely know the length of the last line
400 // in the file since not all files end in a newline.
401 fileSize uint32
402}
403
404// atOffset returns the line containing the offset. If the offset lands on
405// the newline ending line M, we return M. The line is characterized
406// by its linenumber (base-1, byte index of line start, byte index of
407// line end). The line end is the index of a newline, or the filesize
408// (if matching the last line of the file.)
409func (nls newlines) atOffset(offset uint32) (lineNumber, lineStart, lineEnd int) {
410 idx := sort.Search(len(nls.locs), func(n int) bool {
411 return nls.locs[n] >= offset
412 })
413
414 start, end := nls.lineBounds(idx + 1)
415 return idx + 1, int(start), int(end)
416}
417
418// lineBounds returns the byte offsets of the start and end of the 1-based
419// lineNumber. The end offset is exclusive and will not contain the line-ending
420// newline. If the line number is out of range of the lines in the file, start
421// and end will be clamped to [0,fileSize].
422func (nls newlines) lineBounds(lineNumber int) (start, end uint32) {
423 // nls.locs[0] + 1 is the start of the 2nd line of data.
424 startIdx := lineNumber - 2
425 endIdx := lineNumber - 1
426
427 if startIdx < 0 {
428 start = 0
429 } else if startIdx >= len(nls.locs) {
430 start = nls.fileSize
431 } else {
432 start = nls.locs[startIdx] + 1
433 }
434
435 if endIdx < 0 {
436 end = 0
437 } else if endIdx >= len(nls.locs) {
438 end = nls.fileSize
439 } else {
440 end = nls.locs[endIdx]
441 }
442
443 return start, end
444}
445
446// getLines returns a slice of data containing the lines [low, high).
447// low is 1-based and inclusive. high is 1-based and exclusive.
448func (nls newlines) getLines(data []byte, low, high int) []byte {
449 if low >= high {
450 return nil
451 }
452
453 lowStart, _ := nls.lineBounds(low)
454 _, highEnd := nls.lineBounds(high - 1)
455
456 return data[lowStart:highEnd]
457}
458
459const (
460 // Query-dependent scoring signals. All of these together are bounded at ~9000
461 // (scoreWordMatch + scoreSymbol + scoreKindMatch * 10 + scoreFactorAtomMatch).
462 scorePartialWordMatch = 50.0
463 scoreWordMatch = 500.0
464 scoreBase = 7000.0
465 scorePartialBase = 4000.0
466 scoreSymbol = 7000.0
467 scorePartialSymbol = 4000.0
468 scoreKindMatch = 100.0
469 scoreFactorAtomMatch = 400.0
470
471 // File-only scoring signals. For now these are also bounded ~9000 to give them
472 // equal weight with the query-dependent signals.
473 scoreFileRankFactor = 9000.0
474 scoreFileOrderFactor = 10.0
475 scoreRepoRankFactor = 20.0
476
477 // Used for ordering line and chunk matches within a file.
478 scoreLineOrderFactor = 1.0
479)
480
481// findSection checks whether a section defined by offset and size lies within
482// one of the sections in secs.
483func findSection(secs []DocumentSection, off, sz uint32) (int, bool) {
484 j := sort.Search(len(secs), func(i int) bool {
485 return secs[i].End >= off+sz
486 })
487
488 if j == len(secs) {
489 return 0, false
490 }
491
492 if secs[j].Start <= off && off+sz <= secs[j].End {
493 return j, true
494 }
495 return 0, false
496}
497
498func (p *contentProvider) chunkMatchScore(secs []DocumentSection, m *ChunkMatch, language string, debug bool) (float64, string) {
499 type debugScore struct {
500 score float64
501 what string
502 }
503
504 score := &debugScore{}
505 maxScore := &debugScore{}
506
507 addScore := func(what string, s float64) {
508 if s != 0 && debug {
509 score.what += fmt.Sprintf("%s:%.2f, ", what, s)
510 }
511 score.score += s
512 }
513
514 data := p.data(m.FileName)
515 filename := p.data(true)
516
517 for i, r := range m.Ranges {
518 // calculate the start and end offset relative to the start of the content
519 relStartOffset := int(r.Start.ByteOffset - m.ContentStart.ByteOffset)
520 relEndOffset := int(r.End.ByteOffset - m.ContentStart.ByteOffset)
521
522 startBoundary := relStartOffset < len(m.Content) && (relStartOffset == 0 || byteClass(m.Content[relStartOffset-1]) != byteClass(m.Content[relStartOffset]))
523 endBoundary := relEndOffset > 0 && (relEndOffset == len(m.Content) || byteClass(m.Content[relEndOffset-1]) != byteClass(m.Content[relEndOffset]))
524
525 score.score = 0
526 score.what = ""
527
528 if startBoundary && endBoundary {
529 addScore("WordMatch", scoreWordMatch)
530 } else if startBoundary || endBoundary {
531 addScore("PartialWordMatch", scorePartialWordMatch)
532 }
533
534 if m.FileName {
535 sep := bytes.LastIndexByte(m.Content, '/')
536 startMatch := relStartOffset == sep+1
537 endMatch := relEndOffset == len(m.Content)
538 if startMatch && endMatch {
539 addScore("Base", scoreBase)
540 } else if startMatch || endMatch {
541 addScore("EdgeBase", (scoreBase+scorePartialBase)/2)
542 } else if sep < relStartOffset {
543 addScore("InnerBase", scorePartialBase)
544 }
545 } else if secIdx, ok := findSection(secs, uint32(r.Start.ByteOffset), uint32(r.End.ByteOffset-r.Start.ByteOffset)); ok {
546 sec := secs[secIdx]
547 startMatch := sec.Start == uint32(r.Start.ByteOffset)
548 endMatch := sec.End == uint32(r.End.ByteOffset)
549 if startMatch && endMatch {
550 addScore("Symbol", scoreSymbol)
551 } else if startMatch || endMatch {
552 addScore("EdgeSymbol", (scoreSymbol+scorePartialSymbol)/2)
553 } else {
554 addScore("InnerSymbol", scorePartialSymbol)
555 }
556
557 var si *Symbol
558 if m.SymbolInfo != nil {
559 si = m.SymbolInfo[i]
560 }
561 if si == nil {
562 // for non-symbol queries, we need to hydrate in SymbolInfo.
563 start := p.id.fileEndSymbol[p.idx]
564 si = p.id.symbols.data(start + uint32(secIdx))
565 }
566 if si != nil {
567 sym := sectionSlice(data, sec)
568 addScore(fmt.Sprintf("kind:%s:%s", language, si.Kind), scoreSymbolKind(language, filename, sym, si.Kind))
569 }
570 }
571
572 if score.score > maxScore.score {
573 maxScore.score = score.score
574 maxScore.what = score.what
575 }
576 }
577
578 if debug {
579 maxScore.what = fmt.Sprintf("score:%.2f <- %s", maxScore.score, strings.TrimSuffix(maxScore.what, ", "))
580 }
581
582 return maxScore.score, maxScore.what
583}
584
585func (p *contentProvider) matchScore(secs []DocumentSection, m *LineMatch, language string, debug bool) (float64, string) {
586 type debugScore struct {
587 score float64
588 what string
589 }
590
591 score := &debugScore{}
592 maxScore := &debugScore{}
593
594 addScore := func(what string, s float64) {
595 if s != 0 && debug {
596 score.what += fmt.Sprintf("%s:%.2f, ", what, s)
597 }
598 score.score += s
599 }
600
601 data := p.data(m.FileName)
602 filename := p.data(true)
603
604 for _, f := range m.LineFragments {
605 startBoundary := f.LineOffset < len(m.Line) && (f.LineOffset == 0 || byteClass(m.Line[f.LineOffset-1]) != byteClass(m.Line[f.LineOffset]))
606
607 end := int(f.LineOffset) + f.MatchLength
608 endBoundary := end > 0 && (end == len(m.Line) || byteClass(m.Line[end-1]) != byteClass(m.Line[end]))
609
610 score.score = 0
611 score.what = ""
612
613 if startBoundary && endBoundary {
614 addScore("WordMatch", scoreWordMatch)
615 } else if startBoundary || endBoundary {
616 addScore("PartialWordMatch", scorePartialWordMatch)
617 }
618
619 if m.FileName {
620 sep := bytes.LastIndexByte(m.Line, '/')
621 startMatch := sep+1 == f.LineOffset
622 endMatch := len(m.Line) == f.LineOffset+f.MatchLength
623 if startMatch && endMatch {
624 addScore("Base", scoreBase)
625 } else if startMatch || endMatch {
626 addScore("EdgeBase", (scoreBase+scorePartialBase)/2)
627 } else if sep < f.LineOffset {
628 addScore("InnerBase", scorePartialBase)
629 }
630 } else if secIdx, ok := findSection(secs, f.Offset, uint32(f.MatchLength)); ok {
631 sec := secs[secIdx]
632 startMatch := sec.Start == f.Offset
633 endMatch := sec.End == f.Offset+uint32(f.MatchLength)
634 if startMatch && endMatch {
635 addScore("Symbol", scoreSymbol)
636 } else if startMatch || endMatch {
637 addScore("EdgeSymbol", (scoreSymbol+scorePartialSymbol)/2)
638 } else {
639 addScore("InnerSymbol", scorePartialSymbol)
640 }
641
642 si := f.SymbolInfo
643 if si == nil {
644 // for non-symbol queries, we need to hydrate in SymbolInfo.
645 start := p.id.fileEndSymbol[p.idx]
646 si = p.id.symbols.data(start + uint32(secIdx))
647 }
648 if si != nil {
649 // the LineFragment may not be on a symbol, then si will be nil.
650 sym := sectionSlice(data, sec)
651 addScore(fmt.Sprintf("kind:%s:%s", language, si.Kind), scoreSymbolKind(language, filename, sym, si.Kind))
652 }
653 }
654
655 if score.score > maxScore.score {
656 maxScore.score = score.score
657 maxScore.what = score.what
658 }
659 }
660
661 if debug {
662 maxScore.what = fmt.Sprintf("score:%.2f <- %s", maxScore.score, strings.TrimSuffix(maxScore.what, ", "))
663 }
664
665 return maxScore.score, maxScore.what
666}
667
668// sectionSlice will return data[sec.Start:sec.End] but will clip Start and
669// End such that it won't be out of range.
670func sectionSlice(data []byte, sec DocumentSection) []byte {
671 l := uint32(len(data))
672 if sec.Start >= l {
673 return nil
674 }
675 if sec.End > l {
676 sec.End = l
677 }
678 return data[sec.Start:sec.End]
679}
680
681// scoreSymbolKind boosts a match based on the combination of language, symbol
682// and kind. The language string comes from go-enry, the symbol and kind from
683// ctags.
684func scoreSymbolKind(language string, filename []byte, sym []byte, kind string) float64 {
685 var factor float64
686
687 // Generic ranking which will be overriden by language specific ranking
688 switch kind {
689 case "type": // scip-ctags regression workaround https://github.com/sourcegraph/sourcegraph/issues/57659
690 factor = 8
691 case "class":
692 factor = 10
693 case "struct":
694 factor = 9.5
695 case "enum":
696 factor = 9
697 case "interface":
698 factor = 8
699 case "function", "func", "method":
700 factor = 7
701 case "member", "field":
702 factor = 5.5
703 case "constant", "const":
704 factor = 5
705 case "var", "variable":
706 factor = 4
707
708 default:
709 // No idea what it is, but its something regarded as a symbol
710 factor = 1
711 }
712
713 // Refer to universal-ctags --list-kinds-full=<language> to learn about which
714 // kinds are detected for which language.
715 //
716 // Note that go-ctags uses universal-ctags's interactive mode and thus returns
717 // the full name for "kind" and not the one-letter abbreviation.
718 switch language {
719 case "Java", "java":
720 switch kind {
721 // 2022-03-30: go-ctags contains a regex rule for Java classes that sets "kind"
722 // to "classes" instead of "c". We have to cover both cases to support existing
723 // indexes.
724 case "class", "classes":
725 factor = 10
726 case "enum":
727 factor = 9
728 case "interface":
729 factor = 8
730 case "method":
731 factor = 7
732 case "field":
733 factor = 6
734 case "enumConstant":
735 factor = 5
736 }
737 case "Kotlin", "kotlin":
738 switch kind {
739 case "class":
740 factor = 10
741 case "interface":
742 factor = 9
743 case "method":
744 factor = 8
745 case "typealias":
746 factor = 7
747 case "constant":
748 factor = 6
749 case "variable":
750 factor = 5
751 }
752 case "Go", "go":
753 switch kind {
754 // scip-ctags regression workaround https://github.com/sourcegraph/sourcegraph/issues/57659
755 // for each case a description of the fields in ctags in the comment
756 case "type": // interface struct talias
757 factor = 9
758 case "method", "function": // methodSpec func
759 factor = 8
760 case "variable": // var member
761 factor = 7
762 case "constant": // const
763 factor = 6
764
765 case "interface": // interfaces
766 factor = 10
767 case "struct": // structs
768 factor = 9
769 case "talias": // type aliases
770 factor = 9
771 case "methodSpec": // interface method specification
772 factor = 8.5
773 case "func": // functions
774 factor = 8
775 case "member": // struct members
776 factor = 7
777 case "const": // constants
778 factor = 6
779 case "var": // variables
780 factor = 5
781 }
782
783 // Boost exported go symbols. Same implementation as token.IsExported
784 if ch, _ := utf8.DecodeRune(sym); unicode.IsUpper(ch) {
785 factor += 0.5
786 }
787
788 if bytes.HasSuffix(filename, []byte("_test.go")) {
789 factor *= 0.8
790 }
791
792 // Could also rank on:
793 //
794 // - anonMember struct anonymous members
795 // - packageName name for specifying imported package
796 // - receiver receivers
797 // - package packages
798 // - type types
799 // - unknown unknown
800 case "C++", "c++":
801 switch kind {
802 case "class": // classes
803 factor = 10
804 case "enum": // enumeration names
805 factor = 9
806 case "function": // function definitions
807 factor = 8
808 case "struct": // structure names
809 factor = 7
810 case "union": // union names
811 factor = 6
812 case "typdef": // typedefs
813 factor = 5
814 case "member": // class, struct, and union members
815 factor = 4
816 case "variable": // varialbe definitions
817 factor = 3
818 }
819 // Could also rank on:
820 // NAME DESCRIPTION
821 // macro macro definitions
822 // enumerator enumerators (values inside an enumeration)
823 // header included header files
824 // namespace namespaces
825 // variable variable definitions
826 case "Scala", "scala":
827 switch kind {
828 case "class":
829 factor = 10
830 case "interface":
831 factor = 9
832 case "object":
833 factor = 8
834 case "method":
835 factor = 7
836 case "type":
837 factor = 6
838 case "variable":
839 factor = 5
840 case "package":
841 factor = 4
842 }
843 case "Python", "python":
844 switch kind {
845 case "class": // classes
846 factor = 10
847 case "function": // function definitions
848 factor = 8
849 case "member": // class, struct, and union members
850 factor = 4
851 case "variable": // variable definitions
852 factor = 3
853 case "local": // local variables
854 factor = 2
855 }
856 // Could also rank on:
857 //
858 // - namespace name referring a module defined in other file
859 // - module modules
860 // - unknown name referring a class/variable/function/module defined in other module
861 // - parameter function parameters
862 case "Ruby", "ruby":
863 switch kind {
864 case "class":
865 factor = 10
866 case "method":
867 factor = 9
868 case "alias":
869 factor = 8
870 case "module":
871 factor = 7
872 case "singletonMethod":
873 factor = 6
874 case "constant":
875 factor = 5
876 case "accessor":
877 factor = 4
878 case "library":
879 factor = 3
880 }
881 case "PHP", "php":
882 switch kind {
883 case "class":
884 factor = 10
885 case "interface":
886 factor = 9
887 case "function":
888 factor = 8
889 case "trait":
890 factor = 7
891 case "define":
892 factor = 6
893 case "namespace":
894 factor = 5
895 case "alias":
896 factor = 4
897 case "variable":
898 factor = 3
899 case "local":
900 factor = 3
901 }
902 case "GraphQL", "graphql":
903 switch kind {
904 case "type":
905 factor = 10
906 }
907 case "Markdown", "markdown":
908 // Headers are good signal in docs, but do not rank as highly as code.
909 switch kind {
910 case "chapter": // #
911 factor = 4
912 case "section": // ##
913 factor = 3
914 case "subsection": // ###
915 factor = 2
916 }
917 }
918
919 return factor * scoreKindMatch
920}
921
922type matchScoreSlice []LineMatch
923
924func (m matchScoreSlice) Len() int { return len(m) }
925func (m matchScoreSlice) Swap(i, j int) { m[i], m[j] = m[j], m[i] }
926func (m matchScoreSlice) Less(i, j int) bool { return m[i].Score > m[j].Score }
927
928type chunkMatchScoreSlice []ChunkMatch
929
930func (m chunkMatchScoreSlice) Len() int { return len(m) }
931func (m chunkMatchScoreSlice) Swap(i, j int) { m[i], m[j] = m[j], m[i] }
932func (m chunkMatchScoreSlice) Less(i, j int) bool { return m[i].Score > m[j].Score }
933
934type fileMatchesByScore []FileMatch
935
936func (m fileMatchesByScore) Len() int { return len(m) }
937func (m fileMatchesByScore) Swap(i, j int) { m[i], m[j] = m[j], m[i] }
938func (m fileMatchesByScore) Less(i, j int) bool { return m[i].Score > m[j].Score }
939
940func sortMatchesByScore(ms []LineMatch) {
941 sort.Sort(matchScoreSlice(ms))
942}
943
944func sortChunkMatchesByScore(ms []ChunkMatch) {
945 sort.Sort(chunkMatchScoreSlice(ms))
946}
947
948var doNovelty = os.Getenv("ZOEKT_NOVELTY_DISABLE") == ""
949
950// SortFiles sorts files matches in the order we want to present results to
951// users. The order depends on the match score, which includes both
952// query-dependent signals like word overlap, and file-only signals like the
953// file ranks (if file ranks are enabled).
954//
955// We don't only use the scores, we will also boost some results to present
956// files with novel extensions.
957func SortFiles(ms []FileMatch) {
958 sort.Sort(fileMatchesByScore(ms))
959
960 if doNovelty {
961 // Experimentally boost something into the third filematch
962 boostNovelExtension(ms, 2, 0.9)
963 }
964}
965
966func boostNovelExtension(ms []FileMatch, boostOffset int, minScoreRatio float64) {
967 if len(ms) <= boostOffset+1 {
968 return
969 }
970
971 top := ms[:boostOffset]
972 candidates := ms[boostOffset:]
973
974 // Don't bother boosting something which is significantly different to the
975 // result it replaces.
976 minScoreForNovelty := candidates[0].Score * minScoreRatio
977
978 // We want to look for an ext that isn't in the top exts
979 exts := make([]string, len(top))
980 for i := range top {
981 exts[i] = path.Ext(top[i].FileName)
982 }
983
984 for i := range candidates {
985 // Do not assume sorted due to boostNovelExtension being called on subsets
986 if candidates[i].Score < minScoreForNovelty {
987 continue
988 }
989
990 if slices.Contains(exts, path.Ext(candidates[i].FileName)) {
991 continue
992 }
993
994 // Found what we are looking for, now boost to front of candidates (which
995 // is ms[boostOffset])
996 for ; i > 0; i-- {
997 candidates[i], candidates[i-1] = candidates[i-1], candidates[i]
998 }
999 return
1000 }
1001}