fork of https://github.com/sourcegraph/zoekt
1// Copyright 2016 Google Inc. All rights reserved.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15package zoekt
16
17import (
18 "bytes"
19 "fmt"
20 "log"
21 "os"
22 "path"
23 "sort"
24 "strings"
25 "unicode/utf8"
26
27 "golang.org/x/exp/slices"
28)
29
30var _ = log.Println
31
32// contentProvider is an abstraction to treat matches for names and
33// content with the same code.
34type contentProvider struct {
35 id *indexData
36 stats *Stats
37
38 // mutable
39 err error
40 idx uint32
41 _data []byte
42 _nl []uint32
43 _nlBuf []uint32
44 _sects []DocumentSection
45 _sectBuf []DocumentSection
46 fileSize uint32
47}
48
49// setDocument skips to the given document.
50func (p *contentProvider) setDocument(docID uint32) {
51 fileStart := p.id.boundaries[docID]
52
53 p.idx = docID
54 p.fileSize = p.id.boundaries[docID+1] - fileStart
55
56 p._nl = nil
57 p._sects = nil
58 p._data = nil
59}
60
61func (p *contentProvider) docSections() []DocumentSection {
62 if p._sects == nil {
63 var sz uint32
64 p._sects, sz, p.err = p.id.readDocSections(p.idx, p._sectBuf)
65 p.stats.ContentBytesLoaded += int64(sz)
66 p._sectBuf = p._sects
67 }
68 return p._sects
69}
70
71func (p *contentProvider) newlines() newlines {
72 if p._nl == nil {
73 var sz uint32
74 p._nl, sz, p.err = p.id.readNewlines(p.idx, p._nlBuf)
75 p._nlBuf = p._nl
76 p.stats.ContentBytesLoaded += int64(sz)
77 }
78 return newlines{locs: p._nl, fileSize: p.fileSize}
79}
80
81func (p *contentProvider) data(fileName bool) []byte {
82 if fileName {
83 return p.id.fileNameContent[p.id.fileNameIndex[p.idx]:p.id.fileNameIndex[p.idx+1]]
84 }
85
86 if p._data == nil {
87 p._data, p.err = p.id.readContents(p.idx)
88 p.stats.FilesLoaded++
89 p.stats.ContentBytesLoaded += int64(len(p._data))
90 }
91 return p._data
92}
93
94// Find offset in bytes (relative to corpus start) for an offset in
95// runes (relative to document start). If filename is set, the corpus
96// is the set of filenames, with the document being the name itself.
97func (p *contentProvider) findOffset(filename bool, r uint32) uint32 {
98 if p.id.metaData.PlainASCII {
99 return r
100 }
101
102 sample := p.id.runeOffsets
103 runeEnds := p.id.fileEndRunes
104 fileStartByte := p.id.boundaries[p.idx]
105 if filename {
106 sample = p.id.fileNameRuneOffsets
107 runeEnds = p.id.fileNameEndRunes
108 fileStartByte = p.id.fileNameIndex[p.idx]
109 }
110
111 absR := r
112 if p.idx > 0 {
113 absR += runeEnds[p.idx-1]
114 }
115
116 byteOff, left := sample.lookup(absR)
117
118 var data []byte
119
120 if filename {
121 data = p.id.fileNameContent[byteOff:]
122 } else {
123 data, p.err = p.id.readContentSlice(byteOff, 3*runeOffsetFrequency)
124 if p.err != nil {
125 return 0
126 }
127 }
128 for left > 0 {
129 _, sz := utf8.DecodeRune(data)
130 byteOff += uint32(sz)
131 data = data[sz:]
132 left--
133 }
134
135 byteOff -= fileStartByte
136 return byteOff
137}
138
139func (p *contentProvider) fillMatches(ms []*candidateMatch, numContextLines int, language string, debug bool) []LineMatch {
140 var result []LineMatch
141 if ms[0].fileName {
142 // There is only "line" in a filename.
143 res := LineMatch{
144 Line: p.id.fileName(p.idx),
145 FileName: true,
146 }
147
148 for _, m := range ms {
149 res.LineFragments = append(res.LineFragments, LineFragmentMatch{
150 LineOffset: int(m.byteOffset),
151 MatchLength: int(m.byteMatchSz),
152 Offset: m.byteOffset,
153 })
154
155 result = []LineMatch{res}
156 }
157 } else {
158 ms = breakMatchesOnNewlines(ms, p.data(false))
159 result = p.fillContentMatches(ms, numContextLines)
160 }
161
162 sects := p.docSections()
163 for i, m := range result {
164 result[i].Score, result[i].DebugScore = p.matchScore(sects, &m, language, debug)
165 }
166
167 return result
168}
169
170func (p *contentProvider) fillChunkMatches(ms []*candidateMatch, numContextLines int, language string, debug bool) []ChunkMatch {
171 var result []ChunkMatch
172 if ms[0].fileName {
173 // If the first match is a filename match, there will only be
174 // one match and the matched content will be the filename.
175
176 fileName := p.id.fileName(p.idx)
177 ranges := make([]Range, 0, len(ms))
178 for _, m := range ms {
179 ranges = append(ranges, Range{
180 Start: Location{
181 ByteOffset: m.byteOffset,
182 LineNumber: 1,
183 Column: uint32(utf8.RuneCount(fileName[:m.byteOffset]) + 1),
184 },
185 End: Location{
186 ByteOffset: m.byteOffset + m.byteMatchSz,
187 LineNumber: 1,
188 Column: uint32(utf8.RuneCount(fileName[:m.byteOffset+m.byteMatchSz]) + 1),
189 },
190 })
191 }
192
193 result = []ChunkMatch{{
194 Content: fileName,
195 ContentStart: Location{ByteOffset: 0, LineNumber: 1, Column: 1},
196 Ranges: ranges,
197 FileName: true,
198 }}
199 } else {
200 result = p.fillContentChunkMatches(ms, numContextLines)
201 }
202
203 sects := p.docSections()
204 for i, m := range result {
205 result[i].Score, result[i].DebugScore = p.chunkMatchScore(sects, &m, language, debug)
206 }
207
208 return result
209}
210
211func (p *contentProvider) fillContentMatches(ms []*candidateMatch, numContextLines int) []LineMatch {
212 var result []LineMatch
213 for len(ms) > 0 {
214 m := ms[0]
215 num, lineStart, lineEnd := p.newlines().atOffset(m.byteOffset)
216
217 var lineCands []*candidateMatch
218
219 endMatch := m.byteOffset + m.byteMatchSz
220
221 for len(ms) > 0 {
222 m := ms[0]
223 if int(m.byteOffset) <= lineEnd {
224 endMatch = m.byteOffset + m.byteMatchSz
225 lineCands = append(lineCands, m)
226 ms = ms[1:]
227 } else {
228 break
229 }
230 }
231
232 if len(lineCands) == 0 {
233 log.Panicf(
234 "%s %v infinite loop: num %d start,end %d,%d, offset %d",
235 p.id.fileName(p.idx), p.id.metaData,
236 num, lineStart, lineEnd,
237 m.byteOffset)
238 }
239
240 data := p.data(false)
241
242 // Due to merging matches, we may have a match that
243 // crosses a line boundary. Prevent confusion by
244 // taking lines until we pass the last match
245 for lineEnd < len(data) && endMatch > uint32(lineEnd) {
246 next := bytes.IndexByte(data[lineEnd+1:], '\n')
247 if next == -1 {
248 lineEnd = len(data)
249 } else {
250 // TODO(hanwen): test that checks "+1" part here.
251 lineEnd += next + 1
252 }
253 }
254
255 finalMatch := LineMatch{
256 LineStart: lineStart,
257 LineEnd: lineEnd,
258 LineNumber: num,
259 }
260 finalMatch.Line = data[lineStart:lineEnd]
261
262 if numContextLines > 0 {
263 finalMatch.Before = p.newlines().getLines(data, num-numContextLines, num)
264 finalMatch.After = p.newlines().getLines(data, num+1, num+1+numContextLines)
265 }
266
267 for _, m := range lineCands {
268 fragment := LineFragmentMatch{
269 Offset: m.byteOffset,
270 LineOffset: int(m.byteOffset) - lineStart,
271 MatchLength: int(m.byteMatchSz),
272 }
273 if m.symbol {
274 start := p.id.fileEndSymbol[p.idx]
275 fragment.SymbolInfo = p.id.symbols.data(start + m.symbolIdx)
276 if fragment.SymbolInfo != nil {
277 sec := p.docSections()[m.symbolIdx]
278 fragment.SymbolInfo.Sym = string(data[sec.Start:sec.End])
279 }
280 }
281
282 finalMatch.LineFragments = append(finalMatch.LineFragments, fragment)
283 }
284 result = append(result, finalMatch)
285 }
286 return result
287}
288
289func (p *contentProvider) fillContentChunkMatches(ms []*candidateMatch, numContextLines int) []ChunkMatch {
290 newlines := p.newlines()
291 chunks := chunkCandidates(ms, newlines, numContextLines)
292 data := p.data(false)
293 chunkMatches := make([]ChunkMatch, 0, len(chunks))
294 for _, chunk := range chunks {
295 ranges := make([]Range, 0, len(chunk.candidates))
296 var symbolInfo []*Symbol
297 for i, cm := range chunk.candidates {
298 startOffset := cm.byteOffset
299 endOffset := cm.byteOffset + cm.byteMatchSz
300 startLine, startLineOffset, _ := newlines.atOffset(startOffset)
301 endLine, endLineOffset, _ := newlines.atOffset(endOffset)
302
303 ranges = append(ranges, Range{
304 Start: Location{
305 ByteOffset: startOffset,
306 LineNumber: uint32(startLine),
307 Column: uint32(utf8.RuneCount(data[startLineOffset:startOffset]) + 1),
308 },
309 End: Location{
310 ByteOffset: endOffset,
311 LineNumber: uint32(endLine),
312 Column: uint32(utf8.RuneCount(data[endLineOffset:endOffset]) + 1),
313 },
314 })
315
316 if cm.symbol {
317 if symbolInfo == nil {
318 symbolInfo = make([]*Symbol, len(chunk.candidates))
319 }
320 start := p.id.fileEndSymbol[p.idx]
321 si := p.id.symbols.data(start + cm.symbolIdx)
322 if si != nil {
323 sec := p.docSections()[cm.symbolIdx]
324 si.Sym = string(data[sec.Start:sec.End])
325 }
326 symbolInfo[i] = si
327 }
328 }
329
330 firstLineNumber := int(chunk.firstLine) - numContextLines
331 if firstLineNumber < 1 {
332 firstLineNumber = 1
333 }
334 firstLineStart, _ := newlines.lineBounds(firstLineNumber)
335
336 chunkMatches = append(chunkMatches, ChunkMatch{
337 Content: newlines.getLines(data, firstLineNumber, int(chunk.lastLine)+numContextLines+1),
338 ContentStart: Location{
339 ByteOffset: firstLineStart,
340 LineNumber: uint32(firstLineNumber),
341 Column: 1,
342 },
343 FileName: false,
344 Ranges: ranges,
345 SymbolInfo: symbolInfo,
346 })
347 }
348 return chunkMatches
349}
350
351type candidateChunk struct {
352 firstLine uint32 // 1-based, inclusive
353 lastLine uint32 // 1-based, inclusive
354 minOffset uint32 // 0-based, inclusive
355 maxOffset uint32 // 0-based, exclusive
356 candidates []*candidateMatch
357}
358
359// chunkCandidates groups a set of sorted, non-overlapping candidate matches by line number. Adjacent
360// chunks will be merged if adding `numContextLines` to the beginning and end of the chunk would cause
361// it to overlap with an adjacent chunk.
362func chunkCandidates(ms []*candidateMatch, newlines newlines, numContextLines int) []candidateChunk {
363 var chunks []candidateChunk
364 for _, m := range ms {
365 startOffset := m.byteOffset
366 endOffset := m.byteOffset + m.byteMatchSz
367 firstLine, _, _ := newlines.atOffset(startOffset)
368 lastLine, _, _ := newlines.atOffset(endOffset)
369
370 if len(chunks) > 0 && int(chunks[len(chunks)-1].lastLine)+numContextLines >= firstLine-numContextLines {
371 // If a new chunk created with the current candidateMatch would
372 // overlap with the previous chunk, instead add the candidateMatch
373 // to the last chunk and extend end of the last chunk.
374 last := &chunks[len(chunks)-1]
375 last.candidates = append(last.candidates, m)
376 if last.maxOffset < endOffset {
377 last.lastLine = uint32(lastLine)
378 last.maxOffset = uint32(endOffset)
379 }
380 } else {
381 chunks = append(chunks, candidateChunk{
382 firstLine: uint32(firstLine),
383 lastLine: uint32(lastLine),
384 minOffset: startOffset,
385 maxOffset: endOffset,
386 candidates: []*candidateMatch{m},
387 })
388 }
389 }
390 return chunks
391}
392
393type newlines struct {
394 // locs is the sorted set of byte offsets of the newlines in the file
395 locs []uint32
396
397 // fileSize is just the number of bytes in the file. It is stored
398 // on this struct so we can safely know the length of the last line
399 // in the file since not all files end in a newline.
400 fileSize uint32
401}
402
403// atOffset returns the line containing the offset. If the offset lands on
404// the newline ending line M, we return M. The line is characterized
405// by its linenumber (base-1, byte index of line start, byte index of
406// line end). The line end is the index of a newline, or the filesize
407// (if matching the last line of the file.)
408func (nls newlines) atOffset(offset uint32) (lineNumber, lineStart, lineEnd int) {
409 idx := sort.Search(len(nls.locs), func(n int) bool {
410 return nls.locs[n] >= offset
411 })
412
413 start, end := nls.lineBounds(idx + 1)
414 return idx + 1, int(start), int(end)
415}
416
417// lineBounds returns the byte offsets of the start and end of the 1-based
418// lineNumber. The end offset is exclusive and will not contain the line-ending
419// newline. If the line number is out of range of the lines in the file, start
420// and end will be clamped to [0,fileSize].
421func (nls newlines) lineBounds(lineNumber int) (start, end uint32) {
422 // nls.locs[0] + 1 is the start of the 2nd line of data.
423 startIdx := lineNumber - 2
424 endIdx := lineNumber - 1
425
426 if startIdx < 0 {
427 start = 0
428 } else if startIdx >= len(nls.locs) {
429 start = nls.fileSize
430 } else {
431 start = nls.locs[startIdx] + 1
432 }
433
434 if endIdx < 0 {
435 end = 0
436 } else if endIdx >= len(nls.locs) {
437 end = nls.fileSize
438 } else {
439 end = nls.locs[endIdx]
440 }
441
442 return start, end
443}
444
445// getLines returns a slice of data containing the lines [low, high).
446// low is 1-based and inclusive. high is 1-based and exclusive.
447func (nls newlines) getLines(data []byte, low, high int) []byte {
448 if low >= high {
449 return nil
450 }
451
452 lowStart, _ := nls.lineBounds(low)
453 _, highEnd := nls.lineBounds(high - 1)
454
455 return data[lowStart:highEnd]
456}
457
458const (
459 // Query-dependent scoring signals. All of these together are bounded at ~9000
460 // (scoreWordMatch + scoreSymbol + scoreKindMatch * 10 + scoreFactorAtomMatch).
461 scorePartialWordMatch = 50.0
462 scoreWordMatch = 500.0
463 scoreBase = 7000.0
464 scorePartialBase = 4000.0
465 scoreSymbol = 7000.0
466 scorePartialSymbol = 4000.0
467 scoreKindMatch = 100.0
468 scoreFactorAtomMatch = 400.0
469
470 // File-only scoring signals. For now these are also bounded ~9000 to give them
471 // equal weight with the query-dependent signals.
472 scoreFileRankFactor = 9000.0
473 scoreFileOrderFactor = 10.0
474 scoreRepoRankFactor = 20.0
475
476 // Used for ordering line and chunk matches within a file.
477 scoreLineOrderFactor = 1.0
478)
479
480// findSection checks whether a section defined by offset and size lies within
481// one of the sections in secs.
482func findSection(secs []DocumentSection, off, sz uint32) (int, bool) {
483 j := sort.Search(len(secs), func(i int) bool {
484 return secs[i].End >= off+sz
485 })
486
487 if j == len(secs) {
488 return 0, false
489 }
490
491 if secs[j].Start <= off && off+sz <= secs[j].End {
492 return j, true
493 }
494 return 0, false
495}
496
497func (p *contentProvider) chunkMatchScore(secs []DocumentSection, m *ChunkMatch, language string, debug bool) (float64, string) {
498 type debugScore struct {
499 score float64
500 what string
501 }
502
503 score := &debugScore{}
504 maxScore := &debugScore{}
505
506 addScore := func(what string, s float64) {
507 if s != 0 && debug {
508 score.what += fmt.Sprintf("%s:%.2f, ", what, s)
509 }
510 score.score += s
511 }
512
513 for i, r := range m.Ranges {
514 // calculate the start and end offset relative to the start of the content
515 relStartOffset := int(r.Start.ByteOffset - m.ContentStart.ByteOffset)
516 relEndOffset := int(r.End.ByteOffset - m.ContentStart.ByteOffset)
517
518 startBoundary := relStartOffset < len(m.Content) && (relStartOffset == 0 || byteClass(m.Content[relStartOffset-1]) != byteClass(m.Content[relStartOffset]))
519 endBoundary := relEndOffset > 0 && (relEndOffset == len(m.Content) || byteClass(m.Content[relEndOffset-1]) != byteClass(m.Content[relEndOffset]))
520
521 score.score = 0
522 score.what = ""
523
524 if startBoundary && endBoundary {
525 addScore("WordMatch", scoreWordMatch)
526 } else if startBoundary || endBoundary {
527 addScore("PartialWordMatch", scorePartialWordMatch)
528 }
529
530 if m.FileName {
531 sep := bytes.LastIndexByte(m.Content, '/')
532 startMatch := relStartOffset == sep+1
533 endMatch := relEndOffset == len(m.Content)
534 if startMatch && endMatch {
535 addScore("Base", scoreBase)
536 } else if startMatch || endMatch {
537 addScore("EdgeBase", (scoreBase+scorePartialBase)/2)
538 } else if sep < relStartOffset {
539 addScore("InnerBase", scorePartialBase)
540 }
541 } else if secIdx, ok := findSection(secs, uint32(r.Start.ByteOffset), uint32(r.End.ByteOffset-r.Start.ByteOffset)); ok {
542 sec := secs[secIdx]
543 startMatch := sec.Start == uint32(r.Start.ByteOffset)
544 endMatch := sec.End == uint32(r.End.ByteOffset)
545 if startMatch && endMatch {
546 addScore("Symbol", scoreSymbol)
547 } else if startMatch || endMatch {
548 addScore("EdgeSymbol", (scoreSymbol+scorePartialSymbol)/2)
549 } else {
550 addScore("InnerSymbol", scorePartialSymbol)
551 }
552
553 var si *Symbol
554 if m.SymbolInfo != nil {
555 si = m.SymbolInfo[i]
556 }
557 if si == nil {
558 // for non-symbol queries, we need to hydrate in SymbolInfo.
559 start := p.id.fileEndSymbol[p.idx]
560 si = p.id.symbols.data(start + uint32(secIdx))
561 }
562 if si != nil {
563 addScore(fmt.Sprintf("kind:%s:%s", language, si.Kind), scoreKind(language, si.Kind))
564 }
565 }
566
567 if score.score > maxScore.score {
568 maxScore.score = score.score
569 maxScore.what = score.what
570 }
571 }
572
573 if debug {
574 maxScore.what = fmt.Sprintf("score:%.2f <- %s", maxScore.score, strings.TrimSuffix(maxScore.what, ", "))
575 }
576
577 return maxScore.score, maxScore.what
578}
579
580func (p *contentProvider) matchScore(secs []DocumentSection, m *LineMatch, language string, debug bool) (float64, string) {
581 type debugScore struct {
582 score float64
583 what string
584 }
585
586 score := &debugScore{}
587 maxScore := &debugScore{}
588
589 addScore := func(what string, s float64) {
590 if s != 0 && debug {
591 score.what += fmt.Sprintf("%s:%.2f, ", what, s)
592 }
593 score.score += s
594 }
595
596 for _, f := range m.LineFragments {
597 startBoundary := f.LineOffset < len(m.Line) && (f.LineOffset == 0 || byteClass(m.Line[f.LineOffset-1]) != byteClass(m.Line[f.LineOffset]))
598
599 end := int(f.LineOffset) + f.MatchLength
600 endBoundary := end > 0 && (end == len(m.Line) || byteClass(m.Line[end-1]) != byteClass(m.Line[end]))
601
602 score.score = 0
603 score.what = ""
604
605 if startBoundary && endBoundary {
606 addScore("WordMatch", scoreWordMatch)
607 } else if startBoundary || endBoundary {
608 addScore("PartialWordMatch", scorePartialWordMatch)
609 }
610
611 if m.FileName {
612 sep := bytes.LastIndexByte(m.Line, '/')
613 startMatch := sep+1 == f.LineOffset
614 endMatch := len(m.Line) == f.LineOffset+f.MatchLength
615 if startMatch && endMatch {
616 addScore("Base", scoreBase)
617 } else if startMatch || endMatch {
618 addScore("EdgeBase", (scoreBase+scorePartialBase)/2)
619 } else if sep < f.LineOffset {
620 addScore("InnerBase", scorePartialBase)
621 }
622 } else if secIdx, ok := findSection(secs, f.Offset, uint32(f.MatchLength)); ok {
623 sec := secs[secIdx]
624 startMatch := sec.Start == f.Offset
625 endMatch := sec.End == f.Offset+uint32(f.MatchLength)
626 if startMatch && endMatch {
627 addScore("Symbol", scoreSymbol)
628 } else if startMatch || endMatch {
629 addScore("EdgeSymbol", (scoreSymbol+scorePartialSymbol)/2)
630 } else {
631 addScore("InnerSymbol", scorePartialSymbol)
632 }
633
634 si := f.SymbolInfo
635 if si == nil {
636 // for non-symbol queries, we need to hydrate in SymbolInfo.
637 start := p.id.fileEndSymbol[p.idx]
638 si = p.id.symbols.data(start + uint32(secIdx))
639 }
640 if si != nil {
641 // the LineFragment may not be on a symbol, then si will be nil.
642 addScore(fmt.Sprintf("kind:%s:%s", language, si.Kind), scoreKind(language, si.Kind))
643 }
644 }
645
646 if score.score > maxScore.score {
647 maxScore.score = score.score
648 maxScore.what = score.what
649 }
650 }
651
652 if debug {
653 maxScore.what = fmt.Sprintf("score:%.2f <- %s", maxScore.score, strings.TrimSuffix(maxScore.what, ", "))
654 }
655
656 return maxScore.score, maxScore.what
657}
658
659// scoreKind boosts a match based on the combination of language and kind. The
660// language string comes from go-enry, the kind string from ctags.
661func scoreKind(language string, kind string) float64 {
662 var factor float64
663
664 // Generic ranking which will be overriden by language specific ranking
665 switch kind {
666 case "type": // scip-ctags regression workaround https://github.com/sourcegraph/sourcegraph/issues/57659
667 factor = 8
668 case "class":
669 factor = 10
670 case "struct":
671 factor = 9.5
672 case "enum":
673 factor = 9
674 case "interface":
675 factor = 8
676 case "function", "func", "method":
677 factor = 7
678 case "member", "field":
679 factor = 5.5
680 case "constant", "const":
681 factor = 5
682 case "var", "variable":
683 factor = 4
684
685 default:
686 // No idea what it is, but its something regarded as a symbol
687 factor = 1
688 }
689
690 // Refer to universal-ctags --list-kinds-full=<language> to learn about which
691 // kinds are detected for which language.
692 //
693 // Note that go-ctags uses universal-ctags's interactive mode and thus returns
694 // the full name for "kind" and not the one-letter abbreviation.
695 switch language {
696 case "Java", "java":
697 switch kind {
698 // 2022-03-30: go-ctags contains a regex rule for Java classes that sets "kind"
699 // to "classes" instead of "c". We have to cover both cases to support existing
700 // indexes.
701 case "class", "classes":
702 factor = 10
703 case "enum":
704 factor = 9
705 case "interface":
706 factor = 8
707 case "method":
708 factor = 7
709 case "field":
710 factor = 6
711 case "enumConstant":
712 factor = 5
713 }
714 case "Kotlin", "kotlin":
715 switch kind {
716 case "class":
717 factor = 10
718 case "interface":
719 factor = 9
720 case "method":
721 factor = 8
722 case "typealias":
723 factor = 7
724 case "constant":
725 factor = 6
726 case "variable":
727 factor = 5
728 }
729 case "Go", "go":
730 switch kind {
731 // scip-ctags regression workaround https://github.com/sourcegraph/sourcegraph/issues/57659
732 // for each case a description of the fields in ctags in the comment
733 case "type": // interface struct talias
734 factor = 10
735 case "method", "function": // methodSpec
736 factor = 8
737 case "variable": // var member
738 factor = 7
739 case "constant": // const
740 factor = 6
741
742 case "interface": // interfaces
743 factor = 10
744 case "struct": // structs
745 factor = 9
746 case "talias": // type aliases
747 factor = 9
748 case "methodSpec": // interface method specification
749 factor = 8.5
750 case "func": // functions
751 factor = 8
752 case "member": // struct members
753 factor = 7
754 case "const": // constants
755 factor = 6
756 case "var": // variables
757 factor = 5
758 }
759 // Could also rank on:
760 //
761 // - anonMember struct anonymous members
762 // - packageName name for specifying imported package
763 // - receiver receivers
764 // - package packages
765 // - type types
766 // - unknown unknown
767 case "C++", "c++":
768 switch kind {
769 case "class": // classes
770 factor = 10
771 case "enum": // enumeration names
772 factor = 9
773 case "function": // function definitions
774 factor = 8
775 case "struct": // structure names
776 factor = 7
777 case "union": // union names
778 factor = 6
779 case "typdef": // typedefs
780 factor = 5
781 case "member": // class, struct, and union members
782 factor = 4
783 case "variable": // varialbe definitions
784 factor = 3
785 }
786 // Could also rank on:
787 // NAME DESCRIPTION
788 // macro macro definitions
789 // enumerator enumerators (values inside an enumeration)
790 // header included header files
791 // namespace namespaces
792 // variable variable definitions
793 case "Scala", "scala":
794 switch kind {
795 case "class":
796 factor = 10
797 case "interface":
798 factor = 9
799 case "object":
800 factor = 8
801 case "method":
802 factor = 7
803 case "type":
804 factor = 6
805 case "variable":
806 factor = 5
807 case "package":
808 factor = 4
809 }
810 case "Python", "python":
811 switch kind {
812 case "class": // classes
813 factor = 10
814 case "function": // function definitions
815 factor = 8
816 case "member": // class, struct, and union members
817 factor = 4
818 case "variable": // variable definitions
819 factor = 3
820 case "local": // local variables
821 factor = 2
822 }
823 // Could also rank on:
824 //
825 // - namespace name referring a module defined in other file
826 // - module modules
827 // - unknown name referring a class/variable/function/module defined in other module
828 // - parameter function parameters
829 case "Ruby", "ruby":
830 switch kind {
831 case "class":
832 factor = 10
833 case "method":
834 factor = 9
835 case "alias":
836 factor = 8
837 case "module":
838 factor = 7
839 case "singletonMethod":
840 factor = 6
841 case "constant":
842 factor = 5
843 case "accessor":
844 factor = 4
845 case "library":
846 factor = 3
847 }
848 case "PHP", "php":
849 switch kind {
850 case "class":
851 factor = 10
852 case "interface":
853 factor = 9
854 case "function":
855 factor = 8
856 case "trait":
857 factor = 7
858 case "define":
859 factor = 6
860 case "namespace":
861 factor = 5
862 case "alias":
863 factor = 4
864 case "variable":
865 factor = 3
866 case "local":
867 factor = 3
868 }
869 case "GraphQL", "graphql":
870 switch kind {
871 case "type":
872 factor = 10
873 }
874 case "Markdown", "markdown":
875 // Headers are good signal in docs, but do not rank as highly as code.
876 switch kind {
877 case "chapter": // #
878 factor = 4
879 case "section": // ##
880 factor = 3
881 case "subsection": // ###
882 factor = 2
883 }
884 }
885
886 return factor * scoreKindMatch
887}
888
889type matchScoreSlice []LineMatch
890
891func (m matchScoreSlice) Len() int { return len(m) }
892func (m matchScoreSlice) Swap(i, j int) { m[i], m[j] = m[j], m[i] }
893func (m matchScoreSlice) Less(i, j int) bool { return m[i].Score > m[j].Score }
894
895type chunkMatchScoreSlice []ChunkMatch
896
897func (m chunkMatchScoreSlice) Len() int { return len(m) }
898func (m chunkMatchScoreSlice) Swap(i, j int) { m[i], m[j] = m[j], m[i] }
899func (m chunkMatchScoreSlice) Less(i, j int) bool { return m[i].Score > m[j].Score }
900
901type fileMatchesByScore []FileMatch
902
903func (m fileMatchesByScore) Len() int { return len(m) }
904func (m fileMatchesByScore) Swap(i, j int) { m[i], m[j] = m[j], m[i] }
905func (m fileMatchesByScore) Less(i, j int) bool { return m[i].Score > m[j].Score }
906
907func sortMatchesByScore(ms []LineMatch) {
908 sort.Sort(matchScoreSlice(ms))
909}
910
911func sortChunkMatchesByScore(ms []ChunkMatch) {
912 sort.Sort(chunkMatchScoreSlice(ms))
913}
914
915var doNovelty = os.Getenv("ZOEKT_NOVELTY_DISABLE") == ""
916
917// SortFiles sorts files matches in the order we want to present results to
918// users. The order depends on the match score, which includes both
919// query-dependent signals like word overlap, and file-only signals like the
920// file ranks (if file ranks are enabled).
921//
922// We don't only use the scores, we will also boost some results to present
923// files with novel extensions.
924func SortFiles(ms []FileMatch) {
925 sort.Sort(fileMatchesByScore(ms))
926
927 if doNovelty {
928 // Experimentally boost something into the third filematch
929 boostNovelExtension(ms, 2, 0.9)
930 }
931}
932
933func boostNovelExtension(ms []FileMatch, boostOffset int, minScoreRatio float64) {
934 if len(ms) <= boostOffset+1 {
935 return
936 }
937
938 top := ms[:boostOffset]
939 candidates := ms[boostOffset:]
940
941 // Don't bother boosting something which is significantly different to the
942 // result it replaces.
943 minScoreForNovelty := candidates[0].Score * minScoreRatio
944
945 // We want to look for an ext that isn't in the top exts
946 exts := make([]string, len(top))
947 for i := range top {
948 exts[i] = path.Ext(top[i].FileName)
949 }
950
951 for i := range candidates {
952 // Do not assume sorted due to boostNovelExtension being called on subsets
953 if candidates[i].Score < minScoreForNovelty {
954 continue
955 }
956
957 if slices.Contains(exts, path.Ext(candidates[i].FileName)) {
958 continue
959 }
960
961 // Found what we are looking for, now boost to front of candidates (which
962 // is ms[boostOffset])
963 for ; i > 0; i-- {
964 candidates[i], candidates[i-1] = candidates[i-1], candidates[i]
965 }
966 return
967 }
968}