fork of https://github.com/sourcegraph/zoekt
1// Copyright 2016 Google Inc. All rights reserved.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15package zoekt
16
17import (
18 "bytes"
19 "fmt"
20 "log"
21 "os"
22 "path"
23 "sort"
24 "strings"
25 "unicode"
26 "unicode/utf8"
27
28 "github.com/sourcegraph/zoekt/ctags"
29 "golang.org/x/exp/slices"
30)
31
32var _ = log.Println
33
34// contentProvider is an abstraction to treat matches for names and
35// content with the same code.
36type contentProvider struct {
37 id *indexData
38 stats *Stats
39
40 // mutable
41 err error
42 idx uint32
43 _data []byte
44 _nl []uint32
45 _nlBuf []uint32
46 _sects []DocumentSection
47 _sectBuf []DocumentSection
48 fileSize uint32
49}
50
51// setDocument skips to the given document.
52func (p *contentProvider) setDocument(docID uint32) {
53 fileStart := p.id.boundaries[docID]
54
55 p.idx = docID
56 p.fileSize = p.id.boundaries[docID+1] - fileStart
57
58 p._nl = nil
59 p._sects = nil
60 p._data = nil
61}
62
63func (p *contentProvider) docSections() []DocumentSection {
64 if p._sects == nil {
65 var sz uint32
66 p._sects, sz, p.err = p.id.readDocSections(p.idx, p._sectBuf)
67 p.stats.ContentBytesLoaded += int64(sz)
68 p._sectBuf = p._sects
69 }
70 return p._sects
71}
72
73func (p *contentProvider) newlines() newlines {
74 if p._nl == nil {
75 var sz uint32
76 p._nl, sz, p.err = p.id.readNewlines(p.idx, p._nlBuf)
77 p._nlBuf = p._nl
78 p.stats.ContentBytesLoaded += int64(sz)
79 }
80 return newlines{locs: p._nl, fileSize: p.fileSize}
81}
82
83func (p *contentProvider) data(fileName bool) []byte {
84 if fileName {
85 return p.id.fileNameContent[p.id.fileNameIndex[p.idx]:p.id.fileNameIndex[p.idx+1]]
86 }
87
88 if p._data == nil {
89 p._data, p.err = p.id.readContents(p.idx)
90 p.stats.FilesLoaded++
91 p.stats.ContentBytesLoaded += int64(len(p._data))
92 }
93 return p._data
94}
95
96// Find offset in bytes (relative to corpus start) for an offset in
97// runes (relative to document start). If filename is set, the corpus
98// is the set of filenames, with the document being the name itself.
99func (p *contentProvider) findOffset(filename bool, r uint32) uint32 {
100 if p.id.metaData.PlainASCII {
101 return r
102 }
103
104 sample := p.id.runeOffsets
105 runeEnds := p.id.fileEndRunes
106 fileStartByte := p.id.boundaries[p.idx]
107 if filename {
108 sample = p.id.fileNameRuneOffsets
109 runeEnds = p.id.fileNameEndRunes
110 fileStartByte = p.id.fileNameIndex[p.idx]
111 }
112
113 absR := r
114 if p.idx > 0 {
115 absR += runeEnds[p.idx-1]
116 }
117
118 byteOff, left := sample.lookup(absR)
119
120 var data []byte
121
122 if filename {
123 data = p.id.fileNameContent[byteOff:]
124 } else {
125 data, p.err = p.id.readContentSlice(byteOff, 3*runeOffsetFrequency)
126 if p.err != nil {
127 return 0
128 }
129 }
130 for left > 0 {
131 _, sz := utf8.DecodeRune(data)
132 byteOff += uint32(sz)
133 data = data[sz:]
134 left--
135 }
136
137 byteOff -= fileStartByte
138 return byteOff
139}
140
141func (p *contentProvider) fillMatches(ms []*candidateMatch, numContextLines int, language string, debug bool) []LineMatch {
142 var result []LineMatch
143 if ms[0].fileName {
144 // There is only "line" in a filename.
145 res := LineMatch{
146 Line: p.id.fileName(p.idx),
147 FileName: true,
148 }
149
150 for _, m := range ms {
151 res.LineFragments = append(res.LineFragments, LineFragmentMatch{
152 LineOffset: int(m.byteOffset),
153 MatchLength: int(m.byteMatchSz),
154 Offset: m.byteOffset,
155 })
156
157 result = []LineMatch{res}
158 }
159 } else {
160 ms = breakMatchesOnNewlines(ms, p.data(false))
161 result = p.fillContentMatches(ms, numContextLines)
162 }
163
164 sects := p.docSections()
165 for i, m := range result {
166 result[i].Score, result[i].DebugScore = p.matchScore(sects, &m, language, debug)
167 }
168
169 return result
170}
171
172// fillChunkMatches converts the internal candidateMatch slice into our APIs ChunkMatch.
173//
174// Performance invariant: ms is sorted and non-overlapping.
175//
176// Note: the byte slices may be backed by mmapped data, so before being
177// returned by the API it needs to be copied.
178func (p *contentProvider) fillChunkMatches(ms []*candidateMatch, numContextLines int, language string, debug bool) []ChunkMatch {
179 var result []ChunkMatch
180 if ms[0].fileName {
181 // If the first match is a filename match, there will only be
182 // one match and the matched content will be the filename.
183
184 fileName := p.id.fileName(p.idx)
185 ranges := make([]Range, 0, len(ms))
186 for _, m := range ms {
187 ranges = append(ranges, Range{
188 Start: Location{
189 ByteOffset: m.byteOffset,
190 LineNumber: 1,
191 Column: uint32(utf8.RuneCount(fileName[:m.byteOffset]) + 1),
192 },
193 End: Location{
194 ByteOffset: m.byteOffset + m.byteMatchSz,
195 LineNumber: 1,
196 Column: uint32(utf8.RuneCount(fileName[:m.byteOffset+m.byteMatchSz]) + 1),
197 },
198 })
199 }
200
201 result = []ChunkMatch{{
202 Content: fileName,
203 ContentStart: Location{ByteOffset: 0, LineNumber: 1, Column: 1},
204 Ranges: ranges,
205 FileName: true,
206 }}
207 } else {
208 result = p.fillContentChunkMatches(ms, numContextLines)
209 }
210
211 sects := p.docSections()
212 for i, m := range result {
213 result[i].Score, result[i].DebugScore = p.chunkMatchScore(sects, &m, language, debug)
214 }
215
216 return result
217}
218
219func (p *contentProvider) fillContentMatches(ms []*candidateMatch, numContextLines int) []LineMatch {
220 var result []LineMatch
221 for len(ms) > 0 {
222 m := ms[0]
223 num, lineStart, lineEnd := p.newlines().atOffset(m.byteOffset)
224
225 var lineCands []*candidateMatch
226
227 endMatch := m.byteOffset + m.byteMatchSz
228
229 for len(ms) > 0 {
230 m := ms[0]
231 if int(m.byteOffset) <= lineEnd {
232 endMatch = m.byteOffset + m.byteMatchSz
233 lineCands = append(lineCands, m)
234 ms = ms[1:]
235 } else {
236 break
237 }
238 }
239
240 if len(lineCands) == 0 {
241 log.Panicf(
242 "%s %v infinite loop: num %d start,end %d,%d, offset %d",
243 p.id.fileName(p.idx), p.id.metaData,
244 num, lineStart, lineEnd,
245 m.byteOffset)
246 }
247
248 data := p.data(false)
249
250 // Due to merging matches, we may have a match that
251 // crosses a line boundary. Prevent confusion by
252 // taking lines until we pass the last match
253 for lineEnd < len(data) && endMatch > uint32(lineEnd) {
254 next := bytes.IndexByte(data[lineEnd+1:], '\n')
255 if next == -1 {
256 lineEnd = len(data)
257 } else {
258 // TODO(hanwen): test that checks "+1" part here.
259 lineEnd += next + 1
260 }
261 }
262
263 finalMatch := LineMatch{
264 LineStart: lineStart,
265 LineEnd: lineEnd,
266 LineNumber: num,
267 }
268 finalMatch.Line = data[lineStart:lineEnd]
269
270 if numContextLines > 0 {
271 finalMatch.Before = p.newlines().getLines(data, num-numContextLines, num)
272 finalMatch.After = p.newlines().getLines(data, num+1, num+1+numContextLines)
273 }
274
275 for _, m := range lineCands {
276 fragment := LineFragmentMatch{
277 Offset: m.byteOffset,
278 LineOffset: int(m.byteOffset) - lineStart,
279 MatchLength: int(m.byteMatchSz),
280 }
281 if m.symbol {
282 start := p.id.fileEndSymbol[p.idx]
283 fragment.SymbolInfo = p.id.symbols.data(start + m.symbolIdx)
284 if fragment.SymbolInfo != nil {
285 sec := p.docSections()[m.symbolIdx]
286 fragment.SymbolInfo.Sym = string(data[sec.Start:sec.End])
287 }
288 }
289
290 finalMatch.LineFragments = append(finalMatch.LineFragments, fragment)
291 }
292 result = append(result, finalMatch)
293 }
294 return result
295}
296
297func (p *contentProvider) fillContentChunkMatches(ms []*candidateMatch, numContextLines int) []ChunkMatch {
298 newlines := p.newlines()
299 data := p.data(false)
300
301 // columnHelper prevents O(len(ms) * len(data)) lookups for all columns.
302 // However, it depends on ms being sorted by byteOffset and non-overlapping.
303 // This invariant is true at the time of writing, but we conservatively
304 // enforce this. Note: chunkCandidates preserves the sorting so safe to
305 // transform now.
306 columnHelper := columnHelper{data: data}
307 if !sort.IsSorted((sortByOffsetSlice)(ms)) {
308 log.Printf("WARN: performance invariant violated. candidate matches are not sorted in fillContentChunkMatches. Report to developers.")
309 sort.Sort((sortByOffsetSlice)(ms))
310 }
311
312 chunks := chunkCandidates(ms, newlines, numContextLines)
313 chunkMatches := make([]ChunkMatch, 0, len(chunks))
314 for _, chunk := range chunks {
315 ranges := make([]Range, 0, len(chunk.candidates))
316 var symbolInfo []*Symbol
317 for i, cm := range chunk.candidates {
318 startOffset := cm.byteOffset
319 endOffset := cm.byteOffset + cm.byteMatchSz
320 startLine, startLineOffset, _ := newlines.atOffset(startOffset)
321 endLine, endLineOffset, _ := newlines.atOffset(endOffset)
322
323 ranges = append(ranges, Range{
324 Start: Location{
325 ByteOffset: startOffset,
326 LineNumber: uint32(startLine),
327 Column: columnHelper.get(startLineOffset, startOffset),
328 },
329 End: Location{
330 ByteOffset: endOffset,
331 LineNumber: uint32(endLine),
332 Column: columnHelper.get(endLineOffset, endOffset),
333 },
334 })
335
336 if cm.symbol {
337 if symbolInfo == nil {
338 symbolInfo = make([]*Symbol, len(chunk.candidates))
339 }
340 start := p.id.fileEndSymbol[p.idx]
341 si := p.id.symbols.data(start + cm.symbolIdx)
342 if si != nil {
343 sec := p.docSections()[cm.symbolIdx]
344 si.Sym = string(data[sec.Start:sec.End])
345 }
346 symbolInfo[i] = si
347 }
348 }
349
350 firstLineNumber := int(chunk.firstLine) - numContextLines
351 if firstLineNumber < 1 {
352 firstLineNumber = 1
353 }
354 firstLineStart, _ := newlines.lineBounds(firstLineNumber)
355
356 chunkMatches = append(chunkMatches, ChunkMatch{
357 Content: newlines.getLines(data, firstLineNumber, int(chunk.lastLine)+numContextLines+1),
358 ContentStart: Location{
359 ByteOffset: firstLineStart,
360 LineNumber: uint32(firstLineNumber),
361 Column: 1,
362 },
363 FileName: false,
364 Ranges: ranges,
365 SymbolInfo: symbolInfo,
366 })
367 }
368 return chunkMatches
369}
370
371type candidateChunk struct {
372 firstLine uint32 // 1-based, inclusive
373 lastLine uint32 // 1-based, inclusive
374 minOffset uint32 // 0-based, inclusive
375 maxOffset uint32 // 0-based, exclusive
376 candidates []*candidateMatch
377}
378
379// chunkCandidates groups a set of sorted, non-overlapping candidate matches by line number. Adjacent
380// chunks will be merged if adding `numContextLines` to the beginning and end of the chunk would cause
381// it to overlap with an adjacent chunk.
382//
383// input invariants: ms is sorted by byteOffset and is non overlapping with respect to endOffset.
384// output invariants: if you flatten candidates the input invariant is retained.
385func chunkCandidates(ms []*candidateMatch, newlines newlines, numContextLines int) []candidateChunk {
386 var chunks []candidateChunk
387 for _, m := range ms {
388 startOffset := m.byteOffset
389 endOffset := m.byteOffset + m.byteMatchSz
390 firstLine, _, _ := newlines.atOffset(startOffset)
391 lastLine, _, _ := newlines.atOffset(endOffset)
392
393 if len(chunks) > 0 && int(chunks[len(chunks)-1].lastLine)+numContextLines >= firstLine-numContextLines {
394 // If a new chunk created with the current candidateMatch would
395 // overlap with the previous chunk, instead add the candidateMatch
396 // to the last chunk and extend end of the last chunk.
397 last := &chunks[len(chunks)-1]
398 last.candidates = append(last.candidates, m)
399 if last.maxOffset < endOffset {
400 last.lastLine = uint32(lastLine)
401 last.maxOffset = uint32(endOffset)
402 }
403 } else {
404 chunks = append(chunks, candidateChunk{
405 firstLine: uint32(firstLine),
406 lastLine: uint32(lastLine),
407 minOffset: startOffset,
408 maxOffset: endOffset,
409 candidates: []*candidateMatch{m},
410 })
411 }
412 }
413 return chunks
414}
415
416// columnHelper is a helper struct which caches the number of runes last
417// counted. If we naively use utf8.RuneCount for each match on a line, this
418// leads to an O(nm) algorithm where m is the number of matches and n is the
419// length of the line. Aassuming we our candidates are increasing in offset
420// makes this operation O(n) instead.
421type columnHelper struct {
422 data []byte
423
424 // 0 values for all these are valid values
425 lastLineOffset int
426 lastOffset uint32
427 lastRuneCount uint32
428}
429
430// get returns the line column for offset. offset is the byte offset of the
431// rune in data. lineOffset is the byte offset inside of data for the line
432// containing offset.
433func (c *columnHelper) get(lineOffset int, offset uint32) uint32 {
434 var runeCount uint32
435
436 if lineOffset == c.lastLineOffset && offset >= c.lastOffset {
437 // Can count from last calculation
438 runeCount = c.lastRuneCount + uint32(utf8.RuneCount(c.data[c.lastOffset:offset]))
439 } else {
440 // Need to count from the beginning of line
441 runeCount = uint32(utf8.RuneCount(c.data[lineOffset:offset]))
442 }
443
444 c.lastLineOffset = lineOffset
445 c.lastOffset = offset
446 c.lastRuneCount = runeCount
447
448 return runeCount + 1
449}
450
451type newlines struct {
452 // locs is the sorted set of byte offsets of the newlines in the file
453 locs []uint32
454
455 // fileSize is just the number of bytes in the file. It is stored
456 // on this struct so we can safely know the length of the last line
457 // in the file since not all files end in a newline.
458 fileSize uint32
459}
460
461// atOffset returns the line containing the offset. If the offset lands on
462// the newline ending line M, we return M. The line is characterized
463// by its linenumber (base-1, byte index of line start, byte index of
464// line end). The line end is the index of a newline, or the filesize
465// (if matching the last line of the file.)
466func (nls newlines) atOffset(offset uint32) (lineNumber, lineStart, lineEnd int) {
467 idx := sort.Search(len(nls.locs), func(n int) bool {
468 return nls.locs[n] >= offset
469 })
470
471 start, end := nls.lineBounds(idx + 1)
472 return idx + 1, int(start), int(end)
473}
474
475// lineBounds returns the byte offsets of the start and end of the 1-based
476// lineNumber. The end offset is exclusive and will not contain the line-ending
477// newline. If the line number is out of range of the lines in the file, start
478// and end will be clamped to [0,fileSize].
479func (nls newlines) lineBounds(lineNumber int) (start, end uint32) {
480 // nls.locs[0] + 1 is the start of the 2nd line of data.
481 startIdx := lineNumber - 2
482 endIdx := lineNumber - 1
483
484 if startIdx < 0 {
485 start = 0
486 } else if startIdx >= len(nls.locs) {
487 start = nls.fileSize
488 } else {
489 start = nls.locs[startIdx] + 1
490 }
491
492 if endIdx < 0 {
493 end = 0
494 } else if endIdx >= len(nls.locs) {
495 end = nls.fileSize
496 } else {
497 end = nls.locs[endIdx]
498 }
499
500 return start, end
501}
502
503// getLines returns a slice of data containing the lines [low, high).
504// low is 1-based and inclusive. high is 1-based and exclusive.
505func (nls newlines) getLines(data []byte, low, high int) []byte {
506 if low >= high {
507 return nil
508 }
509
510 lowStart, _ := nls.lineBounds(low)
511 _, highEnd := nls.lineBounds(high - 1)
512
513 // Drop any trailing newline. Editors do not treat a trailing newline as
514 // the start of a new line, so we should not either. lineBounds clamps to
515 // len(data) when an out-of-bounds line is requested.
516 //
517 // As an example, if we request lines 1-5 from a file with contents
518 // `one\ntwo\nthree\n`, we should return `one\ntwo\nthree` because those are
519 // the three "lines" in the file, separated by newlines.
520 if highEnd == uint32(len(data)) && bytes.HasSuffix(data, []byte{'\n'}) {
521 highEnd = highEnd - 1
522 lowStart = min(lowStart, highEnd)
523 }
524
525 return data[lowStart:highEnd]
526}
527
528const (
529 // Query-dependent scoring signals. All of these together are bounded at ~9000
530 // (scoreWordMatch + scoreSymbol + scoreKindMatch * 10 + scoreFactorAtomMatch).
531 scorePartialWordMatch = 50.0
532 scoreWordMatch = 500.0
533 scoreBase = 7000.0
534 scorePartialBase = 4000.0
535 scoreSymbol = 7000.0
536 scorePartialSymbol = 4000.0
537 scoreKindMatch = 100.0
538 scoreFactorAtomMatch = 400.0
539
540 // File-only scoring signals. For now these are also bounded ~9000 to give them
541 // equal weight with the query-dependent signals.
542 scoreFileRankFactor = 9000.0
543 scoreFileOrderFactor = 10.0
544 scoreRepoRankFactor = 20.0
545
546 // Used for ordering line and chunk matches within a file.
547 scoreLineOrderFactor = 1.0
548)
549
550// findSection checks whether a section defined by offset and size lies within
551// one of the sections in secs.
552func findSection(secs []DocumentSection, off, sz uint32) (int, bool) {
553 j := sort.Search(len(secs), func(i int) bool {
554 return secs[i].End >= off+sz
555 })
556
557 if j == len(secs) {
558 return 0, false
559 }
560
561 if secs[j].Start <= off && off+sz <= secs[j].End {
562 return j, true
563 }
564 return 0, false
565}
566
567func (p *contentProvider) chunkMatchScore(secs []DocumentSection, m *ChunkMatch, language string, debug bool) (float64, string) {
568 type debugScore struct {
569 score float64
570 what string
571 }
572
573 score := &debugScore{}
574 maxScore := &debugScore{}
575
576 addScore := func(what string, s float64) {
577 if s != 0 && debug {
578 score.what += fmt.Sprintf("%s:%.2f, ", what, s)
579 }
580 score.score += s
581 }
582
583 data := p.data(m.FileName)
584 filename := p.data(true)
585
586 for i, r := range m.Ranges {
587 // calculate the start and end offset relative to the start of the content
588 relStartOffset := int(r.Start.ByteOffset - m.ContentStart.ByteOffset)
589 relEndOffset := int(r.End.ByteOffset - m.ContentStart.ByteOffset)
590
591 startBoundary := relStartOffset < len(m.Content) && (relStartOffset == 0 || byteClass(m.Content[relStartOffset-1]) != byteClass(m.Content[relStartOffset]))
592 endBoundary := relEndOffset > 0 && (relEndOffset == len(m.Content) || byteClass(m.Content[relEndOffset-1]) != byteClass(m.Content[relEndOffset]))
593
594 score.score = 0
595 score.what = ""
596
597 if startBoundary && endBoundary {
598 addScore("WordMatch", scoreWordMatch)
599 } else if startBoundary || endBoundary {
600 addScore("PartialWordMatch", scorePartialWordMatch)
601 }
602
603 if m.FileName {
604 sep := bytes.LastIndexByte(m.Content, '/')
605 startMatch := relStartOffset == sep+1
606 endMatch := relEndOffset == len(m.Content)
607 if startMatch && endMatch {
608 addScore("Base", scoreBase)
609 } else if startMatch || endMatch {
610 addScore("EdgeBase", (scoreBase+scorePartialBase)/2)
611 } else if sep < relStartOffset {
612 addScore("InnerBase", scorePartialBase)
613 }
614 } else if secIdx, ok := findSection(secs, uint32(r.Start.ByteOffset), uint32(r.End.ByteOffset-r.Start.ByteOffset)); ok {
615 sec := secs[secIdx]
616 startMatch := sec.Start == uint32(r.Start.ByteOffset)
617 endMatch := sec.End == uint32(r.End.ByteOffset)
618 if startMatch && endMatch {
619 addScore("Symbol", scoreSymbol)
620 } else if startMatch || endMatch {
621 addScore("EdgeSymbol", (scoreSymbol+scorePartialSymbol)/2)
622 } else {
623 addScore("InnerSymbol", scorePartialSymbol)
624 }
625
626 var si *Symbol
627 if m.SymbolInfo != nil {
628 si = m.SymbolInfo[i]
629 }
630 if si == nil {
631 // for non-symbol queries, we need to hydrate in SymbolInfo.
632 start := p.id.fileEndSymbol[p.idx]
633 si = p.id.symbols.data(start + uint32(secIdx))
634 }
635 if si != nil {
636 symbolKind := ctags.ParseSymbolKind(si.Kind)
637 sym := sectionSlice(data, sec)
638 addScore(fmt.Sprintf("kind:%s:%s", language, si.Kind), scoreSymbolKind(language, filename, sym, symbolKind))
639 }
640 }
641
642 if score.score > maxScore.score {
643 maxScore.score = score.score
644 maxScore.what = score.what
645 }
646 }
647
648 if debug {
649 maxScore.what = fmt.Sprintf("score:%.2f <- %s", maxScore.score, strings.TrimSuffix(maxScore.what, ", "))
650 }
651
652 return maxScore.score, maxScore.what
653}
654
655func (p *contentProvider) matchScore(secs []DocumentSection, m *LineMatch, language string, debug bool) (float64, string) {
656 type debugScore struct {
657 score float64
658 what string
659 }
660
661 score := &debugScore{}
662 maxScore := &debugScore{}
663
664 addScore := func(what string, s float64) {
665 if s != 0 && debug {
666 score.what += fmt.Sprintf("%s:%.2f, ", what, s)
667 }
668 score.score += s
669 }
670
671 data := p.data(m.FileName)
672 filename := p.data(true)
673
674 for _, f := range m.LineFragments {
675 startBoundary := f.LineOffset < len(m.Line) && (f.LineOffset == 0 || byteClass(m.Line[f.LineOffset-1]) != byteClass(m.Line[f.LineOffset]))
676
677 end := int(f.LineOffset) + f.MatchLength
678 endBoundary := end > 0 && (end == len(m.Line) || byteClass(m.Line[end-1]) != byteClass(m.Line[end]))
679
680 score.score = 0
681 score.what = ""
682
683 if startBoundary && endBoundary {
684 addScore("WordMatch", scoreWordMatch)
685 } else if startBoundary || endBoundary {
686 addScore("PartialWordMatch", scorePartialWordMatch)
687 }
688
689 if m.FileName {
690 sep := bytes.LastIndexByte(m.Line, '/')
691 startMatch := sep+1 == f.LineOffset
692 endMatch := len(m.Line) == f.LineOffset+f.MatchLength
693 if startMatch && endMatch {
694 addScore("Base", scoreBase)
695 } else if startMatch || endMatch {
696 addScore("EdgeBase", (scoreBase+scorePartialBase)/2)
697 } else if sep < f.LineOffset {
698 addScore("InnerBase", scorePartialBase)
699 }
700 } else if secIdx, ok := findSection(secs, f.Offset, uint32(f.MatchLength)); ok {
701 sec := secs[secIdx]
702 startMatch := sec.Start == f.Offset
703 endMatch := sec.End == f.Offset+uint32(f.MatchLength)
704 if startMatch && endMatch {
705 addScore("Symbol", scoreSymbol)
706 } else if startMatch || endMatch {
707 addScore("EdgeSymbol", (scoreSymbol+scorePartialSymbol)/2)
708 } else {
709 addScore("InnerSymbol", scorePartialSymbol)
710 }
711
712 si := f.SymbolInfo
713 if si == nil {
714 // for non-symbol queries, we need to hydrate in SymbolInfo.
715 start := p.id.fileEndSymbol[p.idx]
716 si = p.id.symbols.data(start + uint32(secIdx))
717 }
718 if si != nil {
719 // the LineFragment may not be on a symbol, then si will be nil.
720 symbolKind := ctags.ParseSymbolKind(si.Kind)
721 sym := sectionSlice(data, sec)
722 addScore(fmt.Sprintf("kind:%s:%s", language, si.Kind), scoreSymbolKind(language, filename, sym, symbolKind))
723 }
724 }
725
726 if score.score > maxScore.score {
727 maxScore.score = score.score
728 maxScore.what = score.what
729 }
730 }
731
732 if debug {
733 maxScore.what = fmt.Sprintf("score:%.2f <- %s", maxScore.score, strings.TrimSuffix(maxScore.what, ", "))
734 }
735
736 return maxScore.score, maxScore.what
737}
738
739// sectionSlice will return data[sec.Start:sec.End] but will clip Start and
740// End such that it won't be out of range.
741func sectionSlice(data []byte, sec DocumentSection) []byte {
742 l := uint32(len(data))
743 if sec.Start >= l {
744 return nil
745 }
746 if sec.End > l {
747 sec.End = l
748 }
749 return data[sec.Start:sec.End]
750}
751
752// scoreSymbolKind boosts a match based on the combination of language, symbol
753// and kind. The language string comes from go-enry, the symbol and kind from
754// ctags.
755func scoreSymbolKind(language string, filename []byte, sym []byte, kind ctags.SymbolKind) float64 {
756 var factor float64
757
758 // Generic ranking which will be overriden by language specific ranking
759 switch kind {
760 case ctags.Type: // scip-ctags regression workaround https://github.com/sourcegraph/sourcegraph/issues/57659
761 factor = 8
762 case ctags.Class:
763 factor = 10
764 case ctags.Struct:
765 factor = 9.5
766 case ctags.Enum:
767 factor = 9
768 case ctags.Interface:
769 factor = 8
770 case ctags.Function, ctags.Method:
771 factor = 7
772 case ctags.Field:
773 factor = 5.5
774 case ctags.Constant:
775 factor = 5
776 case ctags.Variable:
777 factor = 4
778 default:
779 // For all other kinds, assign a low score by default.
780 factor = 1
781 }
782
783 switch language {
784 case "Java", "java":
785 switch kind {
786 // 2022-03-30: go-ctags contains a regex rule for Java classes that sets "kind"
787 // to "classes" instead of "c". We have to cover both cases to support existing
788 // indexes.
789 case ctags.Class:
790 factor = 10
791 case ctags.Enum:
792 factor = 9
793 case ctags.Interface:
794 factor = 8
795 case ctags.Method:
796 factor = 7
797 case ctags.Field:
798 factor = 6
799 case ctags.EnumConstant:
800 factor = 5
801 }
802 case "Kotlin", "kotlin":
803 switch kind {
804 case ctags.Class:
805 factor = 10
806 case ctags.Interface:
807 factor = 9
808 case ctags.Method:
809 factor = 8
810 case ctags.TypeAlias:
811 factor = 7
812 case ctags.Constant:
813 factor = 6
814 case ctags.Variable:
815 factor = 5
816 }
817 case "Go", "go":
818 switch kind {
819 // scip-ctags regression workaround https://github.com/sourcegraph/sourcegraph/issues/57659
820 // for each case a description of the fields in ctags in the comment
821 case ctags.Type: // interface struct talias
822 factor = 9
823 case ctags.Interface: // interfaces
824 factor = 10
825 case ctags.Struct: // structs
826 factor = 9
827 case ctags.TypeAlias: // type aliases
828 factor = 9
829 case ctags.MethodSpec: // interface method specification
830 factor = 8.5
831 case ctags.Method, ctags.Function: // functions
832 factor = 8
833 case ctags.Field: // struct fields
834 factor = 7
835 case ctags.Constant: // constants
836 factor = 6
837 case ctags.Variable: // variables
838 factor = 5
839 }
840
841 // Boost exported go symbols. Same implementation as token.IsExported
842 if ch, _ := utf8.DecodeRune(sym); unicode.IsUpper(ch) {
843 factor += 0.5
844 }
845
846 if bytes.HasSuffix(filename, []byte("_test.go")) {
847 factor *= 0.8
848 }
849
850 // Could also rank on:
851 //
852 // - anonMember struct anonymous members
853 // - packageName name for specifying imported package
854 // - receiver receivers
855 // - package packages
856 // - type types
857 // - unknown unknown
858 case "C++", "c++":
859 switch kind {
860 case ctags.Class: // classes
861 factor = 10
862 case ctags.Enum: // enumeration names
863 factor = 9
864 case ctags.Function: // function definitions
865 factor = 8
866 case ctags.Struct: // structure names
867 factor = 7
868 case ctags.Union: // union names
869 factor = 6
870 case ctags.TypeAlias: // typedefs
871 factor = 5
872 case ctags.Field: // class, struct, and union members
873 factor = 4
874 case ctags.Variable: // varialbe definitions
875 factor = 3
876 }
877 // Could also rank on:
878 // NAME DESCRIPTION
879 // macro macro definitions
880 // enumerator enumerators (values inside an enumeration)
881 // header included header files
882 // namespace namespaces
883 // variable variable definitions
884 case "Scala", "scala":
885 switch kind {
886 case ctags.Class:
887 factor = 10
888 case ctags.Interface:
889 factor = 9
890 case ctags.Object:
891 factor = 8
892 case ctags.Function:
893 factor = 7
894 case ctags.Type:
895 factor = 6
896 case ctags.Variable:
897 factor = 5
898 case ctags.Package:
899 factor = 4
900 }
901 case "Python", "python":
902 switch kind {
903 case ctags.Class: // classes
904 factor = 10
905 case ctags.Function, ctags.Method: // function definitions
906 factor = 8
907 case ctags.Field: // class, struct, and union members
908 factor = 4
909 case ctags.Variable: // variable definitions
910 factor = 3
911 case ctags.Local: // local variables
912 factor = 2
913 }
914 // Could also rank on:
915 //
916 // - namespace name referring a module defined in other file
917 // - module modules
918 // - unknown name referring a class/variable/function/module defined in other module
919 // - parameter function parameters
920 case "Ruby", "ruby":
921 switch kind {
922 case ctags.Class:
923 factor = 10
924 case ctags.Method:
925 factor = 9
926 case ctags.MethodAlias:
927 factor = 8
928 case ctags.Module:
929 factor = 7
930 case ctags.SingletonMethod:
931 factor = 6
932 case ctags.Constant:
933 factor = 5
934 case ctags.Accessor:
935 factor = 4
936 case ctags.Library:
937 factor = 3
938 }
939 case "PHP", "php":
940 switch kind {
941 case ctags.Class:
942 factor = 10
943 case ctags.Interface:
944 factor = 9
945 case ctags.Function:
946 factor = 8
947 case ctags.Trait:
948 factor = 7
949 case ctags.Define:
950 factor = 6
951 case ctags.Namespace:
952 factor = 5
953 case ctags.MethodAlias:
954 factor = 4
955 case ctags.Variable:
956 factor = 3
957 case ctags.Local:
958 factor = 3
959 }
960 case "GraphQL", "graphql":
961 switch kind {
962 case ctags.Type:
963 factor = 10
964 }
965 case "Markdown", "markdown":
966 // Headers are good signal in docs, but do not rank as highly as code.
967 switch kind {
968 case ctags.Chapter: // #
969 factor = 4
970 case ctags.Section: // ##
971 factor = 3
972 case ctags.Subsection: // ###
973 factor = 2
974 }
975 }
976
977 return factor * scoreKindMatch
978}
979
980type matchScoreSlice []LineMatch
981
982func (m matchScoreSlice) Len() int { return len(m) }
983func (m matchScoreSlice) Swap(i, j int) { m[i], m[j] = m[j], m[i] }
984func (m matchScoreSlice) Less(i, j int) bool { return m[i].Score > m[j].Score }
985
986type chunkMatchScoreSlice []ChunkMatch
987
988func (m chunkMatchScoreSlice) Len() int { return len(m) }
989func (m chunkMatchScoreSlice) Swap(i, j int) { m[i], m[j] = m[j], m[i] }
990func (m chunkMatchScoreSlice) Less(i, j int) bool { return m[i].Score > m[j].Score }
991
992type fileMatchesByScore []FileMatch
993
994func (m fileMatchesByScore) Len() int { return len(m) }
995func (m fileMatchesByScore) Swap(i, j int) { m[i], m[j] = m[j], m[i] }
996func (m fileMatchesByScore) Less(i, j int) bool { return m[i].Score > m[j].Score }
997
998func sortMatchesByScore(ms []LineMatch) {
999 sort.Sort(matchScoreSlice(ms))
1000}
1001
1002func sortChunkMatchesByScore(ms []ChunkMatch) {
1003 sort.Sort(chunkMatchScoreSlice(ms))
1004}
1005
1006var doNovelty = os.Getenv("ZOEKT_NOVELTY_DISABLE") == ""
1007
1008// SortFiles sorts files matches in the order we want to present results to
1009// users. The order depends on the match score, which includes both
1010// query-dependent signals like word overlap, and file-only signals like the
1011// file ranks (if file ranks are enabled).
1012//
1013// We don't only use the scores, we will also boost some results to present
1014// files with novel extensions.
1015func SortFiles(ms []FileMatch) {
1016 sort.Sort(fileMatchesByScore(ms))
1017
1018 if doNovelty {
1019 // Experimentally boost something into the third filematch
1020 boostNovelExtension(ms, 2, 0.9)
1021 }
1022}
1023
1024func boostNovelExtension(ms []FileMatch, boostOffset int, minScoreRatio float64) {
1025 if len(ms) <= boostOffset+1 {
1026 return
1027 }
1028
1029 top := ms[:boostOffset]
1030 candidates := ms[boostOffset:]
1031
1032 // Don't bother boosting something which is significantly different to the
1033 // result it replaces.
1034 minScoreForNovelty := candidates[0].Score * minScoreRatio
1035
1036 // We want to look for an ext that isn't in the top exts
1037 exts := make([]string, len(top))
1038 for i := range top {
1039 exts[i] = path.Ext(top[i].FileName)
1040 }
1041
1042 for i := range candidates {
1043 // Do not assume sorted due to boostNovelExtension being called on subsets
1044 if candidates[i].Score < minScoreForNovelty {
1045 continue
1046 }
1047
1048 if slices.Contains(exts, path.Ext(candidates[i].FileName)) {
1049 continue
1050 }
1051
1052 // Found what we are looking for, now boost to front of candidates (which
1053 // is ms[boostOffset])
1054 for ; i > 0; i-- {
1055 candidates[i], candidates[i-1] = candidates[i-1], candidates[i]
1056 }
1057 return
1058 }
1059}