fork of https://github.com/sourcegraph/zoekt
1// Copyright 2016 Google Inc. All rights reserved.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15package zoekt
16
17import (
18 "bytes"
19 "fmt"
20 "log"
21 "path"
22 "sort"
23 "strings"
24 "unicode"
25 "unicode/utf8"
26
27 "github.com/sourcegraph/zoekt/ctags"
28 "golang.org/x/exp/slices"
29)
30
31var _ = log.Println
32
33// contentProvider is an abstraction to treat matches for names and
34// content with the same code.
35type contentProvider struct {
36 id *indexData
37 stats *Stats
38
39 // mutable
40 err error
41 idx uint32
42 _data []byte
43 _nl []uint32
44 _nlBuf []uint32
45 _sects []DocumentSection
46 _sectBuf []DocumentSection
47 fileSize uint32
48}
49
50// setDocument skips to the given document.
51func (p *contentProvider) setDocument(docID uint32) {
52 fileStart := p.id.boundaries[docID]
53
54 p.idx = docID
55 p.fileSize = p.id.boundaries[docID+1] - fileStart
56
57 p._nl = nil
58 p._sects = nil
59 p._data = nil
60}
61
62func (p *contentProvider) docSections() []DocumentSection {
63 if p._sects == nil {
64 var sz uint32
65 p._sects, sz, p.err = p.id.readDocSections(p.idx, p._sectBuf)
66 p.stats.ContentBytesLoaded += int64(sz)
67 p._sectBuf = p._sects
68 }
69 return p._sects
70}
71
72func (p *contentProvider) newlines() newlines {
73 if p._nl == nil {
74 var sz uint32
75 p._nl, sz, p.err = p.id.readNewlines(p.idx, p._nlBuf)
76 p._nlBuf = p._nl
77 p.stats.ContentBytesLoaded += int64(sz)
78 }
79 return newlines{locs: p._nl, fileSize: p.fileSize}
80}
81
82func (p *contentProvider) data(fileName bool) []byte {
83 if fileName {
84 return p.id.fileNameContent[p.id.fileNameIndex[p.idx]:p.id.fileNameIndex[p.idx+1]]
85 }
86
87 if p._data == nil {
88 p._data, p.err = p.id.readContents(p.idx)
89 p.stats.FilesLoaded++
90 p.stats.ContentBytesLoaded += int64(len(p._data))
91 }
92 return p._data
93}
94
95// Find offset in bytes (relative to corpus start) for an offset in
96// runes (relative to document start). If filename is set, the corpus
97// is the set of filenames, with the document being the name itself.
98func (p *contentProvider) findOffset(filename bool, r uint32) uint32 {
99 if p.id.metaData.PlainASCII {
100 return r
101 }
102
103 sample := p.id.runeOffsets
104 runeEnds := p.id.fileEndRunes
105 fileStartByte := p.id.boundaries[p.idx]
106 if filename {
107 sample = p.id.fileNameRuneOffsets
108 runeEnds = p.id.fileNameEndRunes
109 fileStartByte = p.id.fileNameIndex[p.idx]
110 }
111
112 absR := r
113 if p.idx > 0 {
114 absR += runeEnds[p.idx-1]
115 }
116
117 byteOff, left := sample.lookup(absR)
118
119 var data []byte
120
121 if filename {
122 data = p.id.fileNameContent[byteOff:]
123 } else {
124 data, p.err = p.id.readContentSlice(byteOff, 3*runeOffsetFrequency)
125 if p.err != nil {
126 return 0
127 }
128 }
129 for left > 0 {
130 _, sz := utf8.DecodeRune(data)
131 byteOff += uint32(sz)
132 data = data[sz:]
133 left--
134 }
135
136 byteOff -= fileStartByte
137 return byteOff
138}
139
140func (p *contentProvider) fillMatches(ms []*candidateMatch, numContextLines int, language string, debug bool) []LineMatch {
141 var result []LineMatch
142 if ms[0].fileName {
143 // There is only "line" in a filename.
144 res := LineMatch{
145 Line: p.id.fileName(p.idx),
146 FileName: true,
147 }
148
149 for _, m := range ms {
150 res.LineFragments = append(res.LineFragments, LineFragmentMatch{
151 LineOffset: int(m.byteOffset),
152 MatchLength: int(m.byteMatchSz),
153 Offset: m.byteOffset,
154 })
155
156 result = []LineMatch{res}
157 }
158 } else {
159 ms = breakMatchesOnNewlines(ms, p.data(false))
160 result = p.fillContentMatches(ms, numContextLines)
161 }
162
163 sects := p.docSections()
164 for i, m := range result {
165 result[i].Score, result[i].DebugScore = p.matchScore(sects, &m, language, debug)
166 }
167
168 return result
169}
170
171// fillChunkMatches converts the internal candidateMatch slice into our APIs ChunkMatch.
172//
173// Performance invariant: ms is sorted and non-overlapping.
174//
175// Note: the byte slices may be backed by mmapped data, so before being
176// returned by the API it needs to be copied.
177func (p *contentProvider) fillChunkMatches(ms []*candidateMatch, numContextLines int, language string, debug bool) []ChunkMatch {
178 var result []ChunkMatch
179 if ms[0].fileName {
180 // If the first match is a filename match, there will only be
181 // one match and the matched content will be the filename.
182
183 fileName := p.id.fileName(p.idx)
184 ranges := make([]Range, 0, len(ms))
185 for _, m := range ms {
186 ranges = append(ranges, Range{
187 Start: Location{
188 ByteOffset: m.byteOffset,
189 LineNumber: 1,
190 Column: uint32(utf8.RuneCount(fileName[:m.byteOffset]) + 1),
191 },
192 End: Location{
193 ByteOffset: m.byteOffset + m.byteMatchSz,
194 LineNumber: 1,
195 Column: uint32(utf8.RuneCount(fileName[:m.byteOffset+m.byteMatchSz]) + 1),
196 },
197 })
198 }
199
200 result = []ChunkMatch{{
201 Content: fileName,
202 ContentStart: Location{ByteOffset: 0, LineNumber: 1, Column: 1},
203 Ranges: ranges,
204 FileName: true,
205 }}
206 } else {
207 result = p.fillContentChunkMatches(ms, numContextLines)
208 }
209
210 sects := p.docSections()
211 for i, m := range result {
212 result[i].Score, result[i].DebugScore = p.chunkMatchScore(sects, &m, language, debug)
213 }
214
215 return result
216}
217
218func (p *contentProvider) fillContentMatches(ms []*candidateMatch, numContextLines int) []LineMatch {
219 var result []LineMatch
220 for len(ms) > 0 {
221 m := ms[0]
222 num, lineStart, lineEnd := p.newlines().atOffset(m.byteOffset)
223
224 var lineCands []*candidateMatch
225
226 endMatch := m.byteOffset + m.byteMatchSz
227
228 for len(ms) > 0 {
229 m := ms[0]
230 if int(m.byteOffset) <= lineEnd {
231 endMatch = m.byteOffset + m.byteMatchSz
232 lineCands = append(lineCands, m)
233 ms = ms[1:]
234 } else {
235 break
236 }
237 }
238
239 if len(lineCands) == 0 {
240 log.Panicf(
241 "%s %v infinite loop: num %d start,end %d,%d, offset %d",
242 p.id.fileName(p.idx), p.id.metaData,
243 num, lineStart, lineEnd,
244 m.byteOffset)
245 }
246
247 data := p.data(false)
248
249 // Due to merging matches, we may have a match that
250 // crosses a line boundary. Prevent confusion by
251 // taking lines until we pass the last match
252 for lineEnd < len(data) && endMatch > uint32(lineEnd) {
253 next := bytes.IndexByte(data[lineEnd+1:], '\n')
254 if next == -1 {
255 lineEnd = len(data)
256 } else {
257 // TODO(hanwen): test that checks "+1" part here.
258 lineEnd += next + 1
259 }
260 }
261
262 finalMatch := LineMatch{
263 LineStart: lineStart,
264 LineEnd: lineEnd,
265 LineNumber: num,
266 }
267 finalMatch.Line = data[lineStart:lineEnd]
268
269 if numContextLines > 0 {
270 finalMatch.Before = p.newlines().getLines(data, num-numContextLines, num)
271 finalMatch.After = p.newlines().getLines(data, num+1, num+1+numContextLines)
272 }
273
274 for _, m := range lineCands {
275 fragment := LineFragmentMatch{
276 Offset: m.byteOffset,
277 LineOffset: int(m.byteOffset) - lineStart,
278 MatchLength: int(m.byteMatchSz),
279 }
280 if m.symbol {
281 start := p.id.fileEndSymbol[p.idx]
282 fragment.SymbolInfo = p.id.symbols.data(start + m.symbolIdx)
283 if fragment.SymbolInfo != nil {
284 sec := p.docSections()[m.symbolIdx]
285 fragment.SymbolInfo.Sym = string(data[sec.Start:sec.End])
286 }
287 }
288
289 finalMatch.LineFragments = append(finalMatch.LineFragments, fragment)
290 }
291 result = append(result, finalMatch)
292 }
293 return result
294}
295
296func (p *contentProvider) fillContentChunkMatches(ms []*candidateMatch, numContextLines int) []ChunkMatch {
297 newlines := p.newlines()
298 data := p.data(false)
299
300 // columnHelper prevents O(len(ms) * len(data)) lookups for all columns.
301 // However, it depends on ms being sorted by byteOffset and non-overlapping.
302 // This invariant is true at the time of writing, but we conservatively
303 // enforce this. Note: chunkCandidates preserves the sorting so safe to
304 // transform now.
305 columnHelper := columnHelper{data: data}
306 if !sort.IsSorted((sortByOffsetSlice)(ms)) {
307 log.Printf("WARN: performance invariant violated. candidate matches are not sorted in fillContentChunkMatches. Report to developers.")
308 sort.Sort((sortByOffsetSlice)(ms))
309 }
310
311 chunks := chunkCandidates(ms, newlines, numContextLines)
312 chunkMatches := make([]ChunkMatch, 0, len(chunks))
313 for _, chunk := range chunks {
314 ranges := make([]Range, 0, len(chunk.candidates))
315 var symbolInfo []*Symbol
316 for i, cm := range chunk.candidates {
317 startOffset := cm.byteOffset
318 endOffset := cm.byteOffset + cm.byteMatchSz
319 startLine, startLineOffset, _ := newlines.atOffset(startOffset)
320 endLine, endLineOffset, _ := newlines.atOffset(endOffset)
321
322 ranges = append(ranges, Range{
323 Start: Location{
324 ByteOffset: startOffset,
325 LineNumber: uint32(startLine),
326 Column: columnHelper.get(startLineOffset, startOffset),
327 },
328 End: Location{
329 ByteOffset: endOffset,
330 LineNumber: uint32(endLine),
331 Column: columnHelper.get(endLineOffset, endOffset),
332 },
333 })
334
335 if cm.symbol {
336 if symbolInfo == nil {
337 symbolInfo = make([]*Symbol, len(chunk.candidates))
338 }
339 start := p.id.fileEndSymbol[p.idx]
340 si := p.id.symbols.data(start + cm.symbolIdx)
341 if si != nil {
342 sec := p.docSections()[cm.symbolIdx]
343 si.Sym = string(data[sec.Start:sec.End])
344 }
345 symbolInfo[i] = si
346 }
347 }
348
349 firstLineNumber := int(chunk.firstLine) - numContextLines
350 if firstLineNumber < 1 {
351 firstLineNumber = 1
352 }
353 firstLineStart, _ := newlines.lineBounds(firstLineNumber)
354
355 chunkMatches = append(chunkMatches, ChunkMatch{
356 Content: newlines.getLines(data, firstLineNumber, int(chunk.lastLine)+numContextLines+1),
357 ContentStart: Location{
358 ByteOffset: firstLineStart,
359 LineNumber: uint32(firstLineNumber),
360 Column: 1,
361 },
362 FileName: false,
363 Ranges: ranges,
364 SymbolInfo: symbolInfo,
365 })
366 }
367 return chunkMatches
368}
369
370type candidateChunk struct {
371 candidates []*candidateMatch
372 firstLine uint32 // 1-based, inclusive
373 lastLine uint32 // 1-based, inclusive
374 minOffset uint32 // 0-based, inclusive
375 maxOffset uint32 // 0-based, exclusive
376}
377
378// chunkCandidates groups a set of sorted, non-overlapping candidate matches by line number. Adjacent
379// chunks will be merged if adding `numContextLines` to the beginning and end of the chunk would cause
380// it to overlap with an adjacent chunk.
381//
382// input invariants: ms is sorted by byteOffset and is non overlapping with respect to endOffset.
383// output invariants: if you flatten candidates the input invariant is retained.
384func chunkCandidates(ms []*candidateMatch, newlines newlines, numContextLines int) []candidateChunk {
385 var chunks []candidateChunk
386 for _, m := range ms {
387 startOffset := m.byteOffset
388 endOffset := m.byteOffset + m.byteMatchSz
389 firstLine, _, _ := newlines.atOffset(startOffset)
390 lastLine, _, _ := newlines.atOffset(endOffset)
391
392 if len(chunks) > 0 && int(chunks[len(chunks)-1].lastLine)+numContextLines >= firstLine-numContextLines {
393 // If a new chunk created with the current candidateMatch would
394 // overlap with the previous chunk, instead add the candidateMatch
395 // to the last chunk and extend end of the last chunk.
396 last := &chunks[len(chunks)-1]
397 last.candidates = append(last.candidates, m)
398 if last.maxOffset < endOffset {
399 last.lastLine = uint32(lastLine)
400 last.maxOffset = uint32(endOffset)
401 }
402 } else {
403 chunks = append(chunks, candidateChunk{
404 firstLine: uint32(firstLine),
405 lastLine: uint32(lastLine),
406 minOffset: startOffset,
407 maxOffset: endOffset,
408 candidates: []*candidateMatch{m},
409 })
410 }
411 }
412 return chunks
413}
414
415// columnHelper is a helper struct which caches the number of runes last
416// counted. If we naively use utf8.RuneCount for each match on a line, this
417// leads to an O(nm) algorithm where m is the number of matches and n is the
418// length of the line. Aassuming we our candidates are increasing in offset
419// makes this operation O(n) instead.
420type columnHelper struct {
421 data []byte
422
423 // 0 values for all these are valid values
424 lastLineOffset int
425 lastOffset uint32
426 lastRuneCount uint32
427}
428
429// get returns the line column for offset. offset is the byte offset of the
430// rune in data. lineOffset is the byte offset inside of data for the line
431// containing offset.
432func (c *columnHelper) get(lineOffset int, offset uint32) uint32 {
433 var runeCount uint32
434
435 if lineOffset == c.lastLineOffset && offset >= c.lastOffset {
436 // Can count from last calculation
437 runeCount = c.lastRuneCount + uint32(utf8.RuneCount(c.data[c.lastOffset:offset]))
438 } else {
439 // Need to count from the beginning of line
440 runeCount = uint32(utf8.RuneCount(c.data[lineOffset:offset]))
441 }
442
443 c.lastLineOffset = lineOffset
444 c.lastOffset = offset
445 c.lastRuneCount = runeCount
446
447 return runeCount + 1
448}
449
450type newlines struct {
451 // locs is the sorted set of byte offsets of the newlines in the file
452 locs []uint32
453
454 // fileSize is just the number of bytes in the file. It is stored
455 // on this struct so we can safely know the length of the last line
456 // in the file since not all files end in a newline.
457 fileSize uint32
458}
459
460// atOffset returns the line containing the offset. If the offset lands on
461// the newline ending line M, we return M. The line is characterized
462// by its linenumber (base-1, byte index of line start, byte index of
463// line end). The line end is the index of a newline, or the filesize
464// (if matching the last line of the file.)
465func (nls newlines) atOffset(offset uint32) (lineNumber, lineStart, lineEnd int) {
466 idx := sort.Search(len(nls.locs), func(n int) bool {
467 return nls.locs[n] >= offset
468 })
469
470 start, end := nls.lineBounds(idx + 1)
471 return idx + 1, int(start), int(end)
472}
473
474// lineBounds returns the byte offsets of the start and end of the 1-based
475// lineNumber. The end offset is exclusive and will not contain the line-ending
476// newline. If the line number is out of range of the lines in the file, start
477// and end will be clamped to [0,fileSize].
478func (nls newlines) lineBounds(lineNumber int) (start, end uint32) {
479 // nls.locs[0] + 1 is the start of the 2nd line of data.
480 startIdx := lineNumber - 2
481 endIdx := lineNumber - 1
482
483 if startIdx < 0 {
484 start = 0
485 } else if startIdx >= len(nls.locs) {
486 start = nls.fileSize
487 } else {
488 start = nls.locs[startIdx] + 1
489 }
490
491 if endIdx < 0 {
492 end = 0
493 } else if endIdx >= len(nls.locs) {
494 end = nls.fileSize
495 } else {
496 end = nls.locs[endIdx]
497 }
498
499 return start, end
500}
501
502// getLines returns a slice of data containing the lines [low, high).
503// low is 1-based and inclusive. high is 1-based and exclusive.
504func (nls newlines) getLines(data []byte, low, high int) []byte {
505 if low >= high {
506 return nil
507 }
508
509 lowStart, _ := nls.lineBounds(low)
510 _, highEnd := nls.lineBounds(high - 1)
511
512 // Drop any trailing newline. Editors do not treat a trailing newline as
513 // the start of a new line, so we should not either. lineBounds clamps to
514 // len(data) when an out-of-bounds line is requested.
515 //
516 // As an example, if we request lines 1-5 from a file with contents
517 // `one\ntwo\nthree\n`, we should return `one\ntwo\nthree` because those are
518 // the three "lines" in the file, separated by newlines.
519 if highEnd == uint32(len(data)) && bytes.HasSuffix(data, []byte{'\n'}) {
520 highEnd = highEnd - 1
521 lowStart = min(lowStart, highEnd)
522 }
523
524 return data[lowStart:highEnd]
525}
526
527const (
528 // Query-dependent scoring signals. All of these together are bounded at ~9000
529 // (scoreWordMatch + scoreSymbol + scoreKindMatch * 10 + scoreFactorAtomMatch).
530 scorePartialWordMatch = 50.0
531 scoreWordMatch = 500.0
532 scoreBase = 7000.0
533 scorePartialBase = 4000.0
534 scoreSymbol = 7000.0
535 scorePartialSymbol = 4000.0
536 scoreKindMatch = 100.0
537 scoreFactorAtomMatch = 400.0
538
539 // File-only scoring signals. For now these are also bounded ~9000 to give them
540 // equal weight with the query-dependent signals.
541 scoreFileRankFactor = 9000.0
542 scoreFileOrderFactor = 10.0
543 scoreRepoRankFactor = 20.0
544
545 // Used for ordering line and chunk matches within a file.
546 scoreLineOrderFactor = 1.0
547)
548
549// findSection checks whether a section defined by offset and size lies within
550// one of the sections in secs.
551func findSection(secs []DocumentSection, off, sz uint32) (int, bool) {
552 j := sort.Search(len(secs), func(i int) bool {
553 return secs[i].End >= off+sz
554 })
555
556 if j == len(secs) {
557 return 0, false
558 }
559
560 if secs[j].Start <= off && off+sz <= secs[j].End {
561 return j, true
562 }
563 return 0, false
564}
565
566func (p *contentProvider) chunkMatchScore(secs []DocumentSection, m *ChunkMatch, language string, debug bool) (float64, string) {
567 type debugScore struct {
568 what string
569 score float64
570 }
571
572 score := &debugScore{}
573 maxScore := &debugScore{}
574
575 addScore := func(what string, s float64) {
576 if s != 0 && debug {
577 score.what += fmt.Sprintf("%s:%.2f, ", what, s)
578 }
579 score.score += s
580 }
581
582 data := p.data(m.FileName)
583 filename := p.data(true)
584
585 for i, r := range m.Ranges {
586 // calculate the start and end offset relative to the start of the content
587 relStartOffset := int(r.Start.ByteOffset - m.ContentStart.ByteOffset)
588 relEndOffset := int(r.End.ByteOffset - m.ContentStart.ByteOffset)
589
590 startBoundary := relStartOffset < len(m.Content) && (relStartOffset == 0 || byteClass(m.Content[relStartOffset-1]) != byteClass(m.Content[relStartOffset]))
591 endBoundary := relEndOffset > 0 && (relEndOffset == len(m.Content) || byteClass(m.Content[relEndOffset-1]) != byteClass(m.Content[relEndOffset]))
592
593 score.score = 0
594 score.what = ""
595
596 if startBoundary && endBoundary {
597 addScore("WordMatch", scoreWordMatch)
598 } else if startBoundary || endBoundary {
599 addScore("PartialWordMatch", scorePartialWordMatch)
600 }
601
602 if m.FileName {
603 sep := bytes.LastIndexByte(m.Content, '/')
604 startMatch := relStartOffset == sep+1
605 endMatch := relEndOffset == len(m.Content)
606 if startMatch && endMatch {
607 addScore("Base", scoreBase)
608 } else if startMatch || endMatch {
609 addScore("EdgeBase", (scoreBase+scorePartialBase)/2)
610 } else if sep < relStartOffset {
611 addScore("InnerBase", scorePartialBase)
612 }
613 } else if secIdx, ok := findSection(secs, uint32(r.Start.ByteOffset), uint32(r.End.ByteOffset-r.Start.ByteOffset)); ok {
614 sec := secs[secIdx]
615 startMatch := sec.Start == uint32(r.Start.ByteOffset)
616 endMatch := sec.End == uint32(r.End.ByteOffset)
617 if startMatch && endMatch {
618 addScore("Symbol", scoreSymbol)
619 } else if startMatch || endMatch {
620 addScore("EdgeSymbol", (scoreSymbol+scorePartialSymbol)/2)
621 } else {
622 addScore("InnerSymbol", scorePartialSymbol)
623 }
624
625 var si *Symbol
626 if m.SymbolInfo != nil {
627 si = m.SymbolInfo[i]
628 }
629 if si == nil {
630 // for non-symbol queries, we need to hydrate in SymbolInfo.
631 start := p.id.fileEndSymbol[p.idx]
632 si = p.id.symbols.data(start + uint32(secIdx))
633 }
634 if si != nil {
635 symbolKind := ctags.ParseSymbolKind(si.Kind)
636 sym := sectionSlice(data, sec)
637 addScore(fmt.Sprintf("kind:%s:%s", language, si.Kind), scoreSymbolKind(language, filename, sym, symbolKind))
638 }
639 }
640
641 if score.score > maxScore.score {
642 maxScore.score = score.score
643 maxScore.what = score.what
644 }
645 }
646
647 if debug {
648 maxScore.what = fmt.Sprintf("score:%.2f <- %s", maxScore.score, strings.TrimSuffix(maxScore.what, ", "))
649 }
650
651 return maxScore.score, maxScore.what
652}
653
654func (p *contentProvider) matchScore(secs []DocumentSection, m *LineMatch, language string, debug bool) (float64, string) {
655 type debugScore struct {
656 what string
657 score float64
658 }
659
660 score := &debugScore{}
661 maxScore := &debugScore{}
662
663 addScore := func(what string, s float64) {
664 if s != 0 && debug {
665 score.what += fmt.Sprintf("%s:%.2f, ", what, s)
666 }
667 score.score += s
668 }
669
670 data := p.data(m.FileName)
671 filename := p.data(true)
672
673 for _, f := range m.LineFragments {
674 startBoundary := f.LineOffset < len(m.Line) && (f.LineOffset == 0 || byteClass(m.Line[f.LineOffset-1]) != byteClass(m.Line[f.LineOffset]))
675
676 end := int(f.LineOffset) + f.MatchLength
677 endBoundary := end > 0 && (end == len(m.Line) || byteClass(m.Line[end-1]) != byteClass(m.Line[end]))
678
679 score.score = 0
680 score.what = ""
681
682 if startBoundary && endBoundary {
683 addScore("WordMatch", scoreWordMatch)
684 } else if startBoundary || endBoundary {
685 addScore("PartialWordMatch", scorePartialWordMatch)
686 }
687
688 if m.FileName {
689 sep := bytes.LastIndexByte(m.Line, '/')
690 startMatch := sep+1 == f.LineOffset
691 endMatch := len(m.Line) == f.LineOffset+f.MatchLength
692 if startMatch && endMatch {
693 addScore("Base", scoreBase)
694 } else if startMatch || endMatch {
695 addScore("EdgeBase", (scoreBase+scorePartialBase)/2)
696 } else if sep < f.LineOffset {
697 addScore("InnerBase", scorePartialBase)
698 }
699 } else if secIdx, ok := findSection(secs, f.Offset, uint32(f.MatchLength)); ok {
700 sec := secs[secIdx]
701 startMatch := sec.Start == f.Offset
702 endMatch := sec.End == f.Offset+uint32(f.MatchLength)
703 if startMatch && endMatch {
704 addScore("Symbol", scoreSymbol)
705 } else if startMatch || endMatch {
706 addScore("EdgeSymbol", (scoreSymbol+scorePartialSymbol)/2)
707 } else {
708 addScore("InnerSymbol", scorePartialSymbol)
709 }
710
711 si := f.SymbolInfo
712 if si == nil {
713 // for non-symbol queries, we need to hydrate in SymbolInfo.
714 start := p.id.fileEndSymbol[p.idx]
715 si = p.id.symbols.data(start + uint32(secIdx))
716 }
717 if si != nil {
718 // the LineFragment may not be on a symbol, then si will be nil.
719 symbolKind := ctags.ParseSymbolKind(si.Kind)
720 sym := sectionSlice(data, sec)
721 addScore(fmt.Sprintf("kind:%s:%s", language, si.Kind), scoreSymbolKind(language, filename, sym, symbolKind))
722 }
723 }
724
725 if score.score > maxScore.score {
726 maxScore.score = score.score
727 maxScore.what = score.what
728 }
729 }
730
731 if debug {
732 maxScore.what = fmt.Sprintf("score:%.2f <- %s", maxScore.score, strings.TrimSuffix(maxScore.what, ", "))
733 }
734
735 return maxScore.score, maxScore.what
736}
737
738// sectionSlice will return data[sec.Start:sec.End] but will clip Start and
739// End such that it won't be out of range.
740func sectionSlice(data []byte, sec DocumentSection) []byte {
741 l := uint32(len(data))
742 if sec.Start >= l {
743 return nil
744 }
745 if sec.End > l {
746 sec.End = l
747 }
748 return data[sec.Start:sec.End]
749}
750
751// scoreSymbolKind boosts a match based on the combination of language, symbol
752// and kind. The language string comes from go-enry, the symbol and kind from
753// ctags.
754func scoreSymbolKind(language string, filename []byte, sym []byte, kind ctags.SymbolKind) float64 {
755 var factor float64
756
757 // Generic ranking which will be overriden by language specific ranking
758 switch kind {
759 case ctags.Type: // scip-ctags regression workaround https://github.com/sourcegraph/sourcegraph/issues/57659
760 factor = 8
761 case ctags.Class:
762 factor = 10
763 case ctags.Struct:
764 factor = 9.5
765 case ctags.Enum:
766 factor = 9
767 case ctags.Interface:
768 factor = 8
769 case ctags.Function, ctags.Method:
770 factor = 7
771 case ctags.Field:
772 factor = 5.5
773 case ctags.Constant:
774 factor = 5
775 case ctags.Variable:
776 factor = 4
777 default:
778 // For all other kinds, assign a low score by default.
779 factor = 1
780 }
781
782 switch language {
783 case "Java", "java":
784 switch kind {
785 // 2022-03-30: go-ctags contains a regex rule for Java classes that sets "kind"
786 // to "classes" instead of "c". We have to cover both cases to support existing
787 // indexes.
788 case ctags.Class:
789 factor = 10
790 case ctags.Enum:
791 factor = 9
792 case ctags.Interface:
793 factor = 8
794 case ctags.Method:
795 factor = 7
796 case ctags.Field:
797 factor = 6
798 case ctags.EnumConstant:
799 factor = 5
800 }
801 case "Kotlin", "kotlin":
802 switch kind {
803 case ctags.Class:
804 factor = 10
805 case ctags.Interface:
806 factor = 9
807 case ctags.Method:
808 factor = 8
809 case ctags.TypeAlias:
810 factor = 7
811 case ctags.Constant:
812 factor = 6
813 case ctags.Variable:
814 factor = 5
815 }
816 case "Go", "go":
817 switch kind {
818 // scip-ctags regression workaround https://github.com/sourcegraph/sourcegraph/issues/57659
819 // for each case a description of the fields in ctags in the comment
820 case ctags.Type: // interface struct talias
821 factor = 9
822 case ctags.Interface: // interfaces
823 factor = 10
824 case ctags.Struct: // structs
825 factor = 9
826 case ctags.TypeAlias: // type aliases
827 factor = 9
828 case ctags.MethodSpec: // interface method specification
829 factor = 8.5
830 case ctags.Method, ctags.Function: // functions
831 factor = 8
832 case ctags.Field: // struct fields
833 factor = 7
834 case ctags.Constant: // constants
835 factor = 6
836 case ctags.Variable: // variables
837 factor = 5
838 }
839
840 // Boost exported go symbols. Same implementation as token.IsExported
841 if ch, _ := utf8.DecodeRune(sym); unicode.IsUpper(ch) {
842 factor += 0.5
843 }
844
845 if bytes.HasSuffix(filename, []byte("_test.go")) {
846 factor *= 0.8
847 }
848
849 // Could also rank on:
850 //
851 // - anonMember struct anonymous members
852 // - packageName name for specifying imported package
853 // - receiver receivers
854 // - package packages
855 // - type types
856 // - unknown unknown
857 case "C++", "c++":
858 switch kind {
859 case ctags.Class: // classes
860 factor = 10
861 case ctags.Enum: // enumeration names
862 factor = 9
863 case ctags.Function: // function definitions
864 factor = 8
865 case ctags.Struct: // structure names
866 factor = 7
867 case ctags.Union: // union names
868 factor = 6
869 case ctags.TypeAlias: // typedefs
870 factor = 5
871 case ctags.Field: // class, struct, and union members
872 factor = 4
873 case ctags.Variable: // varialbe definitions
874 factor = 3
875 }
876 // Could also rank on:
877 // NAME DESCRIPTION
878 // macro macro definitions
879 // enumerator enumerators (values inside an enumeration)
880 // header included header files
881 // namespace namespaces
882 // variable variable definitions
883 case "Scala", "scala":
884 switch kind {
885 case ctags.Class:
886 factor = 10
887 case ctags.Interface:
888 factor = 9
889 case ctags.Object:
890 factor = 8
891 case ctags.Function:
892 factor = 7
893 case ctags.Type:
894 factor = 6
895 case ctags.Variable:
896 factor = 5
897 case ctags.Package:
898 factor = 4
899 }
900 case "Python", "python":
901 switch kind {
902 case ctags.Class: // classes
903 factor = 10
904 case ctags.Function, ctags.Method: // function definitions
905 factor = 8
906 case ctags.Field: // class, struct, and union members
907 factor = 4
908 case ctags.Variable: // variable definitions
909 factor = 3
910 case ctags.Local: // local variables
911 factor = 2
912 }
913 // Could also rank on:
914 //
915 // - namespace name referring a module defined in other file
916 // - module modules
917 // - unknown name referring a class/variable/function/module defined in other module
918 // - parameter function parameters
919 case "Ruby", "ruby":
920 switch kind {
921 case ctags.Class:
922 factor = 10
923 case ctags.Method:
924 factor = 9
925 case ctags.MethodAlias:
926 factor = 8
927 case ctags.Module:
928 factor = 7
929 case ctags.SingletonMethod:
930 factor = 6
931 case ctags.Constant:
932 factor = 5
933 case ctags.Accessor:
934 factor = 4
935 case ctags.Library:
936 factor = 3
937 }
938 case "PHP", "php":
939 switch kind {
940 case ctags.Class:
941 factor = 10
942 case ctags.Interface:
943 factor = 9
944 case ctags.Function:
945 factor = 8
946 case ctags.Trait:
947 factor = 7
948 case ctags.Define:
949 factor = 6
950 case ctags.Namespace:
951 factor = 5
952 case ctags.MethodAlias:
953 factor = 4
954 case ctags.Variable:
955 factor = 3
956 case ctags.Local:
957 factor = 3
958 }
959 case "GraphQL", "graphql":
960 switch kind {
961 case ctags.Type:
962 factor = 10
963 }
964 case "Markdown", "markdown":
965 // Headers are good signal in docs, but do not rank as highly as code.
966 switch kind {
967 case ctags.Chapter: // #
968 factor = 4
969 case ctags.Section: // ##
970 factor = 3
971 case ctags.Subsection: // ###
972 factor = 2
973 }
974 }
975
976 return factor * scoreKindMatch
977}
978
979type matchScoreSlice []LineMatch
980
981func (m matchScoreSlice) Len() int { return len(m) }
982func (m matchScoreSlice) Swap(i, j int) { m[i], m[j] = m[j], m[i] }
983func (m matchScoreSlice) Less(i, j int) bool { return m[i].Score > m[j].Score }
984
985type chunkMatchScoreSlice []ChunkMatch
986
987func (m chunkMatchScoreSlice) Len() int { return len(m) }
988func (m chunkMatchScoreSlice) Swap(i, j int) { m[i], m[j] = m[j], m[i] }
989func (m chunkMatchScoreSlice) Less(i, j int) bool { return m[i].Score > m[j].Score }
990
991type fileMatchesByScore []FileMatch
992
993func (m fileMatchesByScore) Len() int { return len(m) }
994func (m fileMatchesByScore) Swap(i, j int) { m[i], m[j] = m[j], m[i] }
995func (m fileMatchesByScore) Less(i, j int) bool { return m[i].Score > m[j].Score }
996
997func sortMatchesByScore(ms []LineMatch) {
998 sort.Sort(matchScoreSlice(ms))
999}
1000
1001func sortChunkMatchesByScore(ms []ChunkMatch) {
1002 sort.Sort(chunkMatchScoreSlice(ms))
1003}
1004
1005// SortFiles sorts files matches in the order we want to present results to
1006// users. The order depends on the match score, which includes both
1007// query-dependent signals like word overlap, and file-only signals like the
1008// file ranks (if file ranks are enabled).
1009//
1010// We don't only use the scores, we will also boost some results to present
1011// files with novel extensions.
1012func SortFiles(ms []FileMatch) {
1013 sort.Sort(fileMatchesByScore(ms))
1014
1015 // Boost a file extension not in the top 3 to the third filematch.
1016 boostNovelExtension(ms, 2, 0.9)
1017}
1018
1019func boostNovelExtension(ms []FileMatch, boostOffset int, minScoreRatio float64) {
1020 if len(ms) <= boostOffset+1 {
1021 return
1022 }
1023
1024 top := ms[:boostOffset]
1025 candidates := ms[boostOffset:]
1026
1027 // Don't bother boosting something which is significantly different to the
1028 // result it replaces.
1029 minScoreForNovelty := candidates[0].Score * minScoreRatio
1030
1031 // We want to look for an ext that isn't in the top exts
1032 exts := make([]string, len(top))
1033 for i := range top {
1034 exts[i] = path.Ext(top[i].FileName)
1035 }
1036
1037 for i := range candidates {
1038 // Do not assume sorted due to boostNovelExtension being called on subsets
1039 if candidates[i].Score < minScoreForNovelty {
1040 continue
1041 }
1042
1043 if slices.Contains(exts, path.Ext(candidates[i].FileName)) {
1044 continue
1045 }
1046
1047 // Found what we are looking for, now boost to front of candidates (which
1048 // is ms[boostOffset])
1049 for ; i > 0; i-- {
1050 candidates[i], candidates[i-1] = candidates[i-1], candidates[i]
1051 }
1052 return
1053 }
1054}