contentprovider.go at b08be72e4cdbd7b53a0f06750c9f50bf25df8a8f · boltless.me/zoekt

fork of https://github.com/sourcegraph/zoekt
zoekt / contentprovider.go
at b08be72e4cdbd7b53a0f06750c9f50bf25df8a8f 28 kB View raw
Keegan Carruthers-Smith score: remove unused contentProvider.matchScore (#725) 2y ago
  1// Copyright 2016 Google Inc. All rights reserved.
  2//
  3// Licensed under the Apache License, Version 2.0 (the "License");
  4// you may not use this file except in compliance with the License.
  5// You may obtain a copy of the License at
  6//
  7//    http://www.apache.org/licenses/LICENSE-2.0
  8//
  9// Unless required by applicable law or agreed to in writing, software
 10// distributed under the License is distributed on an "AS IS" BASIS,
 11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12// See the License for the specific language governing permissions and
 13// limitations under the License.
 14
 15package zoekt
 16
 17import (
 18	"bytes"
 19	"fmt"
 20	"log"
 21	"path"
 22	"sort"
 23	"strings"
 24	"unicode"
 25	"unicode/utf8"
 26
 27	"github.com/sourcegraph/zoekt/ctags"
 28	"golang.org/x/exp/slices"
 29)
 30
 31var _ = log.Println
 32
 33// contentProvider is an abstraction to treat matches for names and
 34// content with the same code.
 35type contentProvider struct {
 36	id    *indexData
 37	stats *Stats
 38
 39	// mutable
 40	err      error
 41	idx      uint32
 42	_data    []byte
 43	_nl      []uint32
 44	_nlBuf   []uint32
 45	_sects   []DocumentSection
 46	_sectBuf []DocumentSection
 47	fileSize uint32
 48}
 49
 50// setDocument skips to the given document.
 51func (p *contentProvider) setDocument(docID uint32) {
 52	fileStart := p.id.boundaries[docID]
 53
 54	p.idx = docID
 55	p.fileSize = p.id.boundaries[docID+1] - fileStart
 56
 57	p._nl = nil
 58	p._sects = nil
 59	p._data = nil
 60}
 61
 62func (p *contentProvider) docSections() []DocumentSection {
 63	if p._sects == nil {
 64		var sz uint32
 65		p._sects, sz, p.err = p.id.readDocSections(p.idx, p._sectBuf)
 66		p.stats.ContentBytesLoaded += int64(sz)
 67		p._sectBuf = p._sects
 68	}
 69	return p._sects
 70}
 71
 72func (p *contentProvider) newlines() newlines {
 73	if p._nl == nil {
 74		var sz uint32
 75		p._nl, sz, p.err = p.id.readNewlines(p.idx, p._nlBuf)
 76		p._nlBuf = p._nl
 77		p.stats.ContentBytesLoaded += int64(sz)
 78	}
 79	return newlines{locs: p._nl, fileSize: p.fileSize}
 80}
 81
 82func (p *contentProvider) data(fileName bool) []byte {
 83	if fileName {
 84		return p.id.fileNameContent[p.id.fileNameIndex[p.idx]:p.id.fileNameIndex[p.idx+1]]
 85	}
 86
 87	if p._data == nil {
 88		p._data, p.err = p.id.readContents(p.idx)
 89		p.stats.FilesLoaded++
 90		p.stats.ContentBytesLoaded += int64(len(p._data))
 91	}
 92	return p._data
 93}
 94
 95// Find offset in bytes (relative to corpus start) for an offset in
 96// runes (relative to document start). If filename is set, the corpus
 97// is the set of filenames, with the document being the name itself.
 98func (p *contentProvider) findOffset(filename bool, r uint32) uint32 {
 99	if p.id.metaData.PlainASCII {
100		return r
101	}
102
103	sample := p.id.runeOffsets
104	runeEnds := p.id.fileEndRunes
105	fileStartByte := p.id.boundaries[p.idx]
106	if filename {
107		sample = p.id.fileNameRuneOffsets
108		runeEnds = p.id.fileNameEndRunes
109		fileStartByte = p.id.fileNameIndex[p.idx]
110	}
111
112	absR := r
113	if p.idx > 0 {
114		absR += runeEnds[p.idx-1]
115	}
116
117	byteOff, left := sample.lookup(absR)
118
119	var data []byte
120
121	if filename {
122		data = p.id.fileNameContent[byteOff:]
123	} else {
124		data, p.err = p.id.readContentSlice(byteOff, 3*runeOffsetFrequency)
125		if p.err != nil {
126			return 0
127		}
128	}
129	for left > 0 {
130		_, sz := utf8.DecodeRune(data)
131		byteOff += uint32(sz)
132		data = data[sz:]
133		left--
134	}
135
136	byteOff -= fileStartByte
137	return byteOff
138}
139
140func (p *contentProvider) fillMatches(ms []*candidateMatch, numContextLines int, language string, debug bool) []LineMatch {
141	var result []LineMatch
142	if ms[0].fileName {
143		score, debugScore, _ := p.candidateMatchScore(ms, language, debug)
144
145		// There is only "line" in a filename.
146		res := LineMatch{
147			Line:     p.id.fileName(p.idx),
148			FileName: true,
149
150			Score:      score,
151			DebugScore: debugScore,
152		}
153
154		for _, m := range ms {
155			res.LineFragments = append(res.LineFragments, LineFragmentMatch{
156				LineOffset:  int(m.byteOffset),
157				MatchLength: int(m.byteMatchSz),
158				Offset:      m.byteOffset,
159			})
160
161			result = []LineMatch{res}
162		}
163	} else {
164		ms = breakMatchesOnNewlines(ms, p.data(false))
165		result = p.fillContentMatches(ms, numContextLines, language, debug)
166	}
167
168	return result
169}
170
171// fillChunkMatches converts the internal candidateMatch slice into our APIs ChunkMatch.
172//
173// Performance invariant: ms is sorted and non-overlapping.
174//
175// Note: the byte slices may be backed by mmapped data, so before being
176// returned by the API it needs to be copied.
177func (p *contentProvider) fillChunkMatches(ms []*candidateMatch, numContextLines int, language string, debug bool) []ChunkMatch {
178	var result []ChunkMatch
179	if ms[0].fileName {
180		// If the first match is a filename match, there will only be
181		// one match and the matched content will be the filename.
182
183		score, debugScore, _ := p.candidateMatchScore(ms, language, debug)
184
185		fileName := p.id.fileName(p.idx)
186		ranges := make([]Range, 0, len(ms))
187		for _, m := range ms {
188			ranges = append(ranges, Range{
189				Start: Location{
190					ByteOffset: m.byteOffset,
191					LineNumber: 1,
192					Column:     uint32(utf8.RuneCount(fileName[:m.byteOffset]) + 1),
193				},
194				End: Location{
195					ByteOffset: m.byteOffset + m.byteMatchSz,
196					LineNumber: 1,
197					Column:     uint32(utf8.RuneCount(fileName[:m.byteOffset+m.byteMatchSz]) + 1),
198				},
199			})
200		}
201
202		result = []ChunkMatch{{
203			Content:      fileName,
204			ContentStart: Location{ByteOffset: 0, LineNumber: 1, Column: 1},
205			Ranges:       ranges,
206			FileName:     true,
207
208			Score:      score,
209			DebugScore: debugScore,
210		}}
211	} else {
212		result = p.fillContentChunkMatches(ms, numContextLines, language, debug)
213	}
214
215	return result
216}
217
218func (p *contentProvider) fillContentMatches(ms []*candidateMatch, numContextLines int, language string, debug bool) []LineMatch {
219	var result []LineMatch
220	for len(ms) > 0 {
221		m := ms[0]
222		num, lineStart, lineEnd := p.newlines().atOffset(m.byteOffset)
223
224		var lineCands []*candidateMatch
225
226		endMatch := m.byteOffset + m.byteMatchSz
227
228		for len(ms) > 0 {
229			m := ms[0]
230			if int(m.byteOffset) <= lineEnd {
231				endMatch = m.byteOffset + m.byteMatchSz
232				lineCands = append(lineCands, m)
233				ms = ms[1:]
234			} else {
235				break
236			}
237		}
238
239		if len(lineCands) == 0 {
240			log.Panicf(
241				"%s %v infinite loop: num %d start,end %d,%d, offset %d",
242				p.id.fileName(p.idx), p.id.metaData,
243				num, lineStart, lineEnd,
244				m.byteOffset)
245		}
246
247		data := p.data(false)
248
249		// Due to merging matches, we may have a match that
250		// crosses a line boundary. Prevent confusion by
251		// taking lines until we pass the last match
252		for lineEnd < len(data) && endMatch > uint32(lineEnd) {
253			next := bytes.IndexByte(data[lineEnd+1:], '\n')
254			if next == -1 {
255				lineEnd = len(data)
256			} else {
257				// TODO(hanwen): test that checks "+1" part here.
258				lineEnd += next + 1
259			}
260		}
261
262		finalMatch := LineMatch{
263			LineStart:  lineStart,
264			LineEnd:    lineEnd,
265			LineNumber: num,
266		}
267		finalMatch.Line = data[lineStart:lineEnd]
268
269		if numContextLines > 0 {
270			finalMatch.Before = p.newlines().getLines(data, num-numContextLines, num)
271			finalMatch.After = p.newlines().getLines(data, num+1, num+1+numContextLines)
272		}
273
274		score, debugScore, symbolInfo := p.candidateMatchScore(lineCands, language, debug)
275		finalMatch.Score = score
276		finalMatch.DebugScore = debugScore
277
278		for i, m := range lineCands {
279			fragment := LineFragmentMatch{
280				Offset:      m.byteOffset,
281				LineOffset:  int(m.byteOffset) - lineStart,
282				MatchLength: int(m.byteMatchSz),
283			}
284			if i < len(symbolInfo) && symbolInfo[i] != nil {
285				fragment.SymbolInfo = symbolInfo[i]
286			}
287
288			finalMatch.LineFragments = append(finalMatch.LineFragments, fragment)
289		}
290		result = append(result, finalMatch)
291	}
292	return result
293}
294
295func (p *contentProvider) fillContentChunkMatches(ms []*candidateMatch, numContextLines int, language string, debug bool) []ChunkMatch {
296	newlines := p.newlines()
297	data := p.data(false)
298
299	// columnHelper prevents O(len(ms) * len(data)) lookups for all columns.
300	// However, it depends on ms being sorted by byteOffset and non-overlapping.
301	// This invariant is true at the time of writing, but we conservatively
302	// enforce this. Note: chunkCandidates preserves the sorting so safe to
303	// transform now.
304	columnHelper := columnHelper{data: data}
305	if !sort.IsSorted((sortByOffsetSlice)(ms)) {
306		log.Printf("WARN: performance invariant violated. candidate matches are not sorted in fillContentChunkMatches. Report to developers.")
307		sort.Sort((sortByOffsetSlice)(ms))
308	}
309
310	chunks := chunkCandidates(ms, newlines, numContextLines)
311	chunkMatches := make([]ChunkMatch, 0, len(chunks))
312	for _, chunk := range chunks {
313		score, debugScore, symbolInfo := p.candidateMatchScore(chunk.candidates, language, debug)
314
315		ranges := make([]Range, 0, len(chunk.candidates))
316		for _, cm := range chunk.candidates {
317			startOffset := cm.byteOffset
318			endOffset := cm.byteOffset + cm.byteMatchSz
319			startLine, startLineOffset, _ := newlines.atOffset(startOffset)
320			endLine, endLineOffset, _ := newlines.atOffset(endOffset)
321
322			ranges = append(ranges, Range{
323				Start: Location{
324					ByteOffset: startOffset,
325					LineNumber: uint32(startLine),
326					Column:     columnHelper.get(startLineOffset, startOffset),
327				},
328				End: Location{
329					ByteOffset: endOffset,
330					LineNumber: uint32(endLine),
331					Column:     columnHelper.get(endLineOffset, endOffset),
332				},
333			})
334		}
335
336		firstLineNumber := int(chunk.firstLine) - numContextLines
337		if firstLineNumber < 1 {
338			firstLineNumber = 1
339		}
340		firstLineStart, _ := newlines.lineBounds(firstLineNumber)
341
342		chunkMatches = append(chunkMatches, ChunkMatch{
343			Content: newlines.getLines(data, firstLineNumber, int(chunk.lastLine)+numContextLines+1),
344			ContentStart: Location{
345				ByteOffset: firstLineStart,
346				LineNumber: uint32(firstLineNumber),
347				Column:     1,
348			},
349			FileName:   false,
350			Ranges:     ranges,
351			SymbolInfo: symbolInfo,
352			Score:      score,
353			DebugScore: debugScore,
354		})
355	}
356	return chunkMatches
357}
358
359type candidateChunk struct {
360	candidates []*candidateMatch
361	firstLine  uint32 // 1-based, inclusive
362	lastLine   uint32 // 1-based, inclusive
363	minOffset  uint32 // 0-based, inclusive
364	maxOffset  uint32 // 0-based, exclusive
365}
366
367// chunkCandidates groups a set of sorted, non-overlapping candidate matches by line number. Adjacent
368// chunks will be merged if adding `numContextLines` to the beginning and end of the chunk would cause
369// it to overlap with an adjacent chunk.
370//
371// input invariants: ms is sorted by byteOffset and is non overlapping with respect to endOffset.
372// output invariants: if you flatten candidates the input invariant is retained.
373func chunkCandidates(ms []*candidateMatch, newlines newlines, numContextLines int) []candidateChunk {
374	var chunks []candidateChunk
375	for _, m := range ms {
376		startOffset := m.byteOffset
377		endOffset := m.byteOffset + m.byteMatchSz
378		firstLine, _, _ := newlines.atOffset(startOffset)
379		lastLine, _, _ := newlines.atOffset(endOffset)
380
381		if len(chunks) > 0 && int(chunks[len(chunks)-1].lastLine)+numContextLines >= firstLine-numContextLines {
382			// If a new chunk created with the current candidateMatch would
383			// overlap with the previous chunk, instead add the candidateMatch
384			// to the last chunk and extend end of the last chunk.
385			last := &chunks[len(chunks)-1]
386			last.candidates = append(last.candidates, m)
387			if last.maxOffset < endOffset {
388				last.lastLine = uint32(lastLine)
389				last.maxOffset = uint32(endOffset)
390			}
391		} else {
392			chunks = append(chunks, candidateChunk{
393				firstLine:  uint32(firstLine),
394				lastLine:   uint32(lastLine),
395				minOffset:  startOffset,
396				maxOffset:  endOffset,
397				candidates: []*candidateMatch{m},
398			})
399		}
400	}
401	return chunks
402}
403
404// columnHelper is a helper struct which caches the number of runes last
405// counted. If we naively use utf8.RuneCount for each match on a line, this
406// leads to an O(nm) algorithm where m is the number of matches and n is the
407// length of the line. Aassuming we our candidates are increasing in offset
408// makes this operation O(n) instead.
409type columnHelper struct {
410	data []byte
411
412	// 0 values for all these are valid values
413	lastLineOffset int
414	lastOffset     uint32
415	lastRuneCount  uint32
416}
417
418// get returns the line column for offset. offset is the byte offset of the
419// rune in data. lineOffset is the byte offset inside of data for the line
420// containing offset.
421func (c *columnHelper) get(lineOffset int, offset uint32) uint32 {
422	var runeCount uint32
423
424	if lineOffset == c.lastLineOffset && offset >= c.lastOffset {
425		// Can count from last calculation
426		runeCount = c.lastRuneCount + uint32(utf8.RuneCount(c.data[c.lastOffset:offset]))
427	} else {
428		// Need to count from the beginning of line
429		runeCount = uint32(utf8.RuneCount(c.data[lineOffset:offset]))
430	}
431
432	c.lastLineOffset = lineOffset
433	c.lastOffset = offset
434	c.lastRuneCount = runeCount
435
436	return runeCount + 1
437}
438
439type newlines struct {
440	// locs is the sorted set of byte offsets of the newlines in the file
441	locs []uint32
442
443	// fileSize is just the number of bytes in the file. It is stored
444	// on this struct so we can safely know the length of the last line
445	// in the file since not all files end in a newline.
446	fileSize uint32
447}
448
449// atOffset returns the line containing the offset. If the offset lands on
450// the newline ending line M, we return M.  The line is characterized
451// by its linenumber (base-1, byte index of line start, byte index of
452// line end). The line end is the index of a newline, or the filesize
453// (if matching the last line of the file.)
454func (nls newlines) atOffset(offset uint32) (lineNumber, lineStart, lineEnd int) {
455	idx := sort.Search(len(nls.locs), func(n int) bool {
456		return nls.locs[n] >= offset
457	})
458
459	start, end := nls.lineBounds(idx + 1)
460	return idx + 1, int(start), int(end)
461}
462
463// lineBounds returns the byte offsets of the start and end of the 1-based
464// lineNumber. The end offset is exclusive and will not contain the line-ending
465// newline. If the line number is out of range of the lines in the file, start
466// and end will be clamped to [0,fileSize].
467func (nls newlines) lineBounds(lineNumber int) (start, end uint32) {
468	// nls.locs[0] + 1 is the start of the 2nd line of data.
469	startIdx := lineNumber - 2
470	endIdx := lineNumber - 1
471
472	if startIdx < 0 {
473		start = 0
474	} else if startIdx >= len(nls.locs) {
475		start = nls.fileSize
476	} else {
477		start = nls.locs[startIdx] + 1
478	}
479
480	if endIdx < 0 {
481		end = 0
482	} else if endIdx >= len(nls.locs) {
483		end = nls.fileSize
484	} else {
485		end = nls.locs[endIdx]
486	}
487
488	return start, end
489}
490
491// getLines returns a slice of data containing the lines [low, high).
492// low is 1-based and inclusive. high is 1-based and exclusive.
493func (nls newlines) getLines(data []byte, low, high int) []byte {
494	if low >= high {
495		return nil
496	}
497
498	lowStart, _ := nls.lineBounds(low)
499	_, highEnd := nls.lineBounds(high - 1)
500
501	// Drop any trailing newline. Editors do not treat a trailing newline as
502	// the start of a new line, so we should not either. lineBounds clamps to
503	// len(data) when an out-of-bounds line is requested.
504	//
505	// As an example, if we request lines 1-5 from a file with contents
506	// `one\ntwo\nthree\n`, we should return `one\ntwo\nthree` because those are
507	// the three "lines" in the file, separated by newlines.
508	if highEnd == uint32(len(data)) && bytes.HasSuffix(data, []byte{'\n'}) {
509		highEnd = highEnd - 1
510		lowStart = min(lowStart, highEnd)
511	}
512
513	return data[lowStart:highEnd]
514}
515
516const (
517	// Query-dependent scoring signals. All of these together are bounded at ~9000
518	// (scoreWordMatch + scoreSymbol + scoreKindMatch * 10 + scoreFactorAtomMatch).
519	scorePartialWordMatch = 50.0
520	scoreWordMatch        = 500.0
521	scoreBase             = 7000.0
522	scorePartialBase      = 4000.0
523	scoreSymbol           = 7000.0
524	scorePartialSymbol    = 4000.0
525	scoreKindMatch        = 100.0
526	scoreFactorAtomMatch  = 400.0
527
528	// File-only scoring signals. For now these are also bounded ~9000 to give them
529	// equal weight with the query-dependent signals.
530	scoreFileRankFactor  = 9000.0
531	scoreFileOrderFactor = 10.0
532	scoreRepoRankFactor  = 20.0
533
534	// Used for ordering line and chunk matches within a file.
535	scoreLineOrderFactor = 1.0
536)
537
538// findSection checks whether a section defined by offset and size lies within
539// one of the sections in secs.
540func findSection(secs []DocumentSection, off, sz uint32) (uint32, bool) {
541	j := sort.Search(len(secs), func(i int) bool {
542		return secs[i].End >= off+sz
543	})
544
545	if j == len(secs) {
546		return 0, false
547	}
548
549	if secs[j].Start <= off && off+sz <= secs[j].End {
550		return uint32(j), true
551	}
552	return 0, false
553}
554
555func (p *contentProvider) findSymbol(cm *candidateMatch) (DocumentSection, *Symbol, bool) {
556	if cm.fileName {
557		return DocumentSection{}, nil, false
558	}
559
560	secs := p.docSections()
561
562	secIdx, ok := cm.symbolIdx, cm.symbol
563	if !ok {
564		// Not from a symbol matchtree. Lets see if it intersects with a symbol.
565		secIdx, ok = findSection(secs, cm.byteOffset, cm.byteMatchSz)
566	}
567	if !ok {
568		return DocumentSection{}, nil, false
569	}
570
571	sec := secs[secIdx]
572
573	// Now lets hydrate in the SymbolInfo. We do not hydrate in SymbolInfo.Sym
574	// since some callsites do not need it stored, and that incurs an extra
575	// copy.
576	//
577	// 2024-01-08 we are refactoring this and the code path indicates this can
578	// fail, so callers need to handle nil symbol. However, it would be
579	// surprising that we have a matching section but not symbol data.
580	start := p.id.fileEndSymbol[p.idx]
581	si := p.id.symbols.data(start + secIdx)
582
583	return sec, si, true
584}
585
586func (p *contentProvider) candidateMatchScore(ms []*candidateMatch, language string, debug bool) (float64, string, []*Symbol) {
587	type debugScore struct {
588		what  string
589		score float64
590	}
591
592	score := &debugScore{}
593	maxScore := &debugScore{}
594
595	addScore := func(what string, s float64) {
596		if s != 0 && debug {
597			score.what += fmt.Sprintf("%s:%.2f, ", what, s)
598		}
599		score.score += s
600	}
601
602	filename := p.data(true)
603	var symbolInfo []*Symbol
604
605	for i, m := range ms {
606		data := p.data(m.fileName)
607
608		endOffset := m.byteOffset + m.byteMatchSz
609		startBoundary := m.byteOffset < uint32(len(data)) && (m.byteOffset == 0 || byteClass(data[m.byteOffset-1]) != byteClass(data[m.byteOffset]))
610		endBoundary := endOffset > 0 && (endOffset == uint32(len(data)) || byteClass(data[endOffset-1]) != byteClass(data[endOffset]))
611
612		score.score = 0
613		score.what = ""
614
615		if startBoundary && endBoundary {
616			addScore("WordMatch", scoreWordMatch)
617		} else if startBoundary || endBoundary {
618			addScore("PartialWordMatch", scorePartialWordMatch)
619		}
620
621		if m.fileName {
622			sep := bytes.LastIndexByte(data, '/')
623			startMatch := int(m.byteOffset) == sep+1
624			endMatch := endOffset == uint32(len(data))
625			if startMatch && endMatch {
626				addScore("Base", scoreBase)
627			} else if startMatch || endMatch {
628				addScore("EdgeBase", (scoreBase+scorePartialBase)/2)
629			} else if sep < int(m.byteOffset) {
630				addScore("InnerBase", scorePartialBase)
631			}
632		} else if sec, si, ok := p.findSymbol(m); ok {
633			startMatch := sec.Start == m.byteOffset
634			endMatch := sec.End == endOffset
635			if startMatch && endMatch {
636				addScore("Symbol", scoreSymbol)
637			} else if startMatch || endMatch {
638				addScore("EdgeSymbol", (scoreSymbol+scorePartialSymbol)/2)
639			} else {
640				addScore("InnerSymbol", scorePartialSymbol)
641			}
642
643			// Score based on symbol data
644			if si != nil {
645				symbolKind := ctags.ParseSymbolKind(si.Kind)
646				sym := sectionSlice(data, sec)
647
648				addScore(fmt.Sprintf("kind:%s:%s", language, si.Kind), scoreSymbolKind(language, filename, sym, symbolKind))
649
650				// This is from a symbol tree, so we need to store the symbol
651				// information.
652				if m.symbol {
653					if symbolInfo == nil {
654						symbolInfo = make([]*Symbol, len(ms))
655					}
656					// findSymbols does not hydrate in Sym. So we need to store it.
657					si.Sym = string(sym)
658					symbolInfo[i] = si
659				}
660			}
661		}
662
663		if score.score > maxScore.score {
664			maxScore.score = score.score
665			maxScore.what = score.what
666		}
667	}
668
669	if debug {
670		maxScore.what = fmt.Sprintf("score:%.2f <- %s", maxScore.score, strings.TrimSuffix(maxScore.what, ", "))
671	}
672
673	return maxScore.score, maxScore.what, symbolInfo
674}
675
676// sectionSlice will return data[sec.Start:sec.End] but will clip Start and
677// End such that it won't be out of range.
678func sectionSlice(data []byte, sec DocumentSection) []byte {
679	l := uint32(len(data))
680	if sec.Start >= l {
681		return nil
682	}
683	if sec.End > l {
684		sec.End = l
685	}
686	return data[sec.Start:sec.End]
687}
688
689// scoreSymbolKind boosts a match based on the combination of language, symbol
690// and kind. The language string comes from go-enry, the symbol and kind from
691// ctags.
692func scoreSymbolKind(language string, filename []byte, sym []byte, kind ctags.SymbolKind) float64 {
693	var factor float64
694
695	// Generic ranking which will be overriden by language specific ranking
696	switch kind {
697	case ctags.Type: // scip-ctags regression workaround https://github.com/sourcegraph/sourcegraph/issues/57659
698		factor = 8
699	case ctags.Class:
700		factor = 10
701	case ctags.Struct:
702		factor = 9.5
703	case ctags.Enum:
704		factor = 9
705	case ctags.Interface:
706		factor = 8
707	case ctags.Function, ctags.Method:
708		factor = 7
709	case ctags.Field:
710		factor = 5.5
711	case ctags.Constant:
712		factor = 5
713	case ctags.Variable:
714		factor = 4
715	default:
716		// For all other kinds, assign a low score by default.
717		factor = 1
718	}
719
720	switch language {
721	case "Java", "java":
722		switch kind {
723		// 2022-03-30: go-ctags contains a regex rule for Java classes that sets "kind"
724		// to "classes" instead of "c". We have to cover both cases to support existing
725		// indexes.
726		case ctags.Class:
727			factor = 10
728		case ctags.Enum:
729			factor = 9
730		case ctags.Interface:
731			factor = 8
732		case ctags.Method:
733			factor = 7
734		case ctags.Field:
735			factor = 6
736		case ctags.EnumConstant:
737			factor = 5
738		}
739	case "Kotlin", "kotlin":
740		switch kind {
741		case ctags.Class:
742			factor = 10
743		case ctags.Interface:
744			factor = 9
745		case ctags.Method:
746			factor = 8
747		case ctags.TypeAlias:
748			factor = 7
749		case ctags.Constant:
750			factor = 6
751		case ctags.Variable:
752			factor = 5
753		}
754	case "Go", "go":
755		switch kind {
756		// scip-ctags regression workaround https://github.com/sourcegraph/sourcegraph/issues/57659
757		// for each case a description of the fields in ctags in the comment
758		case ctags.Type: // interface struct talias
759			factor = 9
760		case ctags.Interface: // interfaces
761			factor = 10
762		case ctags.Struct: // structs
763			factor = 9
764		case ctags.TypeAlias: // type aliases
765			factor = 9
766		case ctags.MethodSpec: // interface method specification
767			factor = 8.5
768		case ctags.Method, ctags.Function: // functions
769			factor = 8
770		case ctags.Field: // struct fields
771			factor = 7
772		case ctags.Constant: // constants
773			factor = 6
774		case ctags.Variable: // variables
775			factor = 5
776		}
777
778		// Boost exported go symbols. Same implementation as token.IsExported
779		if ch, _ := utf8.DecodeRune(sym); unicode.IsUpper(ch) {
780			factor += 0.5
781		}
782
783		if bytes.HasSuffix(filename, []byte("_test.go")) {
784			factor *= 0.8
785		}
786
787		// Could also rank on:
788		//
789		//   - anonMember  struct anonymous members
790		//   - packageName name for specifying imported package
791		//   - receiver    receivers
792		//   - package     packages
793		//   - type        types
794		//   - unknown     unknown
795	case "C++", "c++":
796		switch kind {
797		case ctags.Class: // classes
798			factor = 10
799		case ctags.Enum: // enumeration names
800			factor = 9
801		case ctags.Function: // function definitions
802			factor = 8
803		case ctags.Struct: // structure names
804			factor = 7
805		case ctags.Union: // union names
806			factor = 6
807		case ctags.TypeAlias: // typedefs
808			factor = 5
809		case ctags.Field: // class, struct, and union members
810			factor = 4
811		case ctags.Variable: // varialbe definitions
812			factor = 3
813		}
814	// Could also rank on:
815	// NAME        DESCRIPTION
816	// macro       macro definitions
817	// enumerator  enumerators (values inside an enumeration)
818	// header      included header files
819	// namespace   namespaces
820	// variable    variable definitions
821	case "Scala", "scala":
822		switch kind {
823		case ctags.Class:
824			factor = 10
825		case ctags.Interface:
826			factor = 9
827		case ctags.Object:
828			factor = 8
829		case ctags.Function:
830			factor = 7
831		case ctags.Type:
832			factor = 6
833		case ctags.Variable:
834			factor = 5
835		case ctags.Package:
836			factor = 4
837		}
838	case "Python", "python":
839		switch kind {
840		case ctags.Class: // classes
841			factor = 10
842		case ctags.Function, ctags.Method: // function definitions
843			factor = 8
844		case ctags.Field: // class, struct, and union members
845			factor = 4
846		case ctags.Variable: // variable definitions
847			factor = 3
848		case ctags.Local: // local variables
849			factor = 2
850		}
851		// Could also rank on:
852		//
853		//   - namespace name referring a module defined in other file
854		//   - module    modules
855		//   - unknown   name referring a class/variable/function/module defined in other module
856		//   - parameter function parameters
857	case "Ruby", "ruby":
858		switch kind {
859		case ctags.Class:
860			factor = 10
861		case ctags.Method:
862			factor = 9
863		case ctags.MethodAlias:
864			factor = 8
865		case ctags.Module:
866			factor = 7
867		case ctags.SingletonMethod:
868			factor = 6
869		case ctags.Constant:
870			factor = 5
871		case ctags.Accessor:
872			factor = 4
873		case ctags.Library:
874			factor = 3
875		}
876	case "PHP", "php":
877		switch kind {
878		case ctags.Class:
879			factor = 10
880		case ctags.Interface:
881			factor = 9
882		case ctags.Function:
883			factor = 8
884		case ctags.Trait:
885			factor = 7
886		case ctags.Define:
887			factor = 6
888		case ctags.Namespace:
889			factor = 5
890		case ctags.MethodAlias:
891			factor = 4
892		case ctags.Variable:
893			factor = 3
894		case ctags.Local:
895			factor = 3
896		}
897	case "GraphQL", "graphql":
898		switch kind {
899		case ctags.Type:
900			factor = 10
901		}
902	case "Markdown", "markdown":
903		// Headers are good signal in docs, but do not rank as highly as code.
904		switch kind {
905		case ctags.Chapter: // #
906			factor = 4
907		case ctags.Section: // ##
908			factor = 3
909		case ctags.Subsection: // ###
910			factor = 2
911		}
912	}
913
914	return factor * scoreKindMatch
915}
916
917type matchScoreSlice []LineMatch
918
919func (m matchScoreSlice) Len() int           { return len(m) }
920func (m matchScoreSlice) Swap(i, j int)      { m[i], m[j] = m[j], m[i] }
921func (m matchScoreSlice) Less(i, j int) bool { return m[i].Score > m[j].Score }
922
923type chunkMatchScoreSlice []ChunkMatch
924
925func (m chunkMatchScoreSlice) Len() int           { return len(m) }
926func (m chunkMatchScoreSlice) Swap(i, j int)      { m[i], m[j] = m[j], m[i] }
927func (m chunkMatchScoreSlice) Less(i, j int) bool { return m[i].Score > m[j].Score }
928
929type fileMatchesByScore []FileMatch
930
931func (m fileMatchesByScore) Len() int           { return len(m) }
932func (m fileMatchesByScore) Swap(i, j int)      { m[i], m[j] = m[j], m[i] }
933func (m fileMatchesByScore) Less(i, j int) bool { return m[i].Score > m[j].Score }
934
935func sortMatchesByScore(ms []LineMatch) {
936	sort.Sort(matchScoreSlice(ms))
937}
938
939func sortChunkMatchesByScore(ms []ChunkMatch) {
940	sort.Sort(chunkMatchScoreSlice(ms))
941}
942
943// SortFiles sorts files matches in the order we want to present results to
944// users. The order depends on the match score, which includes both
945// query-dependent signals like word overlap, and file-only signals like the
946// file ranks (if file ranks are enabled).
947//
948// We don't only use the scores, we will also boost some results to present
949// files with novel extensions.
950func SortFiles(ms []FileMatch) {
951	sort.Sort(fileMatchesByScore(ms))
952
953	// Boost a file extension not in the top 3 to the third filematch.
954	boostNovelExtension(ms, 2, 0.9)
955}
956
957func boostNovelExtension(ms []FileMatch, boostOffset int, minScoreRatio float64) {
958	if len(ms) <= boostOffset+1 {
959		return
960	}
961
962	top := ms[:boostOffset]
963	candidates := ms[boostOffset:]
964
965	// Don't bother boosting something which is significantly different to the
966	// result it replaces.
967	minScoreForNovelty := candidates[0].Score * minScoreRatio
968
969	// We want to look for an ext that isn't in the top exts
970	exts := make([]string, len(top))
971	for i := range top {
972		exts[i] = path.Ext(top[i].FileName)
973	}
974
975	for i := range candidates {
976		// Do not assume sorted due to boostNovelExtension being called on subsets
977		if candidates[i].Score < minScoreForNovelty {
978			continue
979		}
980
981		if slices.Contains(exts, path.Ext(candidates[i].FileName)) {
982			continue
983		}
984
985		// Found what we are looking for, now boost to front of candidates (which
986		// is ms[boostOffset])
987		for ; i > 0; i-- {
988			candidates[i], candidates[i-1] = candidates[i-1], candidates[i]
989		}
990		return
991	}
992}
Configure Feed

Configure Feed