contentprovider.go at 68752812bea9d10e2a5db284773d59dda6f4df60 · boltless.me/zoekt

fork of https://github.com/sourcegraph/zoekt
zoekt / contentprovider.go
at 68752812bea9d10e2a5db284773d59dda6f4df60 28 kB View raw
Julie Tibshirani Support BM25 scoring for chunk matches (#889) 1y ago
  1// Copyright 2016 Google Inc. All rights reserved.
  2//
  3// Licensed under the Apache License, Version 2.0 (the "License");
  4// you may not use this file except in compliance with the License.
  5// You may obtain a copy of the License at
  6//
  7//    http://www.apache.org/licenses/LICENSE-2.0
  8//
  9// Unless required by applicable law or agreed to in writing, software
 10// distributed under the License is distributed on an "AS IS" BASIS,
 11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12// See the License for the specific language governing permissions and
 13// limitations under the License.
 14
 15package zoekt
 16
 17import (
 18	"bytes"
 19	"log"
 20	"path"
 21	"slices"
 22	"sort"
 23	"unicode"
 24	"unicode/utf8"
 25
 26	"github.com/sourcegraph/zoekt/ctags"
 27)
 28
 29var _ = log.Println
 30
 31// contentProvider is an abstraction to treat matches for names and
 32// content with the same code.
 33type contentProvider struct {
 34	id    *indexData
 35	stats *Stats
 36
 37	// mutable
 38	err      error
 39	idx      uint32
 40	_data    []byte
 41	_nl      []uint32
 42	_nlBuf   []uint32
 43	_sects   []DocumentSection
 44	_sectBuf []DocumentSection
 45	fileSize uint32
 46}
 47
 48// setDocument skips to the given document.
 49func (p *contentProvider) setDocument(docID uint32) {
 50	fileStart := p.id.boundaries[docID]
 51
 52	p.idx = docID
 53	p.fileSize = p.id.boundaries[docID+1] - fileStart
 54
 55	p._nl = nil
 56	p._sects = nil
 57	p._data = nil
 58}
 59
 60func (p *contentProvider) docSections() []DocumentSection {
 61	if p._sects == nil {
 62		var sz uint32
 63		p._sects, sz, p.err = p.id.readDocSections(p.idx, p._sectBuf)
 64		p.stats.ContentBytesLoaded += int64(sz)
 65		p._sectBuf = p._sects
 66	}
 67	return p._sects
 68}
 69
 70func (p *contentProvider) newlines() newlines {
 71	if p._nl == nil {
 72		var sz uint32
 73		p._nl, sz, p.err = p.id.readNewlines(p.idx, p._nlBuf)
 74		p._nlBuf = p._nl
 75		p.stats.ContentBytesLoaded += int64(sz)
 76	}
 77	return newlines{locs: p._nl, fileSize: p.fileSize}
 78}
 79
 80func (p *contentProvider) data(fileName bool) []byte {
 81	if fileName {
 82		return p.id.fileNameContent[p.id.fileNameIndex[p.idx]:p.id.fileNameIndex[p.idx+1]]
 83	}
 84
 85	if p._data == nil {
 86		p._data, p.err = p.id.readContents(p.idx)
 87		p.stats.FilesLoaded++
 88		p.stats.ContentBytesLoaded += int64(len(p._data))
 89	}
 90	return p._data
 91}
 92
 93// Find offset in bytes (relative to corpus start) for an offset in
 94// runes (relative to document start). If filename is set, the corpus
 95// is the set of filenames, with the document being the name itself.
 96func (p *contentProvider) findOffset(filename bool, r uint32) uint32 {
 97	if p.id.metaData.PlainASCII {
 98		return r
 99	}
100
101	sample := p.id.runeOffsets
102	runeEnds := p.id.fileEndRunes
103	fileStartByte := p.id.boundaries[p.idx]
104	if filename {
105		sample = p.id.fileNameRuneOffsets
106		runeEnds = p.id.fileNameEndRunes
107		fileStartByte = p.id.fileNameIndex[p.idx]
108	}
109
110	absR := r
111	if p.idx > 0 {
112		absR += runeEnds[p.idx-1]
113	}
114
115	byteOff, left := sample.lookup(absR)
116
117	var data []byte
118
119	if filename {
120		data = p.id.fileNameContent[byteOff:]
121	} else {
122		data, p.err = p.id.readContentSlice(byteOff, 3*runeOffsetFrequency)
123		if p.err != nil {
124			return 0
125		}
126	}
127	for left > 0 {
128		_, sz := utf8.DecodeRune(data)
129		byteOff += uint32(sz)
130		data = data[sz:]
131		left--
132	}
133
134	byteOff -= fileStartByte
135	return byteOff
136}
137
138// fillMatches converts the internal candidateMatch slice into our API's LineMatch.
139// It only ever returns content XOR filename matches, not both. If there are any
140// content matches, these are always returned, and we omit filename matches.
141//
142// Performance invariant: ms is sorted and non-overlapping.
143//
144// Note: the byte slices may be backed by mmapped data, so before being
145// returned by the API it needs to be copied.
146func (p *contentProvider) fillMatches(ms []*candidateMatch, numContextLines int, language string, opts *SearchOptions) []LineMatch {
147	var filenameMatches []*candidateMatch
148	contentMatches := make([]*candidateMatch, 0, len(ms))
149
150	for _, m := range ms {
151		if m.fileName {
152			filenameMatches = append(filenameMatches, m)
153		} else {
154			contentMatches = append(contentMatches, m)
155		}
156	}
157
158	// If there are any content matches, we only return these and skip filename matches.
159	if len(contentMatches) > 0 {
160		contentMatches = breakMatchesOnNewlines(contentMatches, p.data(false))
161		return p.fillContentMatches(contentMatches, numContextLines, language, opts)
162	}
163
164	// Otherwise, we return a single line containing the filematch match.
165	lineScore, _ := p.scoreLine(filenameMatches, language, -1 /* must pass -1 for filenames */, opts)
166	res := LineMatch{
167		Line:       p.id.fileName(p.idx),
168		FileName:   true,
169		Score:      lineScore.score,
170		DebugScore: lineScore.debugScore,
171	}
172
173	for _, m := range ms {
174		res.LineFragments = append(res.LineFragments, LineFragmentMatch{
175			LineOffset:  int(m.byteOffset),
176			MatchLength: int(m.byteMatchSz),
177			Offset:      m.byteOffset,
178		})
179	}
180
181	return []LineMatch{res}
182
183}
184
185// fillChunkMatches converts the internal candidateMatch slice into our API's ChunkMatch.
186// It only ever returns content XOR filename matches, not both. If there are any content
187// matches, these are always returned, and we omit filename matches.
188//
189// Performance invariant: ms is sorted and non-overlapping.
190//
191// Note: the byte slices may be backed by mmapped data, so before being
192// returned by the API it needs to be copied.
193func (p *contentProvider) fillChunkMatches(ms []*candidateMatch, numContextLines int, language string, opts *SearchOptions) []ChunkMatch {
194	var filenameMatches []*candidateMatch
195	contentMatches := make([]*candidateMatch, 0, len(ms))
196
197	for _, m := range ms {
198		if m.fileName {
199			filenameMatches = append(filenameMatches, m)
200		} else {
201			contentMatches = append(contentMatches, m)
202		}
203	}
204
205	// If there are any content matches, we only return these and skip filename matches.
206	if len(contentMatches) > 0 {
207		return p.fillContentChunkMatches(contentMatches, numContextLines, language, opts)
208	}
209
210	// Otherwise, we return a single chunk representing the filename match.
211	lineScore, _ := p.scoreLine(filenameMatches, language, -1 /* must pass -1 for filenames */, opts)
212	fileName := p.id.fileName(p.idx)
213	ranges := make([]Range, 0, len(ms))
214	for _, m := range ms {
215		ranges = append(ranges, Range{
216			Start: Location{
217				ByteOffset: m.byteOffset,
218				LineNumber: 1,
219				Column:     uint32(utf8.RuneCount(fileName[:m.byteOffset]) + 1),
220			},
221			End: Location{
222				ByteOffset: m.byteOffset + m.byteMatchSz,
223				LineNumber: 1,
224				Column:     uint32(utf8.RuneCount(fileName[:m.byteOffset+m.byteMatchSz]) + 1),
225			},
226		})
227	}
228
229	return []ChunkMatch{{
230		Content:      fileName,
231		ContentStart: Location{ByteOffset: 0, LineNumber: 1, Column: 1},
232		Ranges:       ranges,
233		FileName:     true,
234		Score:        lineScore.score,
235		DebugScore:   lineScore.debugScore,
236	}}
237}
238
239func (p *contentProvider) fillContentMatches(ms []*candidateMatch, numContextLines int, language string, opts *SearchOptions) []LineMatch {
240	var result []LineMatch
241	for len(ms) > 0 {
242		m := ms[0]
243		num := p.newlines().atOffset(m.byteOffset)
244		lineStart := int(p.newlines().lineStart(num))
245		nextLineStart := int(p.newlines().lineStart(num + 1))
246
247		var lineCands []*candidateMatch
248
249		endMatch := m.byteOffset + m.byteMatchSz
250
251		for len(ms) > 0 {
252			m := ms[0]
253			if int(m.byteOffset) < nextLineStart {
254				endMatch = m.byteOffset + m.byteMatchSz
255				lineCands = append(lineCands, m)
256				ms = ms[1:]
257			} else {
258				break
259			}
260		}
261
262		if len(lineCands) == 0 {
263			log.Panicf(
264				"%s %v infinite loop: num %d start,end %d,%d, offset %d",
265				p.id.fileName(p.idx), p.id.metaData,
266				num, lineStart, nextLineStart,
267				m.byteOffset)
268		}
269
270		data := p.data(false)
271
272		// Due to merging matches, we may have a match that
273		// crosses a line boundary. Prevent confusion by
274		// taking lines until we pass the last match
275		for nextLineStart < len(data) && endMatch > uint32(nextLineStart) {
276			next := bytes.IndexByte(data[nextLineStart:], '\n')
277			if next == -1 {
278				nextLineStart = len(data)
279			} else {
280				// TODO(hanwen): test that checks "+1" part here.
281				nextLineStart += next + 1
282			}
283		}
284
285		finalMatch := LineMatch{
286			LineStart:  lineStart,
287			LineEnd:    nextLineStart,
288			LineNumber: num,
289		}
290		finalMatch.Line = data[lineStart:nextLineStart]
291
292		if numContextLines > 0 {
293			finalMatch.Before = p.newlines().getLines(data, num-numContextLines, num)
294			finalMatch.After = p.newlines().getLines(data, num+1, num+1+numContextLines)
295		}
296
297		lineScore, symbolInfo := p.scoreLine(lineCands, language, num, opts)
298		finalMatch.Score = lineScore.score
299		finalMatch.DebugScore = lineScore.debugScore
300
301		for i, m := range lineCands {
302			fragment := LineFragmentMatch{
303				Offset:      m.byteOffset,
304				LineOffset:  int(m.byteOffset) - lineStart,
305				MatchLength: int(m.byteMatchSz),
306			}
307
308			if i < len(symbolInfo) && symbolInfo[i] != nil {
309				fragment.SymbolInfo = symbolInfo[i]
310			}
311
312			finalMatch.LineFragments = append(finalMatch.LineFragments, fragment)
313		}
314		result = append(result, finalMatch)
315	}
316	return result
317}
318
319func (p *contentProvider) fillContentChunkMatches(ms []*candidateMatch, numContextLines int, language string, opts *SearchOptions) []ChunkMatch {
320	data := p.data(false)
321
322	// columnHelper prevents O(len(ms) * len(data)) lookups for all columns.
323	// However, it depends on ms being sorted by byteOffset and non-overlapping.
324	// This invariant is true at the time of writing, but we conservatively
325	// enforce this. Note: chunkCandidates preserves the sorting so safe to
326	// transform now.
327	columnHelper := columnHelper{data: data}
328	if !sort.IsSorted((sortByOffsetSlice)(ms)) {
329		log.Printf("WARN: performance invariant violated. candidate matches are not sorted in fillContentChunkMatches. Report to developers.")
330		sort.Sort((sortByOffsetSlice)(ms))
331	}
332
333	newlines := p.newlines()
334	chunks := chunkCandidates(ms, newlines, numContextLines)
335	chunkMatches := make([]ChunkMatch, 0, len(chunks))
336	for _, chunk := range chunks {
337		ranges := make([]Range, 0, len(chunk.candidates))
338		for _, cm := range chunk.candidates {
339			startOffset := cm.byteOffset
340			endOffset := cm.byteOffset + cm.byteMatchSz
341			startLine, endLine := newlines.offsetRangeToLineRange(startOffset, endOffset)
342
343			ranges = append(ranges, Range{
344				Start: Location{
345					ByteOffset: startOffset,
346					LineNumber: uint32(startLine),
347					Column:     columnHelper.get(int(newlines.lineStart(startLine)), startOffset),
348				},
349				End: Location{
350					ByteOffset: endOffset,
351					LineNumber: uint32(endLine),
352					Column:     columnHelper.get(int(newlines.lineStart(endLine)), endOffset),
353				},
354			})
355		}
356
357		firstLineNumber := int(chunk.firstLine) - numContextLines
358		if firstLineNumber < 1 {
359			firstLineNumber = 1
360		}
361		firstLineStart := newlines.lineStart(firstLineNumber)
362
363		chunkScore, symbolInfo := p.scoreChunk(chunk.candidates, language, opts)
364		chunkMatches = append(chunkMatches, ChunkMatch{
365			Content: newlines.getLines(data, firstLineNumber, int(chunk.lastLine)+numContextLines+1),
366			ContentStart: Location{
367				ByteOffset: firstLineStart,
368				LineNumber: uint32(firstLineNumber),
369				Column:     1,
370			},
371			FileName:      false,
372			Ranges:        ranges,
373			SymbolInfo:    symbolInfo,
374			BestLineMatch: uint32(chunkScore.bestLine),
375			Score:         chunkScore.score,
376			DebugScore:    chunkScore.debugScore,
377		})
378	}
379	return chunkMatches
380}
381
382type candidateChunk struct {
383	candidates []*candidateMatch
384	firstLine  uint32 // 1-based, inclusive
385	lastLine   uint32 // 1-based, inclusive
386	minOffset  uint32 // 0-based, inclusive
387	maxOffset  uint32 // 0-based, exclusive
388}
389
390// chunkCandidates groups a set of sorted, non-overlapping candidate matches by line number. Adjacent
391// chunks will be merged if adding `numContextLines` to the beginning and end of the chunk would cause
392// it to overlap with an adjacent chunk.
393//
394// input invariants: ms is sorted by byteOffset and is non overlapping with respect to endOffset.
395// output invariants: if you flatten candidates the input invariant is retained.
396func chunkCandidates(ms []*candidateMatch, newlines newlines, numContextLines int) []candidateChunk {
397	var chunks []candidateChunk
398
399	for _, m := range ms {
400		startOffset := m.byteOffset
401		endOffset := m.byteOffset + m.byteMatchSz
402		firstLine, lastLine := newlines.offsetRangeToLineRange(startOffset, endOffset)
403
404		if len(chunks) > 0 && int(chunks[len(chunks)-1].lastLine)+numContextLines >= firstLine-numContextLines {
405			// If a new chunk created with the current candidateMatch would
406			// overlap with the previous chunk, instead add the candidateMatch
407			// to the last chunk and extend end of the last chunk.
408			last := &chunks[len(chunks)-1]
409			last.candidates = append(last.candidates, m)
410			if last.maxOffset < endOffset {
411				last.lastLine = uint32(lastLine)
412				last.maxOffset = uint32(endOffset)
413			}
414		} else {
415			chunks = append(chunks, candidateChunk{
416				firstLine:  uint32(firstLine),
417				lastLine:   uint32(lastLine),
418				minOffset:  startOffset,
419				maxOffset:  endOffset,
420				candidates: []*candidateMatch{m},
421			})
422		}
423	}
424	return chunks
425}
426
427// columnHelper is a helper struct which caches the number of runes last
428// counted. If we naively use utf8.RuneCount for each match on a line, this
429// leads to an O(nm) algorithm where m is the number of matches and n is the
430// length of the line. Aassuming we our candidates are increasing in offset
431// makes this operation O(n) instead.
432type columnHelper struct {
433	data []byte
434
435	// 0 values for all these are valid values
436	lastLineOffset int
437	lastOffset     uint32
438	lastRuneCount  uint32
439}
440
441// get returns the line column for offset. offset is the byte offset of the
442// rune in data. lineOffset is the byte offset inside of data for the line
443// containing offset.
444func (c *columnHelper) get(lineOffset int, offset uint32) uint32 {
445	var runeCount uint32
446
447	if lineOffset == c.lastLineOffset && offset >= c.lastOffset {
448		// Can count from last calculation
449		runeCount = c.lastRuneCount + uint32(utf8.RuneCount(c.data[c.lastOffset:offset]))
450	} else {
451		// Need to count from the beginning of line
452		runeCount = uint32(utf8.RuneCount(c.data[lineOffset:offset]))
453	}
454
455	c.lastLineOffset = lineOffset
456	c.lastOffset = offset
457	c.lastRuneCount = runeCount
458
459	return runeCount + 1
460}
461
462type newlines struct {
463	// locs is the sorted set of byte offsets of the newlines in the file
464	locs []uint32
465
466	// fileSize is just the number of bytes in the file. It is stored
467	// on this struct so we can safely know the length of the last line
468	// in the file since not all files end in a newline.
469	fileSize uint32
470}
471
472// atOffset returns the line containing the offset. If the offset lands on
473// the newline ending line M, we return M.
474func (nls newlines) atOffset(offset uint32) (lineNumber int) {
475	idx := sort.Search(len(nls.locs), func(n int) bool {
476		return nls.locs[n] >= offset
477	})
478	return idx + 1
479}
480
481// lineStart returns the byte offset of the beginning of the given line.
482// lineNumber is 1-based. If lineNumber is out of range of the lines in the
483// file, the return value will be clamped to [0,fileSize].
484func (nls newlines) lineStart(lineNumber int) uint32 {
485	// nls.locs[0] + 1 is the start of the 2nd line of data.
486	startIdx := lineNumber - 2
487
488	if startIdx < 0 {
489		return 0
490	} else if startIdx >= len(nls.locs) {
491		return nls.fileSize
492	} else {
493		return nls.locs[startIdx] + 1
494	}
495}
496
497// offsetRangeToLineRange returns range of lines that fully contains the given byte range.
498// The inputs are 0-based byte offsets into the file representing the (exclusive) range [startOffset, endOffset).
499// The return values are 1-based line numbers representing the (inclusive) range [startLine, endLine].
500func (nls newlines) offsetRangeToLineRange(startOffset, endOffset uint32) (startLine, endLine int) {
501	startLine = nls.atOffset(startOffset)
502	endLine = nls.atOffset(
503		max(startOffset, max(endOffset, 1)-1), // clamp endOffset and prevent underflow
504	)
505	return startLine, endLine
506}
507
508// getLines returns a slice of data containing the lines [low, high).
509// low is 1-based and inclusive. high is 1-based and exclusive.
510func (nls newlines) getLines(data []byte, low, high int) []byte {
511	if low >= high {
512		return nil
513	}
514
515	return data[nls.lineStart(low):nls.lineStart(high)]
516}
517
518const (
519	// Query-dependent scoring signals. All of these together are bounded at ~9000
520	// (scoreWordMatch + scoreSymbol + scoreKindMatch * 10 + scoreFactorAtomMatch).
521	scorePartialWordMatch = 50.0
522	scoreWordMatch        = 500.0
523	scoreBase             = 7000.0
524	scorePartialBase      = 4000.0
525	scoreSymbol           = 7000.0
526	scorePartialSymbol    = 4000.0
527	scoreKindMatch        = 100.0
528	scoreFactorAtomMatch  = 400.0
529
530	// Used for ordering line and chunk matches within a file.
531	scoreLineOrderFactor = 1.0
532
533	// Used for tiebreakers. The scores are not combined with the main score, but
534	// are used to break ties between matches with the same score. The factors are
535	// chosen to separate the tiebreakers from the main score and from each other.
536	// If you make changes here, make sure to update indexData.scoreFile too.
537	scoreRepoRankFactor  = 100.0
538	scoreFileOrderFactor = 10.0
539)
540
541// findMaxOverlappingSection returns the index of the section in secs that
542// overlaps the most with the area defined by off and sz, relative to the size
543// of the section. If no section overlaps, it returns 0, false. If multiple
544// sections overlap the same amount, the first one is returned.
545//
546// The implementation assumes that sections do not overlap and are sorted by
547// DocumentSection.Start.
548func findMaxOverlappingSection(secs []DocumentSection, off, sz uint32) (uint32, bool) {
549	start := off
550	end := off + sz
551
552	// Find the first section that might overlap
553	j := sort.Search(len(secs), func(i int) bool { return secs[i].End > start })
554
555	if j == len(secs) || secs[j].Start >= end {
556		// No overlap.
557		return 0, false
558	}
559
560	relOverlap := func(j int) float64 {
561		secSize := secs[j].End - secs[j].Start
562		if secSize == 0 {
563			return 0
564		}
565		// This cannot overflow because we make sure there is overlap before calling relOverlap
566		overlap := min(secs[j].End, end) - max(secs[j].Start, start)
567		return float64(overlap) / float64(secSize)
568	}
569
570	ol1 := relOverlap(j)
571	if epsilonEqualsOne(ol1) || j == len(secs)-1 || secs[j+1].Start >= end {
572		return uint32(j), ol1 > 0
573	}
574
575	// We know that [off,off+sz[ overlaps with at least 2 sections. We only have to check
576	// if the second section overlaps more than the first one, because a third
577	// section can only overlap if the overlap with the second section is complete.
578	ol2 := relOverlap(j + 1)
579	if ol2 > ol1 {
580		return uint32(j + 1), ol2 > 0
581	}
582
583	return uint32(j), ol1 > 0
584}
585
586func (p *contentProvider) matchesSymbol(cm *candidateMatch) bool {
587	if cm.fileName {
588		return false
589	}
590
591	// Check if this candidate came from a symbol matchTree
592	if cm.symbol {
593		return true
594	}
595
596	// Check if it overlaps with a symbol.
597	secs := p.docSections()
598	_, ok := findMaxOverlappingSection(secs, cm.byteOffset, cm.byteMatchSz)
599	return ok
600}
601
602func (p *contentProvider) findSymbol(cm *candidateMatch) (DocumentSection, *Symbol, bool) {
603	if cm.fileName {
604		return DocumentSection{}, nil, false
605	}
606
607	secs := p.docSections()
608
609	secIdx, ok := cm.symbolIdx, cm.symbol
610	if !ok {
611		// Not from a symbol matchTree. Let's see if it overlaps with a symbol.
612		secIdx, ok = findMaxOverlappingSection(secs, cm.byteOffset, cm.byteMatchSz)
613	}
614	if !ok {
615		return DocumentSection{}, nil, false
616	}
617
618	sec := secs[secIdx]
619
620	// Now lets hydrate in the SymbolInfo. We do not hydrate in SymbolInfo.Sym
621	// since some callsites do not need it stored, and that incurs an extra
622	// copy.
623	//
624	// 2024-01-08 we are refactoring this and the code path indicates this can
625	// fail, so callers need to handle nil symbol. However, it would be
626	// surprising that we have a matching section but not symbol data.
627	start := p.id.fileEndSymbol[p.idx]
628	si := p.id.symbols.data(start + secIdx)
629
630	return sec, si, true
631}
632
633// sectionSlice will return data[sec.Start:sec.End] but will clip Start and
634// End such that it won't be out of range.
635func sectionSlice(data []byte, sec DocumentSection) []byte {
636	l := uint32(len(data))
637	if sec.Start >= l {
638		return nil
639	}
640	if sec.End > l {
641		sec.End = l
642	}
643	return data[sec.Start:sec.End]
644}
645
646// scoreSymbolKind boosts a match based on the combination of language, symbol
647// and kind. The language string comes from go-enry, the symbol and kind from
648// ctags.
649func scoreSymbolKind(language string, filename []byte, sym []byte, kind ctags.SymbolKind) float64 {
650	var factor float64
651
652	// Generic ranking which will be overriden by language specific ranking
653	switch kind {
654	case ctags.Type: // scip-ctags regression workaround https://github.com/sourcegraph/sourcegraph/issues/57659
655		factor = 8
656	case ctags.Class:
657		factor = 10
658	case ctags.Struct:
659		factor = 9.5
660	case ctags.Enum:
661		factor = 9
662	case ctags.Interface:
663		factor = 8
664	case ctags.Function, ctags.Method:
665		factor = 7
666	case ctags.Field:
667		factor = 5.5
668	case ctags.Constant:
669		factor = 5
670	case ctags.Variable:
671		factor = 4
672	default:
673		// For all other kinds, assign a low score by default.
674		factor = 1
675	}
676
677	switch language {
678	case "Java", "java":
679		switch kind {
680		// 2022-03-30: go-ctags contains a regex rule for Java classes that sets "kind"
681		// to "classes" instead of "c". We have to cover both cases to support existing
682		// indexes.
683		case ctags.Class:
684			factor = 10
685		case ctags.Enum:
686			factor = 9
687		case ctags.Interface:
688			factor = 8
689		case ctags.Method:
690			factor = 7
691		case ctags.Field:
692			factor = 6
693		case ctags.EnumConstant:
694			factor = 5
695		}
696	case "Kotlin", "kotlin":
697		switch kind {
698		case ctags.Class:
699			factor = 10
700		case ctags.Interface:
701			factor = 9
702		case ctags.Method:
703			factor = 8
704		case ctags.TypeAlias:
705			factor = 7
706		case ctags.Constant:
707			factor = 6
708		case ctags.Variable:
709			factor = 5
710		}
711	case "Go", "go":
712		switch kind {
713		// scip-ctags regression workaround https://github.com/sourcegraph/sourcegraph/issues/57659
714		// for each case a description of the fields in ctags in the comment
715		case ctags.Type: // interface struct talias
716			factor = 9
717		case ctags.Interface: // interfaces
718			factor = 10
719		case ctags.Struct: // structs
720			factor = 9
721		case ctags.TypeAlias: // type aliases
722			factor = 9
723		case ctags.MethodSpec: // interface method specification
724			factor = 8.5
725		case ctags.Method, ctags.Function: // functions
726			factor = 8
727		case ctags.Field: // struct fields
728			factor = 7
729		case ctags.Constant: // constants
730			factor = 6
731		case ctags.Variable: // variables
732			factor = 5
733		}
734
735		// Boost exported go symbols. Same implementation as token.IsExported
736		if ch, _ := utf8.DecodeRune(sym); unicode.IsUpper(ch) {
737			factor += 0.5
738		}
739
740		if bytes.HasSuffix(filename, []byte("_test.go")) {
741			factor *= 0.8
742		}
743
744		// Could also rank on:
745		//
746		//   - anonMember  struct anonymous members
747		//   - packageName name for specifying imported package
748		//   - receiver    receivers
749		//   - package     packages
750		//   - type        types
751		//   - unknown     unknown
752	case "C++", "c++":
753		switch kind {
754		case ctags.Class: // classes
755			factor = 10
756		case ctags.Enum: // enumeration names
757			factor = 9
758		case ctags.Function: // function definitions
759			factor = 8
760		case ctags.Struct: // structure names
761			factor = 7
762		case ctags.Union: // union names
763			factor = 6
764		case ctags.TypeAlias: // typedefs
765			factor = 5
766		case ctags.Field: // class, struct, and union members
767			factor = 4
768		case ctags.Variable: // varialbe definitions
769			factor = 3
770		}
771	// Could also rank on:
772	// NAME        DESCRIPTION
773	// macro       macro definitions
774	// enumerator  enumerators (values inside an enumeration)
775	// header      included header files
776	// namespace   namespaces
777	// variable    variable definitions
778	case "Scala", "scala":
779		switch kind {
780		case ctags.Class:
781			factor = 10
782		case ctags.Interface:
783			factor = 9
784		case ctags.Object:
785			factor = 8
786		case ctags.Function:
787			factor = 7
788		case ctags.Type:
789			factor = 6
790		case ctags.Variable:
791			factor = 5
792		case ctags.Package:
793			factor = 4
794		}
795	case "Python", "python":
796		switch kind {
797		case ctags.Class: // classes
798			factor = 10
799		case ctags.Function, ctags.Method: // function definitions
800			factor = 8
801		case ctags.Field: // class, struct, and union members
802			factor = 4
803		case ctags.Variable: // variable definitions
804			factor = 3
805		case ctags.Local: // local variables
806			factor = 2
807		}
808		// Could also rank on:
809		//
810		//   - namespace name referring a module defined in other file
811		//   - module    modules
812		//   - unknown   name referring a class/variable/function/module defined in other module
813		//   - parameter function parameters
814	case "Ruby", "ruby":
815		switch kind {
816		case ctags.Class:
817			factor = 10
818		case ctags.Method:
819			factor = 9
820		case ctags.MethodAlias:
821			factor = 8
822		case ctags.Module:
823			factor = 7
824		case ctags.SingletonMethod:
825			factor = 6
826		case ctags.Constant:
827			factor = 5
828		case ctags.Accessor:
829			factor = 4
830		case ctags.Library:
831			factor = 3
832		}
833	case "PHP", "php":
834		switch kind {
835		case ctags.Class:
836			factor = 10
837		case ctags.Interface:
838			factor = 9
839		case ctags.Function:
840			factor = 8
841		case ctags.Trait:
842			factor = 7
843		case ctags.Define:
844			factor = 6
845		case ctags.Namespace:
846			factor = 5
847		case ctags.MethodAlias:
848			factor = 4
849		case ctags.Variable:
850			factor = 3
851		case ctags.Local:
852			factor = 3
853		}
854	case "GraphQL", "graphql":
855		switch kind {
856		case ctags.Type:
857			factor = 10
858		}
859	case "Markdown", "markdown":
860		// Headers are good signal in docs, but do not rank as highly as code.
861		switch kind {
862		case ctags.Chapter: // #
863			factor = 4
864		case ctags.Section: // ##
865			factor = 3
866		case ctags.Subsection: // ###
867			factor = 2
868		}
869	}
870
871	return factor * scoreKindMatch
872}
873
874type matchScoreSlice []LineMatch
875
876func (m matchScoreSlice) Len() int           { return len(m) }
877func (m matchScoreSlice) Swap(i, j int)      { m[i], m[j] = m[j], m[i] }
878func (m matchScoreSlice) Less(i, j int) bool { return m[i].Score > m[j].Score }
879
880type chunkMatchScoreSlice []ChunkMatch
881
882func (m chunkMatchScoreSlice) Len() int           { return len(m) }
883func (m chunkMatchScoreSlice) Swap(i, j int)      { m[i], m[j] = m[j], m[i] }
884func (m chunkMatchScoreSlice) Less(i, j int) bool { return m[i].Score > m[j].Score }
885
886type fileMatchesByScore []FileMatch
887
888func (m fileMatchesByScore) Len() int           { return len(m) }
889func (m fileMatchesByScore) Swap(i, j int)      { m[i], m[j] = m[j], m[i] }
890func (m fileMatchesByScore) Less(i, j int) bool { return m[i].Score > m[j].Score }
891
892func sortMatchesByScore(ms []LineMatch) {
893	sort.Sort(matchScoreSlice(ms))
894}
895
896func sortChunkMatchesByScore(ms []ChunkMatch) {
897	sort.Sort(chunkMatchScoreSlice(ms))
898}
899
900// SortFiles sorts files matches in the order we want to present results to
901// users. The order depends on the match score, which includes both
902// query-dependent signals like word overlap, and file-only signals like the
903// file ranks (if file ranks are enabled).
904//
905// We don't only use the scores, we will also boost some results to present
906// files with novel extensions.
907func SortFiles(ms []FileMatch) {
908	sort.Sort(fileMatchesByScore(ms))
909
910	// Boost a file extension not in the top 3 to the third filematch.
911	boostNovelExtension(ms, 2, 0.9)
912}
913
914func boostNovelExtension(ms []FileMatch, boostOffset int, minScoreRatio float64) {
915	if len(ms) <= boostOffset+1 {
916		return
917	}
918
919	top := ms[:boostOffset]
920	candidates := ms[boostOffset:]
921
922	// Don't bother boosting something which is significantly different to the
923	// result it replaces.
924	minScoreForNovelty := candidates[0].Score * minScoreRatio
925
926	// We want to look for an ext that isn't in the top exts
927	exts := make([]string, len(top))
928	for i := range top {
929		exts[i] = path.Ext(top[i].FileName)
930	}
931
932	for i := range candidates {
933		// Do not assume sorted due to boostNovelExtension being called on subsets
934		if candidates[i].Score < minScoreForNovelty {
935			continue
936		}
937
938		if slices.Contains(exts, path.Ext(candidates[i].FileName)) {
939			continue
940		}
941
942		// Found what we are looking for, now boost to front of candidates (which
943		// is ms[boostOffset])
944		for ; i > 0; i-- {
945			candidates[i], candidates[i-1] = candidates[i-1], candidates[i]
946		}
947		return
948	}
949}
Configure Feed

Configure Feed