index/contentprovider.go at 70647bad55921645285ce3776d8c29928eb205bc · boltless.me/zoekt

fork of https://github.com/sourcegraph/zoekt
zoekt / index / contentprovider.go
at 70647bad55921645285ce3776d8c29928eb205bc 28 kB View raw
Julie Tibshirani Move root-level index code to index package (#902) 1y ago
  1// Copyright 2016 Google Inc. All rights reserved.
  2//
  3// Licensed under the Apache License, Version 2.0 (the "License");
  4// you may not use this file except in compliance with the License.
  5// You may obtain a copy of the License at
  6//
  7//    http://www.apache.org/licenses/LICENSE-2.0
  8//
  9// Unless required by applicable law or agreed to in writing, software
 10// distributed under the License is distributed on an "AS IS" BASIS,
 11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12// See the License for the specific language governing permissions and
 13// limitations under the License.
 14
 15package index
 16
 17import (
 18	"bytes"
 19	"log"
 20	"path"
 21	"slices"
 22	"sort"
 23	"unicode"
 24	"unicode/utf8"
 25
 26	"github.com/sourcegraph/zoekt"
 27	"github.com/sourcegraph/zoekt/internal/ctags"
 28)
 29
 30var _ = log.Println
 31
 32// contentProvider is an abstraction to treat matches for names and
 33// content with the same code.
 34type contentProvider struct {
 35	id    *indexData
 36	stats *zoekt.Stats
 37
 38	// mutable
 39	err      error
 40	idx      uint32
 41	_data    []byte
 42	_nl      []uint32
 43	_nlBuf   []uint32
 44	_sects   []DocumentSection
 45	_sectBuf []DocumentSection
 46	fileSize uint32
 47}
 48
 49// setDocument skips to the given document.
 50func (p *contentProvider) setDocument(docID uint32) {
 51	fileStart := p.id.boundaries[docID]
 52
 53	p.idx = docID
 54	p.fileSize = p.id.boundaries[docID+1] - fileStart
 55
 56	p._nl = nil
 57	p._sects = nil
 58	p._data = nil
 59}
 60
 61func (p *contentProvider) docSections() []DocumentSection {
 62	if p._sects == nil {
 63		var sz uint32
 64		p._sects, sz, p.err = p.id.readDocSections(p.idx, p._sectBuf)
 65		p.stats.ContentBytesLoaded += int64(sz)
 66		p._sectBuf = p._sects
 67	}
 68	return p._sects
 69}
 70
 71func (p *contentProvider) newlines() newlines {
 72	if p._nl == nil {
 73		var sz uint32
 74		p._nl, sz, p.err = p.id.readNewlines(p.idx, p._nlBuf)
 75		p._nlBuf = p._nl
 76		p.stats.ContentBytesLoaded += int64(sz)
 77	}
 78	return newlines{locs: p._nl, fileSize: p.fileSize}
 79}
 80
 81func (p *contentProvider) data(fileName bool) []byte {
 82	if fileName {
 83		return p.id.fileNameContent[p.id.fileNameIndex[p.idx]:p.id.fileNameIndex[p.idx+1]]
 84	}
 85
 86	if p._data == nil {
 87		p._data, p.err = p.id.readContents(p.idx)
 88		p.stats.FilesLoaded++
 89		p.stats.ContentBytesLoaded += int64(len(p._data))
 90	}
 91	return p._data
 92}
 93
 94// Find offset in bytes (relative to corpus start) for an offset in
 95// runes (relative to document start). If filename is set, the corpus
 96// is the set of filenames, with the document being the name itself.
 97func (p *contentProvider) findOffset(filename bool, r uint32) uint32 {
 98	if p.id.metaData.PlainASCII {
 99		return r
100	}
101
102	sample := p.id.runeOffsets
103	runeEnds := p.id.fileEndRunes
104	fileStartByte := p.id.boundaries[p.idx]
105	if filename {
106		sample = p.id.fileNameRuneOffsets
107		runeEnds = p.id.fileNameEndRunes
108		fileStartByte = p.id.fileNameIndex[p.idx]
109	}
110
111	absR := r
112	if p.idx > 0 {
113		absR += runeEnds[p.idx-1]
114	}
115
116	byteOff, left := sample.lookup(absR)
117
118	var data []byte
119
120	if filename {
121		data = p.id.fileNameContent[byteOff:]
122	} else {
123		data, p.err = p.id.readContentSlice(byteOff, 3*runeOffsetFrequency)
124		if p.err != nil {
125			return 0
126		}
127	}
128	for left > 0 {
129		_, sz := utf8.DecodeRune(data)
130		byteOff += uint32(sz)
131		data = data[sz:]
132		left--
133	}
134
135	byteOff -= fileStartByte
136	return byteOff
137}
138
139// fillMatches converts the internal candidateMatch slice into our API's LineMatch.
140// It only ever returns content XOR filename matches, not both. If there are any
141// content matches, these are always returned, and we omit filename matches.
142//
143// Performance invariant: ms is sorted and non-overlapping.
144//
145// Note: the byte slices may be backed by mmapped data, so before being
146// returned by the API it needs to be copied.
147func (p *contentProvider) fillMatches(ms []*candidateMatch, numContextLines int, language string, opts *zoekt.SearchOptions) []zoekt.LineMatch {
148	var filenameMatches []*candidateMatch
149	contentMatches := make([]*candidateMatch, 0, len(ms))
150
151	for _, m := range ms {
152		if m.fileName {
153			filenameMatches = append(filenameMatches, m)
154		} else {
155			contentMatches = append(contentMatches, m)
156		}
157	}
158
159	// If there are any content matches, we only return these and skip filename matches.
160	if len(contentMatches) > 0 {
161		contentMatches = breakMatchesOnNewlines(contentMatches, p.data(false))
162		return p.fillContentMatches(contentMatches, numContextLines, language, opts)
163	}
164
165	// Otherwise, we return a single line containing the filematch index.
166	lineScore, _ := p.scoreLine(filenameMatches, language, -1 /* must pass -1 for filenames */, opts)
167	res := zoekt.LineMatch{
168		Line:       p.id.fileName(p.idx),
169		FileName:   true,
170		Score:      lineScore.score,
171		DebugScore: lineScore.debugScore,
172	}
173
174	for _, m := range ms {
175		res.LineFragments = append(res.LineFragments, zoekt.LineFragmentMatch{
176			LineOffset:  int(m.byteOffset),
177			MatchLength: int(m.byteMatchSz),
178			Offset:      m.byteOffset,
179		})
180	}
181
182	return []zoekt.LineMatch{res}
183
184}
185
186// fillChunkMatches converts the internal candidateMatch slice into our API's ChunkMatch.
187// It only ever returns content XOR filename matches, not both. If there are any content
188// matches, these are always returned, and we omit filename matches.
189//
190// Performance invariant: ms is sorted and non-overlapping.
191//
192// Note: the byte slices may be backed by mmapped data, so before being
193// returned by the API it needs to be copied.
194func (p *contentProvider) fillChunkMatches(ms []*candidateMatch, numContextLines int, language string, opts *zoekt.SearchOptions) []zoekt.ChunkMatch {
195	var filenameMatches []*candidateMatch
196	contentMatches := make([]*candidateMatch, 0, len(ms))
197
198	for _, m := range ms {
199		if m.fileName {
200			filenameMatches = append(filenameMatches, m)
201		} else {
202			contentMatches = append(contentMatches, m)
203		}
204	}
205
206	// If there are any content matches, we only return these and skip filename matches.
207	if len(contentMatches) > 0 {
208		return p.fillContentChunkMatches(contentMatches, numContextLines, language, opts)
209	}
210
211	// Otherwise, we return a single chunk representing the filename index.
212	lineScore, _ := p.scoreLine(filenameMatches, language, -1 /* must pass -1 for filenames */, opts)
213	fileName := p.id.fileName(p.idx)
214	ranges := make([]zoekt.Range, 0, len(ms))
215	for _, m := range ms {
216		ranges = append(ranges, zoekt.Range{
217			Start: zoekt.Location{
218				ByteOffset: m.byteOffset,
219				LineNumber: 1,
220				Column:     uint32(utf8.RuneCount(fileName[:m.byteOffset]) + 1),
221			},
222			End: zoekt.Location{
223				ByteOffset: m.byteOffset + m.byteMatchSz,
224				LineNumber: 1,
225				Column:     uint32(utf8.RuneCount(fileName[:m.byteOffset+m.byteMatchSz]) + 1),
226			},
227		})
228	}
229
230	return []zoekt.ChunkMatch{{
231		Content:      fileName,
232		ContentStart: zoekt.Location{ByteOffset: 0, LineNumber: 1, Column: 1},
233		Ranges:       ranges,
234		FileName:     true,
235		Score:        lineScore.score,
236		DebugScore:   lineScore.debugScore,
237	}}
238}
239
240func (p *contentProvider) fillContentMatches(ms []*candidateMatch, numContextLines int, language string, opts *zoekt.SearchOptions) []zoekt.LineMatch {
241	var result []zoekt.LineMatch
242	for len(ms) > 0 {
243		m := ms[0]
244		num := p.newlines().atOffset(m.byteOffset)
245		lineStart := int(p.newlines().lineStart(num))
246		nextLineStart := int(p.newlines().lineStart(num + 1))
247
248		var lineCands []*candidateMatch
249
250		endMatch := m.byteOffset + m.byteMatchSz
251
252		for len(ms) > 0 {
253			m := ms[0]
254			if int(m.byteOffset) < nextLineStart {
255				endMatch = m.byteOffset + m.byteMatchSz
256				lineCands = append(lineCands, m)
257				ms = ms[1:]
258			} else {
259				break
260			}
261		}
262
263		if len(lineCands) == 0 {
264			log.Panicf(
265				"%s %v infinite loop: num %d start,end %d,%d, offset %d",
266				p.id.fileName(p.idx), p.id.metaData,
267				num, lineStart, nextLineStart,
268				m.byteOffset)
269		}
270
271		data := p.data(false)
272
273		// Due to merging matches, we may have a match that
274		// crosses a line boundary. Prevent confusion by
275		// taking lines until we pass the last index
276		for nextLineStart < len(data) && endMatch > uint32(nextLineStart) {
277			next := bytes.IndexByte(data[nextLineStart:], '\n')
278			if next == -1 {
279				nextLineStart = len(data)
280			} else {
281				// TODO(hanwen): test that checks "+1" part here.
282				nextLineStart += next + 1
283			}
284		}
285
286		finalMatch := zoekt.LineMatch{
287			LineStart:  lineStart,
288			LineEnd:    nextLineStart,
289			LineNumber: num,
290		}
291		finalMatch.Line = data[lineStart:nextLineStart]
292
293		if numContextLines > 0 {
294			finalMatch.Before = p.newlines().getLines(data, num-numContextLines, num)
295			finalMatch.After = p.newlines().getLines(data, num+1, num+1+numContextLines)
296		}
297
298		lineScore, symbolInfo := p.scoreLine(lineCands, language, num, opts)
299		finalMatch.Score = lineScore.score
300		finalMatch.DebugScore = lineScore.debugScore
301
302		for i, m := range lineCands {
303			fragment := zoekt.LineFragmentMatch{
304				Offset:      m.byteOffset,
305				LineOffset:  int(m.byteOffset) - lineStart,
306				MatchLength: int(m.byteMatchSz),
307			}
308
309			if i < len(symbolInfo) && symbolInfo[i] != nil {
310				fragment.SymbolInfo = symbolInfo[i]
311			}
312
313			finalMatch.LineFragments = append(finalMatch.LineFragments, fragment)
314		}
315		result = append(result, finalMatch)
316	}
317	return result
318}
319
320func (p *contentProvider) fillContentChunkMatches(ms []*candidateMatch, numContextLines int, language string, opts *zoekt.SearchOptions) []zoekt.ChunkMatch {
321	data := p.data(false)
322
323	// columnHelper prevents O(len(ms) * len(data)) lookups for all columns.
324	// However, it depends on ms being sorted by byteOffset and non-overlapping.
325	// This invariant is true at the time of writing, but we conservatively
326	// enforce this. Note: chunkCandidates preserves the sorting so safe to
327	// transform now.
328	columnHelper := columnHelper{data: data}
329	if !sort.IsSorted((sortByOffsetSlice)(ms)) {
330		log.Printf("WARN: performance invariant violated. candidate matches are not sorted in fillContentChunkMatches. Report to developers.")
331		sort.Sort((sortByOffsetSlice)(ms))
332	}
333
334	newlines := p.newlines()
335	chunks := chunkCandidates(ms, newlines, numContextLines)
336	chunkMatches := make([]zoekt.ChunkMatch, 0, len(chunks))
337	for _, chunk := range chunks {
338		ranges := make([]zoekt.Range, 0, len(chunk.candidates))
339		for _, cm := range chunk.candidates {
340			startOffset := cm.byteOffset
341			endOffset := cm.byteOffset + cm.byteMatchSz
342			startLine, endLine := newlines.offsetRangeToLineRange(startOffset, endOffset)
343
344			ranges = append(ranges, zoekt.Range{
345				Start: zoekt.Location{
346					ByteOffset: startOffset,
347					LineNumber: uint32(startLine),
348					Column:     columnHelper.get(int(newlines.lineStart(startLine)), startOffset),
349				},
350				End: zoekt.Location{
351					ByteOffset: endOffset,
352					LineNumber: uint32(endLine),
353					Column:     columnHelper.get(int(newlines.lineStart(endLine)), endOffset),
354				},
355			})
356		}
357
358		firstLineNumber := int(chunk.firstLine) - numContextLines
359		if firstLineNumber < 1 {
360			firstLineNumber = 1
361		}
362		firstLineStart := newlines.lineStart(firstLineNumber)
363
364		chunkScore, symbolInfo := p.scoreChunk(chunk.candidates, language, opts)
365		chunkMatches = append(chunkMatches, zoekt.ChunkMatch{
366			Content: newlines.getLines(data, firstLineNumber, int(chunk.lastLine)+numContextLines+1),
367			ContentStart: zoekt.Location{
368				ByteOffset: firstLineStart,
369				LineNumber: uint32(firstLineNumber),
370				Column:     1,
371			},
372			FileName:      false,
373			Ranges:        ranges,
374			SymbolInfo:    symbolInfo,
375			BestLineMatch: uint32(chunkScore.bestLine),
376			Score:         chunkScore.score,
377			DebugScore:    chunkScore.debugScore,
378		})
379	}
380	return chunkMatches
381}
382
383type candidateChunk struct {
384	candidates []*candidateMatch
385	firstLine  uint32 // 1-based, inclusive
386	lastLine   uint32 // 1-based, inclusive
387	minOffset  uint32 // 0-based, inclusive
388	maxOffset  uint32 // 0-based, exclusive
389}
390
391// chunkCandidates groups a set of sorted, non-overlapping candidate matches by line number. Adjacent
392// chunks will be merged if adding `numContextLines` to the beginning and end of the chunk would cause
393// it to overlap with an adjacent chunk.
394//
395// input invariants: ms is sorted by byteOffset and is non overlapping with respect to endOffset.
396// output invariants: if you flatten candidates the input invariant is retained.
397func chunkCandidates(ms []*candidateMatch, newlines newlines, numContextLines int) []candidateChunk {
398	var chunks []candidateChunk
399
400	for _, m := range ms {
401		startOffset := m.byteOffset
402		endOffset := m.byteOffset + m.byteMatchSz
403		firstLine, lastLine := newlines.offsetRangeToLineRange(startOffset, endOffset)
404
405		if len(chunks) > 0 && int(chunks[len(chunks)-1].lastLine)+numContextLines >= firstLine-numContextLines {
406			// If a new chunk created with the current candidateMatch would
407			// overlap with the previous chunk, instead add the candidateMatch
408			// to the last chunk and extend end of the last chunk.
409			last := &chunks[len(chunks)-1]
410			last.candidates = append(last.candidates, m)
411			if last.maxOffset < endOffset {
412				last.lastLine = uint32(lastLine)
413				last.maxOffset = uint32(endOffset)
414			}
415		} else {
416			chunks = append(chunks, candidateChunk{
417				firstLine:  uint32(firstLine),
418				lastLine:   uint32(lastLine),
419				minOffset:  startOffset,
420				maxOffset:  endOffset,
421				candidates: []*candidateMatch{m},
422			})
423		}
424	}
425	return chunks
426}
427
428// columnHelper is a helper struct which caches the number of runes last
429// counted. If we naively use utf8.RuneCount for each match on a line, this
430// leads to an O(nm) algorithm where m is the number of matches and n is the
431// length of the line. Aassuming we our candidates are increasing in offset
432// makes this operation O(n) instead.
433type columnHelper struct {
434	data []byte
435
436	// 0 values for all these are valid values
437	lastLineOffset int
438	lastOffset     uint32
439	lastRuneCount  uint32
440}
441
442// get returns the line column for offset. offset is the byte offset of the
443// rune in data. lineOffset is the byte offset inside of data for the line
444// containing offset.
445func (c *columnHelper) get(lineOffset int, offset uint32) uint32 {
446	var runeCount uint32
447
448	if lineOffset == c.lastLineOffset && offset >= c.lastOffset {
449		// Can count from last calculation
450		runeCount = c.lastRuneCount + uint32(utf8.RuneCount(c.data[c.lastOffset:offset]))
451	} else {
452		// Need to count from the beginning of line
453		runeCount = uint32(utf8.RuneCount(c.data[lineOffset:offset]))
454	}
455
456	c.lastLineOffset = lineOffset
457	c.lastOffset = offset
458	c.lastRuneCount = runeCount
459
460	return runeCount + 1
461}
462
463type newlines struct {
464	// locs is the sorted set of byte offsets of the newlines in the file
465	locs []uint32
466
467	// fileSize is just the number of bytes in the file. It is stored
468	// on this struct so we can safely know the length of the last line
469	// in the file since not all files end in a newline.
470	fileSize uint32
471}
472
473// atOffset returns the line containing the offset. If the offset lands on
474// the newline ending line M, we return M.
475func (nls newlines) atOffset(offset uint32) (lineNumber int) {
476	idx := sort.Search(len(nls.locs), func(n int) bool {
477		return nls.locs[n] >= offset
478	})
479	return idx + 1
480}
481
482// lineStart returns the byte offset of the beginning of the given line.
483// lineNumber is 1-based. If lineNumber is out of range of the lines in the
484// file, the return value will be clamped to [0,fileSize].
485func (nls newlines) lineStart(lineNumber int) uint32 {
486	// nls.locs[0] + 1 is the start of the 2nd line of data.
487	startIdx := lineNumber - 2
488
489	if startIdx < 0 {
490		return 0
491	} else if startIdx >= len(nls.locs) {
492		return nls.fileSize
493	} else {
494		return nls.locs[startIdx] + 1
495	}
496}
497
498// offsetRangeToLineRange returns range of lines that fully contains the given byte range.
499// The inputs are 0-based byte offsets into the file representing the (exclusive) range [startOffset, endOffset).
500// The return values are 1-based line numbers representing the (inclusive) range [startLine, endLine].
501func (nls newlines) offsetRangeToLineRange(startOffset, endOffset uint32) (startLine, endLine int) {
502	startLine = nls.atOffset(startOffset)
503	endLine = nls.atOffset(
504		max(startOffset, max(endOffset, 1)-1), // clamp endOffset and prevent underflow
505	)
506	return startLine, endLine
507}
508
509// getLines returns a slice of data containing the lines [low, high).
510// low is 1-based and inclusive. high is 1-based and exclusive.
511func (nls newlines) getLines(data []byte, low, high int) []byte {
512	if low >= high {
513		return nil
514	}
515
516	return data[nls.lineStart(low):nls.lineStart(high)]
517}
518
519const (
520	// Query-dependent scoring signals. All of these together are bounded at ~9000
521	// (scoreWordMatch + scoreSymbol + scoreKindMatch * 10 + scoreFactorAtomMatch).
522	scorePartialWordMatch = 50.0
523	scoreWordMatch        = 500.0
524	scoreBase             = 7000.0
525	scorePartialBase      = 4000.0
526	scoreSymbol           = 7000.0
527	scorePartialSymbol    = 4000.0
528	scoreKindMatch        = 100.0
529	scoreFactorAtomMatch  = 400.0
530
531	// Used for ordering line and chunk matches within a file.
532	scoreLineOrderFactor = 1.0
533
534	// Used for tiebreakers. The scores are not combined with the main score, but
535	// are used to break ties between matches with the same score. The factors are
536	// chosen to separate the tiebreakers from the main score and from each other.
537	// If you make changes here, make sure to update indexData.scoreFile too.
538	scoreRepoRankFactor  = 100.0
539	scoreFileOrderFactor = 10.0
540)
541
542// findMaxOverlappingSection returns the index of the section in secs that
543// overlaps the most with the area defined by off and sz, relative to the size
544// of the section. If no section overlaps, it returns 0, false. If multiple
545// sections overlap the same amount, the first one is returned.
546//
547// The implementation assumes that sections do not overlap and are sorted by
548// DocumentSection.Start.
549func findMaxOverlappingSection(secs []DocumentSection, off, sz uint32) (uint32, bool) {
550	start := off
551	end := off + sz
552
553	// Find the first section that might overlap
554	j := sort.Search(len(secs), func(i int) bool { return secs[i].End > start })
555
556	if j == len(secs) || secs[j].Start >= end {
557		// No overlap.
558		return 0, false
559	}
560
561	relOverlap := func(j int) float64 {
562		secSize := secs[j].End - secs[j].Start
563		if secSize == 0 {
564			return 0
565		}
566		// This cannot overflow because we make sure there is overlap before calling relOverlap
567		overlap := min(secs[j].End, end) - max(secs[j].Start, start)
568		return float64(overlap) / float64(secSize)
569	}
570
571	ol1 := relOverlap(j)
572	if epsilonEqualsOne(ol1) || j == len(secs)-1 || secs[j+1].Start >= end {
573		return uint32(j), ol1 > 0
574	}
575
576	// We know that [off,off+sz[ overlaps with at least 2 sections. We only have to check
577	// if the second section overlaps more than the first one, because a third
578	// section can only overlap if the overlap with the second section is complete.
579	ol2 := relOverlap(j + 1)
580	if ol2 > ol1 {
581		return uint32(j + 1), ol2 > 0
582	}
583
584	return uint32(j), ol1 > 0
585}
586
587func (p *contentProvider) matchesSymbol(cm *candidateMatch) bool {
588	if cm.fileName {
589		return false
590	}
591
592	// Check if this candidate came from a symbol matchTree
593	if cm.symbol {
594		return true
595	}
596
597	// Check if it overlaps with a symbol.
598	secs := p.docSections()
599	_, ok := findMaxOverlappingSection(secs, cm.byteOffset, cm.byteMatchSz)
600	return ok
601}
602
603func (p *contentProvider) findSymbol(cm *candidateMatch) (DocumentSection, *zoekt.Symbol, bool) {
604	if cm.fileName {
605		return DocumentSection{}, nil, false
606	}
607
608	secs := p.docSections()
609
610	secIdx, ok := cm.symbolIdx, cm.symbol
611	if !ok {
612		// Not from a symbol matchTree. Let's see if it overlaps with a symbol.
613		secIdx, ok = findMaxOverlappingSection(secs, cm.byteOffset, cm.byteMatchSz)
614	}
615	if !ok {
616		return DocumentSection{}, nil, false
617	}
618
619	sec := secs[secIdx]
620
621	// Now lets hydrate in the SymbolInfo. We do not hydrate in SymbolInfo.Sym
622	// since some callsites do not need it stored, and that incurs an extra
623	// copy.
624	//
625	// 2024-01-08 we are refactoring this and the code path indicates this can
626	// fail, so callers need to handle nil symbol. However, it would be
627	// surprising that we have a matching section but not symbol data.
628	start := p.id.fileEndSymbol[p.idx]
629	si := p.id.symbols.data(start + secIdx)
630
631	return sec, si, true
632}
633
634// sectionSlice will return data[sec.Start:sec.End] but will clip Start and
635// End such that it won't be out of range.
636func sectionSlice(data []byte, sec DocumentSection) []byte {
637	l := uint32(len(data))
638	if sec.Start >= l {
639		return nil
640	}
641	if sec.End > l {
642		sec.End = l
643	}
644	return data[sec.Start:sec.End]
645}
646
647// scoreSymbolKind boosts a match based on the combination of language, symbol
648// and kind. The language string comes from go-enry, the symbol and kind from
649// ctags.
650func scoreSymbolKind(language string, filename []byte, sym []byte, kind ctags.SymbolKind) float64 {
651	var factor float64
652
653	// Generic ranking which will be overriden by language specific ranking
654	switch kind {
655	case ctags.Type: // scip-ctags regression workaround https://github.com/sourcegraph/sourcegraph/issues/57659
656		factor = 8
657	case ctags.Class:
658		factor = 10
659	case ctags.Struct:
660		factor = 9.5
661	case ctags.Enum:
662		factor = 9
663	case ctags.Interface:
664		factor = 8
665	case ctags.Function, ctags.Method:
666		factor = 7
667	case ctags.Field:
668		factor = 5.5
669	case ctags.Constant:
670		factor = 5
671	case ctags.Variable:
672		factor = 4
673	default:
674		// For all other kinds, assign a low score by default.
675		factor = 1
676	}
677
678	switch language {
679	case "Java", "java":
680		switch kind {
681		// 2022-03-30: go-ctags contains a regex rule for Java classes that sets "kind"
682		// to "classes" instead of "c". We have to cover both cases to support existing
683		// indexes.
684		case ctags.Class:
685			factor = 10
686		case ctags.Enum:
687			factor = 9
688		case ctags.Interface:
689			factor = 8
690		case ctags.Method:
691			factor = 7
692		case ctags.Field:
693			factor = 6
694		case ctags.EnumConstant:
695			factor = 5
696		}
697	case "Kotlin", "kotlin":
698		switch kind {
699		case ctags.Class:
700			factor = 10
701		case ctags.Interface:
702			factor = 9
703		case ctags.Method:
704			factor = 8
705		case ctags.TypeAlias:
706			factor = 7
707		case ctags.Constant:
708			factor = 6
709		case ctags.Variable:
710			factor = 5
711		}
712	case "Go", "go":
713		switch kind {
714		// scip-ctags regression workaround https://github.com/sourcegraph/sourcegraph/issues/57659
715		// for each case a description of the fields in ctags in the comment
716		case ctags.Type: // interface struct talias
717			factor = 9
718		case ctags.Interface: // interfaces
719			factor = 10
720		case ctags.Struct: // structs
721			factor = 9
722		case ctags.TypeAlias: // type aliases
723			factor = 9
724		case ctags.MethodSpec: // interface method specification
725			factor = 8.5
726		case ctags.Method, ctags.Function: // functions
727			factor = 8
728		case ctags.Field: // struct fields
729			factor = 7
730		case ctags.Constant: // constants
731			factor = 6
732		case ctags.Variable: // variables
733			factor = 5
734		}
735
736		// Boost exported go symbols. Same implementation as token.IsExported
737		if ch, _ := utf8.DecodeRune(sym); unicode.IsUpper(ch) {
738			factor += 0.5
739		}
740
741		if bytes.HasSuffix(filename, []byte("_test.go")) {
742			factor *= 0.8
743		}
744
745		// Could also rank on:
746		//
747		//   - anonMember  struct anonymous members
748		//   - packageName name for specifying imported package
749		//   - receiver    receivers
750		//   - package     packages
751		//   - type        types
752		//   - unknown     unknown
753	case "C++", "c++":
754		switch kind {
755		case ctags.Class: // classes
756			factor = 10
757		case ctags.Enum: // enumeration names
758			factor = 9
759		case ctags.Function: // function definitions
760			factor = 8
761		case ctags.Struct: // structure names
762			factor = 7
763		case ctags.Union: // union names
764			factor = 6
765		case ctags.TypeAlias: // typedefs
766			factor = 5
767		case ctags.Field: // class, struct, and union members
768			factor = 4
769		case ctags.Variable: // varialbe definitions
770			factor = 3
771		}
772	// Could also rank on:
773	// NAME        DESCRIPTION
774	// macro       macro definitions
775	// enumerator  enumerators (values inside an enumeration)
776	// header      included header files
777	// namespace   namespaces
778	// variable    variable definitions
779	case "Scala", "scala":
780		switch kind {
781		case ctags.Class:
782			factor = 10
783		case ctags.Interface:
784			factor = 9
785		case ctags.Object:
786			factor = 8
787		case ctags.Function:
788			factor = 7
789		case ctags.Type:
790			factor = 6
791		case ctags.Variable:
792			factor = 5
793		case ctags.Package:
794			factor = 4
795		}
796	case "Python", "python":
797		switch kind {
798		case ctags.Class: // classes
799			factor = 10
800		case ctags.Function, ctags.Method: // function definitions
801			factor = 8
802		case ctags.Field: // class, struct, and union members
803			factor = 4
804		case ctags.Variable: // variable definitions
805			factor = 3
806		case ctags.Local: // local variables
807			factor = 2
808		}
809		// Could also rank on:
810		//
811		//   - namespace name referring a module defined in other file
812		//   - module    modules
813		//   - unknown   name referring a class/variable/function/module defined in other module
814		//   - parameter function parameters
815	case "Ruby", "ruby":
816		switch kind {
817		case ctags.Class:
818			factor = 10
819		case ctags.Method:
820			factor = 9
821		case ctags.MethodAlias:
822			factor = 8
823		case ctags.Module:
824			factor = 7
825		case ctags.SingletonMethod:
826			factor = 6
827		case ctags.Constant:
828			factor = 5
829		case ctags.Accessor:
830			factor = 4
831		case ctags.Library:
832			factor = 3
833		}
834	case "PHP", "php":
835		switch kind {
836		case ctags.Class:
837			factor = 10
838		case ctags.Interface:
839			factor = 9
840		case ctags.Function:
841			factor = 8
842		case ctags.Trait:
843			factor = 7
844		case ctags.Define:
845			factor = 6
846		case ctags.Namespace:
847			factor = 5
848		case ctags.MethodAlias:
849			factor = 4
850		case ctags.Variable:
851			factor = 3
852		case ctags.Local:
853			factor = 3
854		}
855	case "GraphQL", "graphql":
856		switch kind {
857		case ctags.Type:
858			factor = 10
859		}
860	case "Markdown", "markdown":
861		// Headers are good signal in docs, but do not rank as highly as code.
862		switch kind {
863		case ctags.Chapter: // #
864			factor = 4
865		case ctags.Section: // ##
866			factor = 3
867		case ctags.Subsection: // ###
868			factor = 2
869		}
870	}
871
872	return factor * scoreKindMatch
873}
874
875type matchScoreSlice []zoekt.LineMatch
876
877func (m matchScoreSlice) Len() int           { return len(m) }
878func (m matchScoreSlice) Swap(i, j int)      { m[i], m[j] = m[j], m[i] }
879func (m matchScoreSlice) Less(i, j int) bool { return m[i].Score > m[j].Score }
880
881type chunkMatchScoreSlice []zoekt.ChunkMatch
882
883func (m chunkMatchScoreSlice) Len() int           { return len(m) }
884func (m chunkMatchScoreSlice) Swap(i, j int)      { m[i], m[j] = m[j], m[i] }
885func (m chunkMatchScoreSlice) Less(i, j int) bool { return m[i].Score > m[j].Score }
886
887type fileMatchesByScore []zoekt.FileMatch
888
889func (m fileMatchesByScore) Len() int           { return len(m) }
890func (m fileMatchesByScore) Swap(i, j int)      { m[i], m[j] = m[j], m[i] }
891func (m fileMatchesByScore) Less(i, j int) bool { return m[i].Score > m[j].Score }
892
893func sortMatchesByScore(ms []zoekt.LineMatch) {
894	sort.Sort(matchScoreSlice(ms))
895}
896
897func sortChunkMatchesByScore(ms []zoekt.ChunkMatch) {
898	sort.Sort(chunkMatchScoreSlice(ms))
899}
900
901// SortFiles sorts files matches in the order we want to present results to
902// users. The order depends on the match score, which includes both
903// query-dependent signals like word overlap, and file-only signals like the
904// file ranks (if file ranks are enabled).
905//
906// We don't only use the scores, we will also boost some results to present
907// files with novel extensions.
908func SortFiles(ms []zoekt.FileMatch) {
909	sort.Sort(fileMatchesByScore(ms))
910
911	// Boost a file extension not in the top 3 to the third filematch.
912	boostNovelExtension(ms, 2, 0.9)
913}
914
915func boostNovelExtension(ms []zoekt.FileMatch, boostOffset int, minScoreRatio float64) {
916	if len(ms) <= boostOffset+1 {
917		return
918	}
919
920	top := ms[:boostOffset]
921	candidates := ms[boostOffset:]
922
923	// Don't bother boosting something which is significantly different to the
924	// result it replaces.
925	minScoreForNovelty := candidates[0].Score * minScoreRatio
926
927	// We want to look for an ext that isn't in the top exts
928	exts := make([]string, len(top))
929	for i := range top {
930		exts[i] = path.Ext(top[i].FileName)
931	}
932
933	for i := range candidates {
934		// Do not assume sorted due to boostNovelExtension being called on subsets
935		if candidates[i].Score < minScoreForNovelty {
936			continue
937		}
938
939		if slices.Contains(exts, path.Ext(candidates[i].FileName)) {
940			continue
941		}
942
943		// Found what we are looking for, now boost to front of candidates (which
944		// is ms[boostOffset])
945		for ; i > 0; i-- {
946			candidates[i], candidates[i-1] = candidates[i-1], candidates[i]
947		}
948		return
949	}
950}
Configure Feed

Configure Feed