index/contentprovider.go at main · boltless.me/zoekt

fork of https://github.com/sourcegraph/zoekt
zoekt / index / contentprovider.go
at main 28 kB View raw
Keegan Carruthers-Smith all: run modernize across codebase (#919) 1y ago
  1// Copyright 2016 Google Inc. All rights reserved.
  2//
  3// Licensed under the Apache License, Version 2.0 (the "License");
  4// you may not use this file except in compliance with the License.
  5// You may obtain a copy of the License at
  6//
  7//    http://www.apache.org/licenses/LICENSE-2.0
  8//
  9// Unless required by applicable law or agreed to in writing, software
 10// distributed under the License is distributed on an "AS IS" BASIS,
 11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12// See the License for the specific language governing permissions and
 13// limitations under the License.
 14
 15package index
 16
 17import (
 18	"bytes"
 19	"log"
 20	"path"
 21	"slices"
 22	"sort"
 23	"unicode"
 24	"unicode/utf8"
 25
 26	"github.com/sourcegraph/zoekt"
 27	"github.com/sourcegraph/zoekt/internal/ctags"
 28)
 29
 30var _ = log.Println
 31
 32// contentProvider is an abstraction to treat matches for names and
 33// content with the same code.
 34type contentProvider struct {
 35	id    *indexData
 36	stats *zoekt.Stats
 37
 38	// mutable
 39	err      error
 40	idx      uint32
 41	_data    []byte
 42	_nl      []uint32
 43	_nlBuf   []uint32
 44	_sects   []DocumentSection
 45	_sectBuf []DocumentSection
 46	fileSize uint32
 47}
 48
 49// setDocument skips to the given document.
 50func (p *contentProvider) setDocument(docID uint32) {
 51	fileStart := p.id.boundaries[docID]
 52
 53	p.idx = docID
 54	p.fileSize = p.id.boundaries[docID+1] - fileStart
 55
 56	p._nl = nil
 57	p._sects = nil
 58	p._data = nil
 59}
 60
 61func (p *contentProvider) docSections() []DocumentSection {
 62	if p._sects == nil {
 63		var sz uint32
 64		p._sects, sz, p.err = p.id.readDocSections(p.idx, p._sectBuf)
 65		p.stats.ContentBytesLoaded += int64(sz)
 66		p._sectBuf = p._sects
 67	}
 68	return p._sects
 69}
 70
 71func (p *contentProvider) newlines() newlines {
 72	if p._nl == nil {
 73		var sz uint32
 74		p._nl, sz, p.err = p.id.readNewlines(p.idx, p._nlBuf)
 75		p._nlBuf = p._nl
 76		p.stats.ContentBytesLoaded += int64(sz)
 77	}
 78	return newlines{locs: p._nl, fileSize: p.fileSize}
 79}
 80
 81func (p *contentProvider) data(fileName bool) []byte {
 82	if fileName {
 83		return p.id.fileNameContent[p.id.fileNameIndex[p.idx]:p.id.fileNameIndex[p.idx+1]]
 84	}
 85
 86	if p._data == nil {
 87		p._data, p.err = p.id.readContents(p.idx)
 88		p.stats.FilesLoaded++
 89		p.stats.ContentBytesLoaded += int64(len(p._data))
 90	}
 91	return p._data
 92}
 93
 94// Find offset in bytes (relative to corpus start) for an offset in
 95// runes (relative to document start). If filename is set, the corpus
 96// is the set of filenames, with the document being the name itself.
 97func (p *contentProvider) findOffset(filename bool, r uint32) uint32 {
 98	if p.id.metaData.PlainASCII {
 99		return r
100	}
101
102	sample := p.id.runeOffsets
103	runeEnds := p.id.fileEndRunes
104	fileStartByte := p.id.boundaries[p.idx]
105	if filename {
106		sample = p.id.fileNameRuneOffsets
107		runeEnds = p.id.fileNameEndRunes
108		fileStartByte = p.id.fileNameIndex[p.idx]
109	}
110
111	absR := r
112	if p.idx > 0 {
113		absR += runeEnds[p.idx-1]
114	}
115
116	byteOff, left := sample.lookup(absR)
117
118	var data []byte
119
120	if filename {
121		data = p.id.fileNameContent[byteOff:]
122	} else {
123		data, p.err = p.id.readContentSlice(byteOff, 3*runeOffsetFrequency)
124		if p.err != nil {
125			return 0
126		}
127	}
128	for left > 0 {
129		_, sz := utf8.DecodeRune(data)
130		byteOff += uint32(sz)
131		data = data[sz:]
132		left--
133	}
134
135	byteOff -= fileStartByte
136	return byteOff
137}
138
139// fillMatches converts the internal candidateMatch slice into our API's LineMatch.
140// It only ever returns content XOR filename matches, not both. If there are any
141// content matches, these are always returned, and we omit filename matches.
142//
143// Performance invariant: ms is sorted and non-overlapping.
144//
145// Note: the byte slices may be backed by mmapped data, so before being
146// returned by the API it needs to be copied.
147func (p *contentProvider) fillMatches(ms []*candidateMatch, numContextLines int, language string, opts *zoekt.SearchOptions) []zoekt.LineMatch {
148	var filenameMatches []*candidateMatch
149	contentMatches := make([]*candidateMatch, 0, len(ms))
150
151	for _, m := range ms {
152		if m.fileName {
153			filenameMatches = append(filenameMatches, m)
154		} else {
155			contentMatches = append(contentMatches, m)
156		}
157	}
158
159	// If there are any content matches, we only return these and skip filename matches.
160	if len(contentMatches) > 0 {
161		contentMatches = breakMatchesOnNewlines(contentMatches, p.data(false))
162		return p.fillContentMatches(contentMatches, numContextLines, language, opts)
163	}
164
165	// Otherwise, we return a single line containing the filematch index.
166	lineScore, _ := p.scoreLine(filenameMatches, language, -1 /* must pass -1 for filenames */, opts)
167	res := zoekt.LineMatch{
168		Line:       p.id.fileName(p.idx),
169		FileName:   true,
170		Score:      lineScore.score,
171		DebugScore: lineScore.debugScore,
172	}
173
174	for _, m := range ms {
175		res.LineFragments = append(res.LineFragments, zoekt.LineFragmentMatch{
176			LineOffset:  int(m.byteOffset),
177			MatchLength: int(m.byteMatchSz),
178			Offset:      m.byteOffset,
179		})
180	}
181
182	return []zoekt.LineMatch{res}
183
184}
185
186// fillChunkMatches converts the internal candidateMatch slice into our API's ChunkMatch.
187// It only ever returns content XOR filename matches, not both. If there are any content
188// matches, these are always returned, and we omit filename matches.
189//
190// Performance invariant: ms is sorted and non-overlapping.
191//
192// Note: the byte slices may be backed by mmapped data, so before being
193// returned by the API it needs to be copied.
194func (p *contentProvider) fillChunkMatches(ms []*candidateMatch, numContextLines int, language string, opts *zoekt.SearchOptions) []zoekt.ChunkMatch {
195	var filenameMatches []*candidateMatch
196	contentMatches := make([]*candidateMatch, 0, len(ms))
197
198	for _, m := range ms {
199		if m.fileName {
200			filenameMatches = append(filenameMatches, m)
201		} else {
202			contentMatches = append(contentMatches, m)
203		}
204	}
205
206	// If there are any content matches, we only return these and skip filename matches.
207	if len(contentMatches) > 0 {
208		return p.fillContentChunkMatches(contentMatches, numContextLines, language, opts)
209	}
210
211	// Otherwise, we return a single chunk representing the filename index.
212	lineScore, _ := p.scoreLine(filenameMatches, language, -1 /* must pass -1 for filenames */, opts)
213	fileName := p.id.fileName(p.idx)
214	ranges := make([]zoekt.Range, 0, len(ms))
215	for _, m := range ms {
216		ranges = append(ranges, zoekt.Range{
217			Start: zoekt.Location{
218				ByteOffset: m.byteOffset,
219				LineNumber: 1,
220				Column:     uint32(utf8.RuneCount(fileName[:m.byteOffset]) + 1),
221			},
222			End: zoekt.Location{
223				ByteOffset: m.byteOffset + m.byteMatchSz,
224				LineNumber: 1,
225				Column:     uint32(utf8.RuneCount(fileName[:m.byteOffset+m.byteMatchSz]) + 1),
226			},
227		})
228	}
229
230	return []zoekt.ChunkMatch{{
231		Content:      fileName,
232		ContentStart: zoekt.Location{ByteOffset: 0, LineNumber: 1, Column: 1},
233		Ranges:       ranges,
234		FileName:     true,
235		Score:        lineScore.score,
236		DebugScore:   lineScore.debugScore,
237	}}
238}
239
240func (p *contentProvider) fillContentMatches(ms []*candidateMatch, numContextLines int, language string, opts *zoekt.SearchOptions) []zoekt.LineMatch {
241	var result []zoekt.LineMatch
242	for len(ms) > 0 {
243		m := ms[0]
244		num := p.newlines().atOffset(m.byteOffset)
245		lineStart := int(p.newlines().lineStart(num))
246		nextLineStart := int(p.newlines().lineStart(num + 1))
247
248		var lineCands []*candidateMatch
249
250		endMatch := m.byteOffset + m.byteMatchSz
251
252		for len(ms) > 0 {
253			m := ms[0]
254			if int(m.byteOffset) < nextLineStart {
255				endMatch = m.byteOffset + m.byteMatchSz
256				lineCands = append(lineCands, m)
257				ms = ms[1:]
258			} else {
259				break
260			}
261		}
262
263		if len(lineCands) == 0 {
264			log.Panicf(
265				"%s %v infinite loop: num %d start,end %d,%d, offset %d",
266				p.id.fileName(p.idx), p.id.metaData,
267				num, lineStart, nextLineStart,
268				m.byteOffset)
269		}
270
271		data := p.data(false)
272
273		// Due to merging matches, we may have a match that
274		// crosses a line boundary. Prevent confusion by
275		// taking lines until we pass the last index
276		for nextLineStart < len(data) && endMatch > uint32(nextLineStart) {
277			next := bytes.IndexByte(data[nextLineStart:], '\n')
278			if next == -1 {
279				nextLineStart = len(data)
280			} else {
281				// TODO(hanwen): test that checks "+1" part here.
282				nextLineStart += next + 1
283			}
284		}
285
286		finalMatch := zoekt.LineMatch{
287			LineStart:  lineStart,
288			LineEnd:    nextLineStart,
289			LineNumber: num,
290		}
291		finalMatch.Line = data[lineStart:nextLineStart]
292
293		if numContextLines > 0 {
294			finalMatch.Before = p.newlines().getLines(data, num-numContextLines, num)
295			finalMatch.After = p.newlines().getLines(data, num+1, num+1+numContextLines)
296		}
297
298		lineScore, symbolInfo := p.scoreLine(lineCands, language, num, opts)
299		finalMatch.Score = lineScore.score
300		finalMatch.DebugScore = lineScore.debugScore
301
302		for i, m := range lineCands {
303			fragment := zoekt.LineFragmentMatch{
304				Offset:      m.byteOffset,
305				LineOffset:  int(m.byteOffset) - lineStart,
306				MatchLength: int(m.byteMatchSz),
307			}
308
309			if i < len(symbolInfo) && symbolInfo[i] != nil {
310				fragment.SymbolInfo = symbolInfo[i]
311			}
312
313			finalMatch.LineFragments = append(finalMatch.LineFragments, fragment)
314		}
315		result = append(result, finalMatch)
316	}
317	return result
318}
319
320func (p *contentProvider) fillContentChunkMatches(ms []*candidateMatch, numContextLines int, language string, opts *zoekt.SearchOptions) []zoekt.ChunkMatch {
321	data := p.data(false)
322
323	// columnHelper prevents O(len(ms) * len(data)) lookups for all columns.
324	// However, it depends on ms being sorted by byteOffset and non-overlapping.
325	// This invariant is true at the time of writing, but we conservatively
326	// enforce this. Note: chunkCandidates preserves the sorting so safe to
327	// transform now.
328	columnHelper := columnHelper{data: data}
329	if !sort.IsSorted((sortByOffsetSlice)(ms)) {
330		log.Printf("WARN: performance invariant violated. candidate matches are not sorted in fillContentChunkMatches. Report to developers.")
331		sort.Sort((sortByOffsetSlice)(ms))
332	}
333
334	newlines := p.newlines()
335	chunks := chunkCandidates(ms, newlines, numContextLines)
336	chunkMatches := make([]zoekt.ChunkMatch, 0, len(chunks))
337	for _, chunk := range chunks {
338		ranges := make([]zoekt.Range, 0, len(chunk.candidates))
339		for _, cm := range chunk.candidates {
340			startOffset := cm.byteOffset
341			endOffset := cm.byteOffset + cm.byteMatchSz
342			startLine, endLine := newlines.offsetRangeToLineRange(startOffset, endOffset)
343
344			ranges = append(ranges, zoekt.Range{
345				Start: zoekt.Location{
346					ByteOffset: startOffset,
347					LineNumber: uint32(startLine),
348					Column:     columnHelper.get(int(newlines.lineStart(startLine)), startOffset),
349				},
350				End: zoekt.Location{
351					ByteOffset: endOffset,
352					LineNumber: uint32(endLine),
353					Column:     columnHelper.get(int(newlines.lineStart(endLine)), endOffset),
354				},
355			})
356		}
357
358		firstLineNumber := max(int(chunk.firstLine)-numContextLines, 1)
359		firstLineStart := newlines.lineStart(firstLineNumber)
360
361		chunkScore, symbolInfo := p.scoreChunk(chunk.candidates, language, opts)
362		chunkMatches = append(chunkMatches, zoekt.ChunkMatch{
363			Content: newlines.getLines(data, firstLineNumber, int(chunk.lastLine)+numContextLines+1),
364			ContentStart: zoekt.Location{
365				ByteOffset: firstLineStart,
366				LineNumber: uint32(firstLineNumber),
367				Column:     1,
368			},
369			FileName:      false,
370			Ranges:        ranges,
371			SymbolInfo:    symbolInfo,
372			BestLineMatch: uint32(chunkScore.bestLine),
373			Score:         chunkScore.score,
374			DebugScore:    chunkScore.debugScore,
375		})
376	}
377	return chunkMatches
378}
379
380type candidateChunk struct {
381	candidates []*candidateMatch
382	firstLine  uint32 // 1-based, inclusive
383	lastLine   uint32 // 1-based, inclusive
384	minOffset  uint32 // 0-based, inclusive
385	maxOffset  uint32 // 0-based, exclusive
386}
387
388// chunkCandidates groups a set of sorted, non-overlapping candidate matches by line number. Adjacent
389// chunks will be merged if adding `numContextLines` to the beginning and end of the chunk would cause
390// it to overlap with an adjacent chunk.
391//
392// input invariants: ms is sorted by byteOffset and is non overlapping with respect to endOffset.
393// output invariants: if you flatten candidates the input invariant is retained.
394func chunkCandidates(ms []*candidateMatch, newlines newlines, numContextLines int) []candidateChunk {
395	var chunks []candidateChunk
396
397	for _, m := range ms {
398		startOffset := m.byteOffset
399		endOffset := m.byteOffset + m.byteMatchSz
400		firstLine, lastLine := newlines.offsetRangeToLineRange(startOffset, endOffset)
401
402		if len(chunks) > 0 && int(chunks[len(chunks)-1].lastLine)+numContextLines >= firstLine-numContextLines {
403			// If a new chunk created with the current candidateMatch would
404			// overlap with the previous chunk, instead add the candidateMatch
405			// to the last chunk and extend end of the last chunk.
406			last := &chunks[len(chunks)-1]
407			last.candidates = append(last.candidates, m)
408			if last.maxOffset < endOffset {
409				last.lastLine = uint32(lastLine)
410				last.maxOffset = uint32(endOffset)
411			}
412		} else {
413			chunks = append(chunks, candidateChunk{
414				firstLine:  uint32(firstLine),
415				lastLine:   uint32(lastLine),
416				minOffset:  startOffset,
417				maxOffset:  endOffset,
418				candidates: []*candidateMatch{m},
419			})
420		}
421	}
422	return chunks
423}
424
425// columnHelper is a helper struct which caches the number of runes last
426// counted. If we naively use utf8.RuneCount for each match on a line, this
427// leads to an O(nm) algorithm where m is the number of matches and n is the
428// length of the line. Aassuming we our candidates are increasing in offset
429// makes this operation O(n) instead.
430type columnHelper struct {
431	data []byte
432
433	// 0 values for all these are valid values
434	lastLineOffset int
435	lastOffset     uint32
436	lastRuneCount  uint32
437}
438
439// get returns the line column for offset. offset is the byte offset of the
440// rune in data. lineOffset is the byte offset inside of data for the line
441// containing offset.
442func (c *columnHelper) get(lineOffset int, offset uint32) uint32 {
443	var runeCount uint32
444
445	if lineOffset == c.lastLineOffset && offset >= c.lastOffset {
446		// Can count from last calculation
447		runeCount = c.lastRuneCount + uint32(utf8.RuneCount(c.data[c.lastOffset:offset]))
448	} else {
449		// Need to count from the beginning of line
450		runeCount = uint32(utf8.RuneCount(c.data[lineOffset:offset]))
451	}
452
453	c.lastLineOffset = lineOffset
454	c.lastOffset = offset
455	c.lastRuneCount = runeCount
456
457	return runeCount + 1
458}
459
460type newlines struct {
461	// locs is the sorted set of byte offsets of the newlines in the file
462	locs []uint32
463
464	// fileSize is just the number of bytes in the file. It is stored
465	// on this struct so we can safely know the length of the last line
466	// in the file since not all files end in a newline.
467	fileSize uint32
468}
469
470// atOffset returns the line containing the offset. If the offset lands on
471// the newline ending line M, we return M.
472func (nls newlines) atOffset(offset uint32) (lineNumber int) {
473	idx := sort.Search(len(nls.locs), func(n int) bool {
474		return nls.locs[n] >= offset
475	})
476	return idx + 1
477}
478
479// lineStart returns the byte offset of the beginning of the given line.
480// lineNumber is 1-based. If lineNumber is out of range of the lines in the
481// file, the return value will be clamped to [0,fileSize].
482func (nls newlines) lineStart(lineNumber int) uint32 {
483	// nls.locs[0] + 1 is the start of the 2nd line of data.
484	startIdx := lineNumber - 2
485
486	if startIdx < 0 {
487		return 0
488	} else if startIdx >= len(nls.locs) {
489		return nls.fileSize
490	} else {
491		return nls.locs[startIdx] + 1
492	}
493}
494
495// offsetRangeToLineRange returns range of lines that fully contains the given byte range.
496// The inputs are 0-based byte offsets into the file representing the (exclusive) range [startOffset, endOffset).
497// The return values are 1-based line numbers representing the (inclusive) range [startLine, endLine].
498func (nls newlines) offsetRangeToLineRange(startOffset, endOffset uint32) (startLine, endLine int) {
499	startLine = nls.atOffset(startOffset)
500	endLine = nls.atOffset(
501		max(startOffset, max(endOffset, 1)-1), // clamp endOffset and prevent underflow
502	)
503	return startLine, endLine
504}
505
506// getLines returns a slice of data containing the lines [low, high).
507// low is 1-based and inclusive. high is 1-based and exclusive.
508func (nls newlines) getLines(data []byte, low, high int) []byte {
509	if low >= high {
510		return nil
511	}
512
513	return data[nls.lineStart(low):nls.lineStart(high)]
514}
515
516const (
517	// Query-dependent scoring signals. All of these together are bounded at ~9000
518	// (scoreWordMatch + scoreSymbol + scoreKindMatch * 10 + scoreFactorAtomMatch).
519	scorePartialWordMatch = 50.0
520	scoreWordMatch        = 500.0
521	scoreBase             = 7000.0
522	scorePartialBase      = 4000.0
523	scoreSymbol           = 7000.0
524	scorePartialSymbol    = 4000.0
525	scoreKindMatch        = 100.0
526	scoreFactorAtomMatch  = 400.0
527
528	// Used for ordering line and chunk matches within a file.
529	scoreLineOrderFactor = 1.0
530
531	// Used for tiebreakers. The scores are not combined with the main score, but
532	// are used to break ties between matches with the same score. The factors are
533	// chosen to separate the tiebreakers from the main score and from each other.
534	// If you make changes here, make sure to update indexData.scoreFile too.
535	scoreRepoRankFactor  = 100.0
536	scoreFileOrderFactor = 10.0
537)
538
539// findMaxOverlappingSection returns the index of the section in secs that
540// overlaps the most with the area defined by off and sz, relative to the size
541// of the section. If no section overlaps, it returns 0, false. If multiple
542// sections overlap the same amount, the first one is returned.
543//
544// The implementation assumes that sections do not overlap and are sorted by
545// DocumentSection.Start.
546func findMaxOverlappingSection(secs []DocumentSection, off, sz uint32) (uint32, bool) {
547	start := off
548	end := off + sz
549
550	// Find the first section that might overlap
551	j := sort.Search(len(secs), func(i int) bool { return secs[i].End > start })
552
553	if j == len(secs) || secs[j].Start >= end {
554		// No overlap.
555		return 0, false
556	}
557
558	relOverlap := func(j int) float64 {
559		secSize := secs[j].End - secs[j].Start
560		if secSize == 0 {
561			return 0
562		}
563		// This cannot overflow because we make sure there is overlap before calling relOverlap
564		overlap := min(secs[j].End, end) - max(secs[j].Start, start)
565		return float64(overlap) / float64(secSize)
566	}
567
568	ol1 := relOverlap(j)
569	if epsilonEqualsOne(ol1) || j == len(secs)-1 || secs[j+1].Start >= end {
570		return uint32(j), ol1 > 0
571	}
572
573	// We know that [off,off+sz[ overlaps with at least 2 sections. We only have to check
574	// if the second section overlaps more than the first one, because a third
575	// section can only overlap if the overlap with the second section is complete.
576	ol2 := relOverlap(j + 1)
577	if ol2 > ol1 {
578		return uint32(j + 1), ol2 > 0
579	}
580
581	return uint32(j), ol1 > 0
582}
583
584func (p *contentProvider) matchesSymbol(cm *candidateMatch) bool {
585	if cm.fileName {
586		return false
587	}
588
589	// Check if this candidate came from a symbol matchTree
590	if cm.symbol {
591		return true
592	}
593
594	// Check if it overlaps with a symbol.
595	secs := p.docSections()
596	_, ok := findMaxOverlappingSection(secs, cm.byteOffset, cm.byteMatchSz)
597	return ok
598}
599
600func (p *contentProvider) findSymbol(cm *candidateMatch) (DocumentSection, *zoekt.Symbol, bool) {
601	if cm.fileName {
602		return DocumentSection{}, nil, false
603	}
604
605	secs := p.docSections()
606
607	secIdx, ok := cm.symbolIdx, cm.symbol
608	if !ok {
609		// Not from a symbol matchTree. Let's see if it overlaps with a symbol.
610		secIdx, ok = findMaxOverlappingSection(secs, cm.byteOffset, cm.byteMatchSz)
611	}
612	if !ok {
613		return DocumentSection{}, nil, false
614	}
615
616	sec := secs[secIdx]
617
618	// Now lets hydrate in the SymbolInfo. We do not hydrate in SymbolInfo.Sym
619	// since some callsites do not need it stored, and that incurs an extra
620	// copy.
621	//
622	// 2024-01-08 we are refactoring this and the code path indicates this can
623	// fail, so callers need to handle nil symbol. However, it would be
624	// surprising that we have a matching section but not symbol data.
625	start := p.id.fileEndSymbol[p.idx]
626	si := p.id.symbols.data(start + secIdx)
627
628	return sec, si, true
629}
630
631// sectionSlice will return data[sec.Start:sec.End] but will clip Start and
632// End such that it won't be out of range.
633func sectionSlice(data []byte, sec DocumentSection) []byte {
634	l := uint32(len(data))
635	if sec.Start >= l {
636		return nil
637	}
638	if sec.End > l {
639		sec.End = l
640	}
641	return data[sec.Start:sec.End]
642}
643
644// scoreSymbolKind boosts a match based on the combination of language, symbol
645// and kind. The language string comes from go-enry, the symbol and kind from
646// ctags.
647func scoreSymbolKind(language string, filename []byte, sym []byte, kind ctags.SymbolKind) float64 {
648	var factor float64
649
650	// Generic ranking which will be overriden by language specific ranking
651	switch kind {
652	case ctags.Type: // scip-ctags regression workaround https://github.com/sourcegraph/sourcegraph/issues/57659
653		factor = 8
654	case ctags.Class:
655		factor = 10
656	case ctags.Struct:
657		factor = 9.5
658	case ctags.Enum:
659		factor = 9
660	case ctags.Interface:
661		factor = 8
662	case ctags.Function, ctags.Method:
663		factor = 7
664	case ctags.Field:
665		factor = 5.5
666	case ctags.Constant:
667		factor = 5
668	case ctags.Variable:
669		factor = 4
670	default:
671		// For all other kinds, assign a low score by default.
672		factor = 1
673	}
674
675	switch language {
676	case "Java", "java":
677		switch kind {
678		// 2022-03-30: go-ctags contains a regex rule for Java classes that sets "kind"
679		// to "classes" instead of "c". We have to cover both cases to support existing
680		// indexes.
681		case ctags.Class:
682			factor = 10
683		case ctags.Enum:
684			factor = 9
685		case ctags.Interface:
686			factor = 8
687		case ctags.Method:
688			factor = 7
689		case ctags.Field:
690			factor = 6
691		case ctags.EnumConstant:
692			factor = 5
693		}
694	case "Kotlin", "kotlin":
695		switch kind {
696		case ctags.Class:
697			factor = 10
698		case ctags.Interface:
699			factor = 9
700		case ctags.Method:
701			factor = 8
702		case ctags.TypeAlias:
703			factor = 7
704		case ctags.Constant:
705			factor = 6
706		case ctags.Variable:
707			factor = 5
708		}
709	case "Go", "go":
710		switch kind {
711		// scip-ctags regression workaround https://github.com/sourcegraph/sourcegraph/issues/57659
712		// for each case a description of the fields in ctags in the comment
713		case ctags.Type: // interface struct talias
714			factor = 9
715		case ctags.Interface: // interfaces
716			factor = 10
717		case ctags.Struct: // structs
718			factor = 9
719		case ctags.TypeAlias: // type aliases
720			factor = 9
721		case ctags.MethodSpec: // interface method specification
722			factor = 8.5
723		case ctags.Method, ctags.Function: // functions
724			factor = 8
725		case ctags.Field: // struct fields
726			factor = 7
727		case ctags.Constant: // constants
728			factor = 6
729		case ctags.Variable: // variables
730			factor = 5
731		}
732
733		// Boost exported go symbols. Same implementation as token.IsExported
734		if ch, _ := utf8.DecodeRune(sym); unicode.IsUpper(ch) {
735			factor += 0.5
736		}
737
738		if bytes.HasSuffix(filename, []byte("_test.go")) {
739			factor *= 0.8
740		}
741
742		// Could also rank on:
743		//
744		//   - anonMember  struct anonymous members
745		//   - packageName name for specifying imported package
746		//   - receiver    receivers
747		//   - package     packages
748		//   - type        types
749		//   - unknown     unknown
750	case "C++", "c++":
751		switch kind {
752		case ctags.Class: // classes
753			factor = 10
754		case ctags.Enum: // enumeration names
755			factor = 9
756		case ctags.Function: // function definitions
757			factor = 8
758		case ctags.Struct: // structure names
759			factor = 7
760		case ctags.Union: // union names
761			factor = 6
762		case ctags.TypeAlias: // typedefs
763			factor = 5
764		case ctags.Field: // class, struct, and union members
765			factor = 4
766		case ctags.Variable: // varialbe definitions
767			factor = 3
768		}
769	// Could also rank on:
770	// NAME        DESCRIPTION
771	// macro       macro definitions
772	// enumerator  enumerators (values inside an enumeration)
773	// header      included header files
774	// namespace   namespaces
775	// variable    variable definitions
776	case "Scala", "scala":
777		switch kind {
778		case ctags.Class:
779			factor = 10
780		case ctags.Interface:
781			factor = 9
782		case ctags.Object:
783			factor = 8
784		case ctags.Function:
785			factor = 7
786		case ctags.Type:
787			factor = 6
788		case ctags.Variable:
789			factor = 5
790		case ctags.Package:
791			factor = 4
792		}
793	case "Python", "python":
794		switch kind {
795		case ctags.Class: // classes
796			factor = 10
797		case ctags.Function, ctags.Method: // function definitions
798			factor = 8
799		case ctags.Field: // class, struct, and union members
800			factor = 4
801		case ctags.Variable: // variable definitions
802			factor = 3
803		case ctags.Local: // local variables
804			factor = 2
805		}
806		// Could also rank on:
807		//
808		//   - namespace name referring a module defined in other file
809		//   - module    modules
810		//   - unknown   name referring a class/variable/function/module defined in other module
811		//   - parameter function parameters
812	case "Ruby", "ruby":
813		switch kind {
814		case ctags.Class:
815			factor = 10
816		case ctags.Method:
817			factor = 9
818		case ctags.MethodAlias:
819			factor = 8
820		case ctags.Module:
821			factor = 7
822		case ctags.SingletonMethod:
823			factor = 6
824		case ctags.Constant:
825			factor = 5
826		case ctags.Accessor:
827			factor = 4
828		case ctags.Library:
829			factor = 3
830		}
831	case "PHP", "php":
832		switch kind {
833		case ctags.Class:
834			factor = 10
835		case ctags.Interface:
836			factor = 9
837		case ctags.Function:
838			factor = 8
839		case ctags.Trait:
840			factor = 7
841		case ctags.Define:
842			factor = 6
843		case ctags.Namespace:
844			factor = 5
845		case ctags.MethodAlias:
846			factor = 4
847		case ctags.Variable:
848			factor = 3
849		case ctags.Local:
850			factor = 3
851		}
852	case "GraphQL", "graphql":
853		switch kind {
854		case ctags.Type:
855			factor = 10
856		}
857	case "Markdown", "markdown":
858		// Headers are good signal in docs, but do not rank as highly as code.
859		switch kind {
860		case ctags.Chapter: // #
861			factor = 4
862		case ctags.Section: // ##
863			factor = 3
864		case ctags.Subsection: // ###
865			factor = 2
866		}
867	}
868
869	return factor * scoreKindMatch
870}
871
872type matchScoreSlice []zoekt.LineMatch
873
874func (m matchScoreSlice) Len() int           { return len(m) }
875func (m matchScoreSlice) Swap(i, j int)      { m[i], m[j] = m[j], m[i] }
876func (m matchScoreSlice) Less(i, j int) bool { return m[i].Score > m[j].Score }
877
878type chunkMatchScoreSlice []zoekt.ChunkMatch
879
880func (m chunkMatchScoreSlice) Len() int           { return len(m) }
881func (m chunkMatchScoreSlice) Swap(i, j int)      { m[i], m[j] = m[j], m[i] }
882func (m chunkMatchScoreSlice) Less(i, j int) bool { return m[i].Score > m[j].Score }
883
884type fileMatchesByScore []zoekt.FileMatch
885
886func (m fileMatchesByScore) Len() int           { return len(m) }
887func (m fileMatchesByScore) Swap(i, j int)      { m[i], m[j] = m[j], m[i] }
888func (m fileMatchesByScore) Less(i, j int) bool { return m[i].Score > m[j].Score }
889
890func sortMatchesByScore(ms []zoekt.LineMatch) {
891	sort.Sort(matchScoreSlice(ms))
892}
893
894func sortChunkMatchesByScore(ms []zoekt.ChunkMatch) {
895	sort.Sort(chunkMatchScoreSlice(ms))
896}
897
898// SortFiles sorts files matches in the order we want to present results to
899// users. The order depends on the match score, which includes both
900// query-dependent signals like word overlap, and file-only signals like the
901// file ranks (if file ranks are enabled).
902//
903// We don't only use the scores, we will also boost some results to present
904// files with novel extensions.
905func SortFiles(ms []zoekt.FileMatch) {
906	sort.Sort(fileMatchesByScore(ms))
907
908	// Boost a file extension not in the top 3 to the third filematch.
909	boostNovelExtension(ms, 2, 0.9)
910}
911
912func boostNovelExtension(ms []zoekt.FileMatch, boostOffset int, minScoreRatio float64) {
913	if len(ms) <= boostOffset+1 {
914		return
915	}
916
917	top := ms[:boostOffset]
918	candidates := ms[boostOffset:]
919
920	// Don't bother boosting something which is significantly different to the
921	// result it replaces.
922	minScoreForNovelty := candidates[0].Score * minScoreRatio
923
924	// We want to look for an ext that isn't in the top exts
925	exts := make([]string, len(top))
926	for i := range top {
927		exts[i] = path.Ext(top[i].FileName)
928	}
929
930	for i := range candidates {
931		// Do not assume sorted due to boostNovelExtension being called on subsets
932		if candidates[i].Score < minScoreForNovelty {
933			continue
934		}
935
936		if slices.Contains(exts, path.Ext(candidates[i].FileName)) {
937			continue
938		}
939
940		// Found what we are looking for, now boost to front of candidates (which
941		// is ms[boostOffset])
942		for ; i > 0; i-- {
943			candidates[i], candidates[i-1] = candidates[i-1], candidates[i]
944		}
945		return
946	}
947}
Configure Feed

Configure Feed