contentprovider.go at 1a3dddce84a769a32b83f98debe60ed02988f8d0 · boltless.me/zoekt

fork of https://github.com/sourcegraph/zoekt
zoekt / contentprovider.go
at 1a3dddce84a769a32b83f98debe60ed02988f8d0 27 kB View raw
Keegan Carruthers-Smith score: experimental extension novelty in sorting (#665) 2y ago
  1// Copyright 2016 Google Inc. All rights reserved.
  2//
  3// Licensed under the Apache License, Version 2.0 (the "License");
  4// you may not use this file except in compliance with the License.
  5// You may obtain a copy of the License at
  6//
  7//    http://www.apache.org/licenses/LICENSE-2.0
  8//
  9// Unless required by applicable law or agreed to in writing, software
 10// distributed under the License is distributed on an "AS IS" BASIS,
 11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12// See the License for the specific language governing permissions and
 13// limitations under the License.
 14
 15package zoekt
 16
 17import (
 18	"bytes"
 19	"fmt"
 20	"log"
 21	"os"
 22	"path"
 23	"sort"
 24	"strings"
 25	"unicode/utf8"
 26
 27	"golang.org/x/exp/slices"
 28)
 29
 30var _ = log.Println
 31
 32// contentProvider is an abstraction to treat matches for names and
 33// content with the same code.
 34type contentProvider struct {
 35	id    *indexData
 36	stats *Stats
 37
 38	// mutable
 39	err      error
 40	idx      uint32
 41	_data    []byte
 42	_nl      []uint32
 43	_nlBuf   []uint32
 44	_sects   []DocumentSection
 45	_sectBuf []DocumentSection
 46	fileSize uint32
 47}
 48
 49// setDocument skips to the given document.
 50func (p *contentProvider) setDocument(docID uint32) {
 51	fileStart := p.id.boundaries[docID]
 52
 53	p.idx = docID
 54	p.fileSize = p.id.boundaries[docID+1] - fileStart
 55
 56	p._nl = nil
 57	p._sects = nil
 58	p._data = nil
 59}
 60
 61func (p *contentProvider) docSections() []DocumentSection {
 62	if p._sects == nil {
 63		var sz uint32
 64		p._sects, sz, p.err = p.id.readDocSections(p.idx, p._sectBuf)
 65		p.stats.ContentBytesLoaded += int64(sz)
 66		p._sectBuf = p._sects
 67	}
 68	return p._sects
 69}
 70
 71func (p *contentProvider) newlines() newlines {
 72	if p._nl == nil {
 73		var sz uint32
 74		p._nl, sz, p.err = p.id.readNewlines(p.idx, p._nlBuf)
 75		p._nlBuf = p._nl
 76		p.stats.ContentBytesLoaded += int64(sz)
 77	}
 78	return newlines{locs: p._nl, fileSize: p.fileSize}
 79}
 80
 81func (p *contentProvider) data(fileName bool) []byte {
 82	if fileName {
 83		return p.id.fileNameContent[p.id.fileNameIndex[p.idx]:p.id.fileNameIndex[p.idx+1]]
 84	}
 85
 86	if p._data == nil {
 87		p._data, p.err = p.id.readContents(p.idx)
 88		p.stats.FilesLoaded++
 89		p.stats.ContentBytesLoaded += int64(len(p._data))
 90	}
 91	return p._data
 92}
 93
 94// Find offset in bytes (relative to corpus start) for an offset in
 95// runes (relative to document start). If filename is set, the corpus
 96// is the set of filenames, with the document being the name itself.
 97func (p *contentProvider) findOffset(filename bool, r uint32) uint32 {
 98	if p.id.metaData.PlainASCII {
 99		return r
100	}
101
102	sample := p.id.runeOffsets
103	runeEnds := p.id.fileEndRunes
104	fileStartByte := p.id.boundaries[p.idx]
105	if filename {
106		sample = p.id.fileNameRuneOffsets
107		runeEnds = p.id.fileNameEndRunes
108		fileStartByte = p.id.fileNameIndex[p.idx]
109	}
110
111	absR := r
112	if p.idx > 0 {
113		absR += runeEnds[p.idx-1]
114	}
115
116	byteOff, left := sample.lookup(absR)
117
118	var data []byte
119
120	if filename {
121		data = p.id.fileNameContent[byteOff:]
122	} else {
123		data, p.err = p.id.readContentSlice(byteOff, 3*runeOffsetFrequency)
124		if p.err != nil {
125			return 0
126		}
127	}
128	for left > 0 {
129		_, sz := utf8.DecodeRune(data)
130		byteOff += uint32(sz)
131		data = data[sz:]
132		left--
133	}
134
135	byteOff -= fileStartByte
136	return byteOff
137}
138
139func (p *contentProvider) fillMatches(ms []*candidateMatch, numContextLines int, language string, debug bool) []LineMatch {
140	var result []LineMatch
141	if ms[0].fileName {
142		// There is only "line" in a filename.
143		res := LineMatch{
144			Line:     p.id.fileName(p.idx),
145			FileName: true,
146		}
147
148		for _, m := range ms {
149			res.LineFragments = append(res.LineFragments, LineFragmentMatch{
150				LineOffset:  int(m.byteOffset),
151				MatchLength: int(m.byteMatchSz),
152				Offset:      m.byteOffset,
153			})
154
155			result = []LineMatch{res}
156		}
157	} else {
158		ms = breakMatchesOnNewlines(ms, p.data(false))
159		result = p.fillContentMatches(ms, numContextLines)
160	}
161
162	sects := p.docSections()
163	for i, m := range result {
164		result[i].Score, result[i].DebugScore = p.matchScore(sects, &m, language, debug)
165	}
166
167	return result
168}
169
170func (p *contentProvider) fillChunkMatches(ms []*candidateMatch, numContextLines int, language string, debug bool) []ChunkMatch {
171	var result []ChunkMatch
172	if ms[0].fileName {
173		// If the first match is a filename match, there will only be
174		// one match and the matched content will be the filename.
175
176		fileName := p.id.fileName(p.idx)
177		ranges := make([]Range, 0, len(ms))
178		for _, m := range ms {
179			ranges = append(ranges, Range{
180				Start: Location{
181					ByteOffset: m.byteOffset,
182					LineNumber: 1,
183					Column:     uint32(utf8.RuneCount(fileName[:m.byteOffset]) + 1),
184				},
185				End: Location{
186					ByteOffset: m.byteOffset + m.byteMatchSz,
187					LineNumber: 1,
188					Column:     uint32(utf8.RuneCount(fileName[:m.byteOffset+m.byteMatchSz]) + 1),
189				},
190			})
191		}
192
193		result = []ChunkMatch{{
194			Content:      fileName,
195			ContentStart: Location{ByteOffset: 0, LineNumber: 1, Column: 1},
196			Ranges:       ranges,
197			FileName:     true,
198		}}
199	} else {
200		result = p.fillContentChunkMatches(ms, numContextLines)
201	}
202
203	sects := p.docSections()
204	for i, m := range result {
205		result[i].Score, result[i].DebugScore = p.chunkMatchScore(sects, &m, language, debug)
206	}
207
208	return result
209}
210
211func (p *contentProvider) fillContentMatches(ms []*candidateMatch, numContextLines int) []LineMatch {
212	var result []LineMatch
213	for len(ms) > 0 {
214		m := ms[0]
215		num, lineStart, lineEnd := p.newlines().atOffset(m.byteOffset)
216
217		var lineCands []*candidateMatch
218
219		endMatch := m.byteOffset + m.byteMatchSz
220
221		for len(ms) > 0 {
222			m := ms[0]
223			if int(m.byteOffset) <= lineEnd {
224				endMatch = m.byteOffset + m.byteMatchSz
225				lineCands = append(lineCands, m)
226				ms = ms[1:]
227			} else {
228				break
229			}
230		}
231
232		if len(lineCands) == 0 {
233			log.Panicf(
234				"%s %v infinite loop: num %d start,end %d,%d, offset %d",
235				p.id.fileName(p.idx), p.id.metaData,
236				num, lineStart, lineEnd,
237				m.byteOffset)
238		}
239
240		data := p.data(false)
241
242		// Due to merging matches, we may have a match that
243		// crosses a line boundary. Prevent confusion by
244		// taking lines until we pass the last match
245		for lineEnd < len(data) && endMatch > uint32(lineEnd) {
246			next := bytes.IndexByte(data[lineEnd+1:], '\n')
247			if next == -1 {
248				lineEnd = len(data)
249			} else {
250				// TODO(hanwen): test that checks "+1" part here.
251				lineEnd += next + 1
252			}
253		}
254
255		finalMatch := LineMatch{
256			LineStart:  lineStart,
257			LineEnd:    lineEnd,
258			LineNumber: num,
259		}
260		finalMatch.Line = data[lineStart:lineEnd]
261
262		if numContextLines > 0 {
263			finalMatch.Before = p.newlines().getLines(data, num-numContextLines, num)
264			finalMatch.After = p.newlines().getLines(data, num+1, num+1+numContextLines)
265		}
266
267		for _, m := range lineCands {
268			fragment := LineFragmentMatch{
269				Offset:      m.byteOffset,
270				LineOffset:  int(m.byteOffset) - lineStart,
271				MatchLength: int(m.byteMatchSz),
272			}
273			if m.symbol {
274				start := p.id.fileEndSymbol[p.idx]
275				fragment.SymbolInfo = p.id.symbols.data(start + m.symbolIdx)
276				if fragment.SymbolInfo != nil {
277					sec := p.docSections()[m.symbolIdx]
278					fragment.SymbolInfo.Sym = string(data[sec.Start:sec.End])
279				}
280			}
281
282			finalMatch.LineFragments = append(finalMatch.LineFragments, fragment)
283		}
284		result = append(result, finalMatch)
285	}
286	return result
287}
288
289func (p *contentProvider) fillContentChunkMatches(ms []*candidateMatch, numContextLines int) []ChunkMatch {
290	newlines := p.newlines()
291	chunks := chunkCandidates(ms, newlines, numContextLines)
292	data := p.data(false)
293	chunkMatches := make([]ChunkMatch, 0, len(chunks))
294	for _, chunk := range chunks {
295		ranges := make([]Range, 0, len(chunk.candidates))
296		var symbolInfo []*Symbol
297		for i, cm := range chunk.candidates {
298			startOffset := cm.byteOffset
299			endOffset := cm.byteOffset + cm.byteMatchSz
300			startLine, startLineOffset, _ := newlines.atOffset(startOffset)
301			endLine, endLineOffset, _ := newlines.atOffset(endOffset)
302
303			ranges = append(ranges, Range{
304				Start: Location{
305					ByteOffset: startOffset,
306					LineNumber: uint32(startLine),
307					Column:     uint32(utf8.RuneCount(data[startLineOffset:startOffset]) + 1),
308				},
309				End: Location{
310					ByteOffset: endOffset,
311					LineNumber: uint32(endLine),
312					Column:     uint32(utf8.RuneCount(data[endLineOffset:endOffset]) + 1),
313				},
314			})
315
316			if cm.symbol {
317				if symbolInfo == nil {
318					symbolInfo = make([]*Symbol, len(chunk.candidates))
319				}
320				start := p.id.fileEndSymbol[p.idx]
321				si := p.id.symbols.data(start + cm.symbolIdx)
322				if si != nil {
323					sec := p.docSections()[cm.symbolIdx]
324					si.Sym = string(data[sec.Start:sec.End])
325				}
326				symbolInfo[i] = si
327			}
328		}
329
330		firstLineNumber := int(chunk.firstLine) - numContextLines
331		if firstLineNumber < 1 {
332			firstLineNumber = 1
333		}
334		firstLineStart, _ := newlines.lineBounds(firstLineNumber)
335
336		chunkMatches = append(chunkMatches, ChunkMatch{
337			Content: newlines.getLines(data, firstLineNumber, int(chunk.lastLine)+numContextLines+1),
338			ContentStart: Location{
339				ByteOffset: firstLineStart,
340				LineNumber: uint32(firstLineNumber),
341				Column:     1,
342			},
343			FileName:   false,
344			Ranges:     ranges,
345			SymbolInfo: symbolInfo,
346		})
347	}
348	return chunkMatches
349}
350
351type candidateChunk struct {
352	firstLine  uint32 // 1-based, inclusive
353	lastLine   uint32 // 1-based, inclusive
354	minOffset  uint32 // 0-based, inclusive
355	maxOffset  uint32 // 0-based, exclusive
356	candidates []*candidateMatch
357}
358
359// chunkCandidates groups a set of sorted, non-overlapping candidate matches by line number. Adjacent
360// chunks will be merged if adding `numContextLines` to the beginning and end of the chunk would cause
361// it to overlap with an adjacent chunk.
362func chunkCandidates(ms []*candidateMatch, newlines newlines, numContextLines int) []candidateChunk {
363	var chunks []candidateChunk
364	for _, m := range ms {
365		startOffset := m.byteOffset
366		endOffset := m.byteOffset + m.byteMatchSz
367		firstLine, _, _ := newlines.atOffset(startOffset)
368		lastLine, _, _ := newlines.atOffset(endOffset)
369
370		if len(chunks) > 0 && int(chunks[len(chunks)-1].lastLine)+numContextLines >= firstLine-numContextLines {
371			// If a new chunk created with the current candidateMatch would
372			// overlap with the previous chunk, instead add the candidateMatch
373			// to the last chunk and extend end of the last chunk.
374			last := &chunks[len(chunks)-1]
375			last.candidates = append(last.candidates, m)
376			if last.maxOffset < endOffset {
377				last.lastLine = uint32(lastLine)
378				last.maxOffset = uint32(endOffset)
379			}
380		} else {
381			chunks = append(chunks, candidateChunk{
382				firstLine:  uint32(firstLine),
383				lastLine:   uint32(lastLine),
384				minOffset:  startOffset,
385				maxOffset:  endOffset,
386				candidates: []*candidateMatch{m},
387			})
388		}
389	}
390	return chunks
391}
392
393type newlines struct {
394	// locs is the sorted set of byte offsets of the newlines in the file
395	locs []uint32
396
397	// fileSize is just the number of bytes in the file. It is stored
398	// on this struct so we can safely know the length of the last line
399	// in the file since not all files end in a newline.
400	fileSize uint32
401}
402
403// atOffset returns the line containing the offset. If the offset lands on
404// the newline ending line M, we return M.  The line is characterized
405// by its linenumber (base-1, byte index of line start, byte index of
406// line end). The line end is the index of a newline, or the filesize
407// (if matching the last line of the file.)
408func (nls newlines) atOffset(offset uint32) (lineNumber, lineStart, lineEnd int) {
409	idx := sort.Search(len(nls.locs), func(n int) bool {
410		return nls.locs[n] >= offset
411	})
412
413	start, end := nls.lineBounds(idx + 1)
414	return idx + 1, int(start), int(end)
415}
416
417// lineBounds returns the byte offsets of the start and end of the 1-based
418// lineNumber. The end offset is exclusive and will not contain the line-ending
419// newline. If the line number is out of range of the lines in the file, start
420// and end will be clamped to [0,fileSize].
421func (nls newlines) lineBounds(lineNumber int) (start, end uint32) {
422	// nls.locs[0] + 1 is the start of the 2nd line of data.
423	startIdx := lineNumber - 2
424	endIdx := lineNumber - 1
425
426	if startIdx < 0 {
427		start = 0
428	} else if startIdx >= len(nls.locs) {
429		start = nls.fileSize
430	} else {
431		start = nls.locs[startIdx] + 1
432	}
433
434	if endIdx < 0 {
435		end = 0
436	} else if endIdx >= len(nls.locs) {
437		end = nls.fileSize
438	} else {
439		end = nls.locs[endIdx]
440	}
441
442	return start, end
443}
444
445// getLines returns a slice of data containing the lines [low, high).
446// low is 1-based and inclusive. high is 1-based and exclusive.
447func (nls newlines) getLines(data []byte, low, high int) []byte {
448	if low >= high {
449		return nil
450	}
451
452	lowStart, _ := nls.lineBounds(low)
453	_, highEnd := nls.lineBounds(high - 1)
454
455	return data[lowStart:highEnd]
456}
457
458const (
459	// Query-dependent scoring signals. All of these together are bounded at ~9000
460	// (scoreWordMatch + scoreSymbol + scoreKindMatch * 10 + scoreFactorAtomMatch).
461	scorePartialWordMatch = 50.0
462	scoreWordMatch        = 500.0
463	scoreBase             = 7000.0
464	scorePartialBase      = 4000.0
465	scoreSymbol           = 7000.0
466	scorePartialSymbol    = 4000.0
467	scoreKindMatch        = 100.0
468	scoreFactorAtomMatch  = 400.0
469
470	// File-only scoring signals. For now these are also bounded ~9000 to give them
471	// equal weight with the query-dependent signals.
472	scoreFileRankFactor  = 9000.0
473	scoreFileOrderFactor = 10.0
474	scoreRepoRankFactor  = 20.0
475
476	// Used for ordering line and chunk matches within a file.
477	scoreLineOrderFactor = 1.0
478)
479
480// findSection checks whether a section defined by offset and size lies within
481// one of the sections in secs.
482func findSection(secs []DocumentSection, off, sz uint32) (int, bool) {
483	j := sort.Search(len(secs), func(i int) bool {
484		return secs[i].End >= off+sz
485	})
486
487	if j == len(secs) {
488		return 0, false
489	}
490
491	if secs[j].Start <= off && off+sz <= secs[j].End {
492		return j, true
493	}
494	return 0, false
495}
496
497func (p *contentProvider) chunkMatchScore(secs []DocumentSection, m *ChunkMatch, language string, debug bool) (float64, string) {
498	type debugScore struct {
499		score float64
500		what  string
501	}
502
503	score := &debugScore{}
504	maxScore := &debugScore{}
505
506	addScore := func(what string, s float64) {
507		if s != 0 && debug {
508			score.what += fmt.Sprintf("%s:%.2f, ", what, s)
509		}
510		score.score += s
511	}
512
513	for i, r := range m.Ranges {
514		// calculate the start and end offset relative to the start of the content
515		relStartOffset := int(r.Start.ByteOffset - m.ContentStart.ByteOffset)
516		relEndOffset := int(r.End.ByteOffset - m.ContentStart.ByteOffset)
517
518		startBoundary := relStartOffset < len(m.Content) && (relStartOffset == 0 || byteClass(m.Content[relStartOffset-1]) != byteClass(m.Content[relStartOffset]))
519		endBoundary := relEndOffset > 0 && (relEndOffset == len(m.Content) || byteClass(m.Content[relEndOffset-1]) != byteClass(m.Content[relEndOffset]))
520
521		score.score = 0
522		score.what = ""
523
524		if startBoundary && endBoundary {
525			addScore("WordMatch", scoreWordMatch)
526		} else if startBoundary || endBoundary {
527			addScore("PartialWordMatch", scorePartialWordMatch)
528		}
529
530		if m.FileName {
531			sep := bytes.LastIndexByte(m.Content, '/')
532			startMatch := relStartOffset == sep+1
533			endMatch := relEndOffset == len(m.Content)
534			if startMatch && endMatch {
535				addScore("Base", scoreBase)
536			} else if startMatch || endMatch {
537				addScore("EdgeBase", (scoreBase+scorePartialBase)/2)
538			} else if sep < relStartOffset {
539				addScore("InnerBase", scorePartialBase)
540			}
541		} else if secIdx, ok := findSection(secs, uint32(r.Start.ByteOffset), uint32(r.End.ByteOffset-r.Start.ByteOffset)); ok {
542			sec := secs[secIdx]
543			startMatch := sec.Start == uint32(r.Start.ByteOffset)
544			endMatch := sec.End == uint32(r.End.ByteOffset)
545			if startMatch && endMatch {
546				addScore("Symbol", scoreSymbol)
547			} else if startMatch || endMatch {
548				addScore("EdgeSymbol", (scoreSymbol+scorePartialSymbol)/2)
549			} else {
550				addScore("InnerSymbol", scorePartialSymbol)
551			}
552
553			var si *Symbol
554			if m.SymbolInfo != nil {
555				si = m.SymbolInfo[i]
556			}
557			if si == nil {
558				// for non-symbol queries, we need to hydrate in SymbolInfo.
559				start := p.id.fileEndSymbol[p.idx]
560				si = p.id.symbols.data(start + uint32(secIdx))
561			}
562			if si != nil {
563				addScore(fmt.Sprintf("kind:%s:%s", language, si.Kind), scoreKind(language, si.Kind))
564			}
565		}
566
567		if score.score > maxScore.score {
568			maxScore.score = score.score
569			maxScore.what = score.what
570		}
571	}
572
573	if debug {
574		maxScore.what = fmt.Sprintf("score:%.2f <- %s", maxScore.score, strings.TrimSuffix(maxScore.what, ", "))
575	}
576
577	return maxScore.score, maxScore.what
578}
579
580func (p *contentProvider) matchScore(secs []DocumentSection, m *LineMatch, language string, debug bool) (float64, string) {
581	type debugScore struct {
582		score float64
583		what  string
584	}
585
586	score := &debugScore{}
587	maxScore := &debugScore{}
588
589	addScore := func(what string, s float64) {
590		if s != 0 && debug {
591			score.what += fmt.Sprintf("%s:%.2f, ", what, s)
592		}
593		score.score += s
594	}
595
596	for _, f := range m.LineFragments {
597		startBoundary := f.LineOffset < len(m.Line) && (f.LineOffset == 0 || byteClass(m.Line[f.LineOffset-1]) != byteClass(m.Line[f.LineOffset]))
598
599		end := int(f.LineOffset) + f.MatchLength
600		endBoundary := end > 0 && (end == len(m.Line) || byteClass(m.Line[end-1]) != byteClass(m.Line[end]))
601
602		score.score = 0
603		score.what = ""
604
605		if startBoundary && endBoundary {
606			addScore("WordMatch", scoreWordMatch)
607		} else if startBoundary || endBoundary {
608			addScore("PartialWordMatch", scorePartialWordMatch)
609		}
610
611		if m.FileName {
612			sep := bytes.LastIndexByte(m.Line, '/')
613			startMatch := sep+1 == f.LineOffset
614			endMatch := len(m.Line) == f.LineOffset+f.MatchLength
615			if startMatch && endMatch {
616				addScore("Base", scoreBase)
617			} else if startMatch || endMatch {
618				addScore("EdgeBase", (scoreBase+scorePartialBase)/2)
619			} else if sep < f.LineOffset {
620				addScore("InnerBase", scorePartialBase)
621			}
622		} else if secIdx, ok := findSection(secs, f.Offset, uint32(f.MatchLength)); ok {
623			sec := secs[secIdx]
624			startMatch := sec.Start == f.Offset
625			endMatch := sec.End == f.Offset+uint32(f.MatchLength)
626			if startMatch && endMatch {
627				addScore("Symbol", scoreSymbol)
628			} else if startMatch || endMatch {
629				addScore("EdgeSymbol", (scoreSymbol+scorePartialSymbol)/2)
630			} else {
631				addScore("InnerSymbol", scorePartialSymbol)
632			}
633
634			si := f.SymbolInfo
635			if si == nil {
636				// for non-symbol queries, we need to hydrate in SymbolInfo.
637				start := p.id.fileEndSymbol[p.idx]
638				si = p.id.symbols.data(start + uint32(secIdx))
639			}
640			if si != nil {
641				// the LineFragment may not be on a symbol, then si will be nil.
642				addScore(fmt.Sprintf("kind:%s:%s", language, si.Kind), scoreKind(language, si.Kind))
643			}
644		}
645
646		if score.score > maxScore.score {
647			maxScore.score = score.score
648			maxScore.what = score.what
649		}
650	}
651
652	if debug {
653		maxScore.what = fmt.Sprintf("score:%.2f <- %s", maxScore.score, strings.TrimSuffix(maxScore.what, ", "))
654	}
655
656	return maxScore.score, maxScore.what
657}
658
659// scoreKind boosts a match based on the combination of language and kind. The
660// language string comes from go-enry, the kind string from ctags.
661func scoreKind(language string, kind string) float64 {
662	var factor float64
663
664	// Generic ranking which will be overriden by language specific ranking
665	switch kind {
666	case "type": // scip-ctags regression workaround https://github.com/sourcegraph/sourcegraph/issues/57659
667		factor = 8
668	case "class":
669		factor = 10
670	case "struct":
671		factor = 9.5
672	case "enum":
673		factor = 9
674	case "interface":
675		factor = 8
676	case "function", "func", "method":
677		factor = 7
678	case "member", "field":
679		factor = 5.5
680	case "constant", "const":
681		factor = 5
682	case "var", "variable":
683		factor = 4
684
685	default:
686		// No idea what it is, but its something regarded as a symbol
687		factor = 1
688	}
689
690	// Refer to universal-ctags --list-kinds-full=<language> to learn about which
691	// kinds are detected for which language.
692	//
693	// Note that go-ctags uses universal-ctags's interactive mode and thus returns
694	// the full name for "kind" and not the one-letter abbreviation.
695	switch language {
696	case "Java", "java":
697		switch kind {
698		// 2022-03-30: go-ctags contains a regex rule for Java classes that sets "kind"
699		// to "classes" instead of "c". We have to cover both cases to support existing
700		// indexes.
701		case "class", "classes":
702			factor = 10
703		case "enum":
704			factor = 9
705		case "interface":
706			factor = 8
707		case "method":
708			factor = 7
709		case "field":
710			factor = 6
711		case "enumConstant":
712			factor = 5
713		}
714	case "Kotlin", "kotlin":
715		switch kind {
716		case "class":
717			factor = 10
718		case "interface":
719			factor = 9
720		case "method":
721			factor = 8
722		case "typealias":
723			factor = 7
724		case "constant":
725			factor = 6
726		case "variable":
727			factor = 5
728		}
729	case "Go", "go":
730		switch kind {
731		// scip-ctags regression workaround https://github.com/sourcegraph/sourcegraph/issues/57659
732		// for each case a description of the fields in ctags in the comment
733		case "type": // interface struct talias
734			factor = 10
735		case "method", "function": // methodSpec
736			factor = 8
737		case "variable": // var member
738			factor = 7
739		case "constant": // const
740			factor = 6
741
742		case "interface": // interfaces
743			factor = 10
744		case "struct": // structs
745			factor = 9
746		case "talias": // type aliases
747			factor = 9
748		case "methodSpec": // interface method specification
749			factor = 8.5
750		case "func": // functions
751			factor = 8
752		case "member": // struct members
753			factor = 7
754		case "const": // constants
755			factor = 6
756		case "var": // variables
757			factor = 5
758		}
759		// Could also rank on:
760		//
761		//   - anonMember  struct anonymous members
762		//   - packageName name for specifying imported package
763		//   - receiver    receivers
764		//   - package     packages
765		//   - type        types
766		//   - unknown     unknown
767	case "C++", "c++":
768		switch kind {
769		case "class": // classes
770			factor = 10
771		case "enum": // enumeration names
772			factor = 9
773		case "function": // function definitions
774			factor = 8
775		case "struct": // structure names
776			factor = 7
777		case "union": // union names
778			factor = 6
779		case "typdef": // typedefs
780			factor = 5
781		case "member": // class, struct, and union members
782			factor = 4
783		case "variable": // varialbe definitions
784			factor = 3
785		}
786	// Could also rank on:
787	// NAME        DESCRIPTION
788	// macro       macro definitions
789	// enumerator  enumerators (values inside an enumeration)
790	// header      included header files
791	// namespace   namespaces
792	// variable    variable definitions
793	case "Scala", "scala":
794		switch kind {
795		case "class":
796			factor = 10
797		case "interface":
798			factor = 9
799		case "object":
800			factor = 8
801		case "method":
802			factor = 7
803		case "type":
804			factor = 6
805		case "variable":
806			factor = 5
807		case "package":
808			factor = 4
809		}
810	case "Python", "python":
811		switch kind {
812		case "class": // classes
813			factor = 10
814		case "function": // function definitions
815			factor = 8
816		case "member": // class, struct, and union members
817			factor = 4
818		case "variable": // variable definitions
819			factor = 3
820		case "local": // local variables
821			factor = 2
822		}
823		// Could also rank on:
824		//
825		//   - namespace name referring a module defined in other file
826		//   - module    modules
827		//   - unknown   name referring a class/variable/function/module defined in other module
828		//   - parameter function parameters
829	case "Ruby", "ruby":
830		switch kind {
831		case "class":
832			factor = 10
833		case "method":
834			factor = 9
835		case "alias":
836			factor = 8
837		case "module":
838			factor = 7
839		case "singletonMethod":
840			factor = 6
841		case "constant":
842			factor = 5
843		case "accessor":
844			factor = 4
845		case "library":
846			factor = 3
847		}
848	case "PHP", "php":
849		switch kind {
850		case "class":
851			factor = 10
852		case "interface":
853			factor = 9
854		case "function":
855			factor = 8
856		case "trait":
857			factor = 7
858		case "define":
859			factor = 6
860		case "namespace":
861			factor = 5
862		case "alias":
863			factor = 4
864		case "variable":
865			factor = 3
866		case "local":
867			factor = 3
868		}
869	case "GraphQL", "graphql":
870		switch kind {
871		case "type":
872			factor = 10
873		}
874	case "Markdown", "markdown":
875		// Headers are good signal in docs, but do not rank as highly as code.
876		switch kind {
877		case "chapter": // #
878			factor = 4
879		case "section": // ##
880			factor = 3
881		case "subsection": // ###
882			factor = 2
883		}
884	}
885
886	return factor * scoreKindMatch
887}
888
889type matchScoreSlice []LineMatch
890
891func (m matchScoreSlice) Len() int           { return len(m) }
892func (m matchScoreSlice) Swap(i, j int)      { m[i], m[j] = m[j], m[i] }
893func (m matchScoreSlice) Less(i, j int) bool { return m[i].Score > m[j].Score }
894
895type chunkMatchScoreSlice []ChunkMatch
896
897func (m chunkMatchScoreSlice) Len() int           { return len(m) }
898func (m chunkMatchScoreSlice) Swap(i, j int)      { m[i], m[j] = m[j], m[i] }
899func (m chunkMatchScoreSlice) Less(i, j int) bool { return m[i].Score > m[j].Score }
900
901type fileMatchesByScore []FileMatch
902
903func (m fileMatchesByScore) Len() int           { return len(m) }
904func (m fileMatchesByScore) Swap(i, j int)      { m[i], m[j] = m[j], m[i] }
905func (m fileMatchesByScore) Less(i, j int) bool { return m[i].Score > m[j].Score }
906
907func sortMatchesByScore(ms []LineMatch) {
908	sort.Sort(matchScoreSlice(ms))
909}
910
911func sortChunkMatchesByScore(ms []ChunkMatch) {
912	sort.Sort(chunkMatchScoreSlice(ms))
913}
914
915var doNovelty = os.Getenv("ZOEKT_NOVELTY_DISABLE") == ""
916
917// SortFiles sorts files matches in the order we want to present results to
918// users. The order depends on the match score, which includes both
919// query-dependent signals like word overlap, and file-only signals like the
920// file ranks (if file ranks are enabled).
921//
922// We don't only use the scores, we will also boost some results to present
923// files with novel extensions.
924func SortFiles(ms []FileMatch) {
925	sort.Sort(fileMatchesByScore(ms))
926
927	if doNovelty {
928		// Experimentally boost something into the third filematch
929		boostNovelExtension(ms, 2, 0.9)
930	}
931}
932
933func boostNovelExtension(ms []FileMatch, boostOffset int, minScoreRatio float64) {
934	if len(ms) <= boostOffset+1 {
935		return
936	}
937
938	top := ms[:boostOffset]
939	candidates := ms[boostOffset:]
940
941	// Don't bother boosting something which is significantly different to the
942	// result it replaces.
943	minScoreForNovelty := candidates[0].Score * minScoreRatio
944
945	// We want to look for an ext that isn't in the top exts
946	exts := make([]string, len(top))
947	for i := range top {
948		exts[i] = path.Ext(top[i].FileName)
949	}
950
951	for i := range candidates {
952		// Do not assume sorted due to boostNovelExtension being called on subsets
953		if candidates[i].Score < minScoreForNovelty {
954			continue
955		}
956
957		if slices.Contains(exts, path.Ext(candidates[i].FileName)) {
958			continue
959		}
960
961		// Found what we are looking for, now boost to front of candidates (which
962		// is ms[boostOffset])
963		for ; i > 0; i-- {
964			candidates[i], candidates[i-1] = candidates[i-1], candidates[i]
965		}
966		return
967	}
968}
Configure Feed

Configure Feed