contentprovider.go at e068116194eadd9ef4d619fc53371289bf317d58 · boltless.me/zoekt

fork of https://github.com/sourcegraph/zoekt
zoekt / contentprovider.go
at e068116194eadd9ef4d619fc53371289bf317d58 28 kB View raw
Julie Tibshirani Scoring: test against local scip-ctags (#677) 2y ago
  1// Copyright 2016 Google Inc. All rights reserved.
  2//
  3// Licensed under the Apache License, Version 2.0 (the "License");
  4// you may not use this file except in compliance with the License.
  5// You may obtain a copy of the License at
  6//
  7//    http://www.apache.org/licenses/LICENSE-2.0
  8//
  9// Unless required by applicable law or agreed to in writing, software
 10// distributed under the License is distributed on an "AS IS" BASIS,
 11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12// See the License for the specific language governing permissions and
 13// limitations under the License.
 14
 15package zoekt
 16
 17import (
 18	"bytes"
 19	"fmt"
 20	"log"
 21	"os"
 22	"path"
 23	"sort"
 24	"strings"
 25	"unicode"
 26	"unicode/utf8"
 27
 28	"github.com/sourcegraph/zoekt/ctags"
 29	"golang.org/x/exp/slices"
 30)
 31
 32var _ = log.Println
 33
 34// contentProvider is an abstraction to treat matches for names and
 35// content with the same code.
 36type contentProvider struct {
 37	id    *indexData
 38	stats *Stats
 39
 40	// mutable
 41	err      error
 42	idx      uint32
 43	_data    []byte
 44	_nl      []uint32
 45	_nlBuf   []uint32
 46	_sects   []DocumentSection
 47	_sectBuf []DocumentSection
 48	fileSize uint32
 49}
 50
 51// setDocument skips to the given document.
 52func (p *contentProvider) setDocument(docID uint32) {
 53	fileStart := p.id.boundaries[docID]
 54
 55	p.idx = docID
 56	p.fileSize = p.id.boundaries[docID+1] - fileStart
 57
 58	p._nl = nil
 59	p._sects = nil
 60	p._data = nil
 61}
 62
 63func (p *contentProvider) docSections() []DocumentSection {
 64	if p._sects == nil {
 65		var sz uint32
 66		p._sects, sz, p.err = p.id.readDocSections(p.idx, p._sectBuf)
 67		p.stats.ContentBytesLoaded += int64(sz)
 68		p._sectBuf = p._sects
 69	}
 70	return p._sects
 71}
 72
 73func (p *contentProvider) newlines() newlines {
 74	if p._nl == nil {
 75		var sz uint32
 76		p._nl, sz, p.err = p.id.readNewlines(p.idx, p._nlBuf)
 77		p._nlBuf = p._nl
 78		p.stats.ContentBytesLoaded += int64(sz)
 79	}
 80	return newlines{locs: p._nl, fileSize: p.fileSize}
 81}
 82
 83func (p *contentProvider) data(fileName bool) []byte {
 84	if fileName {
 85		return p.id.fileNameContent[p.id.fileNameIndex[p.idx]:p.id.fileNameIndex[p.idx+1]]
 86	}
 87
 88	if p._data == nil {
 89		p._data, p.err = p.id.readContents(p.idx)
 90		p.stats.FilesLoaded++
 91		p.stats.ContentBytesLoaded += int64(len(p._data))
 92	}
 93	return p._data
 94}
 95
 96// Find offset in bytes (relative to corpus start) for an offset in
 97// runes (relative to document start). If filename is set, the corpus
 98// is the set of filenames, with the document being the name itself.
 99func (p *contentProvider) findOffset(filename bool, r uint32) uint32 {
100	if p.id.metaData.PlainASCII {
101		return r
102	}
103
104	sample := p.id.runeOffsets
105	runeEnds := p.id.fileEndRunes
106	fileStartByte := p.id.boundaries[p.idx]
107	if filename {
108		sample = p.id.fileNameRuneOffsets
109		runeEnds = p.id.fileNameEndRunes
110		fileStartByte = p.id.fileNameIndex[p.idx]
111	}
112
113	absR := r
114	if p.idx > 0 {
115		absR += runeEnds[p.idx-1]
116	}
117
118	byteOff, left := sample.lookup(absR)
119
120	var data []byte
121
122	if filename {
123		data = p.id.fileNameContent[byteOff:]
124	} else {
125		data, p.err = p.id.readContentSlice(byteOff, 3*runeOffsetFrequency)
126		if p.err != nil {
127			return 0
128		}
129	}
130	for left > 0 {
131		_, sz := utf8.DecodeRune(data)
132		byteOff += uint32(sz)
133		data = data[sz:]
134		left--
135	}
136
137	byteOff -= fileStartByte
138	return byteOff
139}
140
141func (p *contentProvider) fillMatches(ms []*candidateMatch, numContextLines int, language string, debug bool) []LineMatch {
142	var result []LineMatch
143	if ms[0].fileName {
144		// There is only "line" in a filename.
145		res := LineMatch{
146			Line:     p.id.fileName(p.idx),
147			FileName: true,
148		}
149
150		for _, m := range ms {
151			res.LineFragments = append(res.LineFragments, LineFragmentMatch{
152				LineOffset:  int(m.byteOffset),
153				MatchLength: int(m.byteMatchSz),
154				Offset:      m.byteOffset,
155			})
156
157			result = []LineMatch{res}
158		}
159	} else {
160		ms = breakMatchesOnNewlines(ms, p.data(false))
161		result = p.fillContentMatches(ms, numContextLines)
162	}
163
164	sects := p.docSections()
165	for i, m := range result {
166		result[i].Score, result[i].DebugScore = p.matchScore(sects, &m, language, debug)
167	}
168
169	return result
170}
171
172func (p *contentProvider) fillChunkMatches(ms []*candidateMatch, numContextLines int, language string, debug bool) []ChunkMatch {
173	var result []ChunkMatch
174	if ms[0].fileName {
175		// If the first match is a filename match, there will only be
176		// one match and the matched content will be the filename.
177
178		fileName := p.id.fileName(p.idx)
179		ranges := make([]Range, 0, len(ms))
180		for _, m := range ms {
181			ranges = append(ranges, Range{
182				Start: Location{
183					ByteOffset: m.byteOffset,
184					LineNumber: 1,
185					Column:     uint32(utf8.RuneCount(fileName[:m.byteOffset]) + 1),
186				},
187				End: Location{
188					ByteOffset: m.byteOffset + m.byteMatchSz,
189					LineNumber: 1,
190					Column:     uint32(utf8.RuneCount(fileName[:m.byteOffset+m.byteMatchSz]) + 1),
191				},
192			})
193		}
194
195		result = []ChunkMatch{{
196			Content:      fileName,
197			ContentStart: Location{ByteOffset: 0, LineNumber: 1, Column: 1},
198			Ranges:       ranges,
199			FileName:     true,
200		}}
201	} else {
202		result = p.fillContentChunkMatches(ms, numContextLines)
203	}
204
205	sects := p.docSections()
206	for i, m := range result {
207		result[i].Score, result[i].DebugScore = p.chunkMatchScore(sects, &m, language, debug)
208	}
209
210	return result
211}
212
213func (p *contentProvider) fillContentMatches(ms []*candidateMatch, numContextLines int) []LineMatch {
214	var result []LineMatch
215	for len(ms) > 0 {
216		m := ms[0]
217		num, lineStart, lineEnd := p.newlines().atOffset(m.byteOffset)
218
219		var lineCands []*candidateMatch
220
221		endMatch := m.byteOffset + m.byteMatchSz
222
223		for len(ms) > 0 {
224			m := ms[0]
225			if int(m.byteOffset) <= lineEnd {
226				endMatch = m.byteOffset + m.byteMatchSz
227				lineCands = append(lineCands, m)
228				ms = ms[1:]
229			} else {
230				break
231			}
232		}
233
234		if len(lineCands) == 0 {
235			log.Panicf(
236				"%s %v infinite loop: num %d start,end %d,%d, offset %d",
237				p.id.fileName(p.idx), p.id.metaData,
238				num, lineStart, lineEnd,
239				m.byteOffset)
240		}
241
242		data := p.data(false)
243
244		// Due to merging matches, we may have a match that
245		// crosses a line boundary. Prevent confusion by
246		// taking lines until we pass the last match
247		for lineEnd < len(data) && endMatch > uint32(lineEnd) {
248			next := bytes.IndexByte(data[lineEnd+1:], '\n')
249			if next == -1 {
250				lineEnd = len(data)
251			} else {
252				// TODO(hanwen): test that checks "+1" part here.
253				lineEnd += next + 1
254			}
255		}
256
257		finalMatch := LineMatch{
258			LineStart:  lineStart,
259			LineEnd:    lineEnd,
260			LineNumber: num,
261		}
262		finalMatch.Line = data[lineStart:lineEnd]
263
264		if numContextLines > 0 {
265			finalMatch.Before = p.newlines().getLines(data, num-numContextLines, num)
266			finalMatch.After = p.newlines().getLines(data, num+1, num+1+numContextLines)
267		}
268
269		for _, m := range lineCands {
270			fragment := LineFragmentMatch{
271				Offset:      m.byteOffset,
272				LineOffset:  int(m.byteOffset) - lineStart,
273				MatchLength: int(m.byteMatchSz),
274			}
275			if m.symbol {
276				start := p.id.fileEndSymbol[p.idx]
277				fragment.SymbolInfo = p.id.symbols.data(start + m.symbolIdx)
278				if fragment.SymbolInfo != nil {
279					sec := p.docSections()[m.symbolIdx]
280					fragment.SymbolInfo.Sym = string(data[sec.Start:sec.End])
281				}
282			}
283
284			finalMatch.LineFragments = append(finalMatch.LineFragments, fragment)
285		}
286		result = append(result, finalMatch)
287	}
288	return result
289}
290
291func (p *contentProvider) fillContentChunkMatches(ms []*candidateMatch, numContextLines int) []ChunkMatch {
292	newlines := p.newlines()
293	chunks := chunkCandidates(ms, newlines, numContextLines)
294	data := p.data(false)
295	chunkMatches := make([]ChunkMatch, 0, len(chunks))
296	for _, chunk := range chunks {
297		ranges := make([]Range, 0, len(chunk.candidates))
298		var symbolInfo []*Symbol
299		for i, cm := range chunk.candidates {
300			startOffset := cm.byteOffset
301			endOffset := cm.byteOffset + cm.byteMatchSz
302			startLine, startLineOffset, _ := newlines.atOffset(startOffset)
303			endLine, endLineOffset, _ := newlines.atOffset(endOffset)
304
305			ranges = append(ranges, Range{
306				Start: Location{
307					ByteOffset: startOffset,
308					LineNumber: uint32(startLine),
309					Column:     uint32(utf8.RuneCount(data[startLineOffset:startOffset]) + 1),
310				},
311				End: Location{
312					ByteOffset: endOffset,
313					LineNumber: uint32(endLine),
314					Column:     uint32(utf8.RuneCount(data[endLineOffset:endOffset]) + 1),
315				},
316			})
317
318			if cm.symbol {
319				if symbolInfo == nil {
320					symbolInfo = make([]*Symbol, len(chunk.candidates))
321				}
322				start := p.id.fileEndSymbol[p.idx]
323				si := p.id.symbols.data(start + cm.symbolIdx)
324				if si != nil {
325					sec := p.docSections()[cm.symbolIdx]
326					si.Sym = string(data[sec.Start:sec.End])
327				}
328				symbolInfo[i] = si
329			}
330		}
331
332		firstLineNumber := int(chunk.firstLine) - numContextLines
333		if firstLineNumber < 1 {
334			firstLineNumber = 1
335		}
336		firstLineStart, _ := newlines.lineBounds(firstLineNumber)
337
338		chunkMatches = append(chunkMatches, ChunkMatch{
339			Content: newlines.getLines(data, firstLineNumber, int(chunk.lastLine)+numContextLines+1),
340			ContentStart: Location{
341				ByteOffset: firstLineStart,
342				LineNumber: uint32(firstLineNumber),
343				Column:     1,
344			},
345			FileName:   false,
346			Ranges:     ranges,
347			SymbolInfo: symbolInfo,
348		})
349	}
350	return chunkMatches
351}
352
353type candidateChunk struct {
354	firstLine  uint32 // 1-based, inclusive
355	lastLine   uint32 // 1-based, inclusive
356	minOffset  uint32 // 0-based, inclusive
357	maxOffset  uint32 // 0-based, exclusive
358	candidates []*candidateMatch
359}
360
361// chunkCandidates groups a set of sorted, non-overlapping candidate matches by line number. Adjacent
362// chunks will be merged if adding `numContextLines` to the beginning and end of the chunk would cause
363// it to overlap with an adjacent chunk.
364func chunkCandidates(ms []*candidateMatch, newlines newlines, numContextLines int) []candidateChunk {
365	var chunks []candidateChunk
366	for _, m := range ms {
367		startOffset := m.byteOffset
368		endOffset := m.byteOffset + m.byteMatchSz
369		firstLine, _, _ := newlines.atOffset(startOffset)
370		lastLine, _, _ := newlines.atOffset(endOffset)
371
372		if len(chunks) > 0 && int(chunks[len(chunks)-1].lastLine)+numContextLines >= firstLine-numContextLines {
373			// If a new chunk created with the current candidateMatch would
374			// overlap with the previous chunk, instead add the candidateMatch
375			// to the last chunk and extend end of the last chunk.
376			last := &chunks[len(chunks)-1]
377			last.candidates = append(last.candidates, m)
378			if last.maxOffset < endOffset {
379				last.lastLine = uint32(lastLine)
380				last.maxOffset = uint32(endOffset)
381			}
382		} else {
383			chunks = append(chunks, candidateChunk{
384				firstLine:  uint32(firstLine),
385				lastLine:   uint32(lastLine),
386				minOffset:  startOffset,
387				maxOffset:  endOffset,
388				candidates: []*candidateMatch{m},
389			})
390		}
391	}
392	return chunks
393}
394
395type newlines struct {
396	// locs is the sorted set of byte offsets of the newlines in the file
397	locs []uint32
398
399	// fileSize is just the number of bytes in the file. It is stored
400	// on this struct so we can safely know the length of the last line
401	// in the file since not all files end in a newline.
402	fileSize uint32
403}
404
405// atOffset returns the line containing the offset. If the offset lands on
406// the newline ending line M, we return M.  The line is characterized
407// by its linenumber (base-1, byte index of line start, byte index of
408// line end). The line end is the index of a newline, or the filesize
409// (if matching the last line of the file.)
410func (nls newlines) atOffset(offset uint32) (lineNumber, lineStart, lineEnd int) {
411	idx := sort.Search(len(nls.locs), func(n int) bool {
412		return nls.locs[n] >= offset
413	})
414
415	start, end := nls.lineBounds(idx + 1)
416	return idx + 1, int(start), int(end)
417}
418
419// lineBounds returns the byte offsets of the start and end of the 1-based
420// lineNumber. The end offset is exclusive and will not contain the line-ending
421// newline. If the line number is out of range of the lines in the file, start
422// and end will be clamped to [0,fileSize].
423func (nls newlines) lineBounds(lineNumber int) (start, end uint32) {
424	// nls.locs[0] + 1 is the start of the 2nd line of data.
425	startIdx := lineNumber - 2
426	endIdx := lineNumber - 1
427
428	if startIdx < 0 {
429		start = 0
430	} else if startIdx >= len(nls.locs) {
431		start = nls.fileSize
432	} else {
433		start = nls.locs[startIdx] + 1
434	}
435
436	if endIdx < 0 {
437		end = 0
438	} else if endIdx >= len(nls.locs) {
439		end = nls.fileSize
440	} else {
441		end = nls.locs[endIdx]
442	}
443
444	return start, end
445}
446
447// getLines returns a slice of data containing the lines [low, high).
448// low is 1-based and inclusive. high is 1-based and exclusive.
449func (nls newlines) getLines(data []byte, low, high int) []byte {
450	if low >= high {
451		return nil
452	}
453
454	lowStart, _ := nls.lineBounds(low)
455	_, highEnd := nls.lineBounds(high - 1)
456
457	return data[lowStart:highEnd]
458}
459
460const (
461	// Query-dependent scoring signals. All of these together are bounded at ~9000
462	// (scoreWordMatch + scoreSymbol + scoreKindMatch * 10 + scoreFactorAtomMatch).
463	scorePartialWordMatch = 50.0
464	scoreWordMatch        = 500.0
465	scoreBase             = 7000.0
466	scorePartialBase      = 4000.0
467	scoreSymbol           = 7000.0
468	scorePartialSymbol    = 4000.0
469	scoreKindMatch        = 100.0
470	scoreFactorAtomMatch  = 400.0
471
472	// File-only scoring signals. For now these are also bounded ~9000 to give them
473	// equal weight with the query-dependent signals.
474	scoreFileRankFactor  = 9000.0
475	scoreFileOrderFactor = 10.0
476	scoreRepoRankFactor  = 20.0
477
478	// Used for ordering line and chunk matches within a file.
479	scoreLineOrderFactor = 1.0
480)
481
482// findSection checks whether a section defined by offset and size lies within
483// one of the sections in secs.
484func findSection(secs []DocumentSection, off, sz uint32) (int, bool) {
485	j := sort.Search(len(secs), func(i int) bool {
486		return secs[i].End >= off+sz
487	})
488
489	if j == len(secs) {
490		return 0, false
491	}
492
493	if secs[j].Start <= off && off+sz <= secs[j].End {
494		return j, true
495	}
496	return 0, false
497}
498
499func (p *contentProvider) chunkMatchScore(secs []DocumentSection, m *ChunkMatch, language string, debug bool) (float64, string) {
500	type debugScore struct {
501		score float64
502		what  string
503	}
504
505	score := &debugScore{}
506	maxScore := &debugScore{}
507
508	addScore := func(what string, s float64) {
509		if s != 0 && debug {
510			score.what += fmt.Sprintf("%s:%.2f, ", what, s)
511		}
512		score.score += s
513	}
514
515	data := p.data(m.FileName)
516	filename := p.data(true)
517
518	for i, r := range m.Ranges {
519		// calculate the start and end offset relative to the start of the content
520		relStartOffset := int(r.Start.ByteOffset - m.ContentStart.ByteOffset)
521		relEndOffset := int(r.End.ByteOffset - m.ContentStart.ByteOffset)
522
523		startBoundary := relStartOffset < len(m.Content) && (relStartOffset == 0 || byteClass(m.Content[relStartOffset-1]) != byteClass(m.Content[relStartOffset]))
524		endBoundary := relEndOffset > 0 && (relEndOffset == len(m.Content) || byteClass(m.Content[relEndOffset-1]) != byteClass(m.Content[relEndOffset]))
525
526		score.score = 0
527		score.what = ""
528
529		if startBoundary && endBoundary {
530			addScore("WordMatch", scoreWordMatch)
531		} else if startBoundary || endBoundary {
532			addScore("PartialWordMatch", scorePartialWordMatch)
533		}
534
535		if m.FileName {
536			sep := bytes.LastIndexByte(m.Content, '/')
537			startMatch := relStartOffset == sep+1
538			endMatch := relEndOffset == len(m.Content)
539			if startMatch && endMatch {
540				addScore("Base", scoreBase)
541			} else if startMatch || endMatch {
542				addScore("EdgeBase", (scoreBase+scorePartialBase)/2)
543			} else if sep < relStartOffset {
544				addScore("InnerBase", scorePartialBase)
545			}
546		} else if secIdx, ok := findSection(secs, uint32(r.Start.ByteOffset), uint32(r.End.ByteOffset-r.Start.ByteOffset)); ok {
547			sec := secs[secIdx]
548			startMatch := sec.Start == uint32(r.Start.ByteOffset)
549			endMatch := sec.End == uint32(r.End.ByteOffset)
550			if startMatch && endMatch {
551				addScore("Symbol", scoreSymbol)
552			} else if startMatch || endMatch {
553				addScore("EdgeSymbol", (scoreSymbol+scorePartialSymbol)/2)
554			} else {
555				addScore("InnerSymbol", scorePartialSymbol)
556			}
557
558			var si *Symbol
559			if m.SymbolInfo != nil {
560				si = m.SymbolInfo[i]
561			}
562			if si == nil {
563				// for non-symbol queries, we need to hydrate in SymbolInfo.
564				start := p.id.fileEndSymbol[p.idx]
565				si = p.id.symbols.data(start + uint32(secIdx))
566			}
567			if si != nil {
568				symbolKind := ctags.ParseSymbolKind(si.Kind)
569				sym := sectionSlice(data, sec)
570				addScore(fmt.Sprintf("kind:%s:%s", language, si.Kind), scoreSymbolKind(language, filename, sym, symbolKind))
571			}
572		}
573
574		if score.score > maxScore.score {
575			maxScore.score = score.score
576			maxScore.what = score.what
577		}
578	}
579
580	if debug {
581		maxScore.what = fmt.Sprintf("score:%.2f <- %s", maxScore.score, strings.TrimSuffix(maxScore.what, ", "))
582	}
583
584	return maxScore.score, maxScore.what
585}
586
587func (p *contentProvider) matchScore(secs []DocumentSection, m *LineMatch, language string, debug bool) (float64, string) {
588	type debugScore struct {
589		score float64
590		what  string
591	}
592
593	score := &debugScore{}
594	maxScore := &debugScore{}
595
596	addScore := func(what string, s float64) {
597		if s != 0 && debug {
598			score.what += fmt.Sprintf("%s:%.2f, ", what, s)
599		}
600		score.score += s
601	}
602
603	data := p.data(m.FileName)
604	filename := p.data(true)
605
606	for _, f := range m.LineFragments {
607		startBoundary := f.LineOffset < len(m.Line) && (f.LineOffset == 0 || byteClass(m.Line[f.LineOffset-1]) != byteClass(m.Line[f.LineOffset]))
608
609		end := int(f.LineOffset) + f.MatchLength
610		endBoundary := end > 0 && (end == len(m.Line) || byteClass(m.Line[end-1]) != byteClass(m.Line[end]))
611
612		score.score = 0
613		score.what = ""
614
615		if startBoundary && endBoundary {
616			addScore("WordMatch", scoreWordMatch)
617		} else if startBoundary || endBoundary {
618			addScore("PartialWordMatch", scorePartialWordMatch)
619		}
620
621		if m.FileName {
622			sep := bytes.LastIndexByte(m.Line, '/')
623			startMatch := sep+1 == f.LineOffset
624			endMatch := len(m.Line) == f.LineOffset+f.MatchLength
625			if startMatch && endMatch {
626				addScore("Base", scoreBase)
627			} else if startMatch || endMatch {
628				addScore("EdgeBase", (scoreBase+scorePartialBase)/2)
629			} else if sep < f.LineOffset {
630				addScore("InnerBase", scorePartialBase)
631			}
632		} else if secIdx, ok := findSection(secs, f.Offset, uint32(f.MatchLength)); ok {
633			sec := secs[secIdx]
634			startMatch := sec.Start == f.Offset
635			endMatch := sec.End == f.Offset+uint32(f.MatchLength)
636			if startMatch && endMatch {
637				addScore("Symbol", scoreSymbol)
638			} else if startMatch || endMatch {
639				addScore("EdgeSymbol", (scoreSymbol+scorePartialSymbol)/2)
640			} else {
641				addScore("InnerSymbol", scorePartialSymbol)
642			}
643
644			si := f.SymbolInfo
645			if si == nil {
646				// for non-symbol queries, we need to hydrate in SymbolInfo.
647				start := p.id.fileEndSymbol[p.idx]
648				si = p.id.symbols.data(start + uint32(secIdx))
649			}
650			if si != nil {
651				// the LineFragment may not be on a symbol, then si will be nil.
652				symbolKind := ctags.ParseSymbolKind(si.Kind)
653				sym := sectionSlice(data, sec)
654				addScore(fmt.Sprintf("kind:%s:%s", language, si.Kind), scoreSymbolKind(language, filename, sym, symbolKind))
655			}
656		}
657
658		if score.score > maxScore.score {
659			maxScore.score = score.score
660			maxScore.what = score.what
661		}
662	}
663
664	if debug {
665		maxScore.what = fmt.Sprintf("score:%.2f <- %s", maxScore.score, strings.TrimSuffix(maxScore.what, ", "))
666	}
667
668	return maxScore.score, maxScore.what
669}
670
671// sectionSlice will return data[sec.Start:sec.End] but will clip Start and
672// End such that it won't be out of range.
673func sectionSlice(data []byte, sec DocumentSection) []byte {
674	l := uint32(len(data))
675	if sec.Start >= l {
676		return nil
677	}
678	if sec.End > l {
679		sec.End = l
680	}
681	return data[sec.Start:sec.End]
682}
683
684
685// scoreSymbolKind boosts a match based on the combination of language, symbol
686// and kind. The language string comes from go-enry, the symbol and kind from
687// ctags.
688func scoreSymbolKind(language string, filename []byte, sym []byte, kind ctags.SymbolKind) float64 {
689	var factor float64
690
691	// Generic ranking which will be overriden by language specific ranking
692	switch kind {
693	case ctags.Type: // scip-ctags regression workaround https://github.com/sourcegraph/sourcegraph/issues/57659
694		factor = 8
695	case ctags.Class:
696		factor = 10
697	case ctags.Struct:
698		factor = 9.5
699	case ctags.Enum:
700		factor = 9
701	case ctags.Interface:
702		factor = 8
703	case ctags.Function, ctags.Method:
704		factor = 7
705	case ctags.Field:
706		factor = 5.5
707	case ctags.Constant:
708		factor = 5
709	case ctags.Variable:
710		factor = 4
711	default:
712		// For all other kinds, assign a low score by default.
713		factor = 1
714	}
715
716	switch language {
717	case "Java", "java":
718		switch kind {
719		// 2022-03-30: go-ctags contains a regex rule for Java classes that sets "kind"
720		// to "classes" instead of "c". We have to cover both cases to support existing
721		// indexes.
722		case ctags.Class:
723			factor = 10
724		case ctags.Enum:
725			factor = 9
726		case ctags.Interface:
727			factor = 8
728		case ctags.Method:
729			factor = 7
730		case ctags.Field:
731			factor = 6
732		case ctags.EnumConstant:
733			factor = 5
734		}
735	case "Kotlin", "kotlin":
736		switch kind {
737		case ctags.Class:
738			factor = 10
739		case ctags.Interface:
740			factor = 9
741		case ctags.Method:
742			factor = 8
743		case ctags.TypeAlias:
744			factor = 7
745		case ctags.Constant:
746			factor = 6
747		case ctags.Variable:
748			factor = 5
749		}
750	case "Go", "go":
751		switch kind {
752		// scip-ctags regression workaround https://github.com/sourcegraph/sourcegraph/issues/57659
753		// for each case a description of the fields in ctags in the comment
754		case ctags.Type: // interface struct talias
755			factor = 9
756		case ctags.Interface: // interfaces
757			factor = 10
758		case ctags.Struct: // structs
759			factor = 9
760		case ctags.TypeAlias: // type aliases
761			factor = 9
762		case ctags.MethodSpec: // interface method specification
763			factor = 8.5
764		case ctags.Method, ctags.Function: // functions
765			factor = 8
766		case ctags.Field: // struct fields
767			factor = 7
768		case ctags.Constant: // constants
769			factor = 6
770		case ctags.Variable: // variables
771			factor = 5
772		}
773
774		// Boost exported go symbols. Same implementation as token.IsExported
775		if ch, _ := utf8.DecodeRune(sym); unicode.IsUpper(ch) {
776			factor += 0.5
777		}
778
779		if bytes.HasSuffix(filename, []byte("_test.go")) {
780			factor *= 0.8
781		}
782
783		// Could also rank on:
784		//
785		//   - anonMember  struct anonymous members
786		//   - packageName name for specifying imported package
787		//   - receiver    receivers
788		//   - package     packages
789		//   - type        types
790		//   - unknown     unknown
791	case "C++", "c++":
792		switch kind {
793		case ctags.Class: // classes
794			factor = 10
795		case ctags.Enum: // enumeration names
796			factor = 9
797		case ctags.Function: // function definitions
798			factor = 8
799		case ctags.Struct: // structure names
800			factor = 7
801		case ctags.Union: // union names
802			factor = 6
803		case ctags.TypeAlias: // typedefs
804			factor = 5
805		case ctags.Field: // class, struct, and union members
806			factor = 4
807		case ctags.Variable: // varialbe definitions
808			factor = 3
809		}
810	// Could also rank on:
811	// NAME        DESCRIPTION
812	// macro       macro definitions
813	// enumerator  enumerators (values inside an enumeration)
814	// header      included header files
815	// namespace   namespaces
816	// variable    variable definitions
817	case "Scala", "scala":
818		switch kind {
819		case ctags.Class:
820			factor = 10
821		case ctags.Interface:
822			factor = 9
823		case ctags.Object:
824			factor = 8
825		case ctags.Function:
826			factor = 7
827		case ctags.Type:
828			factor = 6
829		case ctags.Variable:
830			factor = 5
831		case ctags.Package:
832			factor = 4
833		}
834	case "Python", "python":
835		switch kind {
836		case ctags.Class: // classes
837			factor = 10
838		case ctags.Function, ctags.Method: // function definitions
839			factor = 8
840		case ctags.Field: // class, struct, and union members
841			factor = 4
842		case ctags.Variable: // variable definitions
843			factor = 3
844		case ctags.Local: // local variables
845			factor = 2
846		}
847		// Could also rank on:
848		//
849		//   - namespace name referring a module defined in other file
850		//   - module    modules
851		//   - unknown   name referring a class/variable/function/module defined in other module
852		//   - parameter function parameters
853	case "Ruby", "ruby":
854		switch kind {
855		case ctags.Class:
856			factor = 10
857		case ctags.Method:
858			factor = 9
859		case ctags.MethodAlias:
860			factor = 8
861		case ctags.Module:
862			factor = 7
863		case ctags.SingletonMethod:
864			factor = 6
865		case ctags.Constant:
866			factor = 5
867		case ctags.Accessor:
868			factor = 4
869		case ctags.Library:
870			factor = 3
871		}
872	case "PHP", "php":
873		switch kind {
874		case ctags.Class:
875			factor = 10
876		case ctags.Interface:
877			factor = 9
878		case ctags.Function:
879			factor = 8
880		case ctags.Trait:
881			factor = 7
882		case ctags.Define:
883			factor = 6
884		case ctags.Namespace:
885			factor = 5
886		case ctags.MethodAlias:
887			factor = 4
888		case ctags.Variable:
889			factor = 3
890		case ctags.Local:
891			factor = 3
892		}
893	case "GraphQL", "graphql":
894		switch kind {
895		case ctags.Type:
896			factor = 10
897		}
898	case "Markdown", "markdown":
899		// Headers are good signal in docs, but do not rank as highly as code.
900		switch kind {
901		case ctags.Chapter: // #
902			factor = 4
903		case ctags.Section: // ##
904			factor = 3
905		case ctags.Subsection: // ###
906			factor = 2
907		}
908	}
909
910	return factor * scoreKindMatch
911}
912
913type matchScoreSlice []LineMatch
914
915func (m matchScoreSlice) Len() int           { return len(m) }
916func (m matchScoreSlice) Swap(i, j int)      { m[i], m[j] = m[j], m[i] }
917func (m matchScoreSlice) Less(i, j int) bool { return m[i].Score > m[j].Score }
918
919type chunkMatchScoreSlice []ChunkMatch
920
921func (m chunkMatchScoreSlice) Len() int           { return len(m) }
922func (m chunkMatchScoreSlice) Swap(i, j int)      { m[i], m[j] = m[j], m[i] }
923func (m chunkMatchScoreSlice) Less(i, j int) bool { return m[i].Score > m[j].Score }
924
925type fileMatchesByScore []FileMatch
926
927func (m fileMatchesByScore) Len() int           { return len(m) }
928func (m fileMatchesByScore) Swap(i, j int)      { m[i], m[j] = m[j], m[i] }
929func (m fileMatchesByScore) Less(i, j int) bool { return m[i].Score > m[j].Score }
930
931func sortMatchesByScore(ms []LineMatch) {
932	sort.Sort(matchScoreSlice(ms))
933}
934
935func sortChunkMatchesByScore(ms []ChunkMatch) {
936	sort.Sort(chunkMatchScoreSlice(ms))
937}
938
939var doNovelty = os.Getenv("ZOEKT_NOVELTY_DISABLE") == ""
940
941// SortFiles sorts files matches in the order we want to present results to
942// users. The order depends on the match score, which includes both
943// query-dependent signals like word overlap, and file-only signals like the
944// file ranks (if file ranks are enabled).
945//
946// We don't only use the scores, we will also boost some results to present
947// files with novel extensions.
948func SortFiles(ms []FileMatch) {
949	sort.Sort(fileMatchesByScore(ms))
950
951	if doNovelty {
952		// Experimentally boost something into the third filematch
953		boostNovelExtension(ms, 2, 0.9)
954	}
955}
956
957func boostNovelExtension(ms []FileMatch, boostOffset int, minScoreRatio float64) {
958	if len(ms) <= boostOffset+1 {
959		return
960	}
961
962	top := ms[:boostOffset]
963	candidates := ms[boostOffset:]
964
965	// Don't bother boosting something which is significantly different to the
966	// result it replaces.
967	minScoreForNovelty := candidates[0].Score * minScoreRatio
968
969	// We want to look for an ext that isn't in the top exts
970	exts := make([]string, len(top))
971	for i := range top {
972		exts[i] = path.Ext(top[i].FileName)
973	}
974
975	for i := range candidates {
976		// Do not assume sorted due to boostNovelExtension being called on subsets
977		if candidates[i].Score < minScoreForNovelty {
978			continue
979		}
980
981		if slices.Contains(exts, path.Ext(candidates[i].FileName)) {
982			continue
983		}
984
985		// Found what we are looking for, now boost to front of candidates (which
986		// is ms[boostOffset])
987		for ; i > 0; i-- {
988			candidates[i], candidates[i-1] = candidates[i-1], candidates[i]
989		}
990		return
991	}
992}
Configure Feed

Configure Feed