contentprovider.go at 4be2929fd03415e63831a6e02878dd67b44041fc · boltless.me/zoekt

fork of https://github.com/sourcegraph/zoekt
zoekt / contentprovider.go
at 4be2929fd03415e63831a6e02878dd67b44041fc 30 kB View raw
Keegan Carruthers-Smith score: remove ZOEKT_NOVELTY_DISABLE (#718) 2y ago
   1// Copyright 2016 Google Inc. All rights reserved.
   2//
   3// Licensed under the Apache License, Version 2.0 (the "License");
   4// you may not use this file except in compliance with the License.
   5// You may obtain a copy of the License at
   6//
   7//    http://www.apache.org/licenses/LICENSE-2.0
   8//
   9// Unless required by applicable law or agreed to in writing, software
  10// distributed under the License is distributed on an "AS IS" BASIS,
  11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12// See the License for the specific language governing permissions and
  13// limitations under the License.
  14
  15package zoekt
  16
  17import (
  18	"bytes"
  19	"fmt"
  20	"log"
  21	"path"
  22	"sort"
  23	"strings"
  24	"unicode"
  25	"unicode/utf8"
  26
  27	"github.com/sourcegraph/zoekt/ctags"
  28	"golang.org/x/exp/slices"
  29)
  30
  31var _ = log.Println
  32
  33// contentProvider is an abstraction to treat matches for names and
  34// content with the same code.
  35type contentProvider struct {
  36	id    *indexData
  37	stats *Stats
  38
  39	// mutable
  40	err      error
  41	idx      uint32
  42	_data    []byte
  43	_nl      []uint32
  44	_nlBuf   []uint32
  45	_sects   []DocumentSection
  46	_sectBuf []DocumentSection
  47	fileSize uint32
  48}
  49
  50// setDocument skips to the given document.
  51func (p *contentProvider) setDocument(docID uint32) {
  52	fileStart := p.id.boundaries[docID]
  53
  54	p.idx = docID
  55	p.fileSize = p.id.boundaries[docID+1] - fileStart
  56
  57	p._nl = nil
  58	p._sects = nil
  59	p._data = nil
  60}
  61
  62func (p *contentProvider) docSections() []DocumentSection {
  63	if p._sects == nil {
  64		var sz uint32
  65		p._sects, sz, p.err = p.id.readDocSections(p.idx, p._sectBuf)
  66		p.stats.ContentBytesLoaded += int64(sz)
  67		p._sectBuf = p._sects
  68	}
  69	return p._sects
  70}
  71
  72func (p *contentProvider) newlines() newlines {
  73	if p._nl == nil {
  74		var sz uint32
  75		p._nl, sz, p.err = p.id.readNewlines(p.idx, p._nlBuf)
  76		p._nlBuf = p._nl
  77		p.stats.ContentBytesLoaded += int64(sz)
  78	}
  79	return newlines{locs: p._nl, fileSize: p.fileSize}
  80}
  81
  82func (p *contentProvider) data(fileName bool) []byte {
  83	if fileName {
  84		return p.id.fileNameContent[p.id.fileNameIndex[p.idx]:p.id.fileNameIndex[p.idx+1]]
  85	}
  86
  87	if p._data == nil {
  88		p._data, p.err = p.id.readContents(p.idx)
  89		p.stats.FilesLoaded++
  90		p.stats.ContentBytesLoaded += int64(len(p._data))
  91	}
  92	return p._data
  93}
  94
  95// Find offset in bytes (relative to corpus start) for an offset in
  96// runes (relative to document start). If filename is set, the corpus
  97// is the set of filenames, with the document being the name itself.
  98func (p *contentProvider) findOffset(filename bool, r uint32) uint32 {
  99	if p.id.metaData.PlainASCII {
 100		return r
 101	}
 102
 103	sample := p.id.runeOffsets
 104	runeEnds := p.id.fileEndRunes
 105	fileStartByte := p.id.boundaries[p.idx]
 106	if filename {
 107		sample = p.id.fileNameRuneOffsets
 108		runeEnds = p.id.fileNameEndRunes
 109		fileStartByte = p.id.fileNameIndex[p.idx]
 110	}
 111
 112	absR := r
 113	if p.idx > 0 {
 114		absR += runeEnds[p.idx-1]
 115	}
 116
 117	byteOff, left := sample.lookup(absR)
 118
 119	var data []byte
 120
 121	if filename {
 122		data = p.id.fileNameContent[byteOff:]
 123	} else {
 124		data, p.err = p.id.readContentSlice(byteOff, 3*runeOffsetFrequency)
 125		if p.err != nil {
 126			return 0
 127		}
 128	}
 129	for left > 0 {
 130		_, sz := utf8.DecodeRune(data)
 131		byteOff += uint32(sz)
 132		data = data[sz:]
 133		left--
 134	}
 135
 136	byteOff -= fileStartByte
 137	return byteOff
 138}
 139
 140func (p *contentProvider) fillMatches(ms []*candidateMatch, numContextLines int, language string, debug bool) []LineMatch {
 141	var result []LineMatch
 142	if ms[0].fileName {
 143		// There is only "line" in a filename.
 144		res := LineMatch{
 145			Line:     p.id.fileName(p.idx),
 146			FileName: true,
 147		}
 148
 149		for _, m := range ms {
 150			res.LineFragments = append(res.LineFragments, LineFragmentMatch{
 151				LineOffset:  int(m.byteOffset),
 152				MatchLength: int(m.byteMatchSz),
 153				Offset:      m.byteOffset,
 154			})
 155
 156			result = []LineMatch{res}
 157		}
 158	} else {
 159		ms = breakMatchesOnNewlines(ms, p.data(false))
 160		result = p.fillContentMatches(ms, numContextLines)
 161	}
 162
 163	sects := p.docSections()
 164	for i, m := range result {
 165		result[i].Score, result[i].DebugScore = p.matchScore(sects, &m, language, debug)
 166	}
 167
 168	return result
 169}
 170
 171// fillChunkMatches converts the internal candidateMatch slice into our APIs ChunkMatch.
 172//
 173// Performance invariant: ms is sorted and non-overlapping.
 174//
 175// Note: the byte slices may be backed by mmapped data, so before being
 176// returned by the API it needs to be copied.
 177func (p *contentProvider) fillChunkMatches(ms []*candidateMatch, numContextLines int, language string, debug bool) []ChunkMatch {
 178	var result []ChunkMatch
 179	if ms[0].fileName {
 180		// If the first match is a filename match, there will only be
 181		// one match and the matched content will be the filename.
 182
 183		fileName := p.id.fileName(p.idx)
 184		ranges := make([]Range, 0, len(ms))
 185		for _, m := range ms {
 186			ranges = append(ranges, Range{
 187				Start: Location{
 188					ByteOffset: m.byteOffset,
 189					LineNumber: 1,
 190					Column:     uint32(utf8.RuneCount(fileName[:m.byteOffset]) + 1),
 191				},
 192				End: Location{
 193					ByteOffset: m.byteOffset + m.byteMatchSz,
 194					LineNumber: 1,
 195					Column:     uint32(utf8.RuneCount(fileName[:m.byteOffset+m.byteMatchSz]) + 1),
 196				},
 197			})
 198		}
 199
 200		result = []ChunkMatch{{
 201			Content:      fileName,
 202			ContentStart: Location{ByteOffset: 0, LineNumber: 1, Column: 1},
 203			Ranges:       ranges,
 204			FileName:     true,
 205		}}
 206	} else {
 207		result = p.fillContentChunkMatches(ms, numContextLines)
 208	}
 209
 210	sects := p.docSections()
 211	for i, m := range result {
 212		result[i].Score, result[i].DebugScore = p.chunkMatchScore(sects, &m, language, debug)
 213	}
 214
 215	return result
 216}
 217
 218func (p *contentProvider) fillContentMatches(ms []*candidateMatch, numContextLines int) []LineMatch {
 219	var result []LineMatch
 220	for len(ms) > 0 {
 221		m := ms[0]
 222		num, lineStart, lineEnd := p.newlines().atOffset(m.byteOffset)
 223
 224		var lineCands []*candidateMatch
 225
 226		endMatch := m.byteOffset + m.byteMatchSz
 227
 228		for len(ms) > 0 {
 229			m := ms[0]
 230			if int(m.byteOffset) <= lineEnd {
 231				endMatch = m.byteOffset + m.byteMatchSz
 232				lineCands = append(lineCands, m)
 233				ms = ms[1:]
 234			} else {
 235				break
 236			}
 237		}
 238
 239		if len(lineCands) == 0 {
 240			log.Panicf(
 241				"%s %v infinite loop: num %d start,end %d,%d, offset %d",
 242				p.id.fileName(p.idx), p.id.metaData,
 243				num, lineStart, lineEnd,
 244				m.byteOffset)
 245		}
 246
 247		data := p.data(false)
 248
 249		// Due to merging matches, we may have a match that
 250		// crosses a line boundary. Prevent confusion by
 251		// taking lines until we pass the last match
 252		for lineEnd < len(data) && endMatch > uint32(lineEnd) {
 253			next := bytes.IndexByte(data[lineEnd+1:], '\n')
 254			if next == -1 {
 255				lineEnd = len(data)
 256			} else {
 257				// TODO(hanwen): test that checks "+1" part here.
 258				lineEnd += next + 1
 259			}
 260		}
 261
 262		finalMatch := LineMatch{
 263			LineStart:  lineStart,
 264			LineEnd:    lineEnd,
 265			LineNumber: num,
 266		}
 267		finalMatch.Line = data[lineStart:lineEnd]
 268
 269		if numContextLines > 0 {
 270			finalMatch.Before = p.newlines().getLines(data, num-numContextLines, num)
 271			finalMatch.After = p.newlines().getLines(data, num+1, num+1+numContextLines)
 272		}
 273
 274		for _, m := range lineCands {
 275			fragment := LineFragmentMatch{
 276				Offset:      m.byteOffset,
 277				LineOffset:  int(m.byteOffset) - lineStart,
 278				MatchLength: int(m.byteMatchSz),
 279			}
 280			if m.symbol {
 281				start := p.id.fileEndSymbol[p.idx]
 282				fragment.SymbolInfo = p.id.symbols.data(start + m.symbolIdx)
 283				if fragment.SymbolInfo != nil {
 284					sec := p.docSections()[m.symbolIdx]
 285					fragment.SymbolInfo.Sym = string(data[sec.Start:sec.End])
 286				}
 287			}
 288
 289			finalMatch.LineFragments = append(finalMatch.LineFragments, fragment)
 290		}
 291		result = append(result, finalMatch)
 292	}
 293	return result
 294}
 295
 296func (p *contentProvider) fillContentChunkMatches(ms []*candidateMatch, numContextLines int) []ChunkMatch {
 297	newlines := p.newlines()
 298	data := p.data(false)
 299
 300	// columnHelper prevents O(len(ms) * len(data)) lookups for all columns.
 301	// However, it depends on ms being sorted by byteOffset and non-overlapping.
 302	// This invariant is true at the time of writing, but we conservatively
 303	// enforce this. Note: chunkCandidates preserves the sorting so safe to
 304	// transform now.
 305	columnHelper := columnHelper{data: data}
 306	if !sort.IsSorted((sortByOffsetSlice)(ms)) {
 307		log.Printf("WARN: performance invariant violated. candidate matches are not sorted in fillContentChunkMatches. Report to developers.")
 308		sort.Sort((sortByOffsetSlice)(ms))
 309	}
 310
 311	chunks := chunkCandidates(ms, newlines, numContextLines)
 312	chunkMatches := make([]ChunkMatch, 0, len(chunks))
 313	for _, chunk := range chunks {
 314		ranges := make([]Range, 0, len(chunk.candidates))
 315		var symbolInfo []*Symbol
 316		for i, cm := range chunk.candidates {
 317			startOffset := cm.byteOffset
 318			endOffset := cm.byteOffset + cm.byteMatchSz
 319			startLine, startLineOffset, _ := newlines.atOffset(startOffset)
 320			endLine, endLineOffset, _ := newlines.atOffset(endOffset)
 321
 322			ranges = append(ranges, Range{
 323				Start: Location{
 324					ByteOffset: startOffset,
 325					LineNumber: uint32(startLine),
 326					Column:     columnHelper.get(startLineOffset, startOffset),
 327				},
 328				End: Location{
 329					ByteOffset: endOffset,
 330					LineNumber: uint32(endLine),
 331					Column:     columnHelper.get(endLineOffset, endOffset),
 332				},
 333			})
 334
 335			if cm.symbol {
 336				if symbolInfo == nil {
 337					symbolInfo = make([]*Symbol, len(chunk.candidates))
 338				}
 339				start := p.id.fileEndSymbol[p.idx]
 340				si := p.id.symbols.data(start + cm.symbolIdx)
 341				if si != nil {
 342					sec := p.docSections()[cm.symbolIdx]
 343					si.Sym = string(data[sec.Start:sec.End])
 344				}
 345				symbolInfo[i] = si
 346			}
 347		}
 348
 349		firstLineNumber := int(chunk.firstLine) - numContextLines
 350		if firstLineNumber < 1 {
 351			firstLineNumber = 1
 352		}
 353		firstLineStart, _ := newlines.lineBounds(firstLineNumber)
 354
 355		chunkMatches = append(chunkMatches, ChunkMatch{
 356			Content: newlines.getLines(data, firstLineNumber, int(chunk.lastLine)+numContextLines+1),
 357			ContentStart: Location{
 358				ByteOffset: firstLineStart,
 359				LineNumber: uint32(firstLineNumber),
 360				Column:     1,
 361			},
 362			FileName:   false,
 363			Ranges:     ranges,
 364			SymbolInfo: symbolInfo,
 365		})
 366	}
 367	return chunkMatches
 368}
 369
 370type candidateChunk struct {
 371	candidates []*candidateMatch
 372	firstLine  uint32 // 1-based, inclusive
 373	lastLine   uint32 // 1-based, inclusive
 374	minOffset  uint32 // 0-based, inclusive
 375	maxOffset  uint32 // 0-based, exclusive
 376}
 377
 378// chunkCandidates groups a set of sorted, non-overlapping candidate matches by line number. Adjacent
 379// chunks will be merged if adding `numContextLines` to the beginning and end of the chunk would cause
 380// it to overlap with an adjacent chunk.
 381//
 382// input invariants: ms is sorted by byteOffset and is non overlapping with respect to endOffset.
 383// output invariants: if you flatten candidates the input invariant is retained.
 384func chunkCandidates(ms []*candidateMatch, newlines newlines, numContextLines int) []candidateChunk {
 385	var chunks []candidateChunk
 386	for _, m := range ms {
 387		startOffset := m.byteOffset
 388		endOffset := m.byteOffset + m.byteMatchSz
 389		firstLine, _, _ := newlines.atOffset(startOffset)
 390		lastLine, _, _ := newlines.atOffset(endOffset)
 391
 392		if len(chunks) > 0 && int(chunks[len(chunks)-1].lastLine)+numContextLines >= firstLine-numContextLines {
 393			// If a new chunk created with the current candidateMatch would
 394			// overlap with the previous chunk, instead add the candidateMatch
 395			// to the last chunk and extend end of the last chunk.
 396			last := &chunks[len(chunks)-1]
 397			last.candidates = append(last.candidates, m)
 398			if last.maxOffset < endOffset {
 399				last.lastLine = uint32(lastLine)
 400				last.maxOffset = uint32(endOffset)
 401			}
 402		} else {
 403			chunks = append(chunks, candidateChunk{
 404				firstLine:  uint32(firstLine),
 405				lastLine:   uint32(lastLine),
 406				minOffset:  startOffset,
 407				maxOffset:  endOffset,
 408				candidates: []*candidateMatch{m},
 409			})
 410		}
 411	}
 412	return chunks
 413}
 414
 415// columnHelper is a helper struct which caches the number of runes last
 416// counted. If we naively use utf8.RuneCount for each match on a line, this
 417// leads to an O(nm) algorithm where m is the number of matches and n is the
 418// length of the line. Aassuming we our candidates are increasing in offset
 419// makes this operation O(n) instead.
 420type columnHelper struct {
 421	data []byte
 422
 423	// 0 values for all these are valid values
 424	lastLineOffset int
 425	lastOffset     uint32
 426	lastRuneCount  uint32
 427}
 428
 429// get returns the line column for offset. offset is the byte offset of the
 430// rune in data. lineOffset is the byte offset inside of data for the line
 431// containing offset.
 432func (c *columnHelper) get(lineOffset int, offset uint32) uint32 {
 433	var runeCount uint32
 434
 435	if lineOffset == c.lastLineOffset && offset >= c.lastOffset {
 436		// Can count from last calculation
 437		runeCount = c.lastRuneCount + uint32(utf8.RuneCount(c.data[c.lastOffset:offset]))
 438	} else {
 439		// Need to count from the beginning of line
 440		runeCount = uint32(utf8.RuneCount(c.data[lineOffset:offset]))
 441	}
 442
 443	c.lastLineOffset = lineOffset
 444	c.lastOffset = offset
 445	c.lastRuneCount = runeCount
 446
 447	return runeCount + 1
 448}
 449
 450type newlines struct {
 451	// locs is the sorted set of byte offsets of the newlines in the file
 452	locs []uint32
 453
 454	// fileSize is just the number of bytes in the file. It is stored
 455	// on this struct so we can safely know the length of the last line
 456	// in the file since not all files end in a newline.
 457	fileSize uint32
 458}
 459
 460// atOffset returns the line containing the offset. If the offset lands on
 461// the newline ending line M, we return M.  The line is characterized
 462// by its linenumber (base-1, byte index of line start, byte index of
 463// line end). The line end is the index of a newline, or the filesize
 464// (if matching the last line of the file.)
 465func (nls newlines) atOffset(offset uint32) (lineNumber, lineStart, lineEnd int) {
 466	idx := sort.Search(len(nls.locs), func(n int) bool {
 467		return nls.locs[n] >= offset
 468	})
 469
 470	start, end := nls.lineBounds(idx + 1)
 471	return idx + 1, int(start), int(end)
 472}
 473
 474// lineBounds returns the byte offsets of the start and end of the 1-based
 475// lineNumber. The end offset is exclusive and will not contain the line-ending
 476// newline. If the line number is out of range of the lines in the file, start
 477// and end will be clamped to [0,fileSize].
 478func (nls newlines) lineBounds(lineNumber int) (start, end uint32) {
 479	// nls.locs[0] + 1 is the start of the 2nd line of data.
 480	startIdx := lineNumber - 2
 481	endIdx := lineNumber - 1
 482
 483	if startIdx < 0 {
 484		start = 0
 485	} else if startIdx >= len(nls.locs) {
 486		start = nls.fileSize
 487	} else {
 488		start = nls.locs[startIdx] + 1
 489	}
 490
 491	if endIdx < 0 {
 492		end = 0
 493	} else if endIdx >= len(nls.locs) {
 494		end = nls.fileSize
 495	} else {
 496		end = nls.locs[endIdx]
 497	}
 498
 499	return start, end
 500}
 501
 502// getLines returns a slice of data containing the lines [low, high).
 503// low is 1-based and inclusive. high is 1-based and exclusive.
 504func (nls newlines) getLines(data []byte, low, high int) []byte {
 505	if low >= high {
 506		return nil
 507	}
 508
 509	lowStart, _ := nls.lineBounds(low)
 510	_, highEnd := nls.lineBounds(high - 1)
 511
 512	// Drop any trailing newline. Editors do not treat a trailing newline as
 513	// the start of a new line, so we should not either. lineBounds clamps to
 514	// len(data) when an out-of-bounds line is requested.
 515	//
 516	// As an example, if we request lines 1-5 from a file with contents
 517	// `one\ntwo\nthree\n`, we should return `one\ntwo\nthree` because those are
 518	// the three "lines" in the file, separated by newlines.
 519	if highEnd == uint32(len(data)) && bytes.HasSuffix(data, []byte{'\n'}) {
 520		highEnd = highEnd - 1
 521		lowStart = min(lowStart, highEnd)
 522	}
 523
 524	return data[lowStart:highEnd]
 525}
 526
 527const (
 528	// Query-dependent scoring signals. All of these together are bounded at ~9000
 529	// (scoreWordMatch + scoreSymbol + scoreKindMatch * 10 + scoreFactorAtomMatch).
 530	scorePartialWordMatch = 50.0
 531	scoreWordMatch        = 500.0
 532	scoreBase             = 7000.0
 533	scorePartialBase      = 4000.0
 534	scoreSymbol           = 7000.0
 535	scorePartialSymbol    = 4000.0
 536	scoreKindMatch        = 100.0
 537	scoreFactorAtomMatch  = 400.0
 538
 539	// File-only scoring signals. For now these are also bounded ~9000 to give them
 540	// equal weight with the query-dependent signals.
 541	scoreFileRankFactor  = 9000.0
 542	scoreFileOrderFactor = 10.0
 543	scoreRepoRankFactor  = 20.0
 544
 545	// Used for ordering line and chunk matches within a file.
 546	scoreLineOrderFactor = 1.0
 547)
 548
 549// findSection checks whether a section defined by offset and size lies within
 550// one of the sections in secs.
 551func findSection(secs []DocumentSection, off, sz uint32) (int, bool) {
 552	j := sort.Search(len(secs), func(i int) bool {
 553		return secs[i].End >= off+sz
 554	})
 555
 556	if j == len(secs) {
 557		return 0, false
 558	}
 559
 560	if secs[j].Start <= off && off+sz <= secs[j].End {
 561		return j, true
 562	}
 563	return 0, false
 564}
 565
 566func (p *contentProvider) chunkMatchScore(secs []DocumentSection, m *ChunkMatch, language string, debug bool) (float64, string) {
 567	type debugScore struct {
 568		what  string
 569		score float64
 570	}
 571
 572	score := &debugScore{}
 573	maxScore := &debugScore{}
 574
 575	addScore := func(what string, s float64) {
 576		if s != 0 && debug {
 577			score.what += fmt.Sprintf("%s:%.2f, ", what, s)
 578		}
 579		score.score += s
 580	}
 581
 582	data := p.data(m.FileName)
 583	filename := p.data(true)
 584
 585	for i, r := range m.Ranges {
 586		// calculate the start and end offset relative to the start of the content
 587		relStartOffset := int(r.Start.ByteOffset - m.ContentStart.ByteOffset)
 588		relEndOffset := int(r.End.ByteOffset - m.ContentStart.ByteOffset)
 589
 590		startBoundary := relStartOffset < len(m.Content) && (relStartOffset == 0 || byteClass(m.Content[relStartOffset-1]) != byteClass(m.Content[relStartOffset]))
 591		endBoundary := relEndOffset > 0 && (relEndOffset == len(m.Content) || byteClass(m.Content[relEndOffset-1]) != byteClass(m.Content[relEndOffset]))
 592
 593		score.score = 0
 594		score.what = ""
 595
 596		if startBoundary && endBoundary {
 597			addScore("WordMatch", scoreWordMatch)
 598		} else if startBoundary || endBoundary {
 599			addScore("PartialWordMatch", scorePartialWordMatch)
 600		}
 601
 602		if m.FileName {
 603			sep := bytes.LastIndexByte(m.Content, '/')
 604			startMatch := relStartOffset == sep+1
 605			endMatch := relEndOffset == len(m.Content)
 606			if startMatch && endMatch {
 607				addScore("Base", scoreBase)
 608			} else if startMatch || endMatch {
 609				addScore("EdgeBase", (scoreBase+scorePartialBase)/2)
 610			} else if sep < relStartOffset {
 611				addScore("InnerBase", scorePartialBase)
 612			}
 613		} else if secIdx, ok := findSection(secs, uint32(r.Start.ByteOffset), uint32(r.End.ByteOffset-r.Start.ByteOffset)); ok {
 614			sec := secs[secIdx]
 615			startMatch := sec.Start == uint32(r.Start.ByteOffset)
 616			endMatch := sec.End == uint32(r.End.ByteOffset)
 617			if startMatch && endMatch {
 618				addScore("Symbol", scoreSymbol)
 619			} else if startMatch || endMatch {
 620				addScore("EdgeSymbol", (scoreSymbol+scorePartialSymbol)/2)
 621			} else {
 622				addScore("InnerSymbol", scorePartialSymbol)
 623			}
 624
 625			var si *Symbol
 626			if m.SymbolInfo != nil {
 627				si = m.SymbolInfo[i]
 628			}
 629			if si == nil {
 630				// for non-symbol queries, we need to hydrate in SymbolInfo.
 631				start := p.id.fileEndSymbol[p.idx]
 632				si = p.id.symbols.data(start + uint32(secIdx))
 633			}
 634			if si != nil {
 635				symbolKind := ctags.ParseSymbolKind(si.Kind)
 636				sym := sectionSlice(data, sec)
 637				addScore(fmt.Sprintf("kind:%s:%s", language, si.Kind), scoreSymbolKind(language, filename, sym, symbolKind))
 638			}
 639		}
 640
 641		if score.score > maxScore.score {
 642			maxScore.score = score.score
 643			maxScore.what = score.what
 644		}
 645	}
 646
 647	if debug {
 648		maxScore.what = fmt.Sprintf("score:%.2f <- %s", maxScore.score, strings.TrimSuffix(maxScore.what, ", "))
 649	}
 650
 651	return maxScore.score, maxScore.what
 652}
 653
 654func (p *contentProvider) matchScore(secs []DocumentSection, m *LineMatch, language string, debug bool) (float64, string) {
 655	type debugScore struct {
 656		what  string
 657		score float64
 658	}
 659
 660	score := &debugScore{}
 661	maxScore := &debugScore{}
 662
 663	addScore := func(what string, s float64) {
 664		if s != 0 && debug {
 665			score.what += fmt.Sprintf("%s:%.2f, ", what, s)
 666		}
 667		score.score += s
 668	}
 669
 670	data := p.data(m.FileName)
 671	filename := p.data(true)
 672
 673	for _, f := range m.LineFragments {
 674		startBoundary := f.LineOffset < len(m.Line) && (f.LineOffset == 0 || byteClass(m.Line[f.LineOffset-1]) != byteClass(m.Line[f.LineOffset]))
 675
 676		end := int(f.LineOffset) + f.MatchLength
 677		endBoundary := end > 0 && (end == len(m.Line) || byteClass(m.Line[end-1]) != byteClass(m.Line[end]))
 678
 679		score.score = 0
 680		score.what = ""
 681
 682		if startBoundary && endBoundary {
 683			addScore("WordMatch", scoreWordMatch)
 684		} else if startBoundary || endBoundary {
 685			addScore("PartialWordMatch", scorePartialWordMatch)
 686		}
 687
 688		if m.FileName {
 689			sep := bytes.LastIndexByte(m.Line, '/')
 690			startMatch := sep+1 == f.LineOffset
 691			endMatch := len(m.Line) == f.LineOffset+f.MatchLength
 692			if startMatch && endMatch {
 693				addScore("Base", scoreBase)
 694			} else if startMatch || endMatch {
 695				addScore("EdgeBase", (scoreBase+scorePartialBase)/2)
 696			} else if sep < f.LineOffset {
 697				addScore("InnerBase", scorePartialBase)
 698			}
 699		} else if secIdx, ok := findSection(secs, f.Offset, uint32(f.MatchLength)); ok {
 700			sec := secs[secIdx]
 701			startMatch := sec.Start == f.Offset
 702			endMatch := sec.End == f.Offset+uint32(f.MatchLength)
 703			if startMatch && endMatch {
 704				addScore("Symbol", scoreSymbol)
 705			} else if startMatch || endMatch {
 706				addScore("EdgeSymbol", (scoreSymbol+scorePartialSymbol)/2)
 707			} else {
 708				addScore("InnerSymbol", scorePartialSymbol)
 709			}
 710
 711			si := f.SymbolInfo
 712			if si == nil {
 713				// for non-symbol queries, we need to hydrate in SymbolInfo.
 714				start := p.id.fileEndSymbol[p.idx]
 715				si = p.id.symbols.data(start + uint32(secIdx))
 716			}
 717			if si != nil {
 718				// the LineFragment may not be on a symbol, then si will be nil.
 719				symbolKind := ctags.ParseSymbolKind(si.Kind)
 720				sym := sectionSlice(data, sec)
 721				addScore(fmt.Sprintf("kind:%s:%s", language, si.Kind), scoreSymbolKind(language, filename, sym, symbolKind))
 722			}
 723		}
 724
 725		if score.score > maxScore.score {
 726			maxScore.score = score.score
 727			maxScore.what = score.what
 728		}
 729	}
 730
 731	if debug {
 732		maxScore.what = fmt.Sprintf("score:%.2f <- %s", maxScore.score, strings.TrimSuffix(maxScore.what, ", "))
 733	}
 734
 735	return maxScore.score, maxScore.what
 736}
 737
 738// sectionSlice will return data[sec.Start:sec.End] but will clip Start and
 739// End such that it won't be out of range.
 740func sectionSlice(data []byte, sec DocumentSection) []byte {
 741	l := uint32(len(data))
 742	if sec.Start >= l {
 743		return nil
 744	}
 745	if sec.End > l {
 746		sec.End = l
 747	}
 748	return data[sec.Start:sec.End]
 749}
 750
 751// scoreSymbolKind boosts a match based on the combination of language, symbol
 752// and kind. The language string comes from go-enry, the symbol and kind from
 753// ctags.
 754func scoreSymbolKind(language string, filename []byte, sym []byte, kind ctags.SymbolKind) float64 {
 755	var factor float64
 756
 757	// Generic ranking which will be overriden by language specific ranking
 758	switch kind {
 759	case ctags.Type: // scip-ctags regression workaround https://github.com/sourcegraph/sourcegraph/issues/57659
 760		factor = 8
 761	case ctags.Class:
 762		factor = 10
 763	case ctags.Struct:
 764		factor = 9.5
 765	case ctags.Enum:
 766		factor = 9
 767	case ctags.Interface:
 768		factor = 8
 769	case ctags.Function, ctags.Method:
 770		factor = 7
 771	case ctags.Field:
 772		factor = 5.5
 773	case ctags.Constant:
 774		factor = 5
 775	case ctags.Variable:
 776		factor = 4
 777	default:
 778		// For all other kinds, assign a low score by default.
 779		factor = 1
 780	}
 781
 782	switch language {
 783	case "Java", "java":
 784		switch kind {
 785		// 2022-03-30: go-ctags contains a regex rule for Java classes that sets "kind"
 786		// to "classes" instead of "c". We have to cover both cases to support existing
 787		// indexes.
 788		case ctags.Class:
 789			factor = 10
 790		case ctags.Enum:
 791			factor = 9
 792		case ctags.Interface:
 793			factor = 8
 794		case ctags.Method:
 795			factor = 7
 796		case ctags.Field:
 797			factor = 6
 798		case ctags.EnumConstant:
 799			factor = 5
 800		}
 801	case "Kotlin", "kotlin":
 802		switch kind {
 803		case ctags.Class:
 804			factor = 10
 805		case ctags.Interface:
 806			factor = 9
 807		case ctags.Method:
 808			factor = 8
 809		case ctags.TypeAlias:
 810			factor = 7
 811		case ctags.Constant:
 812			factor = 6
 813		case ctags.Variable:
 814			factor = 5
 815		}
 816	case "Go", "go":
 817		switch kind {
 818		// scip-ctags regression workaround https://github.com/sourcegraph/sourcegraph/issues/57659
 819		// for each case a description of the fields in ctags in the comment
 820		case ctags.Type: // interface struct talias
 821			factor = 9
 822		case ctags.Interface: // interfaces
 823			factor = 10
 824		case ctags.Struct: // structs
 825			factor = 9
 826		case ctags.TypeAlias: // type aliases
 827			factor = 9
 828		case ctags.MethodSpec: // interface method specification
 829			factor = 8.5
 830		case ctags.Method, ctags.Function: // functions
 831			factor = 8
 832		case ctags.Field: // struct fields
 833			factor = 7
 834		case ctags.Constant: // constants
 835			factor = 6
 836		case ctags.Variable: // variables
 837			factor = 5
 838		}
 839
 840		// Boost exported go symbols. Same implementation as token.IsExported
 841		if ch, _ := utf8.DecodeRune(sym); unicode.IsUpper(ch) {
 842			factor += 0.5
 843		}
 844
 845		if bytes.HasSuffix(filename, []byte("_test.go")) {
 846			factor *= 0.8
 847		}
 848
 849		// Could also rank on:
 850		//
 851		//   - anonMember  struct anonymous members
 852		//   - packageName name for specifying imported package
 853		//   - receiver    receivers
 854		//   - package     packages
 855		//   - type        types
 856		//   - unknown     unknown
 857	case "C++", "c++":
 858		switch kind {
 859		case ctags.Class: // classes
 860			factor = 10
 861		case ctags.Enum: // enumeration names
 862			factor = 9
 863		case ctags.Function: // function definitions
 864			factor = 8
 865		case ctags.Struct: // structure names
 866			factor = 7
 867		case ctags.Union: // union names
 868			factor = 6
 869		case ctags.TypeAlias: // typedefs
 870			factor = 5
 871		case ctags.Field: // class, struct, and union members
 872			factor = 4
 873		case ctags.Variable: // varialbe definitions
 874			factor = 3
 875		}
 876	// Could also rank on:
 877	// NAME        DESCRIPTION
 878	// macro       macro definitions
 879	// enumerator  enumerators (values inside an enumeration)
 880	// header      included header files
 881	// namespace   namespaces
 882	// variable    variable definitions
 883	case "Scala", "scala":
 884		switch kind {
 885		case ctags.Class:
 886			factor = 10
 887		case ctags.Interface:
 888			factor = 9
 889		case ctags.Object:
 890			factor = 8
 891		case ctags.Function:
 892			factor = 7
 893		case ctags.Type:
 894			factor = 6
 895		case ctags.Variable:
 896			factor = 5
 897		case ctags.Package:
 898			factor = 4
 899		}
 900	case "Python", "python":
 901		switch kind {
 902		case ctags.Class: // classes
 903			factor = 10
 904		case ctags.Function, ctags.Method: // function definitions
 905			factor = 8
 906		case ctags.Field: // class, struct, and union members
 907			factor = 4
 908		case ctags.Variable: // variable definitions
 909			factor = 3
 910		case ctags.Local: // local variables
 911			factor = 2
 912		}
 913		// Could also rank on:
 914		//
 915		//   - namespace name referring a module defined in other file
 916		//   - module    modules
 917		//   - unknown   name referring a class/variable/function/module defined in other module
 918		//   - parameter function parameters
 919	case "Ruby", "ruby":
 920		switch kind {
 921		case ctags.Class:
 922			factor = 10
 923		case ctags.Method:
 924			factor = 9
 925		case ctags.MethodAlias:
 926			factor = 8
 927		case ctags.Module:
 928			factor = 7
 929		case ctags.SingletonMethod:
 930			factor = 6
 931		case ctags.Constant:
 932			factor = 5
 933		case ctags.Accessor:
 934			factor = 4
 935		case ctags.Library:
 936			factor = 3
 937		}
 938	case "PHP", "php":
 939		switch kind {
 940		case ctags.Class:
 941			factor = 10
 942		case ctags.Interface:
 943			factor = 9
 944		case ctags.Function:
 945			factor = 8
 946		case ctags.Trait:
 947			factor = 7
 948		case ctags.Define:
 949			factor = 6
 950		case ctags.Namespace:
 951			factor = 5
 952		case ctags.MethodAlias:
 953			factor = 4
 954		case ctags.Variable:
 955			factor = 3
 956		case ctags.Local:
 957			factor = 3
 958		}
 959	case "GraphQL", "graphql":
 960		switch kind {
 961		case ctags.Type:
 962			factor = 10
 963		}
 964	case "Markdown", "markdown":
 965		// Headers are good signal in docs, but do not rank as highly as code.
 966		switch kind {
 967		case ctags.Chapter: // #
 968			factor = 4
 969		case ctags.Section: // ##
 970			factor = 3
 971		case ctags.Subsection: // ###
 972			factor = 2
 973		}
 974	}
 975
 976	return factor * scoreKindMatch
 977}
 978
 979type matchScoreSlice []LineMatch
 980
 981func (m matchScoreSlice) Len() int           { return len(m) }
 982func (m matchScoreSlice) Swap(i, j int)      { m[i], m[j] = m[j], m[i] }
 983func (m matchScoreSlice) Less(i, j int) bool { return m[i].Score > m[j].Score }
 984
 985type chunkMatchScoreSlice []ChunkMatch
 986
 987func (m chunkMatchScoreSlice) Len() int           { return len(m) }
 988func (m chunkMatchScoreSlice) Swap(i, j int)      { m[i], m[j] = m[j], m[i] }
 989func (m chunkMatchScoreSlice) Less(i, j int) bool { return m[i].Score > m[j].Score }
 990
 991type fileMatchesByScore []FileMatch
 992
 993func (m fileMatchesByScore) Len() int           { return len(m) }
 994func (m fileMatchesByScore) Swap(i, j int)      { m[i], m[j] = m[j], m[i] }
 995func (m fileMatchesByScore) Less(i, j int) bool { return m[i].Score > m[j].Score }
 996
 997func sortMatchesByScore(ms []LineMatch) {
 998	sort.Sort(matchScoreSlice(ms))
 999}
1000
1001func sortChunkMatchesByScore(ms []ChunkMatch) {
1002	sort.Sort(chunkMatchScoreSlice(ms))
1003}
1004
1005// SortFiles sorts files matches in the order we want to present results to
1006// users. The order depends on the match score, which includes both
1007// query-dependent signals like word overlap, and file-only signals like the
1008// file ranks (if file ranks are enabled).
1009//
1010// We don't only use the scores, we will also boost some results to present
1011// files with novel extensions.
1012func SortFiles(ms []FileMatch) {
1013	sort.Sort(fileMatchesByScore(ms))
1014
1015	// Boost a file extension not in the top 3 to the third filematch.
1016	boostNovelExtension(ms, 2, 0.9)
1017}
1018
1019func boostNovelExtension(ms []FileMatch, boostOffset int, minScoreRatio float64) {
1020	if len(ms) <= boostOffset+1 {
1021		return
1022	}
1023
1024	top := ms[:boostOffset]
1025	candidates := ms[boostOffset:]
1026
1027	// Don't bother boosting something which is significantly different to the
1028	// result it replaces.
1029	minScoreForNovelty := candidates[0].Score * minScoreRatio
1030
1031	// We want to look for an ext that isn't in the top exts
1032	exts := make([]string, len(top))
1033	for i := range top {
1034		exts[i] = path.Ext(top[i].FileName)
1035	}
1036
1037	for i := range candidates {
1038		// Do not assume sorted due to boostNovelExtension being called on subsets
1039		if candidates[i].Score < minScoreForNovelty {
1040			continue
1041		}
1042
1043		if slices.Contains(exts, path.Ext(candidates[i].FileName)) {
1044			continue
1045		}
1046
1047		// Found what we are looking for, now boost to front of candidates (which
1048		// is ms[boostOffset])
1049		for ; i > 0; i-- {
1050			candidates[i], candidates[i-1] = candidates[i-1], candidates[i]
1051		}
1052		return
1053	}
1054}
Configure Feed

Configure Feed