contentprovider.go at 687cafc8f702e6c0efa0b562b35c0eee619a88d8 · boltless.me/zoekt

fork of https://github.com/sourcegraph/zoekt
zoekt / contentprovider.go
at 687cafc8f702e6c0efa0b562b35c0eee619a88d8 32 kB View raw
Julie Tibshirani Boost symbol matches in BM25 (#876) 2y ago
   1// Copyright 2016 Google Inc. All rights reserved.
   2//
   3// Licensed under the Apache License, Version 2.0 (the "License");
   4// you may not use this file except in compliance with the License.
   5// You may obtain a copy of the License at
   6//
   7//    http://www.apache.org/licenses/LICENSE-2.0
   8//
   9// Unless required by applicable law or agreed to in writing, software
  10// distributed under the License is distributed on an "AS IS" BASIS,
  11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12// See the License for the specific language governing permissions and
  13// limitations under the License.
  14
  15package zoekt
  16
  17import (
  18	"bytes"
  19	"fmt"
  20	"log"
  21	"path"
  22	"slices"
  23	"sort"
  24	"strings"
  25	"unicode"
  26	"unicode/utf8"
  27
  28	"github.com/sourcegraph/zoekt/ctags"
  29)
  30
  31var _ = log.Println
  32
  33// contentProvider is an abstraction to treat matches for names and
  34// content with the same code.
  35type contentProvider struct {
  36	id    *indexData
  37	stats *Stats
  38
  39	// mutable
  40	err      error
  41	idx      uint32
  42	_data    []byte
  43	_nl      []uint32
  44	_nlBuf   []uint32
  45	_sects   []DocumentSection
  46	_sectBuf []DocumentSection
  47	fileSize uint32
  48}
  49
  50// setDocument skips to the given document.
  51func (p *contentProvider) setDocument(docID uint32) {
  52	fileStart := p.id.boundaries[docID]
  53
  54	p.idx = docID
  55	p.fileSize = p.id.boundaries[docID+1] - fileStart
  56
  57	p._nl = nil
  58	p._sects = nil
  59	p._data = nil
  60}
  61
  62func (p *contentProvider) docSections() []DocumentSection {
  63	if p._sects == nil {
  64		var sz uint32
  65		p._sects, sz, p.err = p.id.readDocSections(p.idx, p._sectBuf)
  66		p.stats.ContentBytesLoaded += int64(sz)
  67		p._sectBuf = p._sects
  68	}
  69	return p._sects
  70}
  71
  72func (p *contentProvider) newlines() newlines {
  73	if p._nl == nil {
  74		var sz uint32
  75		p._nl, sz, p.err = p.id.readNewlines(p.idx, p._nlBuf)
  76		p._nlBuf = p._nl
  77		p.stats.ContentBytesLoaded += int64(sz)
  78	}
  79	return newlines{locs: p._nl, fileSize: p.fileSize}
  80}
  81
  82func (p *contentProvider) data(fileName bool) []byte {
  83	if fileName {
  84		return p.id.fileNameContent[p.id.fileNameIndex[p.idx]:p.id.fileNameIndex[p.idx+1]]
  85	}
  86
  87	if p._data == nil {
  88		p._data, p.err = p.id.readContents(p.idx)
  89		p.stats.FilesLoaded++
  90		p.stats.ContentBytesLoaded += int64(len(p._data))
  91	}
  92	return p._data
  93}
  94
  95// Find offset in bytes (relative to corpus start) for an offset in
  96// runes (relative to document start). If filename is set, the corpus
  97// is the set of filenames, with the document being the name itself.
  98func (p *contentProvider) findOffset(filename bool, r uint32) uint32 {
  99	if p.id.metaData.PlainASCII {
 100		return r
 101	}
 102
 103	sample := p.id.runeOffsets
 104	runeEnds := p.id.fileEndRunes
 105	fileStartByte := p.id.boundaries[p.idx]
 106	if filename {
 107		sample = p.id.fileNameRuneOffsets
 108		runeEnds = p.id.fileNameEndRunes
 109		fileStartByte = p.id.fileNameIndex[p.idx]
 110	}
 111
 112	absR := r
 113	if p.idx > 0 {
 114		absR += runeEnds[p.idx-1]
 115	}
 116
 117	byteOff, left := sample.lookup(absR)
 118
 119	var data []byte
 120
 121	if filename {
 122		data = p.id.fileNameContent[byteOff:]
 123	} else {
 124		data, p.err = p.id.readContentSlice(byteOff, 3*runeOffsetFrequency)
 125		if p.err != nil {
 126			return 0
 127		}
 128	}
 129	for left > 0 {
 130		_, sz := utf8.DecodeRune(data)
 131		byteOff += uint32(sz)
 132		data = data[sz:]
 133		left--
 134	}
 135
 136	byteOff -= fileStartByte
 137	return byteOff
 138}
 139
 140// fillMatches converts the internal candidateMatch slice into our API's LineMatch.
 141// It only ever returns content XOR filename matches, not both. If there are any
 142// content matches, these are always returned, and we omit filename matches.
 143//
 144// Performance invariant: ms is sorted and non-overlapping.
 145//
 146// Note: the byte slices may be backed by mmapped data, so before being
 147// returned by the API it needs to be copied.
 148func (p *contentProvider) fillMatches(ms []*candidateMatch, numContextLines int, language string, debug bool) []LineMatch {
 149	var filenameMatches []*candidateMatch
 150	contentMatches := make([]*candidateMatch, 0, len(ms))
 151
 152	for _, m := range ms {
 153		if m.fileName {
 154			filenameMatches = append(filenameMatches, m)
 155		} else {
 156			contentMatches = append(contentMatches, m)
 157		}
 158	}
 159
 160	// If there are any content matches, we only return these and skip filename matches.
 161	if len(contentMatches) > 0 {
 162		contentMatches = breakMatchesOnNewlines(contentMatches, p.data(false))
 163		return p.fillContentMatches(contentMatches, numContextLines, language, debug)
 164	}
 165
 166	// Otherwise, we return a single line containing the filematch match.
 167	score, debugScore, _ := p.candidateMatchScore(filenameMatches, language, debug)
 168	res := LineMatch{
 169		Line:       p.id.fileName(p.idx),
 170		FileName:   true,
 171		Score:      score,
 172		DebugScore: debugScore,
 173	}
 174
 175	for _, m := range ms {
 176		res.LineFragments = append(res.LineFragments, LineFragmentMatch{
 177			LineOffset:  int(m.byteOffset),
 178			MatchLength: int(m.byteMatchSz),
 179			Offset:      m.byteOffset,
 180		})
 181	}
 182
 183	return []LineMatch{res}
 184
 185}
 186
 187// fillChunkMatches converts the internal candidateMatch slice into our API's ChunkMatch.
 188// It only ever returns content XOR filename matches, not both. If there are any content
 189// matches, these are always returned, and we omit filename matches.
 190//
 191// Performance invariant: ms is sorted and non-overlapping.
 192//
 193// Note: the byte slices may be backed by mmapped data, so before being
 194// returned by the API it needs to be copied.
 195func (p *contentProvider) fillChunkMatches(ms []*candidateMatch, numContextLines int, language string, debug bool) []ChunkMatch {
 196	var filenameMatches []*candidateMatch
 197	contentMatches := make([]*candidateMatch, 0, len(ms))
 198
 199	for _, m := range ms {
 200		if m.fileName {
 201			filenameMatches = append(filenameMatches, m)
 202		} else {
 203			contentMatches = append(contentMatches, m)
 204		}
 205	}
 206
 207	// If there are any content matches, we only return these and skip filename matches.
 208	if len(contentMatches) > 0 {
 209		return p.fillContentChunkMatches(contentMatches, numContextLines, language, debug)
 210	}
 211
 212	// Otherwise, we return a single chunk representing the filename match.
 213	score, debugScore, _ := p.candidateMatchScore(filenameMatches, language, debug)
 214	fileName := p.id.fileName(p.idx)
 215	ranges := make([]Range, 0, len(ms))
 216	for _, m := range ms {
 217		ranges = append(ranges, Range{
 218			Start: Location{
 219				ByteOffset: m.byteOffset,
 220				LineNumber: 1,
 221				Column:     uint32(utf8.RuneCount(fileName[:m.byteOffset]) + 1),
 222			},
 223			End: Location{
 224				ByteOffset: m.byteOffset + m.byteMatchSz,
 225				LineNumber: 1,
 226				Column:     uint32(utf8.RuneCount(fileName[:m.byteOffset+m.byteMatchSz]) + 1),
 227			},
 228		})
 229	}
 230
 231	return []ChunkMatch{{
 232		Content:      fileName,
 233		ContentStart: Location{ByteOffset: 0, LineNumber: 1, Column: 1},
 234		Ranges:       ranges,
 235		FileName:     true,
 236
 237		Score:      score,
 238		DebugScore: debugScore,
 239	}}
 240}
 241
 242func (p *contentProvider) fillContentMatches(ms []*candidateMatch, numContextLines int, language string, debug bool) []LineMatch {
 243	var result []LineMatch
 244	for len(ms) > 0 {
 245		m := ms[0]
 246		num := p.newlines().atOffset(m.byteOffset)
 247		lineStart := int(p.newlines().lineStart(num))
 248		nextLineStart := int(p.newlines().lineStart(num + 1))
 249
 250		var lineCands []*candidateMatch
 251
 252		endMatch := m.byteOffset + m.byteMatchSz
 253
 254		for len(ms) > 0 {
 255			m := ms[0]
 256			if int(m.byteOffset) < nextLineStart {
 257				endMatch = m.byteOffset + m.byteMatchSz
 258				lineCands = append(lineCands, m)
 259				ms = ms[1:]
 260			} else {
 261				break
 262			}
 263		}
 264
 265		if len(lineCands) == 0 {
 266			log.Panicf(
 267				"%s %v infinite loop: num %d start,end %d,%d, offset %d",
 268				p.id.fileName(p.idx), p.id.metaData,
 269				num, lineStart, nextLineStart,
 270				m.byteOffset)
 271		}
 272
 273		data := p.data(false)
 274
 275		// Due to merging matches, we may have a match that
 276		// crosses a line boundary. Prevent confusion by
 277		// taking lines until we pass the last match
 278		for nextLineStart < len(data) && endMatch > uint32(nextLineStart) {
 279			next := bytes.IndexByte(data[nextLineStart:], '\n')
 280			if next == -1 {
 281				nextLineStart = len(data)
 282			} else {
 283				// TODO(hanwen): test that checks "+1" part here.
 284				nextLineStart += next + 1
 285			}
 286		}
 287
 288		finalMatch := LineMatch{
 289			LineStart:  lineStart,
 290			LineEnd:    nextLineStart,
 291			LineNumber: num,
 292		}
 293		finalMatch.Line = data[lineStart:nextLineStart]
 294
 295		if numContextLines > 0 {
 296			finalMatch.Before = p.newlines().getLines(data, num-numContextLines, num)
 297			finalMatch.After = p.newlines().getLines(data, num+1, num+1+numContextLines)
 298		}
 299
 300		score, debugScore, symbolInfo := p.candidateMatchScore(lineCands, language, debug)
 301		finalMatch.Score = score
 302		finalMatch.DebugScore = debugScore
 303
 304		for i, m := range lineCands {
 305			fragment := LineFragmentMatch{
 306				Offset:      m.byteOffset,
 307				LineOffset:  int(m.byteOffset) - lineStart,
 308				MatchLength: int(m.byteMatchSz),
 309			}
 310			if i < len(symbolInfo) && symbolInfo[i] != nil {
 311				fragment.SymbolInfo = symbolInfo[i]
 312			}
 313
 314			finalMatch.LineFragments = append(finalMatch.LineFragments, fragment)
 315		}
 316		result = append(result, finalMatch)
 317	}
 318	return result
 319}
 320
 321func (p *contentProvider) fillContentChunkMatches(ms []*candidateMatch, numContextLines int, language string, debug bool) []ChunkMatch {
 322	newlines := p.newlines()
 323	data := p.data(false)
 324
 325	// columnHelper prevents O(len(ms) * len(data)) lookups for all columns.
 326	// However, it depends on ms being sorted by byteOffset and non-overlapping.
 327	// This invariant is true at the time of writing, but we conservatively
 328	// enforce this. Note: chunkCandidates preserves the sorting so safe to
 329	// transform now.
 330	columnHelper := columnHelper{data: data}
 331	if !sort.IsSorted((sortByOffsetSlice)(ms)) {
 332		log.Printf("WARN: performance invariant violated. candidate matches are not sorted in fillContentChunkMatches. Report to developers.")
 333		sort.Sort((sortByOffsetSlice)(ms))
 334	}
 335
 336	chunks := chunkCandidates(ms, newlines, numContextLines)
 337	chunkMatches := make([]ChunkMatch, 0, len(chunks))
 338	for _, chunk := range chunks {
 339		score, debugScore, symbolInfo := p.candidateMatchScore(chunk.candidates, language, debug)
 340
 341		ranges := make([]Range, 0, len(chunk.candidates))
 342		for _, cm := range chunk.candidates {
 343			startOffset := cm.byteOffset
 344			endOffset := cm.byteOffset + cm.byteMatchSz
 345			startLine, endLine := newlines.offsetRangeToLineRange(startOffset, endOffset)
 346
 347			ranges = append(ranges, Range{
 348				Start: Location{
 349					ByteOffset: startOffset,
 350					LineNumber: uint32(startLine),
 351					Column:     columnHelper.get(int(newlines.lineStart(startLine)), startOffset),
 352				},
 353				End: Location{
 354					ByteOffset: endOffset,
 355					LineNumber: uint32(endLine),
 356					Column:     columnHelper.get(int(newlines.lineStart(endLine)), endOffset),
 357				},
 358			})
 359		}
 360
 361		firstLineNumber := int(chunk.firstLine) - numContextLines
 362		if firstLineNumber < 1 {
 363			firstLineNumber = 1
 364		}
 365		firstLineStart := newlines.lineStart(firstLineNumber)
 366
 367		chunkMatches = append(chunkMatches, ChunkMatch{
 368			Content: newlines.getLines(data, firstLineNumber, int(chunk.lastLine)+numContextLines+1),
 369			ContentStart: Location{
 370				ByteOffset: firstLineStart,
 371				LineNumber: uint32(firstLineNumber),
 372				Column:     1,
 373			},
 374			FileName:   false,
 375			Ranges:     ranges,
 376			SymbolInfo: symbolInfo,
 377			Score:      score,
 378			DebugScore: debugScore,
 379		})
 380	}
 381	return chunkMatches
 382}
 383
 384type candidateChunk struct {
 385	candidates []*candidateMatch
 386	firstLine  uint32 // 1-based, inclusive
 387	lastLine   uint32 // 1-based, inclusive
 388	minOffset  uint32 // 0-based, inclusive
 389	maxOffset  uint32 // 0-based, exclusive
 390}
 391
 392// chunkCandidates groups a set of sorted, non-overlapping candidate matches by line number. Adjacent
 393// chunks will be merged if adding `numContextLines` to the beginning and end of the chunk would cause
 394// it to overlap with an adjacent chunk.
 395//
 396// input invariants: ms is sorted by byteOffset and is non overlapping with respect to endOffset.
 397// output invariants: if you flatten candidates the input invariant is retained.
 398func chunkCandidates(ms []*candidateMatch, newlines newlines, numContextLines int) []candidateChunk {
 399	var chunks []candidateChunk
 400	for _, m := range ms {
 401		startOffset := m.byteOffset
 402		endOffset := m.byteOffset + m.byteMatchSz
 403		firstLine, lastLine := newlines.offsetRangeToLineRange(startOffset, endOffset)
 404
 405		if len(chunks) > 0 && int(chunks[len(chunks)-1].lastLine)+numContextLines >= firstLine-numContextLines {
 406			// If a new chunk created with the current candidateMatch would
 407			// overlap with the previous chunk, instead add the candidateMatch
 408			// to the last chunk and extend end of the last chunk.
 409			last := &chunks[len(chunks)-1]
 410			last.candidates = append(last.candidates, m)
 411			if last.maxOffset < endOffset {
 412				last.lastLine = uint32(lastLine)
 413				last.maxOffset = uint32(endOffset)
 414			}
 415		} else {
 416			chunks = append(chunks, candidateChunk{
 417				firstLine:  uint32(firstLine),
 418				lastLine:   uint32(lastLine),
 419				minOffset:  startOffset,
 420				maxOffset:  endOffset,
 421				candidates: []*candidateMatch{m},
 422			})
 423		}
 424	}
 425	return chunks
 426}
 427
 428// columnHelper is a helper struct which caches the number of runes last
 429// counted. If we naively use utf8.RuneCount for each match on a line, this
 430// leads to an O(nm) algorithm where m is the number of matches and n is the
 431// length of the line. Aassuming we our candidates are increasing in offset
 432// makes this operation O(n) instead.
 433type columnHelper struct {
 434	data []byte
 435
 436	// 0 values for all these are valid values
 437	lastLineOffset int
 438	lastOffset     uint32
 439	lastRuneCount  uint32
 440}
 441
 442// get returns the line column for offset. offset is the byte offset of the
 443// rune in data. lineOffset is the byte offset inside of data for the line
 444// containing offset.
 445func (c *columnHelper) get(lineOffset int, offset uint32) uint32 {
 446	var runeCount uint32
 447
 448	if lineOffset == c.lastLineOffset && offset >= c.lastOffset {
 449		// Can count from last calculation
 450		runeCount = c.lastRuneCount + uint32(utf8.RuneCount(c.data[c.lastOffset:offset]))
 451	} else {
 452		// Need to count from the beginning of line
 453		runeCount = uint32(utf8.RuneCount(c.data[lineOffset:offset]))
 454	}
 455
 456	c.lastLineOffset = lineOffset
 457	c.lastOffset = offset
 458	c.lastRuneCount = runeCount
 459
 460	return runeCount + 1
 461}
 462
 463type newlines struct {
 464	// locs is the sorted set of byte offsets of the newlines in the file
 465	locs []uint32
 466
 467	// fileSize is just the number of bytes in the file. It is stored
 468	// on this struct so we can safely know the length of the last line
 469	// in the file since not all files end in a newline.
 470	fileSize uint32
 471}
 472
 473// atOffset returns the line containing the offset. If the offset lands on
 474// the newline ending line M, we return M.
 475func (nls newlines) atOffset(offset uint32) (lineNumber int) {
 476	idx := sort.Search(len(nls.locs), func(n int) bool {
 477		return nls.locs[n] >= offset
 478	})
 479	return idx + 1
 480}
 481
 482// lineStart returns the byte offset of the beginning of the given line.
 483// lineNumber is 1-based. If lineNumber is out of range of the lines in the
 484// file, the return value will be clamped to [0,fileSize].
 485func (nls newlines) lineStart(lineNumber int) uint32 {
 486	// nls.locs[0] + 1 is the start of the 2nd line of data.
 487	startIdx := lineNumber - 2
 488
 489	if startIdx < 0 {
 490		return 0
 491	} else if startIdx >= len(nls.locs) {
 492		return nls.fileSize
 493	} else {
 494		return nls.locs[startIdx] + 1
 495	}
 496}
 497
 498// offsetRangeToLineRange returns range of lines that fully contains the given byte range.
 499// The inputs are 0-based byte offsets into the file representing the (exclusive) range [startOffset, endOffset).
 500// The return values are 1-based line numbers representing the (inclusive) range [startLine, endLine].
 501func (nls newlines) offsetRangeToLineRange(startOffset, endOffset uint32) (startLine, endLine int) {
 502	startLine = nls.atOffset(startOffset)
 503	endLine = nls.atOffset(
 504		max(startOffset, max(endOffset, 1)-1), // clamp endOffset and prevent underflow
 505	)
 506	return startLine, endLine
 507}
 508
 509// getLines returns a slice of data containing the lines [low, high).
 510// low is 1-based and inclusive. high is 1-based and exclusive.
 511func (nls newlines) getLines(data []byte, low, high int) []byte {
 512	if low >= high {
 513		return nil
 514	}
 515
 516	return data[nls.lineStart(low):nls.lineStart(high)]
 517}
 518
 519const (
 520	// Query-dependent scoring signals. All of these together are bounded at ~9000
 521	// (scoreWordMatch + scoreSymbol + scoreKindMatch * 10 + scoreFactorAtomMatch).
 522	scorePartialWordMatch = 50.0
 523	scoreWordMatch        = 500.0
 524	scoreBase             = 7000.0
 525	scorePartialBase      = 4000.0
 526	scoreSymbol           = 7000.0
 527	scorePartialSymbol    = 4000.0
 528	scoreKindMatch        = 100.0
 529	scoreFactorAtomMatch  = 400.0
 530
 531	// File-only scoring signals. For now these are also bounded ~9000 to give them
 532	// equal weight with the query-dependent signals.
 533	scoreFileRankFactor = 9000.0
 534
 535	// Used for ordering line and chunk matches within a file.
 536	scoreLineOrderFactor = 1.0
 537
 538	// Used for tiebreakers. The scores are not combined with the main score, but
 539	// are used to break ties between matches with the same score. The factors are
 540	// chosen to separate the tiebreakers from the main score and from each other.
 541	// If you make changes here, make sure to update indexData.scoreFile too.
 542	scoreRepoRankFactor  = 100.0
 543	scoreFileOrderFactor = 10.0
 544)
 545
 546// findMaxOverlappingSection returns the index of the section in secs that
 547// overlaps the most with the area defined by off and sz, relative to the size
 548// of the section. If no section overlaps, it returns 0, false. If multiple
 549// sections overlap the same amount, the first one is returned.
 550//
 551// The implementation assumes that sections do not overlap and are sorted by
 552// DocumentSection.Start.
 553func findMaxOverlappingSection(secs []DocumentSection, off, sz uint32) (uint32, bool) {
 554	start := off
 555	end := off + sz
 556
 557	// Find the first section that might overlap
 558	j := sort.Search(len(secs), func(i int) bool { return secs[i].End > start })
 559
 560	if j == len(secs) || secs[j].Start >= end {
 561		// No overlap.
 562		return 0, false
 563	}
 564
 565	relOverlap := func(j int) float64 {
 566		secSize := secs[j].End - secs[j].Start
 567		if secSize == 0 {
 568			return 0
 569		}
 570		// This cannot overflow because we make sure there is overlap before calling relOverlap
 571		overlap := min(secs[j].End, end) - max(secs[j].Start, start)
 572		return float64(overlap) / float64(secSize)
 573	}
 574
 575	ol1 := relOverlap(j)
 576	if epsilonEqualsOne(ol1) || j == len(secs)-1 || secs[j+1].Start >= end {
 577		return uint32(j), ol1 > 0
 578	}
 579
 580	// We know that [off,off+sz[ overlaps with at least 2 sections. We only have to check
 581	// if the second section overlaps more than the first one, because a third
 582	// section can only overlap if the overlap with the second section is complete.
 583	ol2 := relOverlap(j + 1)
 584	if ol2 > ol1 {
 585		return uint32(j + 1), ol2 > 0
 586	}
 587
 588	return uint32(j), ol1 > 0
 589}
 590
 591func (p *contentProvider) matchesSymbol(cm *candidateMatch) bool {
 592	if cm.fileName {
 593		return false
 594	}
 595
 596	// Check if this candidate came from a symbol matchTree
 597	if cm.symbol {
 598		return true
 599	}
 600
 601	// Check if it overlaps with a symbol.
 602	secs := p.docSections()
 603	_, ok := findMaxOverlappingSection(secs, cm.byteOffset, cm.byteMatchSz)
 604	return ok
 605}
 606
 607func (p *contentProvider) findSymbol(cm *candidateMatch) (DocumentSection, *Symbol, bool) {
 608	if cm.fileName {
 609		return DocumentSection{}, nil, false
 610	}
 611
 612	secs := p.docSections()
 613
 614	secIdx, ok := cm.symbolIdx, cm.symbol
 615	if !ok {
 616		// Not from a symbol matchTree. Let's see if it overlaps with a symbol.
 617		secIdx, ok = findMaxOverlappingSection(secs, cm.byteOffset, cm.byteMatchSz)
 618	}
 619	if !ok {
 620		return DocumentSection{}, nil, false
 621	}
 622
 623	sec := secs[secIdx]
 624
 625	// Now lets hydrate in the SymbolInfo. We do not hydrate in SymbolInfo.Sym
 626	// since some callsites do not need it stored, and that incurs an extra
 627	// copy.
 628	//
 629	// 2024-01-08 we are refactoring this and the code path indicates this can
 630	// fail, so callers need to handle nil symbol. However, it would be
 631	// surprising that we have a matching section but not symbol data.
 632	start := p.id.fileEndSymbol[p.idx]
 633	si := p.id.symbols.data(start + secIdx)
 634
 635	return sec, si, true
 636}
 637
 638// calculateTermFrequency computes the term frequency for the file match.
 639// Notes:
 640// * Filename matches count more than content matches. This mimics a common text search strategy to 'boost' matches on document titles.
 641// * Symbol matches also count more than content matches, to reward matches on symbol definitions.
 642func (p *contentProvider) calculateTermFrequency(cands []*candidateMatch, df termDocumentFrequency) map[string]int {
 643	// Treat each candidate match as a term and compute the frequencies. For now, ignore case
 644	// sensitivity and treat filenames and symbols the same as content.
 645	termFreqs := map[string]int{}
 646	for _, m := range cands {
 647		term := string(m.substrLowered)
 648		if m.fileName || p.matchesSymbol(m) {
 649			termFreqs[term] += 5
 650		} else {
 651			termFreqs[term]++
 652		}
 653	}
 654
 655	for term := range termFreqs {
 656		df[term] += 1
 657	}
 658	return termFreqs
 659}
 660
 661func (p *contentProvider) candidateMatchScore(ms []*candidateMatch, language string, debug bool) (float64, string, []*Symbol) {
 662	type debugScore struct {
 663		what  string
 664		score float64
 665	}
 666
 667	score := &debugScore{}
 668	maxScore := &debugScore{}
 669
 670	addScore := func(what string, s float64) {
 671		if s != 0 && debug {
 672			score.what += fmt.Sprintf("%s:%.2f, ", what, s)
 673		}
 674		score.score += s
 675	}
 676
 677	filename := p.data(true)
 678	var symbolInfo []*Symbol
 679
 680	for i, m := range ms {
 681		data := p.data(m.fileName)
 682
 683		endOffset := m.byteOffset + m.byteMatchSz
 684		startBoundary := m.byteOffset < uint32(len(data)) && (m.byteOffset == 0 || byteClass(data[m.byteOffset-1]) != byteClass(data[m.byteOffset]))
 685		endBoundary := endOffset > 0 && (endOffset == uint32(len(data)) || byteClass(data[endOffset-1]) != byteClass(data[endOffset]))
 686
 687		score.score = 0
 688		score.what = ""
 689
 690		if startBoundary && endBoundary {
 691			addScore("WordMatch", scoreWordMatch)
 692		} else if startBoundary || endBoundary {
 693			addScore("PartialWordMatch", scorePartialWordMatch)
 694		}
 695
 696		if m.fileName {
 697			sep := bytes.LastIndexByte(data, '/')
 698			startMatch := int(m.byteOffset) == sep+1
 699			endMatch := endOffset == uint32(len(data))
 700			if startMatch && endMatch {
 701				addScore("Base", scoreBase)
 702			} else if startMatch || endMatch {
 703				addScore("EdgeBase", (scoreBase+scorePartialBase)/2)
 704			} else if sep < int(m.byteOffset) {
 705				addScore("InnerBase", scorePartialBase)
 706			}
 707		} else if sec, si, ok := p.findSymbol(m); ok {
 708			startMatch := sec.Start == m.byteOffset
 709			endMatch := sec.End == endOffset
 710			if startMatch && endMatch {
 711				addScore("Symbol", scoreSymbol)
 712			} else if startMatch || endMatch {
 713				addScore("EdgeSymbol", (scoreSymbol+scorePartialSymbol)/2)
 714			} else {
 715				addScore("OverlapSymbol", scorePartialSymbol)
 716			}
 717
 718			// Score based on symbol data
 719			if si != nil {
 720				symbolKind := ctags.ParseSymbolKind(si.Kind)
 721				sym := sectionSlice(data, sec)
 722
 723				addScore(fmt.Sprintf("kind:%s:%s", language, si.Kind), scoreSymbolKind(language, filename, sym, symbolKind))
 724
 725				// This is from a symbol tree, so we need to store the symbol
 726				// information.
 727				if m.symbol {
 728					if symbolInfo == nil {
 729						symbolInfo = make([]*Symbol, len(ms))
 730					}
 731					// findSymbols does not hydrate in Sym. So we need to store it.
 732					si.Sym = string(sym)
 733					symbolInfo[i] = si
 734				}
 735			}
 736		}
 737
 738		// scoreWeight != 1 means it affects score
 739		if !epsilonEqualsOne(m.scoreWeight) {
 740			score.score = score.score * m.scoreWeight
 741			if debug {
 742				score.what += fmt.Sprintf("boost:%.2f, ", m.scoreWeight)
 743			}
 744		}
 745
 746		if score.score > maxScore.score {
 747			maxScore.score = score.score
 748			maxScore.what = score.what
 749		}
 750	}
 751
 752	if debug {
 753		maxScore.what = fmt.Sprintf("score:%.2f <- %s", maxScore.score, strings.TrimSuffix(maxScore.what, ", "))
 754	}
 755
 756	return maxScore.score, maxScore.what, symbolInfo
 757}
 758
 759// sectionSlice will return data[sec.Start:sec.End] but will clip Start and
 760// End such that it won't be out of range.
 761func sectionSlice(data []byte, sec DocumentSection) []byte {
 762	l := uint32(len(data))
 763	if sec.Start >= l {
 764		return nil
 765	}
 766	if sec.End > l {
 767		sec.End = l
 768	}
 769	return data[sec.Start:sec.End]
 770}
 771
 772// scoreSymbolKind boosts a match based on the combination of language, symbol
 773// and kind. The language string comes from go-enry, the symbol and kind from
 774// ctags.
 775func scoreSymbolKind(language string, filename []byte, sym []byte, kind ctags.SymbolKind) float64 {
 776	var factor float64
 777
 778	// Generic ranking which will be overriden by language specific ranking
 779	switch kind {
 780	case ctags.Type: // scip-ctags regression workaround https://github.com/sourcegraph/sourcegraph/issues/57659
 781		factor = 8
 782	case ctags.Class:
 783		factor = 10
 784	case ctags.Struct:
 785		factor = 9.5
 786	case ctags.Enum:
 787		factor = 9
 788	case ctags.Interface:
 789		factor = 8
 790	case ctags.Function, ctags.Method:
 791		factor = 7
 792	case ctags.Field:
 793		factor = 5.5
 794	case ctags.Constant:
 795		factor = 5
 796	case ctags.Variable:
 797		factor = 4
 798	default:
 799		// For all other kinds, assign a low score by default.
 800		factor = 1
 801	}
 802
 803	switch language {
 804	case "Java", "java":
 805		switch kind {
 806		// 2022-03-30: go-ctags contains a regex rule for Java classes that sets "kind"
 807		// to "classes" instead of "c". We have to cover both cases to support existing
 808		// indexes.
 809		case ctags.Class:
 810			factor = 10
 811		case ctags.Enum:
 812			factor = 9
 813		case ctags.Interface:
 814			factor = 8
 815		case ctags.Method:
 816			factor = 7
 817		case ctags.Field:
 818			factor = 6
 819		case ctags.EnumConstant:
 820			factor = 5
 821		}
 822	case "Kotlin", "kotlin":
 823		switch kind {
 824		case ctags.Class:
 825			factor = 10
 826		case ctags.Interface:
 827			factor = 9
 828		case ctags.Method:
 829			factor = 8
 830		case ctags.TypeAlias:
 831			factor = 7
 832		case ctags.Constant:
 833			factor = 6
 834		case ctags.Variable:
 835			factor = 5
 836		}
 837	case "Go", "go":
 838		switch kind {
 839		// scip-ctags regression workaround https://github.com/sourcegraph/sourcegraph/issues/57659
 840		// for each case a description of the fields in ctags in the comment
 841		case ctags.Type: // interface struct talias
 842			factor = 9
 843		case ctags.Interface: // interfaces
 844			factor = 10
 845		case ctags.Struct: // structs
 846			factor = 9
 847		case ctags.TypeAlias: // type aliases
 848			factor = 9
 849		case ctags.MethodSpec: // interface method specification
 850			factor = 8.5
 851		case ctags.Method, ctags.Function: // functions
 852			factor = 8
 853		case ctags.Field: // struct fields
 854			factor = 7
 855		case ctags.Constant: // constants
 856			factor = 6
 857		case ctags.Variable: // variables
 858			factor = 5
 859		}
 860
 861		// Boost exported go symbols. Same implementation as token.IsExported
 862		if ch, _ := utf8.DecodeRune(sym); unicode.IsUpper(ch) {
 863			factor += 0.5
 864		}
 865
 866		if bytes.HasSuffix(filename, []byte("_test.go")) {
 867			factor *= 0.8
 868		}
 869
 870		// Could also rank on:
 871		//
 872		//   - anonMember  struct anonymous members
 873		//   - packageName name for specifying imported package
 874		//   - receiver    receivers
 875		//   - package     packages
 876		//   - type        types
 877		//   - unknown     unknown
 878	case "C++", "c++":
 879		switch kind {
 880		case ctags.Class: // classes
 881			factor = 10
 882		case ctags.Enum: // enumeration names
 883			factor = 9
 884		case ctags.Function: // function definitions
 885			factor = 8
 886		case ctags.Struct: // structure names
 887			factor = 7
 888		case ctags.Union: // union names
 889			factor = 6
 890		case ctags.TypeAlias: // typedefs
 891			factor = 5
 892		case ctags.Field: // class, struct, and union members
 893			factor = 4
 894		case ctags.Variable: // varialbe definitions
 895			factor = 3
 896		}
 897	// Could also rank on:
 898	// NAME        DESCRIPTION
 899	// macro       macro definitions
 900	// enumerator  enumerators (values inside an enumeration)
 901	// header      included header files
 902	// namespace   namespaces
 903	// variable    variable definitions
 904	case "Scala", "scala":
 905		switch kind {
 906		case ctags.Class:
 907			factor = 10
 908		case ctags.Interface:
 909			factor = 9
 910		case ctags.Object:
 911			factor = 8
 912		case ctags.Function:
 913			factor = 7
 914		case ctags.Type:
 915			factor = 6
 916		case ctags.Variable:
 917			factor = 5
 918		case ctags.Package:
 919			factor = 4
 920		}
 921	case "Python", "python":
 922		switch kind {
 923		case ctags.Class: // classes
 924			factor = 10
 925		case ctags.Function, ctags.Method: // function definitions
 926			factor = 8
 927		case ctags.Field: // class, struct, and union members
 928			factor = 4
 929		case ctags.Variable: // variable definitions
 930			factor = 3
 931		case ctags.Local: // local variables
 932			factor = 2
 933		}
 934		// Could also rank on:
 935		//
 936		//   - namespace name referring a module defined in other file
 937		//   - module    modules
 938		//   - unknown   name referring a class/variable/function/module defined in other module
 939		//   - parameter function parameters
 940	case "Ruby", "ruby":
 941		switch kind {
 942		case ctags.Class:
 943			factor = 10
 944		case ctags.Method:
 945			factor = 9
 946		case ctags.MethodAlias:
 947			factor = 8
 948		case ctags.Module:
 949			factor = 7
 950		case ctags.SingletonMethod:
 951			factor = 6
 952		case ctags.Constant:
 953			factor = 5
 954		case ctags.Accessor:
 955			factor = 4
 956		case ctags.Library:
 957			factor = 3
 958		}
 959	case "PHP", "php":
 960		switch kind {
 961		case ctags.Class:
 962			factor = 10
 963		case ctags.Interface:
 964			factor = 9
 965		case ctags.Function:
 966			factor = 8
 967		case ctags.Trait:
 968			factor = 7
 969		case ctags.Define:
 970			factor = 6
 971		case ctags.Namespace:
 972			factor = 5
 973		case ctags.MethodAlias:
 974			factor = 4
 975		case ctags.Variable:
 976			factor = 3
 977		case ctags.Local:
 978			factor = 3
 979		}
 980	case "GraphQL", "graphql":
 981		switch kind {
 982		case ctags.Type:
 983			factor = 10
 984		}
 985	case "Markdown", "markdown":
 986		// Headers are good signal in docs, but do not rank as highly as code.
 987		switch kind {
 988		case ctags.Chapter: // #
 989			factor = 4
 990		case ctags.Section: // ##
 991			factor = 3
 992		case ctags.Subsection: // ###
 993			factor = 2
 994		}
 995	}
 996
 997	return factor * scoreKindMatch
 998}
 999
1000type matchScoreSlice []LineMatch
1001
1002func (m matchScoreSlice) Len() int           { return len(m) }
1003func (m matchScoreSlice) Swap(i, j int)      { m[i], m[j] = m[j], m[i] }
1004func (m matchScoreSlice) Less(i, j int) bool { return m[i].Score > m[j].Score }
1005
1006type chunkMatchScoreSlice []ChunkMatch
1007
1008func (m chunkMatchScoreSlice) Len() int           { return len(m) }
1009func (m chunkMatchScoreSlice) Swap(i, j int)      { m[i], m[j] = m[j], m[i] }
1010func (m chunkMatchScoreSlice) Less(i, j int) bool { return m[i].Score > m[j].Score }
1011
1012type fileMatchesByScore []FileMatch
1013
1014func (m fileMatchesByScore) Len() int           { return len(m) }
1015func (m fileMatchesByScore) Swap(i, j int)      { m[i], m[j] = m[j], m[i] }
1016func (m fileMatchesByScore) Less(i, j int) bool { return m[i].Score > m[j].Score }
1017
1018func sortMatchesByScore(ms []LineMatch) {
1019	sort.Sort(matchScoreSlice(ms))
1020}
1021
1022func sortChunkMatchesByScore(ms []ChunkMatch) {
1023	sort.Sort(chunkMatchScoreSlice(ms))
1024}
1025
1026// SortFiles sorts files matches in the order we want to present results to
1027// users. The order depends on the match score, which includes both
1028// query-dependent signals like word overlap, and file-only signals like the
1029// file ranks (if file ranks are enabled).
1030//
1031// We don't only use the scores, we will also boost some results to present
1032// files with novel extensions.
1033func SortFiles(ms []FileMatch) {
1034	sort.Sort(fileMatchesByScore(ms))
1035
1036	// Boost a file extension not in the top 3 to the third filematch.
1037	boostNovelExtension(ms, 2, 0.9)
1038}
1039
1040func boostNovelExtension(ms []FileMatch, boostOffset int, minScoreRatio float64) {
1041	if len(ms) <= boostOffset+1 {
1042		return
1043	}
1044
1045	top := ms[:boostOffset]
1046	candidates := ms[boostOffset:]
1047
1048	// Don't bother boosting something which is significantly different to the
1049	// result it replaces.
1050	minScoreForNovelty := candidates[0].Score * minScoreRatio
1051
1052	// We want to look for an ext that isn't in the top exts
1053	exts := make([]string, len(top))
1054	for i := range top {
1055		exts[i] = path.Ext(top[i].FileName)
1056	}
1057
1058	for i := range candidates {
1059		// Do not assume sorted due to boostNovelExtension being called on subsets
1060		if candidates[i].Score < minScoreForNovelty {
1061			continue
1062		}
1063
1064		if slices.Contains(exts, path.Ext(candidates[i].FileName)) {
1065			continue
1066		}
1067
1068		// Found what we are looking for, now boost to front of candidates (which
1069		// is ms[boostOffset])
1070		for ; i > 0; i-- {
1071			candidates[i], candidates[i-1] = candidates[i-1], candidates[i]
1072		}
1073		return
1074	}
1075}
Configure Feed

Configure Feed