contentprovider.go at b4a5b7cee93f51cd1c9392da69ac5ae2f61cb05d · boltless.me/zoekt

fork of https://github.com/sourcegraph/zoekt
zoekt / contentprovider.go
at b4a5b7cee93f51cd1c9392da69ac5ae2f61cb05d 32 kB View raw
Julie Tibshirani Add ChunkMatch.BestLineMatch to return the best-scoring line (#884) 1y ago
   1// Copyright 2016 Google Inc. All rights reserved.
   2//
   3// Licensed under the Apache License, Version 2.0 (the "License");
   4// you may not use this file except in compliance with the License.
   5// You may obtain a copy of the License at
   6//
   7//    http://www.apache.org/licenses/LICENSE-2.0
   8//
   9// Unless required by applicable law or agreed to in writing, software
  10// distributed under the License is distributed on an "AS IS" BASIS,
  11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12// See the License for the specific language governing permissions and
  13// limitations under the License.
  14
  15package zoekt
  16
  17import (
  18	"bytes"
  19	"fmt"
  20	"log"
  21	"path"
  22	"slices"
  23	"sort"
  24	"strings"
  25	"unicode"
  26	"unicode/utf8"
  27
  28	"github.com/sourcegraph/zoekt/ctags"
  29)
  30
  31var _ = log.Println
  32
  33// contentProvider is an abstraction to treat matches for names and
  34// content with the same code.
  35type contentProvider struct {
  36	id    *indexData
  37	stats *Stats
  38
  39	// mutable
  40	err      error
  41	idx      uint32
  42	_data    []byte
  43	_nl      []uint32
  44	_nlBuf   []uint32
  45	_sects   []DocumentSection
  46	_sectBuf []DocumentSection
  47	fileSize uint32
  48}
  49
  50// setDocument skips to the given document.
  51func (p *contentProvider) setDocument(docID uint32) {
  52	fileStart := p.id.boundaries[docID]
  53
  54	p.idx = docID
  55	p.fileSize = p.id.boundaries[docID+1] - fileStart
  56
  57	p._nl = nil
  58	p._sects = nil
  59	p._data = nil
  60}
  61
  62func (p *contentProvider) docSections() []DocumentSection {
  63	if p._sects == nil {
  64		var sz uint32
  65		p._sects, sz, p.err = p.id.readDocSections(p.idx, p._sectBuf)
  66		p.stats.ContentBytesLoaded += int64(sz)
  67		p._sectBuf = p._sects
  68	}
  69	return p._sects
  70}
  71
  72func (p *contentProvider) newlines() newlines {
  73	if p._nl == nil {
  74		var sz uint32
  75		p._nl, sz, p.err = p.id.readNewlines(p.idx, p._nlBuf)
  76		p._nlBuf = p._nl
  77		p.stats.ContentBytesLoaded += int64(sz)
  78	}
  79	return newlines{locs: p._nl, fileSize: p.fileSize}
  80}
  81
  82func (p *contentProvider) data(fileName bool) []byte {
  83	if fileName {
  84		return p.id.fileNameContent[p.id.fileNameIndex[p.idx]:p.id.fileNameIndex[p.idx+1]]
  85	}
  86
  87	if p._data == nil {
  88		p._data, p.err = p.id.readContents(p.idx)
  89		p.stats.FilesLoaded++
  90		p.stats.ContentBytesLoaded += int64(len(p._data))
  91	}
  92	return p._data
  93}
  94
  95// Find offset in bytes (relative to corpus start) for an offset in
  96// runes (relative to document start). If filename is set, the corpus
  97// is the set of filenames, with the document being the name itself.
  98func (p *contentProvider) findOffset(filename bool, r uint32) uint32 {
  99	if p.id.metaData.PlainASCII {
 100		return r
 101	}
 102
 103	sample := p.id.runeOffsets
 104	runeEnds := p.id.fileEndRunes
 105	fileStartByte := p.id.boundaries[p.idx]
 106	if filename {
 107		sample = p.id.fileNameRuneOffsets
 108		runeEnds = p.id.fileNameEndRunes
 109		fileStartByte = p.id.fileNameIndex[p.idx]
 110	}
 111
 112	absR := r
 113	if p.idx > 0 {
 114		absR += runeEnds[p.idx-1]
 115	}
 116
 117	byteOff, left := sample.lookup(absR)
 118
 119	var data []byte
 120
 121	if filename {
 122		data = p.id.fileNameContent[byteOff:]
 123	} else {
 124		data, p.err = p.id.readContentSlice(byteOff, 3*runeOffsetFrequency)
 125		if p.err != nil {
 126			return 0
 127		}
 128	}
 129	for left > 0 {
 130		_, sz := utf8.DecodeRune(data)
 131		byteOff += uint32(sz)
 132		data = data[sz:]
 133		left--
 134	}
 135
 136	byteOff -= fileStartByte
 137	return byteOff
 138}
 139
 140// fillMatches converts the internal candidateMatch slice into our API's LineMatch.
 141// It only ever returns content XOR filename matches, not both. If there are any
 142// content matches, these are always returned, and we omit filename matches.
 143//
 144// Performance invariant: ms is sorted and non-overlapping.
 145//
 146// Note: the byte slices may be backed by mmapped data, so before being
 147// returned by the API it needs to be copied.
 148func (p *contentProvider) fillMatches(ms []*candidateMatch, numContextLines int, language string, debug bool) []LineMatch {
 149	var filenameMatches []*candidateMatch
 150	contentMatches := make([]*candidateMatch, 0, len(ms))
 151
 152	for _, m := range ms {
 153		if m.fileName {
 154			filenameMatches = append(filenameMatches, m)
 155		} else {
 156			contentMatches = append(contentMatches, m)
 157		}
 158	}
 159
 160	// If there are any content matches, we only return these and skip filename matches.
 161	if len(contentMatches) > 0 {
 162		contentMatches = breakMatchesOnNewlines(contentMatches, p.data(false))
 163		return p.fillContentMatches(contentMatches, numContextLines, language, debug)
 164	}
 165
 166	// Otherwise, we return a single line containing the filematch match.
 167	bestMatch, _ := p.candidateMatchScore(filenameMatches, language, debug)
 168	res := LineMatch{
 169		Line:       p.id.fileName(p.idx),
 170		FileName:   true,
 171		Score:      bestMatch.score,
 172		DebugScore: bestMatch.debugScore,
 173	}
 174
 175	for _, m := range ms {
 176		res.LineFragments = append(res.LineFragments, LineFragmentMatch{
 177			LineOffset:  int(m.byteOffset),
 178			MatchLength: int(m.byteMatchSz),
 179			Offset:      m.byteOffset,
 180		})
 181	}
 182
 183	return []LineMatch{res}
 184
 185}
 186
 187// fillChunkMatches converts the internal candidateMatch slice into our API's ChunkMatch.
 188// It only ever returns content XOR filename matches, not both. If there are any content
 189// matches, these are always returned, and we omit filename matches.
 190//
 191// Performance invariant: ms is sorted and non-overlapping.
 192//
 193// Note: the byte slices may be backed by mmapped data, so before being
 194// returned by the API it needs to be copied.
 195func (p *contentProvider) fillChunkMatches(ms []*candidateMatch, numContextLines int, language string, debug bool) []ChunkMatch {
 196	var filenameMatches []*candidateMatch
 197	contentMatches := make([]*candidateMatch, 0, len(ms))
 198
 199	for _, m := range ms {
 200		if m.fileName {
 201			filenameMatches = append(filenameMatches, m)
 202		} else {
 203			contentMatches = append(contentMatches, m)
 204		}
 205	}
 206
 207	// If there are any content matches, we only return these and skip filename matches.
 208	if len(contentMatches) > 0 {
 209		return p.fillContentChunkMatches(contentMatches, numContextLines, language, debug)
 210	}
 211
 212	// Otherwise, we return a single chunk representing the filename match.
 213	bestMatch, _ := p.candidateMatchScore(filenameMatches, language, debug)
 214	fileName := p.id.fileName(p.idx)
 215	ranges := make([]Range, 0, len(ms))
 216	for _, m := range ms {
 217		ranges = append(ranges, Range{
 218			Start: Location{
 219				ByteOffset: m.byteOffset,
 220				LineNumber: 1,
 221				Column:     uint32(utf8.RuneCount(fileName[:m.byteOffset]) + 1),
 222			},
 223			End: Location{
 224				ByteOffset: m.byteOffset + m.byteMatchSz,
 225				LineNumber: 1,
 226				Column:     uint32(utf8.RuneCount(fileName[:m.byteOffset+m.byteMatchSz]) + 1),
 227			},
 228		})
 229	}
 230
 231	return []ChunkMatch{{
 232		Content:      fileName,
 233		ContentStart: Location{ByteOffset: 0, LineNumber: 1, Column: 1},
 234		Ranges:       ranges,
 235		FileName:     true,
 236		Score:        bestMatch.score,
 237		DebugScore:   bestMatch.debugScore,
 238	}}
 239}
 240
 241func (p *contentProvider) fillContentMatches(ms []*candidateMatch, numContextLines int, language string, debug bool) []LineMatch {
 242	var result []LineMatch
 243	for len(ms) > 0 {
 244		m := ms[0]
 245		num := p.newlines().atOffset(m.byteOffset)
 246		lineStart := int(p.newlines().lineStart(num))
 247		nextLineStart := int(p.newlines().lineStart(num + 1))
 248
 249		var lineCands []*candidateMatch
 250
 251		endMatch := m.byteOffset + m.byteMatchSz
 252
 253		for len(ms) > 0 {
 254			m := ms[0]
 255			if int(m.byteOffset) < nextLineStart {
 256				endMatch = m.byteOffset + m.byteMatchSz
 257				lineCands = append(lineCands, m)
 258				ms = ms[1:]
 259			} else {
 260				break
 261			}
 262		}
 263
 264		if len(lineCands) == 0 {
 265			log.Panicf(
 266				"%s %v infinite loop: num %d start,end %d,%d, offset %d",
 267				p.id.fileName(p.idx), p.id.metaData,
 268				num, lineStart, nextLineStart,
 269				m.byteOffset)
 270		}
 271
 272		data := p.data(false)
 273
 274		// Due to merging matches, we may have a match that
 275		// crosses a line boundary. Prevent confusion by
 276		// taking lines until we pass the last match
 277		for nextLineStart < len(data) && endMatch > uint32(nextLineStart) {
 278			next := bytes.IndexByte(data[nextLineStart:], '\n')
 279			if next == -1 {
 280				nextLineStart = len(data)
 281			} else {
 282				// TODO(hanwen): test that checks "+1" part here.
 283				nextLineStart += next + 1
 284			}
 285		}
 286
 287		finalMatch := LineMatch{
 288			LineStart:  lineStart,
 289			LineEnd:    nextLineStart,
 290			LineNumber: num,
 291		}
 292		finalMatch.Line = data[lineStart:nextLineStart]
 293
 294		if numContextLines > 0 {
 295			finalMatch.Before = p.newlines().getLines(data, num-numContextLines, num)
 296			finalMatch.After = p.newlines().getLines(data, num+1, num+1+numContextLines)
 297		}
 298
 299		bestMatch, symbolInfo := p.candidateMatchScore(lineCands, language, debug)
 300		finalMatch.Score = bestMatch.score
 301		finalMatch.DebugScore = bestMatch.debugScore
 302
 303		for i, m := range lineCands {
 304			fragment := LineFragmentMatch{
 305				Offset:      m.byteOffset,
 306				LineOffset:  int(m.byteOffset) - lineStart,
 307				MatchLength: int(m.byteMatchSz),
 308			}
 309			if i < len(symbolInfo) && symbolInfo[i] != nil {
 310				fragment.SymbolInfo = symbolInfo[i]
 311			}
 312
 313			finalMatch.LineFragments = append(finalMatch.LineFragments, fragment)
 314		}
 315		result = append(result, finalMatch)
 316	}
 317	return result
 318}
 319
 320func (p *contentProvider) fillContentChunkMatches(ms []*candidateMatch, numContextLines int, language string, debug bool) []ChunkMatch {
 321	newlines := p.newlines()
 322	data := p.data(false)
 323
 324	// columnHelper prevents O(len(ms) * len(data)) lookups for all columns.
 325	// However, it depends on ms being sorted by byteOffset and non-overlapping.
 326	// This invariant is true at the time of writing, but we conservatively
 327	// enforce this. Note: chunkCandidates preserves the sorting so safe to
 328	// transform now.
 329	columnHelper := columnHelper{data: data}
 330	if !sort.IsSorted((sortByOffsetSlice)(ms)) {
 331		log.Printf("WARN: performance invariant violated. candidate matches are not sorted in fillContentChunkMatches. Report to developers.")
 332		sort.Sort((sortByOffsetSlice)(ms))
 333	}
 334
 335	chunks := chunkCandidates(ms, newlines, numContextLines)
 336	chunkMatches := make([]ChunkMatch, 0, len(chunks))
 337	for _, chunk := range chunks {
 338		bestMatch, symbolInfo := p.candidateMatchScore(chunk.candidates, language, debug)
 339
 340		ranges := make([]Range, 0, len(chunk.candidates))
 341		for _, cm := range chunk.candidates {
 342			startOffset := cm.byteOffset
 343			endOffset := cm.byteOffset + cm.byteMatchSz
 344			startLine, endLine := newlines.offsetRangeToLineRange(startOffset, endOffset)
 345
 346			ranges = append(ranges, Range{
 347				Start: Location{
 348					ByteOffset: startOffset,
 349					LineNumber: uint32(startLine),
 350					Column:     columnHelper.get(int(newlines.lineStart(startLine)), startOffset),
 351				},
 352				End: Location{
 353					ByteOffset: endOffset,
 354					LineNumber: uint32(endLine),
 355					Column:     columnHelper.get(int(newlines.lineStart(endLine)), endOffset),
 356				},
 357			})
 358		}
 359
 360		firstLineNumber := int(chunk.firstLine) - numContextLines
 361		if firstLineNumber < 1 {
 362			firstLineNumber = 1
 363		}
 364		firstLineStart := newlines.lineStart(firstLineNumber)
 365
 366		bestLineMatch := 0
 367		if bestMatch.match != nil {
 368			bestLineMatch = newlines.atOffset(bestMatch.match.byteOffset)
 369			if debug {
 370				bestMatch.debugScore = fmt.Sprintf("%s, (line: %d)", bestMatch.debugScore, bestLineMatch)
 371			}
 372		}
 373
 374		chunkMatches = append(chunkMatches, ChunkMatch{
 375			Content: newlines.getLines(data, firstLineNumber, int(chunk.lastLine)+numContextLines+1),
 376			ContentStart: Location{
 377				ByteOffset: firstLineStart,
 378				LineNumber: uint32(firstLineNumber),
 379				Column:     1,
 380			},
 381			FileName:      false,
 382			Ranges:        ranges,
 383			SymbolInfo:    symbolInfo,
 384			BestLineMatch: uint32(bestLineMatch),
 385			Score:         bestMatch.score,
 386			DebugScore:    bestMatch.debugScore,
 387		})
 388	}
 389	return chunkMatches
 390}
 391
 392type candidateChunk struct {
 393	candidates []*candidateMatch
 394	firstLine  uint32 // 1-based, inclusive
 395	lastLine   uint32 // 1-based, inclusive
 396	minOffset  uint32 // 0-based, inclusive
 397	maxOffset  uint32 // 0-based, exclusive
 398}
 399
 400// chunkCandidates groups a set of sorted, non-overlapping candidate matches by line number. Adjacent
 401// chunks will be merged if adding `numContextLines` to the beginning and end of the chunk would cause
 402// it to overlap with an adjacent chunk.
 403//
 404// input invariants: ms is sorted by byteOffset and is non overlapping with respect to endOffset.
 405// output invariants: if you flatten candidates the input invariant is retained.
 406func chunkCandidates(ms []*candidateMatch, newlines newlines, numContextLines int) []candidateChunk {
 407	var chunks []candidateChunk
 408	for _, m := range ms {
 409		startOffset := m.byteOffset
 410		endOffset := m.byteOffset + m.byteMatchSz
 411		firstLine, lastLine := newlines.offsetRangeToLineRange(startOffset, endOffset)
 412
 413		if len(chunks) > 0 && int(chunks[len(chunks)-1].lastLine)+numContextLines >= firstLine-numContextLines {
 414			// If a new chunk created with the current candidateMatch would
 415			// overlap with the previous chunk, instead add the candidateMatch
 416			// to the last chunk and extend end of the last chunk.
 417			last := &chunks[len(chunks)-1]
 418			last.candidates = append(last.candidates, m)
 419			if last.maxOffset < endOffset {
 420				last.lastLine = uint32(lastLine)
 421				last.maxOffset = uint32(endOffset)
 422			}
 423		} else {
 424			chunks = append(chunks, candidateChunk{
 425				firstLine:  uint32(firstLine),
 426				lastLine:   uint32(lastLine),
 427				minOffset:  startOffset,
 428				maxOffset:  endOffset,
 429				candidates: []*candidateMatch{m},
 430			})
 431		}
 432	}
 433	return chunks
 434}
 435
 436// columnHelper is a helper struct which caches the number of runes last
 437// counted. If we naively use utf8.RuneCount for each match on a line, this
 438// leads to an O(nm) algorithm where m is the number of matches and n is the
 439// length of the line. Aassuming we our candidates are increasing in offset
 440// makes this operation O(n) instead.
 441type columnHelper struct {
 442	data []byte
 443
 444	// 0 values for all these are valid values
 445	lastLineOffset int
 446	lastOffset     uint32
 447	lastRuneCount  uint32
 448}
 449
 450// get returns the line column for offset. offset is the byte offset of the
 451// rune in data. lineOffset is the byte offset inside of data for the line
 452// containing offset.
 453func (c *columnHelper) get(lineOffset int, offset uint32) uint32 {
 454	var runeCount uint32
 455
 456	if lineOffset == c.lastLineOffset && offset >= c.lastOffset {
 457		// Can count from last calculation
 458		runeCount = c.lastRuneCount + uint32(utf8.RuneCount(c.data[c.lastOffset:offset]))
 459	} else {
 460		// Need to count from the beginning of line
 461		runeCount = uint32(utf8.RuneCount(c.data[lineOffset:offset]))
 462	}
 463
 464	c.lastLineOffset = lineOffset
 465	c.lastOffset = offset
 466	c.lastRuneCount = runeCount
 467
 468	return runeCount + 1
 469}
 470
 471type newlines struct {
 472	// locs is the sorted set of byte offsets of the newlines in the file
 473	locs []uint32
 474
 475	// fileSize is just the number of bytes in the file. It is stored
 476	// on this struct so we can safely know the length of the last line
 477	// in the file since not all files end in a newline.
 478	fileSize uint32
 479}
 480
 481// atOffset returns the line containing the offset. If the offset lands on
 482// the newline ending line M, we return M.
 483func (nls newlines) atOffset(offset uint32) (lineNumber int) {
 484	idx := sort.Search(len(nls.locs), func(n int) bool {
 485		return nls.locs[n] >= offset
 486	})
 487	return idx + 1
 488}
 489
 490// lineStart returns the byte offset of the beginning of the given line.
 491// lineNumber is 1-based. If lineNumber is out of range of the lines in the
 492// file, the return value will be clamped to [0,fileSize].
 493func (nls newlines) lineStart(lineNumber int) uint32 {
 494	// nls.locs[0] + 1 is the start of the 2nd line of data.
 495	startIdx := lineNumber - 2
 496
 497	if startIdx < 0 {
 498		return 0
 499	} else if startIdx >= len(nls.locs) {
 500		return nls.fileSize
 501	} else {
 502		return nls.locs[startIdx] + 1
 503	}
 504}
 505
 506// offsetRangeToLineRange returns range of lines that fully contains the given byte range.
 507// The inputs are 0-based byte offsets into the file representing the (exclusive) range [startOffset, endOffset).
 508// The return values are 1-based line numbers representing the (inclusive) range [startLine, endLine].
 509func (nls newlines) offsetRangeToLineRange(startOffset, endOffset uint32) (startLine, endLine int) {
 510	startLine = nls.atOffset(startOffset)
 511	endLine = nls.atOffset(
 512		max(startOffset, max(endOffset, 1)-1), // clamp endOffset and prevent underflow
 513	)
 514	return startLine, endLine
 515}
 516
 517// getLines returns a slice of data containing the lines [low, high).
 518// low is 1-based and inclusive. high is 1-based and exclusive.
 519func (nls newlines) getLines(data []byte, low, high int) []byte {
 520	if low >= high {
 521		return nil
 522	}
 523
 524	return data[nls.lineStart(low):nls.lineStart(high)]
 525}
 526
 527const (
 528	// Query-dependent scoring signals. All of these together are bounded at ~9000
 529	// (scoreWordMatch + scoreSymbol + scoreKindMatch * 10 + scoreFactorAtomMatch).
 530	scorePartialWordMatch = 50.0
 531	scoreWordMatch        = 500.0
 532	scoreBase             = 7000.0
 533	scorePartialBase      = 4000.0
 534	scoreSymbol           = 7000.0
 535	scorePartialSymbol    = 4000.0
 536	scoreKindMatch        = 100.0
 537	scoreFactorAtomMatch  = 400.0
 538
 539	// File-only scoring signals. For now these are also bounded ~9000 to give them
 540	// equal weight with the query-dependent signals.
 541	scoreFileRankFactor = 9000.0
 542
 543	// Used for ordering line and chunk matches within a file.
 544	scoreLineOrderFactor = 1.0
 545
 546	// Used for tiebreakers. The scores are not combined with the main score, but
 547	// are used to break ties between matches with the same score. The factors are
 548	// chosen to separate the tiebreakers from the main score and from each other.
 549	// If you make changes here, make sure to update indexData.scoreFile too.
 550	scoreRepoRankFactor  = 100.0
 551	scoreFileOrderFactor = 10.0
 552)
 553
 554// findMaxOverlappingSection returns the index of the section in secs that
 555// overlaps the most with the area defined by off and sz, relative to the size
 556// of the section. If no section overlaps, it returns 0, false. If multiple
 557// sections overlap the same amount, the first one is returned.
 558//
 559// The implementation assumes that sections do not overlap and are sorted by
 560// DocumentSection.Start.
 561func findMaxOverlappingSection(secs []DocumentSection, off, sz uint32) (uint32, bool) {
 562	start := off
 563	end := off + sz
 564
 565	// Find the first section that might overlap
 566	j := sort.Search(len(secs), func(i int) bool { return secs[i].End > start })
 567
 568	if j == len(secs) || secs[j].Start >= end {
 569		// No overlap.
 570		return 0, false
 571	}
 572
 573	relOverlap := func(j int) float64 {
 574		secSize := secs[j].End - secs[j].Start
 575		if secSize == 0 {
 576			return 0
 577		}
 578		// This cannot overflow because we make sure there is overlap before calling relOverlap
 579		overlap := min(secs[j].End, end) - max(secs[j].Start, start)
 580		return float64(overlap) / float64(secSize)
 581	}
 582
 583	ol1 := relOverlap(j)
 584	if epsilonEqualsOne(ol1) || j == len(secs)-1 || secs[j+1].Start >= end {
 585		return uint32(j), ol1 > 0
 586	}
 587
 588	// We know that [off,off+sz[ overlaps with at least 2 sections. We only have to check
 589	// if the second section overlaps more than the first one, because a third
 590	// section can only overlap if the overlap with the second section is complete.
 591	ol2 := relOverlap(j + 1)
 592	if ol2 > ol1 {
 593		return uint32(j + 1), ol2 > 0
 594	}
 595
 596	return uint32(j), ol1 > 0
 597}
 598
 599func (p *contentProvider) matchesSymbol(cm *candidateMatch) bool {
 600	if cm.fileName {
 601		return false
 602	}
 603
 604	// Check if this candidate came from a symbol matchTree
 605	if cm.symbol {
 606		return true
 607	}
 608
 609	// Check if it overlaps with a symbol.
 610	secs := p.docSections()
 611	_, ok := findMaxOverlappingSection(secs, cm.byteOffset, cm.byteMatchSz)
 612	return ok
 613}
 614
 615func (p *contentProvider) findSymbol(cm *candidateMatch) (DocumentSection, *Symbol, bool) {
 616	if cm.fileName {
 617		return DocumentSection{}, nil, false
 618	}
 619
 620	secs := p.docSections()
 621
 622	secIdx, ok := cm.symbolIdx, cm.symbol
 623	if !ok {
 624		// Not from a symbol matchTree. Let's see if it overlaps with a symbol.
 625		secIdx, ok = findMaxOverlappingSection(secs, cm.byteOffset, cm.byteMatchSz)
 626	}
 627	if !ok {
 628		return DocumentSection{}, nil, false
 629	}
 630
 631	sec := secs[secIdx]
 632
 633	// Now lets hydrate in the SymbolInfo. We do not hydrate in SymbolInfo.Sym
 634	// since some callsites do not need it stored, and that incurs an extra
 635	// copy.
 636	//
 637	// 2024-01-08 we are refactoring this and the code path indicates this can
 638	// fail, so callers need to handle nil symbol. However, it would be
 639	// surprising that we have a matching section but not symbol data.
 640	start := p.id.fileEndSymbol[p.idx]
 641	si := p.id.symbols.data(start + secIdx)
 642
 643	return sec, si, true
 644}
 645
 646// calculateTermFrequency computes the term frequency for the file match.
 647// Notes:
 648// * Filename matches count more than content matches. This mimics a common text search strategy to 'boost' matches on document titles.
 649// * Symbol matches also count more than content matches, to reward matches on symbol definitions.
 650func (p *contentProvider) calculateTermFrequency(cands []*candidateMatch, df termDocumentFrequency) map[string]int {
 651	// Treat each candidate match as a term and compute the frequencies. For now, ignore case
 652	// sensitivity and treat filenames and symbols the same as content.
 653	termFreqs := map[string]int{}
 654	for _, m := range cands {
 655		term := string(m.substrLowered)
 656		if m.fileName || p.matchesSymbol(m) {
 657			termFreqs[term] += 5
 658		} else {
 659			termFreqs[term]++
 660		}
 661	}
 662
 663	for term := range termFreqs {
 664		df[term] += 1
 665	}
 666	return termFreqs
 667}
 668
 669// scoredMatch holds the score information for a candidate match.
 670type scoredMatch struct {
 671	score      float64
 672	debugScore string
 673	match      *candidateMatch
 674}
 675
 676// candidateMatchScore scores all candidate matches and returns the best-scoring match plus its score information.
 677// Invariant: there should be at least one input candidate, len(ms) > 0.
 678func (p *contentProvider) candidateMatchScore(ms []*candidateMatch, language string, debug bool) (scoredMatch, []*Symbol) {
 679	score := 0.0
 680	what := ""
 681
 682	addScore := func(w string, s float64) {
 683		if s != 0 && debug {
 684			what += fmt.Sprintf("%s:%.2f, ", w, s)
 685		}
 686		score += s
 687	}
 688
 689	filename := p.data(true)
 690	var symbolInfo []*Symbol
 691
 692	var bestMatch scoredMatch
 693	for i, m := range ms {
 694		data := p.data(m.fileName)
 695
 696		endOffset := m.byteOffset + m.byteMatchSz
 697		startBoundary := m.byteOffset < uint32(len(data)) && (m.byteOffset == 0 || byteClass(data[m.byteOffset-1]) != byteClass(data[m.byteOffset]))
 698		endBoundary := endOffset > 0 && (endOffset == uint32(len(data)) || byteClass(data[endOffset-1]) != byteClass(data[endOffset]))
 699
 700		score = 0
 701		what = ""
 702
 703		if startBoundary && endBoundary {
 704			addScore("WordMatch", scoreWordMatch)
 705		} else if startBoundary || endBoundary {
 706			addScore("PartialWordMatch", scorePartialWordMatch)
 707		}
 708
 709		if m.fileName {
 710			sep := bytes.LastIndexByte(data, '/')
 711			startMatch := int(m.byteOffset) == sep+1
 712			endMatch := endOffset == uint32(len(data))
 713			if startMatch && endMatch {
 714				addScore("Base", scoreBase)
 715			} else if startMatch || endMatch {
 716				addScore("EdgeBase", (scoreBase+scorePartialBase)/2)
 717			} else if sep < int(m.byteOffset) {
 718				addScore("InnerBase", scorePartialBase)
 719			}
 720		} else if sec, si, ok := p.findSymbol(m); ok {
 721			startMatch := sec.Start == m.byteOffset
 722			endMatch := sec.End == endOffset
 723			if startMatch && endMatch {
 724				addScore("Symbol", scoreSymbol)
 725			} else if startMatch || endMatch {
 726				addScore("EdgeSymbol", (scoreSymbol+scorePartialSymbol)/2)
 727			} else {
 728				addScore("OverlapSymbol", scorePartialSymbol)
 729			}
 730
 731			// Score based on symbol data
 732			if si != nil {
 733				symbolKind := ctags.ParseSymbolKind(si.Kind)
 734				sym := sectionSlice(data, sec)
 735
 736				addScore(fmt.Sprintf("kind:%s:%s", language, si.Kind), scoreSymbolKind(language, filename, sym, symbolKind))
 737
 738				// This is from a symbol tree, so we need to store the symbol
 739				// information.
 740				if m.symbol {
 741					if symbolInfo == nil {
 742						symbolInfo = make([]*Symbol, len(ms))
 743					}
 744					// findSymbols does not hydrate in Sym. So we need to store it.
 745					si.Sym = string(sym)
 746					symbolInfo[i] = si
 747				}
 748			}
 749		}
 750
 751		// scoreWeight != 1 means it affects score
 752		if !epsilonEqualsOne(m.scoreWeight) {
 753			score = score * m.scoreWeight
 754			if debug {
 755				what += fmt.Sprintf("boost:%.2f, ", m.scoreWeight)
 756			}
 757		}
 758
 759		if score > bestMatch.score {
 760			bestMatch.score = score
 761			bestMatch.debugScore = what
 762			bestMatch.match = m
 763		}
 764	}
 765
 766	if debug {
 767		bestMatch.debugScore = fmt.Sprintf("score:%.2f <- %s", bestMatch.score, strings.TrimSuffix(bestMatch.debugScore, ", "))
 768	}
 769
 770	return bestMatch, symbolInfo
 771}
 772
 773// sectionSlice will return data[sec.Start:sec.End] but will clip Start and
 774// End such that it won't be out of range.
 775func sectionSlice(data []byte, sec DocumentSection) []byte {
 776	l := uint32(len(data))
 777	if sec.Start >= l {
 778		return nil
 779	}
 780	if sec.End > l {
 781		sec.End = l
 782	}
 783	return data[sec.Start:sec.End]
 784}
 785
 786// scoreSymbolKind boosts a match based on the combination of language, symbol
 787// and kind. The language string comes from go-enry, the symbol and kind from
 788// ctags.
 789func scoreSymbolKind(language string, filename []byte, sym []byte, kind ctags.SymbolKind) float64 {
 790	var factor float64
 791
 792	// Generic ranking which will be overriden by language specific ranking
 793	switch kind {
 794	case ctags.Type: // scip-ctags regression workaround https://github.com/sourcegraph/sourcegraph/issues/57659
 795		factor = 8
 796	case ctags.Class:
 797		factor = 10
 798	case ctags.Struct:
 799		factor = 9.5
 800	case ctags.Enum:
 801		factor = 9
 802	case ctags.Interface:
 803		factor = 8
 804	case ctags.Function, ctags.Method:
 805		factor = 7
 806	case ctags.Field:
 807		factor = 5.5
 808	case ctags.Constant:
 809		factor = 5
 810	case ctags.Variable:
 811		factor = 4
 812	default:
 813		// For all other kinds, assign a low score by default.
 814		factor = 1
 815	}
 816
 817	switch language {
 818	case "Java", "java":
 819		switch kind {
 820		// 2022-03-30: go-ctags contains a regex rule for Java classes that sets "kind"
 821		// to "classes" instead of "c". We have to cover both cases to support existing
 822		// indexes.
 823		case ctags.Class:
 824			factor = 10
 825		case ctags.Enum:
 826			factor = 9
 827		case ctags.Interface:
 828			factor = 8
 829		case ctags.Method:
 830			factor = 7
 831		case ctags.Field:
 832			factor = 6
 833		case ctags.EnumConstant:
 834			factor = 5
 835		}
 836	case "Kotlin", "kotlin":
 837		switch kind {
 838		case ctags.Class:
 839			factor = 10
 840		case ctags.Interface:
 841			factor = 9
 842		case ctags.Method:
 843			factor = 8
 844		case ctags.TypeAlias:
 845			factor = 7
 846		case ctags.Constant:
 847			factor = 6
 848		case ctags.Variable:
 849			factor = 5
 850		}
 851	case "Go", "go":
 852		switch kind {
 853		// scip-ctags regression workaround https://github.com/sourcegraph/sourcegraph/issues/57659
 854		// for each case a description of the fields in ctags in the comment
 855		case ctags.Type: // interface struct talias
 856			factor = 9
 857		case ctags.Interface: // interfaces
 858			factor = 10
 859		case ctags.Struct: // structs
 860			factor = 9
 861		case ctags.TypeAlias: // type aliases
 862			factor = 9
 863		case ctags.MethodSpec: // interface method specification
 864			factor = 8.5
 865		case ctags.Method, ctags.Function: // functions
 866			factor = 8
 867		case ctags.Field: // struct fields
 868			factor = 7
 869		case ctags.Constant: // constants
 870			factor = 6
 871		case ctags.Variable: // variables
 872			factor = 5
 873		}
 874
 875		// Boost exported go symbols. Same implementation as token.IsExported
 876		if ch, _ := utf8.DecodeRune(sym); unicode.IsUpper(ch) {
 877			factor += 0.5
 878		}
 879
 880		if bytes.HasSuffix(filename, []byte("_test.go")) {
 881			factor *= 0.8
 882		}
 883
 884		// Could also rank on:
 885		//
 886		//   - anonMember  struct anonymous members
 887		//   - packageName name for specifying imported package
 888		//   - receiver    receivers
 889		//   - package     packages
 890		//   - type        types
 891		//   - unknown     unknown
 892	case "C++", "c++":
 893		switch kind {
 894		case ctags.Class: // classes
 895			factor = 10
 896		case ctags.Enum: // enumeration names
 897			factor = 9
 898		case ctags.Function: // function definitions
 899			factor = 8
 900		case ctags.Struct: // structure names
 901			factor = 7
 902		case ctags.Union: // union names
 903			factor = 6
 904		case ctags.TypeAlias: // typedefs
 905			factor = 5
 906		case ctags.Field: // class, struct, and union members
 907			factor = 4
 908		case ctags.Variable: // varialbe definitions
 909			factor = 3
 910		}
 911	// Could also rank on:
 912	// NAME        DESCRIPTION
 913	// macro       macro definitions
 914	// enumerator  enumerators (values inside an enumeration)
 915	// header      included header files
 916	// namespace   namespaces
 917	// variable    variable definitions
 918	case "Scala", "scala":
 919		switch kind {
 920		case ctags.Class:
 921			factor = 10
 922		case ctags.Interface:
 923			factor = 9
 924		case ctags.Object:
 925			factor = 8
 926		case ctags.Function:
 927			factor = 7
 928		case ctags.Type:
 929			factor = 6
 930		case ctags.Variable:
 931			factor = 5
 932		case ctags.Package:
 933			factor = 4
 934		}
 935	case "Python", "python":
 936		switch kind {
 937		case ctags.Class: // classes
 938			factor = 10
 939		case ctags.Function, ctags.Method: // function definitions
 940			factor = 8
 941		case ctags.Field: // class, struct, and union members
 942			factor = 4
 943		case ctags.Variable: // variable definitions
 944			factor = 3
 945		case ctags.Local: // local variables
 946			factor = 2
 947		}
 948		// Could also rank on:
 949		//
 950		//   - namespace name referring a module defined in other file
 951		//   - module    modules
 952		//   - unknown   name referring a class/variable/function/module defined in other module
 953		//   - parameter function parameters
 954	case "Ruby", "ruby":
 955		switch kind {
 956		case ctags.Class:
 957			factor = 10
 958		case ctags.Method:
 959			factor = 9
 960		case ctags.MethodAlias:
 961			factor = 8
 962		case ctags.Module:
 963			factor = 7
 964		case ctags.SingletonMethod:
 965			factor = 6
 966		case ctags.Constant:
 967			factor = 5
 968		case ctags.Accessor:
 969			factor = 4
 970		case ctags.Library:
 971			factor = 3
 972		}
 973	case "PHP", "php":
 974		switch kind {
 975		case ctags.Class:
 976			factor = 10
 977		case ctags.Interface:
 978			factor = 9
 979		case ctags.Function:
 980			factor = 8
 981		case ctags.Trait:
 982			factor = 7
 983		case ctags.Define:
 984			factor = 6
 985		case ctags.Namespace:
 986			factor = 5
 987		case ctags.MethodAlias:
 988			factor = 4
 989		case ctags.Variable:
 990			factor = 3
 991		case ctags.Local:
 992			factor = 3
 993		}
 994	case "GraphQL", "graphql":
 995		switch kind {
 996		case ctags.Type:
 997			factor = 10
 998		}
 999	case "Markdown", "markdown":
1000		// Headers are good signal in docs, but do not rank as highly as code.
1001		switch kind {
1002		case ctags.Chapter: // #
1003			factor = 4
1004		case ctags.Section: // ##
1005			factor = 3
1006		case ctags.Subsection: // ###
1007			factor = 2
1008		}
1009	}
1010
1011	return factor * scoreKindMatch
1012}
1013
1014type matchScoreSlice []LineMatch
1015
1016func (m matchScoreSlice) Len() int           { return len(m) }
1017func (m matchScoreSlice) Swap(i, j int)      { m[i], m[j] = m[j], m[i] }
1018func (m matchScoreSlice) Less(i, j int) bool { return m[i].Score > m[j].Score }
1019
1020type chunkMatchScoreSlice []ChunkMatch
1021
1022func (m chunkMatchScoreSlice) Len() int           { return len(m) }
1023func (m chunkMatchScoreSlice) Swap(i, j int)      { m[i], m[j] = m[j], m[i] }
1024func (m chunkMatchScoreSlice) Less(i, j int) bool { return m[i].Score > m[j].Score }
1025
1026type fileMatchesByScore []FileMatch
1027
1028func (m fileMatchesByScore) Len() int           { return len(m) }
1029func (m fileMatchesByScore) Swap(i, j int)      { m[i], m[j] = m[j], m[i] }
1030func (m fileMatchesByScore) Less(i, j int) bool { return m[i].Score > m[j].Score }
1031
1032func sortMatchesByScore(ms []LineMatch) {
1033	sort.Sort(matchScoreSlice(ms))
1034}
1035
1036func sortChunkMatchesByScore(ms []ChunkMatch) {
1037	sort.Sort(chunkMatchScoreSlice(ms))
1038}
1039
1040// SortFiles sorts files matches in the order we want to present results to
1041// users. The order depends on the match score, which includes both
1042// query-dependent signals like word overlap, and file-only signals like the
1043// file ranks (if file ranks are enabled).
1044//
1045// We don't only use the scores, we will also boost some results to present
1046// files with novel extensions.
1047func SortFiles(ms []FileMatch) {
1048	sort.Sort(fileMatchesByScore(ms))
1049
1050	// Boost a file extension not in the top 3 to the third filematch.
1051	boostNovelExtension(ms, 2, 0.9)
1052}
1053
1054func boostNovelExtension(ms []FileMatch, boostOffset int, minScoreRatio float64) {
1055	if len(ms) <= boostOffset+1 {
1056		return
1057	}
1058
1059	top := ms[:boostOffset]
1060	candidates := ms[boostOffset:]
1061
1062	// Don't bother boosting something which is significantly different to the
1063	// result it replaces.
1064	minScoreForNovelty := candidates[0].Score * minScoreRatio
1065
1066	// We want to look for an ext that isn't in the top exts
1067	exts := make([]string, len(top))
1068	for i := range top {
1069		exts[i] = path.Ext(top[i].FileName)
1070	}
1071
1072	for i := range candidates {
1073		// Do not assume sorted due to boostNovelExtension being called on subsets
1074		if candidates[i].Score < minScoreForNovelty {
1075			continue
1076		}
1077
1078		if slices.Contains(exts, path.Ext(candidates[i].FileName)) {
1079			continue
1080		}
1081
1082		// Found what we are looking for, now boost to front of candidates (which
1083		// is ms[boostOffset])
1084		for ; i > 0; i-- {
1085			candidates[i], candidates[i-1] = candidates[i-1], candidates[i]
1086		}
1087		return
1088	}
1089}
Configure Feed

Configure Feed