contentprovider.go at 06de7adb722e71f0fd38f4bcea416b66cea759ec · boltless.me/zoekt

fork of https://github.com/sourcegraph/zoekt
zoekt / contentprovider.go
at 06de7adb722e71f0fd38f4bcea416b66cea759ec 30 kB View raw
Stefan Hengl ranking: allow partial overlap with symbol (#742) 2y ago
   1// Copyright 2016 Google Inc. All rights reserved.
   2//
   3// Licensed under the Apache License, Version 2.0 (the "License");
   4// you may not use this file except in compliance with the License.
   5// You may obtain a copy of the License at
   6//
   7//    http://www.apache.org/licenses/LICENSE-2.0
   8//
   9// Unless required by applicable law or agreed to in writing, software
  10// distributed under the License is distributed on an "AS IS" BASIS,
  11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12// See the License for the specific language governing permissions and
  13// limitations under the License.
  14
  15package zoekt
  16
  17import (
  18	"bytes"
  19	"fmt"
  20	"log"
  21	"path"
  22	"slices"
  23	"sort"
  24	"strings"
  25	"unicode"
  26	"unicode/utf8"
  27
  28	"github.com/sourcegraph/zoekt/ctags"
  29)
  30
  31var _ = log.Println
  32
  33// contentProvider is an abstraction to treat matches for names and
  34// content with the same code.
  35type contentProvider struct {
  36	id    *indexData
  37	stats *Stats
  38
  39	// mutable
  40	err      error
  41	idx      uint32
  42	_data    []byte
  43	_nl      []uint32
  44	_nlBuf   []uint32
  45	_sects   []DocumentSection
  46	_sectBuf []DocumentSection
  47	fileSize uint32
  48}
  49
  50// setDocument skips to the given document.
  51func (p *contentProvider) setDocument(docID uint32) {
  52	fileStart := p.id.boundaries[docID]
  53
  54	p.idx = docID
  55	p.fileSize = p.id.boundaries[docID+1] - fileStart
  56
  57	p._nl = nil
  58	p._sects = nil
  59	p._data = nil
  60}
  61
  62func (p *contentProvider) docSections() []DocumentSection {
  63	if p._sects == nil {
  64		var sz uint32
  65		p._sects, sz, p.err = p.id.readDocSections(p.idx, p._sectBuf)
  66		p.stats.ContentBytesLoaded += int64(sz)
  67		p._sectBuf = p._sects
  68	}
  69	return p._sects
  70}
  71
  72func (p *contentProvider) newlines() newlines {
  73	if p._nl == nil {
  74		var sz uint32
  75		p._nl, sz, p.err = p.id.readNewlines(p.idx, p._nlBuf)
  76		p._nlBuf = p._nl
  77		p.stats.ContentBytesLoaded += int64(sz)
  78	}
  79	return newlines{locs: p._nl, fileSize: p.fileSize}
  80}
  81
  82func (p *contentProvider) data(fileName bool) []byte {
  83	if fileName {
  84		return p.id.fileNameContent[p.id.fileNameIndex[p.idx]:p.id.fileNameIndex[p.idx+1]]
  85	}
  86
  87	if p._data == nil {
  88		p._data, p.err = p.id.readContents(p.idx)
  89		p.stats.FilesLoaded++
  90		p.stats.ContentBytesLoaded += int64(len(p._data))
  91	}
  92	return p._data
  93}
  94
  95// Find offset in bytes (relative to corpus start) for an offset in
  96// runes (relative to document start). If filename is set, the corpus
  97// is the set of filenames, with the document being the name itself.
  98func (p *contentProvider) findOffset(filename bool, r uint32) uint32 {
  99	if p.id.metaData.PlainASCII {
 100		return r
 101	}
 102
 103	sample := p.id.runeOffsets
 104	runeEnds := p.id.fileEndRunes
 105	fileStartByte := p.id.boundaries[p.idx]
 106	if filename {
 107		sample = p.id.fileNameRuneOffsets
 108		runeEnds = p.id.fileNameEndRunes
 109		fileStartByte = p.id.fileNameIndex[p.idx]
 110	}
 111
 112	absR := r
 113	if p.idx > 0 {
 114		absR += runeEnds[p.idx-1]
 115	}
 116
 117	byteOff, left := sample.lookup(absR)
 118
 119	var data []byte
 120
 121	if filename {
 122		data = p.id.fileNameContent[byteOff:]
 123	} else {
 124		data, p.err = p.id.readContentSlice(byteOff, 3*runeOffsetFrequency)
 125		if p.err != nil {
 126			return 0
 127		}
 128	}
 129	for left > 0 {
 130		_, sz := utf8.DecodeRune(data)
 131		byteOff += uint32(sz)
 132		data = data[sz:]
 133		left--
 134	}
 135
 136	byteOff -= fileStartByte
 137	return byteOff
 138}
 139
 140func (p *contentProvider) fillMatches(ms []*candidateMatch, numContextLines int, language string, debug bool) []LineMatch {
 141	var result []LineMatch
 142	if ms[0].fileName {
 143		score, debugScore, _ := p.candidateMatchScore(ms, language, debug)
 144
 145		// There is only "line" in a filename.
 146		res := LineMatch{
 147			Line:     p.id.fileName(p.idx),
 148			FileName: true,
 149
 150			Score:      score,
 151			DebugScore: debugScore,
 152		}
 153
 154		for _, m := range ms {
 155			res.LineFragments = append(res.LineFragments, LineFragmentMatch{
 156				LineOffset:  int(m.byteOffset),
 157				MatchLength: int(m.byteMatchSz),
 158				Offset:      m.byteOffset,
 159			})
 160
 161			result = []LineMatch{res}
 162		}
 163	} else {
 164		ms = breakMatchesOnNewlines(ms, p.data(false))
 165		result = p.fillContentMatches(ms, numContextLines, language, debug)
 166	}
 167
 168	return result
 169}
 170
 171// fillChunkMatches converts the internal candidateMatch slice into our APIs ChunkMatch.
 172//
 173// Performance invariant: ms is sorted and non-overlapping.
 174//
 175// Note: the byte slices may be backed by mmapped data, so before being
 176// returned by the API it needs to be copied.
 177func (p *contentProvider) fillChunkMatches(ms []*candidateMatch, numContextLines int, language string, debug bool) []ChunkMatch {
 178	var result []ChunkMatch
 179	if ms[0].fileName {
 180		// If the first match is a filename match, there will only be
 181		// one match and the matched content will be the filename.
 182
 183		score, debugScore, _ := p.candidateMatchScore(ms, language, debug)
 184
 185		fileName := p.id.fileName(p.idx)
 186		ranges := make([]Range, 0, len(ms))
 187		for _, m := range ms {
 188			ranges = append(ranges, Range{
 189				Start: Location{
 190					ByteOffset: m.byteOffset,
 191					LineNumber: 1,
 192					Column:     uint32(utf8.RuneCount(fileName[:m.byteOffset]) + 1),
 193				},
 194				End: Location{
 195					ByteOffset: m.byteOffset + m.byteMatchSz,
 196					LineNumber: 1,
 197					Column:     uint32(utf8.RuneCount(fileName[:m.byteOffset+m.byteMatchSz]) + 1),
 198				},
 199			})
 200		}
 201
 202		result = []ChunkMatch{{
 203			Content:      fileName,
 204			ContentStart: Location{ByteOffset: 0, LineNumber: 1, Column: 1},
 205			Ranges:       ranges,
 206			FileName:     true,
 207
 208			Score:      score,
 209			DebugScore: debugScore,
 210		}}
 211	} else {
 212		result = p.fillContentChunkMatches(ms, numContextLines, language, debug)
 213	}
 214
 215	return result
 216}
 217
 218func (p *contentProvider) fillContentMatches(ms []*candidateMatch, numContextLines int, language string, debug bool) []LineMatch {
 219	var result []LineMatch
 220	for len(ms) > 0 {
 221		m := ms[0]
 222		num, lineStart, lineEnd := p.newlines().atOffset(m.byteOffset)
 223
 224		var lineCands []*candidateMatch
 225
 226		endMatch := m.byteOffset + m.byteMatchSz
 227
 228		for len(ms) > 0 {
 229			m := ms[0]
 230			if int(m.byteOffset) <= lineEnd {
 231				endMatch = m.byteOffset + m.byteMatchSz
 232				lineCands = append(lineCands, m)
 233				ms = ms[1:]
 234			} else {
 235				break
 236			}
 237		}
 238
 239		if len(lineCands) == 0 {
 240			log.Panicf(
 241				"%s %v infinite loop: num %d start,end %d,%d, offset %d",
 242				p.id.fileName(p.idx), p.id.metaData,
 243				num, lineStart, lineEnd,
 244				m.byteOffset)
 245		}
 246
 247		data := p.data(false)
 248
 249		// Due to merging matches, we may have a match that
 250		// crosses a line boundary. Prevent confusion by
 251		// taking lines until we pass the last match
 252		for lineEnd < len(data) && endMatch > uint32(lineEnd) {
 253			next := bytes.IndexByte(data[lineEnd+1:], '\n')
 254			if next == -1 {
 255				lineEnd = len(data)
 256			} else {
 257				// TODO(hanwen): test that checks "+1" part here.
 258				lineEnd += next + 1
 259			}
 260		}
 261
 262		finalMatch := LineMatch{
 263			LineStart:  lineStart,
 264			LineEnd:    lineEnd,
 265			LineNumber: num,
 266		}
 267		finalMatch.Line = data[lineStart:lineEnd]
 268
 269		if numContextLines > 0 {
 270			finalMatch.Before = p.newlines().getLines(data, num-numContextLines, num)
 271			finalMatch.After = p.newlines().getLines(data, num+1, num+1+numContextLines)
 272		}
 273
 274		score, debugScore, symbolInfo := p.candidateMatchScore(lineCands, language, debug)
 275		finalMatch.Score = score
 276		finalMatch.DebugScore = debugScore
 277
 278		for i, m := range lineCands {
 279			fragment := LineFragmentMatch{
 280				Offset:      m.byteOffset,
 281				LineOffset:  int(m.byteOffset) - lineStart,
 282				MatchLength: int(m.byteMatchSz),
 283			}
 284			if i < len(symbolInfo) && symbolInfo[i] != nil {
 285				fragment.SymbolInfo = symbolInfo[i]
 286			}
 287
 288			finalMatch.LineFragments = append(finalMatch.LineFragments, fragment)
 289		}
 290		result = append(result, finalMatch)
 291	}
 292	return result
 293}
 294
 295func (p *contentProvider) fillContentChunkMatches(ms []*candidateMatch, numContextLines int, language string, debug bool) []ChunkMatch {
 296	newlines := p.newlines()
 297	data := p.data(false)
 298
 299	// columnHelper prevents O(len(ms) * len(data)) lookups for all columns.
 300	// However, it depends on ms being sorted by byteOffset and non-overlapping.
 301	// This invariant is true at the time of writing, but we conservatively
 302	// enforce this. Note: chunkCandidates preserves the sorting so safe to
 303	// transform now.
 304	columnHelper := columnHelper{data: data}
 305	if !sort.IsSorted((sortByOffsetSlice)(ms)) {
 306		log.Printf("WARN: performance invariant violated. candidate matches are not sorted in fillContentChunkMatches. Report to developers.")
 307		sort.Sort((sortByOffsetSlice)(ms))
 308	}
 309
 310	chunks := chunkCandidates(ms, newlines, numContextLines)
 311	chunkMatches := make([]ChunkMatch, 0, len(chunks))
 312	for _, chunk := range chunks {
 313		score, debugScore, symbolInfo := p.candidateMatchScore(chunk.candidates, language, debug)
 314
 315		ranges := make([]Range, 0, len(chunk.candidates))
 316		for _, cm := range chunk.candidates {
 317			startOffset := cm.byteOffset
 318			endOffset := cm.byteOffset + cm.byteMatchSz
 319			startLine, startLineOffset, _ := newlines.atOffset(startOffset)
 320			endLine, endLineOffset, _ := newlines.atOffset(endOffset)
 321
 322			ranges = append(ranges, Range{
 323				Start: Location{
 324					ByteOffset: startOffset,
 325					LineNumber: uint32(startLine),
 326					Column:     columnHelper.get(startLineOffset, startOffset),
 327				},
 328				End: Location{
 329					ByteOffset: endOffset,
 330					LineNumber: uint32(endLine),
 331					Column:     columnHelper.get(endLineOffset, endOffset),
 332				},
 333			})
 334		}
 335
 336		firstLineNumber := int(chunk.firstLine) - numContextLines
 337		if firstLineNumber < 1 {
 338			firstLineNumber = 1
 339		}
 340		firstLineStart, _ := newlines.lineBounds(firstLineNumber)
 341
 342		chunkMatches = append(chunkMatches, ChunkMatch{
 343			Content: newlines.getLines(data, firstLineNumber, int(chunk.lastLine)+numContextLines+1),
 344			ContentStart: Location{
 345				ByteOffset: firstLineStart,
 346				LineNumber: uint32(firstLineNumber),
 347				Column:     1,
 348			},
 349			FileName:   false,
 350			Ranges:     ranges,
 351			SymbolInfo: symbolInfo,
 352			Score:      score,
 353			DebugScore: debugScore,
 354		})
 355	}
 356	return chunkMatches
 357}
 358
 359type candidateChunk struct {
 360	candidates []*candidateMatch
 361	firstLine  uint32 // 1-based, inclusive
 362	lastLine   uint32 // 1-based, inclusive
 363	minOffset  uint32 // 0-based, inclusive
 364	maxOffset  uint32 // 0-based, exclusive
 365}
 366
 367// chunkCandidates groups a set of sorted, non-overlapping candidate matches by line number. Adjacent
 368// chunks will be merged if adding `numContextLines` to the beginning and end of the chunk would cause
 369// it to overlap with an adjacent chunk.
 370//
 371// input invariants: ms is sorted by byteOffset and is non overlapping with respect to endOffset.
 372// output invariants: if you flatten candidates the input invariant is retained.
 373func chunkCandidates(ms []*candidateMatch, newlines newlines, numContextLines int) []candidateChunk {
 374	var chunks []candidateChunk
 375	for _, m := range ms {
 376		startOffset := m.byteOffset
 377		endOffset := m.byteOffset + m.byteMatchSz
 378		firstLine, _, _ := newlines.atOffset(startOffset)
 379		lastLine, _, _ := newlines.atOffset(endOffset)
 380
 381		if len(chunks) > 0 && int(chunks[len(chunks)-1].lastLine)+numContextLines >= firstLine-numContextLines {
 382			// If a new chunk created with the current candidateMatch would
 383			// overlap with the previous chunk, instead add the candidateMatch
 384			// to the last chunk and extend end of the last chunk.
 385			last := &chunks[len(chunks)-1]
 386			last.candidates = append(last.candidates, m)
 387			if last.maxOffset < endOffset {
 388				last.lastLine = uint32(lastLine)
 389				last.maxOffset = uint32(endOffset)
 390			}
 391		} else {
 392			chunks = append(chunks, candidateChunk{
 393				firstLine:  uint32(firstLine),
 394				lastLine:   uint32(lastLine),
 395				minOffset:  startOffset,
 396				maxOffset:  endOffset,
 397				candidates: []*candidateMatch{m},
 398			})
 399		}
 400	}
 401	return chunks
 402}
 403
 404// columnHelper is a helper struct which caches the number of runes last
 405// counted. If we naively use utf8.RuneCount for each match on a line, this
 406// leads to an O(nm) algorithm where m is the number of matches and n is the
 407// length of the line. Aassuming we our candidates are increasing in offset
 408// makes this operation O(n) instead.
 409type columnHelper struct {
 410	data []byte
 411
 412	// 0 values for all these are valid values
 413	lastLineOffset int
 414	lastOffset     uint32
 415	lastRuneCount  uint32
 416}
 417
 418// get returns the line column for offset. offset is the byte offset of the
 419// rune in data. lineOffset is the byte offset inside of data for the line
 420// containing offset.
 421func (c *columnHelper) get(lineOffset int, offset uint32) uint32 {
 422	var runeCount uint32
 423
 424	if lineOffset == c.lastLineOffset && offset >= c.lastOffset {
 425		// Can count from last calculation
 426		runeCount = c.lastRuneCount + uint32(utf8.RuneCount(c.data[c.lastOffset:offset]))
 427	} else {
 428		// Need to count from the beginning of line
 429		runeCount = uint32(utf8.RuneCount(c.data[lineOffset:offset]))
 430	}
 431
 432	c.lastLineOffset = lineOffset
 433	c.lastOffset = offset
 434	c.lastRuneCount = runeCount
 435
 436	return runeCount + 1
 437}
 438
 439type newlines struct {
 440	// locs is the sorted set of byte offsets of the newlines in the file
 441	locs []uint32
 442
 443	// fileSize is just the number of bytes in the file. It is stored
 444	// on this struct so we can safely know the length of the last line
 445	// in the file since not all files end in a newline.
 446	fileSize uint32
 447}
 448
 449// atOffset returns the line containing the offset. If the offset lands on
 450// the newline ending line M, we return M.  The line is characterized
 451// by its linenumber (base-1, byte index of line start, byte index of
 452// line end). The line end is the index of a newline, or the filesize
 453// (if matching the last line of the file.)
 454func (nls newlines) atOffset(offset uint32) (lineNumber, lineStart, lineEnd int) {
 455	idx := sort.Search(len(nls.locs), func(n int) bool {
 456		return nls.locs[n] >= offset
 457	})
 458
 459	start, end := nls.lineBounds(idx + 1)
 460	return idx + 1, int(start), int(end)
 461}
 462
 463// lineBounds returns the byte offsets of the start and end of the 1-based
 464// lineNumber. The end offset is exclusive and will not contain the line-ending
 465// newline. If the line number is out of range of the lines in the file, start
 466// and end will be clamped to [0,fileSize].
 467func (nls newlines) lineBounds(lineNumber int) (start, end uint32) {
 468	// nls.locs[0] + 1 is the start of the 2nd line of data.
 469	startIdx := lineNumber - 2
 470	endIdx := lineNumber - 1
 471
 472	if startIdx < 0 {
 473		start = 0
 474	} else if startIdx >= len(nls.locs) {
 475		start = nls.fileSize
 476	} else {
 477		start = nls.locs[startIdx] + 1
 478	}
 479
 480	if endIdx < 0 {
 481		end = 0
 482	} else if endIdx >= len(nls.locs) {
 483		end = nls.fileSize
 484	} else {
 485		end = nls.locs[endIdx]
 486	}
 487
 488	return start, end
 489}
 490
 491// getLines returns a slice of data containing the lines [low, high).
 492// low is 1-based and inclusive. high is 1-based and exclusive.
 493func (nls newlines) getLines(data []byte, low, high int) []byte {
 494	if low >= high {
 495		return nil
 496	}
 497
 498	lowStart, _ := nls.lineBounds(low)
 499	_, highEnd := nls.lineBounds(high - 1)
 500
 501	// Drop any trailing newline. Editors do not treat a trailing newline as
 502	// the start of a new line, so we should not either. lineBounds clamps to
 503	// len(data) when an out-of-bounds line is requested.
 504	//
 505	// As an example, if we request lines 1-5 from a file with contents
 506	// `one\ntwo\nthree\n`, we should return `one\ntwo\nthree` because those are
 507	// the three "lines" in the file, separated by newlines.
 508	if highEnd == uint32(len(data)) && bytes.HasSuffix(data, []byte{'\n'}) {
 509		highEnd = highEnd - 1
 510		lowStart = min(lowStart, highEnd)
 511	}
 512
 513	return data[lowStart:highEnd]
 514}
 515
 516const (
 517	// Query-dependent scoring signals. All of these together are bounded at ~9000
 518	// (scoreWordMatch + scoreSymbol + scoreKindMatch * 10 + scoreFactorAtomMatch).
 519	scorePartialWordMatch = 50.0
 520	scoreWordMatch        = 500.0
 521	scoreBase             = 7000.0
 522	scorePartialBase      = 4000.0
 523	scoreSymbol           = 7000.0
 524	scorePartialSymbol    = 4000.0
 525	scoreKindMatch        = 100.0
 526	scoreFactorAtomMatch  = 400.0
 527
 528	// File-only scoring signals. For now these are also bounded ~9000 to give them
 529	// equal weight with the query-dependent signals.
 530	scoreFileRankFactor  = 9000.0
 531	scoreFileOrderFactor = 10.0
 532	scoreRepoRankFactor  = 20.0
 533
 534	// Used for ordering line and chunk matches within a file.
 535	scoreLineOrderFactor = 1.0
 536)
 537
 538// findSection checks whether a section defined by offset and size lies within
 539// one of the sections in secs.
 540func findSection(secs []DocumentSection, off, sz uint32) (uint32, bool) {
 541	j := sort.Search(len(secs), func(i int) bool {
 542		return secs[i].End >= off+sz
 543	})
 544
 545	if j == len(secs) {
 546		return 0, false
 547	}
 548
 549	if secs[j].Start <= off && off+sz <= secs[j].End {
 550		return uint32(j), true
 551	}
 552	return 0, false
 553}
 554
 555// findMaxOverlappingSection returns the index of the section in secs that
 556// overlaps the most with the area defined by off and sz, relative to the size
 557// of the section. If no section overlaps, it returns 0, false. If multiple
 558// sections overlap the same amount, the first one is returned.
 559//
 560// The implementation assumes that sections do not overlap and are sorted by
 561// DocumentSection.Start.
 562func findMaxOverlappingSection(secs []DocumentSection, off, sz uint32) (uint32, bool) {
 563	start := off
 564	end := off + sz
 565
 566	// Find the first section that might overlap
 567	j := sort.Search(len(secs), func(i int) bool { return secs[i].End > start })
 568
 569	if j == len(secs) || secs[j].Start >= end {
 570		// No overlap.
 571		return 0, false
 572	}
 573
 574	relOverlap := func(j int) float64 {
 575		secSize := secs[j].End - secs[j].Start
 576		if secSize == 0 {
 577			return 0
 578		}
 579		// This cannot overflow because we make sure there is overlap before calling relOverlap
 580		overlap := min(secs[j].End, end) - max(secs[j].Start, start)
 581		return float64(overlap) / float64(secSize)
 582	}
 583
 584	ol1 := relOverlap(j)
 585	if epsilonEqualsOne(ol1) || j == len(secs)-1 || secs[j+1].Start >= end {
 586		return uint32(j), ol1 > 0
 587	}
 588
 589	// We know that [off,off+sz[ overlaps with at least 2 sections. We only have to check
 590	// if the second section overlaps more than the first one, because a third
 591	// section can only overlap if the overlap with the second section is complete.
 592	ol2 := relOverlap(j + 1)
 593	if ol2 > ol1 {
 594		return uint32(j + 1), ol2 > 0
 595	}
 596
 597	return uint32(j), ol1 > 0
 598}
 599
 600func (p *contentProvider) findSymbol(cm *candidateMatch) (DocumentSection, *Symbol, bool) {
 601	if cm.fileName {
 602		return DocumentSection{}, nil, false
 603	}
 604
 605	secs := p.docSections()
 606
 607	secIdx, ok := cm.symbolIdx, cm.symbol
 608	if !ok {
 609		// Not from a symbol matchTree. Let's see if it overlaps with a symbol.
 610		secIdx, ok = findMaxOverlappingSection(secs, cm.byteOffset, cm.byteMatchSz)
 611	}
 612	if !ok {
 613		return DocumentSection{}, nil, false
 614	}
 615
 616	sec := secs[secIdx]
 617
 618	// Now lets hydrate in the SymbolInfo. We do not hydrate in SymbolInfo.Sym
 619	// since some callsites do not need it stored, and that incurs an extra
 620	// copy.
 621	//
 622	// 2024-01-08 we are refactoring this and the code path indicates this can
 623	// fail, so callers need to handle nil symbol. However, it would be
 624	// surprising that we have a matching section but not symbol data.
 625	start := p.id.fileEndSymbol[p.idx]
 626	si := p.id.symbols.data(start + secIdx)
 627
 628	return sec, si, true
 629}
 630
 631func (p *contentProvider) candidateMatchScore(ms []*candidateMatch, language string, debug bool) (float64, string, []*Symbol) {
 632	type debugScore struct {
 633		what  string
 634		score float64
 635	}
 636
 637	score := &debugScore{}
 638	maxScore := &debugScore{}
 639
 640	addScore := func(what string, s float64) {
 641		if s != 0 && debug {
 642			score.what += fmt.Sprintf("%s:%.2f, ", what, s)
 643		}
 644		score.score += s
 645	}
 646
 647	filename := p.data(true)
 648	var symbolInfo []*Symbol
 649
 650	for i, m := range ms {
 651		data := p.data(m.fileName)
 652
 653		endOffset := m.byteOffset + m.byteMatchSz
 654		startBoundary := m.byteOffset < uint32(len(data)) && (m.byteOffset == 0 || byteClass(data[m.byteOffset-1]) != byteClass(data[m.byteOffset]))
 655		endBoundary := endOffset > 0 && (endOffset == uint32(len(data)) || byteClass(data[endOffset-1]) != byteClass(data[endOffset]))
 656
 657		score.score = 0
 658		score.what = ""
 659
 660		if startBoundary && endBoundary {
 661			addScore("WordMatch", scoreWordMatch)
 662		} else if startBoundary || endBoundary {
 663			addScore("PartialWordMatch", scorePartialWordMatch)
 664		}
 665
 666		if m.fileName {
 667			sep := bytes.LastIndexByte(data, '/')
 668			startMatch := int(m.byteOffset) == sep+1
 669			endMatch := endOffset == uint32(len(data))
 670			if startMatch && endMatch {
 671				addScore("Base", scoreBase)
 672			} else if startMatch || endMatch {
 673				addScore("EdgeBase", (scoreBase+scorePartialBase)/2)
 674			} else if sep < int(m.byteOffset) {
 675				addScore("InnerBase", scorePartialBase)
 676			}
 677		} else if sec, si, ok := p.findSymbol(m); ok {
 678			startMatch := sec.Start == m.byteOffset
 679			endMatch := sec.End == endOffset
 680			if startMatch && endMatch {
 681				addScore("Symbol", scoreSymbol)
 682			} else if startMatch || endMatch {
 683				addScore("EdgeSymbol", (scoreSymbol+scorePartialSymbol)/2)
 684			} else {
 685				addScore("OverlapSymbol", scorePartialSymbol)
 686			}
 687
 688			// Score based on symbol data
 689			if si != nil {
 690				symbolKind := ctags.ParseSymbolKind(si.Kind)
 691				sym := sectionSlice(data, sec)
 692
 693				addScore(fmt.Sprintf("kind:%s:%s", language, si.Kind), scoreSymbolKind(language, filename, sym, symbolKind))
 694
 695				// This is from a symbol tree, so we need to store the symbol
 696				// information.
 697				if m.symbol {
 698					if symbolInfo == nil {
 699						symbolInfo = make([]*Symbol, len(ms))
 700					}
 701					// findSymbols does not hydrate in Sym. So we need to store it.
 702					si.Sym = string(sym)
 703					symbolInfo[i] = si
 704				}
 705			}
 706		}
 707
 708		// scoreWeight != 1 means it affects score
 709		if !epsilonEqualsOne(m.scoreWeight) {
 710			score.score = score.score * m.scoreWeight
 711			if debug {
 712				score.what += fmt.Sprintf("boost:%.2f, ", m.scoreWeight)
 713			}
 714		}
 715
 716		if score.score > maxScore.score {
 717			maxScore.score = score.score
 718			maxScore.what = score.what
 719		}
 720	}
 721
 722	if debug {
 723		maxScore.what = fmt.Sprintf("score:%.2f <- %s", maxScore.score, strings.TrimSuffix(maxScore.what, ", "))
 724	}
 725
 726	return maxScore.score, maxScore.what, symbolInfo
 727}
 728
 729// sectionSlice will return data[sec.Start:sec.End] but will clip Start and
 730// End such that it won't be out of range.
 731func sectionSlice(data []byte, sec DocumentSection) []byte {
 732	l := uint32(len(data))
 733	if sec.Start >= l {
 734		return nil
 735	}
 736	if sec.End > l {
 737		sec.End = l
 738	}
 739	return data[sec.Start:sec.End]
 740}
 741
 742// scoreSymbolKind boosts a match based on the combination of language, symbol
 743// and kind. The language string comes from go-enry, the symbol and kind from
 744// ctags.
 745func scoreSymbolKind(language string, filename []byte, sym []byte, kind ctags.SymbolKind) float64 {
 746	var factor float64
 747
 748	// Generic ranking which will be overriden by language specific ranking
 749	switch kind {
 750	case ctags.Type: // scip-ctags regression workaround https://github.com/sourcegraph/sourcegraph/issues/57659
 751		factor = 8
 752	case ctags.Class:
 753		factor = 10
 754	case ctags.Struct:
 755		factor = 9.5
 756	case ctags.Enum:
 757		factor = 9
 758	case ctags.Interface:
 759		factor = 8
 760	case ctags.Function, ctags.Method:
 761		factor = 7
 762	case ctags.Field:
 763		factor = 5.5
 764	case ctags.Constant:
 765		factor = 5
 766	case ctags.Variable:
 767		factor = 4
 768	default:
 769		// For all other kinds, assign a low score by default.
 770		factor = 1
 771	}
 772
 773	switch language {
 774	case "Java", "java":
 775		switch kind {
 776		// 2022-03-30: go-ctags contains a regex rule for Java classes that sets "kind"
 777		// to "classes" instead of "c". We have to cover both cases to support existing
 778		// indexes.
 779		case ctags.Class:
 780			factor = 10
 781		case ctags.Enum:
 782			factor = 9
 783		case ctags.Interface:
 784			factor = 8
 785		case ctags.Method:
 786			factor = 7
 787		case ctags.Field:
 788			factor = 6
 789		case ctags.EnumConstant:
 790			factor = 5
 791		}
 792	case "Kotlin", "kotlin":
 793		switch kind {
 794		case ctags.Class:
 795			factor = 10
 796		case ctags.Interface:
 797			factor = 9
 798		case ctags.Method:
 799			factor = 8
 800		case ctags.TypeAlias:
 801			factor = 7
 802		case ctags.Constant:
 803			factor = 6
 804		case ctags.Variable:
 805			factor = 5
 806		}
 807	case "Go", "go":
 808		switch kind {
 809		// scip-ctags regression workaround https://github.com/sourcegraph/sourcegraph/issues/57659
 810		// for each case a description of the fields in ctags in the comment
 811		case ctags.Type: // interface struct talias
 812			factor = 9
 813		case ctags.Interface: // interfaces
 814			factor = 10
 815		case ctags.Struct: // structs
 816			factor = 9
 817		case ctags.TypeAlias: // type aliases
 818			factor = 9
 819		case ctags.MethodSpec: // interface method specification
 820			factor = 8.5
 821		case ctags.Method, ctags.Function: // functions
 822			factor = 8
 823		case ctags.Field: // struct fields
 824			factor = 7
 825		case ctags.Constant: // constants
 826			factor = 6
 827		case ctags.Variable: // variables
 828			factor = 5
 829		}
 830
 831		// Boost exported go symbols. Same implementation as token.IsExported
 832		if ch, _ := utf8.DecodeRune(sym); unicode.IsUpper(ch) {
 833			factor += 0.5
 834		}
 835
 836		if bytes.HasSuffix(filename, []byte("_test.go")) {
 837			factor *= 0.8
 838		}
 839
 840		// Could also rank on:
 841		//
 842		//   - anonMember  struct anonymous members
 843		//   - packageName name for specifying imported package
 844		//   - receiver    receivers
 845		//   - package     packages
 846		//   - type        types
 847		//   - unknown     unknown
 848	case "C++", "c++":
 849		switch kind {
 850		case ctags.Class: // classes
 851			factor = 10
 852		case ctags.Enum: // enumeration names
 853			factor = 9
 854		case ctags.Function: // function definitions
 855			factor = 8
 856		case ctags.Struct: // structure names
 857			factor = 7
 858		case ctags.Union: // union names
 859			factor = 6
 860		case ctags.TypeAlias: // typedefs
 861			factor = 5
 862		case ctags.Field: // class, struct, and union members
 863			factor = 4
 864		case ctags.Variable: // varialbe definitions
 865			factor = 3
 866		}
 867	// Could also rank on:
 868	// NAME        DESCRIPTION
 869	// macro       macro definitions
 870	// enumerator  enumerators (values inside an enumeration)
 871	// header      included header files
 872	// namespace   namespaces
 873	// variable    variable definitions
 874	case "Scala", "scala":
 875		switch kind {
 876		case ctags.Class:
 877			factor = 10
 878		case ctags.Interface:
 879			factor = 9
 880		case ctags.Object:
 881			factor = 8
 882		case ctags.Function:
 883			factor = 7
 884		case ctags.Type:
 885			factor = 6
 886		case ctags.Variable:
 887			factor = 5
 888		case ctags.Package:
 889			factor = 4
 890		}
 891	case "Python", "python":
 892		switch kind {
 893		case ctags.Class: // classes
 894			factor = 10
 895		case ctags.Function, ctags.Method: // function definitions
 896			factor = 8
 897		case ctags.Field: // class, struct, and union members
 898			factor = 4
 899		case ctags.Variable: // variable definitions
 900			factor = 3
 901		case ctags.Local: // local variables
 902			factor = 2
 903		}
 904		// Could also rank on:
 905		//
 906		//   - namespace name referring a module defined in other file
 907		//   - module    modules
 908		//   - unknown   name referring a class/variable/function/module defined in other module
 909		//   - parameter function parameters
 910	case "Ruby", "ruby":
 911		switch kind {
 912		case ctags.Class:
 913			factor = 10
 914		case ctags.Method:
 915			factor = 9
 916		case ctags.MethodAlias:
 917			factor = 8
 918		case ctags.Module:
 919			factor = 7
 920		case ctags.SingletonMethod:
 921			factor = 6
 922		case ctags.Constant:
 923			factor = 5
 924		case ctags.Accessor:
 925			factor = 4
 926		case ctags.Library:
 927			factor = 3
 928		}
 929	case "PHP", "php":
 930		switch kind {
 931		case ctags.Class:
 932			factor = 10
 933		case ctags.Interface:
 934			factor = 9
 935		case ctags.Function:
 936			factor = 8
 937		case ctags.Trait:
 938			factor = 7
 939		case ctags.Define:
 940			factor = 6
 941		case ctags.Namespace:
 942			factor = 5
 943		case ctags.MethodAlias:
 944			factor = 4
 945		case ctags.Variable:
 946			factor = 3
 947		case ctags.Local:
 948			factor = 3
 949		}
 950	case "GraphQL", "graphql":
 951		switch kind {
 952		case ctags.Type:
 953			factor = 10
 954		}
 955	case "Markdown", "markdown":
 956		// Headers are good signal in docs, but do not rank as highly as code.
 957		switch kind {
 958		case ctags.Chapter: // #
 959			factor = 4
 960		case ctags.Section: // ##
 961			factor = 3
 962		case ctags.Subsection: // ###
 963			factor = 2
 964		}
 965	}
 966
 967	return factor * scoreKindMatch
 968}
 969
 970type matchScoreSlice []LineMatch
 971
 972func (m matchScoreSlice) Len() int           { return len(m) }
 973func (m matchScoreSlice) Swap(i, j int)      { m[i], m[j] = m[j], m[i] }
 974func (m matchScoreSlice) Less(i, j int) bool { return m[i].Score > m[j].Score }
 975
 976type chunkMatchScoreSlice []ChunkMatch
 977
 978func (m chunkMatchScoreSlice) Len() int           { return len(m) }
 979func (m chunkMatchScoreSlice) Swap(i, j int)      { m[i], m[j] = m[j], m[i] }
 980func (m chunkMatchScoreSlice) Less(i, j int) bool { return m[i].Score > m[j].Score }
 981
 982type fileMatchesByScore []FileMatch
 983
 984func (m fileMatchesByScore) Len() int           { return len(m) }
 985func (m fileMatchesByScore) Swap(i, j int)      { m[i], m[j] = m[j], m[i] }
 986func (m fileMatchesByScore) Less(i, j int) bool { return m[i].Score > m[j].Score }
 987
 988func sortMatchesByScore(ms []LineMatch) {
 989	sort.Sort(matchScoreSlice(ms))
 990}
 991
 992func sortChunkMatchesByScore(ms []ChunkMatch) {
 993	sort.Sort(chunkMatchScoreSlice(ms))
 994}
 995
 996// SortFiles sorts files matches in the order we want to present results to
 997// users. The order depends on the match score, which includes both
 998// query-dependent signals like word overlap, and file-only signals like the
 999// file ranks (if file ranks are enabled).
1000//
1001// We don't only use the scores, we will also boost some results to present
1002// files with novel extensions.
1003func SortFiles(ms []FileMatch) {
1004	sort.Sort(fileMatchesByScore(ms))
1005
1006	// Boost a file extension not in the top 3 to the third filematch.
1007	boostNovelExtension(ms, 2, 0.9)
1008}
1009
1010func boostNovelExtension(ms []FileMatch, boostOffset int, minScoreRatio float64) {
1011	if len(ms) <= boostOffset+1 {
1012		return
1013	}
1014
1015	top := ms[:boostOffset]
1016	candidates := ms[boostOffset:]
1017
1018	// Don't bother boosting something which is significantly different to the
1019	// result it replaces.
1020	minScoreForNovelty := candidates[0].Score * minScoreRatio
1021
1022	// We want to look for an ext that isn't in the top exts
1023	exts := make([]string, len(top))
1024	for i := range top {
1025		exts[i] = path.Ext(top[i].FileName)
1026	}
1027
1028	for i := range candidates {
1029		// Do not assume sorted due to boostNovelExtension being called on subsets
1030		if candidates[i].Score < minScoreForNovelty {
1031			continue
1032		}
1033
1034		if slices.Contains(exts, path.Ext(candidates[i].FileName)) {
1035			continue
1036		}
1037
1038		// Found what we are looking for, now boost to front of candidates (which
1039		// is ms[boostOffset])
1040		for ; i > 0; i-- {
1041			candidates[i], candidates[i-1] = candidates[i-1], candidates[i]
1042		}
1043		return
1044	}
1045}
Configure Feed

Configure Feed