contentprovider.go at 155050eb98d3e71e629c3d527f11b3c2047886aa · boltless.me/zoekt

fork of https://github.com/sourcegraph/zoekt
zoekt / contentprovider.go
at 155050eb98d3e71e629c3d527f11b3c2047886aa 30 kB View raw
Keegan Carruthers-Smith chunkmatches: reuse last calculated column when filling (#711) 2y ago
   1// Copyright 2016 Google Inc. All rights reserved.
   2//
   3// Licensed under the Apache License, Version 2.0 (the "License");
   4// you may not use this file except in compliance with the License.
   5// You may obtain a copy of the License at
   6//
   7//    http://www.apache.org/licenses/LICENSE-2.0
   8//
   9// Unless required by applicable law or agreed to in writing, software
  10// distributed under the License is distributed on an "AS IS" BASIS,
  11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12// See the License for the specific language governing permissions and
  13// limitations under the License.
  14
  15package zoekt
  16
  17import (
  18	"bytes"
  19	"fmt"
  20	"log"
  21	"os"
  22	"path"
  23	"sort"
  24	"strings"
  25	"unicode"
  26	"unicode/utf8"
  27
  28	"github.com/sourcegraph/zoekt/ctags"
  29	"golang.org/x/exp/slices"
  30)
  31
  32var _ = log.Println
  33
  34// contentProvider is an abstraction to treat matches for names and
  35// content with the same code.
  36type contentProvider struct {
  37	id    *indexData
  38	stats *Stats
  39
  40	// mutable
  41	err      error
  42	idx      uint32
  43	_data    []byte
  44	_nl      []uint32
  45	_nlBuf   []uint32
  46	_sects   []DocumentSection
  47	_sectBuf []DocumentSection
  48	fileSize uint32
  49}
  50
  51// setDocument skips to the given document.
  52func (p *contentProvider) setDocument(docID uint32) {
  53	fileStart := p.id.boundaries[docID]
  54
  55	p.idx = docID
  56	p.fileSize = p.id.boundaries[docID+1] - fileStart
  57
  58	p._nl = nil
  59	p._sects = nil
  60	p._data = nil
  61}
  62
  63func (p *contentProvider) docSections() []DocumentSection {
  64	if p._sects == nil {
  65		var sz uint32
  66		p._sects, sz, p.err = p.id.readDocSections(p.idx, p._sectBuf)
  67		p.stats.ContentBytesLoaded += int64(sz)
  68		p._sectBuf = p._sects
  69	}
  70	return p._sects
  71}
  72
  73func (p *contentProvider) newlines() newlines {
  74	if p._nl == nil {
  75		var sz uint32
  76		p._nl, sz, p.err = p.id.readNewlines(p.idx, p._nlBuf)
  77		p._nlBuf = p._nl
  78		p.stats.ContentBytesLoaded += int64(sz)
  79	}
  80	return newlines{locs: p._nl, fileSize: p.fileSize}
  81}
  82
  83func (p *contentProvider) data(fileName bool) []byte {
  84	if fileName {
  85		return p.id.fileNameContent[p.id.fileNameIndex[p.idx]:p.id.fileNameIndex[p.idx+1]]
  86	}
  87
  88	if p._data == nil {
  89		p._data, p.err = p.id.readContents(p.idx)
  90		p.stats.FilesLoaded++
  91		p.stats.ContentBytesLoaded += int64(len(p._data))
  92	}
  93	return p._data
  94}
  95
  96// Find offset in bytes (relative to corpus start) for an offset in
  97// runes (relative to document start). If filename is set, the corpus
  98// is the set of filenames, with the document being the name itself.
  99func (p *contentProvider) findOffset(filename bool, r uint32) uint32 {
 100	if p.id.metaData.PlainASCII {
 101		return r
 102	}
 103
 104	sample := p.id.runeOffsets
 105	runeEnds := p.id.fileEndRunes
 106	fileStartByte := p.id.boundaries[p.idx]
 107	if filename {
 108		sample = p.id.fileNameRuneOffsets
 109		runeEnds = p.id.fileNameEndRunes
 110		fileStartByte = p.id.fileNameIndex[p.idx]
 111	}
 112
 113	absR := r
 114	if p.idx > 0 {
 115		absR += runeEnds[p.idx-1]
 116	}
 117
 118	byteOff, left := sample.lookup(absR)
 119
 120	var data []byte
 121
 122	if filename {
 123		data = p.id.fileNameContent[byteOff:]
 124	} else {
 125		data, p.err = p.id.readContentSlice(byteOff, 3*runeOffsetFrequency)
 126		if p.err != nil {
 127			return 0
 128		}
 129	}
 130	for left > 0 {
 131		_, sz := utf8.DecodeRune(data)
 132		byteOff += uint32(sz)
 133		data = data[sz:]
 134		left--
 135	}
 136
 137	byteOff -= fileStartByte
 138	return byteOff
 139}
 140
 141func (p *contentProvider) fillMatches(ms []*candidateMatch, numContextLines int, language string, debug bool) []LineMatch {
 142	var result []LineMatch
 143	if ms[0].fileName {
 144		// There is only "line" in a filename.
 145		res := LineMatch{
 146			Line:     p.id.fileName(p.idx),
 147			FileName: true,
 148		}
 149
 150		for _, m := range ms {
 151			res.LineFragments = append(res.LineFragments, LineFragmentMatch{
 152				LineOffset:  int(m.byteOffset),
 153				MatchLength: int(m.byteMatchSz),
 154				Offset:      m.byteOffset,
 155			})
 156
 157			result = []LineMatch{res}
 158		}
 159	} else {
 160		ms = breakMatchesOnNewlines(ms, p.data(false))
 161		result = p.fillContentMatches(ms, numContextLines)
 162	}
 163
 164	sects := p.docSections()
 165	for i, m := range result {
 166		result[i].Score, result[i].DebugScore = p.matchScore(sects, &m, language, debug)
 167	}
 168
 169	return result
 170}
 171
 172// fillChunkMatches converts the internal candidateMatch slice into our APIs ChunkMatch.
 173//
 174// Performance invariant: ms is sorted and non-overlapping.
 175//
 176// Note: the byte slices may be backed by mmapped data, so before being
 177// returned by the API it needs to be copied.
 178func (p *contentProvider) fillChunkMatches(ms []*candidateMatch, numContextLines int, language string, debug bool) []ChunkMatch {
 179	var result []ChunkMatch
 180	if ms[0].fileName {
 181		// If the first match is a filename match, there will only be
 182		// one match and the matched content will be the filename.
 183
 184		fileName := p.id.fileName(p.idx)
 185		ranges := make([]Range, 0, len(ms))
 186		for _, m := range ms {
 187			ranges = append(ranges, Range{
 188				Start: Location{
 189					ByteOffset: m.byteOffset,
 190					LineNumber: 1,
 191					Column:     uint32(utf8.RuneCount(fileName[:m.byteOffset]) + 1),
 192				},
 193				End: Location{
 194					ByteOffset: m.byteOffset + m.byteMatchSz,
 195					LineNumber: 1,
 196					Column:     uint32(utf8.RuneCount(fileName[:m.byteOffset+m.byteMatchSz]) + 1),
 197				},
 198			})
 199		}
 200
 201		result = []ChunkMatch{{
 202			Content:      fileName,
 203			ContentStart: Location{ByteOffset: 0, LineNumber: 1, Column: 1},
 204			Ranges:       ranges,
 205			FileName:     true,
 206		}}
 207	} else {
 208		result = p.fillContentChunkMatches(ms, numContextLines)
 209	}
 210
 211	sects := p.docSections()
 212	for i, m := range result {
 213		result[i].Score, result[i].DebugScore = p.chunkMatchScore(sects, &m, language, debug)
 214	}
 215
 216	return result
 217}
 218
 219func (p *contentProvider) fillContentMatches(ms []*candidateMatch, numContextLines int) []LineMatch {
 220	var result []LineMatch
 221	for len(ms) > 0 {
 222		m := ms[0]
 223		num, lineStart, lineEnd := p.newlines().atOffset(m.byteOffset)
 224
 225		var lineCands []*candidateMatch
 226
 227		endMatch := m.byteOffset + m.byteMatchSz
 228
 229		for len(ms) > 0 {
 230			m := ms[0]
 231			if int(m.byteOffset) <= lineEnd {
 232				endMatch = m.byteOffset + m.byteMatchSz
 233				lineCands = append(lineCands, m)
 234				ms = ms[1:]
 235			} else {
 236				break
 237			}
 238		}
 239
 240		if len(lineCands) == 0 {
 241			log.Panicf(
 242				"%s %v infinite loop: num %d start,end %d,%d, offset %d",
 243				p.id.fileName(p.idx), p.id.metaData,
 244				num, lineStart, lineEnd,
 245				m.byteOffset)
 246		}
 247
 248		data := p.data(false)
 249
 250		// Due to merging matches, we may have a match that
 251		// crosses a line boundary. Prevent confusion by
 252		// taking lines until we pass the last match
 253		for lineEnd < len(data) && endMatch > uint32(lineEnd) {
 254			next := bytes.IndexByte(data[lineEnd+1:], '\n')
 255			if next == -1 {
 256				lineEnd = len(data)
 257			} else {
 258				// TODO(hanwen): test that checks "+1" part here.
 259				lineEnd += next + 1
 260			}
 261		}
 262
 263		finalMatch := LineMatch{
 264			LineStart:  lineStart,
 265			LineEnd:    lineEnd,
 266			LineNumber: num,
 267		}
 268		finalMatch.Line = data[lineStart:lineEnd]
 269
 270		if numContextLines > 0 {
 271			finalMatch.Before = p.newlines().getLines(data, num-numContextLines, num)
 272			finalMatch.After = p.newlines().getLines(data, num+1, num+1+numContextLines)
 273		}
 274
 275		for _, m := range lineCands {
 276			fragment := LineFragmentMatch{
 277				Offset:      m.byteOffset,
 278				LineOffset:  int(m.byteOffset) - lineStart,
 279				MatchLength: int(m.byteMatchSz),
 280			}
 281			if m.symbol {
 282				start := p.id.fileEndSymbol[p.idx]
 283				fragment.SymbolInfo = p.id.symbols.data(start + m.symbolIdx)
 284				if fragment.SymbolInfo != nil {
 285					sec := p.docSections()[m.symbolIdx]
 286					fragment.SymbolInfo.Sym = string(data[sec.Start:sec.End])
 287				}
 288			}
 289
 290			finalMatch.LineFragments = append(finalMatch.LineFragments, fragment)
 291		}
 292		result = append(result, finalMatch)
 293	}
 294	return result
 295}
 296
 297func (p *contentProvider) fillContentChunkMatches(ms []*candidateMatch, numContextLines int) []ChunkMatch {
 298	newlines := p.newlines()
 299	data := p.data(false)
 300
 301	// columnHelper prevents O(len(ms) * len(data)) lookups for all columns.
 302	// However, it depends on ms being sorted by byteOffset and non-overlapping.
 303	// This invariant is true at the time of writing, but we conservatively
 304	// enforce this. Note: chunkCandidates preserves the sorting so safe to
 305	// transform now.
 306	columnHelper := columnHelper{data: data}
 307	if !sort.IsSorted((sortByOffsetSlice)(ms)) {
 308		log.Printf("WARN: performance invariant violated. candidate matches are not sorted in fillContentChunkMatches. Report to developers.")
 309		sort.Sort((sortByOffsetSlice)(ms))
 310	}
 311
 312	chunks := chunkCandidates(ms, newlines, numContextLines)
 313	chunkMatches := make([]ChunkMatch, 0, len(chunks))
 314	for _, chunk := range chunks {
 315		ranges := make([]Range, 0, len(chunk.candidates))
 316		var symbolInfo []*Symbol
 317		for i, cm := range chunk.candidates {
 318			startOffset := cm.byteOffset
 319			endOffset := cm.byteOffset + cm.byteMatchSz
 320			startLine, startLineOffset, _ := newlines.atOffset(startOffset)
 321			endLine, endLineOffset, _ := newlines.atOffset(endOffset)
 322
 323			ranges = append(ranges, Range{
 324				Start: Location{
 325					ByteOffset: startOffset,
 326					LineNumber: uint32(startLine),
 327					Column:     columnHelper.get(startLineOffset, startOffset),
 328				},
 329				End: Location{
 330					ByteOffset: endOffset,
 331					LineNumber: uint32(endLine),
 332					Column:     columnHelper.get(endLineOffset, endOffset),
 333				},
 334			})
 335
 336			if cm.symbol {
 337				if symbolInfo == nil {
 338					symbolInfo = make([]*Symbol, len(chunk.candidates))
 339				}
 340				start := p.id.fileEndSymbol[p.idx]
 341				si := p.id.symbols.data(start + cm.symbolIdx)
 342				if si != nil {
 343					sec := p.docSections()[cm.symbolIdx]
 344					si.Sym = string(data[sec.Start:sec.End])
 345				}
 346				symbolInfo[i] = si
 347			}
 348		}
 349
 350		firstLineNumber := int(chunk.firstLine) - numContextLines
 351		if firstLineNumber < 1 {
 352			firstLineNumber = 1
 353		}
 354		firstLineStart, _ := newlines.lineBounds(firstLineNumber)
 355
 356		chunkMatches = append(chunkMatches, ChunkMatch{
 357			Content: newlines.getLines(data, firstLineNumber, int(chunk.lastLine)+numContextLines+1),
 358			ContentStart: Location{
 359				ByteOffset: firstLineStart,
 360				LineNumber: uint32(firstLineNumber),
 361				Column:     1,
 362			},
 363			FileName:   false,
 364			Ranges:     ranges,
 365			SymbolInfo: symbolInfo,
 366		})
 367	}
 368	return chunkMatches
 369}
 370
 371type candidateChunk struct {
 372	firstLine  uint32 // 1-based, inclusive
 373	lastLine   uint32 // 1-based, inclusive
 374	minOffset  uint32 // 0-based, inclusive
 375	maxOffset  uint32 // 0-based, exclusive
 376	candidates []*candidateMatch
 377}
 378
 379// chunkCandidates groups a set of sorted, non-overlapping candidate matches by line number. Adjacent
 380// chunks will be merged if adding `numContextLines` to the beginning and end of the chunk would cause
 381// it to overlap with an adjacent chunk.
 382//
 383// input invariants: ms is sorted by byteOffset and is non overlapping with respect to endOffset.
 384// output invariants: if you flatten candidates the input invariant is retained.
 385func chunkCandidates(ms []*candidateMatch, newlines newlines, numContextLines int) []candidateChunk {
 386	var chunks []candidateChunk
 387	for _, m := range ms {
 388		startOffset := m.byteOffset
 389		endOffset := m.byteOffset + m.byteMatchSz
 390		firstLine, _, _ := newlines.atOffset(startOffset)
 391		lastLine, _, _ := newlines.atOffset(endOffset)
 392
 393		if len(chunks) > 0 && int(chunks[len(chunks)-1].lastLine)+numContextLines >= firstLine-numContextLines {
 394			// If a new chunk created with the current candidateMatch would
 395			// overlap with the previous chunk, instead add the candidateMatch
 396			// to the last chunk and extend end of the last chunk.
 397			last := &chunks[len(chunks)-1]
 398			last.candidates = append(last.candidates, m)
 399			if last.maxOffset < endOffset {
 400				last.lastLine = uint32(lastLine)
 401				last.maxOffset = uint32(endOffset)
 402			}
 403		} else {
 404			chunks = append(chunks, candidateChunk{
 405				firstLine:  uint32(firstLine),
 406				lastLine:   uint32(lastLine),
 407				minOffset:  startOffset,
 408				maxOffset:  endOffset,
 409				candidates: []*candidateMatch{m},
 410			})
 411		}
 412	}
 413	return chunks
 414}
 415
 416// columnHelper is a helper struct which caches the number of runes last
 417// counted. If we naively use utf8.RuneCount for each match on a line, this
 418// leads to an O(nm) algorithm where m is the number of matches and n is the
 419// length of the line. Aassuming we our candidates are increasing in offset
 420// makes this operation O(n) instead.
 421type columnHelper struct {
 422	data []byte
 423
 424	// 0 values for all these are valid values
 425	lastLineOffset int
 426	lastOffset     uint32
 427	lastRuneCount  uint32
 428}
 429
 430// get returns the line column for offset. offset is the byte offset of the
 431// rune in data. lineOffset is the byte offset inside of data for the line
 432// containing offset.
 433func (c *columnHelper) get(lineOffset int, offset uint32) uint32 {
 434	var runeCount uint32
 435
 436	if lineOffset == c.lastLineOffset && offset >= c.lastOffset {
 437		// Can count from last calculation
 438		runeCount = c.lastRuneCount + uint32(utf8.RuneCount(c.data[c.lastOffset:offset]))
 439	} else {
 440		// Need to count from the beginning of line
 441		runeCount = uint32(utf8.RuneCount(c.data[lineOffset:offset]))
 442	}
 443
 444	c.lastLineOffset = lineOffset
 445	c.lastOffset = offset
 446	c.lastRuneCount = runeCount
 447
 448	return runeCount + 1
 449}
 450
 451type newlines struct {
 452	// locs is the sorted set of byte offsets of the newlines in the file
 453	locs []uint32
 454
 455	// fileSize is just the number of bytes in the file. It is stored
 456	// on this struct so we can safely know the length of the last line
 457	// in the file since not all files end in a newline.
 458	fileSize uint32
 459}
 460
 461// atOffset returns the line containing the offset. If the offset lands on
 462// the newline ending line M, we return M.  The line is characterized
 463// by its linenumber (base-1, byte index of line start, byte index of
 464// line end). The line end is the index of a newline, or the filesize
 465// (if matching the last line of the file.)
 466func (nls newlines) atOffset(offset uint32) (lineNumber, lineStart, lineEnd int) {
 467	idx := sort.Search(len(nls.locs), func(n int) bool {
 468		return nls.locs[n] >= offset
 469	})
 470
 471	start, end := nls.lineBounds(idx + 1)
 472	return idx + 1, int(start), int(end)
 473}
 474
 475// lineBounds returns the byte offsets of the start and end of the 1-based
 476// lineNumber. The end offset is exclusive and will not contain the line-ending
 477// newline. If the line number is out of range of the lines in the file, start
 478// and end will be clamped to [0,fileSize].
 479func (nls newlines) lineBounds(lineNumber int) (start, end uint32) {
 480	// nls.locs[0] + 1 is the start of the 2nd line of data.
 481	startIdx := lineNumber - 2
 482	endIdx := lineNumber - 1
 483
 484	if startIdx < 0 {
 485		start = 0
 486	} else if startIdx >= len(nls.locs) {
 487		start = nls.fileSize
 488	} else {
 489		start = nls.locs[startIdx] + 1
 490	}
 491
 492	if endIdx < 0 {
 493		end = 0
 494	} else if endIdx >= len(nls.locs) {
 495		end = nls.fileSize
 496	} else {
 497		end = nls.locs[endIdx]
 498	}
 499
 500	return start, end
 501}
 502
 503// getLines returns a slice of data containing the lines [low, high).
 504// low is 1-based and inclusive. high is 1-based and exclusive.
 505func (nls newlines) getLines(data []byte, low, high int) []byte {
 506	if low >= high {
 507		return nil
 508	}
 509
 510	lowStart, _ := nls.lineBounds(low)
 511	_, highEnd := nls.lineBounds(high - 1)
 512
 513	// Drop any trailing newline. Editors do not treat a trailing newline as
 514	// the start of a new line, so we should not either. lineBounds clamps to
 515	// len(data) when an out-of-bounds line is requested.
 516	//
 517	// As an example, if we request lines 1-5 from a file with contents
 518	// `one\ntwo\nthree\n`, we should return `one\ntwo\nthree` because those are
 519	// the three "lines" in the file, separated by newlines.
 520	if highEnd == uint32(len(data)) && bytes.HasSuffix(data, []byte{'\n'}) {
 521		highEnd = highEnd - 1
 522		lowStart = min(lowStart, highEnd)
 523	}
 524
 525	return data[lowStart:highEnd]
 526}
 527
 528const (
 529	// Query-dependent scoring signals. All of these together are bounded at ~9000
 530	// (scoreWordMatch + scoreSymbol + scoreKindMatch * 10 + scoreFactorAtomMatch).
 531	scorePartialWordMatch = 50.0
 532	scoreWordMatch        = 500.0
 533	scoreBase             = 7000.0
 534	scorePartialBase      = 4000.0
 535	scoreSymbol           = 7000.0
 536	scorePartialSymbol    = 4000.0
 537	scoreKindMatch        = 100.0
 538	scoreFactorAtomMatch  = 400.0
 539
 540	// File-only scoring signals. For now these are also bounded ~9000 to give them
 541	// equal weight with the query-dependent signals.
 542	scoreFileRankFactor  = 9000.0
 543	scoreFileOrderFactor = 10.0
 544	scoreRepoRankFactor  = 20.0
 545
 546	// Used for ordering line and chunk matches within a file.
 547	scoreLineOrderFactor = 1.0
 548)
 549
 550// findSection checks whether a section defined by offset and size lies within
 551// one of the sections in secs.
 552func findSection(secs []DocumentSection, off, sz uint32) (int, bool) {
 553	j := sort.Search(len(secs), func(i int) bool {
 554		return secs[i].End >= off+sz
 555	})
 556
 557	if j == len(secs) {
 558		return 0, false
 559	}
 560
 561	if secs[j].Start <= off && off+sz <= secs[j].End {
 562		return j, true
 563	}
 564	return 0, false
 565}
 566
 567func (p *contentProvider) chunkMatchScore(secs []DocumentSection, m *ChunkMatch, language string, debug bool) (float64, string) {
 568	type debugScore struct {
 569		score float64
 570		what  string
 571	}
 572
 573	score := &debugScore{}
 574	maxScore := &debugScore{}
 575
 576	addScore := func(what string, s float64) {
 577		if s != 0 && debug {
 578			score.what += fmt.Sprintf("%s:%.2f, ", what, s)
 579		}
 580		score.score += s
 581	}
 582
 583	data := p.data(m.FileName)
 584	filename := p.data(true)
 585
 586	for i, r := range m.Ranges {
 587		// calculate the start and end offset relative to the start of the content
 588		relStartOffset := int(r.Start.ByteOffset - m.ContentStart.ByteOffset)
 589		relEndOffset := int(r.End.ByteOffset - m.ContentStart.ByteOffset)
 590
 591		startBoundary := relStartOffset < len(m.Content) && (relStartOffset == 0 || byteClass(m.Content[relStartOffset-1]) != byteClass(m.Content[relStartOffset]))
 592		endBoundary := relEndOffset > 0 && (relEndOffset == len(m.Content) || byteClass(m.Content[relEndOffset-1]) != byteClass(m.Content[relEndOffset]))
 593
 594		score.score = 0
 595		score.what = ""
 596
 597		if startBoundary && endBoundary {
 598			addScore("WordMatch", scoreWordMatch)
 599		} else if startBoundary || endBoundary {
 600			addScore("PartialWordMatch", scorePartialWordMatch)
 601		}
 602
 603		if m.FileName {
 604			sep := bytes.LastIndexByte(m.Content, '/')
 605			startMatch := relStartOffset == sep+1
 606			endMatch := relEndOffset == len(m.Content)
 607			if startMatch && endMatch {
 608				addScore("Base", scoreBase)
 609			} else if startMatch || endMatch {
 610				addScore("EdgeBase", (scoreBase+scorePartialBase)/2)
 611			} else if sep < relStartOffset {
 612				addScore("InnerBase", scorePartialBase)
 613			}
 614		} else if secIdx, ok := findSection(secs, uint32(r.Start.ByteOffset), uint32(r.End.ByteOffset-r.Start.ByteOffset)); ok {
 615			sec := secs[secIdx]
 616			startMatch := sec.Start == uint32(r.Start.ByteOffset)
 617			endMatch := sec.End == uint32(r.End.ByteOffset)
 618			if startMatch && endMatch {
 619				addScore("Symbol", scoreSymbol)
 620			} else if startMatch || endMatch {
 621				addScore("EdgeSymbol", (scoreSymbol+scorePartialSymbol)/2)
 622			} else {
 623				addScore("InnerSymbol", scorePartialSymbol)
 624			}
 625
 626			var si *Symbol
 627			if m.SymbolInfo != nil {
 628				si = m.SymbolInfo[i]
 629			}
 630			if si == nil {
 631				// for non-symbol queries, we need to hydrate in SymbolInfo.
 632				start := p.id.fileEndSymbol[p.idx]
 633				si = p.id.symbols.data(start + uint32(secIdx))
 634			}
 635			if si != nil {
 636				symbolKind := ctags.ParseSymbolKind(si.Kind)
 637				sym := sectionSlice(data, sec)
 638				addScore(fmt.Sprintf("kind:%s:%s", language, si.Kind), scoreSymbolKind(language, filename, sym, symbolKind))
 639			}
 640		}
 641
 642		if score.score > maxScore.score {
 643			maxScore.score = score.score
 644			maxScore.what = score.what
 645		}
 646	}
 647
 648	if debug {
 649		maxScore.what = fmt.Sprintf("score:%.2f <- %s", maxScore.score, strings.TrimSuffix(maxScore.what, ", "))
 650	}
 651
 652	return maxScore.score, maxScore.what
 653}
 654
 655func (p *contentProvider) matchScore(secs []DocumentSection, m *LineMatch, language string, debug bool) (float64, string) {
 656	type debugScore struct {
 657		score float64
 658		what  string
 659	}
 660
 661	score := &debugScore{}
 662	maxScore := &debugScore{}
 663
 664	addScore := func(what string, s float64) {
 665		if s != 0 && debug {
 666			score.what += fmt.Sprintf("%s:%.2f, ", what, s)
 667		}
 668		score.score += s
 669	}
 670
 671	data := p.data(m.FileName)
 672	filename := p.data(true)
 673
 674	for _, f := range m.LineFragments {
 675		startBoundary := f.LineOffset < len(m.Line) && (f.LineOffset == 0 || byteClass(m.Line[f.LineOffset-1]) != byteClass(m.Line[f.LineOffset]))
 676
 677		end := int(f.LineOffset) + f.MatchLength
 678		endBoundary := end > 0 && (end == len(m.Line) || byteClass(m.Line[end-1]) != byteClass(m.Line[end]))
 679
 680		score.score = 0
 681		score.what = ""
 682
 683		if startBoundary && endBoundary {
 684			addScore("WordMatch", scoreWordMatch)
 685		} else if startBoundary || endBoundary {
 686			addScore("PartialWordMatch", scorePartialWordMatch)
 687		}
 688
 689		if m.FileName {
 690			sep := bytes.LastIndexByte(m.Line, '/')
 691			startMatch := sep+1 == f.LineOffset
 692			endMatch := len(m.Line) == f.LineOffset+f.MatchLength
 693			if startMatch && endMatch {
 694				addScore("Base", scoreBase)
 695			} else if startMatch || endMatch {
 696				addScore("EdgeBase", (scoreBase+scorePartialBase)/2)
 697			} else if sep < f.LineOffset {
 698				addScore("InnerBase", scorePartialBase)
 699			}
 700		} else if secIdx, ok := findSection(secs, f.Offset, uint32(f.MatchLength)); ok {
 701			sec := secs[secIdx]
 702			startMatch := sec.Start == f.Offset
 703			endMatch := sec.End == f.Offset+uint32(f.MatchLength)
 704			if startMatch && endMatch {
 705				addScore("Symbol", scoreSymbol)
 706			} else if startMatch || endMatch {
 707				addScore("EdgeSymbol", (scoreSymbol+scorePartialSymbol)/2)
 708			} else {
 709				addScore("InnerSymbol", scorePartialSymbol)
 710			}
 711
 712			si := f.SymbolInfo
 713			if si == nil {
 714				// for non-symbol queries, we need to hydrate in SymbolInfo.
 715				start := p.id.fileEndSymbol[p.idx]
 716				si = p.id.symbols.data(start + uint32(secIdx))
 717			}
 718			if si != nil {
 719				// the LineFragment may not be on a symbol, then si will be nil.
 720				symbolKind := ctags.ParseSymbolKind(si.Kind)
 721				sym := sectionSlice(data, sec)
 722				addScore(fmt.Sprintf("kind:%s:%s", language, si.Kind), scoreSymbolKind(language, filename, sym, symbolKind))
 723			}
 724		}
 725
 726		if score.score > maxScore.score {
 727			maxScore.score = score.score
 728			maxScore.what = score.what
 729		}
 730	}
 731
 732	if debug {
 733		maxScore.what = fmt.Sprintf("score:%.2f <- %s", maxScore.score, strings.TrimSuffix(maxScore.what, ", "))
 734	}
 735
 736	return maxScore.score, maxScore.what
 737}
 738
 739// sectionSlice will return data[sec.Start:sec.End] but will clip Start and
 740// End such that it won't be out of range.
 741func sectionSlice(data []byte, sec DocumentSection) []byte {
 742	l := uint32(len(data))
 743	if sec.Start >= l {
 744		return nil
 745	}
 746	if sec.End > l {
 747		sec.End = l
 748	}
 749	return data[sec.Start:sec.End]
 750}
 751
 752// scoreSymbolKind boosts a match based on the combination of language, symbol
 753// and kind. The language string comes from go-enry, the symbol and kind from
 754// ctags.
 755func scoreSymbolKind(language string, filename []byte, sym []byte, kind ctags.SymbolKind) float64 {
 756	var factor float64
 757
 758	// Generic ranking which will be overriden by language specific ranking
 759	switch kind {
 760	case ctags.Type: // scip-ctags regression workaround https://github.com/sourcegraph/sourcegraph/issues/57659
 761		factor = 8
 762	case ctags.Class:
 763		factor = 10
 764	case ctags.Struct:
 765		factor = 9.5
 766	case ctags.Enum:
 767		factor = 9
 768	case ctags.Interface:
 769		factor = 8
 770	case ctags.Function, ctags.Method:
 771		factor = 7
 772	case ctags.Field:
 773		factor = 5.5
 774	case ctags.Constant:
 775		factor = 5
 776	case ctags.Variable:
 777		factor = 4
 778	default:
 779		// For all other kinds, assign a low score by default.
 780		factor = 1
 781	}
 782
 783	switch language {
 784	case "Java", "java":
 785		switch kind {
 786		// 2022-03-30: go-ctags contains a regex rule for Java classes that sets "kind"
 787		// to "classes" instead of "c". We have to cover both cases to support existing
 788		// indexes.
 789		case ctags.Class:
 790			factor = 10
 791		case ctags.Enum:
 792			factor = 9
 793		case ctags.Interface:
 794			factor = 8
 795		case ctags.Method:
 796			factor = 7
 797		case ctags.Field:
 798			factor = 6
 799		case ctags.EnumConstant:
 800			factor = 5
 801		}
 802	case "Kotlin", "kotlin":
 803		switch kind {
 804		case ctags.Class:
 805			factor = 10
 806		case ctags.Interface:
 807			factor = 9
 808		case ctags.Method:
 809			factor = 8
 810		case ctags.TypeAlias:
 811			factor = 7
 812		case ctags.Constant:
 813			factor = 6
 814		case ctags.Variable:
 815			factor = 5
 816		}
 817	case "Go", "go":
 818		switch kind {
 819		// scip-ctags regression workaround https://github.com/sourcegraph/sourcegraph/issues/57659
 820		// for each case a description of the fields in ctags in the comment
 821		case ctags.Type: // interface struct talias
 822			factor = 9
 823		case ctags.Interface: // interfaces
 824			factor = 10
 825		case ctags.Struct: // structs
 826			factor = 9
 827		case ctags.TypeAlias: // type aliases
 828			factor = 9
 829		case ctags.MethodSpec: // interface method specification
 830			factor = 8.5
 831		case ctags.Method, ctags.Function: // functions
 832			factor = 8
 833		case ctags.Field: // struct fields
 834			factor = 7
 835		case ctags.Constant: // constants
 836			factor = 6
 837		case ctags.Variable: // variables
 838			factor = 5
 839		}
 840
 841		// Boost exported go symbols. Same implementation as token.IsExported
 842		if ch, _ := utf8.DecodeRune(sym); unicode.IsUpper(ch) {
 843			factor += 0.5
 844		}
 845
 846		if bytes.HasSuffix(filename, []byte("_test.go")) {
 847			factor *= 0.8
 848		}
 849
 850		// Could also rank on:
 851		//
 852		//   - anonMember  struct anonymous members
 853		//   - packageName name for specifying imported package
 854		//   - receiver    receivers
 855		//   - package     packages
 856		//   - type        types
 857		//   - unknown     unknown
 858	case "C++", "c++":
 859		switch kind {
 860		case ctags.Class: // classes
 861			factor = 10
 862		case ctags.Enum: // enumeration names
 863			factor = 9
 864		case ctags.Function: // function definitions
 865			factor = 8
 866		case ctags.Struct: // structure names
 867			factor = 7
 868		case ctags.Union: // union names
 869			factor = 6
 870		case ctags.TypeAlias: // typedefs
 871			factor = 5
 872		case ctags.Field: // class, struct, and union members
 873			factor = 4
 874		case ctags.Variable: // varialbe definitions
 875			factor = 3
 876		}
 877	// Could also rank on:
 878	// NAME        DESCRIPTION
 879	// macro       macro definitions
 880	// enumerator  enumerators (values inside an enumeration)
 881	// header      included header files
 882	// namespace   namespaces
 883	// variable    variable definitions
 884	case "Scala", "scala":
 885		switch kind {
 886		case ctags.Class:
 887			factor = 10
 888		case ctags.Interface:
 889			factor = 9
 890		case ctags.Object:
 891			factor = 8
 892		case ctags.Function:
 893			factor = 7
 894		case ctags.Type:
 895			factor = 6
 896		case ctags.Variable:
 897			factor = 5
 898		case ctags.Package:
 899			factor = 4
 900		}
 901	case "Python", "python":
 902		switch kind {
 903		case ctags.Class: // classes
 904			factor = 10
 905		case ctags.Function, ctags.Method: // function definitions
 906			factor = 8
 907		case ctags.Field: // class, struct, and union members
 908			factor = 4
 909		case ctags.Variable: // variable definitions
 910			factor = 3
 911		case ctags.Local: // local variables
 912			factor = 2
 913		}
 914		// Could also rank on:
 915		//
 916		//   - namespace name referring a module defined in other file
 917		//   - module    modules
 918		//   - unknown   name referring a class/variable/function/module defined in other module
 919		//   - parameter function parameters
 920	case "Ruby", "ruby":
 921		switch kind {
 922		case ctags.Class:
 923			factor = 10
 924		case ctags.Method:
 925			factor = 9
 926		case ctags.MethodAlias:
 927			factor = 8
 928		case ctags.Module:
 929			factor = 7
 930		case ctags.SingletonMethod:
 931			factor = 6
 932		case ctags.Constant:
 933			factor = 5
 934		case ctags.Accessor:
 935			factor = 4
 936		case ctags.Library:
 937			factor = 3
 938		}
 939	case "PHP", "php":
 940		switch kind {
 941		case ctags.Class:
 942			factor = 10
 943		case ctags.Interface:
 944			factor = 9
 945		case ctags.Function:
 946			factor = 8
 947		case ctags.Trait:
 948			factor = 7
 949		case ctags.Define:
 950			factor = 6
 951		case ctags.Namespace:
 952			factor = 5
 953		case ctags.MethodAlias:
 954			factor = 4
 955		case ctags.Variable:
 956			factor = 3
 957		case ctags.Local:
 958			factor = 3
 959		}
 960	case "GraphQL", "graphql":
 961		switch kind {
 962		case ctags.Type:
 963			factor = 10
 964		}
 965	case "Markdown", "markdown":
 966		// Headers are good signal in docs, but do not rank as highly as code.
 967		switch kind {
 968		case ctags.Chapter: // #
 969			factor = 4
 970		case ctags.Section: // ##
 971			factor = 3
 972		case ctags.Subsection: // ###
 973			factor = 2
 974		}
 975	}
 976
 977	return factor * scoreKindMatch
 978}
 979
 980type matchScoreSlice []LineMatch
 981
 982func (m matchScoreSlice) Len() int           { return len(m) }
 983func (m matchScoreSlice) Swap(i, j int)      { m[i], m[j] = m[j], m[i] }
 984func (m matchScoreSlice) Less(i, j int) bool { return m[i].Score > m[j].Score }
 985
 986type chunkMatchScoreSlice []ChunkMatch
 987
 988func (m chunkMatchScoreSlice) Len() int           { return len(m) }
 989func (m chunkMatchScoreSlice) Swap(i, j int)      { m[i], m[j] = m[j], m[i] }
 990func (m chunkMatchScoreSlice) Less(i, j int) bool { return m[i].Score > m[j].Score }
 991
 992type fileMatchesByScore []FileMatch
 993
 994func (m fileMatchesByScore) Len() int           { return len(m) }
 995func (m fileMatchesByScore) Swap(i, j int)      { m[i], m[j] = m[j], m[i] }
 996func (m fileMatchesByScore) Less(i, j int) bool { return m[i].Score > m[j].Score }
 997
 998func sortMatchesByScore(ms []LineMatch) {
 999	sort.Sort(matchScoreSlice(ms))
1000}
1001
1002func sortChunkMatchesByScore(ms []ChunkMatch) {
1003	sort.Sort(chunkMatchScoreSlice(ms))
1004}
1005
1006var doNovelty = os.Getenv("ZOEKT_NOVELTY_DISABLE") == ""
1007
1008// SortFiles sorts files matches in the order we want to present results to
1009// users. The order depends on the match score, which includes both
1010// query-dependent signals like word overlap, and file-only signals like the
1011// file ranks (if file ranks are enabled).
1012//
1013// We don't only use the scores, we will also boost some results to present
1014// files with novel extensions.
1015func SortFiles(ms []FileMatch) {
1016	sort.Sort(fileMatchesByScore(ms))
1017
1018	if doNovelty {
1019		// Experimentally boost something into the third filematch
1020		boostNovelExtension(ms, 2, 0.9)
1021	}
1022}
1023
1024func boostNovelExtension(ms []FileMatch, boostOffset int, minScoreRatio float64) {
1025	if len(ms) <= boostOffset+1 {
1026		return
1027	}
1028
1029	top := ms[:boostOffset]
1030	candidates := ms[boostOffset:]
1031
1032	// Don't bother boosting something which is significantly different to the
1033	// result it replaces.
1034	minScoreForNovelty := candidates[0].Score * minScoreRatio
1035
1036	// We want to look for an ext that isn't in the top exts
1037	exts := make([]string, len(top))
1038	for i := range top {
1039		exts[i] = path.Ext(top[i].FileName)
1040	}
1041
1042	for i := range candidates {
1043		// Do not assume sorted due to boostNovelExtension being called on subsets
1044		if candidates[i].Score < minScoreForNovelty {
1045			continue
1046		}
1047
1048		if slices.Contains(exts, path.Ext(candidates[i].FileName)) {
1049			continue
1050		}
1051
1052		// Found what we are looking for, now boost to front of candidates (which
1053		// is ms[boostOffset])
1054		for ; i > 0; i-- {
1055			candidates[i], candidates[i-1] = candidates[i-1], candidates[i]
1056		}
1057		return
1058	}
1059}
Configure Feed

Configure Feed