contentprovider.go at b5a5fdc86e830d80b2be3154bb9f896fd20c3e3e · boltless.me/zoekt

fork of https://github.com/sourcegraph/zoekt
zoekt / contentprovider.go
at b5a5fdc86e830d80b2be3154bb9f896fd20c3e3e 28 kB View raw
Keegan Carruthers-Smith score: boost exported go ident and downrank _test.go (#675) 2y ago
   1// Copyright 2016 Google Inc. All rights reserved.
   2//
   3// Licensed under the Apache License, Version 2.0 (the "License");
   4// you may not use this file except in compliance with the License.
   5// You may obtain a copy of the License at
   6//
   7//    http://www.apache.org/licenses/LICENSE-2.0
   8//
   9// Unless required by applicable law or agreed to in writing, software
  10// distributed under the License is distributed on an "AS IS" BASIS,
  11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12// See the License for the specific language governing permissions and
  13// limitations under the License.
  14
  15package zoekt
  16
  17import (
  18	"bytes"
  19	"fmt"
  20	"log"
  21	"os"
  22	"path"
  23	"sort"
  24	"strings"
  25	"unicode"
  26	"unicode/utf8"
  27
  28	"golang.org/x/exp/slices"
  29)
  30
  31var _ = log.Println
  32
  33// contentProvider is an abstraction to treat matches for names and
  34// content with the same code.
  35type contentProvider struct {
  36	id    *indexData
  37	stats *Stats
  38
  39	// mutable
  40	err      error
  41	idx      uint32
  42	_data    []byte
  43	_nl      []uint32
  44	_nlBuf   []uint32
  45	_sects   []DocumentSection
  46	_sectBuf []DocumentSection
  47	fileSize uint32
  48}
  49
  50// setDocument skips to the given document.
  51func (p *contentProvider) setDocument(docID uint32) {
  52	fileStart := p.id.boundaries[docID]
  53
  54	p.idx = docID
  55	p.fileSize = p.id.boundaries[docID+1] - fileStart
  56
  57	p._nl = nil
  58	p._sects = nil
  59	p._data = nil
  60}
  61
  62func (p *contentProvider) docSections() []DocumentSection {
  63	if p._sects == nil {
  64		var sz uint32
  65		p._sects, sz, p.err = p.id.readDocSections(p.idx, p._sectBuf)
  66		p.stats.ContentBytesLoaded += int64(sz)
  67		p._sectBuf = p._sects
  68	}
  69	return p._sects
  70}
  71
  72func (p *contentProvider) newlines() newlines {
  73	if p._nl == nil {
  74		var sz uint32
  75		p._nl, sz, p.err = p.id.readNewlines(p.idx, p._nlBuf)
  76		p._nlBuf = p._nl
  77		p.stats.ContentBytesLoaded += int64(sz)
  78	}
  79	return newlines{locs: p._nl, fileSize: p.fileSize}
  80}
  81
  82func (p *contentProvider) data(fileName bool) []byte {
  83	if fileName {
  84		return p.id.fileNameContent[p.id.fileNameIndex[p.idx]:p.id.fileNameIndex[p.idx+1]]
  85	}
  86
  87	if p._data == nil {
  88		p._data, p.err = p.id.readContents(p.idx)
  89		p.stats.FilesLoaded++
  90		p.stats.ContentBytesLoaded += int64(len(p._data))
  91	}
  92	return p._data
  93}
  94
  95// Find offset in bytes (relative to corpus start) for an offset in
  96// runes (relative to document start). If filename is set, the corpus
  97// is the set of filenames, with the document being the name itself.
  98func (p *contentProvider) findOffset(filename bool, r uint32) uint32 {
  99	if p.id.metaData.PlainASCII {
 100		return r
 101	}
 102
 103	sample := p.id.runeOffsets
 104	runeEnds := p.id.fileEndRunes
 105	fileStartByte := p.id.boundaries[p.idx]
 106	if filename {
 107		sample = p.id.fileNameRuneOffsets
 108		runeEnds = p.id.fileNameEndRunes
 109		fileStartByte = p.id.fileNameIndex[p.idx]
 110	}
 111
 112	absR := r
 113	if p.idx > 0 {
 114		absR += runeEnds[p.idx-1]
 115	}
 116
 117	byteOff, left := sample.lookup(absR)
 118
 119	var data []byte
 120
 121	if filename {
 122		data = p.id.fileNameContent[byteOff:]
 123	} else {
 124		data, p.err = p.id.readContentSlice(byteOff, 3*runeOffsetFrequency)
 125		if p.err != nil {
 126			return 0
 127		}
 128	}
 129	for left > 0 {
 130		_, sz := utf8.DecodeRune(data)
 131		byteOff += uint32(sz)
 132		data = data[sz:]
 133		left--
 134	}
 135
 136	byteOff -= fileStartByte
 137	return byteOff
 138}
 139
 140func (p *contentProvider) fillMatches(ms []*candidateMatch, numContextLines int, language string, debug bool) []LineMatch {
 141	var result []LineMatch
 142	if ms[0].fileName {
 143		// There is only "line" in a filename.
 144		res := LineMatch{
 145			Line:     p.id.fileName(p.idx),
 146			FileName: true,
 147		}
 148
 149		for _, m := range ms {
 150			res.LineFragments = append(res.LineFragments, LineFragmentMatch{
 151				LineOffset:  int(m.byteOffset),
 152				MatchLength: int(m.byteMatchSz),
 153				Offset:      m.byteOffset,
 154			})
 155
 156			result = []LineMatch{res}
 157		}
 158	} else {
 159		ms = breakMatchesOnNewlines(ms, p.data(false))
 160		result = p.fillContentMatches(ms, numContextLines)
 161	}
 162
 163	sects := p.docSections()
 164	for i, m := range result {
 165		result[i].Score, result[i].DebugScore = p.matchScore(sects, &m, language, debug)
 166	}
 167
 168	return result
 169}
 170
 171func (p *contentProvider) fillChunkMatches(ms []*candidateMatch, numContextLines int, language string, debug bool) []ChunkMatch {
 172	var result []ChunkMatch
 173	if ms[0].fileName {
 174		// If the first match is a filename match, there will only be
 175		// one match and the matched content will be the filename.
 176
 177		fileName := p.id.fileName(p.idx)
 178		ranges := make([]Range, 0, len(ms))
 179		for _, m := range ms {
 180			ranges = append(ranges, Range{
 181				Start: Location{
 182					ByteOffset: m.byteOffset,
 183					LineNumber: 1,
 184					Column:     uint32(utf8.RuneCount(fileName[:m.byteOffset]) + 1),
 185				},
 186				End: Location{
 187					ByteOffset: m.byteOffset + m.byteMatchSz,
 188					LineNumber: 1,
 189					Column:     uint32(utf8.RuneCount(fileName[:m.byteOffset+m.byteMatchSz]) + 1),
 190				},
 191			})
 192		}
 193
 194		result = []ChunkMatch{{
 195			Content:      fileName,
 196			ContentStart: Location{ByteOffset: 0, LineNumber: 1, Column: 1},
 197			Ranges:       ranges,
 198			FileName:     true,
 199		}}
 200	} else {
 201		result = p.fillContentChunkMatches(ms, numContextLines)
 202	}
 203
 204	sects := p.docSections()
 205	for i, m := range result {
 206		result[i].Score, result[i].DebugScore = p.chunkMatchScore(sects, &m, language, debug)
 207	}
 208
 209	return result
 210}
 211
 212func (p *contentProvider) fillContentMatches(ms []*candidateMatch, numContextLines int) []LineMatch {
 213	var result []LineMatch
 214	for len(ms) > 0 {
 215		m := ms[0]
 216		num, lineStart, lineEnd := p.newlines().atOffset(m.byteOffset)
 217
 218		var lineCands []*candidateMatch
 219
 220		endMatch := m.byteOffset + m.byteMatchSz
 221
 222		for len(ms) > 0 {
 223			m := ms[0]
 224			if int(m.byteOffset) <= lineEnd {
 225				endMatch = m.byteOffset + m.byteMatchSz
 226				lineCands = append(lineCands, m)
 227				ms = ms[1:]
 228			} else {
 229				break
 230			}
 231		}
 232
 233		if len(lineCands) == 0 {
 234			log.Panicf(
 235				"%s %v infinite loop: num %d start,end %d,%d, offset %d",
 236				p.id.fileName(p.idx), p.id.metaData,
 237				num, lineStart, lineEnd,
 238				m.byteOffset)
 239		}
 240
 241		data := p.data(false)
 242
 243		// Due to merging matches, we may have a match that
 244		// crosses a line boundary. Prevent confusion by
 245		// taking lines until we pass the last match
 246		for lineEnd < len(data) && endMatch > uint32(lineEnd) {
 247			next := bytes.IndexByte(data[lineEnd+1:], '\n')
 248			if next == -1 {
 249				lineEnd = len(data)
 250			} else {
 251				// TODO(hanwen): test that checks "+1" part here.
 252				lineEnd += next + 1
 253			}
 254		}
 255
 256		finalMatch := LineMatch{
 257			LineStart:  lineStart,
 258			LineEnd:    lineEnd,
 259			LineNumber: num,
 260		}
 261		finalMatch.Line = data[lineStart:lineEnd]
 262
 263		if numContextLines > 0 {
 264			finalMatch.Before = p.newlines().getLines(data, num-numContextLines, num)
 265			finalMatch.After = p.newlines().getLines(data, num+1, num+1+numContextLines)
 266		}
 267
 268		for _, m := range lineCands {
 269			fragment := LineFragmentMatch{
 270				Offset:      m.byteOffset,
 271				LineOffset:  int(m.byteOffset) - lineStart,
 272				MatchLength: int(m.byteMatchSz),
 273			}
 274			if m.symbol {
 275				start := p.id.fileEndSymbol[p.idx]
 276				fragment.SymbolInfo = p.id.symbols.data(start + m.symbolIdx)
 277				if fragment.SymbolInfo != nil {
 278					sec := p.docSections()[m.symbolIdx]
 279					fragment.SymbolInfo.Sym = string(data[sec.Start:sec.End])
 280				}
 281			}
 282
 283			finalMatch.LineFragments = append(finalMatch.LineFragments, fragment)
 284		}
 285		result = append(result, finalMatch)
 286	}
 287	return result
 288}
 289
 290func (p *contentProvider) fillContentChunkMatches(ms []*candidateMatch, numContextLines int) []ChunkMatch {
 291	newlines := p.newlines()
 292	chunks := chunkCandidates(ms, newlines, numContextLines)
 293	data := p.data(false)
 294	chunkMatches := make([]ChunkMatch, 0, len(chunks))
 295	for _, chunk := range chunks {
 296		ranges := make([]Range, 0, len(chunk.candidates))
 297		var symbolInfo []*Symbol
 298		for i, cm := range chunk.candidates {
 299			startOffset := cm.byteOffset
 300			endOffset := cm.byteOffset + cm.byteMatchSz
 301			startLine, startLineOffset, _ := newlines.atOffset(startOffset)
 302			endLine, endLineOffset, _ := newlines.atOffset(endOffset)
 303
 304			ranges = append(ranges, Range{
 305				Start: Location{
 306					ByteOffset: startOffset,
 307					LineNumber: uint32(startLine),
 308					Column:     uint32(utf8.RuneCount(data[startLineOffset:startOffset]) + 1),
 309				},
 310				End: Location{
 311					ByteOffset: endOffset,
 312					LineNumber: uint32(endLine),
 313					Column:     uint32(utf8.RuneCount(data[endLineOffset:endOffset]) + 1),
 314				},
 315			})
 316
 317			if cm.symbol {
 318				if symbolInfo == nil {
 319					symbolInfo = make([]*Symbol, len(chunk.candidates))
 320				}
 321				start := p.id.fileEndSymbol[p.idx]
 322				si := p.id.symbols.data(start + cm.symbolIdx)
 323				if si != nil {
 324					sec := p.docSections()[cm.symbolIdx]
 325					si.Sym = string(data[sec.Start:sec.End])
 326				}
 327				symbolInfo[i] = si
 328			}
 329		}
 330
 331		firstLineNumber := int(chunk.firstLine) - numContextLines
 332		if firstLineNumber < 1 {
 333			firstLineNumber = 1
 334		}
 335		firstLineStart, _ := newlines.lineBounds(firstLineNumber)
 336
 337		chunkMatches = append(chunkMatches, ChunkMatch{
 338			Content: newlines.getLines(data, firstLineNumber, int(chunk.lastLine)+numContextLines+1),
 339			ContentStart: Location{
 340				ByteOffset: firstLineStart,
 341				LineNumber: uint32(firstLineNumber),
 342				Column:     1,
 343			},
 344			FileName:   false,
 345			Ranges:     ranges,
 346			SymbolInfo: symbolInfo,
 347		})
 348	}
 349	return chunkMatches
 350}
 351
 352type candidateChunk struct {
 353	firstLine  uint32 // 1-based, inclusive
 354	lastLine   uint32 // 1-based, inclusive
 355	minOffset  uint32 // 0-based, inclusive
 356	maxOffset  uint32 // 0-based, exclusive
 357	candidates []*candidateMatch
 358}
 359
 360// chunkCandidates groups a set of sorted, non-overlapping candidate matches by line number. Adjacent
 361// chunks will be merged if adding `numContextLines` to the beginning and end of the chunk would cause
 362// it to overlap with an adjacent chunk.
 363func chunkCandidates(ms []*candidateMatch, newlines newlines, numContextLines int) []candidateChunk {
 364	var chunks []candidateChunk
 365	for _, m := range ms {
 366		startOffset := m.byteOffset
 367		endOffset := m.byteOffset + m.byteMatchSz
 368		firstLine, _, _ := newlines.atOffset(startOffset)
 369		lastLine, _, _ := newlines.atOffset(endOffset)
 370
 371		if len(chunks) > 0 && int(chunks[len(chunks)-1].lastLine)+numContextLines >= firstLine-numContextLines {
 372			// If a new chunk created with the current candidateMatch would
 373			// overlap with the previous chunk, instead add the candidateMatch
 374			// to the last chunk and extend end of the last chunk.
 375			last := &chunks[len(chunks)-1]
 376			last.candidates = append(last.candidates, m)
 377			if last.maxOffset < endOffset {
 378				last.lastLine = uint32(lastLine)
 379				last.maxOffset = uint32(endOffset)
 380			}
 381		} else {
 382			chunks = append(chunks, candidateChunk{
 383				firstLine:  uint32(firstLine),
 384				lastLine:   uint32(lastLine),
 385				minOffset:  startOffset,
 386				maxOffset:  endOffset,
 387				candidates: []*candidateMatch{m},
 388			})
 389		}
 390	}
 391	return chunks
 392}
 393
 394type newlines struct {
 395	// locs is the sorted set of byte offsets of the newlines in the file
 396	locs []uint32
 397
 398	// fileSize is just the number of bytes in the file. It is stored
 399	// on this struct so we can safely know the length of the last line
 400	// in the file since not all files end in a newline.
 401	fileSize uint32
 402}
 403
 404// atOffset returns the line containing the offset. If the offset lands on
 405// the newline ending line M, we return M.  The line is characterized
 406// by its linenumber (base-1, byte index of line start, byte index of
 407// line end). The line end is the index of a newline, or the filesize
 408// (if matching the last line of the file.)
 409func (nls newlines) atOffset(offset uint32) (lineNumber, lineStart, lineEnd int) {
 410	idx := sort.Search(len(nls.locs), func(n int) bool {
 411		return nls.locs[n] >= offset
 412	})
 413
 414	start, end := nls.lineBounds(idx + 1)
 415	return idx + 1, int(start), int(end)
 416}
 417
 418// lineBounds returns the byte offsets of the start and end of the 1-based
 419// lineNumber. The end offset is exclusive and will not contain the line-ending
 420// newline. If the line number is out of range of the lines in the file, start
 421// and end will be clamped to [0,fileSize].
 422func (nls newlines) lineBounds(lineNumber int) (start, end uint32) {
 423	// nls.locs[0] + 1 is the start of the 2nd line of data.
 424	startIdx := lineNumber - 2
 425	endIdx := lineNumber - 1
 426
 427	if startIdx < 0 {
 428		start = 0
 429	} else if startIdx >= len(nls.locs) {
 430		start = nls.fileSize
 431	} else {
 432		start = nls.locs[startIdx] + 1
 433	}
 434
 435	if endIdx < 0 {
 436		end = 0
 437	} else if endIdx >= len(nls.locs) {
 438		end = nls.fileSize
 439	} else {
 440		end = nls.locs[endIdx]
 441	}
 442
 443	return start, end
 444}
 445
 446// getLines returns a slice of data containing the lines [low, high).
 447// low is 1-based and inclusive. high is 1-based and exclusive.
 448func (nls newlines) getLines(data []byte, low, high int) []byte {
 449	if low >= high {
 450		return nil
 451	}
 452
 453	lowStart, _ := nls.lineBounds(low)
 454	_, highEnd := nls.lineBounds(high - 1)
 455
 456	return data[lowStart:highEnd]
 457}
 458
 459const (
 460	// Query-dependent scoring signals. All of these together are bounded at ~9000
 461	// (scoreWordMatch + scoreSymbol + scoreKindMatch * 10 + scoreFactorAtomMatch).
 462	scorePartialWordMatch = 50.0
 463	scoreWordMatch        = 500.0
 464	scoreBase             = 7000.0
 465	scorePartialBase      = 4000.0
 466	scoreSymbol           = 7000.0
 467	scorePartialSymbol    = 4000.0
 468	scoreKindMatch        = 100.0
 469	scoreFactorAtomMatch  = 400.0
 470
 471	// File-only scoring signals. For now these are also bounded ~9000 to give them
 472	// equal weight with the query-dependent signals.
 473	scoreFileRankFactor  = 9000.0
 474	scoreFileOrderFactor = 10.0
 475	scoreRepoRankFactor  = 20.0
 476
 477	// Used for ordering line and chunk matches within a file.
 478	scoreLineOrderFactor = 1.0
 479)
 480
 481// findSection checks whether a section defined by offset and size lies within
 482// one of the sections in secs.
 483func findSection(secs []DocumentSection, off, sz uint32) (int, bool) {
 484	j := sort.Search(len(secs), func(i int) bool {
 485		return secs[i].End >= off+sz
 486	})
 487
 488	if j == len(secs) {
 489		return 0, false
 490	}
 491
 492	if secs[j].Start <= off && off+sz <= secs[j].End {
 493		return j, true
 494	}
 495	return 0, false
 496}
 497
 498func (p *contentProvider) chunkMatchScore(secs []DocumentSection, m *ChunkMatch, language string, debug bool) (float64, string) {
 499	type debugScore struct {
 500		score float64
 501		what  string
 502	}
 503
 504	score := &debugScore{}
 505	maxScore := &debugScore{}
 506
 507	addScore := func(what string, s float64) {
 508		if s != 0 && debug {
 509			score.what += fmt.Sprintf("%s:%.2f, ", what, s)
 510		}
 511		score.score += s
 512	}
 513
 514	data := p.data(m.FileName)
 515	filename := p.data(true)
 516
 517	for i, r := range m.Ranges {
 518		// calculate the start and end offset relative to the start of the content
 519		relStartOffset := int(r.Start.ByteOffset - m.ContentStart.ByteOffset)
 520		relEndOffset := int(r.End.ByteOffset - m.ContentStart.ByteOffset)
 521
 522		startBoundary := relStartOffset < len(m.Content) && (relStartOffset == 0 || byteClass(m.Content[relStartOffset-1]) != byteClass(m.Content[relStartOffset]))
 523		endBoundary := relEndOffset > 0 && (relEndOffset == len(m.Content) || byteClass(m.Content[relEndOffset-1]) != byteClass(m.Content[relEndOffset]))
 524
 525		score.score = 0
 526		score.what = ""
 527
 528		if startBoundary && endBoundary {
 529			addScore("WordMatch", scoreWordMatch)
 530		} else if startBoundary || endBoundary {
 531			addScore("PartialWordMatch", scorePartialWordMatch)
 532		}
 533
 534		if m.FileName {
 535			sep := bytes.LastIndexByte(m.Content, '/')
 536			startMatch := relStartOffset == sep+1
 537			endMatch := relEndOffset == len(m.Content)
 538			if startMatch && endMatch {
 539				addScore("Base", scoreBase)
 540			} else if startMatch || endMatch {
 541				addScore("EdgeBase", (scoreBase+scorePartialBase)/2)
 542			} else if sep < relStartOffset {
 543				addScore("InnerBase", scorePartialBase)
 544			}
 545		} else if secIdx, ok := findSection(secs, uint32(r.Start.ByteOffset), uint32(r.End.ByteOffset-r.Start.ByteOffset)); ok {
 546			sec := secs[secIdx]
 547			startMatch := sec.Start == uint32(r.Start.ByteOffset)
 548			endMatch := sec.End == uint32(r.End.ByteOffset)
 549			if startMatch && endMatch {
 550				addScore("Symbol", scoreSymbol)
 551			} else if startMatch || endMatch {
 552				addScore("EdgeSymbol", (scoreSymbol+scorePartialSymbol)/2)
 553			} else {
 554				addScore("InnerSymbol", scorePartialSymbol)
 555			}
 556
 557			var si *Symbol
 558			if m.SymbolInfo != nil {
 559				si = m.SymbolInfo[i]
 560			}
 561			if si == nil {
 562				// for non-symbol queries, we need to hydrate in SymbolInfo.
 563				start := p.id.fileEndSymbol[p.idx]
 564				si = p.id.symbols.data(start + uint32(secIdx))
 565			}
 566			if si != nil {
 567				sym := sectionSlice(data, sec)
 568				addScore(fmt.Sprintf("kind:%s:%s", language, si.Kind), scoreSymbolKind(language, filename, sym, si.Kind))
 569			}
 570		}
 571
 572		if score.score > maxScore.score {
 573			maxScore.score = score.score
 574			maxScore.what = score.what
 575		}
 576	}
 577
 578	if debug {
 579		maxScore.what = fmt.Sprintf("score:%.2f <- %s", maxScore.score, strings.TrimSuffix(maxScore.what, ", "))
 580	}
 581
 582	return maxScore.score, maxScore.what
 583}
 584
 585func (p *contentProvider) matchScore(secs []DocumentSection, m *LineMatch, language string, debug bool) (float64, string) {
 586	type debugScore struct {
 587		score float64
 588		what  string
 589	}
 590
 591	score := &debugScore{}
 592	maxScore := &debugScore{}
 593
 594	addScore := func(what string, s float64) {
 595		if s != 0 && debug {
 596			score.what += fmt.Sprintf("%s:%.2f, ", what, s)
 597		}
 598		score.score += s
 599	}
 600
 601	data := p.data(m.FileName)
 602	filename := p.data(true)
 603
 604	for _, f := range m.LineFragments {
 605		startBoundary := f.LineOffset < len(m.Line) && (f.LineOffset == 0 || byteClass(m.Line[f.LineOffset-1]) != byteClass(m.Line[f.LineOffset]))
 606
 607		end := int(f.LineOffset) + f.MatchLength
 608		endBoundary := end > 0 && (end == len(m.Line) || byteClass(m.Line[end-1]) != byteClass(m.Line[end]))
 609
 610		score.score = 0
 611		score.what = ""
 612
 613		if startBoundary && endBoundary {
 614			addScore("WordMatch", scoreWordMatch)
 615		} else if startBoundary || endBoundary {
 616			addScore("PartialWordMatch", scorePartialWordMatch)
 617		}
 618
 619		if m.FileName {
 620			sep := bytes.LastIndexByte(m.Line, '/')
 621			startMatch := sep+1 == f.LineOffset
 622			endMatch := len(m.Line) == f.LineOffset+f.MatchLength
 623			if startMatch && endMatch {
 624				addScore("Base", scoreBase)
 625			} else if startMatch || endMatch {
 626				addScore("EdgeBase", (scoreBase+scorePartialBase)/2)
 627			} else if sep < f.LineOffset {
 628				addScore("InnerBase", scorePartialBase)
 629			}
 630		} else if secIdx, ok := findSection(secs, f.Offset, uint32(f.MatchLength)); ok {
 631			sec := secs[secIdx]
 632			startMatch := sec.Start == f.Offset
 633			endMatch := sec.End == f.Offset+uint32(f.MatchLength)
 634			if startMatch && endMatch {
 635				addScore("Symbol", scoreSymbol)
 636			} else if startMatch || endMatch {
 637				addScore("EdgeSymbol", (scoreSymbol+scorePartialSymbol)/2)
 638			} else {
 639				addScore("InnerSymbol", scorePartialSymbol)
 640			}
 641
 642			si := f.SymbolInfo
 643			if si == nil {
 644				// for non-symbol queries, we need to hydrate in SymbolInfo.
 645				start := p.id.fileEndSymbol[p.idx]
 646				si = p.id.symbols.data(start + uint32(secIdx))
 647			}
 648			if si != nil {
 649				// the LineFragment may not be on a symbol, then si will be nil.
 650				sym := sectionSlice(data, sec)
 651				addScore(fmt.Sprintf("kind:%s:%s", language, si.Kind), scoreSymbolKind(language, filename, sym, si.Kind))
 652			}
 653		}
 654
 655		if score.score > maxScore.score {
 656			maxScore.score = score.score
 657			maxScore.what = score.what
 658		}
 659	}
 660
 661	if debug {
 662		maxScore.what = fmt.Sprintf("score:%.2f <- %s", maxScore.score, strings.TrimSuffix(maxScore.what, ", "))
 663	}
 664
 665	return maxScore.score, maxScore.what
 666}
 667
 668// sectionSlice will return data[sec.Start:sec.End] but will clip Start and
 669// End such that it won't be out of range.
 670func sectionSlice(data []byte, sec DocumentSection) []byte {
 671	l := uint32(len(data))
 672	if sec.Start >= l {
 673		return nil
 674	}
 675	if sec.End > l {
 676		sec.End = l
 677	}
 678	return data[sec.Start:sec.End]
 679}
 680
 681// scoreSymbolKind boosts a match based on the combination of language, symbol
 682// and kind. The language string comes from go-enry, the symbol and kind from
 683// ctags.
 684func scoreSymbolKind(language string, filename []byte, sym []byte, kind string) float64 {
 685	var factor float64
 686
 687	// Generic ranking which will be overriden by language specific ranking
 688	switch kind {
 689	case "type": // scip-ctags regression workaround https://github.com/sourcegraph/sourcegraph/issues/57659
 690		factor = 8
 691	case "class":
 692		factor = 10
 693	case "struct":
 694		factor = 9.5
 695	case "enum":
 696		factor = 9
 697	case "interface":
 698		factor = 8
 699	case "function", "func", "method":
 700		factor = 7
 701	case "member", "field":
 702		factor = 5.5
 703	case "constant", "const":
 704		factor = 5
 705	case "var", "variable":
 706		factor = 4
 707
 708	default:
 709		// No idea what it is, but its something regarded as a symbol
 710		factor = 1
 711	}
 712
 713	// Refer to universal-ctags --list-kinds-full=<language> to learn about which
 714	// kinds are detected for which language.
 715	//
 716	// Note that go-ctags uses universal-ctags's interactive mode and thus returns
 717	// the full name for "kind" and not the one-letter abbreviation.
 718	switch language {
 719	case "Java", "java":
 720		switch kind {
 721		// 2022-03-30: go-ctags contains a regex rule for Java classes that sets "kind"
 722		// to "classes" instead of "c". We have to cover both cases to support existing
 723		// indexes.
 724		case "class", "classes":
 725			factor = 10
 726		case "enum":
 727			factor = 9
 728		case "interface":
 729			factor = 8
 730		case "method":
 731			factor = 7
 732		case "field":
 733			factor = 6
 734		case "enumConstant":
 735			factor = 5
 736		}
 737	case "Kotlin", "kotlin":
 738		switch kind {
 739		case "class":
 740			factor = 10
 741		case "interface":
 742			factor = 9
 743		case "method":
 744			factor = 8
 745		case "typealias":
 746			factor = 7
 747		case "constant":
 748			factor = 6
 749		case "variable":
 750			factor = 5
 751		}
 752	case "Go", "go":
 753		switch kind {
 754		// scip-ctags regression workaround https://github.com/sourcegraph/sourcegraph/issues/57659
 755		// for each case a description of the fields in ctags in the comment
 756		case "type": // interface struct talias
 757			factor = 9
 758		case "method", "function": // methodSpec func
 759			factor = 8
 760		case "variable": // var member
 761			factor = 7
 762		case "constant": // const
 763			factor = 6
 764
 765		case "interface": // interfaces
 766			factor = 10
 767		case "struct": // structs
 768			factor = 9
 769		case "talias": // type aliases
 770			factor = 9
 771		case "methodSpec": // interface method specification
 772			factor = 8.5
 773		case "func": // functions
 774			factor = 8
 775		case "member": // struct members
 776			factor = 7
 777		case "const": // constants
 778			factor = 6
 779		case "var": // variables
 780			factor = 5
 781		}
 782
 783		// Boost exported go symbols. Same implementation as token.IsExported
 784		if ch, _ := utf8.DecodeRune(sym); unicode.IsUpper(ch) {
 785			factor += 0.5
 786		}
 787
 788		if bytes.HasSuffix(filename, []byte("_test.go")) {
 789			factor *= 0.8
 790		}
 791
 792		// Could also rank on:
 793		//
 794		//   - anonMember  struct anonymous members
 795		//   - packageName name for specifying imported package
 796		//   - receiver    receivers
 797		//   - package     packages
 798		//   - type        types
 799		//   - unknown     unknown
 800	case "C++", "c++":
 801		switch kind {
 802		case "class": // classes
 803			factor = 10
 804		case "enum": // enumeration names
 805			factor = 9
 806		case "function": // function definitions
 807			factor = 8
 808		case "struct": // structure names
 809			factor = 7
 810		case "union": // union names
 811			factor = 6
 812		case "typdef": // typedefs
 813			factor = 5
 814		case "member": // class, struct, and union members
 815			factor = 4
 816		case "variable": // varialbe definitions
 817			factor = 3
 818		}
 819	// Could also rank on:
 820	// NAME        DESCRIPTION
 821	// macro       macro definitions
 822	// enumerator  enumerators (values inside an enumeration)
 823	// header      included header files
 824	// namespace   namespaces
 825	// variable    variable definitions
 826	case "Scala", "scala":
 827		switch kind {
 828		case "class":
 829			factor = 10
 830		case "interface":
 831			factor = 9
 832		case "object":
 833			factor = 8
 834		case "method":
 835			factor = 7
 836		case "type":
 837			factor = 6
 838		case "variable":
 839			factor = 5
 840		case "package":
 841			factor = 4
 842		}
 843	case "Python", "python":
 844		switch kind {
 845		case "class": // classes
 846			factor = 10
 847		case "function": // function definitions
 848			factor = 8
 849		case "member": // class, struct, and union members
 850			factor = 4
 851		case "variable": // variable definitions
 852			factor = 3
 853		case "local": // local variables
 854			factor = 2
 855		}
 856		// Could also rank on:
 857		//
 858		//   - namespace name referring a module defined in other file
 859		//   - module    modules
 860		//   - unknown   name referring a class/variable/function/module defined in other module
 861		//   - parameter function parameters
 862	case "Ruby", "ruby":
 863		switch kind {
 864		case "class":
 865			factor = 10
 866		case "method":
 867			factor = 9
 868		case "alias":
 869			factor = 8
 870		case "module":
 871			factor = 7
 872		case "singletonMethod":
 873			factor = 6
 874		case "constant":
 875			factor = 5
 876		case "accessor":
 877			factor = 4
 878		case "library":
 879			factor = 3
 880		}
 881	case "PHP", "php":
 882		switch kind {
 883		case "class":
 884			factor = 10
 885		case "interface":
 886			factor = 9
 887		case "function":
 888			factor = 8
 889		case "trait":
 890			factor = 7
 891		case "define":
 892			factor = 6
 893		case "namespace":
 894			factor = 5
 895		case "alias":
 896			factor = 4
 897		case "variable":
 898			factor = 3
 899		case "local":
 900			factor = 3
 901		}
 902	case "GraphQL", "graphql":
 903		switch kind {
 904		case "type":
 905			factor = 10
 906		}
 907	case "Markdown", "markdown":
 908		// Headers are good signal in docs, but do not rank as highly as code.
 909		switch kind {
 910		case "chapter": // #
 911			factor = 4
 912		case "section": // ##
 913			factor = 3
 914		case "subsection": // ###
 915			factor = 2
 916		}
 917	}
 918
 919	return factor * scoreKindMatch
 920}
 921
 922type matchScoreSlice []LineMatch
 923
 924func (m matchScoreSlice) Len() int           { return len(m) }
 925func (m matchScoreSlice) Swap(i, j int)      { m[i], m[j] = m[j], m[i] }
 926func (m matchScoreSlice) Less(i, j int) bool { return m[i].Score > m[j].Score }
 927
 928type chunkMatchScoreSlice []ChunkMatch
 929
 930func (m chunkMatchScoreSlice) Len() int           { return len(m) }
 931func (m chunkMatchScoreSlice) Swap(i, j int)      { m[i], m[j] = m[j], m[i] }
 932func (m chunkMatchScoreSlice) Less(i, j int) bool { return m[i].Score > m[j].Score }
 933
 934type fileMatchesByScore []FileMatch
 935
 936func (m fileMatchesByScore) Len() int           { return len(m) }
 937func (m fileMatchesByScore) Swap(i, j int)      { m[i], m[j] = m[j], m[i] }
 938func (m fileMatchesByScore) Less(i, j int) bool { return m[i].Score > m[j].Score }
 939
 940func sortMatchesByScore(ms []LineMatch) {
 941	sort.Sort(matchScoreSlice(ms))
 942}
 943
 944func sortChunkMatchesByScore(ms []ChunkMatch) {
 945	sort.Sort(chunkMatchScoreSlice(ms))
 946}
 947
 948var doNovelty = os.Getenv("ZOEKT_NOVELTY_DISABLE") == ""
 949
 950// SortFiles sorts files matches in the order we want to present results to
 951// users. The order depends on the match score, which includes both
 952// query-dependent signals like word overlap, and file-only signals like the
 953// file ranks (if file ranks are enabled).
 954//
 955// We don't only use the scores, we will also boost some results to present
 956// files with novel extensions.
 957func SortFiles(ms []FileMatch) {
 958	sort.Sort(fileMatchesByScore(ms))
 959
 960	if doNovelty {
 961		// Experimentally boost something into the third filematch
 962		boostNovelExtension(ms, 2, 0.9)
 963	}
 964}
 965
 966func boostNovelExtension(ms []FileMatch, boostOffset int, minScoreRatio float64) {
 967	if len(ms) <= boostOffset+1 {
 968		return
 969	}
 970
 971	top := ms[:boostOffset]
 972	candidates := ms[boostOffset:]
 973
 974	// Don't bother boosting something which is significantly different to the
 975	// result it replaces.
 976	minScoreForNovelty := candidates[0].Score * minScoreRatio
 977
 978	// We want to look for an ext that isn't in the top exts
 979	exts := make([]string, len(top))
 980	for i := range top {
 981		exts[i] = path.Ext(top[i].FileName)
 982	}
 983
 984	for i := range candidates {
 985		// Do not assume sorted due to boostNovelExtension being called on subsets
 986		if candidates[i].Score < minScoreForNovelty {
 987			continue
 988		}
 989
 990		if slices.Contains(exts, path.Ext(candidates[i].FileName)) {
 991			continue
 992		}
 993
 994		// Found what we are looking for, now boost to front of candidates (which
 995		// is ms[boostOffset])
 996		for ; i > 0; i-- {
 997			candidates[i], candidates[i-1] = candidates[i-1], candidates[i]
 998		}
 999		return
1000	}
1001}
Configure Feed

Configure Feed