fork of https://github.com/sourcegraph/zoekt
0

Configure Feed

Select the types of activity you want to include in your feed.

1// Copyright 2016 Google Inc. All rights reserved. 2// 3// Licensed under the Apache License, Version 2.0 (the "License"); 4// you may not use this file except in compliance with the License. 5// You may obtain a copy of the License at 6// 7// http://www.apache.org/licenses/LICENSE-2.0 8// 9// Unless required by applicable law or agreed to in writing, software 10// distributed under the License is distributed on an "AS IS" BASIS, 11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12// See the License for the specific language governing permissions and 13// limitations under the License. 14 15package zoekt 16 17import ( 18 "bytes" 19 "fmt" 20 "log" 21 "path" 22 "slices" 23 "sort" 24 "strings" 25 "unicode" 26 "unicode/utf8" 27 28 "github.com/sourcegraph/zoekt/ctags" 29) 30 31var _ = log.Println 32 33// contentProvider is an abstraction to treat matches for names and 34// content with the same code. 35type contentProvider struct { 36 id *indexData 37 stats *Stats 38 39 // mutable 40 err error 41 idx uint32 42 _data []byte 43 _nl []uint32 44 _nlBuf []uint32 45 _sects []DocumentSection 46 _sectBuf []DocumentSection 47 fileSize uint32 48} 49 50// setDocument skips to the given document. 51func (p *contentProvider) setDocument(docID uint32) { 52 fileStart := p.id.boundaries[docID] 53 54 p.idx = docID 55 p.fileSize = p.id.boundaries[docID+1] - fileStart 56 57 p._nl = nil 58 p._sects = nil 59 p._data = nil 60} 61 62func (p *contentProvider) docSections() []DocumentSection { 63 if p._sects == nil { 64 var sz uint32 65 p._sects, sz, p.err = p.id.readDocSections(p.idx, p._sectBuf) 66 p.stats.ContentBytesLoaded += int64(sz) 67 p._sectBuf = p._sects 68 } 69 return p._sects 70} 71 72func (p *contentProvider) newlines() newlines { 73 if p._nl == nil { 74 var sz uint32 75 p._nl, sz, p.err = p.id.readNewlines(p.idx, p._nlBuf) 76 p._nlBuf = p._nl 77 p.stats.ContentBytesLoaded += int64(sz) 78 } 79 return newlines{locs: p._nl, fileSize: p.fileSize} 80} 81 82func (p *contentProvider) data(fileName bool) []byte { 83 if fileName { 84 return p.id.fileNameContent[p.id.fileNameIndex[p.idx]:p.id.fileNameIndex[p.idx+1]] 85 } 86 87 if p._data == nil { 88 p._data, p.err = p.id.readContents(p.idx) 89 p.stats.FilesLoaded++ 90 p.stats.ContentBytesLoaded += int64(len(p._data)) 91 } 92 return p._data 93} 94 95// Find offset in bytes (relative to corpus start) for an offset in 96// runes (relative to document start). If filename is set, the corpus 97// is the set of filenames, with the document being the name itself. 98func (p *contentProvider) findOffset(filename bool, r uint32) uint32 { 99 if p.id.metaData.PlainASCII { 100 return r 101 } 102 103 sample := p.id.runeOffsets 104 runeEnds := p.id.fileEndRunes 105 fileStartByte := p.id.boundaries[p.idx] 106 if filename { 107 sample = p.id.fileNameRuneOffsets 108 runeEnds = p.id.fileNameEndRunes 109 fileStartByte = p.id.fileNameIndex[p.idx] 110 } 111 112 absR := r 113 if p.idx > 0 { 114 absR += runeEnds[p.idx-1] 115 } 116 117 byteOff, left := sample.lookup(absR) 118 119 var data []byte 120 121 if filename { 122 data = p.id.fileNameContent[byteOff:] 123 } else { 124 data, p.err = p.id.readContentSlice(byteOff, 3*runeOffsetFrequency) 125 if p.err != nil { 126 return 0 127 } 128 } 129 for left > 0 { 130 _, sz := utf8.DecodeRune(data) 131 byteOff += uint32(sz) 132 data = data[sz:] 133 left-- 134 } 135 136 byteOff -= fileStartByte 137 return byteOff 138} 139 140// fillMatches converts the internal candidateMatch slice into our API's LineMatch. 141// It only ever returns content XOR filename matches, not both. If there are any 142// content matches, these are always returned, and we omit filename matches. 143// 144// Performance invariant: ms is sorted and non-overlapping. 145// 146// Note: the byte slices may be backed by mmapped data, so before being 147// returned by the API it needs to be copied. 148func (p *contentProvider) fillMatches(ms []*candidateMatch, numContextLines int, language string, debug bool) []LineMatch { 149 var filenameMatches []*candidateMatch 150 contentMatches := make([]*candidateMatch, 0, len(ms)) 151 152 for _, m := range ms { 153 if m.fileName { 154 filenameMatches = append(filenameMatches, m) 155 } else { 156 contentMatches = append(contentMatches, m) 157 } 158 } 159 160 // If there are any content matches, we only return these and skip filename matches. 161 if len(contentMatches) > 0 { 162 contentMatches = breakMatchesOnNewlines(contentMatches, p.data(false)) 163 return p.fillContentMatches(contentMatches, numContextLines, language, debug) 164 } 165 166 // Otherwise, we return a single line containing the filematch match. 167 bestMatch, _ := p.candidateMatchScore(filenameMatches, language, debug) 168 res := LineMatch{ 169 Line: p.id.fileName(p.idx), 170 FileName: true, 171 Score: bestMatch.score, 172 DebugScore: bestMatch.debugScore, 173 } 174 175 for _, m := range ms { 176 res.LineFragments = append(res.LineFragments, LineFragmentMatch{ 177 LineOffset: int(m.byteOffset), 178 MatchLength: int(m.byteMatchSz), 179 Offset: m.byteOffset, 180 }) 181 } 182 183 return []LineMatch{res} 184 185} 186 187// fillChunkMatches converts the internal candidateMatch slice into our API's ChunkMatch. 188// It only ever returns content XOR filename matches, not both. If there are any content 189// matches, these are always returned, and we omit filename matches. 190// 191// Performance invariant: ms is sorted and non-overlapping. 192// 193// Note: the byte slices may be backed by mmapped data, so before being 194// returned by the API it needs to be copied. 195func (p *contentProvider) fillChunkMatches(ms []*candidateMatch, numContextLines int, language string, debug bool) []ChunkMatch { 196 var filenameMatches []*candidateMatch 197 contentMatches := make([]*candidateMatch, 0, len(ms)) 198 199 for _, m := range ms { 200 if m.fileName { 201 filenameMatches = append(filenameMatches, m) 202 } else { 203 contentMatches = append(contentMatches, m) 204 } 205 } 206 207 // If there are any content matches, we only return these and skip filename matches. 208 if len(contentMatches) > 0 { 209 return p.fillContentChunkMatches(contentMatches, numContextLines, language, debug) 210 } 211 212 // Otherwise, we return a single chunk representing the filename match. 213 bestMatch, _ := p.candidateMatchScore(filenameMatches, language, debug) 214 fileName := p.id.fileName(p.idx) 215 ranges := make([]Range, 0, len(ms)) 216 for _, m := range ms { 217 ranges = append(ranges, Range{ 218 Start: Location{ 219 ByteOffset: m.byteOffset, 220 LineNumber: 1, 221 Column: uint32(utf8.RuneCount(fileName[:m.byteOffset]) + 1), 222 }, 223 End: Location{ 224 ByteOffset: m.byteOffset + m.byteMatchSz, 225 LineNumber: 1, 226 Column: uint32(utf8.RuneCount(fileName[:m.byteOffset+m.byteMatchSz]) + 1), 227 }, 228 }) 229 } 230 231 return []ChunkMatch{{ 232 Content: fileName, 233 ContentStart: Location{ByteOffset: 0, LineNumber: 1, Column: 1}, 234 Ranges: ranges, 235 FileName: true, 236 Score: bestMatch.score, 237 DebugScore: bestMatch.debugScore, 238 }} 239} 240 241func (p *contentProvider) fillContentMatches(ms []*candidateMatch, numContextLines int, language string, debug bool) []LineMatch { 242 var result []LineMatch 243 for len(ms) > 0 { 244 m := ms[0] 245 num := p.newlines().atOffset(m.byteOffset) 246 lineStart := int(p.newlines().lineStart(num)) 247 nextLineStart := int(p.newlines().lineStart(num + 1)) 248 249 var lineCands []*candidateMatch 250 251 endMatch := m.byteOffset + m.byteMatchSz 252 253 for len(ms) > 0 { 254 m := ms[0] 255 if int(m.byteOffset) < nextLineStart { 256 endMatch = m.byteOffset + m.byteMatchSz 257 lineCands = append(lineCands, m) 258 ms = ms[1:] 259 } else { 260 break 261 } 262 } 263 264 if len(lineCands) == 0 { 265 log.Panicf( 266 "%s %v infinite loop: num %d start,end %d,%d, offset %d", 267 p.id.fileName(p.idx), p.id.metaData, 268 num, lineStart, nextLineStart, 269 m.byteOffset) 270 } 271 272 data := p.data(false) 273 274 // Due to merging matches, we may have a match that 275 // crosses a line boundary. Prevent confusion by 276 // taking lines until we pass the last match 277 for nextLineStart < len(data) && endMatch > uint32(nextLineStart) { 278 next := bytes.IndexByte(data[nextLineStart:], '\n') 279 if next == -1 { 280 nextLineStart = len(data) 281 } else { 282 // TODO(hanwen): test that checks "+1" part here. 283 nextLineStart += next + 1 284 } 285 } 286 287 finalMatch := LineMatch{ 288 LineStart: lineStart, 289 LineEnd: nextLineStart, 290 LineNumber: num, 291 } 292 finalMatch.Line = data[lineStart:nextLineStart] 293 294 if numContextLines > 0 { 295 finalMatch.Before = p.newlines().getLines(data, num-numContextLines, num) 296 finalMatch.After = p.newlines().getLines(data, num+1, num+1+numContextLines) 297 } 298 299 bestMatch, symbolInfo := p.candidateMatchScore(lineCands, language, debug) 300 finalMatch.Score = bestMatch.score 301 finalMatch.DebugScore = bestMatch.debugScore 302 303 for i, m := range lineCands { 304 fragment := LineFragmentMatch{ 305 Offset: m.byteOffset, 306 LineOffset: int(m.byteOffset) - lineStart, 307 MatchLength: int(m.byteMatchSz), 308 } 309 if i < len(symbolInfo) && symbolInfo[i] != nil { 310 fragment.SymbolInfo = symbolInfo[i] 311 } 312 313 finalMatch.LineFragments = append(finalMatch.LineFragments, fragment) 314 } 315 result = append(result, finalMatch) 316 } 317 return result 318} 319 320func (p *contentProvider) fillContentChunkMatches(ms []*candidateMatch, numContextLines int, language string, debug bool) []ChunkMatch { 321 newlines := p.newlines() 322 data := p.data(false) 323 324 // columnHelper prevents O(len(ms) * len(data)) lookups for all columns. 325 // However, it depends on ms being sorted by byteOffset and non-overlapping. 326 // This invariant is true at the time of writing, but we conservatively 327 // enforce this. Note: chunkCandidates preserves the sorting so safe to 328 // transform now. 329 columnHelper := columnHelper{data: data} 330 if !sort.IsSorted((sortByOffsetSlice)(ms)) { 331 log.Printf("WARN: performance invariant violated. candidate matches are not sorted in fillContentChunkMatches. Report to developers.") 332 sort.Sort((sortByOffsetSlice)(ms)) 333 } 334 335 chunks := chunkCandidates(ms, newlines, numContextLines) 336 chunkMatches := make([]ChunkMatch, 0, len(chunks)) 337 for _, chunk := range chunks { 338 bestMatch, symbolInfo := p.candidateMatchScore(chunk.candidates, language, debug) 339 340 ranges := make([]Range, 0, len(chunk.candidates)) 341 for _, cm := range chunk.candidates { 342 startOffset := cm.byteOffset 343 endOffset := cm.byteOffset + cm.byteMatchSz 344 startLine, endLine := newlines.offsetRangeToLineRange(startOffset, endOffset) 345 346 ranges = append(ranges, Range{ 347 Start: Location{ 348 ByteOffset: startOffset, 349 LineNumber: uint32(startLine), 350 Column: columnHelper.get(int(newlines.lineStart(startLine)), startOffset), 351 }, 352 End: Location{ 353 ByteOffset: endOffset, 354 LineNumber: uint32(endLine), 355 Column: columnHelper.get(int(newlines.lineStart(endLine)), endOffset), 356 }, 357 }) 358 } 359 360 firstLineNumber := int(chunk.firstLine) - numContextLines 361 if firstLineNumber < 1 { 362 firstLineNumber = 1 363 } 364 firstLineStart := newlines.lineStart(firstLineNumber) 365 366 bestLineMatch := 0 367 if bestMatch.match != nil { 368 bestLineMatch = newlines.atOffset(bestMatch.match.byteOffset) 369 if debug { 370 bestMatch.debugScore = fmt.Sprintf("%s, (line: %d)", bestMatch.debugScore, bestLineMatch) 371 } 372 } 373 374 chunkMatches = append(chunkMatches, ChunkMatch{ 375 Content: newlines.getLines(data, firstLineNumber, int(chunk.lastLine)+numContextLines+1), 376 ContentStart: Location{ 377 ByteOffset: firstLineStart, 378 LineNumber: uint32(firstLineNumber), 379 Column: 1, 380 }, 381 FileName: false, 382 Ranges: ranges, 383 SymbolInfo: symbolInfo, 384 BestLineMatch: uint32(bestLineMatch), 385 Score: bestMatch.score, 386 DebugScore: bestMatch.debugScore, 387 }) 388 } 389 return chunkMatches 390} 391 392type candidateChunk struct { 393 candidates []*candidateMatch 394 firstLine uint32 // 1-based, inclusive 395 lastLine uint32 // 1-based, inclusive 396 minOffset uint32 // 0-based, inclusive 397 maxOffset uint32 // 0-based, exclusive 398} 399 400// chunkCandidates groups a set of sorted, non-overlapping candidate matches by line number. Adjacent 401// chunks will be merged if adding `numContextLines` to the beginning and end of the chunk would cause 402// it to overlap with an adjacent chunk. 403// 404// input invariants: ms is sorted by byteOffset and is non overlapping with respect to endOffset. 405// output invariants: if you flatten candidates the input invariant is retained. 406func chunkCandidates(ms []*candidateMatch, newlines newlines, numContextLines int) []candidateChunk { 407 var chunks []candidateChunk 408 for _, m := range ms { 409 startOffset := m.byteOffset 410 endOffset := m.byteOffset + m.byteMatchSz 411 firstLine, lastLine := newlines.offsetRangeToLineRange(startOffset, endOffset) 412 413 if len(chunks) > 0 && int(chunks[len(chunks)-1].lastLine)+numContextLines >= firstLine-numContextLines { 414 // If a new chunk created with the current candidateMatch would 415 // overlap with the previous chunk, instead add the candidateMatch 416 // to the last chunk and extend end of the last chunk. 417 last := &chunks[len(chunks)-1] 418 last.candidates = append(last.candidates, m) 419 if last.maxOffset < endOffset { 420 last.lastLine = uint32(lastLine) 421 last.maxOffset = uint32(endOffset) 422 } 423 } else { 424 chunks = append(chunks, candidateChunk{ 425 firstLine: uint32(firstLine), 426 lastLine: uint32(lastLine), 427 minOffset: startOffset, 428 maxOffset: endOffset, 429 candidates: []*candidateMatch{m}, 430 }) 431 } 432 } 433 return chunks 434} 435 436// columnHelper is a helper struct which caches the number of runes last 437// counted. If we naively use utf8.RuneCount for each match on a line, this 438// leads to an O(nm) algorithm where m is the number of matches and n is the 439// length of the line. Aassuming we our candidates are increasing in offset 440// makes this operation O(n) instead. 441type columnHelper struct { 442 data []byte 443 444 // 0 values for all these are valid values 445 lastLineOffset int 446 lastOffset uint32 447 lastRuneCount uint32 448} 449 450// get returns the line column for offset. offset is the byte offset of the 451// rune in data. lineOffset is the byte offset inside of data for the line 452// containing offset. 453func (c *columnHelper) get(lineOffset int, offset uint32) uint32 { 454 var runeCount uint32 455 456 if lineOffset == c.lastLineOffset && offset >= c.lastOffset { 457 // Can count from last calculation 458 runeCount = c.lastRuneCount + uint32(utf8.RuneCount(c.data[c.lastOffset:offset])) 459 } else { 460 // Need to count from the beginning of line 461 runeCount = uint32(utf8.RuneCount(c.data[lineOffset:offset])) 462 } 463 464 c.lastLineOffset = lineOffset 465 c.lastOffset = offset 466 c.lastRuneCount = runeCount 467 468 return runeCount + 1 469} 470 471type newlines struct { 472 // locs is the sorted set of byte offsets of the newlines in the file 473 locs []uint32 474 475 // fileSize is just the number of bytes in the file. It is stored 476 // on this struct so we can safely know the length of the last line 477 // in the file since not all files end in a newline. 478 fileSize uint32 479} 480 481// atOffset returns the line containing the offset. If the offset lands on 482// the newline ending line M, we return M. 483func (nls newlines) atOffset(offset uint32) (lineNumber int) { 484 idx := sort.Search(len(nls.locs), func(n int) bool { 485 return nls.locs[n] >= offset 486 }) 487 return idx + 1 488} 489 490// lineStart returns the byte offset of the beginning of the given line. 491// lineNumber is 1-based. If lineNumber is out of range of the lines in the 492// file, the return value will be clamped to [0,fileSize]. 493func (nls newlines) lineStart(lineNumber int) uint32 { 494 // nls.locs[0] + 1 is the start of the 2nd line of data. 495 startIdx := lineNumber - 2 496 497 if startIdx < 0 { 498 return 0 499 } else if startIdx >= len(nls.locs) { 500 return nls.fileSize 501 } else { 502 return nls.locs[startIdx] + 1 503 } 504} 505 506// offsetRangeToLineRange returns range of lines that fully contains the given byte range. 507// The inputs are 0-based byte offsets into the file representing the (exclusive) range [startOffset, endOffset). 508// The return values are 1-based line numbers representing the (inclusive) range [startLine, endLine]. 509func (nls newlines) offsetRangeToLineRange(startOffset, endOffset uint32) (startLine, endLine int) { 510 startLine = nls.atOffset(startOffset) 511 endLine = nls.atOffset( 512 max(startOffset, max(endOffset, 1)-1), // clamp endOffset and prevent underflow 513 ) 514 return startLine, endLine 515} 516 517// getLines returns a slice of data containing the lines [low, high). 518// low is 1-based and inclusive. high is 1-based and exclusive. 519func (nls newlines) getLines(data []byte, low, high int) []byte { 520 if low >= high { 521 return nil 522 } 523 524 return data[nls.lineStart(low):nls.lineStart(high)] 525} 526 527const ( 528 // Query-dependent scoring signals. All of these together are bounded at ~9000 529 // (scoreWordMatch + scoreSymbol + scoreKindMatch * 10 + scoreFactorAtomMatch). 530 scorePartialWordMatch = 50.0 531 scoreWordMatch = 500.0 532 scoreBase = 7000.0 533 scorePartialBase = 4000.0 534 scoreSymbol = 7000.0 535 scorePartialSymbol = 4000.0 536 scoreKindMatch = 100.0 537 scoreFactorAtomMatch = 400.0 538 539 // File-only scoring signals. For now these are also bounded ~9000 to give them 540 // equal weight with the query-dependent signals. 541 scoreFileRankFactor = 9000.0 542 543 // Used for ordering line and chunk matches within a file. 544 scoreLineOrderFactor = 1.0 545 546 // Used for tiebreakers. The scores are not combined with the main score, but 547 // are used to break ties between matches with the same score. The factors are 548 // chosen to separate the tiebreakers from the main score and from each other. 549 // If you make changes here, make sure to update indexData.scoreFile too. 550 scoreRepoRankFactor = 100.0 551 scoreFileOrderFactor = 10.0 552) 553 554// findMaxOverlappingSection returns the index of the section in secs that 555// overlaps the most with the area defined by off and sz, relative to the size 556// of the section. If no section overlaps, it returns 0, false. If multiple 557// sections overlap the same amount, the first one is returned. 558// 559// The implementation assumes that sections do not overlap and are sorted by 560// DocumentSection.Start. 561func findMaxOverlappingSection(secs []DocumentSection, off, sz uint32) (uint32, bool) { 562 start := off 563 end := off + sz 564 565 // Find the first section that might overlap 566 j := sort.Search(len(secs), func(i int) bool { return secs[i].End > start }) 567 568 if j == len(secs) || secs[j].Start >= end { 569 // No overlap. 570 return 0, false 571 } 572 573 relOverlap := func(j int) float64 { 574 secSize := secs[j].End - secs[j].Start 575 if secSize == 0 { 576 return 0 577 } 578 // This cannot overflow because we make sure there is overlap before calling relOverlap 579 overlap := min(secs[j].End, end) - max(secs[j].Start, start) 580 return float64(overlap) / float64(secSize) 581 } 582 583 ol1 := relOverlap(j) 584 if epsilonEqualsOne(ol1) || j == len(secs)-1 || secs[j+1].Start >= end { 585 return uint32(j), ol1 > 0 586 } 587 588 // We know that [off,off+sz[ overlaps with at least 2 sections. We only have to check 589 // if the second section overlaps more than the first one, because a third 590 // section can only overlap if the overlap with the second section is complete. 591 ol2 := relOverlap(j + 1) 592 if ol2 > ol1 { 593 return uint32(j + 1), ol2 > 0 594 } 595 596 return uint32(j), ol1 > 0 597} 598 599func (p *contentProvider) matchesSymbol(cm *candidateMatch) bool { 600 if cm.fileName { 601 return false 602 } 603 604 // Check if this candidate came from a symbol matchTree 605 if cm.symbol { 606 return true 607 } 608 609 // Check if it overlaps with a symbol. 610 secs := p.docSections() 611 _, ok := findMaxOverlappingSection(secs, cm.byteOffset, cm.byteMatchSz) 612 return ok 613} 614 615func (p *contentProvider) findSymbol(cm *candidateMatch) (DocumentSection, *Symbol, bool) { 616 if cm.fileName { 617 return DocumentSection{}, nil, false 618 } 619 620 secs := p.docSections() 621 622 secIdx, ok := cm.symbolIdx, cm.symbol 623 if !ok { 624 // Not from a symbol matchTree. Let's see if it overlaps with a symbol. 625 secIdx, ok = findMaxOverlappingSection(secs, cm.byteOffset, cm.byteMatchSz) 626 } 627 if !ok { 628 return DocumentSection{}, nil, false 629 } 630 631 sec := secs[secIdx] 632 633 // Now lets hydrate in the SymbolInfo. We do not hydrate in SymbolInfo.Sym 634 // since some callsites do not need it stored, and that incurs an extra 635 // copy. 636 // 637 // 2024-01-08 we are refactoring this and the code path indicates this can 638 // fail, so callers need to handle nil symbol. However, it would be 639 // surprising that we have a matching section but not symbol data. 640 start := p.id.fileEndSymbol[p.idx] 641 si := p.id.symbols.data(start + secIdx) 642 643 return sec, si, true 644} 645 646// calculateTermFrequency computes the term frequency for the file match. 647// Notes: 648// * Filename matches count more than content matches. This mimics a common text search strategy to 'boost' matches on document titles. 649// * Symbol matches also count more than content matches, to reward matches on symbol definitions. 650func (p *contentProvider) calculateTermFrequency(cands []*candidateMatch, df termDocumentFrequency) map[string]int { 651 // Treat each candidate match as a term and compute the frequencies. For now, ignore case 652 // sensitivity and treat filenames and symbols the same as content. 653 termFreqs := map[string]int{} 654 for _, m := range cands { 655 term := string(m.substrLowered) 656 if m.fileName || p.matchesSymbol(m) { 657 termFreqs[term] += 5 658 } else { 659 termFreqs[term]++ 660 } 661 } 662 663 for term := range termFreqs { 664 df[term] += 1 665 } 666 return termFreqs 667} 668 669// scoredMatch holds the score information for a candidate match. 670type scoredMatch struct { 671 score float64 672 debugScore string 673 match *candidateMatch 674} 675 676// candidateMatchScore scores all candidate matches and returns the best-scoring match plus its score information. 677// Invariant: there should be at least one input candidate, len(ms) > 0. 678func (p *contentProvider) candidateMatchScore(ms []*candidateMatch, language string, debug bool) (scoredMatch, []*Symbol) { 679 score := 0.0 680 what := "" 681 682 addScore := func(w string, s float64) { 683 if s != 0 && debug { 684 what += fmt.Sprintf("%s:%.2f, ", w, s) 685 } 686 score += s 687 } 688 689 filename := p.data(true) 690 var symbolInfo []*Symbol 691 692 var bestMatch scoredMatch 693 for i, m := range ms { 694 data := p.data(m.fileName) 695 696 endOffset := m.byteOffset + m.byteMatchSz 697 startBoundary := m.byteOffset < uint32(len(data)) && (m.byteOffset == 0 || byteClass(data[m.byteOffset-1]) != byteClass(data[m.byteOffset])) 698 endBoundary := endOffset > 0 && (endOffset == uint32(len(data)) || byteClass(data[endOffset-1]) != byteClass(data[endOffset])) 699 700 score = 0 701 what = "" 702 703 if startBoundary && endBoundary { 704 addScore("WordMatch", scoreWordMatch) 705 } else if startBoundary || endBoundary { 706 addScore("PartialWordMatch", scorePartialWordMatch) 707 } 708 709 if m.fileName { 710 sep := bytes.LastIndexByte(data, '/') 711 startMatch := int(m.byteOffset) == sep+1 712 endMatch := endOffset == uint32(len(data)) 713 if startMatch && endMatch { 714 addScore("Base", scoreBase) 715 } else if startMatch || endMatch { 716 addScore("EdgeBase", (scoreBase+scorePartialBase)/2) 717 } else if sep < int(m.byteOffset) { 718 addScore("InnerBase", scorePartialBase) 719 } 720 } else if sec, si, ok := p.findSymbol(m); ok { 721 startMatch := sec.Start == m.byteOffset 722 endMatch := sec.End == endOffset 723 if startMatch && endMatch { 724 addScore("Symbol", scoreSymbol) 725 } else if startMatch || endMatch { 726 addScore("EdgeSymbol", (scoreSymbol+scorePartialSymbol)/2) 727 } else { 728 addScore("OverlapSymbol", scorePartialSymbol) 729 } 730 731 // Score based on symbol data 732 if si != nil { 733 symbolKind := ctags.ParseSymbolKind(si.Kind) 734 sym := sectionSlice(data, sec) 735 736 addScore(fmt.Sprintf("kind:%s:%s", language, si.Kind), scoreSymbolKind(language, filename, sym, symbolKind)) 737 738 // This is from a symbol tree, so we need to store the symbol 739 // information. 740 if m.symbol { 741 if symbolInfo == nil { 742 symbolInfo = make([]*Symbol, len(ms)) 743 } 744 // findSymbols does not hydrate in Sym. So we need to store it. 745 si.Sym = string(sym) 746 symbolInfo[i] = si 747 } 748 } 749 } 750 751 // scoreWeight != 1 means it affects score 752 if !epsilonEqualsOne(m.scoreWeight) { 753 score = score * m.scoreWeight 754 if debug { 755 what += fmt.Sprintf("boost:%.2f, ", m.scoreWeight) 756 } 757 } 758 759 if score > bestMatch.score { 760 bestMatch.score = score 761 bestMatch.debugScore = what 762 bestMatch.match = m 763 } 764 } 765 766 if debug { 767 bestMatch.debugScore = fmt.Sprintf("score:%.2f <- %s", bestMatch.score, strings.TrimSuffix(bestMatch.debugScore, ", ")) 768 } 769 770 return bestMatch, symbolInfo 771} 772 773// sectionSlice will return data[sec.Start:sec.End] but will clip Start and 774// End such that it won't be out of range. 775func sectionSlice(data []byte, sec DocumentSection) []byte { 776 l := uint32(len(data)) 777 if sec.Start >= l { 778 return nil 779 } 780 if sec.End > l { 781 sec.End = l 782 } 783 return data[sec.Start:sec.End] 784} 785 786// scoreSymbolKind boosts a match based on the combination of language, symbol 787// and kind. The language string comes from go-enry, the symbol and kind from 788// ctags. 789func scoreSymbolKind(language string, filename []byte, sym []byte, kind ctags.SymbolKind) float64 { 790 var factor float64 791 792 // Generic ranking which will be overriden by language specific ranking 793 switch kind { 794 case ctags.Type: // scip-ctags regression workaround https://github.com/sourcegraph/sourcegraph/issues/57659 795 factor = 8 796 case ctags.Class: 797 factor = 10 798 case ctags.Struct: 799 factor = 9.5 800 case ctags.Enum: 801 factor = 9 802 case ctags.Interface: 803 factor = 8 804 case ctags.Function, ctags.Method: 805 factor = 7 806 case ctags.Field: 807 factor = 5.5 808 case ctags.Constant: 809 factor = 5 810 case ctags.Variable: 811 factor = 4 812 default: 813 // For all other kinds, assign a low score by default. 814 factor = 1 815 } 816 817 switch language { 818 case "Java", "java": 819 switch kind { 820 // 2022-03-30: go-ctags contains a regex rule for Java classes that sets "kind" 821 // to "classes" instead of "c". We have to cover both cases to support existing 822 // indexes. 823 case ctags.Class: 824 factor = 10 825 case ctags.Enum: 826 factor = 9 827 case ctags.Interface: 828 factor = 8 829 case ctags.Method: 830 factor = 7 831 case ctags.Field: 832 factor = 6 833 case ctags.EnumConstant: 834 factor = 5 835 } 836 case "Kotlin", "kotlin": 837 switch kind { 838 case ctags.Class: 839 factor = 10 840 case ctags.Interface: 841 factor = 9 842 case ctags.Method: 843 factor = 8 844 case ctags.TypeAlias: 845 factor = 7 846 case ctags.Constant: 847 factor = 6 848 case ctags.Variable: 849 factor = 5 850 } 851 case "Go", "go": 852 switch kind { 853 // scip-ctags regression workaround https://github.com/sourcegraph/sourcegraph/issues/57659 854 // for each case a description of the fields in ctags in the comment 855 case ctags.Type: // interface struct talias 856 factor = 9 857 case ctags.Interface: // interfaces 858 factor = 10 859 case ctags.Struct: // structs 860 factor = 9 861 case ctags.TypeAlias: // type aliases 862 factor = 9 863 case ctags.MethodSpec: // interface method specification 864 factor = 8.5 865 case ctags.Method, ctags.Function: // functions 866 factor = 8 867 case ctags.Field: // struct fields 868 factor = 7 869 case ctags.Constant: // constants 870 factor = 6 871 case ctags.Variable: // variables 872 factor = 5 873 } 874 875 // Boost exported go symbols. Same implementation as token.IsExported 876 if ch, _ := utf8.DecodeRune(sym); unicode.IsUpper(ch) { 877 factor += 0.5 878 } 879 880 if bytes.HasSuffix(filename, []byte("_test.go")) { 881 factor *= 0.8 882 } 883 884 // Could also rank on: 885 // 886 // - anonMember struct anonymous members 887 // - packageName name for specifying imported package 888 // - receiver receivers 889 // - package packages 890 // - type types 891 // - unknown unknown 892 case "C++", "c++": 893 switch kind { 894 case ctags.Class: // classes 895 factor = 10 896 case ctags.Enum: // enumeration names 897 factor = 9 898 case ctags.Function: // function definitions 899 factor = 8 900 case ctags.Struct: // structure names 901 factor = 7 902 case ctags.Union: // union names 903 factor = 6 904 case ctags.TypeAlias: // typedefs 905 factor = 5 906 case ctags.Field: // class, struct, and union members 907 factor = 4 908 case ctags.Variable: // varialbe definitions 909 factor = 3 910 } 911 // Could also rank on: 912 // NAME DESCRIPTION 913 // macro macro definitions 914 // enumerator enumerators (values inside an enumeration) 915 // header included header files 916 // namespace namespaces 917 // variable variable definitions 918 case "Scala", "scala": 919 switch kind { 920 case ctags.Class: 921 factor = 10 922 case ctags.Interface: 923 factor = 9 924 case ctags.Object: 925 factor = 8 926 case ctags.Function: 927 factor = 7 928 case ctags.Type: 929 factor = 6 930 case ctags.Variable: 931 factor = 5 932 case ctags.Package: 933 factor = 4 934 } 935 case "Python", "python": 936 switch kind { 937 case ctags.Class: // classes 938 factor = 10 939 case ctags.Function, ctags.Method: // function definitions 940 factor = 8 941 case ctags.Field: // class, struct, and union members 942 factor = 4 943 case ctags.Variable: // variable definitions 944 factor = 3 945 case ctags.Local: // local variables 946 factor = 2 947 } 948 // Could also rank on: 949 // 950 // - namespace name referring a module defined in other file 951 // - module modules 952 // - unknown name referring a class/variable/function/module defined in other module 953 // - parameter function parameters 954 case "Ruby", "ruby": 955 switch kind { 956 case ctags.Class: 957 factor = 10 958 case ctags.Method: 959 factor = 9 960 case ctags.MethodAlias: 961 factor = 8 962 case ctags.Module: 963 factor = 7 964 case ctags.SingletonMethod: 965 factor = 6 966 case ctags.Constant: 967 factor = 5 968 case ctags.Accessor: 969 factor = 4 970 case ctags.Library: 971 factor = 3 972 } 973 case "PHP", "php": 974 switch kind { 975 case ctags.Class: 976 factor = 10 977 case ctags.Interface: 978 factor = 9 979 case ctags.Function: 980 factor = 8 981 case ctags.Trait: 982 factor = 7 983 case ctags.Define: 984 factor = 6 985 case ctags.Namespace: 986 factor = 5 987 case ctags.MethodAlias: 988 factor = 4 989 case ctags.Variable: 990 factor = 3 991 case ctags.Local: 992 factor = 3 993 } 994 case "GraphQL", "graphql": 995 switch kind { 996 case ctags.Type: 997 factor = 10 998 } 999 case "Markdown", "markdown": 1000 // Headers are good signal in docs, but do not rank as highly as code. 1001 switch kind { 1002 case ctags.Chapter: // # 1003 factor = 4 1004 case ctags.Section: // ## 1005 factor = 3 1006 case ctags.Subsection: // ### 1007 factor = 2 1008 } 1009 } 1010 1011 return factor * scoreKindMatch 1012} 1013 1014type matchScoreSlice []LineMatch 1015 1016func (m matchScoreSlice) Len() int { return len(m) } 1017func (m matchScoreSlice) Swap(i, j int) { m[i], m[j] = m[j], m[i] } 1018func (m matchScoreSlice) Less(i, j int) bool { return m[i].Score > m[j].Score } 1019 1020type chunkMatchScoreSlice []ChunkMatch 1021 1022func (m chunkMatchScoreSlice) Len() int { return len(m) } 1023func (m chunkMatchScoreSlice) Swap(i, j int) { m[i], m[j] = m[j], m[i] } 1024func (m chunkMatchScoreSlice) Less(i, j int) bool { return m[i].Score > m[j].Score } 1025 1026type fileMatchesByScore []FileMatch 1027 1028func (m fileMatchesByScore) Len() int { return len(m) } 1029func (m fileMatchesByScore) Swap(i, j int) { m[i], m[j] = m[j], m[i] } 1030func (m fileMatchesByScore) Less(i, j int) bool { return m[i].Score > m[j].Score } 1031 1032func sortMatchesByScore(ms []LineMatch) { 1033 sort.Sort(matchScoreSlice(ms)) 1034} 1035 1036func sortChunkMatchesByScore(ms []ChunkMatch) { 1037 sort.Sort(chunkMatchScoreSlice(ms)) 1038} 1039 1040// SortFiles sorts files matches in the order we want to present results to 1041// users. The order depends on the match score, which includes both 1042// query-dependent signals like word overlap, and file-only signals like the 1043// file ranks (if file ranks are enabled). 1044// 1045// We don't only use the scores, we will also boost some results to present 1046// files with novel extensions. 1047func SortFiles(ms []FileMatch) { 1048 sort.Sort(fileMatchesByScore(ms)) 1049 1050 // Boost a file extension not in the top 3 to the third filematch. 1051 boostNovelExtension(ms, 2, 0.9) 1052} 1053 1054func boostNovelExtension(ms []FileMatch, boostOffset int, minScoreRatio float64) { 1055 if len(ms) <= boostOffset+1 { 1056 return 1057 } 1058 1059 top := ms[:boostOffset] 1060 candidates := ms[boostOffset:] 1061 1062 // Don't bother boosting something which is significantly different to the 1063 // result it replaces. 1064 minScoreForNovelty := candidates[0].Score * minScoreRatio 1065 1066 // We want to look for an ext that isn't in the top exts 1067 exts := make([]string, len(top)) 1068 for i := range top { 1069 exts[i] = path.Ext(top[i].FileName) 1070 } 1071 1072 for i := range candidates { 1073 // Do not assume sorted due to boostNovelExtension being called on subsets 1074 if candidates[i].Score < minScoreForNovelty { 1075 continue 1076 } 1077 1078 if slices.Contains(exts, path.Ext(candidates[i].FileName)) { 1079 continue 1080 } 1081 1082 // Found what we are looking for, now boost to front of candidates (which 1083 // is ms[boostOffset]) 1084 for ; i > 0; i-- { 1085 candidates[i], candidates[i-1] = candidates[i-1], candidates[i] 1086 } 1087 return 1088 } 1089}