fork of https://github.com/sourcegraph/zoekt
0

Configure Feed

Select the types of activity you want to include in your feed.

1// Copyright 2016 Google Inc. All rights reserved. 2// 3// Licensed under the Apache License, Version 2.0 (the "License"); 4// you may not use this file except in compliance with the License. 5// You may obtain a copy of the License at 6// 7// http://www.apache.org/licenses/LICENSE-2.0 8// 9// Unless required by applicable law or agreed to in writing, software 10// distributed under the License is distributed on an "AS IS" BASIS, 11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12// See the License for the specific language governing permissions and 13// limitations under the License. 14 15package zoekt 16 17import ( 18 "bytes" 19 "fmt" 20 "log" 21 "path" 22 "slices" 23 "sort" 24 "strings" 25 "unicode" 26 "unicode/utf8" 27 28 "github.com/sourcegraph/zoekt/ctags" 29) 30 31var _ = log.Println 32 33// contentProvider is an abstraction to treat matches for names and 34// content with the same code. 35type contentProvider struct { 36 id *indexData 37 stats *Stats 38 39 // mutable 40 err error 41 idx uint32 42 _data []byte 43 _nl []uint32 44 _nlBuf []uint32 45 _sects []DocumentSection 46 _sectBuf []DocumentSection 47 fileSize uint32 48} 49 50// setDocument skips to the given document. 51func (p *contentProvider) setDocument(docID uint32) { 52 fileStart := p.id.boundaries[docID] 53 54 p.idx = docID 55 p.fileSize = p.id.boundaries[docID+1] - fileStart 56 57 p._nl = nil 58 p._sects = nil 59 p._data = nil 60} 61 62func (p *contentProvider) docSections() []DocumentSection { 63 if p._sects == nil { 64 var sz uint32 65 p._sects, sz, p.err = p.id.readDocSections(p.idx, p._sectBuf) 66 p.stats.ContentBytesLoaded += int64(sz) 67 p._sectBuf = p._sects 68 } 69 return p._sects 70} 71 72func (p *contentProvider) newlines() newlines { 73 if p._nl == nil { 74 var sz uint32 75 p._nl, sz, p.err = p.id.readNewlines(p.idx, p._nlBuf) 76 p._nlBuf = p._nl 77 p.stats.ContentBytesLoaded += int64(sz) 78 } 79 return newlines{locs: p._nl, fileSize: p.fileSize} 80} 81 82func (p *contentProvider) data(fileName bool) []byte { 83 if fileName { 84 return p.id.fileNameContent[p.id.fileNameIndex[p.idx]:p.id.fileNameIndex[p.idx+1]] 85 } 86 87 if p._data == nil { 88 p._data, p.err = p.id.readContents(p.idx) 89 p.stats.FilesLoaded++ 90 p.stats.ContentBytesLoaded += int64(len(p._data)) 91 } 92 return p._data 93} 94 95// Find offset in bytes (relative to corpus start) for an offset in 96// runes (relative to document start). If filename is set, the corpus 97// is the set of filenames, with the document being the name itself. 98func (p *contentProvider) findOffset(filename bool, r uint32) uint32 { 99 if p.id.metaData.PlainASCII { 100 return r 101 } 102 103 sample := p.id.runeOffsets 104 runeEnds := p.id.fileEndRunes 105 fileStartByte := p.id.boundaries[p.idx] 106 if filename { 107 sample = p.id.fileNameRuneOffsets 108 runeEnds = p.id.fileNameEndRunes 109 fileStartByte = p.id.fileNameIndex[p.idx] 110 } 111 112 absR := r 113 if p.idx > 0 { 114 absR += runeEnds[p.idx-1] 115 } 116 117 byteOff, left := sample.lookup(absR) 118 119 var data []byte 120 121 if filename { 122 data = p.id.fileNameContent[byteOff:] 123 } else { 124 data, p.err = p.id.readContentSlice(byteOff, 3*runeOffsetFrequency) 125 if p.err != nil { 126 return 0 127 } 128 } 129 for left > 0 { 130 _, sz := utf8.DecodeRune(data) 131 byteOff += uint32(sz) 132 data = data[sz:] 133 left-- 134 } 135 136 byteOff -= fileStartByte 137 return byteOff 138} 139 140func (p *contentProvider) fillMatches(ms []*candidateMatch, numContextLines int, language string, debug bool) []LineMatch { 141 var result []LineMatch 142 if ms[0].fileName { 143 score, debugScore, _ := p.candidateMatchScore(ms, language, debug) 144 145 // There is only "line" in a filename. 146 res := LineMatch{ 147 Line: p.id.fileName(p.idx), 148 FileName: true, 149 150 Score: score, 151 DebugScore: debugScore, 152 } 153 154 for _, m := range ms { 155 res.LineFragments = append(res.LineFragments, LineFragmentMatch{ 156 LineOffset: int(m.byteOffset), 157 MatchLength: int(m.byteMatchSz), 158 Offset: m.byteOffset, 159 }) 160 161 result = []LineMatch{res} 162 } 163 } else { 164 ms = breakMatchesOnNewlines(ms, p.data(false)) 165 result = p.fillContentMatches(ms, numContextLines, language, debug) 166 } 167 168 return result 169} 170 171// fillChunkMatches converts the internal candidateMatch slice into our APIs ChunkMatch. 172// 173// Performance invariant: ms is sorted and non-overlapping. 174// 175// Note: the byte slices may be backed by mmapped data, so before being 176// returned by the API it needs to be copied. 177func (p *contentProvider) fillChunkMatches(ms []*candidateMatch, numContextLines int, language string, debug bool) []ChunkMatch { 178 var result []ChunkMatch 179 if ms[0].fileName { 180 // If the first match is a filename match, there will only be 181 // one match and the matched content will be the filename. 182 183 score, debugScore, _ := p.candidateMatchScore(ms, language, debug) 184 185 fileName := p.id.fileName(p.idx) 186 ranges := make([]Range, 0, len(ms)) 187 for _, m := range ms { 188 ranges = append(ranges, Range{ 189 Start: Location{ 190 ByteOffset: m.byteOffset, 191 LineNumber: 1, 192 Column: uint32(utf8.RuneCount(fileName[:m.byteOffset]) + 1), 193 }, 194 End: Location{ 195 ByteOffset: m.byteOffset + m.byteMatchSz, 196 LineNumber: 1, 197 Column: uint32(utf8.RuneCount(fileName[:m.byteOffset+m.byteMatchSz]) + 1), 198 }, 199 }) 200 } 201 202 result = []ChunkMatch{{ 203 Content: fileName, 204 ContentStart: Location{ByteOffset: 0, LineNumber: 1, Column: 1}, 205 Ranges: ranges, 206 FileName: true, 207 208 Score: score, 209 DebugScore: debugScore, 210 }} 211 } else { 212 result = p.fillContentChunkMatches(ms, numContextLines, language, debug) 213 } 214 215 return result 216} 217 218func (p *contentProvider) fillContentMatches(ms []*candidateMatch, numContextLines int, language string, debug bool) []LineMatch { 219 var result []LineMatch 220 for len(ms) > 0 { 221 m := ms[0] 222 num, lineStart, lineEnd := p.newlines().atOffset(m.byteOffset) 223 224 var lineCands []*candidateMatch 225 226 endMatch := m.byteOffset + m.byteMatchSz 227 228 for len(ms) > 0 { 229 m := ms[0] 230 if int(m.byteOffset) <= lineEnd { 231 endMatch = m.byteOffset + m.byteMatchSz 232 lineCands = append(lineCands, m) 233 ms = ms[1:] 234 } else { 235 break 236 } 237 } 238 239 if len(lineCands) == 0 { 240 log.Panicf( 241 "%s %v infinite loop: num %d start,end %d,%d, offset %d", 242 p.id.fileName(p.idx), p.id.metaData, 243 num, lineStart, lineEnd, 244 m.byteOffset) 245 } 246 247 data := p.data(false) 248 249 // Due to merging matches, we may have a match that 250 // crosses a line boundary. Prevent confusion by 251 // taking lines until we pass the last match 252 for lineEnd < len(data) && endMatch > uint32(lineEnd) { 253 next := bytes.IndexByte(data[lineEnd+1:], '\n') 254 if next == -1 { 255 lineEnd = len(data) 256 } else { 257 // TODO(hanwen): test that checks "+1" part here. 258 lineEnd += next + 1 259 } 260 } 261 262 finalMatch := LineMatch{ 263 LineStart: lineStart, 264 LineEnd: lineEnd, 265 LineNumber: num, 266 } 267 finalMatch.Line = data[lineStart:lineEnd] 268 269 if numContextLines > 0 { 270 finalMatch.Before = p.newlines().getLines(data, num-numContextLines, num) 271 finalMatch.After = p.newlines().getLines(data, num+1, num+1+numContextLines) 272 } 273 274 score, debugScore, symbolInfo := p.candidateMatchScore(lineCands, language, debug) 275 finalMatch.Score = score 276 finalMatch.DebugScore = debugScore 277 278 for i, m := range lineCands { 279 fragment := LineFragmentMatch{ 280 Offset: m.byteOffset, 281 LineOffset: int(m.byteOffset) - lineStart, 282 MatchLength: int(m.byteMatchSz), 283 } 284 if i < len(symbolInfo) && symbolInfo[i] != nil { 285 fragment.SymbolInfo = symbolInfo[i] 286 } 287 288 finalMatch.LineFragments = append(finalMatch.LineFragments, fragment) 289 } 290 result = append(result, finalMatch) 291 } 292 return result 293} 294 295func (p *contentProvider) fillContentChunkMatches(ms []*candidateMatch, numContextLines int, language string, debug bool) []ChunkMatch { 296 newlines := p.newlines() 297 data := p.data(false) 298 299 // columnHelper prevents O(len(ms) * len(data)) lookups for all columns. 300 // However, it depends on ms being sorted by byteOffset and non-overlapping. 301 // This invariant is true at the time of writing, but we conservatively 302 // enforce this. Note: chunkCandidates preserves the sorting so safe to 303 // transform now. 304 columnHelper := columnHelper{data: data} 305 if !sort.IsSorted((sortByOffsetSlice)(ms)) { 306 log.Printf("WARN: performance invariant violated. candidate matches are not sorted in fillContentChunkMatches. Report to developers.") 307 sort.Sort((sortByOffsetSlice)(ms)) 308 } 309 310 chunks := chunkCandidates(ms, newlines, numContextLines) 311 chunkMatches := make([]ChunkMatch, 0, len(chunks)) 312 for _, chunk := range chunks { 313 score, debugScore, symbolInfo := p.candidateMatchScore(chunk.candidates, language, debug) 314 315 ranges := make([]Range, 0, len(chunk.candidates)) 316 for _, cm := range chunk.candidates { 317 startOffset := cm.byteOffset 318 endOffset := cm.byteOffset + cm.byteMatchSz 319 startLine, startLineOffset, _ := newlines.atOffset(startOffset) 320 endLine, endLineOffset, _ := newlines.atOffset(endOffset) 321 322 ranges = append(ranges, Range{ 323 Start: Location{ 324 ByteOffset: startOffset, 325 LineNumber: uint32(startLine), 326 Column: columnHelper.get(startLineOffset, startOffset), 327 }, 328 End: Location{ 329 ByteOffset: endOffset, 330 LineNumber: uint32(endLine), 331 Column: columnHelper.get(endLineOffset, endOffset), 332 }, 333 }) 334 } 335 336 firstLineNumber := int(chunk.firstLine) - numContextLines 337 if firstLineNumber < 1 { 338 firstLineNumber = 1 339 } 340 firstLineStart, _ := newlines.lineBounds(firstLineNumber) 341 342 chunkMatches = append(chunkMatches, ChunkMatch{ 343 Content: newlines.getLines(data, firstLineNumber, int(chunk.lastLine)+numContextLines+1), 344 ContentStart: Location{ 345 ByteOffset: firstLineStart, 346 LineNumber: uint32(firstLineNumber), 347 Column: 1, 348 }, 349 FileName: false, 350 Ranges: ranges, 351 SymbolInfo: symbolInfo, 352 Score: score, 353 DebugScore: debugScore, 354 }) 355 } 356 return chunkMatches 357} 358 359type candidateChunk struct { 360 candidates []*candidateMatch 361 firstLine uint32 // 1-based, inclusive 362 lastLine uint32 // 1-based, inclusive 363 minOffset uint32 // 0-based, inclusive 364 maxOffset uint32 // 0-based, exclusive 365} 366 367// chunkCandidates groups a set of sorted, non-overlapping candidate matches by line number. Adjacent 368// chunks will be merged if adding `numContextLines` to the beginning and end of the chunk would cause 369// it to overlap with an adjacent chunk. 370// 371// input invariants: ms is sorted by byteOffset and is non overlapping with respect to endOffset. 372// output invariants: if you flatten candidates the input invariant is retained. 373func chunkCandidates(ms []*candidateMatch, newlines newlines, numContextLines int) []candidateChunk { 374 var chunks []candidateChunk 375 for _, m := range ms { 376 startOffset := m.byteOffset 377 endOffset := m.byteOffset + m.byteMatchSz 378 firstLine, _, _ := newlines.atOffset(startOffset) 379 lastLine, _, _ := newlines.atOffset(endOffset) 380 381 if len(chunks) > 0 && int(chunks[len(chunks)-1].lastLine)+numContextLines >= firstLine-numContextLines { 382 // If a new chunk created with the current candidateMatch would 383 // overlap with the previous chunk, instead add the candidateMatch 384 // to the last chunk and extend end of the last chunk. 385 last := &chunks[len(chunks)-1] 386 last.candidates = append(last.candidates, m) 387 if last.maxOffset < endOffset { 388 last.lastLine = uint32(lastLine) 389 last.maxOffset = uint32(endOffset) 390 } 391 } else { 392 chunks = append(chunks, candidateChunk{ 393 firstLine: uint32(firstLine), 394 lastLine: uint32(lastLine), 395 minOffset: startOffset, 396 maxOffset: endOffset, 397 candidates: []*candidateMatch{m}, 398 }) 399 } 400 } 401 return chunks 402} 403 404// columnHelper is a helper struct which caches the number of runes last 405// counted. If we naively use utf8.RuneCount for each match on a line, this 406// leads to an O(nm) algorithm where m is the number of matches and n is the 407// length of the line. Aassuming we our candidates are increasing in offset 408// makes this operation O(n) instead. 409type columnHelper struct { 410 data []byte 411 412 // 0 values for all these are valid values 413 lastLineOffset int 414 lastOffset uint32 415 lastRuneCount uint32 416} 417 418// get returns the line column for offset. offset is the byte offset of the 419// rune in data. lineOffset is the byte offset inside of data for the line 420// containing offset. 421func (c *columnHelper) get(lineOffset int, offset uint32) uint32 { 422 var runeCount uint32 423 424 if lineOffset == c.lastLineOffset && offset >= c.lastOffset { 425 // Can count from last calculation 426 runeCount = c.lastRuneCount + uint32(utf8.RuneCount(c.data[c.lastOffset:offset])) 427 } else { 428 // Need to count from the beginning of line 429 runeCount = uint32(utf8.RuneCount(c.data[lineOffset:offset])) 430 } 431 432 c.lastLineOffset = lineOffset 433 c.lastOffset = offset 434 c.lastRuneCount = runeCount 435 436 return runeCount + 1 437} 438 439type newlines struct { 440 // locs is the sorted set of byte offsets of the newlines in the file 441 locs []uint32 442 443 // fileSize is just the number of bytes in the file. It is stored 444 // on this struct so we can safely know the length of the last line 445 // in the file since not all files end in a newline. 446 fileSize uint32 447} 448 449// atOffset returns the line containing the offset. If the offset lands on 450// the newline ending line M, we return M. The line is characterized 451// by its linenumber (base-1, byte index of line start, byte index of 452// line end). The line end is the index of a newline, or the filesize 453// (if matching the last line of the file.) 454func (nls newlines) atOffset(offset uint32) (lineNumber, lineStart, lineEnd int) { 455 idx := sort.Search(len(nls.locs), func(n int) bool { 456 return nls.locs[n] >= offset 457 }) 458 459 start, end := nls.lineBounds(idx + 1) 460 return idx + 1, int(start), int(end) 461} 462 463// lineBounds returns the byte offsets of the start and end of the 1-based 464// lineNumber. The end offset is exclusive and will not contain the line-ending 465// newline. If the line number is out of range of the lines in the file, start 466// and end will be clamped to [0,fileSize]. 467func (nls newlines) lineBounds(lineNumber int) (start, end uint32) { 468 // nls.locs[0] + 1 is the start of the 2nd line of data. 469 startIdx := lineNumber - 2 470 endIdx := lineNumber - 1 471 472 if startIdx < 0 { 473 start = 0 474 } else if startIdx >= len(nls.locs) { 475 start = nls.fileSize 476 } else { 477 start = nls.locs[startIdx] + 1 478 } 479 480 if endIdx < 0 { 481 end = 0 482 } else if endIdx >= len(nls.locs) { 483 end = nls.fileSize 484 } else { 485 end = nls.locs[endIdx] 486 } 487 488 return start, end 489} 490 491// getLines returns a slice of data containing the lines [low, high). 492// low is 1-based and inclusive. high is 1-based and exclusive. 493func (nls newlines) getLines(data []byte, low, high int) []byte { 494 if low >= high { 495 return nil 496 } 497 498 lowStart, _ := nls.lineBounds(low) 499 _, highEnd := nls.lineBounds(high - 1) 500 501 // Drop any trailing newline. Editors do not treat a trailing newline as 502 // the start of a new line, so we should not either. lineBounds clamps to 503 // len(data) when an out-of-bounds line is requested. 504 // 505 // As an example, if we request lines 1-5 from a file with contents 506 // `one\ntwo\nthree\n`, we should return `one\ntwo\nthree` because those are 507 // the three "lines" in the file, separated by newlines. 508 if highEnd == uint32(len(data)) && bytes.HasSuffix(data, []byte{'\n'}) { 509 highEnd = highEnd - 1 510 lowStart = min(lowStart, highEnd) 511 } 512 513 return data[lowStart:highEnd] 514} 515 516const ( 517 // Query-dependent scoring signals. All of these together are bounded at ~9000 518 // (scoreWordMatch + scoreSymbol + scoreKindMatch * 10 + scoreFactorAtomMatch). 519 scorePartialWordMatch = 50.0 520 scoreWordMatch = 500.0 521 scoreBase = 7000.0 522 scorePartialBase = 4000.0 523 scoreSymbol = 7000.0 524 scorePartialSymbol = 4000.0 525 scoreKindMatch = 100.0 526 scoreFactorAtomMatch = 400.0 527 528 // File-only scoring signals. For now these are also bounded ~9000 to give them 529 // equal weight with the query-dependent signals. 530 scoreFileRankFactor = 9000.0 531 scoreFileOrderFactor = 10.0 532 scoreRepoRankFactor = 20.0 533 534 // Used for ordering line and chunk matches within a file. 535 scoreLineOrderFactor = 1.0 536) 537 538// findSection checks whether a section defined by offset and size lies within 539// one of the sections in secs. 540func findSection(secs []DocumentSection, off, sz uint32) (uint32, bool) { 541 j := sort.Search(len(secs), func(i int) bool { 542 return secs[i].End >= off+sz 543 }) 544 545 if j == len(secs) { 546 return 0, false 547 } 548 549 if secs[j].Start <= off && off+sz <= secs[j].End { 550 return uint32(j), true 551 } 552 return 0, false 553} 554 555// findMaxOverlappingSection returns the index of the section in secs that 556// overlaps the most with the area defined by off and sz, relative to the size 557// of the section. If no section overlaps, it returns 0, false. If multiple 558// sections overlap the same amount, the first one is returned. 559// 560// The implementation assumes that sections do not overlap and are sorted by 561// DocumentSection.Start. 562func findMaxOverlappingSection(secs []DocumentSection, off, sz uint32) (uint32, bool) { 563 start := off 564 end := off + sz 565 566 // Find the first section that might overlap 567 j := sort.Search(len(secs), func(i int) bool { return secs[i].End > start }) 568 569 if j == len(secs) || secs[j].Start >= end { 570 // No overlap. 571 return 0, false 572 } 573 574 relOverlap := func(j int) float64 { 575 secSize := secs[j].End - secs[j].Start 576 if secSize == 0 { 577 return 0 578 } 579 // This cannot overflow because we make sure there is overlap before calling relOverlap 580 overlap := min(secs[j].End, end) - max(secs[j].Start, start) 581 return float64(overlap) / float64(secSize) 582 } 583 584 ol1 := relOverlap(j) 585 if epsilonEqualsOne(ol1) || j == len(secs)-1 || secs[j+1].Start >= end { 586 return uint32(j), ol1 > 0 587 } 588 589 // We know that [off,off+sz[ overlaps with at least 2 sections. We only have to check 590 // if the second section overlaps more than the first one, because a third 591 // section can only overlap if the overlap with the second section is complete. 592 ol2 := relOverlap(j + 1) 593 if ol2 > ol1 { 594 return uint32(j + 1), ol2 > 0 595 } 596 597 return uint32(j), ol1 > 0 598} 599 600func (p *contentProvider) findSymbol(cm *candidateMatch) (DocumentSection, *Symbol, bool) { 601 if cm.fileName { 602 return DocumentSection{}, nil, false 603 } 604 605 secs := p.docSections() 606 607 secIdx, ok := cm.symbolIdx, cm.symbol 608 if !ok { 609 // Not from a symbol matchTree. Let's see if it overlaps with a symbol. 610 secIdx, ok = findMaxOverlappingSection(secs, cm.byteOffset, cm.byteMatchSz) 611 } 612 if !ok { 613 return DocumentSection{}, nil, false 614 } 615 616 sec := secs[secIdx] 617 618 // Now lets hydrate in the SymbolInfo. We do not hydrate in SymbolInfo.Sym 619 // since some callsites do not need it stored, and that incurs an extra 620 // copy. 621 // 622 // 2024-01-08 we are refactoring this and the code path indicates this can 623 // fail, so callers need to handle nil symbol. However, it would be 624 // surprising that we have a matching section but not symbol data. 625 start := p.id.fileEndSymbol[p.idx] 626 si := p.id.symbols.data(start + secIdx) 627 628 return sec, si, true 629} 630 631func (p *contentProvider) candidateMatchScore(ms []*candidateMatch, language string, debug bool) (float64, string, []*Symbol) { 632 type debugScore struct { 633 what string 634 score float64 635 } 636 637 score := &debugScore{} 638 maxScore := &debugScore{} 639 640 addScore := func(what string, s float64) { 641 if s != 0 && debug { 642 score.what += fmt.Sprintf("%s:%.2f, ", what, s) 643 } 644 score.score += s 645 } 646 647 filename := p.data(true) 648 var symbolInfo []*Symbol 649 650 for i, m := range ms { 651 data := p.data(m.fileName) 652 653 endOffset := m.byteOffset + m.byteMatchSz 654 startBoundary := m.byteOffset < uint32(len(data)) && (m.byteOffset == 0 || byteClass(data[m.byteOffset-1]) != byteClass(data[m.byteOffset])) 655 endBoundary := endOffset > 0 && (endOffset == uint32(len(data)) || byteClass(data[endOffset-1]) != byteClass(data[endOffset])) 656 657 score.score = 0 658 score.what = "" 659 660 if startBoundary && endBoundary { 661 addScore("WordMatch", scoreWordMatch) 662 } else if startBoundary || endBoundary { 663 addScore("PartialWordMatch", scorePartialWordMatch) 664 } 665 666 if m.fileName { 667 sep := bytes.LastIndexByte(data, '/') 668 startMatch := int(m.byteOffset) == sep+1 669 endMatch := endOffset == uint32(len(data)) 670 if startMatch && endMatch { 671 addScore("Base", scoreBase) 672 } else if startMatch || endMatch { 673 addScore("EdgeBase", (scoreBase+scorePartialBase)/2) 674 } else if sep < int(m.byteOffset) { 675 addScore("InnerBase", scorePartialBase) 676 } 677 } else if sec, si, ok := p.findSymbol(m); ok { 678 startMatch := sec.Start == m.byteOffset 679 endMatch := sec.End == endOffset 680 if startMatch && endMatch { 681 addScore("Symbol", scoreSymbol) 682 } else if startMatch || endMatch { 683 addScore("EdgeSymbol", (scoreSymbol+scorePartialSymbol)/2) 684 } else { 685 addScore("OverlapSymbol", scorePartialSymbol) 686 } 687 688 // Score based on symbol data 689 if si != nil { 690 symbolKind := ctags.ParseSymbolKind(si.Kind) 691 sym := sectionSlice(data, sec) 692 693 addScore(fmt.Sprintf("kind:%s:%s", language, si.Kind), scoreSymbolKind(language, filename, sym, symbolKind)) 694 695 // This is from a symbol tree, so we need to store the symbol 696 // information. 697 if m.symbol { 698 if symbolInfo == nil { 699 symbolInfo = make([]*Symbol, len(ms)) 700 } 701 // findSymbols does not hydrate in Sym. So we need to store it. 702 si.Sym = string(sym) 703 symbolInfo[i] = si 704 } 705 } 706 } 707 708 // scoreWeight != 1 means it affects score 709 if !epsilonEqualsOne(m.scoreWeight) { 710 score.score = score.score * m.scoreWeight 711 if debug { 712 score.what += fmt.Sprintf("boost:%.2f, ", m.scoreWeight) 713 } 714 } 715 716 if score.score > maxScore.score { 717 maxScore.score = score.score 718 maxScore.what = score.what 719 } 720 } 721 722 if debug { 723 maxScore.what = fmt.Sprintf("score:%.2f <- %s", maxScore.score, strings.TrimSuffix(maxScore.what, ", ")) 724 } 725 726 return maxScore.score, maxScore.what, symbolInfo 727} 728 729// sectionSlice will return data[sec.Start:sec.End] but will clip Start and 730// End such that it won't be out of range. 731func sectionSlice(data []byte, sec DocumentSection) []byte { 732 l := uint32(len(data)) 733 if sec.Start >= l { 734 return nil 735 } 736 if sec.End > l { 737 sec.End = l 738 } 739 return data[sec.Start:sec.End] 740} 741 742// scoreSymbolKind boosts a match based on the combination of language, symbol 743// and kind. The language string comes from go-enry, the symbol and kind from 744// ctags. 745func scoreSymbolKind(language string, filename []byte, sym []byte, kind ctags.SymbolKind) float64 { 746 var factor float64 747 748 // Generic ranking which will be overriden by language specific ranking 749 switch kind { 750 case ctags.Type: // scip-ctags regression workaround https://github.com/sourcegraph/sourcegraph/issues/57659 751 factor = 8 752 case ctags.Class: 753 factor = 10 754 case ctags.Struct: 755 factor = 9.5 756 case ctags.Enum: 757 factor = 9 758 case ctags.Interface: 759 factor = 8 760 case ctags.Function, ctags.Method: 761 factor = 7 762 case ctags.Field: 763 factor = 5.5 764 case ctags.Constant: 765 factor = 5 766 case ctags.Variable: 767 factor = 4 768 default: 769 // For all other kinds, assign a low score by default. 770 factor = 1 771 } 772 773 switch language { 774 case "Java", "java": 775 switch kind { 776 // 2022-03-30: go-ctags contains a regex rule for Java classes that sets "kind" 777 // to "classes" instead of "c". We have to cover both cases to support existing 778 // indexes. 779 case ctags.Class: 780 factor = 10 781 case ctags.Enum: 782 factor = 9 783 case ctags.Interface: 784 factor = 8 785 case ctags.Method: 786 factor = 7 787 case ctags.Field: 788 factor = 6 789 case ctags.EnumConstant: 790 factor = 5 791 } 792 case "Kotlin", "kotlin": 793 switch kind { 794 case ctags.Class: 795 factor = 10 796 case ctags.Interface: 797 factor = 9 798 case ctags.Method: 799 factor = 8 800 case ctags.TypeAlias: 801 factor = 7 802 case ctags.Constant: 803 factor = 6 804 case ctags.Variable: 805 factor = 5 806 } 807 case "Go", "go": 808 switch kind { 809 // scip-ctags regression workaround https://github.com/sourcegraph/sourcegraph/issues/57659 810 // for each case a description of the fields in ctags in the comment 811 case ctags.Type: // interface struct talias 812 factor = 9 813 case ctags.Interface: // interfaces 814 factor = 10 815 case ctags.Struct: // structs 816 factor = 9 817 case ctags.TypeAlias: // type aliases 818 factor = 9 819 case ctags.MethodSpec: // interface method specification 820 factor = 8.5 821 case ctags.Method, ctags.Function: // functions 822 factor = 8 823 case ctags.Field: // struct fields 824 factor = 7 825 case ctags.Constant: // constants 826 factor = 6 827 case ctags.Variable: // variables 828 factor = 5 829 } 830 831 // Boost exported go symbols. Same implementation as token.IsExported 832 if ch, _ := utf8.DecodeRune(sym); unicode.IsUpper(ch) { 833 factor += 0.5 834 } 835 836 if bytes.HasSuffix(filename, []byte("_test.go")) { 837 factor *= 0.8 838 } 839 840 // Could also rank on: 841 // 842 // - anonMember struct anonymous members 843 // - packageName name for specifying imported package 844 // - receiver receivers 845 // - package packages 846 // - type types 847 // - unknown unknown 848 case "C++", "c++": 849 switch kind { 850 case ctags.Class: // classes 851 factor = 10 852 case ctags.Enum: // enumeration names 853 factor = 9 854 case ctags.Function: // function definitions 855 factor = 8 856 case ctags.Struct: // structure names 857 factor = 7 858 case ctags.Union: // union names 859 factor = 6 860 case ctags.TypeAlias: // typedefs 861 factor = 5 862 case ctags.Field: // class, struct, and union members 863 factor = 4 864 case ctags.Variable: // varialbe definitions 865 factor = 3 866 } 867 // Could also rank on: 868 // NAME DESCRIPTION 869 // macro macro definitions 870 // enumerator enumerators (values inside an enumeration) 871 // header included header files 872 // namespace namespaces 873 // variable variable definitions 874 case "Scala", "scala": 875 switch kind { 876 case ctags.Class: 877 factor = 10 878 case ctags.Interface: 879 factor = 9 880 case ctags.Object: 881 factor = 8 882 case ctags.Function: 883 factor = 7 884 case ctags.Type: 885 factor = 6 886 case ctags.Variable: 887 factor = 5 888 case ctags.Package: 889 factor = 4 890 } 891 case "Python", "python": 892 switch kind { 893 case ctags.Class: // classes 894 factor = 10 895 case ctags.Function, ctags.Method: // function definitions 896 factor = 8 897 case ctags.Field: // class, struct, and union members 898 factor = 4 899 case ctags.Variable: // variable definitions 900 factor = 3 901 case ctags.Local: // local variables 902 factor = 2 903 } 904 // Could also rank on: 905 // 906 // - namespace name referring a module defined in other file 907 // - module modules 908 // - unknown name referring a class/variable/function/module defined in other module 909 // - parameter function parameters 910 case "Ruby", "ruby": 911 switch kind { 912 case ctags.Class: 913 factor = 10 914 case ctags.Method: 915 factor = 9 916 case ctags.MethodAlias: 917 factor = 8 918 case ctags.Module: 919 factor = 7 920 case ctags.SingletonMethod: 921 factor = 6 922 case ctags.Constant: 923 factor = 5 924 case ctags.Accessor: 925 factor = 4 926 case ctags.Library: 927 factor = 3 928 } 929 case "PHP", "php": 930 switch kind { 931 case ctags.Class: 932 factor = 10 933 case ctags.Interface: 934 factor = 9 935 case ctags.Function: 936 factor = 8 937 case ctags.Trait: 938 factor = 7 939 case ctags.Define: 940 factor = 6 941 case ctags.Namespace: 942 factor = 5 943 case ctags.MethodAlias: 944 factor = 4 945 case ctags.Variable: 946 factor = 3 947 case ctags.Local: 948 factor = 3 949 } 950 case "GraphQL", "graphql": 951 switch kind { 952 case ctags.Type: 953 factor = 10 954 } 955 case "Markdown", "markdown": 956 // Headers are good signal in docs, but do not rank as highly as code. 957 switch kind { 958 case ctags.Chapter: // # 959 factor = 4 960 case ctags.Section: // ## 961 factor = 3 962 case ctags.Subsection: // ### 963 factor = 2 964 } 965 } 966 967 return factor * scoreKindMatch 968} 969 970type matchScoreSlice []LineMatch 971 972func (m matchScoreSlice) Len() int { return len(m) } 973func (m matchScoreSlice) Swap(i, j int) { m[i], m[j] = m[j], m[i] } 974func (m matchScoreSlice) Less(i, j int) bool { return m[i].Score > m[j].Score } 975 976type chunkMatchScoreSlice []ChunkMatch 977 978func (m chunkMatchScoreSlice) Len() int { return len(m) } 979func (m chunkMatchScoreSlice) Swap(i, j int) { m[i], m[j] = m[j], m[i] } 980func (m chunkMatchScoreSlice) Less(i, j int) bool { return m[i].Score > m[j].Score } 981 982type fileMatchesByScore []FileMatch 983 984func (m fileMatchesByScore) Len() int { return len(m) } 985func (m fileMatchesByScore) Swap(i, j int) { m[i], m[j] = m[j], m[i] } 986func (m fileMatchesByScore) Less(i, j int) bool { return m[i].Score > m[j].Score } 987 988func sortMatchesByScore(ms []LineMatch) { 989 sort.Sort(matchScoreSlice(ms)) 990} 991 992func sortChunkMatchesByScore(ms []ChunkMatch) { 993 sort.Sort(chunkMatchScoreSlice(ms)) 994} 995 996// SortFiles sorts files matches in the order we want to present results to 997// users. The order depends on the match score, which includes both 998// query-dependent signals like word overlap, and file-only signals like the 999// file ranks (if file ranks are enabled). 1000// 1001// We don't only use the scores, we will also boost some results to present 1002// files with novel extensions. 1003func SortFiles(ms []FileMatch) { 1004 sort.Sort(fileMatchesByScore(ms)) 1005 1006 // Boost a file extension not in the top 3 to the third filematch. 1007 boostNovelExtension(ms, 2, 0.9) 1008} 1009 1010func boostNovelExtension(ms []FileMatch, boostOffset int, minScoreRatio float64) { 1011 if len(ms) <= boostOffset+1 { 1012 return 1013 } 1014 1015 top := ms[:boostOffset] 1016 candidates := ms[boostOffset:] 1017 1018 // Don't bother boosting something which is significantly different to the 1019 // result it replaces. 1020 minScoreForNovelty := candidates[0].Score * minScoreRatio 1021 1022 // We want to look for an ext that isn't in the top exts 1023 exts := make([]string, len(top)) 1024 for i := range top { 1025 exts[i] = path.Ext(top[i].FileName) 1026 } 1027 1028 for i := range candidates { 1029 // Do not assume sorted due to boostNovelExtension being called on subsets 1030 if candidates[i].Score < minScoreForNovelty { 1031 continue 1032 } 1033 1034 if slices.Contains(exts, path.Ext(candidates[i].FileName)) { 1035 continue 1036 } 1037 1038 // Found what we are looking for, now boost to front of candidates (which 1039 // is ms[boostOffset]) 1040 for ; i > 0; i-- { 1041 candidates[i], candidates[i-1] = candidates[i-1], candidates[i] 1042 } 1043 return 1044 } 1045}