fork of https://github.com/sourcegraph/zoekt
0

Configure Feed

Select the types of activity you want to include in your feed.

1// Copyright 2016 Google Inc. All rights reserved. 2// 3// Licensed under the Apache License, Version 2.0 (the "License"); 4// you may not use this file except in compliance with the License. 5// You may obtain a copy of the License at 6// 7// http://www.apache.org/licenses/LICENSE-2.0 8// 9// Unless required by applicable law or agreed to in writing, software 10// distributed under the License is distributed on an "AS IS" BASIS, 11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12// See the License for the specific language governing permissions and 13// limitations under the License. 14 15package index 16 17import ( 18 "bytes" 19 "log" 20 "path" 21 "slices" 22 "sort" 23 "unicode" 24 "unicode/utf8" 25 26 "github.com/sourcegraph/zoekt" 27 "github.com/sourcegraph/zoekt/internal/ctags" 28) 29 30var _ = log.Println 31 32// contentProvider is an abstraction to treat matches for names and 33// content with the same code. 34type contentProvider struct { 35 id *indexData 36 stats *zoekt.Stats 37 38 // mutable 39 err error 40 idx uint32 41 _data []byte 42 _nl []uint32 43 _nlBuf []uint32 44 _sects []DocumentSection 45 _sectBuf []DocumentSection 46 fileSize uint32 47} 48 49// setDocument skips to the given document. 50func (p *contentProvider) setDocument(docID uint32) { 51 fileStart := p.id.boundaries[docID] 52 53 p.idx = docID 54 p.fileSize = p.id.boundaries[docID+1] - fileStart 55 56 p._nl = nil 57 p._sects = nil 58 p._data = nil 59} 60 61func (p *contentProvider) docSections() []DocumentSection { 62 if p._sects == nil { 63 var sz uint32 64 p._sects, sz, p.err = p.id.readDocSections(p.idx, p._sectBuf) 65 p.stats.ContentBytesLoaded += int64(sz) 66 p._sectBuf = p._sects 67 } 68 return p._sects 69} 70 71func (p *contentProvider) newlines() newlines { 72 if p._nl == nil { 73 var sz uint32 74 p._nl, sz, p.err = p.id.readNewlines(p.idx, p._nlBuf) 75 p._nlBuf = p._nl 76 p.stats.ContentBytesLoaded += int64(sz) 77 } 78 return newlines{locs: p._nl, fileSize: p.fileSize} 79} 80 81func (p *contentProvider) data(fileName bool) []byte { 82 if fileName { 83 return p.id.fileNameContent[p.id.fileNameIndex[p.idx]:p.id.fileNameIndex[p.idx+1]] 84 } 85 86 if p._data == nil { 87 p._data, p.err = p.id.readContents(p.idx) 88 p.stats.FilesLoaded++ 89 p.stats.ContentBytesLoaded += int64(len(p._data)) 90 } 91 return p._data 92} 93 94// Find offset in bytes (relative to corpus start) for an offset in 95// runes (relative to document start). If filename is set, the corpus 96// is the set of filenames, with the document being the name itself. 97func (p *contentProvider) findOffset(filename bool, r uint32) uint32 { 98 if p.id.metaData.PlainASCII { 99 return r 100 } 101 102 sample := p.id.runeOffsets 103 runeEnds := p.id.fileEndRunes 104 fileStartByte := p.id.boundaries[p.idx] 105 if filename { 106 sample = p.id.fileNameRuneOffsets 107 runeEnds = p.id.fileNameEndRunes 108 fileStartByte = p.id.fileNameIndex[p.idx] 109 } 110 111 absR := r 112 if p.idx > 0 { 113 absR += runeEnds[p.idx-1] 114 } 115 116 byteOff, left := sample.lookup(absR) 117 118 var data []byte 119 120 if filename { 121 data = p.id.fileNameContent[byteOff:] 122 } else { 123 data, p.err = p.id.readContentSlice(byteOff, 3*runeOffsetFrequency) 124 if p.err != nil { 125 return 0 126 } 127 } 128 for left > 0 { 129 _, sz := utf8.DecodeRune(data) 130 byteOff += uint32(sz) 131 data = data[sz:] 132 left-- 133 } 134 135 byteOff -= fileStartByte 136 return byteOff 137} 138 139// fillMatches converts the internal candidateMatch slice into our API's LineMatch. 140// It only ever returns content XOR filename matches, not both. If there are any 141// content matches, these are always returned, and we omit filename matches. 142// 143// Performance invariant: ms is sorted and non-overlapping. 144// 145// Note: the byte slices may be backed by mmapped data, so before being 146// returned by the API it needs to be copied. 147func (p *contentProvider) fillMatches(ms []*candidateMatch, numContextLines int, language string, opts *zoekt.SearchOptions) []zoekt.LineMatch { 148 var filenameMatches []*candidateMatch 149 contentMatches := make([]*candidateMatch, 0, len(ms)) 150 151 for _, m := range ms { 152 if m.fileName { 153 filenameMatches = append(filenameMatches, m) 154 } else { 155 contentMatches = append(contentMatches, m) 156 } 157 } 158 159 // If there are any content matches, we only return these and skip filename matches. 160 if len(contentMatches) > 0 { 161 contentMatches = breakMatchesOnNewlines(contentMatches, p.data(false)) 162 return p.fillContentMatches(contentMatches, numContextLines, language, opts) 163 } 164 165 // Otherwise, we return a single line containing the filematch index. 166 lineScore, _ := p.scoreLine(filenameMatches, language, -1 /* must pass -1 for filenames */, opts) 167 res := zoekt.LineMatch{ 168 Line: p.id.fileName(p.idx), 169 FileName: true, 170 Score: lineScore.score, 171 DebugScore: lineScore.debugScore, 172 } 173 174 for _, m := range ms { 175 res.LineFragments = append(res.LineFragments, zoekt.LineFragmentMatch{ 176 LineOffset: int(m.byteOffset), 177 MatchLength: int(m.byteMatchSz), 178 Offset: m.byteOffset, 179 }) 180 } 181 182 return []zoekt.LineMatch{res} 183 184} 185 186// fillChunkMatches converts the internal candidateMatch slice into our API's ChunkMatch. 187// It only ever returns content XOR filename matches, not both. If there are any content 188// matches, these are always returned, and we omit filename matches. 189// 190// Performance invariant: ms is sorted and non-overlapping. 191// 192// Note: the byte slices may be backed by mmapped data, so before being 193// returned by the API it needs to be copied. 194func (p *contentProvider) fillChunkMatches(ms []*candidateMatch, numContextLines int, language string, opts *zoekt.SearchOptions) []zoekt.ChunkMatch { 195 var filenameMatches []*candidateMatch 196 contentMatches := make([]*candidateMatch, 0, len(ms)) 197 198 for _, m := range ms { 199 if m.fileName { 200 filenameMatches = append(filenameMatches, m) 201 } else { 202 contentMatches = append(contentMatches, m) 203 } 204 } 205 206 // If there are any content matches, we only return these and skip filename matches. 207 if len(contentMatches) > 0 { 208 return p.fillContentChunkMatches(contentMatches, numContextLines, language, opts) 209 } 210 211 // Otherwise, we return a single chunk representing the filename index. 212 lineScore, _ := p.scoreLine(filenameMatches, language, -1 /* must pass -1 for filenames */, opts) 213 fileName := p.id.fileName(p.idx) 214 ranges := make([]zoekt.Range, 0, len(ms)) 215 for _, m := range ms { 216 ranges = append(ranges, zoekt.Range{ 217 Start: zoekt.Location{ 218 ByteOffset: m.byteOffset, 219 LineNumber: 1, 220 Column: uint32(utf8.RuneCount(fileName[:m.byteOffset]) + 1), 221 }, 222 End: zoekt.Location{ 223 ByteOffset: m.byteOffset + m.byteMatchSz, 224 LineNumber: 1, 225 Column: uint32(utf8.RuneCount(fileName[:m.byteOffset+m.byteMatchSz]) + 1), 226 }, 227 }) 228 } 229 230 return []zoekt.ChunkMatch{{ 231 Content: fileName, 232 ContentStart: zoekt.Location{ByteOffset: 0, LineNumber: 1, Column: 1}, 233 Ranges: ranges, 234 FileName: true, 235 Score: lineScore.score, 236 DebugScore: lineScore.debugScore, 237 }} 238} 239 240func (p *contentProvider) fillContentMatches(ms []*candidateMatch, numContextLines int, language string, opts *zoekt.SearchOptions) []zoekt.LineMatch { 241 var result []zoekt.LineMatch 242 for len(ms) > 0 { 243 m := ms[0] 244 num := p.newlines().atOffset(m.byteOffset) 245 lineStart := int(p.newlines().lineStart(num)) 246 nextLineStart := int(p.newlines().lineStart(num + 1)) 247 248 var lineCands []*candidateMatch 249 250 endMatch := m.byteOffset + m.byteMatchSz 251 252 for len(ms) > 0 { 253 m := ms[0] 254 if int(m.byteOffset) < nextLineStart { 255 endMatch = m.byteOffset + m.byteMatchSz 256 lineCands = append(lineCands, m) 257 ms = ms[1:] 258 } else { 259 break 260 } 261 } 262 263 if len(lineCands) == 0 { 264 log.Panicf( 265 "%s %v infinite loop: num %d start,end %d,%d, offset %d", 266 p.id.fileName(p.idx), p.id.metaData, 267 num, lineStart, nextLineStart, 268 m.byteOffset) 269 } 270 271 data := p.data(false) 272 273 // Due to merging matches, we may have a match that 274 // crosses a line boundary. Prevent confusion by 275 // taking lines until we pass the last index 276 for nextLineStart < len(data) && endMatch > uint32(nextLineStart) { 277 next := bytes.IndexByte(data[nextLineStart:], '\n') 278 if next == -1 { 279 nextLineStart = len(data) 280 } else { 281 // TODO(hanwen): test that checks "+1" part here. 282 nextLineStart += next + 1 283 } 284 } 285 286 finalMatch := zoekt.LineMatch{ 287 LineStart: lineStart, 288 LineEnd: nextLineStart, 289 LineNumber: num, 290 } 291 finalMatch.Line = data[lineStart:nextLineStart] 292 293 if numContextLines > 0 { 294 finalMatch.Before = p.newlines().getLines(data, num-numContextLines, num) 295 finalMatch.After = p.newlines().getLines(data, num+1, num+1+numContextLines) 296 } 297 298 lineScore, symbolInfo := p.scoreLine(lineCands, language, num, opts) 299 finalMatch.Score = lineScore.score 300 finalMatch.DebugScore = lineScore.debugScore 301 302 for i, m := range lineCands { 303 fragment := zoekt.LineFragmentMatch{ 304 Offset: m.byteOffset, 305 LineOffset: int(m.byteOffset) - lineStart, 306 MatchLength: int(m.byteMatchSz), 307 } 308 309 if i < len(symbolInfo) && symbolInfo[i] != nil { 310 fragment.SymbolInfo = symbolInfo[i] 311 } 312 313 finalMatch.LineFragments = append(finalMatch.LineFragments, fragment) 314 } 315 result = append(result, finalMatch) 316 } 317 return result 318} 319 320func (p *contentProvider) fillContentChunkMatches(ms []*candidateMatch, numContextLines int, language string, opts *zoekt.SearchOptions) []zoekt.ChunkMatch { 321 data := p.data(false) 322 323 // columnHelper prevents O(len(ms) * len(data)) lookups for all columns. 324 // However, it depends on ms being sorted by byteOffset and non-overlapping. 325 // This invariant is true at the time of writing, but we conservatively 326 // enforce this. Note: chunkCandidates preserves the sorting so safe to 327 // transform now. 328 columnHelper := columnHelper{data: data} 329 if !sort.IsSorted((sortByOffsetSlice)(ms)) { 330 log.Printf("WARN: performance invariant violated. candidate matches are not sorted in fillContentChunkMatches. Report to developers.") 331 sort.Sort((sortByOffsetSlice)(ms)) 332 } 333 334 newlines := p.newlines() 335 chunks := chunkCandidates(ms, newlines, numContextLines) 336 chunkMatches := make([]zoekt.ChunkMatch, 0, len(chunks)) 337 for _, chunk := range chunks { 338 ranges := make([]zoekt.Range, 0, len(chunk.candidates)) 339 for _, cm := range chunk.candidates { 340 startOffset := cm.byteOffset 341 endOffset := cm.byteOffset + cm.byteMatchSz 342 startLine, endLine := newlines.offsetRangeToLineRange(startOffset, endOffset) 343 344 ranges = append(ranges, zoekt.Range{ 345 Start: zoekt.Location{ 346 ByteOffset: startOffset, 347 LineNumber: uint32(startLine), 348 Column: columnHelper.get(int(newlines.lineStart(startLine)), startOffset), 349 }, 350 End: zoekt.Location{ 351 ByteOffset: endOffset, 352 LineNumber: uint32(endLine), 353 Column: columnHelper.get(int(newlines.lineStart(endLine)), endOffset), 354 }, 355 }) 356 } 357 358 firstLineNumber := int(chunk.firstLine) - numContextLines 359 if firstLineNumber < 1 { 360 firstLineNumber = 1 361 } 362 firstLineStart := newlines.lineStart(firstLineNumber) 363 364 chunkScore, symbolInfo := p.scoreChunk(chunk.candidates, language, opts) 365 chunkMatches = append(chunkMatches, zoekt.ChunkMatch{ 366 Content: newlines.getLines(data, firstLineNumber, int(chunk.lastLine)+numContextLines+1), 367 ContentStart: zoekt.Location{ 368 ByteOffset: firstLineStart, 369 LineNumber: uint32(firstLineNumber), 370 Column: 1, 371 }, 372 FileName: false, 373 Ranges: ranges, 374 SymbolInfo: symbolInfo, 375 BestLineMatch: uint32(chunkScore.bestLine), 376 Score: chunkScore.score, 377 DebugScore: chunkScore.debugScore, 378 }) 379 } 380 return chunkMatches 381} 382 383type candidateChunk struct { 384 candidates []*candidateMatch 385 firstLine uint32 // 1-based, inclusive 386 lastLine uint32 // 1-based, inclusive 387 minOffset uint32 // 0-based, inclusive 388 maxOffset uint32 // 0-based, exclusive 389} 390 391// chunkCandidates groups a set of sorted, non-overlapping candidate matches by line number. Adjacent 392// chunks will be merged if adding `numContextLines` to the beginning and end of the chunk would cause 393// it to overlap with an adjacent chunk. 394// 395// input invariants: ms is sorted by byteOffset and is non overlapping with respect to endOffset. 396// output invariants: if you flatten candidates the input invariant is retained. 397func chunkCandidates(ms []*candidateMatch, newlines newlines, numContextLines int) []candidateChunk { 398 var chunks []candidateChunk 399 400 for _, m := range ms { 401 startOffset := m.byteOffset 402 endOffset := m.byteOffset + m.byteMatchSz 403 firstLine, lastLine := newlines.offsetRangeToLineRange(startOffset, endOffset) 404 405 if len(chunks) > 0 && int(chunks[len(chunks)-1].lastLine)+numContextLines >= firstLine-numContextLines { 406 // If a new chunk created with the current candidateMatch would 407 // overlap with the previous chunk, instead add the candidateMatch 408 // to the last chunk and extend end of the last chunk. 409 last := &chunks[len(chunks)-1] 410 last.candidates = append(last.candidates, m) 411 if last.maxOffset < endOffset { 412 last.lastLine = uint32(lastLine) 413 last.maxOffset = uint32(endOffset) 414 } 415 } else { 416 chunks = append(chunks, candidateChunk{ 417 firstLine: uint32(firstLine), 418 lastLine: uint32(lastLine), 419 minOffset: startOffset, 420 maxOffset: endOffset, 421 candidates: []*candidateMatch{m}, 422 }) 423 } 424 } 425 return chunks 426} 427 428// columnHelper is a helper struct which caches the number of runes last 429// counted. If we naively use utf8.RuneCount for each match on a line, this 430// leads to an O(nm) algorithm where m is the number of matches and n is the 431// length of the line. Aassuming we our candidates are increasing in offset 432// makes this operation O(n) instead. 433type columnHelper struct { 434 data []byte 435 436 // 0 values for all these are valid values 437 lastLineOffset int 438 lastOffset uint32 439 lastRuneCount uint32 440} 441 442// get returns the line column for offset. offset is the byte offset of the 443// rune in data. lineOffset is the byte offset inside of data for the line 444// containing offset. 445func (c *columnHelper) get(lineOffset int, offset uint32) uint32 { 446 var runeCount uint32 447 448 if lineOffset == c.lastLineOffset && offset >= c.lastOffset { 449 // Can count from last calculation 450 runeCount = c.lastRuneCount + uint32(utf8.RuneCount(c.data[c.lastOffset:offset])) 451 } else { 452 // Need to count from the beginning of line 453 runeCount = uint32(utf8.RuneCount(c.data[lineOffset:offset])) 454 } 455 456 c.lastLineOffset = lineOffset 457 c.lastOffset = offset 458 c.lastRuneCount = runeCount 459 460 return runeCount + 1 461} 462 463type newlines struct { 464 // locs is the sorted set of byte offsets of the newlines in the file 465 locs []uint32 466 467 // fileSize is just the number of bytes in the file. It is stored 468 // on this struct so we can safely know the length of the last line 469 // in the file since not all files end in a newline. 470 fileSize uint32 471} 472 473// atOffset returns the line containing the offset. If the offset lands on 474// the newline ending line M, we return M. 475func (nls newlines) atOffset(offset uint32) (lineNumber int) { 476 idx := sort.Search(len(nls.locs), func(n int) bool { 477 return nls.locs[n] >= offset 478 }) 479 return idx + 1 480} 481 482// lineStart returns the byte offset of the beginning of the given line. 483// lineNumber is 1-based. If lineNumber is out of range of the lines in the 484// file, the return value will be clamped to [0,fileSize]. 485func (nls newlines) lineStart(lineNumber int) uint32 { 486 // nls.locs[0] + 1 is the start of the 2nd line of data. 487 startIdx := lineNumber - 2 488 489 if startIdx < 0 { 490 return 0 491 } else if startIdx >= len(nls.locs) { 492 return nls.fileSize 493 } else { 494 return nls.locs[startIdx] + 1 495 } 496} 497 498// offsetRangeToLineRange returns range of lines that fully contains the given byte range. 499// The inputs are 0-based byte offsets into the file representing the (exclusive) range [startOffset, endOffset). 500// The return values are 1-based line numbers representing the (inclusive) range [startLine, endLine]. 501func (nls newlines) offsetRangeToLineRange(startOffset, endOffset uint32) (startLine, endLine int) { 502 startLine = nls.atOffset(startOffset) 503 endLine = nls.atOffset( 504 max(startOffset, max(endOffset, 1)-1), // clamp endOffset and prevent underflow 505 ) 506 return startLine, endLine 507} 508 509// getLines returns a slice of data containing the lines [low, high). 510// low is 1-based and inclusive. high is 1-based and exclusive. 511func (nls newlines) getLines(data []byte, low, high int) []byte { 512 if low >= high { 513 return nil 514 } 515 516 return data[nls.lineStart(low):nls.lineStart(high)] 517} 518 519const ( 520 // Query-dependent scoring signals. All of these together are bounded at ~9000 521 // (scoreWordMatch + scoreSymbol + scoreKindMatch * 10 + scoreFactorAtomMatch). 522 scorePartialWordMatch = 50.0 523 scoreWordMatch = 500.0 524 scoreBase = 7000.0 525 scorePartialBase = 4000.0 526 scoreSymbol = 7000.0 527 scorePartialSymbol = 4000.0 528 scoreKindMatch = 100.0 529 scoreFactorAtomMatch = 400.0 530 531 // Used for ordering line and chunk matches within a file. 532 scoreLineOrderFactor = 1.0 533 534 // Used for tiebreakers. The scores are not combined with the main score, but 535 // are used to break ties between matches with the same score. The factors are 536 // chosen to separate the tiebreakers from the main score and from each other. 537 // If you make changes here, make sure to update indexData.scoreFile too. 538 scoreRepoRankFactor = 100.0 539 scoreFileOrderFactor = 10.0 540) 541 542// findMaxOverlappingSection returns the index of the section in secs that 543// overlaps the most with the area defined by off and sz, relative to the size 544// of the section. If no section overlaps, it returns 0, false. If multiple 545// sections overlap the same amount, the first one is returned. 546// 547// The implementation assumes that sections do not overlap and are sorted by 548// DocumentSection.Start. 549func findMaxOverlappingSection(secs []DocumentSection, off, sz uint32) (uint32, bool) { 550 start := off 551 end := off + sz 552 553 // Find the first section that might overlap 554 j := sort.Search(len(secs), func(i int) bool { return secs[i].End > start }) 555 556 if j == len(secs) || secs[j].Start >= end { 557 // No overlap. 558 return 0, false 559 } 560 561 relOverlap := func(j int) float64 { 562 secSize := secs[j].End - secs[j].Start 563 if secSize == 0 { 564 return 0 565 } 566 // This cannot overflow because we make sure there is overlap before calling relOverlap 567 overlap := min(secs[j].End, end) - max(secs[j].Start, start) 568 return float64(overlap) / float64(secSize) 569 } 570 571 ol1 := relOverlap(j) 572 if epsilonEqualsOne(ol1) || j == len(secs)-1 || secs[j+1].Start >= end { 573 return uint32(j), ol1 > 0 574 } 575 576 // We know that [off,off+sz[ overlaps with at least 2 sections. We only have to check 577 // if the second section overlaps more than the first one, because a third 578 // section can only overlap if the overlap with the second section is complete. 579 ol2 := relOverlap(j + 1) 580 if ol2 > ol1 { 581 return uint32(j + 1), ol2 > 0 582 } 583 584 return uint32(j), ol1 > 0 585} 586 587func (p *contentProvider) matchesSymbol(cm *candidateMatch) bool { 588 if cm.fileName { 589 return false 590 } 591 592 // Check if this candidate came from a symbol matchTree 593 if cm.symbol { 594 return true 595 } 596 597 // Check if it overlaps with a symbol. 598 secs := p.docSections() 599 _, ok := findMaxOverlappingSection(secs, cm.byteOffset, cm.byteMatchSz) 600 return ok 601} 602 603func (p *contentProvider) findSymbol(cm *candidateMatch) (DocumentSection, *zoekt.Symbol, bool) { 604 if cm.fileName { 605 return DocumentSection{}, nil, false 606 } 607 608 secs := p.docSections() 609 610 secIdx, ok := cm.symbolIdx, cm.symbol 611 if !ok { 612 // Not from a symbol matchTree. Let's see if it overlaps with a symbol. 613 secIdx, ok = findMaxOverlappingSection(secs, cm.byteOffset, cm.byteMatchSz) 614 } 615 if !ok { 616 return DocumentSection{}, nil, false 617 } 618 619 sec := secs[secIdx] 620 621 // Now lets hydrate in the SymbolInfo. We do not hydrate in SymbolInfo.Sym 622 // since some callsites do not need it stored, and that incurs an extra 623 // copy. 624 // 625 // 2024-01-08 we are refactoring this and the code path indicates this can 626 // fail, so callers need to handle nil symbol. However, it would be 627 // surprising that we have a matching section but not symbol data. 628 start := p.id.fileEndSymbol[p.idx] 629 si := p.id.symbols.data(start + secIdx) 630 631 return sec, si, true 632} 633 634// sectionSlice will return data[sec.Start:sec.End] but will clip Start and 635// End such that it won't be out of range. 636func sectionSlice(data []byte, sec DocumentSection) []byte { 637 l := uint32(len(data)) 638 if sec.Start >= l { 639 return nil 640 } 641 if sec.End > l { 642 sec.End = l 643 } 644 return data[sec.Start:sec.End] 645} 646 647// scoreSymbolKind boosts a match based on the combination of language, symbol 648// and kind. The language string comes from go-enry, the symbol and kind from 649// ctags. 650func scoreSymbolKind(language string, filename []byte, sym []byte, kind ctags.SymbolKind) float64 { 651 var factor float64 652 653 // Generic ranking which will be overriden by language specific ranking 654 switch kind { 655 case ctags.Type: // scip-ctags regression workaround https://github.com/sourcegraph/sourcegraph/issues/57659 656 factor = 8 657 case ctags.Class: 658 factor = 10 659 case ctags.Struct: 660 factor = 9.5 661 case ctags.Enum: 662 factor = 9 663 case ctags.Interface: 664 factor = 8 665 case ctags.Function, ctags.Method: 666 factor = 7 667 case ctags.Field: 668 factor = 5.5 669 case ctags.Constant: 670 factor = 5 671 case ctags.Variable: 672 factor = 4 673 default: 674 // For all other kinds, assign a low score by default. 675 factor = 1 676 } 677 678 switch language { 679 case "Java", "java": 680 switch kind { 681 // 2022-03-30: go-ctags contains a regex rule for Java classes that sets "kind" 682 // to "classes" instead of "c". We have to cover both cases to support existing 683 // indexes. 684 case ctags.Class: 685 factor = 10 686 case ctags.Enum: 687 factor = 9 688 case ctags.Interface: 689 factor = 8 690 case ctags.Method: 691 factor = 7 692 case ctags.Field: 693 factor = 6 694 case ctags.EnumConstant: 695 factor = 5 696 } 697 case "Kotlin", "kotlin": 698 switch kind { 699 case ctags.Class: 700 factor = 10 701 case ctags.Interface: 702 factor = 9 703 case ctags.Method: 704 factor = 8 705 case ctags.TypeAlias: 706 factor = 7 707 case ctags.Constant: 708 factor = 6 709 case ctags.Variable: 710 factor = 5 711 } 712 case "Go", "go": 713 switch kind { 714 // scip-ctags regression workaround https://github.com/sourcegraph/sourcegraph/issues/57659 715 // for each case a description of the fields in ctags in the comment 716 case ctags.Type: // interface struct talias 717 factor = 9 718 case ctags.Interface: // interfaces 719 factor = 10 720 case ctags.Struct: // structs 721 factor = 9 722 case ctags.TypeAlias: // type aliases 723 factor = 9 724 case ctags.MethodSpec: // interface method specification 725 factor = 8.5 726 case ctags.Method, ctags.Function: // functions 727 factor = 8 728 case ctags.Field: // struct fields 729 factor = 7 730 case ctags.Constant: // constants 731 factor = 6 732 case ctags.Variable: // variables 733 factor = 5 734 } 735 736 // Boost exported go symbols. Same implementation as token.IsExported 737 if ch, _ := utf8.DecodeRune(sym); unicode.IsUpper(ch) { 738 factor += 0.5 739 } 740 741 if bytes.HasSuffix(filename, []byte("_test.go")) { 742 factor *= 0.8 743 } 744 745 // Could also rank on: 746 // 747 // - anonMember struct anonymous members 748 // - packageName name for specifying imported package 749 // - receiver receivers 750 // - package packages 751 // - type types 752 // - unknown unknown 753 case "C++", "c++": 754 switch kind { 755 case ctags.Class: // classes 756 factor = 10 757 case ctags.Enum: // enumeration names 758 factor = 9 759 case ctags.Function: // function definitions 760 factor = 8 761 case ctags.Struct: // structure names 762 factor = 7 763 case ctags.Union: // union names 764 factor = 6 765 case ctags.TypeAlias: // typedefs 766 factor = 5 767 case ctags.Field: // class, struct, and union members 768 factor = 4 769 case ctags.Variable: // varialbe definitions 770 factor = 3 771 } 772 // Could also rank on: 773 // NAME DESCRIPTION 774 // macro macro definitions 775 // enumerator enumerators (values inside an enumeration) 776 // header included header files 777 // namespace namespaces 778 // variable variable definitions 779 case "Scala", "scala": 780 switch kind { 781 case ctags.Class: 782 factor = 10 783 case ctags.Interface: 784 factor = 9 785 case ctags.Object: 786 factor = 8 787 case ctags.Function: 788 factor = 7 789 case ctags.Type: 790 factor = 6 791 case ctags.Variable: 792 factor = 5 793 case ctags.Package: 794 factor = 4 795 } 796 case "Python", "python": 797 switch kind { 798 case ctags.Class: // classes 799 factor = 10 800 case ctags.Function, ctags.Method: // function definitions 801 factor = 8 802 case ctags.Field: // class, struct, and union members 803 factor = 4 804 case ctags.Variable: // variable definitions 805 factor = 3 806 case ctags.Local: // local variables 807 factor = 2 808 } 809 // Could also rank on: 810 // 811 // - namespace name referring a module defined in other file 812 // - module modules 813 // - unknown name referring a class/variable/function/module defined in other module 814 // - parameter function parameters 815 case "Ruby", "ruby": 816 switch kind { 817 case ctags.Class: 818 factor = 10 819 case ctags.Method: 820 factor = 9 821 case ctags.MethodAlias: 822 factor = 8 823 case ctags.Module: 824 factor = 7 825 case ctags.SingletonMethod: 826 factor = 6 827 case ctags.Constant: 828 factor = 5 829 case ctags.Accessor: 830 factor = 4 831 case ctags.Library: 832 factor = 3 833 } 834 case "PHP", "php": 835 switch kind { 836 case ctags.Class: 837 factor = 10 838 case ctags.Interface: 839 factor = 9 840 case ctags.Function: 841 factor = 8 842 case ctags.Trait: 843 factor = 7 844 case ctags.Define: 845 factor = 6 846 case ctags.Namespace: 847 factor = 5 848 case ctags.MethodAlias: 849 factor = 4 850 case ctags.Variable: 851 factor = 3 852 case ctags.Local: 853 factor = 3 854 } 855 case "GraphQL", "graphql": 856 switch kind { 857 case ctags.Type: 858 factor = 10 859 } 860 case "Markdown", "markdown": 861 // Headers are good signal in docs, but do not rank as highly as code. 862 switch kind { 863 case ctags.Chapter: // # 864 factor = 4 865 case ctags.Section: // ## 866 factor = 3 867 case ctags.Subsection: // ### 868 factor = 2 869 } 870 } 871 872 return factor * scoreKindMatch 873} 874 875type matchScoreSlice []zoekt.LineMatch 876 877func (m matchScoreSlice) Len() int { return len(m) } 878func (m matchScoreSlice) Swap(i, j int) { m[i], m[j] = m[j], m[i] } 879func (m matchScoreSlice) Less(i, j int) bool { return m[i].Score > m[j].Score } 880 881type chunkMatchScoreSlice []zoekt.ChunkMatch 882 883func (m chunkMatchScoreSlice) Len() int { return len(m) } 884func (m chunkMatchScoreSlice) Swap(i, j int) { m[i], m[j] = m[j], m[i] } 885func (m chunkMatchScoreSlice) Less(i, j int) bool { return m[i].Score > m[j].Score } 886 887type fileMatchesByScore []zoekt.FileMatch 888 889func (m fileMatchesByScore) Len() int { return len(m) } 890func (m fileMatchesByScore) Swap(i, j int) { m[i], m[j] = m[j], m[i] } 891func (m fileMatchesByScore) Less(i, j int) bool { return m[i].Score > m[j].Score } 892 893func sortMatchesByScore(ms []zoekt.LineMatch) { 894 sort.Sort(matchScoreSlice(ms)) 895} 896 897func sortChunkMatchesByScore(ms []zoekt.ChunkMatch) { 898 sort.Sort(chunkMatchScoreSlice(ms)) 899} 900 901// SortFiles sorts files matches in the order we want to present results to 902// users. The order depends on the match score, which includes both 903// query-dependent signals like word overlap, and file-only signals like the 904// file ranks (if file ranks are enabled). 905// 906// We don't only use the scores, we will also boost some results to present 907// files with novel extensions. 908func SortFiles(ms []zoekt.FileMatch) { 909 sort.Sort(fileMatchesByScore(ms)) 910 911 // Boost a file extension not in the top 3 to the third filematch. 912 boostNovelExtension(ms, 2, 0.9) 913} 914 915func boostNovelExtension(ms []zoekt.FileMatch, boostOffset int, minScoreRatio float64) { 916 if len(ms) <= boostOffset+1 { 917 return 918 } 919 920 top := ms[:boostOffset] 921 candidates := ms[boostOffset:] 922 923 // Don't bother boosting something which is significantly different to the 924 // result it replaces. 925 minScoreForNovelty := candidates[0].Score * minScoreRatio 926 927 // We want to look for an ext that isn't in the top exts 928 exts := make([]string, len(top)) 929 for i := range top { 930 exts[i] = path.Ext(top[i].FileName) 931 } 932 933 for i := range candidates { 934 // Do not assume sorted due to boostNovelExtension being called on subsets 935 if candidates[i].Score < minScoreForNovelty { 936 continue 937 } 938 939 if slices.Contains(exts, path.Ext(candidates[i].FileName)) { 940 continue 941 } 942 943 // Found what we are looking for, now boost to front of candidates (which 944 // is ms[boostOffset]) 945 for ; i > 0; i-- { 946 candidates[i], candidates[i-1] = candidates[i-1], candidates[i] 947 } 948 return 949 } 950}