fork of https://github.com/sourcegraph/zoekt
0

Configure Feed

Select the types of activity you want to include in your feed.

1// Copyright 2016 Google Inc. All rights reserved. 2// 3// Licensed under the Apache License, Version 2.0 (the "License"); 4// you may not use this file except in compliance with the License. 5// You may obtain a copy of the License at 6// 7// http://www.apache.org/licenses/LICENSE-2.0 8// 9// Unless required by applicable law or agreed to in writing, software 10// distributed under the License is distributed on an "AS IS" BASIS, 11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12// See the License for the specific language governing permissions and 13// limitations under the License. 14 15package zoekt 16 17import ( 18 "bytes" 19 "log" 20 "path" 21 "slices" 22 "sort" 23 "unicode" 24 "unicode/utf8" 25 26 "github.com/sourcegraph/zoekt/ctags" 27) 28 29var _ = log.Println 30 31// contentProvider is an abstraction to treat matches for names and 32// content with the same code. 33type contentProvider struct { 34 id *indexData 35 stats *Stats 36 37 // mutable 38 err error 39 idx uint32 40 _data []byte 41 _nl []uint32 42 _nlBuf []uint32 43 _sects []DocumentSection 44 _sectBuf []DocumentSection 45 fileSize uint32 46} 47 48// setDocument skips to the given document. 49func (p *contentProvider) setDocument(docID uint32) { 50 fileStart := p.id.boundaries[docID] 51 52 p.idx = docID 53 p.fileSize = p.id.boundaries[docID+1] - fileStart 54 55 p._nl = nil 56 p._sects = nil 57 p._data = nil 58} 59 60func (p *contentProvider) docSections() []DocumentSection { 61 if p._sects == nil { 62 var sz uint32 63 p._sects, sz, p.err = p.id.readDocSections(p.idx, p._sectBuf) 64 p.stats.ContentBytesLoaded += int64(sz) 65 p._sectBuf = p._sects 66 } 67 return p._sects 68} 69 70func (p *contentProvider) newlines() newlines { 71 if p._nl == nil { 72 var sz uint32 73 p._nl, sz, p.err = p.id.readNewlines(p.idx, p._nlBuf) 74 p._nlBuf = p._nl 75 p.stats.ContentBytesLoaded += int64(sz) 76 } 77 return newlines{locs: p._nl, fileSize: p.fileSize} 78} 79 80func (p *contentProvider) data(fileName bool) []byte { 81 if fileName { 82 return p.id.fileNameContent[p.id.fileNameIndex[p.idx]:p.id.fileNameIndex[p.idx+1]] 83 } 84 85 if p._data == nil { 86 p._data, p.err = p.id.readContents(p.idx) 87 p.stats.FilesLoaded++ 88 p.stats.ContentBytesLoaded += int64(len(p._data)) 89 } 90 return p._data 91} 92 93// Find offset in bytes (relative to corpus start) for an offset in 94// runes (relative to document start). If filename is set, the corpus 95// is the set of filenames, with the document being the name itself. 96func (p *contentProvider) findOffset(filename bool, r uint32) uint32 { 97 if p.id.metaData.PlainASCII { 98 return r 99 } 100 101 sample := p.id.runeOffsets 102 runeEnds := p.id.fileEndRunes 103 fileStartByte := p.id.boundaries[p.idx] 104 if filename { 105 sample = p.id.fileNameRuneOffsets 106 runeEnds = p.id.fileNameEndRunes 107 fileStartByte = p.id.fileNameIndex[p.idx] 108 } 109 110 absR := r 111 if p.idx > 0 { 112 absR += runeEnds[p.idx-1] 113 } 114 115 byteOff, left := sample.lookup(absR) 116 117 var data []byte 118 119 if filename { 120 data = p.id.fileNameContent[byteOff:] 121 } else { 122 data, p.err = p.id.readContentSlice(byteOff, 3*runeOffsetFrequency) 123 if p.err != nil { 124 return 0 125 } 126 } 127 for left > 0 { 128 _, sz := utf8.DecodeRune(data) 129 byteOff += uint32(sz) 130 data = data[sz:] 131 left-- 132 } 133 134 byteOff -= fileStartByte 135 return byteOff 136} 137 138// fillMatches converts the internal candidateMatch slice into our API's LineMatch. 139// It only ever returns content XOR filename matches, not both. If there are any 140// content matches, these are always returned, and we omit filename matches. 141// 142// Performance invariant: ms is sorted and non-overlapping. 143// 144// Note: the byte slices may be backed by mmapped data, so before being 145// returned by the API it needs to be copied. 146func (p *contentProvider) fillMatches(ms []*candidateMatch, numContextLines int, language string, opts *SearchOptions) []LineMatch { 147 var filenameMatches []*candidateMatch 148 contentMatches := make([]*candidateMatch, 0, len(ms)) 149 150 for _, m := range ms { 151 if m.fileName { 152 filenameMatches = append(filenameMatches, m) 153 } else { 154 contentMatches = append(contentMatches, m) 155 } 156 } 157 158 // If there are any content matches, we only return these and skip filename matches. 159 if len(contentMatches) > 0 { 160 contentMatches = breakMatchesOnNewlines(contentMatches, p.data(false)) 161 return p.fillContentMatches(contentMatches, numContextLines, language, opts) 162 } 163 164 // Otherwise, we return a single line containing the filematch match. 165 lineScore, _ := p.scoreLine(filenameMatches, language, -1 /* must pass -1 for filenames */, opts) 166 res := LineMatch{ 167 Line: p.id.fileName(p.idx), 168 FileName: true, 169 Score: lineScore.score, 170 DebugScore: lineScore.debugScore, 171 } 172 173 for _, m := range ms { 174 res.LineFragments = append(res.LineFragments, LineFragmentMatch{ 175 LineOffset: int(m.byteOffset), 176 MatchLength: int(m.byteMatchSz), 177 Offset: m.byteOffset, 178 }) 179 } 180 181 return []LineMatch{res} 182 183} 184 185// fillChunkMatches converts the internal candidateMatch slice into our API's ChunkMatch. 186// It only ever returns content XOR filename matches, not both. If there are any content 187// matches, these are always returned, and we omit filename matches. 188// 189// Performance invariant: ms is sorted and non-overlapping. 190// 191// Note: the byte slices may be backed by mmapped data, so before being 192// returned by the API it needs to be copied. 193func (p *contentProvider) fillChunkMatches(ms []*candidateMatch, numContextLines int, language string, opts *SearchOptions) []ChunkMatch { 194 var filenameMatches []*candidateMatch 195 contentMatches := make([]*candidateMatch, 0, len(ms)) 196 197 for _, m := range ms { 198 if m.fileName { 199 filenameMatches = append(filenameMatches, m) 200 } else { 201 contentMatches = append(contentMatches, m) 202 } 203 } 204 205 // If there are any content matches, we only return these and skip filename matches. 206 if len(contentMatches) > 0 { 207 return p.fillContentChunkMatches(contentMatches, numContextLines, language, opts) 208 } 209 210 // Otherwise, we return a single chunk representing the filename match. 211 lineScore, _ := p.scoreLine(filenameMatches, language, -1 /* must pass -1 for filenames */, opts) 212 fileName := p.id.fileName(p.idx) 213 ranges := make([]Range, 0, len(ms)) 214 for _, m := range ms { 215 ranges = append(ranges, Range{ 216 Start: Location{ 217 ByteOffset: m.byteOffset, 218 LineNumber: 1, 219 Column: uint32(utf8.RuneCount(fileName[:m.byteOffset]) + 1), 220 }, 221 End: Location{ 222 ByteOffset: m.byteOffset + m.byteMatchSz, 223 LineNumber: 1, 224 Column: uint32(utf8.RuneCount(fileName[:m.byteOffset+m.byteMatchSz]) + 1), 225 }, 226 }) 227 } 228 229 return []ChunkMatch{{ 230 Content: fileName, 231 ContentStart: Location{ByteOffset: 0, LineNumber: 1, Column: 1}, 232 Ranges: ranges, 233 FileName: true, 234 Score: lineScore.score, 235 DebugScore: lineScore.debugScore, 236 }} 237} 238 239func (p *contentProvider) fillContentMatches(ms []*candidateMatch, numContextLines int, language string, opts *SearchOptions) []LineMatch { 240 var result []LineMatch 241 for len(ms) > 0 { 242 m := ms[0] 243 num := p.newlines().atOffset(m.byteOffset) 244 lineStart := int(p.newlines().lineStart(num)) 245 nextLineStart := int(p.newlines().lineStart(num + 1)) 246 247 var lineCands []*candidateMatch 248 249 endMatch := m.byteOffset + m.byteMatchSz 250 251 for len(ms) > 0 { 252 m := ms[0] 253 if int(m.byteOffset) < nextLineStart { 254 endMatch = m.byteOffset + m.byteMatchSz 255 lineCands = append(lineCands, m) 256 ms = ms[1:] 257 } else { 258 break 259 } 260 } 261 262 if len(lineCands) == 0 { 263 log.Panicf( 264 "%s %v infinite loop: num %d start,end %d,%d, offset %d", 265 p.id.fileName(p.idx), p.id.metaData, 266 num, lineStart, nextLineStart, 267 m.byteOffset) 268 } 269 270 data := p.data(false) 271 272 // Due to merging matches, we may have a match that 273 // crosses a line boundary. Prevent confusion by 274 // taking lines until we pass the last match 275 for nextLineStart < len(data) && endMatch > uint32(nextLineStart) { 276 next := bytes.IndexByte(data[nextLineStart:], '\n') 277 if next == -1 { 278 nextLineStart = len(data) 279 } else { 280 // TODO(hanwen): test that checks "+1" part here. 281 nextLineStart += next + 1 282 } 283 } 284 285 finalMatch := LineMatch{ 286 LineStart: lineStart, 287 LineEnd: nextLineStart, 288 LineNumber: num, 289 } 290 finalMatch.Line = data[lineStart:nextLineStart] 291 292 if numContextLines > 0 { 293 finalMatch.Before = p.newlines().getLines(data, num-numContextLines, num) 294 finalMatch.After = p.newlines().getLines(data, num+1, num+1+numContextLines) 295 } 296 297 lineScore, symbolInfo := p.scoreLine(lineCands, language, num, opts) 298 finalMatch.Score = lineScore.score 299 finalMatch.DebugScore = lineScore.debugScore 300 301 for i, m := range lineCands { 302 fragment := LineFragmentMatch{ 303 Offset: m.byteOffset, 304 LineOffset: int(m.byteOffset) - lineStart, 305 MatchLength: int(m.byteMatchSz), 306 } 307 308 if i < len(symbolInfo) && symbolInfo[i] != nil { 309 fragment.SymbolInfo = symbolInfo[i] 310 } 311 312 finalMatch.LineFragments = append(finalMatch.LineFragments, fragment) 313 } 314 result = append(result, finalMatch) 315 } 316 return result 317} 318 319func (p *contentProvider) fillContentChunkMatches(ms []*candidateMatch, numContextLines int, language string, opts *SearchOptions) []ChunkMatch { 320 data := p.data(false) 321 322 // columnHelper prevents O(len(ms) * len(data)) lookups for all columns. 323 // However, it depends on ms being sorted by byteOffset and non-overlapping. 324 // This invariant is true at the time of writing, but we conservatively 325 // enforce this. Note: chunkCandidates preserves the sorting so safe to 326 // transform now. 327 columnHelper := columnHelper{data: data} 328 if !sort.IsSorted((sortByOffsetSlice)(ms)) { 329 log.Printf("WARN: performance invariant violated. candidate matches are not sorted in fillContentChunkMatches. Report to developers.") 330 sort.Sort((sortByOffsetSlice)(ms)) 331 } 332 333 newlines := p.newlines() 334 chunks := chunkCandidates(ms, newlines, numContextLines) 335 chunkMatches := make([]ChunkMatch, 0, len(chunks)) 336 for _, chunk := range chunks { 337 ranges := make([]Range, 0, len(chunk.candidates)) 338 for _, cm := range chunk.candidates { 339 startOffset := cm.byteOffset 340 endOffset := cm.byteOffset + cm.byteMatchSz 341 startLine, endLine := newlines.offsetRangeToLineRange(startOffset, endOffset) 342 343 ranges = append(ranges, Range{ 344 Start: Location{ 345 ByteOffset: startOffset, 346 LineNumber: uint32(startLine), 347 Column: columnHelper.get(int(newlines.lineStart(startLine)), startOffset), 348 }, 349 End: Location{ 350 ByteOffset: endOffset, 351 LineNumber: uint32(endLine), 352 Column: columnHelper.get(int(newlines.lineStart(endLine)), endOffset), 353 }, 354 }) 355 } 356 357 firstLineNumber := int(chunk.firstLine) - numContextLines 358 if firstLineNumber < 1 { 359 firstLineNumber = 1 360 } 361 firstLineStart := newlines.lineStart(firstLineNumber) 362 363 chunkScore, symbolInfo := p.scoreChunk(chunk.candidates, language, opts) 364 chunkMatches = append(chunkMatches, ChunkMatch{ 365 Content: newlines.getLines(data, firstLineNumber, int(chunk.lastLine)+numContextLines+1), 366 ContentStart: Location{ 367 ByteOffset: firstLineStart, 368 LineNumber: uint32(firstLineNumber), 369 Column: 1, 370 }, 371 FileName: false, 372 Ranges: ranges, 373 SymbolInfo: symbolInfo, 374 BestLineMatch: uint32(chunkScore.bestLine), 375 Score: chunkScore.score, 376 DebugScore: chunkScore.debugScore, 377 }) 378 } 379 return chunkMatches 380} 381 382type candidateChunk struct { 383 candidates []*candidateMatch 384 firstLine uint32 // 1-based, inclusive 385 lastLine uint32 // 1-based, inclusive 386 minOffset uint32 // 0-based, inclusive 387 maxOffset uint32 // 0-based, exclusive 388} 389 390// chunkCandidates groups a set of sorted, non-overlapping candidate matches by line number. Adjacent 391// chunks will be merged if adding `numContextLines` to the beginning and end of the chunk would cause 392// it to overlap with an adjacent chunk. 393// 394// input invariants: ms is sorted by byteOffset and is non overlapping with respect to endOffset. 395// output invariants: if you flatten candidates the input invariant is retained. 396func chunkCandidates(ms []*candidateMatch, newlines newlines, numContextLines int) []candidateChunk { 397 var chunks []candidateChunk 398 399 for _, m := range ms { 400 startOffset := m.byteOffset 401 endOffset := m.byteOffset + m.byteMatchSz 402 firstLine, lastLine := newlines.offsetRangeToLineRange(startOffset, endOffset) 403 404 if len(chunks) > 0 && int(chunks[len(chunks)-1].lastLine)+numContextLines >= firstLine-numContextLines { 405 // If a new chunk created with the current candidateMatch would 406 // overlap with the previous chunk, instead add the candidateMatch 407 // to the last chunk and extend end of the last chunk. 408 last := &chunks[len(chunks)-1] 409 last.candidates = append(last.candidates, m) 410 if last.maxOffset < endOffset { 411 last.lastLine = uint32(lastLine) 412 last.maxOffset = uint32(endOffset) 413 } 414 } else { 415 chunks = append(chunks, candidateChunk{ 416 firstLine: uint32(firstLine), 417 lastLine: uint32(lastLine), 418 minOffset: startOffset, 419 maxOffset: endOffset, 420 candidates: []*candidateMatch{m}, 421 }) 422 } 423 } 424 return chunks 425} 426 427// columnHelper is a helper struct which caches the number of runes last 428// counted. If we naively use utf8.RuneCount for each match on a line, this 429// leads to an O(nm) algorithm where m is the number of matches and n is the 430// length of the line. Aassuming we our candidates are increasing in offset 431// makes this operation O(n) instead. 432type columnHelper struct { 433 data []byte 434 435 // 0 values for all these are valid values 436 lastLineOffset int 437 lastOffset uint32 438 lastRuneCount uint32 439} 440 441// get returns the line column for offset. offset is the byte offset of the 442// rune in data. lineOffset is the byte offset inside of data for the line 443// containing offset. 444func (c *columnHelper) get(lineOffset int, offset uint32) uint32 { 445 var runeCount uint32 446 447 if lineOffset == c.lastLineOffset && offset >= c.lastOffset { 448 // Can count from last calculation 449 runeCount = c.lastRuneCount + uint32(utf8.RuneCount(c.data[c.lastOffset:offset])) 450 } else { 451 // Need to count from the beginning of line 452 runeCount = uint32(utf8.RuneCount(c.data[lineOffset:offset])) 453 } 454 455 c.lastLineOffset = lineOffset 456 c.lastOffset = offset 457 c.lastRuneCount = runeCount 458 459 return runeCount + 1 460} 461 462type newlines struct { 463 // locs is the sorted set of byte offsets of the newlines in the file 464 locs []uint32 465 466 // fileSize is just the number of bytes in the file. It is stored 467 // on this struct so we can safely know the length of the last line 468 // in the file since not all files end in a newline. 469 fileSize uint32 470} 471 472// atOffset returns the line containing the offset. If the offset lands on 473// the newline ending line M, we return M. 474func (nls newlines) atOffset(offset uint32) (lineNumber int) { 475 idx := sort.Search(len(nls.locs), func(n int) bool { 476 return nls.locs[n] >= offset 477 }) 478 return idx + 1 479} 480 481// lineStart returns the byte offset of the beginning of the given line. 482// lineNumber is 1-based. If lineNumber is out of range of the lines in the 483// file, the return value will be clamped to [0,fileSize]. 484func (nls newlines) lineStart(lineNumber int) uint32 { 485 // nls.locs[0] + 1 is the start of the 2nd line of data. 486 startIdx := lineNumber - 2 487 488 if startIdx < 0 { 489 return 0 490 } else if startIdx >= len(nls.locs) { 491 return nls.fileSize 492 } else { 493 return nls.locs[startIdx] + 1 494 } 495} 496 497// offsetRangeToLineRange returns range of lines that fully contains the given byte range. 498// The inputs are 0-based byte offsets into the file representing the (exclusive) range [startOffset, endOffset). 499// The return values are 1-based line numbers representing the (inclusive) range [startLine, endLine]. 500func (nls newlines) offsetRangeToLineRange(startOffset, endOffset uint32) (startLine, endLine int) { 501 startLine = nls.atOffset(startOffset) 502 endLine = nls.atOffset( 503 max(startOffset, max(endOffset, 1)-1), // clamp endOffset and prevent underflow 504 ) 505 return startLine, endLine 506} 507 508// getLines returns a slice of data containing the lines [low, high). 509// low is 1-based and inclusive. high is 1-based and exclusive. 510func (nls newlines) getLines(data []byte, low, high int) []byte { 511 if low >= high { 512 return nil 513 } 514 515 return data[nls.lineStart(low):nls.lineStart(high)] 516} 517 518const ( 519 // Query-dependent scoring signals. All of these together are bounded at ~9000 520 // (scoreWordMatch + scoreSymbol + scoreKindMatch * 10 + scoreFactorAtomMatch). 521 scorePartialWordMatch = 50.0 522 scoreWordMatch = 500.0 523 scoreBase = 7000.0 524 scorePartialBase = 4000.0 525 scoreSymbol = 7000.0 526 scorePartialSymbol = 4000.0 527 scoreKindMatch = 100.0 528 scoreFactorAtomMatch = 400.0 529 530 // Used for ordering line and chunk matches within a file. 531 scoreLineOrderFactor = 1.0 532 533 // Used for tiebreakers. The scores are not combined with the main score, but 534 // are used to break ties between matches with the same score. The factors are 535 // chosen to separate the tiebreakers from the main score and from each other. 536 // If you make changes here, make sure to update indexData.scoreFile too. 537 scoreRepoRankFactor = 100.0 538 scoreFileOrderFactor = 10.0 539) 540 541// findMaxOverlappingSection returns the index of the section in secs that 542// overlaps the most with the area defined by off and sz, relative to the size 543// of the section. If no section overlaps, it returns 0, false. If multiple 544// sections overlap the same amount, the first one is returned. 545// 546// The implementation assumes that sections do not overlap and are sorted by 547// DocumentSection.Start. 548func findMaxOverlappingSection(secs []DocumentSection, off, sz uint32) (uint32, bool) { 549 start := off 550 end := off + sz 551 552 // Find the first section that might overlap 553 j := sort.Search(len(secs), func(i int) bool { return secs[i].End > start }) 554 555 if j == len(secs) || secs[j].Start >= end { 556 // No overlap. 557 return 0, false 558 } 559 560 relOverlap := func(j int) float64 { 561 secSize := secs[j].End - secs[j].Start 562 if secSize == 0 { 563 return 0 564 } 565 // This cannot overflow because we make sure there is overlap before calling relOverlap 566 overlap := min(secs[j].End, end) - max(secs[j].Start, start) 567 return float64(overlap) / float64(secSize) 568 } 569 570 ol1 := relOverlap(j) 571 if epsilonEqualsOne(ol1) || j == len(secs)-1 || secs[j+1].Start >= end { 572 return uint32(j), ol1 > 0 573 } 574 575 // We know that [off,off+sz[ overlaps with at least 2 sections. We only have to check 576 // if the second section overlaps more than the first one, because a third 577 // section can only overlap if the overlap with the second section is complete. 578 ol2 := relOverlap(j + 1) 579 if ol2 > ol1 { 580 return uint32(j + 1), ol2 > 0 581 } 582 583 return uint32(j), ol1 > 0 584} 585 586func (p *contentProvider) matchesSymbol(cm *candidateMatch) bool { 587 if cm.fileName { 588 return false 589 } 590 591 // Check if this candidate came from a symbol matchTree 592 if cm.symbol { 593 return true 594 } 595 596 // Check if it overlaps with a symbol. 597 secs := p.docSections() 598 _, ok := findMaxOverlappingSection(secs, cm.byteOffset, cm.byteMatchSz) 599 return ok 600} 601 602func (p *contentProvider) findSymbol(cm *candidateMatch) (DocumentSection, *Symbol, bool) { 603 if cm.fileName { 604 return DocumentSection{}, nil, false 605 } 606 607 secs := p.docSections() 608 609 secIdx, ok := cm.symbolIdx, cm.symbol 610 if !ok { 611 // Not from a symbol matchTree. Let's see if it overlaps with a symbol. 612 secIdx, ok = findMaxOverlappingSection(secs, cm.byteOffset, cm.byteMatchSz) 613 } 614 if !ok { 615 return DocumentSection{}, nil, false 616 } 617 618 sec := secs[secIdx] 619 620 // Now lets hydrate in the SymbolInfo. We do not hydrate in SymbolInfo.Sym 621 // since some callsites do not need it stored, and that incurs an extra 622 // copy. 623 // 624 // 2024-01-08 we are refactoring this and the code path indicates this can 625 // fail, so callers need to handle nil symbol. However, it would be 626 // surprising that we have a matching section but not symbol data. 627 start := p.id.fileEndSymbol[p.idx] 628 si := p.id.symbols.data(start + secIdx) 629 630 return sec, si, true 631} 632 633// sectionSlice will return data[sec.Start:sec.End] but will clip Start and 634// End such that it won't be out of range. 635func sectionSlice(data []byte, sec DocumentSection) []byte { 636 l := uint32(len(data)) 637 if sec.Start >= l { 638 return nil 639 } 640 if sec.End > l { 641 sec.End = l 642 } 643 return data[sec.Start:sec.End] 644} 645 646// scoreSymbolKind boosts a match based on the combination of language, symbol 647// and kind. The language string comes from go-enry, the symbol and kind from 648// ctags. 649func scoreSymbolKind(language string, filename []byte, sym []byte, kind ctags.SymbolKind) float64 { 650 var factor float64 651 652 // Generic ranking which will be overriden by language specific ranking 653 switch kind { 654 case ctags.Type: // scip-ctags regression workaround https://github.com/sourcegraph/sourcegraph/issues/57659 655 factor = 8 656 case ctags.Class: 657 factor = 10 658 case ctags.Struct: 659 factor = 9.5 660 case ctags.Enum: 661 factor = 9 662 case ctags.Interface: 663 factor = 8 664 case ctags.Function, ctags.Method: 665 factor = 7 666 case ctags.Field: 667 factor = 5.5 668 case ctags.Constant: 669 factor = 5 670 case ctags.Variable: 671 factor = 4 672 default: 673 // For all other kinds, assign a low score by default. 674 factor = 1 675 } 676 677 switch language { 678 case "Java", "java": 679 switch kind { 680 // 2022-03-30: go-ctags contains a regex rule for Java classes that sets "kind" 681 // to "classes" instead of "c". We have to cover both cases to support existing 682 // indexes. 683 case ctags.Class: 684 factor = 10 685 case ctags.Enum: 686 factor = 9 687 case ctags.Interface: 688 factor = 8 689 case ctags.Method: 690 factor = 7 691 case ctags.Field: 692 factor = 6 693 case ctags.EnumConstant: 694 factor = 5 695 } 696 case "Kotlin", "kotlin": 697 switch kind { 698 case ctags.Class: 699 factor = 10 700 case ctags.Interface: 701 factor = 9 702 case ctags.Method: 703 factor = 8 704 case ctags.TypeAlias: 705 factor = 7 706 case ctags.Constant: 707 factor = 6 708 case ctags.Variable: 709 factor = 5 710 } 711 case "Go", "go": 712 switch kind { 713 // scip-ctags regression workaround https://github.com/sourcegraph/sourcegraph/issues/57659 714 // for each case a description of the fields in ctags in the comment 715 case ctags.Type: // interface struct talias 716 factor = 9 717 case ctags.Interface: // interfaces 718 factor = 10 719 case ctags.Struct: // structs 720 factor = 9 721 case ctags.TypeAlias: // type aliases 722 factor = 9 723 case ctags.MethodSpec: // interface method specification 724 factor = 8.5 725 case ctags.Method, ctags.Function: // functions 726 factor = 8 727 case ctags.Field: // struct fields 728 factor = 7 729 case ctags.Constant: // constants 730 factor = 6 731 case ctags.Variable: // variables 732 factor = 5 733 } 734 735 // Boost exported go symbols. Same implementation as token.IsExported 736 if ch, _ := utf8.DecodeRune(sym); unicode.IsUpper(ch) { 737 factor += 0.5 738 } 739 740 if bytes.HasSuffix(filename, []byte("_test.go")) { 741 factor *= 0.8 742 } 743 744 // Could also rank on: 745 // 746 // - anonMember struct anonymous members 747 // - packageName name for specifying imported package 748 // - receiver receivers 749 // - package packages 750 // - type types 751 // - unknown unknown 752 case "C++", "c++": 753 switch kind { 754 case ctags.Class: // classes 755 factor = 10 756 case ctags.Enum: // enumeration names 757 factor = 9 758 case ctags.Function: // function definitions 759 factor = 8 760 case ctags.Struct: // structure names 761 factor = 7 762 case ctags.Union: // union names 763 factor = 6 764 case ctags.TypeAlias: // typedefs 765 factor = 5 766 case ctags.Field: // class, struct, and union members 767 factor = 4 768 case ctags.Variable: // varialbe definitions 769 factor = 3 770 } 771 // Could also rank on: 772 // NAME DESCRIPTION 773 // macro macro definitions 774 // enumerator enumerators (values inside an enumeration) 775 // header included header files 776 // namespace namespaces 777 // variable variable definitions 778 case "Scala", "scala": 779 switch kind { 780 case ctags.Class: 781 factor = 10 782 case ctags.Interface: 783 factor = 9 784 case ctags.Object: 785 factor = 8 786 case ctags.Function: 787 factor = 7 788 case ctags.Type: 789 factor = 6 790 case ctags.Variable: 791 factor = 5 792 case ctags.Package: 793 factor = 4 794 } 795 case "Python", "python": 796 switch kind { 797 case ctags.Class: // classes 798 factor = 10 799 case ctags.Function, ctags.Method: // function definitions 800 factor = 8 801 case ctags.Field: // class, struct, and union members 802 factor = 4 803 case ctags.Variable: // variable definitions 804 factor = 3 805 case ctags.Local: // local variables 806 factor = 2 807 } 808 // Could also rank on: 809 // 810 // - namespace name referring a module defined in other file 811 // - module modules 812 // - unknown name referring a class/variable/function/module defined in other module 813 // - parameter function parameters 814 case "Ruby", "ruby": 815 switch kind { 816 case ctags.Class: 817 factor = 10 818 case ctags.Method: 819 factor = 9 820 case ctags.MethodAlias: 821 factor = 8 822 case ctags.Module: 823 factor = 7 824 case ctags.SingletonMethod: 825 factor = 6 826 case ctags.Constant: 827 factor = 5 828 case ctags.Accessor: 829 factor = 4 830 case ctags.Library: 831 factor = 3 832 } 833 case "PHP", "php": 834 switch kind { 835 case ctags.Class: 836 factor = 10 837 case ctags.Interface: 838 factor = 9 839 case ctags.Function: 840 factor = 8 841 case ctags.Trait: 842 factor = 7 843 case ctags.Define: 844 factor = 6 845 case ctags.Namespace: 846 factor = 5 847 case ctags.MethodAlias: 848 factor = 4 849 case ctags.Variable: 850 factor = 3 851 case ctags.Local: 852 factor = 3 853 } 854 case "GraphQL", "graphql": 855 switch kind { 856 case ctags.Type: 857 factor = 10 858 } 859 case "Markdown", "markdown": 860 // Headers are good signal in docs, but do not rank as highly as code. 861 switch kind { 862 case ctags.Chapter: // # 863 factor = 4 864 case ctags.Section: // ## 865 factor = 3 866 case ctags.Subsection: // ### 867 factor = 2 868 } 869 } 870 871 return factor * scoreKindMatch 872} 873 874type matchScoreSlice []LineMatch 875 876func (m matchScoreSlice) Len() int { return len(m) } 877func (m matchScoreSlice) Swap(i, j int) { m[i], m[j] = m[j], m[i] } 878func (m matchScoreSlice) Less(i, j int) bool { return m[i].Score > m[j].Score } 879 880type chunkMatchScoreSlice []ChunkMatch 881 882func (m chunkMatchScoreSlice) Len() int { return len(m) } 883func (m chunkMatchScoreSlice) Swap(i, j int) { m[i], m[j] = m[j], m[i] } 884func (m chunkMatchScoreSlice) Less(i, j int) bool { return m[i].Score > m[j].Score } 885 886type fileMatchesByScore []FileMatch 887 888func (m fileMatchesByScore) Len() int { return len(m) } 889func (m fileMatchesByScore) Swap(i, j int) { m[i], m[j] = m[j], m[i] } 890func (m fileMatchesByScore) Less(i, j int) bool { return m[i].Score > m[j].Score } 891 892func sortMatchesByScore(ms []LineMatch) { 893 sort.Sort(matchScoreSlice(ms)) 894} 895 896func sortChunkMatchesByScore(ms []ChunkMatch) { 897 sort.Sort(chunkMatchScoreSlice(ms)) 898} 899 900// SortFiles sorts files matches in the order we want to present results to 901// users. The order depends on the match score, which includes both 902// query-dependent signals like word overlap, and file-only signals like the 903// file ranks (if file ranks are enabled). 904// 905// We don't only use the scores, we will also boost some results to present 906// files with novel extensions. 907func SortFiles(ms []FileMatch) { 908 sort.Sort(fileMatchesByScore(ms)) 909 910 // Boost a file extension not in the top 3 to the third filematch. 911 boostNovelExtension(ms, 2, 0.9) 912} 913 914func boostNovelExtension(ms []FileMatch, boostOffset int, minScoreRatio float64) { 915 if len(ms) <= boostOffset+1 { 916 return 917 } 918 919 top := ms[:boostOffset] 920 candidates := ms[boostOffset:] 921 922 // Don't bother boosting something which is significantly different to the 923 // result it replaces. 924 minScoreForNovelty := candidates[0].Score * minScoreRatio 925 926 // We want to look for an ext that isn't in the top exts 927 exts := make([]string, len(top)) 928 for i := range top { 929 exts[i] = path.Ext(top[i].FileName) 930 } 931 932 for i := range candidates { 933 // Do not assume sorted due to boostNovelExtension being called on subsets 934 if candidates[i].Score < minScoreForNovelty { 935 continue 936 } 937 938 if slices.Contains(exts, path.Ext(candidates[i].FileName)) { 939 continue 940 } 941 942 // Found what we are looking for, now boost to front of candidates (which 943 // is ms[boostOffset]) 944 for ; i > 0; i-- { 945 candidates[i], candidates[i-1] = candidates[i-1], candidates[i] 946 } 947 return 948 } 949}