fork of https://github.com/sourcegraph/zoekt
0

Configure Feed

Select the types of activity you want to include in your feed.

1// Copyright 2016 Google Inc. All rights reserved. 2// 3// Licensed under the Apache License, Version 2.0 (the "License"); 4// you may not use this file except in compliance with the License. 5// You may obtain a copy of the License at 6// 7// http://www.apache.org/licenses/LICENSE-2.0 8// 9// Unless required by applicable law or agreed to in writing, software 10// distributed under the License is distributed on an "AS IS" BASIS, 11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12// See the License for the specific language governing permissions and 13// limitations under the License. 14 15package zoekt 16 17import ( 18 "bytes" 19 "fmt" 20 "log" 21 "path" 22 "sort" 23 "strings" 24 "unicode" 25 "unicode/utf8" 26 27 "github.com/sourcegraph/zoekt/ctags" 28 "golang.org/x/exp/slices" 29) 30 31var _ = log.Println 32 33// contentProvider is an abstraction to treat matches for names and 34// content with the same code. 35type contentProvider struct { 36 id *indexData 37 stats *Stats 38 39 // mutable 40 err error 41 idx uint32 42 _data []byte 43 _nl []uint32 44 _nlBuf []uint32 45 _sects []DocumentSection 46 _sectBuf []DocumentSection 47 fileSize uint32 48} 49 50// setDocument skips to the given document. 51func (p *contentProvider) setDocument(docID uint32) { 52 fileStart := p.id.boundaries[docID] 53 54 p.idx = docID 55 p.fileSize = p.id.boundaries[docID+1] - fileStart 56 57 p._nl = nil 58 p._sects = nil 59 p._data = nil 60} 61 62func (p *contentProvider) docSections() []DocumentSection { 63 if p._sects == nil { 64 var sz uint32 65 p._sects, sz, p.err = p.id.readDocSections(p.idx, p._sectBuf) 66 p.stats.ContentBytesLoaded += int64(sz) 67 p._sectBuf = p._sects 68 } 69 return p._sects 70} 71 72func (p *contentProvider) newlines() newlines { 73 if p._nl == nil { 74 var sz uint32 75 p._nl, sz, p.err = p.id.readNewlines(p.idx, p._nlBuf) 76 p._nlBuf = p._nl 77 p.stats.ContentBytesLoaded += int64(sz) 78 } 79 return newlines{locs: p._nl, fileSize: p.fileSize} 80} 81 82func (p *contentProvider) data(fileName bool) []byte { 83 if fileName { 84 return p.id.fileNameContent[p.id.fileNameIndex[p.idx]:p.id.fileNameIndex[p.idx+1]] 85 } 86 87 if p._data == nil { 88 p._data, p.err = p.id.readContents(p.idx) 89 p.stats.FilesLoaded++ 90 p.stats.ContentBytesLoaded += int64(len(p._data)) 91 } 92 return p._data 93} 94 95// Find offset in bytes (relative to corpus start) for an offset in 96// runes (relative to document start). If filename is set, the corpus 97// is the set of filenames, with the document being the name itself. 98func (p *contentProvider) findOffset(filename bool, r uint32) uint32 { 99 if p.id.metaData.PlainASCII { 100 return r 101 } 102 103 sample := p.id.runeOffsets 104 runeEnds := p.id.fileEndRunes 105 fileStartByte := p.id.boundaries[p.idx] 106 if filename { 107 sample = p.id.fileNameRuneOffsets 108 runeEnds = p.id.fileNameEndRunes 109 fileStartByte = p.id.fileNameIndex[p.idx] 110 } 111 112 absR := r 113 if p.idx > 0 { 114 absR += runeEnds[p.idx-1] 115 } 116 117 byteOff, left := sample.lookup(absR) 118 119 var data []byte 120 121 if filename { 122 data = p.id.fileNameContent[byteOff:] 123 } else { 124 data, p.err = p.id.readContentSlice(byteOff, 3*runeOffsetFrequency) 125 if p.err != nil { 126 return 0 127 } 128 } 129 for left > 0 { 130 _, sz := utf8.DecodeRune(data) 131 byteOff += uint32(sz) 132 data = data[sz:] 133 left-- 134 } 135 136 byteOff -= fileStartByte 137 return byteOff 138} 139 140func (p *contentProvider) fillMatches(ms []*candidateMatch, numContextLines int, language string, debug bool) []LineMatch { 141 var result []LineMatch 142 if ms[0].fileName { 143 // There is only "line" in a filename. 144 res := LineMatch{ 145 Line: p.id.fileName(p.idx), 146 FileName: true, 147 } 148 149 for _, m := range ms { 150 res.LineFragments = append(res.LineFragments, LineFragmentMatch{ 151 LineOffset: int(m.byteOffset), 152 MatchLength: int(m.byteMatchSz), 153 Offset: m.byteOffset, 154 }) 155 156 result = []LineMatch{res} 157 } 158 } else { 159 ms = breakMatchesOnNewlines(ms, p.data(false)) 160 result = p.fillContentMatches(ms, numContextLines) 161 } 162 163 sects := p.docSections() 164 for i, m := range result { 165 result[i].Score, result[i].DebugScore = p.matchScore(sects, &m, language, debug) 166 } 167 168 return result 169} 170 171// fillChunkMatches converts the internal candidateMatch slice into our APIs ChunkMatch. 172// 173// Performance invariant: ms is sorted and non-overlapping. 174// 175// Note: the byte slices may be backed by mmapped data, so before being 176// returned by the API it needs to be copied. 177func (p *contentProvider) fillChunkMatches(ms []*candidateMatch, numContextLines int, language string, debug bool) []ChunkMatch { 178 var result []ChunkMatch 179 if ms[0].fileName { 180 // If the first match is a filename match, there will only be 181 // one match and the matched content will be the filename. 182 183 fileName := p.id.fileName(p.idx) 184 ranges := make([]Range, 0, len(ms)) 185 for _, m := range ms { 186 ranges = append(ranges, Range{ 187 Start: Location{ 188 ByteOffset: m.byteOffset, 189 LineNumber: 1, 190 Column: uint32(utf8.RuneCount(fileName[:m.byteOffset]) + 1), 191 }, 192 End: Location{ 193 ByteOffset: m.byteOffset + m.byteMatchSz, 194 LineNumber: 1, 195 Column: uint32(utf8.RuneCount(fileName[:m.byteOffset+m.byteMatchSz]) + 1), 196 }, 197 }) 198 } 199 200 result = []ChunkMatch{{ 201 Content: fileName, 202 ContentStart: Location{ByteOffset: 0, LineNumber: 1, Column: 1}, 203 Ranges: ranges, 204 FileName: true, 205 }} 206 } else { 207 result = p.fillContentChunkMatches(ms, numContextLines) 208 } 209 210 sects := p.docSections() 211 for i, m := range result { 212 result[i].Score, result[i].DebugScore = p.chunkMatchScore(sects, &m, language, debug) 213 } 214 215 return result 216} 217 218func (p *contentProvider) fillContentMatches(ms []*candidateMatch, numContextLines int) []LineMatch { 219 var result []LineMatch 220 for len(ms) > 0 { 221 m := ms[0] 222 num, lineStart, lineEnd := p.newlines().atOffset(m.byteOffset) 223 224 var lineCands []*candidateMatch 225 226 endMatch := m.byteOffset + m.byteMatchSz 227 228 for len(ms) > 0 { 229 m := ms[0] 230 if int(m.byteOffset) <= lineEnd { 231 endMatch = m.byteOffset + m.byteMatchSz 232 lineCands = append(lineCands, m) 233 ms = ms[1:] 234 } else { 235 break 236 } 237 } 238 239 if len(lineCands) == 0 { 240 log.Panicf( 241 "%s %v infinite loop: num %d start,end %d,%d, offset %d", 242 p.id.fileName(p.idx), p.id.metaData, 243 num, lineStart, lineEnd, 244 m.byteOffset) 245 } 246 247 data := p.data(false) 248 249 // Due to merging matches, we may have a match that 250 // crosses a line boundary. Prevent confusion by 251 // taking lines until we pass the last match 252 for lineEnd < len(data) && endMatch > uint32(lineEnd) { 253 next := bytes.IndexByte(data[lineEnd+1:], '\n') 254 if next == -1 { 255 lineEnd = len(data) 256 } else { 257 // TODO(hanwen): test that checks "+1" part here. 258 lineEnd += next + 1 259 } 260 } 261 262 finalMatch := LineMatch{ 263 LineStart: lineStart, 264 LineEnd: lineEnd, 265 LineNumber: num, 266 } 267 finalMatch.Line = data[lineStart:lineEnd] 268 269 if numContextLines > 0 { 270 finalMatch.Before = p.newlines().getLines(data, num-numContextLines, num) 271 finalMatch.After = p.newlines().getLines(data, num+1, num+1+numContextLines) 272 } 273 274 for _, m := range lineCands { 275 fragment := LineFragmentMatch{ 276 Offset: m.byteOffset, 277 LineOffset: int(m.byteOffset) - lineStart, 278 MatchLength: int(m.byteMatchSz), 279 } 280 if m.symbol { 281 start := p.id.fileEndSymbol[p.idx] 282 fragment.SymbolInfo = p.id.symbols.data(start + m.symbolIdx) 283 if fragment.SymbolInfo != nil { 284 sec := p.docSections()[m.symbolIdx] 285 fragment.SymbolInfo.Sym = string(data[sec.Start:sec.End]) 286 } 287 } 288 289 finalMatch.LineFragments = append(finalMatch.LineFragments, fragment) 290 } 291 result = append(result, finalMatch) 292 } 293 return result 294} 295 296func (p *contentProvider) fillContentChunkMatches(ms []*candidateMatch, numContextLines int) []ChunkMatch { 297 newlines := p.newlines() 298 data := p.data(false) 299 300 // columnHelper prevents O(len(ms) * len(data)) lookups for all columns. 301 // However, it depends on ms being sorted by byteOffset and non-overlapping. 302 // This invariant is true at the time of writing, but we conservatively 303 // enforce this. Note: chunkCandidates preserves the sorting so safe to 304 // transform now. 305 columnHelper := columnHelper{data: data} 306 if !sort.IsSorted((sortByOffsetSlice)(ms)) { 307 log.Printf("WARN: performance invariant violated. candidate matches are not sorted in fillContentChunkMatches. Report to developers.") 308 sort.Sort((sortByOffsetSlice)(ms)) 309 } 310 311 chunks := chunkCandidates(ms, newlines, numContextLines) 312 chunkMatches := make([]ChunkMatch, 0, len(chunks)) 313 for _, chunk := range chunks { 314 ranges := make([]Range, 0, len(chunk.candidates)) 315 var symbolInfo []*Symbol 316 for i, cm := range chunk.candidates { 317 startOffset := cm.byteOffset 318 endOffset := cm.byteOffset + cm.byteMatchSz 319 startLine, startLineOffset, _ := newlines.atOffset(startOffset) 320 endLine, endLineOffset, _ := newlines.atOffset(endOffset) 321 322 ranges = append(ranges, Range{ 323 Start: Location{ 324 ByteOffset: startOffset, 325 LineNumber: uint32(startLine), 326 Column: columnHelper.get(startLineOffset, startOffset), 327 }, 328 End: Location{ 329 ByteOffset: endOffset, 330 LineNumber: uint32(endLine), 331 Column: columnHelper.get(endLineOffset, endOffset), 332 }, 333 }) 334 335 if cm.symbol { 336 if symbolInfo == nil { 337 symbolInfo = make([]*Symbol, len(chunk.candidates)) 338 } 339 start := p.id.fileEndSymbol[p.idx] 340 si := p.id.symbols.data(start + cm.symbolIdx) 341 if si != nil { 342 sec := p.docSections()[cm.symbolIdx] 343 si.Sym = string(data[sec.Start:sec.End]) 344 } 345 symbolInfo[i] = si 346 } 347 } 348 349 firstLineNumber := int(chunk.firstLine) - numContextLines 350 if firstLineNumber < 1 { 351 firstLineNumber = 1 352 } 353 firstLineStart, _ := newlines.lineBounds(firstLineNumber) 354 355 chunkMatches = append(chunkMatches, ChunkMatch{ 356 Content: newlines.getLines(data, firstLineNumber, int(chunk.lastLine)+numContextLines+1), 357 ContentStart: Location{ 358 ByteOffset: firstLineStart, 359 LineNumber: uint32(firstLineNumber), 360 Column: 1, 361 }, 362 FileName: false, 363 Ranges: ranges, 364 SymbolInfo: symbolInfo, 365 }) 366 } 367 return chunkMatches 368} 369 370type candidateChunk struct { 371 candidates []*candidateMatch 372 firstLine uint32 // 1-based, inclusive 373 lastLine uint32 // 1-based, inclusive 374 minOffset uint32 // 0-based, inclusive 375 maxOffset uint32 // 0-based, exclusive 376} 377 378// chunkCandidates groups a set of sorted, non-overlapping candidate matches by line number. Adjacent 379// chunks will be merged if adding `numContextLines` to the beginning and end of the chunk would cause 380// it to overlap with an adjacent chunk. 381// 382// input invariants: ms is sorted by byteOffset and is non overlapping with respect to endOffset. 383// output invariants: if you flatten candidates the input invariant is retained. 384func chunkCandidates(ms []*candidateMatch, newlines newlines, numContextLines int) []candidateChunk { 385 var chunks []candidateChunk 386 for _, m := range ms { 387 startOffset := m.byteOffset 388 endOffset := m.byteOffset + m.byteMatchSz 389 firstLine, _, _ := newlines.atOffset(startOffset) 390 lastLine, _, _ := newlines.atOffset(endOffset) 391 392 if len(chunks) > 0 && int(chunks[len(chunks)-1].lastLine)+numContextLines >= firstLine-numContextLines { 393 // If a new chunk created with the current candidateMatch would 394 // overlap with the previous chunk, instead add the candidateMatch 395 // to the last chunk and extend end of the last chunk. 396 last := &chunks[len(chunks)-1] 397 last.candidates = append(last.candidates, m) 398 if last.maxOffset < endOffset { 399 last.lastLine = uint32(lastLine) 400 last.maxOffset = uint32(endOffset) 401 } 402 } else { 403 chunks = append(chunks, candidateChunk{ 404 firstLine: uint32(firstLine), 405 lastLine: uint32(lastLine), 406 minOffset: startOffset, 407 maxOffset: endOffset, 408 candidates: []*candidateMatch{m}, 409 }) 410 } 411 } 412 return chunks 413} 414 415// columnHelper is a helper struct which caches the number of runes last 416// counted. If we naively use utf8.RuneCount for each match on a line, this 417// leads to an O(nm) algorithm where m is the number of matches and n is the 418// length of the line. Aassuming we our candidates are increasing in offset 419// makes this operation O(n) instead. 420type columnHelper struct { 421 data []byte 422 423 // 0 values for all these are valid values 424 lastLineOffset int 425 lastOffset uint32 426 lastRuneCount uint32 427} 428 429// get returns the line column for offset. offset is the byte offset of the 430// rune in data. lineOffset is the byte offset inside of data for the line 431// containing offset. 432func (c *columnHelper) get(lineOffset int, offset uint32) uint32 { 433 var runeCount uint32 434 435 if lineOffset == c.lastLineOffset && offset >= c.lastOffset { 436 // Can count from last calculation 437 runeCount = c.lastRuneCount + uint32(utf8.RuneCount(c.data[c.lastOffset:offset])) 438 } else { 439 // Need to count from the beginning of line 440 runeCount = uint32(utf8.RuneCount(c.data[lineOffset:offset])) 441 } 442 443 c.lastLineOffset = lineOffset 444 c.lastOffset = offset 445 c.lastRuneCount = runeCount 446 447 return runeCount + 1 448} 449 450type newlines struct { 451 // locs is the sorted set of byte offsets of the newlines in the file 452 locs []uint32 453 454 // fileSize is just the number of bytes in the file. It is stored 455 // on this struct so we can safely know the length of the last line 456 // in the file since not all files end in a newline. 457 fileSize uint32 458} 459 460// atOffset returns the line containing the offset. If the offset lands on 461// the newline ending line M, we return M. The line is characterized 462// by its linenumber (base-1, byte index of line start, byte index of 463// line end). The line end is the index of a newline, or the filesize 464// (if matching the last line of the file.) 465func (nls newlines) atOffset(offset uint32) (lineNumber, lineStart, lineEnd int) { 466 idx := sort.Search(len(nls.locs), func(n int) bool { 467 return nls.locs[n] >= offset 468 }) 469 470 start, end := nls.lineBounds(idx + 1) 471 return idx + 1, int(start), int(end) 472} 473 474// lineBounds returns the byte offsets of the start and end of the 1-based 475// lineNumber. The end offset is exclusive and will not contain the line-ending 476// newline. If the line number is out of range of the lines in the file, start 477// and end will be clamped to [0,fileSize]. 478func (nls newlines) lineBounds(lineNumber int) (start, end uint32) { 479 // nls.locs[0] + 1 is the start of the 2nd line of data. 480 startIdx := lineNumber - 2 481 endIdx := lineNumber - 1 482 483 if startIdx < 0 { 484 start = 0 485 } else if startIdx >= len(nls.locs) { 486 start = nls.fileSize 487 } else { 488 start = nls.locs[startIdx] + 1 489 } 490 491 if endIdx < 0 { 492 end = 0 493 } else if endIdx >= len(nls.locs) { 494 end = nls.fileSize 495 } else { 496 end = nls.locs[endIdx] 497 } 498 499 return start, end 500} 501 502// getLines returns a slice of data containing the lines [low, high). 503// low is 1-based and inclusive. high is 1-based and exclusive. 504func (nls newlines) getLines(data []byte, low, high int) []byte { 505 if low >= high { 506 return nil 507 } 508 509 lowStart, _ := nls.lineBounds(low) 510 _, highEnd := nls.lineBounds(high - 1) 511 512 // Drop any trailing newline. Editors do not treat a trailing newline as 513 // the start of a new line, so we should not either. lineBounds clamps to 514 // len(data) when an out-of-bounds line is requested. 515 // 516 // As an example, if we request lines 1-5 from a file with contents 517 // `one\ntwo\nthree\n`, we should return `one\ntwo\nthree` because those are 518 // the three "lines" in the file, separated by newlines. 519 if highEnd == uint32(len(data)) && bytes.HasSuffix(data, []byte{'\n'}) { 520 highEnd = highEnd - 1 521 lowStart = min(lowStart, highEnd) 522 } 523 524 return data[lowStart:highEnd] 525} 526 527const ( 528 // Query-dependent scoring signals. All of these together are bounded at ~9000 529 // (scoreWordMatch + scoreSymbol + scoreKindMatch * 10 + scoreFactorAtomMatch). 530 scorePartialWordMatch = 50.0 531 scoreWordMatch = 500.0 532 scoreBase = 7000.0 533 scorePartialBase = 4000.0 534 scoreSymbol = 7000.0 535 scorePartialSymbol = 4000.0 536 scoreKindMatch = 100.0 537 scoreFactorAtomMatch = 400.0 538 539 // File-only scoring signals. For now these are also bounded ~9000 to give them 540 // equal weight with the query-dependent signals. 541 scoreFileRankFactor = 9000.0 542 scoreFileOrderFactor = 10.0 543 scoreRepoRankFactor = 20.0 544 545 // Used for ordering line and chunk matches within a file. 546 scoreLineOrderFactor = 1.0 547) 548 549// findSection checks whether a section defined by offset and size lies within 550// one of the sections in secs. 551func findSection(secs []DocumentSection, off, sz uint32) (int, bool) { 552 j := sort.Search(len(secs), func(i int) bool { 553 return secs[i].End >= off+sz 554 }) 555 556 if j == len(secs) { 557 return 0, false 558 } 559 560 if secs[j].Start <= off && off+sz <= secs[j].End { 561 return j, true 562 } 563 return 0, false 564} 565 566func (p *contentProvider) chunkMatchScore(secs []DocumentSection, m *ChunkMatch, language string, debug bool) (float64, string) { 567 type debugScore struct { 568 what string 569 score float64 570 } 571 572 score := &debugScore{} 573 maxScore := &debugScore{} 574 575 addScore := func(what string, s float64) { 576 if s != 0 && debug { 577 score.what += fmt.Sprintf("%s:%.2f, ", what, s) 578 } 579 score.score += s 580 } 581 582 data := p.data(m.FileName) 583 filename := p.data(true) 584 585 for i, r := range m.Ranges { 586 // calculate the start and end offset relative to the start of the content 587 relStartOffset := int(r.Start.ByteOffset - m.ContentStart.ByteOffset) 588 relEndOffset := int(r.End.ByteOffset - m.ContentStart.ByteOffset) 589 590 startBoundary := relStartOffset < len(m.Content) && (relStartOffset == 0 || byteClass(m.Content[relStartOffset-1]) != byteClass(m.Content[relStartOffset])) 591 endBoundary := relEndOffset > 0 && (relEndOffset == len(m.Content) || byteClass(m.Content[relEndOffset-1]) != byteClass(m.Content[relEndOffset])) 592 593 score.score = 0 594 score.what = "" 595 596 if startBoundary && endBoundary { 597 addScore("WordMatch", scoreWordMatch) 598 } else if startBoundary || endBoundary { 599 addScore("PartialWordMatch", scorePartialWordMatch) 600 } 601 602 if m.FileName { 603 sep := bytes.LastIndexByte(m.Content, '/') 604 startMatch := relStartOffset == sep+1 605 endMatch := relEndOffset == len(m.Content) 606 if startMatch && endMatch { 607 addScore("Base", scoreBase) 608 } else if startMatch || endMatch { 609 addScore("EdgeBase", (scoreBase+scorePartialBase)/2) 610 } else if sep < relStartOffset { 611 addScore("InnerBase", scorePartialBase) 612 } 613 } else if secIdx, ok := findSection(secs, uint32(r.Start.ByteOffset), uint32(r.End.ByteOffset-r.Start.ByteOffset)); ok { 614 sec := secs[secIdx] 615 startMatch := sec.Start == uint32(r.Start.ByteOffset) 616 endMatch := sec.End == uint32(r.End.ByteOffset) 617 if startMatch && endMatch { 618 addScore("Symbol", scoreSymbol) 619 } else if startMatch || endMatch { 620 addScore("EdgeSymbol", (scoreSymbol+scorePartialSymbol)/2) 621 } else { 622 addScore("InnerSymbol", scorePartialSymbol) 623 } 624 625 var si *Symbol 626 if m.SymbolInfo != nil { 627 si = m.SymbolInfo[i] 628 } 629 if si == nil { 630 // for non-symbol queries, we need to hydrate in SymbolInfo. 631 start := p.id.fileEndSymbol[p.idx] 632 si = p.id.symbols.data(start + uint32(secIdx)) 633 } 634 if si != nil { 635 symbolKind := ctags.ParseSymbolKind(si.Kind) 636 sym := sectionSlice(data, sec) 637 addScore(fmt.Sprintf("kind:%s:%s", language, si.Kind), scoreSymbolKind(language, filename, sym, symbolKind)) 638 } 639 } 640 641 if score.score > maxScore.score { 642 maxScore.score = score.score 643 maxScore.what = score.what 644 } 645 } 646 647 if debug { 648 maxScore.what = fmt.Sprintf("score:%.2f <- %s", maxScore.score, strings.TrimSuffix(maxScore.what, ", ")) 649 } 650 651 return maxScore.score, maxScore.what 652} 653 654func (p *contentProvider) matchScore(secs []DocumentSection, m *LineMatch, language string, debug bool) (float64, string) { 655 type debugScore struct { 656 what string 657 score float64 658 } 659 660 score := &debugScore{} 661 maxScore := &debugScore{} 662 663 addScore := func(what string, s float64) { 664 if s != 0 && debug { 665 score.what += fmt.Sprintf("%s:%.2f, ", what, s) 666 } 667 score.score += s 668 } 669 670 data := p.data(m.FileName) 671 filename := p.data(true) 672 673 for _, f := range m.LineFragments { 674 startBoundary := f.LineOffset < len(m.Line) && (f.LineOffset == 0 || byteClass(m.Line[f.LineOffset-1]) != byteClass(m.Line[f.LineOffset])) 675 676 end := int(f.LineOffset) + f.MatchLength 677 endBoundary := end > 0 && (end == len(m.Line) || byteClass(m.Line[end-1]) != byteClass(m.Line[end])) 678 679 score.score = 0 680 score.what = "" 681 682 if startBoundary && endBoundary { 683 addScore("WordMatch", scoreWordMatch) 684 } else if startBoundary || endBoundary { 685 addScore("PartialWordMatch", scorePartialWordMatch) 686 } 687 688 if m.FileName { 689 sep := bytes.LastIndexByte(m.Line, '/') 690 startMatch := sep+1 == f.LineOffset 691 endMatch := len(m.Line) == f.LineOffset+f.MatchLength 692 if startMatch && endMatch { 693 addScore("Base", scoreBase) 694 } else if startMatch || endMatch { 695 addScore("EdgeBase", (scoreBase+scorePartialBase)/2) 696 } else if sep < f.LineOffset { 697 addScore("InnerBase", scorePartialBase) 698 } 699 } else if secIdx, ok := findSection(secs, f.Offset, uint32(f.MatchLength)); ok { 700 sec := secs[secIdx] 701 startMatch := sec.Start == f.Offset 702 endMatch := sec.End == f.Offset+uint32(f.MatchLength) 703 if startMatch && endMatch { 704 addScore("Symbol", scoreSymbol) 705 } else if startMatch || endMatch { 706 addScore("EdgeSymbol", (scoreSymbol+scorePartialSymbol)/2) 707 } else { 708 addScore("InnerSymbol", scorePartialSymbol) 709 } 710 711 si := f.SymbolInfo 712 if si == nil { 713 // for non-symbol queries, we need to hydrate in SymbolInfo. 714 start := p.id.fileEndSymbol[p.idx] 715 si = p.id.symbols.data(start + uint32(secIdx)) 716 } 717 if si != nil { 718 // the LineFragment may not be on a symbol, then si will be nil. 719 symbolKind := ctags.ParseSymbolKind(si.Kind) 720 sym := sectionSlice(data, sec) 721 addScore(fmt.Sprintf("kind:%s:%s", language, si.Kind), scoreSymbolKind(language, filename, sym, symbolKind)) 722 } 723 } 724 725 if score.score > maxScore.score { 726 maxScore.score = score.score 727 maxScore.what = score.what 728 } 729 } 730 731 if debug { 732 maxScore.what = fmt.Sprintf("score:%.2f <- %s", maxScore.score, strings.TrimSuffix(maxScore.what, ", ")) 733 } 734 735 return maxScore.score, maxScore.what 736} 737 738// sectionSlice will return data[sec.Start:sec.End] but will clip Start and 739// End such that it won't be out of range. 740func sectionSlice(data []byte, sec DocumentSection) []byte { 741 l := uint32(len(data)) 742 if sec.Start >= l { 743 return nil 744 } 745 if sec.End > l { 746 sec.End = l 747 } 748 return data[sec.Start:sec.End] 749} 750 751// scoreSymbolKind boosts a match based on the combination of language, symbol 752// and kind. The language string comes from go-enry, the symbol and kind from 753// ctags. 754func scoreSymbolKind(language string, filename []byte, sym []byte, kind ctags.SymbolKind) float64 { 755 var factor float64 756 757 // Generic ranking which will be overriden by language specific ranking 758 switch kind { 759 case ctags.Type: // scip-ctags regression workaround https://github.com/sourcegraph/sourcegraph/issues/57659 760 factor = 8 761 case ctags.Class: 762 factor = 10 763 case ctags.Struct: 764 factor = 9.5 765 case ctags.Enum: 766 factor = 9 767 case ctags.Interface: 768 factor = 8 769 case ctags.Function, ctags.Method: 770 factor = 7 771 case ctags.Field: 772 factor = 5.5 773 case ctags.Constant: 774 factor = 5 775 case ctags.Variable: 776 factor = 4 777 default: 778 // For all other kinds, assign a low score by default. 779 factor = 1 780 } 781 782 switch language { 783 case "Java", "java": 784 switch kind { 785 // 2022-03-30: go-ctags contains a regex rule for Java classes that sets "kind" 786 // to "classes" instead of "c". We have to cover both cases to support existing 787 // indexes. 788 case ctags.Class: 789 factor = 10 790 case ctags.Enum: 791 factor = 9 792 case ctags.Interface: 793 factor = 8 794 case ctags.Method: 795 factor = 7 796 case ctags.Field: 797 factor = 6 798 case ctags.EnumConstant: 799 factor = 5 800 } 801 case "Kotlin", "kotlin": 802 switch kind { 803 case ctags.Class: 804 factor = 10 805 case ctags.Interface: 806 factor = 9 807 case ctags.Method: 808 factor = 8 809 case ctags.TypeAlias: 810 factor = 7 811 case ctags.Constant: 812 factor = 6 813 case ctags.Variable: 814 factor = 5 815 } 816 case "Go", "go": 817 switch kind { 818 // scip-ctags regression workaround https://github.com/sourcegraph/sourcegraph/issues/57659 819 // for each case a description of the fields in ctags in the comment 820 case ctags.Type: // interface struct talias 821 factor = 9 822 case ctags.Interface: // interfaces 823 factor = 10 824 case ctags.Struct: // structs 825 factor = 9 826 case ctags.TypeAlias: // type aliases 827 factor = 9 828 case ctags.MethodSpec: // interface method specification 829 factor = 8.5 830 case ctags.Method, ctags.Function: // functions 831 factor = 8 832 case ctags.Field: // struct fields 833 factor = 7 834 case ctags.Constant: // constants 835 factor = 6 836 case ctags.Variable: // variables 837 factor = 5 838 } 839 840 // Boost exported go symbols. Same implementation as token.IsExported 841 if ch, _ := utf8.DecodeRune(sym); unicode.IsUpper(ch) { 842 factor += 0.5 843 } 844 845 if bytes.HasSuffix(filename, []byte("_test.go")) { 846 factor *= 0.8 847 } 848 849 // Could also rank on: 850 // 851 // - anonMember struct anonymous members 852 // - packageName name for specifying imported package 853 // - receiver receivers 854 // - package packages 855 // - type types 856 // - unknown unknown 857 case "C++", "c++": 858 switch kind { 859 case ctags.Class: // classes 860 factor = 10 861 case ctags.Enum: // enumeration names 862 factor = 9 863 case ctags.Function: // function definitions 864 factor = 8 865 case ctags.Struct: // structure names 866 factor = 7 867 case ctags.Union: // union names 868 factor = 6 869 case ctags.TypeAlias: // typedefs 870 factor = 5 871 case ctags.Field: // class, struct, and union members 872 factor = 4 873 case ctags.Variable: // varialbe definitions 874 factor = 3 875 } 876 // Could also rank on: 877 // NAME DESCRIPTION 878 // macro macro definitions 879 // enumerator enumerators (values inside an enumeration) 880 // header included header files 881 // namespace namespaces 882 // variable variable definitions 883 case "Scala", "scala": 884 switch kind { 885 case ctags.Class: 886 factor = 10 887 case ctags.Interface: 888 factor = 9 889 case ctags.Object: 890 factor = 8 891 case ctags.Function: 892 factor = 7 893 case ctags.Type: 894 factor = 6 895 case ctags.Variable: 896 factor = 5 897 case ctags.Package: 898 factor = 4 899 } 900 case "Python", "python": 901 switch kind { 902 case ctags.Class: // classes 903 factor = 10 904 case ctags.Function, ctags.Method: // function definitions 905 factor = 8 906 case ctags.Field: // class, struct, and union members 907 factor = 4 908 case ctags.Variable: // variable definitions 909 factor = 3 910 case ctags.Local: // local variables 911 factor = 2 912 } 913 // Could also rank on: 914 // 915 // - namespace name referring a module defined in other file 916 // - module modules 917 // - unknown name referring a class/variable/function/module defined in other module 918 // - parameter function parameters 919 case "Ruby", "ruby": 920 switch kind { 921 case ctags.Class: 922 factor = 10 923 case ctags.Method: 924 factor = 9 925 case ctags.MethodAlias: 926 factor = 8 927 case ctags.Module: 928 factor = 7 929 case ctags.SingletonMethod: 930 factor = 6 931 case ctags.Constant: 932 factor = 5 933 case ctags.Accessor: 934 factor = 4 935 case ctags.Library: 936 factor = 3 937 } 938 case "PHP", "php": 939 switch kind { 940 case ctags.Class: 941 factor = 10 942 case ctags.Interface: 943 factor = 9 944 case ctags.Function: 945 factor = 8 946 case ctags.Trait: 947 factor = 7 948 case ctags.Define: 949 factor = 6 950 case ctags.Namespace: 951 factor = 5 952 case ctags.MethodAlias: 953 factor = 4 954 case ctags.Variable: 955 factor = 3 956 case ctags.Local: 957 factor = 3 958 } 959 case "GraphQL", "graphql": 960 switch kind { 961 case ctags.Type: 962 factor = 10 963 } 964 case "Markdown", "markdown": 965 // Headers are good signal in docs, but do not rank as highly as code. 966 switch kind { 967 case ctags.Chapter: // # 968 factor = 4 969 case ctags.Section: // ## 970 factor = 3 971 case ctags.Subsection: // ### 972 factor = 2 973 } 974 } 975 976 return factor * scoreKindMatch 977} 978 979type matchScoreSlice []LineMatch 980 981func (m matchScoreSlice) Len() int { return len(m) } 982func (m matchScoreSlice) Swap(i, j int) { m[i], m[j] = m[j], m[i] } 983func (m matchScoreSlice) Less(i, j int) bool { return m[i].Score > m[j].Score } 984 985type chunkMatchScoreSlice []ChunkMatch 986 987func (m chunkMatchScoreSlice) Len() int { return len(m) } 988func (m chunkMatchScoreSlice) Swap(i, j int) { m[i], m[j] = m[j], m[i] } 989func (m chunkMatchScoreSlice) Less(i, j int) bool { return m[i].Score > m[j].Score } 990 991type fileMatchesByScore []FileMatch 992 993func (m fileMatchesByScore) Len() int { return len(m) } 994func (m fileMatchesByScore) Swap(i, j int) { m[i], m[j] = m[j], m[i] } 995func (m fileMatchesByScore) Less(i, j int) bool { return m[i].Score > m[j].Score } 996 997func sortMatchesByScore(ms []LineMatch) { 998 sort.Sort(matchScoreSlice(ms)) 999} 1000 1001func sortChunkMatchesByScore(ms []ChunkMatch) { 1002 sort.Sort(chunkMatchScoreSlice(ms)) 1003} 1004 1005// SortFiles sorts files matches in the order we want to present results to 1006// users. The order depends on the match score, which includes both 1007// query-dependent signals like word overlap, and file-only signals like the 1008// file ranks (if file ranks are enabled). 1009// 1010// We don't only use the scores, we will also boost some results to present 1011// files with novel extensions. 1012func SortFiles(ms []FileMatch) { 1013 sort.Sort(fileMatchesByScore(ms)) 1014 1015 // Boost a file extension not in the top 3 to the third filematch. 1016 boostNovelExtension(ms, 2, 0.9) 1017} 1018 1019func boostNovelExtension(ms []FileMatch, boostOffset int, minScoreRatio float64) { 1020 if len(ms) <= boostOffset+1 { 1021 return 1022 } 1023 1024 top := ms[:boostOffset] 1025 candidates := ms[boostOffset:] 1026 1027 // Don't bother boosting something which is significantly different to the 1028 // result it replaces. 1029 minScoreForNovelty := candidates[0].Score * minScoreRatio 1030 1031 // We want to look for an ext that isn't in the top exts 1032 exts := make([]string, len(top)) 1033 for i := range top { 1034 exts[i] = path.Ext(top[i].FileName) 1035 } 1036 1037 for i := range candidates { 1038 // Do not assume sorted due to boostNovelExtension being called on subsets 1039 if candidates[i].Score < minScoreForNovelty { 1040 continue 1041 } 1042 1043 if slices.Contains(exts, path.Ext(candidates[i].FileName)) { 1044 continue 1045 } 1046 1047 // Found what we are looking for, now boost to front of candidates (which 1048 // is ms[boostOffset]) 1049 for ; i > 0; i-- { 1050 candidates[i], candidates[i-1] = candidates[i-1], candidates[i] 1051 } 1052 return 1053 } 1054}