fork of https://github.com/sourcegraph/zoekt
0

Configure Feed

Select the types of activity you want to include in your feed.

1// Copyright 2016 Google Inc. All rights reserved. 2// 3// Licensed under the Apache License, Version 2.0 (the "License"); 4// you may not use this file except in compliance with the License. 5// You may obtain a copy of the License at 6// 7// http://www.apache.org/licenses/LICENSE-2.0 8// 9// Unless required by applicable law or agreed to in writing, software 10// distributed under the License is distributed on an "AS IS" BASIS, 11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12// See the License for the specific language governing permissions and 13// limitations under the License. 14 15package zoekt 16 17import ( 18 "bytes" 19 "fmt" 20 "log" 21 "os" 22 "path" 23 "sort" 24 "strings" 25 "unicode" 26 "unicode/utf8" 27 28 "github.com/sourcegraph/zoekt/ctags" 29 "golang.org/x/exp/slices" 30) 31 32var _ = log.Println 33 34// contentProvider is an abstraction to treat matches for names and 35// content with the same code. 36type contentProvider struct { 37 id *indexData 38 stats *Stats 39 40 // mutable 41 err error 42 idx uint32 43 _data []byte 44 _nl []uint32 45 _nlBuf []uint32 46 _sects []DocumentSection 47 _sectBuf []DocumentSection 48 fileSize uint32 49} 50 51// setDocument skips to the given document. 52func (p *contentProvider) setDocument(docID uint32) { 53 fileStart := p.id.boundaries[docID] 54 55 p.idx = docID 56 p.fileSize = p.id.boundaries[docID+1] - fileStart 57 58 p._nl = nil 59 p._sects = nil 60 p._data = nil 61} 62 63func (p *contentProvider) docSections() []DocumentSection { 64 if p._sects == nil { 65 var sz uint32 66 p._sects, sz, p.err = p.id.readDocSections(p.idx, p._sectBuf) 67 p.stats.ContentBytesLoaded += int64(sz) 68 p._sectBuf = p._sects 69 } 70 return p._sects 71} 72 73func (p *contentProvider) newlines() newlines { 74 if p._nl == nil { 75 var sz uint32 76 p._nl, sz, p.err = p.id.readNewlines(p.idx, p._nlBuf) 77 p._nlBuf = p._nl 78 p.stats.ContentBytesLoaded += int64(sz) 79 } 80 return newlines{locs: p._nl, fileSize: p.fileSize} 81} 82 83func (p *contentProvider) data(fileName bool) []byte { 84 if fileName { 85 return p.id.fileNameContent[p.id.fileNameIndex[p.idx]:p.id.fileNameIndex[p.idx+1]] 86 } 87 88 if p._data == nil { 89 p._data, p.err = p.id.readContents(p.idx) 90 p.stats.FilesLoaded++ 91 p.stats.ContentBytesLoaded += int64(len(p._data)) 92 } 93 return p._data 94} 95 96// Find offset in bytes (relative to corpus start) for an offset in 97// runes (relative to document start). If filename is set, the corpus 98// is the set of filenames, with the document being the name itself. 99func (p *contentProvider) findOffset(filename bool, r uint32) uint32 { 100 if p.id.metaData.PlainASCII { 101 return r 102 } 103 104 sample := p.id.runeOffsets 105 runeEnds := p.id.fileEndRunes 106 fileStartByte := p.id.boundaries[p.idx] 107 if filename { 108 sample = p.id.fileNameRuneOffsets 109 runeEnds = p.id.fileNameEndRunes 110 fileStartByte = p.id.fileNameIndex[p.idx] 111 } 112 113 absR := r 114 if p.idx > 0 { 115 absR += runeEnds[p.idx-1] 116 } 117 118 byteOff, left := sample.lookup(absR) 119 120 var data []byte 121 122 if filename { 123 data = p.id.fileNameContent[byteOff:] 124 } else { 125 data, p.err = p.id.readContentSlice(byteOff, 3*runeOffsetFrequency) 126 if p.err != nil { 127 return 0 128 } 129 } 130 for left > 0 { 131 _, sz := utf8.DecodeRune(data) 132 byteOff += uint32(sz) 133 data = data[sz:] 134 left-- 135 } 136 137 byteOff -= fileStartByte 138 return byteOff 139} 140 141func (p *contentProvider) fillMatches(ms []*candidateMatch, numContextLines int, language string, debug bool) []LineMatch { 142 var result []LineMatch 143 if ms[0].fileName { 144 // There is only "line" in a filename. 145 res := LineMatch{ 146 Line: p.id.fileName(p.idx), 147 FileName: true, 148 } 149 150 for _, m := range ms { 151 res.LineFragments = append(res.LineFragments, LineFragmentMatch{ 152 LineOffset: int(m.byteOffset), 153 MatchLength: int(m.byteMatchSz), 154 Offset: m.byteOffset, 155 }) 156 157 result = []LineMatch{res} 158 } 159 } else { 160 ms = breakMatchesOnNewlines(ms, p.data(false)) 161 result = p.fillContentMatches(ms, numContextLines) 162 } 163 164 sects := p.docSections() 165 for i, m := range result { 166 result[i].Score, result[i].DebugScore = p.matchScore(sects, &m, language, debug) 167 } 168 169 return result 170} 171 172// fillChunkMatches converts the internal candidateMatch slice into our APIs ChunkMatch. 173// 174// Performance invariant: ms is sorted and non-overlapping. 175// 176// Note: the byte slices may be backed by mmapped data, so before being 177// returned by the API it needs to be copied. 178func (p *contentProvider) fillChunkMatches(ms []*candidateMatch, numContextLines int, language string, debug bool) []ChunkMatch { 179 var result []ChunkMatch 180 if ms[0].fileName { 181 // If the first match is a filename match, there will only be 182 // one match and the matched content will be the filename. 183 184 fileName := p.id.fileName(p.idx) 185 ranges := make([]Range, 0, len(ms)) 186 for _, m := range ms { 187 ranges = append(ranges, Range{ 188 Start: Location{ 189 ByteOffset: m.byteOffset, 190 LineNumber: 1, 191 Column: uint32(utf8.RuneCount(fileName[:m.byteOffset]) + 1), 192 }, 193 End: Location{ 194 ByteOffset: m.byteOffset + m.byteMatchSz, 195 LineNumber: 1, 196 Column: uint32(utf8.RuneCount(fileName[:m.byteOffset+m.byteMatchSz]) + 1), 197 }, 198 }) 199 } 200 201 result = []ChunkMatch{{ 202 Content: fileName, 203 ContentStart: Location{ByteOffset: 0, LineNumber: 1, Column: 1}, 204 Ranges: ranges, 205 FileName: true, 206 }} 207 } else { 208 result = p.fillContentChunkMatches(ms, numContextLines) 209 } 210 211 sects := p.docSections() 212 for i, m := range result { 213 result[i].Score, result[i].DebugScore = p.chunkMatchScore(sects, &m, language, debug) 214 } 215 216 return result 217} 218 219func (p *contentProvider) fillContentMatches(ms []*candidateMatch, numContextLines int) []LineMatch { 220 var result []LineMatch 221 for len(ms) > 0 { 222 m := ms[0] 223 num, lineStart, lineEnd := p.newlines().atOffset(m.byteOffset) 224 225 var lineCands []*candidateMatch 226 227 endMatch := m.byteOffset + m.byteMatchSz 228 229 for len(ms) > 0 { 230 m := ms[0] 231 if int(m.byteOffset) <= lineEnd { 232 endMatch = m.byteOffset + m.byteMatchSz 233 lineCands = append(lineCands, m) 234 ms = ms[1:] 235 } else { 236 break 237 } 238 } 239 240 if len(lineCands) == 0 { 241 log.Panicf( 242 "%s %v infinite loop: num %d start,end %d,%d, offset %d", 243 p.id.fileName(p.idx), p.id.metaData, 244 num, lineStart, lineEnd, 245 m.byteOffset) 246 } 247 248 data := p.data(false) 249 250 // Due to merging matches, we may have a match that 251 // crosses a line boundary. Prevent confusion by 252 // taking lines until we pass the last match 253 for lineEnd < len(data) && endMatch > uint32(lineEnd) { 254 next := bytes.IndexByte(data[lineEnd+1:], '\n') 255 if next == -1 { 256 lineEnd = len(data) 257 } else { 258 // TODO(hanwen): test that checks "+1" part here. 259 lineEnd += next + 1 260 } 261 } 262 263 finalMatch := LineMatch{ 264 LineStart: lineStart, 265 LineEnd: lineEnd, 266 LineNumber: num, 267 } 268 finalMatch.Line = data[lineStart:lineEnd] 269 270 if numContextLines > 0 { 271 finalMatch.Before = p.newlines().getLines(data, num-numContextLines, num) 272 finalMatch.After = p.newlines().getLines(data, num+1, num+1+numContextLines) 273 } 274 275 for _, m := range lineCands { 276 fragment := LineFragmentMatch{ 277 Offset: m.byteOffset, 278 LineOffset: int(m.byteOffset) - lineStart, 279 MatchLength: int(m.byteMatchSz), 280 } 281 if m.symbol { 282 start := p.id.fileEndSymbol[p.idx] 283 fragment.SymbolInfo = p.id.symbols.data(start + m.symbolIdx) 284 if fragment.SymbolInfo != nil { 285 sec := p.docSections()[m.symbolIdx] 286 fragment.SymbolInfo.Sym = string(data[sec.Start:sec.End]) 287 } 288 } 289 290 finalMatch.LineFragments = append(finalMatch.LineFragments, fragment) 291 } 292 result = append(result, finalMatch) 293 } 294 return result 295} 296 297func (p *contentProvider) fillContentChunkMatches(ms []*candidateMatch, numContextLines int) []ChunkMatch { 298 newlines := p.newlines() 299 data := p.data(false) 300 301 // columnHelper prevents O(len(ms) * len(data)) lookups for all columns. 302 // However, it depends on ms being sorted by byteOffset and non-overlapping. 303 // This invariant is true at the time of writing, but we conservatively 304 // enforce this. Note: chunkCandidates preserves the sorting so safe to 305 // transform now. 306 columnHelper := columnHelper{data: data} 307 if !sort.IsSorted((sortByOffsetSlice)(ms)) { 308 log.Printf("WARN: performance invariant violated. candidate matches are not sorted in fillContentChunkMatches. Report to developers.") 309 sort.Sort((sortByOffsetSlice)(ms)) 310 } 311 312 chunks := chunkCandidates(ms, newlines, numContextLines) 313 chunkMatches := make([]ChunkMatch, 0, len(chunks)) 314 for _, chunk := range chunks { 315 ranges := make([]Range, 0, len(chunk.candidates)) 316 var symbolInfo []*Symbol 317 for i, cm := range chunk.candidates { 318 startOffset := cm.byteOffset 319 endOffset := cm.byteOffset + cm.byteMatchSz 320 startLine, startLineOffset, _ := newlines.atOffset(startOffset) 321 endLine, endLineOffset, _ := newlines.atOffset(endOffset) 322 323 ranges = append(ranges, Range{ 324 Start: Location{ 325 ByteOffset: startOffset, 326 LineNumber: uint32(startLine), 327 Column: columnHelper.get(startLineOffset, startOffset), 328 }, 329 End: Location{ 330 ByteOffset: endOffset, 331 LineNumber: uint32(endLine), 332 Column: columnHelper.get(endLineOffset, endOffset), 333 }, 334 }) 335 336 if cm.symbol { 337 if symbolInfo == nil { 338 symbolInfo = make([]*Symbol, len(chunk.candidates)) 339 } 340 start := p.id.fileEndSymbol[p.idx] 341 si := p.id.symbols.data(start + cm.symbolIdx) 342 if si != nil { 343 sec := p.docSections()[cm.symbolIdx] 344 si.Sym = string(data[sec.Start:sec.End]) 345 } 346 symbolInfo[i] = si 347 } 348 } 349 350 firstLineNumber := int(chunk.firstLine) - numContextLines 351 if firstLineNumber < 1 { 352 firstLineNumber = 1 353 } 354 firstLineStart, _ := newlines.lineBounds(firstLineNumber) 355 356 chunkMatches = append(chunkMatches, ChunkMatch{ 357 Content: newlines.getLines(data, firstLineNumber, int(chunk.lastLine)+numContextLines+1), 358 ContentStart: Location{ 359 ByteOffset: firstLineStart, 360 LineNumber: uint32(firstLineNumber), 361 Column: 1, 362 }, 363 FileName: false, 364 Ranges: ranges, 365 SymbolInfo: symbolInfo, 366 }) 367 } 368 return chunkMatches 369} 370 371type candidateChunk struct { 372 firstLine uint32 // 1-based, inclusive 373 lastLine uint32 // 1-based, inclusive 374 minOffset uint32 // 0-based, inclusive 375 maxOffset uint32 // 0-based, exclusive 376 candidates []*candidateMatch 377} 378 379// chunkCandidates groups a set of sorted, non-overlapping candidate matches by line number. Adjacent 380// chunks will be merged if adding `numContextLines` to the beginning and end of the chunk would cause 381// it to overlap with an adjacent chunk. 382// 383// input invariants: ms is sorted by byteOffset and is non overlapping with respect to endOffset. 384// output invariants: if you flatten candidates the input invariant is retained. 385func chunkCandidates(ms []*candidateMatch, newlines newlines, numContextLines int) []candidateChunk { 386 var chunks []candidateChunk 387 for _, m := range ms { 388 startOffset := m.byteOffset 389 endOffset := m.byteOffset + m.byteMatchSz 390 firstLine, _, _ := newlines.atOffset(startOffset) 391 lastLine, _, _ := newlines.atOffset(endOffset) 392 393 if len(chunks) > 0 && int(chunks[len(chunks)-1].lastLine)+numContextLines >= firstLine-numContextLines { 394 // If a new chunk created with the current candidateMatch would 395 // overlap with the previous chunk, instead add the candidateMatch 396 // to the last chunk and extend end of the last chunk. 397 last := &chunks[len(chunks)-1] 398 last.candidates = append(last.candidates, m) 399 if last.maxOffset < endOffset { 400 last.lastLine = uint32(lastLine) 401 last.maxOffset = uint32(endOffset) 402 } 403 } else { 404 chunks = append(chunks, candidateChunk{ 405 firstLine: uint32(firstLine), 406 lastLine: uint32(lastLine), 407 minOffset: startOffset, 408 maxOffset: endOffset, 409 candidates: []*candidateMatch{m}, 410 }) 411 } 412 } 413 return chunks 414} 415 416// columnHelper is a helper struct which caches the number of runes last 417// counted. If we naively use utf8.RuneCount for each match on a line, this 418// leads to an O(nm) algorithm where m is the number of matches and n is the 419// length of the line. Aassuming we our candidates are increasing in offset 420// makes this operation O(n) instead. 421type columnHelper struct { 422 data []byte 423 424 // 0 values for all these are valid values 425 lastLineOffset int 426 lastOffset uint32 427 lastRuneCount uint32 428} 429 430// get returns the line column for offset. offset is the byte offset of the 431// rune in data. lineOffset is the byte offset inside of data for the line 432// containing offset. 433func (c *columnHelper) get(lineOffset int, offset uint32) uint32 { 434 var runeCount uint32 435 436 if lineOffset == c.lastLineOffset && offset >= c.lastOffset { 437 // Can count from last calculation 438 runeCount = c.lastRuneCount + uint32(utf8.RuneCount(c.data[c.lastOffset:offset])) 439 } else { 440 // Need to count from the beginning of line 441 runeCount = uint32(utf8.RuneCount(c.data[lineOffset:offset])) 442 } 443 444 c.lastLineOffset = lineOffset 445 c.lastOffset = offset 446 c.lastRuneCount = runeCount 447 448 return runeCount + 1 449} 450 451type newlines struct { 452 // locs is the sorted set of byte offsets of the newlines in the file 453 locs []uint32 454 455 // fileSize is just the number of bytes in the file. It is stored 456 // on this struct so we can safely know the length of the last line 457 // in the file since not all files end in a newline. 458 fileSize uint32 459} 460 461// atOffset returns the line containing the offset. If the offset lands on 462// the newline ending line M, we return M. The line is characterized 463// by its linenumber (base-1, byte index of line start, byte index of 464// line end). The line end is the index of a newline, or the filesize 465// (if matching the last line of the file.) 466func (nls newlines) atOffset(offset uint32) (lineNumber, lineStart, lineEnd int) { 467 idx := sort.Search(len(nls.locs), func(n int) bool { 468 return nls.locs[n] >= offset 469 }) 470 471 start, end := nls.lineBounds(idx + 1) 472 return idx + 1, int(start), int(end) 473} 474 475// lineBounds returns the byte offsets of the start and end of the 1-based 476// lineNumber. The end offset is exclusive and will not contain the line-ending 477// newline. If the line number is out of range of the lines in the file, start 478// and end will be clamped to [0,fileSize]. 479func (nls newlines) lineBounds(lineNumber int) (start, end uint32) { 480 // nls.locs[0] + 1 is the start of the 2nd line of data. 481 startIdx := lineNumber - 2 482 endIdx := lineNumber - 1 483 484 if startIdx < 0 { 485 start = 0 486 } else if startIdx >= len(nls.locs) { 487 start = nls.fileSize 488 } else { 489 start = nls.locs[startIdx] + 1 490 } 491 492 if endIdx < 0 { 493 end = 0 494 } else if endIdx >= len(nls.locs) { 495 end = nls.fileSize 496 } else { 497 end = nls.locs[endIdx] 498 } 499 500 return start, end 501} 502 503// getLines returns a slice of data containing the lines [low, high). 504// low is 1-based and inclusive. high is 1-based and exclusive. 505func (nls newlines) getLines(data []byte, low, high int) []byte { 506 if low >= high { 507 return nil 508 } 509 510 lowStart, _ := nls.lineBounds(low) 511 _, highEnd := nls.lineBounds(high - 1) 512 513 // Drop any trailing newline. Editors do not treat a trailing newline as 514 // the start of a new line, so we should not either. lineBounds clamps to 515 // len(data) when an out-of-bounds line is requested. 516 // 517 // As an example, if we request lines 1-5 from a file with contents 518 // `one\ntwo\nthree\n`, we should return `one\ntwo\nthree` because those are 519 // the three "lines" in the file, separated by newlines. 520 if highEnd == uint32(len(data)) && bytes.HasSuffix(data, []byte{'\n'}) { 521 highEnd = highEnd - 1 522 lowStart = min(lowStart, highEnd) 523 } 524 525 return data[lowStart:highEnd] 526} 527 528const ( 529 // Query-dependent scoring signals. All of these together are bounded at ~9000 530 // (scoreWordMatch + scoreSymbol + scoreKindMatch * 10 + scoreFactorAtomMatch). 531 scorePartialWordMatch = 50.0 532 scoreWordMatch = 500.0 533 scoreBase = 7000.0 534 scorePartialBase = 4000.0 535 scoreSymbol = 7000.0 536 scorePartialSymbol = 4000.0 537 scoreKindMatch = 100.0 538 scoreFactorAtomMatch = 400.0 539 540 // File-only scoring signals. For now these are also bounded ~9000 to give them 541 // equal weight with the query-dependent signals. 542 scoreFileRankFactor = 9000.0 543 scoreFileOrderFactor = 10.0 544 scoreRepoRankFactor = 20.0 545 546 // Used for ordering line and chunk matches within a file. 547 scoreLineOrderFactor = 1.0 548) 549 550// findSection checks whether a section defined by offset and size lies within 551// one of the sections in secs. 552func findSection(secs []DocumentSection, off, sz uint32) (int, bool) { 553 j := sort.Search(len(secs), func(i int) bool { 554 return secs[i].End >= off+sz 555 }) 556 557 if j == len(secs) { 558 return 0, false 559 } 560 561 if secs[j].Start <= off && off+sz <= secs[j].End { 562 return j, true 563 } 564 return 0, false 565} 566 567func (p *contentProvider) chunkMatchScore(secs []DocumentSection, m *ChunkMatch, language string, debug bool) (float64, string) { 568 type debugScore struct { 569 score float64 570 what string 571 } 572 573 score := &debugScore{} 574 maxScore := &debugScore{} 575 576 addScore := func(what string, s float64) { 577 if s != 0 && debug { 578 score.what += fmt.Sprintf("%s:%.2f, ", what, s) 579 } 580 score.score += s 581 } 582 583 data := p.data(m.FileName) 584 filename := p.data(true) 585 586 for i, r := range m.Ranges { 587 // calculate the start and end offset relative to the start of the content 588 relStartOffset := int(r.Start.ByteOffset - m.ContentStart.ByteOffset) 589 relEndOffset := int(r.End.ByteOffset - m.ContentStart.ByteOffset) 590 591 startBoundary := relStartOffset < len(m.Content) && (relStartOffset == 0 || byteClass(m.Content[relStartOffset-1]) != byteClass(m.Content[relStartOffset])) 592 endBoundary := relEndOffset > 0 && (relEndOffset == len(m.Content) || byteClass(m.Content[relEndOffset-1]) != byteClass(m.Content[relEndOffset])) 593 594 score.score = 0 595 score.what = "" 596 597 if startBoundary && endBoundary { 598 addScore("WordMatch", scoreWordMatch) 599 } else if startBoundary || endBoundary { 600 addScore("PartialWordMatch", scorePartialWordMatch) 601 } 602 603 if m.FileName { 604 sep := bytes.LastIndexByte(m.Content, '/') 605 startMatch := relStartOffset == sep+1 606 endMatch := relEndOffset == len(m.Content) 607 if startMatch && endMatch { 608 addScore("Base", scoreBase) 609 } else if startMatch || endMatch { 610 addScore("EdgeBase", (scoreBase+scorePartialBase)/2) 611 } else if sep < relStartOffset { 612 addScore("InnerBase", scorePartialBase) 613 } 614 } else if secIdx, ok := findSection(secs, uint32(r.Start.ByteOffset), uint32(r.End.ByteOffset-r.Start.ByteOffset)); ok { 615 sec := secs[secIdx] 616 startMatch := sec.Start == uint32(r.Start.ByteOffset) 617 endMatch := sec.End == uint32(r.End.ByteOffset) 618 if startMatch && endMatch { 619 addScore("Symbol", scoreSymbol) 620 } else if startMatch || endMatch { 621 addScore("EdgeSymbol", (scoreSymbol+scorePartialSymbol)/2) 622 } else { 623 addScore("InnerSymbol", scorePartialSymbol) 624 } 625 626 var si *Symbol 627 if m.SymbolInfo != nil { 628 si = m.SymbolInfo[i] 629 } 630 if si == nil { 631 // for non-symbol queries, we need to hydrate in SymbolInfo. 632 start := p.id.fileEndSymbol[p.idx] 633 si = p.id.symbols.data(start + uint32(secIdx)) 634 } 635 if si != nil { 636 symbolKind := ctags.ParseSymbolKind(si.Kind) 637 sym := sectionSlice(data, sec) 638 addScore(fmt.Sprintf("kind:%s:%s", language, si.Kind), scoreSymbolKind(language, filename, sym, symbolKind)) 639 } 640 } 641 642 if score.score > maxScore.score { 643 maxScore.score = score.score 644 maxScore.what = score.what 645 } 646 } 647 648 if debug { 649 maxScore.what = fmt.Sprintf("score:%.2f <- %s", maxScore.score, strings.TrimSuffix(maxScore.what, ", ")) 650 } 651 652 return maxScore.score, maxScore.what 653} 654 655func (p *contentProvider) matchScore(secs []DocumentSection, m *LineMatch, language string, debug bool) (float64, string) { 656 type debugScore struct { 657 score float64 658 what string 659 } 660 661 score := &debugScore{} 662 maxScore := &debugScore{} 663 664 addScore := func(what string, s float64) { 665 if s != 0 && debug { 666 score.what += fmt.Sprintf("%s:%.2f, ", what, s) 667 } 668 score.score += s 669 } 670 671 data := p.data(m.FileName) 672 filename := p.data(true) 673 674 for _, f := range m.LineFragments { 675 startBoundary := f.LineOffset < len(m.Line) && (f.LineOffset == 0 || byteClass(m.Line[f.LineOffset-1]) != byteClass(m.Line[f.LineOffset])) 676 677 end := int(f.LineOffset) + f.MatchLength 678 endBoundary := end > 0 && (end == len(m.Line) || byteClass(m.Line[end-1]) != byteClass(m.Line[end])) 679 680 score.score = 0 681 score.what = "" 682 683 if startBoundary && endBoundary { 684 addScore("WordMatch", scoreWordMatch) 685 } else if startBoundary || endBoundary { 686 addScore("PartialWordMatch", scorePartialWordMatch) 687 } 688 689 if m.FileName { 690 sep := bytes.LastIndexByte(m.Line, '/') 691 startMatch := sep+1 == f.LineOffset 692 endMatch := len(m.Line) == f.LineOffset+f.MatchLength 693 if startMatch && endMatch { 694 addScore("Base", scoreBase) 695 } else if startMatch || endMatch { 696 addScore("EdgeBase", (scoreBase+scorePartialBase)/2) 697 } else if sep < f.LineOffset { 698 addScore("InnerBase", scorePartialBase) 699 } 700 } else if secIdx, ok := findSection(secs, f.Offset, uint32(f.MatchLength)); ok { 701 sec := secs[secIdx] 702 startMatch := sec.Start == f.Offset 703 endMatch := sec.End == f.Offset+uint32(f.MatchLength) 704 if startMatch && endMatch { 705 addScore("Symbol", scoreSymbol) 706 } else if startMatch || endMatch { 707 addScore("EdgeSymbol", (scoreSymbol+scorePartialSymbol)/2) 708 } else { 709 addScore("InnerSymbol", scorePartialSymbol) 710 } 711 712 si := f.SymbolInfo 713 if si == nil { 714 // for non-symbol queries, we need to hydrate in SymbolInfo. 715 start := p.id.fileEndSymbol[p.idx] 716 si = p.id.symbols.data(start + uint32(secIdx)) 717 } 718 if si != nil { 719 // the LineFragment may not be on a symbol, then si will be nil. 720 symbolKind := ctags.ParseSymbolKind(si.Kind) 721 sym := sectionSlice(data, sec) 722 addScore(fmt.Sprintf("kind:%s:%s", language, si.Kind), scoreSymbolKind(language, filename, sym, symbolKind)) 723 } 724 } 725 726 if score.score > maxScore.score { 727 maxScore.score = score.score 728 maxScore.what = score.what 729 } 730 } 731 732 if debug { 733 maxScore.what = fmt.Sprintf("score:%.2f <- %s", maxScore.score, strings.TrimSuffix(maxScore.what, ", ")) 734 } 735 736 return maxScore.score, maxScore.what 737} 738 739// sectionSlice will return data[sec.Start:sec.End] but will clip Start and 740// End such that it won't be out of range. 741func sectionSlice(data []byte, sec DocumentSection) []byte { 742 l := uint32(len(data)) 743 if sec.Start >= l { 744 return nil 745 } 746 if sec.End > l { 747 sec.End = l 748 } 749 return data[sec.Start:sec.End] 750} 751 752// scoreSymbolKind boosts a match based on the combination of language, symbol 753// and kind. The language string comes from go-enry, the symbol and kind from 754// ctags. 755func scoreSymbolKind(language string, filename []byte, sym []byte, kind ctags.SymbolKind) float64 { 756 var factor float64 757 758 // Generic ranking which will be overriden by language specific ranking 759 switch kind { 760 case ctags.Type: // scip-ctags regression workaround https://github.com/sourcegraph/sourcegraph/issues/57659 761 factor = 8 762 case ctags.Class: 763 factor = 10 764 case ctags.Struct: 765 factor = 9.5 766 case ctags.Enum: 767 factor = 9 768 case ctags.Interface: 769 factor = 8 770 case ctags.Function, ctags.Method: 771 factor = 7 772 case ctags.Field: 773 factor = 5.5 774 case ctags.Constant: 775 factor = 5 776 case ctags.Variable: 777 factor = 4 778 default: 779 // For all other kinds, assign a low score by default. 780 factor = 1 781 } 782 783 switch language { 784 case "Java", "java": 785 switch kind { 786 // 2022-03-30: go-ctags contains a regex rule for Java classes that sets "kind" 787 // to "classes" instead of "c". We have to cover both cases to support existing 788 // indexes. 789 case ctags.Class: 790 factor = 10 791 case ctags.Enum: 792 factor = 9 793 case ctags.Interface: 794 factor = 8 795 case ctags.Method: 796 factor = 7 797 case ctags.Field: 798 factor = 6 799 case ctags.EnumConstant: 800 factor = 5 801 } 802 case "Kotlin", "kotlin": 803 switch kind { 804 case ctags.Class: 805 factor = 10 806 case ctags.Interface: 807 factor = 9 808 case ctags.Method: 809 factor = 8 810 case ctags.TypeAlias: 811 factor = 7 812 case ctags.Constant: 813 factor = 6 814 case ctags.Variable: 815 factor = 5 816 } 817 case "Go", "go": 818 switch kind { 819 // scip-ctags regression workaround https://github.com/sourcegraph/sourcegraph/issues/57659 820 // for each case a description of the fields in ctags in the comment 821 case ctags.Type: // interface struct talias 822 factor = 9 823 case ctags.Interface: // interfaces 824 factor = 10 825 case ctags.Struct: // structs 826 factor = 9 827 case ctags.TypeAlias: // type aliases 828 factor = 9 829 case ctags.MethodSpec: // interface method specification 830 factor = 8.5 831 case ctags.Method, ctags.Function: // functions 832 factor = 8 833 case ctags.Field: // struct fields 834 factor = 7 835 case ctags.Constant: // constants 836 factor = 6 837 case ctags.Variable: // variables 838 factor = 5 839 } 840 841 // Boost exported go symbols. Same implementation as token.IsExported 842 if ch, _ := utf8.DecodeRune(sym); unicode.IsUpper(ch) { 843 factor += 0.5 844 } 845 846 if bytes.HasSuffix(filename, []byte("_test.go")) { 847 factor *= 0.8 848 } 849 850 // Could also rank on: 851 // 852 // - anonMember struct anonymous members 853 // - packageName name for specifying imported package 854 // - receiver receivers 855 // - package packages 856 // - type types 857 // - unknown unknown 858 case "C++", "c++": 859 switch kind { 860 case ctags.Class: // classes 861 factor = 10 862 case ctags.Enum: // enumeration names 863 factor = 9 864 case ctags.Function: // function definitions 865 factor = 8 866 case ctags.Struct: // structure names 867 factor = 7 868 case ctags.Union: // union names 869 factor = 6 870 case ctags.TypeAlias: // typedefs 871 factor = 5 872 case ctags.Field: // class, struct, and union members 873 factor = 4 874 case ctags.Variable: // varialbe definitions 875 factor = 3 876 } 877 // Could also rank on: 878 // NAME DESCRIPTION 879 // macro macro definitions 880 // enumerator enumerators (values inside an enumeration) 881 // header included header files 882 // namespace namespaces 883 // variable variable definitions 884 case "Scala", "scala": 885 switch kind { 886 case ctags.Class: 887 factor = 10 888 case ctags.Interface: 889 factor = 9 890 case ctags.Object: 891 factor = 8 892 case ctags.Function: 893 factor = 7 894 case ctags.Type: 895 factor = 6 896 case ctags.Variable: 897 factor = 5 898 case ctags.Package: 899 factor = 4 900 } 901 case "Python", "python": 902 switch kind { 903 case ctags.Class: // classes 904 factor = 10 905 case ctags.Function, ctags.Method: // function definitions 906 factor = 8 907 case ctags.Field: // class, struct, and union members 908 factor = 4 909 case ctags.Variable: // variable definitions 910 factor = 3 911 case ctags.Local: // local variables 912 factor = 2 913 } 914 // Could also rank on: 915 // 916 // - namespace name referring a module defined in other file 917 // - module modules 918 // - unknown name referring a class/variable/function/module defined in other module 919 // - parameter function parameters 920 case "Ruby", "ruby": 921 switch kind { 922 case ctags.Class: 923 factor = 10 924 case ctags.Method: 925 factor = 9 926 case ctags.MethodAlias: 927 factor = 8 928 case ctags.Module: 929 factor = 7 930 case ctags.SingletonMethod: 931 factor = 6 932 case ctags.Constant: 933 factor = 5 934 case ctags.Accessor: 935 factor = 4 936 case ctags.Library: 937 factor = 3 938 } 939 case "PHP", "php": 940 switch kind { 941 case ctags.Class: 942 factor = 10 943 case ctags.Interface: 944 factor = 9 945 case ctags.Function: 946 factor = 8 947 case ctags.Trait: 948 factor = 7 949 case ctags.Define: 950 factor = 6 951 case ctags.Namespace: 952 factor = 5 953 case ctags.MethodAlias: 954 factor = 4 955 case ctags.Variable: 956 factor = 3 957 case ctags.Local: 958 factor = 3 959 } 960 case "GraphQL", "graphql": 961 switch kind { 962 case ctags.Type: 963 factor = 10 964 } 965 case "Markdown", "markdown": 966 // Headers are good signal in docs, but do not rank as highly as code. 967 switch kind { 968 case ctags.Chapter: // # 969 factor = 4 970 case ctags.Section: // ## 971 factor = 3 972 case ctags.Subsection: // ### 973 factor = 2 974 } 975 } 976 977 return factor * scoreKindMatch 978} 979 980type matchScoreSlice []LineMatch 981 982func (m matchScoreSlice) Len() int { return len(m) } 983func (m matchScoreSlice) Swap(i, j int) { m[i], m[j] = m[j], m[i] } 984func (m matchScoreSlice) Less(i, j int) bool { return m[i].Score > m[j].Score } 985 986type chunkMatchScoreSlice []ChunkMatch 987 988func (m chunkMatchScoreSlice) Len() int { return len(m) } 989func (m chunkMatchScoreSlice) Swap(i, j int) { m[i], m[j] = m[j], m[i] } 990func (m chunkMatchScoreSlice) Less(i, j int) bool { return m[i].Score > m[j].Score } 991 992type fileMatchesByScore []FileMatch 993 994func (m fileMatchesByScore) Len() int { return len(m) } 995func (m fileMatchesByScore) Swap(i, j int) { m[i], m[j] = m[j], m[i] } 996func (m fileMatchesByScore) Less(i, j int) bool { return m[i].Score > m[j].Score } 997 998func sortMatchesByScore(ms []LineMatch) { 999 sort.Sort(matchScoreSlice(ms)) 1000} 1001 1002func sortChunkMatchesByScore(ms []ChunkMatch) { 1003 sort.Sort(chunkMatchScoreSlice(ms)) 1004} 1005 1006var doNovelty = os.Getenv("ZOEKT_NOVELTY_DISABLE") == "" 1007 1008// SortFiles sorts files matches in the order we want to present results to 1009// users. The order depends on the match score, which includes both 1010// query-dependent signals like word overlap, and file-only signals like the 1011// file ranks (if file ranks are enabled). 1012// 1013// We don't only use the scores, we will also boost some results to present 1014// files with novel extensions. 1015func SortFiles(ms []FileMatch) { 1016 sort.Sort(fileMatchesByScore(ms)) 1017 1018 if doNovelty { 1019 // Experimentally boost something into the third filematch 1020 boostNovelExtension(ms, 2, 0.9) 1021 } 1022} 1023 1024func boostNovelExtension(ms []FileMatch, boostOffset int, minScoreRatio float64) { 1025 if len(ms) <= boostOffset+1 { 1026 return 1027 } 1028 1029 top := ms[:boostOffset] 1030 candidates := ms[boostOffset:] 1031 1032 // Don't bother boosting something which is significantly different to the 1033 // result it replaces. 1034 minScoreForNovelty := candidates[0].Score * minScoreRatio 1035 1036 // We want to look for an ext that isn't in the top exts 1037 exts := make([]string, len(top)) 1038 for i := range top { 1039 exts[i] = path.Ext(top[i].FileName) 1040 } 1041 1042 for i := range candidates { 1043 // Do not assume sorted due to boostNovelExtension being called on subsets 1044 if candidates[i].Score < minScoreForNovelty { 1045 continue 1046 } 1047 1048 if slices.Contains(exts, path.Ext(candidates[i].FileName)) { 1049 continue 1050 } 1051 1052 // Found what we are looking for, now boost to front of candidates (which 1053 // is ms[boostOffset]) 1054 for ; i > 0; i-- { 1055 candidates[i], candidates[i-1] = candidates[i-1], candidates[i] 1056 } 1057 return 1058 } 1059}