fork of https://github.com/sourcegraph/zoekt
0

Configure Feed

Select the types of activity you want to include in your feed.

1// Copyright 2016 Google Inc. All rights reserved. 2// 3// Licensed under the Apache License, Version 2.0 (the "License"); 4// you may not use this file except in compliance with the License. 5// You may obtain a copy of the License at 6// 7// http://www.apache.org/licenses/LICENSE-2.0 8// 9// Unless required by applicable law or agreed to in writing, software 10// distributed under the License is distributed on an "AS IS" BASIS, 11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12// See the License for the specific language governing permissions and 13// limitations under the License. 14 15package zoekt 16 17import ( 18 "bytes" 19 "fmt" 20 "log" 21 "os" 22 "path" 23 "sort" 24 "strings" 25 "unicode" 26 "unicode/utf8" 27 28 "github.com/sourcegraph/zoekt/ctags" 29 "golang.org/x/exp/slices" 30) 31 32var _ = log.Println 33 34// contentProvider is an abstraction to treat matches for names and 35// content with the same code. 36type contentProvider struct { 37 id *indexData 38 stats *Stats 39 40 // mutable 41 err error 42 idx uint32 43 _data []byte 44 _nl []uint32 45 _nlBuf []uint32 46 _sects []DocumentSection 47 _sectBuf []DocumentSection 48 fileSize uint32 49} 50 51// setDocument skips to the given document. 52func (p *contentProvider) setDocument(docID uint32) { 53 fileStart := p.id.boundaries[docID] 54 55 p.idx = docID 56 p.fileSize = p.id.boundaries[docID+1] - fileStart 57 58 p._nl = nil 59 p._sects = nil 60 p._data = nil 61} 62 63func (p *contentProvider) docSections() []DocumentSection { 64 if p._sects == nil { 65 var sz uint32 66 p._sects, sz, p.err = p.id.readDocSections(p.idx, p._sectBuf) 67 p.stats.ContentBytesLoaded += int64(sz) 68 p._sectBuf = p._sects 69 } 70 return p._sects 71} 72 73func (p *contentProvider) newlines() newlines { 74 if p._nl == nil { 75 var sz uint32 76 p._nl, sz, p.err = p.id.readNewlines(p.idx, p._nlBuf) 77 p._nlBuf = p._nl 78 p.stats.ContentBytesLoaded += int64(sz) 79 } 80 return newlines{locs: p._nl, fileSize: p.fileSize} 81} 82 83func (p *contentProvider) data(fileName bool) []byte { 84 if fileName { 85 return p.id.fileNameContent[p.id.fileNameIndex[p.idx]:p.id.fileNameIndex[p.idx+1]] 86 } 87 88 if p._data == nil { 89 p._data, p.err = p.id.readContents(p.idx) 90 p.stats.FilesLoaded++ 91 p.stats.ContentBytesLoaded += int64(len(p._data)) 92 } 93 return p._data 94} 95 96// Find offset in bytes (relative to corpus start) for an offset in 97// runes (relative to document start). If filename is set, the corpus 98// is the set of filenames, with the document being the name itself. 99func (p *contentProvider) findOffset(filename bool, r uint32) uint32 { 100 if p.id.metaData.PlainASCII { 101 return r 102 } 103 104 sample := p.id.runeOffsets 105 runeEnds := p.id.fileEndRunes 106 fileStartByte := p.id.boundaries[p.idx] 107 if filename { 108 sample = p.id.fileNameRuneOffsets 109 runeEnds = p.id.fileNameEndRunes 110 fileStartByte = p.id.fileNameIndex[p.idx] 111 } 112 113 absR := r 114 if p.idx > 0 { 115 absR += runeEnds[p.idx-1] 116 } 117 118 byteOff, left := sample.lookup(absR) 119 120 var data []byte 121 122 if filename { 123 data = p.id.fileNameContent[byteOff:] 124 } else { 125 data, p.err = p.id.readContentSlice(byteOff, 3*runeOffsetFrequency) 126 if p.err != nil { 127 return 0 128 } 129 } 130 for left > 0 { 131 _, sz := utf8.DecodeRune(data) 132 byteOff += uint32(sz) 133 data = data[sz:] 134 left-- 135 } 136 137 byteOff -= fileStartByte 138 return byteOff 139} 140 141func (p *contentProvider) fillMatches(ms []*candidateMatch, numContextLines int, language string, debug bool) []LineMatch { 142 var result []LineMatch 143 if ms[0].fileName { 144 // There is only "line" in a filename. 145 res := LineMatch{ 146 Line: p.id.fileName(p.idx), 147 FileName: true, 148 } 149 150 for _, m := range ms { 151 res.LineFragments = append(res.LineFragments, LineFragmentMatch{ 152 LineOffset: int(m.byteOffset), 153 MatchLength: int(m.byteMatchSz), 154 Offset: m.byteOffset, 155 }) 156 157 result = []LineMatch{res} 158 } 159 } else { 160 ms = breakMatchesOnNewlines(ms, p.data(false)) 161 result = p.fillContentMatches(ms, numContextLines) 162 } 163 164 sects := p.docSections() 165 for i, m := range result { 166 result[i].Score, result[i].DebugScore = p.matchScore(sects, &m, language, debug) 167 } 168 169 return result 170} 171 172func (p *contentProvider) fillChunkMatches(ms []*candidateMatch, numContextLines int, language string, debug bool) []ChunkMatch { 173 var result []ChunkMatch 174 if ms[0].fileName { 175 // If the first match is a filename match, there will only be 176 // one match and the matched content will be the filename. 177 178 fileName := p.id.fileName(p.idx) 179 ranges := make([]Range, 0, len(ms)) 180 for _, m := range ms { 181 ranges = append(ranges, Range{ 182 Start: Location{ 183 ByteOffset: m.byteOffset, 184 LineNumber: 1, 185 Column: uint32(utf8.RuneCount(fileName[:m.byteOffset]) + 1), 186 }, 187 End: Location{ 188 ByteOffset: m.byteOffset + m.byteMatchSz, 189 LineNumber: 1, 190 Column: uint32(utf8.RuneCount(fileName[:m.byteOffset+m.byteMatchSz]) + 1), 191 }, 192 }) 193 } 194 195 result = []ChunkMatch{{ 196 Content: fileName, 197 ContentStart: Location{ByteOffset: 0, LineNumber: 1, Column: 1}, 198 Ranges: ranges, 199 FileName: true, 200 }} 201 } else { 202 result = p.fillContentChunkMatches(ms, numContextLines) 203 } 204 205 sects := p.docSections() 206 for i, m := range result { 207 result[i].Score, result[i].DebugScore = p.chunkMatchScore(sects, &m, language, debug) 208 } 209 210 return result 211} 212 213func (p *contentProvider) fillContentMatches(ms []*candidateMatch, numContextLines int) []LineMatch { 214 var result []LineMatch 215 for len(ms) > 0 { 216 m := ms[0] 217 num, lineStart, lineEnd := p.newlines().atOffset(m.byteOffset) 218 219 var lineCands []*candidateMatch 220 221 endMatch := m.byteOffset + m.byteMatchSz 222 223 for len(ms) > 0 { 224 m := ms[0] 225 if int(m.byteOffset) <= lineEnd { 226 endMatch = m.byteOffset + m.byteMatchSz 227 lineCands = append(lineCands, m) 228 ms = ms[1:] 229 } else { 230 break 231 } 232 } 233 234 if len(lineCands) == 0 { 235 log.Panicf( 236 "%s %v infinite loop: num %d start,end %d,%d, offset %d", 237 p.id.fileName(p.idx), p.id.metaData, 238 num, lineStart, lineEnd, 239 m.byteOffset) 240 } 241 242 data := p.data(false) 243 244 // Due to merging matches, we may have a match that 245 // crosses a line boundary. Prevent confusion by 246 // taking lines until we pass the last match 247 for lineEnd < len(data) && endMatch > uint32(lineEnd) { 248 next := bytes.IndexByte(data[lineEnd+1:], '\n') 249 if next == -1 { 250 lineEnd = len(data) 251 } else { 252 // TODO(hanwen): test that checks "+1" part here. 253 lineEnd += next + 1 254 } 255 } 256 257 finalMatch := LineMatch{ 258 LineStart: lineStart, 259 LineEnd: lineEnd, 260 LineNumber: num, 261 } 262 finalMatch.Line = data[lineStart:lineEnd] 263 264 if numContextLines > 0 { 265 finalMatch.Before = p.newlines().getLines(data, num-numContextLines, num) 266 finalMatch.After = p.newlines().getLines(data, num+1, num+1+numContextLines) 267 } 268 269 for _, m := range lineCands { 270 fragment := LineFragmentMatch{ 271 Offset: m.byteOffset, 272 LineOffset: int(m.byteOffset) - lineStart, 273 MatchLength: int(m.byteMatchSz), 274 } 275 if m.symbol { 276 start := p.id.fileEndSymbol[p.idx] 277 fragment.SymbolInfo = p.id.symbols.data(start + m.symbolIdx) 278 if fragment.SymbolInfo != nil { 279 sec := p.docSections()[m.symbolIdx] 280 fragment.SymbolInfo.Sym = string(data[sec.Start:sec.End]) 281 } 282 } 283 284 finalMatch.LineFragments = append(finalMatch.LineFragments, fragment) 285 } 286 result = append(result, finalMatch) 287 } 288 return result 289} 290 291func (p *contentProvider) fillContentChunkMatches(ms []*candidateMatch, numContextLines int) []ChunkMatch { 292 newlines := p.newlines() 293 chunks := chunkCandidates(ms, newlines, numContextLines) 294 data := p.data(false) 295 chunkMatches := make([]ChunkMatch, 0, len(chunks)) 296 for _, chunk := range chunks { 297 ranges := make([]Range, 0, len(chunk.candidates)) 298 var symbolInfo []*Symbol 299 for i, cm := range chunk.candidates { 300 startOffset := cm.byteOffset 301 endOffset := cm.byteOffset + cm.byteMatchSz 302 startLine, startLineOffset, _ := newlines.atOffset(startOffset) 303 endLine, endLineOffset, _ := newlines.atOffset(endOffset) 304 305 ranges = append(ranges, Range{ 306 Start: Location{ 307 ByteOffset: startOffset, 308 LineNumber: uint32(startLine), 309 Column: uint32(utf8.RuneCount(data[startLineOffset:startOffset]) + 1), 310 }, 311 End: Location{ 312 ByteOffset: endOffset, 313 LineNumber: uint32(endLine), 314 Column: uint32(utf8.RuneCount(data[endLineOffset:endOffset]) + 1), 315 }, 316 }) 317 318 if cm.symbol { 319 if symbolInfo == nil { 320 symbolInfo = make([]*Symbol, len(chunk.candidates)) 321 } 322 start := p.id.fileEndSymbol[p.idx] 323 si := p.id.symbols.data(start + cm.symbolIdx) 324 if si != nil { 325 sec := p.docSections()[cm.symbolIdx] 326 si.Sym = string(data[sec.Start:sec.End]) 327 } 328 symbolInfo[i] = si 329 } 330 } 331 332 firstLineNumber := int(chunk.firstLine) - numContextLines 333 if firstLineNumber < 1 { 334 firstLineNumber = 1 335 } 336 firstLineStart, _ := newlines.lineBounds(firstLineNumber) 337 338 chunkMatches = append(chunkMatches, ChunkMatch{ 339 Content: newlines.getLines(data, firstLineNumber, int(chunk.lastLine)+numContextLines+1), 340 ContentStart: Location{ 341 ByteOffset: firstLineStart, 342 LineNumber: uint32(firstLineNumber), 343 Column: 1, 344 }, 345 FileName: false, 346 Ranges: ranges, 347 SymbolInfo: symbolInfo, 348 }) 349 } 350 return chunkMatches 351} 352 353type candidateChunk struct { 354 firstLine uint32 // 1-based, inclusive 355 lastLine uint32 // 1-based, inclusive 356 minOffset uint32 // 0-based, inclusive 357 maxOffset uint32 // 0-based, exclusive 358 candidates []*candidateMatch 359} 360 361// chunkCandidates groups a set of sorted, non-overlapping candidate matches by line number. Adjacent 362// chunks will be merged if adding `numContextLines` to the beginning and end of the chunk would cause 363// it to overlap with an adjacent chunk. 364func chunkCandidates(ms []*candidateMatch, newlines newlines, numContextLines int) []candidateChunk { 365 var chunks []candidateChunk 366 for _, m := range ms { 367 startOffset := m.byteOffset 368 endOffset := m.byteOffset + m.byteMatchSz 369 firstLine, _, _ := newlines.atOffset(startOffset) 370 lastLine, _, _ := newlines.atOffset(endOffset) 371 372 if len(chunks) > 0 && int(chunks[len(chunks)-1].lastLine)+numContextLines >= firstLine-numContextLines { 373 // If a new chunk created with the current candidateMatch would 374 // overlap with the previous chunk, instead add the candidateMatch 375 // to the last chunk and extend end of the last chunk. 376 last := &chunks[len(chunks)-1] 377 last.candidates = append(last.candidates, m) 378 if last.maxOffset < endOffset { 379 last.lastLine = uint32(lastLine) 380 last.maxOffset = uint32(endOffset) 381 } 382 } else { 383 chunks = append(chunks, candidateChunk{ 384 firstLine: uint32(firstLine), 385 lastLine: uint32(lastLine), 386 minOffset: startOffset, 387 maxOffset: endOffset, 388 candidates: []*candidateMatch{m}, 389 }) 390 } 391 } 392 return chunks 393} 394 395type newlines struct { 396 // locs is the sorted set of byte offsets of the newlines in the file 397 locs []uint32 398 399 // fileSize is just the number of bytes in the file. It is stored 400 // on this struct so we can safely know the length of the last line 401 // in the file since not all files end in a newline. 402 fileSize uint32 403} 404 405// atOffset returns the line containing the offset. If the offset lands on 406// the newline ending line M, we return M. The line is characterized 407// by its linenumber (base-1, byte index of line start, byte index of 408// line end). The line end is the index of a newline, or the filesize 409// (if matching the last line of the file.) 410func (nls newlines) atOffset(offset uint32) (lineNumber, lineStart, lineEnd int) { 411 idx := sort.Search(len(nls.locs), func(n int) bool { 412 return nls.locs[n] >= offset 413 }) 414 415 start, end := nls.lineBounds(idx + 1) 416 return idx + 1, int(start), int(end) 417} 418 419// lineBounds returns the byte offsets of the start and end of the 1-based 420// lineNumber. The end offset is exclusive and will not contain the line-ending 421// newline. If the line number is out of range of the lines in the file, start 422// and end will be clamped to [0,fileSize]. 423func (nls newlines) lineBounds(lineNumber int) (start, end uint32) { 424 // nls.locs[0] + 1 is the start of the 2nd line of data. 425 startIdx := lineNumber - 2 426 endIdx := lineNumber - 1 427 428 if startIdx < 0 { 429 start = 0 430 } else if startIdx >= len(nls.locs) { 431 start = nls.fileSize 432 } else { 433 start = nls.locs[startIdx] + 1 434 } 435 436 if endIdx < 0 { 437 end = 0 438 } else if endIdx >= len(nls.locs) { 439 end = nls.fileSize 440 } else { 441 end = nls.locs[endIdx] 442 } 443 444 return start, end 445} 446 447// getLines returns a slice of data containing the lines [low, high). 448// low is 1-based and inclusive. high is 1-based and exclusive. 449func (nls newlines) getLines(data []byte, low, high int) []byte { 450 if low >= high { 451 return nil 452 } 453 454 lowStart, _ := nls.lineBounds(low) 455 _, highEnd := nls.lineBounds(high - 1) 456 457 return data[lowStart:highEnd] 458} 459 460const ( 461 // Query-dependent scoring signals. All of these together are bounded at ~9000 462 // (scoreWordMatch + scoreSymbol + scoreKindMatch * 10 + scoreFactorAtomMatch). 463 scorePartialWordMatch = 50.0 464 scoreWordMatch = 500.0 465 scoreBase = 7000.0 466 scorePartialBase = 4000.0 467 scoreSymbol = 7000.0 468 scorePartialSymbol = 4000.0 469 scoreKindMatch = 100.0 470 scoreFactorAtomMatch = 400.0 471 472 // File-only scoring signals. For now these are also bounded ~9000 to give them 473 // equal weight with the query-dependent signals. 474 scoreFileRankFactor = 9000.0 475 scoreFileOrderFactor = 10.0 476 scoreRepoRankFactor = 20.0 477 478 // Used for ordering line and chunk matches within a file. 479 scoreLineOrderFactor = 1.0 480) 481 482// findSection checks whether a section defined by offset and size lies within 483// one of the sections in secs. 484func findSection(secs []DocumentSection, off, sz uint32) (int, bool) { 485 j := sort.Search(len(secs), func(i int) bool { 486 return secs[i].End >= off+sz 487 }) 488 489 if j == len(secs) { 490 return 0, false 491 } 492 493 if secs[j].Start <= off && off+sz <= secs[j].End { 494 return j, true 495 } 496 return 0, false 497} 498 499func (p *contentProvider) chunkMatchScore(secs []DocumentSection, m *ChunkMatch, language string, debug bool) (float64, string) { 500 type debugScore struct { 501 score float64 502 what string 503 } 504 505 score := &debugScore{} 506 maxScore := &debugScore{} 507 508 addScore := func(what string, s float64) { 509 if s != 0 && debug { 510 score.what += fmt.Sprintf("%s:%.2f, ", what, s) 511 } 512 score.score += s 513 } 514 515 data := p.data(m.FileName) 516 filename := p.data(true) 517 518 for i, r := range m.Ranges { 519 // calculate the start and end offset relative to the start of the content 520 relStartOffset := int(r.Start.ByteOffset - m.ContentStart.ByteOffset) 521 relEndOffset := int(r.End.ByteOffset - m.ContentStart.ByteOffset) 522 523 startBoundary := relStartOffset < len(m.Content) && (relStartOffset == 0 || byteClass(m.Content[relStartOffset-1]) != byteClass(m.Content[relStartOffset])) 524 endBoundary := relEndOffset > 0 && (relEndOffset == len(m.Content) || byteClass(m.Content[relEndOffset-1]) != byteClass(m.Content[relEndOffset])) 525 526 score.score = 0 527 score.what = "" 528 529 if startBoundary && endBoundary { 530 addScore("WordMatch", scoreWordMatch) 531 } else if startBoundary || endBoundary { 532 addScore("PartialWordMatch", scorePartialWordMatch) 533 } 534 535 if m.FileName { 536 sep := bytes.LastIndexByte(m.Content, '/') 537 startMatch := relStartOffset == sep+1 538 endMatch := relEndOffset == len(m.Content) 539 if startMatch && endMatch { 540 addScore("Base", scoreBase) 541 } else if startMatch || endMatch { 542 addScore("EdgeBase", (scoreBase+scorePartialBase)/2) 543 } else if sep < relStartOffset { 544 addScore("InnerBase", scorePartialBase) 545 } 546 } else if secIdx, ok := findSection(secs, uint32(r.Start.ByteOffset), uint32(r.End.ByteOffset-r.Start.ByteOffset)); ok { 547 sec := secs[secIdx] 548 startMatch := sec.Start == uint32(r.Start.ByteOffset) 549 endMatch := sec.End == uint32(r.End.ByteOffset) 550 if startMatch && endMatch { 551 addScore("Symbol", scoreSymbol) 552 } else if startMatch || endMatch { 553 addScore("EdgeSymbol", (scoreSymbol+scorePartialSymbol)/2) 554 } else { 555 addScore("InnerSymbol", scorePartialSymbol) 556 } 557 558 var si *Symbol 559 if m.SymbolInfo != nil { 560 si = m.SymbolInfo[i] 561 } 562 if si == nil { 563 // for non-symbol queries, we need to hydrate in SymbolInfo. 564 start := p.id.fileEndSymbol[p.idx] 565 si = p.id.symbols.data(start + uint32(secIdx)) 566 } 567 if si != nil { 568 symbolKind := ctags.ParseSymbolKind(si.Kind) 569 sym := sectionSlice(data, sec) 570 addScore(fmt.Sprintf("kind:%s:%s", language, si.Kind), scoreSymbolKind(language, filename, sym, symbolKind)) 571 } 572 } 573 574 if score.score > maxScore.score { 575 maxScore.score = score.score 576 maxScore.what = score.what 577 } 578 } 579 580 if debug { 581 maxScore.what = fmt.Sprintf("score:%.2f <- %s", maxScore.score, strings.TrimSuffix(maxScore.what, ", ")) 582 } 583 584 return maxScore.score, maxScore.what 585} 586 587func (p *contentProvider) matchScore(secs []DocumentSection, m *LineMatch, language string, debug bool) (float64, string) { 588 type debugScore struct { 589 score float64 590 what string 591 } 592 593 score := &debugScore{} 594 maxScore := &debugScore{} 595 596 addScore := func(what string, s float64) { 597 if s != 0 && debug { 598 score.what += fmt.Sprintf("%s:%.2f, ", what, s) 599 } 600 score.score += s 601 } 602 603 data := p.data(m.FileName) 604 filename := p.data(true) 605 606 for _, f := range m.LineFragments { 607 startBoundary := f.LineOffset < len(m.Line) && (f.LineOffset == 0 || byteClass(m.Line[f.LineOffset-1]) != byteClass(m.Line[f.LineOffset])) 608 609 end := int(f.LineOffset) + f.MatchLength 610 endBoundary := end > 0 && (end == len(m.Line) || byteClass(m.Line[end-1]) != byteClass(m.Line[end])) 611 612 score.score = 0 613 score.what = "" 614 615 if startBoundary && endBoundary { 616 addScore("WordMatch", scoreWordMatch) 617 } else if startBoundary || endBoundary { 618 addScore("PartialWordMatch", scorePartialWordMatch) 619 } 620 621 if m.FileName { 622 sep := bytes.LastIndexByte(m.Line, '/') 623 startMatch := sep+1 == f.LineOffset 624 endMatch := len(m.Line) == f.LineOffset+f.MatchLength 625 if startMatch && endMatch { 626 addScore("Base", scoreBase) 627 } else if startMatch || endMatch { 628 addScore("EdgeBase", (scoreBase+scorePartialBase)/2) 629 } else if sep < f.LineOffset { 630 addScore("InnerBase", scorePartialBase) 631 } 632 } else if secIdx, ok := findSection(secs, f.Offset, uint32(f.MatchLength)); ok { 633 sec := secs[secIdx] 634 startMatch := sec.Start == f.Offset 635 endMatch := sec.End == f.Offset+uint32(f.MatchLength) 636 if startMatch && endMatch { 637 addScore("Symbol", scoreSymbol) 638 } else if startMatch || endMatch { 639 addScore("EdgeSymbol", (scoreSymbol+scorePartialSymbol)/2) 640 } else { 641 addScore("InnerSymbol", scorePartialSymbol) 642 } 643 644 si := f.SymbolInfo 645 if si == nil { 646 // for non-symbol queries, we need to hydrate in SymbolInfo. 647 start := p.id.fileEndSymbol[p.idx] 648 si = p.id.symbols.data(start + uint32(secIdx)) 649 } 650 if si != nil { 651 // the LineFragment may not be on a symbol, then si will be nil. 652 symbolKind := ctags.ParseSymbolKind(si.Kind) 653 sym := sectionSlice(data, sec) 654 addScore(fmt.Sprintf("kind:%s:%s", language, si.Kind), scoreSymbolKind(language, filename, sym, symbolKind)) 655 } 656 } 657 658 if score.score > maxScore.score { 659 maxScore.score = score.score 660 maxScore.what = score.what 661 } 662 } 663 664 if debug { 665 maxScore.what = fmt.Sprintf("score:%.2f <- %s", maxScore.score, strings.TrimSuffix(maxScore.what, ", ")) 666 } 667 668 return maxScore.score, maxScore.what 669} 670 671// sectionSlice will return data[sec.Start:sec.End] but will clip Start and 672// End such that it won't be out of range. 673func sectionSlice(data []byte, sec DocumentSection) []byte { 674 l := uint32(len(data)) 675 if sec.Start >= l { 676 return nil 677 } 678 if sec.End > l { 679 sec.End = l 680 } 681 return data[sec.Start:sec.End] 682} 683 684 685// scoreSymbolKind boosts a match based on the combination of language, symbol 686// and kind. The language string comes from go-enry, the symbol and kind from 687// ctags. 688func scoreSymbolKind(language string, filename []byte, sym []byte, kind ctags.SymbolKind) float64 { 689 var factor float64 690 691 // Generic ranking which will be overriden by language specific ranking 692 switch kind { 693 case ctags.Type: // scip-ctags regression workaround https://github.com/sourcegraph/sourcegraph/issues/57659 694 factor = 8 695 case ctags.Class: 696 factor = 10 697 case ctags.Struct: 698 factor = 9.5 699 case ctags.Enum: 700 factor = 9 701 case ctags.Interface: 702 factor = 8 703 case ctags.Function, ctags.Method: 704 factor = 7 705 case ctags.Field: 706 factor = 5.5 707 case ctags.Constant: 708 factor = 5 709 case ctags.Variable: 710 factor = 4 711 default: 712 // For all other kinds, assign a low score by default. 713 factor = 1 714 } 715 716 switch language { 717 case "Java", "java": 718 switch kind { 719 // 2022-03-30: go-ctags contains a regex rule for Java classes that sets "kind" 720 // to "classes" instead of "c". We have to cover both cases to support existing 721 // indexes. 722 case ctags.Class: 723 factor = 10 724 case ctags.Enum: 725 factor = 9 726 case ctags.Interface: 727 factor = 8 728 case ctags.Method: 729 factor = 7 730 case ctags.Field: 731 factor = 6 732 case ctags.EnumConstant: 733 factor = 5 734 } 735 case "Kotlin", "kotlin": 736 switch kind { 737 case ctags.Class: 738 factor = 10 739 case ctags.Interface: 740 factor = 9 741 case ctags.Method: 742 factor = 8 743 case ctags.TypeAlias: 744 factor = 7 745 case ctags.Constant: 746 factor = 6 747 case ctags.Variable: 748 factor = 5 749 } 750 case "Go", "go": 751 switch kind { 752 // scip-ctags regression workaround https://github.com/sourcegraph/sourcegraph/issues/57659 753 // for each case a description of the fields in ctags in the comment 754 case ctags.Type: // interface struct talias 755 factor = 9 756 case ctags.Interface: // interfaces 757 factor = 10 758 case ctags.Struct: // structs 759 factor = 9 760 case ctags.TypeAlias: // type aliases 761 factor = 9 762 case ctags.MethodSpec: // interface method specification 763 factor = 8.5 764 case ctags.Method, ctags.Function: // functions 765 factor = 8 766 case ctags.Field: // struct fields 767 factor = 7 768 case ctags.Constant: // constants 769 factor = 6 770 case ctags.Variable: // variables 771 factor = 5 772 } 773 774 // Boost exported go symbols. Same implementation as token.IsExported 775 if ch, _ := utf8.DecodeRune(sym); unicode.IsUpper(ch) { 776 factor += 0.5 777 } 778 779 if bytes.HasSuffix(filename, []byte("_test.go")) { 780 factor *= 0.8 781 } 782 783 // Could also rank on: 784 // 785 // - anonMember struct anonymous members 786 // - packageName name for specifying imported package 787 // - receiver receivers 788 // - package packages 789 // - type types 790 // - unknown unknown 791 case "C++", "c++": 792 switch kind { 793 case ctags.Class: // classes 794 factor = 10 795 case ctags.Enum: // enumeration names 796 factor = 9 797 case ctags.Function: // function definitions 798 factor = 8 799 case ctags.Struct: // structure names 800 factor = 7 801 case ctags.Union: // union names 802 factor = 6 803 case ctags.TypeAlias: // typedefs 804 factor = 5 805 case ctags.Field: // class, struct, and union members 806 factor = 4 807 case ctags.Variable: // varialbe definitions 808 factor = 3 809 } 810 // Could also rank on: 811 // NAME DESCRIPTION 812 // macro macro definitions 813 // enumerator enumerators (values inside an enumeration) 814 // header included header files 815 // namespace namespaces 816 // variable variable definitions 817 case "Scala", "scala": 818 switch kind { 819 case ctags.Class: 820 factor = 10 821 case ctags.Interface: 822 factor = 9 823 case ctags.Object: 824 factor = 8 825 case ctags.Function: 826 factor = 7 827 case ctags.Type: 828 factor = 6 829 case ctags.Variable: 830 factor = 5 831 case ctags.Package: 832 factor = 4 833 } 834 case "Python", "python": 835 switch kind { 836 case ctags.Class: // classes 837 factor = 10 838 case ctags.Function, ctags.Method: // function definitions 839 factor = 8 840 case ctags.Field: // class, struct, and union members 841 factor = 4 842 case ctags.Variable: // variable definitions 843 factor = 3 844 case ctags.Local: // local variables 845 factor = 2 846 } 847 // Could also rank on: 848 // 849 // - namespace name referring a module defined in other file 850 // - module modules 851 // - unknown name referring a class/variable/function/module defined in other module 852 // - parameter function parameters 853 case "Ruby", "ruby": 854 switch kind { 855 case ctags.Class: 856 factor = 10 857 case ctags.Method: 858 factor = 9 859 case ctags.MethodAlias: 860 factor = 8 861 case ctags.Module: 862 factor = 7 863 case ctags.SingletonMethod: 864 factor = 6 865 case ctags.Constant: 866 factor = 5 867 case ctags.Accessor: 868 factor = 4 869 case ctags.Library: 870 factor = 3 871 } 872 case "PHP", "php": 873 switch kind { 874 case ctags.Class: 875 factor = 10 876 case ctags.Interface: 877 factor = 9 878 case ctags.Function: 879 factor = 8 880 case ctags.Trait: 881 factor = 7 882 case ctags.Define: 883 factor = 6 884 case ctags.Namespace: 885 factor = 5 886 case ctags.MethodAlias: 887 factor = 4 888 case ctags.Variable: 889 factor = 3 890 case ctags.Local: 891 factor = 3 892 } 893 case "GraphQL", "graphql": 894 switch kind { 895 case ctags.Type: 896 factor = 10 897 } 898 case "Markdown", "markdown": 899 // Headers are good signal in docs, but do not rank as highly as code. 900 switch kind { 901 case ctags.Chapter: // # 902 factor = 4 903 case ctags.Section: // ## 904 factor = 3 905 case ctags.Subsection: // ### 906 factor = 2 907 } 908 } 909 910 return factor * scoreKindMatch 911} 912 913type matchScoreSlice []LineMatch 914 915func (m matchScoreSlice) Len() int { return len(m) } 916func (m matchScoreSlice) Swap(i, j int) { m[i], m[j] = m[j], m[i] } 917func (m matchScoreSlice) Less(i, j int) bool { return m[i].Score > m[j].Score } 918 919type chunkMatchScoreSlice []ChunkMatch 920 921func (m chunkMatchScoreSlice) Len() int { return len(m) } 922func (m chunkMatchScoreSlice) Swap(i, j int) { m[i], m[j] = m[j], m[i] } 923func (m chunkMatchScoreSlice) Less(i, j int) bool { return m[i].Score > m[j].Score } 924 925type fileMatchesByScore []FileMatch 926 927func (m fileMatchesByScore) Len() int { return len(m) } 928func (m fileMatchesByScore) Swap(i, j int) { m[i], m[j] = m[j], m[i] } 929func (m fileMatchesByScore) Less(i, j int) bool { return m[i].Score > m[j].Score } 930 931func sortMatchesByScore(ms []LineMatch) { 932 sort.Sort(matchScoreSlice(ms)) 933} 934 935func sortChunkMatchesByScore(ms []ChunkMatch) { 936 sort.Sort(chunkMatchScoreSlice(ms)) 937} 938 939var doNovelty = os.Getenv("ZOEKT_NOVELTY_DISABLE") == "" 940 941// SortFiles sorts files matches in the order we want to present results to 942// users. The order depends on the match score, which includes both 943// query-dependent signals like word overlap, and file-only signals like the 944// file ranks (if file ranks are enabled). 945// 946// We don't only use the scores, we will also boost some results to present 947// files with novel extensions. 948func SortFiles(ms []FileMatch) { 949 sort.Sort(fileMatchesByScore(ms)) 950 951 if doNovelty { 952 // Experimentally boost something into the third filematch 953 boostNovelExtension(ms, 2, 0.9) 954 } 955} 956 957func boostNovelExtension(ms []FileMatch, boostOffset int, minScoreRatio float64) { 958 if len(ms) <= boostOffset+1 { 959 return 960 } 961 962 top := ms[:boostOffset] 963 candidates := ms[boostOffset:] 964 965 // Don't bother boosting something which is significantly different to the 966 // result it replaces. 967 minScoreForNovelty := candidates[0].Score * minScoreRatio 968 969 // We want to look for an ext that isn't in the top exts 970 exts := make([]string, len(top)) 971 for i := range top { 972 exts[i] = path.Ext(top[i].FileName) 973 } 974 975 for i := range candidates { 976 // Do not assume sorted due to boostNovelExtension being called on subsets 977 if candidates[i].Score < minScoreForNovelty { 978 continue 979 } 980 981 if slices.Contains(exts, path.Ext(candidates[i].FileName)) { 982 continue 983 } 984 985 // Found what we are looking for, now boost to front of candidates (which 986 // is ms[boostOffset]) 987 for ; i > 0; i-- { 988 candidates[i], candidates[i-1] = candidates[i-1], candidates[i] 989 } 990 return 991 } 992}