fork of https://github.com/sourcegraph/zoekt
0

Configure Feed

Select the types of activity you want to include in your feed.

1// Copyright 2016 Google Inc. All rights reserved. 2// 3// Licensed under the Apache License, Version 2.0 (the "License"); 4// you may not use this file except in compliance with the License. 5// You may obtain a copy of the License at 6// 7// http://www.apache.org/licenses/LICENSE-2.0 8// 9// Unless required by applicable law or agreed to in writing, software 10// distributed under the License is distributed on an "AS IS" BASIS, 11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12// See the License for the specific language governing permissions and 13// limitations under the License. 14 15package zoekt 16 17import ( 18 "bytes" 19 "fmt" 20 "log" 21 "os" 22 "path" 23 "sort" 24 "strings" 25 "unicode/utf8" 26 27 "golang.org/x/exp/slices" 28) 29 30var _ = log.Println 31 32// contentProvider is an abstraction to treat matches for names and 33// content with the same code. 34type contentProvider struct { 35 id *indexData 36 stats *Stats 37 38 // mutable 39 err error 40 idx uint32 41 _data []byte 42 _nl []uint32 43 _nlBuf []uint32 44 _sects []DocumentSection 45 _sectBuf []DocumentSection 46 fileSize uint32 47} 48 49// setDocument skips to the given document. 50func (p *contentProvider) setDocument(docID uint32) { 51 fileStart := p.id.boundaries[docID] 52 53 p.idx = docID 54 p.fileSize = p.id.boundaries[docID+1] - fileStart 55 56 p._nl = nil 57 p._sects = nil 58 p._data = nil 59} 60 61func (p *contentProvider) docSections() []DocumentSection { 62 if p._sects == nil { 63 var sz uint32 64 p._sects, sz, p.err = p.id.readDocSections(p.idx, p._sectBuf) 65 p.stats.ContentBytesLoaded += int64(sz) 66 p._sectBuf = p._sects 67 } 68 return p._sects 69} 70 71func (p *contentProvider) newlines() newlines { 72 if p._nl == nil { 73 var sz uint32 74 p._nl, sz, p.err = p.id.readNewlines(p.idx, p._nlBuf) 75 p._nlBuf = p._nl 76 p.stats.ContentBytesLoaded += int64(sz) 77 } 78 return newlines{locs: p._nl, fileSize: p.fileSize} 79} 80 81func (p *contentProvider) data(fileName bool) []byte { 82 if fileName { 83 return p.id.fileNameContent[p.id.fileNameIndex[p.idx]:p.id.fileNameIndex[p.idx+1]] 84 } 85 86 if p._data == nil { 87 p._data, p.err = p.id.readContents(p.idx) 88 p.stats.FilesLoaded++ 89 p.stats.ContentBytesLoaded += int64(len(p._data)) 90 } 91 return p._data 92} 93 94// Find offset in bytes (relative to corpus start) for an offset in 95// runes (relative to document start). If filename is set, the corpus 96// is the set of filenames, with the document being the name itself. 97func (p *contentProvider) findOffset(filename bool, r uint32) uint32 { 98 if p.id.metaData.PlainASCII { 99 return r 100 } 101 102 sample := p.id.runeOffsets 103 runeEnds := p.id.fileEndRunes 104 fileStartByte := p.id.boundaries[p.idx] 105 if filename { 106 sample = p.id.fileNameRuneOffsets 107 runeEnds = p.id.fileNameEndRunes 108 fileStartByte = p.id.fileNameIndex[p.idx] 109 } 110 111 absR := r 112 if p.idx > 0 { 113 absR += runeEnds[p.idx-1] 114 } 115 116 byteOff, left := sample.lookup(absR) 117 118 var data []byte 119 120 if filename { 121 data = p.id.fileNameContent[byteOff:] 122 } else { 123 data, p.err = p.id.readContentSlice(byteOff, 3*runeOffsetFrequency) 124 if p.err != nil { 125 return 0 126 } 127 } 128 for left > 0 { 129 _, sz := utf8.DecodeRune(data) 130 byteOff += uint32(sz) 131 data = data[sz:] 132 left-- 133 } 134 135 byteOff -= fileStartByte 136 return byteOff 137} 138 139func (p *contentProvider) fillMatches(ms []*candidateMatch, numContextLines int, language string, debug bool) []LineMatch { 140 var result []LineMatch 141 if ms[0].fileName { 142 // There is only "line" in a filename. 143 res := LineMatch{ 144 Line: p.id.fileName(p.idx), 145 FileName: true, 146 } 147 148 for _, m := range ms { 149 res.LineFragments = append(res.LineFragments, LineFragmentMatch{ 150 LineOffset: int(m.byteOffset), 151 MatchLength: int(m.byteMatchSz), 152 Offset: m.byteOffset, 153 }) 154 155 result = []LineMatch{res} 156 } 157 } else { 158 ms = breakMatchesOnNewlines(ms, p.data(false)) 159 result = p.fillContentMatches(ms, numContextLines) 160 } 161 162 sects := p.docSections() 163 for i, m := range result { 164 result[i].Score, result[i].DebugScore = p.matchScore(sects, &m, language, debug) 165 } 166 167 return result 168} 169 170func (p *contentProvider) fillChunkMatches(ms []*candidateMatch, numContextLines int, language string, debug bool) []ChunkMatch { 171 var result []ChunkMatch 172 if ms[0].fileName { 173 // If the first match is a filename match, there will only be 174 // one match and the matched content will be the filename. 175 176 fileName := p.id.fileName(p.idx) 177 ranges := make([]Range, 0, len(ms)) 178 for _, m := range ms { 179 ranges = append(ranges, Range{ 180 Start: Location{ 181 ByteOffset: m.byteOffset, 182 LineNumber: 1, 183 Column: uint32(utf8.RuneCount(fileName[:m.byteOffset]) + 1), 184 }, 185 End: Location{ 186 ByteOffset: m.byteOffset + m.byteMatchSz, 187 LineNumber: 1, 188 Column: uint32(utf8.RuneCount(fileName[:m.byteOffset+m.byteMatchSz]) + 1), 189 }, 190 }) 191 } 192 193 result = []ChunkMatch{{ 194 Content: fileName, 195 ContentStart: Location{ByteOffset: 0, LineNumber: 1, Column: 1}, 196 Ranges: ranges, 197 FileName: true, 198 }} 199 } else { 200 result = p.fillContentChunkMatches(ms, numContextLines) 201 } 202 203 sects := p.docSections() 204 for i, m := range result { 205 result[i].Score, result[i].DebugScore = p.chunkMatchScore(sects, &m, language, debug) 206 } 207 208 return result 209} 210 211func (p *contentProvider) fillContentMatches(ms []*candidateMatch, numContextLines int) []LineMatch { 212 var result []LineMatch 213 for len(ms) > 0 { 214 m := ms[0] 215 num, lineStart, lineEnd := p.newlines().atOffset(m.byteOffset) 216 217 var lineCands []*candidateMatch 218 219 endMatch := m.byteOffset + m.byteMatchSz 220 221 for len(ms) > 0 { 222 m := ms[0] 223 if int(m.byteOffset) <= lineEnd { 224 endMatch = m.byteOffset + m.byteMatchSz 225 lineCands = append(lineCands, m) 226 ms = ms[1:] 227 } else { 228 break 229 } 230 } 231 232 if len(lineCands) == 0 { 233 log.Panicf( 234 "%s %v infinite loop: num %d start,end %d,%d, offset %d", 235 p.id.fileName(p.idx), p.id.metaData, 236 num, lineStart, lineEnd, 237 m.byteOffset) 238 } 239 240 data := p.data(false) 241 242 // Due to merging matches, we may have a match that 243 // crosses a line boundary. Prevent confusion by 244 // taking lines until we pass the last match 245 for lineEnd < len(data) && endMatch > uint32(lineEnd) { 246 next := bytes.IndexByte(data[lineEnd+1:], '\n') 247 if next == -1 { 248 lineEnd = len(data) 249 } else { 250 // TODO(hanwen): test that checks "+1" part here. 251 lineEnd += next + 1 252 } 253 } 254 255 finalMatch := LineMatch{ 256 LineStart: lineStart, 257 LineEnd: lineEnd, 258 LineNumber: num, 259 } 260 finalMatch.Line = data[lineStart:lineEnd] 261 262 if numContextLines > 0 { 263 finalMatch.Before = p.newlines().getLines(data, num-numContextLines, num) 264 finalMatch.After = p.newlines().getLines(data, num+1, num+1+numContextLines) 265 } 266 267 for _, m := range lineCands { 268 fragment := LineFragmentMatch{ 269 Offset: m.byteOffset, 270 LineOffset: int(m.byteOffset) - lineStart, 271 MatchLength: int(m.byteMatchSz), 272 } 273 if m.symbol { 274 start := p.id.fileEndSymbol[p.idx] 275 fragment.SymbolInfo = p.id.symbols.data(start + m.symbolIdx) 276 if fragment.SymbolInfo != nil { 277 sec := p.docSections()[m.symbolIdx] 278 fragment.SymbolInfo.Sym = string(data[sec.Start:sec.End]) 279 } 280 } 281 282 finalMatch.LineFragments = append(finalMatch.LineFragments, fragment) 283 } 284 result = append(result, finalMatch) 285 } 286 return result 287} 288 289func (p *contentProvider) fillContentChunkMatches(ms []*candidateMatch, numContextLines int) []ChunkMatch { 290 newlines := p.newlines() 291 chunks := chunkCandidates(ms, newlines, numContextLines) 292 data := p.data(false) 293 chunkMatches := make([]ChunkMatch, 0, len(chunks)) 294 for _, chunk := range chunks { 295 ranges := make([]Range, 0, len(chunk.candidates)) 296 var symbolInfo []*Symbol 297 for i, cm := range chunk.candidates { 298 startOffset := cm.byteOffset 299 endOffset := cm.byteOffset + cm.byteMatchSz 300 startLine, startLineOffset, _ := newlines.atOffset(startOffset) 301 endLine, endLineOffset, _ := newlines.atOffset(endOffset) 302 303 ranges = append(ranges, Range{ 304 Start: Location{ 305 ByteOffset: startOffset, 306 LineNumber: uint32(startLine), 307 Column: uint32(utf8.RuneCount(data[startLineOffset:startOffset]) + 1), 308 }, 309 End: Location{ 310 ByteOffset: endOffset, 311 LineNumber: uint32(endLine), 312 Column: uint32(utf8.RuneCount(data[endLineOffset:endOffset]) + 1), 313 }, 314 }) 315 316 if cm.symbol { 317 if symbolInfo == nil { 318 symbolInfo = make([]*Symbol, len(chunk.candidates)) 319 } 320 start := p.id.fileEndSymbol[p.idx] 321 si := p.id.symbols.data(start + cm.symbolIdx) 322 if si != nil { 323 sec := p.docSections()[cm.symbolIdx] 324 si.Sym = string(data[sec.Start:sec.End]) 325 } 326 symbolInfo[i] = si 327 } 328 } 329 330 firstLineNumber := int(chunk.firstLine) - numContextLines 331 if firstLineNumber < 1 { 332 firstLineNumber = 1 333 } 334 firstLineStart, _ := newlines.lineBounds(firstLineNumber) 335 336 chunkMatches = append(chunkMatches, ChunkMatch{ 337 Content: newlines.getLines(data, firstLineNumber, int(chunk.lastLine)+numContextLines+1), 338 ContentStart: Location{ 339 ByteOffset: firstLineStart, 340 LineNumber: uint32(firstLineNumber), 341 Column: 1, 342 }, 343 FileName: false, 344 Ranges: ranges, 345 SymbolInfo: symbolInfo, 346 }) 347 } 348 return chunkMatches 349} 350 351type candidateChunk struct { 352 firstLine uint32 // 1-based, inclusive 353 lastLine uint32 // 1-based, inclusive 354 minOffset uint32 // 0-based, inclusive 355 maxOffset uint32 // 0-based, exclusive 356 candidates []*candidateMatch 357} 358 359// chunkCandidates groups a set of sorted, non-overlapping candidate matches by line number. Adjacent 360// chunks will be merged if adding `numContextLines` to the beginning and end of the chunk would cause 361// it to overlap with an adjacent chunk. 362func chunkCandidates(ms []*candidateMatch, newlines newlines, numContextLines int) []candidateChunk { 363 var chunks []candidateChunk 364 for _, m := range ms { 365 startOffset := m.byteOffset 366 endOffset := m.byteOffset + m.byteMatchSz 367 firstLine, _, _ := newlines.atOffset(startOffset) 368 lastLine, _, _ := newlines.atOffset(endOffset) 369 370 if len(chunks) > 0 && int(chunks[len(chunks)-1].lastLine)+numContextLines >= firstLine-numContextLines { 371 // If a new chunk created with the current candidateMatch would 372 // overlap with the previous chunk, instead add the candidateMatch 373 // to the last chunk and extend end of the last chunk. 374 last := &chunks[len(chunks)-1] 375 last.candidates = append(last.candidates, m) 376 if last.maxOffset < endOffset { 377 last.lastLine = uint32(lastLine) 378 last.maxOffset = uint32(endOffset) 379 } 380 } else { 381 chunks = append(chunks, candidateChunk{ 382 firstLine: uint32(firstLine), 383 lastLine: uint32(lastLine), 384 minOffset: startOffset, 385 maxOffset: endOffset, 386 candidates: []*candidateMatch{m}, 387 }) 388 } 389 } 390 return chunks 391} 392 393type newlines struct { 394 // locs is the sorted set of byte offsets of the newlines in the file 395 locs []uint32 396 397 // fileSize is just the number of bytes in the file. It is stored 398 // on this struct so we can safely know the length of the last line 399 // in the file since not all files end in a newline. 400 fileSize uint32 401} 402 403// atOffset returns the line containing the offset. If the offset lands on 404// the newline ending line M, we return M. The line is characterized 405// by its linenumber (base-1, byte index of line start, byte index of 406// line end). The line end is the index of a newline, or the filesize 407// (if matching the last line of the file.) 408func (nls newlines) atOffset(offset uint32) (lineNumber, lineStart, lineEnd int) { 409 idx := sort.Search(len(nls.locs), func(n int) bool { 410 return nls.locs[n] >= offset 411 }) 412 413 start, end := nls.lineBounds(idx + 1) 414 return idx + 1, int(start), int(end) 415} 416 417// lineBounds returns the byte offsets of the start and end of the 1-based 418// lineNumber. The end offset is exclusive and will not contain the line-ending 419// newline. If the line number is out of range of the lines in the file, start 420// and end will be clamped to [0,fileSize]. 421func (nls newlines) lineBounds(lineNumber int) (start, end uint32) { 422 // nls.locs[0] + 1 is the start of the 2nd line of data. 423 startIdx := lineNumber - 2 424 endIdx := lineNumber - 1 425 426 if startIdx < 0 { 427 start = 0 428 } else if startIdx >= len(nls.locs) { 429 start = nls.fileSize 430 } else { 431 start = nls.locs[startIdx] + 1 432 } 433 434 if endIdx < 0 { 435 end = 0 436 } else if endIdx >= len(nls.locs) { 437 end = nls.fileSize 438 } else { 439 end = nls.locs[endIdx] 440 } 441 442 return start, end 443} 444 445// getLines returns a slice of data containing the lines [low, high). 446// low is 1-based and inclusive. high is 1-based and exclusive. 447func (nls newlines) getLines(data []byte, low, high int) []byte { 448 if low >= high { 449 return nil 450 } 451 452 lowStart, _ := nls.lineBounds(low) 453 _, highEnd := nls.lineBounds(high - 1) 454 455 return data[lowStart:highEnd] 456} 457 458const ( 459 // Query-dependent scoring signals. All of these together are bounded at ~9000 460 // (scoreWordMatch + scoreSymbol + scoreKindMatch * 10 + scoreFactorAtomMatch). 461 scorePartialWordMatch = 50.0 462 scoreWordMatch = 500.0 463 scoreBase = 7000.0 464 scorePartialBase = 4000.0 465 scoreSymbol = 7000.0 466 scorePartialSymbol = 4000.0 467 scoreKindMatch = 100.0 468 scoreFactorAtomMatch = 400.0 469 470 // File-only scoring signals. For now these are also bounded ~9000 to give them 471 // equal weight with the query-dependent signals. 472 scoreFileRankFactor = 9000.0 473 scoreFileOrderFactor = 10.0 474 scoreRepoRankFactor = 20.0 475 476 // Used for ordering line and chunk matches within a file. 477 scoreLineOrderFactor = 1.0 478) 479 480// findSection checks whether a section defined by offset and size lies within 481// one of the sections in secs. 482func findSection(secs []DocumentSection, off, sz uint32) (int, bool) { 483 j := sort.Search(len(secs), func(i int) bool { 484 return secs[i].End >= off+sz 485 }) 486 487 if j == len(secs) { 488 return 0, false 489 } 490 491 if secs[j].Start <= off && off+sz <= secs[j].End { 492 return j, true 493 } 494 return 0, false 495} 496 497func (p *contentProvider) chunkMatchScore(secs []DocumentSection, m *ChunkMatch, language string, debug bool) (float64, string) { 498 type debugScore struct { 499 score float64 500 what string 501 } 502 503 score := &debugScore{} 504 maxScore := &debugScore{} 505 506 addScore := func(what string, s float64) { 507 if s != 0 && debug { 508 score.what += fmt.Sprintf("%s:%.2f, ", what, s) 509 } 510 score.score += s 511 } 512 513 for i, r := range m.Ranges { 514 // calculate the start and end offset relative to the start of the content 515 relStartOffset := int(r.Start.ByteOffset - m.ContentStart.ByteOffset) 516 relEndOffset := int(r.End.ByteOffset - m.ContentStart.ByteOffset) 517 518 startBoundary := relStartOffset < len(m.Content) && (relStartOffset == 0 || byteClass(m.Content[relStartOffset-1]) != byteClass(m.Content[relStartOffset])) 519 endBoundary := relEndOffset > 0 && (relEndOffset == len(m.Content) || byteClass(m.Content[relEndOffset-1]) != byteClass(m.Content[relEndOffset])) 520 521 score.score = 0 522 score.what = "" 523 524 if startBoundary && endBoundary { 525 addScore("WordMatch", scoreWordMatch) 526 } else if startBoundary || endBoundary { 527 addScore("PartialWordMatch", scorePartialWordMatch) 528 } 529 530 if m.FileName { 531 sep := bytes.LastIndexByte(m.Content, '/') 532 startMatch := relStartOffset == sep+1 533 endMatch := relEndOffset == len(m.Content) 534 if startMatch && endMatch { 535 addScore("Base", scoreBase) 536 } else if startMatch || endMatch { 537 addScore("EdgeBase", (scoreBase+scorePartialBase)/2) 538 } else if sep < relStartOffset { 539 addScore("InnerBase", scorePartialBase) 540 } 541 } else if secIdx, ok := findSection(secs, uint32(r.Start.ByteOffset), uint32(r.End.ByteOffset-r.Start.ByteOffset)); ok { 542 sec := secs[secIdx] 543 startMatch := sec.Start == uint32(r.Start.ByteOffset) 544 endMatch := sec.End == uint32(r.End.ByteOffset) 545 if startMatch && endMatch { 546 addScore("Symbol", scoreSymbol) 547 } else if startMatch || endMatch { 548 addScore("EdgeSymbol", (scoreSymbol+scorePartialSymbol)/2) 549 } else { 550 addScore("InnerSymbol", scorePartialSymbol) 551 } 552 553 var si *Symbol 554 if m.SymbolInfo != nil { 555 si = m.SymbolInfo[i] 556 } 557 if si == nil { 558 // for non-symbol queries, we need to hydrate in SymbolInfo. 559 start := p.id.fileEndSymbol[p.idx] 560 si = p.id.symbols.data(start + uint32(secIdx)) 561 } 562 if si != nil { 563 addScore(fmt.Sprintf("kind:%s:%s", language, si.Kind), scoreKind(language, si.Kind)) 564 } 565 } 566 567 if score.score > maxScore.score { 568 maxScore.score = score.score 569 maxScore.what = score.what 570 } 571 } 572 573 if debug { 574 maxScore.what = fmt.Sprintf("score:%.2f <- %s", maxScore.score, strings.TrimSuffix(maxScore.what, ", ")) 575 } 576 577 return maxScore.score, maxScore.what 578} 579 580func (p *contentProvider) matchScore(secs []DocumentSection, m *LineMatch, language string, debug bool) (float64, string) { 581 type debugScore struct { 582 score float64 583 what string 584 } 585 586 score := &debugScore{} 587 maxScore := &debugScore{} 588 589 addScore := func(what string, s float64) { 590 if s != 0 && debug { 591 score.what += fmt.Sprintf("%s:%.2f, ", what, s) 592 } 593 score.score += s 594 } 595 596 for _, f := range m.LineFragments { 597 startBoundary := f.LineOffset < len(m.Line) && (f.LineOffset == 0 || byteClass(m.Line[f.LineOffset-1]) != byteClass(m.Line[f.LineOffset])) 598 599 end := int(f.LineOffset) + f.MatchLength 600 endBoundary := end > 0 && (end == len(m.Line) || byteClass(m.Line[end-1]) != byteClass(m.Line[end])) 601 602 score.score = 0 603 score.what = "" 604 605 if startBoundary && endBoundary { 606 addScore("WordMatch", scoreWordMatch) 607 } else if startBoundary || endBoundary { 608 addScore("PartialWordMatch", scorePartialWordMatch) 609 } 610 611 if m.FileName { 612 sep := bytes.LastIndexByte(m.Line, '/') 613 startMatch := sep+1 == f.LineOffset 614 endMatch := len(m.Line) == f.LineOffset+f.MatchLength 615 if startMatch && endMatch { 616 addScore("Base", scoreBase) 617 } else if startMatch || endMatch { 618 addScore("EdgeBase", (scoreBase+scorePartialBase)/2) 619 } else if sep < f.LineOffset { 620 addScore("InnerBase", scorePartialBase) 621 } 622 } else if secIdx, ok := findSection(secs, f.Offset, uint32(f.MatchLength)); ok { 623 sec := secs[secIdx] 624 startMatch := sec.Start == f.Offset 625 endMatch := sec.End == f.Offset+uint32(f.MatchLength) 626 if startMatch && endMatch { 627 addScore("Symbol", scoreSymbol) 628 } else if startMatch || endMatch { 629 addScore("EdgeSymbol", (scoreSymbol+scorePartialSymbol)/2) 630 } else { 631 addScore("InnerSymbol", scorePartialSymbol) 632 } 633 634 si := f.SymbolInfo 635 if si == nil { 636 // for non-symbol queries, we need to hydrate in SymbolInfo. 637 start := p.id.fileEndSymbol[p.idx] 638 si = p.id.symbols.data(start + uint32(secIdx)) 639 } 640 if si != nil { 641 // the LineFragment may not be on a symbol, then si will be nil. 642 addScore(fmt.Sprintf("kind:%s:%s", language, si.Kind), scoreKind(language, si.Kind)) 643 } 644 } 645 646 if score.score > maxScore.score { 647 maxScore.score = score.score 648 maxScore.what = score.what 649 } 650 } 651 652 if debug { 653 maxScore.what = fmt.Sprintf("score:%.2f <- %s", maxScore.score, strings.TrimSuffix(maxScore.what, ", ")) 654 } 655 656 return maxScore.score, maxScore.what 657} 658 659// scoreKind boosts a match based on the combination of language and kind. The 660// language string comes from go-enry, the kind string from ctags. 661func scoreKind(language string, kind string) float64 { 662 var factor float64 663 664 // Generic ranking which will be overriden by language specific ranking 665 switch kind { 666 case "type": // scip-ctags regression workaround https://github.com/sourcegraph/sourcegraph/issues/57659 667 factor = 8 668 case "class": 669 factor = 10 670 case "struct": 671 factor = 9.5 672 case "enum": 673 factor = 9 674 case "interface": 675 factor = 8 676 case "function", "func", "method": 677 factor = 7 678 case "member", "field": 679 factor = 5.5 680 case "constant", "const": 681 factor = 5 682 case "var", "variable": 683 factor = 4 684 685 default: 686 // No idea what it is, but its something regarded as a symbol 687 factor = 1 688 } 689 690 // Refer to universal-ctags --list-kinds-full=<language> to learn about which 691 // kinds are detected for which language. 692 // 693 // Note that go-ctags uses universal-ctags's interactive mode and thus returns 694 // the full name for "kind" and not the one-letter abbreviation. 695 switch language { 696 case "Java", "java": 697 switch kind { 698 // 2022-03-30: go-ctags contains a regex rule for Java classes that sets "kind" 699 // to "classes" instead of "c". We have to cover both cases to support existing 700 // indexes. 701 case "class", "classes": 702 factor = 10 703 case "enum": 704 factor = 9 705 case "interface": 706 factor = 8 707 case "method": 708 factor = 7 709 case "field": 710 factor = 6 711 case "enumConstant": 712 factor = 5 713 } 714 case "Kotlin", "kotlin": 715 switch kind { 716 case "class": 717 factor = 10 718 case "interface": 719 factor = 9 720 case "method": 721 factor = 8 722 case "typealias": 723 factor = 7 724 case "constant": 725 factor = 6 726 case "variable": 727 factor = 5 728 } 729 case "Go", "go": 730 switch kind { 731 // scip-ctags regression workaround https://github.com/sourcegraph/sourcegraph/issues/57659 732 // for each case a description of the fields in ctags in the comment 733 case "type": // interface struct talias 734 factor = 10 735 case "method", "function": // methodSpec 736 factor = 8 737 case "variable": // var member 738 factor = 7 739 case "constant": // const 740 factor = 6 741 742 case "interface": // interfaces 743 factor = 10 744 case "struct": // structs 745 factor = 9 746 case "talias": // type aliases 747 factor = 9 748 case "methodSpec": // interface method specification 749 factor = 8.5 750 case "func": // functions 751 factor = 8 752 case "member": // struct members 753 factor = 7 754 case "const": // constants 755 factor = 6 756 case "var": // variables 757 factor = 5 758 } 759 // Could also rank on: 760 // 761 // - anonMember struct anonymous members 762 // - packageName name for specifying imported package 763 // - receiver receivers 764 // - package packages 765 // - type types 766 // - unknown unknown 767 case "C++", "c++": 768 switch kind { 769 case "class": // classes 770 factor = 10 771 case "enum": // enumeration names 772 factor = 9 773 case "function": // function definitions 774 factor = 8 775 case "struct": // structure names 776 factor = 7 777 case "union": // union names 778 factor = 6 779 case "typdef": // typedefs 780 factor = 5 781 case "member": // class, struct, and union members 782 factor = 4 783 case "variable": // varialbe definitions 784 factor = 3 785 } 786 // Could also rank on: 787 // NAME DESCRIPTION 788 // macro macro definitions 789 // enumerator enumerators (values inside an enumeration) 790 // header included header files 791 // namespace namespaces 792 // variable variable definitions 793 case "Scala", "scala": 794 switch kind { 795 case "class": 796 factor = 10 797 case "interface": 798 factor = 9 799 case "object": 800 factor = 8 801 case "method": 802 factor = 7 803 case "type": 804 factor = 6 805 case "variable": 806 factor = 5 807 case "package": 808 factor = 4 809 } 810 case "Python", "python": 811 switch kind { 812 case "class": // classes 813 factor = 10 814 case "function": // function definitions 815 factor = 8 816 case "member": // class, struct, and union members 817 factor = 4 818 case "variable": // variable definitions 819 factor = 3 820 case "local": // local variables 821 factor = 2 822 } 823 // Could also rank on: 824 // 825 // - namespace name referring a module defined in other file 826 // - module modules 827 // - unknown name referring a class/variable/function/module defined in other module 828 // - parameter function parameters 829 case "Ruby", "ruby": 830 switch kind { 831 case "class": 832 factor = 10 833 case "method": 834 factor = 9 835 case "alias": 836 factor = 8 837 case "module": 838 factor = 7 839 case "singletonMethod": 840 factor = 6 841 case "constant": 842 factor = 5 843 case "accessor": 844 factor = 4 845 case "library": 846 factor = 3 847 } 848 case "PHP", "php": 849 switch kind { 850 case "class": 851 factor = 10 852 case "interface": 853 factor = 9 854 case "function": 855 factor = 8 856 case "trait": 857 factor = 7 858 case "define": 859 factor = 6 860 case "namespace": 861 factor = 5 862 case "alias": 863 factor = 4 864 case "variable": 865 factor = 3 866 case "local": 867 factor = 3 868 } 869 case "GraphQL", "graphql": 870 switch kind { 871 case "type": 872 factor = 10 873 } 874 case "Markdown", "markdown": 875 // Headers are good signal in docs, but do not rank as highly as code. 876 switch kind { 877 case "chapter": // # 878 factor = 4 879 case "section": // ## 880 factor = 3 881 case "subsection": // ### 882 factor = 2 883 } 884 } 885 886 return factor * scoreKindMatch 887} 888 889type matchScoreSlice []LineMatch 890 891func (m matchScoreSlice) Len() int { return len(m) } 892func (m matchScoreSlice) Swap(i, j int) { m[i], m[j] = m[j], m[i] } 893func (m matchScoreSlice) Less(i, j int) bool { return m[i].Score > m[j].Score } 894 895type chunkMatchScoreSlice []ChunkMatch 896 897func (m chunkMatchScoreSlice) Len() int { return len(m) } 898func (m chunkMatchScoreSlice) Swap(i, j int) { m[i], m[j] = m[j], m[i] } 899func (m chunkMatchScoreSlice) Less(i, j int) bool { return m[i].Score > m[j].Score } 900 901type fileMatchesByScore []FileMatch 902 903func (m fileMatchesByScore) Len() int { return len(m) } 904func (m fileMatchesByScore) Swap(i, j int) { m[i], m[j] = m[j], m[i] } 905func (m fileMatchesByScore) Less(i, j int) bool { return m[i].Score > m[j].Score } 906 907func sortMatchesByScore(ms []LineMatch) { 908 sort.Sort(matchScoreSlice(ms)) 909} 910 911func sortChunkMatchesByScore(ms []ChunkMatch) { 912 sort.Sort(chunkMatchScoreSlice(ms)) 913} 914 915var doNovelty = os.Getenv("ZOEKT_NOVELTY_DISABLE") == "" 916 917// SortFiles sorts files matches in the order we want to present results to 918// users. The order depends on the match score, which includes both 919// query-dependent signals like word overlap, and file-only signals like the 920// file ranks (if file ranks are enabled). 921// 922// We don't only use the scores, we will also boost some results to present 923// files with novel extensions. 924func SortFiles(ms []FileMatch) { 925 sort.Sort(fileMatchesByScore(ms)) 926 927 if doNovelty { 928 // Experimentally boost something into the third filematch 929 boostNovelExtension(ms, 2, 0.9) 930 } 931} 932 933func boostNovelExtension(ms []FileMatch, boostOffset int, minScoreRatio float64) { 934 if len(ms) <= boostOffset+1 { 935 return 936 } 937 938 top := ms[:boostOffset] 939 candidates := ms[boostOffset:] 940 941 // Don't bother boosting something which is significantly different to the 942 // result it replaces. 943 minScoreForNovelty := candidates[0].Score * minScoreRatio 944 945 // We want to look for an ext that isn't in the top exts 946 exts := make([]string, len(top)) 947 for i := range top { 948 exts[i] = path.Ext(top[i].FileName) 949 } 950 951 for i := range candidates { 952 // Do not assume sorted due to boostNovelExtension being called on subsets 953 if candidates[i].Score < minScoreForNovelty { 954 continue 955 } 956 957 if slices.Contains(exts, path.Ext(candidates[i].FileName)) { 958 continue 959 } 960 961 // Found what we are looking for, now boost to front of candidates (which 962 // is ms[boostOffset]) 963 for ; i > 0; i-- { 964 candidates[i], candidates[i-1] = candidates[i-1], candidates[i] 965 } 966 return 967 } 968}