fork of https://github.com/sourcegraph/zoekt
0

Configure Feed

Select the types of activity you want to include in your feed.

1// Copyright 2016 Google Inc. All rights reserved. 2// 3// Licensed under the Apache License, Version 2.0 (the "License"); 4// you may not use this file except in compliance with the License. 5// You may obtain a copy of the License at 6// 7// http://www.apache.org/licenses/LICENSE-2.0 8// 9// Unless required by applicable law or agreed to in writing, software 10// distributed under the License is distributed on an "AS IS" BASIS, 11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12// See the License for the specific language governing permissions and 13// limitations under the License. 14 15package zoekt 16 17import ( 18 "bytes" 19 "fmt" 20 "log" 21 "os" 22 "path" 23 "sort" 24 "strings" 25 "unicode" 26 "unicode/utf8" 27 28 "golang.org/x/exp/slices" 29) 30 31var _ = log.Println 32 33// contentProvider is an abstraction to treat matches for names and 34// content with the same code. 35type contentProvider struct { 36 id *indexData 37 stats *Stats 38 39 // mutable 40 err error 41 idx uint32 42 _data []byte 43 _nl []uint32 44 _nlBuf []uint32 45 _sects []DocumentSection 46 _sectBuf []DocumentSection 47 fileSize uint32 48} 49 50// setDocument skips to the given document. 51func (p *contentProvider) setDocument(docID uint32) { 52 fileStart := p.id.boundaries[docID] 53 54 p.idx = docID 55 p.fileSize = p.id.boundaries[docID+1] - fileStart 56 57 p._nl = nil 58 p._sects = nil 59 p._data = nil 60} 61 62func (p *contentProvider) docSections() []DocumentSection { 63 if p._sects == nil { 64 var sz uint32 65 p._sects, sz, p.err = p.id.readDocSections(p.idx, p._sectBuf) 66 p.stats.ContentBytesLoaded += int64(sz) 67 p._sectBuf = p._sects 68 } 69 return p._sects 70} 71 72func (p *contentProvider) newlines() newlines { 73 if p._nl == nil { 74 var sz uint32 75 p._nl, sz, p.err = p.id.readNewlines(p.idx, p._nlBuf) 76 p._nlBuf = p._nl 77 p.stats.ContentBytesLoaded += int64(sz) 78 } 79 return newlines{locs: p._nl, fileSize: p.fileSize} 80} 81 82func (p *contentProvider) data(fileName bool) []byte { 83 if fileName { 84 return p.id.fileNameContent[p.id.fileNameIndex[p.idx]:p.id.fileNameIndex[p.idx+1]] 85 } 86 87 if p._data == nil { 88 p._data, p.err = p.id.readContents(p.idx) 89 p.stats.FilesLoaded++ 90 p.stats.ContentBytesLoaded += int64(len(p._data)) 91 } 92 return p._data 93} 94 95// Find offset in bytes (relative to corpus start) for an offset in 96// runes (relative to document start). If filename is set, the corpus 97// is the set of filenames, with the document being the name itself. 98func (p *contentProvider) findOffset(filename bool, r uint32) uint32 { 99 if p.id.metaData.PlainASCII { 100 return r 101 } 102 103 sample := p.id.runeOffsets 104 runeEnds := p.id.fileEndRunes 105 fileStartByte := p.id.boundaries[p.idx] 106 if filename { 107 sample = p.id.fileNameRuneOffsets 108 runeEnds = p.id.fileNameEndRunes 109 fileStartByte = p.id.fileNameIndex[p.idx] 110 } 111 112 absR := r 113 if p.idx > 0 { 114 absR += runeEnds[p.idx-1] 115 } 116 117 byteOff, left := sample.lookup(absR) 118 119 var data []byte 120 121 if filename { 122 data = p.id.fileNameContent[byteOff:] 123 } else { 124 data, p.err = p.id.readContentSlice(byteOff, 3*runeOffsetFrequency) 125 if p.err != nil { 126 return 0 127 } 128 } 129 for left > 0 { 130 _, sz := utf8.DecodeRune(data) 131 byteOff += uint32(sz) 132 data = data[sz:] 133 left-- 134 } 135 136 byteOff -= fileStartByte 137 return byteOff 138} 139 140func (p *contentProvider) fillMatches(ms []*candidateMatch, numContextLines int, language string, debug bool) []LineMatch { 141 var result []LineMatch 142 if ms[0].fileName { 143 // There is only "line" in a filename. 144 res := LineMatch{ 145 Line: p.id.fileName(p.idx), 146 FileName: true, 147 } 148 149 for _, m := range ms { 150 res.LineFragments = append(res.LineFragments, LineFragmentMatch{ 151 LineOffset: int(m.byteOffset), 152 MatchLength: int(m.byteMatchSz), 153 Offset: m.byteOffset, 154 }) 155 156 result = []LineMatch{res} 157 } 158 } else { 159 ms = breakMatchesOnNewlines(ms, p.data(false)) 160 result = p.fillContentMatches(ms, numContextLines) 161 } 162 163 sects := p.docSections() 164 for i, m := range result { 165 result[i].Score, result[i].DebugScore = p.matchScore(sects, &m, language, debug) 166 } 167 168 return result 169} 170 171func (p *contentProvider) fillChunkMatches(ms []*candidateMatch, numContextLines int, language string, debug bool) []ChunkMatch { 172 var result []ChunkMatch 173 if ms[0].fileName { 174 // If the first match is a filename match, there will only be 175 // one match and the matched content will be the filename. 176 177 fileName := p.id.fileName(p.idx) 178 ranges := make([]Range, 0, len(ms)) 179 for _, m := range ms { 180 ranges = append(ranges, Range{ 181 Start: Location{ 182 ByteOffset: m.byteOffset, 183 LineNumber: 1, 184 Column: uint32(utf8.RuneCount(fileName[:m.byteOffset]) + 1), 185 }, 186 End: Location{ 187 ByteOffset: m.byteOffset + m.byteMatchSz, 188 LineNumber: 1, 189 Column: uint32(utf8.RuneCount(fileName[:m.byteOffset+m.byteMatchSz]) + 1), 190 }, 191 }) 192 } 193 194 result = []ChunkMatch{{ 195 Content: fileName, 196 ContentStart: Location{ByteOffset: 0, LineNumber: 1, Column: 1}, 197 Ranges: ranges, 198 FileName: true, 199 }} 200 } else { 201 result = p.fillContentChunkMatches(ms, numContextLines) 202 } 203 204 sects := p.docSections() 205 for i, m := range result { 206 result[i].Score, result[i].DebugScore = p.chunkMatchScore(sects, &m, language, debug) 207 } 208 209 return result 210} 211 212func (p *contentProvider) fillContentMatches(ms []*candidateMatch, numContextLines int) []LineMatch { 213 var result []LineMatch 214 for len(ms) > 0 { 215 m := ms[0] 216 num, lineStart, lineEnd := p.newlines().atOffset(m.byteOffset) 217 218 var lineCands []*candidateMatch 219 220 endMatch := m.byteOffset + m.byteMatchSz 221 222 for len(ms) > 0 { 223 m := ms[0] 224 if int(m.byteOffset) <= lineEnd { 225 endMatch = m.byteOffset + m.byteMatchSz 226 lineCands = append(lineCands, m) 227 ms = ms[1:] 228 } else { 229 break 230 } 231 } 232 233 if len(lineCands) == 0 { 234 log.Panicf( 235 "%s %v infinite loop: num %d start,end %d,%d, offset %d", 236 p.id.fileName(p.idx), p.id.metaData, 237 num, lineStart, lineEnd, 238 m.byteOffset) 239 } 240 241 data := p.data(false) 242 243 // Due to merging matches, we may have a match that 244 // crosses a line boundary. Prevent confusion by 245 // taking lines until we pass the last match 246 for lineEnd < len(data) && endMatch > uint32(lineEnd) { 247 next := bytes.IndexByte(data[lineEnd+1:], '\n') 248 if next == -1 { 249 lineEnd = len(data) 250 } else { 251 // TODO(hanwen): test that checks "+1" part here. 252 lineEnd += next + 1 253 } 254 } 255 256 finalMatch := LineMatch{ 257 LineStart: lineStart, 258 LineEnd: lineEnd, 259 LineNumber: num, 260 } 261 finalMatch.Line = data[lineStart:lineEnd] 262 263 if numContextLines > 0 { 264 finalMatch.Before = p.newlines().getLines(data, num-numContextLines, num) 265 finalMatch.After = p.newlines().getLines(data, num+1, num+1+numContextLines) 266 } 267 268 for _, m := range lineCands { 269 fragment := LineFragmentMatch{ 270 Offset: m.byteOffset, 271 LineOffset: int(m.byteOffset) - lineStart, 272 MatchLength: int(m.byteMatchSz), 273 } 274 if m.symbol { 275 start := p.id.fileEndSymbol[p.idx] 276 fragment.SymbolInfo = p.id.symbols.data(start + m.symbolIdx) 277 if fragment.SymbolInfo != nil { 278 sec := p.docSections()[m.symbolIdx] 279 fragment.SymbolInfo.Sym = string(data[sec.Start:sec.End]) 280 } 281 } 282 283 finalMatch.LineFragments = append(finalMatch.LineFragments, fragment) 284 } 285 result = append(result, finalMatch) 286 } 287 return result 288} 289 290func (p *contentProvider) fillContentChunkMatches(ms []*candidateMatch, numContextLines int) []ChunkMatch { 291 newlines := p.newlines() 292 chunks := chunkCandidates(ms, newlines, numContextLines) 293 data := p.data(false) 294 chunkMatches := make([]ChunkMatch, 0, len(chunks)) 295 for _, chunk := range chunks { 296 ranges := make([]Range, 0, len(chunk.candidates)) 297 var symbolInfo []*Symbol 298 for i, cm := range chunk.candidates { 299 startOffset := cm.byteOffset 300 endOffset := cm.byteOffset + cm.byteMatchSz 301 startLine, startLineOffset, _ := newlines.atOffset(startOffset) 302 endLine, endLineOffset, _ := newlines.atOffset(endOffset) 303 304 ranges = append(ranges, Range{ 305 Start: Location{ 306 ByteOffset: startOffset, 307 LineNumber: uint32(startLine), 308 Column: uint32(utf8.RuneCount(data[startLineOffset:startOffset]) + 1), 309 }, 310 End: Location{ 311 ByteOffset: endOffset, 312 LineNumber: uint32(endLine), 313 Column: uint32(utf8.RuneCount(data[endLineOffset:endOffset]) + 1), 314 }, 315 }) 316 317 if cm.symbol { 318 if symbolInfo == nil { 319 symbolInfo = make([]*Symbol, len(chunk.candidates)) 320 } 321 start := p.id.fileEndSymbol[p.idx] 322 si := p.id.symbols.data(start + cm.symbolIdx) 323 if si != nil { 324 sec := p.docSections()[cm.symbolIdx] 325 si.Sym = string(data[sec.Start:sec.End]) 326 } 327 symbolInfo[i] = si 328 } 329 } 330 331 firstLineNumber := int(chunk.firstLine) - numContextLines 332 if firstLineNumber < 1 { 333 firstLineNumber = 1 334 } 335 firstLineStart, _ := newlines.lineBounds(firstLineNumber) 336 337 chunkMatches = append(chunkMatches, ChunkMatch{ 338 Content: newlines.getLines(data, firstLineNumber, int(chunk.lastLine)+numContextLines+1), 339 ContentStart: Location{ 340 ByteOffset: firstLineStart, 341 LineNumber: uint32(firstLineNumber), 342 Column: 1, 343 }, 344 FileName: false, 345 Ranges: ranges, 346 SymbolInfo: symbolInfo, 347 }) 348 } 349 return chunkMatches 350} 351 352type candidateChunk struct { 353 firstLine uint32 // 1-based, inclusive 354 lastLine uint32 // 1-based, inclusive 355 minOffset uint32 // 0-based, inclusive 356 maxOffset uint32 // 0-based, exclusive 357 candidates []*candidateMatch 358} 359 360// chunkCandidates groups a set of sorted, non-overlapping candidate matches by line number. Adjacent 361// chunks will be merged if adding `numContextLines` to the beginning and end of the chunk would cause 362// it to overlap with an adjacent chunk. 363func chunkCandidates(ms []*candidateMatch, newlines newlines, numContextLines int) []candidateChunk { 364 var chunks []candidateChunk 365 for _, m := range ms { 366 startOffset := m.byteOffset 367 endOffset := m.byteOffset + m.byteMatchSz 368 firstLine, _, _ := newlines.atOffset(startOffset) 369 lastLine, _, _ := newlines.atOffset(endOffset) 370 371 if len(chunks) > 0 && int(chunks[len(chunks)-1].lastLine)+numContextLines >= firstLine-numContextLines { 372 // If a new chunk created with the current candidateMatch would 373 // overlap with the previous chunk, instead add the candidateMatch 374 // to the last chunk and extend end of the last chunk. 375 last := &chunks[len(chunks)-1] 376 last.candidates = append(last.candidates, m) 377 if last.maxOffset < endOffset { 378 last.lastLine = uint32(lastLine) 379 last.maxOffset = uint32(endOffset) 380 } 381 } else { 382 chunks = append(chunks, candidateChunk{ 383 firstLine: uint32(firstLine), 384 lastLine: uint32(lastLine), 385 minOffset: startOffset, 386 maxOffset: endOffset, 387 candidates: []*candidateMatch{m}, 388 }) 389 } 390 } 391 return chunks 392} 393 394type newlines struct { 395 // locs is the sorted set of byte offsets of the newlines in the file 396 locs []uint32 397 398 // fileSize is just the number of bytes in the file. It is stored 399 // on this struct so we can safely know the length of the last line 400 // in the file since not all files end in a newline. 401 fileSize uint32 402} 403 404// atOffset returns the line containing the offset. If the offset lands on 405// the newline ending line M, we return M. The line is characterized 406// by its linenumber (base-1, byte index of line start, byte index of 407// line end). The line end is the index of a newline, or the filesize 408// (if matching the last line of the file.) 409func (nls newlines) atOffset(offset uint32) (lineNumber, lineStart, lineEnd int) { 410 idx := sort.Search(len(nls.locs), func(n int) bool { 411 return nls.locs[n] >= offset 412 }) 413 414 start, end := nls.lineBounds(idx + 1) 415 return idx + 1, int(start), int(end) 416} 417 418// lineBounds returns the byte offsets of the start and end of the 1-based 419// lineNumber. The end offset is exclusive and will not contain the line-ending 420// newline. If the line number is out of range of the lines in the file, start 421// and end will be clamped to [0,fileSize]. 422func (nls newlines) lineBounds(lineNumber int) (start, end uint32) { 423 // nls.locs[0] + 1 is the start of the 2nd line of data. 424 startIdx := lineNumber - 2 425 endIdx := lineNumber - 1 426 427 if startIdx < 0 { 428 start = 0 429 } else if startIdx >= len(nls.locs) { 430 start = nls.fileSize 431 } else { 432 start = nls.locs[startIdx] + 1 433 } 434 435 if endIdx < 0 { 436 end = 0 437 } else if endIdx >= len(nls.locs) { 438 end = nls.fileSize 439 } else { 440 end = nls.locs[endIdx] 441 } 442 443 return start, end 444} 445 446// getLines returns a slice of data containing the lines [low, high). 447// low is 1-based and inclusive. high is 1-based and exclusive. 448func (nls newlines) getLines(data []byte, low, high int) []byte { 449 if low >= high { 450 return nil 451 } 452 453 lowStart, _ := nls.lineBounds(low) 454 _, highEnd := nls.lineBounds(high - 1) 455 456 return data[lowStart:highEnd] 457} 458 459const ( 460 // Query-dependent scoring signals. All of these together are bounded at ~9000 461 // (scoreWordMatch + scoreSymbol + scoreKindMatch * 10 + scoreFactorAtomMatch). 462 scorePartialWordMatch = 50.0 463 scoreWordMatch = 500.0 464 scoreBase = 7000.0 465 scorePartialBase = 4000.0 466 scoreSymbol = 7000.0 467 scorePartialSymbol = 4000.0 468 scoreKindMatch = 100.0 469 scoreFactorAtomMatch = 400.0 470 471 // File-only scoring signals. For now these are also bounded ~9000 to give them 472 // equal weight with the query-dependent signals. 473 scoreFileRankFactor = 9000.0 474 scoreFileOrderFactor = 10.0 475 scoreRepoRankFactor = 20.0 476 477 // Used for ordering line and chunk matches within a file. 478 scoreLineOrderFactor = 1.0 479) 480 481// findSection checks whether a section defined by offset and size lies within 482// one of the sections in secs. 483func findSection(secs []DocumentSection, off, sz uint32) (int, bool) { 484 j := sort.Search(len(secs), func(i int) bool { 485 return secs[i].End >= off+sz 486 }) 487 488 if j == len(secs) { 489 return 0, false 490 } 491 492 if secs[j].Start <= off && off+sz <= secs[j].End { 493 return j, true 494 } 495 return 0, false 496} 497 498func (p *contentProvider) chunkMatchScore(secs []DocumentSection, m *ChunkMatch, language string, debug bool) (float64, string) { 499 type debugScore struct { 500 score float64 501 what string 502 } 503 504 score := &debugScore{} 505 maxScore := &debugScore{} 506 507 addScore := func(what string, s float64) { 508 if s != 0 && debug { 509 score.what += fmt.Sprintf("%s:%.2f, ", what, s) 510 } 511 score.score += s 512 } 513 514 data := p.data(m.FileName) 515 filename := p.data(true) 516 517 for i, r := range m.Ranges { 518 // calculate the start and end offset relative to the start of the content 519 relStartOffset := int(r.Start.ByteOffset - m.ContentStart.ByteOffset) 520 relEndOffset := int(r.End.ByteOffset - m.ContentStart.ByteOffset) 521 522 startBoundary := relStartOffset < len(m.Content) && (relStartOffset == 0 || byteClass(m.Content[relStartOffset-1]) != byteClass(m.Content[relStartOffset])) 523 endBoundary := relEndOffset > 0 && (relEndOffset == len(m.Content) || byteClass(m.Content[relEndOffset-1]) != byteClass(m.Content[relEndOffset])) 524 525 score.score = 0 526 score.what = "" 527 528 if startBoundary && endBoundary { 529 addScore("WordMatch", scoreWordMatch) 530 } else if startBoundary || endBoundary { 531 addScore("PartialWordMatch", scorePartialWordMatch) 532 } 533 534 if m.FileName { 535 sep := bytes.LastIndexByte(m.Content, '/') 536 startMatch := relStartOffset == sep+1 537 endMatch := relEndOffset == len(m.Content) 538 if startMatch && endMatch { 539 addScore("Base", scoreBase) 540 } else if startMatch || endMatch { 541 addScore("EdgeBase", (scoreBase+scorePartialBase)/2) 542 } else if sep < relStartOffset { 543 addScore("InnerBase", scorePartialBase) 544 } 545 } else if secIdx, ok := findSection(secs, uint32(r.Start.ByteOffset), uint32(r.End.ByteOffset-r.Start.ByteOffset)); ok { 546 sec := secs[secIdx] 547 startMatch := sec.Start == uint32(r.Start.ByteOffset) 548 endMatch := sec.End == uint32(r.End.ByteOffset) 549 if startMatch && endMatch { 550 addScore("Symbol", scoreSymbol) 551 } else if startMatch || endMatch { 552 addScore("EdgeSymbol", (scoreSymbol+scorePartialSymbol)/2) 553 } else { 554 addScore("InnerSymbol", scorePartialSymbol) 555 } 556 557 var si *Symbol 558 if m.SymbolInfo != nil { 559 si = m.SymbolInfo[i] 560 } 561 if si == nil { 562 // for non-symbol queries, we need to hydrate in SymbolInfo. 563 start := p.id.fileEndSymbol[p.idx] 564 si = p.id.symbols.data(start + uint32(secIdx)) 565 } 566 if si != nil { 567 sym := sectionSlice(data, sec) 568 addScore(fmt.Sprintf("kind:%s:%s", language, si.Kind), scoreSymbolKind(language, filename, sym, si.Kind)) 569 } 570 } 571 572 if score.score > maxScore.score { 573 maxScore.score = score.score 574 maxScore.what = score.what 575 } 576 } 577 578 if debug { 579 maxScore.what = fmt.Sprintf("score:%.2f <- %s", maxScore.score, strings.TrimSuffix(maxScore.what, ", ")) 580 } 581 582 return maxScore.score, maxScore.what 583} 584 585func (p *contentProvider) matchScore(secs []DocumentSection, m *LineMatch, language string, debug bool) (float64, string) { 586 type debugScore struct { 587 score float64 588 what string 589 } 590 591 score := &debugScore{} 592 maxScore := &debugScore{} 593 594 addScore := func(what string, s float64) { 595 if s != 0 && debug { 596 score.what += fmt.Sprintf("%s:%.2f, ", what, s) 597 } 598 score.score += s 599 } 600 601 data := p.data(m.FileName) 602 filename := p.data(true) 603 604 for _, f := range m.LineFragments { 605 startBoundary := f.LineOffset < len(m.Line) && (f.LineOffset == 0 || byteClass(m.Line[f.LineOffset-1]) != byteClass(m.Line[f.LineOffset])) 606 607 end := int(f.LineOffset) + f.MatchLength 608 endBoundary := end > 0 && (end == len(m.Line) || byteClass(m.Line[end-1]) != byteClass(m.Line[end])) 609 610 score.score = 0 611 score.what = "" 612 613 if startBoundary && endBoundary { 614 addScore("WordMatch", scoreWordMatch) 615 } else if startBoundary || endBoundary { 616 addScore("PartialWordMatch", scorePartialWordMatch) 617 } 618 619 if m.FileName { 620 sep := bytes.LastIndexByte(m.Line, '/') 621 startMatch := sep+1 == f.LineOffset 622 endMatch := len(m.Line) == f.LineOffset+f.MatchLength 623 if startMatch && endMatch { 624 addScore("Base", scoreBase) 625 } else if startMatch || endMatch { 626 addScore("EdgeBase", (scoreBase+scorePartialBase)/2) 627 } else if sep < f.LineOffset { 628 addScore("InnerBase", scorePartialBase) 629 } 630 } else if secIdx, ok := findSection(secs, f.Offset, uint32(f.MatchLength)); ok { 631 sec := secs[secIdx] 632 startMatch := sec.Start == f.Offset 633 endMatch := sec.End == f.Offset+uint32(f.MatchLength) 634 if startMatch && endMatch { 635 addScore("Symbol", scoreSymbol) 636 } else if startMatch || endMatch { 637 addScore("EdgeSymbol", (scoreSymbol+scorePartialSymbol)/2) 638 } else { 639 addScore("InnerSymbol", scorePartialSymbol) 640 } 641 642 si := f.SymbolInfo 643 if si == nil { 644 // for non-symbol queries, we need to hydrate in SymbolInfo. 645 start := p.id.fileEndSymbol[p.idx] 646 si = p.id.symbols.data(start + uint32(secIdx)) 647 } 648 if si != nil { 649 // the LineFragment may not be on a symbol, then si will be nil. 650 sym := sectionSlice(data, sec) 651 addScore(fmt.Sprintf("kind:%s:%s", language, si.Kind), scoreSymbolKind(language, filename, sym, si.Kind)) 652 } 653 } 654 655 if score.score > maxScore.score { 656 maxScore.score = score.score 657 maxScore.what = score.what 658 } 659 } 660 661 if debug { 662 maxScore.what = fmt.Sprintf("score:%.2f <- %s", maxScore.score, strings.TrimSuffix(maxScore.what, ", ")) 663 } 664 665 return maxScore.score, maxScore.what 666} 667 668// sectionSlice will return data[sec.Start:sec.End] but will clip Start and 669// End such that it won't be out of range. 670func sectionSlice(data []byte, sec DocumentSection) []byte { 671 l := uint32(len(data)) 672 if sec.Start >= l { 673 return nil 674 } 675 if sec.End > l { 676 sec.End = l 677 } 678 return data[sec.Start:sec.End] 679} 680 681// scoreSymbolKind boosts a match based on the combination of language, symbol 682// and kind. The language string comes from go-enry, the symbol and kind from 683// ctags. 684func scoreSymbolKind(language string, filename []byte, sym []byte, kind string) float64 { 685 var factor float64 686 687 // Generic ranking which will be overriden by language specific ranking 688 switch kind { 689 case "type": // scip-ctags regression workaround https://github.com/sourcegraph/sourcegraph/issues/57659 690 factor = 8 691 case "class": 692 factor = 10 693 case "struct": 694 factor = 9.5 695 case "enum": 696 factor = 9 697 case "interface": 698 factor = 8 699 case "function", "func", "method": 700 factor = 7 701 case "member", "field": 702 factor = 5.5 703 case "constant", "const": 704 factor = 5 705 case "var", "variable": 706 factor = 4 707 708 default: 709 // No idea what it is, but its something regarded as a symbol 710 factor = 1 711 } 712 713 // Refer to universal-ctags --list-kinds-full=<language> to learn about which 714 // kinds are detected for which language. 715 // 716 // Note that go-ctags uses universal-ctags's interactive mode and thus returns 717 // the full name for "kind" and not the one-letter abbreviation. 718 switch language { 719 case "Java", "java": 720 switch kind { 721 // 2022-03-30: go-ctags contains a regex rule for Java classes that sets "kind" 722 // to "classes" instead of "c". We have to cover both cases to support existing 723 // indexes. 724 case "class", "classes": 725 factor = 10 726 case "enum": 727 factor = 9 728 case "interface": 729 factor = 8 730 case "method": 731 factor = 7 732 case "field": 733 factor = 6 734 case "enumConstant": 735 factor = 5 736 } 737 case "Kotlin", "kotlin": 738 switch kind { 739 case "class": 740 factor = 10 741 case "interface": 742 factor = 9 743 case "method": 744 factor = 8 745 case "typealias": 746 factor = 7 747 case "constant": 748 factor = 6 749 case "variable": 750 factor = 5 751 } 752 case "Go", "go": 753 switch kind { 754 // scip-ctags regression workaround https://github.com/sourcegraph/sourcegraph/issues/57659 755 // for each case a description of the fields in ctags in the comment 756 case "type": // interface struct talias 757 factor = 9 758 case "method", "function": // methodSpec func 759 factor = 8 760 case "variable": // var member 761 factor = 7 762 case "constant": // const 763 factor = 6 764 765 case "interface": // interfaces 766 factor = 10 767 case "struct": // structs 768 factor = 9 769 case "talias": // type aliases 770 factor = 9 771 case "methodSpec": // interface method specification 772 factor = 8.5 773 case "func": // functions 774 factor = 8 775 case "member": // struct members 776 factor = 7 777 case "const": // constants 778 factor = 6 779 case "var": // variables 780 factor = 5 781 } 782 783 // Boost exported go symbols. Same implementation as token.IsExported 784 if ch, _ := utf8.DecodeRune(sym); unicode.IsUpper(ch) { 785 factor += 0.5 786 } 787 788 if bytes.HasSuffix(filename, []byte("_test.go")) { 789 factor *= 0.8 790 } 791 792 // Could also rank on: 793 // 794 // - anonMember struct anonymous members 795 // - packageName name for specifying imported package 796 // - receiver receivers 797 // - package packages 798 // - type types 799 // - unknown unknown 800 case "C++", "c++": 801 switch kind { 802 case "class": // classes 803 factor = 10 804 case "enum": // enumeration names 805 factor = 9 806 case "function": // function definitions 807 factor = 8 808 case "struct": // structure names 809 factor = 7 810 case "union": // union names 811 factor = 6 812 case "typdef": // typedefs 813 factor = 5 814 case "member": // class, struct, and union members 815 factor = 4 816 case "variable": // varialbe definitions 817 factor = 3 818 } 819 // Could also rank on: 820 // NAME DESCRIPTION 821 // macro macro definitions 822 // enumerator enumerators (values inside an enumeration) 823 // header included header files 824 // namespace namespaces 825 // variable variable definitions 826 case "Scala", "scala": 827 switch kind { 828 case "class": 829 factor = 10 830 case "interface": 831 factor = 9 832 case "object": 833 factor = 8 834 case "method": 835 factor = 7 836 case "type": 837 factor = 6 838 case "variable": 839 factor = 5 840 case "package": 841 factor = 4 842 } 843 case "Python", "python": 844 switch kind { 845 case "class": // classes 846 factor = 10 847 case "function": // function definitions 848 factor = 8 849 case "member": // class, struct, and union members 850 factor = 4 851 case "variable": // variable definitions 852 factor = 3 853 case "local": // local variables 854 factor = 2 855 } 856 // Could also rank on: 857 // 858 // - namespace name referring a module defined in other file 859 // - module modules 860 // - unknown name referring a class/variable/function/module defined in other module 861 // - parameter function parameters 862 case "Ruby", "ruby": 863 switch kind { 864 case "class": 865 factor = 10 866 case "method": 867 factor = 9 868 case "alias": 869 factor = 8 870 case "module": 871 factor = 7 872 case "singletonMethod": 873 factor = 6 874 case "constant": 875 factor = 5 876 case "accessor": 877 factor = 4 878 case "library": 879 factor = 3 880 } 881 case "PHP", "php": 882 switch kind { 883 case "class": 884 factor = 10 885 case "interface": 886 factor = 9 887 case "function": 888 factor = 8 889 case "trait": 890 factor = 7 891 case "define": 892 factor = 6 893 case "namespace": 894 factor = 5 895 case "alias": 896 factor = 4 897 case "variable": 898 factor = 3 899 case "local": 900 factor = 3 901 } 902 case "GraphQL", "graphql": 903 switch kind { 904 case "type": 905 factor = 10 906 } 907 case "Markdown", "markdown": 908 // Headers are good signal in docs, but do not rank as highly as code. 909 switch kind { 910 case "chapter": // # 911 factor = 4 912 case "section": // ## 913 factor = 3 914 case "subsection": // ### 915 factor = 2 916 } 917 } 918 919 return factor * scoreKindMatch 920} 921 922type matchScoreSlice []LineMatch 923 924func (m matchScoreSlice) Len() int { return len(m) } 925func (m matchScoreSlice) Swap(i, j int) { m[i], m[j] = m[j], m[i] } 926func (m matchScoreSlice) Less(i, j int) bool { return m[i].Score > m[j].Score } 927 928type chunkMatchScoreSlice []ChunkMatch 929 930func (m chunkMatchScoreSlice) Len() int { return len(m) } 931func (m chunkMatchScoreSlice) Swap(i, j int) { m[i], m[j] = m[j], m[i] } 932func (m chunkMatchScoreSlice) Less(i, j int) bool { return m[i].Score > m[j].Score } 933 934type fileMatchesByScore []FileMatch 935 936func (m fileMatchesByScore) Len() int { return len(m) } 937func (m fileMatchesByScore) Swap(i, j int) { m[i], m[j] = m[j], m[i] } 938func (m fileMatchesByScore) Less(i, j int) bool { return m[i].Score > m[j].Score } 939 940func sortMatchesByScore(ms []LineMatch) { 941 sort.Sort(matchScoreSlice(ms)) 942} 943 944func sortChunkMatchesByScore(ms []ChunkMatch) { 945 sort.Sort(chunkMatchScoreSlice(ms)) 946} 947 948var doNovelty = os.Getenv("ZOEKT_NOVELTY_DISABLE") == "" 949 950// SortFiles sorts files matches in the order we want to present results to 951// users. The order depends on the match score, which includes both 952// query-dependent signals like word overlap, and file-only signals like the 953// file ranks (if file ranks are enabled). 954// 955// We don't only use the scores, we will also boost some results to present 956// files with novel extensions. 957func SortFiles(ms []FileMatch) { 958 sort.Sort(fileMatchesByScore(ms)) 959 960 if doNovelty { 961 // Experimentally boost something into the third filematch 962 boostNovelExtension(ms, 2, 0.9) 963 } 964} 965 966func boostNovelExtension(ms []FileMatch, boostOffset int, minScoreRatio float64) { 967 if len(ms) <= boostOffset+1 { 968 return 969 } 970 971 top := ms[:boostOffset] 972 candidates := ms[boostOffset:] 973 974 // Don't bother boosting something which is significantly different to the 975 // result it replaces. 976 minScoreForNovelty := candidates[0].Score * minScoreRatio 977 978 // We want to look for an ext that isn't in the top exts 979 exts := make([]string, len(top)) 980 for i := range top { 981 exts[i] = path.Ext(top[i].FileName) 982 } 983 984 for i := range candidates { 985 // Do not assume sorted due to boostNovelExtension being called on subsets 986 if candidates[i].Score < minScoreForNovelty { 987 continue 988 } 989 990 if slices.Contains(exts, path.Ext(candidates[i].FileName)) { 991 continue 992 } 993 994 // Found what we are looking for, now boost to front of candidates (which 995 // is ms[boostOffset]) 996 for ; i > 0; i-- { 997 candidates[i], candidates[i-1] = candidates[i-1], candidates[i] 998 } 999 return 1000 } 1001}