fork of https://github.com/sourcegraph/zoekt
0

Configure Feed

Select the types of activity you want to include in your feed.

1// Copyright 2018 Google Inc. All rights reserved. 2// 3// Licensed under the Apache License, Version 2.0 (the "License"); 4// you may not use this file except in compliance with the License. 5// You may obtain a copy of the License at 6// 7// http://www.apache.org/licenses/LICENSE-2.0 8// 9// Unless required by applicable law or agreed to in writing, software 10// distributed under the License is distributed on an "AS IS" BASIS, 11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12// See the License for the specific language governing permissions and 13// limitations under the License. 14 15package index 16 17import ( 18 "bytes" 19 "fmt" 20 "log" 21 "math" 22 "regexp/syntax" 23 "strings" 24 "unicode/utf8" 25 26 "github.com/cespare/xxhash/v2" 27 "github.com/grafana/regexp" 28 29 "github.com/sourcegraph/zoekt" 30 "github.com/sourcegraph/zoekt/internal/syntaxutil" 31 "github.com/sourcegraph/zoekt/query" 32) 33 34// A docIterator iterates over documents in order. 35type docIterator interface { 36 // provide the next document where we may find something interesting. 37 // 38 // This is like a "peek" and shouldn't mutate state. prepare is what should 39 // change state. 40 nextDoc() uint32 41 42 // clears any per-document state of the docIterator, and 43 // prepares for evaluating the given doc. The argument is 44 // strictly increasing over time. 45 prepare(nextDoc uint32) 46} 47 48// costs are passed in increasing order to matchTree.matches until they do not 49// return matchesRequiresHigherCost. 50const ( 51 costConst = 0 52 costMemory = 1 53 costContent = 2 54 costRegexp = 3 55) 56 57const ( 58 costMin = costConst 59 costMax = costRegexp 60) 61 62// matchesState is an enum for the state of a matchTree after a call to 63// matchTree.matches. 64type matchesState uint8 65 66const ( 67 // matchesRequiresHigherCost is returned when matchTree.matches hasn't done 68 // a search yet since the cost value is not high enough. 69 matchesRequiresHigherCost matchesState = iota 70 71 // matchesFound is returned when matchTree.matches has done a search and 72 // found one or more matches. 73 matchesFound 74 75 // matchesNone is returned when matchTree.matches has done a search and 76 // found nothing. 77 matchesNone 78) 79 80// matchesStatePred is a helper which returns matchesFound if b is true 81// otherwise returns matchesNone. 82func matchesStatePred(b bool) matchesState { 83 if b { 84 return matchesFound 85 } 86 return matchesNone 87} 88 89// matchesStateForSlice is a helper which returns matchesFound if v is 90// non-empty otherwise returns matchesNone. 91func matchesStateForSlice[T any](v []T) matchesState { 92 return matchesStatePred(len(v) > 0) 93} 94 95// An expression tree coupled with matches. The matchtree has two 96// functions: 97// 98// * it implements boolean combinations (and, or, not) 99// 100// * it implements shortcuts, where we skip documents (for example: if 101// there are no trigram matches, we can be sure there are no substring 102// matches). The matchtree iterates over the documents as they are 103// ordered in the shard. 104// 105// The general process for a given (shard, query) is: 106// 107// - construct matchTree for the query 108// 109// - find all different leaf matchTrees (substring, regexp, etc.) 110// 111// in a loop: 112// 113// - find next doc to process using nextDoc 114// 115// - evaluate atoms (leaf expressions that match text) 116// 117// - evaluate the tree using matches(), storing the result in map. 118// 119// - if the complete tree returns (matches() != matchesRequiresHigherCost) 120// for the document, collect all text matches by looking at leaf 121// matchTrees. 122type matchTree interface { 123 docIterator 124 125 // matches if cost is high enough. See documentation for matchesState's 126 // values. 127 // 128 // Note: Do not call this directly, rather use evalMatchTree which uses 129 // known to cache responses once the state transitions away from 130 // matchesRequiresHigherCost. 131 matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState 132} 133 134// docMatchTree iterates over documents for which predicate(docID) returns true. 135type docMatchTree struct { 136 // the number of documents in a shard. 137 numDocs uint32 138 139 predicate func(docID uint32) bool 140 141 // provides additional information about the reason why the docMatchTree was 142 // created. 143 reason string 144 145 // mutable 146 firstDone bool 147 docID uint32 148} 149 150type bruteForceMatchTree struct { 151 // mutable 152 firstDone bool 153 docID uint32 154} 155 156type andLineMatchTree struct { 157 andMatchTree 158} 159 160type andMatchTree struct { 161 children []matchTree 162} 163 164type orMatchTree struct { 165 children []matchTree 166} 167 168type notMatchTree struct { 169 child matchTree 170} 171 172// Returns only the filename of child matches. 173type fileNameMatchTree struct { 174 child matchTree 175} 176 177type boostMatchTree struct { 178 child matchTree 179 boost float64 180} 181 182// Don't visit this subtree for collecting matches. 183type noVisitMatchTree struct { 184 matchTree 185} 186 187type regexpMatchTree struct { 188 regexp *regexp.Regexp 189 190 // origRegexp is the original parsed regexp from the query structure. It 191 // does not include mutations such as case sensitivity. 192 origRegexp *syntax.Regexp 193 194 fileName bool 195 196 // mutable 197 reEvaluated bool 198 found []*candidateMatch 199 200 // nextDoc, prepare. 201 bruteForceMatchTree 202} 203 204func newRegexpMatchTree(s *query.Regexp) *regexpMatchTree { 205 prefix := "" 206 if !s.CaseSensitive { 207 prefix = "(?i)" 208 } 209 210 return &regexpMatchTree{ 211 regexp: regexp.MustCompile(prefix + syntaxutil.RegexpString(s.Regexp)), 212 origRegexp: s.Regexp, 213 fileName: s.FileName, 214 } 215} 216 217// \bLITERAL\b 218type wordMatchTree struct { 219 word string 220 221 fileName bool 222 223 // mutable 224 evaluated bool 225 found []*candidateMatch 226 227 // nextDoc, prepare. 228 bruteForceMatchTree 229} 230 231type substrMatchTree struct { 232 matchIterator 233 234 query *query.Substring 235 caseSensitive bool 236 fileName bool 237 238 // mutable 239 current []*candidateMatch 240 contEvaluated bool 241} 242 243type branchQueryMatchTree struct { 244 fileMasks []uint64 245 masks []uint64 246 repos []uint16 247 248 // mutable 249 firstDone bool 250 docID uint32 251} 252 253func (t *branchQueryMatchTree) branchMask() uint64 { 254 return t.fileMasks[t.docID] & t.masks[t.repos[t.docID]] 255} 256 257type symbolRegexpMatchTree struct { 258 matchTree 259 regexp *regexp.Regexp 260 all bool // skips regex match if .* 261 262 reEvaluated bool 263 found []*candidateMatch 264} 265 266func (t *symbolRegexpMatchTree) prepare(doc uint32) { 267 t.reEvaluated = false 268 t.found = t.found[:0] 269 t.matchTree.prepare(doc) 270} 271 272func (t *symbolRegexpMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState { 273 if t.reEvaluated { 274 return matchesStateForSlice(t.found) 275 } 276 277 if cost < costRegexp { 278 return matchesRequiresHigherCost 279 } 280 281 sections := cp.docSections() 282 content := cp.data(false) 283 284 found := t.found[:0] 285 for i, sec := range sections { 286 var idx []int 287 if t.all { 288 idx = []int{0, int(sec.End - sec.Start)} 289 } else { 290 idx = t.regexp.FindIndex(content[sec.Start:sec.End]) 291 if idx == nil { 292 continue 293 } 294 } 295 296 cm := &candidateMatch{ 297 byteOffset: sec.Start + uint32(idx[0]), 298 byteMatchSz: uint32(idx[1] - idx[0]), 299 symbol: true, 300 symbolIdx: uint32(i), 301 } 302 found = append(found, cm) 303 } 304 t.found = found 305 t.reEvaluated = true 306 307 return matchesStateForSlice(t.found) 308} 309 310type symbolSubstrMatchTree struct { 311 *substrMatchTree 312 313 patternSize uint32 314 fileEndRunes []uint32 315 fileEndSymbol []uint32 316 317 doc uint32 318 sections []DocumentSection 319 320 secID uint32 321} 322 323func (t *symbolSubstrMatchTree) prepare(doc uint32) { 324 t.substrMatchTree.prepare(doc) 325 t.doc = doc 326 327 var fileStart uint32 328 if doc > 0 { 329 fileStart = t.fileEndRunes[doc-1] 330 } 331 332 var sections []DocumentSection 333 if len(t.sections) > 0 { 334 most := t.fileEndSymbol[len(t.fileEndSymbol)-1] 335 if most == uint32(len(t.sections)) { 336 sections = t.sections[t.fileEndSymbol[doc]:t.fileEndSymbol[doc+1]] 337 } else { 338 for t.secID < uint32(len(t.sections)) && t.sections[t.secID].Start < fileStart { 339 t.secID++ 340 } 341 342 fileEnd, symbolEnd := t.fileEndRunes[doc], t.secID 343 for symbolEnd < uint32(len(t.sections)) && t.sections[symbolEnd].Start < fileEnd { 344 symbolEnd++ 345 } 346 347 sections = t.sections[t.secID:symbolEnd] 348 } 349 } 350 351 secIdx := 0 352 trimmed := t.current[:0] 353 for len(sections) > secIdx && len(t.current) > 0 { 354 start := fileStart + t.current[0].runeOffset 355 end := start + t.patternSize 356 if start >= sections[secIdx].End { 357 secIdx++ 358 continue 359 } 360 361 if start < sections[secIdx].Start { 362 t.current = t.current[1:] 363 continue 364 } 365 366 if end <= sections[secIdx].End { 367 t.current[0].symbol = true 368 t.current[0].symbolIdx = uint32(secIdx) 369 trimmed = append(trimmed, t.current[0]) 370 } 371 372 t.current = t.current[1:] 373 } 374 t.current = trimmed 375} 376 377// all prepare methods 378 379func (t *bruteForceMatchTree) prepare(doc uint32) { 380 t.docID = doc 381 t.firstDone = true 382} 383 384func (t *docMatchTree) prepare(doc uint32) { 385 t.docID = doc 386 t.firstDone = true 387} 388 389func (t *andMatchTree) prepare(doc uint32) { 390 for _, c := range t.children { 391 c.prepare(doc) 392 } 393} 394 395func (t *regexpMatchTree) prepare(doc uint32) { 396 t.found = t.found[:0] 397 t.reEvaluated = false 398 t.bruteForceMatchTree.prepare(doc) 399} 400 401func (t *wordMatchTree) prepare(doc uint32) { 402 t.found = t.found[:0] 403 t.evaluated = false 404 t.bruteForceMatchTree.prepare(doc) 405} 406 407func (t *orMatchTree) prepare(doc uint32) { 408 for _, c := range t.children { 409 c.prepare(doc) 410 } 411} 412 413func (t *notMatchTree) prepare(doc uint32) { 414 t.child.prepare(doc) 415} 416 417func (t *fileNameMatchTree) prepare(doc uint32) { 418 t.child.prepare(doc) 419} 420 421func (t *boostMatchTree) prepare(doc uint32) { 422 t.child.prepare(doc) 423} 424 425func (t *substrMatchTree) prepare(nextDoc uint32) { 426 t.matchIterator.prepare(nextDoc) 427 t.current = t.matchIterator.candidates() 428 t.contEvaluated = false 429} 430 431func (t *branchQueryMatchTree) prepare(doc uint32) { 432 t.firstDone = true 433 t.docID = doc 434} 435 436// nextDoc 437 438func (t *docMatchTree) nextDoc() uint32 { 439 var start uint32 440 if t.firstDone { 441 start = t.docID + 1 442 } 443 for i := start; i < t.numDocs; i++ { 444 if t.predicate(i) { 445 return i 446 } 447 } 448 return math.MaxUint32 449} 450 451func (t *bruteForceMatchTree) nextDoc() uint32 { 452 if !t.firstDone { 453 return 0 454 } 455 return t.docID + 1 456} 457 458func (t *andMatchTree) nextDoc() uint32 { 459 var max uint32 460 for _, c := range t.children { 461 m := c.nextDoc() 462 if m > max { 463 max = m 464 } 465 } 466 return max 467} 468 469func (t *orMatchTree) nextDoc() uint32 { 470 min := uint32(math.MaxUint32) 471 for _, c := range t.children { 472 m := c.nextDoc() 473 if m < min { 474 min = m 475 } 476 } 477 return min 478} 479 480func (t *notMatchTree) nextDoc() uint32 { 481 return 0 482} 483 484func (t *fileNameMatchTree) nextDoc() uint32 { 485 return t.child.nextDoc() 486} 487 488func (t *boostMatchTree) nextDoc() uint32 { 489 return t.child.nextDoc() 490} 491 492func (t *branchQueryMatchTree) nextDoc() uint32 { 493 var start uint32 494 if t.firstDone { 495 start = t.docID + 1 496 } 497 498 for i := start; i < uint32(len(t.fileMasks)); i++ { 499 if (t.masks[t.repos[i]] & t.fileMasks[i]) != 0 { 500 return i 501 } 502 } 503 return math.MaxUint32 504} 505 506// all String methods 507 508func (t *bruteForceMatchTree) String() string { 509 return "all" 510} 511 512func (t *docMatchTree) String() string { 513 return fmt.Sprintf("doc(%s)", t.reason) 514} 515 516func (t *andMatchTree) String() string { 517 return fmt.Sprintf("and%v", t.children) 518} 519 520func (t *regexpMatchTree) String() string { 521 f := "" 522 if t.fileName { 523 f = "f" 524 } 525 return fmt.Sprintf("%sre(%s)", f, t.regexp) 526} 527 528func (t *wordMatchTree) String() string { 529 f := "" 530 if t.fileName { 531 f = "f" 532 } 533 return fmt.Sprintf("%sword(%s)", f, t.word) 534} 535 536func (t *orMatchTree) String() string { 537 return fmt.Sprintf("or%v", t.children) 538} 539 540func (t *notMatchTree) String() string { 541 return fmt.Sprintf("not(%v)", t.child) 542} 543 544func (t *noVisitMatchTree) String() string { 545 return fmt.Sprintf("novisit(%v)", t.matchTree) 546} 547 548func (t *fileNameMatchTree) String() string { 549 return fmt.Sprintf("f(%v)", t.child) 550} 551 552func (t *boostMatchTree) String() string { 553 return fmt.Sprintf("boost(%f, %v)", t.boost, t.child) 554} 555 556func (t *substrMatchTree) String() string { 557 f := "" 558 if t.fileName { 559 f = "f" 560 } 561 562 return fmt.Sprintf("%ssubstr(%q, %v, %v)", f, t.query.Pattern, t.current, t.matchIterator) 563} 564 565func (t *branchQueryMatchTree) String() string { 566 return fmt.Sprintf("branch(%x)", t.masks) 567} 568 569func (t *symbolSubstrMatchTree) String() string { 570 return fmt.Sprintf("symbol(%v)", t.substrMatchTree) 571} 572 573func (t *symbolRegexpMatchTree) String() string { 574 return fmt.Sprintf("symbol(%v)", t.matchTree) 575} 576 577// visitMatches visits all atoms in matchTree. Note: This visits 578// noVisitMatchTree. For collecting matches use visitMatches. 579func visitMatchTree(t matchTree, f func(matchTree)) { 580 switch s := t.(type) { 581 case *andMatchTree: 582 for _, ch := range s.children { 583 visitMatchTree(ch, f) 584 } 585 case *orMatchTree: 586 for _, ch := range s.children { 587 visitMatchTree(ch, f) 588 } 589 case *andLineMatchTree: 590 visitMatchTree(&s.andMatchTree, f) 591 case *noVisitMatchTree: 592 visitMatchTree(s.matchTree, f) 593 case *notMatchTree: 594 visitMatchTree(s.child, f) 595 case *fileNameMatchTree: 596 visitMatchTree(s.child, f) 597 case *boostMatchTree: 598 visitMatchTree(s.child, f) 599 case *symbolSubstrMatchTree: 600 visitMatchTree(s.substrMatchTree, f) 601 case *symbolRegexpMatchTree: 602 visitMatchTree(s.matchTree, f) 603 default: 604 f(t) 605 } 606} 607 608// updateMatchTreeStats calls updateStats on all atoms in mt which have that 609// function defined. 610func updateMatchTreeStats(mt matchTree, stats *zoekt.Stats) { 611 visitMatchTree(mt, func(mt matchTree) { 612 if atom, ok := mt.(interface{ updateStats(*zoekt.Stats) }); ok { 613 atom.updateStats(stats) 614 } 615 }) 616} 617 618func visitMatchAtoms(t matchTree, known map[matchTree]bool, f func(matchTree)) { 619 visitMatches(t, known, 1, func(mt matchTree, _ float64) { 620 f(mt) 621 }) 622} 623 624// visitMatches visits all atoms which can contribute matches. Note: This 625// skips noVisitMatchTree. 626func visitMatches(t matchTree, known map[matchTree]bool, weight float64, f func(matchTree, float64)) { 627 switch s := t.(type) { 628 case *andMatchTree: 629 for _, ch := range s.children { 630 if known[ch] { 631 visitMatches(ch, known, weight, f) 632 } 633 } 634 case *andLineMatchTree: 635 visitMatches(&s.andMatchTree, known, weight, f) 636 case *orMatchTree: 637 for _, ch := range s.children { 638 if known[ch] { 639 visitMatches(ch, known, weight, f) 640 } 641 } 642 case *boostMatchTree: 643 visitMatches(s.child, known, weight*s.boost, f) 644 case *symbolSubstrMatchTree: 645 visitMatches(s.substrMatchTree, known, weight, f) 646 case *notMatchTree: 647 case *noVisitMatchTree: 648 // don't collect into negative trees. 649 case *fileNameMatchTree: 650 // We will just gather the filename if we do not visit this tree. 651 default: 652 f(s, weight) 653 } 654} 655 656// all matches() methods. 657 658func (t *docMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState { 659 return matchesStatePred(t.predicate(cp.idx)) 660} 661 662func (t *bruteForceMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState { 663 return matchesFound 664} 665 666// andLineMatchTree is a performance optimization of andMatchTree. For content 667// searches we don't want to run the regex engine if there is no line that 668// contains matches from all terms. 669func (t *andLineMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState { 670 if state := evalMatchTree(cp, cost, known, &t.andMatchTree); state != matchesFound { 671 return state 672 } 673 674 // Invariant: all children have matches. If any line contains all of them we 675 // can return MatchesFound. 676 677 // find child with fewest candidates 678 minCount := math.MaxInt 679 fewestChildren := 0 680 for ix, child := range t.children { 681 v, ok := child.(*substrMatchTree) 682 // make sure we are running a content search and that all candidates are a 683 // substrMatchTree 684 if !ok || v.fileName { 685 return matchesFound 686 } 687 if len(v.current) < minCount { 688 minCount = len(v.current) 689 fewestChildren = ix 690 } 691 } 692 693 type lineRange struct { 694 start int 695 end int 696 } 697 lines := make([]lineRange, 0, len(t.children[fewestChildren].(*substrMatchTree).current)) 698 prev := -1 699 for _, candidate := range t.children[fewestChildren].(*substrMatchTree).current { 700 line := cp.newlines().atOffset(candidate.byteOffset) 701 if line == prev { 702 continue 703 } 704 prev = line 705 byteStart := int(cp.newlines().lineStart(line)) 706 byteEnd := int(cp.newlines().lineStart(line + 1)) 707 lines = append(lines, lineRange{byteStart, byteEnd}) 708 } 709 710 // children keeps track of the children's candidates we have already seen. 711 children := make([][]*candidateMatch, 0, len(t.children)-1) 712 for j, child := range t.children { 713 if j == fewestChildren { 714 continue 715 } 716 children = append(children, child.(*substrMatchTree).current) 717 } 718 719nextLine: 720 for i := 0; i < len(lines); i++ { 721 hits := 1 722 nextChild: 723 for j := range children { 724 nextCandidate: 725 for len(children[j]) > 0 { 726 candidate := children[j][0] 727 bo := int(cp.findOffset(false, candidate.runeOffset)) 728 if bo < lines[i].start { 729 children[j] = children[j][1:] 730 continue nextCandidate 731 } 732 if bo < lines[i].end { 733 hits++ 734 continue nextChild 735 } 736 // move the `lines` iterator forward until bo < line.end 737 for i < len(lines) && bo >= lines[i].end { 738 i++ 739 } 740 i-- 741 continue nextLine 742 } 743 } 744 // return early once we found any line that contains matches from all children 745 if hits == len(t.children) { 746 return matchesFound 747 } 748 } 749 return matchesNone 750} 751 752func (t *andMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState { 753 // We have found matches unless a child needs to do more work or it hasn't 754 // found matches. 755 state := matchesFound 756 757 for _, ch := range t.children { 758 switch evalMatchTree(cp, cost, known, ch) { 759 case matchesRequiresHigherCost: 760 // keep evaluating other children incase we come across matchesNone 761 state = matchesRequiresHigherCost 762 case matchesFound: 763 // will return this if every child has this value 764 case matchesNone: 765 return matchesNone 766 } 767 } 768 769 return state 770} 771 772func (t *orMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState { 773 // we could short-circuit, but we want to use the other possibilities as a 774 // ranking signal. So we always return the most conservative state. 775 state := matchesNone 776 for _, ch := range t.children { 777 switch evalMatchTree(cp, cost, known, ch) { 778 case matchesRequiresHigherCost: 779 state = matchesRequiresHigherCost 780 case matchesFound: 781 if state != matchesRequiresHigherCost { 782 state = matchesFound 783 } 784 case matchesNone: 785 // noop 786 } 787 } 788 return state 789} 790 791func (t *branchQueryMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState { 792 return matchesStatePred(t.branchMask() != 0) 793} 794 795func (t *regexpMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState { 796 if t.reEvaluated { 797 return matchesStateForSlice(t.found) 798 } 799 800 if cost < costRegexp { 801 return matchesRequiresHigherCost 802 } 803 804 cp.stats.RegexpsConsidered++ 805 idxs := t.regexp.FindAllIndex(cp.data(t.fileName), -1) 806 found := t.found[:0] 807 for _, idx := range idxs { 808 cm := &candidateMatch{ 809 byteOffset: uint32(idx[0]), 810 byteMatchSz: uint32(idx[1] - idx[0]), 811 fileName: t.fileName, 812 } 813 814 found = append(found, cm) 815 } 816 t.found = found 817 t.reEvaluated = true 818 819 return matchesStateForSlice(t.found) 820} 821 822func (t *wordMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState { 823 if t.evaluated { 824 return matchesStateForSlice(t.found) 825 } 826 827 if cost < costRegexp { 828 return matchesRequiresHigherCost 829 } 830 831 data := cp.data(t.fileName) 832 offset := 0 833 found := t.found[:0] 834 for { 835 idx := bytes.Index(data[offset:], []byte(t.word)) 836 if idx < 0 { 837 break 838 } 839 840 relStartOffset := offset + idx 841 relEndOffset := relStartOffset + len(t.word) 842 843 startBoundary := relStartOffset < len(data) && (relStartOffset == 0 || !characterClass(data[relStartOffset-1])) 844 endBoundary := relEndOffset > 0 && (relEndOffset == len(data) || !characterClass(data[relEndOffset])) 845 if startBoundary && endBoundary { 846 found = append(found, &candidateMatch{ 847 byteOffset: uint32(offset + idx), 848 byteMatchSz: uint32(len(t.word)), 849 fileName: t.fileName, 850 }) 851 } 852 offset += idx + len(t.word) 853 } 854 855 t.found = found 856 t.evaluated = true 857 858 return matchesStateForSlice(t.found) 859} 860 861// breakMatchesOnNewlines returns matches resulting from breaking each element 862// of cms on newlines within text. 863func breakMatchesOnNewlines(cms []*candidateMatch, text []byte) []*candidateMatch { 864 var lineCMs []*candidateMatch 865 for _, cm := range cms { 866 lineCMs = append(lineCMs, breakOnNewlines(cm, text)...) 867 } 868 return lineCMs 869} 870 871// breakOnNewlines returns matches resulting from breaking cm on newlines 872// within text. 873func breakOnNewlines(cm *candidateMatch, text []byte) []*candidateMatch { 874 var cms []*candidateMatch 875 addMe := &candidateMatch{} 876 *addMe = *cm 877 for i := uint32(cm.byteOffset); i < cm.byteOffset+cm.byteMatchSz; i++ { 878 if text[i] == '\n' { 879 addMe.byteMatchSz = i - addMe.byteOffset 880 if addMe.byteMatchSz != 0 { 881 cms = append(cms, addMe) 882 } 883 884 addMe = &candidateMatch{} 885 *addMe = *cm 886 addMe.byteOffset = i + 1 887 } 888 } 889 addMe.byteMatchSz = cm.byteOffset + cm.byteMatchSz - addMe.byteOffset 890 if addMe.byteMatchSz != 0 { 891 cms = append(cms, addMe) 892 } 893 return cms 894} 895 896// evalMatchTree should be called instead of directly calling 897// matchTree.matches. It cache known values for future evaluation at higher 898// costs. 899func evalMatchTree(cp *contentProvider, cost int, known map[matchTree]bool, mt matchTree) matchesState { 900 if v, ok := known[mt]; ok { 901 return matchesStatePred(v) 902 } 903 904 ms := mt.matches(cp, cost, known) 905 if ms != matchesRequiresHigherCost { 906 known[mt] = ms == matchesFound 907 } 908 909 return ms 910} 911 912func (t *notMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState { 913 switch evalMatchTree(cp, cost, known, t.child) { 914 case matchesRequiresHigherCost: 915 return matchesRequiresHigherCost 916 case matchesFound: 917 return matchesNone 918 case matchesNone: 919 return matchesFound 920 default: 921 panic("unreachable") 922 } 923} 924 925func (t *fileNameMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState { 926 return evalMatchTree(cp, cost, known, t.child) 927} 928 929func (t *boostMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState { 930 return evalMatchTree(cp, cost, known, t.child) 931} 932 933func (t *substrMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState { 934 if t.contEvaluated { 935 return matchesStateForSlice(t.current) 936 } 937 938 if len(t.current) == 0 { 939 return matchesNone 940 } 941 942 if t.fileName && cost < costMemory { 943 return matchesRequiresHigherCost 944 } 945 946 if !t.fileName && cost < costContent { 947 return matchesRequiresHigherCost 948 } 949 950 pruned := t.current[:0] 951 for _, m := range t.current { 952 if m.byteOffset == 0 && m.runeOffset > 0 { 953 m.byteOffset = cp.findOffset(m.fileName, m.runeOffset) 954 } 955 if m.matchContent(cp.data(m.fileName)) { 956 pruned = append(pruned, m) 957 } 958 } 959 t.current = pruned 960 t.contEvaluated = true 961 962 return matchesStateForSlice(t.current) 963} 964 965type matchTreeOpt struct { 966 // DisableWordMatchOptimization is used to disable the use of wordMatchTree. 967 // This was added since we do not support wordMatchTree with symbol search. 968 DisableWordMatchOptimization bool 969} 970 971func (d *indexData) newMatchTree(q query.Q, opt matchTreeOpt) (matchTree, error) { 972 if q == nil { 973 return nil, fmt.Errorf("got nil (sub)query") 974 } 975 976 switch s := q.(type) { 977 case *query.Regexp: 978 // RegexpToMatchTreeRecursive tries to distill a matchTree that matches a 979 // superset of the regexp. If the returned matchTree is equivalent to the 980 // original regexp, it returns true. An equivalent matchTree has the same 981 // behaviour as the original regexp and can be used instead. 982 // 983 subMT, isEq, _, err := d.regexpToMatchTreeRecursive(s.Regexp, ngramSize, s.FileName, s.CaseSensitive) 984 if err != nil { 985 return nil, err 986 } 987 // if the query can be used in place of the regexp 988 // return the subtree 989 if isEq { 990 return subMT, nil 991 } 992 993 var tr matchTree 994 if wmt, ok := regexpToWordMatchTree(s, opt); ok { 995 // A common search we get is "\bLITERAL\b". Avoid the regex engine and 996 // provide something faster. 997 tr = wmt 998 } else { 999 tr = newRegexpMatchTree(s) 1000 } 1001 1002 return &andMatchTree{ 1003 children: []matchTree{ 1004 tr, &noVisitMatchTree{subMT}, 1005 }, 1006 }, nil 1007 case *query.And: 1008 var r []matchTree 1009 for _, ch := range s.Children { 1010 ct, err := d.newMatchTree(ch, opt) 1011 if err != nil { 1012 return nil, err 1013 } 1014 r = append(r, ct) 1015 } 1016 return &andMatchTree{r}, nil 1017 case *query.Or: 1018 var r []matchTree 1019 for _, ch := range s.Children { 1020 ct, err := d.newMatchTree(ch, opt) 1021 if err != nil { 1022 return nil, err 1023 } 1024 r = append(r, ct) 1025 } 1026 return &orMatchTree{r}, nil 1027 case *query.Not: 1028 ct, err := d.newMatchTree(s.Child, opt) 1029 return &notMatchTree{ 1030 child: ct, 1031 }, err 1032 1033 case *query.Type: 1034 if s.Type != query.TypeFileName { 1035 break 1036 } 1037 1038 ct, err := d.newMatchTree(s.Child, opt) 1039 if err != nil { 1040 return nil, err 1041 } 1042 1043 return &fileNameMatchTree{ 1044 child: ct, 1045 }, nil 1046 1047 case *query.Boost: 1048 ct, err := d.newMatchTree(s.Child, opt) 1049 if err != nil { 1050 return nil, err 1051 } 1052 1053 return &boostMatchTree{ 1054 child: ct, 1055 boost: s.Boost, 1056 }, nil 1057 1058 case *query.Meta: 1059 checksum := queryMetaChecksum(s.Field, s.Value) 1060 cacheKeyField := "Meta" 1061 if cached, ok := d.docMatchTreeCache.Get(cacheKeyField, checksum); ok { 1062 return cached, nil 1063 } 1064 1065 reposWant := make([]bool, len(d.repoMetaData)) 1066 for repoIdx, r := range d.repoMetaData { 1067 if r.Metadata != nil { 1068 if val, ok := r.Metadata[s.Field]; ok && s.Value.MatchString(val) { 1069 reposWant[repoIdx] = true 1070 } 1071 } 1072 } 1073 1074 mt := &docMatchTree{ 1075 reason: "Meta", 1076 numDocs: d.numDocs(), 1077 predicate: func(docID uint32) bool { 1078 repoIdx := d.repos[docID] 1079 if int(repoIdx) >= len(reposWant) { 1080 return false 1081 } 1082 return reposWant[repoIdx] 1083 }, 1084 } 1085 d.docMatchTreeCache.Add(cacheKeyField, checksum, mt) 1086 return mt, nil 1087 1088 case *query.Substring: 1089 return d.newSubstringMatchTree(s) 1090 1091 case *query.Branch: 1092 masks := make([]uint64, 0, len(d.repoMetaData)) 1093 if s.Pattern == "HEAD" { 1094 for range d.repoMetaData { 1095 masks = append(masks, 1) 1096 } 1097 } else { 1098 for _, branchIDs := range d.branchIDs { 1099 mask := uint64(0) 1100 for nm, m := range branchIDs { 1101 if (s.Exact && nm == s.Pattern) || (!s.Exact && strings.Contains(nm, s.Pattern)) { 1102 mask |= uint64(m) 1103 } 1104 } 1105 masks = append(masks, mask) 1106 } 1107 } 1108 return &branchQueryMatchTree{ 1109 masks: masks, 1110 fileMasks: d.fileBranchMasks, 1111 repos: d.repos, 1112 }, nil 1113 case *query.Const: 1114 if s.Value { 1115 return &bruteForceMatchTree{}, nil 1116 } else { 1117 return &noMatchTree{Why: "const"}, nil 1118 } 1119 case *query.Language: 1120 code, ok := d.metaData.LanguageMap[s.Language] 1121 if !ok { 1122 return &noMatchTree{Why: "lang"}, nil 1123 } 1124 return &docMatchTree{ 1125 reason: "language", 1126 numDocs: d.numDocs(), 1127 predicate: func(docID uint32) bool { 1128 return d.getLanguage(docID) == code 1129 }, 1130 }, nil 1131 1132 case *query.Symbol: 1133 // Disable WordMatchTree since we don't support it in symbols yet. 1134 optCopy := opt 1135 optCopy.DisableWordMatchOptimization = true 1136 1137 subMT, err := d.newMatchTree(s.Expr, optCopy) 1138 if err != nil { 1139 return nil, err 1140 } 1141 1142 if substr, ok := subMT.(*substrMatchTree); ok { 1143 return &symbolSubstrMatchTree{ 1144 substrMatchTree: substr, 1145 patternSize: uint32(utf8.RuneCountInString(substr.query.Pattern)), 1146 fileEndRunes: d.fileEndRunes, 1147 fileEndSymbol: d.fileEndSymbol, 1148 sections: d.runeDocSections, 1149 }, nil 1150 } 1151 1152 var regexpMT *regexpMatchTree 1153 visitMatchTree(subMT, func(mt matchTree) { 1154 if t, ok := mt.(*regexpMatchTree); ok { 1155 regexpMT = t 1156 } 1157 }) 1158 if regexpMT == nil { 1159 return nil, fmt.Errorf("found %T inside query.Symbol", subMT) 1160 } 1161 1162 return &symbolRegexpMatchTree{ 1163 regexp: regexpMT.regexp, 1164 all: isRegexpAll(regexpMT.origRegexp), 1165 matchTree: subMT, 1166 }, nil 1167 1168 case *query.FileNameSet: 1169 return &docMatchTree{ 1170 reason: "FileNameSet", 1171 numDocs: d.numDocs(), 1172 predicate: func(docID uint32) bool { 1173 fileName := d.fileName(docID) 1174 _, ok := s.Set[string(fileName)] 1175 return ok 1176 }, 1177 }, nil 1178 1179 case *query.BranchesRepos: 1180 reposBranchesWant := make([]uint64, len(d.repoMetaData)) 1181 for repoIdx := range d.repoMetaData { 1182 var mask uint64 1183 for _, br := range s.List { 1184 if br.Repos.Contains(d.repoMetaData[repoIdx].ID) { 1185 mask |= uint64(d.branchIDs[repoIdx][br.Branch]) 1186 } 1187 } 1188 reposBranchesWant[repoIdx] = mask 1189 } 1190 return &docMatchTree{ 1191 reason: "BranchesRepos", 1192 numDocs: d.numDocs(), 1193 predicate: func(docID uint32) bool { 1194 return d.fileBranchMasks[docID]&reposBranchesWant[d.repos[docID]] != 0 1195 }, 1196 }, nil 1197 1198 case *query.RepoSet: 1199 reposWant := make([]bool, len(d.repoMetaData)) 1200 for repoIdx, r := range d.repoMetaData { 1201 if _, ok := s.Set[r.Name]; ok { 1202 reposWant[repoIdx] = true 1203 } 1204 } 1205 return &docMatchTree{ 1206 reason: "RepoSet", 1207 numDocs: d.numDocs(), 1208 predicate: func(docID uint32) bool { 1209 return reposWant[d.repos[docID]] 1210 }, 1211 }, nil 1212 1213 case *query.RepoIDs: 1214 reposWant := make([]bool, len(d.repoMetaData)) 1215 for repoIdx, r := range d.repoMetaData { 1216 if s.Repos.Contains(r.ID) { 1217 reposWant[repoIdx] = true 1218 } 1219 } 1220 return &docMatchTree{ 1221 reason: "RepoIDs", 1222 numDocs: d.numDocs(), 1223 predicate: func(docID uint32) bool { 1224 return reposWant[d.repos[docID]] 1225 }, 1226 }, nil 1227 1228 case *query.Repo: 1229 reposWant := make([]bool, len(d.repoMetaData)) 1230 for repoIdx, r := range d.repoMetaData { 1231 if s.Regexp.MatchString(r.Name) { 1232 reposWant[repoIdx] = true 1233 } 1234 } 1235 return &docMatchTree{ 1236 reason: "Repo", 1237 numDocs: d.numDocs(), 1238 predicate: func(docID uint32) bool { 1239 return reposWant[d.repos[docID]] 1240 }, 1241 }, nil 1242 1243 case *query.RepoRegexp: 1244 reposWant := make([]bool, len(d.repoMetaData)) 1245 for repoIdx, r := range d.repoMetaData { 1246 if s.Regexp.MatchString(r.Name) { 1247 reposWant[repoIdx] = true 1248 } 1249 } 1250 return &docMatchTree{ 1251 reason: "RepoRegexp", 1252 numDocs: d.numDocs(), 1253 predicate: func(docID uint32) bool { 1254 return reposWant[d.repos[docID]] 1255 }, 1256 }, nil 1257 1258 case query.RawConfig: 1259 return &docMatchTree{ 1260 reason: s.String(), 1261 numDocs: d.numDocs(), 1262 predicate: func(docID uint32) bool { 1263 return uint8(s)&d.rawConfigMasks[d.repos[docID]] == uint8(s) 1264 }, 1265 }, nil 1266 } 1267 log.Panicf("type %T", q) 1268 return nil, nil 1269} 1270 1271func (d *indexData) newSubstringMatchTree(s *query.Substring) (matchTree, error) { 1272 st := &substrMatchTree{ 1273 query: s, 1274 caseSensitive: s.CaseSensitive, 1275 fileName: s.FileName, 1276 } 1277 1278 if utf8.RuneCountInString(s.Pattern) < ngramSize { 1279 return newRegexpMatchTree(&query.Regexp{ 1280 Regexp: &syntax.Regexp{Op: syntax.OpLiteral, Rune: []rune(s.Pattern)}, 1281 FileName: s.FileName, 1282 Content: s.Content, 1283 CaseSensitive: s.CaseSensitive, 1284 }), nil 1285 } 1286 1287 result, err := d.iterateNgrams(s) 1288 if err != nil { 1289 return nil, err 1290 } 1291 st.matchIterator = result 1292 return st, nil 1293} 1294 1295func regexpToWordMatchTree(q *query.Regexp, opt matchTreeOpt) (_ *wordMatchTree, ok bool) { 1296 if opt.DisableWordMatchOptimization { 1297 return nil, false 1298 } 1299 // Needs to be case sensitive 1300 if !q.CaseSensitive || q.Regexp.Flags&syntax.FoldCase != 0 { 1301 return nil, false 1302 } 1303 // We want a regex that looks like Op.Concat[OpWordBoundary OpLiteral OpWordBoundary] 1304 if q.Regexp.Op != syntax.OpConcat || len(q.Regexp.Sub) != 3 { 1305 return nil, false 1306 } 1307 sub := q.Regexp.Sub 1308 if sub[0].Op != syntax.OpWordBoundary || sub[1].Op != syntax.OpLiteral || sub[2].Op != syntax.OpWordBoundary { 1309 return nil, false 1310 } 1311 1312 return &wordMatchTree{ 1313 word: string(sub[1].Rune), 1314 fileName: q.FileName, 1315 }, true 1316} 1317 1318// pruneMatchTree removes impossible branches from the matchTree, as indicated 1319// by substrMatchTree having a noMatchTree and the resulting impossible and clauses and so forth. 1320func pruneMatchTree(mt matchTree) (matchTree, error) { 1321 var err error 1322 switch mt := mt.(type) { 1323 // leaf nodes that we test with the filter: 1324 case *substrMatchTree: 1325 if res, ok := mt.matchIterator.(*ngramIterationResults); ok { 1326 if _, ok := res.matchIterator.(*noMatchTree); ok { 1327 return nil, nil 1328 } 1329 } 1330 // recursive tree structures: 1331 case *andMatchTree: 1332 // Any branch of an and becoming impossible means the entire clause 1333 // is impossible. Otherwise, just handle rewrites. 1334 for i, child := range mt.children { 1335 newChild, err := pruneMatchTree(child) 1336 if err != nil { 1337 return nil, err 1338 } 1339 if newChild == nil { 1340 return nil, nil 1341 } 1342 mt.children[i] = newChild 1343 } 1344 case *orMatchTree: 1345 // *All* branches of an OR becoming impossible means the entire clause 1346 // is impossible. Otherwise, drop impossible subclauses and handle 1347 // rewrites, including simplifying to a singular resulting child branch. 1348 n := 0 1349 for _, child := range mt.children { 1350 newChild, err := pruneMatchTree(child) 1351 if err != nil { 1352 return nil, err 1353 } 1354 if newChild != nil { 1355 mt.children[n] = newChild 1356 n++ 1357 } 1358 } 1359 mt.children = mt.children[:n] 1360 if len(mt.children) == 1 { 1361 return mt.children[0], nil 1362 } else if len(mt.children) == 0 { 1363 return nil, nil 1364 } 1365 case *noVisitMatchTree: 1366 mt.matchTree, err = pruneMatchTree(mt.matchTree) 1367 if err != nil { 1368 return nil, err 1369 } 1370 if mt.matchTree == nil { 1371 return nil, nil 1372 } 1373 case *fileNameMatchTree: 1374 mt.child, err = pruneMatchTree(mt.child) 1375 if err != nil { 1376 return nil, err 1377 } 1378 if mt.child == nil { 1379 return nil, nil 1380 } 1381 case *boostMatchTree: 1382 mt.child, err = pruneMatchTree(mt.child) 1383 if err != nil { 1384 return nil, err 1385 } 1386 if mt.child == nil { 1387 return nil, nil 1388 } 1389 case *andLineMatchTree: 1390 child, err := pruneMatchTree(&mt.andMatchTree) 1391 if err != nil { 1392 return nil, err 1393 } 1394 if child == nil { 1395 return nil, nil 1396 } 1397 if c, ok := child.(*andMatchTree); ok { 1398 mt.andMatchTree = *c 1399 } else { 1400 // the and was simplified to a single clause, 1401 // so the linematch portion is irrelevant. 1402 return mt, nil 1403 } 1404 case *notMatchTree: 1405 mt.child, err = pruneMatchTree(mt.child) 1406 if err != nil { 1407 return nil, err 1408 } 1409 if mt.child == nil { 1410 // not false => true 1411 return &bruteForceMatchTree{}, nil 1412 } 1413 // unhandled: 1414 case *docMatchTree: 1415 case *bruteForceMatchTree: 1416 case *regexpMatchTree: 1417 case *wordMatchTree: 1418 } 1419 return mt, err 1420} 1421 1422// isRegexpAll returns true if the query matches all possible lines. 1423// 1424// Note: it is possible for a funky regex to actually match all but this 1425// returns false. This returns true for normal looking regexes like ".*" or 1426// "(?-s:.*)". 1427func isRegexpAll(r *syntax.Regexp) bool { 1428 // Note: we don't care about any flags since we are looking for .* and we 1429 // don't mind if . matches all or everything but newline. 1430 1431 // for loop is for visiting children of OpCapture/OpConcat until we hit .* 1432 for { 1433 // Our main target: .* 1434 if r.Op == syntax.OpStar && len(r.Sub) == 1 { // * 1435 // (?s:.) or (?-s:.) 1436 return r.Sub[0].Op == syntax.OpAnyChar || r.Sub[0].Op == syntax.OpAnyCharNotNL 1437 } 1438 1439 // Strip away expressions being wrapped in paranthesis 1440 if (r.Op == syntax.OpCapture || r.Op == syntax.OpConcat) && len(r.Sub) == 1 { 1441 r = r.Sub[0] 1442 continue 1443 } 1444 1445 return false 1446 } 1447} 1448 1449func queryMetaChecksum(field string, value *regexp.Regexp) string { 1450 h := xxhash.New() 1451 h.Write([]byte(field)) 1452 h.Write([]byte{':'}) 1453 h.Write([]byte(value.String())) 1454 return fmt.Sprintf("%x", h.Sum64()) 1455}