fork of https://github.com/sourcegraph/zoekt
0

Configure Feed

Select the types of activity you want to include in your feed.

1// Copyright 2018 Google Inc. All rights reserved. 2// 3// Licensed under the Apache License, Version 2.0 (the "License"); 4// you may not use this file except in compliance with the License. 5// You may obtain a copy of the License at 6// 7// http://www.apache.org/licenses/LICENSE-2.0 8// 9// Unless required by applicable law or agreed to in writing, software 10// distributed under the License is distributed on an "AS IS" BASIS, 11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12// See the License for the specific language governing permissions and 13// limitations under the License. 14 15package zoekt 16 17import ( 18 "bytes" 19 "fmt" 20 "log" 21 "regexp/syntax" 22 "strings" 23 "unicode/utf8" 24 25 "github.com/grafana/regexp" 26 27 "github.com/sourcegraph/zoekt/query" 28) 29 30// A docIterator iterates over documents in order. 31type docIterator interface { 32 // provide the next document where we can may find something 33 // interesting. 34 nextDoc() uint32 35 36 // clears any per-document state of the docIterator, and 37 // prepares for evaluating the given doc. The argument is 38 // strictly increasing over time. 39 prepare(nextDoc uint32) 40} 41 42const ( 43 costConst = 0 44 costMemory = 1 45 costContent = 2 46 costRegexp = 3 47) 48 49const ( 50 costMin = costConst 51 costMax = costRegexp 52) 53 54// An expression tree coupled with matches. The matchtree has two 55// functions: 56// 57// * it implements boolean combinations (and, or, not) 58// 59// * it implements shortcuts, where we skip documents (for example: if 60// there are no trigram matches, we can be sure there are no substring 61// matches). The matchtree iterates over the documents as they are 62// ordered in the shard. 63// 64// The general process for a given (shard, query) is: 65// 66// - construct matchTree for the query 67// 68// - find all different leaf matchTrees (substring, regexp, etc.) 69// 70// in a loop: 71// 72// - find next doc to process using nextDoc 73// 74// - evaluate atoms (leaf expressions that match text) 75// 76// - evaluate the tree using matches(), storing the result in map. 77// 78// - if the complete tree returns (matches() == true) for the document, 79// collect all text matches by looking at leaf matchTrees 80type matchTree interface { 81 docIterator 82 83 // returns whether this matches, and if we are sure. 84 matches(cp *contentProvider, cost int, known map[matchTree]bool) (match bool, sure bool) 85} 86 87// docMatchTree iterates over documents for which predicate(docID) returns true. 88type docMatchTree struct { 89 // the number of documents in a shard. 90 numDocs uint32 91 92 predicate func(docID uint32) bool 93 94 // provides additional information about the reason why the docMatchTree was 95 // created. 96 reason string 97 98 // mutable 99 firstDone bool 100 docID uint32 101} 102 103type bruteForceMatchTree struct { 104 // mutable 105 firstDone bool 106 docID uint32 107} 108 109type andLineMatchTree struct { 110 andMatchTree 111} 112 113type andMatchTree struct { 114 children []matchTree 115} 116 117type orMatchTree struct { 118 children []matchTree 119} 120 121type notMatchTree struct { 122 child matchTree 123} 124 125// Returns only the filename of child matches. 126type fileNameMatchTree struct { 127 child matchTree 128} 129 130// Don't visit this subtree for collecting matches. 131type noVisitMatchTree struct { 132 matchTree 133} 134 135type regexpMatchTree struct { 136 regexp *regexp.Regexp 137 138 fileName bool 139 140 // mutable 141 reEvaluated bool 142 found []*candidateMatch 143 144 // nextDoc, prepare. 145 bruteForceMatchTree 146} 147 148// \bLITERAL\b 149type wordMatchTree struct { 150 word string 151 152 fileName bool 153 154 // mutable 155 evaluated bool 156 found []*candidateMatch 157 158 // nextDoc, prepare. 159 bruteForceMatchTree 160} 161 162type substrMatchTree struct { 163 matchIterator 164 165 query *query.Substring 166 caseSensitive bool 167 fileName bool 168 169 // mutable 170 current []*candidateMatch 171 contEvaluated bool 172} 173 174type branchQueryMatchTree struct { 175 fileMasks []uint64 176 masks []uint64 177 repos []uint16 178 179 // mutable 180 firstDone bool 181 docID uint32 182} 183 184func (t *branchQueryMatchTree) branchMask() uint64 { 185 return t.fileMasks[t.docID] & t.masks[t.repos[t.docID]] 186} 187 188type symbolRegexpMatchTree struct { 189 matchTree 190 regexp *regexp.Regexp 191 all bool // skips regex match if .* 192 193 reEvaluated bool 194 found []*candidateMatch 195} 196 197func (t *symbolRegexpMatchTree) prepare(doc uint32) { 198 t.reEvaluated = false 199 t.found = t.found[:0] 200 t.matchTree.prepare(doc) 201} 202 203func (t *symbolRegexpMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) (bool, bool) { 204 if t.reEvaluated { 205 return len(t.found) > 0, true 206 } 207 208 if cost < costRegexp { 209 return false, false 210 } 211 212 sections := cp.docSections() 213 content := cp.data(false) 214 215 found := t.found[:0] 216 for i, sec := range sections { 217 var idx []int 218 if t.all { 219 idx = []int{0, int(sec.End - sec.Start)} 220 } else { 221 idx = t.regexp.FindIndex(content[sec.Start:sec.End]) 222 if idx == nil { 223 continue 224 } 225 } 226 227 cm := &candidateMatch{ 228 byteOffset: sec.Start + uint32(idx[0]), 229 byteMatchSz: uint32(idx[1] - idx[0]), 230 symbol: true, 231 symbolIdx: uint32(i), 232 } 233 found = append(found, cm) 234 } 235 t.found = found 236 t.reEvaluated = true 237 238 return len(t.found) > 0, true 239} 240 241type symbolSubstrMatchTree struct { 242 *substrMatchTree 243 244 patternSize uint32 245 fileEndRunes []uint32 246 fileEndSymbol []uint32 247 248 doc uint32 249 sections []DocumentSection 250 251 secID uint32 252} 253 254func (t *symbolSubstrMatchTree) prepare(doc uint32) { 255 t.substrMatchTree.prepare(doc) 256 t.doc = doc 257 258 var fileStart uint32 259 if doc > 0 { 260 fileStart = t.fileEndRunes[doc-1] 261 } 262 263 var sections []DocumentSection 264 if len(t.sections) > 0 { 265 most := t.fileEndSymbol[len(t.fileEndSymbol)-1] 266 if most == uint32(len(t.sections)) { 267 sections = t.sections[t.fileEndSymbol[doc]:t.fileEndSymbol[doc+1]] 268 } else { 269 for t.secID < uint32(len(t.sections)) && t.sections[t.secID].Start < fileStart { 270 t.secID++ 271 } 272 273 fileEnd, symbolEnd := t.fileEndRunes[doc], t.secID 274 for symbolEnd < uint32(len(t.sections)) && t.sections[symbolEnd].Start < fileEnd { 275 symbolEnd++ 276 } 277 278 sections = t.sections[t.secID:symbolEnd] 279 } 280 } 281 282 secIdx := 0 283 trimmed := t.current[:0] 284 for len(sections) > secIdx && len(t.current) > 0 { 285 start := fileStart + t.current[0].runeOffset 286 end := start + t.patternSize 287 if start >= sections[secIdx].End { 288 secIdx++ 289 continue 290 } 291 292 if start < sections[secIdx].Start { 293 t.current = t.current[1:] 294 continue 295 } 296 297 if end <= sections[secIdx].End { 298 t.current[0].symbol = true 299 t.current[0].symbolIdx = uint32(secIdx) 300 trimmed = append(trimmed, t.current[0]) 301 } 302 303 t.current = t.current[1:] 304 } 305 t.current = trimmed 306} 307 308// all prepare methods 309 310func (t *bruteForceMatchTree) prepare(doc uint32) { 311 t.docID = doc 312 t.firstDone = true 313} 314 315func (t *docMatchTree) prepare(doc uint32) { 316 t.docID = doc 317 t.firstDone = true 318} 319 320func (t *andMatchTree) prepare(doc uint32) { 321 for _, c := range t.children { 322 c.prepare(doc) 323 } 324} 325 326func (t *regexpMatchTree) prepare(doc uint32) { 327 t.found = t.found[:0] 328 t.reEvaluated = false 329 t.bruteForceMatchTree.prepare(doc) 330} 331 332func (t *wordMatchTree) prepare(doc uint32) { 333 t.found = t.found[:0] 334 t.evaluated = false 335 t.bruteForceMatchTree.prepare(doc) 336} 337 338func (t *orMatchTree) prepare(doc uint32) { 339 for _, c := range t.children { 340 c.prepare(doc) 341 } 342} 343 344func (t *notMatchTree) prepare(doc uint32) { 345 t.child.prepare(doc) 346} 347 348func (t *fileNameMatchTree) prepare(doc uint32) { 349 t.child.prepare(doc) 350} 351 352func (t *substrMatchTree) prepare(nextDoc uint32) { 353 t.matchIterator.prepare(nextDoc) 354 t.current = t.matchIterator.candidates() 355 t.contEvaluated = false 356} 357 358func (t *branchQueryMatchTree) prepare(doc uint32) { 359 t.firstDone = true 360 t.docID = doc 361} 362 363// nextDoc 364 365func (t *docMatchTree) nextDoc() uint32 { 366 var start uint32 367 if t.firstDone { 368 start = t.docID + 1 369 } 370 for i := start; i < t.numDocs; i++ { 371 if t.predicate(i) { 372 return i 373 } 374 } 375 return maxUInt32 376} 377 378func (t *bruteForceMatchTree) nextDoc() uint32 { 379 if !t.firstDone { 380 return 0 381 } 382 return t.docID + 1 383} 384 385func (t *andMatchTree) nextDoc() uint32 { 386 var max uint32 387 for _, c := range t.children { 388 m := c.nextDoc() 389 if m > max { 390 max = m 391 } 392 } 393 return max 394} 395 396func (t *orMatchTree) nextDoc() uint32 { 397 min := uint32(maxUInt32) 398 for _, c := range t.children { 399 m := c.nextDoc() 400 if m < min { 401 min = m 402 } 403 } 404 return min 405} 406 407func (t *notMatchTree) nextDoc() uint32 { 408 return 0 409} 410 411func (t *fileNameMatchTree) nextDoc() uint32 { 412 return t.child.nextDoc() 413} 414 415func (t *branchQueryMatchTree) nextDoc() uint32 { 416 var start uint32 417 if t.firstDone { 418 start = t.docID + 1 419 } 420 421 for i := start; i < uint32(len(t.fileMasks)); i++ { 422 if (t.masks[t.repos[i]] & t.fileMasks[i]) != 0 { 423 return i 424 } 425 } 426 return maxUInt32 427} 428 429// all String methods 430 431func (t *bruteForceMatchTree) String() string { 432 return "all" 433} 434 435func (t *docMatchTree) String() string { 436 return fmt.Sprintf("doc(%s)", t.reason) 437} 438 439func (t *andMatchTree) String() string { 440 return fmt.Sprintf("and%v", t.children) 441} 442 443func (t *regexpMatchTree) String() string { 444 f := "" 445 if t.fileName { 446 f = "f" 447 } 448 return fmt.Sprintf("%sre(%s)", f, t.regexp) 449} 450 451func (t *wordMatchTree) String() string { 452 f := "" 453 if t.fileName { 454 f = "f" 455 } 456 return fmt.Sprintf("%sword(%s)", f, t.word) 457} 458 459func (t *orMatchTree) String() string { 460 return fmt.Sprintf("or%v", t.children) 461} 462 463func (t *notMatchTree) String() string { 464 return fmt.Sprintf("not(%v)", t.child) 465} 466 467func (t *noVisitMatchTree) String() string { 468 return fmt.Sprintf("novisit(%v)", t.matchTree) 469} 470 471func (t *fileNameMatchTree) String() string { 472 return fmt.Sprintf("f(%v)", t.child) 473} 474 475func (t *substrMatchTree) String() string { 476 f := "" 477 if t.fileName { 478 f = "f" 479 } 480 481 return fmt.Sprintf("%ssubstr(%q, %v, %v)", f, t.query.Pattern, t.current, t.matchIterator) 482} 483 484func (t *branchQueryMatchTree) String() string { 485 return fmt.Sprintf("branch(%x)", t.masks) 486} 487 488func (t *symbolSubstrMatchTree) String() string { 489 return fmt.Sprintf("symbol(%v)", t.substrMatchTree) 490} 491 492func (t *symbolRegexpMatchTree) String() string { 493 return fmt.Sprintf("symbol(%v)", t.matchTree) 494} 495 496// visitMatches visits all atoms in matchTree. Note: This visits 497// noVisitMatchTree. For collecting matches use visitMatches. 498func visitMatchTree(t matchTree, f func(matchTree)) { 499 switch s := t.(type) { 500 case *andMatchTree: 501 for _, ch := range s.children { 502 visitMatchTree(ch, f) 503 } 504 case *orMatchTree: 505 for _, ch := range s.children { 506 visitMatchTree(ch, f) 507 } 508 case *andLineMatchTree: 509 visitMatchTree(&s.andMatchTree, f) 510 case *noVisitMatchTree: 511 visitMatchTree(s.matchTree, f) 512 case *notMatchTree: 513 visitMatchTree(s.child, f) 514 case *fileNameMatchTree: 515 visitMatchTree(s.child, f) 516 case *symbolSubstrMatchTree: 517 visitMatchTree(s.substrMatchTree, f) 518 case *symbolRegexpMatchTree: 519 visitMatchTree(s.matchTree, f) 520 default: 521 f(t) 522 } 523} 524 525// updateMatchTreeStats calls updateStats on all atoms in mt which have that 526// function defined. 527func updateMatchTreeStats(mt matchTree, stats *Stats) { 528 visitMatchTree(mt, func(mt matchTree) { 529 if atom, ok := mt.(interface{ updateStats(*Stats) }); ok { 530 atom.updateStats(stats) 531 } 532 }) 533} 534 535// visitMatches visits all atoms which can contribute matches. Note: This 536// skips noVisitMatchTree. 537func visitMatches(t matchTree, known map[matchTree]bool, f func(matchTree)) { 538 switch s := t.(type) { 539 case *andMatchTree: 540 for _, ch := range s.children { 541 if known[ch] { 542 visitMatches(ch, known, f) 543 } 544 } 545 case *andLineMatchTree: 546 visitMatches(&s.andMatchTree, known, f) 547 case *orMatchTree: 548 for _, ch := range s.children { 549 if known[ch] { 550 visitMatches(ch, known, f) 551 } 552 } 553 case *symbolSubstrMatchTree: 554 visitMatches(s.substrMatchTree, known, f) 555 case *notMatchTree: 556 case *noVisitMatchTree: 557 // don't collect into negative trees. 558 case *fileNameMatchTree: 559 // We will just gather the filename if we do not visit this tree. 560 default: 561 f(s) 562 } 563} 564 565// all matches() methods. 566 567func (t *docMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) (bool, bool) { 568 return t.predicate(cp.idx), true 569} 570 571func (t *bruteForceMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) (bool, bool) { 572 return true, true 573} 574 575// andLineMatchTree is a performance optimization of andMatchTree. For content 576// searches we don't want to run the regex engine if there is no line that 577// contains matches from all terms. 578func (t *andLineMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) (bool, bool) { 579 matches, sure := t.andMatchTree.matches(cp, cost, known) 580 if !(sure && matches) { 581 return matches, sure 582 } 583 584 // find child with fewest candidates 585 min := maxUInt32 586 fewestChildren := 0 587 for ix, child := range t.children { 588 v, ok := child.(*substrMatchTree) 589 // make sure we are running a content search and that all candidates are a 590 // substrMatchTree 591 if !ok || v.fileName { 592 return matches, sure 593 } 594 if len(v.current) < min { 595 min = len(v.current) 596 fewestChildren = ix 597 } 598 } 599 600 type lineRange struct { 601 start int 602 end int 603 } 604 lines := make([]lineRange, 0, len(t.children[fewestChildren].(*substrMatchTree).current)) 605 prev := -1 606 for _, candidate := range t.children[fewestChildren].(*substrMatchTree).current { 607 line, byteStart, byteEnd := cp.newlines().atOffset(candidate.byteOffset) 608 if line == prev { 609 continue 610 } 611 prev = line 612 lines = append(lines, lineRange{byteStart, byteEnd}) 613 } 614 615 // children keeps track of the children's candidates we have already seen. 616 children := make([][]*candidateMatch, 0, len(t.children)-1) 617 for j, child := range t.children { 618 if j == fewestChildren { 619 continue 620 } 621 children = append(children, child.(*substrMatchTree).current) 622 } 623 624nextLine: 625 for i := 0; i < len(lines); i++ { 626 hits := 1 627 nextChild: 628 for j := range children { 629 nextCandidate: 630 for len(children[j]) > 0 { 631 candidate := children[j][0] 632 bo := int(cp.findOffset(false, candidate.runeOffset)) 633 if bo < lines[i].start { 634 children[j] = children[j][1:] 635 continue nextCandidate 636 } 637 if bo <= lines[i].end { 638 hits++ 639 continue nextChild 640 } 641 // move the `lines` iterator forward until bo <= line.end 642 for i < len(lines) && bo > lines[i].end { 643 i++ 644 } 645 i-- 646 continue nextLine 647 } 648 } 649 // return early once we found any line that contains matches from all children 650 if hits == len(t.children) { 651 return matches, true 652 } 653 } 654 return false, true 655} 656 657func (t *andMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) (bool, bool) { 658 sure := true 659 660 for _, ch := range t.children { 661 v, ok := evalMatchTree(cp, cost, known, ch) 662 if ok && !v { 663 return false, true 664 } 665 if !ok { 666 sure = false 667 } 668 } 669 670 return true, sure 671} 672 673func (t *orMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) (bool, bool) { 674 matches := false 675 sure := true 676 for _, ch := range t.children { 677 v, ok := evalMatchTree(cp, cost, known, ch) 678 if ok { 679 // we could short-circuit, but we want to use 680 // the other possibilities as a ranking 681 // signal. 682 matches = matches || v 683 } else { 684 sure = false 685 } 686 } 687 return matches, sure 688} 689 690func (t *branchQueryMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) (bool, bool) { 691 return t.branchMask() != 0, true 692} 693 694func (t *regexpMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) (bool, bool) { 695 if t.reEvaluated { 696 return len(t.found) > 0, true 697 } 698 699 if cost < costRegexp { 700 return false, false 701 } 702 703 cp.stats.RegexpsConsidered++ 704 idxs := t.regexp.FindAllIndex(cp.data(t.fileName), -1) 705 found := t.found[:0] 706 for _, idx := range idxs { 707 cm := &candidateMatch{ 708 byteOffset: uint32(idx[0]), 709 byteMatchSz: uint32(idx[1] - idx[0]), 710 fileName: t.fileName, 711 } 712 713 found = append(found, cm) 714 } 715 t.found = found 716 t.reEvaluated = true 717 718 return len(t.found) > 0, true 719} 720 721func (t *wordMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) (bool, bool) { 722 if t.evaluated { 723 return len(t.found) > 0, true 724 } 725 726 if cost < costRegexp { 727 return false, false 728 } 729 730 data := cp.data(t.fileName) 731 offset := 0 732 found := t.found[:0] 733 for { 734 idx := bytes.Index(data[offset:], []byte(t.word)) 735 if idx < 0 { 736 break 737 } 738 739 relStartOffset := offset + idx 740 relEndOffset := relStartOffset + len(t.word) 741 742 startBoundary := relStartOffset < len(data) && (relStartOffset == 0 || !characterClass(data[relStartOffset-1])) 743 endBoundary := relEndOffset > 0 && (relEndOffset == len(data) || !characterClass(data[relEndOffset])) 744 if startBoundary && endBoundary { 745 found = append(found, &candidateMatch{ 746 byteOffset: uint32(offset + idx), 747 byteMatchSz: uint32(len(t.word)), 748 fileName: t.fileName, 749 }) 750 } 751 offset += idx + len(t.word) 752 } 753 754 t.found = found 755 t.evaluated = true 756 757 return len(t.found) > 0, true 758} 759 760// breakMatchesOnNewlines returns matches resulting from breaking each element 761// of cms on newlines within text. 762func breakMatchesOnNewlines(cms []*candidateMatch, text []byte) []*candidateMatch { 763 var lineCMs []*candidateMatch 764 for _, cm := range cms { 765 lineCMs = append(lineCMs, breakOnNewlines(cm, text)...) 766 } 767 return lineCMs 768} 769 770// breakOnNewlines returns matches resulting from breaking cm on newlines 771// within text. 772func breakOnNewlines(cm *candidateMatch, text []byte) []*candidateMatch { 773 var cms []*candidateMatch 774 addMe := &candidateMatch{} 775 *addMe = *cm 776 for i := uint32(cm.byteOffset); i < cm.byteOffset+cm.byteMatchSz; i++ { 777 if text[i] == '\n' { 778 addMe.byteMatchSz = i - addMe.byteOffset 779 if addMe.byteMatchSz != 0 { 780 cms = append(cms, addMe) 781 } 782 783 addMe = &candidateMatch{} 784 *addMe = *cm 785 addMe.byteOffset = i + 1 786 } 787 } 788 addMe.byteMatchSz = cm.byteOffset + cm.byteMatchSz - addMe.byteOffset 789 if addMe.byteMatchSz != 0 { 790 cms = append(cms, addMe) 791 } 792 return cms 793} 794 795func evalMatchTree(cp *contentProvider, cost int, known map[matchTree]bool, mt matchTree) (bool, bool) { 796 if v, ok := known[mt]; ok { 797 return v, true 798 } 799 800 v, ok := mt.matches(cp, cost, known) 801 if ok { 802 known[mt] = v 803 } 804 805 return v, ok 806} 807 808func (t *notMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) (bool, bool) { 809 v, ok := evalMatchTree(cp, cost, known, t.child) 810 return !v, ok 811} 812 813func (t *fileNameMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) (bool, bool) { 814 return evalMatchTree(cp, cost, known, t.child) 815} 816 817func (t *substrMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) (bool, bool) { 818 if t.contEvaluated { 819 return len(t.current) > 0, true 820 } 821 822 if len(t.current) == 0 { 823 return false, true 824 } 825 826 if t.fileName && cost < costMemory { 827 return false, false 828 } 829 830 if !t.fileName && cost < costContent { 831 return false, false 832 } 833 834 pruned := t.current[:0] 835 for _, m := range t.current { 836 if m.byteOffset == 0 && m.runeOffset > 0 { 837 m.byteOffset = cp.findOffset(m.fileName, m.runeOffset) 838 } 839 if m.matchContent(cp.data(m.fileName)) { 840 pruned = append(pruned, m) 841 } 842 } 843 t.current = pruned 844 t.contEvaluated = true 845 846 return len(t.current) > 0, true 847} 848 849type matchTreeOpt struct { 850 // DisableWordMatchOptimization is used to disable the use of wordMatchTree. 851 // This was added since we do not support wordMatchTree with symbol search. 852 DisableWordMatchOptimization bool 853} 854 855func (d *indexData) newMatchTree(q query.Q, opt matchTreeOpt) (matchTree, error) { 856 if q == nil { 857 return nil, fmt.Errorf("got nil (sub)query") 858 } 859 switch s := q.(type) { 860 case *query.Regexp: 861 // RegexpToMatchTreeRecursive tries to distill a matchTree that matches a 862 // superset of the regexp. If the returned matchTree is equivalent to the 863 // original regexp, it returns true. An equivalent matchTree has the same 864 // behaviour as the original regexp and can be used instead. 865 // 866 subMT, isEq, _, err := d.regexpToMatchTreeRecursive(s.Regexp, ngramSize, s.FileName, s.CaseSensitive) 867 if err != nil { 868 return nil, err 869 } 870 // if the query can be used in place of the regexp 871 // return the subtree 872 if isEq { 873 return subMT, nil 874 } 875 876 var tr matchTree 877 if wmt, ok := regexpToWordMatchTree(s, opt); ok { 878 // A common search we get is "\bLITERAL\b". Avoid the regex engine and 879 // provide something faster. 880 tr = wmt 881 } else { 882 prefix := "" 883 if !s.CaseSensitive { 884 prefix = "(?i)" 885 } 886 887 tr = &regexpMatchTree{ 888 regexp: regexp.MustCompile(prefix + s.Regexp.String()), 889 fileName: s.FileName, 890 } 891 } 892 893 return &andMatchTree{ 894 children: []matchTree{ 895 tr, &noVisitMatchTree{subMT}, 896 }, 897 }, nil 898 case *query.And: 899 var r []matchTree 900 for _, ch := range s.Children { 901 ct, err := d.newMatchTree(ch, opt) 902 if err != nil { 903 return nil, err 904 } 905 r = append(r, ct) 906 } 907 return &andMatchTree{r}, nil 908 case *query.Or: 909 var r []matchTree 910 for _, ch := range s.Children { 911 ct, err := d.newMatchTree(ch, opt) 912 if err != nil { 913 return nil, err 914 } 915 r = append(r, ct) 916 } 917 return &orMatchTree{r}, nil 918 case *query.Not: 919 ct, err := d.newMatchTree(s.Child, opt) 920 return &notMatchTree{ 921 child: ct, 922 }, err 923 924 case *query.Type: 925 if s.Type != query.TypeFileName { 926 break 927 } 928 929 ct, err := d.newMatchTree(s.Child, opt) 930 if err != nil { 931 return nil, err 932 } 933 934 return &fileNameMatchTree{ 935 child: ct, 936 }, nil 937 938 case *query.Substring: 939 return d.newSubstringMatchTree(s) 940 941 case *query.Branch: 942 masks := make([]uint64, 0, len(d.repoMetaData)) 943 if s.Pattern == "HEAD" { 944 for i := 0; i < len(d.repoMetaData); i++ { 945 masks = append(masks, 1) 946 } 947 } else { 948 for _, branchIDs := range d.branchIDs { 949 mask := uint64(0) 950 for nm, m := range branchIDs { 951 if (s.Exact && nm == s.Pattern) || (!s.Exact && strings.Contains(nm, s.Pattern)) { 952 mask |= uint64(m) 953 } 954 } 955 masks = append(masks, mask) 956 } 957 958 } 959 return &branchQueryMatchTree{ 960 masks: masks, 961 fileMasks: d.fileBranchMasks, 962 repos: d.repos, 963 }, nil 964 case *query.Const: 965 if s.Value { 966 return &bruteForceMatchTree{}, nil 967 } else { 968 return &noMatchTree{Why: "const"}, nil 969 } 970 case *query.Language: 971 code, ok := d.metaData.LanguageMap[s.Language] 972 if !ok { 973 return &noMatchTree{Why: "lang"}, nil 974 } 975 return &docMatchTree{ 976 reason: "language", 977 numDocs: d.numDocs(), 978 predicate: func(docID uint32) bool { 979 return d.getLanguage(docID) == code 980 }, 981 }, nil 982 983 case *query.Symbol: 984 // Disable WordMatchTree since we don't support it in symbols yet. 985 optCopy := opt 986 optCopy.DisableWordMatchOptimization = true 987 988 subMT, err := d.newMatchTree(s.Expr, optCopy) 989 if err != nil { 990 return nil, err 991 } 992 993 if substr, ok := subMT.(*substrMatchTree); ok { 994 return &symbolSubstrMatchTree{ 995 substrMatchTree: substr, 996 patternSize: uint32(utf8.RuneCountInString(substr.query.Pattern)), 997 fileEndRunes: d.fileEndRunes, 998 fileEndSymbol: d.fileEndSymbol, 999 sections: d.runeDocSections, 1000 }, nil 1001 } 1002 1003 var regexp *regexp.Regexp 1004 visitMatchTree(subMT, func(mt matchTree) { 1005 if t, ok := mt.(*regexpMatchTree); ok { 1006 regexp = t.regexp 1007 } 1008 }) 1009 if regexp == nil { 1010 return nil, fmt.Errorf("found %T inside query.Symbol", subMT) 1011 } 1012 1013 return &symbolRegexpMatchTree{ 1014 regexp: regexp, 1015 all: regexp.String() == "(?i)(?-s:.)*", 1016 matchTree: subMT, 1017 }, nil 1018 1019 case *query.FileNameSet: 1020 return &docMatchTree{ 1021 reason: "FileNameSet", 1022 numDocs: d.numDocs(), 1023 predicate: func(docID uint32) bool { 1024 fileName := d.fileName(docID) 1025 _, ok := s.Set[string(fileName)] 1026 return ok 1027 }, 1028 }, nil 1029 1030 case *query.BranchesRepos: 1031 reposBranchesWant := make([]uint64, len(d.repoMetaData)) 1032 for repoIdx := range d.repoMetaData { 1033 var mask uint64 1034 for _, br := range s.List { 1035 if br.Repos.Contains(d.repoMetaData[repoIdx].ID) { 1036 mask |= uint64(d.branchIDs[repoIdx][br.Branch]) 1037 } 1038 } 1039 reposBranchesWant[repoIdx] = mask 1040 } 1041 return &docMatchTree{ 1042 reason: "BranchesRepos", 1043 numDocs: d.numDocs(), 1044 predicate: func(docID uint32) bool { 1045 return d.fileBranchMasks[docID]&reposBranchesWant[d.repos[docID]] != 0 1046 }, 1047 }, nil 1048 1049 case *query.RepoSet: 1050 reposWant := make([]bool, len(d.repoMetaData)) 1051 for repoIdx, r := range d.repoMetaData { 1052 if _, ok := s.Set[r.Name]; ok { 1053 reposWant[repoIdx] = true 1054 } 1055 } 1056 return &docMatchTree{ 1057 reason: "RepoSet", 1058 numDocs: d.numDocs(), 1059 predicate: func(docID uint32) bool { 1060 return reposWant[d.repos[docID]] 1061 }, 1062 }, nil 1063 1064 case *query.RepoIDs: 1065 reposWant := make([]bool, len(d.repoMetaData)) 1066 for repoIdx, r := range d.repoMetaData { 1067 if s.Repos.Contains(r.ID) { 1068 reposWant[repoIdx] = true 1069 } 1070 } 1071 return &docMatchTree{ 1072 reason: "RepoIDs", 1073 numDocs: d.numDocs(), 1074 predicate: func(docID uint32) bool { 1075 return reposWant[d.repos[docID]] 1076 }, 1077 }, nil 1078 1079 case *query.Repo: 1080 reposWant := make([]bool, len(d.repoMetaData)) 1081 for repoIdx, r := range d.repoMetaData { 1082 if s.Regexp.MatchString(r.Name) { 1083 reposWant[repoIdx] = true 1084 } 1085 } 1086 return &docMatchTree{ 1087 reason: "Repo", 1088 numDocs: d.numDocs(), 1089 predicate: func(docID uint32) bool { 1090 return reposWant[d.repos[docID]] 1091 }, 1092 }, nil 1093 1094 case *query.RepoRegexp: 1095 reposWant := make([]bool, len(d.repoMetaData)) 1096 for repoIdx, r := range d.repoMetaData { 1097 if s.Regexp.MatchString(r.Name) { 1098 reposWant[repoIdx] = true 1099 } 1100 } 1101 return &docMatchTree{ 1102 reason: "RepoRegexp", 1103 numDocs: d.numDocs(), 1104 predicate: func(docID uint32) bool { 1105 return reposWant[d.repos[docID]] 1106 }, 1107 }, nil 1108 1109 case query.RawConfig: 1110 return &docMatchTree{ 1111 reason: s.String(), 1112 numDocs: d.numDocs(), 1113 predicate: func(docID uint32) bool { 1114 return uint8(s)&d.rawConfigMasks[d.repos[docID]] == uint8(s) 1115 }, 1116 }, nil 1117 } 1118 log.Panicf("type %T", q) 1119 return nil, nil 1120} 1121 1122func (d *indexData) newSubstringMatchTree(s *query.Substring) (matchTree, error) { 1123 st := &substrMatchTree{ 1124 query: s, 1125 caseSensitive: s.CaseSensitive, 1126 fileName: s.FileName, 1127 } 1128 1129 if utf8.RuneCountInString(s.Pattern) < ngramSize { 1130 prefix := "" 1131 if !s.CaseSensitive { 1132 prefix = "(?i)" 1133 } 1134 t := &regexpMatchTree{ 1135 regexp: regexp.MustCompile(prefix + regexp.QuoteMeta(s.Pattern)), 1136 fileName: s.FileName, 1137 } 1138 return t, nil 1139 } 1140 1141 result, err := d.iterateNgrams(s) 1142 if err != nil { 1143 return nil, err 1144 } 1145 st.matchIterator = result 1146 return st, nil 1147} 1148 1149func regexpToWordMatchTree(q *query.Regexp, opt matchTreeOpt) (_ *wordMatchTree, ok bool) { 1150 if opt.DisableWordMatchOptimization { 1151 return nil, false 1152 } 1153 // Needs to be case sensitive 1154 if !q.CaseSensitive || q.Regexp.Flags&syntax.FoldCase != 0 { 1155 return nil, false 1156 } 1157 // We want a regex that looks like Op.Concat[OpWordBoundary OpLiteral OpWordBoundary] 1158 if q.Regexp.Op != syntax.OpConcat || len(q.Regexp.Sub) != 3 { 1159 return nil, false 1160 } 1161 sub := q.Regexp.Sub 1162 if sub[0].Op != syntax.OpWordBoundary || sub[1].Op != syntax.OpLiteral || sub[2].Op != syntax.OpWordBoundary { 1163 return nil, false 1164 } 1165 1166 return &wordMatchTree{ 1167 word: string(sub[1].Rune), 1168 fileName: q.FileName, 1169 }, true 1170} 1171 1172// pruneMatchTree removes impossible branches from the matchTree, as indicated 1173// by substrMatchTree having a noMatchTree and the resulting impossible and clauses and so forth. 1174func pruneMatchTree(mt matchTree) (matchTree, error) { 1175 var err error 1176 switch mt := mt.(type) { 1177 // leaf nodes that we test with the filter: 1178 case *substrMatchTree: 1179 if res, ok := mt.matchIterator.(*ngramIterationResults); ok { 1180 if _, ok := res.matchIterator.(*noMatchTree); ok { 1181 return nil, nil 1182 } 1183 } 1184 // recursive tree structures: 1185 case *andMatchTree: 1186 // Any branch of an and becoming impossible means the entire clause 1187 // is impossible. Otherwise, just handle rewrites. 1188 for i, child := range mt.children { 1189 newChild, err := pruneMatchTree(child) 1190 if err != nil { 1191 return nil, err 1192 } 1193 if newChild == nil { 1194 return nil, nil 1195 } 1196 mt.children[i] = newChild 1197 } 1198 case *orMatchTree: 1199 // *All* branches of an OR becoming impossible means the entire clause 1200 // is impossible. Otherwise, drop impossible subclauses and handle 1201 // rewrites, including simplifying to a singular resulting child branch. 1202 n := 0 1203 for _, child := range mt.children { 1204 newChild, err := pruneMatchTree(child) 1205 if err != nil { 1206 return nil, err 1207 } 1208 if newChild != nil { 1209 mt.children[n] = newChild 1210 n++ 1211 } 1212 } 1213 mt.children = mt.children[:n] 1214 if len(mt.children) == 1 { 1215 return mt.children[0], nil 1216 } else if len(mt.children) == 0 { 1217 return nil, nil 1218 } 1219 case *noVisitMatchTree: 1220 mt.matchTree, err = pruneMatchTree(mt.matchTree) 1221 if err != nil { 1222 return nil, err 1223 } 1224 if mt.matchTree == nil { 1225 return nil, nil 1226 } 1227 case *fileNameMatchTree: 1228 mt.child, err = pruneMatchTree(mt.child) 1229 case *andLineMatchTree: 1230 child, err := pruneMatchTree(&mt.andMatchTree) 1231 if err != nil { 1232 return nil, err 1233 } 1234 if child == nil { 1235 return nil, nil 1236 } 1237 if c, ok := child.(*andMatchTree); ok { 1238 mt.andMatchTree = *c 1239 } else { 1240 // the and was simplified to a single clause, 1241 // so the linematch portion is irrelevant. 1242 return mt, nil 1243 } 1244 case *notMatchTree: 1245 mt.child, err = pruneMatchTree(mt.child) 1246 if err != nil { 1247 return nil, err 1248 } 1249 if mt.child == nil { 1250 // not false => true 1251 return &bruteForceMatchTree{}, nil 1252 } 1253 // unhandled: 1254 case *docMatchTree: 1255 case *bruteForceMatchTree: 1256 case *regexpMatchTree: 1257 case *wordMatchTree: 1258 } 1259 return mt, err 1260}