fork of https://github.com/sourcegraph/zoekt
0

Configure Feed

Select the types of activity you want to include in your feed.

1// Copyright 2018 Google Inc. All rights reserved. 2// 3// Licensed under the Apache License, Version 2.0 (the "License"); 4// you may not use this file except in compliance with the License. 5// You may obtain a copy of the License at 6// 7// http://www.apache.org/licenses/LICENSE-2.0 8// 9// Unless required by applicable law or agreed to in writing, software 10// distributed under the License is distributed on an "AS IS" BASIS, 11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12// See the License for the specific language governing permissions and 13// limitations under the License. 14 15package zoekt 16 17import ( 18 "bytes" 19 "fmt" 20 "log" 21 "regexp/syntax" 22 "strings" 23 "unicode/utf8" 24 25 "github.com/grafana/regexp" 26 27 "github.com/sourcegraph/zoekt/query" 28) 29 30// A docIterator iterates over documents in order. 31type docIterator interface { 32 // provide the next document where we can may find something 33 // interesting. 34 nextDoc() uint32 35 36 // clears any per-document state of the docIterator, and 37 // prepares for evaluating the given doc. The argument is 38 // strictly increasing over time. 39 prepare(nextDoc uint32) 40} 41 42const ( 43 costConst = 0 44 costMemory = 1 45 costContent = 2 46 costRegexp = 3 47) 48 49const ( 50 costMin = costConst 51 costMax = costRegexp 52) 53 54// An expression tree coupled with matches. The matchtree has two 55// functions: 56// 57// * it implements boolean combinations (and, or, not) 58// 59// * it implements shortcuts, where we skip documents (for example: if 60// there are no trigram matches, we can be sure there are no substring 61// matches). The matchtree iterates over the documents as they are 62// ordered in the shard. 63// 64// The general process for a given (shard, query) is: 65// 66// - construct matchTree for the query 67// 68// - find all different leaf matchTrees (substring, regexp, etc.) 69// 70// in a loop: 71// 72// - find next doc to process using nextDoc 73// 74// - evaluate atoms (leaf expressions that match text) 75// 76// - evaluate the tree using matches(), storing the result in map. 77// 78// - if the complete tree returns (matches() == true) for the document, 79// collect all text matches by looking at leaf matchTrees 80type matchTree interface { 81 docIterator 82 83 // returns whether this matches, and if we are sure. 84 matches(cp *contentProvider, cost int, known map[matchTree]bool) (match bool, sure bool) 85} 86 87// docMatchTree iterates over documents for which predicate(docID) returns true. 88type docMatchTree struct { 89 // the number of documents in a shard. 90 numDocs uint32 91 92 predicate func(docID uint32) bool 93 94 // provides additional information about the reason why the docMatchTree was 95 // created. 96 reason string 97 98 // mutable 99 firstDone bool 100 docID uint32 101} 102 103type bruteForceMatchTree struct { 104 // mutable 105 firstDone bool 106 docID uint32 107} 108 109type andLineMatchTree struct { 110 andMatchTree 111} 112 113type andMatchTree struct { 114 children []matchTree 115} 116 117type orMatchTree struct { 118 children []matchTree 119} 120 121type notMatchTree struct { 122 child matchTree 123} 124 125// Returns only the filename of child matches. 126type fileNameMatchTree struct { 127 child matchTree 128} 129 130// Don't visit this subtree for collecting matches. 131type noVisitMatchTree struct { 132 matchTree 133} 134 135type regexpMatchTree struct { 136 regexp *regexp.Regexp 137 138 fileName bool 139 140 // mutable 141 reEvaluated bool 142 found []*candidateMatch 143 144 // nextDoc, prepare. 145 bruteForceMatchTree 146} 147 148// \bLITERAL\b 149type wordMatchTree struct { 150 word string 151 152 fileName bool 153 154 // mutable 155 evaluated bool 156 found []*candidateMatch 157 158 // nextDoc, prepare. 159 bruteForceMatchTree 160} 161 162type substrMatchTree struct { 163 matchIterator 164 165 query *query.Substring 166 caseSensitive bool 167 fileName bool 168 169 // mutable 170 current []*candidateMatch 171 contEvaluated bool 172} 173 174type branchQueryMatchTree struct { 175 fileMasks []uint64 176 masks []uint64 177 repos []uint16 178 179 // mutable 180 firstDone bool 181 docID uint32 182} 183 184func (t *branchQueryMatchTree) branchMask() uint64 { 185 return t.fileMasks[t.docID] & t.masks[t.repos[t.docID]] 186} 187 188type symbolRegexpMatchTree struct { 189 matchTree 190 regexp *regexp.Regexp 191 all bool // skips regex match if .* 192 193 reEvaluated bool 194 found []*candidateMatch 195} 196 197func (t *symbolRegexpMatchTree) prepare(doc uint32) { 198 t.reEvaluated = false 199} 200 201func (t *symbolRegexpMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) (bool, bool) { 202 if t.reEvaluated { 203 return len(t.found) > 0, true 204 } 205 206 if cost < costRegexp { 207 return false, false 208 } 209 210 sections := cp.docSections() 211 content := cp.data(false) 212 213 found := t.found[:0] 214 for i, sec := range sections { 215 var idx []int 216 if t.all { 217 idx = []int{0, int(sec.End - sec.Start)} 218 } else { 219 idx = t.regexp.FindIndex(content[sec.Start:sec.End]) 220 if idx == nil { 221 continue 222 } 223 } 224 225 cm := &candidateMatch{ 226 byteOffset: sec.Start + uint32(idx[0]), 227 byteMatchSz: uint32(idx[1] - idx[0]), 228 symbol: true, 229 symbolIdx: uint32(i), 230 } 231 found = append(found, cm) 232 } 233 t.found = found 234 t.reEvaluated = true 235 236 return len(t.found) > 0, true 237} 238 239type symbolSubstrMatchTree struct { 240 *substrMatchTree 241 242 patternSize uint32 243 fileEndRunes []uint32 244 fileEndSymbol []uint32 245 246 doc uint32 247 sections []DocumentSection 248 249 secID uint32 250} 251 252func (t *symbolSubstrMatchTree) prepare(doc uint32) { 253 t.substrMatchTree.prepare(doc) 254 t.doc = doc 255 256 var fileStart uint32 257 if doc > 0 { 258 fileStart = t.fileEndRunes[doc-1] 259 } 260 261 var sections []DocumentSection 262 if len(t.sections) > 0 { 263 most := t.fileEndSymbol[len(t.fileEndSymbol)-1] 264 if most == uint32(len(t.sections)) { 265 sections = t.sections[t.fileEndSymbol[doc]:t.fileEndSymbol[doc+1]] 266 } else { 267 for t.secID < uint32(len(t.sections)) && t.sections[t.secID].Start < fileStart { 268 t.secID++ 269 } 270 271 fileEnd, symbolEnd := t.fileEndRunes[doc], t.secID 272 for symbolEnd < uint32(len(t.sections)) && t.sections[symbolEnd].Start < fileEnd { 273 symbolEnd++ 274 } 275 276 sections = t.sections[t.secID:symbolEnd] 277 } 278 } 279 280 secIdx := 0 281 trimmed := t.current[:0] 282 for len(sections) > secIdx && len(t.current) > 0 { 283 start := fileStart + t.current[0].runeOffset 284 end := start + t.patternSize 285 if start >= sections[secIdx].End { 286 secIdx++ 287 continue 288 } 289 290 if start < sections[secIdx].Start { 291 t.current = t.current[1:] 292 continue 293 } 294 295 if end <= sections[secIdx].End { 296 t.current[0].symbol = true 297 t.current[0].symbolIdx = uint32(secIdx) 298 trimmed = append(trimmed, t.current[0]) 299 } 300 301 t.current = t.current[1:] 302 } 303 t.current = trimmed 304} 305 306// all prepare methods 307 308func (t *bruteForceMatchTree) prepare(doc uint32) { 309 t.docID = doc 310 t.firstDone = true 311} 312 313func (t *docMatchTree) prepare(doc uint32) { 314 t.docID = doc 315 t.firstDone = true 316} 317 318func (t *andMatchTree) prepare(doc uint32) { 319 for _, c := range t.children { 320 c.prepare(doc) 321 } 322} 323 324func (t *regexpMatchTree) prepare(doc uint32) { 325 t.found = t.found[:0] 326 t.reEvaluated = false 327 t.bruteForceMatchTree.prepare(doc) 328} 329 330func (t *wordMatchTree) prepare(doc uint32) { 331 t.found = t.found[:0] 332 t.evaluated = false 333 t.bruteForceMatchTree.prepare(doc) 334} 335 336func (t *orMatchTree) prepare(doc uint32) { 337 for _, c := range t.children { 338 c.prepare(doc) 339 } 340} 341 342func (t *notMatchTree) prepare(doc uint32) { 343 t.child.prepare(doc) 344} 345 346func (t *fileNameMatchTree) prepare(doc uint32) { 347 t.child.prepare(doc) 348} 349 350func (t *substrMatchTree) prepare(nextDoc uint32) { 351 t.matchIterator.prepare(nextDoc) 352 t.current = t.matchIterator.candidates() 353 t.contEvaluated = false 354} 355 356func (t *branchQueryMatchTree) prepare(doc uint32) { 357 t.firstDone = true 358 t.docID = doc 359} 360 361// nextDoc 362 363func (t *docMatchTree) nextDoc() uint32 { 364 var start uint32 365 if t.firstDone { 366 start = t.docID + 1 367 } 368 for i := start; i < t.numDocs; i++ { 369 if t.predicate(i) { 370 return i 371 } 372 } 373 return maxUInt32 374} 375 376func (t *bruteForceMatchTree) nextDoc() uint32 { 377 if !t.firstDone { 378 return 0 379 } 380 return t.docID + 1 381} 382 383func (t *andMatchTree) nextDoc() uint32 { 384 var max uint32 385 for _, c := range t.children { 386 m := c.nextDoc() 387 if m > max { 388 max = m 389 } 390 } 391 return max 392} 393 394func (t *orMatchTree) nextDoc() uint32 { 395 min := uint32(maxUInt32) 396 for _, c := range t.children { 397 m := c.nextDoc() 398 if m < min { 399 min = m 400 } 401 } 402 return min 403} 404 405func (t *notMatchTree) nextDoc() uint32 { 406 return 0 407} 408 409func (t *fileNameMatchTree) nextDoc() uint32 { 410 return t.child.nextDoc() 411} 412 413func (t *branchQueryMatchTree) nextDoc() uint32 { 414 var start uint32 415 if t.firstDone { 416 start = t.docID + 1 417 } 418 419 for i := start; i < uint32(len(t.fileMasks)); i++ { 420 if (t.masks[t.repos[i]] & t.fileMasks[i]) != 0 { 421 return i 422 } 423 } 424 return maxUInt32 425} 426 427// all String methods 428 429func (t *bruteForceMatchTree) String() string { 430 return "all" 431} 432 433func (t *docMatchTree) String() string { 434 return fmt.Sprintf("doc(%s)", t.reason) 435} 436 437func (t *andMatchTree) String() string { 438 return fmt.Sprintf("and%v", t.children) 439} 440 441func (t *regexpMatchTree) String() string { 442 f := "" 443 if t.fileName { 444 f = "f" 445 } 446 return fmt.Sprintf("%sre(%s)", f, t.regexp) 447} 448 449func (t *wordMatchTree) String() string { 450 f := "" 451 if t.fileName { 452 f = "f" 453 } 454 return fmt.Sprintf("%sword(%s)", f, t.word) 455} 456 457func (t *orMatchTree) String() string { 458 return fmt.Sprintf("or%v", t.children) 459} 460 461func (t *notMatchTree) String() string { 462 return fmt.Sprintf("not(%v)", t.child) 463} 464 465func (t *noVisitMatchTree) String() string { 466 return fmt.Sprintf("novisit(%v)", t.matchTree) 467} 468 469func (t *fileNameMatchTree) String() string { 470 return fmt.Sprintf("f(%v)", t.child) 471} 472 473func (t *substrMatchTree) String() string { 474 f := "" 475 if t.fileName { 476 f = "f" 477 } 478 479 return fmt.Sprintf("%ssubstr(%q, %v, %v)", f, t.query.Pattern, t.current, t.matchIterator) 480} 481 482func (t *branchQueryMatchTree) String() string { 483 return fmt.Sprintf("branch(%x)", t.masks) 484} 485 486func (t *symbolSubstrMatchTree) String() string { 487 return fmt.Sprintf("symbol(%v)", t.substrMatchTree) 488} 489 490func (t *symbolRegexpMatchTree) String() string { 491 return fmt.Sprintf("symbol(%v)", t.matchTree) 492} 493 494// visitMatches visits all atoms in matchTree. Note: This visits 495// noVisitMatchTree. For collecting matches use visitMatches. 496func visitMatchTree(t matchTree, f func(matchTree)) { 497 switch s := t.(type) { 498 case *andMatchTree: 499 for _, ch := range s.children { 500 visitMatchTree(ch, f) 501 } 502 case *orMatchTree: 503 for _, ch := range s.children { 504 visitMatchTree(ch, f) 505 } 506 case *andLineMatchTree: 507 visitMatchTree(&s.andMatchTree, f) 508 case *noVisitMatchTree: 509 visitMatchTree(s.matchTree, f) 510 case *notMatchTree: 511 visitMatchTree(s.child, f) 512 case *fileNameMatchTree: 513 visitMatchTree(s.child, f) 514 case *symbolSubstrMatchTree: 515 visitMatchTree(s.substrMatchTree, f) 516 case *symbolRegexpMatchTree: 517 visitMatchTree(s.matchTree, f) 518 default: 519 f(t) 520 } 521} 522 523// updateMatchTreeStats calls updateStats on all atoms in mt which have that 524// function defined. 525func updateMatchTreeStats(mt matchTree, stats *Stats) { 526 visitMatchTree(mt, func(mt matchTree) { 527 if atom, ok := mt.(interface{ updateStats(*Stats) }); ok { 528 atom.updateStats(stats) 529 } 530 }) 531} 532 533// visitMatches visits all atoms which can contribute matches. Note: This 534// skips noVisitMatchTree. 535func visitMatches(t matchTree, known map[matchTree]bool, f func(matchTree)) { 536 switch s := t.(type) { 537 case *andMatchTree: 538 for _, ch := range s.children { 539 if known[ch] { 540 visitMatches(ch, known, f) 541 } 542 } 543 case *andLineMatchTree: 544 visitMatches(&s.andMatchTree, known, f) 545 case *orMatchTree: 546 for _, ch := range s.children { 547 if known[ch] { 548 visitMatches(ch, known, f) 549 } 550 } 551 case *symbolSubstrMatchTree: 552 visitMatches(s.substrMatchTree, known, f) 553 case *notMatchTree: 554 case *noVisitMatchTree: 555 // don't collect into negative trees. 556 case *fileNameMatchTree: 557 // We will just gather the filename if we do not visit this tree. 558 default: 559 f(s) 560 } 561} 562 563// all matches() methods. 564 565func (t *docMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) (bool, bool) { 566 return t.predicate(cp.idx), true 567} 568 569func (t *bruteForceMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) (bool, bool) { 570 return true, true 571} 572 573// andLineMatchTree is a performance optimization of andMatchTree. For content 574// searches we don't want to run the regex engine if there is no line that 575// contains matches from all terms. 576func (t *andLineMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) (bool, bool) { 577 matches, sure := t.andMatchTree.matches(cp, cost, known) 578 if !(sure && matches) { 579 return matches, sure 580 } 581 582 // find child with fewest candidates 583 min := maxUInt32 584 fewestChildren := 0 585 for ix, child := range t.children { 586 v, ok := child.(*substrMatchTree) 587 // make sure we are running a content search and that all candidates are a 588 // substrMatchTree 589 if !ok || v.fileName { 590 return matches, sure 591 } 592 if len(v.current) < min { 593 min = len(v.current) 594 fewestChildren = ix 595 } 596 } 597 598 type lineRange struct { 599 start int 600 end int 601 } 602 lines := make([]lineRange, 0, len(t.children[fewestChildren].(*substrMatchTree).current)) 603 prev := -1 604 for _, candidate := range t.children[fewestChildren].(*substrMatchTree).current { 605 line, byteStart, byteEnd := cp.newlines().atOffset(candidate.byteOffset) 606 if line == prev { 607 continue 608 } 609 prev = line 610 lines = append(lines, lineRange{byteStart, byteEnd}) 611 } 612 613 // children keeps track of the children's candidates we have already seen. 614 children := make([][]*candidateMatch, 0, len(t.children)-1) 615 for j, child := range t.children { 616 if j == fewestChildren { 617 continue 618 } 619 children = append(children, child.(*substrMatchTree).current) 620 } 621 622nextLine: 623 for i := 0; i < len(lines); i++ { 624 hits := 1 625 nextChild: 626 for j := range children { 627 nextCandidate: 628 for len(children[j]) > 0 { 629 candidate := children[j][0] 630 bo := int(cp.findOffset(false, candidate.runeOffset)) 631 if bo < lines[i].start { 632 children[j] = children[j][1:] 633 continue nextCandidate 634 } 635 if bo <= lines[i].end { 636 hits++ 637 continue nextChild 638 } 639 // move the `lines` iterator forward until bo <= line.end 640 for i < len(lines) && bo > lines[i].end { 641 i++ 642 } 643 i-- 644 continue nextLine 645 } 646 } 647 // return early once we found any line that contains matches from all children 648 if hits == len(t.children) { 649 return matches, true 650 } 651 } 652 return false, true 653} 654 655func (t *andMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) (bool, bool) { 656 sure := true 657 658 for _, ch := range t.children { 659 v, ok := evalMatchTree(cp, cost, known, ch) 660 if ok && !v { 661 return false, true 662 } 663 if !ok { 664 sure = false 665 } 666 } 667 668 return true, sure 669} 670 671func (t *orMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) (bool, bool) { 672 matches := false 673 sure := true 674 for _, ch := range t.children { 675 v, ok := evalMatchTree(cp, cost, known, ch) 676 if ok { 677 // we could short-circuit, but we want to use 678 // the other possibilities as a ranking 679 // signal. 680 matches = matches || v 681 } else { 682 sure = false 683 } 684 } 685 return matches, sure 686} 687 688func (t *branchQueryMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) (bool, bool) { 689 return t.branchMask() != 0, true 690} 691 692func (t *regexpMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) (bool, bool) { 693 if t.reEvaluated { 694 return len(t.found) > 0, true 695 } 696 697 if cost < costRegexp { 698 return false, false 699 } 700 701 cp.stats.RegexpsConsidered++ 702 idxs := t.regexp.FindAllIndex(cp.data(t.fileName), -1) 703 found := t.found[:0] 704 for _, idx := range idxs { 705 cm := &candidateMatch{ 706 byteOffset: uint32(idx[0]), 707 byteMatchSz: uint32(idx[1] - idx[0]), 708 fileName: t.fileName, 709 } 710 711 found = append(found, cm) 712 } 713 t.found = found 714 t.reEvaluated = true 715 716 return len(t.found) > 0, true 717} 718 719func (t *wordMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) (bool, bool) { 720 if t.evaluated { 721 return len(t.found) > 0, true 722 } 723 724 if cost < costRegexp { 725 return false, false 726 } 727 728 data := cp.data(t.fileName) 729 offset := 0 730 found := t.found[:0] 731 for { 732 idx := bytes.Index(data[offset:], []byte(t.word)) 733 if idx < 0 { 734 break 735 } 736 737 relStartOffset := offset + idx 738 relEndOffset := relStartOffset + len(t.word) 739 740 startBoundary := relStartOffset < len(data) && (relStartOffset == 0 || !characterClass(data[relStartOffset-1])) 741 endBoundary := relEndOffset > 0 && (relEndOffset == len(data) || !characterClass(data[relEndOffset])) 742 if startBoundary && endBoundary { 743 found = append(found, &candidateMatch{ 744 byteOffset: uint32(offset + idx), 745 byteMatchSz: uint32(len(t.word)), 746 fileName: t.fileName, 747 }) 748 } 749 offset += idx + len(t.word) 750 } 751 752 t.found = found 753 t.evaluated = true 754 755 return len(t.found) > 0, true 756} 757 758// breakMatchesOnNewlines returns matches resulting from breaking each element 759// of cms on newlines within text. 760func breakMatchesOnNewlines(cms []*candidateMatch, text []byte) []*candidateMatch { 761 var lineCMs []*candidateMatch 762 for _, cm := range cms { 763 lineCMs = append(lineCMs, breakOnNewlines(cm, text)...) 764 } 765 return lineCMs 766} 767 768// breakOnNewlines returns matches resulting from breaking cm on newlines 769// within text. 770func breakOnNewlines(cm *candidateMatch, text []byte) []*candidateMatch { 771 var cms []*candidateMatch 772 addMe := &candidateMatch{} 773 *addMe = *cm 774 for i := uint32(cm.byteOffset); i < cm.byteOffset+cm.byteMatchSz; i++ { 775 if text[i] == '\n' { 776 addMe.byteMatchSz = i - addMe.byteOffset 777 if addMe.byteMatchSz != 0 { 778 cms = append(cms, addMe) 779 } 780 781 addMe = &candidateMatch{} 782 *addMe = *cm 783 addMe.byteOffset = i + 1 784 } 785 } 786 addMe.byteMatchSz = cm.byteOffset + cm.byteMatchSz - addMe.byteOffset 787 if addMe.byteMatchSz != 0 { 788 cms = append(cms, addMe) 789 } 790 return cms 791} 792 793func evalMatchTree(cp *contentProvider, cost int, known map[matchTree]bool, mt matchTree) (bool, bool) { 794 if v, ok := known[mt]; ok { 795 return v, true 796 } 797 798 v, ok := mt.matches(cp, cost, known) 799 if ok { 800 known[mt] = v 801 } 802 803 return v, ok 804} 805 806func (t *notMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) (bool, bool) { 807 v, ok := evalMatchTree(cp, cost, known, t.child) 808 return !v, ok 809} 810 811func (t *fileNameMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) (bool, bool) { 812 return evalMatchTree(cp, cost, known, t.child) 813} 814 815func (t *substrMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) (bool, bool) { 816 if t.contEvaluated { 817 return len(t.current) > 0, true 818 } 819 820 if len(t.current) == 0 { 821 return false, true 822 } 823 824 if t.fileName && cost < costMemory { 825 return false, false 826 } 827 828 if !t.fileName && cost < costContent { 829 return false, false 830 } 831 832 pruned := t.current[:0] 833 for _, m := range t.current { 834 if m.byteOffset == 0 && m.runeOffset > 0 { 835 m.byteOffset = cp.findOffset(m.fileName, m.runeOffset) 836 } 837 if m.matchContent(cp.data(m.fileName)) { 838 pruned = append(pruned, m) 839 } 840 } 841 t.current = pruned 842 t.contEvaluated = true 843 844 return len(t.current) > 0, true 845} 846 847type matchTreeOpt struct { 848 // DisableWordMatchOptimization is used to disable the use of wordMatchTree. 849 // This was added since we do not support wordMatchTree with symbol search. 850 DisableWordMatchOptimization bool 851} 852 853func (d *indexData) newMatchTree(q query.Q, opt matchTreeOpt) (matchTree, error) { 854 if q == nil { 855 return nil, fmt.Errorf("got nil (sub)query") 856 } 857 switch s := q.(type) { 858 case *query.Regexp: 859 // RegexpToMatchTreeRecursive tries to distill a matchTree that matches a 860 // superset of the regexp. If the returned matchTree is equivalent to the 861 // original regexp, it returns true. An equivalent matchTree has the same 862 // behaviour as the original regexp and can be used instead. 863 // 864 subMT, isEq, _, err := d.regexpToMatchTreeRecursive(s.Regexp, ngramSize, s.FileName, s.CaseSensitive) 865 if err != nil { 866 return nil, err 867 } 868 // if the query can be used in place of the regexp 869 // return the subtree 870 if isEq { 871 return subMT, nil 872 } 873 874 var tr matchTree 875 if wmt, ok := regexpToWordMatchTree(s, opt); ok { 876 // A common search we get is "\bLITERAL\b". Avoid the regex engine and 877 // provide something faster. 878 tr = wmt 879 } else { 880 prefix := "" 881 if !s.CaseSensitive { 882 prefix = "(?i)" 883 } 884 885 tr = &regexpMatchTree{ 886 regexp: regexp.MustCompile(prefix + s.Regexp.String()), 887 fileName: s.FileName, 888 } 889 } 890 891 return &andMatchTree{ 892 children: []matchTree{ 893 tr, &noVisitMatchTree{subMT}, 894 }, 895 }, nil 896 case *query.And: 897 var r []matchTree 898 for _, ch := range s.Children { 899 ct, err := d.newMatchTree(ch, opt) 900 if err != nil { 901 return nil, err 902 } 903 r = append(r, ct) 904 } 905 return &andMatchTree{r}, nil 906 case *query.Or: 907 var r []matchTree 908 for _, ch := range s.Children { 909 ct, err := d.newMatchTree(ch, opt) 910 if err != nil { 911 return nil, err 912 } 913 r = append(r, ct) 914 } 915 return &orMatchTree{r}, nil 916 case *query.Not: 917 ct, err := d.newMatchTree(s.Child, opt) 918 return &notMatchTree{ 919 child: ct, 920 }, err 921 922 case *query.Type: 923 if s.Type != query.TypeFileName { 924 break 925 } 926 927 ct, err := d.newMatchTree(s.Child, opt) 928 if err != nil { 929 return nil, err 930 } 931 932 return &fileNameMatchTree{ 933 child: ct, 934 }, nil 935 936 case *query.Substring: 937 return d.newSubstringMatchTree(s) 938 939 case *query.Branch: 940 masks := make([]uint64, 0, len(d.repoMetaData)) 941 if s.Pattern == "HEAD" { 942 for i := 0; i < len(d.repoMetaData); i++ { 943 masks = append(masks, 1) 944 } 945 } else { 946 for _, branchIDs := range d.branchIDs { 947 mask := uint64(0) 948 for nm, m := range branchIDs { 949 if (s.Exact && nm == s.Pattern) || (!s.Exact && strings.Contains(nm, s.Pattern)) { 950 mask |= uint64(m) 951 } 952 } 953 masks = append(masks, mask) 954 } 955 956 } 957 return &branchQueryMatchTree{ 958 masks: masks, 959 fileMasks: d.fileBranchMasks, 960 repos: d.repos, 961 }, nil 962 case *query.Const: 963 if s.Value { 964 return &bruteForceMatchTree{}, nil 965 } else { 966 return &noMatchTree{Why: "const"}, nil 967 } 968 case *query.Language: 969 code, ok := d.metaData.LanguageMap[s.Language] 970 if !ok { 971 return &noMatchTree{Why: "lang"}, nil 972 } 973 return &docMatchTree{ 974 reason: "language", 975 numDocs: d.numDocs(), 976 predicate: func(docID uint32) bool { 977 return d.getLanguage(docID) == code 978 }, 979 }, nil 980 981 case *query.Symbol: 982 // Disable WordMatchTree since we don't support it in symbols yet. 983 optCopy := opt 984 optCopy.DisableWordMatchOptimization = true 985 986 subMT, err := d.newMatchTree(s.Expr, optCopy) 987 if err != nil { 988 return nil, err 989 } 990 991 if substr, ok := subMT.(*substrMatchTree); ok { 992 return &symbolSubstrMatchTree{ 993 substrMatchTree: substr, 994 patternSize: uint32(utf8.RuneCountInString(substr.query.Pattern)), 995 fileEndRunes: d.fileEndRunes, 996 fileEndSymbol: d.fileEndSymbol, 997 sections: d.runeDocSections, 998 }, nil 999 } 1000 1001 var regexp *regexp.Regexp 1002 visitMatchTree(subMT, func(mt matchTree) { 1003 if t, ok := mt.(*regexpMatchTree); ok { 1004 regexp = t.regexp 1005 } 1006 }) 1007 if regexp == nil { 1008 return nil, fmt.Errorf("found %T inside query.Symbol", subMT) 1009 } 1010 1011 return &symbolRegexpMatchTree{ 1012 regexp: regexp, 1013 all: regexp.String() == "(?i)(?-s:.)*", 1014 matchTree: subMT, 1015 }, nil 1016 1017 case *query.FileNameSet: 1018 return &docMatchTree{ 1019 reason: "FileNameSet", 1020 numDocs: d.numDocs(), 1021 predicate: func(docID uint32) bool { 1022 fileName := d.fileName(docID) 1023 _, ok := s.Set[string(fileName)] 1024 return ok 1025 }, 1026 }, nil 1027 1028 case *query.BranchesRepos: 1029 reposBranchesWant := make([]uint64, len(d.repoMetaData)) 1030 for repoIdx := range d.repoMetaData { 1031 var mask uint64 1032 for _, br := range s.List { 1033 if br.Repos.Contains(d.repoMetaData[repoIdx].ID) { 1034 mask |= uint64(d.branchIDs[repoIdx][br.Branch]) 1035 } 1036 } 1037 reposBranchesWant[repoIdx] = mask 1038 } 1039 return &docMatchTree{ 1040 reason: "BranchesRepos", 1041 numDocs: d.numDocs(), 1042 predicate: func(docID uint32) bool { 1043 return d.fileBranchMasks[docID]&reposBranchesWant[d.repos[docID]] != 0 1044 }, 1045 }, nil 1046 1047 case *query.RepoSet: 1048 reposWant := make([]bool, len(d.repoMetaData)) 1049 for repoIdx, r := range d.repoMetaData { 1050 if _, ok := s.Set[r.Name]; ok { 1051 reposWant[repoIdx] = true 1052 } 1053 } 1054 return &docMatchTree{ 1055 reason: "RepoSet", 1056 numDocs: d.numDocs(), 1057 predicate: func(docID uint32) bool { 1058 return reposWant[d.repos[docID]] 1059 }, 1060 }, nil 1061 1062 case *query.RepoIDs: 1063 reposWant := make([]bool, len(d.repoMetaData)) 1064 for repoIdx, r := range d.repoMetaData { 1065 if s.Repos.Contains(r.ID) { 1066 reposWant[repoIdx] = true 1067 } 1068 } 1069 return &docMatchTree{ 1070 reason: "RepoIDs", 1071 numDocs: d.numDocs(), 1072 predicate: func(docID uint32) bool { 1073 return reposWant[d.repos[docID]] 1074 }, 1075 }, nil 1076 1077 case *query.Repo: 1078 reposWant := make([]bool, len(d.repoMetaData)) 1079 for repoIdx, r := range d.repoMetaData { 1080 if s.Regexp.MatchString(r.Name) { 1081 reposWant[repoIdx] = true 1082 } 1083 } 1084 return &docMatchTree{ 1085 reason: "Repo", 1086 numDocs: d.numDocs(), 1087 predicate: func(docID uint32) bool { 1088 return reposWant[d.repos[docID]] 1089 }, 1090 }, nil 1091 1092 case *query.RepoRegexp: 1093 reposWant := make([]bool, len(d.repoMetaData)) 1094 for repoIdx, r := range d.repoMetaData { 1095 if s.Regexp.MatchString(r.Name) { 1096 reposWant[repoIdx] = true 1097 } 1098 } 1099 return &docMatchTree{ 1100 reason: "RepoRegexp", 1101 numDocs: d.numDocs(), 1102 predicate: func(docID uint32) bool { 1103 return reposWant[d.repos[docID]] 1104 }, 1105 }, nil 1106 1107 case query.RawConfig: 1108 return &docMatchTree{ 1109 reason: s.String(), 1110 numDocs: d.numDocs(), 1111 predicate: func(docID uint32) bool { 1112 return uint8(s)&d.rawConfigMasks[d.repos[docID]] == uint8(s) 1113 }, 1114 }, nil 1115 } 1116 log.Panicf("type %T", q) 1117 return nil, nil 1118} 1119 1120func (d *indexData) newSubstringMatchTree(s *query.Substring) (matchTree, error) { 1121 st := &substrMatchTree{ 1122 query: s, 1123 caseSensitive: s.CaseSensitive, 1124 fileName: s.FileName, 1125 } 1126 1127 if utf8.RuneCountInString(s.Pattern) < ngramSize { 1128 prefix := "" 1129 if !s.CaseSensitive { 1130 prefix = "(?i)" 1131 } 1132 t := &regexpMatchTree{ 1133 regexp: regexp.MustCompile(prefix + regexp.QuoteMeta(s.Pattern)), 1134 fileName: s.FileName, 1135 } 1136 return t, nil 1137 } 1138 1139 result, err := d.iterateNgrams(s) 1140 if err != nil { 1141 return nil, err 1142 } 1143 st.matchIterator = result 1144 return st, nil 1145} 1146 1147func regexpToWordMatchTree(q *query.Regexp, opt matchTreeOpt) (_ *wordMatchTree, ok bool) { 1148 if opt.DisableWordMatchOptimization { 1149 return nil, false 1150 } 1151 // Needs to be case sensitive 1152 if !q.CaseSensitive || q.Regexp.Flags&syntax.FoldCase != 0 { 1153 return nil, false 1154 } 1155 // We want a regex that looks like Op.Concat[OpWordBoundary OpLiteral OpWordBoundary] 1156 if q.Regexp.Op != syntax.OpConcat || len(q.Regexp.Sub) != 3 { 1157 return nil, false 1158 } 1159 sub := q.Regexp.Sub 1160 if sub[0].Op != syntax.OpWordBoundary || sub[1].Op != syntax.OpLiteral || sub[2].Op != syntax.OpWordBoundary { 1161 return nil, false 1162 } 1163 1164 return &wordMatchTree{ 1165 word: string(sub[1].Rune), 1166 fileName: q.FileName, 1167 }, true 1168} 1169 1170// pruneMatchTree removes impossible branches from the matchTree, as indicated 1171// by substrMatchTree having a noMatchTree and the resulting impossible and clauses and so forth. 1172func pruneMatchTree(mt matchTree) (matchTree, error) { 1173 var err error 1174 switch mt := mt.(type) { 1175 // leaf nodes that we test with the filter: 1176 case *substrMatchTree: 1177 if res, ok := mt.matchIterator.(*ngramIterationResults); ok { 1178 if _, ok := res.matchIterator.(*noMatchTree); ok { 1179 return nil, nil 1180 } 1181 } 1182 // recursive tree structures: 1183 case *andMatchTree: 1184 // Any branch of an and becoming impossible means the entire clause 1185 // is impossible. Otherwise, just handle rewrites. 1186 for i, child := range mt.children { 1187 newChild, err := pruneMatchTree(child) 1188 if err != nil { 1189 return nil, err 1190 } 1191 if newChild == nil { 1192 return nil, nil 1193 } 1194 mt.children[i] = newChild 1195 } 1196 case *orMatchTree: 1197 // *All* branches of an OR becoming impossible means the entire clause 1198 // is impossible. Otherwise, drop impossible subclauses and handle 1199 // rewrites, including simplifying to a singular resulting child branch. 1200 n := 0 1201 for _, child := range mt.children { 1202 newChild, err := pruneMatchTree(child) 1203 if err != nil { 1204 return nil, err 1205 } 1206 if newChild != nil { 1207 mt.children[n] = newChild 1208 n++ 1209 } 1210 } 1211 mt.children = mt.children[:n] 1212 if len(mt.children) == 1 { 1213 return mt.children[0], nil 1214 } else if len(mt.children) == 0 { 1215 return nil, nil 1216 } 1217 case *noVisitMatchTree: 1218 mt.matchTree, err = pruneMatchTree(mt.matchTree) 1219 if err != nil { 1220 return nil, err 1221 } 1222 if mt.matchTree == nil { 1223 return nil, nil 1224 } 1225 case *fileNameMatchTree: 1226 mt.child, err = pruneMatchTree(mt.child) 1227 case *andLineMatchTree: 1228 child, err := pruneMatchTree(&mt.andMatchTree) 1229 if err != nil { 1230 return nil, err 1231 } 1232 if child == nil { 1233 return nil, nil 1234 } 1235 if c, ok := child.(*andMatchTree); ok { 1236 mt.andMatchTree = *c 1237 } else { 1238 // the and was simplified to a single clause, 1239 // so the linematch portion is irrelevant. 1240 return mt, nil 1241 } 1242 case *notMatchTree: 1243 mt.child, err = pruneMatchTree(mt.child) 1244 if err != nil { 1245 return nil, err 1246 } 1247 if mt.child == nil { 1248 // not false => true 1249 return &bruteForceMatchTree{}, nil 1250 } 1251 // unhandled: 1252 case *docMatchTree: 1253 case *bruteForceMatchTree: 1254 case *regexpMatchTree: 1255 case *wordMatchTree: 1256 } 1257 return mt, err 1258}