fork of https://github.com/sourcegraph/zoekt
0

Configure Feed

Select the types of activity you want to include in your feed.

1// Copyright 2018 Google Inc. All rights reserved. 2// 3// Licensed under the Apache License, Version 2.0 (the "License"); 4// you may not use this file except in compliance with the License. 5// You may obtain a copy of the License at 6// 7// http://www.apache.org/licenses/LICENSE-2.0 8// 9// Unless required by applicable law or agreed to in writing, software 10// distributed under the License is distributed on an "AS IS" BASIS, 11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12// See the License for the specific language governing permissions and 13// limitations under the License. 14 15package zoekt 16 17import ( 18 "bytes" 19 "fmt" 20 "log" 21 "regexp/syntax" 22 "strings" 23 "unicode/utf8" 24 25 "github.com/grafana/regexp" 26 27 "github.com/sourcegraph/zoekt/query" 28) 29 30// A docIterator iterates over documents in order. 31type docIterator interface { 32 // provide the next document where we can may find something 33 // interesting. 34 nextDoc() uint32 35 36 // clears any per-document state of the docIterator, and 37 // prepares for evaluating the given doc. The argument is 38 // strictly increasing over time. 39 prepare(nextDoc uint32) 40} 41 42const ( 43 costConst = 0 44 costMemory = 1 45 costContent = 2 46 costRegexp = 3 47) 48 49const ( 50 costMin = costConst 51 costMax = costRegexp 52) 53 54// An expression tree coupled with matches. The matchtree has two 55// functions: 56// 57// * it implements boolean combinations (and, or, not) 58// 59// * it implements shortcuts, where we skip documents (for example: if 60// there are no trigram matches, we can be sure there are no substring 61// matches). The matchtree iterates over the documents as they are 62// ordered in the shard. 63// 64// The general process for a given (shard, query) is: 65// 66// - construct matchTree for the query 67// 68// - find all different leaf matchTrees (substring, regexp, etc.) 69// 70// in a loop: 71// 72// - find next doc to process using nextDoc 73// 74// - evaluate atoms (leaf expressions that match text) 75// 76// - evaluate the tree using matches(), storing the result in map. 77// 78// - if the complete tree returns (matches() == true) for the document, 79// collect all text matches by looking at leaf matchTrees 80type matchTree interface { 81 docIterator 82 83 // returns whether this matches, and if we are sure. 84 matches(cp *contentProvider, cost int, known map[matchTree]bool) (match bool, sure bool) 85} 86 87// docMatchTree iterates over documents for which predicate(docID) returns true. 88type docMatchTree struct { 89 // the number of documents in a shard. 90 numDocs uint32 91 92 predicate func(docID uint32) bool 93 94 // provides additional information about the reason why the docMatchTree was 95 // created. 96 reason string 97 98 // mutable 99 firstDone bool 100 docID uint32 101} 102 103type bruteForceMatchTree struct { 104 // mutable 105 firstDone bool 106 docID uint32 107} 108 109type andLineMatchTree struct { 110 andMatchTree 111} 112 113type andMatchTree struct { 114 children []matchTree 115} 116 117type orMatchTree struct { 118 children []matchTree 119} 120 121type notMatchTree struct { 122 child matchTree 123} 124 125// Returns only the filename of child matches. 126type fileNameMatchTree struct { 127 child matchTree 128} 129 130// Don't visit this subtree for collecting matches. 131type noVisitMatchTree struct { 132 matchTree 133} 134 135type regexpMatchTree struct { 136 regexp *regexp.Regexp 137 138 fileName bool 139 140 // mutable 141 reEvaluated bool 142 found []*candidateMatch 143 144 // nextDoc, prepare. 145 bruteForceMatchTree 146} 147 148// \bLITERAL\b 149type wordMatchTree struct { 150 word string 151 152 fileName bool 153 154 // mutable 155 evaluated bool 156 found []*candidateMatch 157 158 // nextDoc, prepare. 159 bruteForceMatchTree 160} 161 162type substrMatchTree struct { 163 matchIterator 164 165 query *query.Substring 166 caseSensitive bool 167 fileName bool 168 169 // mutable 170 current []*candidateMatch 171 contEvaluated bool 172} 173 174type branchQueryMatchTree struct { 175 fileMasks []uint64 176 masks []uint64 177 repos []uint16 178 179 // mutable 180 firstDone bool 181 docID uint32 182} 183 184type symbolRegexpMatchTree struct { 185 matchTree 186 regexp *regexp.Regexp 187 all bool // skips regex match if .* 188 189 reEvaluated bool 190 found []*candidateMatch 191} 192 193func (t *symbolRegexpMatchTree) prepare(doc uint32) { 194 t.reEvaluated = false 195} 196 197func (t *symbolRegexpMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) (bool, bool) { 198 if t.reEvaluated { 199 return len(t.found) > 0, true 200 } 201 202 if cost < costRegexp { 203 return false, false 204 } 205 206 sections := cp.docSections() 207 content := cp.data(false) 208 209 found := t.found[:0] 210 for i, sec := range sections { 211 var idx []int 212 if t.all { 213 idx = []int{0, int(sec.End - sec.Start)} 214 } else { 215 idx = t.regexp.FindIndex(content[sec.Start:sec.End]) 216 if idx == nil { 217 continue 218 } 219 } 220 221 cm := &candidateMatch{ 222 byteOffset: sec.Start + uint32(idx[0]), 223 byteMatchSz: uint32(idx[1] - idx[0]), 224 symbol: true, 225 symbolIdx: uint32(i), 226 } 227 found = append(found, cm) 228 } 229 t.found = found 230 t.reEvaluated = true 231 232 return len(t.found) > 0, true 233} 234 235type symbolSubstrMatchTree struct { 236 *substrMatchTree 237 238 patternSize uint32 239 fileEndRunes []uint32 240 fileEndSymbol []uint32 241 242 doc uint32 243 sections []DocumentSection 244 245 secID uint32 246} 247 248func (t *symbolSubstrMatchTree) prepare(doc uint32) { 249 t.substrMatchTree.prepare(doc) 250 t.doc = doc 251 252 var fileStart uint32 253 if doc > 0 { 254 fileStart = t.fileEndRunes[doc-1] 255 } 256 257 var sections []DocumentSection 258 if len(t.sections) > 0 { 259 most := t.fileEndSymbol[len(t.fileEndSymbol)-1] 260 if most == uint32(len(t.sections)) { 261 sections = t.sections[t.fileEndSymbol[doc]:t.fileEndSymbol[doc+1]] 262 } else { 263 for t.secID < uint32(len(t.sections)) && t.sections[t.secID].Start < fileStart { 264 t.secID++ 265 } 266 267 fileEnd, symbolEnd := t.fileEndRunes[doc], t.secID 268 for symbolEnd < uint32(len(t.sections)) && t.sections[symbolEnd].Start < fileEnd { 269 symbolEnd++ 270 } 271 272 sections = t.sections[t.secID:symbolEnd] 273 } 274 } 275 276 secIdx := 0 277 trimmed := t.current[:0] 278 for len(sections) > secIdx && len(t.current) > 0 { 279 start := fileStart + t.current[0].runeOffset 280 end := start + t.patternSize 281 if start >= sections[secIdx].End { 282 secIdx++ 283 continue 284 } 285 286 if start < sections[secIdx].Start { 287 t.current = t.current[1:] 288 continue 289 } 290 291 if end <= sections[secIdx].End { 292 t.current[0].symbol = true 293 t.current[0].symbolIdx = uint32(secIdx) 294 trimmed = append(trimmed, t.current[0]) 295 } 296 297 t.current = t.current[1:] 298 } 299 t.current = trimmed 300} 301 302// all prepare methods 303 304func (t *bruteForceMatchTree) prepare(doc uint32) { 305 t.docID = doc 306 t.firstDone = true 307} 308 309func (t *docMatchTree) prepare(doc uint32) { 310 t.docID = doc 311 t.firstDone = true 312} 313 314func (t *andMatchTree) prepare(doc uint32) { 315 for _, c := range t.children { 316 c.prepare(doc) 317 } 318} 319 320func (t *regexpMatchTree) prepare(doc uint32) { 321 t.found = t.found[:0] 322 t.reEvaluated = false 323 t.bruteForceMatchTree.prepare(doc) 324} 325 326func (t *wordMatchTree) prepare(doc uint32) { 327 t.found = t.found[:0] 328 t.evaluated = false 329 t.bruteForceMatchTree.prepare(doc) 330} 331 332func (t *orMatchTree) prepare(doc uint32) { 333 for _, c := range t.children { 334 c.prepare(doc) 335 } 336} 337 338func (t *notMatchTree) prepare(doc uint32) { 339 t.child.prepare(doc) 340} 341 342func (t *fileNameMatchTree) prepare(doc uint32) { 343 t.child.prepare(doc) 344} 345 346func (t *substrMatchTree) prepare(nextDoc uint32) { 347 t.matchIterator.prepare(nextDoc) 348 t.current = t.matchIterator.candidates() 349 t.contEvaluated = false 350} 351 352func (t *branchQueryMatchTree) prepare(doc uint32) { 353 t.firstDone = true 354 t.docID = doc 355} 356 357// nextDoc 358 359func (t *docMatchTree) nextDoc() uint32 { 360 var start uint32 361 if t.firstDone { 362 start = t.docID + 1 363 } 364 for i := start; i < t.numDocs; i++ { 365 if t.predicate(i) { 366 return i 367 } 368 } 369 return maxUInt32 370} 371 372func (t *bruteForceMatchTree) nextDoc() uint32 { 373 if !t.firstDone { 374 return 0 375 } 376 return t.docID + 1 377} 378 379func (t *andMatchTree) nextDoc() uint32 { 380 var max uint32 381 for _, c := range t.children { 382 m := c.nextDoc() 383 if m > max { 384 max = m 385 } 386 } 387 return max 388} 389 390func (t *orMatchTree) nextDoc() uint32 { 391 min := uint32(maxUInt32) 392 for _, c := range t.children { 393 m := c.nextDoc() 394 if m < min { 395 min = m 396 } 397 } 398 return min 399} 400 401func (t *notMatchTree) nextDoc() uint32 { 402 return 0 403} 404 405func (t *fileNameMatchTree) nextDoc() uint32 { 406 return t.child.nextDoc() 407} 408 409func (t *branchQueryMatchTree) nextDoc() uint32 { 410 var start uint32 411 if t.firstDone { 412 start = t.docID + 1 413 } 414 415 for i := start; i < uint32(len(t.fileMasks)); i++ { 416 if (t.masks[t.repos[i]] & t.fileMasks[i]) != 0 { 417 return i 418 } 419 } 420 return maxUInt32 421} 422 423// all String methods 424 425func (t *bruteForceMatchTree) String() string { 426 return "all" 427} 428 429func (t *docMatchTree) String() string { 430 return fmt.Sprintf("doc(%s)", t.reason) 431} 432 433func (t *andMatchTree) String() string { 434 return fmt.Sprintf("and%v", t.children) 435} 436 437func (t *regexpMatchTree) String() string { 438 f := "" 439 if t.fileName { 440 f = "f" 441 } 442 return fmt.Sprintf("%sre(%s)", f, t.regexp) 443} 444 445func (t *wordMatchTree) String() string { 446 f := "" 447 if t.fileName { 448 f = "f" 449 } 450 return fmt.Sprintf("%sword(%s)", f, t.word) 451} 452 453func (t *orMatchTree) String() string { 454 return fmt.Sprintf("or%v", t.children) 455} 456 457func (t *notMatchTree) String() string { 458 return fmt.Sprintf("not(%v)", t.child) 459} 460 461func (t *noVisitMatchTree) String() string { 462 return fmt.Sprintf("novisit(%v)", t.matchTree) 463} 464 465func (t *fileNameMatchTree) String() string { 466 return fmt.Sprintf("f(%v)", t.child) 467} 468 469func (t *substrMatchTree) String() string { 470 f := "" 471 if t.fileName { 472 f = "f" 473 } 474 475 return fmt.Sprintf("%ssubstr(%q, %v, %v)", f, t.query.Pattern, t.current, t.matchIterator) 476} 477 478func (t *branchQueryMatchTree) String() string { 479 return fmt.Sprintf("branch(%x)", t.masks) 480} 481 482func (t *symbolSubstrMatchTree) String() string { 483 return fmt.Sprintf("symbol(%v)", t.substrMatchTree) 484} 485 486func (t *symbolRegexpMatchTree) String() string { 487 return fmt.Sprintf("symbol(%v)", t.matchTree) 488} 489 490// visitMatches visits all atoms in matchTree. Note: This visits 491// noVisitMatchTree. For collecting matches use visitMatches. 492func visitMatchTree(t matchTree, f func(matchTree)) { 493 switch s := t.(type) { 494 case *andMatchTree: 495 for _, ch := range s.children { 496 visitMatchTree(ch, f) 497 } 498 case *orMatchTree: 499 for _, ch := range s.children { 500 visitMatchTree(ch, f) 501 } 502 case *andLineMatchTree: 503 visitMatchTree(&s.andMatchTree, f) 504 case *noVisitMatchTree: 505 visitMatchTree(s.matchTree, f) 506 case *notMatchTree: 507 visitMatchTree(s.child, f) 508 case *fileNameMatchTree: 509 visitMatchTree(s.child, f) 510 case *symbolSubstrMatchTree: 511 visitMatchTree(s.substrMatchTree, f) 512 case *symbolRegexpMatchTree: 513 visitMatchTree(s.matchTree, f) 514 default: 515 f(t) 516 } 517} 518 519// visitMatches visits all atoms which can contribute matches. Note: This 520// skips noVisitMatchTree. 521func visitMatches(t matchTree, known map[matchTree]bool, f func(matchTree)) { 522 switch s := t.(type) { 523 case *andMatchTree: 524 for _, ch := range s.children { 525 if known[ch] { 526 visitMatches(ch, known, f) 527 } 528 } 529 case *andLineMatchTree: 530 visitMatches(&s.andMatchTree, known, f) 531 case *orMatchTree: 532 for _, ch := range s.children { 533 if known[ch] { 534 visitMatches(ch, known, f) 535 } 536 } 537 case *symbolSubstrMatchTree: 538 visitMatches(s.substrMatchTree, known, f) 539 case *notMatchTree: 540 case *noVisitMatchTree: 541 // don't collect into negative trees. 542 case *fileNameMatchTree: 543 // We will just gather the filename if we do not visit this tree. 544 default: 545 f(s) 546 } 547} 548 549// all matches() methods. 550 551func (t *docMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) (bool, bool) { 552 return t.predicate(cp.idx), true 553} 554 555func (t *bruteForceMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) (bool, bool) { 556 return true, true 557} 558 559// andLineMatchTree is a performance optimization of andMatchTree. For content 560// searches we don't want to run the regex engine if there is no line that 561// contains matches from all terms. 562func (t *andLineMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) (bool, bool) { 563 matches, sure := t.andMatchTree.matches(cp, cost, known) 564 if !(sure && matches) { 565 return matches, sure 566 } 567 568 // find child with fewest candidates 569 min := maxUInt32 570 fewestChildren := 0 571 for ix, child := range t.children { 572 v, ok := child.(*substrMatchTree) 573 // make sure we are running a content search and that all candidates are a 574 // substrMatchTree 575 if !ok || v.fileName { 576 return matches, sure 577 } 578 if len(v.current) < min { 579 min = len(v.current) 580 fewestChildren = ix 581 } 582 } 583 584 type lineRange struct { 585 start int 586 end int 587 } 588 lines := make([]lineRange, 0, len(t.children[fewestChildren].(*substrMatchTree).current)) 589 prev := -1 590 for _, candidate := range t.children[fewestChildren].(*substrMatchTree).current { 591 line, byteStart, byteEnd := cp.newlines().atOffset(candidate.byteOffset) 592 if line == prev { 593 continue 594 } 595 prev = line 596 lines = append(lines, lineRange{byteStart, byteEnd}) 597 } 598 599 // children keeps track of the children's candidates we have already seen. 600 children := make([][]*candidateMatch, 0, len(t.children)-1) 601 for j, child := range t.children { 602 if j == fewestChildren { 603 continue 604 } 605 children = append(children, child.(*substrMatchTree).current) 606 } 607 608nextLine: 609 for i := 0; i < len(lines); i++ { 610 hits := 1 611 nextChild: 612 for j := range children { 613 nextCandidate: 614 for len(children[j]) > 0 { 615 candidate := children[j][0] 616 bo := int(cp.findOffset(false, candidate.runeOffset)) 617 if bo < lines[i].start { 618 children[j] = children[j][1:] 619 continue nextCandidate 620 } 621 if bo <= lines[i].end { 622 hits++ 623 continue nextChild 624 } 625 // move the `lines` iterator forward until bo <= line.end 626 for i < len(lines) && bo > lines[i].end { 627 i++ 628 } 629 i-- 630 continue nextLine 631 } 632 } 633 // return early once we found any line that contains matches from all children 634 if hits == len(t.children) { 635 return matches, true 636 } 637 } 638 return false, true 639} 640 641func (t *andMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) (bool, bool) { 642 sure := true 643 644 for _, ch := range t.children { 645 v, ok := evalMatchTree(cp, cost, known, ch) 646 if ok && !v { 647 return false, true 648 } 649 if !ok { 650 sure = false 651 } 652 } 653 654 return true, sure 655} 656 657func (t *orMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) (bool, bool) { 658 matches := false 659 sure := true 660 for _, ch := range t.children { 661 v, ok := evalMatchTree(cp, cost, known, ch) 662 if ok { 663 // we could short-circuit, but we want to use 664 // the other possibilities as a ranking 665 // signal. 666 matches = matches || v 667 } else { 668 sure = false 669 } 670 } 671 return matches, sure 672} 673 674func (t *branchQueryMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) (bool, bool) { 675 return t.fileMasks[t.docID]&t.masks[t.repos[t.docID]] != 0, true 676} 677 678func (t *regexpMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) (bool, bool) { 679 if t.reEvaluated { 680 return len(t.found) > 0, true 681 } 682 683 if cost < costRegexp { 684 return false, false 685 } 686 687 cp.stats.RegexpsConsidered++ 688 idxs := t.regexp.FindAllIndex(cp.data(t.fileName), -1) 689 found := t.found[:0] 690 for _, idx := range idxs { 691 cm := &candidateMatch{ 692 byteOffset: uint32(idx[0]), 693 byteMatchSz: uint32(idx[1] - idx[0]), 694 fileName: t.fileName, 695 } 696 697 found = append(found, cm) 698 } 699 t.found = found 700 t.reEvaluated = true 701 702 return len(t.found) > 0, true 703} 704 705func (t *wordMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) (bool, bool) { 706 if t.evaluated { 707 return len(t.found) > 0, true 708 } 709 710 if cost < costRegexp { 711 return false, false 712 } 713 714 data := cp.data(t.fileName) 715 offset := 0 716 found := t.found[:0] 717 for { 718 idx := bytes.Index(data[offset:], []byte(t.word)) 719 if idx < 0 { 720 break 721 } 722 723 relStartOffset := offset + idx 724 relEndOffset := relStartOffset + len(t.word) 725 726 startBoundary := relStartOffset < len(data) && (relStartOffset == 0 || !characterClass(data[relStartOffset-1])) 727 endBoundary := relEndOffset > 0 && (relEndOffset == len(data) || !characterClass(data[relEndOffset])) 728 if startBoundary && endBoundary { 729 found = append(found, &candidateMatch{ 730 byteOffset: uint32(offset + idx), 731 byteMatchSz: uint32(len(t.word)), 732 fileName: t.fileName, 733 }) 734 } 735 offset += idx + len(t.word) 736 } 737 738 t.found = found 739 t.evaluated = true 740 741 return len(t.found) > 0, true 742} 743 744// breakMatchesOnNewlines returns matches resulting from breaking each element 745// of cms on newlines within text. 746func breakMatchesOnNewlines(cms []*candidateMatch, text []byte) []*candidateMatch { 747 var lineCMs []*candidateMatch 748 for _, cm := range cms { 749 lineCMs = append(lineCMs, breakOnNewlines(cm, text)...) 750 } 751 return lineCMs 752} 753 754// breakOnNewlines returns matches resulting from breaking cm on newlines 755// within text. 756func breakOnNewlines(cm *candidateMatch, text []byte) []*candidateMatch { 757 var cms []*candidateMatch 758 addMe := &candidateMatch{} 759 *addMe = *cm 760 for i := uint32(cm.byteOffset); i < cm.byteOffset+cm.byteMatchSz; i++ { 761 if text[i] == '\n' { 762 addMe.byteMatchSz = i - addMe.byteOffset 763 if addMe.byteMatchSz != 0 { 764 cms = append(cms, addMe) 765 } 766 767 addMe = &candidateMatch{} 768 *addMe = *cm 769 addMe.byteOffset = i + 1 770 } 771 } 772 addMe.byteMatchSz = cm.byteOffset + cm.byteMatchSz - addMe.byteOffset 773 if addMe.byteMatchSz != 0 { 774 cms = append(cms, addMe) 775 } 776 return cms 777} 778 779func evalMatchTree(cp *contentProvider, cost int, known map[matchTree]bool, mt matchTree) (bool, bool) { 780 if v, ok := known[mt]; ok { 781 return v, true 782 } 783 784 v, ok := mt.matches(cp, cost, known) 785 if ok { 786 known[mt] = v 787 } 788 789 return v, ok 790} 791 792func (t *notMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) (bool, bool) { 793 v, ok := evalMatchTree(cp, cost, known, t.child) 794 return !v, ok 795} 796 797func (t *fileNameMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) (bool, bool) { 798 return evalMatchTree(cp, cost, known, t.child) 799} 800 801func (t *substrMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) (bool, bool) { 802 if t.contEvaluated { 803 return len(t.current) > 0, true 804 } 805 806 if len(t.current) == 0 { 807 return false, true 808 } 809 810 if t.fileName && cost < costMemory { 811 return false, false 812 } 813 814 if !t.fileName && cost < costContent { 815 return false, false 816 } 817 818 pruned := t.current[:0] 819 for _, m := range t.current { 820 if m.byteOffset == 0 && m.runeOffset > 0 { 821 m.byteOffset = cp.findOffset(m.fileName, m.runeOffset) 822 } 823 if m.matchContent(cp.data(m.fileName)) { 824 pruned = append(pruned, m) 825 } 826 } 827 t.current = pruned 828 t.contEvaluated = true 829 830 return len(t.current) > 0, true 831} 832 833func (d *indexData) newMatchTree(q query.Q) (matchTree, error) { 834 if q == nil { 835 return nil, fmt.Errorf("got nil (sub)query") 836 } 837 switch s := q.(type) { 838 case *query.Regexp: 839 // RegexpToMatchTreeRecursive tries to distill a matchTree that matches a 840 // superset of the regexp. If the returned matchTree is equivalent to the 841 // original regexp, it returns true. An equivalent matchTree has the same 842 // behaviour as the original regexp and can be used instead. 843 // 844 subMT, isEq, _, err := d.regexpToMatchTreeRecursive(s.Regexp, ngramSize, s.FileName, s.CaseSensitive) 845 if err != nil { 846 return nil, err 847 } 848 // if the query can be used in place of the regexp 849 // return the subtree 850 if isEq { 851 return subMT, nil 852 } 853 854 var tr matchTree 855 if wmt, ok := regexpToWordMatchTree(s); ok { 856 // A common search we get is "\bLITERAL\b". Avoid the regex engine and 857 // provide something faster. 858 tr = wmt 859 } else { 860 prefix := "" 861 if !s.CaseSensitive { 862 prefix = "(?i)" 863 } 864 865 tr = &regexpMatchTree{ 866 regexp: regexp.MustCompile(prefix + s.Regexp.String()), 867 fileName: s.FileName, 868 } 869 } 870 871 return &andMatchTree{ 872 children: []matchTree{ 873 tr, &noVisitMatchTree{subMT}, 874 }, 875 }, nil 876 case *query.And: 877 var r []matchTree 878 for _, ch := range s.Children { 879 ct, err := d.newMatchTree(ch) 880 if err != nil { 881 return nil, err 882 } 883 r = append(r, ct) 884 } 885 return &andMatchTree{r}, nil 886 case *query.Or: 887 var r []matchTree 888 for _, ch := range s.Children { 889 ct, err := d.newMatchTree(ch) 890 if err != nil { 891 return nil, err 892 } 893 r = append(r, ct) 894 } 895 return &orMatchTree{r}, nil 896 case *query.Not: 897 ct, err := d.newMatchTree(s.Child) 898 return &notMatchTree{ 899 child: ct, 900 }, err 901 902 case *query.Type: 903 if s.Type != query.TypeFileName { 904 break 905 } 906 907 ct, err := d.newMatchTree(s.Child) 908 if err != nil { 909 return nil, err 910 } 911 912 return &fileNameMatchTree{ 913 child: ct, 914 }, nil 915 916 case *query.Substring: 917 return d.newSubstringMatchTree(s) 918 919 case *query.Branch: 920 masks := make([]uint64, 0, len(d.repoMetaData)) 921 if s.Pattern == "HEAD" { 922 for i := 0; i < len(d.repoMetaData); i++ { 923 masks = append(masks, 1) 924 } 925 } else { 926 for _, branchIDs := range d.branchIDs { 927 mask := uint64(0) 928 for nm, m := range branchIDs { 929 if (s.Exact && nm == s.Pattern) || (!s.Exact && strings.Contains(nm, s.Pattern)) { 930 mask |= uint64(m) 931 } 932 } 933 masks = append(masks, mask) 934 } 935 936 } 937 return &branchQueryMatchTree{ 938 masks: masks, 939 fileMasks: d.fileBranchMasks, 940 repos: d.repos, 941 }, nil 942 case *query.Const: 943 if s.Value { 944 return &bruteForceMatchTree{}, nil 945 } else { 946 return &noMatchTree{"const"}, nil 947 } 948 case *query.Language: 949 code, ok := d.metaData.LanguageMap[s.Language] 950 if !ok { 951 return &noMatchTree{"lang"}, nil 952 } 953 return &docMatchTree{ 954 reason: "language", 955 numDocs: d.numDocs(), 956 predicate: func(docID uint32) bool { 957 return d.getLanguage(docID) == code 958 }, 959 }, nil 960 961 case *query.Symbol: 962 subMT, err := d.newMatchTree(s.Expr) 963 if err != nil { 964 return nil, err 965 } 966 967 if substr, ok := subMT.(*substrMatchTree); ok { 968 // We have a feature flag for lazy decoding. If runeDocSectionsRaw is 969 // non-nil it means we need to lazily decode on request. 970 sections := d.runeDocSections 971 if sections == nil && d.runeDocSectionsRaw != nil { 972 sections = unmarshalDocSections(d.runeDocSectionsRaw, nil) 973 } 974 975 return &symbolSubstrMatchTree{ 976 substrMatchTree: substr, 977 patternSize: uint32(utf8.RuneCountInString(substr.query.Pattern)), 978 fileEndRunes: d.fileEndRunes, 979 fileEndSymbol: d.fileEndSymbol, 980 sections: sections, 981 }, nil 982 } 983 984 var regexp *regexp.Regexp 985 visitMatchTree(subMT, func(mt matchTree) { 986 if t, ok := mt.(*regexpMatchTree); ok { 987 regexp = t.regexp 988 } 989 }) 990 if regexp == nil { 991 return nil, fmt.Errorf("found %T inside query.Symbol", subMT) 992 } 993 994 return &symbolRegexpMatchTree{ 995 regexp: regexp, 996 all: regexp.String() == "(?i)(?-s:.)*", 997 matchTree: subMT, 998 }, nil 999 1000 case *query.FileNameSet: 1001 return &docMatchTree{ 1002 reason: "FileNameSet", 1003 numDocs: d.numDocs(), 1004 predicate: func(docID uint32) bool { 1005 fileName := d.fileName(docID) 1006 _, ok := s.Set[string(fileName)] 1007 return ok 1008 }, 1009 }, nil 1010 1011 case *query.BranchesRepos: 1012 reposBranchesWant := make([]uint64, len(d.repoMetaData)) 1013 for repoIdx := range d.repoMetaData { 1014 var mask uint64 1015 for _, br := range s.List { 1016 if br.Repos.Contains(d.repoMetaData[repoIdx].ID) { 1017 mask |= uint64(d.branchIDs[repoIdx][br.Branch]) 1018 } 1019 } 1020 reposBranchesWant[repoIdx] = mask 1021 } 1022 return &docMatchTree{ 1023 reason: "BranchesRepos", 1024 numDocs: d.numDocs(), 1025 predicate: func(docID uint32) bool { 1026 return d.fileBranchMasks[docID]&reposBranchesWant[d.repos[docID]] != 0 1027 }, 1028 }, nil 1029 1030 case *query.RepoSet: 1031 reposWant := make([]bool, len(d.repoMetaData)) 1032 for repoIdx, r := range d.repoMetaData { 1033 if _, ok := s.Set[r.Name]; ok { 1034 reposWant[repoIdx] = true 1035 } 1036 } 1037 return &docMatchTree{ 1038 reason: "RepoSet", 1039 numDocs: d.numDocs(), 1040 predicate: func(docID uint32) bool { 1041 return reposWant[d.repos[docID]] 1042 }, 1043 }, nil 1044 1045 case *query.RepoIDs: 1046 reposWant := make([]bool, len(d.repoMetaData)) 1047 for repoIdx, r := range d.repoMetaData { 1048 if s.Repos.Contains(r.ID) { 1049 reposWant[repoIdx] = true 1050 } 1051 } 1052 return &docMatchTree{ 1053 reason: "RepoIDs", 1054 numDocs: d.numDocs(), 1055 predicate: func(docID uint32) bool { 1056 return reposWant[d.repos[docID]] 1057 }, 1058 }, nil 1059 1060 case *query.Repo: 1061 reposWant := make([]bool, len(d.repoMetaData)) 1062 for repoIdx, r := range d.repoMetaData { 1063 if s.Regexp.MatchString(r.Name) { 1064 reposWant[repoIdx] = true 1065 } 1066 } 1067 return &docMatchTree{ 1068 reason: "Repo", 1069 numDocs: d.numDocs(), 1070 predicate: func(docID uint32) bool { 1071 return reposWant[d.repos[docID]] 1072 }, 1073 }, nil 1074 1075 case *query.RepoRegexp: 1076 reposWant := make([]bool, len(d.repoMetaData)) 1077 for repoIdx, r := range d.repoMetaData { 1078 if s.Regexp.MatchString(r.Name) { 1079 reposWant[repoIdx] = true 1080 } 1081 } 1082 return &docMatchTree{ 1083 reason: "RepoRegexp", 1084 numDocs: d.numDocs(), 1085 predicate: func(docID uint32) bool { 1086 return reposWant[d.repos[docID]] 1087 }, 1088 }, nil 1089 1090 case query.RawConfig: 1091 return &docMatchTree{ 1092 reason: s.String(), 1093 numDocs: d.numDocs(), 1094 predicate: func(docID uint32) bool { 1095 return uint8(s)&d.rawConfigMasks[d.repos[docID]] == uint8(s) 1096 }, 1097 }, nil 1098 } 1099 log.Panicf("type %T", q) 1100 return nil, nil 1101} 1102 1103func (d *indexData) newSubstringMatchTree(s *query.Substring) (matchTree, error) { 1104 st := &substrMatchTree{ 1105 query: s, 1106 caseSensitive: s.CaseSensitive, 1107 fileName: s.FileName, 1108 } 1109 1110 if utf8.RuneCountInString(s.Pattern) < ngramSize { 1111 prefix := "" 1112 if !s.CaseSensitive { 1113 prefix = "(?i)" 1114 } 1115 t := &regexpMatchTree{ 1116 regexp: regexp.MustCompile(prefix + regexp.QuoteMeta(s.Pattern)), 1117 fileName: s.FileName, 1118 } 1119 return t, nil 1120 } 1121 1122 result, err := d.iterateNgrams(s) 1123 if err != nil { 1124 return nil, err 1125 } 1126 st.matchIterator = result 1127 return st, nil 1128} 1129 1130func regexpToWordMatchTree(q *query.Regexp) (_ *wordMatchTree, ok bool) { 1131 // Needs to be case sensitive 1132 if !q.CaseSensitive || q.Regexp.Flags&syntax.FoldCase != 0 { 1133 return nil, false 1134 } 1135 // We want a regex that looks like Op.Concat[OpWordBoundary OpLiteral OpWordBoundary] 1136 if q.Regexp.Op != syntax.OpConcat || len(q.Regexp.Sub) != 3 { 1137 return nil, false 1138 } 1139 sub := q.Regexp.Sub 1140 if sub[0].Op != syntax.OpWordBoundary || sub[1].Op != syntax.OpLiteral || sub[2].Op != syntax.OpWordBoundary { 1141 return nil, false 1142 } 1143 1144 return &wordMatchTree{ 1145 word: string(sub[1].Rune), 1146 fileName: q.FileName, 1147 }, true 1148} 1149 1150// pruneMatchTree removes impossible branches from the matchTree, as indicated 1151// by substrMatchTree having a noMatchTree and the resulting impossible and clauses and so forth. 1152func pruneMatchTree(mt matchTree) (matchTree, error) { 1153 var err error 1154 switch mt := mt.(type) { 1155 // leaf nodes that we test with the filter: 1156 case *substrMatchTree: 1157 if res, ok := mt.matchIterator.(*ngramIterationResults); ok { 1158 if _, ok := res.matchIterator.(*noMatchTree); ok { 1159 return nil, nil 1160 } 1161 } 1162 // recursive tree structures: 1163 case *andMatchTree: 1164 // Any branch of an and becoming impossible means the entire clause 1165 // is impossible. Otherwise, just handle rewrites. 1166 for i, child := range mt.children { 1167 newChild, err := pruneMatchTree(child) 1168 if err != nil { 1169 return nil, err 1170 } 1171 if newChild == nil { 1172 return nil, nil 1173 } 1174 mt.children[i] = newChild 1175 } 1176 case *orMatchTree: 1177 // *All* branches of an OR becoming impossible means the entire clause 1178 // is impossible. Otherwise, drop impossible subclauses and handle 1179 // rewrites, including simplifying to a singular resulting child branch. 1180 n := 0 1181 for _, child := range mt.children { 1182 newChild, err := pruneMatchTree(child) 1183 if err != nil { 1184 return nil, err 1185 } 1186 if newChild != nil { 1187 mt.children[n] = newChild 1188 n++ 1189 } 1190 } 1191 mt.children = mt.children[:n] 1192 if len(mt.children) == 1 { 1193 return mt.children[0], nil 1194 } else if len(mt.children) == 0 { 1195 return nil, nil 1196 } 1197 case *noVisitMatchTree: 1198 mt.matchTree, err = pruneMatchTree(mt.matchTree) 1199 if err != nil { 1200 return nil, err 1201 } 1202 if mt.matchTree == nil { 1203 return nil, nil 1204 } 1205 case *fileNameMatchTree: 1206 mt.child, err = pruneMatchTree(mt.child) 1207 case *andLineMatchTree: 1208 child, err := pruneMatchTree(&mt.andMatchTree) 1209 if err != nil { 1210 return nil, err 1211 } 1212 if child == nil { 1213 return nil, nil 1214 } 1215 if c, ok := child.(*andMatchTree); ok { 1216 mt.andMatchTree = *c 1217 } else { 1218 // the and was simplified to a single clause, 1219 // so the linematch portion is irrelevant. 1220 return mt, nil 1221 } 1222 case *notMatchTree: 1223 mt.child, err = pruneMatchTree(mt.child) 1224 if err != nil { 1225 return nil, err 1226 } 1227 if mt.child == nil { 1228 // not false => true 1229 return &bruteForceMatchTree{}, nil 1230 } 1231 // unhandled: 1232 case *docMatchTree: 1233 case *bruteForceMatchTree: 1234 case *regexpMatchTree: 1235 case *wordMatchTree: 1236 } 1237 return mt, err 1238}