fork of https://github.com/sourcegraph/zoekt
0

Configure Feed

Select the types of activity you want to include in your feed.

1// Copyright 2018 Google Inc. All rights reserved. 2// 3// Licensed under the Apache License, Version 2.0 (the "License"); 4// you may not use this file except in compliance with the License. 5// You may obtain a copy of the License at 6// 7// http://www.apache.org/licenses/LICENSE-2.0 8// 9// Unless required by applicable law or agreed to in writing, software 10// distributed under the License is distributed on an "AS IS" BASIS, 11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12// See the License for the specific language governing permissions and 13// limitations under the License. 14 15package zoekt 16 17import ( 18 "bytes" 19 "fmt" 20 "log" 21 "regexp/syntax" 22 "strings" 23 "unicode/utf8" 24 25 "github.com/grafana/regexp" 26 27 "github.com/sourcegraph/zoekt/query" 28) 29 30// A docIterator iterates over documents in order. 31type docIterator interface { 32 // provide the next document where we can may find something 33 // interesting. 34 nextDoc() uint32 35 36 // clears any per-document state of the docIterator, and 37 // prepares for evaluating the given doc. The argument is 38 // strictly increasing over time. 39 prepare(nextDoc uint32) 40} 41 42const ( 43 costConst = 0 44 costMemory = 1 45 costContent = 2 46 costRegexp = 3 47) 48 49const ( 50 costMin = costConst 51 costMax = costRegexp 52) 53 54// An expression tree coupled with matches. The matchtree has two 55// functions: 56// 57// * it implements boolean combinations (and, or, not) 58// 59// * it implements shortcuts, where we skip documents (for example: if 60// there are no trigram matches, we can be sure there are no substring 61// matches). The matchtree iterates over the documents as they are 62// ordered in the shard. 63// 64// The general process for a given (shard, query) is: 65// 66// - construct matchTree for the query 67// 68// - find all different leaf matchTrees (substring, regexp, etc.) 69// 70// in a loop: 71// 72// - find next doc to process using nextDoc 73// 74// - evaluate atoms (leaf expressions that match text) 75// 76// - evaluate the tree using matches(), storing the result in map. 77// 78// - if the complete tree returns (matches() == true) for the document, 79// collect all text matches by looking at leaf matchTrees 80type matchTree interface { 81 docIterator 82 83 // returns whether this matches, and if we are sure. 84 matches(cp *contentProvider, cost int, known map[matchTree]bool) (match bool, sure bool) 85} 86 87// docMatchTree iterates over documents for which predicate(docID) returns true. 88type docMatchTree struct { 89 // the number of documents in a shard. 90 numDocs uint32 91 92 predicate func(docID uint32) bool 93 94 // provides additional information about the reason why the docMatchTree was 95 // created. 96 reason string 97 98 // mutable 99 firstDone bool 100 docID uint32 101} 102 103type bruteForceMatchTree struct { 104 // mutable 105 firstDone bool 106 docID uint32 107} 108 109type andLineMatchTree struct { 110 andMatchTree 111} 112 113type andMatchTree struct { 114 children []matchTree 115} 116 117type orMatchTree struct { 118 children []matchTree 119} 120 121type notMatchTree struct { 122 child matchTree 123} 124 125// Returns only the filename of child matches. 126type fileNameMatchTree struct { 127 child matchTree 128} 129 130// Don't visit this subtree for collecting matches. 131type noVisitMatchTree struct { 132 matchTree 133} 134 135type regexpMatchTree struct { 136 regexp *regexp.Regexp 137 138 fileName bool 139 140 // mutable 141 reEvaluated bool 142 found []*candidateMatch 143 144 // nextDoc, prepare. 145 bruteForceMatchTree 146} 147 148// \bLITERAL\b 149type wordMatchTree struct { 150 word string 151 152 fileName bool 153 154 // mutable 155 evaluated bool 156 found []*candidateMatch 157 158 // nextDoc, prepare. 159 bruteForceMatchTree 160} 161 162type substrMatchTree struct { 163 matchIterator 164 165 query *query.Substring 166 caseSensitive bool 167 fileName bool 168 169 // mutable 170 current []*candidateMatch 171 contEvaluated bool 172} 173 174type branchQueryMatchTree struct { 175 fileMasks []uint64 176 masks []uint64 177 repos []uint16 178 179 // mutable 180 firstDone bool 181 docID uint32 182} 183 184func (t *branchQueryMatchTree) branchMask() uint64 { 185 return t.fileMasks[t.docID] & t.masks[t.repos[t.docID]] 186} 187 188type symbolRegexpMatchTree struct { 189 matchTree 190 regexp *regexp.Regexp 191 all bool // skips regex match if .* 192 193 reEvaluated bool 194 found []*candidateMatch 195} 196 197func (t *symbolRegexpMatchTree) prepare(doc uint32) { 198 t.reEvaluated = false 199} 200 201func (t *symbolRegexpMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) (bool, bool) { 202 if t.reEvaluated { 203 return len(t.found) > 0, true 204 } 205 206 if cost < costRegexp { 207 return false, false 208 } 209 210 sections := cp.docSections() 211 content := cp.data(false) 212 213 found := t.found[:0] 214 for i, sec := range sections { 215 var idx []int 216 if t.all { 217 idx = []int{0, int(sec.End - sec.Start)} 218 } else { 219 idx = t.regexp.FindIndex(content[sec.Start:sec.End]) 220 if idx == nil { 221 continue 222 } 223 } 224 225 cm := &candidateMatch{ 226 byteOffset: sec.Start + uint32(idx[0]), 227 byteMatchSz: uint32(idx[1] - idx[0]), 228 symbol: true, 229 symbolIdx: uint32(i), 230 } 231 found = append(found, cm) 232 } 233 t.found = found 234 t.reEvaluated = true 235 236 return len(t.found) > 0, true 237} 238 239type symbolSubstrMatchTree struct { 240 *substrMatchTree 241 242 patternSize uint32 243 fileEndRunes []uint32 244 fileEndSymbol []uint32 245 246 doc uint32 247 sections []DocumentSection 248 249 secID uint32 250} 251 252func (t *symbolSubstrMatchTree) prepare(doc uint32) { 253 t.substrMatchTree.prepare(doc) 254 t.doc = doc 255 256 var fileStart uint32 257 if doc > 0 { 258 fileStart = t.fileEndRunes[doc-1] 259 } 260 261 var sections []DocumentSection 262 if len(t.sections) > 0 { 263 most := t.fileEndSymbol[len(t.fileEndSymbol)-1] 264 if most == uint32(len(t.sections)) { 265 sections = t.sections[t.fileEndSymbol[doc]:t.fileEndSymbol[doc+1]] 266 } else { 267 for t.secID < uint32(len(t.sections)) && t.sections[t.secID].Start < fileStart { 268 t.secID++ 269 } 270 271 fileEnd, symbolEnd := t.fileEndRunes[doc], t.secID 272 for symbolEnd < uint32(len(t.sections)) && t.sections[symbolEnd].Start < fileEnd { 273 symbolEnd++ 274 } 275 276 sections = t.sections[t.secID:symbolEnd] 277 } 278 } 279 280 secIdx := 0 281 trimmed := t.current[:0] 282 for len(sections) > secIdx && len(t.current) > 0 { 283 start := fileStart + t.current[0].runeOffset 284 end := start + t.patternSize 285 if start >= sections[secIdx].End { 286 secIdx++ 287 continue 288 } 289 290 if start < sections[secIdx].Start { 291 t.current = t.current[1:] 292 continue 293 } 294 295 if end <= sections[secIdx].End { 296 t.current[0].symbol = true 297 t.current[0].symbolIdx = uint32(secIdx) 298 trimmed = append(trimmed, t.current[0]) 299 } 300 301 t.current = t.current[1:] 302 } 303 t.current = trimmed 304} 305 306// all prepare methods 307 308func (t *bruteForceMatchTree) prepare(doc uint32) { 309 t.docID = doc 310 t.firstDone = true 311} 312 313func (t *docMatchTree) prepare(doc uint32) { 314 t.docID = doc 315 t.firstDone = true 316} 317 318func (t *andMatchTree) prepare(doc uint32) { 319 for _, c := range t.children { 320 c.prepare(doc) 321 } 322} 323 324func (t *regexpMatchTree) prepare(doc uint32) { 325 t.found = t.found[:0] 326 t.reEvaluated = false 327 t.bruteForceMatchTree.prepare(doc) 328} 329 330func (t *wordMatchTree) prepare(doc uint32) { 331 t.found = t.found[:0] 332 t.evaluated = false 333 t.bruteForceMatchTree.prepare(doc) 334} 335 336func (t *orMatchTree) prepare(doc uint32) { 337 for _, c := range t.children { 338 c.prepare(doc) 339 } 340} 341 342func (t *notMatchTree) prepare(doc uint32) { 343 t.child.prepare(doc) 344} 345 346func (t *fileNameMatchTree) prepare(doc uint32) { 347 t.child.prepare(doc) 348} 349 350func (t *substrMatchTree) prepare(nextDoc uint32) { 351 t.matchIterator.prepare(nextDoc) 352 t.current = t.matchIterator.candidates() 353 t.contEvaluated = false 354} 355 356func (t *branchQueryMatchTree) prepare(doc uint32) { 357 t.firstDone = true 358 t.docID = doc 359} 360 361// nextDoc 362 363func (t *docMatchTree) nextDoc() uint32 { 364 var start uint32 365 if t.firstDone { 366 start = t.docID + 1 367 } 368 for i := start; i < t.numDocs; i++ { 369 if t.predicate(i) { 370 return i 371 } 372 } 373 return maxUInt32 374} 375 376func (t *bruteForceMatchTree) nextDoc() uint32 { 377 if !t.firstDone { 378 return 0 379 } 380 return t.docID + 1 381} 382 383func (t *andMatchTree) nextDoc() uint32 { 384 var max uint32 385 for _, c := range t.children { 386 m := c.nextDoc() 387 if m > max { 388 max = m 389 } 390 } 391 return max 392} 393 394func (t *orMatchTree) nextDoc() uint32 { 395 min := uint32(maxUInt32) 396 for _, c := range t.children { 397 m := c.nextDoc() 398 if m < min { 399 min = m 400 } 401 } 402 return min 403} 404 405func (t *notMatchTree) nextDoc() uint32 { 406 return 0 407} 408 409func (t *fileNameMatchTree) nextDoc() uint32 { 410 return t.child.nextDoc() 411} 412 413func (t *branchQueryMatchTree) nextDoc() uint32 { 414 var start uint32 415 if t.firstDone { 416 start = t.docID + 1 417 } 418 419 for i := start; i < uint32(len(t.fileMasks)); i++ { 420 if (t.masks[t.repos[i]] & t.fileMasks[i]) != 0 { 421 return i 422 } 423 } 424 return maxUInt32 425} 426 427// all String methods 428 429func (t *bruteForceMatchTree) String() string { 430 return "all" 431} 432 433func (t *docMatchTree) String() string { 434 return fmt.Sprintf("doc(%s)", t.reason) 435} 436 437func (t *andMatchTree) String() string { 438 return fmt.Sprintf("and%v", t.children) 439} 440 441func (t *regexpMatchTree) String() string { 442 f := "" 443 if t.fileName { 444 f = "f" 445 } 446 return fmt.Sprintf("%sre(%s)", f, t.regexp) 447} 448 449func (t *wordMatchTree) String() string { 450 f := "" 451 if t.fileName { 452 f = "f" 453 } 454 return fmt.Sprintf("%sword(%s)", f, t.word) 455} 456 457func (t *orMatchTree) String() string { 458 return fmt.Sprintf("or%v", t.children) 459} 460 461func (t *notMatchTree) String() string { 462 return fmt.Sprintf("not(%v)", t.child) 463} 464 465func (t *noVisitMatchTree) String() string { 466 return fmt.Sprintf("novisit(%v)", t.matchTree) 467} 468 469func (t *fileNameMatchTree) String() string { 470 return fmt.Sprintf("f(%v)", t.child) 471} 472 473func (t *substrMatchTree) String() string { 474 f := "" 475 if t.fileName { 476 f = "f" 477 } 478 479 return fmt.Sprintf("%ssubstr(%q, %v, %v)", f, t.query.Pattern, t.current, t.matchIterator) 480} 481 482func (t *branchQueryMatchTree) String() string { 483 return fmt.Sprintf("branch(%x)", t.masks) 484} 485 486func (t *symbolSubstrMatchTree) String() string { 487 return fmt.Sprintf("symbol(%v)", t.substrMatchTree) 488} 489 490func (t *symbolRegexpMatchTree) String() string { 491 return fmt.Sprintf("symbol(%v)", t.matchTree) 492} 493 494// visitMatches visits all atoms in matchTree. Note: This visits 495// noVisitMatchTree. For collecting matches use visitMatches. 496func visitMatchTree(t matchTree, f func(matchTree)) { 497 switch s := t.(type) { 498 case *andMatchTree: 499 for _, ch := range s.children { 500 visitMatchTree(ch, f) 501 } 502 case *orMatchTree: 503 for _, ch := range s.children { 504 visitMatchTree(ch, f) 505 } 506 case *andLineMatchTree: 507 visitMatchTree(&s.andMatchTree, f) 508 case *noVisitMatchTree: 509 visitMatchTree(s.matchTree, f) 510 case *notMatchTree: 511 visitMatchTree(s.child, f) 512 case *fileNameMatchTree: 513 visitMatchTree(s.child, f) 514 case *symbolSubstrMatchTree: 515 visitMatchTree(s.substrMatchTree, f) 516 case *symbolRegexpMatchTree: 517 visitMatchTree(s.matchTree, f) 518 default: 519 f(t) 520 } 521} 522 523// visitMatches visits all atoms which can contribute matches. Note: This 524// skips noVisitMatchTree. 525func visitMatches(t matchTree, known map[matchTree]bool, f func(matchTree)) { 526 switch s := t.(type) { 527 case *andMatchTree: 528 for _, ch := range s.children { 529 if known[ch] { 530 visitMatches(ch, known, f) 531 } 532 } 533 case *andLineMatchTree: 534 visitMatches(&s.andMatchTree, known, f) 535 case *orMatchTree: 536 for _, ch := range s.children { 537 if known[ch] { 538 visitMatches(ch, known, f) 539 } 540 } 541 case *symbolSubstrMatchTree: 542 visitMatches(s.substrMatchTree, known, f) 543 case *notMatchTree: 544 case *noVisitMatchTree: 545 // don't collect into negative trees. 546 case *fileNameMatchTree: 547 // We will just gather the filename if we do not visit this tree. 548 default: 549 f(s) 550 } 551} 552 553// all matches() methods. 554 555func (t *docMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) (bool, bool) { 556 return t.predicate(cp.idx), true 557} 558 559func (t *bruteForceMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) (bool, bool) { 560 return true, true 561} 562 563// andLineMatchTree is a performance optimization of andMatchTree. For content 564// searches we don't want to run the regex engine if there is no line that 565// contains matches from all terms. 566func (t *andLineMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) (bool, bool) { 567 matches, sure := t.andMatchTree.matches(cp, cost, known) 568 if !(sure && matches) { 569 return matches, sure 570 } 571 572 // find child with fewest candidates 573 min := maxUInt32 574 fewestChildren := 0 575 for ix, child := range t.children { 576 v, ok := child.(*substrMatchTree) 577 // make sure we are running a content search and that all candidates are a 578 // substrMatchTree 579 if !ok || v.fileName { 580 return matches, sure 581 } 582 if len(v.current) < min { 583 min = len(v.current) 584 fewestChildren = ix 585 } 586 } 587 588 type lineRange struct { 589 start int 590 end int 591 } 592 lines := make([]lineRange, 0, len(t.children[fewestChildren].(*substrMatchTree).current)) 593 prev := -1 594 for _, candidate := range t.children[fewestChildren].(*substrMatchTree).current { 595 line, byteStart, byteEnd := cp.newlines().atOffset(candidate.byteOffset) 596 if line == prev { 597 continue 598 } 599 prev = line 600 lines = append(lines, lineRange{byteStart, byteEnd}) 601 } 602 603 // children keeps track of the children's candidates we have already seen. 604 children := make([][]*candidateMatch, 0, len(t.children)-1) 605 for j, child := range t.children { 606 if j == fewestChildren { 607 continue 608 } 609 children = append(children, child.(*substrMatchTree).current) 610 } 611 612nextLine: 613 for i := 0; i < len(lines); i++ { 614 hits := 1 615 nextChild: 616 for j := range children { 617 nextCandidate: 618 for len(children[j]) > 0 { 619 candidate := children[j][0] 620 bo := int(cp.findOffset(false, candidate.runeOffset)) 621 if bo < lines[i].start { 622 children[j] = children[j][1:] 623 continue nextCandidate 624 } 625 if bo <= lines[i].end { 626 hits++ 627 continue nextChild 628 } 629 // move the `lines` iterator forward until bo <= line.end 630 for i < len(lines) && bo > lines[i].end { 631 i++ 632 } 633 i-- 634 continue nextLine 635 } 636 } 637 // return early once we found any line that contains matches from all children 638 if hits == len(t.children) { 639 return matches, true 640 } 641 } 642 return false, true 643} 644 645func (t *andMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) (bool, bool) { 646 sure := true 647 648 for _, ch := range t.children { 649 v, ok := evalMatchTree(cp, cost, known, ch) 650 if ok && !v { 651 return false, true 652 } 653 if !ok { 654 sure = false 655 } 656 } 657 658 return true, sure 659} 660 661func (t *orMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) (bool, bool) { 662 matches := false 663 sure := true 664 for _, ch := range t.children { 665 v, ok := evalMatchTree(cp, cost, known, ch) 666 if ok { 667 // we could short-circuit, but we want to use 668 // the other possibilities as a ranking 669 // signal. 670 matches = matches || v 671 } else { 672 sure = false 673 } 674 } 675 return matches, sure 676} 677 678func (t *branchQueryMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) (bool, bool) { 679 return t.branchMask() != 0, true 680} 681 682func (t *regexpMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) (bool, bool) { 683 if t.reEvaluated { 684 return len(t.found) > 0, true 685 } 686 687 if cost < costRegexp { 688 return false, false 689 } 690 691 cp.stats.RegexpsConsidered++ 692 idxs := t.regexp.FindAllIndex(cp.data(t.fileName), -1) 693 found := t.found[:0] 694 for _, idx := range idxs { 695 cm := &candidateMatch{ 696 byteOffset: uint32(idx[0]), 697 byteMatchSz: uint32(idx[1] - idx[0]), 698 fileName: t.fileName, 699 } 700 701 found = append(found, cm) 702 } 703 t.found = found 704 t.reEvaluated = true 705 706 return len(t.found) > 0, true 707} 708 709func (t *wordMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) (bool, bool) { 710 if t.evaluated { 711 return len(t.found) > 0, true 712 } 713 714 if cost < costRegexp { 715 return false, false 716 } 717 718 data := cp.data(t.fileName) 719 offset := 0 720 found := t.found[:0] 721 for { 722 idx := bytes.Index(data[offset:], []byte(t.word)) 723 if idx < 0 { 724 break 725 } 726 727 relStartOffset := offset + idx 728 relEndOffset := relStartOffset + len(t.word) 729 730 startBoundary := relStartOffset < len(data) && (relStartOffset == 0 || !characterClass(data[relStartOffset-1])) 731 endBoundary := relEndOffset > 0 && (relEndOffset == len(data) || !characterClass(data[relEndOffset])) 732 if startBoundary && endBoundary { 733 found = append(found, &candidateMatch{ 734 byteOffset: uint32(offset + idx), 735 byteMatchSz: uint32(len(t.word)), 736 fileName: t.fileName, 737 }) 738 } 739 offset += idx + len(t.word) 740 } 741 742 t.found = found 743 t.evaluated = true 744 745 return len(t.found) > 0, true 746} 747 748// breakMatchesOnNewlines returns matches resulting from breaking each element 749// of cms on newlines within text. 750func breakMatchesOnNewlines(cms []*candidateMatch, text []byte) []*candidateMatch { 751 var lineCMs []*candidateMatch 752 for _, cm := range cms { 753 lineCMs = append(lineCMs, breakOnNewlines(cm, text)...) 754 } 755 return lineCMs 756} 757 758// breakOnNewlines returns matches resulting from breaking cm on newlines 759// within text. 760func breakOnNewlines(cm *candidateMatch, text []byte) []*candidateMatch { 761 var cms []*candidateMatch 762 addMe := &candidateMatch{} 763 *addMe = *cm 764 for i := uint32(cm.byteOffset); i < cm.byteOffset+cm.byteMatchSz; i++ { 765 if text[i] == '\n' { 766 addMe.byteMatchSz = i - addMe.byteOffset 767 if addMe.byteMatchSz != 0 { 768 cms = append(cms, addMe) 769 } 770 771 addMe = &candidateMatch{} 772 *addMe = *cm 773 addMe.byteOffset = i + 1 774 } 775 } 776 addMe.byteMatchSz = cm.byteOffset + cm.byteMatchSz - addMe.byteOffset 777 if addMe.byteMatchSz != 0 { 778 cms = append(cms, addMe) 779 } 780 return cms 781} 782 783func evalMatchTree(cp *contentProvider, cost int, known map[matchTree]bool, mt matchTree) (bool, bool) { 784 if v, ok := known[mt]; ok { 785 return v, true 786 } 787 788 v, ok := mt.matches(cp, cost, known) 789 if ok { 790 known[mt] = v 791 } 792 793 return v, ok 794} 795 796func (t *notMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) (bool, bool) { 797 v, ok := evalMatchTree(cp, cost, known, t.child) 798 return !v, ok 799} 800 801func (t *fileNameMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) (bool, bool) { 802 return evalMatchTree(cp, cost, known, t.child) 803} 804 805func (t *substrMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) (bool, bool) { 806 if t.contEvaluated { 807 return len(t.current) > 0, true 808 } 809 810 if len(t.current) == 0 { 811 return false, true 812 } 813 814 if t.fileName && cost < costMemory { 815 return false, false 816 } 817 818 if !t.fileName && cost < costContent { 819 return false, false 820 } 821 822 pruned := t.current[:0] 823 for _, m := range t.current { 824 if m.byteOffset == 0 && m.runeOffset > 0 { 825 m.byteOffset = cp.findOffset(m.fileName, m.runeOffset) 826 } 827 if m.matchContent(cp.data(m.fileName)) { 828 pruned = append(pruned, m) 829 } 830 } 831 t.current = pruned 832 t.contEvaluated = true 833 834 return len(t.current) > 0, true 835} 836 837type matchTreeOpt struct { 838 // DisableWordMatchOptimization is used to disable the use of wordMatchTree. 839 // This was added since we do not support wordMatchTree with symbol search. 840 DisableWordMatchOptimization bool 841} 842 843func (d *indexData) newMatchTree(q query.Q, opt matchTreeOpt) (matchTree, error) { 844 if q == nil { 845 return nil, fmt.Errorf("got nil (sub)query") 846 } 847 switch s := q.(type) { 848 case *query.Regexp: 849 // RegexpToMatchTreeRecursive tries to distill a matchTree that matches a 850 // superset of the regexp. If the returned matchTree is equivalent to the 851 // original regexp, it returns true. An equivalent matchTree has the same 852 // behaviour as the original regexp and can be used instead. 853 // 854 subMT, isEq, _, err := d.regexpToMatchTreeRecursive(s.Regexp, ngramSize, s.FileName, s.CaseSensitive) 855 if err != nil { 856 return nil, err 857 } 858 // if the query can be used in place of the regexp 859 // return the subtree 860 if isEq { 861 return subMT, nil 862 } 863 864 var tr matchTree 865 if wmt, ok := regexpToWordMatchTree(s, opt); ok { 866 // A common search we get is "\bLITERAL\b". Avoid the regex engine and 867 // provide something faster. 868 tr = wmt 869 } else { 870 prefix := "" 871 if !s.CaseSensitive { 872 prefix = "(?i)" 873 } 874 875 tr = &regexpMatchTree{ 876 regexp: regexp.MustCompile(prefix + s.Regexp.String()), 877 fileName: s.FileName, 878 } 879 } 880 881 return &andMatchTree{ 882 children: []matchTree{ 883 tr, &noVisitMatchTree{subMT}, 884 }, 885 }, nil 886 case *query.And: 887 var r []matchTree 888 for _, ch := range s.Children { 889 ct, err := d.newMatchTree(ch, opt) 890 if err != nil { 891 return nil, err 892 } 893 r = append(r, ct) 894 } 895 return &andMatchTree{r}, nil 896 case *query.Or: 897 var r []matchTree 898 for _, ch := range s.Children { 899 ct, err := d.newMatchTree(ch, opt) 900 if err != nil { 901 return nil, err 902 } 903 r = append(r, ct) 904 } 905 return &orMatchTree{r}, nil 906 case *query.Not: 907 ct, err := d.newMatchTree(s.Child, opt) 908 return &notMatchTree{ 909 child: ct, 910 }, err 911 912 case *query.Type: 913 if s.Type != query.TypeFileName { 914 break 915 } 916 917 ct, err := d.newMatchTree(s.Child, opt) 918 if err != nil { 919 return nil, err 920 } 921 922 return &fileNameMatchTree{ 923 child: ct, 924 }, nil 925 926 case *query.Substring: 927 return d.newSubstringMatchTree(s) 928 929 case *query.Branch: 930 masks := make([]uint64, 0, len(d.repoMetaData)) 931 if s.Pattern == "HEAD" { 932 for i := 0; i < len(d.repoMetaData); i++ { 933 masks = append(masks, 1) 934 } 935 } else { 936 for _, branchIDs := range d.branchIDs { 937 mask := uint64(0) 938 for nm, m := range branchIDs { 939 if (s.Exact && nm == s.Pattern) || (!s.Exact && strings.Contains(nm, s.Pattern)) { 940 mask |= uint64(m) 941 } 942 } 943 masks = append(masks, mask) 944 } 945 946 } 947 return &branchQueryMatchTree{ 948 masks: masks, 949 fileMasks: d.fileBranchMasks, 950 repos: d.repos, 951 }, nil 952 case *query.Const: 953 if s.Value { 954 return &bruteForceMatchTree{}, nil 955 } else { 956 return &noMatchTree{"const"}, nil 957 } 958 case *query.Language: 959 code, ok := d.metaData.LanguageMap[s.Language] 960 if !ok { 961 return &noMatchTree{"lang"}, nil 962 } 963 return &docMatchTree{ 964 reason: "language", 965 numDocs: d.numDocs(), 966 predicate: func(docID uint32) bool { 967 return d.getLanguage(docID) == code 968 }, 969 }, nil 970 971 case *query.Symbol: 972 // Disable WordMatchTree since we don't support it in symbols yet. 973 optCopy := opt 974 optCopy.DisableWordMatchOptimization = true 975 976 subMT, err := d.newMatchTree(s.Expr, optCopy) 977 if err != nil { 978 return nil, err 979 } 980 981 if substr, ok := subMT.(*substrMatchTree); ok { 982 // We have a feature flag for lazy decoding. If runeDocSectionsRaw is 983 // non-nil it means we need to lazily decode on request. 984 sections := d.runeDocSections 985 if sections == nil && d.runeDocSectionsRaw != nil { 986 sections = unmarshalDocSections(d.runeDocSectionsRaw, nil) 987 } 988 989 return &symbolSubstrMatchTree{ 990 substrMatchTree: substr, 991 patternSize: uint32(utf8.RuneCountInString(substr.query.Pattern)), 992 fileEndRunes: d.fileEndRunes, 993 fileEndSymbol: d.fileEndSymbol, 994 sections: sections, 995 }, nil 996 } 997 998 var regexp *regexp.Regexp 999 visitMatchTree(subMT, func(mt matchTree) { 1000 if t, ok := mt.(*regexpMatchTree); ok { 1001 regexp = t.regexp 1002 } 1003 }) 1004 if regexp == nil { 1005 return nil, fmt.Errorf("found %T inside query.Symbol", subMT) 1006 } 1007 1008 return &symbolRegexpMatchTree{ 1009 regexp: regexp, 1010 all: regexp.String() == "(?i)(?-s:.)*", 1011 matchTree: subMT, 1012 }, nil 1013 1014 case *query.FileNameSet: 1015 return &docMatchTree{ 1016 reason: "FileNameSet", 1017 numDocs: d.numDocs(), 1018 predicate: func(docID uint32) bool { 1019 fileName := d.fileName(docID) 1020 _, ok := s.Set[string(fileName)] 1021 return ok 1022 }, 1023 }, nil 1024 1025 case *query.BranchesRepos: 1026 reposBranchesWant := make([]uint64, len(d.repoMetaData)) 1027 for repoIdx := range d.repoMetaData { 1028 var mask uint64 1029 for _, br := range s.List { 1030 if br.Repos.Contains(d.repoMetaData[repoIdx].ID) { 1031 mask |= uint64(d.branchIDs[repoIdx][br.Branch]) 1032 } 1033 } 1034 reposBranchesWant[repoIdx] = mask 1035 } 1036 return &docMatchTree{ 1037 reason: "BranchesRepos", 1038 numDocs: d.numDocs(), 1039 predicate: func(docID uint32) bool { 1040 return d.fileBranchMasks[docID]&reposBranchesWant[d.repos[docID]] != 0 1041 }, 1042 }, nil 1043 1044 case *query.RepoSet: 1045 reposWant := make([]bool, len(d.repoMetaData)) 1046 for repoIdx, r := range d.repoMetaData { 1047 if _, ok := s.Set[r.Name]; ok { 1048 reposWant[repoIdx] = true 1049 } 1050 } 1051 return &docMatchTree{ 1052 reason: "RepoSet", 1053 numDocs: d.numDocs(), 1054 predicate: func(docID uint32) bool { 1055 return reposWant[d.repos[docID]] 1056 }, 1057 }, nil 1058 1059 case *query.RepoIDs: 1060 reposWant := make([]bool, len(d.repoMetaData)) 1061 for repoIdx, r := range d.repoMetaData { 1062 if s.Repos.Contains(r.ID) { 1063 reposWant[repoIdx] = true 1064 } 1065 } 1066 return &docMatchTree{ 1067 reason: "RepoIDs", 1068 numDocs: d.numDocs(), 1069 predicate: func(docID uint32) bool { 1070 return reposWant[d.repos[docID]] 1071 }, 1072 }, nil 1073 1074 case *query.Repo: 1075 reposWant := make([]bool, len(d.repoMetaData)) 1076 for repoIdx, r := range d.repoMetaData { 1077 if s.Regexp.MatchString(r.Name) { 1078 reposWant[repoIdx] = true 1079 } 1080 } 1081 return &docMatchTree{ 1082 reason: "Repo", 1083 numDocs: d.numDocs(), 1084 predicate: func(docID uint32) bool { 1085 return reposWant[d.repos[docID]] 1086 }, 1087 }, nil 1088 1089 case *query.RepoRegexp: 1090 reposWant := make([]bool, len(d.repoMetaData)) 1091 for repoIdx, r := range d.repoMetaData { 1092 if s.Regexp.MatchString(r.Name) { 1093 reposWant[repoIdx] = true 1094 } 1095 } 1096 return &docMatchTree{ 1097 reason: "RepoRegexp", 1098 numDocs: d.numDocs(), 1099 predicate: func(docID uint32) bool { 1100 return reposWant[d.repos[docID]] 1101 }, 1102 }, nil 1103 1104 case query.RawConfig: 1105 return &docMatchTree{ 1106 reason: s.String(), 1107 numDocs: d.numDocs(), 1108 predicate: func(docID uint32) bool { 1109 return uint8(s)&d.rawConfigMasks[d.repos[docID]] == uint8(s) 1110 }, 1111 }, nil 1112 } 1113 log.Panicf("type %T", q) 1114 return nil, nil 1115} 1116 1117func (d *indexData) newSubstringMatchTree(s *query.Substring) (matchTree, error) { 1118 st := &substrMatchTree{ 1119 query: s, 1120 caseSensitive: s.CaseSensitive, 1121 fileName: s.FileName, 1122 } 1123 1124 if utf8.RuneCountInString(s.Pattern) < ngramSize { 1125 prefix := "" 1126 if !s.CaseSensitive { 1127 prefix = "(?i)" 1128 } 1129 t := &regexpMatchTree{ 1130 regexp: regexp.MustCompile(prefix + regexp.QuoteMeta(s.Pattern)), 1131 fileName: s.FileName, 1132 } 1133 return t, nil 1134 } 1135 1136 result, err := d.iterateNgrams(s) 1137 if err != nil { 1138 return nil, err 1139 } 1140 st.matchIterator = result 1141 return st, nil 1142} 1143 1144func regexpToWordMatchTree(q *query.Regexp, opt matchTreeOpt) (_ *wordMatchTree, ok bool) { 1145 if opt.DisableWordMatchOptimization { 1146 return nil, false 1147 } 1148 // Needs to be case sensitive 1149 if !q.CaseSensitive || q.Regexp.Flags&syntax.FoldCase != 0 { 1150 return nil, false 1151 } 1152 // We want a regex that looks like Op.Concat[OpWordBoundary OpLiteral OpWordBoundary] 1153 if q.Regexp.Op != syntax.OpConcat || len(q.Regexp.Sub) != 3 { 1154 return nil, false 1155 } 1156 sub := q.Regexp.Sub 1157 if sub[0].Op != syntax.OpWordBoundary || sub[1].Op != syntax.OpLiteral || sub[2].Op != syntax.OpWordBoundary { 1158 return nil, false 1159 } 1160 1161 return &wordMatchTree{ 1162 word: string(sub[1].Rune), 1163 fileName: q.FileName, 1164 }, true 1165} 1166 1167// pruneMatchTree removes impossible branches from the matchTree, as indicated 1168// by substrMatchTree having a noMatchTree and the resulting impossible and clauses and so forth. 1169func pruneMatchTree(mt matchTree) (matchTree, error) { 1170 var err error 1171 switch mt := mt.(type) { 1172 // leaf nodes that we test with the filter: 1173 case *substrMatchTree: 1174 if res, ok := mt.matchIterator.(*ngramIterationResults); ok { 1175 if _, ok := res.matchIterator.(*noMatchTree); ok { 1176 return nil, nil 1177 } 1178 } 1179 // recursive tree structures: 1180 case *andMatchTree: 1181 // Any branch of an and becoming impossible means the entire clause 1182 // is impossible. Otherwise, just handle rewrites. 1183 for i, child := range mt.children { 1184 newChild, err := pruneMatchTree(child) 1185 if err != nil { 1186 return nil, err 1187 } 1188 if newChild == nil { 1189 return nil, nil 1190 } 1191 mt.children[i] = newChild 1192 } 1193 case *orMatchTree: 1194 // *All* branches of an OR becoming impossible means the entire clause 1195 // is impossible. Otherwise, drop impossible subclauses and handle 1196 // rewrites, including simplifying to a singular resulting child branch. 1197 n := 0 1198 for _, child := range mt.children { 1199 newChild, err := pruneMatchTree(child) 1200 if err != nil { 1201 return nil, err 1202 } 1203 if newChild != nil { 1204 mt.children[n] = newChild 1205 n++ 1206 } 1207 } 1208 mt.children = mt.children[:n] 1209 if len(mt.children) == 1 { 1210 return mt.children[0], nil 1211 } else if len(mt.children) == 0 { 1212 return nil, nil 1213 } 1214 case *noVisitMatchTree: 1215 mt.matchTree, err = pruneMatchTree(mt.matchTree) 1216 if err != nil { 1217 return nil, err 1218 } 1219 if mt.matchTree == nil { 1220 return nil, nil 1221 } 1222 case *fileNameMatchTree: 1223 mt.child, err = pruneMatchTree(mt.child) 1224 case *andLineMatchTree: 1225 child, err := pruneMatchTree(&mt.andMatchTree) 1226 if err != nil { 1227 return nil, err 1228 } 1229 if child == nil { 1230 return nil, nil 1231 } 1232 if c, ok := child.(*andMatchTree); ok { 1233 mt.andMatchTree = *c 1234 } else { 1235 // the and was simplified to a single clause, 1236 // so the linematch portion is irrelevant. 1237 return mt, nil 1238 } 1239 case *notMatchTree: 1240 mt.child, err = pruneMatchTree(mt.child) 1241 if err != nil { 1242 return nil, err 1243 } 1244 if mt.child == nil { 1245 // not false => true 1246 return &bruteForceMatchTree{}, nil 1247 } 1248 // unhandled: 1249 case *docMatchTree: 1250 case *bruteForceMatchTree: 1251 case *regexpMatchTree: 1252 case *wordMatchTree: 1253 } 1254 return mt, err 1255}