fork of https://github.com/sourcegraph/zoekt
0

Configure Feed

Select the types of activity you want to include in your feed.

1// Copyright 2018 Google Inc. All rights reserved. 2// 3// Licensed under the Apache License, Version 2.0 (the "License"); 4// you may not use this file except in compliance with the License. 5// You may obtain a copy of the License at 6// 7// http://www.apache.org/licenses/LICENSE-2.0 8// 9// Unless required by applicable law or agreed to in writing, software 10// distributed under the License is distributed on an "AS IS" BASIS, 11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12// See the License for the specific language governing permissions and 13// limitations under the License. 14 15package zoekt 16 17import ( 18 "fmt" 19 "log" 20 "strings" 21 "unicode/utf8" 22 23 "github.com/grafana/regexp" 24 "github.com/sourcegraph/zoekt/query" 25) 26 27// A docIterator iterates over documents in order. 28type docIterator interface { 29 // provide the next document where we can may find something 30 // interesting. 31 nextDoc() uint32 32 33 // clears any per-document state of the docIterator, and 34 // prepares for evaluating the given doc. The argument is 35 // strictly increasing over time. 36 prepare(nextDoc uint32) 37} 38 39const ( 40 costConst = 0 41 costMemory = 1 42 costContent = 2 43 costRegexp = 3 44) 45 46const ( 47 costMin = costConst 48 costMax = costRegexp 49) 50 51// An expression tree coupled with matches. The matchtree has two 52// functions: 53// 54// * it implements boolean combinations (and, or, not) 55// 56// * it implements shortcuts, where we skip documents (for example: if 57// there are no trigram matches, we can be sure there are no substring 58// matches). The matchtree iterates over the documents as they are 59// ordered in the shard. 60// 61// The general process for a given (shard, query) is: 62// 63// - construct matchTree for the query 64// 65// - find all different leaf matchTrees (substring, regexp, etc.) 66// 67// in a loop: 68// 69// - find next doc to process using nextDoc 70// 71// - evaluate atoms (leaf expressions that match text) 72// 73// - evaluate the tree using matches(), storing the result in map. 74// 75// - if the complete tree returns (matches() == true) for the document, 76// collect all text matches by looking at leaf matchTrees 77type matchTree interface { 78 docIterator 79 80 // returns whether this matches, and if we are sure. 81 matches(cp *contentProvider, cost int, known map[matchTree]bool) (match bool, sure bool) 82} 83 84// docMatchTree iterates over documents for which predicate(docID) returns true. 85type docMatchTree struct { 86 // the number of documents in a shard. 87 numDocs uint32 88 89 predicate func(docID uint32) bool 90 91 // provides additional information about the reason why the docMatchTree was 92 // created. 93 reason string 94 95 // mutable 96 firstDone bool 97 docID uint32 98} 99 100type bruteForceMatchTree struct { 101 // mutable 102 firstDone bool 103 docID uint32 104} 105 106type andLineMatchTree struct { 107 andMatchTree 108} 109 110type andMatchTree struct { 111 children []matchTree 112} 113 114type orMatchTree struct { 115 children []matchTree 116} 117 118type notMatchTree struct { 119 child matchTree 120} 121 122// Returns only the filename of child matches. 123type fileNameMatchTree struct { 124 child matchTree 125} 126 127// Don't visit this subtree for collecting matches. 128type noVisitMatchTree struct { 129 matchTree 130} 131 132type regexpMatchTree struct { 133 regexp *regexp.Regexp 134 135 fileName bool 136 137 // mutable 138 reEvaluated bool 139 found []*candidateMatch 140 141 // nextDoc, prepare. 142 bruteForceMatchTree 143} 144 145type substrMatchTree struct { 146 matchIterator 147 148 query *query.Substring 149 caseSensitive bool 150 fileName bool 151 152 // mutable 153 current []*candidateMatch 154 contEvaluated bool 155} 156 157type branchQueryMatchTree struct { 158 fileMasks []uint64 159 masks []uint64 160 repos []uint16 161 162 // mutable 163 firstDone bool 164 docID uint32 165} 166 167type symbolRegexpMatchTree struct { 168 matchTree 169 regexp *regexp.Regexp 170 all bool // skips regex match if .* 171 172 reEvaluated bool 173 found []*candidateMatch 174} 175 176func (t *symbolRegexpMatchTree) prepare(doc uint32) { 177 t.reEvaluated = false 178} 179 180func (t *symbolRegexpMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) (bool, bool) { 181 if t.reEvaluated { 182 return len(t.found) > 0, true 183 } 184 185 if cost < costRegexp { 186 return false, false 187 } 188 189 sections := cp.docSections() 190 content := cp.data(false) 191 192 found := t.found[:0] 193 for i, sec := range sections { 194 var idx []int 195 if t.all { 196 idx = []int{0, int(sec.End - sec.Start)} 197 } else { 198 idx = t.regexp.FindIndex(content[sec.Start:sec.End]) 199 if idx == nil { 200 continue 201 } 202 } 203 204 cm := &candidateMatch{ 205 byteOffset: sec.Start + uint32(idx[0]), 206 byteMatchSz: uint32(idx[1] - idx[0]), 207 symbol: true, 208 symbolIdx: uint32(i), 209 } 210 found = append(found, cm) 211 } 212 t.found = found 213 t.reEvaluated = true 214 215 return len(t.found) > 0, true 216} 217 218type symbolSubstrMatchTree struct { 219 *substrMatchTree 220 221 patternSize uint32 222 fileEndRunes []uint32 223 fileEndSymbol []uint32 224 225 doc uint32 226 sections []DocumentSection 227 228 secID uint32 229} 230 231func (t *symbolSubstrMatchTree) prepare(doc uint32) { 232 t.substrMatchTree.prepare(doc) 233 t.doc = doc 234 235 var fileStart uint32 236 if doc > 0 { 237 fileStart = t.fileEndRunes[doc-1] 238 } 239 240 var sections []DocumentSection 241 if len(t.sections) > 0 { 242 most := t.fileEndSymbol[len(t.fileEndSymbol)-1] 243 if most == uint32(len(t.sections)) { 244 sections = t.sections[t.fileEndSymbol[doc]:t.fileEndSymbol[doc+1]] 245 } else { 246 for t.secID < uint32(len(t.sections)) && t.sections[t.secID].Start < fileStart { 247 t.secID++ 248 } 249 250 fileEnd, symbolEnd := t.fileEndRunes[doc], t.secID 251 for symbolEnd < uint32(len(t.sections)) && t.sections[symbolEnd].Start < fileEnd { 252 symbolEnd++ 253 } 254 255 sections = t.sections[t.secID:symbolEnd] 256 } 257 } 258 259 secIdx := 0 260 trimmed := t.current[:0] 261 for len(sections) > secIdx && len(t.current) > 0 { 262 start := fileStart + t.current[0].runeOffset 263 end := start + t.patternSize 264 if start >= sections[secIdx].End { 265 secIdx++ 266 continue 267 } 268 269 if start < sections[secIdx].Start { 270 t.current = t.current[1:] 271 continue 272 } 273 274 if end <= sections[secIdx].End { 275 t.current[0].symbol = true 276 t.current[0].symbolIdx = uint32(secIdx) 277 trimmed = append(trimmed, t.current[0]) 278 } 279 280 t.current = t.current[1:] 281 } 282 t.current = trimmed 283} 284 285// all prepare methods 286 287func (t *bruteForceMatchTree) prepare(doc uint32) { 288 t.docID = doc 289 t.firstDone = true 290} 291 292func (t *docMatchTree) prepare(doc uint32) { 293 t.docID = doc 294 t.firstDone = true 295} 296 297func (t *andMatchTree) prepare(doc uint32) { 298 for _, c := range t.children { 299 c.prepare(doc) 300 } 301} 302 303func (t *regexpMatchTree) prepare(doc uint32) { 304 t.found = t.found[:0] 305 t.reEvaluated = false 306 t.bruteForceMatchTree.prepare(doc) 307} 308 309func (t *orMatchTree) prepare(doc uint32) { 310 for _, c := range t.children { 311 c.prepare(doc) 312 } 313} 314 315func (t *notMatchTree) prepare(doc uint32) { 316 t.child.prepare(doc) 317} 318 319func (t *fileNameMatchTree) prepare(doc uint32) { 320 t.child.prepare(doc) 321} 322 323func (t *substrMatchTree) prepare(nextDoc uint32) { 324 t.matchIterator.prepare(nextDoc) 325 t.current = t.matchIterator.candidates() 326 t.contEvaluated = false 327} 328 329func (t *branchQueryMatchTree) prepare(doc uint32) { 330 t.firstDone = true 331 t.docID = doc 332} 333 334// nextDoc 335 336func (t *docMatchTree) nextDoc() uint32 { 337 var start uint32 338 if t.firstDone { 339 start = t.docID + 1 340 } 341 for i := start; i < t.numDocs; i++ { 342 if t.predicate(i) { 343 return i 344 } 345 } 346 return maxUInt32 347} 348 349func (t *bruteForceMatchTree) nextDoc() uint32 { 350 if !t.firstDone { 351 return 0 352 } 353 return t.docID + 1 354} 355 356func (t *andMatchTree) nextDoc() uint32 { 357 var max uint32 358 for _, c := range t.children { 359 m := c.nextDoc() 360 if m > max { 361 max = m 362 } 363 } 364 return max 365} 366 367func (t *orMatchTree) nextDoc() uint32 { 368 min := uint32(maxUInt32) 369 for _, c := range t.children { 370 m := c.nextDoc() 371 if m < min { 372 min = m 373 } 374 } 375 return min 376} 377 378func (t *notMatchTree) nextDoc() uint32 { 379 return 0 380} 381 382func (t *fileNameMatchTree) nextDoc() uint32 { 383 return t.child.nextDoc() 384} 385 386func (t *branchQueryMatchTree) nextDoc() uint32 { 387 var start uint32 388 if t.firstDone { 389 start = t.docID + 1 390 } 391 392 for i := start; i < uint32(len(t.fileMasks)); i++ { 393 if (t.masks[t.repos[i]] & t.fileMasks[i]) != 0 { 394 return i 395 } 396 } 397 return maxUInt32 398} 399 400// all String methods 401 402func (t *bruteForceMatchTree) String() string { 403 return "all" 404} 405 406func (t *docMatchTree) String() string { 407 return fmt.Sprintf("doc(%s)", t.reason) 408} 409 410func (t *andMatchTree) String() string { 411 return fmt.Sprintf("and%v", t.children) 412} 413 414func (t *regexpMatchTree) String() string { 415 f := "" 416 if t.fileName { 417 f = "f" 418 } 419 return fmt.Sprintf("%sre(%s)", f, t.regexp) 420} 421 422func (t *orMatchTree) String() string { 423 return fmt.Sprintf("or%v", t.children) 424} 425 426func (t *notMatchTree) String() string { 427 return fmt.Sprintf("not(%v)", t.child) 428} 429 430func (t *noVisitMatchTree) String() string { 431 return fmt.Sprintf("novisit(%v)", t.matchTree) 432} 433 434func (t *fileNameMatchTree) String() string { 435 return fmt.Sprintf("f(%v)", t.child) 436} 437 438func (t *substrMatchTree) String() string { 439 f := "" 440 if t.fileName { 441 f = "f" 442 } 443 444 return fmt.Sprintf("%ssubstr(%q, %v, %v)", f, t.query.Pattern, t.current, t.matchIterator) 445} 446 447func (t *branchQueryMatchTree) String() string { 448 return fmt.Sprintf("branch(%x)", t.masks) 449} 450 451func (t *symbolSubstrMatchTree) String() string { 452 return fmt.Sprintf("symbol(%v)", t.substrMatchTree) 453} 454 455func (t *symbolRegexpMatchTree) String() string { 456 return fmt.Sprintf("symbol(%v)", t.matchTree) 457} 458 459// visitMatches visits all atoms in matchTree. Note: This visits 460// noVisitMatchTree. For collecting matches use visitMatches. 461func visitMatchTree(t matchTree, f func(matchTree)) { 462 switch s := t.(type) { 463 case *andMatchTree: 464 for _, ch := range s.children { 465 visitMatchTree(ch, f) 466 } 467 case *orMatchTree: 468 for _, ch := range s.children { 469 visitMatchTree(ch, f) 470 } 471 case *andLineMatchTree: 472 visitMatchTree(&s.andMatchTree, f) 473 case *noVisitMatchTree: 474 visitMatchTree(s.matchTree, f) 475 case *notMatchTree: 476 visitMatchTree(s.child, f) 477 case *fileNameMatchTree: 478 visitMatchTree(s.child, f) 479 case *symbolSubstrMatchTree: 480 visitMatchTree(s.substrMatchTree, f) 481 case *symbolRegexpMatchTree: 482 visitMatchTree(s.matchTree, f) 483 default: 484 f(t) 485 } 486} 487 488// visitMatches visits all atoms which can contribute matches. Note: This 489// skips noVisitMatchTree. 490func visitMatches(t matchTree, known map[matchTree]bool, f func(matchTree)) { 491 switch s := t.(type) { 492 case *andMatchTree: 493 for _, ch := range s.children { 494 if known[ch] { 495 visitMatches(ch, known, f) 496 } 497 } 498 case *andLineMatchTree: 499 visitMatches(&s.andMatchTree, known, f) 500 case *orMatchTree: 501 for _, ch := range s.children { 502 if known[ch] { 503 visitMatches(ch, known, f) 504 } 505 } 506 case *symbolSubstrMatchTree: 507 visitMatches(s.substrMatchTree, known, f) 508 case *notMatchTree: 509 case *noVisitMatchTree: 510 // don't collect into negative trees. 511 case *fileNameMatchTree: 512 // We will just gather the filename if we do not visit this tree. 513 default: 514 f(s) 515 } 516} 517 518// all matches() methods. 519 520func (t *docMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) (bool, bool) { 521 return t.predicate(cp.idx), true 522} 523 524func (t *bruteForceMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) (bool, bool) { 525 return true, true 526} 527 528// andLineMatchTree is a performance optimization of andMatchTree. For content 529// searches we don't want to run the regex engine if there is no line that 530// contains matches from all terms. 531func (t *andLineMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) (bool, bool) { 532 matches, sure := t.andMatchTree.matches(cp, cost, known) 533 if !(sure && matches) { 534 return matches, sure 535 } 536 537 // find child with fewest candidates 538 min := maxUInt32 539 fewestChildren := 0 540 for ix, child := range t.children { 541 v, ok := child.(*substrMatchTree) 542 // make sure we are running a content search and that all candidates are a 543 // substrMatchTree 544 if !ok || v.fileName { 545 return matches, sure 546 } 547 if len(v.current) < min { 548 min = len(v.current) 549 fewestChildren = ix 550 } 551 } 552 553 type lineRange struct { 554 start int 555 end int 556 } 557 lines := make([]lineRange, 0, len(t.children[fewestChildren].(*substrMatchTree).current)) 558 prev := -1 559 for _, candidate := range t.children[fewestChildren].(*substrMatchTree).current { 560 line, byteStart, byteEnd := cp.newlines().atOffset(candidate.byteOffset) 561 if line == prev { 562 continue 563 } 564 prev = line 565 lines = append(lines, lineRange{byteStart, byteEnd}) 566 } 567 568 // children keeps track of the children's candidates we have already seen. 569 children := make([][]*candidateMatch, 0, len(t.children)-1) 570 for j, child := range t.children { 571 if j == fewestChildren { 572 continue 573 } 574 children = append(children, child.(*substrMatchTree).current) 575 } 576 577nextLine: 578 for i := 0; i < len(lines); i++ { 579 hits := 1 580 nextChild: 581 for j := range children { 582 nextCandidate: 583 for len(children[j]) > 0 { 584 candidate := children[j][0] 585 bo := int(cp.findOffset(false, candidate.runeOffset)) 586 if bo < lines[i].start { 587 children[j] = children[j][1:] 588 continue nextCandidate 589 } 590 if bo <= lines[i].end { 591 hits++ 592 continue nextChild 593 } 594 // move the `lines` iterator forward until bo <= line.end 595 for i < len(lines) && bo > lines[i].end { 596 i++ 597 } 598 i-- 599 continue nextLine 600 } 601 } 602 // return early once we found any line that contains matches from all children 603 if hits == len(t.children) { 604 return matches, true 605 } 606 } 607 return false, true 608} 609 610func (t *andMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) (bool, bool) { 611 sure := true 612 613 for _, ch := range t.children { 614 v, ok := evalMatchTree(cp, cost, known, ch) 615 if ok && !v { 616 return false, true 617 } 618 if !ok { 619 sure = false 620 } 621 } 622 623 return true, sure 624} 625 626func (t *orMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) (bool, bool) { 627 matches := false 628 sure := true 629 for _, ch := range t.children { 630 v, ok := evalMatchTree(cp, cost, known, ch) 631 if ok { 632 // we could short-circuit, but we want to use 633 // the other possibilities as a ranking 634 // signal. 635 matches = matches || v 636 } else { 637 sure = false 638 } 639 } 640 return matches, sure 641} 642 643func (t *branchQueryMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) (bool, bool) { 644 return t.fileMasks[t.docID]&t.masks[t.repos[t.docID]] != 0, true 645} 646 647func (t *regexpMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) (bool, bool) { 648 if t.reEvaluated { 649 return len(t.found) > 0, true 650 } 651 652 if cost < costRegexp { 653 return false, false 654 } 655 656 cp.stats.RegexpsConsidered++ 657 idxs := t.regexp.FindAllIndex(cp.data(t.fileName), -1) 658 found := t.found[:0] 659 for _, idx := range idxs { 660 cm := &candidateMatch{ 661 byteOffset: uint32(idx[0]), 662 byteMatchSz: uint32(idx[1] - idx[0]), 663 fileName: t.fileName, 664 } 665 666 found = append(found, cm) 667 } 668 t.found = found 669 t.reEvaluated = true 670 671 return len(t.found) > 0, true 672} 673 674// breakMatchesOnNewlines returns matches resulting from breaking each element 675// of cms on newlines within text. 676func breakMatchesOnNewlines(cms []*candidateMatch, text []byte) []*candidateMatch { 677 var lineCMs []*candidateMatch 678 for _, cm := range cms { 679 lineCMs = append(lineCMs, breakOnNewlines(cm, text)...) 680 } 681 return lineCMs 682} 683 684// breakOnNewlines returns matches resulting from breaking cm on newlines 685// within text. 686func breakOnNewlines(cm *candidateMatch, text []byte) []*candidateMatch { 687 var cms []*candidateMatch 688 addMe := &candidateMatch{} 689 *addMe = *cm 690 for i := uint32(cm.byteOffset); i < cm.byteOffset+cm.byteMatchSz; i++ { 691 if text[i] == '\n' { 692 addMe.byteMatchSz = i - addMe.byteOffset 693 if addMe.byteMatchSz != 0 { 694 cms = append(cms, addMe) 695 } 696 697 addMe = &candidateMatch{} 698 *addMe = *cm 699 addMe.byteOffset = i + 1 700 } 701 } 702 addMe.byteMatchSz = cm.byteOffset + cm.byteMatchSz - addMe.byteOffset 703 if addMe.byteMatchSz != 0 { 704 cms = append(cms, addMe) 705 } 706 return cms 707} 708 709func evalMatchTree(cp *contentProvider, cost int, known map[matchTree]bool, mt matchTree) (bool, bool) { 710 if v, ok := known[mt]; ok { 711 return v, true 712 } 713 714 v, ok := mt.matches(cp, cost, known) 715 if ok { 716 known[mt] = v 717 } 718 719 return v, ok 720} 721 722func (t *notMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) (bool, bool) { 723 v, ok := evalMatchTree(cp, cost, known, t.child) 724 return !v, ok 725} 726 727func (t *fileNameMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) (bool, bool) { 728 return evalMatchTree(cp, cost, known, t.child) 729} 730 731func (t *substrMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) (bool, bool) { 732 if t.contEvaluated { 733 return len(t.current) > 0, true 734 } 735 736 if len(t.current) == 0 { 737 return false, true 738 } 739 740 if t.fileName && cost < costMemory { 741 return false, false 742 } 743 744 if !t.fileName && cost < costContent { 745 return false, false 746 } 747 748 pruned := t.current[:0] 749 for _, m := range t.current { 750 if m.byteOffset == 0 && m.runeOffset > 0 { 751 m.byteOffset = cp.findOffset(m.fileName, m.runeOffset) 752 } 753 if m.matchContent(cp.data(m.fileName)) { 754 pruned = append(pruned, m) 755 } 756 } 757 t.current = pruned 758 t.contEvaluated = true 759 760 return len(t.current) > 0, true 761} 762 763func (d *indexData) newMatchTree(q query.Q) (matchTree, error) { 764 if q == nil { 765 return nil, fmt.Errorf("got nil (sub)query") 766 } 767 switch s := q.(type) { 768 case *query.Regexp: 769 // RegexpToMatchTreeRecursive tries to distill a matchTree that matches a 770 // superset of the regexp. If the returned matchTree is equivalent to the 771 // original regexp, it returns true. An equivalent matchTree has the same 772 // behaviour as the original regexp and can be used instead. 773 // 774 subMT, isEq, _, err := d.regexpToMatchTreeRecursive(s.Regexp, ngramSize, s.FileName, s.CaseSensitive) 775 if err != nil { 776 return nil, err 777 } 778 // if the query can be used in place of the regexp 779 // return the subtree 780 if isEq { 781 return subMT, nil 782 } 783 784 prefix := "" 785 if !s.CaseSensitive { 786 prefix = "(?i)" 787 } 788 789 tr := &regexpMatchTree{ 790 regexp: regexp.MustCompile(prefix + s.Regexp.String()), 791 fileName: s.FileName, 792 } 793 794 return &andMatchTree{ 795 children: []matchTree{ 796 tr, &noVisitMatchTree{subMT}, 797 }, 798 }, nil 799 case *query.And: 800 var r []matchTree 801 for _, ch := range s.Children { 802 ct, err := d.newMatchTree(ch) 803 if err != nil { 804 return nil, err 805 } 806 r = append(r, ct) 807 } 808 return &andMatchTree{r}, nil 809 case *query.Or: 810 var r []matchTree 811 for _, ch := range s.Children { 812 ct, err := d.newMatchTree(ch) 813 if err != nil { 814 return nil, err 815 } 816 r = append(r, ct) 817 } 818 return &orMatchTree{r}, nil 819 case *query.Not: 820 ct, err := d.newMatchTree(s.Child) 821 return &notMatchTree{ 822 child: ct, 823 }, err 824 825 case *query.Type: 826 if s.Type != query.TypeFileName { 827 break 828 } 829 830 ct, err := d.newMatchTree(s.Child) 831 if err != nil { 832 return nil, err 833 } 834 835 return &fileNameMatchTree{ 836 child: ct, 837 }, nil 838 839 case *query.Substring: 840 return d.newSubstringMatchTree(s) 841 842 case *query.Branch: 843 masks := make([]uint64, 0, len(d.repoMetaData)) 844 if s.Pattern == "HEAD" { 845 for i := 0; i < len(d.repoMetaData); i++ { 846 masks = append(masks, 1) 847 } 848 } else { 849 for _, branchIDs := range d.branchIDs { 850 mask := uint64(0) 851 for nm, m := range branchIDs { 852 if (s.Exact && nm == s.Pattern) || (!s.Exact && strings.Contains(nm, s.Pattern)) { 853 mask |= uint64(m) 854 } 855 } 856 masks = append(masks, mask) 857 } 858 859 } 860 return &branchQueryMatchTree{ 861 masks: masks, 862 fileMasks: d.fileBranchMasks, 863 repos: d.repos, 864 }, nil 865 case *query.Const: 866 if s.Value { 867 return &bruteForceMatchTree{}, nil 868 } else { 869 return &noMatchTree{"const"}, nil 870 } 871 case *query.Language: 872 code, ok := d.metaData.LanguageMap[s.Language] 873 if !ok { 874 return &noMatchTree{"lang"}, nil 875 } 876 return &docMatchTree{ 877 reason: "language", 878 numDocs: d.numDocs(), 879 predicate: func(docID uint32) bool { 880 return d.getLanguage(docID) == code 881 }, 882 }, nil 883 884 case *query.Symbol: 885 subMT, err := d.newMatchTree(s.Expr) 886 if err != nil { 887 return nil, err 888 } 889 890 if substr, ok := subMT.(*substrMatchTree); ok { 891 // We have a feature flag for lazy decoding. If runeDocSectionsRaw is 892 // non-nil it means we need to lazily decode on request. 893 sections := d.runeDocSections 894 if sections == nil && d.runeDocSectionsRaw != nil { 895 sections = unmarshalDocSections(d.runeDocSectionsRaw, nil) 896 } 897 898 return &symbolSubstrMatchTree{ 899 substrMatchTree: substr, 900 patternSize: uint32(utf8.RuneCountInString(substr.query.Pattern)), 901 fileEndRunes: d.fileEndRunes, 902 fileEndSymbol: d.fileEndSymbol, 903 sections: sections, 904 }, nil 905 } 906 907 var regexp *regexp.Regexp 908 visitMatchTree(subMT, func(mt matchTree) { 909 if t, ok := mt.(*regexpMatchTree); ok { 910 regexp = t.regexp 911 } 912 }) 913 if regexp == nil { 914 return nil, fmt.Errorf("found %T inside query.Symbol", subMT) 915 } 916 917 return &symbolRegexpMatchTree{ 918 regexp: regexp, 919 all: regexp.String() == "(?i)(?-s:.)*", 920 matchTree: subMT, 921 }, nil 922 923 case *query.FileNameSet: 924 return &docMatchTree{ 925 reason: "FileNameSet", 926 numDocs: d.numDocs(), 927 predicate: func(docID uint32) bool { 928 fileName := d.fileName(docID) 929 _, ok := s.Set[string(fileName)] 930 return ok 931 }, 932 }, nil 933 934 case *query.BranchesRepos: 935 reposBranchesWant := make([]uint64, len(d.repoMetaData)) 936 for repoIdx := range d.repoMetaData { 937 var mask uint64 938 for _, br := range s.List { 939 if br.Repos.Contains(d.repoMetaData[repoIdx].ID) { 940 mask |= uint64(d.branchIDs[repoIdx][br.Branch]) 941 } 942 } 943 reposBranchesWant[repoIdx] = mask 944 } 945 return &docMatchTree{ 946 reason: "BranchesRepos", 947 numDocs: d.numDocs(), 948 predicate: func(docID uint32) bool { 949 return d.fileBranchMasks[docID]&reposBranchesWant[d.repos[docID]] != 0 950 }, 951 }, nil 952 953 case *query.RepoSet: 954 reposWant := make([]bool, len(d.repoMetaData)) 955 for repoIdx, r := range d.repoMetaData { 956 if _, ok := s.Set[r.Name]; ok { 957 reposWant[repoIdx] = true 958 } 959 } 960 return &docMatchTree{ 961 reason: "RepoSet", 962 numDocs: d.numDocs(), 963 predicate: func(docID uint32) bool { 964 return reposWant[d.repos[docID]] 965 }, 966 }, nil 967 968 case *query.Repo: 969 reposWant := make([]bool, len(d.repoMetaData)) 970 for repoIdx, r := range d.repoMetaData { 971 if s.Regexp.MatchString(r.Name) { 972 reposWant[repoIdx] = true 973 } 974 } 975 return &docMatchTree{ 976 reason: "Repo", 977 numDocs: d.numDocs(), 978 predicate: func(docID uint32) bool { 979 return reposWant[d.repos[docID]] 980 }, 981 }, nil 982 983 case *query.RepoRegexp: 984 reposWant := make([]bool, len(d.repoMetaData)) 985 for repoIdx, r := range d.repoMetaData { 986 if s.Regexp.MatchString(r.Name) { 987 reposWant[repoIdx] = true 988 } 989 } 990 return &docMatchTree{ 991 reason: "RepoRegexp", 992 numDocs: d.numDocs(), 993 predicate: func(docID uint32) bool { 994 return reposWant[d.repos[docID]] 995 }, 996 }, nil 997 998 case query.RawConfig: 999 return &docMatchTree{ 1000 reason: s.String(), 1001 numDocs: d.numDocs(), 1002 predicate: func(docID uint32) bool { 1003 return uint8(s)&d.rawConfigMasks[d.repos[docID]] == uint8(s) 1004 }, 1005 }, nil 1006 } 1007 log.Panicf("type %T", q) 1008 return nil, nil 1009} 1010 1011func (d *indexData) newSubstringMatchTree(s *query.Substring) (matchTree, error) { 1012 st := &substrMatchTree{ 1013 query: s, 1014 caseSensitive: s.CaseSensitive, 1015 fileName: s.FileName, 1016 } 1017 1018 if utf8.RuneCountInString(s.Pattern) < ngramSize { 1019 prefix := "" 1020 if !s.CaseSensitive { 1021 prefix = "(?i)" 1022 } 1023 t := &regexpMatchTree{ 1024 regexp: regexp.MustCompile(prefix + regexp.QuoteMeta(s.Pattern)), 1025 fileName: s.FileName, 1026 } 1027 return t, nil 1028 } 1029 1030 result, err := d.iterateNgrams(s) 1031 if err != nil { 1032 return nil, err 1033 } 1034 st.matchIterator = result 1035 return st, nil 1036} 1037 1038// pruneMatchTree removes impossible branches from the matchTree, as indicated 1039// by substrMatchTree having a noMatchTree and the resulting impossible and clauses and so forth. 1040func pruneMatchTree(mt matchTree) (matchTree, error) { 1041 var err error 1042 switch mt := mt.(type) { 1043 // leaf nodes that we test with the filter: 1044 case *substrMatchTree: 1045 if res, ok := mt.matchIterator.(*ngramIterationResults); ok { 1046 if _, ok := res.matchIterator.(*noMatchTree); ok { 1047 return nil, nil 1048 } 1049 } 1050 // recursive tree structures: 1051 case *andMatchTree: 1052 // Any branch of an and becoming impossible means the entire clause 1053 // is impossible. Otherwise, just handle rewrites. 1054 for i, child := range mt.children { 1055 newChild, err := pruneMatchTree(child) 1056 if err != nil { 1057 return nil, err 1058 } 1059 if newChild == nil { 1060 return nil, nil 1061 } 1062 mt.children[i] = newChild 1063 } 1064 case *orMatchTree: 1065 // *All* branches of an OR becoming impossible means the entire clause 1066 // is impossible. Otherwise, drop impossible subclauses and handle 1067 // rewrites, including simplifying to a singular resulting child branch. 1068 n := 0 1069 for _, child := range mt.children { 1070 newChild, err := pruneMatchTree(child) 1071 if err != nil { 1072 return nil, err 1073 } 1074 if newChild != nil { 1075 mt.children[n] = newChild 1076 n++ 1077 } 1078 } 1079 mt.children = mt.children[:n] 1080 if len(mt.children) == 1 { 1081 return mt.children[0], nil 1082 } else if len(mt.children) == 0 { 1083 return nil, nil 1084 } 1085 case *noVisitMatchTree: 1086 mt.matchTree, err = pruneMatchTree(mt.matchTree) 1087 if err != nil { 1088 return nil, err 1089 } 1090 if mt.matchTree == nil { 1091 return nil, nil 1092 } 1093 case *fileNameMatchTree: 1094 mt.child, err = pruneMatchTree(mt.child) 1095 case *andLineMatchTree: 1096 child, err := pruneMatchTree(&mt.andMatchTree) 1097 if err != nil { 1098 return nil, err 1099 } 1100 if child == nil { 1101 return nil, nil 1102 } 1103 if c, ok := child.(*andMatchTree); ok { 1104 mt.andMatchTree = *c 1105 } else { 1106 // the and was simplified to a single clause, 1107 // so the linematch portion is irrelevant. 1108 return mt, nil 1109 } 1110 case *notMatchTree: 1111 mt.child, err = pruneMatchTree(mt.child) 1112 if err != nil { 1113 return nil, err 1114 } 1115 if mt.child == nil { 1116 // not false => true 1117 return &bruteForceMatchTree{}, nil 1118 } 1119 // unhandled: 1120 case *docMatchTree: 1121 case *bruteForceMatchTree: 1122 case *regexpMatchTree: 1123 } 1124 return mt, err 1125}