fork of https://github.com/sourcegraph/zoekt
0

Configure Feed

Select the types of activity you want to include in your feed.

1// Copyright 2018 Google Inc. All rights reserved. 2// 3// Licensed under the Apache License, Version 2.0 (the "License"); 4// you may not use this file except in compliance with the License. 5// You may obtain a copy of the License at 6// 7// http://www.apache.org/licenses/LICENSE-2.0 8// 9// Unless required by applicable law or agreed to in writing, software 10// distributed under the License is distributed on an "AS IS" BASIS, 11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12// See the License for the specific language governing permissions and 13// limitations under the License. 14 15package zoekt 16 17import ( 18 "bytes" 19 "fmt" 20 "log" 21 "regexp/syntax" 22 "strings" 23 "unicode/utf8" 24 25 "github.com/grafana/regexp" 26 27 "github.com/sourcegraph/zoekt/query" 28) 29 30// A docIterator iterates over documents in order. 31type docIterator interface { 32 // provide the next document where we may find something interesting. 33 // 34 // This is like a "peek" and shouldn't mutate state. prepare is what should 35 // change state. 36 nextDoc() uint32 37 38 // clears any per-document state of the docIterator, and 39 // prepares for evaluating the given doc. The argument is 40 // strictly increasing over time. 41 prepare(nextDoc uint32) 42} 43 44// costs are passed in increasing order to matchTree.matches until they do not 45// return matchesRequiresHigherCost. 46const ( 47 costConst = 0 48 costMemory = 1 49 costContent = 2 50 costRegexp = 3 51) 52 53const ( 54 costMin = costConst 55 costMax = costRegexp 56) 57 58// matchesState is an enum for the state of a matchTree after a call to 59// matchTree.matches. 60type matchesState uint8 61 62const ( 63 // matchesRequiresHigherCost is returned when matchTree.matches hasn't done 64 // a search yet since the cost value is not high enough. 65 matchesRequiresHigherCost matchesState = iota 66 67 // matchesFound is returned when matchTree.matches has done a search and 68 // found one or more matches. 69 matchesFound 70 71 // matchesNone is returned when matchTree.matches has done a search and 72 // found nothing. 73 matchesNone 74) 75 76// matchesStatePred is a helper which returns matchesFound if b is true 77// otherwise returns matchesNone. 78func matchesStatePred(b bool) matchesState { 79 if b { 80 return matchesFound 81 } 82 return matchesNone 83} 84 85// matchesStateForSlice is a helper which returns matchesFound if v is 86// non-empty otherwise returns matchesNone. 87func matchesStateForSlice[T any](v []T) matchesState { 88 return matchesStatePred(len(v) > 0) 89} 90 91// An expression tree coupled with matches. The matchtree has two 92// functions: 93// 94// * it implements boolean combinations (and, or, not) 95// 96// * it implements shortcuts, where we skip documents (for example: if 97// there are no trigram matches, we can be sure there are no substring 98// matches). The matchtree iterates over the documents as they are 99// ordered in the shard. 100// 101// The general process for a given (shard, query) is: 102// 103// - construct matchTree for the query 104// 105// - find all different leaf matchTrees (substring, regexp, etc.) 106// 107// in a loop: 108// 109// - find next doc to process using nextDoc 110// 111// - evaluate atoms (leaf expressions that match text) 112// 113// - evaluate the tree using matches(), storing the result in map. 114// 115// - if the complete tree returns (matches() != matchesRequiresHigherCost) 116// for the document, collect all text matches by looking at leaf 117// matchTrees. 118type matchTree interface { 119 docIterator 120 121 // matches if cost is high enough. See documentation for matchesState's 122 // values. 123 // 124 // Note: Do not call this directly, rather use evalMatchTree which uses 125 // known to cache responses once the state transitions away from 126 // matchesRequiresHigherCost. 127 matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState 128} 129 130// docMatchTree iterates over documents for which predicate(docID) returns true. 131type docMatchTree struct { 132 // the number of documents in a shard. 133 numDocs uint32 134 135 predicate func(docID uint32) bool 136 137 // provides additional information about the reason why the docMatchTree was 138 // created. 139 reason string 140 141 // mutable 142 firstDone bool 143 docID uint32 144} 145 146type bruteForceMatchTree struct { 147 // mutable 148 firstDone bool 149 docID uint32 150} 151 152type andLineMatchTree struct { 153 andMatchTree 154} 155 156type andMatchTree struct { 157 children []matchTree 158} 159 160type orMatchTree struct { 161 children []matchTree 162} 163 164type notMatchTree struct { 165 child matchTree 166} 167 168// Returns only the filename of child matches. 169type fileNameMatchTree struct { 170 child matchTree 171} 172 173// Don't visit this subtree for collecting matches. 174type noVisitMatchTree struct { 175 matchTree 176} 177 178type regexpMatchTree struct { 179 regexp *regexp.Regexp 180 181 fileName bool 182 183 // mutable 184 reEvaluated bool 185 found []*candidateMatch 186 187 // nextDoc, prepare. 188 bruteForceMatchTree 189} 190 191// \bLITERAL\b 192type wordMatchTree struct { 193 word string 194 195 fileName bool 196 197 // mutable 198 evaluated bool 199 found []*candidateMatch 200 201 // nextDoc, prepare. 202 bruteForceMatchTree 203} 204 205type substrMatchTree struct { 206 matchIterator 207 208 query *query.Substring 209 caseSensitive bool 210 fileName bool 211 212 // mutable 213 current []*candidateMatch 214 contEvaluated bool 215} 216 217type branchQueryMatchTree struct { 218 fileMasks []uint64 219 masks []uint64 220 repos []uint16 221 222 // mutable 223 firstDone bool 224 docID uint32 225} 226 227func (t *branchQueryMatchTree) branchMask() uint64 { 228 return t.fileMasks[t.docID] & t.masks[t.repos[t.docID]] 229} 230 231type symbolRegexpMatchTree struct { 232 matchTree 233 regexp *regexp.Regexp 234 all bool // skips regex match if .* 235 236 reEvaluated bool 237 found []*candidateMatch 238} 239 240func (t *symbolRegexpMatchTree) prepare(doc uint32) { 241 t.reEvaluated = false 242 t.found = t.found[:0] 243 t.matchTree.prepare(doc) 244} 245 246func (t *symbolRegexpMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState { 247 if t.reEvaluated { 248 return matchesStateForSlice(t.found) 249 } 250 251 if cost < costRegexp { 252 return matchesRequiresHigherCost 253 } 254 255 sections := cp.docSections() 256 content := cp.data(false) 257 258 found := t.found[:0] 259 for i, sec := range sections { 260 var idx []int 261 if t.all { 262 idx = []int{0, int(sec.End - sec.Start)} 263 } else { 264 idx = t.regexp.FindIndex(content[sec.Start:sec.End]) 265 if idx == nil { 266 continue 267 } 268 } 269 270 cm := &candidateMatch{ 271 byteOffset: sec.Start + uint32(idx[0]), 272 byteMatchSz: uint32(idx[1] - idx[0]), 273 symbol: true, 274 symbolIdx: uint32(i), 275 } 276 found = append(found, cm) 277 } 278 t.found = found 279 t.reEvaluated = true 280 281 return matchesStateForSlice(t.found) 282} 283 284type symbolSubstrMatchTree struct { 285 *substrMatchTree 286 287 patternSize uint32 288 fileEndRunes []uint32 289 fileEndSymbol []uint32 290 291 doc uint32 292 sections []DocumentSection 293 294 secID uint32 295} 296 297func (t *symbolSubstrMatchTree) prepare(doc uint32) { 298 t.substrMatchTree.prepare(doc) 299 t.doc = doc 300 301 var fileStart uint32 302 if doc > 0 { 303 fileStart = t.fileEndRunes[doc-1] 304 } 305 306 var sections []DocumentSection 307 if len(t.sections) > 0 { 308 most := t.fileEndSymbol[len(t.fileEndSymbol)-1] 309 if most == uint32(len(t.sections)) { 310 sections = t.sections[t.fileEndSymbol[doc]:t.fileEndSymbol[doc+1]] 311 } else { 312 for t.secID < uint32(len(t.sections)) && t.sections[t.secID].Start < fileStart { 313 t.secID++ 314 } 315 316 fileEnd, symbolEnd := t.fileEndRunes[doc], t.secID 317 for symbolEnd < uint32(len(t.sections)) && t.sections[symbolEnd].Start < fileEnd { 318 symbolEnd++ 319 } 320 321 sections = t.sections[t.secID:symbolEnd] 322 } 323 } 324 325 secIdx := 0 326 trimmed := t.current[:0] 327 for len(sections) > secIdx && len(t.current) > 0 { 328 start := fileStart + t.current[0].runeOffset 329 end := start + t.patternSize 330 if start >= sections[secIdx].End { 331 secIdx++ 332 continue 333 } 334 335 if start < sections[secIdx].Start { 336 t.current = t.current[1:] 337 continue 338 } 339 340 if end <= sections[secIdx].End { 341 t.current[0].symbol = true 342 t.current[0].symbolIdx = uint32(secIdx) 343 trimmed = append(trimmed, t.current[0]) 344 } 345 346 t.current = t.current[1:] 347 } 348 t.current = trimmed 349} 350 351// all prepare methods 352 353func (t *bruteForceMatchTree) prepare(doc uint32) { 354 t.docID = doc 355 t.firstDone = true 356} 357 358func (t *docMatchTree) prepare(doc uint32) { 359 t.docID = doc 360 t.firstDone = true 361} 362 363func (t *andMatchTree) prepare(doc uint32) { 364 for _, c := range t.children { 365 c.prepare(doc) 366 } 367} 368 369func (t *regexpMatchTree) prepare(doc uint32) { 370 t.found = t.found[:0] 371 t.reEvaluated = false 372 t.bruteForceMatchTree.prepare(doc) 373} 374 375func (t *wordMatchTree) prepare(doc uint32) { 376 t.found = t.found[:0] 377 t.evaluated = false 378 t.bruteForceMatchTree.prepare(doc) 379} 380 381func (t *orMatchTree) prepare(doc uint32) { 382 for _, c := range t.children { 383 c.prepare(doc) 384 } 385} 386 387func (t *notMatchTree) prepare(doc uint32) { 388 t.child.prepare(doc) 389} 390 391func (t *fileNameMatchTree) prepare(doc uint32) { 392 t.child.prepare(doc) 393} 394 395func (t *substrMatchTree) prepare(nextDoc uint32) { 396 t.matchIterator.prepare(nextDoc) 397 t.current = t.matchIterator.candidates() 398 t.contEvaluated = false 399} 400 401func (t *branchQueryMatchTree) prepare(doc uint32) { 402 t.firstDone = true 403 t.docID = doc 404} 405 406// nextDoc 407 408func (t *docMatchTree) nextDoc() uint32 { 409 var start uint32 410 if t.firstDone { 411 start = t.docID + 1 412 } 413 for i := start; i < t.numDocs; i++ { 414 if t.predicate(i) { 415 return i 416 } 417 } 418 return maxUInt32 419} 420 421func (t *bruteForceMatchTree) nextDoc() uint32 { 422 if !t.firstDone { 423 return 0 424 } 425 return t.docID + 1 426} 427 428func (t *andMatchTree) nextDoc() uint32 { 429 var max uint32 430 for _, c := range t.children { 431 m := c.nextDoc() 432 if m > max { 433 max = m 434 } 435 } 436 return max 437} 438 439func (t *orMatchTree) nextDoc() uint32 { 440 min := uint32(maxUInt32) 441 for _, c := range t.children { 442 m := c.nextDoc() 443 if m < min { 444 min = m 445 } 446 } 447 return min 448} 449 450func (t *notMatchTree) nextDoc() uint32 { 451 return 0 452} 453 454func (t *fileNameMatchTree) nextDoc() uint32 { 455 return t.child.nextDoc() 456} 457 458func (t *branchQueryMatchTree) nextDoc() uint32 { 459 var start uint32 460 if t.firstDone { 461 start = t.docID + 1 462 } 463 464 for i := start; i < uint32(len(t.fileMasks)); i++ { 465 if (t.masks[t.repos[i]] & t.fileMasks[i]) != 0 { 466 return i 467 } 468 } 469 return maxUInt32 470} 471 472// all String methods 473 474func (t *bruteForceMatchTree) String() string { 475 return "all" 476} 477 478func (t *docMatchTree) String() string { 479 return fmt.Sprintf("doc(%s)", t.reason) 480} 481 482func (t *andMatchTree) String() string { 483 return fmt.Sprintf("and%v", t.children) 484} 485 486func (t *regexpMatchTree) String() string { 487 f := "" 488 if t.fileName { 489 f = "f" 490 } 491 return fmt.Sprintf("%sre(%s)", f, t.regexp) 492} 493 494func (t *wordMatchTree) String() string { 495 f := "" 496 if t.fileName { 497 f = "f" 498 } 499 return fmt.Sprintf("%sword(%s)", f, t.word) 500} 501 502func (t *orMatchTree) String() string { 503 return fmt.Sprintf("or%v", t.children) 504} 505 506func (t *notMatchTree) String() string { 507 return fmt.Sprintf("not(%v)", t.child) 508} 509 510func (t *noVisitMatchTree) String() string { 511 return fmt.Sprintf("novisit(%v)", t.matchTree) 512} 513 514func (t *fileNameMatchTree) String() string { 515 return fmt.Sprintf("f(%v)", t.child) 516} 517 518func (t *substrMatchTree) String() string { 519 f := "" 520 if t.fileName { 521 f = "f" 522 } 523 524 return fmt.Sprintf("%ssubstr(%q, %v, %v)", f, t.query.Pattern, t.current, t.matchIterator) 525} 526 527func (t *branchQueryMatchTree) String() string { 528 return fmt.Sprintf("branch(%x)", t.masks) 529} 530 531func (t *symbolSubstrMatchTree) String() string { 532 return fmt.Sprintf("symbol(%v)", t.substrMatchTree) 533} 534 535func (t *symbolRegexpMatchTree) String() string { 536 return fmt.Sprintf("symbol(%v)", t.matchTree) 537} 538 539// visitMatches visits all atoms in matchTree. Note: This visits 540// noVisitMatchTree. For collecting matches use visitMatches. 541func visitMatchTree(t matchTree, f func(matchTree)) { 542 switch s := t.(type) { 543 case *andMatchTree: 544 for _, ch := range s.children { 545 visitMatchTree(ch, f) 546 } 547 case *orMatchTree: 548 for _, ch := range s.children { 549 visitMatchTree(ch, f) 550 } 551 case *andLineMatchTree: 552 visitMatchTree(&s.andMatchTree, f) 553 case *noVisitMatchTree: 554 visitMatchTree(s.matchTree, f) 555 case *notMatchTree: 556 visitMatchTree(s.child, f) 557 case *fileNameMatchTree: 558 visitMatchTree(s.child, f) 559 case *symbolSubstrMatchTree: 560 visitMatchTree(s.substrMatchTree, f) 561 case *symbolRegexpMatchTree: 562 visitMatchTree(s.matchTree, f) 563 default: 564 f(t) 565 } 566} 567 568// updateMatchTreeStats calls updateStats on all atoms in mt which have that 569// function defined. 570func updateMatchTreeStats(mt matchTree, stats *Stats) { 571 visitMatchTree(mt, func(mt matchTree) { 572 if atom, ok := mt.(interface{ updateStats(*Stats) }); ok { 573 atom.updateStats(stats) 574 } 575 }) 576} 577 578// visitMatches visits all atoms which can contribute matches. Note: This 579// skips noVisitMatchTree. 580func visitMatches(t matchTree, known map[matchTree]bool, f func(matchTree)) { 581 switch s := t.(type) { 582 case *andMatchTree: 583 for _, ch := range s.children { 584 if known[ch] { 585 visitMatches(ch, known, f) 586 } 587 } 588 case *andLineMatchTree: 589 visitMatches(&s.andMatchTree, known, f) 590 case *orMatchTree: 591 for _, ch := range s.children { 592 if known[ch] { 593 visitMatches(ch, known, f) 594 } 595 } 596 case *symbolSubstrMatchTree: 597 visitMatches(s.substrMatchTree, known, f) 598 case *notMatchTree: 599 case *noVisitMatchTree: 600 // don't collect into negative trees. 601 case *fileNameMatchTree: 602 // We will just gather the filename if we do not visit this tree. 603 default: 604 f(s) 605 } 606} 607 608// all matches() methods. 609 610func (t *docMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState { 611 return matchesStatePred(t.predicate(cp.idx)) 612} 613 614func (t *bruteForceMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState { 615 return matchesFound 616} 617 618// andLineMatchTree is a performance optimization of andMatchTree. For content 619// searches we don't want to run the regex engine if there is no line that 620// contains matches from all terms. 621func (t *andLineMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState { 622 if state := evalMatchTree(cp, cost, known, &t.andMatchTree); state != matchesFound { 623 return state 624 } 625 626 // Invariant: all children have matches. If any line contains all of them we 627 // can return MatchesFound. 628 629 // find child with fewest candidates 630 min := maxUInt32 631 fewestChildren := 0 632 for ix, child := range t.children { 633 v, ok := child.(*substrMatchTree) 634 // make sure we are running a content search and that all candidates are a 635 // substrMatchTree 636 if !ok || v.fileName { 637 return matchesFound 638 } 639 if len(v.current) < min { 640 min = len(v.current) 641 fewestChildren = ix 642 } 643 } 644 645 type lineRange struct { 646 start int 647 end int 648 } 649 lines := make([]lineRange, 0, len(t.children[fewestChildren].(*substrMatchTree).current)) 650 prev := -1 651 for _, candidate := range t.children[fewestChildren].(*substrMatchTree).current { 652 line, byteStart, byteEnd := cp.newlines().atOffset(candidate.byteOffset) 653 if line == prev { 654 continue 655 } 656 prev = line 657 lines = append(lines, lineRange{byteStart, byteEnd}) 658 } 659 660 // children keeps track of the children's candidates we have already seen. 661 children := make([][]*candidateMatch, 0, len(t.children)-1) 662 for j, child := range t.children { 663 if j == fewestChildren { 664 continue 665 } 666 children = append(children, child.(*substrMatchTree).current) 667 } 668 669nextLine: 670 for i := 0; i < len(lines); i++ { 671 hits := 1 672 nextChild: 673 for j := range children { 674 nextCandidate: 675 for len(children[j]) > 0 { 676 candidate := children[j][0] 677 bo := int(cp.findOffset(false, candidate.runeOffset)) 678 if bo < lines[i].start { 679 children[j] = children[j][1:] 680 continue nextCandidate 681 } 682 if bo <= lines[i].end { 683 hits++ 684 continue nextChild 685 } 686 // move the `lines` iterator forward until bo <= line.end 687 for i < len(lines) && bo > lines[i].end { 688 i++ 689 } 690 i-- 691 continue nextLine 692 } 693 } 694 // return early once we found any line that contains matches from all children 695 if hits == len(t.children) { 696 return matchesFound 697 } 698 } 699 return matchesNone 700} 701 702func (t *andMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState { 703 // We have found matches unless a child needs to do more work or it hasn't 704 // found matches. 705 state := matchesFound 706 707 for _, ch := range t.children { 708 switch evalMatchTree(cp, cost, known, ch) { 709 case matchesRequiresHigherCost: 710 // keep evaluating other children incase we come across matchesNone 711 state = matchesRequiresHigherCost 712 case matchesFound: 713 // will return this if every child has this value 714 case matchesNone: 715 return matchesNone 716 } 717 } 718 719 return state 720} 721 722func (t *orMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState { 723 // we could short-circuit, but we want to use the other possibilities as a 724 // ranking signal. So we always return the most conservative state. 725 state := matchesNone 726 for _, ch := range t.children { 727 switch evalMatchTree(cp, cost, known, ch) { 728 case matchesRequiresHigherCost: 729 state = matchesRequiresHigherCost 730 case matchesFound: 731 if state != matchesRequiresHigherCost { 732 state = matchesFound 733 } 734 case matchesNone: 735 // noop 736 } 737 } 738 return state 739} 740 741func (t *branchQueryMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState { 742 return matchesStatePred(t.branchMask() != 0) 743} 744 745func (t *regexpMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState { 746 if t.reEvaluated { 747 return matchesStateForSlice(t.found) 748 } 749 750 if cost < costRegexp { 751 return matchesRequiresHigherCost 752 } 753 754 cp.stats.RegexpsConsidered++ 755 idxs := t.regexp.FindAllIndex(cp.data(t.fileName), -1) 756 found := t.found[:0] 757 for _, idx := range idxs { 758 cm := &candidateMatch{ 759 byteOffset: uint32(idx[0]), 760 byteMatchSz: uint32(idx[1] - idx[0]), 761 fileName: t.fileName, 762 } 763 764 found = append(found, cm) 765 } 766 t.found = found 767 t.reEvaluated = true 768 769 return matchesStateForSlice(t.found) 770} 771 772func (t *wordMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState { 773 if t.evaluated { 774 return matchesStateForSlice(t.found) 775 } 776 777 if cost < costRegexp { 778 return matchesRequiresHigherCost 779 } 780 781 data := cp.data(t.fileName) 782 offset := 0 783 found := t.found[:0] 784 for { 785 idx := bytes.Index(data[offset:], []byte(t.word)) 786 if idx < 0 { 787 break 788 } 789 790 relStartOffset := offset + idx 791 relEndOffset := relStartOffset + len(t.word) 792 793 startBoundary := relStartOffset < len(data) && (relStartOffset == 0 || !characterClass(data[relStartOffset-1])) 794 endBoundary := relEndOffset > 0 && (relEndOffset == len(data) || !characterClass(data[relEndOffset])) 795 if startBoundary && endBoundary { 796 found = append(found, &candidateMatch{ 797 byteOffset: uint32(offset + idx), 798 byteMatchSz: uint32(len(t.word)), 799 fileName: t.fileName, 800 }) 801 } 802 offset += idx + len(t.word) 803 } 804 805 t.found = found 806 t.evaluated = true 807 808 return matchesStateForSlice(t.found) 809} 810 811// breakMatchesOnNewlines returns matches resulting from breaking each element 812// of cms on newlines within text. 813func breakMatchesOnNewlines(cms []*candidateMatch, text []byte) []*candidateMatch { 814 var lineCMs []*candidateMatch 815 for _, cm := range cms { 816 lineCMs = append(lineCMs, breakOnNewlines(cm, text)...) 817 } 818 return lineCMs 819} 820 821// breakOnNewlines returns matches resulting from breaking cm on newlines 822// within text. 823func breakOnNewlines(cm *candidateMatch, text []byte) []*candidateMatch { 824 var cms []*candidateMatch 825 addMe := &candidateMatch{} 826 *addMe = *cm 827 for i := uint32(cm.byteOffset); i < cm.byteOffset+cm.byteMatchSz; i++ { 828 if text[i] == '\n' { 829 addMe.byteMatchSz = i - addMe.byteOffset 830 if addMe.byteMatchSz != 0 { 831 cms = append(cms, addMe) 832 } 833 834 addMe = &candidateMatch{} 835 *addMe = *cm 836 addMe.byteOffset = i + 1 837 } 838 } 839 addMe.byteMatchSz = cm.byteOffset + cm.byteMatchSz - addMe.byteOffset 840 if addMe.byteMatchSz != 0 { 841 cms = append(cms, addMe) 842 } 843 return cms 844} 845 846// evalMatchTree should be called instead of directly calling 847// matchTree.matches. It cache known values for future evaluation at higher 848// costs. 849func evalMatchTree(cp *contentProvider, cost int, known map[matchTree]bool, mt matchTree) matchesState { 850 if v, ok := known[mt]; ok { 851 return matchesStatePred(v) 852 } 853 854 ms := mt.matches(cp, cost, known) 855 if ms != matchesRequiresHigherCost { 856 known[mt] = ms == matchesFound 857 } 858 859 return ms 860} 861 862func (t *notMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState { 863 switch evalMatchTree(cp, cost, known, t.child) { 864 case matchesRequiresHigherCost: 865 return matchesRequiresHigherCost 866 case matchesFound: 867 return matchesNone 868 case matchesNone: 869 return matchesFound 870 default: 871 panic("unreachable") 872 } 873} 874 875func (t *fileNameMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState { 876 return evalMatchTree(cp, cost, known, t.child) 877} 878 879func (t *substrMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState { 880 if t.contEvaluated { 881 return matchesStateForSlice(t.current) 882 } 883 884 if len(t.current) == 0 { 885 return matchesNone 886 } 887 888 if t.fileName && cost < costMemory { 889 return matchesRequiresHigherCost 890 } 891 892 if !t.fileName && cost < costContent { 893 return matchesRequiresHigherCost 894 } 895 896 pruned := t.current[:0] 897 for _, m := range t.current { 898 if m.byteOffset == 0 && m.runeOffset > 0 { 899 m.byteOffset = cp.findOffset(m.fileName, m.runeOffset) 900 } 901 if m.matchContent(cp.data(m.fileName)) { 902 pruned = append(pruned, m) 903 } 904 } 905 t.current = pruned 906 t.contEvaluated = true 907 908 return matchesStateForSlice(t.current) 909} 910 911type matchTreeOpt struct { 912 // DisableWordMatchOptimization is used to disable the use of wordMatchTree. 913 // This was added since we do not support wordMatchTree with symbol search. 914 DisableWordMatchOptimization bool 915} 916 917func (d *indexData) newMatchTree(q query.Q, opt matchTreeOpt) (matchTree, error) { 918 if q == nil { 919 return nil, fmt.Errorf("got nil (sub)query") 920 } 921 switch s := q.(type) { 922 case *query.Regexp: 923 // RegexpToMatchTreeRecursive tries to distill a matchTree that matches a 924 // superset of the regexp. If the returned matchTree is equivalent to the 925 // original regexp, it returns true. An equivalent matchTree has the same 926 // behaviour as the original regexp and can be used instead. 927 // 928 subMT, isEq, _, err := d.regexpToMatchTreeRecursive(s.Regexp, ngramSize, s.FileName, s.CaseSensitive) 929 if err != nil { 930 return nil, err 931 } 932 // if the query can be used in place of the regexp 933 // return the subtree 934 if isEq { 935 return subMT, nil 936 } 937 938 var tr matchTree 939 if wmt, ok := regexpToWordMatchTree(s, opt); ok { 940 // A common search we get is "\bLITERAL\b". Avoid the regex engine and 941 // provide something faster. 942 tr = wmt 943 } else { 944 prefix := "" 945 if !s.CaseSensitive { 946 prefix = "(?i)" 947 } 948 949 tr = &regexpMatchTree{ 950 regexp: regexp.MustCompile(prefix + s.Regexp.String()), 951 fileName: s.FileName, 952 } 953 } 954 955 return &andMatchTree{ 956 children: []matchTree{ 957 tr, &noVisitMatchTree{subMT}, 958 }, 959 }, nil 960 case *query.And: 961 var r []matchTree 962 for _, ch := range s.Children { 963 ct, err := d.newMatchTree(ch, opt) 964 if err != nil { 965 return nil, err 966 } 967 r = append(r, ct) 968 } 969 return &andMatchTree{r}, nil 970 case *query.Or: 971 var r []matchTree 972 for _, ch := range s.Children { 973 ct, err := d.newMatchTree(ch, opt) 974 if err != nil { 975 return nil, err 976 } 977 r = append(r, ct) 978 } 979 return &orMatchTree{r}, nil 980 case *query.Not: 981 ct, err := d.newMatchTree(s.Child, opt) 982 return &notMatchTree{ 983 child: ct, 984 }, err 985 986 case *query.Type: 987 if s.Type != query.TypeFileName { 988 break 989 } 990 991 ct, err := d.newMatchTree(s.Child, opt) 992 if err != nil { 993 return nil, err 994 } 995 996 return &fileNameMatchTree{ 997 child: ct, 998 }, nil 999 1000 case *query.Substring: 1001 return d.newSubstringMatchTree(s) 1002 1003 case *query.Branch: 1004 masks := make([]uint64, 0, len(d.repoMetaData)) 1005 if s.Pattern == "HEAD" { 1006 for i := 0; i < len(d.repoMetaData); i++ { 1007 masks = append(masks, 1) 1008 } 1009 } else { 1010 for _, branchIDs := range d.branchIDs { 1011 mask := uint64(0) 1012 for nm, m := range branchIDs { 1013 if (s.Exact && nm == s.Pattern) || (!s.Exact && strings.Contains(nm, s.Pattern)) { 1014 mask |= uint64(m) 1015 } 1016 } 1017 masks = append(masks, mask) 1018 } 1019 1020 } 1021 return &branchQueryMatchTree{ 1022 masks: masks, 1023 fileMasks: d.fileBranchMasks, 1024 repos: d.repos, 1025 }, nil 1026 case *query.Const: 1027 if s.Value { 1028 return &bruteForceMatchTree{}, nil 1029 } else { 1030 return &noMatchTree{Why: "const"}, nil 1031 } 1032 case *query.Language: 1033 code, ok := d.metaData.LanguageMap[s.Language] 1034 if !ok { 1035 return &noMatchTree{Why: "lang"}, nil 1036 } 1037 return &docMatchTree{ 1038 reason: "language", 1039 numDocs: d.numDocs(), 1040 predicate: func(docID uint32) bool { 1041 return d.getLanguage(docID) == code 1042 }, 1043 }, nil 1044 1045 case *query.Symbol: 1046 // Disable WordMatchTree since we don't support it in symbols yet. 1047 optCopy := opt 1048 optCopy.DisableWordMatchOptimization = true 1049 1050 subMT, err := d.newMatchTree(s.Expr, optCopy) 1051 if err != nil { 1052 return nil, err 1053 } 1054 1055 if substr, ok := subMT.(*substrMatchTree); ok { 1056 return &symbolSubstrMatchTree{ 1057 substrMatchTree: substr, 1058 patternSize: uint32(utf8.RuneCountInString(substr.query.Pattern)), 1059 fileEndRunes: d.fileEndRunes, 1060 fileEndSymbol: d.fileEndSymbol, 1061 sections: d.runeDocSections, 1062 }, nil 1063 } 1064 1065 var regexp *regexp.Regexp 1066 visitMatchTree(subMT, func(mt matchTree) { 1067 if t, ok := mt.(*regexpMatchTree); ok { 1068 regexp = t.regexp 1069 } 1070 }) 1071 if regexp == nil { 1072 return nil, fmt.Errorf("found %T inside query.Symbol", subMT) 1073 } 1074 1075 return &symbolRegexpMatchTree{ 1076 regexp: regexp, 1077 all: regexp.String() == "(?i)(?-s:.)*", 1078 matchTree: subMT, 1079 }, nil 1080 1081 case *query.FileNameSet: 1082 return &docMatchTree{ 1083 reason: "FileNameSet", 1084 numDocs: d.numDocs(), 1085 predicate: func(docID uint32) bool { 1086 fileName := d.fileName(docID) 1087 _, ok := s.Set[string(fileName)] 1088 return ok 1089 }, 1090 }, nil 1091 1092 case *query.BranchesRepos: 1093 reposBranchesWant := make([]uint64, len(d.repoMetaData)) 1094 for repoIdx := range d.repoMetaData { 1095 var mask uint64 1096 for _, br := range s.List { 1097 if br.Repos.Contains(d.repoMetaData[repoIdx].ID) { 1098 mask |= uint64(d.branchIDs[repoIdx][br.Branch]) 1099 } 1100 } 1101 reposBranchesWant[repoIdx] = mask 1102 } 1103 return &docMatchTree{ 1104 reason: "BranchesRepos", 1105 numDocs: d.numDocs(), 1106 predicate: func(docID uint32) bool { 1107 return d.fileBranchMasks[docID]&reposBranchesWant[d.repos[docID]] != 0 1108 }, 1109 }, nil 1110 1111 case *query.RepoSet: 1112 reposWant := make([]bool, len(d.repoMetaData)) 1113 for repoIdx, r := range d.repoMetaData { 1114 if _, ok := s.Set[r.Name]; ok { 1115 reposWant[repoIdx] = true 1116 } 1117 } 1118 return &docMatchTree{ 1119 reason: "RepoSet", 1120 numDocs: d.numDocs(), 1121 predicate: func(docID uint32) bool { 1122 return reposWant[d.repos[docID]] 1123 }, 1124 }, nil 1125 1126 case *query.RepoIDs: 1127 reposWant := make([]bool, len(d.repoMetaData)) 1128 for repoIdx, r := range d.repoMetaData { 1129 if s.Repos.Contains(r.ID) { 1130 reposWant[repoIdx] = true 1131 } 1132 } 1133 return &docMatchTree{ 1134 reason: "RepoIDs", 1135 numDocs: d.numDocs(), 1136 predicate: func(docID uint32) bool { 1137 return reposWant[d.repos[docID]] 1138 }, 1139 }, nil 1140 1141 case *query.Repo: 1142 reposWant := make([]bool, len(d.repoMetaData)) 1143 for repoIdx, r := range d.repoMetaData { 1144 if s.Regexp.MatchString(r.Name) { 1145 reposWant[repoIdx] = true 1146 } 1147 } 1148 return &docMatchTree{ 1149 reason: "Repo", 1150 numDocs: d.numDocs(), 1151 predicate: func(docID uint32) bool { 1152 return reposWant[d.repos[docID]] 1153 }, 1154 }, nil 1155 1156 case *query.RepoRegexp: 1157 reposWant := make([]bool, len(d.repoMetaData)) 1158 for repoIdx, r := range d.repoMetaData { 1159 if s.Regexp.MatchString(r.Name) { 1160 reposWant[repoIdx] = true 1161 } 1162 } 1163 return &docMatchTree{ 1164 reason: "RepoRegexp", 1165 numDocs: d.numDocs(), 1166 predicate: func(docID uint32) bool { 1167 return reposWant[d.repos[docID]] 1168 }, 1169 }, nil 1170 1171 case query.RawConfig: 1172 return &docMatchTree{ 1173 reason: s.String(), 1174 numDocs: d.numDocs(), 1175 predicate: func(docID uint32) bool { 1176 return uint8(s)&d.rawConfigMasks[d.repos[docID]] == uint8(s) 1177 }, 1178 }, nil 1179 } 1180 log.Panicf("type %T", q) 1181 return nil, nil 1182} 1183 1184func (d *indexData) newSubstringMatchTree(s *query.Substring) (matchTree, error) { 1185 st := &substrMatchTree{ 1186 query: s, 1187 caseSensitive: s.CaseSensitive, 1188 fileName: s.FileName, 1189 } 1190 1191 if utf8.RuneCountInString(s.Pattern) < ngramSize { 1192 prefix := "" 1193 if !s.CaseSensitive { 1194 prefix = "(?i)" 1195 } 1196 t := &regexpMatchTree{ 1197 regexp: regexp.MustCompile(prefix + regexp.QuoteMeta(s.Pattern)), 1198 fileName: s.FileName, 1199 } 1200 return t, nil 1201 } 1202 1203 result, err := d.iterateNgrams(s) 1204 if err != nil { 1205 return nil, err 1206 } 1207 st.matchIterator = result 1208 return st, nil 1209} 1210 1211func regexpToWordMatchTree(q *query.Regexp, opt matchTreeOpt) (_ *wordMatchTree, ok bool) { 1212 if opt.DisableWordMatchOptimization { 1213 return nil, false 1214 } 1215 // Needs to be case sensitive 1216 if !q.CaseSensitive || q.Regexp.Flags&syntax.FoldCase != 0 { 1217 return nil, false 1218 } 1219 // We want a regex that looks like Op.Concat[OpWordBoundary OpLiteral OpWordBoundary] 1220 if q.Regexp.Op != syntax.OpConcat || len(q.Regexp.Sub) != 3 { 1221 return nil, false 1222 } 1223 sub := q.Regexp.Sub 1224 if sub[0].Op != syntax.OpWordBoundary || sub[1].Op != syntax.OpLiteral || sub[2].Op != syntax.OpWordBoundary { 1225 return nil, false 1226 } 1227 1228 return &wordMatchTree{ 1229 word: string(sub[1].Rune), 1230 fileName: q.FileName, 1231 }, true 1232} 1233 1234// pruneMatchTree removes impossible branches from the matchTree, as indicated 1235// by substrMatchTree having a noMatchTree and the resulting impossible and clauses and so forth. 1236func pruneMatchTree(mt matchTree) (matchTree, error) { 1237 var err error 1238 switch mt := mt.(type) { 1239 // leaf nodes that we test with the filter: 1240 case *substrMatchTree: 1241 if res, ok := mt.matchIterator.(*ngramIterationResults); ok { 1242 if _, ok := res.matchIterator.(*noMatchTree); ok { 1243 return nil, nil 1244 } 1245 } 1246 // recursive tree structures: 1247 case *andMatchTree: 1248 // Any branch of an and becoming impossible means the entire clause 1249 // is impossible. Otherwise, just handle rewrites. 1250 for i, child := range mt.children { 1251 newChild, err := pruneMatchTree(child) 1252 if err != nil { 1253 return nil, err 1254 } 1255 if newChild == nil { 1256 return nil, nil 1257 } 1258 mt.children[i] = newChild 1259 } 1260 case *orMatchTree: 1261 // *All* branches of an OR becoming impossible means the entire clause 1262 // is impossible. Otherwise, drop impossible subclauses and handle 1263 // rewrites, including simplifying to a singular resulting child branch. 1264 n := 0 1265 for _, child := range mt.children { 1266 newChild, err := pruneMatchTree(child) 1267 if err != nil { 1268 return nil, err 1269 } 1270 if newChild != nil { 1271 mt.children[n] = newChild 1272 n++ 1273 } 1274 } 1275 mt.children = mt.children[:n] 1276 if len(mt.children) == 1 { 1277 return mt.children[0], nil 1278 } else if len(mt.children) == 0 { 1279 return nil, nil 1280 } 1281 case *noVisitMatchTree: 1282 mt.matchTree, err = pruneMatchTree(mt.matchTree) 1283 if err != nil { 1284 return nil, err 1285 } 1286 if mt.matchTree == nil { 1287 return nil, nil 1288 } 1289 case *fileNameMatchTree: 1290 mt.child, err = pruneMatchTree(mt.child) 1291 case *andLineMatchTree: 1292 child, err := pruneMatchTree(&mt.andMatchTree) 1293 if err != nil { 1294 return nil, err 1295 } 1296 if child == nil { 1297 return nil, nil 1298 } 1299 if c, ok := child.(*andMatchTree); ok { 1300 mt.andMatchTree = *c 1301 } else { 1302 // the and was simplified to a single clause, 1303 // so the linematch portion is irrelevant. 1304 return mt, nil 1305 } 1306 case *notMatchTree: 1307 mt.child, err = pruneMatchTree(mt.child) 1308 if err != nil { 1309 return nil, err 1310 } 1311 if mt.child == nil { 1312 // not false => true 1313 return &bruteForceMatchTree{}, nil 1314 } 1315 // unhandled: 1316 case *docMatchTree: 1317 case *bruteForceMatchTree: 1318 case *regexpMatchTree: 1319 case *wordMatchTree: 1320 } 1321 return mt, err 1322}