fork of https://github.com/sourcegraph/zoekt
0

Configure Feed

Select the types of activity you want to include in your feed.

1// Copyright 2018 Google Inc. All rights reserved. 2// 3// Licensed under the Apache License, Version 2.0 (the "License"); 4// you may not use this file except in compliance with the License. 5// You may obtain a copy of the License at 6// 7// http://www.apache.org/licenses/LICENSE-2.0 8// 9// Unless required by applicable law or agreed to in writing, software 10// distributed under the License is distributed on an "AS IS" BASIS, 11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12// See the License for the specific language governing permissions and 13// limitations under the License. 14 15package zoekt 16 17import ( 18 "bytes" 19 "fmt" 20 "log" 21 "regexp/syntax" 22 "strings" 23 "unicode/utf8" 24 25 "github.com/grafana/regexp" 26 27 "github.com/sourcegraph/zoekt/query" 28) 29 30// A docIterator iterates over documents in order. 31type docIterator interface { 32 // provide the next document where we may find something interesting. 33 // 34 // This is like a "peek" and shouldn't mutate state. prepare is what should 35 // change state. 36 nextDoc() uint32 37 38 // clears any per-document state of the docIterator, and 39 // prepares for evaluating the given doc. The argument is 40 // strictly increasing over time. 41 prepare(nextDoc uint32) 42} 43 44// costs are passed in increasing order to matchTree.matches until they do not 45// return matchesRequiresHigherCost. 46const ( 47 costConst = 0 48 costMemory = 1 49 costContent = 2 50 costRegexp = 3 51) 52 53const ( 54 costMin = costConst 55 costMax = costRegexp 56) 57 58// matchesState is an enum for the state of a matchTree after a call to 59// matchTree.matches. 60type matchesState uint8 61 62const ( 63 // matchesRequiresHigherCost is returned when matchTree.matches hasn't done 64 // a search yet since the cost value is not high enough. 65 matchesRequiresHigherCost matchesState = iota 66 67 // matchesFound is returned when matchTree.matches has done a search and 68 // found one or more matches. 69 matchesFound 70 71 // matchesNone is returned when matchTree.matches has done a search and 72 // found nothing. 73 matchesNone 74) 75 76// matchesStatePred is a helper which returns matchesFound if b is true 77// otherwise returns matchesNone. 78func matchesStatePred(b bool) matchesState { 79 if b { 80 return matchesFound 81 } 82 return matchesNone 83} 84 85// matchesStateForSlice is a helper which returns matchesFound if v is 86// non-empty otherwise returns matchesNone. 87func matchesStateForSlice[T any](v []T) matchesState { 88 return matchesStatePred(len(v) > 0) 89} 90 91// An expression tree coupled with matches. The matchtree has two 92// functions: 93// 94// * it implements boolean combinations (and, or, not) 95// 96// * it implements shortcuts, where we skip documents (for example: if 97// there are no trigram matches, we can be sure there are no substring 98// matches). The matchtree iterates over the documents as they are 99// ordered in the shard. 100// 101// The general process for a given (shard, query) is: 102// 103// - construct matchTree for the query 104// 105// - find all different leaf matchTrees (substring, regexp, etc.) 106// 107// in a loop: 108// 109// - find next doc to process using nextDoc 110// 111// - evaluate atoms (leaf expressions that match text) 112// 113// - evaluate the tree using matches(), storing the result in map. 114// 115// - if the complete tree returns (matches() != matchesRequiresHigherCost) 116// for the document, collect all text matches by looking at leaf 117// matchTrees. 118type matchTree interface { 119 docIterator 120 121 // matches if cost is high enough. See documentation for matchesState's 122 // values. 123 // 124 // Note: Do not call this directly, rather use evalMatchTree which uses 125 // known to cache responses once the state transitions away from 126 // matchesRequiresHigherCost. 127 matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState 128} 129 130// docMatchTree iterates over documents for which predicate(docID) returns true. 131type docMatchTree struct { 132 // the number of documents in a shard. 133 numDocs uint32 134 135 predicate func(docID uint32) bool 136 137 // provides additional information about the reason why the docMatchTree was 138 // created. 139 reason string 140 141 // mutable 142 firstDone bool 143 docID uint32 144} 145 146type bruteForceMatchTree struct { 147 // mutable 148 firstDone bool 149 docID uint32 150} 151 152type andLineMatchTree struct { 153 andMatchTree 154} 155 156type andMatchTree struct { 157 children []matchTree 158} 159 160type orMatchTree struct { 161 children []matchTree 162} 163 164type notMatchTree struct { 165 child matchTree 166} 167 168// Returns only the filename of child matches. 169type fileNameMatchTree struct { 170 child matchTree 171} 172 173type boostMatchTree struct { 174 child matchTree 175 boost float64 176} 177 178// Don't visit this subtree for collecting matches. 179type noVisitMatchTree struct { 180 matchTree 181} 182 183type regexpMatchTree struct { 184 regexp *regexp.Regexp 185 186 // origRegexp is the original parsed regexp from the query structure. It 187 // does not include mutations such as case sensitivity. 188 origRegexp *syntax.Regexp 189 190 fileName bool 191 192 // mutable 193 reEvaluated bool 194 found []*candidateMatch 195 196 // nextDoc, prepare. 197 bruteForceMatchTree 198} 199 200func newRegexpMatchTree(s *query.Regexp) *regexpMatchTree { 201 prefix := "" 202 if !s.CaseSensitive { 203 prefix = "(?i)" 204 } 205 206 return &regexpMatchTree{ 207 regexp: regexp.MustCompile(prefix + s.Regexp.String()), 208 origRegexp: s.Regexp, 209 fileName: s.FileName, 210 } 211} 212 213// \bLITERAL\b 214type wordMatchTree struct { 215 word string 216 217 fileName bool 218 219 // mutable 220 evaluated bool 221 found []*candidateMatch 222 223 // nextDoc, prepare. 224 bruteForceMatchTree 225} 226 227type substrMatchTree struct { 228 matchIterator 229 230 query *query.Substring 231 caseSensitive bool 232 fileName bool 233 234 // mutable 235 current []*candidateMatch 236 contEvaluated bool 237} 238 239type branchQueryMatchTree struct { 240 fileMasks []uint64 241 masks []uint64 242 repos []uint16 243 244 // mutable 245 firstDone bool 246 docID uint32 247} 248 249func (t *branchQueryMatchTree) branchMask() uint64 { 250 return t.fileMasks[t.docID] & t.masks[t.repos[t.docID]] 251} 252 253type symbolRegexpMatchTree struct { 254 matchTree 255 regexp *regexp.Regexp 256 all bool // skips regex match if .* 257 258 reEvaluated bool 259 found []*candidateMatch 260} 261 262func (t *symbolRegexpMatchTree) prepare(doc uint32) { 263 t.reEvaluated = false 264 t.found = t.found[:0] 265 t.matchTree.prepare(doc) 266} 267 268func (t *symbolRegexpMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState { 269 if t.reEvaluated { 270 return matchesStateForSlice(t.found) 271 } 272 273 if cost < costRegexp { 274 return matchesRequiresHigherCost 275 } 276 277 sections := cp.docSections() 278 content := cp.data(false) 279 280 found := t.found[:0] 281 for i, sec := range sections { 282 var idx []int 283 if t.all { 284 idx = []int{0, int(sec.End - sec.Start)} 285 } else { 286 idx = t.regexp.FindIndex(content[sec.Start:sec.End]) 287 if idx == nil { 288 continue 289 } 290 } 291 292 cm := &candidateMatch{ 293 byteOffset: sec.Start + uint32(idx[0]), 294 byteMatchSz: uint32(idx[1] - idx[0]), 295 symbol: true, 296 symbolIdx: uint32(i), 297 } 298 found = append(found, cm) 299 } 300 t.found = found 301 t.reEvaluated = true 302 303 return matchesStateForSlice(t.found) 304} 305 306type symbolSubstrMatchTree struct { 307 *substrMatchTree 308 309 patternSize uint32 310 fileEndRunes []uint32 311 fileEndSymbol []uint32 312 313 doc uint32 314 sections []DocumentSection 315 316 secID uint32 317} 318 319func (t *symbolSubstrMatchTree) prepare(doc uint32) { 320 t.substrMatchTree.prepare(doc) 321 t.doc = doc 322 323 var fileStart uint32 324 if doc > 0 { 325 fileStart = t.fileEndRunes[doc-1] 326 } 327 328 var sections []DocumentSection 329 if len(t.sections) > 0 { 330 most := t.fileEndSymbol[len(t.fileEndSymbol)-1] 331 if most == uint32(len(t.sections)) { 332 sections = t.sections[t.fileEndSymbol[doc]:t.fileEndSymbol[doc+1]] 333 } else { 334 for t.secID < uint32(len(t.sections)) && t.sections[t.secID].Start < fileStart { 335 t.secID++ 336 } 337 338 fileEnd, symbolEnd := t.fileEndRunes[doc], t.secID 339 for symbolEnd < uint32(len(t.sections)) && t.sections[symbolEnd].Start < fileEnd { 340 symbolEnd++ 341 } 342 343 sections = t.sections[t.secID:symbolEnd] 344 } 345 } 346 347 secIdx := 0 348 trimmed := t.current[:0] 349 for len(sections) > secIdx && len(t.current) > 0 { 350 start := fileStart + t.current[0].runeOffset 351 end := start + t.patternSize 352 if start >= sections[secIdx].End { 353 secIdx++ 354 continue 355 } 356 357 if start < sections[secIdx].Start { 358 t.current = t.current[1:] 359 continue 360 } 361 362 if end <= sections[secIdx].End { 363 t.current[0].symbol = true 364 t.current[0].symbolIdx = uint32(secIdx) 365 trimmed = append(trimmed, t.current[0]) 366 } 367 368 t.current = t.current[1:] 369 } 370 t.current = trimmed 371} 372 373// all prepare methods 374 375func (t *bruteForceMatchTree) prepare(doc uint32) { 376 t.docID = doc 377 t.firstDone = true 378} 379 380func (t *docMatchTree) prepare(doc uint32) { 381 t.docID = doc 382 t.firstDone = true 383} 384 385func (t *andMatchTree) prepare(doc uint32) { 386 for _, c := range t.children { 387 c.prepare(doc) 388 } 389} 390 391func (t *regexpMatchTree) prepare(doc uint32) { 392 t.found = t.found[:0] 393 t.reEvaluated = false 394 t.bruteForceMatchTree.prepare(doc) 395} 396 397func (t *wordMatchTree) prepare(doc uint32) { 398 t.found = t.found[:0] 399 t.evaluated = false 400 t.bruteForceMatchTree.prepare(doc) 401} 402 403func (t *orMatchTree) prepare(doc uint32) { 404 for _, c := range t.children { 405 c.prepare(doc) 406 } 407} 408 409func (t *notMatchTree) prepare(doc uint32) { 410 t.child.prepare(doc) 411} 412 413func (t *fileNameMatchTree) prepare(doc uint32) { 414 t.child.prepare(doc) 415} 416 417func (t *boostMatchTree) prepare(doc uint32) { 418 t.child.prepare(doc) 419} 420 421func (t *substrMatchTree) prepare(nextDoc uint32) { 422 t.matchIterator.prepare(nextDoc) 423 t.current = t.matchIterator.candidates() 424 t.contEvaluated = false 425} 426 427func (t *branchQueryMatchTree) prepare(doc uint32) { 428 t.firstDone = true 429 t.docID = doc 430} 431 432// nextDoc 433 434func (t *docMatchTree) nextDoc() uint32 { 435 var start uint32 436 if t.firstDone { 437 start = t.docID + 1 438 } 439 for i := start; i < t.numDocs; i++ { 440 if t.predicate(i) { 441 return i 442 } 443 } 444 return maxUInt32 445} 446 447func (t *bruteForceMatchTree) nextDoc() uint32 { 448 if !t.firstDone { 449 return 0 450 } 451 return t.docID + 1 452} 453 454func (t *andMatchTree) nextDoc() uint32 { 455 var max uint32 456 for _, c := range t.children { 457 m := c.nextDoc() 458 if m > max { 459 max = m 460 } 461 } 462 return max 463} 464 465func (t *orMatchTree) nextDoc() uint32 { 466 min := uint32(maxUInt32) 467 for _, c := range t.children { 468 m := c.nextDoc() 469 if m < min { 470 min = m 471 } 472 } 473 return min 474} 475 476func (t *notMatchTree) nextDoc() uint32 { 477 return 0 478} 479 480func (t *fileNameMatchTree) nextDoc() uint32 { 481 return t.child.nextDoc() 482} 483 484func (t *boostMatchTree) nextDoc() uint32 { 485 return t.child.nextDoc() 486} 487 488func (t *branchQueryMatchTree) nextDoc() uint32 { 489 var start uint32 490 if t.firstDone { 491 start = t.docID + 1 492 } 493 494 for i := start; i < uint32(len(t.fileMasks)); i++ { 495 if (t.masks[t.repos[i]] & t.fileMasks[i]) != 0 { 496 return i 497 } 498 } 499 return maxUInt32 500} 501 502// all String methods 503 504func (t *bruteForceMatchTree) String() string { 505 return "all" 506} 507 508func (t *docMatchTree) String() string { 509 return fmt.Sprintf("doc(%s)", t.reason) 510} 511 512func (t *andMatchTree) String() string { 513 return fmt.Sprintf("and%v", t.children) 514} 515 516func (t *regexpMatchTree) String() string { 517 f := "" 518 if t.fileName { 519 f = "f" 520 } 521 return fmt.Sprintf("%sre(%s)", f, t.regexp) 522} 523 524func (t *wordMatchTree) String() string { 525 f := "" 526 if t.fileName { 527 f = "f" 528 } 529 return fmt.Sprintf("%sword(%s)", f, t.word) 530} 531 532func (t *orMatchTree) String() string { 533 return fmt.Sprintf("or%v", t.children) 534} 535 536func (t *notMatchTree) String() string { 537 return fmt.Sprintf("not(%v)", t.child) 538} 539 540func (t *noVisitMatchTree) String() string { 541 return fmt.Sprintf("novisit(%v)", t.matchTree) 542} 543 544func (t *fileNameMatchTree) String() string { 545 return fmt.Sprintf("f(%v)", t.child) 546} 547 548func (t *boostMatchTree) String() string { 549 return fmt.Sprintf("boost(%f, %v)", t.boost, t.child) 550} 551 552func (t *substrMatchTree) String() string { 553 f := "" 554 if t.fileName { 555 f = "f" 556 } 557 558 return fmt.Sprintf("%ssubstr(%q, %v, %v)", f, t.query.Pattern, t.current, t.matchIterator) 559} 560 561func (t *branchQueryMatchTree) String() string { 562 return fmt.Sprintf("branch(%x)", t.masks) 563} 564 565func (t *symbolSubstrMatchTree) String() string { 566 return fmt.Sprintf("symbol(%v)", t.substrMatchTree) 567} 568 569func (t *symbolRegexpMatchTree) String() string { 570 return fmt.Sprintf("symbol(%v)", t.matchTree) 571} 572 573// visitMatches visits all atoms in matchTree. Note: This visits 574// noVisitMatchTree. For collecting matches use visitMatches. 575func visitMatchTree(t matchTree, f func(matchTree)) { 576 switch s := t.(type) { 577 case *andMatchTree: 578 for _, ch := range s.children { 579 visitMatchTree(ch, f) 580 } 581 case *orMatchTree: 582 for _, ch := range s.children { 583 visitMatchTree(ch, f) 584 } 585 case *andLineMatchTree: 586 visitMatchTree(&s.andMatchTree, f) 587 case *noVisitMatchTree: 588 visitMatchTree(s.matchTree, f) 589 case *notMatchTree: 590 visitMatchTree(s.child, f) 591 case *fileNameMatchTree: 592 visitMatchTree(s.child, f) 593 case *boostMatchTree: 594 visitMatchTree(s.child, f) 595 case *symbolSubstrMatchTree: 596 visitMatchTree(s.substrMatchTree, f) 597 case *symbolRegexpMatchTree: 598 visitMatchTree(s.matchTree, f) 599 default: 600 f(t) 601 } 602} 603 604// updateMatchTreeStats calls updateStats on all atoms in mt which have that 605// function defined. 606func updateMatchTreeStats(mt matchTree, stats *Stats) { 607 visitMatchTree(mt, func(mt matchTree) { 608 if atom, ok := mt.(interface{ updateStats(*Stats) }); ok { 609 atom.updateStats(stats) 610 } 611 }) 612} 613 614func visitMatchAtoms(t matchTree, known map[matchTree]bool, f func(matchTree)) { 615 visitMatches(t, known, 1, func(mt matchTree, _ float64) { 616 f(mt) 617 }) 618} 619 620// visitMatches visits all atoms which can contribute matches. Note: This 621// skips noVisitMatchTree. 622func visitMatches(t matchTree, known map[matchTree]bool, weight float64, f func(matchTree, float64)) { 623 switch s := t.(type) { 624 case *andMatchTree: 625 for _, ch := range s.children { 626 if known[ch] { 627 visitMatches(ch, known, weight, f) 628 } 629 } 630 case *andLineMatchTree: 631 visitMatches(&s.andMatchTree, known, weight, f) 632 case *orMatchTree: 633 for _, ch := range s.children { 634 if known[ch] { 635 visitMatches(ch, known, weight, f) 636 } 637 } 638 case *boostMatchTree: 639 visitMatches(s.child, known, weight*s.boost, f) 640 case *symbolSubstrMatchTree: 641 visitMatches(s.substrMatchTree, known, weight, f) 642 case *notMatchTree: 643 case *noVisitMatchTree: 644 // don't collect into negative trees. 645 case *fileNameMatchTree: 646 // We will just gather the filename if we do not visit this tree. 647 default: 648 f(s, weight) 649 } 650} 651 652// all matches() methods. 653 654func (t *docMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState { 655 return matchesStatePred(t.predicate(cp.idx)) 656} 657 658func (t *bruteForceMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState { 659 return matchesFound 660} 661 662// andLineMatchTree is a performance optimization of andMatchTree. For content 663// searches we don't want to run the regex engine if there is no line that 664// contains matches from all terms. 665func (t *andLineMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState { 666 if state := evalMatchTree(cp, cost, known, &t.andMatchTree); state != matchesFound { 667 return state 668 } 669 670 // Invariant: all children have matches. If any line contains all of them we 671 // can return MatchesFound. 672 673 // find child with fewest candidates 674 min := maxUInt32 675 fewestChildren := 0 676 for ix, child := range t.children { 677 v, ok := child.(*substrMatchTree) 678 // make sure we are running a content search and that all candidates are a 679 // substrMatchTree 680 if !ok || v.fileName { 681 return matchesFound 682 } 683 if len(v.current) < min { 684 min = len(v.current) 685 fewestChildren = ix 686 } 687 } 688 689 type lineRange struct { 690 start int 691 end int 692 } 693 lines := make([]lineRange, 0, len(t.children[fewestChildren].(*substrMatchTree).current)) 694 prev := -1 695 for _, candidate := range t.children[fewestChildren].(*substrMatchTree).current { 696 line, byteStart, byteEnd := cp.newlines().atOffset(candidate.byteOffset) 697 if line == prev { 698 continue 699 } 700 prev = line 701 lines = append(lines, lineRange{byteStart, byteEnd}) 702 } 703 704 // children keeps track of the children's candidates we have already seen. 705 children := make([][]*candidateMatch, 0, len(t.children)-1) 706 for j, child := range t.children { 707 if j == fewestChildren { 708 continue 709 } 710 children = append(children, child.(*substrMatchTree).current) 711 } 712 713nextLine: 714 for i := 0; i < len(lines); i++ { 715 hits := 1 716 nextChild: 717 for j := range children { 718 nextCandidate: 719 for len(children[j]) > 0 { 720 candidate := children[j][0] 721 bo := int(cp.findOffset(false, candidate.runeOffset)) 722 if bo < lines[i].start { 723 children[j] = children[j][1:] 724 continue nextCandidate 725 } 726 if bo <= lines[i].end { 727 hits++ 728 continue nextChild 729 } 730 // move the `lines` iterator forward until bo <= line.end 731 for i < len(lines) && bo > lines[i].end { 732 i++ 733 } 734 i-- 735 continue nextLine 736 } 737 } 738 // return early once we found any line that contains matches from all children 739 if hits == len(t.children) { 740 return matchesFound 741 } 742 } 743 return matchesNone 744} 745 746func (t *andMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState { 747 // We have found matches unless a child needs to do more work or it hasn't 748 // found matches. 749 state := matchesFound 750 751 for _, ch := range t.children { 752 switch evalMatchTree(cp, cost, known, ch) { 753 case matchesRequiresHigherCost: 754 // keep evaluating other children incase we come across matchesNone 755 state = matchesRequiresHigherCost 756 case matchesFound: 757 // will return this if every child has this value 758 case matchesNone: 759 return matchesNone 760 } 761 } 762 763 return state 764} 765 766func (t *orMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState { 767 // we could short-circuit, but we want to use the other possibilities as a 768 // ranking signal. So we always return the most conservative state. 769 state := matchesNone 770 for _, ch := range t.children { 771 switch evalMatchTree(cp, cost, known, ch) { 772 case matchesRequiresHigherCost: 773 state = matchesRequiresHigherCost 774 case matchesFound: 775 if state != matchesRequiresHigherCost { 776 state = matchesFound 777 } 778 case matchesNone: 779 // noop 780 } 781 } 782 return state 783} 784 785func (t *branchQueryMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState { 786 return matchesStatePred(t.branchMask() != 0) 787} 788 789func (t *regexpMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState { 790 if t.reEvaluated { 791 return matchesStateForSlice(t.found) 792 } 793 794 if cost < costRegexp { 795 return matchesRequiresHigherCost 796 } 797 798 cp.stats.RegexpsConsidered++ 799 idxs := t.regexp.FindAllIndex(cp.data(t.fileName), -1) 800 found := t.found[:0] 801 for _, idx := range idxs { 802 cm := &candidateMatch{ 803 byteOffset: uint32(idx[0]), 804 byteMatchSz: uint32(idx[1] - idx[0]), 805 fileName: t.fileName, 806 } 807 808 found = append(found, cm) 809 } 810 t.found = found 811 t.reEvaluated = true 812 813 return matchesStateForSlice(t.found) 814} 815 816func (t *wordMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState { 817 if t.evaluated { 818 return matchesStateForSlice(t.found) 819 } 820 821 if cost < costRegexp { 822 return matchesRequiresHigherCost 823 } 824 825 data := cp.data(t.fileName) 826 offset := 0 827 found := t.found[:0] 828 for { 829 idx := bytes.Index(data[offset:], []byte(t.word)) 830 if idx < 0 { 831 break 832 } 833 834 relStartOffset := offset + idx 835 relEndOffset := relStartOffset + len(t.word) 836 837 startBoundary := relStartOffset < len(data) && (relStartOffset == 0 || !characterClass(data[relStartOffset-1])) 838 endBoundary := relEndOffset > 0 && (relEndOffset == len(data) || !characterClass(data[relEndOffset])) 839 if startBoundary && endBoundary { 840 found = append(found, &candidateMatch{ 841 byteOffset: uint32(offset + idx), 842 byteMatchSz: uint32(len(t.word)), 843 fileName: t.fileName, 844 }) 845 } 846 offset += idx + len(t.word) 847 } 848 849 t.found = found 850 t.evaluated = true 851 852 return matchesStateForSlice(t.found) 853} 854 855// breakMatchesOnNewlines returns matches resulting from breaking each element 856// of cms on newlines within text. 857func breakMatchesOnNewlines(cms []*candidateMatch, text []byte) []*candidateMatch { 858 var lineCMs []*candidateMatch 859 for _, cm := range cms { 860 lineCMs = append(lineCMs, breakOnNewlines(cm, text)...) 861 } 862 return lineCMs 863} 864 865// breakOnNewlines returns matches resulting from breaking cm on newlines 866// within text. 867func breakOnNewlines(cm *candidateMatch, text []byte) []*candidateMatch { 868 var cms []*candidateMatch 869 addMe := &candidateMatch{} 870 *addMe = *cm 871 for i := uint32(cm.byteOffset); i < cm.byteOffset+cm.byteMatchSz; i++ { 872 if text[i] == '\n' { 873 addMe.byteMatchSz = i - addMe.byteOffset 874 if addMe.byteMatchSz != 0 { 875 cms = append(cms, addMe) 876 } 877 878 addMe = &candidateMatch{} 879 *addMe = *cm 880 addMe.byteOffset = i + 1 881 } 882 } 883 addMe.byteMatchSz = cm.byteOffset + cm.byteMatchSz - addMe.byteOffset 884 if addMe.byteMatchSz != 0 { 885 cms = append(cms, addMe) 886 } 887 return cms 888} 889 890// evalMatchTree should be called instead of directly calling 891// matchTree.matches. It cache known values for future evaluation at higher 892// costs. 893func evalMatchTree(cp *contentProvider, cost int, known map[matchTree]bool, mt matchTree) matchesState { 894 if v, ok := known[mt]; ok { 895 return matchesStatePred(v) 896 } 897 898 ms := mt.matches(cp, cost, known) 899 if ms != matchesRequiresHigherCost { 900 known[mt] = ms == matchesFound 901 } 902 903 return ms 904} 905 906func (t *notMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState { 907 switch evalMatchTree(cp, cost, known, t.child) { 908 case matchesRequiresHigherCost: 909 return matchesRequiresHigherCost 910 case matchesFound: 911 return matchesNone 912 case matchesNone: 913 return matchesFound 914 default: 915 panic("unreachable") 916 } 917} 918 919func (t *fileNameMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState { 920 return evalMatchTree(cp, cost, known, t.child) 921} 922 923func (t *boostMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState { 924 return evalMatchTree(cp, cost, known, t.child) 925} 926 927func (t *substrMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState { 928 if t.contEvaluated { 929 return matchesStateForSlice(t.current) 930 } 931 932 if len(t.current) == 0 { 933 return matchesNone 934 } 935 936 if t.fileName && cost < costMemory { 937 return matchesRequiresHigherCost 938 } 939 940 if !t.fileName && cost < costContent { 941 return matchesRequiresHigherCost 942 } 943 944 pruned := t.current[:0] 945 for _, m := range t.current { 946 if m.byteOffset == 0 && m.runeOffset > 0 { 947 m.byteOffset = cp.findOffset(m.fileName, m.runeOffset) 948 } 949 if m.matchContent(cp.data(m.fileName)) { 950 pruned = append(pruned, m) 951 } 952 } 953 t.current = pruned 954 t.contEvaluated = true 955 956 return matchesStateForSlice(t.current) 957} 958 959type matchTreeOpt struct { 960 // DisableWordMatchOptimization is used to disable the use of wordMatchTree. 961 // This was added since we do not support wordMatchTree with symbol search. 962 DisableWordMatchOptimization bool 963} 964 965func (d *indexData) newMatchTree(q query.Q, opt matchTreeOpt) (matchTree, error) { 966 if q == nil { 967 return nil, fmt.Errorf("got nil (sub)query") 968 } 969 switch s := q.(type) { 970 case *query.Regexp: 971 // RegexpToMatchTreeRecursive tries to distill a matchTree that matches a 972 // superset of the regexp. If the returned matchTree is equivalent to the 973 // original regexp, it returns true. An equivalent matchTree has the same 974 // behaviour as the original regexp and can be used instead. 975 // 976 subMT, isEq, _, err := d.regexpToMatchTreeRecursive(s.Regexp, ngramSize, s.FileName, s.CaseSensitive) 977 if err != nil { 978 return nil, err 979 } 980 // if the query can be used in place of the regexp 981 // return the subtree 982 if isEq { 983 return subMT, nil 984 } 985 986 var tr matchTree 987 if wmt, ok := regexpToWordMatchTree(s, opt); ok { 988 // A common search we get is "\bLITERAL\b". Avoid the regex engine and 989 // provide something faster. 990 tr = wmt 991 } else { 992 tr = newRegexpMatchTree(s) 993 } 994 995 return &andMatchTree{ 996 children: []matchTree{ 997 tr, &noVisitMatchTree{subMT}, 998 }, 999 }, nil 1000 case *query.And: 1001 var r []matchTree 1002 for _, ch := range s.Children { 1003 ct, err := d.newMatchTree(ch, opt) 1004 if err != nil { 1005 return nil, err 1006 } 1007 r = append(r, ct) 1008 } 1009 return &andMatchTree{r}, nil 1010 case *query.Or: 1011 var r []matchTree 1012 for _, ch := range s.Children { 1013 ct, err := d.newMatchTree(ch, opt) 1014 if err != nil { 1015 return nil, err 1016 } 1017 r = append(r, ct) 1018 } 1019 return &orMatchTree{r}, nil 1020 case *query.Not: 1021 ct, err := d.newMatchTree(s.Child, opt) 1022 return &notMatchTree{ 1023 child: ct, 1024 }, err 1025 1026 case *query.Type: 1027 if s.Type != query.TypeFileName { 1028 break 1029 } 1030 1031 ct, err := d.newMatchTree(s.Child, opt) 1032 if err != nil { 1033 return nil, err 1034 } 1035 1036 return &fileNameMatchTree{ 1037 child: ct, 1038 }, nil 1039 1040 case *query.Boost: 1041 ct, err := d.newMatchTree(s.Child, opt) 1042 if err != nil { 1043 return nil, err 1044 } 1045 1046 return &boostMatchTree{ 1047 child: ct, 1048 boost: s.Boost, 1049 }, nil 1050 1051 case *query.Substring: 1052 return d.newSubstringMatchTree(s) 1053 1054 case *query.Branch: 1055 masks := make([]uint64, 0, len(d.repoMetaData)) 1056 if s.Pattern == "HEAD" { 1057 for i := 0; i < len(d.repoMetaData); i++ { 1058 masks = append(masks, 1) 1059 } 1060 } else { 1061 for _, branchIDs := range d.branchIDs { 1062 mask := uint64(0) 1063 for nm, m := range branchIDs { 1064 if (s.Exact && nm == s.Pattern) || (!s.Exact && strings.Contains(nm, s.Pattern)) { 1065 mask |= uint64(m) 1066 } 1067 } 1068 masks = append(masks, mask) 1069 } 1070 } 1071 return &branchQueryMatchTree{ 1072 masks: masks, 1073 fileMasks: d.fileBranchMasks, 1074 repos: d.repos, 1075 }, nil 1076 case *query.Const: 1077 if s.Value { 1078 return &bruteForceMatchTree{}, nil 1079 } else { 1080 return &noMatchTree{Why: "const"}, nil 1081 } 1082 case *query.Language: 1083 code, ok := d.metaData.LanguageMap[s.Language] 1084 if !ok { 1085 return &noMatchTree{Why: "lang"}, nil 1086 } 1087 return &docMatchTree{ 1088 reason: "language", 1089 numDocs: d.numDocs(), 1090 predicate: func(docID uint32) bool { 1091 return d.getLanguage(docID) == code 1092 }, 1093 }, nil 1094 1095 case *query.Symbol: 1096 // Disable WordMatchTree since we don't support it in symbols yet. 1097 optCopy := opt 1098 optCopy.DisableWordMatchOptimization = true 1099 1100 subMT, err := d.newMatchTree(s.Expr, optCopy) 1101 if err != nil { 1102 return nil, err 1103 } 1104 1105 if substr, ok := subMT.(*substrMatchTree); ok { 1106 return &symbolSubstrMatchTree{ 1107 substrMatchTree: substr, 1108 patternSize: uint32(utf8.RuneCountInString(substr.query.Pattern)), 1109 fileEndRunes: d.fileEndRunes, 1110 fileEndSymbol: d.fileEndSymbol, 1111 sections: d.runeDocSections, 1112 }, nil 1113 } 1114 1115 var regexpMT *regexpMatchTree 1116 visitMatchTree(subMT, func(mt matchTree) { 1117 if t, ok := mt.(*regexpMatchTree); ok { 1118 regexpMT = t 1119 } 1120 }) 1121 if regexpMT == nil { 1122 return nil, fmt.Errorf("found %T inside query.Symbol", subMT) 1123 } 1124 1125 return &symbolRegexpMatchTree{ 1126 regexp: regexpMT.regexp, 1127 all: isRegexpAll(regexpMT.origRegexp), 1128 matchTree: subMT, 1129 }, nil 1130 1131 case *query.FileNameSet: 1132 return &docMatchTree{ 1133 reason: "FileNameSet", 1134 numDocs: d.numDocs(), 1135 predicate: func(docID uint32) bool { 1136 fileName := d.fileName(docID) 1137 _, ok := s.Set[string(fileName)] 1138 return ok 1139 }, 1140 }, nil 1141 1142 case *query.BranchesRepos: 1143 reposBranchesWant := make([]uint64, len(d.repoMetaData)) 1144 for repoIdx := range d.repoMetaData { 1145 var mask uint64 1146 for _, br := range s.List { 1147 if br.Repos.Contains(d.repoMetaData[repoIdx].ID) { 1148 mask |= uint64(d.branchIDs[repoIdx][br.Branch]) 1149 } 1150 } 1151 reposBranchesWant[repoIdx] = mask 1152 } 1153 return &docMatchTree{ 1154 reason: "BranchesRepos", 1155 numDocs: d.numDocs(), 1156 predicate: func(docID uint32) bool { 1157 return d.fileBranchMasks[docID]&reposBranchesWant[d.repos[docID]] != 0 1158 }, 1159 }, nil 1160 1161 case *query.RepoSet: 1162 reposWant := make([]bool, len(d.repoMetaData)) 1163 for repoIdx, r := range d.repoMetaData { 1164 if _, ok := s.Set[r.Name]; ok { 1165 reposWant[repoIdx] = true 1166 } 1167 } 1168 return &docMatchTree{ 1169 reason: "RepoSet", 1170 numDocs: d.numDocs(), 1171 predicate: func(docID uint32) bool { 1172 return reposWant[d.repos[docID]] 1173 }, 1174 }, nil 1175 1176 case *query.RepoIDs: 1177 reposWant := make([]bool, len(d.repoMetaData)) 1178 for repoIdx, r := range d.repoMetaData { 1179 if s.Repos.Contains(r.ID) { 1180 reposWant[repoIdx] = true 1181 } 1182 } 1183 return &docMatchTree{ 1184 reason: "RepoIDs", 1185 numDocs: d.numDocs(), 1186 predicate: func(docID uint32) bool { 1187 return reposWant[d.repos[docID]] 1188 }, 1189 }, nil 1190 1191 case *query.Repo: 1192 reposWant := make([]bool, len(d.repoMetaData)) 1193 for repoIdx, r := range d.repoMetaData { 1194 if s.Regexp.MatchString(r.Name) { 1195 reposWant[repoIdx] = true 1196 } 1197 } 1198 return &docMatchTree{ 1199 reason: "Repo", 1200 numDocs: d.numDocs(), 1201 predicate: func(docID uint32) bool { 1202 return reposWant[d.repos[docID]] 1203 }, 1204 }, nil 1205 1206 case *query.RepoRegexp: 1207 reposWant := make([]bool, len(d.repoMetaData)) 1208 for repoIdx, r := range d.repoMetaData { 1209 if s.Regexp.MatchString(r.Name) { 1210 reposWant[repoIdx] = true 1211 } 1212 } 1213 return &docMatchTree{ 1214 reason: "RepoRegexp", 1215 numDocs: d.numDocs(), 1216 predicate: func(docID uint32) bool { 1217 return reposWant[d.repos[docID]] 1218 }, 1219 }, nil 1220 1221 case query.RawConfig: 1222 return &docMatchTree{ 1223 reason: s.String(), 1224 numDocs: d.numDocs(), 1225 predicate: func(docID uint32) bool { 1226 return uint8(s)&d.rawConfigMasks[d.repos[docID]] == uint8(s) 1227 }, 1228 }, nil 1229 } 1230 log.Panicf("type %T", q) 1231 return nil, nil 1232} 1233 1234func (d *indexData) newSubstringMatchTree(s *query.Substring) (matchTree, error) { 1235 st := &substrMatchTree{ 1236 query: s, 1237 caseSensitive: s.CaseSensitive, 1238 fileName: s.FileName, 1239 } 1240 1241 if utf8.RuneCountInString(s.Pattern) < ngramSize { 1242 return newRegexpMatchTree(&query.Regexp{ 1243 Regexp: &syntax.Regexp{Op: syntax.OpLiteral, Rune: []rune(s.Pattern)}, 1244 FileName: s.FileName, 1245 Content: s.Content, 1246 CaseSensitive: s.CaseSensitive, 1247 }), nil 1248 } 1249 1250 result, err := d.iterateNgrams(s) 1251 if err != nil { 1252 return nil, err 1253 } 1254 st.matchIterator = result 1255 return st, nil 1256} 1257 1258func regexpToWordMatchTree(q *query.Regexp, opt matchTreeOpt) (_ *wordMatchTree, ok bool) { 1259 if opt.DisableWordMatchOptimization { 1260 return nil, false 1261 } 1262 // Needs to be case sensitive 1263 if !q.CaseSensitive || q.Regexp.Flags&syntax.FoldCase != 0 { 1264 return nil, false 1265 } 1266 // We want a regex that looks like Op.Concat[OpWordBoundary OpLiteral OpWordBoundary] 1267 if q.Regexp.Op != syntax.OpConcat || len(q.Regexp.Sub) != 3 { 1268 return nil, false 1269 } 1270 sub := q.Regexp.Sub 1271 if sub[0].Op != syntax.OpWordBoundary || sub[1].Op != syntax.OpLiteral || sub[2].Op != syntax.OpWordBoundary { 1272 return nil, false 1273 } 1274 1275 return &wordMatchTree{ 1276 word: string(sub[1].Rune), 1277 fileName: q.FileName, 1278 }, true 1279} 1280 1281// pruneMatchTree removes impossible branches from the matchTree, as indicated 1282// by substrMatchTree having a noMatchTree and the resulting impossible and clauses and so forth. 1283func pruneMatchTree(mt matchTree) (matchTree, error) { 1284 var err error 1285 switch mt := mt.(type) { 1286 // leaf nodes that we test with the filter: 1287 case *substrMatchTree: 1288 if res, ok := mt.matchIterator.(*ngramIterationResults); ok { 1289 if _, ok := res.matchIterator.(*noMatchTree); ok { 1290 return nil, nil 1291 } 1292 } 1293 // recursive tree structures: 1294 case *andMatchTree: 1295 // Any branch of an and becoming impossible means the entire clause 1296 // is impossible. Otherwise, just handle rewrites. 1297 for i, child := range mt.children { 1298 newChild, err := pruneMatchTree(child) 1299 if err != nil { 1300 return nil, err 1301 } 1302 if newChild == nil { 1303 return nil, nil 1304 } 1305 mt.children[i] = newChild 1306 } 1307 case *orMatchTree: 1308 // *All* branches of an OR becoming impossible means the entire clause 1309 // is impossible. Otherwise, drop impossible subclauses and handle 1310 // rewrites, including simplifying to a singular resulting child branch. 1311 n := 0 1312 for _, child := range mt.children { 1313 newChild, err := pruneMatchTree(child) 1314 if err != nil { 1315 return nil, err 1316 } 1317 if newChild != nil { 1318 mt.children[n] = newChild 1319 n++ 1320 } 1321 } 1322 mt.children = mt.children[:n] 1323 if len(mt.children) == 1 { 1324 return mt.children[0], nil 1325 } else if len(mt.children) == 0 { 1326 return nil, nil 1327 } 1328 case *noVisitMatchTree: 1329 mt.matchTree, err = pruneMatchTree(mt.matchTree) 1330 if err != nil { 1331 return nil, err 1332 } 1333 if mt.matchTree == nil { 1334 return nil, nil 1335 } 1336 case *fileNameMatchTree: 1337 mt.child, err = pruneMatchTree(mt.child) 1338 if err != nil { 1339 return nil, err 1340 } 1341 if mt.child == nil { 1342 return nil, nil 1343 } 1344 case *boostMatchTree: 1345 mt.child, err = pruneMatchTree(mt.child) 1346 if err != nil { 1347 return nil, err 1348 } 1349 if mt.child == nil { 1350 return nil, nil 1351 } 1352 case *andLineMatchTree: 1353 child, err := pruneMatchTree(&mt.andMatchTree) 1354 if err != nil { 1355 return nil, err 1356 } 1357 if child == nil { 1358 return nil, nil 1359 } 1360 if c, ok := child.(*andMatchTree); ok { 1361 mt.andMatchTree = *c 1362 } else { 1363 // the and was simplified to a single clause, 1364 // so the linematch portion is irrelevant. 1365 return mt, nil 1366 } 1367 case *notMatchTree: 1368 mt.child, err = pruneMatchTree(mt.child) 1369 if err != nil { 1370 return nil, err 1371 } 1372 if mt.child == nil { 1373 // not false => true 1374 return &bruteForceMatchTree{}, nil 1375 } 1376 // unhandled: 1377 case *docMatchTree: 1378 case *bruteForceMatchTree: 1379 case *regexpMatchTree: 1380 case *wordMatchTree: 1381 } 1382 return mt, err 1383} 1384 1385// isRegexpAll returns true if the query matches all possible lines. 1386// 1387// Note: it is possible for a funky regex to actually match all but this 1388// returns false. This returns true for normal looking regexes like ".*" or 1389// "(?-s:.*)". 1390func isRegexpAll(r *syntax.Regexp) bool { 1391 // Note: we don't care about any flags since we are looking for .* and we 1392 // don't mind if . matches all or everything but newline. 1393 1394 // for loop is for visiting children of OpCapture/OpConcat until we hit .* 1395 for { 1396 // Our main target: .* 1397 if r.Op == syntax.OpStar && len(r.Sub) == 1 { // * 1398 // (?s:.) or (?-s:.) 1399 return r.Sub[0].Op == syntax.OpAnyChar || r.Sub[0].Op == syntax.OpAnyCharNotNL 1400 } 1401 1402 // Strip away expressions being wrapped in paranthesis 1403 if (r.Op == syntax.OpCapture || r.Op == syntax.OpConcat) && len(r.Sub) == 1 { 1404 r = r.Sub[0] 1405 continue 1406 } 1407 1408 return false 1409 } 1410}