fork of https://github.com/sourcegraph/zoekt
0

Configure Feed

Select the types of activity you want to include in your feed.

1// Copyright 2018 Google Inc. All rights reserved. 2// 3// Licensed under the Apache License, Version 2.0 (the "License"); 4// you may not use this file except in compliance with the License. 5// You may obtain a copy of the License at 6// 7// http://www.apache.org/licenses/LICENSE-2.0 8// 9// Unless required by applicable law or agreed to in writing, software 10// distributed under the License is distributed on an "AS IS" BASIS, 11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12// See the License for the specific language governing permissions and 13// limitations under the License. 14 15package index 16 17import ( 18 "bytes" 19 "fmt" 20 "log" 21 "math" 22 "regexp/syntax" 23 "strings" 24 "unicode/utf8" 25 26 "github.com/grafana/regexp" 27 28 "github.com/sourcegraph/zoekt" 29 "github.com/sourcegraph/zoekt/internal/syntaxutil" 30 "github.com/sourcegraph/zoekt/query" 31) 32 33// A docIterator iterates over documents in order. 34type docIterator interface { 35 // provide the next document where we may find something interesting. 36 // 37 // This is like a "peek" and shouldn't mutate state. prepare is what should 38 // change state. 39 nextDoc() uint32 40 41 // clears any per-document state of the docIterator, and 42 // prepares for evaluating the given doc. The argument is 43 // strictly increasing over time. 44 prepare(nextDoc uint32) 45} 46 47// costs are passed in increasing order to matchTree.matches until they do not 48// return matchesRequiresHigherCost. 49const ( 50 costConst = 0 51 costMemory = 1 52 costContent = 2 53 costRegexp = 3 54) 55 56const ( 57 costMin = costConst 58 costMax = costRegexp 59) 60 61// matchesState is an enum for the state of a matchTree after a call to 62// matchTree.matches. 63type matchesState uint8 64 65const ( 66 // matchesRequiresHigherCost is returned when matchTree.matches hasn't done 67 // a search yet since the cost value is not high enough. 68 matchesRequiresHigherCost matchesState = iota 69 70 // matchesFound is returned when matchTree.matches has done a search and 71 // found one or more matches. 72 matchesFound 73 74 // matchesNone is returned when matchTree.matches has done a search and 75 // found nothing. 76 matchesNone 77) 78 79// matchesStatePred is a helper which returns matchesFound if b is true 80// otherwise returns matchesNone. 81func matchesStatePred(b bool) matchesState { 82 if b { 83 return matchesFound 84 } 85 return matchesNone 86} 87 88// matchesStateForSlice is a helper which returns matchesFound if v is 89// non-empty otherwise returns matchesNone. 90func matchesStateForSlice[T any](v []T) matchesState { 91 return matchesStatePred(len(v) > 0) 92} 93 94// An expression tree coupled with matches. The matchtree has two 95// functions: 96// 97// * it implements boolean combinations (and, or, not) 98// 99// * it implements shortcuts, where we skip documents (for example: if 100// there are no trigram matches, we can be sure there are no substring 101// matches). The matchtree iterates over the documents as they are 102// ordered in the shard. 103// 104// The general process for a given (shard, query) is: 105// 106// - construct matchTree for the query 107// 108// - find all different leaf matchTrees (substring, regexp, etc.) 109// 110// in a loop: 111// 112// - find next doc to process using nextDoc 113// 114// - evaluate atoms (leaf expressions that match text) 115// 116// - evaluate the tree using matches(), storing the result in map. 117// 118// - if the complete tree returns (matches() != matchesRequiresHigherCost) 119// for the document, collect all text matches by looking at leaf 120// matchTrees. 121type matchTree interface { 122 docIterator 123 124 // matches if cost is high enough. See documentation for matchesState's 125 // values. 126 // 127 // Note: Do not call this directly, rather use evalMatchTree which uses 128 // known to cache responses once the state transitions away from 129 // matchesRequiresHigherCost. 130 matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState 131} 132 133// docMatchTree iterates over documents for which predicate(docID) returns true. 134type docMatchTree struct { 135 // the number of documents in a shard. 136 numDocs uint32 137 138 predicate func(docID uint32) bool 139 140 // provides additional information about the reason why the docMatchTree was 141 // created. 142 reason string 143 144 // mutable 145 firstDone bool 146 docID uint32 147} 148 149type bruteForceMatchTree struct { 150 // mutable 151 firstDone bool 152 docID uint32 153} 154 155type andLineMatchTree struct { 156 andMatchTree 157} 158 159type andMatchTree struct { 160 children []matchTree 161} 162 163type orMatchTree struct { 164 children []matchTree 165} 166 167type notMatchTree struct { 168 child matchTree 169} 170 171// Returns only the filename of child matches. 172type fileNameMatchTree struct { 173 child matchTree 174} 175 176type boostMatchTree struct { 177 child matchTree 178 boost float64 179} 180 181// Don't visit this subtree for collecting matches. 182type noVisitMatchTree struct { 183 matchTree 184} 185 186type regexpMatchTree struct { 187 regexp *regexp.Regexp 188 189 // origRegexp is the original parsed regexp from the query structure. It 190 // does not include mutations such as case sensitivity. 191 origRegexp *syntax.Regexp 192 193 fileName bool 194 195 // mutable 196 reEvaluated bool 197 found []*candidateMatch 198 199 // nextDoc, prepare. 200 bruteForceMatchTree 201} 202 203func newRegexpMatchTree(s *query.Regexp) *regexpMatchTree { 204 prefix := "" 205 if !s.CaseSensitive { 206 prefix = "(?i)" 207 } 208 209 return &regexpMatchTree{ 210 regexp: regexp.MustCompile(prefix + syntaxutil.RegexpString(s.Regexp)), 211 origRegexp: s.Regexp, 212 fileName: s.FileName, 213 } 214} 215 216// \bLITERAL\b 217type wordMatchTree struct { 218 word string 219 220 fileName bool 221 222 // mutable 223 evaluated bool 224 found []*candidateMatch 225 226 // nextDoc, prepare. 227 bruteForceMatchTree 228} 229 230type substrMatchTree struct { 231 matchIterator 232 233 query *query.Substring 234 caseSensitive bool 235 fileName bool 236 237 // mutable 238 current []*candidateMatch 239 contEvaluated bool 240} 241 242type branchQueryMatchTree struct { 243 fileMasks []uint64 244 masks []uint64 245 repos []uint16 246 247 // mutable 248 firstDone bool 249 docID uint32 250} 251 252func (t *branchQueryMatchTree) branchMask() uint64 { 253 return t.fileMasks[t.docID] & t.masks[t.repos[t.docID]] 254} 255 256type symbolRegexpMatchTree struct { 257 matchTree 258 regexp *regexp.Regexp 259 all bool // skips regex match if .* 260 261 reEvaluated bool 262 found []*candidateMatch 263} 264 265func (t *symbolRegexpMatchTree) prepare(doc uint32) { 266 t.reEvaluated = false 267 t.found = t.found[:0] 268 t.matchTree.prepare(doc) 269} 270 271func (t *symbolRegexpMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState { 272 if t.reEvaluated { 273 return matchesStateForSlice(t.found) 274 } 275 276 if cost < costRegexp { 277 return matchesRequiresHigherCost 278 } 279 280 sections := cp.docSections() 281 content := cp.data(false) 282 283 found := t.found[:0] 284 for i, sec := range sections { 285 var idx []int 286 if t.all { 287 idx = []int{0, int(sec.End - sec.Start)} 288 } else { 289 idx = t.regexp.FindIndex(content[sec.Start:sec.End]) 290 if idx == nil { 291 continue 292 } 293 } 294 295 cm := &candidateMatch{ 296 byteOffset: sec.Start + uint32(idx[0]), 297 byteMatchSz: uint32(idx[1] - idx[0]), 298 symbol: true, 299 symbolIdx: uint32(i), 300 } 301 found = append(found, cm) 302 } 303 t.found = found 304 t.reEvaluated = true 305 306 return matchesStateForSlice(t.found) 307} 308 309type symbolSubstrMatchTree struct { 310 *substrMatchTree 311 312 patternSize uint32 313 fileEndRunes []uint32 314 fileEndSymbol []uint32 315 316 doc uint32 317 sections []DocumentSection 318 319 secID uint32 320} 321 322func (t *symbolSubstrMatchTree) prepare(doc uint32) { 323 t.substrMatchTree.prepare(doc) 324 t.doc = doc 325 326 var fileStart uint32 327 if doc > 0 { 328 fileStart = t.fileEndRunes[doc-1] 329 } 330 331 var sections []DocumentSection 332 if len(t.sections) > 0 { 333 most := t.fileEndSymbol[len(t.fileEndSymbol)-1] 334 if most == uint32(len(t.sections)) { 335 sections = t.sections[t.fileEndSymbol[doc]:t.fileEndSymbol[doc+1]] 336 } else { 337 for t.secID < uint32(len(t.sections)) && t.sections[t.secID].Start < fileStart { 338 t.secID++ 339 } 340 341 fileEnd, symbolEnd := t.fileEndRunes[doc], t.secID 342 for symbolEnd < uint32(len(t.sections)) && t.sections[symbolEnd].Start < fileEnd { 343 symbolEnd++ 344 } 345 346 sections = t.sections[t.secID:symbolEnd] 347 } 348 } 349 350 secIdx := 0 351 trimmed := t.current[:0] 352 for len(sections) > secIdx && len(t.current) > 0 { 353 start := fileStart + t.current[0].runeOffset 354 end := start + t.patternSize 355 if start >= sections[secIdx].End { 356 secIdx++ 357 continue 358 } 359 360 if start < sections[secIdx].Start { 361 t.current = t.current[1:] 362 continue 363 } 364 365 if end <= sections[secIdx].End { 366 t.current[0].symbol = true 367 t.current[0].symbolIdx = uint32(secIdx) 368 trimmed = append(trimmed, t.current[0]) 369 } 370 371 t.current = t.current[1:] 372 } 373 t.current = trimmed 374} 375 376// all prepare methods 377 378func (t *bruteForceMatchTree) prepare(doc uint32) { 379 t.docID = doc 380 t.firstDone = true 381} 382 383func (t *docMatchTree) prepare(doc uint32) { 384 t.docID = doc 385 t.firstDone = true 386} 387 388func (t *andMatchTree) prepare(doc uint32) { 389 for _, c := range t.children { 390 c.prepare(doc) 391 } 392} 393 394func (t *regexpMatchTree) prepare(doc uint32) { 395 t.found = t.found[:0] 396 t.reEvaluated = false 397 t.bruteForceMatchTree.prepare(doc) 398} 399 400func (t *wordMatchTree) prepare(doc uint32) { 401 t.found = t.found[:0] 402 t.evaluated = false 403 t.bruteForceMatchTree.prepare(doc) 404} 405 406func (t *orMatchTree) prepare(doc uint32) { 407 for _, c := range t.children { 408 c.prepare(doc) 409 } 410} 411 412func (t *notMatchTree) prepare(doc uint32) { 413 t.child.prepare(doc) 414} 415 416func (t *fileNameMatchTree) prepare(doc uint32) { 417 t.child.prepare(doc) 418} 419 420func (t *boostMatchTree) prepare(doc uint32) { 421 t.child.prepare(doc) 422} 423 424func (t *substrMatchTree) prepare(nextDoc uint32) { 425 t.matchIterator.prepare(nextDoc) 426 t.current = t.matchIterator.candidates() 427 t.contEvaluated = false 428} 429 430func (t *branchQueryMatchTree) prepare(doc uint32) { 431 t.firstDone = true 432 t.docID = doc 433} 434 435// nextDoc 436 437func (t *docMatchTree) nextDoc() uint32 { 438 var start uint32 439 if t.firstDone { 440 start = t.docID + 1 441 } 442 for i := start; i < t.numDocs; i++ { 443 if t.predicate(i) { 444 return i 445 } 446 } 447 return math.MaxUint32 448} 449 450func (t *bruteForceMatchTree) nextDoc() uint32 { 451 if !t.firstDone { 452 return 0 453 } 454 return t.docID + 1 455} 456 457func (t *andMatchTree) nextDoc() uint32 { 458 var max uint32 459 for _, c := range t.children { 460 m := c.nextDoc() 461 if m > max { 462 max = m 463 } 464 } 465 return max 466} 467 468func (t *orMatchTree) nextDoc() uint32 { 469 min := uint32(math.MaxUint32) 470 for _, c := range t.children { 471 m := c.nextDoc() 472 if m < min { 473 min = m 474 } 475 } 476 return min 477} 478 479func (t *notMatchTree) nextDoc() uint32 { 480 return 0 481} 482 483func (t *fileNameMatchTree) nextDoc() uint32 { 484 return t.child.nextDoc() 485} 486 487func (t *boostMatchTree) nextDoc() uint32 { 488 return t.child.nextDoc() 489} 490 491func (t *branchQueryMatchTree) nextDoc() uint32 { 492 var start uint32 493 if t.firstDone { 494 start = t.docID + 1 495 } 496 497 for i := start; i < uint32(len(t.fileMasks)); i++ { 498 if (t.masks[t.repos[i]] & t.fileMasks[i]) != 0 { 499 return i 500 } 501 } 502 return math.MaxUint32 503} 504 505// all String methods 506 507func (t *bruteForceMatchTree) String() string { 508 return "all" 509} 510 511func (t *docMatchTree) String() string { 512 return fmt.Sprintf("doc(%s)", t.reason) 513} 514 515func (t *andMatchTree) String() string { 516 return fmt.Sprintf("and%v", t.children) 517} 518 519func (t *regexpMatchTree) String() string { 520 f := "" 521 if t.fileName { 522 f = "f" 523 } 524 return fmt.Sprintf("%sre(%s)", f, t.regexp) 525} 526 527func (t *wordMatchTree) String() string { 528 f := "" 529 if t.fileName { 530 f = "f" 531 } 532 return fmt.Sprintf("%sword(%s)", f, t.word) 533} 534 535func (t *orMatchTree) String() string { 536 return fmt.Sprintf("or%v", t.children) 537} 538 539func (t *notMatchTree) String() string { 540 return fmt.Sprintf("not(%v)", t.child) 541} 542 543func (t *noVisitMatchTree) String() string { 544 return fmt.Sprintf("novisit(%v)", t.matchTree) 545} 546 547func (t *fileNameMatchTree) String() string { 548 return fmt.Sprintf("f(%v)", t.child) 549} 550 551func (t *boostMatchTree) String() string { 552 return fmt.Sprintf("boost(%f, %v)", t.boost, t.child) 553} 554 555func (t *substrMatchTree) String() string { 556 f := "" 557 if t.fileName { 558 f = "f" 559 } 560 561 return fmt.Sprintf("%ssubstr(%q, %v, %v)", f, t.query.Pattern, t.current, t.matchIterator) 562} 563 564func (t *branchQueryMatchTree) String() string { 565 return fmt.Sprintf("branch(%x)", t.masks) 566} 567 568func (t *symbolSubstrMatchTree) String() string { 569 return fmt.Sprintf("symbol(%v)", t.substrMatchTree) 570} 571 572func (t *symbolRegexpMatchTree) String() string { 573 return fmt.Sprintf("symbol(%v)", t.matchTree) 574} 575 576// visitMatches visits all atoms in matchTree. Note: This visits 577// noVisitMatchTree. For collecting matches use visitMatches. 578func visitMatchTree(t matchTree, f func(matchTree)) { 579 switch s := t.(type) { 580 case *andMatchTree: 581 for _, ch := range s.children { 582 visitMatchTree(ch, f) 583 } 584 case *orMatchTree: 585 for _, ch := range s.children { 586 visitMatchTree(ch, f) 587 } 588 case *andLineMatchTree: 589 visitMatchTree(&s.andMatchTree, f) 590 case *noVisitMatchTree: 591 visitMatchTree(s.matchTree, f) 592 case *notMatchTree: 593 visitMatchTree(s.child, f) 594 case *fileNameMatchTree: 595 visitMatchTree(s.child, f) 596 case *boostMatchTree: 597 visitMatchTree(s.child, f) 598 case *symbolSubstrMatchTree: 599 visitMatchTree(s.substrMatchTree, f) 600 case *symbolRegexpMatchTree: 601 visitMatchTree(s.matchTree, f) 602 default: 603 f(t) 604 } 605} 606 607// updateMatchTreeStats calls updateStats on all atoms in mt which have that 608// function defined. 609func updateMatchTreeStats(mt matchTree, stats *zoekt.Stats) { 610 visitMatchTree(mt, func(mt matchTree) { 611 if atom, ok := mt.(interface{ updateStats(*zoekt.Stats) }); ok { 612 atom.updateStats(stats) 613 } 614 }) 615} 616 617func visitMatchAtoms(t matchTree, known map[matchTree]bool, f func(matchTree)) { 618 visitMatches(t, known, 1, func(mt matchTree, _ float64) { 619 f(mt) 620 }) 621} 622 623// visitMatches visits all atoms which can contribute matches. Note: This 624// skips noVisitMatchTree. 625func visitMatches(t matchTree, known map[matchTree]bool, weight float64, f func(matchTree, float64)) { 626 switch s := t.(type) { 627 case *andMatchTree: 628 for _, ch := range s.children { 629 if known[ch] { 630 visitMatches(ch, known, weight, f) 631 } 632 } 633 case *andLineMatchTree: 634 visitMatches(&s.andMatchTree, known, weight, f) 635 case *orMatchTree: 636 for _, ch := range s.children { 637 if known[ch] { 638 visitMatches(ch, known, weight, f) 639 } 640 } 641 case *boostMatchTree: 642 visitMatches(s.child, known, weight*s.boost, f) 643 case *symbolSubstrMatchTree: 644 visitMatches(s.substrMatchTree, known, weight, f) 645 case *notMatchTree: 646 case *noVisitMatchTree: 647 // don't collect into negative trees. 648 case *fileNameMatchTree: 649 // We will just gather the filename if we do not visit this tree. 650 default: 651 f(s, weight) 652 } 653} 654 655// all matches() methods. 656 657func (t *docMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState { 658 return matchesStatePred(t.predicate(cp.idx)) 659} 660 661func (t *bruteForceMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState { 662 return matchesFound 663} 664 665// andLineMatchTree is a performance optimization of andMatchTree. For content 666// searches we don't want to run the regex engine if there is no line that 667// contains matches from all terms. 668func (t *andLineMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState { 669 if state := evalMatchTree(cp, cost, known, &t.andMatchTree); state != matchesFound { 670 return state 671 } 672 673 // Invariant: all children have matches. If any line contains all of them we 674 // can return MatchesFound. 675 676 // find child with fewest candidates 677 minCount := math.MaxInt 678 fewestChildren := 0 679 for ix, child := range t.children { 680 v, ok := child.(*substrMatchTree) 681 // make sure we are running a content search and that all candidates are a 682 // substrMatchTree 683 if !ok || v.fileName { 684 return matchesFound 685 } 686 if len(v.current) < minCount { 687 minCount = len(v.current) 688 fewestChildren = ix 689 } 690 } 691 692 type lineRange struct { 693 start int 694 end int 695 } 696 lines := make([]lineRange, 0, len(t.children[fewestChildren].(*substrMatchTree).current)) 697 prev := -1 698 for _, candidate := range t.children[fewestChildren].(*substrMatchTree).current { 699 line := cp.newlines().atOffset(candidate.byteOffset) 700 if line == prev { 701 continue 702 } 703 prev = line 704 byteStart := int(cp.newlines().lineStart(line)) 705 byteEnd := int(cp.newlines().lineStart(line + 1)) 706 lines = append(lines, lineRange{byteStart, byteEnd}) 707 } 708 709 // children keeps track of the children's candidates we have already seen. 710 children := make([][]*candidateMatch, 0, len(t.children)-1) 711 for j, child := range t.children { 712 if j == fewestChildren { 713 continue 714 } 715 children = append(children, child.(*substrMatchTree).current) 716 } 717 718nextLine: 719 for i := 0; i < len(lines); i++ { 720 hits := 1 721 nextChild: 722 for j := range children { 723 nextCandidate: 724 for len(children[j]) > 0 { 725 candidate := children[j][0] 726 bo := int(cp.findOffset(false, candidate.runeOffset)) 727 if bo < lines[i].start { 728 children[j] = children[j][1:] 729 continue nextCandidate 730 } 731 if bo < lines[i].end { 732 hits++ 733 continue nextChild 734 } 735 // move the `lines` iterator forward until bo < line.end 736 for i < len(lines) && bo >= lines[i].end { 737 i++ 738 } 739 i-- 740 continue nextLine 741 } 742 } 743 // return early once we found any line that contains matches from all children 744 if hits == len(t.children) { 745 return matchesFound 746 } 747 } 748 return matchesNone 749} 750 751func (t *andMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState { 752 // We have found matches unless a child needs to do more work or it hasn't 753 // found matches. 754 state := matchesFound 755 756 for _, ch := range t.children { 757 switch evalMatchTree(cp, cost, known, ch) { 758 case matchesRequiresHigherCost: 759 // keep evaluating other children incase we come across matchesNone 760 state = matchesRequiresHigherCost 761 case matchesFound: 762 // will return this if every child has this value 763 case matchesNone: 764 return matchesNone 765 } 766 } 767 768 return state 769} 770 771func (t *orMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState { 772 // we could short-circuit, but we want to use the other possibilities as a 773 // ranking signal. So we always return the most conservative state. 774 state := matchesNone 775 for _, ch := range t.children { 776 switch evalMatchTree(cp, cost, known, ch) { 777 case matchesRequiresHigherCost: 778 state = matchesRequiresHigherCost 779 case matchesFound: 780 if state != matchesRequiresHigherCost { 781 state = matchesFound 782 } 783 case matchesNone: 784 // noop 785 } 786 } 787 return state 788} 789 790func (t *branchQueryMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState { 791 return matchesStatePred(t.branchMask() != 0) 792} 793 794func (t *regexpMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState { 795 if t.reEvaluated { 796 return matchesStateForSlice(t.found) 797 } 798 799 if cost < costRegexp { 800 return matchesRequiresHigherCost 801 } 802 803 cp.stats.RegexpsConsidered++ 804 idxs := t.regexp.FindAllIndex(cp.data(t.fileName), -1) 805 found := t.found[:0] 806 for _, idx := range idxs { 807 cm := &candidateMatch{ 808 byteOffset: uint32(idx[0]), 809 byteMatchSz: uint32(idx[1] - idx[0]), 810 fileName: t.fileName, 811 } 812 813 found = append(found, cm) 814 } 815 t.found = found 816 t.reEvaluated = true 817 818 return matchesStateForSlice(t.found) 819} 820 821func (t *wordMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState { 822 if t.evaluated { 823 return matchesStateForSlice(t.found) 824 } 825 826 if cost < costRegexp { 827 return matchesRequiresHigherCost 828 } 829 830 data := cp.data(t.fileName) 831 offset := 0 832 found := t.found[:0] 833 for { 834 idx := bytes.Index(data[offset:], []byte(t.word)) 835 if idx < 0 { 836 break 837 } 838 839 relStartOffset := offset + idx 840 relEndOffset := relStartOffset + len(t.word) 841 842 startBoundary := relStartOffset < len(data) && (relStartOffset == 0 || !characterClass(data[relStartOffset-1])) 843 endBoundary := relEndOffset > 0 && (relEndOffset == len(data) || !characterClass(data[relEndOffset])) 844 if startBoundary && endBoundary { 845 found = append(found, &candidateMatch{ 846 byteOffset: uint32(offset + idx), 847 byteMatchSz: uint32(len(t.word)), 848 fileName: t.fileName, 849 }) 850 } 851 offset += idx + len(t.word) 852 } 853 854 t.found = found 855 t.evaluated = true 856 857 return matchesStateForSlice(t.found) 858} 859 860// breakMatchesOnNewlines returns matches resulting from breaking each element 861// of cms on newlines within text. 862func breakMatchesOnNewlines(cms []*candidateMatch, text []byte) []*candidateMatch { 863 var lineCMs []*candidateMatch 864 for _, cm := range cms { 865 lineCMs = append(lineCMs, breakOnNewlines(cm, text)...) 866 } 867 return lineCMs 868} 869 870// breakOnNewlines returns matches resulting from breaking cm on newlines 871// within text. 872func breakOnNewlines(cm *candidateMatch, text []byte) []*candidateMatch { 873 var cms []*candidateMatch 874 addMe := &candidateMatch{} 875 *addMe = *cm 876 for i := uint32(cm.byteOffset); i < cm.byteOffset+cm.byteMatchSz; i++ { 877 if text[i] == '\n' { 878 addMe.byteMatchSz = i - addMe.byteOffset 879 if addMe.byteMatchSz != 0 { 880 cms = append(cms, addMe) 881 } 882 883 addMe = &candidateMatch{} 884 *addMe = *cm 885 addMe.byteOffset = i + 1 886 } 887 } 888 addMe.byteMatchSz = cm.byteOffset + cm.byteMatchSz - addMe.byteOffset 889 if addMe.byteMatchSz != 0 { 890 cms = append(cms, addMe) 891 } 892 return cms 893} 894 895// evalMatchTree should be called instead of directly calling 896// matchTree.matches. It cache known values for future evaluation at higher 897// costs. 898func evalMatchTree(cp *contentProvider, cost int, known map[matchTree]bool, mt matchTree) matchesState { 899 if v, ok := known[mt]; ok { 900 return matchesStatePred(v) 901 } 902 903 ms := mt.matches(cp, cost, known) 904 if ms != matchesRequiresHigherCost { 905 known[mt] = ms == matchesFound 906 } 907 908 return ms 909} 910 911func (t *notMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState { 912 switch evalMatchTree(cp, cost, known, t.child) { 913 case matchesRequiresHigherCost: 914 return matchesRequiresHigherCost 915 case matchesFound: 916 return matchesNone 917 case matchesNone: 918 return matchesFound 919 default: 920 panic("unreachable") 921 } 922} 923 924func (t *fileNameMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState { 925 return evalMatchTree(cp, cost, known, t.child) 926} 927 928func (t *boostMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState { 929 return evalMatchTree(cp, cost, known, t.child) 930} 931 932func (t *substrMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState { 933 if t.contEvaluated { 934 return matchesStateForSlice(t.current) 935 } 936 937 if len(t.current) == 0 { 938 return matchesNone 939 } 940 941 if t.fileName && cost < costMemory { 942 return matchesRequiresHigherCost 943 } 944 945 if !t.fileName && cost < costContent { 946 return matchesRequiresHigherCost 947 } 948 949 pruned := t.current[:0] 950 for _, m := range t.current { 951 if m.byteOffset == 0 && m.runeOffset > 0 { 952 m.byteOffset = cp.findOffset(m.fileName, m.runeOffset) 953 } 954 if m.matchContent(cp.data(m.fileName)) { 955 pruned = append(pruned, m) 956 } 957 } 958 t.current = pruned 959 t.contEvaluated = true 960 961 return matchesStateForSlice(t.current) 962} 963 964type matchTreeOpt struct { 965 // DisableWordMatchOptimization is used to disable the use of wordMatchTree. 966 // This was added since we do not support wordMatchTree with symbol search. 967 DisableWordMatchOptimization bool 968} 969 970func (d *indexData) newMatchTree(q query.Q, opt matchTreeOpt) (matchTree, error) { 971 if q == nil { 972 return nil, fmt.Errorf("got nil (sub)query") 973 } 974 switch s := q.(type) { 975 case *query.Regexp: 976 // RegexpToMatchTreeRecursive tries to distill a matchTree that matches a 977 // superset of the regexp. If the returned matchTree is equivalent to the 978 // original regexp, it returns true. An equivalent matchTree has the same 979 // behaviour as the original regexp and can be used instead. 980 // 981 subMT, isEq, _, err := d.regexpToMatchTreeRecursive(s.Regexp, ngramSize, s.FileName, s.CaseSensitive) 982 if err != nil { 983 return nil, err 984 } 985 // if the query can be used in place of the regexp 986 // return the subtree 987 if isEq { 988 return subMT, nil 989 } 990 991 var tr matchTree 992 if wmt, ok := regexpToWordMatchTree(s, opt); ok { 993 // A common search we get is "\bLITERAL\b". Avoid the regex engine and 994 // provide something faster. 995 tr = wmt 996 } else { 997 tr = newRegexpMatchTree(s) 998 } 999 1000 return &andMatchTree{ 1001 children: []matchTree{ 1002 tr, &noVisitMatchTree{subMT}, 1003 }, 1004 }, nil 1005 case *query.And: 1006 var r []matchTree 1007 for _, ch := range s.Children { 1008 ct, err := d.newMatchTree(ch, opt) 1009 if err != nil { 1010 return nil, err 1011 } 1012 r = append(r, ct) 1013 } 1014 return &andMatchTree{r}, nil 1015 case *query.Or: 1016 var r []matchTree 1017 for _, ch := range s.Children { 1018 ct, err := d.newMatchTree(ch, opt) 1019 if err != nil { 1020 return nil, err 1021 } 1022 r = append(r, ct) 1023 } 1024 return &orMatchTree{r}, nil 1025 case *query.Not: 1026 ct, err := d.newMatchTree(s.Child, opt) 1027 return &notMatchTree{ 1028 child: ct, 1029 }, err 1030 1031 case *query.Type: 1032 if s.Type != query.TypeFileName { 1033 break 1034 } 1035 1036 ct, err := d.newMatchTree(s.Child, opt) 1037 if err != nil { 1038 return nil, err 1039 } 1040 1041 return &fileNameMatchTree{ 1042 child: ct, 1043 }, nil 1044 1045 case *query.Boost: 1046 ct, err := d.newMatchTree(s.Child, opt) 1047 if err != nil { 1048 return nil, err 1049 } 1050 1051 return &boostMatchTree{ 1052 child: ct, 1053 boost: s.Boost, 1054 }, nil 1055 1056 case *query.Meta: 1057 reposWant := make([]bool, len(d.repoMetaData)) 1058 for repoIdx, r := range d.repoMetaData { 1059 if r.Metadata != nil { 1060 if val, ok := r.Metadata[s.Field]; ok && s.Value.MatchString(val) { 1061 reposWant[repoIdx] = true 1062 } 1063 } 1064 } 1065 1066 return &docMatchTree{ 1067 reason: "Meta", 1068 numDocs: d.numDocs(), 1069 predicate: func(docID uint32) bool { 1070 repoIdx := d.repos[docID] 1071 if int(repoIdx) >= len(reposWant) { 1072 return false 1073 } 1074 return reposWant[repoIdx] 1075 }, 1076 }, nil 1077 1078 case *query.Substring: 1079 return d.newSubstringMatchTree(s) 1080 1081 case *query.Branch: 1082 masks := make([]uint64, 0, len(d.repoMetaData)) 1083 if s.Pattern == "HEAD" { 1084 for range d.repoMetaData { 1085 masks = append(masks, 1) 1086 } 1087 } else { 1088 for _, branchIDs := range d.branchIDs { 1089 mask := uint64(0) 1090 for nm, m := range branchIDs { 1091 if (s.Exact && nm == s.Pattern) || (!s.Exact && strings.Contains(nm, s.Pattern)) { 1092 mask |= uint64(m) 1093 } 1094 } 1095 masks = append(masks, mask) 1096 } 1097 } 1098 return &branchQueryMatchTree{ 1099 masks: masks, 1100 fileMasks: d.fileBranchMasks, 1101 repos: d.repos, 1102 }, nil 1103 case *query.Const: 1104 if s.Value { 1105 return &bruteForceMatchTree{}, nil 1106 } else { 1107 return &noMatchTree{Why: "const"}, nil 1108 } 1109 case *query.Language: 1110 code, ok := d.metaData.LanguageMap[s.Language] 1111 if !ok { 1112 return &noMatchTree{Why: "lang"}, nil 1113 } 1114 return &docMatchTree{ 1115 reason: "language", 1116 numDocs: d.numDocs(), 1117 predicate: func(docID uint32) bool { 1118 return d.getLanguage(docID) == code 1119 }, 1120 }, nil 1121 1122 case *query.Symbol: 1123 // Disable WordMatchTree since we don't support it in symbols yet. 1124 optCopy := opt 1125 optCopy.DisableWordMatchOptimization = true 1126 1127 subMT, err := d.newMatchTree(s.Expr, optCopy) 1128 if err != nil { 1129 return nil, err 1130 } 1131 1132 if substr, ok := subMT.(*substrMatchTree); ok { 1133 return &symbolSubstrMatchTree{ 1134 substrMatchTree: substr, 1135 patternSize: uint32(utf8.RuneCountInString(substr.query.Pattern)), 1136 fileEndRunes: d.fileEndRunes, 1137 fileEndSymbol: d.fileEndSymbol, 1138 sections: d.runeDocSections, 1139 }, nil 1140 } 1141 1142 var regexpMT *regexpMatchTree 1143 visitMatchTree(subMT, func(mt matchTree) { 1144 if t, ok := mt.(*regexpMatchTree); ok { 1145 regexpMT = t 1146 } 1147 }) 1148 if regexpMT == nil { 1149 return nil, fmt.Errorf("found %T inside query.Symbol", subMT) 1150 } 1151 1152 return &symbolRegexpMatchTree{ 1153 regexp: regexpMT.regexp, 1154 all: isRegexpAll(regexpMT.origRegexp), 1155 matchTree: subMT, 1156 }, nil 1157 1158 case *query.FileNameSet: 1159 return &docMatchTree{ 1160 reason: "FileNameSet", 1161 numDocs: d.numDocs(), 1162 predicate: func(docID uint32) bool { 1163 fileName := d.fileName(docID) 1164 _, ok := s.Set[string(fileName)] 1165 return ok 1166 }, 1167 }, nil 1168 1169 case *query.BranchesRepos: 1170 reposBranchesWant := make([]uint64, len(d.repoMetaData)) 1171 for repoIdx := range d.repoMetaData { 1172 var mask uint64 1173 for _, br := range s.List { 1174 if br.Repos.Contains(d.repoMetaData[repoIdx].ID) { 1175 mask |= uint64(d.branchIDs[repoIdx][br.Branch]) 1176 } 1177 } 1178 reposBranchesWant[repoIdx] = mask 1179 } 1180 return &docMatchTree{ 1181 reason: "BranchesRepos", 1182 numDocs: d.numDocs(), 1183 predicate: func(docID uint32) bool { 1184 return d.fileBranchMasks[docID]&reposBranchesWant[d.repos[docID]] != 0 1185 }, 1186 }, nil 1187 1188 case *query.RepoSet: 1189 reposWant := make([]bool, len(d.repoMetaData)) 1190 for repoIdx, r := range d.repoMetaData { 1191 if _, ok := s.Set[r.Name]; ok { 1192 reposWant[repoIdx] = true 1193 } 1194 } 1195 return &docMatchTree{ 1196 reason: "RepoSet", 1197 numDocs: d.numDocs(), 1198 predicate: func(docID uint32) bool { 1199 return reposWant[d.repos[docID]] 1200 }, 1201 }, nil 1202 1203 case *query.RepoIDs: 1204 reposWant := make([]bool, len(d.repoMetaData)) 1205 for repoIdx, r := range d.repoMetaData { 1206 if s.Repos.Contains(r.ID) { 1207 reposWant[repoIdx] = true 1208 } 1209 } 1210 return &docMatchTree{ 1211 reason: "RepoIDs", 1212 numDocs: d.numDocs(), 1213 predicate: func(docID uint32) bool { 1214 return reposWant[d.repos[docID]] 1215 }, 1216 }, nil 1217 1218 case *query.Repo: 1219 reposWant := make([]bool, len(d.repoMetaData)) 1220 for repoIdx, r := range d.repoMetaData { 1221 if s.Regexp.MatchString(r.Name) { 1222 reposWant[repoIdx] = true 1223 } 1224 } 1225 return &docMatchTree{ 1226 reason: "Repo", 1227 numDocs: d.numDocs(), 1228 predicate: func(docID uint32) bool { 1229 return reposWant[d.repos[docID]] 1230 }, 1231 }, nil 1232 1233 case *query.RepoRegexp: 1234 reposWant := make([]bool, len(d.repoMetaData)) 1235 for repoIdx, r := range d.repoMetaData { 1236 if s.Regexp.MatchString(r.Name) { 1237 reposWant[repoIdx] = true 1238 } 1239 } 1240 return &docMatchTree{ 1241 reason: "RepoRegexp", 1242 numDocs: d.numDocs(), 1243 predicate: func(docID uint32) bool { 1244 return reposWant[d.repos[docID]] 1245 }, 1246 }, nil 1247 1248 case query.RawConfig: 1249 return &docMatchTree{ 1250 reason: s.String(), 1251 numDocs: d.numDocs(), 1252 predicate: func(docID uint32) bool { 1253 return uint8(s)&d.rawConfigMasks[d.repos[docID]] == uint8(s) 1254 }, 1255 }, nil 1256 } 1257 log.Panicf("type %T", q) 1258 return nil, nil 1259} 1260 1261func (d *indexData) newSubstringMatchTree(s *query.Substring) (matchTree, error) { 1262 st := &substrMatchTree{ 1263 query: s, 1264 caseSensitive: s.CaseSensitive, 1265 fileName: s.FileName, 1266 } 1267 1268 if utf8.RuneCountInString(s.Pattern) < ngramSize { 1269 return newRegexpMatchTree(&query.Regexp{ 1270 Regexp: &syntax.Regexp{Op: syntax.OpLiteral, Rune: []rune(s.Pattern)}, 1271 FileName: s.FileName, 1272 Content: s.Content, 1273 CaseSensitive: s.CaseSensitive, 1274 }), nil 1275 } 1276 1277 result, err := d.iterateNgrams(s) 1278 if err != nil { 1279 return nil, err 1280 } 1281 st.matchIterator = result 1282 return st, nil 1283} 1284 1285func regexpToWordMatchTree(q *query.Regexp, opt matchTreeOpt) (_ *wordMatchTree, ok bool) { 1286 if opt.DisableWordMatchOptimization { 1287 return nil, false 1288 } 1289 // Needs to be case sensitive 1290 if !q.CaseSensitive || q.Regexp.Flags&syntax.FoldCase != 0 { 1291 return nil, false 1292 } 1293 // We want a regex that looks like Op.Concat[OpWordBoundary OpLiteral OpWordBoundary] 1294 if q.Regexp.Op != syntax.OpConcat || len(q.Regexp.Sub) != 3 { 1295 return nil, false 1296 } 1297 sub := q.Regexp.Sub 1298 if sub[0].Op != syntax.OpWordBoundary || sub[1].Op != syntax.OpLiteral || sub[2].Op != syntax.OpWordBoundary { 1299 return nil, false 1300 } 1301 1302 return &wordMatchTree{ 1303 word: string(sub[1].Rune), 1304 fileName: q.FileName, 1305 }, true 1306} 1307 1308// pruneMatchTree removes impossible branches from the matchTree, as indicated 1309// by substrMatchTree having a noMatchTree and the resulting impossible and clauses and so forth. 1310func pruneMatchTree(mt matchTree) (matchTree, error) { 1311 var err error 1312 switch mt := mt.(type) { 1313 // leaf nodes that we test with the filter: 1314 case *substrMatchTree: 1315 if res, ok := mt.matchIterator.(*ngramIterationResults); ok { 1316 if _, ok := res.matchIterator.(*noMatchTree); ok { 1317 return nil, nil 1318 } 1319 } 1320 // recursive tree structures: 1321 case *andMatchTree: 1322 // Any branch of an and becoming impossible means the entire clause 1323 // is impossible. Otherwise, just handle rewrites. 1324 for i, child := range mt.children { 1325 newChild, err := pruneMatchTree(child) 1326 if err != nil { 1327 return nil, err 1328 } 1329 if newChild == nil { 1330 return nil, nil 1331 } 1332 mt.children[i] = newChild 1333 } 1334 case *orMatchTree: 1335 // *All* branches of an OR becoming impossible means the entire clause 1336 // is impossible. Otherwise, drop impossible subclauses and handle 1337 // rewrites, including simplifying to a singular resulting child branch. 1338 n := 0 1339 for _, child := range mt.children { 1340 newChild, err := pruneMatchTree(child) 1341 if err != nil { 1342 return nil, err 1343 } 1344 if newChild != nil { 1345 mt.children[n] = newChild 1346 n++ 1347 } 1348 } 1349 mt.children = mt.children[:n] 1350 if len(mt.children) == 1 { 1351 return mt.children[0], nil 1352 } else if len(mt.children) == 0 { 1353 return nil, nil 1354 } 1355 case *noVisitMatchTree: 1356 mt.matchTree, err = pruneMatchTree(mt.matchTree) 1357 if err != nil { 1358 return nil, err 1359 } 1360 if mt.matchTree == nil { 1361 return nil, nil 1362 } 1363 case *fileNameMatchTree: 1364 mt.child, err = pruneMatchTree(mt.child) 1365 if err != nil { 1366 return nil, err 1367 } 1368 if mt.child == nil { 1369 return nil, nil 1370 } 1371 case *boostMatchTree: 1372 mt.child, err = pruneMatchTree(mt.child) 1373 if err != nil { 1374 return nil, err 1375 } 1376 if mt.child == nil { 1377 return nil, nil 1378 } 1379 case *andLineMatchTree: 1380 child, err := pruneMatchTree(&mt.andMatchTree) 1381 if err != nil { 1382 return nil, err 1383 } 1384 if child == nil { 1385 return nil, nil 1386 } 1387 if c, ok := child.(*andMatchTree); ok { 1388 mt.andMatchTree = *c 1389 } else { 1390 // the and was simplified to a single clause, 1391 // so the linematch portion is irrelevant. 1392 return mt, nil 1393 } 1394 case *notMatchTree: 1395 mt.child, err = pruneMatchTree(mt.child) 1396 if err != nil { 1397 return nil, err 1398 } 1399 if mt.child == nil { 1400 // not false => true 1401 return &bruteForceMatchTree{}, nil 1402 } 1403 // unhandled: 1404 case *docMatchTree: 1405 case *bruteForceMatchTree: 1406 case *regexpMatchTree: 1407 case *wordMatchTree: 1408 } 1409 return mt, err 1410} 1411 1412// isRegexpAll returns true if the query matches all possible lines. 1413// 1414// Note: it is possible for a funky regex to actually match all but this 1415// returns false. This returns true for normal looking regexes like ".*" or 1416// "(?-s:.*)". 1417func isRegexpAll(r *syntax.Regexp) bool { 1418 // Note: we don't care about any flags since we are looking for .* and we 1419 // don't mind if . matches all or everything but newline. 1420 1421 // for loop is for visiting children of OpCapture/OpConcat until we hit .* 1422 for { 1423 // Our main target: .* 1424 if r.Op == syntax.OpStar && len(r.Sub) == 1 { // * 1425 // (?s:.) or (?-s:.) 1426 return r.Sub[0].Op == syntax.OpAnyChar || r.Sub[0].Op == syntax.OpAnyCharNotNL 1427 } 1428 1429 // Strip away expressions being wrapped in paranthesis 1430 if (r.Op == syntax.OpCapture || r.Op == syntax.OpConcat) && len(r.Sub) == 1 { 1431 r = r.Sub[0] 1432 continue 1433 } 1434 1435 return false 1436 } 1437}