fork of https://github.com/sourcegraph/zoekt
0

Configure Feed

Select the types of activity you want to include in your feed.

1// Copyright 2018 Google Inc. All rights reserved. 2// 3// Licensed under the Apache License, Version 2.0 (the "License"); 4// you may not use this file except in compliance with the License. 5// You may obtain a copy of the License at 6// 7// http://www.apache.org/licenses/LICENSE-2.0 8// 9// Unless required by applicable law or agreed to in writing, software 10// distributed under the License is distributed on an "AS IS" BASIS, 11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12// See the License for the specific language governing permissions and 13// limitations under the License. 14 15package index 16 17import ( 18 "bytes" 19 "fmt" 20 "log" 21 "math" 22 "regexp/syntax" 23 "strings" 24 "unicode/utf8" 25 26 "github.com/grafana/regexp" 27 "github.com/sourcegraph/zoekt" 28 "github.com/sourcegraph/zoekt/internal/syntaxutil" 29 "github.com/sourcegraph/zoekt/query" 30) 31 32// A docIterator iterates over documents in order. 33type docIterator interface { 34 // provide the next document where we may find something interesting. 35 // 36 // This is like a "peek" and shouldn't mutate state. prepare is what should 37 // change state. 38 nextDoc() uint32 39 40 // clears any per-document state of the docIterator, and 41 // prepares for evaluating the given doc. The argument is 42 // strictly increasing over time. 43 prepare(nextDoc uint32) 44} 45 46// costs are passed in increasing order to matchTree.matches until they do not 47// return matchesRequiresHigherCost. 48const ( 49 costConst = 0 50 costMemory = 1 51 costContent = 2 52 costRegexp = 3 53) 54 55const ( 56 costMin = costConst 57 costMax = costRegexp 58) 59 60// matchesState is an enum for the state of a matchTree after a call to 61// matchTree.matches. 62type matchesState uint8 63 64const ( 65 // matchesRequiresHigherCost is returned when matchTree.matches hasn't done 66 // a search yet since the cost value is not high enough. 67 matchesRequiresHigherCost matchesState = iota 68 69 // matchesFound is returned when matchTree.matches has done a search and 70 // found one or more matches. 71 matchesFound 72 73 // matchesNone is returned when matchTree.matches has done a search and 74 // found nothing. 75 matchesNone 76) 77 78// matchesStatePred is a helper which returns matchesFound if b is true 79// otherwise returns matchesNone. 80func matchesStatePred(b bool) matchesState { 81 if b { 82 return matchesFound 83 } 84 return matchesNone 85} 86 87// matchesStateForSlice is a helper which returns matchesFound if v is 88// non-empty otherwise returns matchesNone. 89func matchesStateForSlice[T any](v []T) matchesState { 90 return matchesStatePred(len(v) > 0) 91} 92 93// An expression tree coupled with matches. The matchtree has two 94// functions: 95// 96// * it implements boolean combinations (and, or, not) 97// 98// * it implements shortcuts, where we skip documents (for example: if 99// there are no trigram matches, we can be sure there are no substring 100// matches). The matchtree iterates over the documents as they are 101// ordered in the shard. 102// 103// The general process for a given (shard, query) is: 104// 105// - construct matchTree for the query 106// 107// - find all different leaf matchTrees (substring, regexp, etc.) 108// 109// in a loop: 110// 111// - find next doc to process using nextDoc 112// 113// - evaluate atoms (leaf expressions that match text) 114// 115// - evaluate the tree using matches(), storing the result in map. 116// 117// - if the complete tree returns (matches() != matchesRequiresHigherCost) 118// for the document, collect all text matches by looking at leaf 119// matchTrees. 120type matchTree interface { 121 docIterator 122 123 // matches if cost is high enough. See documentation for matchesState's 124 // values. 125 // 126 // Note: Do not call this directly, rather use evalMatchTree which uses 127 // known to cache responses once the state transitions away from 128 // matchesRequiresHigherCost. 129 matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState 130} 131 132// docMatchTree iterates over documents for which predicate(docID) returns true. 133type docMatchTree struct { 134 // the number of documents in a shard. 135 numDocs uint32 136 137 predicate func(docID uint32) bool 138 139 // provides additional information about the reason why the docMatchTree was 140 // created. 141 reason string 142 143 // mutable 144 firstDone bool 145 docID uint32 146} 147 148type bruteForceMatchTree struct { 149 // mutable 150 firstDone bool 151 docID uint32 152} 153 154type andLineMatchTree struct { 155 andMatchTree 156} 157 158type andMatchTree struct { 159 children []matchTree 160} 161 162type orMatchTree struct { 163 children []matchTree 164} 165 166type notMatchTree struct { 167 child matchTree 168} 169 170// Returns only the filename of child matches. 171type fileNameMatchTree struct { 172 child matchTree 173} 174 175type boostMatchTree struct { 176 child matchTree 177 boost float64 178} 179 180// Don't visit this subtree for collecting matches. 181type noVisitMatchTree struct { 182 matchTree 183} 184 185type regexpMatchTree struct { 186 regexp *regexp.Regexp 187 188 // origRegexp is the original parsed regexp from the query structure. It 189 // does not include mutations such as case sensitivity. 190 origRegexp *syntax.Regexp 191 192 fileName bool 193 194 // mutable 195 reEvaluated bool 196 found []*candidateMatch 197 198 // nextDoc, prepare. 199 bruteForceMatchTree 200} 201 202func newRegexpMatchTree(s *query.Regexp) *regexpMatchTree { 203 prefix := "" 204 if !s.CaseSensitive { 205 prefix = "(?i)" 206 } 207 208 return &regexpMatchTree{ 209 regexp: regexp.MustCompile(prefix + syntaxutil.RegexpString(s.Regexp)), 210 origRegexp: s.Regexp, 211 fileName: s.FileName, 212 } 213} 214 215// \bLITERAL\b 216type wordMatchTree struct { 217 word string 218 219 fileName bool 220 221 // mutable 222 evaluated bool 223 found []*candidateMatch 224 225 // nextDoc, prepare. 226 bruteForceMatchTree 227} 228 229type substrMatchTree struct { 230 matchIterator 231 232 query *query.Substring 233 caseSensitive bool 234 fileName bool 235 236 // mutable 237 current []*candidateMatch 238 contEvaluated bool 239} 240 241type branchQueryMatchTree struct { 242 fileMasks []uint64 243 masks []uint64 244 repos []uint16 245 246 // mutable 247 firstDone bool 248 docID uint32 249} 250 251func (t *branchQueryMatchTree) branchMask() uint64 { 252 return t.fileMasks[t.docID] & t.masks[t.repos[t.docID]] 253} 254 255type symbolRegexpMatchTree struct { 256 matchTree 257 regexp *regexp.Regexp 258 all bool // skips regex match if .* 259 260 reEvaluated bool 261 found []*candidateMatch 262} 263 264func (t *symbolRegexpMatchTree) prepare(doc uint32) { 265 t.reEvaluated = false 266 t.found = t.found[:0] 267 t.matchTree.prepare(doc) 268} 269 270func (t *symbolRegexpMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState { 271 if t.reEvaluated { 272 return matchesStateForSlice(t.found) 273 } 274 275 if cost < costRegexp { 276 return matchesRequiresHigherCost 277 } 278 279 sections := cp.docSections() 280 content := cp.data(false) 281 282 found := t.found[:0] 283 for i, sec := range sections { 284 var idx []int 285 if t.all { 286 idx = []int{0, int(sec.End - sec.Start)} 287 } else { 288 idx = t.regexp.FindIndex(content[sec.Start:sec.End]) 289 if idx == nil { 290 continue 291 } 292 } 293 294 cm := &candidateMatch{ 295 byteOffset: sec.Start + uint32(idx[0]), 296 byteMatchSz: uint32(idx[1] - idx[0]), 297 symbol: true, 298 symbolIdx: uint32(i), 299 } 300 found = append(found, cm) 301 } 302 t.found = found 303 t.reEvaluated = true 304 305 return matchesStateForSlice(t.found) 306} 307 308type symbolSubstrMatchTree struct { 309 *substrMatchTree 310 311 patternSize uint32 312 fileEndRunes []uint32 313 fileEndSymbol []uint32 314 315 doc uint32 316 sections []DocumentSection 317 318 secID uint32 319} 320 321func (t *symbolSubstrMatchTree) prepare(doc uint32) { 322 t.substrMatchTree.prepare(doc) 323 t.doc = doc 324 325 var fileStart uint32 326 if doc > 0 { 327 fileStart = t.fileEndRunes[doc-1] 328 } 329 330 var sections []DocumentSection 331 if len(t.sections) > 0 { 332 most := t.fileEndSymbol[len(t.fileEndSymbol)-1] 333 if most == uint32(len(t.sections)) { 334 sections = t.sections[t.fileEndSymbol[doc]:t.fileEndSymbol[doc+1]] 335 } else { 336 for t.secID < uint32(len(t.sections)) && t.sections[t.secID].Start < fileStart { 337 t.secID++ 338 } 339 340 fileEnd, symbolEnd := t.fileEndRunes[doc], t.secID 341 for symbolEnd < uint32(len(t.sections)) && t.sections[symbolEnd].Start < fileEnd { 342 symbolEnd++ 343 } 344 345 sections = t.sections[t.secID:symbolEnd] 346 } 347 } 348 349 secIdx := 0 350 trimmed := t.current[:0] 351 for len(sections) > secIdx && len(t.current) > 0 { 352 start := fileStart + t.current[0].runeOffset 353 end := start + t.patternSize 354 if start >= sections[secIdx].End { 355 secIdx++ 356 continue 357 } 358 359 if start < sections[secIdx].Start { 360 t.current = t.current[1:] 361 continue 362 } 363 364 if end <= sections[secIdx].End { 365 t.current[0].symbol = true 366 t.current[0].symbolIdx = uint32(secIdx) 367 trimmed = append(trimmed, t.current[0]) 368 } 369 370 t.current = t.current[1:] 371 } 372 t.current = trimmed 373} 374 375// all prepare methods 376 377func (t *bruteForceMatchTree) prepare(doc uint32) { 378 t.docID = doc 379 t.firstDone = true 380} 381 382func (t *docMatchTree) prepare(doc uint32) { 383 t.docID = doc 384 t.firstDone = true 385} 386 387func (t *andMatchTree) prepare(doc uint32) { 388 for _, c := range t.children { 389 c.prepare(doc) 390 } 391} 392 393func (t *regexpMatchTree) prepare(doc uint32) { 394 t.found = t.found[:0] 395 t.reEvaluated = false 396 t.bruteForceMatchTree.prepare(doc) 397} 398 399func (t *wordMatchTree) prepare(doc uint32) { 400 t.found = t.found[:0] 401 t.evaluated = false 402 t.bruteForceMatchTree.prepare(doc) 403} 404 405func (t *orMatchTree) prepare(doc uint32) { 406 for _, c := range t.children { 407 c.prepare(doc) 408 } 409} 410 411func (t *notMatchTree) prepare(doc uint32) { 412 t.child.prepare(doc) 413} 414 415func (t *fileNameMatchTree) prepare(doc uint32) { 416 t.child.prepare(doc) 417} 418 419func (t *boostMatchTree) prepare(doc uint32) { 420 t.child.prepare(doc) 421} 422 423func (t *substrMatchTree) prepare(nextDoc uint32) { 424 t.matchIterator.prepare(nextDoc) 425 t.current = t.matchIterator.candidates() 426 t.contEvaluated = false 427} 428 429func (t *branchQueryMatchTree) prepare(doc uint32) { 430 t.firstDone = true 431 t.docID = doc 432} 433 434// nextDoc 435 436func (t *docMatchTree) nextDoc() uint32 { 437 var start uint32 438 if t.firstDone { 439 start = t.docID + 1 440 } 441 for i := start; i < t.numDocs; i++ { 442 if t.predicate(i) { 443 return i 444 } 445 } 446 return math.MaxUint32 447} 448 449func (t *bruteForceMatchTree) nextDoc() uint32 { 450 if !t.firstDone { 451 return 0 452 } 453 return t.docID + 1 454} 455 456func (t *andMatchTree) nextDoc() uint32 { 457 var max uint32 458 for _, c := range t.children { 459 m := c.nextDoc() 460 if m > max { 461 max = m 462 } 463 } 464 return max 465} 466 467func (t *orMatchTree) nextDoc() uint32 { 468 min := uint32(math.MaxUint32) 469 for _, c := range t.children { 470 m := c.nextDoc() 471 if m < min { 472 min = m 473 } 474 } 475 return min 476} 477 478func (t *notMatchTree) nextDoc() uint32 { 479 return 0 480} 481 482func (t *fileNameMatchTree) nextDoc() uint32 { 483 return t.child.nextDoc() 484} 485 486func (t *boostMatchTree) nextDoc() uint32 { 487 return t.child.nextDoc() 488} 489 490func (t *branchQueryMatchTree) nextDoc() uint32 { 491 var start uint32 492 if t.firstDone { 493 start = t.docID + 1 494 } 495 496 for i := start; i < uint32(len(t.fileMasks)); i++ { 497 if (t.masks[t.repos[i]] & t.fileMasks[i]) != 0 { 498 return i 499 } 500 } 501 return math.MaxUint32 502} 503 504// all String methods 505 506func (t *bruteForceMatchTree) String() string { 507 return "all" 508} 509 510func (t *docMatchTree) String() string { 511 return fmt.Sprintf("doc(%s)", t.reason) 512} 513 514func (t *andMatchTree) String() string { 515 return fmt.Sprintf("and%v", t.children) 516} 517 518func (t *regexpMatchTree) String() string { 519 f := "" 520 if t.fileName { 521 f = "f" 522 } 523 return fmt.Sprintf("%sre(%s)", f, t.regexp) 524} 525 526func (t *wordMatchTree) String() string { 527 f := "" 528 if t.fileName { 529 f = "f" 530 } 531 return fmt.Sprintf("%sword(%s)", f, t.word) 532} 533 534func (t *orMatchTree) String() string { 535 return fmt.Sprintf("or%v", t.children) 536} 537 538func (t *notMatchTree) String() string { 539 return fmt.Sprintf("not(%v)", t.child) 540} 541 542func (t *noVisitMatchTree) String() string { 543 return fmt.Sprintf("novisit(%v)", t.matchTree) 544} 545 546func (t *fileNameMatchTree) String() string { 547 return fmt.Sprintf("f(%v)", t.child) 548} 549 550func (t *boostMatchTree) String() string { 551 return fmt.Sprintf("boost(%f, %v)", t.boost, t.child) 552} 553 554func (t *substrMatchTree) String() string { 555 f := "" 556 if t.fileName { 557 f = "f" 558 } 559 560 return fmt.Sprintf("%ssubstr(%q, %v, %v)", f, t.query.Pattern, t.current, t.matchIterator) 561} 562 563func (t *branchQueryMatchTree) String() string { 564 return fmt.Sprintf("branch(%x)", t.masks) 565} 566 567func (t *symbolSubstrMatchTree) String() string { 568 return fmt.Sprintf("symbol(%v)", t.substrMatchTree) 569} 570 571func (t *symbolRegexpMatchTree) String() string { 572 return fmt.Sprintf("symbol(%v)", t.matchTree) 573} 574 575// visitMatches visits all atoms in matchTree. Note: This visits 576// noVisitMatchTree. For collecting matches use visitMatches. 577func visitMatchTree(t matchTree, f func(matchTree)) { 578 switch s := t.(type) { 579 case *andMatchTree: 580 for _, ch := range s.children { 581 visitMatchTree(ch, f) 582 } 583 case *orMatchTree: 584 for _, ch := range s.children { 585 visitMatchTree(ch, f) 586 } 587 case *andLineMatchTree: 588 visitMatchTree(&s.andMatchTree, f) 589 case *noVisitMatchTree: 590 visitMatchTree(s.matchTree, f) 591 case *notMatchTree: 592 visitMatchTree(s.child, f) 593 case *fileNameMatchTree: 594 visitMatchTree(s.child, f) 595 case *boostMatchTree: 596 visitMatchTree(s.child, f) 597 case *symbolSubstrMatchTree: 598 visitMatchTree(s.substrMatchTree, f) 599 case *symbolRegexpMatchTree: 600 visitMatchTree(s.matchTree, f) 601 default: 602 f(t) 603 } 604} 605 606// updateMatchTreeStats calls updateStats on all atoms in mt which have that 607// function defined. 608func updateMatchTreeStats(mt matchTree, stats *zoekt.Stats) { 609 visitMatchTree(mt, func(mt matchTree) { 610 if atom, ok := mt.(interface{ updateStats(*zoekt.Stats) }); ok { 611 atom.updateStats(stats) 612 } 613 }) 614} 615 616func visitMatchAtoms(t matchTree, known map[matchTree]bool, f func(matchTree)) { 617 visitMatches(t, known, 1, func(mt matchTree, _ float64) { 618 f(mt) 619 }) 620} 621 622// visitMatches visits all atoms which can contribute matches. Note: This 623// skips noVisitMatchTree. 624func visitMatches(t matchTree, known map[matchTree]bool, weight float64, f func(matchTree, float64)) { 625 switch s := t.(type) { 626 case *andMatchTree: 627 for _, ch := range s.children { 628 if known[ch] { 629 visitMatches(ch, known, weight, f) 630 } 631 } 632 case *andLineMatchTree: 633 visitMatches(&s.andMatchTree, known, weight, f) 634 case *orMatchTree: 635 for _, ch := range s.children { 636 if known[ch] { 637 visitMatches(ch, known, weight, f) 638 } 639 } 640 case *boostMatchTree: 641 visitMatches(s.child, known, weight*s.boost, f) 642 case *symbolSubstrMatchTree: 643 visitMatches(s.substrMatchTree, known, weight, f) 644 case *notMatchTree: 645 case *noVisitMatchTree: 646 // don't collect into negative trees. 647 case *fileNameMatchTree: 648 // We will just gather the filename if we do not visit this tree. 649 default: 650 f(s, weight) 651 } 652} 653 654// all matches() methods. 655 656func (t *docMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState { 657 return matchesStatePred(t.predicate(cp.idx)) 658} 659 660func (t *bruteForceMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState { 661 return matchesFound 662} 663 664// andLineMatchTree is a performance optimization of andMatchTree. For content 665// searches we don't want to run the regex engine if there is no line that 666// contains matches from all terms. 667func (t *andLineMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState { 668 if state := evalMatchTree(cp, cost, known, &t.andMatchTree); state != matchesFound { 669 return state 670 } 671 672 // Invariant: all children have matches. If any line contains all of them we 673 // can return MatchesFound. 674 675 // find child with fewest candidates 676 minCount := math.MaxInt 677 fewestChildren := 0 678 for ix, child := range t.children { 679 v, ok := child.(*substrMatchTree) 680 // make sure we are running a content search and that all candidates are a 681 // substrMatchTree 682 if !ok || v.fileName { 683 return matchesFound 684 } 685 if len(v.current) < minCount { 686 minCount = len(v.current) 687 fewestChildren = ix 688 } 689 } 690 691 type lineRange struct { 692 start int 693 end int 694 } 695 lines := make([]lineRange, 0, len(t.children[fewestChildren].(*substrMatchTree).current)) 696 prev := -1 697 for _, candidate := range t.children[fewestChildren].(*substrMatchTree).current { 698 line := cp.newlines().atOffset(candidate.byteOffset) 699 if line == prev { 700 continue 701 } 702 prev = line 703 byteStart := int(cp.newlines().lineStart(line)) 704 byteEnd := int(cp.newlines().lineStart(line + 1)) 705 lines = append(lines, lineRange{byteStart, byteEnd}) 706 } 707 708 // children keeps track of the children's candidates we have already seen. 709 children := make([][]*candidateMatch, 0, len(t.children)-1) 710 for j, child := range t.children { 711 if j == fewestChildren { 712 continue 713 } 714 children = append(children, child.(*substrMatchTree).current) 715 } 716 717nextLine: 718 for i := 0; i < len(lines); i++ { 719 hits := 1 720 nextChild: 721 for j := range children { 722 nextCandidate: 723 for len(children[j]) > 0 { 724 candidate := children[j][0] 725 bo := int(cp.findOffset(false, candidate.runeOffset)) 726 if bo < lines[i].start { 727 children[j] = children[j][1:] 728 continue nextCandidate 729 } 730 if bo < lines[i].end { 731 hits++ 732 continue nextChild 733 } 734 // move the `lines` iterator forward until bo < line.end 735 for i < len(lines) && bo >= lines[i].end { 736 i++ 737 } 738 i-- 739 continue nextLine 740 } 741 } 742 // return early once we found any line that contains matches from all children 743 if hits == len(t.children) { 744 return matchesFound 745 } 746 } 747 return matchesNone 748} 749 750func (t *andMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState { 751 // We have found matches unless a child needs to do more work or it hasn't 752 // found matches. 753 state := matchesFound 754 755 for _, ch := range t.children { 756 switch evalMatchTree(cp, cost, known, ch) { 757 case matchesRequiresHigherCost: 758 // keep evaluating other children incase we come across matchesNone 759 state = matchesRequiresHigherCost 760 case matchesFound: 761 // will return this if every child has this value 762 case matchesNone: 763 return matchesNone 764 } 765 } 766 767 return state 768} 769 770func (t *orMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState { 771 // we could short-circuit, but we want to use the other possibilities as a 772 // ranking signal. So we always return the most conservative state. 773 state := matchesNone 774 for _, ch := range t.children { 775 switch evalMatchTree(cp, cost, known, ch) { 776 case matchesRequiresHigherCost: 777 state = matchesRequiresHigherCost 778 case matchesFound: 779 if state != matchesRequiresHigherCost { 780 state = matchesFound 781 } 782 case matchesNone: 783 // noop 784 } 785 } 786 return state 787} 788 789func (t *branchQueryMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState { 790 return matchesStatePred(t.branchMask() != 0) 791} 792 793func (t *regexpMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState { 794 if t.reEvaluated { 795 return matchesStateForSlice(t.found) 796 } 797 798 if cost < costRegexp { 799 return matchesRequiresHigherCost 800 } 801 802 cp.stats.RegexpsConsidered++ 803 idxs := t.regexp.FindAllIndex(cp.data(t.fileName), -1) 804 found := t.found[:0] 805 for _, idx := range idxs { 806 cm := &candidateMatch{ 807 byteOffset: uint32(idx[0]), 808 byteMatchSz: uint32(idx[1] - idx[0]), 809 fileName: t.fileName, 810 } 811 812 found = append(found, cm) 813 } 814 t.found = found 815 t.reEvaluated = true 816 817 return matchesStateForSlice(t.found) 818} 819 820func (t *wordMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState { 821 if t.evaluated { 822 return matchesStateForSlice(t.found) 823 } 824 825 if cost < costRegexp { 826 return matchesRequiresHigherCost 827 } 828 829 data := cp.data(t.fileName) 830 offset := 0 831 found := t.found[:0] 832 for { 833 idx := bytes.Index(data[offset:], []byte(t.word)) 834 if idx < 0 { 835 break 836 } 837 838 relStartOffset := offset + idx 839 relEndOffset := relStartOffset + len(t.word) 840 841 startBoundary := relStartOffset < len(data) && (relStartOffset == 0 || !characterClass(data[relStartOffset-1])) 842 endBoundary := relEndOffset > 0 && (relEndOffset == len(data) || !characterClass(data[relEndOffset])) 843 if startBoundary && endBoundary { 844 found = append(found, &candidateMatch{ 845 byteOffset: uint32(offset + idx), 846 byteMatchSz: uint32(len(t.word)), 847 fileName: t.fileName, 848 }) 849 } 850 offset += idx + len(t.word) 851 } 852 853 t.found = found 854 t.evaluated = true 855 856 return matchesStateForSlice(t.found) 857} 858 859// breakMatchesOnNewlines returns matches resulting from breaking each element 860// of cms on newlines within text. 861func breakMatchesOnNewlines(cms []*candidateMatch, text []byte) []*candidateMatch { 862 var lineCMs []*candidateMatch 863 for _, cm := range cms { 864 lineCMs = append(lineCMs, breakOnNewlines(cm, text)...) 865 } 866 return lineCMs 867} 868 869// breakOnNewlines returns matches resulting from breaking cm on newlines 870// within text. 871func breakOnNewlines(cm *candidateMatch, text []byte) []*candidateMatch { 872 var cms []*candidateMatch 873 addMe := &candidateMatch{} 874 *addMe = *cm 875 for i := uint32(cm.byteOffset); i < cm.byteOffset+cm.byteMatchSz; i++ { 876 if text[i] == '\n' { 877 addMe.byteMatchSz = i - addMe.byteOffset 878 if addMe.byteMatchSz != 0 { 879 cms = append(cms, addMe) 880 } 881 882 addMe = &candidateMatch{} 883 *addMe = *cm 884 addMe.byteOffset = i + 1 885 } 886 } 887 addMe.byteMatchSz = cm.byteOffset + cm.byteMatchSz - addMe.byteOffset 888 if addMe.byteMatchSz != 0 { 889 cms = append(cms, addMe) 890 } 891 return cms 892} 893 894// evalMatchTree should be called instead of directly calling 895// matchTree.matches. It cache known values for future evaluation at higher 896// costs. 897func evalMatchTree(cp *contentProvider, cost int, known map[matchTree]bool, mt matchTree) matchesState { 898 if v, ok := known[mt]; ok { 899 return matchesStatePred(v) 900 } 901 902 ms := mt.matches(cp, cost, known) 903 if ms != matchesRequiresHigherCost { 904 known[mt] = ms == matchesFound 905 } 906 907 return ms 908} 909 910func (t *notMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState { 911 switch evalMatchTree(cp, cost, known, t.child) { 912 case matchesRequiresHigherCost: 913 return matchesRequiresHigherCost 914 case matchesFound: 915 return matchesNone 916 case matchesNone: 917 return matchesFound 918 default: 919 panic("unreachable") 920 } 921} 922 923func (t *fileNameMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState { 924 return evalMatchTree(cp, cost, known, t.child) 925} 926 927func (t *boostMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState { 928 return evalMatchTree(cp, cost, known, t.child) 929} 930 931func (t *substrMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState { 932 if t.contEvaluated { 933 return matchesStateForSlice(t.current) 934 } 935 936 if len(t.current) == 0 { 937 return matchesNone 938 } 939 940 if t.fileName && cost < costMemory { 941 return matchesRequiresHigherCost 942 } 943 944 if !t.fileName && cost < costContent { 945 return matchesRequiresHigherCost 946 } 947 948 pruned := t.current[:0] 949 for _, m := range t.current { 950 if m.byteOffset == 0 && m.runeOffset > 0 { 951 m.byteOffset = cp.findOffset(m.fileName, m.runeOffset) 952 } 953 if m.matchContent(cp.data(m.fileName)) { 954 pruned = append(pruned, m) 955 } 956 } 957 t.current = pruned 958 t.contEvaluated = true 959 960 return matchesStateForSlice(t.current) 961} 962 963type matchTreeOpt struct { 964 // DisableWordMatchOptimization is used to disable the use of wordMatchTree. 965 // This was added since we do not support wordMatchTree with symbol search. 966 DisableWordMatchOptimization bool 967} 968 969func (d *indexData) newMatchTree(q query.Q, opt matchTreeOpt) (matchTree, error) { 970 if q == nil { 971 return nil, fmt.Errorf("got nil (sub)query") 972 } 973 switch s := q.(type) { 974 case *query.Regexp: 975 // RegexpToMatchTreeRecursive tries to distill a matchTree that matches a 976 // superset of the regexp. If the returned matchTree is equivalent to the 977 // original regexp, it returns true. An equivalent matchTree has the same 978 // behaviour as the original regexp and can be used instead. 979 // 980 subMT, isEq, _, err := d.regexpToMatchTreeRecursive(s.Regexp, ngramSize, s.FileName, s.CaseSensitive) 981 if err != nil { 982 return nil, err 983 } 984 // if the query can be used in place of the regexp 985 // return the subtree 986 if isEq { 987 return subMT, nil 988 } 989 990 var tr matchTree 991 if wmt, ok := regexpToWordMatchTree(s, opt); ok { 992 // A common search we get is "\bLITERAL\b". Avoid the regex engine and 993 // provide something faster. 994 tr = wmt 995 } else { 996 tr = newRegexpMatchTree(s) 997 } 998 999 return &andMatchTree{ 1000 children: []matchTree{ 1001 tr, &noVisitMatchTree{subMT}, 1002 }, 1003 }, nil 1004 case *query.And: 1005 var r []matchTree 1006 for _, ch := range s.Children { 1007 ct, err := d.newMatchTree(ch, opt) 1008 if err != nil { 1009 return nil, err 1010 } 1011 r = append(r, ct) 1012 } 1013 return &andMatchTree{r}, nil 1014 case *query.Or: 1015 var r []matchTree 1016 for _, ch := range s.Children { 1017 ct, err := d.newMatchTree(ch, opt) 1018 if err != nil { 1019 return nil, err 1020 } 1021 r = append(r, ct) 1022 } 1023 return &orMatchTree{r}, nil 1024 case *query.Not: 1025 ct, err := d.newMatchTree(s.Child, opt) 1026 return &notMatchTree{ 1027 child: ct, 1028 }, err 1029 1030 case *query.Type: 1031 if s.Type != query.TypeFileName { 1032 break 1033 } 1034 1035 ct, err := d.newMatchTree(s.Child, opt) 1036 if err != nil { 1037 return nil, err 1038 } 1039 1040 return &fileNameMatchTree{ 1041 child: ct, 1042 }, nil 1043 1044 case *query.Boost: 1045 ct, err := d.newMatchTree(s.Child, opt) 1046 if err != nil { 1047 return nil, err 1048 } 1049 1050 return &boostMatchTree{ 1051 child: ct, 1052 boost: s.Boost, 1053 }, nil 1054 1055 case *query.Substring: 1056 return d.newSubstringMatchTree(s) 1057 1058 case *query.Branch: 1059 masks := make([]uint64, 0, len(d.repoMetaData)) 1060 if s.Pattern == "HEAD" { 1061 for range d.repoMetaData { 1062 masks = append(masks, 1) 1063 } 1064 } else { 1065 for _, branchIDs := range d.branchIDs { 1066 mask := uint64(0) 1067 for nm, m := range branchIDs { 1068 if (s.Exact && nm == s.Pattern) || (!s.Exact && strings.Contains(nm, s.Pattern)) { 1069 mask |= uint64(m) 1070 } 1071 } 1072 masks = append(masks, mask) 1073 } 1074 } 1075 return &branchQueryMatchTree{ 1076 masks: masks, 1077 fileMasks: d.fileBranchMasks, 1078 repos: d.repos, 1079 }, nil 1080 case *query.Const: 1081 if s.Value { 1082 return &bruteForceMatchTree{}, nil 1083 } else { 1084 return &noMatchTree{Why: "const"}, nil 1085 } 1086 case *query.Language: 1087 code, ok := d.metaData.LanguageMap[s.Language] 1088 if !ok { 1089 return &noMatchTree{Why: "lang"}, nil 1090 } 1091 return &docMatchTree{ 1092 reason: "language", 1093 numDocs: d.numDocs(), 1094 predicate: func(docID uint32) bool { 1095 return d.getLanguage(docID) == code 1096 }, 1097 }, nil 1098 1099 case *query.Symbol: 1100 // Disable WordMatchTree since we don't support it in symbols yet. 1101 optCopy := opt 1102 optCopy.DisableWordMatchOptimization = true 1103 1104 subMT, err := d.newMatchTree(s.Expr, optCopy) 1105 if err != nil { 1106 return nil, err 1107 } 1108 1109 if substr, ok := subMT.(*substrMatchTree); ok { 1110 return &symbolSubstrMatchTree{ 1111 substrMatchTree: substr, 1112 patternSize: uint32(utf8.RuneCountInString(substr.query.Pattern)), 1113 fileEndRunes: d.fileEndRunes, 1114 fileEndSymbol: d.fileEndSymbol, 1115 sections: d.runeDocSections, 1116 }, nil 1117 } 1118 1119 var regexpMT *regexpMatchTree 1120 visitMatchTree(subMT, func(mt matchTree) { 1121 if t, ok := mt.(*regexpMatchTree); ok { 1122 regexpMT = t 1123 } 1124 }) 1125 if regexpMT == nil { 1126 return nil, fmt.Errorf("found %T inside query.Symbol", subMT) 1127 } 1128 1129 return &symbolRegexpMatchTree{ 1130 regexp: regexpMT.regexp, 1131 all: isRegexpAll(regexpMT.origRegexp), 1132 matchTree: subMT, 1133 }, nil 1134 1135 case *query.FileNameSet: 1136 return &docMatchTree{ 1137 reason: "FileNameSet", 1138 numDocs: d.numDocs(), 1139 predicate: func(docID uint32) bool { 1140 fileName := d.fileName(docID) 1141 _, ok := s.Set[string(fileName)] 1142 return ok 1143 }, 1144 }, nil 1145 1146 case *query.BranchesRepos: 1147 reposBranchesWant := make([]uint64, len(d.repoMetaData)) 1148 for repoIdx := range d.repoMetaData { 1149 var mask uint64 1150 for _, br := range s.List { 1151 if br.Repos.Contains(d.repoMetaData[repoIdx].ID) { 1152 mask |= uint64(d.branchIDs[repoIdx][br.Branch]) 1153 } 1154 } 1155 reposBranchesWant[repoIdx] = mask 1156 } 1157 return &docMatchTree{ 1158 reason: "BranchesRepos", 1159 numDocs: d.numDocs(), 1160 predicate: func(docID uint32) bool { 1161 return d.fileBranchMasks[docID]&reposBranchesWant[d.repos[docID]] != 0 1162 }, 1163 }, nil 1164 1165 case *query.RepoSet: 1166 reposWant := make([]bool, len(d.repoMetaData)) 1167 for repoIdx, r := range d.repoMetaData { 1168 if _, ok := s.Set[r.Name]; ok { 1169 reposWant[repoIdx] = true 1170 } 1171 } 1172 return &docMatchTree{ 1173 reason: "RepoSet", 1174 numDocs: d.numDocs(), 1175 predicate: func(docID uint32) bool { 1176 return reposWant[d.repos[docID]] 1177 }, 1178 }, nil 1179 1180 case *query.RepoIDs: 1181 reposWant := make([]bool, len(d.repoMetaData)) 1182 for repoIdx, r := range d.repoMetaData { 1183 if s.Repos.Contains(r.ID) { 1184 reposWant[repoIdx] = true 1185 } 1186 } 1187 return &docMatchTree{ 1188 reason: "RepoIDs", 1189 numDocs: d.numDocs(), 1190 predicate: func(docID uint32) bool { 1191 return reposWant[d.repos[docID]] 1192 }, 1193 }, nil 1194 1195 case *query.Repo: 1196 reposWant := make([]bool, len(d.repoMetaData)) 1197 for repoIdx, r := range d.repoMetaData { 1198 if s.Regexp.MatchString(r.Name) { 1199 reposWant[repoIdx] = true 1200 } 1201 } 1202 return &docMatchTree{ 1203 reason: "Repo", 1204 numDocs: d.numDocs(), 1205 predicate: func(docID uint32) bool { 1206 return reposWant[d.repos[docID]] 1207 }, 1208 }, nil 1209 1210 case *query.RepoRegexp: 1211 reposWant := make([]bool, len(d.repoMetaData)) 1212 for repoIdx, r := range d.repoMetaData { 1213 if s.Regexp.MatchString(r.Name) { 1214 reposWant[repoIdx] = true 1215 } 1216 } 1217 return &docMatchTree{ 1218 reason: "RepoRegexp", 1219 numDocs: d.numDocs(), 1220 predicate: func(docID uint32) bool { 1221 return reposWant[d.repos[docID]] 1222 }, 1223 }, nil 1224 1225 case query.RawConfig: 1226 return &docMatchTree{ 1227 reason: s.String(), 1228 numDocs: d.numDocs(), 1229 predicate: func(docID uint32) bool { 1230 return uint8(s)&d.rawConfigMasks[d.repos[docID]] == uint8(s) 1231 }, 1232 }, nil 1233 } 1234 log.Panicf("type %T", q) 1235 return nil, nil 1236} 1237 1238func (d *indexData) newSubstringMatchTree(s *query.Substring) (matchTree, error) { 1239 st := &substrMatchTree{ 1240 query: s, 1241 caseSensitive: s.CaseSensitive, 1242 fileName: s.FileName, 1243 } 1244 1245 if utf8.RuneCountInString(s.Pattern) < ngramSize { 1246 return newRegexpMatchTree(&query.Regexp{ 1247 Regexp: &syntax.Regexp{Op: syntax.OpLiteral, Rune: []rune(s.Pattern)}, 1248 FileName: s.FileName, 1249 Content: s.Content, 1250 CaseSensitive: s.CaseSensitive, 1251 }), nil 1252 } 1253 1254 result, err := d.iterateNgrams(s) 1255 if err != nil { 1256 return nil, err 1257 } 1258 st.matchIterator = result 1259 return st, nil 1260} 1261 1262func regexpToWordMatchTree(q *query.Regexp, opt matchTreeOpt) (_ *wordMatchTree, ok bool) { 1263 if opt.DisableWordMatchOptimization { 1264 return nil, false 1265 } 1266 // Needs to be case sensitive 1267 if !q.CaseSensitive || q.Regexp.Flags&syntax.FoldCase != 0 { 1268 return nil, false 1269 } 1270 // We want a regex that looks like Op.Concat[OpWordBoundary OpLiteral OpWordBoundary] 1271 if q.Regexp.Op != syntax.OpConcat || len(q.Regexp.Sub) != 3 { 1272 return nil, false 1273 } 1274 sub := q.Regexp.Sub 1275 if sub[0].Op != syntax.OpWordBoundary || sub[1].Op != syntax.OpLiteral || sub[2].Op != syntax.OpWordBoundary { 1276 return nil, false 1277 } 1278 1279 return &wordMatchTree{ 1280 word: string(sub[1].Rune), 1281 fileName: q.FileName, 1282 }, true 1283} 1284 1285// pruneMatchTree removes impossible branches from the matchTree, as indicated 1286// by substrMatchTree having a noMatchTree and the resulting impossible and clauses and so forth. 1287func pruneMatchTree(mt matchTree) (matchTree, error) { 1288 var err error 1289 switch mt := mt.(type) { 1290 // leaf nodes that we test with the filter: 1291 case *substrMatchTree: 1292 if res, ok := mt.matchIterator.(*ngramIterationResults); ok { 1293 if _, ok := res.matchIterator.(*noMatchTree); ok { 1294 return nil, nil 1295 } 1296 } 1297 // recursive tree structures: 1298 case *andMatchTree: 1299 // Any branch of an and becoming impossible means the entire clause 1300 // is impossible. Otherwise, just handle rewrites. 1301 for i, child := range mt.children { 1302 newChild, err := pruneMatchTree(child) 1303 if err != nil { 1304 return nil, err 1305 } 1306 if newChild == nil { 1307 return nil, nil 1308 } 1309 mt.children[i] = newChild 1310 } 1311 case *orMatchTree: 1312 // *All* branches of an OR becoming impossible means the entire clause 1313 // is impossible. Otherwise, drop impossible subclauses and handle 1314 // rewrites, including simplifying to a singular resulting child branch. 1315 n := 0 1316 for _, child := range mt.children { 1317 newChild, err := pruneMatchTree(child) 1318 if err != nil { 1319 return nil, err 1320 } 1321 if newChild != nil { 1322 mt.children[n] = newChild 1323 n++ 1324 } 1325 } 1326 mt.children = mt.children[:n] 1327 if len(mt.children) == 1 { 1328 return mt.children[0], nil 1329 } else if len(mt.children) == 0 { 1330 return nil, nil 1331 } 1332 case *noVisitMatchTree: 1333 mt.matchTree, err = pruneMatchTree(mt.matchTree) 1334 if err != nil { 1335 return nil, err 1336 } 1337 if mt.matchTree == nil { 1338 return nil, nil 1339 } 1340 case *fileNameMatchTree: 1341 mt.child, err = pruneMatchTree(mt.child) 1342 if err != nil { 1343 return nil, err 1344 } 1345 if mt.child == nil { 1346 return nil, nil 1347 } 1348 case *boostMatchTree: 1349 mt.child, err = pruneMatchTree(mt.child) 1350 if err != nil { 1351 return nil, err 1352 } 1353 if mt.child == nil { 1354 return nil, nil 1355 } 1356 case *andLineMatchTree: 1357 child, err := pruneMatchTree(&mt.andMatchTree) 1358 if err != nil { 1359 return nil, err 1360 } 1361 if child == nil { 1362 return nil, nil 1363 } 1364 if c, ok := child.(*andMatchTree); ok { 1365 mt.andMatchTree = *c 1366 } else { 1367 // the and was simplified to a single clause, 1368 // so the linematch portion is irrelevant. 1369 return mt, nil 1370 } 1371 case *notMatchTree: 1372 mt.child, err = pruneMatchTree(mt.child) 1373 if err != nil { 1374 return nil, err 1375 } 1376 if mt.child == nil { 1377 // not false => true 1378 return &bruteForceMatchTree{}, nil 1379 } 1380 // unhandled: 1381 case *docMatchTree: 1382 case *bruteForceMatchTree: 1383 case *regexpMatchTree: 1384 case *wordMatchTree: 1385 } 1386 return mt, err 1387} 1388 1389// isRegexpAll returns true if the query matches all possible lines. 1390// 1391// Note: it is possible for a funky regex to actually match all but this 1392// returns false. This returns true for normal looking regexes like ".*" or 1393// "(?-s:.*)". 1394func isRegexpAll(r *syntax.Regexp) bool { 1395 // Note: we don't care about any flags since we are looking for .* and we 1396 // don't mind if . matches all or everything but newline. 1397 1398 // for loop is for visiting children of OpCapture/OpConcat until we hit .* 1399 for { 1400 // Our main target: .* 1401 if r.Op == syntax.OpStar && len(r.Sub) == 1 { // * 1402 // (?s:.) or (?-s:.) 1403 return r.Sub[0].Op == syntax.OpAnyChar || r.Sub[0].Op == syntax.OpAnyCharNotNL 1404 } 1405 1406 // Strip away expressions being wrapped in paranthesis 1407 if (r.Op == syntax.OpCapture || r.Op == syntax.OpConcat) && len(r.Sub) == 1 { 1408 r = r.Sub[0] 1409 continue 1410 } 1411 1412 return false 1413 } 1414}