fork of https://github.com/sourcegraph/zoekt
0

Configure Feed

Select the types of activity you want to include in your feed.

at main 36 kB View raw
1// Copyright 2018 Google Inc. All rights reserved. 2// 3// Licensed under the Apache License, Version 2.0 (the "License"); 4// you may not use this file except in compliance with the License. 5// You may obtain a copy of the License at 6// 7// http://www.apache.org/licenses/LICENSE-2.0 8// 9// Unless required by applicable law or agreed to in writing, software 10// distributed under the License is distributed on an "AS IS" BASIS, 11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12// See the License for the specific language governing permissions and 13// limitations under the License. 14 15package index 16 17import ( 18 "bytes" 19 "fmt" 20 "log" 21 "math" 22 "regexp/syntax" 23 "strings" 24 "unicode/utf8" 25 26 "github.com/cespare/xxhash/v2" 27 "github.com/grafana/regexp" 28 29 "github.com/sourcegraph/zoekt" 30 "github.com/sourcegraph/zoekt/internal/hybridre2" 31 "github.com/sourcegraph/zoekt/internal/syntaxutil" 32 "github.com/sourcegraph/zoekt/query" 33) 34 35// A docIterator iterates over documents in order. 36type docIterator interface { 37 // provide the next document where we may find something interesting. 38 // 39 // This is like a "peek" and shouldn't mutate state. prepare is what should 40 // change state. 41 nextDoc() uint32 42 43 // clears any per-document state of the docIterator, and 44 // prepares for evaluating the given doc. The argument is 45 // strictly increasing over time. 46 prepare(nextDoc uint32) 47} 48 49// costs are passed in increasing order to matchTree.matches until they do not 50// return matchesRequiresHigherCost. 51const ( 52 costConst = 0 53 costMemory = 1 54 costContent = 2 55 costRegexp = 3 56) 57 58const ( 59 costMin = costConst 60 costMax = costRegexp 61) 62 63// matchesState is an enum for the state of a matchTree after a call to 64// matchTree.matches. 65type matchesState uint8 66 67const ( 68 // matchesRequiresHigherCost is returned when matchTree.matches hasn't done 69 // a search yet since the cost value is not high enough. 70 matchesRequiresHigherCost matchesState = iota 71 72 // matchesFound is returned when matchTree.matches has done a search and 73 // found one or more matches. 74 matchesFound 75 76 // matchesNone is returned when matchTree.matches has done a search and 77 // found nothing. 78 matchesNone 79) 80 81// matchesStatePred is a helper which returns matchesFound if b is true 82// otherwise returns matchesNone. 83func matchesStatePred(b bool) matchesState { 84 if b { 85 return matchesFound 86 } 87 return matchesNone 88} 89 90// matchesStateForSlice is a helper which returns matchesFound if v is 91// non-empty otherwise returns matchesNone. 92func matchesStateForSlice[T any](v []T) matchesState { 93 return matchesStatePred(len(v) > 0) 94} 95 96// An expression tree coupled with matches. The matchtree has two 97// functions: 98// 99// * it implements boolean combinations (and, or, not) 100// 101// * it implements shortcuts, where we skip documents (for example: if 102// there are no trigram matches, we can be sure there are no substring 103// matches). The matchtree iterates over the documents as they are 104// ordered in the shard. 105// 106// The general process for a given (shard, query) is: 107// 108// - construct matchTree for the query 109// 110// - find all different leaf matchTrees (substring, regexp, etc.) 111// 112// in a loop: 113// 114// - find next doc to process using nextDoc 115// 116// - evaluate atoms (leaf expressions that match text) 117// 118// - evaluate the tree using matches(), storing the result in map. 119// 120// - if the complete tree returns (matches() != matchesRequiresHigherCost) 121// for the document, collect all text matches by looking at leaf 122// matchTrees. 123type matchTree interface { 124 docIterator 125 126 // matches if cost is high enough. See documentation for matchesState's 127 // values. 128 // 129 // Note: Do not call this directly, rather use evalMatchTree which uses 130 // known to cache responses once the state transitions away from 131 // matchesRequiresHigherCost. 132 matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState 133} 134 135// docMatchTree iterates over documents for which predicate(docID) returns true. 136type docMatchTree struct { 137 // the number of documents in a shard. 138 numDocs uint32 139 140 predicate func(docID uint32) bool 141 142 // provides additional information about the reason why the docMatchTree was 143 // created. 144 reason string 145 146 // mutable 147 firstDone bool 148 docID uint32 149} 150 151type bruteForceMatchTree struct { 152 // mutable 153 firstDone bool 154 docID uint32 155} 156 157type andLineMatchTree struct { 158 andMatchTree 159} 160 161type andMatchTree struct { 162 children []matchTree 163} 164 165type orMatchTree struct { 166 children []matchTree 167} 168 169type notMatchTree struct { 170 child matchTree 171} 172 173// Returns only the filename of child matches. 174type fileNameMatchTree struct { 175 child matchTree 176} 177 178type boostMatchTree struct { 179 child matchTree 180 boost float64 181} 182 183// Don't visit this subtree for collecting matches. 184type noVisitMatchTree struct { 185 matchTree 186} 187 188type regexpMatchTree struct { 189 regexp *regexp.Regexp 190 191 // hybridRegexp is a size-aware wrapper that dispatches to go-re2 for 192 // large file content when ZOEKT_RE2_THRESHOLD_BYTES is configured. 193 // For small inputs and filename matches, regexp is used directly. 194 hybridRegexp *hybridre2.Regexp 195 196 // origRegexp is the original parsed regexp from the query structure. It 197 // does not include mutations such as case sensitivity. 198 origRegexp *syntax.Regexp 199 200 fileName bool 201 202 // mutable 203 reEvaluated bool 204 found []*candidateMatch 205 206 // nextDoc, prepare. 207 bruteForceMatchTree 208} 209 210func newRegexpMatchTree(s *query.Regexp) *regexpMatchTree { 211 prefix := "" 212 if !s.CaseSensitive { 213 prefix = "(?i)" 214 } 215 216 pattern := prefix + syntaxutil.RegexpString(s.Regexp) 217 218 // hybridRegexp is only used for file content matching; skip the RE2 219 // compilation overhead for filename-only regexps. 220 var hr *hybridre2.Regexp 221 if !s.FileName { 222 hr = hybridre2.MustCompile(pattern) 223 } 224 return &regexpMatchTree{ 225 regexp: regexp.MustCompile(pattern), 226 hybridRegexp: hr, 227 origRegexp: s.Regexp, 228 fileName: s.FileName, 229 } 230} 231 232// \bLITERAL\b 233type wordMatchTree struct { 234 word string 235 236 fileName bool 237 238 // mutable 239 evaluated bool 240 found []*candidateMatch 241 242 // nextDoc, prepare. 243 bruteForceMatchTree 244} 245 246type substrMatchTree struct { 247 matchIterator 248 249 query *query.Substring 250 caseSensitive bool 251 fileName bool 252 253 // mutable 254 current []*candidateMatch 255 contEvaluated bool 256} 257 258type branchQueryMatchTree struct { 259 fileMasks []uint64 260 masks []uint64 261 repos []uint16 262 263 // mutable 264 firstDone bool 265 docID uint32 266} 267 268func (t *branchQueryMatchTree) branchMask() uint64 { 269 return t.fileMasks[t.docID] & t.masks[t.repos[t.docID]] 270} 271 272type symbolRegexpMatchTree struct { 273 matchTree 274 regexp *regexp.Regexp 275 all bool // skips regex match if .* 276 277 reEvaluated bool 278 found []*candidateMatch 279} 280 281func (t *symbolRegexpMatchTree) prepare(doc uint32) { 282 t.reEvaluated = false 283 t.found = t.found[:0] 284 t.matchTree.prepare(doc) 285} 286 287func (t *symbolRegexpMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState { 288 if t.reEvaluated { 289 return matchesStateForSlice(t.found) 290 } 291 292 if cost < costRegexp { 293 return matchesRequiresHigherCost 294 } 295 296 sections := cp.docSections() 297 content := cp.data(false) 298 299 found := t.found[:0] 300 for i, sec := range sections { 301 var idx []int 302 if t.all { 303 idx = []int{0, int(sec.End - sec.Start)} 304 } else { 305 idx = t.regexp.FindIndex(content[sec.Start:sec.End]) 306 if idx == nil { 307 continue 308 } 309 } 310 311 cm := &candidateMatch{ 312 byteOffset: sec.Start + uint32(idx[0]), 313 byteMatchSz: uint32(idx[1] - idx[0]), 314 symbol: true, 315 symbolIdx: uint32(i), 316 } 317 found = append(found, cm) 318 } 319 t.found = found 320 t.reEvaluated = true 321 322 return matchesStateForSlice(t.found) 323} 324 325type symbolSubstrMatchTree struct { 326 *substrMatchTree 327 328 patternSize uint32 329 fileEndRunes []uint32 330 fileEndSymbol []uint32 331 332 doc uint32 333 sections []DocumentSection 334 335 secID uint32 336} 337 338func (t *symbolSubstrMatchTree) prepare(doc uint32) { 339 t.substrMatchTree.prepare(doc) 340 t.doc = doc 341 342 var fileStart uint32 343 if doc > 0 { 344 fileStart = t.fileEndRunes[doc-1] 345 } 346 347 var sections []DocumentSection 348 if len(t.sections) > 0 { 349 most := t.fileEndSymbol[len(t.fileEndSymbol)-1] 350 if most == uint32(len(t.sections)) { 351 sections = t.sections[t.fileEndSymbol[doc]:t.fileEndSymbol[doc+1]] 352 } else { 353 for t.secID < uint32(len(t.sections)) && t.sections[t.secID].Start < fileStart { 354 t.secID++ 355 } 356 357 fileEnd, symbolEnd := t.fileEndRunes[doc], t.secID 358 for symbolEnd < uint32(len(t.sections)) && t.sections[symbolEnd].Start < fileEnd { 359 symbolEnd++ 360 } 361 362 sections = t.sections[t.secID:symbolEnd] 363 } 364 } 365 366 secIdx := 0 367 trimmed := t.current[:0] 368 for len(sections) > secIdx && len(t.current) > 0 { 369 start := fileStart + t.current[0].runeOffset 370 end := start + t.patternSize 371 if start >= sections[secIdx].End { 372 secIdx++ 373 continue 374 } 375 376 if start < sections[secIdx].Start { 377 t.current = t.current[1:] 378 continue 379 } 380 381 if end <= sections[secIdx].End { 382 t.current[0].symbol = true 383 t.current[0].symbolIdx = uint32(secIdx) 384 trimmed = append(trimmed, t.current[0]) 385 } 386 387 t.current = t.current[1:] 388 } 389 t.current = trimmed 390} 391 392// all prepare methods 393 394func (t *bruteForceMatchTree) prepare(doc uint32) { 395 t.docID = doc 396 t.firstDone = true 397} 398 399func (t *docMatchTree) prepare(doc uint32) { 400 t.docID = doc 401 t.firstDone = true 402} 403 404func (t *andMatchTree) prepare(doc uint32) { 405 for _, c := range t.children { 406 c.prepare(doc) 407 } 408} 409 410func (t *regexpMatchTree) prepare(doc uint32) { 411 t.found = t.found[:0] 412 t.reEvaluated = false 413 t.bruteForceMatchTree.prepare(doc) 414} 415 416func (t *wordMatchTree) prepare(doc uint32) { 417 t.found = t.found[:0] 418 t.evaluated = false 419 t.bruteForceMatchTree.prepare(doc) 420} 421 422func (t *orMatchTree) prepare(doc uint32) { 423 for _, c := range t.children { 424 c.prepare(doc) 425 } 426} 427 428func (t *notMatchTree) prepare(doc uint32) { 429 t.child.prepare(doc) 430} 431 432func (t *fileNameMatchTree) prepare(doc uint32) { 433 t.child.prepare(doc) 434} 435 436func (t *boostMatchTree) prepare(doc uint32) { 437 t.child.prepare(doc) 438} 439 440func (t *substrMatchTree) prepare(nextDoc uint32) { 441 t.matchIterator.prepare(nextDoc) 442 t.current = t.matchIterator.candidates() 443 t.contEvaluated = false 444} 445 446func (t *branchQueryMatchTree) prepare(doc uint32) { 447 t.firstDone = true 448 t.docID = doc 449} 450 451// nextDoc 452 453func (t *docMatchTree) nextDoc() uint32 { 454 var start uint32 455 if t.firstDone { 456 start = t.docID + 1 457 } 458 for i := start; i < t.numDocs; i++ { 459 if t.predicate(i) { 460 return i 461 } 462 } 463 return math.MaxUint32 464} 465 466func (t *bruteForceMatchTree) nextDoc() uint32 { 467 if !t.firstDone { 468 return 0 469 } 470 return t.docID + 1 471} 472 473func (t *andMatchTree) nextDoc() uint32 { 474 var max uint32 475 for _, c := range t.children { 476 m := c.nextDoc() 477 if m > max { 478 max = m 479 } 480 } 481 return max 482} 483 484func (t *orMatchTree) nextDoc() uint32 { 485 min := uint32(math.MaxUint32) 486 for _, c := range t.children { 487 m := c.nextDoc() 488 if m < min { 489 min = m 490 } 491 } 492 return min 493} 494 495func (t *notMatchTree) nextDoc() uint32 { 496 return 0 497} 498 499func (t *fileNameMatchTree) nextDoc() uint32 { 500 return t.child.nextDoc() 501} 502 503func (t *boostMatchTree) nextDoc() uint32 { 504 return t.child.nextDoc() 505} 506 507func (t *branchQueryMatchTree) nextDoc() uint32 { 508 var start uint32 509 if t.firstDone { 510 start = t.docID + 1 511 } 512 513 for i := start; i < uint32(len(t.fileMasks)); i++ { 514 if (t.masks[t.repos[i]] & t.fileMasks[i]) != 0 { 515 return i 516 } 517 } 518 return math.MaxUint32 519} 520 521// all String methods 522 523func (t *bruteForceMatchTree) String() string { 524 return "all" 525} 526 527func (t *docMatchTree) String() string { 528 return fmt.Sprintf("doc(%s)", t.reason) 529} 530 531func (t *andMatchTree) String() string { 532 return fmt.Sprintf("and%v", t.children) 533} 534 535func (t *regexpMatchTree) String() string { 536 f := "" 537 if t.fileName { 538 f = "f" 539 } 540 return fmt.Sprintf("%sre(%s)", f, t.regexp) 541} 542 543func (t *wordMatchTree) String() string { 544 f := "" 545 if t.fileName { 546 f = "f" 547 } 548 return fmt.Sprintf("%sword(%s)", f, t.word) 549} 550 551func (t *orMatchTree) String() string { 552 return fmt.Sprintf("or%v", t.children) 553} 554 555func (t *notMatchTree) String() string { 556 return fmt.Sprintf("not(%v)", t.child) 557} 558 559func (t *noVisitMatchTree) String() string { 560 return fmt.Sprintf("novisit(%v)", t.matchTree) 561} 562 563func (t *fileNameMatchTree) String() string { 564 return fmt.Sprintf("f(%v)", t.child) 565} 566 567func (t *boostMatchTree) String() string { 568 return fmt.Sprintf("boost(%f, %v)", t.boost, t.child) 569} 570 571func (t *substrMatchTree) String() string { 572 f := "" 573 if t.fileName { 574 f = "f" 575 } 576 577 return fmt.Sprintf("%ssubstr(%q, %v, %v)", f, t.query.Pattern, t.current, t.matchIterator) 578} 579 580func (t *branchQueryMatchTree) String() string { 581 return fmt.Sprintf("branch(%x)", t.masks) 582} 583 584func (t *symbolSubstrMatchTree) String() string { 585 return fmt.Sprintf("symbol(%v)", t.substrMatchTree) 586} 587 588func (t *symbolRegexpMatchTree) String() string { 589 return fmt.Sprintf("symbol(%v)", t.matchTree) 590} 591 592// visitMatches visits all atoms in matchTree. Note: This visits 593// noVisitMatchTree. For collecting matches use visitMatches. 594func visitMatchTree(t matchTree, f func(matchTree)) { 595 switch s := t.(type) { 596 case *andMatchTree: 597 for _, ch := range s.children { 598 visitMatchTree(ch, f) 599 } 600 case *orMatchTree: 601 for _, ch := range s.children { 602 visitMatchTree(ch, f) 603 } 604 case *andLineMatchTree: 605 visitMatchTree(&s.andMatchTree, f) 606 case *noVisitMatchTree: 607 visitMatchTree(s.matchTree, f) 608 case *notMatchTree: 609 visitMatchTree(s.child, f) 610 case *fileNameMatchTree: 611 visitMatchTree(s.child, f) 612 case *boostMatchTree: 613 visitMatchTree(s.child, f) 614 case *symbolSubstrMatchTree: 615 visitMatchTree(s.substrMatchTree, f) 616 case *symbolRegexpMatchTree: 617 visitMatchTree(s.matchTree, f) 618 default: 619 f(t) 620 } 621} 622 623// updateMatchTreeStats calls updateStats on all atoms in mt which have that 624// function defined. 625func updateMatchTreeStats(mt matchTree, stats *zoekt.Stats) { 626 visitMatchTree(mt, func(mt matchTree) { 627 if atom, ok := mt.(interface{ updateStats(*zoekt.Stats) }); ok { 628 atom.updateStats(stats) 629 } 630 }) 631} 632 633func visitMatchAtoms(t matchTree, known map[matchTree]bool, f func(matchTree)) { 634 visitMatches(t, known, 1, func(mt matchTree, _ float64) { 635 f(mt) 636 }) 637} 638 639// visitMatches visits all atoms which can contribute matches. Note: This 640// skips noVisitMatchTree. 641func visitMatches(t matchTree, known map[matchTree]bool, weight float64, f func(matchTree, float64)) { 642 switch s := t.(type) { 643 case *andMatchTree: 644 for _, ch := range s.children { 645 if known[ch] { 646 visitMatches(ch, known, weight, f) 647 } 648 } 649 case *andLineMatchTree: 650 visitMatches(&s.andMatchTree, known, weight, f) 651 case *orMatchTree: 652 for _, ch := range s.children { 653 if known[ch] { 654 visitMatches(ch, known, weight, f) 655 } 656 } 657 case *boostMatchTree: 658 visitMatches(s.child, known, weight*s.boost, f) 659 case *symbolSubstrMatchTree: 660 visitMatches(s.substrMatchTree, known, weight, f) 661 case *notMatchTree: 662 case *noVisitMatchTree: 663 // don't collect into negative trees. 664 case *fileNameMatchTree: 665 // We will just gather the filename if we do not visit this tree. 666 default: 667 f(s, weight) 668 } 669} 670 671// all matches() methods. 672 673func (t *docMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState { 674 return matchesStatePred(t.predicate(cp.idx)) 675} 676 677func (t *bruteForceMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState { 678 return matchesFound 679} 680 681// andLineMatchTree is a performance optimization of andMatchTree. For content 682// searches we don't want to run the regex engine if there is no line that 683// contains matches from all terms. 684func (t *andLineMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState { 685 if state := evalMatchTree(cp, cost, known, &t.andMatchTree); state != matchesFound { 686 return state 687 } 688 689 // Invariant: all children have matches. If any line contains all of them we 690 // can return MatchesFound. 691 692 // find child with fewest candidates 693 minCount := math.MaxInt 694 fewestChildren := 0 695 for ix, child := range t.children { 696 v, ok := child.(*substrMatchTree) 697 // make sure we are running a content search and that all candidates are a 698 // substrMatchTree 699 if !ok || v.fileName { 700 return matchesFound 701 } 702 if len(v.current) < minCount { 703 minCount = len(v.current) 704 fewestChildren = ix 705 } 706 } 707 708 type lineRange struct { 709 start int 710 end int 711 } 712 lines := make([]lineRange, 0, len(t.children[fewestChildren].(*substrMatchTree).current)) 713 prev := -1 714 for _, candidate := range t.children[fewestChildren].(*substrMatchTree).current { 715 line := cp.newlines().atOffset(candidate.byteOffset) 716 if line == prev { 717 continue 718 } 719 prev = line 720 byteStart := int(cp.newlines().lineStart(line)) 721 byteEnd := int(cp.newlines().lineStart(line + 1)) 722 lines = append(lines, lineRange{byteStart, byteEnd}) 723 } 724 725 // children keeps track of the children's candidates we have already seen. 726 children := make([][]*candidateMatch, 0, len(t.children)-1) 727 for j, child := range t.children { 728 if j == fewestChildren { 729 continue 730 } 731 children = append(children, child.(*substrMatchTree).current) 732 } 733 734nextLine: 735 for i := 0; i < len(lines); i++ { 736 hits := 1 737 nextChild: 738 for j := range children { 739 nextCandidate: 740 for len(children[j]) > 0 { 741 candidate := children[j][0] 742 bo := int(cp.findOffset(false, candidate.runeOffset)) 743 if bo < lines[i].start { 744 children[j] = children[j][1:] 745 continue nextCandidate 746 } 747 if bo < lines[i].end { 748 hits++ 749 continue nextChild 750 } 751 // move the `lines` iterator forward until bo < line.end 752 for i < len(lines) && bo >= lines[i].end { 753 i++ 754 } 755 i-- 756 continue nextLine 757 } 758 } 759 // return early once we found any line that contains matches from all children 760 if hits == len(t.children) { 761 return matchesFound 762 } 763 } 764 return matchesNone 765} 766 767func (t *andMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState { 768 // We have found matches unless a child needs to do more work or it hasn't 769 // found matches. 770 state := matchesFound 771 772 for _, ch := range t.children { 773 switch evalMatchTree(cp, cost, known, ch) { 774 case matchesRequiresHigherCost: 775 // keep evaluating other children incase we come across matchesNone 776 state = matchesRequiresHigherCost 777 case matchesFound: 778 // will return this if every child has this value 779 case matchesNone: 780 return matchesNone 781 } 782 } 783 784 return state 785} 786 787func (t *orMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState { 788 // we could short-circuit, but we want to use the other possibilities as a 789 // ranking signal. So we always return the most conservative state. 790 state := matchesNone 791 for _, ch := range t.children { 792 switch evalMatchTree(cp, cost, known, ch) { 793 case matchesRequiresHigherCost: 794 state = matchesRequiresHigherCost 795 case matchesFound: 796 if state != matchesRequiresHigherCost { 797 state = matchesFound 798 } 799 case matchesNone: 800 // noop 801 } 802 } 803 return state 804} 805 806func (t *branchQueryMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState { 807 return matchesStatePred(t.branchMask() != 0) 808} 809 810func (t *regexpMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState { 811 if t.reEvaluated { 812 return matchesStateForSlice(t.found) 813 } 814 815 if cost < costRegexp { 816 return matchesRequiresHigherCost 817 } 818 819 cp.stats.RegexpsConsidered++ 820 data := cp.data(t.fileName) 821 // For file content, use hybridRegexp which dispatches to go-re2 when 822 // len(data) >= ZOEKT_RE2_THRESHOLD_BYTES. For filename matching, use 823 // grafana/regexp directly: filenames are always short, so the WASM 824 // call overhead of go-re2 outweighs any benefit. 825 var idxs [][]int 826 if t.fileName { 827 idxs = t.regexp.FindAllIndex(data, -1) 828 } else { 829 idxs = t.hybridRegexp.FindAllIndex(data, -1) 830 } 831 found := t.found[:0] 832 for _, idx := range idxs { 833 cm := &candidateMatch{ 834 byteOffset: uint32(idx[0]), 835 byteMatchSz: uint32(idx[1] - idx[0]), 836 fileName: t.fileName, 837 } 838 839 found = append(found, cm) 840 } 841 t.found = found 842 t.reEvaluated = true 843 844 return matchesStateForSlice(t.found) 845} 846 847func (t *wordMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState { 848 if t.evaluated { 849 return matchesStateForSlice(t.found) 850 } 851 852 if cost < costRegexp { 853 return matchesRequiresHigherCost 854 } 855 856 data := cp.data(t.fileName) 857 offset := 0 858 found := t.found[:0] 859 for { 860 idx := bytes.Index(data[offset:], []byte(t.word)) 861 if idx < 0 { 862 break 863 } 864 865 relStartOffset := offset + idx 866 relEndOffset := relStartOffset + len(t.word) 867 868 startBoundary := relStartOffset < len(data) && (relStartOffset == 0 || !characterClass(data[relStartOffset-1])) 869 endBoundary := relEndOffset > 0 && (relEndOffset == len(data) || !characterClass(data[relEndOffset])) 870 if startBoundary && endBoundary { 871 found = append(found, &candidateMatch{ 872 byteOffset: uint32(offset + idx), 873 byteMatchSz: uint32(len(t.word)), 874 fileName: t.fileName, 875 }) 876 } 877 offset += idx + len(t.word) 878 } 879 880 t.found = found 881 t.evaluated = true 882 883 return matchesStateForSlice(t.found) 884} 885 886// breakMatchesOnNewlines returns matches resulting from breaking each element 887// of cms on newlines within text. 888func breakMatchesOnNewlines(cms []*candidateMatch, text []byte) []*candidateMatch { 889 var lineCMs []*candidateMatch 890 for _, cm := range cms { 891 lineCMs = append(lineCMs, breakOnNewlines(cm, text)...) 892 } 893 return lineCMs 894} 895 896// breakOnNewlines returns matches resulting from breaking cm on newlines 897// within text. 898func breakOnNewlines(cm *candidateMatch, text []byte) []*candidateMatch { 899 var cms []*candidateMatch 900 addMe := &candidateMatch{} 901 *addMe = *cm 902 for i := uint32(cm.byteOffset); i < cm.byteOffset+cm.byteMatchSz; i++ { 903 if text[i] == '\n' { 904 addMe.byteMatchSz = i - addMe.byteOffset 905 if addMe.byteMatchSz != 0 { 906 cms = append(cms, addMe) 907 } 908 909 addMe = &candidateMatch{} 910 *addMe = *cm 911 addMe.byteOffset = i + 1 912 } 913 } 914 addMe.byteMatchSz = cm.byteOffset + cm.byteMatchSz - addMe.byteOffset 915 if addMe.byteMatchSz != 0 { 916 cms = append(cms, addMe) 917 } 918 return cms 919} 920 921// evalMatchTree should be called instead of directly calling 922// matchTree.matches. It cache known values for future evaluation at higher 923// costs. 924func evalMatchTree(cp *contentProvider, cost int, known map[matchTree]bool, mt matchTree) matchesState { 925 if v, ok := known[mt]; ok { 926 return matchesStatePred(v) 927 } 928 929 ms := mt.matches(cp, cost, known) 930 if ms != matchesRequiresHigherCost { 931 known[mt] = ms == matchesFound 932 } 933 934 return ms 935} 936 937func (t *notMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState { 938 switch evalMatchTree(cp, cost, known, t.child) { 939 case matchesRequiresHigherCost: 940 return matchesRequiresHigherCost 941 case matchesFound: 942 return matchesNone 943 case matchesNone: 944 return matchesFound 945 default: 946 panic("unreachable") 947 } 948} 949 950func (t *fileNameMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState { 951 return evalMatchTree(cp, cost, known, t.child) 952} 953 954func (t *boostMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState { 955 return evalMatchTree(cp, cost, known, t.child) 956} 957 958func (t *substrMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState { 959 if t.contEvaluated { 960 return matchesStateForSlice(t.current) 961 } 962 963 if len(t.current) == 0 { 964 return matchesNone 965 } 966 967 if t.fileName && cost < costMemory { 968 return matchesRequiresHigherCost 969 } 970 971 if !t.fileName && cost < costContent { 972 return matchesRequiresHigherCost 973 } 974 975 pruned := t.current[:0] 976 for _, m := range t.current { 977 if m.byteOffset == 0 && m.runeOffset > 0 { 978 m.byteOffset = cp.findOffset(m.fileName, m.runeOffset) 979 } 980 if m.matchContent(cp.data(m.fileName)) { 981 pruned = append(pruned, m) 982 } 983 } 984 t.current = pruned 985 t.contEvaluated = true 986 987 return matchesStateForSlice(t.current) 988} 989 990type matchTreeOpt struct { 991 // DisableWordMatchOptimization is used to disable the use of wordMatchTree. 992 // This was added since we do not support wordMatchTree with symbol search. 993 DisableWordMatchOptimization bool 994} 995 996func (d *indexData) newMatchTree(q query.Q, opt matchTreeOpt) (matchTree, error) { 997 if q == nil { 998 return nil, fmt.Errorf("got nil (sub)query") 999 } 1000 1001 switch s := q.(type) { 1002 case *query.Regexp: 1003 // RegexpToMatchTreeRecursive tries to distill a matchTree that matches a 1004 // superset of the regexp. If the returned matchTree is equivalent to the 1005 // original regexp, it returns true. An equivalent matchTree has the same 1006 // behaviour as the original regexp and can be used instead. 1007 // 1008 subMT, isEq, _, err := d.regexpToMatchTreeRecursive(s.Regexp, ngramSize, s.FileName, s.CaseSensitive) 1009 if err != nil { 1010 return nil, err 1011 } 1012 // if the query can be used in place of the regexp 1013 // return the subtree 1014 if isEq { 1015 return subMT, nil 1016 } 1017 1018 var tr matchTree 1019 if wmt, ok := regexpToWordMatchTree(s, opt); ok { 1020 // A common search we get is "\bLITERAL\b". Avoid the regex engine and 1021 // provide something faster. 1022 tr = wmt 1023 } else { 1024 tr = newRegexpMatchTree(s) 1025 } 1026 1027 return &andMatchTree{ 1028 children: []matchTree{ 1029 tr, &noVisitMatchTree{subMT}, 1030 }, 1031 }, nil 1032 case *query.And: 1033 var r []matchTree 1034 for _, ch := range s.Children { 1035 ct, err := d.newMatchTree(ch, opt) 1036 if err != nil { 1037 return nil, err 1038 } 1039 r = append(r, ct) 1040 } 1041 return &andMatchTree{r}, nil 1042 case *query.Or: 1043 var r []matchTree 1044 for _, ch := range s.Children { 1045 ct, err := d.newMatchTree(ch, opt) 1046 if err != nil { 1047 return nil, err 1048 } 1049 r = append(r, ct) 1050 } 1051 return &orMatchTree{r}, nil 1052 case *query.Not: 1053 ct, err := d.newMatchTree(s.Child, opt) 1054 return &notMatchTree{ 1055 child: ct, 1056 }, err 1057 1058 case *query.Type: 1059 if s.Type != query.TypeFileName { 1060 break 1061 } 1062 1063 ct, err := d.newMatchTree(s.Child, opt) 1064 if err != nil { 1065 return nil, err 1066 } 1067 1068 return &fileNameMatchTree{ 1069 child: ct, 1070 }, nil 1071 1072 case *query.Boost: 1073 ct, err := d.newMatchTree(s.Child, opt) 1074 if err != nil { 1075 return nil, err 1076 } 1077 1078 return &boostMatchTree{ 1079 child: ct, 1080 boost: s.Boost, 1081 }, nil 1082 1083 case *query.Meta: 1084 checksum := queryMetaChecksum(s.Field, s.Value) 1085 cacheKeyField := "Meta" 1086 if cached, ok := d.docMatchTreeCache.Get(cacheKeyField, checksum); ok { 1087 return cached, nil 1088 } 1089 1090 reposWant := make([]bool, len(d.repoMetaData)) 1091 for repoIdx, r := range d.repoMetaData { 1092 if r.Metadata != nil { 1093 if val, ok := r.Metadata[s.Field]; ok && s.Value.MatchString(val) { 1094 reposWant[repoIdx] = true 1095 } 1096 } 1097 } 1098 1099 mt := &docMatchTree{ 1100 reason: "Meta", 1101 numDocs: d.numDocs(), 1102 predicate: func(docID uint32) bool { 1103 repoIdx := d.repos[docID] 1104 if int(repoIdx) >= len(reposWant) { 1105 return false 1106 } 1107 return reposWant[repoIdx] 1108 }, 1109 } 1110 d.docMatchTreeCache.Add(cacheKeyField, checksum, mt) 1111 return mt, nil 1112 1113 case *query.Substring: 1114 return d.newSubstringMatchTree(s) 1115 1116 case *query.Branch: 1117 masks := make([]uint64, 0, len(d.repoMetaData)) 1118 if s.Pattern == "HEAD" { 1119 for range d.repoMetaData { 1120 masks = append(masks, 1) 1121 } 1122 } else { 1123 for _, branchIDs := range d.branchIDs { 1124 mask := uint64(0) 1125 for nm, m := range branchIDs { 1126 if (s.Exact && nm == s.Pattern) || (!s.Exact && strings.Contains(nm, s.Pattern)) { 1127 mask |= uint64(m) 1128 } 1129 } 1130 masks = append(masks, mask) 1131 } 1132 } 1133 return &branchQueryMatchTree{ 1134 masks: masks, 1135 fileMasks: d.fileBranchMasks, 1136 repos: d.repos, 1137 }, nil 1138 case *query.Const: 1139 if s.Value { 1140 return &bruteForceMatchTree{}, nil 1141 } else { 1142 return &noMatchTree{Why: "const"}, nil 1143 } 1144 case *query.Language: 1145 code, ok := d.metaData.LanguageMap[s.Language] 1146 if !ok { 1147 return &noMatchTree{Why: "lang"}, nil 1148 } 1149 return &docMatchTree{ 1150 reason: "language", 1151 numDocs: d.numDocs(), 1152 predicate: func(docID uint32) bool { 1153 return d.getLanguage(docID) == code 1154 }, 1155 }, nil 1156 1157 case *query.Symbol: 1158 // Disable WordMatchTree since we don't support it in symbols yet. 1159 optCopy := opt 1160 optCopy.DisableWordMatchOptimization = true 1161 1162 subMT, err := d.newMatchTree(s.Expr, optCopy) 1163 if err != nil { 1164 return nil, err 1165 } 1166 1167 if substr, ok := subMT.(*substrMatchTree); ok { 1168 return &symbolSubstrMatchTree{ 1169 substrMatchTree: substr, 1170 patternSize: uint32(utf8.RuneCountInString(substr.query.Pattern)), 1171 fileEndRunes: d.fileEndRunes, 1172 fileEndSymbol: d.fileEndSymbol, 1173 sections: d.runeDocSections, 1174 }, nil 1175 } 1176 1177 var regexpMT *regexpMatchTree 1178 visitMatchTree(subMT, func(mt matchTree) { 1179 if t, ok := mt.(*regexpMatchTree); ok { 1180 regexpMT = t 1181 } 1182 }) 1183 if regexpMT == nil { 1184 return nil, fmt.Errorf("found %T inside query.Symbol", subMT) 1185 } 1186 1187 return &symbolRegexpMatchTree{ 1188 regexp: regexpMT.regexp, 1189 all: isRegexpAll(regexpMT.origRegexp), 1190 matchTree: subMT, 1191 }, nil 1192 1193 case *query.FileNameSet: 1194 return &docMatchTree{ 1195 reason: "FileNameSet", 1196 numDocs: d.numDocs(), 1197 predicate: func(docID uint32) bool { 1198 fileName := d.fileName(docID) 1199 _, ok := s.Set[string(fileName)] 1200 return ok 1201 }, 1202 }, nil 1203 1204 case *query.BranchesRepos: 1205 reposBranchesWant := make([]uint64, len(d.repoMetaData)) 1206 for repoIdx := range d.repoMetaData { 1207 var mask uint64 1208 for _, br := range s.List { 1209 if br.Repos.Contains(d.repoMetaData[repoIdx].ID) { 1210 mask |= uint64(d.branchIDs[repoIdx][br.Branch]) 1211 } 1212 } 1213 reposBranchesWant[repoIdx] = mask 1214 } 1215 return &docMatchTree{ 1216 reason: "BranchesRepos", 1217 numDocs: d.numDocs(), 1218 predicate: func(docID uint32) bool { 1219 return d.fileBranchMasks[docID]&reposBranchesWant[d.repos[docID]] != 0 1220 }, 1221 }, nil 1222 1223 case *query.RepoSet: 1224 reposWant := make([]bool, len(d.repoMetaData)) 1225 for repoIdx, r := range d.repoMetaData { 1226 if _, ok := s.Set[r.Name]; ok { 1227 reposWant[repoIdx] = true 1228 } 1229 } 1230 return &docMatchTree{ 1231 reason: "RepoSet", 1232 numDocs: d.numDocs(), 1233 predicate: func(docID uint32) bool { 1234 return reposWant[d.repos[docID]] 1235 }, 1236 }, nil 1237 1238 case *query.RepoIDs: 1239 reposWant := make([]bool, len(d.repoMetaData)) 1240 for repoIdx, r := range d.repoMetaData { 1241 if s.Repos.Contains(r.ID) { 1242 reposWant[repoIdx] = true 1243 } 1244 } 1245 return &docMatchTree{ 1246 reason: "RepoIDs", 1247 numDocs: d.numDocs(), 1248 predicate: func(docID uint32) bool { 1249 return reposWant[d.repos[docID]] 1250 }, 1251 }, nil 1252 1253 case *query.Repo: 1254 reposWant := make([]bool, len(d.repoMetaData)) 1255 for repoIdx, r := range d.repoMetaData { 1256 if s.Regexp.MatchString(r.Name) { 1257 reposWant[repoIdx] = true 1258 } 1259 } 1260 return &docMatchTree{ 1261 reason: "Repo", 1262 numDocs: d.numDocs(), 1263 predicate: func(docID uint32) bool { 1264 return reposWant[d.repos[docID]] 1265 }, 1266 }, nil 1267 1268 case *query.RepoRegexp: 1269 reposWant := make([]bool, len(d.repoMetaData)) 1270 for repoIdx, r := range d.repoMetaData { 1271 if s.Regexp.MatchString(r.Name) { 1272 reposWant[repoIdx] = true 1273 } 1274 } 1275 return &docMatchTree{ 1276 reason: "RepoRegexp", 1277 numDocs: d.numDocs(), 1278 predicate: func(docID uint32) bool { 1279 return reposWant[d.repos[docID]] 1280 }, 1281 }, nil 1282 1283 case query.RawConfig: 1284 return &docMatchTree{ 1285 reason: s.String(), 1286 numDocs: d.numDocs(), 1287 predicate: func(docID uint32) bool { 1288 return uint8(s)&d.rawConfigMasks[d.repos[docID]] == uint8(s) 1289 }, 1290 }, nil 1291 } 1292 log.Panicf("type %T", q) 1293 return nil, nil 1294} 1295 1296func (d *indexData) newSubstringMatchTree(s *query.Substring) (matchTree, error) { 1297 st := &substrMatchTree{ 1298 query: s, 1299 caseSensitive: s.CaseSensitive, 1300 fileName: s.FileName, 1301 } 1302 1303 if utf8.RuneCountInString(s.Pattern) < ngramSize { 1304 return newRegexpMatchTree(&query.Regexp{ 1305 Regexp: &syntax.Regexp{Op: syntax.OpLiteral, Rune: []rune(s.Pattern)}, 1306 FileName: s.FileName, 1307 Content: s.Content, 1308 CaseSensitive: s.CaseSensitive, 1309 }), nil 1310 } 1311 1312 result, err := d.iterateNgrams(s) 1313 if err != nil { 1314 return nil, err 1315 } 1316 st.matchIterator = result 1317 return st, nil 1318} 1319 1320func regexpToWordMatchTree(q *query.Regexp, opt matchTreeOpt) (_ *wordMatchTree, ok bool) { 1321 if opt.DisableWordMatchOptimization { 1322 return nil, false 1323 } 1324 // Needs to be case sensitive 1325 if !q.CaseSensitive || q.Regexp.Flags&syntax.FoldCase != 0 { 1326 return nil, false 1327 } 1328 // We want a regex that looks like Op.Concat[OpWordBoundary OpLiteral OpWordBoundary] 1329 if q.Regexp.Op != syntax.OpConcat || len(q.Regexp.Sub) != 3 { 1330 return nil, false 1331 } 1332 sub := q.Regexp.Sub 1333 if sub[0].Op != syntax.OpWordBoundary || sub[1].Op != syntax.OpLiteral || sub[2].Op != syntax.OpWordBoundary { 1334 return nil, false 1335 } 1336 1337 return &wordMatchTree{ 1338 word: string(sub[1].Rune), 1339 fileName: q.FileName, 1340 }, true 1341} 1342 1343// pruneMatchTree removes impossible branches from the matchTree, as indicated 1344// by substrMatchTree having a noMatchTree and the resulting impossible and clauses and so forth. 1345func pruneMatchTree(mt matchTree) (matchTree, error) { 1346 var err error 1347 switch mt := mt.(type) { 1348 // leaf nodes that we test with the filter: 1349 case *substrMatchTree: 1350 if res, ok := mt.matchIterator.(*ngramIterationResults); ok { 1351 if _, ok := res.matchIterator.(*noMatchTree); ok { 1352 return nil, nil 1353 } 1354 } 1355 // recursive tree structures: 1356 case *andMatchTree: 1357 // Any branch of an and becoming impossible means the entire clause 1358 // is impossible. Otherwise, just handle rewrites. 1359 for i, child := range mt.children { 1360 newChild, err := pruneMatchTree(child) 1361 if err != nil { 1362 return nil, err 1363 } 1364 if newChild == nil { 1365 return nil, nil 1366 } 1367 mt.children[i] = newChild 1368 } 1369 case *orMatchTree: 1370 // *All* branches of an OR becoming impossible means the entire clause 1371 // is impossible. Otherwise, drop impossible subclauses and handle 1372 // rewrites, including simplifying to a singular resulting child branch. 1373 n := 0 1374 for _, child := range mt.children { 1375 newChild, err := pruneMatchTree(child) 1376 if err != nil { 1377 return nil, err 1378 } 1379 if newChild != nil { 1380 mt.children[n] = newChild 1381 n++ 1382 } 1383 } 1384 mt.children = mt.children[:n] 1385 if len(mt.children) == 1 { 1386 return mt.children[0], nil 1387 } else if len(mt.children) == 0 { 1388 return nil, nil 1389 } 1390 case *noVisitMatchTree: 1391 mt.matchTree, err = pruneMatchTree(mt.matchTree) 1392 if err != nil { 1393 return nil, err 1394 } 1395 if mt.matchTree == nil { 1396 return nil, nil 1397 } 1398 case *fileNameMatchTree: 1399 mt.child, err = pruneMatchTree(mt.child) 1400 if err != nil { 1401 return nil, err 1402 } 1403 if mt.child == nil { 1404 return nil, nil 1405 } 1406 case *boostMatchTree: 1407 mt.child, err = pruneMatchTree(mt.child) 1408 if err != nil { 1409 return nil, err 1410 } 1411 if mt.child == nil { 1412 return nil, nil 1413 } 1414 case *andLineMatchTree: 1415 child, err := pruneMatchTree(&mt.andMatchTree) 1416 if err != nil { 1417 return nil, err 1418 } 1419 if child == nil { 1420 return nil, nil 1421 } 1422 if c, ok := child.(*andMatchTree); ok { 1423 mt.andMatchTree = *c 1424 } else { 1425 // the and was simplified to a single clause, 1426 // so the linematch portion is irrelevant. 1427 return mt, nil 1428 } 1429 case *notMatchTree: 1430 mt.child, err = pruneMatchTree(mt.child) 1431 if err != nil { 1432 return nil, err 1433 } 1434 if mt.child == nil { 1435 // not false => true 1436 return &bruteForceMatchTree{}, nil 1437 } 1438 // unhandled: 1439 case *docMatchTree: 1440 case *bruteForceMatchTree: 1441 case *regexpMatchTree: 1442 case *wordMatchTree: 1443 } 1444 return mt, err 1445} 1446 1447// isRegexpAll returns true if the query matches all possible lines. 1448// 1449// Note: it is possible for a funky regex to actually match all but this 1450// returns false. This returns true for normal looking regexes like ".*" or 1451// "(?-s:.*)". 1452func isRegexpAll(r *syntax.Regexp) bool { 1453 // Note: we don't care about any flags since we are looking for .* and we 1454 // don't mind if . matches all or everything but newline. 1455 1456 // for loop is for visiting children of OpCapture/OpConcat until we hit .* 1457 for { 1458 // Our main target: .* 1459 if r.Op == syntax.OpStar && len(r.Sub) == 1 { // * 1460 // (?s:.) or (?-s:.) 1461 return r.Sub[0].Op == syntax.OpAnyChar || r.Sub[0].Op == syntax.OpAnyCharNotNL 1462 } 1463 1464 // Strip away expressions being wrapped in paranthesis 1465 if (r.Op == syntax.OpCapture || r.Op == syntax.OpConcat) && len(r.Sub) == 1 { 1466 r = r.Sub[0] 1467 continue 1468 } 1469 1470 return false 1471 } 1472} 1473 1474func queryMetaChecksum(field string, value *regexp.Regexp) string { 1475 h := xxhash.New() 1476 h.Write([]byte(field)) 1477 h.Write([]byte{':'}) 1478 h.Write([]byte(value.String())) 1479 return fmt.Sprintf("%x", h.Sum64()) 1480}