fork of https://github.com/sourcegraph/zoekt
0

Configure Feed

Select the types of activity you want to include in your feed.

1// Copyright 2018 Google Inc. All rights reserved. 2// 3// Licensed under the Apache License, Version 2.0 (the "License"); 4// you may not use this file except in compliance with the License. 5// You may obtain a copy of the License at 6// 7// http://www.apache.org/licenses/LICENSE-2.0 8// 9// Unless required by applicable law or agreed to in writing, software 10// distributed under the License is distributed on an "AS IS" BASIS, 11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12// See the License for the specific language governing permissions and 13// limitations under the License. 14 15package index 16 17import ( 18 "bytes" 19 "fmt" 20 "log" 21 "math" 22 "regexp/syntax" 23 "strings" 24 "unicode/utf8" 25 26 "github.com/cespare/xxhash/v2" 27 "github.com/grafana/regexp" 28 29 "github.com/sourcegraph/zoekt" 30 "github.com/sourcegraph/zoekt/internal/hybridre2" 31 "github.com/sourcegraph/zoekt/internal/syntaxutil" 32 "github.com/sourcegraph/zoekt/query" 33) 34 35// A docIterator iterates over documents in order. 36type docIterator interface { 37 // provide the next document where we may find something interesting. 38 // 39 // This is like a "peek" and shouldn't mutate state. prepare is what should 40 // change state. 41 nextDoc() uint32 42 43 // clears any per-document state of the docIterator, and 44 // prepares for evaluating the given doc. The argument is 45 // strictly increasing over time. 46 prepare(nextDoc uint32) 47} 48 49// costs are passed in increasing order to matchTree.matches until they do not 50// return matchesRequiresHigherCost. 51const ( 52 costConst = 0 53 costMemory = 1 54 costContent = 2 55 costRegexp = 3 56) 57 58const ( 59 costMin = costConst 60 costMax = costRegexp 61) 62 63// matchesState is an enum for the state of a matchTree after a call to 64// matchTree.matches. 65type matchesState uint8 66 67const ( 68 // matchesRequiresHigherCost is returned when matchTree.matches hasn't done 69 // a search yet since the cost value is not high enough. 70 matchesRequiresHigherCost matchesState = iota 71 72 // matchesFound is returned when matchTree.matches has done a search and 73 // found one or more matches. 74 matchesFound 75 76 // matchesNone is returned when matchTree.matches has done a search and 77 // found nothing. 78 matchesNone 79) 80 81// matchesStatePred is a helper which returns matchesFound if b is true 82// otherwise returns matchesNone. 83func matchesStatePred(b bool) matchesState { 84 if b { 85 return matchesFound 86 } 87 return matchesNone 88} 89 90// matchesStateForSlice is a helper which returns matchesFound if v is 91// non-empty otherwise returns matchesNone. 92func matchesStateForSlice[T any](v []T) matchesState { 93 return matchesStatePred(len(v) > 0) 94} 95 96// An expression tree coupled with matches. The matchtree has two 97// functions: 98// 99// * it implements boolean combinations (and, or, not) 100// 101// * it implements shortcuts, where we skip documents (for example: if 102// there are no trigram matches, we can be sure there are no substring 103// matches). The matchtree iterates over the documents as they are 104// ordered in the shard. 105// 106// The general process for a given (shard, query) is: 107// 108// - construct matchTree for the query 109// 110// - find all different leaf matchTrees (substring, regexp, etc.) 111// 112// in a loop: 113// 114// - find next doc to process using nextDoc 115// 116// - evaluate atoms (leaf expressions that match text) 117// 118// - evaluate the tree using matches(), storing the result in map. 119// 120// - if the complete tree returns (matches() != matchesRequiresHigherCost) 121// for the document, collect all text matches by looking at leaf 122// matchTrees. 123type matchTree interface { 124 docIterator 125 126 // matches if cost is high enough. See documentation for matchesState's 127 // values. 128 // 129 // Note: Do not call this directly, rather use evalMatchTree which uses 130 // known to cache responses once the state transitions away from 131 // matchesRequiresHigherCost. 132 matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState 133} 134 135// docMatchTree iterates over documents for which predicate(docID) returns true. 136type docMatchTree struct { 137 // the number of documents in a shard. 138 numDocs uint32 139 140 predicate func(docID uint32) bool 141 142 // provides additional information about the reason why the docMatchTree was 143 // created. 144 reason string 145 146 // mutable 147 firstDone bool 148 docID uint32 149} 150 151type bruteForceMatchTree struct { 152 // mutable 153 firstDone bool 154 docID uint32 155} 156 157type andLineMatchTree struct { 158 andMatchTree 159} 160 161type andMatchTree struct { 162 children []matchTree 163} 164 165type orMatchTree struct { 166 children []matchTree 167} 168 169type notMatchTree struct { 170 child matchTree 171} 172 173// Returns only the filename of child matches. 174type fileNameMatchTree struct { 175 child matchTree 176} 177 178type boostMatchTree struct { 179 child matchTree 180 boost float64 181} 182 183// Don't visit this subtree for collecting matches. 184type noVisitMatchTree struct { 185 matchTree 186} 187 188type regexpMatchTree struct { 189 regexp *regexp.Regexp 190 191 // hybridRegexp is a size-aware wrapper that dispatches to go-re2 for 192 // large file content when ZOEKT_RE2_THRESHOLD_BYTES is configured. 193 // For small inputs and filename matches, regexp is used directly. 194 hybridRegexp *hybridre2.Regexp 195 196 // origRegexp is the original parsed regexp from the query structure. It 197 // does not include mutations such as case sensitivity. 198 origRegexp *syntax.Regexp 199 200 fileName bool 201 202 // mutable 203 reEvaluated bool 204 found []*candidateMatch 205 206 // nextDoc, prepare. 207 bruteForceMatchTree 208} 209 210func newRegexpMatchTree(s *query.Regexp) *regexpMatchTree { 211 prefix := "" 212 if !s.CaseSensitive { 213 prefix = "(?i)" 214 } 215 216 pattern := prefix + syntaxutil.RegexpString(s.Regexp) 217 218 // hybridRegexp is only used for file content matching; skip the RE2 219 // compilation overhead for filename-only regexps. 220 var hr *hybridre2.Regexp 221 if !s.FileName { 222 hr = hybridre2.MustCompile(pattern) 223 } 224 return &regexpMatchTree{ 225 regexp: regexp.MustCompile(pattern), 226 hybridRegexp: hr, 227 origRegexp: s.Regexp, 228 fileName: s.FileName, 229 } 230} 231 232// \bLITERAL\b 233type wordMatchTree struct { 234 word string 235 236 fileName bool 237 238 // mutable 239 evaluated bool 240 found []*candidateMatch 241 242 // nextDoc, prepare. 243 bruteForceMatchTree 244} 245 246type substrMatchTree struct { 247 matchIterator 248 249 query *query.Substring 250 caseSensitive bool 251 fileName bool 252 253 // mutable 254 current []*candidateMatch 255 contEvaluated bool 256} 257 258type branchQueryMatchTree struct { 259 fileMasks []uint64 260 masks []uint64 261 repos []uint16 262 263 // mutable 264 firstDone bool 265 docID uint32 266} 267 268func (t *branchQueryMatchTree) branchMask() uint64 { 269 return t.fileMasks[t.docID] & t.masks[t.repos[t.docID]] 270} 271 272type symbolRegexpMatchTree struct { 273 matchTree 274 regexp *regexp.Regexp 275 all bool // skips regex match if .* 276 277 reEvaluated bool 278 found []*candidateMatch 279} 280 281func (t *symbolRegexpMatchTree) prepare(doc uint32) { 282 t.reEvaluated = false 283 t.found = t.found[:0] 284 t.matchTree.prepare(doc) 285} 286 287func (t *symbolRegexpMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState { 288 if t.reEvaluated { 289 return matchesStateForSlice(t.found) 290 } 291 292 if cost < costRegexp { 293 return matchesRequiresHigherCost 294 } 295 296 sections := cp.docSections() 297 content := cp.data(false) 298 299 found := t.found[:0] 300 for i, sec := range sections { 301 var idx []int 302 if t.all { 303 idx = []int{0, int(sec.End - sec.Start)} 304 } else { 305 idx = t.regexp.FindIndex(content[sec.Start:sec.End]) 306 if idx == nil { 307 continue 308 } 309 } 310 311 cm := &candidateMatch{ 312 byteOffset: sec.Start + uint32(idx[0]), 313 byteMatchSz: uint32(idx[1] - idx[0]), 314 symbol: true, 315 symbolIdx: uint32(i), 316 } 317 found = append(found, cm) 318 } 319 t.found = found 320 t.reEvaluated = true 321 322 return matchesStateForSlice(t.found) 323} 324 325type symbolSubstrMatchTree struct { 326 *substrMatchTree 327 328 patternSize uint32 329 fileEndRunes []uint32 330 fileEndSymbol []uint32 331 332 doc uint32 333 sections []DocumentSection 334 335 secID uint32 336} 337 338func (t *symbolSubstrMatchTree) prepare(doc uint32) { 339 t.substrMatchTree.prepare(doc) 340 t.doc = doc 341 342 var fileStart uint32 343 if doc > 0 { 344 fileStart = t.fileEndRunes[doc-1] 345 } 346 347 var sections []DocumentSection 348 if len(t.sections) > 0 { 349 most := t.fileEndSymbol[len(t.fileEndSymbol)-1] 350 if most == uint32(len(t.sections)) { 351 sections = t.sections[t.fileEndSymbol[doc]:t.fileEndSymbol[doc+1]] 352 } else { 353 for t.secID < uint32(len(t.sections)) && t.sections[t.secID].Start < fileStart { 354 t.secID++ 355 } 356 357 fileEnd, symbolEnd := t.fileEndRunes[doc], t.secID 358 for symbolEnd < uint32(len(t.sections)) && t.sections[symbolEnd].Start < fileEnd { 359 symbolEnd++ 360 } 361 362 sections = t.sections[t.secID:symbolEnd] 363 } 364 } 365 366 secIdx := 0 367 trimmed := t.current[:0] 368 for len(sections) > secIdx && len(t.current) > 0 { 369 start := fileStart + t.current[0].runeOffset 370 end := start + t.patternSize 371 if start >= sections[secIdx].End { 372 secIdx++ 373 continue 374 } 375 376 if start < sections[secIdx].Start { 377 t.current = t.current[1:] 378 continue 379 } 380 381 if end <= sections[secIdx].End { 382 t.current[0].symbol = true 383 t.current[0].symbolIdx = uint32(secIdx) 384 trimmed = append(trimmed, t.current[0]) 385 } 386 387 t.current = t.current[1:] 388 } 389 t.current = trimmed 390} 391 392// all prepare methods 393 394func (t *bruteForceMatchTree) prepare(doc uint32) { 395 t.docID = doc 396 t.firstDone = true 397} 398 399func (t *docMatchTree) prepare(doc uint32) { 400 t.docID = doc 401 t.firstDone = true 402} 403 404func (t *andMatchTree) prepare(doc uint32) { 405 for _, c := range t.children { 406 c.prepare(doc) 407 } 408} 409 410func (t *regexpMatchTree) prepare(doc uint32) { 411 t.found = t.found[:0] 412 t.reEvaluated = false 413 t.bruteForceMatchTree.prepare(doc) 414} 415 416func (t *wordMatchTree) prepare(doc uint32) { 417 t.found = t.found[:0] 418 t.evaluated = false 419 t.bruteForceMatchTree.prepare(doc) 420} 421 422func (t *orMatchTree) prepare(doc uint32) { 423 for _, c := range t.children { 424 c.prepare(doc) 425 } 426} 427 428func (t *notMatchTree) prepare(doc uint32) { 429 t.child.prepare(doc) 430} 431 432func (t *fileNameMatchTree) prepare(doc uint32) { 433 t.child.prepare(doc) 434} 435 436func (t *boostMatchTree) prepare(doc uint32) { 437 t.child.prepare(doc) 438} 439 440func (t *substrMatchTree) prepare(nextDoc uint32) { 441 t.matchIterator.prepare(nextDoc) 442 t.current = t.matchIterator.candidates() 443 t.contEvaluated = false 444} 445 446func (t *branchQueryMatchTree) prepare(doc uint32) { 447 t.firstDone = true 448 t.docID = doc 449} 450 451// nextDoc 452 453func (t *docMatchTree) nextDoc() uint32 { 454 var start uint32 455 if t.firstDone { 456 start = t.docID + 1 457 } 458 for i := start; i < t.numDocs; i++ { 459 if t.predicate(i) { 460 return i 461 } 462 } 463 return math.MaxUint32 464} 465 466func (t *bruteForceMatchTree) nextDoc() uint32 { 467 if !t.firstDone { 468 return 0 469 } 470 return t.docID + 1 471} 472 473func (t *andMatchTree) nextDoc() uint32 { 474 var max uint32 475 for _, c := range t.children { 476 m := c.nextDoc() 477 if m > max { 478 max = m 479 } 480 } 481 return max 482} 483 484func (t *orMatchTree) nextDoc() uint32 { 485 min := uint32(math.MaxUint32) 486 for _, c := range t.children { 487 m := c.nextDoc() 488 if m < min { 489 min = m 490 } 491 } 492 return min 493} 494 495func (t *notMatchTree) nextDoc() uint32 { 496 return 0 497} 498 499func (t *fileNameMatchTree) nextDoc() uint32 { 500 return t.child.nextDoc() 501} 502 503func (t *boostMatchTree) nextDoc() uint32 { 504 return t.child.nextDoc() 505} 506 507func (t *branchQueryMatchTree) nextDoc() uint32 { 508 var start uint32 509 if t.firstDone { 510 start = t.docID + 1 511 } 512 513 for i := start; i < uint32(len(t.fileMasks)); i++ { 514 if (t.masks[t.repos[i]] & t.fileMasks[i]) != 0 { 515 return i 516 } 517 } 518 return math.MaxUint32 519} 520 521// all String methods 522 523func (t *bruteForceMatchTree) String() string { 524 return "all" 525} 526 527func (t *docMatchTree) String() string { 528 return fmt.Sprintf("doc(%s)", t.reason) 529} 530 531func (t *andMatchTree) String() string { 532 return fmt.Sprintf("and%v", t.children) 533} 534 535func (t *regexpMatchTree) String() string { 536 f := "" 537 if t.fileName { 538 f = "f" 539 } 540 return fmt.Sprintf("%sre(%s)", f, t.regexp) 541} 542 543func (t *wordMatchTree) String() string { 544 f := "" 545 if t.fileName { 546 f = "f" 547 } 548 return fmt.Sprintf("%sword(%s)", f, t.word) 549} 550 551func (t *orMatchTree) String() string { 552 return fmt.Sprintf("or%v", t.children) 553} 554 555func (t *notMatchTree) String() string { 556 return fmt.Sprintf("not(%v)", t.child) 557} 558 559func (t *noVisitMatchTree) String() string { 560 return fmt.Sprintf("novisit(%v)", t.matchTree) 561} 562 563func (t *fileNameMatchTree) String() string { 564 return fmt.Sprintf("f(%v)", t.child) 565} 566 567func (t *boostMatchTree) String() string { 568 return fmt.Sprintf("boost(%f, %v)", t.boost, t.child) 569} 570 571func (t *substrMatchTree) String() string { 572 f := "" 573 if t.fileName { 574 f = "f" 575 } 576 577 return fmt.Sprintf("%ssubstr(%q, %v, %v)", f, t.query.Pattern, t.current, t.matchIterator) 578} 579 580func (t *branchQueryMatchTree) String() string { 581 return fmt.Sprintf("branch(%x)", t.masks) 582} 583 584func (t *symbolSubstrMatchTree) String() string { 585 return fmt.Sprintf("symbol(%v)", t.substrMatchTree) 586} 587 588func (t *symbolRegexpMatchTree) String() string { 589 return fmt.Sprintf("symbol(%v)", t.matchTree) 590} 591 592// visitMatches visits all atoms in matchTree. Note: This visits 593// noVisitMatchTree. For collecting matches use visitMatches. 594func visitMatchTree(t matchTree, f func(matchTree)) { 595 switch s := t.(type) { 596 case *andMatchTree: 597 for _, ch := range s.children { 598 visitMatchTree(ch, f) 599 } 600 case *orMatchTree: 601 for _, ch := range s.children { 602 visitMatchTree(ch, f) 603 } 604 case *andLineMatchTree: 605 visitMatchTree(&s.andMatchTree, f) 606 case *noVisitMatchTree: 607 visitMatchTree(s.matchTree, f) 608 case *notMatchTree: 609 visitMatchTree(s.child, f) 610 case *fileNameMatchTree: 611 visitMatchTree(s.child, f) 612 case *boostMatchTree: 613 visitMatchTree(s.child, f) 614 case *symbolSubstrMatchTree: 615 visitMatchTree(s.substrMatchTree, f) 616 case *symbolRegexpMatchTree: 617 visitMatchTree(s.matchTree, f) 618 default: 619 f(t) 620 } 621} 622 623// updateMatchTreeStats calls updateStats on all atoms in mt which have that 624// function defined. 625func updateMatchTreeStats(mt matchTree, stats *zoekt.Stats) { 626 visitMatchTree(mt, func(mt matchTree) { 627 if atom, ok := mt.(interface{ updateStats(*zoekt.Stats) }); ok { 628 atom.updateStats(stats) 629 } 630 }) 631} 632 633func visitMatchAtoms(t matchTree, known map[matchTree]bool, f func(matchTree)) { 634 visitMatches(t, known, 1, func(mt matchTree, _ float64) { 635 f(mt) 636 }) 637} 638 639// visitMatches visits all atoms which can contribute matches. Note: This 640// skips noVisitMatchTree. 641func visitMatches(t matchTree, known map[matchTree]bool, weight float64, f func(matchTree, float64)) { 642 switch s := t.(type) { 643 case *andMatchTree: 644 for _, ch := range s.children { 645 if known[ch] { 646 visitMatches(ch, known, weight, f) 647 } 648 } 649 case *andLineMatchTree: 650 visitMatches(&s.andMatchTree, known, weight, f) 651 case *orMatchTree: 652 for _, ch := range s.children { 653 if known[ch] { 654 visitMatches(ch, known, weight, f) 655 } 656 } 657 case *boostMatchTree: 658 visitMatches(s.child, known, weight*s.boost, f) 659 case *symbolSubstrMatchTree: 660 visitMatches(s.substrMatchTree, known, weight, f) 661 case *notMatchTree: 662 case *noVisitMatchTree: 663 // don't collect into negative trees. 664 case *fileNameMatchTree: 665 // We will just gather the filename if we do not visit this tree. 666 default: 667 f(s, weight) 668 } 669} 670 671// all matches() methods. 672 673func (t *docMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState { 674 return matchesStatePred(t.predicate(cp.idx)) 675} 676 677func (t *bruteForceMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState { 678 return matchesFound 679} 680 681// andLineMatchTree is a performance optimization of andMatchTree. For content 682// searches we don't want to run the regex engine if there is no line that 683// contains matches from all terms. 684func (t *andLineMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState { 685 if state := evalMatchTree(cp, cost, known, &t.andMatchTree); state != matchesFound { 686 return state 687 } 688 689 // Invariant: all children have matches. If any line contains all of them we 690 // can return MatchesFound. 691 692 // find child with fewest candidates 693 minCount := math.MaxInt 694 fewestChildren := 0 695 for ix, child := range t.children { 696 v, ok := child.(*substrMatchTree) 697 // make sure we are running a content search and that all candidates are a 698 // substrMatchTree 699 if !ok || v.fileName { 700 return matchesFound 701 } 702 if len(v.current) < minCount { 703 minCount = len(v.current) 704 fewestChildren = ix 705 } 706 } 707 708 type lineRange struct { 709 start int 710 end int 711 } 712 lines := make([]lineRange, 0, len(t.children[fewestChildren].(*substrMatchTree).current)) 713 prev := -1 714 for _, candidate := range t.children[fewestChildren].(*substrMatchTree).current { 715 line := cp.newlines().atOffset(candidate.byteOffset) 716 if line == prev { 717 continue 718 } 719 prev = line 720 byteStart := int(cp.newlines().lineStart(line)) 721 byteEnd := int(cp.newlines().lineStart(line + 1)) 722 lines = append(lines, lineRange{byteStart, byteEnd}) 723 } 724 725 // children keeps track of the children's candidates we have already seen. 726 children := make([][]*candidateMatch, 0, len(t.children)-1) 727 for j, child := range t.children { 728 if j == fewestChildren { 729 continue 730 } 731 children = append(children, child.(*substrMatchTree).current) 732 } 733 734nextLine: 735 for i := 0; i < len(lines); i++ { 736 hits := 1 737 nextChild: 738 for j := range children { 739 nextCandidate: 740 for len(children[j]) > 0 { 741 candidate := children[j][0] 742 bo := int(cp.findOffset(false, candidate.runeOffset)) 743 if bo < lines[i].start { 744 children[j] = children[j][1:] 745 continue nextCandidate 746 } 747 if bo < lines[i].end { 748 hits++ 749 continue nextChild 750 } 751 // move the `lines` iterator forward until bo < line.end 752 for i < len(lines) && bo >= lines[i].end { 753 i++ 754 } 755 i-- 756 continue nextLine 757 } 758 } 759 // return early once we found any line that contains matches from all children 760 if hits == len(t.children) { 761 return matchesFound 762 } 763 } 764 return matchesNone 765} 766 767func (t *andMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState { 768 // We have found matches unless a child needs to do more work or it hasn't 769 // found matches. 770 state := matchesFound 771 772 for _, ch := range t.children { 773 switch evalMatchTree(cp, cost, known, ch) { 774 case matchesRequiresHigherCost: 775 // keep evaluating other children incase we come across matchesNone 776 state = matchesRequiresHigherCost 777 case matchesFound: 778 // will return this if every child has this value 779 case matchesNone: 780 return matchesNone 781 } 782 } 783 784 return state 785} 786 787func (t *orMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState { 788 // we could short-circuit, but we want to use the other possibilities as a 789 // ranking signal. So we always return the most conservative state. 790 state := matchesNone 791 for _, ch := range t.children { 792 switch evalMatchTree(cp, cost, known, ch) { 793 case matchesRequiresHigherCost: 794 state = matchesRequiresHigherCost 795 case matchesFound: 796 if state != matchesRequiresHigherCost { 797 state = matchesFound 798 } 799 case matchesNone: 800 // noop 801 } 802 } 803 return state 804} 805 806func (t *branchQueryMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState { 807 return matchesStatePred(t.branchMask() != 0) 808} 809 810func (t *regexpMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState { 811 if t.reEvaluated { 812 return matchesStateForSlice(t.found) 813 } 814 815 if cost < costRegexp { 816 return matchesRequiresHigherCost 817 } 818 819 cp.stats.RegexpsConsidered++ 820 data := cp.data(t.fileName) 821 // For file content, use hybridRegexp which dispatches to go-re2 when 822 // len(data) >= ZOEKT_RE2_THRESHOLD_BYTES. For filename matching, use 823 // grafana/regexp directly: filenames are always short, so the WASM 824 // call overhead of go-re2 outweighs any benefit. 825 var idxs [][]int 826 if t.fileName { 827 idxs = t.regexp.FindAllIndex(data, -1) 828 } else { 829 idxs = t.hybridRegexp.FindAllIndex(data, -1) 830 } 831 found := t.found[:0] 832 for _, idx := range idxs { 833 cm := &candidateMatch{ 834 byteOffset: uint32(idx[0]), 835 byteMatchSz: uint32(idx[1] - idx[0]), 836 fileName: t.fileName, 837 } 838 839 found = append(found, cm) 840 } 841 t.found = found 842 t.reEvaluated = true 843 844 return matchesStateForSlice(t.found) 845} 846 847func (t *wordMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState { 848 if t.evaluated { 849 return matchesStateForSlice(t.found) 850 } 851 852 if cost < costRegexp { 853 return matchesRequiresHigherCost 854 } 855 856 data := cp.data(t.fileName) 857 offset := 0 858 found := t.found[:0] 859 for { 860 idx := bytes.Index(data[offset:], []byte(t.word)) 861 if idx < 0 { 862 break 863 } 864 865 relStartOffset := offset + idx 866 relEndOffset := relStartOffset + len(t.word) 867 868 startBoundary := relStartOffset < len(data) && (relStartOffset == 0 || !characterClass(data[relStartOffset-1])) 869 endBoundary := relEndOffset > 0 && (relEndOffset == len(data) || !characterClass(data[relEndOffset])) 870 if startBoundary && endBoundary { 871 found = append(found, &candidateMatch{ 872 byteOffset: uint32(offset + idx), 873 byteMatchSz: uint32(len(t.word)), 874 fileName: t.fileName, 875 }) 876 } 877 offset += idx + len(t.word) 878 } 879 880 t.found = found 881 t.evaluated = true 882 883 return matchesStateForSlice(t.found) 884} 885 886// breakMatchesOnNewlines returns matches resulting from breaking each element 887// of cms on newlines within text. 888func breakMatchesOnNewlines(cms []*candidateMatch, text []byte) []*candidateMatch { 889 var lineCMs []*candidateMatch 890 for _, cm := range cms { 891 lineCMs = append(lineCMs, breakOnNewlines(cm, text)...) 892 } 893 return lineCMs 894} 895 896// breakOnNewlines returns matches resulting from breaking cm on newlines 897// within text. 898func breakOnNewlines(cm *candidateMatch, text []byte) []*candidateMatch { 899 var cms []*candidateMatch 900 addMe := &candidateMatch{} 901 *addMe = *cm 902 for i := uint32(cm.byteOffset); i < cm.byteOffset+cm.byteMatchSz; i++ { 903 if text[i] == '\n' { 904 addMe.byteMatchSz = i - addMe.byteOffset 905 if addMe.byteMatchSz != 0 { 906 cms = append(cms, addMe) 907 } 908 909 addMe = &candidateMatch{} 910 *addMe = *cm 911 addMe.byteOffset = i + 1 912 } 913 } 914 addMe.byteMatchSz = cm.byteOffset + cm.byteMatchSz - addMe.byteOffset 915 if addMe.byteMatchSz != 0 { 916 cms = append(cms, addMe) 917 } 918 return cms 919} 920 921// evalMatchTree should be called instead of directly calling 922// matchTree.matches. It cache known values for future evaluation at higher 923// costs. 924func evalMatchTree(cp *contentProvider, cost int, known map[matchTree]bool, mt matchTree) matchesState { 925 if v, ok := known[mt]; ok { 926 return matchesStatePred(v) 927 } 928 929 ms := mt.matches(cp, cost, known) 930 if ms != matchesRequiresHigherCost { 931 known[mt] = ms == matchesFound 932 } 933 934 return ms 935} 936 937func (t *notMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState { 938 switch evalMatchTree(cp, cost, known, t.child) { 939 case matchesRequiresHigherCost: 940 return matchesRequiresHigherCost 941 case matchesFound: 942 return matchesNone 943 case matchesNone: 944 return matchesFound 945 default: 946 panic("unreachable") 947 } 948} 949 950func (t *fileNameMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState { 951 return evalMatchTree(cp, cost, known, t.child) 952} 953 954func (t *boostMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState { 955 return evalMatchTree(cp, cost, known, t.child) 956} 957 958func (t *substrMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState { 959 if t.contEvaluated { 960 return matchesStateForSlice(t.current) 961 } 962 963 if len(t.current) == 0 { 964 return matchesNone 965 } 966 967 if t.fileName && cost < costMemory { 968 return matchesRequiresHigherCost 969 } 970 971 if !t.fileName && cost < costContent { 972 return matchesRequiresHigherCost 973 } 974 975 pruned := t.current[:0] 976 for _, m := range t.current { 977 if m.byteOffset == 0 && m.runeOffset > 0 { 978 m.byteOffset = cp.findOffset(m.fileName, m.runeOffset) 979 } 980 if m.matchContent(cp.data(m.fileName)) { 981 pruned = append(pruned, m) 982 } 983 } 984 t.current = pruned 985 t.contEvaluated = true 986 987 return matchesStateForSlice(t.current) 988} 989 990type matchTreeOpt struct { 991 // DisableWordMatchOptimization is used to disable the use of wordMatchTree. 992 // This was added since we do not support wordMatchTree with symbol search. 993 DisableWordMatchOptimization bool 994} 995 996func (d *indexData) newMatchTree(q query.Q, opt matchTreeOpt) (matchTree, error) { 997 if q == nil { 998 return nil, fmt.Errorf("got nil (sub)query") 999 } 1000 1001 switch s := q.(type) { 1002 case *query.Regexp: 1003 // RegexpToMatchTreeRecursive tries to distill a matchTree that matches a 1004 // superset of the regexp. If the returned matchTree is equivalent to the 1005 // original regexp, it returns true. An equivalent matchTree has the same 1006 // behaviour as the original regexp and can be used instead. 1007 // 1008 subMT, isEq, _, err := d.regexpToMatchTreeRecursive(s.Regexp, ngramSize, s.FileName, s.CaseSensitive) 1009 if err != nil { 1010 return nil, err 1011 } 1012 // if the query can be used in place of the regexp 1013 // return the subtree 1014 if isEq { 1015 return subMT, nil 1016 } 1017 1018 var tr matchTree 1019 if wmt, ok := regexpToWordMatchTree(s, opt); ok { 1020 // A common search we get is "\bLITERAL\b". Avoid the regex engine and 1021 // provide something faster. 1022 tr = wmt 1023 } else { 1024 tr = newRegexpMatchTree(s) 1025 } 1026 1027 return &andMatchTree{ 1028 children: []matchTree{ 1029 tr, &noVisitMatchTree{subMT}, 1030 }, 1031 }, nil 1032 case *query.And: 1033 var r []matchTree 1034 for _, ch := range s.Children { 1035 ct, err := d.newMatchTree(ch, opt) 1036 if err != nil { 1037 return nil, err 1038 } 1039 r = append(r, ct) 1040 } 1041 return &andMatchTree{r}, nil 1042 case *query.Or: 1043 var r []matchTree 1044 for _, ch := range s.Children { 1045 ct, err := d.newMatchTree(ch, opt) 1046 if err != nil { 1047 return nil, err 1048 } 1049 r = append(r, ct) 1050 } 1051 return &orMatchTree{r}, nil 1052 case *query.Not: 1053 ct, err := d.newMatchTree(s.Child, opt) 1054 return &notMatchTree{ 1055 child: ct, 1056 }, err 1057 1058 case *query.Type: 1059 if s.Type != query.TypeFileName { 1060 // TypeFileMatch is the default result granularity, so the 1061 // wrapper carries no extra match semantics: match the child 1062 // directly. TypeRepo is normally stripped before reaching a 1063 // shard, but handle it the same way to avoid a panic. 1064 return d.newMatchTree(s.Child, opt) 1065 } 1066 1067 ct, err := d.newMatchTree(s.Child, opt) 1068 if err != nil { 1069 return nil, err 1070 } 1071 1072 return &fileNameMatchTree{ 1073 child: ct, 1074 }, nil 1075 1076 case *query.Boost: 1077 ct, err := d.newMatchTree(s.Child, opt) 1078 if err != nil { 1079 return nil, err 1080 } 1081 1082 return &boostMatchTree{ 1083 child: ct, 1084 boost: s.Boost, 1085 }, nil 1086 1087 case *query.Meta: 1088 checksum := queryMetaChecksum(s.Field, s.Value) 1089 cacheKeyField := "Meta" 1090 if cached, ok := d.docMatchTreeCache.Get(cacheKeyField, checksum); ok { 1091 return cached, nil 1092 } 1093 1094 reposWant := make([]bool, len(d.repoMetaData)) 1095 for repoIdx, r := range d.repoMetaData { 1096 if r.Metadata != nil { 1097 if val, ok := r.Metadata[s.Field]; ok && s.Value.MatchString(val) { 1098 reposWant[repoIdx] = true 1099 } 1100 } 1101 } 1102 1103 mt := &docMatchTree{ 1104 reason: "Meta", 1105 numDocs: d.numDocs(), 1106 predicate: func(docID uint32) bool { 1107 repoIdx := d.repos[docID] 1108 if int(repoIdx) >= len(reposWant) { 1109 return false 1110 } 1111 return reposWant[repoIdx] 1112 }, 1113 } 1114 d.docMatchTreeCache.Add(cacheKeyField, checksum, mt) 1115 return mt, nil 1116 1117 case *query.Substring: 1118 return d.newSubstringMatchTree(s) 1119 1120 case *query.Branch: 1121 masks := make([]uint64, 0, len(d.repoMetaData)) 1122 if s.Pattern == "HEAD" { 1123 for range d.repoMetaData { 1124 masks = append(masks, 1) 1125 } 1126 } else { 1127 for _, branchIDs := range d.branchIDs { 1128 mask := uint64(0) 1129 for nm, m := range branchIDs { 1130 if (s.Exact && nm == s.Pattern) || (!s.Exact && strings.Contains(nm, s.Pattern)) { 1131 mask |= uint64(m) 1132 } 1133 } 1134 masks = append(masks, mask) 1135 } 1136 } 1137 return &branchQueryMatchTree{ 1138 masks: masks, 1139 fileMasks: d.fileBranchMasks, 1140 repos: d.repos, 1141 }, nil 1142 case *query.Const: 1143 if s.Value { 1144 return &bruteForceMatchTree{}, nil 1145 } else { 1146 return &noMatchTree{Why: "const"}, nil 1147 } 1148 case *query.Language: 1149 code, ok := d.metaData.LanguageMap[s.Language] 1150 if !ok { 1151 return &noMatchTree{Why: "lang"}, nil 1152 } 1153 return &docMatchTree{ 1154 reason: "language", 1155 numDocs: d.numDocs(), 1156 predicate: func(docID uint32) bool { 1157 return d.getLanguage(docID) == code 1158 }, 1159 }, nil 1160 1161 case *query.Symbol: 1162 // Disable WordMatchTree since we don't support it in symbols yet. 1163 optCopy := opt 1164 optCopy.DisableWordMatchOptimization = true 1165 1166 subMT, err := d.newMatchTree(s.Expr, optCopy) 1167 if err != nil { 1168 return nil, err 1169 } 1170 1171 if substr, ok := subMT.(*substrMatchTree); ok { 1172 return &symbolSubstrMatchTree{ 1173 substrMatchTree: substr, 1174 patternSize: uint32(utf8.RuneCountInString(substr.query.Pattern)), 1175 fileEndRunes: d.fileEndRunes, 1176 fileEndSymbol: d.fileEndSymbol, 1177 sections: d.runeDocSections, 1178 }, nil 1179 } 1180 1181 var regexpMT *regexpMatchTree 1182 visitMatchTree(subMT, func(mt matchTree) { 1183 if t, ok := mt.(*regexpMatchTree); ok { 1184 regexpMT = t 1185 } 1186 }) 1187 if regexpMT == nil { 1188 return nil, fmt.Errorf("found %T inside query.Symbol", subMT) 1189 } 1190 1191 return &symbolRegexpMatchTree{ 1192 regexp: regexpMT.regexp, 1193 all: isRegexpAll(regexpMT.origRegexp), 1194 matchTree: subMT, 1195 }, nil 1196 1197 case *query.FileNameSet: 1198 return &docMatchTree{ 1199 reason: "FileNameSet", 1200 numDocs: d.numDocs(), 1201 predicate: func(docID uint32) bool { 1202 fileName := d.fileName(docID) 1203 _, ok := s.Set[string(fileName)] 1204 return ok 1205 }, 1206 }, nil 1207 1208 case *query.BranchesRepos: 1209 reposBranchesWant := make([]uint64, len(d.repoMetaData)) 1210 for repoIdx := range d.repoMetaData { 1211 var mask uint64 1212 for _, br := range s.List { 1213 if br.Repos.Contains(d.repoMetaData[repoIdx].ID) { 1214 mask |= uint64(d.branchIDs[repoIdx][br.Branch]) 1215 } 1216 } 1217 reposBranchesWant[repoIdx] = mask 1218 } 1219 return &docMatchTree{ 1220 reason: "BranchesRepos", 1221 numDocs: d.numDocs(), 1222 predicate: func(docID uint32) bool { 1223 return d.fileBranchMasks[docID]&reposBranchesWant[d.repos[docID]] != 0 1224 }, 1225 }, nil 1226 1227 case *query.RepoSet: 1228 reposWant := make([]bool, len(d.repoMetaData)) 1229 for repoIdx, r := range d.repoMetaData { 1230 if _, ok := s.Set[r.Name]; ok { 1231 reposWant[repoIdx] = true 1232 } 1233 } 1234 return &docMatchTree{ 1235 reason: "RepoSet", 1236 numDocs: d.numDocs(), 1237 predicate: func(docID uint32) bool { 1238 return reposWant[d.repos[docID]] 1239 }, 1240 }, nil 1241 1242 case *query.RepoIDs: 1243 reposWant := make([]bool, len(d.repoMetaData)) 1244 for repoIdx, r := range d.repoMetaData { 1245 if s.Repos.Contains(r.ID) { 1246 reposWant[repoIdx] = true 1247 } 1248 } 1249 return &docMatchTree{ 1250 reason: "RepoIDs", 1251 numDocs: d.numDocs(), 1252 predicate: func(docID uint32) bool { 1253 return reposWant[d.repos[docID]] 1254 }, 1255 }, nil 1256 1257 case *query.Repo: 1258 reposWant := make([]bool, len(d.repoMetaData)) 1259 for repoIdx, r := range d.repoMetaData { 1260 if s.Regexp.MatchString(r.Name) { 1261 reposWant[repoIdx] = true 1262 } 1263 } 1264 return &docMatchTree{ 1265 reason: "Repo", 1266 numDocs: d.numDocs(), 1267 predicate: func(docID uint32) bool { 1268 return reposWant[d.repos[docID]] 1269 }, 1270 }, nil 1271 1272 case *query.RepoRegexp: 1273 reposWant := make([]bool, len(d.repoMetaData)) 1274 for repoIdx, r := range d.repoMetaData { 1275 if s.Regexp.MatchString(r.Name) { 1276 reposWant[repoIdx] = true 1277 } 1278 } 1279 return &docMatchTree{ 1280 reason: "RepoRegexp", 1281 numDocs: d.numDocs(), 1282 predicate: func(docID uint32) bool { 1283 return reposWant[d.repos[docID]] 1284 }, 1285 }, nil 1286 1287 case query.RawConfig: 1288 return &docMatchTree{ 1289 reason: s.String(), 1290 numDocs: d.numDocs(), 1291 predicate: func(docID uint32) bool { 1292 return uint8(s)&d.rawConfigMasks[d.repos[docID]] == uint8(s) 1293 }, 1294 }, nil 1295 } 1296 log.Panicf("type %T", q) 1297 return nil, nil 1298} 1299 1300func (d *indexData) newSubstringMatchTree(s *query.Substring) (matchTree, error) { 1301 st := &substrMatchTree{ 1302 query: s, 1303 caseSensitive: s.CaseSensitive, 1304 fileName: s.FileName, 1305 } 1306 1307 if utf8.RuneCountInString(s.Pattern) < ngramSize { 1308 return newRegexpMatchTree(&query.Regexp{ 1309 Regexp: &syntax.Regexp{Op: syntax.OpLiteral, Rune: []rune(s.Pattern)}, 1310 FileName: s.FileName, 1311 Content: s.Content, 1312 CaseSensitive: s.CaseSensitive, 1313 }), nil 1314 } 1315 1316 result, err := d.iterateNgrams(s) 1317 if err != nil { 1318 return nil, err 1319 } 1320 st.matchIterator = result 1321 return st, nil 1322} 1323 1324func regexpToWordMatchTree(q *query.Regexp, opt matchTreeOpt) (_ *wordMatchTree, ok bool) { 1325 if opt.DisableWordMatchOptimization { 1326 return nil, false 1327 } 1328 // Needs to be case sensitive 1329 if !q.CaseSensitive || q.Regexp.Flags&syntax.FoldCase != 0 { 1330 return nil, false 1331 } 1332 // We want a regex that looks like Op.Concat[OpWordBoundary OpLiteral OpWordBoundary] 1333 if q.Regexp.Op != syntax.OpConcat || len(q.Regexp.Sub) != 3 { 1334 return nil, false 1335 } 1336 sub := q.Regexp.Sub 1337 if sub[0].Op != syntax.OpWordBoundary || sub[1].Op != syntax.OpLiteral || sub[2].Op != syntax.OpWordBoundary { 1338 return nil, false 1339 } 1340 1341 return &wordMatchTree{ 1342 word: string(sub[1].Rune), 1343 fileName: q.FileName, 1344 }, true 1345} 1346 1347// pruneMatchTree removes impossible branches from the matchTree, as indicated 1348// by substrMatchTree having a noMatchTree and the resulting impossible and clauses and so forth. 1349func pruneMatchTree(mt matchTree) (matchTree, error) { 1350 var err error 1351 switch mt := mt.(type) { 1352 // leaf nodes that we test with the filter: 1353 case *substrMatchTree: 1354 if res, ok := mt.matchIterator.(*ngramIterationResults); ok { 1355 if _, ok := res.matchIterator.(*noMatchTree); ok { 1356 return nil, nil 1357 } 1358 } 1359 // recursive tree structures: 1360 case *andMatchTree: 1361 // Any branch of an and becoming impossible means the entire clause 1362 // is impossible. Otherwise, just handle rewrites. 1363 for i, child := range mt.children { 1364 newChild, err := pruneMatchTree(child) 1365 if err != nil { 1366 return nil, err 1367 } 1368 if newChild == nil { 1369 return nil, nil 1370 } 1371 mt.children[i] = newChild 1372 } 1373 case *orMatchTree: 1374 // *All* branches of an OR becoming impossible means the entire clause 1375 // is impossible. Otherwise, drop impossible subclauses and handle 1376 // rewrites, including simplifying to a singular resulting child branch. 1377 n := 0 1378 for _, child := range mt.children { 1379 newChild, err := pruneMatchTree(child) 1380 if err != nil { 1381 return nil, err 1382 } 1383 if newChild != nil { 1384 mt.children[n] = newChild 1385 n++ 1386 } 1387 } 1388 mt.children = mt.children[:n] 1389 if len(mt.children) == 1 { 1390 return mt.children[0], nil 1391 } else if len(mt.children) == 0 { 1392 return nil, nil 1393 } 1394 case *noVisitMatchTree: 1395 mt.matchTree, err = pruneMatchTree(mt.matchTree) 1396 if err != nil { 1397 return nil, err 1398 } 1399 if mt.matchTree == nil { 1400 return nil, nil 1401 } 1402 case *fileNameMatchTree: 1403 mt.child, err = pruneMatchTree(mt.child) 1404 if err != nil { 1405 return nil, err 1406 } 1407 if mt.child == nil { 1408 return nil, nil 1409 } 1410 case *boostMatchTree: 1411 mt.child, err = pruneMatchTree(mt.child) 1412 if err != nil { 1413 return nil, err 1414 } 1415 if mt.child == nil { 1416 return nil, nil 1417 } 1418 case *andLineMatchTree: 1419 child, err := pruneMatchTree(&mt.andMatchTree) 1420 if err != nil { 1421 return nil, err 1422 } 1423 if child == nil { 1424 return nil, nil 1425 } 1426 if c, ok := child.(*andMatchTree); ok { 1427 mt.andMatchTree = *c 1428 } else { 1429 // the and was simplified to a single clause, 1430 // so the linematch portion is irrelevant. 1431 return mt, nil 1432 } 1433 case *notMatchTree: 1434 mt.child, err = pruneMatchTree(mt.child) 1435 if err != nil { 1436 return nil, err 1437 } 1438 if mt.child == nil { 1439 // not false => true 1440 return &bruteForceMatchTree{}, nil 1441 } 1442 // unhandled: 1443 case *docMatchTree: 1444 case *bruteForceMatchTree: 1445 case *regexpMatchTree: 1446 case *wordMatchTree: 1447 } 1448 return mt, err 1449} 1450 1451// isRegexpAll returns true if the query matches all possible lines. 1452// 1453// Note: it is possible for a funky regex to actually match all but this 1454// returns false. This returns true for normal looking regexes like ".*" or 1455// "(?-s:.*)". 1456func isRegexpAll(r *syntax.Regexp) bool { 1457 // Note: we don't care about any flags since we are looking for .* and we 1458 // don't mind if . matches all or everything but newline. 1459 1460 // for loop is for visiting children of OpCapture/OpConcat until we hit .* 1461 for { 1462 // Our main target: .* 1463 if r.Op == syntax.OpStar && len(r.Sub) == 1 { // * 1464 // (?s:.) or (?-s:.) 1465 return r.Sub[0].Op == syntax.OpAnyChar || r.Sub[0].Op == syntax.OpAnyCharNotNL 1466 } 1467 1468 // Strip away expressions being wrapped in paranthesis 1469 if (r.Op == syntax.OpCapture || r.Op == syntax.OpConcat) && len(r.Sub) == 1 { 1470 r = r.Sub[0] 1471 continue 1472 } 1473 1474 return false 1475 } 1476} 1477 1478func queryMetaChecksum(field string, value *regexp.Regexp) string { 1479 h := xxhash.New() 1480 h.Write([]byte(field)) 1481 h.Write([]byte{':'}) 1482 h.Write([]byte(value.String())) 1483 return fmt.Sprintf("%x", h.Sum64()) 1484}