fork of https://github.com/sourcegraph/zoekt
0

Configure Feed

Select the types of activity you want to include in your feed.

1// Copyright 2018 Google Inc. All rights reserved. 2// 3// Licensed under the Apache License, Version 2.0 (the "License"); 4// you may not use this file except in compliance with the License. 5// You may obtain a copy of the License at 6// 7// http://www.apache.org/licenses/LICENSE-2.0 8// 9// Unless required by applicable law or agreed to in writing, software 10// distributed under the License is distributed on an "AS IS" BASIS, 11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12// See the License for the specific language governing permissions and 13// limitations under the License. 14 15package zoekt 16 17import ( 18 "bytes" 19 "fmt" 20 "log" 21 "regexp/syntax" 22 "strings" 23 "unicode/utf8" 24 25 "github.com/grafana/regexp" 26 27 "github.com/sourcegraph/zoekt/query" 28) 29 30// A docIterator iterates over documents in order. 31type docIterator interface { 32 // provide the next document where we may find something interesting. 33 // 34 // This is like a "peek" and shouldn't mutate state. prepare is what should 35 // change state. 36 nextDoc() uint32 37 38 // clears any per-document state of the docIterator, and 39 // prepares for evaluating the given doc. The argument is 40 // strictly increasing over time. 41 prepare(nextDoc uint32) 42} 43 44// costs are passed in increasing order to matchTree.matches until they do not 45// return matchesRequiresHigherCost. 46const ( 47 costConst = 0 48 costMemory = 1 49 costContent = 2 50 costRegexp = 3 51) 52 53const ( 54 costMin = costConst 55 costMax = costRegexp 56) 57 58// matchesState is an enum for the state of a matchTree after a call to 59// matchTree.matches. 60type matchesState uint8 61 62const ( 63 // matchesRequiresHigherCost is returned when matchTree.matches hasn't done 64 // a search yet since the cost value is not high enough. 65 matchesRequiresHigherCost matchesState = iota 66 67 // matchesFound is returned when matchTree.matches has done a search and 68 // found one or more matches. 69 matchesFound 70 71 // matchesNone is returned when matchTree.matches has done a search and 72 // found nothing. 73 matchesNone 74) 75 76// matchesStatePred is a helper which returns matchesFound if b is true 77// otherwise returns matchesNone. 78func matchesStatePred(b bool) matchesState { 79 if b { 80 return matchesFound 81 } 82 return matchesNone 83} 84 85// matchesStateForSlice is a helper which returns matchesFound if v is 86// non-empty otherwise returns matchesNone. 87func matchesStateForSlice[T any](v []T) matchesState { 88 return matchesStatePred(len(v) > 0) 89} 90 91// An expression tree coupled with matches. The matchtree has two 92// functions: 93// 94// * it implements boolean combinations (and, or, not) 95// 96// * it implements shortcuts, where we skip documents (for example: if 97// there are no trigram matches, we can be sure there are no substring 98// matches). The matchtree iterates over the documents as they are 99// ordered in the shard. 100// 101// The general process for a given (shard, query) is: 102// 103// - construct matchTree for the query 104// 105// - find all different leaf matchTrees (substring, regexp, etc.) 106// 107// in a loop: 108// 109// - find next doc to process using nextDoc 110// 111// - evaluate atoms (leaf expressions that match text) 112// 113// - evaluate the tree using matches(), storing the result in map. 114// 115// - if the complete tree returns (matches() != matchesRequiresHigherCost) 116// for the document, collect all text matches by looking at leaf 117// matchTrees. 118type matchTree interface { 119 docIterator 120 121 // matches if cost is high enough. See documentation for matchesState's 122 // values. 123 // 124 // Note: Do not call this directly, rather use evalMatchTree which uses 125 // known to cache responses once the state transitions away from 126 // matchesRequiresHigherCost. 127 matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState 128} 129 130// docMatchTree iterates over documents for which predicate(docID) returns true. 131type docMatchTree struct { 132 // the number of documents in a shard. 133 numDocs uint32 134 135 predicate func(docID uint32) bool 136 137 // provides additional information about the reason why the docMatchTree was 138 // created. 139 reason string 140 141 // mutable 142 firstDone bool 143 docID uint32 144} 145 146type bruteForceMatchTree struct { 147 // mutable 148 firstDone bool 149 docID uint32 150} 151 152type andLineMatchTree struct { 153 andMatchTree 154} 155 156type andMatchTree struct { 157 children []matchTree 158} 159 160type orMatchTree struct { 161 children []matchTree 162} 163 164type notMatchTree struct { 165 child matchTree 166} 167 168// Returns only the filename of child matches. 169type fileNameMatchTree struct { 170 child matchTree 171} 172 173type boostMatchTree struct { 174 child matchTree 175 boost float64 176} 177 178// Don't visit this subtree for collecting matches. 179type noVisitMatchTree struct { 180 matchTree 181} 182 183type regexpMatchTree struct { 184 regexp *regexp.Regexp 185 186 fileName bool 187 188 // mutable 189 reEvaluated bool 190 found []*candidateMatch 191 192 // nextDoc, prepare. 193 bruteForceMatchTree 194} 195 196// \bLITERAL\b 197type wordMatchTree struct { 198 word string 199 200 fileName bool 201 202 // mutable 203 evaluated bool 204 found []*candidateMatch 205 206 // nextDoc, prepare. 207 bruteForceMatchTree 208} 209 210type substrMatchTree struct { 211 matchIterator 212 213 query *query.Substring 214 caseSensitive bool 215 fileName bool 216 217 // mutable 218 current []*candidateMatch 219 contEvaluated bool 220} 221 222type branchQueryMatchTree struct { 223 fileMasks []uint64 224 masks []uint64 225 repos []uint16 226 227 // mutable 228 firstDone bool 229 docID uint32 230} 231 232func (t *branchQueryMatchTree) branchMask() uint64 { 233 return t.fileMasks[t.docID] & t.masks[t.repos[t.docID]] 234} 235 236type symbolRegexpMatchTree struct { 237 matchTree 238 regexp *regexp.Regexp 239 all bool // skips regex match if .* 240 241 reEvaluated bool 242 found []*candidateMatch 243} 244 245func (t *symbolRegexpMatchTree) prepare(doc uint32) { 246 t.reEvaluated = false 247 t.found = t.found[:0] 248 t.matchTree.prepare(doc) 249} 250 251func (t *symbolRegexpMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState { 252 if t.reEvaluated { 253 return matchesStateForSlice(t.found) 254 } 255 256 if cost < costRegexp { 257 return matchesRequiresHigherCost 258 } 259 260 sections := cp.docSections() 261 content := cp.data(false) 262 263 found := t.found[:0] 264 for i, sec := range sections { 265 var idx []int 266 if t.all { 267 idx = []int{0, int(sec.End - sec.Start)} 268 } else { 269 idx = t.regexp.FindIndex(content[sec.Start:sec.End]) 270 if idx == nil { 271 continue 272 } 273 } 274 275 cm := &candidateMatch{ 276 byteOffset: sec.Start + uint32(idx[0]), 277 byteMatchSz: uint32(idx[1] - idx[0]), 278 symbol: true, 279 symbolIdx: uint32(i), 280 } 281 found = append(found, cm) 282 } 283 t.found = found 284 t.reEvaluated = true 285 286 return matchesStateForSlice(t.found) 287} 288 289type symbolSubstrMatchTree struct { 290 *substrMatchTree 291 292 patternSize uint32 293 fileEndRunes []uint32 294 fileEndSymbol []uint32 295 296 doc uint32 297 sections []DocumentSection 298 299 secID uint32 300} 301 302func (t *symbolSubstrMatchTree) prepare(doc uint32) { 303 t.substrMatchTree.prepare(doc) 304 t.doc = doc 305 306 var fileStart uint32 307 if doc > 0 { 308 fileStart = t.fileEndRunes[doc-1] 309 } 310 311 var sections []DocumentSection 312 if len(t.sections) > 0 { 313 most := t.fileEndSymbol[len(t.fileEndSymbol)-1] 314 if most == uint32(len(t.sections)) { 315 sections = t.sections[t.fileEndSymbol[doc]:t.fileEndSymbol[doc+1]] 316 } else { 317 for t.secID < uint32(len(t.sections)) && t.sections[t.secID].Start < fileStart { 318 t.secID++ 319 } 320 321 fileEnd, symbolEnd := t.fileEndRunes[doc], t.secID 322 for symbolEnd < uint32(len(t.sections)) && t.sections[symbolEnd].Start < fileEnd { 323 symbolEnd++ 324 } 325 326 sections = t.sections[t.secID:symbolEnd] 327 } 328 } 329 330 secIdx := 0 331 trimmed := t.current[:0] 332 for len(sections) > secIdx && len(t.current) > 0 { 333 start := fileStart + t.current[0].runeOffset 334 end := start + t.patternSize 335 if start >= sections[secIdx].End { 336 secIdx++ 337 continue 338 } 339 340 if start < sections[secIdx].Start { 341 t.current = t.current[1:] 342 continue 343 } 344 345 if end <= sections[secIdx].End { 346 t.current[0].symbol = true 347 t.current[0].symbolIdx = uint32(secIdx) 348 trimmed = append(trimmed, t.current[0]) 349 } 350 351 t.current = t.current[1:] 352 } 353 t.current = trimmed 354} 355 356// all prepare methods 357 358func (t *bruteForceMatchTree) prepare(doc uint32) { 359 t.docID = doc 360 t.firstDone = true 361} 362 363func (t *docMatchTree) prepare(doc uint32) { 364 t.docID = doc 365 t.firstDone = true 366} 367 368func (t *andMatchTree) prepare(doc uint32) { 369 for _, c := range t.children { 370 c.prepare(doc) 371 } 372} 373 374func (t *regexpMatchTree) prepare(doc uint32) { 375 t.found = t.found[:0] 376 t.reEvaluated = false 377 t.bruteForceMatchTree.prepare(doc) 378} 379 380func (t *wordMatchTree) prepare(doc uint32) { 381 t.found = t.found[:0] 382 t.evaluated = false 383 t.bruteForceMatchTree.prepare(doc) 384} 385 386func (t *orMatchTree) prepare(doc uint32) { 387 for _, c := range t.children { 388 c.prepare(doc) 389 } 390} 391 392func (t *notMatchTree) prepare(doc uint32) { 393 t.child.prepare(doc) 394} 395 396func (t *fileNameMatchTree) prepare(doc uint32) { 397 t.child.prepare(doc) 398} 399 400func (t *boostMatchTree) prepare(doc uint32) { 401 t.child.prepare(doc) 402} 403 404func (t *substrMatchTree) prepare(nextDoc uint32) { 405 t.matchIterator.prepare(nextDoc) 406 t.current = t.matchIterator.candidates() 407 t.contEvaluated = false 408} 409 410func (t *branchQueryMatchTree) prepare(doc uint32) { 411 t.firstDone = true 412 t.docID = doc 413} 414 415// nextDoc 416 417func (t *docMatchTree) nextDoc() uint32 { 418 var start uint32 419 if t.firstDone { 420 start = t.docID + 1 421 } 422 for i := start; i < t.numDocs; i++ { 423 if t.predicate(i) { 424 return i 425 } 426 } 427 return maxUInt32 428} 429 430func (t *bruteForceMatchTree) nextDoc() uint32 { 431 if !t.firstDone { 432 return 0 433 } 434 return t.docID + 1 435} 436 437func (t *andMatchTree) nextDoc() uint32 { 438 var max uint32 439 for _, c := range t.children { 440 m := c.nextDoc() 441 if m > max { 442 max = m 443 } 444 } 445 return max 446} 447 448func (t *orMatchTree) nextDoc() uint32 { 449 min := uint32(maxUInt32) 450 for _, c := range t.children { 451 m := c.nextDoc() 452 if m < min { 453 min = m 454 } 455 } 456 return min 457} 458 459func (t *notMatchTree) nextDoc() uint32 { 460 return 0 461} 462 463func (t *fileNameMatchTree) nextDoc() uint32 { 464 return t.child.nextDoc() 465} 466 467func (t *boostMatchTree) nextDoc() uint32 { 468 return t.child.nextDoc() 469} 470 471func (t *branchQueryMatchTree) nextDoc() uint32 { 472 var start uint32 473 if t.firstDone { 474 start = t.docID + 1 475 } 476 477 for i := start; i < uint32(len(t.fileMasks)); i++ { 478 if (t.masks[t.repos[i]] & t.fileMasks[i]) != 0 { 479 return i 480 } 481 } 482 return maxUInt32 483} 484 485// all String methods 486 487func (t *bruteForceMatchTree) String() string { 488 return "all" 489} 490 491func (t *docMatchTree) String() string { 492 return fmt.Sprintf("doc(%s)", t.reason) 493} 494 495func (t *andMatchTree) String() string { 496 return fmt.Sprintf("and%v", t.children) 497} 498 499func (t *regexpMatchTree) String() string { 500 f := "" 501 if t.fileName { 502 f = "f" 503 } 504 return fmt.Sprintf("%sre(%s)", f, t.regexp) 505} 506 507func (t *wordMatchTree) String() string { 508 f := "" 509 if t.fileName { 510 f = "f" 511 } 512 return fmt.Sprintf("%sword(%s)", f, t.word) 513} 514 515func (t *orMatchTree) String() string { 516 return fmt.Sprintf("or%v", t.children) 517} 518 519func (t *notMatchTree) String() string { 520 return fmt.Sprintf("not(%v)", t.child) 521} 522 523func (t *noVisitMatchTree) String() string { 524 return fmt.Sprintf("novisit(%v)", t.matchTree) 525} 526 527func (t *fileNameMatchTree) String() string { 528 return fmt.Sprintf("f(%v)", t.child) 529} 530 531func (t *boostMatchTree) String() string { 532 return fmt.Sprintf("boost(%f, %v)", t.boost, t.child) 533} 534 535func (t *substrMatchTree) String() string { 536 f := "" 537 if t.fileName { 538 f = "f" 539 } 540 541 return fmt.Sprintf("%ssubstr(%q, %v, %v)", f, t.query.Pattern, t.current, t.matchIterator) 542} 543 544func (t *branchQueryMatchTree) String() string { 545 return fmt.Sprintf("branch(%x)", t.masks) 546} 547 548func (t *symbolSubstrMatchTree) String() string { 549 return fmt.Sprintf("symbol(%v)", t.substrMatchTree) 550} 551 552func (t *symbolRegexpMatchTree) String() string { 553 return fmt.Sprintf("symbol(%v)", t.matchTree) 554} 555 556// visitMatches visits all atoms in matchTree. Note: This visits 557// noVisitMatchTree. For collecting matches use visitMatches. 558func visitMatchTree(t matchTree, f func(matchTree)) { 559 switch s := t.(type) { 560 case *andMatchTree: 561 for _, ch := range s.children { 562 visitMatchTree(ch, f) 563 } 564 case *orMatchTree: 565 for _, ch := range s.children { 566 visitMatchTree(ch, f) 567 } 568 case *andLineMatchTree: 569 visitMatchTree(&s.andMatchTree, f) 570 case *noVisitMatchTree: 571 visitMatchTree(s.matchTree, f) 572 case *notMatchTree: 573 visitMatchTree(s.child, f) 574 case *fileNameMatchTree: 575 visitMatchTree(s.child, f) 576 case *boostMatchTree: 577 visitMatchTree(s.child, f) 578 case *symbolSubstrMatchTree: 579 visitMatchTree(s.substrMatchTree, f) 580 case *symbolRegexpMatchTree: 581 visitMatchTree(s.matchTree, f) 582 default: 583 f(t) 584 } 585} 586 587// updateMatchTreeStats calls updateStats on all atoms in mt which have that 588// function defined. 589func updateMatchTreeStats(mt matchTree, stats *Stats) { 590 visitMatchTree(mt, func(mt matchTree) { 591 if atom, ok := mt.(interface{ updateStats(*Stats) }); ok { 592 atom.updateStats(stats) 593 } 594 }) 595} 596 597func visitMatchAtoms(t matchTree, known map[matchTree]bool, f func(matchTree)) { 598 visitMatches(t, known, 1, func(mt matchTree, _ float64) { 599 f(mt) 600 }) 601} 602 603// visitMatches visits all atoms which can contribute matches. Note: This 604// skips noVisitMatchTree. 605func visitMatches(t matchTree, known map[matchTree]bool, weight float64, f func(matchTree, float64)) { 606 switch s := t.(type) { 607 case *andMatchTree: 608 for _, ch := range s.children { 609 if known[ch] { 610 visitMatches(ch, known, weight, f) 611 } 612 } 613 case *andLineMatchTree: 614 visitMatches(&s.andMatchTree, known, weight, f) 615 case *orMatchTree: 616 for _, ch := range s.children { 617 if known[ch] { 618 visitMatches(ch, known, weight, f) 619 } 620 } 621 case *boostMatchTree: 622 visitMatches(s.child, known, weight*s.boost, f) 623 case *symbolSubstrMatchTree: 624 visitMatches(s.substrMatchTree, known, weight, f) 625 case *notMatchTree: 626 case *noVisitMatchTree: 627 // don't collect into negative trees. 628 case *fileNameMatchTree: 629 // We will just gather the filename if we do not visit this tree. 630 default: 631 f(s, weight) 632 } 633} 634 635// all matches() methods. 636 637func (t *docMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState { 638 return matchesStatePred(t.predicate(cp.idx)) 639} 640 641func (t *bruteForceMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState { 642 return matchesFound 643} 644 645// andLineMatchTree is a performance optimization of andMatchTree. For content 646// searches we don't want to run the regex engine if there is no line that 647// contains matches from all terms. 648func (t *andLineMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState { 649 if state := evalMatchTree(cp, cost, known, &t.andMatchTree); state != matchesFound { 650 return state 651 } 652 653 // Invariant: all children have matches. If any line contains all of them we 654 // can return MatchesFound. 655 656 // find child with fewest candidates 657 min := maxUInt32 658 fewestChildren := 0 659 for ix, child := range t.children { 660 v, ok := child.(*substrMatchTree) 661 // make sure we are running a content search and that all candidates are a 662 // substrMatchTree 663 if !ok || v.fileName { 664 return matchesFound 665 } 666 if len(v.current) < min { 667 min = len(v.current) 668 fewestChildren = ix 669 } 670 } 671 672 type lineRange struct { 673 start int 674 end int 675 } 676 lines := make([]lineRange, 0, len(t.children[fewestChildren].(*substrMatchTree).current)) 677 prev := -1 678 for _, candidate := range t.children[fewestChildren].(*substrMatchTree).current { 679 line, byteStart, byteEnd := cp.newlines().atOffset(candidate.byteOffset) 680 if line == prev { 681 continue 682 } 683 prev = line 684 lines = append(lines, lineRange{byteStart, byteEnd}) 685 } 686 687 // children keeps track of the children's candidates we have already seen. 688 children := make([][]*candidateMatch, 0, len(t.children)-1) 689 for j, child := range t.children { 690 if j == fewestChildren { 691 continue 692 } 693 children = append(children, child.(*substrMatchTree).current) 694 } 695 696nextLine: 697 for i := 0; i < len(lines); i++ { 698 hits := 1 699 nextChild: 700 for j := range children { 701 nextCandidate: 702 for len(children[j]) > 0 { 703 candidate := children[j][0] 704 bo := int(cp.findOffset(false, candidate.runeOffset)) 705 if bo < lines[i].start { 706 children[j] = children[j][1:] 707 continue nextCandidate 708 } 709 if bo <= lines[i].end { 710 hits++ 711 continue nextChild 712 } 713 // move the `lines` iterator forward until bo <= line.end 714 for i < len(lines) && bo > lines[i].end { 715 i++ 716 } 717 i-- 718 continue nextLine 719 } 720 } 721 // return early once we found any line that contains matches from all children 722 if hits == len(t.children) { 723 return matchesFound 724 } 725 } 726 return matchesNone 727} 728 729func (t *andMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState { 730 // We have found matches unless a child needs to do more work or it hasn't 731 // found matches. 732 state := matchesFound 733 734 for _, ch := range t.children { 735 switch evalMatchTree(cp, cost, known, ch) { 736 case matchesRequiresHigherCost: 737 // keep evaluating other children incase we come across matchesNone 738 state = matchesRequiresHigherCost 739 case matchesFound: 740 // will return this if every child has this value 741 case matchesNone: 742 return matchesNone 743 } 744 } 745 746 return state 747} 748 749func (t *orMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState { 750 // we could short-circuit, but we want to use the other possibilities as a 751 // ranking signal. So we always return the most conservative state. 752 state := matchesNone 753 for _, ch := range t.children { 754 switch evalMatchTree(cp, cost, known, ch) { 755 case matchesRequiresHigherCost: 756 state = matchesRequiresHigherCost 757 case matchesFound: 758 if state != matchesRequiresHigherCost { 759 state = matchesFound 760 } 761 case matchesNone: 762 // noop 763 } 764 } 765 return state 766} 767 768func (t *branchQueryMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState { 769 return matchesStatePred(t.branchMask() != 0) 770} 771 772func (t *regexpMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState { 773 if t.reEvaluated { 774 return matchesStateForSlice(t.found) 775 } 776 777 if cost < costRegexp { 778 return matchesRequiresHigherCost 779 } 780 781 cp.stats.RegexpsConsidered++ 782 idxs := t.regexp.FindAllIndex(cp.data(t.fileName), -1) 783 found := t.found[:0] 784 for _, idx := range idxs { 785 cm := &candidateMatch{ 786 byteOffset: uint32(idx[0]), 787 byteMatchSz: uint32(idx[1] - idx[0]), 788 fileName: t.fileName, 789 } 790 791 found = append(found, cm) 792 } 793 t.found = found 794 t.reEvaluated = true 795 796 return matchesStateForSlice(t.found) 797} 798 799func (t *wordMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState { 800 if t.evaluated { 801 return matchesStateForSlice(t.found) 802 } 803 804 if cost < costRegexp { 805 return matchesRequiresHigherCost 806 } 807 808 data := cp.data(t.fileName) 809 offset := 0 810 found := t.found[:0] 811 for { 812 idx := bytes.Index(data[offset:], []byte(t.word)) 813 if idx < 0 { 814 break 815 } 816 817 relStartOffset := offset + idx 818 relEndOffset := relStartOffset + len(t.word) 819 820 startBoundary := relStartOffset < len(data) && (relStartOffset == 0 || !characterClass(data[relStartOffset-1])) 821 endBoundary := relEndOffset > 0 && (relEndOffset == len(data) || !characterClass(data[relEndOffset])) 822 if startBoundary && endBoundary { 823 found = append(found, &candidateMatch{ 824 byteOffset: uint32(offset + idx), 825 byteMatchSz: uint32(len(t.word)), 826 fileName: t.fileName, 827 }) 828 } 829 offset += idx + len(t.word) 830 } 831 832 t.found = found 833 t.evaluated = true 834 835 return matchesStateForSlice(t.found) 836} 837 838// breakMatchesOnNewlines returns matches resulting from breaking each element 839// of cms on newlines within text. 840func breakMatchesOnNewlines(cms []*candidateMatch, text []byte) []*candidateMatch { 841 var lineCMs []*candidateMatch 842 for _, cm := range cms { 843 lineCMs = append(lineCMs, breakOnNewlines(cm, text)...) 844 } 845 return lineCMs 846} 847 848// breakOnNewlines returns matches resulting from breaking cm on newlines 849// within text. 850func breakOnNewlines(cm *candidateMatch, text []byte) []*candidateMatch { 851 var cms []*candidateMatch 852 addMe := &candidateMatch{} 853 *addMe = *cm 854 for i := uint32(cm.byteOffset); i < cm.byteOffset+cm.byteMatchSz; i++ { 855 if text[i] == '\n' { 856 addMe.byteMatchSz = i - addMe.byteOffset 857 if addMe.byteMatchSz != 0 { 858 cms = append(cms, addMe) 859 } 860 861 addMe = &candidateMatch{} 862 *addMe = *cm 863 addMe.byteOffset = i + 1 864 } 865 } 866 addMe.byteMatchSz = cm.byteOffset + cm.byteMatchSz - addMe.byteOffset 867 if addMe.byteMatchSz != 0 { 868 cms = append(cms, addMe) 869 } 870 return cms 871} 872 873// evalMatchTree should be called instead of directly calling 874// matchTree.matches. It cache known values for future evaluation at higher 875// costs. 876func evalMatchTree(cp *contentProvider, cost int, known map[matchTree]bool, mt matchTree) matchesState { 877 if v, ok := known[mt]; ok { 878 return matchesStatePred(v) 879 } 880 881 ms := mt.matches(cp, cost, known) 882 if ms != matchesRequiresHigherCost { 883 known[mt] = ms == matchesFound 884 } 885 886 return ms 887} 888 889func (t *notMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState { 890 switch evalMatchTree(cp, cost, known, t.child) { 891 case matchesRequiresHigherCost: 892 return matchesRequiresHigherCost 893 case matchesFound: 894 return matchesNone 895 case matchesNone: 896 return matchesFound 897 default: 898 panic("unreachable") 899 } 900} 901 902func (t *fileNameMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState { 903 return evalMatchTree(cp, cost, known, t.child) 904} 905 906func (t *boostMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState { 907 return evalMatchTree(cp, cost, known, t.child) 908} 909 910func (t *substrMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState { 911 if t.contEvaluated { 912 return matchesStateForSlice(t.current) 913 } 914 915 if len(t.current) == 0 { 916 return matchesNone 917 } 918 919 if t.fileName && cost < costMemory { 920 return matchesRequiresHigherCost 921 } 922 923 if !t.fileName && cost < costContent { 924 return matchesRequiresHigherCost 925 } 926 927 pruned := t.current[:0] 928 for _, m := range t.current { 929 if m.byteOffset == 0 && m.runeOffset > 0 { 930 m.byteOffset = cp.findOffset(m.fileName, m.runeOffset) 931 } 932 if m.matchContent(cp.data(m.fileName)) { 933 pruned = append(pruned, m) 934 } 935 } 936 t.current = pruned 937 t.contEvaluated = true 938 939 return matchesStateForSlice(t.current) 940} 941 942type matchTreeOpt struct { 943 // DisableWordMatchOptimization is used to disable the use of wordMatchTree. 944 // This was added since we do not support wordMatchTree with symbol search. 945 DisableWordMatchOptimization bool 946} 947 948func (d *indexData) newMatchTree(q query.Q, opt matchTreeOpt) (matchTree, error) { 949 if q == nil { 950 return nil, fmt.Errorf("got nil (sub)query") 951 } 952 switch s := q.(type) { 953 case *query.Regexp: 954 // RegexpToMatchTreeRecursive tries to distill a matchTree that matches a 955 // superset of the regexp. If the returned matchTree is equivalent to the 956 // original regexp, it returns true. An equivalent matchTree has the same 957 // behaviour as the original regexp and can be used instead. 958 // 959 subMT, isEq, _, err := d.regexpToMatchTreeRecursive(s.Regexp, ngramSize, s.FileName, s.CaseSensitive) 960 if err != nil { 961 return nil, err 962 } 963 // if the query can be used in place of the regexp 964 // return the subtree 965 if isEq { 966 return subMT, nil 967 } 968 969 var tr matchTree 970 if wmt, ok := regexpToWordMatchTree(s, opt); ok { 971 // A common search we get is "\bLITERAL\b". Avoid the regex engine and 972 // provide something faster. 973 tr = wmt 974 } else { 975 prefix := "" 976 if !s.CaseSensitive { 977 prefix = "(?i)" 978 } 979 980 tr = &regexpMatchTree{ 981 regexp: regexp.MustCompile(prefix + s.Regexp.String()), 982 fileName: s.FileName, 983 } 984 } 985 986 return &andMatchTree{ 987 children: []matchTree{ 988 tr, &noVisitMatchTree{subMT}, 989 }, 990 }, nil 991 case *query.And: 992 var r []matchTree 993 for _, ch := range s.Children { 994 ct, err := d.newMatchTree(ch, opt) 995 if err != nil { 996 return nil, err 997 } 998 r = append(r, ct) 999 } 1000 return &andMatchTree{r}, nil 1001 case *query.Or: 1002 var r []matchTree 1003 for _, ch := range s.Children { 1004 ct, err := d.newMatchTree(ch, opt) 1005 if err != nil { 1006 return nil, err 1007 } 1008 r = append(r, ct) 1009 } 1010 return &orMatchTree{r}, nil 1011 case *query.Not: 1012 ct, err := d.newMatchTree(s.Child, opt) 1013 return &notMatchTree{ 1014 child: ct, 1015 }, err 1016 1017 case *query.Type: 1018 if s.Type != query.TypeFileName { 1019 break 1020 } 1021 1022 ct, err := d.newMatchTree(s.Child, opt) 1023 if err != nil { 1024 return nil, err 1025 } 1026 1027 return &fileNameMatchTree{ 1028 child: ct, 1029 }, nil 1030 1031 case *query.Boost: 1032 ct, err := d.newMatchTree(s.Child, opt) 1033 if err != nil { 1034 return nil, err 1035 } 1036 1037 return &boostMatchTree{ 1038 child: ct, 1039 boost: s.Boost, 1040 }, nil 1041 1042 case *query.Substring: 1043 return d.newSubstringMatchTree(s) 1044 1045 case *query.Branch: 1046 masks := make([]uint64, 0, len(d.repoMetaData)) 1047 if s.Pattern == "HEAD" { 1048 for i := 0; i < len(d.repoMetaData); i++ { 1049 masks = append(masks, 1) 1050 } 1051 } else { 1052 for _, branchIDs := range d.branchIDs { 1053 mask := uint64(0) 1054 for nm, m := range branchIDs { 1055 if (s.Exact && nm == s.Pattern) || (!s.Exact && strings.Contains(nm, s.Pattern)) { 1056 mask |= uint64(m) 1057 } 1058 } 1059 masks = append(masks, mask) 1060 } 1061 1062 } 1063 return &branchQueryMatchTree{ 1064 masks: masks, 1065 fileMasks: d.fileBranchMasks, 1066 repos: d.repos, 1067 }, nil 1068 case *query.Const: 1069 if s.Value { 1070 return &bruteForceMatchTree{}, nil 1071 } else { 1072 return &noMatchTree{Why: "const"}, nil 1073 } 1074 case *query.Language: 1075 code, ok := d.metaData.LanguageMap[s.Language] 1076 if !ok { 1077 return &noMatchTree{Why: "lang"}, nil 1078 } 1079 return &docMatchTree{ 1080 reason: "language", 1081 numDocs: d.numDocs(), 1082 predicate: func(docID uint32) bool { 1083 return d.getLanguage(docID) == code 1084 }, 1085 }, nil 1086 1087 case *query.Symbol: 1088 // Disable WordMatchTree since we don't support it in symbols yet. 1089 optCopy := opt 1090 optCopy.DisableWordMatchOptimization = true 1091 1092 subMT, err := d.newMatchTree(s.Expr, optCopy) 1093 if err != nil { 1094 return nil, err 1095 } 1096 1097 if substr, ok := subMT.(*substrMatchTree); ok { 1098 return &symbolSubstrMatchTree{ 1099 substrMatchTree: substr, 1100 patternSize: uint32(utf8.RuneCountInString(substr.query.Pattern)), 1101 fileEndRunes: d.fileEndRunes, 1102 fileEndSymbol: d.fileEndSymbol, 1103 sections: d.runeDocSections, 1104 }, nil 1105 } 1106 1107 var regexp *regexp.Regexp 1108 visitMatchTree(subMT, func(mt matchTree) { 1109 if t, ok := mt.(*regexpMatchTree); ok { 1110 regexp = t.regexp 1111 } 1112 }) 1113 if regexp == nil { 1114 return nil, fmt.Errorf("found %T inside query.Symbol", subMT) 1115 } 1116 1117 return &symbolRegexpMatchTree{ 1118 regexp: regexp, 1119 all: regexp.String() == "(?i)(?-s:.)*", 1120 matchTree: subMT, 1121 }, nil 1122 1123 case *query.FileNameSet: 1124 return &docMatchTree{ 1125 reason: "FileNameSet", 1126 numDocs: d.numDocs(), 1127 predicate: func(docID uint32) bool { 1128 fileName := d.fileName(docID) 1129 _, ok := s.Set[string(fileName)] 1130 return ok 1131 }, 1132 }, nil 1133 1134 case *query.BranchesRepos: 1135 reposBranchesWant := make([]uint64, len(d.repoMetaData)) 1136 for repoIdx := range d.repoMetaData { 1137 var mask uint64 1138 for _, br := range s.List { 1139 if br.Repos.Contains(d.repoMetaData[repoIdx].ID) { 1140 mask |= uint64(d.branchIDs[repoIdx][br.Branch]) 1141 } 1142 } 1143 reposBranchesWant[repoIdx] = mask 1144 } 1145 return &docMatchTree{ 1146 reason: "BranchesRepos", 1147 numDocs: d.numDocs(), 1148 predicate: func(docID uint32) bool { 1149 return d.fileBranchMasks[docID]&reposBranchesWant[d.repos[docID]] != 0 1150 }, 1151 }, nil 1152 1153 case *query.RepoSet: 1154 reposWant := make([]bool, len(d.repoMetaData)) 1155 for repoIdx, r := range d.repoMetaData { 1156 if _, ok := s.Set[r.Name]; ok { 1157 reposWant[repoIdx] = true 1158 } 1159 } 1160 return &docMatchTree{ 1161 reason: "RepoSet", 1162 numDocs: d.numDocs(), 1163 predicate: func(docID uint32) bool { 1164 return reposWant[d.repos[docID]] 1165 }, 1166 }, nil 1167 1168 case *query.RepoIDs: 1169 reposWant := make([]bool, len(d.repoMetaData)) 1170 for repoIdx, r := range d.repoMetaData { 1171 if s.Repos.Contains(r.ID) { 1172 reposWant[repoIdx] = true 1173 } 1174 } 1175 return &docMatchTree{ 1176 reason: "RepoIDs", 1177 numDocs: d.numDocs(), 1178 predicate: func(docID uint32) bool { 1179 return reposWant[d.repos[docID]] 1180 }, 1181 }, nil 1182 1183 case *query.Repo: 1184 reposWant := make([]bool, len(d.repoMetaData)) 1185 for repoIdx, r := range d.repoMetaData { 1186 if s.Regexp.MatchString(r.Name) { 1187 reposWant[repoIdx] = true 1188 } 1189 } 1190 return &docMatchTree{ 1191 reason: "Repo", 1192 numDocs: d.numDocs(), 1193 predicate: func(docID uint32) bool { 1194 return reposWant[d.repos[docID]] 1195 }, 1196 }, nil 1197 1198 case *query.RepoRegexp: 1199 reposWant := make([]bool, len(d.repoMetaData)) 1200 for repoIdx, r := range d.repoMetaData { 1201 if s.Regexp.MatchString(r.Name) { 1202 reposWant[repoIdx] = true 1203 } 1204 } 1205 return &docMatchTree{ 1206 reason: "RepoRegexp", 1207 numDocs: d.numDocs(), 1208 predicate: func(docID uint32) bool { 1209 return reposWant[d.repos[docID]] 1210 }, 1211 }, nil 1212 1213 case query.RawConfig: 1214 return &docMatchTree{ 1215 reason: s.String(), 1216 numDocs: d.numDocs(), 1217 predicate: func(docID uint32) bool { 1218 return uint8(s)&d.rawConfigMasks[d.repos[docID]] == uint8(s) 1219 }, 1220 }, nil 1221 } 1222 log.Panicf("type %T", q) 1223 return nil, nil 1224} 1225 1226func (d *indexData) newSubstringMatchTree(s *query.Substring) (matchTree, error) { 1227 st := &substrMatchTree{ 1228 query: s, 1229 caseSensitive: s.CaseSensitive, 1230 fileName: s.FileName, 1231 } 1232 1233 if utf8.RuneCountInString(s.Pattern) < ngramSize { 1234 prefix := "" 1235 if !s.CaseSensitive { 1236 prefix = "(?i)" 1237 } 1238 t := &regexpMatchTree{ 1239 regexp: regexp.MustCompile(prefix + regexp.QuoteMeta(s.Pattern)), 1240 fileName: s.FileName, 1241 } 1242 return t, nil 1243 } 1244 1245 result, err := d.iterateNgrams(s) 1246 if err != nil { 1247 return nil, err 1248 } 1249 st.matchIterator = result 1250 return st, nil 1251} 1252 1253func regexpToWordMatchTree(q *query.Regexp, opt matchTreeOpt) (_ *wordMatchTree, ok bool) { 1254 if opt.DisableWordMatchOptimization { 1255 return nil, false 1256 } 1257 // Needs to be case sensitive 1258 if !q.CaseSensitive || q.Regexp.Flags&syntax.FoldCase != 0 { 1259 return nil, false 1260 } 1261 // We want a regex that looks like Op.Concat[OpWordBoundary OpLiteral OpWordBoundary] 1262 if q.Regexp.Op != syntax.OpConcat || len(q.Regexp.Sub) != 3 { 1263 return nil, false 1264 } 1265 sub := q.Regexp.Sub 1266 if sub[0].Op != syntax.OpWordBoundary || sub[1].Op != syntax.OpLiteral || sub[2].Op != syntax.OpWordBoundary { 1267 return nil, false 1268 } 1269 1270 return &wordMatchTree{ 1271 word: string(sub[1].Rune), 1272 fileName: q.FileName, 1273 }, true 1274} 1275 1276// pruneMatchTree removes impossible branches from the matchTree, as indicated 1277// by substrMatchTree having a noMatchTree and the resulting impossible and clauses and so forth. 1278func pruneMatchTree(mt matchTree) (matchTree, error) { 1279 var err error 1280 switch mt := mt.(type) { 1281 // leaf nodes that we test with the filter: 1282 case *substrMatchTree: 1283 if res, ok := mt.matchIterator.(*ngramIterationResults); ok { 1284 if _, ok := res.matchIterator.(*noMatchTree); ok { 1285 return nil, nil 1286 } 1287 } 1288 // recursive tree structures: 1289 case *andMatchTree: 1290 // Any branch of an and becoming impossible means the entire clause 1291 // is impossible. Otherwise, just handle rewrites. 1292 for i, child := range mt.children { 1293 newChild, err := pruneMatchTree(child) 1294 if err != nil { 1295 return nil, err 1296 } 1297 if newChild == nil { 1298 return nil, nil 1299 } 1300 mt.children[i] = newChild 1301 } 1302 case *orMatchTree: 1303 // *All* branches of an OR becoming impossible means the entire clause 1304 // is impossible. Otherwise, drop impossible subclauses and handle 1305 // rewrites, including simplifying to a singular resulting child branch. 1306 n := 0 1307 for _, child := range mt.children { 1308 newChild, err := pruneMatchTree(child) 1309 if err != nil { 1310 return nil, err 1311 } 1312 if newChild != nil { 1313 mt.children[n] = newChild 1314 n++ 1315 } 1316 } 1317 mt.children = mt.children[:n] 1318 if len(mt.children) == 1 { 1319 return mt.children[0], nil 1320 } else if len(mt.children) == 0 { 1321 return nil, nil 1322 } 1323 case *noVisitMatchTree: 1324 mt.matchTree, err = pruneMatchTree(mt.matchTree) 1325 if err != nil { 1326 return nil, err 1327 } 1328 if mt.matchTree == nil { 1329 return nil, nil 1330 } 1331 case *fileNameMatchTree: 1332 mt.child, err = pruneMatchTree(mt.child) 1333 case *boostMatchTree: 1334 mt.child, err = pruneMatchTree(mt.child) 1335 if err != nil { 1336 return nil, err 1337 } 1338 if mt.child == nil { 1339 return nil, nil 1340 } 1341 case *andLineMatchTree: 1342 child, err := pruneMatchTree(&mt.andMatchTree) 1343 if err != nil { 1344 return nil, err 1345 } 1346 if child == nil { 1347 return nil, nil 1348 } 1349 if c, ok := child.(*andMatchTree); ok { 1350 mt.andMatchTree = *c 1351 } else { 1352 // the and was simplified to a single clause, 1353 // so the linematch portion is irrelevant. 1354 return mt, nil 1355 } 1356 case *notMatchTree: 1357 mt.child, err = pruneMatchTree(mt.child) 1358 if err != nil { 1359 return nil, err 1360 } 1361 if mt.child == nil { 1362 // not false => true 1363 return &bruteForceMatchTree{}, nil 1364 } 1365 // unhandled: 1366 case *docMatchTree: 1367 case *bruteForceMatchTree: 1368 case *regexpMatchTree: 1369 case *wordMatchTree: 1370 } 1371 return mt, err 1372}