fork of https://github.com/sourcegraph/zoekt
0

Configure Feed

Select the types of activity you want to include in your feed.

1// Copyright 2018 Google Inc. All rights reserved. 2// 3// Licensed under the Apache License, Version 2.0 (the "License"); 4// you may not use this file except in compliance with the License. 5// You may obtain a copy of the License at 6// 7// http://www.apache.org/licenses/LICENSE-2.0 8// 9// Unless required by applicable law or agreed to in writing, software 10// distributed under the License is distributed on an "AS IS" BASIS, 11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12// See the License for the specific language governing permissions and 13// limitations under the License. 14 15package zoekt 16 17import ( 18 "bytes" 19 "fmt" 20 "log" 21 "regexp/syntax" 22 "strings" 23 "unicode/utf8" 24 25 "github.com/grafana/regexp" 26 27 "github.com/sourcegraph/zoekt/internal/syntaxutil" 28 "github.com/sourcegraph/zoekt/query" 29) 30 31// A docIterator iterates over documents in order. 32type docIterator interface { 33 // provide the next document where we may find something interesting. 34 // 35 // This is like a "peek" and shouldn't mutate state. prepare is what should 36 // change state. 37 nextDoc() uint32 38 39 // clears any per-document state of the docIterator, and 40 // prepares for evaluating the given doc. The argument is 41 // strictly increasing over time. 42 prepare(nextDoc uint32) 43} 44 45// costs are passed in increasing order to matchTree.matches until they do not 46// return matchesRequiresHigherCost. 47const ( 48 costConst = 0 49 costMemory = 1 50 costContent = 2 51 costRegexp = 3 52) 53 54const ( 55 costMin = costConst 56 costMax = costRegexp 57) 58 59// matchesState is an enum for the state of a matchTree after a call to 60// matchTree.matches. 61type matchesState uint8 62 63const ( 64 // matchesRequiresHigherCost is returned when matchTree.matches hasn't done 65 // a search yet since the cost value is not high enough. 66 matchesRequiresHigherCost matchesState = iota 67 68 // matchesFound is returned when matchTree.matches has done a search and 69 // found one or more matches. 70 matchesFound 71 72 // matchesNone is returned when matchTree.matches has done a search and 73 // found nothing. 74 matchesNone 75) 76 77// matchesStatePred is a helper which returns matchesFound if b is true 78// otherwise returns matchesNone. 79func matchesStatePred(b bool) matchesState { 80 if b { 81 return matchesFound 82 } 83 return matchesNone 84} 85 86// matchesStateForSlice is a helper which returns matchesFound if v is 87// non-empty otherwise returns matchesNone. 88func matchesStateForSlice[T any](v []T) matchesState { 89 return matchesStatePred(len(v) > 0) 90} 91 92// An expression tree coupled with matches. The matchtree has two 93// functions: 94// 95// * it implements boolean combinations (and, or, not) 96// 97// * it implements shortcuts, where we skip documents (for example: if 98// there are no trigram matches, we can be sure there are no substring 99// matches). The matchtree iterates over the documents as they are 100// ordered in the shard. 101// 102// The general process for a given (shard, query) is: 103// 104// - construct matchTree for the query 105// 106// - find all different leaf matchTrees (substring, regexp, etc.) 107// 108// in a loop: 109// 110// - find next doc to process using nextDoc 111// 112// - evaluate atoms (leaf expressions that match text) 113// 114// - evaluate the tree using matches(), storing the result in map. 115// 116// - if the complete tree returns (matches() != matchesRequiresHigherCost) 117// for the document, collect all text matches by looking at leaf 118// matchTrees. 119type matchTree interface { 120 docIterator 121 122 // matches if cost is high enough. See documentation for matchesState's 123 // values. 124 // 125 // Note: Do not call this directly, rather use evalMatchTree which uses 126 // known to cache responses once the state transitions away from 127 // matchesRequiresHigherCost. 128 matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState 129} 130 131// docMatchTree iterates over documents for which predicate(docID) returns true. 132type docMatchTree struct { 133 // the number of documents in a shard. 134 numDocs uint32 135 136 predicate func(docID uint32) bool 137 138 // provides additional information about the reason why the docMatchTree was 139 // created. 140 reason string 141 142 // mutable 143 firstDone bool 144 docID uint32 145} 146 147type bruteForceMatchTree struct { 148 // mutable 149 firstDone bool 150 docID uint32 151} 152 153type andLineMatchTree struct { 154 andMatchTree 155} 156 157type andMatchTree struct { 158 children []matchTree 159} 160 161type orMatchTree struct { 162 children []matchTree 163} 164 165type notMatchTree struct { 166 child matchTree 167} 168 169// Returns only the filename of child matches. 170type fileNameMatchTree struct { 171 child matchTree 172} 173 174type boostMatchTree struct { 175 child matchTree 176 boost float64 177} 178 179// Don't visit this subtree for collecting matches. 180type noVisitMatchTree struct { 181 matchTree 182} 183 184type regexpMatchTree struct { 185 regexp *regexp.Regexp 186 187 // origRegexp is the original parsed regexp from the query structure. It 188 // does not include mutations such as case sensitivity. 189 origRegexp *syntax.Regexp 190 191 fileName bool 192 193 // mutable 194 reEvaluated bool 195 found []*candidateMatch 196 197 // nextDoc, prepare. 198 bruteForceMatchTree 199} 200 201func newRegexpMatchTree(s *query.Regexp) *regexpMatchTree { 202 prefix := "" 203 if !s.CaseSensitive { 204 prefix = "(?i)" 205 } 206 207 return &regexpMatchTree{ 208 regexp: regexp.MustCompile(prefix + syntaxutil.RegexpString(s.Regexp)), 209 origRegexp: s.Regexp, 210 fileName: s.FileName, 211 } 212} 213 214// \bLITERAL\b 215type wordMatchTree struct { 216 word string 217 218 fileName bool 219 220 // mutable 221 evaluated bool 222 found []*candidateMatch 223 224 // nextDoc, prepare. 225 bruteForceMatchTree 226} 227 228type substrMatchTree struct { 229 matchIterator 230 231 query *query.Substring 232 caseSensitive bool 233 fileName bool 234 235 // mutable 236 current []*candidateMatch 237 contEvaluated bool 238} 239 240type branchQueryMatchTree struct { 241 fileMasks []uint64 242 masks []uint64 243 repos []uint16 244 245 // mutable 246 firstDone bool 247 docID uint32 248} 249 250func (t *branchQueryMatchTree) branchMask() uint64 { 251 return t.fileMasks[t.docID] & t.masks[t.repos[t.docID]] 252} 253 254type symbolRegexpMatchTree struct { 255 matchTree 256 regexp *regexp.Regexp 257 all bool // skips regex match if .* 258 259 reEvaluated bool 260 found []*candidateMatch 261} 262 263func (t *symbolRegexpMatchTree) prepare(doc uint32) { 264 t.reEvaluated = false 265 t.found = t.found[:0] 266 t.matchTree.prepare(doc) 267} 268 269func (t *symbolRegexpMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState { 270 if t.reEvaluated { 271 return matchesStateForSlice(t.found) 272 } 273 274 if cost < costRegexp { 275 return matchesRequiresHigherCost 276 } 277 278 sections := cp.docSections() 279 content := cp.data(false) 280 281 found := t.found[:0] 282 for i, sec := range sections { 283 var idx []int 284 if t.all { 285 idx = []int{0, int(sec.End - sec.Start)} 286 } else { 287 idx = t.regexp.FindIndex(content[sec.Start:sec.End]) 288 if idx == nil { 289 continue 290 } 291 } 292 293 cm := &candidateMatch{ 294 byteOffset: sec.Start + uint32(idx[0]), 295 byteMatchSz: uint32(idx[1] - idx[0]), 296 symbol: true, 297 symbolIdx: uint32(i), 298 } 299 found = append(found, cm) 300 } 301 t.found = found 302 t.reEvaluated = true 303 304 return matchesStateForSlice(t.found) 305} 306 307type symbolSubstrMatchTree struct { 308 *substrMatchTree 309 310 patternSize uint32 311 fileEndRunes []uint32 312 fileEndSymbol []uint32 313 314 doc uint32 315 sections []DocumentSection 316 317 secID uint32 318} 319 320func (t *symbolSubstrMatchTree) prepare(doc uint32) { 321 t.substrMatchTree.prepare(doc) 322 t.doc = doc 323 324 var fileStart uint32 325 if doc > 0 { 326 fileStart = t.fileEndRunes[doc-1] 327 } 328 329 var sections []DocumentSection 330 if len(t.sections) > 0 { 331 most := t.fileEndSymbol[len(t.fileEndSymbol)-1] 332 if most == uint32(len(t.sections)) { 333 sections = t.sections[t.fileEndSymbol[doc]:t.fileEndSymbol[doc+1]] 334 } else { 335 for t.secID < uint32(len(t.sections)) && t.sections[t.secID].Start < fileStart { 336 t.secID++ 337 } 338 339 fileEnd, symbolEnd := t.fileEndRunes[doc], t.secID 340 for symbolEnd < uint32(len(t.sections)) && t.sections[symbolEnd].Start < fileEnd { 341 symbolEnd++ 342 } 343 344 sections = t.sections[t.secID:symbolEnd] 345 } 346 } 347 348 secIdx := 0 349 trimmed := t.current[:0] 350 for len(sections) > secIdx && len(t.current) > 0 { 351 start := fileStart + t.current[0].runeOffset 352 end := start + t.patternSize 353 if start >= sections[secIdx].End { 354 secIdx++ 355 continue 356 } 357 358 if start < sections[secIdx].Start { 359 t.current = t.current[1:] 360 continue 361 } 362 363 if end <= sections[secIdx].End { 364 t.current[0].symbol = true 365 t.current[0].symbolIdx = uint32(secIdx) 366 trimmed = append(trimmed, t.current[0]) 367 } 368 369 t.current = t.current[1:] 370 } 371 t.current = trimmed 372} 373 374// all prepare methods 375 376func (t *bruteForceMatchTree) prepare(doc uint32) { 377 t.docID = doc 378 t.firstDone = true 379} 380 381func (t *docMatchTree) prepare(doc uint32) { 382 t.docID = doc 383 t.firstDone = true 384} 385 386func (t *andMatchTree) prepare(doc uint32) { 387 for _, c := range t.children { 388 c.prepare(doc) 389 } 390} 391 392func (t *regexpMatchTree) prepare(doc uint32) { 393 t.found = t.found[:0] 394 t.reEvaluated = false 395 t.bruteForceMatchTree.prepare(doc) 396} 397 398func (t *wordMatchTree) prepare(doc uint32) { 399 t.found = t.found[:0] 400 t.evaluated = false 401 t.bruteForceMatchTree.prepare(doc) 402} 403 404func (t *orMatchTree) prepare(doc uint32) { 405 for _, c := range t.children { 406 c.prepare(doc) 407 } 408} 409 410func (t *notMatchTree) prepare(doc uint32) { 411 t.child.prepare(doc) 412} 413 414func (t *fileNameMatchTree) prepare(doc uint32) { 415 t.child.prepare(doc) 416} 417 418func (t *boostMatchTree) prepare(doc uint32) { 419 t.child.prepare(doc) 420} 421 422func (t *substrMatchTree) prepare(nextDoc uint32) { 423 t.matchIterator.prepare(nextDoc) 424 t.current = t.matchIterator.candidates() 425 t.contEvaluated = false 426} 427 428func (t *branchQueryMatchTree) prepare(doc uint32) { 429 t.firstDone = true 430 t.docID = doc 431} 432 433// nextDoc 434 435func (t *docMatchTree) nextDoc() uint32 { 436 var start uint32 437 if t.firstDone { 438 start = t.docID + 1 439 } 440 for i := start; i < t.numDocs; i++ { 441 if t.predicate(i) { 442 return i 443 } 444 } 445 return maxUInt32 446} 447 448func (t *bruteForceMatchTree) nextDoc() uint32 { 449 if !t.firstDone { 450 return 0 451 } 452 return t.docID + 1 453} 454 455func (t *andMatchTree) nextDoc() uint32 { 456 var max uint32 457 for _, c := range t.children { 458 m := c.nextDoc() 459 if m > max { 460 max = m 461 } 462 } 463 return max 464} 465 466func (t *orMatchTree) nextDoc() uint32 { 467 min := uint32(maxUInt32) 468 for _, c := range t.children { 469 m := c.nextDoc() 470 if m < min { 471 min = m 472 } 473 } 474 return min 475} 476 477func (t *notMatchTree) nextDoc() uint32 { 478 return 0 479} 480 481func (t *fileNameMatchTree) nextDoc() uint32 { 482 return t.child.nextDoc() 483} 484 485func (t *boostMatchTree) nextDoc() uint32 { 486 return t.child.nextDoc() 487} 488 489func (t *branchQueryMatchTree) nextDoc() uint32 { 490 var start uint32 491 if t.firstDone { 492 start = t.docID + 1 493 } 494 495 for i := start; i < uint32(len(t.fileMasks)); i++ { 496 if (t.masks[t.repos[i]] & t.fileMasks[i]) != 0 { 497 return i 498 } 499 } 500 return maxUInt32 501} 502 503// all String methods 504 505func (t *bruteForceMatchTree) String() string { 506 return "all" 507} 508 509func (t *docMatchTree) String() string { 510 return fmt.Sprintf("doc(%s)", t.reason) 511} 512 513func (t *andMatchTree) String() string { 514 return fmt.Sprintf("and%v", t.children) 515} 516 517func (t *regexpMatchTree) String() string { 518 f := "" 519 if t.fileName { 520 f = "f" 521 } 522 return fmt.Sprintf("%sre(%s)", f, t.regexp) 523} 524 525func (t *wordMatchTree) String() string { 526 f := "" 527 if t.fileName { 528 f = "f" 529 } 530 return fmt.Sprintf("%sword(%s)", f, t.word) 531} 532 533func (t *orMatchTree) String() string { 534 return fmt.Sprintf("or%v", t.children) 535} 536 537func (t *notMatchTree) String() string { 538 return fmt.Sprintf("not(%v)", t.child) 539} 540 541func (t *noVisitMatchTree) String() string { 542 return fmt.Sprintf("novisit(%v)", t.matchTree) 543} 544 545func (t *fileNameMatchTree) String() string { 546 return fmt.Sprintf("f(%v)", t.child) 547} 548 549func (t *boostMatchTree) String() string { 550 return fmt.Sprintf("boost(%f, %v)", t.boost, t.child) 551} 552 553func (t *substrMatchTree) String() string { 554 f := "" 555 if t.fileName { 556 f = "f" 557 } 558 559 return fmt.Sprintf("%ssubstr(%q, %v, %v)", f, t.query.Pattern, t.current, t.matchIterator) 560} 561 562func (t *branchQueryMatchTree) String() string { 563 return fmt.Sprintf("branch(%x)", t.masks) 564} 565 566func (t *symbolSubstrMatchTree) String() string { 567 return fmt.Sprintf("symbol(%v)", t.substrMatchTree) 568} 569 570func (t *symbolRegexpMatchTree) String() string { 571 return fmt.Sprintf("symbol(%v)", t.matchTree) 572} 573 574// visitMatches visits all atoms in matchTree. Note: This visits 575// noVisitMatchTree. For collecting matches use visitMatches. 576func visitMatchTree(t matchTree, f func(matchTree)) { 577 switch s := t.(type) { 578 case *andMatchTree: 579 for _, ch := range s.children { 580 visitMatchTree(ch, f) 581 } 582 case *orMatchTree: 583 for _, ch := range s.children { 584 visitMatchTree(ch, f) 585 } 586 case *andLineMatchTree: 587 visitMatchTree(&s.andMatchTree, f) 588 case *noVisitMatchTree: 589 visitMatchTree(s.matchTree, f) 590 case *notMatchTree: 591 visitMatchTree(s.child, f) 592 case *fileNameMatchTree: 593 visitMatchTree(s.child, f) 594 case *boostMatchTree: 595 visitMatchTree(s.child, f) 596 case *symbolSubstrMatchTree: 597 visitMatchTree(s.substrMatchTree, f) 598 case *symbolRegexpMatchTree: 599 visitMatchTree(s.matchTree, f) 600 default: 601 f(t) 602 } 603} 604 605// updateMatchTreeStats calls updateStats on all atoms in mt which have that 606// function defined. 607func updateMatchTreeStats(mt matchTree, stats *Stats) { 608 visitMatchTree(mt, func(mt matchTree) { 609 if atom, ok := mt.(interface{ updateStats(*Stats) }); ok { 610 atom.updateStats(stats) 611 } 612 }) 613} 614 615func visitMatchAtoms(t matchTree, known map[matchTree]bool, f func(matchTree)) { 616 visitMatches(t, known, 1, func(mt matchTree, _ float64) { 617 f(mt) 618 }) 619} 620 621// visitMatches visits all atoms which can contribute matches. Note: This 622// skips noVisitMatchTree. 623func visitMatches(t matchTree, known map[matchTree]bool, weight float64, f func(matchTree, float64)) { 624 switch s := t.(type) { 625 case *andMatchTree: 626 for _, ch := range s.children { 627 if known[ch] { 628 visitMatches(ch, known, weight, f) 629 } 630 } 631 case *andLineMatchTree: 632 visitMatches(&s.andMatchTree, known, weight, f) 633 case *orMatchTree: 634 for _, ch := range s.children { 635 if known[ch] { 636 visitMatches(ch, known, weight, f) 637 } 638 } 639 case *boostMatchTree: 640 visitMatches(s.child, known, weight*s.boost, f) 641 case *symbolSubstrMatchTree: 642 visitMatches(s.substrMatchTree, known, weight, f) 643 case *notMatchTree: 644 case *noVisitMatchTree: 645 // don't collect into negative trees. 646 case *fileNameMatchTree: 647 // We will just gather the filename if we do not visit this tree. 648 default: 649 f(s, weight) 650 } 651} 652 653// all matches() methods. 654 655func (t *docMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState { 656 return matchesStatePred(t.predicate(cp.idx)) 657} 658 659func (t *bruteForceMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState { 660 return matchesFound 661} 662 663// andLineMatchTree is a performance optimization of andMatchTree. For content 664// searches we don't want to run the regex engine if there is no line that 665// contains matches from all terms. 666func (t *andLineMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState { 667 if state := evalMatchTree(cp, cost, known, &t.andMatchTree); state != matchesFound { 668 return state 669 } 670 671 // Invariant: all children have matches. If any line contains all of them we 672 // can return MatchesFound. 673 674 // find child with fewest candidates 675 min := maxUInt32 676 fewestChildren := 0 677 for ix, child := range t.children { 678 v, ok := child.(*substrMatchTree) 679 // make sure we are running a content search and that all candidates are a 680 // substrMatchTree 681 if !ok || v.fileName { 682 return matchesFound 683 } 684 if len(v.current) < min { 685 min = len(v.current) 686 fewestChildren = ix 687 } 688 } 689 690 type lineRange struct { 691 start int 692 end int 693 } 694 lines := make([]lineRange, 0, len(t.children[fewestChildren].(*substrMatchTree).current)) 695 prev := -1 696 for _, candidate := range t.children[fewestChildren].(*substrMatchTree).current { 697 line := cp.newlines().atOffset(candidate.byteOffset) 698 if line == prev { 699 continue 700 } 701 prev = line 702 byteStart := int(cp.newlines().lineStart(line)) 703 byteEnd := int(cp.newlines().lineStart(line + 1)) 704 lines = append(lines, lineRange{byteStart, byteEnd}) 705 } 706 707 // children keeps track of the children's candidates we have already seen. 708 children := make([][]*candidateMatch, 0, len(t.children)-1) 709 for j, child := range t.children { 710 if j == fewestChildren { 711 continue 712 } 713 children = append(children, child.(*substrMatchTree).current) 714 } 715 716nextLine: 717 for i := 0; i < len(lines); i++ { 718 hits := 1 719 nextChild: 720 for j := range children { 721 nextCandidate: 722 for len(children[j]) > 0 { 723 candidate := children[j][0] 724 bo := int(cp.findOffset(false, candidate.runeOffset)) 725 if bo < lines[i].start { 726 children[j] = children[j][1:] 727 continue nextCandidate 728 } 729 if bo < lines[i].end { 730 hits++ 731 continue nextChild 732 } 733 // move the `lines` iterator forward until bo < line.end 734 for i < len(lines) && bo >= lines[i].end { 735 i++ 736 } 737 i-- 738 continue nextLine 739 } 740 } 741 // return early once we found any line that contains matches from all children 742 if hits == len(t.children) { 743 return matchesFound 744 } 745 } 746 return matchesNone 747} 748 749func (t *andMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState { 750 // We have found matches unless a child needs to do more work or it hasn't 751 // found matches. 752 state := matchesFound 753 754 for _, ch := range t.children { 755 switch evalMatchTree(cp, cost, known, ch) { 756 case matchesRequiresHigherCost: 757 // keep evaluating other children incase we come across matchesNone 758 state = matchesRequiresHigherCost 759 case matchesFound: 760 // will return this if every child has this value 761 case matchesNone: 762 return matchesNone 763 } 764 } 765 766 return state 767} 768 769func (t *orMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState { 770 // we could short-circuit, but we want to use the other possibilities as a 771 // ranking signal. So we always return the most conservative state. 772 state := matchesNone 773 for _, ch := range t.children { 774 switch evalMatchTree(cp, cost, known, ch) { 775 case matchesRequiresHigherCost: 776 state = matchesRequiresHigherCost 777 case matchesFound: 778 if state != matchesRequiresHigherCost { 779 state = matchesFound 780 } 781 case matchesNone: 782 // noop 783 } 784 } 785 return state 786} 787 788func (t *branchQueryMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState { 789 return matchesStatePred(t.branchMask() != 0) 790} 791 792func (t *regexpMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState { 793 if t.reEvaluated { 794 return matchesStateForSlice(t.found) 795 } 796 797 if cost < costRegexp { 798 return matchesRequiresHigherCost 799 } 800 801 cp.stats.RegexpsConsidered++ 802 idxs := t.regexp.FindAllIndex(cp.data(t.fileName), -1) 803 found := t.found[:0] 804 for _, idx := range idxs { 805 cm := &candidateMatch{ 806 byteOffset: uint32(idx[0]), 807 byteMatchSz: uint32(idx[1] - idx[0]), 808 fileName: t.fileName, 809 } 810 811 found = append(found, cm) 812 } 813 t.found = found 814 t.reEvaluated = true 815 816 return matchesStateForSlice(t.found) 817} 818 819func (t *wordMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState { 820 if t.evaluated { 821 return matchesStateForSlice(t.found) 822 } 823 824 if cost < costRegexp { 825 return matchesRequiresHigherCost 826 } 827 828 data := cp.data(t.fileName) 829 offset := 0 830 found := t.found[:0] 831 for { 832 idx := bytes.Index(data[offset:], []byte(t.word)) 833 if idx < 0 { 834 break 835 } 836 837 relStartOffset := offset + idx 838 relEndOffset := relStartOffset + len(t.word) 839 840 startBoundary := relStartOffset < len(data) && (relStartOffset == 0 || !characterClass(data[relStartOffset-1])) 841 endBoundary := relEndOffset > 0 && (relEndOffset == len(data) || !characterClass(data[relEndOffset])) 842 if startBoundary && endBoundary { 843 found = append(found, &candidateMatch{ 844 byteOffset: uint32(offset + idx), 845 byteMatchSz: uint32(len(t.word)), 846 fileName: t.fileName, 847 }) 848 } 849 offset += idx + len(t.word) 850 } 851 852 t.found = found 853 t.evaluated = true 854 855 return matchesStateForSlice(t.found) 856} 857 858// breakMatchesOnNewlines returns matches resulting from breaking each element 859// of cms on newlines within text. 860func breakMatchesOnNewlines(cms []*candidateMatch, text []byte) []*candidateMatch { 861 var lineCMs []*candidateMatch 862 for _, cm := range cms { 863 lineCMs = append(lineCMs, breakOnNewlines(cm, text)...) 864 } 865 return lineCMs 866} 867 868// breakOnNewlines returns matches resulting from breaking cm on newlines 869// within text. 870func breakOnNewlines(cm *candidateMatch, text []byte) []*candidateMatch { 871 var cms []*candidateMatch 872 addMe := &candidateMatch{} 873 *addMe = *cm 874 for i := uint32(cm.byteOffset); i < cm.byteOffset+cm.byteMatchSz; i++ { 875 if text[i] == '\n' { 876 addMe.byteMatchSz = i - addMe.byteOffset 877 if addMe.byteMatchSz != 0 { 878 cms = append(cms, addMe) 879 } 880 881 addMe = &candidateMatch{} 882 *addMe = *cm 883 addMe.byteOffset = i + 1 884 } 885 } 886 addMe.byteMatchSz = cm.byteOffset + cm.byteMatchSz - addMe.byteOffset 887 if addMe.byteMatchSz != 0 { 888 cms = append(cms, addMe) 889 } 890 return cms 891} 892 893// evalMatchTree should be called instead of directly calling 894// matchTree.matches. It cache known values for future evaluation at higher 895// costs. 896func evalMatchTree(cp *contentProvider, cost int, known map[matchTree]bool, mt matchTree) matchesState { 897 if v, ok := known[mt]; ok { 898 return matchesStatePred(v) 899 } 900 901 ms := mt.matches(cp, cost, known) 902 if ms != matchesRequiresHigherCost { 903 known[mt] = ms == matchesFound 904 } 905 906 return ms 907} 908 909func (t *notMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState { 910 switch evalMatchTree(cp, cost, known, t.child) { 911 case matchesRequiresHigherCost: 912 return matchesRequiresHigherCost 913 case matchesFound: 914 return matchesNone 915 case matchesNone: 916 return matchesFound 917 default: 918 panic("unreachable") 919 } 920} 921 922func (t *fileNameMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState { 923 return evalMatchTree(cp, cost, known, t.child) 924} 925 926func (t *boostMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState { 927 return evalMatchTree(cp, cost, known, t.child) 928} 929 930func (t *substrMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState { 931 if t.contEvaluated { 932 return matchesStateForSlice(t.current) 933 } 934 935 if len(t.current) == 0 { 936 return matchesNone 937 } 938 939 if t.fileName && cost < costMemory { 940 return matchesRequiresHigherCost 941 } 942 943 if !t.fileName && cost < costContent { 944 return matchesRequiresHigherCost 945 } 946 947 pruned := t.current[:0] 948 for _, m := range t.current { 949 if m.byteOffset == 0 && m.runeOffset > 0 { 950 m.byteOffset = cp.findOffset(m.fileName, m.runeOffset) 951 } 952 if m.matchContent(cp.data(m.fileName)) { 953 pruned = append(pruned, m) 954 } 955 } 956 t.current = pruned 957 t.contEvaluated = true 958 959 return matchesStateForSlice(t.current) 960} 961 962type matchTreeOpt struct { 963 // DisableWordMatchOptimization is used to disable the use of wordMatchTree. 964 // This was added since we do not support wordMatchTree with symbol search. 965 DisableWordMatchOptimization bool 966} 967 968func (d *indexData) newMatchTree(q query.Q, opt matchTreeOpt) (matchTree, error) { 969 if q == nil { 970 return nil, fmt.Errorf("got nil (sub)query") 971 } 972 switch s := q.(type) { 973 case *query.Regexp: 974 // RegexpToMatchTreeRecursive tries to distill a matchTree that matches a 975 // superset of the regexp. If the returned matchTree is equivalent to the 976 // original regexp, it returns true. An equivalent matchTree has the same 977 // behaviour as the original regexp and can be used instead. 978 // 979 subMT, isEq, _, err := d.regexpToMatchTreeRecursive(s.Regexp, ngramSize, s.FileName, s.CaseSensitive) 980 if err != nil { 981 return nil, err 982 } 983 // if the query can be used in place of the regexp 984 // return the subtree 985 if isEq { 986 return subMT, nil 987 } 988 989 var tr matchTree 990 if wmt, ok := regexpToWordMatchTree(s, opt); ok { 991 // A common search we get is "\bLITERAL\b". Avoid the regex engine and 992 // provide something faster. 993 tr = wmt 994 } else { 995 tr = newRegexpMatchTree(s) 996 } 997 998 return &andMatchTree{ 999 children: []matchTree{ 1000 tr, &noVisitMatchTree{subMT}, 1001 }, 1002 }, nil 1003 case *query.And: 1004 var r []matchTree 1005 for _, ch := range s.Children { 1006 ct, err := d.newMatchTree(ch, opt) 1007 if err != nil { 1008 return nil, err 1009 } 1010 r = append(r, ct) 1011 } 1012 return &andMatchTree{r}, nil 1013 case *query.Or: 1014 var r []matchTree 1015 for _, ch := range s.Children { 1016 ct, err := d.newMatchTree(ch, opt) 1017 if err != nil { 1018 return nil, err 1019 } 1020 r = append(r, ct) 1021 } 1022 return &orMatchTree{r}, nil 1023 case *query.Not: 1024 ct, err := d.newMatchTree(s.Child, opt) 1025 return &notMatchTree{ 1026 child: ct, 1027 }, err 1028 1029 case *query.Type: 1030 if s.Type != query.TypeFileName { 1031 break 1032 } 1033 1034 ct, err := d.newMatchTree(s.Child, opt) 1035 if err != nil { 1036 return nil, err 1037 } 1038 1039 return &fileNameMatchTree{ 1040 child: ct, 1041 }, nil 1042 1043 case *query.Boost: 1044 ct, err := d.newMatchTree(s.Child, opt) 1045 if err != nil { 1046 return nil, err 1047 } 1048 1049 return &boostMatchTree{ 1050 child: ct, 1051 boost: s.Boost, 1052 }, nil 1053 1054 case *query.Substring: 1055 return d.newSubstringMatchTree(s) 1056 1057 case *query.Branch: 1058 masks := make([]uint64, 0, len(d.repoMetaData)) 1059 if s.Pattern == "HEAD" { 1060 for i := 0; i < len(d.repoMetaData); i++ { 1061 masks = append(masks, 1) 1062 } 1063 } else { 1064 for _, branchIDs := range d.branchIDs { 1065 mask := uint64(0) 1066 for nm, m := range branchIDs { 1067 if (s.Exact && nm == s.Pattern) || (!s.Exact && strings.Contains(nm, s.Pattern)) { 1068 mask |= uint64(m) 1069 } 1070 } 1071 masks = append(masks, mask) 1072 } 1073 } 1074 return &branchQueryMatchTree{ 1075 masks: masks, 1076 fileMasks: d.fileBranchMasks, 1077 repos: d.repos, 1078 }, nil 1079 case *query.Const: 1080 if s.Value { 1081 return &bruteForceMatchTree{}, nil 1082 } else { 1083 return &noMatchTree{Why: "const"}, nil 1084 } 1085 case *query.Language: 1086 code, ok := d.metaData.LanguageMap[s.Language] 1087 if !ok { 1088 return &noMatchTree{Why: "lang"}, nil 1089 } 1090 return &docMatchTree{ 1091 reason: "language", 1092 numDocs: d.numDocs(), 1093 predicate: func(docID uint32) bool { 1094 return d.getLanguage(docID) == code 1095 }, 1096 }, nil 1097 1098 case *query.Symbol: 1099 // Disable WordMatchTree since we don't support it in symbols yet. 1100 optCopy := opt 1101 optCopy.DisableWordMatchOptimization = true 1102 1103 subMT, err := d.newMatchTree(s.Expr, optCopy) 1104 if err != nil { 1105 return nil, err 1106 } 1107 1108 if substr, ok := subMT.(*substrMatchTree); ok { 1109 return &symbolSubstrMatchTree{ 1110 substrMatchTree: substr, 1111 patternSize: uint32(utf8.RuneCountInString(substr.query.Pattern)), 1112 fileEndRunes: d.fileEndRunes, 1113 fileEndSymbol: d.fileEndSymbol, 1114 sections: d.runeDocSections, 1115 }, nil 1116 } 1117 1118 var regexpMT *regexpMatchTree 1119 visitMatchTree(subMT, func(mt matchTree) { 1120 if t, ok := mt.(*regexpMatchTree); ok { 1121 regexpMT = t 1122 } 1123 }) 1124 if regexpMT == nil { 1125 return nil, fmt.Errorf("found %T inside query.Symbol", subMT) 1126 } 1127 1128 return &symbolRegexpMatchTree{ 1129 regexp: regexpMT.regexp, 1130 all: isRegexpAll(regexpMT.origRegexp), 1131 matchTree: subMT, 1132 }, nil 1133 1134 case *query.FileNameSet: 1135 return &docMatchTree{ 1136 reason: "FileNameSet", 1137 numDocs: d.numDocs(), 1138 predicate: func(docID uint32) bool { 1139 fileName := d.fileName(docID) 1140 _, ok := s.Set[string(fileName)] 1141 return ok 1142 }, 1143 }, nil 1144 1145 case *query.BranchesRepos: 1146 reposBranchesWant := make([]uint64, len(d.repoMetaData)) 1147 for repoIdx := range d.repoMetaData { 1148 var mask uint64 1149 for _, br := range s.List { 1150 if br.Repos.Contains(d.repoMetaData[repoIdx].ID) { 1151 mask |= uint64(d.branchIDs[repoIdx][br.Branch]) 1152 } 1153 } 1154 reposBranchesWant[repoIdx] = mask 1155 } 1156 return &docMatchTree{ 1157 reason: "BranchesRepos", 1158 numDocs: d.numDocs(), 1159 predicate: func(docID uint32) bool { 1160 return d.fileBranchMasks[docID]&reposBranchesWant[d.repos[docID]] != 0 1161 }, 1162 }, nil 1163 1164 case *query.RepoSet: 1165 reposWant := make([]bool, len(d.repoMetaData)) 1166 for repoIdx, r := range d.repoMetaData { 1167 if _, ok := s.Set[r.Name]; ok { 1168 reposWant[repoIdx] = true 1169 } 1170 } 1171 return &docMatchTree{ 1172 reason: "RepoSet", 1173 numDocs: d.numDocs(), 1174 predicate: func(docID uint32) bool { 1175 return reposWant[d.repos[docID]] 1176 }, 1177 }, nil 1178 1179 case *query.RepoIDs: 1180 reposWant := make([]bool, len(d.repoMetaData)) 1181 for repoIdx, r := range d.repoMetaData { 1182 if s.Repos.Contains(r.ID) { 1183 reposWant[repoIdx] = true 1184 } 1185 } 1186 return &docMatchTree{ 1187 reason: "RepoIDs", 1188 numDocs: d.numDocs(), 1189 predicate: func(docID uint32) bool { 1190 return reposWant[d.repos[docID]] 1191 }, 1192 }, nil 1193 1194 case *query.Repo: 1195 reposWant := make([]bool, len(d.repoMetaData)) 1196 for repoIdx, r := range d.repoMetaData { 1197 if s.Regexp.MatchString(r.Name) { 1198 reposWant[repoIdx] = true 1199 } 1200 } 1201 return &docMatchTree{ 1202 reason: "Repo", 1203 numDocs: d.numDocs(), 1204 predicate: func(docID uint32) bool { 1205 return reposWant[d.repos[docID]] 1206 }, 1207 }, nil 1208 1209 case *query.RepoRegexp: 1210 reposWant := make([]bool, len(d.repoMetaData)) 1211 for repoIdx, r := range d.repoMetaData { 1212 if s.Regexp.MatchString(r.Name) { 1213 reposWant[repoIdx] = true 1214 } 1215 } 1216 return &docMatchTree{ 1217 reason: "RepoRegexp", 1218 numDocs: d.numDocs(), 1219 predicate: func(docID uint32) bool { 1220 return reposWant[d.repos[docID]] 1221 }, 1222 }, nil 1223 1224 case query.RawConfig: 1225 return &docMatchTree{ 1226 reason: s.String(), 1227 numDocs: d.numDocs(), 1228 predicate: func(docID uint32) bool { 1229 return uint8(s)&d.rawConfigMasks[d.repos[docID]] == uint8(s) 1230 }, 1231 }, nil 1232 } 1233 log.Panicf("type %T", q) 1234 return nil, nil 1235} 1236 1237func (d *indexData) newSubstringMatchTree(s *query.Substring) (matchTree, error) { 1238 st := &substrMatchTree{ 1239 query: s, 1240 caseSensitive: s.CaseSensitive, 1241 fileName: s.FileName, 1242 } 1243 1244 if utf8.RuneCountInString(s.Pattern) < ngramSize { 1245 return newRegexpMatchTree(&query.Regexp{ 1246 Regexp: &syntax.Regexp{Op: syntax.OpLiteral, Rune: []rune(s.Pattern)}, 1247 FileName: s.FileName, 1248 Content: s.Content, 1249 CaseSensitive: s.CaseSensitive, 1250 }), nil 1251 } 1252 1253 result, err := d.iterateNgrams(s) 1254 if err != nil { 1255 return nil, err 1256 } 1257 st.matchIterator = result 1258 return st, nil 1259} 1260 1261func regexpToWordMatchTree(q *query.Regexp, opt matchTreeOpt) (_ *wordMatchTree, ok bool) { 1262 if opt.DisableWordMatchOptimization { 1263 return nil, false 1264 } 1265 // Needs to be case sensitive 1266 if !q.CaseSensitive || q.Regexp.Flags&syntax.FoldCase != 0 { 1267 return nil, false 1268 } 1269 // We want a regex that looks like Op.Concat[OpWordBoundary OpLiteral OpWordBoundary] 1270 if q.Regexp.Op != syntax.OpConcat || len(q.Regexp.Sub) != 3 { 1271 return nil, false 1272 } 1273 sub := q.Regexp.Sub 1274 if sub[0].Op != syntax.OpWordBoundary || sub[1].Op != syntax.OpLiteral || sub[2].Op != syntax.OpWordBoundary { 1275 return nil, false 1276 } 1277 1278 return &wordMatchTree{ 1279 word: string(sub[1].Rune), 1280 fileName: q.FileName, 1281 }, true 1282} 1283 1284// pruneMatchTree removes impossible branches from the matchTree, as indicated 1285// by substrMatchTree having a noMatchTree and the resulting impossible and clauses and so forth. 1286func pruneMatchTree(mt matchTree) (matchTree, error) { 1287 var err error 1288 switch mt := mt.(type) { 1289 // leaf nodes that we test with the filter: 1290 case *substrMatchTree: 1291 if res, ok := mt.matchIterator.(*ngramIterationResults); ok { 1292 if _, ok := res.matchIterator.(*noMatchTree); ok { 1293 return nil, nil 1294 } 1295 } 1296 // recursive tree structures: 1297 case *andMatchTree: 1298 // Any branch of an and becoming impossible means the entire clause 1299 // is impossible. Otherwise, just handle rewrites. 1300 for i, child := range mt.children { 1301 newChild, err := pruneMatchTree(child) 1302 if err != nil { 1303 return nil, err 1304 } 1305 if newChild == nil { 1306 return nil, nil 1307 } 1308 mt.children[i] = newChild 1309 } 1310 case *orMatchTree: 1311 // *All* branches of an OR becoming impossible means the entire clause 1312 // is impossible. Otherwise, drop impossible subclauses and handle 1313 // rewrites, including simplifying to a singular resulting child branch. 1314 n := 0 1315 for _, child := range mt.children { 1316 newChild, err := pruneMatchTree(child) 1317 if err != nil { 1318 return nil, err 1319 } 1320 if newChild != nil { 1321 mt.children[n] = newChild 1322 n++ 1323 } 1324 } 1325 mt.children = mt.children[:n] 1326 if len(mt.children) == 1 { 1327 return mt.children[0], nil 1328 } else if len(mt.children) == 0 { 1329 return nil, nil 1330 } 1331 case *noVisitMatchTree: 1332 mt.matchTree, err = pruneMatchTree(mt.matchTree) 1333 if err != nil { 1334 return nil, err 1335 } 1336 if mt.matchTree == nil { 1337 return nil, nil 1338 } 1339 case *fileNameMatchTree: 1340 mt.child, err = pruneMatchTree(mt.child) 1341 if err != nil { 1342 return nil, err 1343 } 1344 if mt.child == nil { 1345 return nil, nil 1346 } 1347 case *boostMatchTree: 1348 mt.child, err = pruneMatchTree(mt.child) 1349 if err != nil { 1350 return nil, err 1351 } 1352 if mt.child == nil { 1353 return nil, nil 1354 } 1355 case *andLineMatchTree: 1356 child, err := pruneMatchTree(&mt.andMatchTree) 1357 if err != nil { 1358 return nil, err 1359 } 1360 if child == nil { 1361 return nil, nil 1362 } 1363 if c, ok := child.(*andMatchTree); ok { 1364 mt.andMatchTree = *c 1365 } else { 1366 // the and was simplified to a single clause, 1367 // so the linematch portion is irrelevant. 1368 return mt, nil 1369 } 1370 case *notMatchTree: 1371 mt.child, err = pruneMatchTree(mt.child) 1372 if err != nil { 1373 return nil, err 1374 } 1375 if mt.child == nil { 1376 // not false => true 1377 return &bruteForceMatchTree{}, nil 1378 } 1379 // unhandled: 1380 case *docMatchTree: 1381 case *bruteForceMatchTree: 1382 case *regexpMatchTree: 1383 case *wordMatchTree: 1384 } 1385 return mt, err 1386} 1387 1388// isRegexpAll returns true if the query matches all possible lines. 1389// 1390// Note: it is possible for a funky regex to actually match all but this 1391// returns false. This returns true for normal looking regexes like ".*" or 1392// "(?-s:.*)". 1393func isRegexpAll(r *syntax.Regexp) bool { 1394 // Note: we don't care about any flags since we are looking for .* and we 1395 // don't mind if . matches all or everything but newline. 1396 1397 // for loop is for visiting children of OpCapture/OpConcat until we hit .* 1398 for { 1399 // Our main target: .* 1400 if r.Op == syntax.OpStar && len(r.Sub) == 1 { // * 1401 // (?s:.) or (?-s:.) 1402 return r.Sub[0].Op == syntax.OpAnyChar || r.Sub[0].Op == syntax.OpAnyCharNotNL 1403 } 1404 1405 // Strip away expressions being wrapped in paranthesis 1406 if (r.Op == syntax.OpCapture || r.Op == syntax.OpConcat) && len(r.Sub) == 1 { 1407 r = r.Sub[0] 1408 continue 1409 } 1410 1411 return false 1412 } 1413}