fork of https://github.com/sourcegraph/zoekt
0

Configure Feed

Select the types of activity you want to include in your feed.

1// Copyright 2018 Google Inc. All rights reserved. 2// 3// Licensed under the Apache License, Version 2.0 (the "License"); 4// you may not use this file except in compliance with the License. 5// You may obtain a copy of the License at 6// 7// http://www.apache.org/licenses/LICENSE-2.0 8// 9// Unless required by applicable law or agreed to in writing, software 10// distributed under the License is distributed on an "AS IS" BASIS, 11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12// See the License for the specific language governing permissions and 13// limitations under the License. 14 15package zoekt 16 17import ( 18 "bytes" 19 "fmt" 20 "log" 21 "regexp/syntax" 22 "strings" 23 "unicode/utf8" 24 25 "github.com/grafana/regexp" 26 27 "github.com/sourcegraph/zoekt/internal/syntaxutil" 28 "github.com/sourcegraph/zoekt/query" 29) 30 31// A docIterator iterates over documents in order. 32type docIterator interface { 33 // provide the next document where we may find something interesting. 34 // 35 // This is like a "peek" and shouldn't mutate state. prepare is what should 36 // change state. 37 nextDoc() uint32 38 39 // clears any per-document state of the docIterator, and 40 // prepares for evaluating the given doc. The argument is 41 // strictly increasing over time. 42 prepare(nextDoc uint32) 43} 44 45// costs are passed in increasing order to matchTree.matches until they do not 46// return matchesRequiresHigherCost. 47const ( 48 costConst = 0 49 costMemory = 1 50 costContent = 2 51 costRegexp = 3 52) 53 54const ( 55 costMin = costConst 56 costMax = costRegexp 57) 58 59// matchesState is an enum for the state of a matchTree after a call to 60// matchTree.matches. 61type matchesState uint8 62 63const ( 64 // matchesRequiresHigherCost is returned when matchTree.matches hasn't done 65 // a search yet since the cost value is not high enough. 66 matchesRequiresHigherCost matchesState = iota 67 68 // matchesFound is returned when matchTree.matches has done a search and 69 // found one or more matches. 70 matchesFound 71 72 // matchesNone is returned when matchTree.matches has done a search and 73 // found nothing. 74 matchesNone 75) 76 77// matchesStatePred is a helper which returns matchesFound if b is true 78// otherwise returns matchesNone. 79func matchesStatePred(b bool) matchesState { 80 if b { 81 return matchesFound 82 } 83 return matchesNone 84} 85 86// matchesStateForSlice is a helper which returns matchesFound if v is 87// non-empty otherwise returns matchesNone. 88func matchesStateForSlice[T any](v []T) matchesState { 89 return matchesStatePred(len(v) > 0) 90} 91 92// An expression tree coupled with matches. The matchtree has two 93// functions: 94// 95// * it implements boolean combinations (and, or, not) 96// 97// * it implements shortcuts, where we skip documents (for example: if 98// there are no trigram matches, we can be sure there are no substring 99// matches). The matchtree iterates over the documents as they are 100// ordered in the shard. 101// 102// The general process for a given (shard, query) is: 103// 104// - construct matchTree for the query 105// 106// - find all different leaf matchTrees (substring, regexp, etc.) 107// 108// in a loop: 109// 110// - find next doc to process using nextDoc 111// 112// - evaluate atoms (leaf expressions that match text) 113// 114// - evaluate the tree using matches(), storing the result in map. 115// 116// - if the complete tree returns (matches() != matchesRequiresHigherCost) 117// for the document, collect all text matches by looking at leaf 118// matchTrees. 119type matchTree interface { 120 docIterator 121 122 // matches if cost is high enough. See documentation for matchesState's 123 // values. 124 // 125 // Note: Do not call this directly, rather use evalMatchTree which uses 126 // known to cache responses once the state transitions away from 127 // matchesRequiresHigherCost. 128 matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState 129} 130 131// docMatchTree iterates over documents for which predicate(docID) returns true. 132type docMatchTree struct { 133 // the number of documents in a shard. 134 numDocs uint32 135 136 predicate func(docID uint32) bool 137 138 // provides additional information about the reason why the docMatchTree was 139 // created. 140 reason string 141 142 // mutable 143 firstDone bool 144 docID uint32 145} 146 147type bruteForceMatchTree struct { 148 // mutable 149 firstDone bool 150 docID uint32 151} 152 153type andLineMatchTree struct { 154 andMatchTree 155} 156 157type andMatchTree struct { 158 children []matchTree 159} 160 161type orMatchTree struct { 162 children []matchTree 163} 164 165type notMatchTree struct { 166 child matchTree 167} 168 169// Returns only the filename of child matches. 170type fileNameMatchTree struct { 171 child matchTree 172} 173 174type boostMatchTree struct { 175 child matchTree 176 boost float64 177} 178 179// Don't visit this subtree for collecting matches. 180type noVisitMatchTree struct { 181 matchTree 182} 183 184type regexpMatchTree struct { 185 regexp *regexp.Regexp 186 187 // origRegexp is the original parsed regexp from the query structure. It 188 // does not include mutations such as case sensitivity. 189 origRegexp *syntax.Regexp 190 191 fileName bool 192 193 // mutable 194 reEvaluated bool 195 found []*candidateMatch 196 197 // nextDoc, prepare. 198 bruteForceMatchTree 199} 200 201func newRegexpMatchTree(s *query.Regexp) *regexpMatchTree { 202 prefix := "" 203 if !s.CaseSensitive { 204 prefix = "(?i)" 205 } 206 207 return &regexpMatchTree{ 208 regexp: regexp.MustCompile(prefix + syntaxutil.RegexpString(s.Regexp)), 209 origRegexp: s.Regexp, 210 fileName: s.FileName, 211 } 212} 213 214// \bLITERAL\b 215type wordMatchTree struct { 216 word string 217 218 fileName bool 219 220 // mutable 221 evaluated bool 222 found []*candidateMatch 223 224 // nextDoc, prepare. 225 bruteForceMatchTree 226} 227 228type substrMatchTree struct { 229 matchIterator 230 231 query *query.Substring 232 caseSensitive bool 233 fileName bool 234 235 // mutable 236 current []*candidateMatch 237 contEvaluated bool 238} 239 240type branchQueryMatchTree struct { 241 fileMasks []uint64 242 masks []uint64 243 repos []uint16 244 245 // mutable 246 firstDone bool 247 docID uint32 248} 249 250func (t *branchQueryMatchTree) branchMask() uint64 { 251 return t.fileMasks[t.docID] & t.masks[t.repos[t.docID]] 252} 253 254type symbolRegexpMatchTree struct { 255 matchTree 256 regexp *regexp.Regexp 257 all bool // skips regex match if .* 258 259 reEvaluated bool 260 found []*candidateMatch 261} 262 263func (t *symbolRegexpMatchTree) prepare(doc uint32) { 264 t.reEvaluated = false 265 t.found = t.found[:0] 266 t.matchTree.prepare(doc) 267} 268 269func (t *symbolRegexpMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState { 270 if t.reEvaluated { 271 return matchesStateForSlice(t.found) 272 } 273 274 if cost < costRegexp { 275 return matchesRequiresHigherCost 276 } 277 278 sections := cp.docSections() 279 content := cp.data(false) 280 281 found := t.found[:0] 282 for i, sec := range sections { 283 var idx []int 284 if t.all { 285 idx = []int{0, int(sec.End - sec.Start)} 286 } else { 287 idx = t.regexp.FindIndex(content[sec.Start:sec.End]) 288 if idx == nil { 289 continue 290 } 291 } 292 293 cm := &candidateMatch{ 294 byteOffset: sec.Start + uint32(idx[0]), 295 byteMatchSz: uint32(idx[1] - idx[0]), 296 symbol: true, 297 symbolIdx: uint32(i), 298 } 299 found = append(found, cm) 300 } 301 t.found = found 302 t.reEvaluated = true 303 304 return matchesStateForSlice(t.found) 305} 306 307type symbolSubstrMatchTree struct { 308 *substrMatchTree 309 310 patternSize uint32 311 fileEndRunes []uint32 312 fileEndSymbol []uint32 313 314 doc uint32 315 sections []DocumentSection 316 317 secID uint32 318} 319 320func (t *symbolSubstrMatchTree) prepare(doc uint32) { 321 t.substrMatchTree.prepare(doc) 322 t.doc = doc 323 324 var fileStart uint32 325 if doc > 0 { 326 fileStart = t.fileEndRunes[doc-1] 327 } 328 329 var sections []DocumentSection 330 if len(t.sections) > 0 { 331 most := t.fileEndSymbol[len(t.fileEndSymbol)-1] 332 if most == uint32(len(t.sections)) { 333 sections = t.sections[t.fileEndSymbol[doc]:t.fileEndSymbol[doc+1]] 334 } else { 335 for t.secID < uint32(len(t.sections)) && t.sections[t.secID].Start < fileStart { 336 t.secID++ 337 } 338 339 fileEnd, symbolEnd := t.fileEndRunes[doc], t.secID 340 for symbolEnd < uint32(len(t.sections)) && t.sections[symbolEnd].Start < fileEnd { 341 symbolEnd++ 342 } 343 344 sections = t.sections[t.secID:symbolEnd] 345 } 346 } 347 348 secIdx := 0 349 trimmed := t.current[:0] 350 for len(sections) > secIdx && len(t.current) > 0 { 351 start := fileStart + t.current[0].runeOffset 352 end := start + t.patternSize 353 if start >= sections[secIdx].End { 354 secIdx++ 355 continue 356 } 357 358 if start < sections[secIdx].Start { 359 t.current = t.current[1:] 360 continue 361 } 362 363 if end <= sections[secIdx].End { 364 t.current[0].symbol = true 365 t.current[0].symbolIdx = uint32(secIdx) 366 trimmed = append(trimmed, t.current[0]) 367 } 368 369 t.current = t.current[1:] 370 } 371 t.current = trimmed 372} 373 374// all prepare methods 375 376func (t *bruteForceMatchTree) prepare(doc uint32) { 377 t.docID = doc 378 t.firstDone = true 379} 380 381func (t *docMatchTree) prepare(doc uint32) { 382 t.docID = doc 383 t.firstDone = true 384} 385 386func (t *andMatchTree) prepare(doc uint32) { 387 for _, c := range t.children { 388 c.prepare(doc) 389 } 390} 391 392func (t *regexpMatchTree) prepare(doc uint32) { 393 t.found = t.found[:0] 394 t.reEvaluated = false 395 t.bruteForceMatchTree.prepare(doc) 396} 397 398func (t *wordMatchTree) prepare(doc uint32) { 399 t.found = t.found[:0] 400 t.evaluated = false 401 t.bruteForceMatchTree.prepare(doc) 402} 403 404func (t *orMatchTree) prepare(doc uint32) { 405 for _, c := range t.children { 406 c.prepare(doc) 407 } 408} 409 410func (t *notMatchTree) prepare(doc uint32) { 411 t.child.prepare(doc) 412} 413 414func (t *fileNameMatchTree) prepare(doc uint32) { 415 t.child.prepare(doc) 416} 417 418func (t *boostMatchTree) prepare(doc uint32) { 419 t.child.prepare(doc) 420} 421 422func (t *substrMatchTree) prepare(nextDoc uint32) { 423 t.matchIterator.prepare(nextDoc) 424 t.current = t.matchIterator.candidates() 425 t.contEvaluated = false 426} 427 428func (t *branchQueryMatchTree) prepare(doc uint32) { 429 t.firstDone = true 430 t.docID = doc 431} 432 433// nextDoc 434 435func (t *docMatchTree) nextDoc() uint32 { 436 var start uint32 437 if t.firstDone { 438 start = t.docID + 1 439 } 440 for i := start; i < t.numDocs; i++ { 441 if t.predicate(i) { 442 return i 443 } 444 } 445 return maxUInt32 446} 447 448func (t *bruteForceMatchTree) nextDoc() uint32 { 449 if !t.firstDone { 450 return 0 451 } 452 return t.docID + 1 453} 454 455func (t *andMatchTree) nextDoc() uint32 { 456 var max uint32 457 for _, c := range t.children { 458 m := c.nextDoc() 459 if m > max { 460 max = m 461 } 462 } 463 return max 464} 465 466func (t *orMatchTree) nextDoc() uint32 { 467 min := uint32(maxUInt32) 468 for _, c := range t.children { 469 m := c.nextDoc() 470 if m < min { 471 min = m 472 } 473 } 474 return min 475} 476 477func (t *notMatchTree) nextDoc() uint32 { 478 return 0 479} 480 481func (t *fileNameMatchTree) nextDoc() uint32 { 482 return t.child.nextDoc() 483} 484 485func (t *boostMatchTree) nextDoc() uint32 { 486 return t.child.nextDoc() 487} 488 489func (t *branchQueryMatchTree) nextDoc() uint32 { 490 var start uint32 491 if t.firstDone { 492 start = t.docID + 1 493 } 494 495 for i := start; i < uint32(len(t.fileMasks)); i++ { 496 if (t.masks[t.repos[i]] & t.fileMasks[i]) != 0 { 497 return i 498 } 499 } 500 return maxUInt32 501} 502 503// all String methods 504 505func (t *bruteForceMatchTree) String() string { 506 return "all" 507} 508 509func (t *docMatchTree) String() string { 510 return fmt.Sprintf("doc(%s)", t.reason) 511} 512 513func (t *andMatchTree) String() string { 514 return fmt.Sprintf("and%v", t.children) 515} 516 517func (t *regexpMatchTree) String() string { 518 f := "" 519 if t.fileName { 520 f = "f" 521 } 522 return fmt.Sprintf("%sre(%s)", f, t.regexp) 523} 524 525func (t *wordMatchTree) String() string { 526 f := "" 527 if t.fileName { 528 f = "f" 529 } 530 return fmt.Sprintf("%sword(%s)", f, t.word) 531} 532 533func (t *orMatchTree) String() string { 534 return fmt.Sprintf("or%v", t.children) 535} 536 537func (t *notMatchTree) String() string { 538 return fmt.Sprintf("not(%v)", t.child) 539} 540 541func (t *noVisitMatchTree) String() string { 542 return fmt.Sprintf("novisit(%v)", t.matchTree) 543} 544 545func (t *fileNameMatchTree) String() string { 546 return fmt.Sprintf("f(%v)", t.child) 547} 548 549func (t *boostMatchTree) String() string { 550 return fmt.Sprintf("boost(%f, %v)", t.boost, t.child) 551} 552 553func (t *substrMatchTree) String() string { 554 f := "" 555 if t.fileName { 556 f = "f" 557 } 558 559 return fmt.Sprintf("%ssubstr(%q, %v, %v)", f, t.query.Pattern, t.current, t.matchIterator) 560} 561 562func (t *branchQueryMatchTree) String() string { 563 return fmt.Sprintf("branch(%x)", t.masks) 564} 565 566func (t *symbolSubstrMatchTree) String() string { 567 return fmt.Sprintf("symbol(%v)", t.substrMatchTree) 568} 569 570func (t *symbolRegexpMatchTree) String() string { 571 return fmt.Sprintf("symbol(%v)", t.matchTree) 572} 573 574// visitMatches visits all atoms in matchTree. Note: This visits 575// noVisitMatchTree. For collecting matches use visitMatches. 576func visitMatchTree(t matchTree, f func(matchTree)) { 577 switch s := t.(type) { 578 case *andMatchTree: 579 for _, ch := range s.children { 580 visitMatchTree(ch, f) 581 } 582 case *orMatchTree: 583 for _, ch := range s.children { 584 visitMatchTree(ch, f) 585 } 586 case *andLineMatchTree: 587 visitMatchTree(&s.andMatchTree, f) 588 case *noVisitMatchTree: 589 visitMatchTree(s.matchTree, f) 590 case *notMatchTree: 591 visitMatchTree(s.child, f) 592 case *fileNameMatchTree: 593 visitMatchTree(s.child, f) 594 case *boostMatchTree: 595 visitMatchTree(s.child, f) 596 case *symbolSubstrMatchTree: 597 visitMatchTree(s.substrMatchTree, f) 598 case *symbolRegexpMatchTree: 599 visitMatchTree(s.matchTree, f) 600 default: 601 f(t) 602 } 603} 604 605// updateMatchTreeStats calls updateStats on all atoms in mt which have that 606// function defined. 607func updateMatchTreeStats(mt matchTree, stats *Stats) { 608 visitMatchTree(mt, func(mt matchTree) { 609 if atom, ok := mt.(interface{ updateStats(*Stats) }); ok { 610 atom.updateStats(stats) 611 } 612 }) 613} 614 615func visitMatchAtoms(t matchTree, known map[matchTree]bool, f func(matchTree)) { 616 visitMatches(t, known, 1, func(mt matchTree, _ float64) { 617 f(mt) 618 }) 619} 620 621// visitMatches visits all atoms which can contribute matches. Note: This 622// skips noVisitMatchTree. 623func visitMatches(t matchTree, known map[matchTree]bool, weight float64, f func(matchTree, float64)) { 624 switch s := t.(type) { 625 case *andMatchTree: 626 for _, ch := range s.children { 627 if known[ch] { 628 visitMatches(ch, known, weight, f) 629 } 630 } 631 case *andLineMatchTree: 632 visitMatches(&s.andMatchTree, known, weight, f) 633 case *orMatchTree: 634 for _, ch := range s.children { 635 if known[ch] { 636 visitMatches(ch, known, weight, f) 637 } 638 } 639 case *boostMatchTree: 640 visitMatches(s.child, known, weight*s.boost, f) 641 case *symbolSubstrMatchTree: 642 visitMatches(s.substrMatchTree, known, weight, f) 643 case *notMatchTree: 644 case *noVisitMatchTree: 645 // don't collect into negative trees. 646 case *fileNameMatchTree: 647 // We will just gather the filename if we do not visit this tree. 648 default: 649 f(s, weight) 650 } 651} 652 653// all matches() methods. 654 655func (t *docMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState { 656 return matchesStatePred(t.predicate(cp.idx)) 657} 658 659func (t *bruteForceMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState { 660 return matchesFound 661} 662 663// andLineMatchTree is a performance optimization of andMatchTree. For content 664// searches we don't want to run the regex engine if there is no line that 665// contains matches from all terms. 666func (t *andLineMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState { 667 if state := evalMatchTree(cp, cost, known, &t.andMatchTree); state != matchesFound { 668 return state 669 } 670 671 // Invariant: all children have matches. If any line contains all of them we 672 // can return MatchesFound. 673 674 // find child with fewest candidates 675 min := maxUInt32 676 fewestChildren := 0 677 for ix, child := range t.children { 678 v, ok := child.(*substrMatchTree) 679 // make sure we are running a content search and that all candidates are a 680 // substrMatchTree 681 if !ok || v.fileName { 682 return matchesFound 683 } 684 if len(v.current) < min { 685 min = len(v.current) 686 fewestChildren = ix 687 } 688 } 689 690 type lineRange struct { 691 start int 692 end int 693 } 694 lines := make([]lineRange, 0, len(t.children[fewestChildren].(*substrMatchTree).current)) 695 prev := -1 696 for _, candidate := range t.children[fewestChildren].(*substrMatchTree).current { 697 line, byteStart, byteEnd := cp.newlines().atOffset(candidate.byteOffset) 698 if line == prev { 699 continue 700 } 701 prev = line 702 lines = append(lines, lineRange{byteStart, byteEnd}) 703 } 704 705 // children keeps track of the children's candidates we have already seen. 706 children := make([][]*candidateMatch, 0, len(t.children)-1) 707 for j, child := range t.children { 708 if j == fewestChildren { 709 continue 710 } 711 children = append(children, child.(*substrMatchTree).current) 712 } 713 714nextLine: 715 for i := 0; i < len(lines); i++ { 716 hits := 1 717 nextChild: 718 for j := range children { 719 nextCandidate: 720 for len(children[j]) > 0 { 721 candidate := children[j][0] 722 bo := int(cp.findOffset(false, candidate.runeOffset)) 723 if bo < lines[i].start { 724 children[j] = children[j][1:] 725 continue nextCandidate 726 } 727 if bo <= lines[i].end { 728 hits++ 729 continue nextChild 730 } 731 // move the `lines` iterator forward until bo <= line.end 732 for i < len(lines) && bo > lines[i].end { 733 i++ 734 } 735 i-- 736 continue nextLine 737 } 738 } 739 // return early once we found any line that contains matches from all children 740 if hits == len(t.children) { 741 return matchesFound 742 } 743 } 744 return matchesNone 745} 746 747func (t *andMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState { 748 // We have found matches unless a child needs to do more work or it hasn't 749 // found matches. 750 state := matchesFound 751 752 for _, ch := range t.children { 753 switch evalMatchTree(cp, cost, known, ch) { 754 case matchesRequiresHigherCost: 755 // keep evaluating other children incase we come across matchesNone 756 state = matchesRequiresHigherCost 757 case matchesFound: 758 // will return this if every child has this value 759 case matchesNone: 760 return matchesNone 761 } 762 } 763 764 return state 765} 766 767func (t *orMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState { 768 // we could short-circuit, but we want to use the other possibilities as a 769 // ranking signal. So we always return the most conservative state. 770 state := matchesNone 771 for _, ch := range t.children { 772 switch evalMatchTree(cp, cost, known, ch) { 773 case matchesRequiresHigherCost: 774 state = matchesRequiresHigherCost 775 case matchesFound: 776 if state != matchesRequiresHigherCost { 777 state = matchesFound 778 } 779 case matchesNone: 780 // noop 781 } 782 } 783 return state 784} 785 786func (t *branchQueryMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState { 787 return matchesStatePred(t.branchMask() != 0) 788} 789 790func (t *regexpMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState { 791 if t.reEvaluated { 792 return matchesStateForSlice(t.found) 793 } 794 795 if cost < costRegexp { 796 return matchesRequiresHigherCost 797 } 798 799 cp.stats.RegexpsConsidered++ 800 idxs := t.regexp.FindAllIndex(cp.data(t.fileName), -1) 801 found := t.found[:0] 802 for _, idx := range idxs { 803 cm := &candidateMatch{ 804 byteOffset: uint32(idx[0]), 805 byteMatchSz: uint32(idx[1] - idx[0]), 806 fileName: t.fileName, 807 } 808 809 found = append(found, cm) 810 } 811 t.found = found 812 t.reEvaluated = true 813 814 return matchesStateForSlice(t.found) 815} 816 817func (t *wordMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState { 818 if t.evaluated { 819 return matchesStateForSlice(t.found) 820 } 821 822 if cost < costRegexp { 823 return matchesRequiresHigherCost 824 } 825 826 data := cp.data(t.fileName) 827 offset := 0 828 found := t.found[:0] 829 for { 830 idx := bytes.Index(data[offset:], []byte(t.word)) 831 if idx < 0 { 832 break 833 } 834 835 relStartOffset := offset + idx 836 relEndOffset := relStartOffset + len(t.word) 837 838 startBoundary := relStartOffset < len(data) && (relStartOffset == 0 || !characterClass(data[relStartOffset-1])) 839 endBoundary := relEndOffset > 0 && (relEndOffset == len(data) || !characterClass(data[relEndOffset])) 840 if startBoundary && endBoundary { 841 found = append(found, &candidateMatch{ 842 byteOffset: uint32(offset + idx), 843 byteMatchSz: uint32(len(t.word)), 844 fileName: t.fileName, 845 }) 846 } 847 offset += idx + len(t.word) 848 } 849 850 t.found = found 851 t.evaluated = true 852 853 return matchesStateForSlice(t.found) 854} 855 856// breakMatchesOnNewlines returns matches resulting from breaking each element 857// of cms on newlines within text. 858func breakMatchesOnNewlines(cms []*candidateMatch, text []byte) []*candidateMatch { 859 var lineCMs []*candidateMatch 860 for _, cm := range cms { 861 lineCMs = append(lineCMs, breakOnNewlines(cm, text)...) 862 } 863 return lineCMs 864} 865 866// breakOnNewlines returns matches resulting from breaking cm on newlines 867// within text. 868func breakOnNewlines(cm *candidateMatch, text []byte) []*candidateMatch { 869 var cms []*candidateMatch 870 addMe := &candidateMatch{} 871 *addMe = *cm 872 for i := uint32(cm.byteOffset); i < cm.byteOffset+cm.byteMatchSz; i++ { 873 if text[i] == '\n' { 874 addMe.byteMatchSz = i - addMe.byteOffset 875 if addMe.byteMatchSz != 0 { 876 cms = append(cms, addMe) 877 } 878 879 addMe = &candidateMatch{} 880 *addMe = *cm 881 addMe.byteOffset = i + 1 882 } 883 } 884 addMe.byteMatchSz = cm.byteOffset + cm.byteMatchSz - addMe.byteOffset 885 if addMe.byteMatchSz != 0 { 886 cms = append(cms, addMe) 887 } 888 return cms 889} 890 891// evalMatchTree should be called instead of directly calling 892// matchTree.matches. It cache known values for future evaluation at higher 893// costs. 894func evalMatchTree(cp *contentProvider, cost int, known map[matchTree]bool, mt matchTree) matchesState { 895 if v, ok := known[mt]; ok { 896 return matchesStatePred(v) 897 } 898 899 ms := mt.matches(cp, cost, known) 900 if ms != matchesRequiresHigherCost { 901 known[mt] = ms == matchesFound 902 } 903 904 return ms 905} 906 907func (t *notMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState { 908 switch evalMatchTree(cp, cost, known, t.child) { 909 case matchesRequiresHigherCost: 910 return matchesRequiresHigherCost 911 case matchesFound: 912 return matchesNone 913 case matchesNone: 914 return matchesFound 915 default: 916 panic("unreachable") 917 } 918} 919 920func (t *fileNameMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState { 921 return evalMatchTree(cp, cost, known, t.child) 922} 923 924func (t *boostMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState { 925 return evalMatchTree(cp, cost, known, t.child) 926} 927 928func (t *substrMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState { 929 if t.contEvaluated { 930 return matchesStateForSlice(t.current) 931 } 932 933 if len(t.current) == 0 { 934 return matchesNone 935 } 936 937 if t.fileName && cost < costMemory { 938 return matchesRequiresHigherCost 939 } 940 941 if !t.fileName && cost < costContent { 942 return matchesRequiresHigherCost 943 } 944 945 pruned := t.current[:0] 946 for _, m := range t.current { 947 if m.byteOffset == 0 && m.runeOffset > 0 { 948 m.byteOffset = cp.findOffset(m.fileName, m.runeOffset) 949 } 950 if m.matchContent(cp.data(m.fileName)) { 951 pruned = append(pruned, m) 952 } 953 } 954 t.current = pruned 955 t.contEvaluated = true 956 957 return matchesStateForSlice(t.current) 958} 959 960type matchTreeOpt struct { 961 // DisableWordMatchOptimization is used to disable the use of wordMatchTree. 962 // This was added since we do not support wordMatchTree with symbol search. 963 DisableWordMatchOptimization bool 964} 965 966func (d *indexData) newMatchTree(q query.Q, opt matchTreeOpt) (matchTree, error) { 967 if q == nil { 968 return nil, fmt.Errorf("got nil (sub)query") 969 } 970 switch s := q.(type) { 971 case *query.Regexp: 972 // RegexpToMatchTreeRecursive tries to distill a matchTree that matches a 973 // superset of the regexp. If the returned matchTree is equivalent to the 974 // original regexp, it returns true. An equivalent matchTree has the same 975 // behaviour as the original regexp and can be used instead. 976 // 977 subMT, isEq, _, err := d.regexpToMatchTreeRecursive(s.Regexp, ngramSize, s.FileName, s.CaseSensitive) 978 if err != nil { 979 return nil, err 980 } 981 // if the query can be used in place of the regexp 982 // return the subtree 983 if isEq { 984 return subMT, nil 985 } 986 987 var tr matchTree 988 if wmt, ok := regexpToWordMatchTree(s, opt); ok { 989 // A common search we get is "\bLITERAL\b". Avoid the regex engine and 990 // provide something faster. 991 tr = wmt 992 } else { 993 tr = newRegexpMatchTree(s) 994 } 995 996 return &andMatchTree{ 997 children: []matchTree{ 998 tr, &noVisitMatchTree{subMT}, 999 }, 1000 }, nil 1001 case *query.And: 1002 var r []matchTree 1003 for _, ch := range s.Children { 1004 ct, err := d.newMatchTree(ch, opt) 1005 if err != nil { 1006 return nil, err 1007 } 1008 r = append(r, ct) 1009 } 1010 return &andMatchTree{r}, nil 1011 case *query.Or: 1012 var r []matchTree 1013 for _, ch := range s.Children { 1014 ct, err := d.newMatchTree(ch, opt) 1015 if err != nil { 1016 return nil, err 1017 } 1018 r = append(r, ct) 1019 } 1020 return &orMatchTree{r}, nil 1021 case *query.Not: 1022 ct, err := d.newMatchTree(s.Child, opt) 1023 return &notMatchTree{ 1024 child: ct, 1025 }, err 1026 1027 case *query.Type: 1028 if s.Type != query.TypeFileName { 1029 break 1030 } 1031 1032 ct, err := d.newMatchTree(s.Child, opt) 1033 if err != nil { 1034 return nil, err 1035 } 1036 1037 return &fileNameMatchTree{ 1038 child: ct, 1039 }, nil 1040 1041 case *query.Boost: 1042 ct, err := d.newMatchTree(s.Child, opt) 1043 if err != nil { 1044 return nil, err 1045 } 1046 1047 return &boostMatchTree{ 1048 child: ct, 1049 boost: s.Boost, 1050 }, nil 1051 1052 case *query.Substring: 1053 return d.newSubstringMatchTree(s) 1054 1055 case *query.Branch: 1056 masks := make([]uint64, 0, len(d.repoMetaData)) 1057 if s.Pattern == "HEAD" { 1058 for i := 0; i < len(d.repoMetaData); i++ { 1059 masks = append(masks, 1) 1060 } 1061 } else { 1062 for _, branchIDs := range d.branchIDs { 1063 mask := uint64(0) 1064 for nm, m := range branchIDs { 1065 if (s.Exact && nm == s.Pattern) || (!s.Exact && strings.Contains(nm, s.Pattern)) { 1066 mask |= uint64(m) 1067 } 1068 } 1069 masks = append(masks, mask) 1070 } 1071 } 1072 return &branchQueryMatchTree{ 1073 masks: masks, 1074 fileMasks: d.fileBranchMasks, 1075 repos: d.repos, 1076 }, nil 1077 case *query.Const: 1078 if s.Value { 1079 return &bruteForceMatchTree{}, nil 1080 } else { 1081 return &noMatchTree{Why: "const"}, nil 1082 } 1083 case *query.Language: 1084 code, ok := d.metaData.LanguageMap[s.Language] 1085 if !ok { 1086 return &noMatchTree{Why: "lang"}, nil 1087 } 1088 return &docMatchTree{ 1089 reason: "language", 1090 numDocs: d.numDocs(), 1091 predicate: func(docID uint32) bool { 1092 return d.getLanguage(docID) == code 1093 }, 1094 }, nil 1095 1096 case *query.Symbol: 1097 // Disable WordMatchTree since we don't support it in symbols yet. 1098 optCopy := opt 1099 optCopy.DisableWordMatchOptimization = true 1100 1101 subMT, err := d.newMatchTree(s.Expr, optCopy) 1102 if err != nil { 1103 return nil, err 1104 } 1105 1106 if substr, ok := subMT.(*substrMatchTree); ok { 1107 return &symbolSubstrMatchTree{ 1108 substrMatchTree: substr, 1109 patternSize: uint32(utf8.RuneCountInString(substr.query.Pattern)), 1110 fileEndRunes: d.fileEndRunes, 1111 fileEndSymbol: d.fileEndSymbol, 1112 sections: d.runeDocSections, 1113 }, nil 1114 } 1115 1116 var regexpMT *regexpMatchTree 1117 visitMatchTree(subMT, func(mt matchTree) { 1118 if t, ok := mt.(*regexpMatchTree); ok { 1119 regexpMT = t 1120 } 1121 }) 1122 if regexpMT == nil { 1123 return nil, fmt.Errorf("found %T inside query.Symbol", subMT) 1124 } 1125 1126 return &symbolRegexpMatchTree{ 1127 regexp: regexpMT.regexp, 1128 all: isRegexpAll(regexpMT.origRegexp), 1129 matchTree: subMT, 1130 }, nil 1131 1132 case *query.FileNameSet: 1133 return &docMatchTree{ 1134 reason: "FileNameSet", 1135 numDocs: d.numDocs(), 1136 predicate: func(docID uint32) bool { 1137 fileName := d.fileName(docID) 1138 _, ok := s.Set[string(fileName)] 1139 return ok 1140 }, 1141 }, nil 1142 1143 case *query.BranchesRepos: 1144 reposBranchesWant := make([]uint64, len(d.repoMetaData)) 1145 for repoIdx := range d.repoMetaData { 1146 var mask uint64 1147 for _, br := range s.List { 1148 if br.Repos.Contains(d.repoMetaData[repoIdx].ID) { 1149 mask |= uint64(d.branchIDs[repoIdx][br.Branch]) 1150 } 1151 } 1152 reposBranchesWant[repoIdx] = mask 1153 } 1154 return &docMatchTree{ 1155 reason: "BranchesRepos", 1156 numDocs: d.numDocs(), 1157 predicate: func(docID uint32) bool { 1158 return d.fileBranchMasks[docID]&reposBranchesWant[d.repos[docID]] != 0 1159 }, 1160 }, nil 1161 1162 case *query.RepoSet: 1163 reposWant := make([]bool, len(d.repoMetaData)) 1164 for repoIdx, r := range d.repoMetaData { 1165 if _, ok := s.Set[r.Name]; ok { 1166 reposWant[repoIdx] = true 1167 } 1168 } 1169 return &docMatchTree{ 1170 reason: "RepoSet", 1171 numDocs: d.numDocs(), 1172 predicate: func(docID uint32) bool { 1173 return reposWant[d.repos[docID]] 1174 }, 1175 }, nil 1176 1177 case *query.RepoIDs: 1178 reposWant := make([]bool, len(d.repoMetaData)) 1179 for repoIdx, r := range d.repoMetaData { 1180 if s.Repos.Contains(r.ID) { 1181 reposWant[repoIdx] = true 1182 } 1183 } 1184 return &docMatchTree{ 1185 reason: "RepoIDs", 1186 numDocs: d.numDocs(), 1187 predicate: func(docID uint32) bool { 1188 return reposWant[d.repos[docID]] 1189 }, 1190 }, nil 1191 1192 case *query.Repo: 1193 reposWant := make([]bool, len(d.repoMetaData)) 1194 for repoIdx, r := range d.repoMetaData { 1195 if s.Regexp.MatchString(r.Name) { 1196 reposWant[repoIdx] = true 1197 } 1198 } 1199 return &docMatchTree{ 1200 reason: "Repo", 1201 numDocs: d.numDocs(), 1202 predicate: func(docID uint32) bool { 1203 return reposWant[d.repos[docID]] 1204 }, 1205 }, nil 1206 1207 case *query.RepoRegexp: 1208 reposWant := make([]bool, len(d.repoMetaData)) 1209 for repoIdx, r := range d.repoMetaData { 1210 if s.Regexp.MatchString(r.Name) { 1211 reposWant[repoIdx] = true 1212 } 1213 } 1214 return &docMatchTree{ 1215 reason: "RepoRegexp", 1216 numDocs: d.numDocs(), 1217 predicate: func(docID uint32) bool { 1218 return reposWant[d.repos[docID]] 1219 }, 1220 }, nil 1221 1222 case query.RawConfig: 1223 return &docMatchTree{ 1224 reason: s.String(), 1225 numDocs: d.numDocs(), 1226 predicate: func(docID uint32) bool { 1227 return uint8(s)&d.rawConfigMasks[d.repos[docID]] == uint8(s) 1228 }, 1229 }, nil 1230 } 1231 log.Panicf("type %T", q) 1232 return nil, nil 1233} 1234 1235func (d *indexData) newSubstringMatchTree(s *query.Substring) (matchTree, error) { 1236 st := &substrMatchTree{ 1237 query: s, 1238 caseSensitive: s.CaseSensitive, 1239 fileName: s.FileName, 1240 } 1241 1242 if utf8.RuneCountInString(s.Pattern) < ngramSize { 1243 return newRegexpMatchTree(&query.Regexp{ 1244 Regexp: &syntax.Regexp{Op: syntax.OpLiteral, Rune: []rune(s.Pattern)}, 1245 FileName: s.FileName, 1246 Content: s.Content, 1247 CaseSensitive: s.CaseSensitive, 1248 }), nil 1249 } 1250 1251 result, err := d.iterateNgrams(s) 1252 if err != nil { 1253 return nil, err 1254 } 1255 st.matchIterator = result 1256 return st, nil 1257} 1258 1259func regexpToWordMatchTree(q *query.Regexp, opt matchTreeOpt) (_ *wordMatchTree, ok bool) { 1260 if opt.DisableWordMatchOptimization { 1261 return nil, false 1262 } 1263 // Needs to be case sensitive 1264 if !q.CaseSensitive || q.Regexp.Flags&syntax.FoldCase != 0 { 1265 return nil, false 1266 } 1267 // We want a regex that looks like Op.Concat[OpWordBoundary OpLiteral OpWordBoundary] 1268 if q.Regexp.Op != syntax.OpConcat || len(q.Regexp.Sub) != 3 { 1269 return nil, false 1270 } 1271 sub := q.Regexp.Sub 1272 if sub[0].Op != syntax.OpWordBoundary || sub[1].Op != syntax.OpLiteral || sub[2].Op != syntax.OpWordBoundary { 1273 return nil, false 1274 } 1275 1276 return &wordMatchTree{ 1277 word: string(sub[1].Rune), 1278 fileName: q.FileName, 1279 }, true 1280} 1281 1282// pruneMatchTree removes impossible branches from the matchTree, as indicated 1283// by substrMatchTree having a noMatchTree and the resulting impossible and clauses and so forth. 1284func pruneMatchTree(mt matchTree) (matchTree, error) { 1285 var err error 1286 switch mt := mt.(type) { 1287 // leaf nodes that we test with the filter: 1288 case *substrMatchTree: 1289 if res, ok := mt.matchIterator.(*ngramIterationResults); ok { 1290 if _, ok := res.matchIterator.(*noMatchTree); ok { 1291 return nil, nil 1292 } 1293 } 1294 // recursive tree structures: 1295 case *andMatchTree: 1296 // Any branch of an and becoming impossible means the entire clause 1297 // is impossible. Otherwise, just handle rewrites. 1298 for i, child := range mt.children { 1299 newChild, err := pruneMatchTree(child) 1300 if err != nil { 1301 return nil, err 1302 } 1303 if newChild == nil { 1304 return nil, nil 1305 } 1306 mt.children[i] = newChild 1307 } 1308 case *orMatchTree: 1309 // *All* branches of an OR becoming impossible means the entire clause 1310 // is impossible. Otherwise, drop impossible subclauses and handle 1311 // rewrites, including simplifying to a singular resulting child branch. 1312 n := 0 1313 for _, child := range mt.children { 1314 newChild, err := pruneMatchTree(child) 1315 if err != nil { 1316 return nil, err 1317 } 1318 if newChild != nil { 1319 mt.children[n] = newChild 1320 n++ 1321 } 1322 } 1323 mt.children = mt.children[:n] 1324 if len(mt.children) == 1 { 1325 return mt.children[0], nil 1326 } else if len(mt.children) == 0 { 1327 return nil, nil 1328 } 1329 case *noVisitMatchTree: 1330 mt.matchTree, err = pruneMatchTree(mt.matchTree) 1331 if err != nil { 1332 return nil, err 1333 } 1334 if mt.matchTree == nil { 1335 return nil, nil 1336 } 1337 case *fileNameMatchTree: 1338 mt.child, err = pruneMatchTree(mt.child) 1339 if err != nil { 1340 return nil, err 1341 } 1342 if mt.child == nil { 1343 return nil, nil 1344 } 1345 case *boostMatchTree: 1346 mt.child, err = pruneMatchTree(mt.child) 1347 if err != nil { 1348 return nil, err 1349 } 1350 if mt.child == nil { 1351 return nil, nil 1352 } 1353 case *andLineMatchTree: 1354 child, err := pruneMatchTree(&mt.andMatchTree) 1355 if err != nil { 1356 return nil, err 1357 } 1358 if child == nil { 1359 return nil, nil 1360 } 1361 if c, ok := child.(*andMatchTree); ok { 1362 mt.andMatchTree = *c 1363 } else { 1364 // the and was simplified to a single clause, 1365 // so the linematch portion is irrelevant. 1366 return mt, nil 1367 } 1368 case *notMatchTree: 1369 mt.child, err = pruneMatchTree(mt.child) 1370 if err != nil { 1371 return nil, err 1372 } 1373 if mt.child == nil { 1374 // not false => true 1375 return &bruteForceMatchTree{}, nil 1376 } 1377 // unhandled: 1378 case *docMatchTree: 1379 case *bruteForceMatchTree: 1380 case *regexpMatchTree: 1381 case *wordMatchTree: 1382 } 1383 return mt, err 1384} 1385 1386// isRegexpAll returns true if the query matches all possible lines. 1387// 1388// Note: it is possible for a funky regex to actually match all but this 1389// returns false. This returns true for normal looking regexes like ".*" or 1390// "(?-s:.*)". 1391func isRegexpAll(r *syntax.Regexp) bool { 1392 // Note: we don't care about any flags since we are looking for .* and we 1393 // don't mind if . matches all or everything but newline. 1394 1395 // for loop is for visiting children of OpCapture/OpConcat until we hit .* 1396 for { 1397 // Our main target: .* 1398 if r.Op == syntax.OpStar && len(r.Sub) == 1 { // * 1399 // (?s:.) or (?-s:.) 1400 return r.Sub[0].Op == syntax.OpAnyChar || r.Sub[0].Op == syntax.OpAnyCharNotNL 1401 } 1402 1403 // Strip away expressions being wrapped in paranthesis 1404 if (r.Op == syntax.OpCapture || r.Op == syntax.OpConcat) && len(r.Sub) == 1 { 1405 r = r.Sub[0] 1406 continue 1407 } 1408 1409 return false 1410 } 1411}