fork of https://github.com/sourcegraph/zoekt
0

Configure Feed

Select the types of activity you want to include in your feed.

matchtree: switch to enum for matches signature (#691)

The pair of bools (matches, sure) often was quite hard to reason about.
I think this came down to them often not being named at return sites,
the names itself being too concise and that two booleans represents 4
possible states, but we only had two possible states.

This commit uses a more verbose enum instead of booleans. From reading
the diff back I find the code easier to reason about, so I think this is
a good change.

Test Plan: go test ./...

+135 -77
+10 -7
eval.go
··· 291 291 md := d.repoMetaData[d.repos[nextDoc]] 292 292 293 293 for cost := costMin; cost <= costMax; cost++ { 294 - v, ok := mt.matches(cp, cost, known) 295 - if ok && !v { 294 + switch mt.matches(cp, cost, known) { 295 + case matchesRequiresHigherCost: 296 + if cost == costMax { 297 + log.Panicf("did not decide. Repo %s, doc %d, known %v", 298 + md.Name, nextDoc, known) 299 + } 300 + case matchesFound: 301 + // could short-circuit now, but we want to run higher costs to 302 + // potentially find higher ranked matches. 303 + case matchesNone: 296 304 continue nextFileMatch 297 - } 298 - 299 - if cost == costMax && !ok { 300 - log.Panicf("did not decide. Repo %s, doc %d, known %v", 301 - md.Name, nextDoc, known) 302 305 } 303 306 } 304 307
+2 -2
matchiter.go
··· 92 92 93 93 func (t *noMatchTree) prepare(uint32) {} 94 94 95 - func (t *noMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) (bool, bool) { 96 - return false, true 95 + func (t *noMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState { 96 + return matchesNone 97 97 } 98 98 99 99 func (t *noMatchTree) updateStats(s *Stats) {
+123 -68
matchtree.go
··· 29 29 30 30 // A docIterator iterates over documents in order. 31 31 type docIterator interface { 32 - // provide the next document where we can may find something 33 - // interesting. 32 + // provide the next document where we may find something interesting. 33 + // 34 + // This is like a "peek" and shouldn't mutate state. prepare is what should 35 + // change state. 34 36 nextDoc() uint32 35 37 36 38 // clears any per-document state of the docIterator, and ··· 39 41 prepare(nextDoc uint32) 40 42 } 41 43 44 + // costs are passed in increasing order to matchTree.matches until they do not 45 + // return matchesRequiresHigherCost. 42 46 const ( 43 47 costConst = 0 44 48 costMemory = 1 ··· 51 55 costMax = costRegexp 52 56 ) 53 57 58 + // matchesState is an enum for the state of a matchTree after a call to 59 + // matchTree.matches. 60 + type matchesState uint8 61 + 62 + const ( 63 + // matchesRequiresHigherCost is returned when matchTree.matches hasn't done 64 + // a search yet since the cost value is not high enough. 65 + matchesRequiresHigherCost matchesState = iota 66 + 67 + // matchesFound is returned when matchTree.matches has done a search and 68 + // found one or more matches. 69 + matchesFound 70 + 71 + // matchesNone is returned when matchTree.matches has done a search and 72 + // found nothing. 73 + matchesNone 74 + ) 75 + 76 + // matchesStatePred is a helper which returns matchesFound if b is true 77 + // otherwise returns matchesNone. 78 + func matchesStatePred(b bool) matchesState { 79 + if b { 80 + return matchesFound 81 + } 82 + return matchesNone 83 + } 84 + 85 + // matchesStateForSlice is a helper which returns matchesFound if v is 86 + // non-empty otherwise returns matchesNone. 87 + func matchesStateForSlice[T any](v []T) matchesState { 88 + return matchesStatePred(len(v) > 0) 89 + } 90 + 54 91 // An expression tree coupled with matches. The matchtree has two 55 92 // functions: 56 93 // ··· 75 112 // 76 113 // - evaluate the tree using matches(), storing the result in map. 77 114 // 78 - // - if the complete tree returns (matches() == true) for the document, 79 - // collect all text matches by looking at leaf matchTrees 115 + // - if the complete tree returns (matches() != matchesRequiresHigherCost) 116 + // for the document, collect all text matches by looking at leaf 117 + // matchTrees. 80 118 type matchTree interface { 81 119 docIterator 82 120 83 - // returns whether this matches, and if we are sure. 84 - matches(cp *contentProvider, cost int, known map[matchTree]bool) (match bool, sure bool) 121 + // matches if cost is high enough, caching known values for future 122 + // evaluation at higher costs. See documentation for matchesState's values. 123 + matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState 85 124 } 86 125 87 126 // docMatchTree iterates over documents for which predicate(docID) returns true. ··· 200 239 t.matchTree.prepare(doc) 201 240 } 202 241 203 - func (t *symbolRegexpMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) (bool, bool) { 242 + func (t *symbolRegexpMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState { 204 243 if t.reEvaluated { 205 - return len(t.found) > 0, true 244 + return matchesStateForSlice(t.found) 206 245 } 207 246 208 247 if cost < costRegexp { 209 - return false, false 248 + return matchesRequiresHigherCost 210 249 } 211 250 212 251 sections := cp.docSections() ··· 235 274 t.found = found 236 275 t.reEvaluated = true 237 276 238 - return len(t.found) > 0, true 277 + return matchesStateForSlice(t.found) 239 278 } 240 279 241 280 type symbolSubstrMatchTree struct { ··· 564 603 565 604 // all matches() methods. 566 605 567 - func (t *docMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) (bool, bool) { 568 - return t.predicate(cp.idx), true 606 + func (t *docMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState { 607 + return matchesStatePred(t.predicate(cp.idx)) 569 608 } 570 609 571 - func (t *bruteForceMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) (bool, bool) { 572 - return true, true 610 + func (t *bruteForceMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState { 611 + return matchesFound 573 612 } 574 613 575 614 // andLineMatchTree is a performance optimization of andMatchTree. For content 576 615 // searches we don't want to run the regex engine if there is no line that 577 616 // contains matches from all terms. 578 - func (t *andLineMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) (bool, bool) { 579 - matches, sure := t.andMatchTree.matches(cp, cost, known) 580 - if !(sure && matches) { 581 - return matches, sure 617 + func (t *andLineMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState { 618 + if state := t.andMatchTree.matches(cp, cost, known); state != matchesFound { 619 + return state 582 620 } 621 + 622 + // Invariant: all children have matches. If any line contains all of them we 623 + // can return MatchesFound. 583 624 584 625 // find child with fewest candidates 585 626 min := maxUInt32 ··· 589 630 // make sure we are running a content search and that all candidates are a 590 631 // substrMatchTree 591 632 if !ok || v.fileName { 592 - return matches, sure 633 + return matchesFound 593 634 } 594 635 if len(v.current) < min { 595 636 min = len(v.current) ··· 648 689 } 649 690 // return early once we found any line that contains matches from all children 650 691 if hits == len(t.children) { 651 - return matches, true 692 + return matchesFound 652 693 } 653 694 } 654 - return false, true 695 + return matchesNone 655 696 } 656 697 657 - func (t *andMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) (bool, bool) { 658 - sure := true 698 + func (t *andMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState { 699 + // We have found matches unless a child needs to do more work or it hasn't 700 + // found matches. 701 + state := matchesFound 659 702 660 703 for _, ch := range t.children { 661 - v, ok := evalMatchTree(cp, cost, known, ch) 662 - if ok && !v { 663 - return false, true 664 - } 665 - if !ok { 666 - sure = false 704 + switch evalMatchTree(cp, cost, known, ch) { 705 + case matchesRequiresHigherCost: 706 + // keep evaluating other children incase we come across matchesNone 707 + state = matchesRequiresHigherCost 708 + case matchesFound: 709 + // will return this if every child has this value 710 + case matchesNone: 711 + return matchesNone 667 712 } 668 713 } 669 714 670 - return true, sure 715 + return state 671 716 } 672 717 673 - func (t *orMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) (bool, bool) { 674 - matches := false 675 - sure := true 718 + func (t *orMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState { 719 + // we could short-circuit, but we want to use the other possibilities as a 720 + // ranking signal. So we always return the most conservative state. 721 + state := matchesNone 676 722 for _, ch := range t.children { 677 - v, ok := evalMatchTree(cp, cost, known, ch) 678 - if ok { 679 - // we could short-circuit, but we want to use 680 - // the other possibilities as a ranking 681 - // signal. 682 - matches = matches || v 683 - } else { 684 - sure = false 723 + switch evalMatchTree(cp, cost, known, ch) { 724 + case matchesRequiresHigherCost: 725 + state = matchesRequiresHigherCost 726 + case matchesFound: 727 + if state != matchesRequiresHigherCost { 728 + state = matchesFound 729 + } 730 + case matchesNone: 731 + // noop 685 732 } 686 733 } 687 - return matches, sure 734 + return state 688 735 } 689 736 690 - func (t *branchQueryMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) (bool, bool) { 691 - return t.branchMask() != 0, true 737 + func (t *branchQueryMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState { 738 + return matchesStatePred(t.branchMask() != 0) 692 739 } 693 740 694 - func (t *regexpMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) (bool, bool) { 741 + func (t *regexpMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState { 695 742 if t.reEvaluated { 696 - return len(t.found) > 0, true 743 + return matchesStateForSlice(t.found) 697 744 } 698 745 699 746 if cost < costRegexp { 700 - return false, false 747 + return matchesRequiresHigherCost 701 748 } 702 749 703 750 cp.stats.RegexpsConsidered++ ··· 715 762 t.found = found 716 763 t.reEvaluated = true 717 764 718 - return len(t.found) > 0, true 765 + return matchesStateForSlice(t.found) 719 766 } 720 767 721 - func (t *wordMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) (bool, bool) { 768 + func (t *wordMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState { 722 769 if t.evaluated { 723 - return len(t.found) > 0, true 770 + return matchesStateForSlice(t.found) 724 771 } 725 772 726 773 if cost < costRegexp { 727 - return false, false 774 + return matchesRequiresHigherCost 728 775 } 729 776 730 777 data := cp.data(t.fileName) ··· 754 801 t.found = found 755 802 t.evaluated = true 756 803 757 - return len(t.found) > 0, true 804 + return matchesStateForSlice(t.found) 758 805 } 759 806 760 807 // breakMatchesOnNewlines returns matches resulting from breaking each element ··· 792 839 return cms 793 840 } 794 841 795 - func evalMatchTree(cp *contentProvider, cost int, known map[matchTree]bool, mt matchTree) (bool, bool) { 842 + func evalMatchTree(cp *contentProvider, cost int, known map[matchTree]bool, mt matchTree) matchesState { 796 843 if v, ok := known[mt]; ok { 797 - return v, true 844 + return matchesStatePred(v) 798 845 } 799 846 800 - v, ok := mt.matches(cp, cost, known) 801 - if ok { 802 - known[mt] = v 847 + ms := mt.matches(cp, cost, known) 848 + if ms != matchesRequiresHigherCost { 849 + known[mt] = ms == matchesFound 803 850 } 804 851 805 - return v, ok 852 + return ms 806 853 } 807 854 808 - func (t *notMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) (bool, bool) { 809 - v, ok := evalMatchTree(cp, cost, known, t.child) 810 - return !v, ok 855 + func (t *notMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState { 856 + switch evalMatchTree(cp, cost, known, t.child) { 857 + case matchesRequiresHigherCost: 858 + return matchesRequiresHigherCost 859 + case matchesFound: 860 + return matchesNone 861 + case matchesNone: 862 + return matchesFound 863 + default: 864 + panic("unreachable") 865 + } 811 866 } 812 867 813 - func (t *fileNameMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) (bool, bool) { 868 + func (t *fileNameMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState { 814 869 return evalMatchTree(cp, cost, known, t.child) 815 870 } 816 871 817 - func (t *substrMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) (bool, bool) { 872 + func (t *substrMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState { 818 873 if t.contEvaluated { 819 - return len(t.current) > 0, true 874 + return matchesStateForSlice(t.current) 820 875 } 821 876 822 877 if len(t.current) == 0 { 823 - return false, true 878 + return matchesNone 824 879 } 825 880 826 881 if t.fileName && cost < costMemory { 827 - return false, false 882 + return matchesRequiresHigherCost 828 883 } 829 884 830 885 if !t.fileName && cost < costContent { 831 - return false, false 886 + return matchesRequiresHigherCost 832 887 } 833 888 834 889 pruned := t.current[:0] ··· 843 898 t.current = pruned 844 899 t.contEvaluated = true 845 900 846 - return len(t.current) > 0, true 901 + return matchesStateForSlice(t.current) 847 902 } 848 903 849 904 type matchTreeOpt struct {