matchtree: special case word search (#526) · boltless.me/zoekt@ea5ebff

+4

bits.go

··· 178 178 } 179 179 } 180 180 181 + func characterClass(c byte) bool { 182 + return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || (c >= '0' && c <= '9') || c == '_' 183 + } 184 + 181 185 func marshalDocSections(secs []DocumentSection) []byte { 182 186 ints := make([]uint32, 0, len(secs)*2) 183 187 for _, s := range secs {

+3

eval.go

··· 478 478 if rmt, ok := mt.(*regexpMatchTree); ok { 479 479 cands = append(cands, rmt.found...) 480 480 } 481 + if rmt, ok := mt.(*wordMatchTree); ok { 482 + cands = append(cands, rmt.found...) 483 + } 481 484 if smt, ok := mt.(*symbolRegexpMatchTree); ok { 482 485 cands = append(cands, smt.found...) 483 486 }

+78 -1

index_test.go

··· 1116 1116 } 1117 1117 1118 1118 func mustParseRE(s string) *syntax.Regexp { 1119 - r, err := syntax.Parse(s, 0) 1119 + r, err := syntax.Parse(s, syntax.Perl) 1120 1120 if err != nil { 1121 1121 panic(err) 1122 1122 } ··· 3536 3536 3537 3537 }) 3538 3538 } 3539 + 3540 + // This tests the frequent pattern "\bLITERAL\b". 3541 + func TestWordSearch(t *testing.T) { 3542 + content := []byte("needle the bla") 3543 + // ----------------01234567890123 3544 + 3545 + b := testIndexBuilder(t, nil, 3546 + Document{ 3547 + Name: "f1", 3548 + Content: content, 3549 + }) 3550 + 3551 + t.Run("LineMatches", func(t *testing.T) { 3552 + sres := searchForTest(t, b, 3553 + &query.Regexp{ 3554 + Regexp: mustParseRE("\\bthe\\b"), 3555 + CaseSensitive: true, 3556 + Content: true, 3557 + }) 3558 + 3559 + if len(sres.Files) != 1 || len(sres.Files[0].LineMatches) != 1 { 3560 + t.Fatalf("got %v, want 1 match in 1 file", sres.Files) 3561 + } 3562 + 3563 + if sres.Stats.RegexpsConsidered != 0 { 3564 + t.Fatal("expected regexp to be skipped") 3565 + } 3566 + 3567 + got := sres.Files[0].LineMatches[0] 3568 + want := LineMatch{ 3569 + LineFragments: []LineFragmentMatch{{ 3570 + LineOffset: 7, 3571 + Offset: 7, 3572 + MatchLength: 3, 3573 + }}, 3574 + Line: content, 3575 + FileName: false, 3576 + LineNumber: 1, 3577 + LineStart: 0, 3578 + LineEnd: 14, 3579 + } 3580 + 3581 + if !reflect.DeepEqual(got, want) { 3582 + t.Errorf("got %#v, want %#v", got, want) 3583 + } 3584 + }) 3585 + 3586 + t.Run("ChunkMatches", func(t *testing.T) { 3587 + sres := searchForTest(t, b, 3588 + &query.Regexp{ 3589 + Regexp: mustParseRE("\\bthe\\b"), 3590 + CaseSensitive: true, 3591 + }, chunkOpts) 3592 + 3593 + if len(sres.Files) != 1 || len(sres.Files[0].ChunkMatches) != 1 { 3594 + t.Fatalf("got %v, want 1 match in 1 file", sres.Files) 3595 + } 3596 + 3597 + if sres.Stats.RegexpsConsidered != 0 { 3598 + t.Fatal("expected regexp to be skipped") 3599 + } 3600 + 3601 + got := sres.Files[0].ChunkMatches[0] 3602 + want := ChunkMatch{ 3603 + Content: content, 3604 + ContentStart: Location{ByteOffset: 0, LineNumber: 1, Column: 1}, 3605 + Ranges: []Range{{ 3606 + Start: Location{ByteOffset: 7, LineNumber: 1, Column: 8}, 3607 + End: Location{ByteOffset: 10, LineNumber: 1, Column: 11}, 3608 + }}, 3609 + } 3610 + 3611 + if diff := cmp.Diff(want, got); diff != "" { 3612 + t.Fatal(diff) 3613 + } 3614 + }) 3615 + }

+105 -7

matchtree.go

··· 15 15 package zoekt 16 16 17 17 import ( 18 + "bytes" 18 19 "fmt" 19 20 "log" 21 + "regexp/syntax" 20 22 "strings" 21 23 "unicode/utf8" 22 24 23 25 "github.com/grafana/regexp" 26 + 24 27 "github.com/sourcegraph/zoekt/query" 25 28 ) 26 29 ··· 142 145 bruteForceMatchTree 143 146 } 144 147 148 + // \bLITERAL\b 149 + type wordMatchTree struct { 150 + word string 151 + 152 + fileName bool 153 + 154 + // mutable 155 + evaluated bool 156 + found []*candidateMatch 157 + 158 + // nextDoc, prepare. 159 + bruteForceMatchTree 160 + } 161 + 145 162 type substrMatchTree struct { 146 163 matchIterator 147 164 ··· 306 323 t.bruteForceMatchTree.prepare(doc) 307 324 } 308 325 326 + func (t *wordMatchTree) prepare(doc uint32) { 327 + t.found = t.found[:0] 328 + t.evaluated = false 329 + t.bruteForceMatchTree.prepare(doc) 330 + } 331 + 309 332 func (t *orMatchTree) prepare(doc uint32) { 310 333 for _, c := range t.children { 311 334 c.prepare(doc) ··· 419 442 return fmt.Sprintf("%sre(%s)", f, t.regexp) 420 443 } 421 444 445 + func (t *wordMatchTree) String() string { 446 + f := "" 447 + if t.fileName { 448 + f = "f" 449 + } 450 + return fmt.Sprintf("%sword(%s)", f, t.word) 451 + } 452 + 422 453 func (t *orMatchTree) String() string { 423 454 return fmt.Sprintf("or%v", t.children) 424 455 } ··· 671 702 return len(t.found) > 0, true 672 703 } 673 704 705 + func (t *wordMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) (bool, bool) { 706 + if t.evaluated { 707 + return len(t.found) > 0, true 708 + } 709 + 710 + if cost < costRegexp { 711 + return false, false 712 + } 713 + 714 + data := cp.data(t.fileName) 715 + offset := 0 716 + found := t.found[:0] 717 + for { 718 + idx := bytes.Index(data[offset:], []byte(t.word)) 719 + if idx < 0 { 720 + break 721 + } 722 + 723 + relStartOffset := offset + idx 724 + relEndOffset := relStartOffset + len(t.word) 725 + 726 + startBoundary := relStartOffset < len(data) && (relStartOffset == 0 || !characterClass(data[relStartOffset-1])) 727 + endBoundary := relEndOffset > 0 && (relEndOffset == len(data) || !characterClass(data[relEndOffset])) 728 + if startBoundary && endBoundary { 729 + found = append(found, &candidateMatch{ 730 + byteOffset: uint32(offset + idx), 731 + byteMatchSz: uint32(len(t.word)), 732 + fileName: t.fileName, 733 + }) 734 + } 735 + offset += idx + len(t.word) 736 + } 737 + 738 + t.found = found 739 + t.evaluated = true 740 + 741 + return len(t.found) > 0, true 742 + } 743 + 674 744 // breakMatchesOnNewlines returns matches resulting from breaking each element 675 745 // of cms on newlines within text. 676 746 func breakMatchesOnNewlines(cms []*candidateMatch, text []byte) []*candidateMatch { ··· 781 851 return subMT, nil 782 852 } 783 853 784 - prefix := "" 785 - if !s.CaseSensitive { 786 - prefix = "(?i)" 787 - } 854 + var tr matchTree 855 + if wmt, ok := regexpToWordMatchTree(s); ok { 856 + // A common search we get is "\bLITERAL\b". Avoid the regex engine and 857 + // provide something faster. 858 + tr = wmt 859 + } else { 860 + prefix := "" 861 + if !s.CaseSensitive { 862 + prefix = "(?i)" 863 + } 788 864 789 - tr := &regexpMatchTree{ 790 - regexp: regexp.MustCompile(prefix + s.Regexp.String()), 791 - fileName: s.FileName, 865 + tr = &regexpMatchTree{ 866 + regexp: regexp.MustCompile(prefix + s.Regexp.String()), 867 + fileName: s.FileName, 868 + } 792 869 } 793 870 794 871 return &andMatchTree{ ··· 1050 1127 return st, nil 1051 1128 } 1052 1129 1130 + func regexpToWordMatchTree(q *query.Regexp) (_ *wordMatchTree, ok bool) { 1131 + // Needs to be case sensitive 1132 + if !q.CaseSensitive || q.Regexp.Flags&syntax.FoldCase != 0 { 1133 + return nil, false 1134 + } 1135 + // We want a regex that looks like Op.Concat[OpWordBoundary OpLiteral OpWordBoundary] 1136 + if q.Regexp.Op != syntax.OpConcat || len(q.Regexp.Sub) != 3 { 1137 + return nil, false 1138 + } 1139 + sub := q.Regexp.Sub 1140 + if sub[0].Op != syntax.OpWordBoundary || sub[1].Op != syntax.OpLiteral || sub[2].Op != syntax.OpWordBoundary { 1141 + return nil, false 1142 + } 1143 + 1144 + return &wordMatchTree{ 1145 + word: string(sub[1].Rune), 1146 + fileName: q.FileName, 1147 + }, true 1148 + } 1149 + 1053 1150 // pruneMatchTree removes impossible branches from the matchTree, as indicated 1054 1151 // by substrMatchTree having a noMatchTree and the resulting impossible and clauses and so forth. 1055 1152 func pruneMatchTree(mt matchTree) (matchTree, error) { ··· 1135 1232 case *docMatchTree: 1136 1233 case *bruteForceMatchTree: 1137 1234 case *regexpMatchTree: 1235 + case *wordMatchTree: 1138 1236 } 1139 1237 return mt, err 1140 1238 }

+35

matchtree_test.go

··· 20 20 21 21 "github.com/RoaringBitmap/roaring" 22 22 "github.com/grafana/regexp" 23 + 23 24 "github.com/sourcegraph/zoekt/query" 24 25 ) 25 26 ··· 189 190 t.Errorf("Expected regexpMatchTree to be skipped for query: %s", q) 190 191 } 191 192 }) 193 + } 194 + } 195 + 196 + // Test whether we skip the regexp engine for queries like "\bLITERAL\b 197 + // case:yes" 198 + func TestWordSearchSkipRegexpTree(t *testing.T) { 199 + qStr := "\\bfoo\\b case:yes" 200 + q, err := query.Parse(qStr) 201 + if err != nil { 202 + t.Fatalf("Error parsing query: %s", "sym:"+qStr) 203 + } 204 + 205 + d := &indexData{} 206 + mt, err := d.newMatchTree(q) 207 + if err != nil { 208 + t.Fatalf("Error creating match tree from query: %s", q) 209 + } 210 + 211 + countRegexMatchTree, countWordMatchTree := 0, 0 212 + visitMatchTree(mt, func(m matchTree) { 213 + switch m.(type) { 214 + case *regexpMatchTree: 215 + countRegexMatchTree++ 216 + case *wordMatchTree: 217 + countWordMatchTree++ 218 + } 219 + }) 220 + 221 + if countRegexMatchTree != 0 { 222 + t.Fatalf("expected to find 0 regexMatchTree, found %d", countRegexMatchTree) 223 + } 224 + 225 + if countWordMatchTree != 1 { 226 + t.Fatalf("expected to find 1 wordMatchTree, found %d", countWordMatchTree) 192 227 } 193 228 } 194 229

Configure Feed

Configure Feed