fork of https://github.com/sourcegraph/zoekt
0

Configure Feed

Select the types of activity you want to include in your feed.

matchtree: special case word search (#526)

A common search Zoekt gets from Sourcegraph is "\bLITERAL\b". With this PR we avoid the regex engine for these type of queries and provide something faster.

Local benchmarks show that the new code runs 4.8x faster for select queries.

Co-authored-by: Stefan Hengl <stefan@sourcegraph.com>

+225 -8
+4
bits.go
··· 178 178 } 179 179 } 180 180 181 + func characterClass(c byte) bool { 182 + return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || (c >= '0' && c <= '9') || c == '_' 183 + } 184 + 181 185 func marshalDocSections(secs []DocumentSection) []byte { 182 186 ints := make([]uint32, 0, len(secs)*2) 183 187 for _, s := range secs {
+3
eval.go
··· 478 478 if rmt, ok := mt.(*regexpMatchTree); ok { 479 479 cands = append(cands, rmt.found...) 480 480 } 481 + if rmt, ok := mt.(*wordMatchTree); ok { 482 + cands = append(cands, rmt.found...) 483 + } 481 484 if smt, ok := mt.(*symbolRegexpMatchTree); ok { 482 485 cands = append(cands, smt.found...) 483 486 }
+78 -1
index_test.go
··· 1116 1116 } 1117 1117 1118 1118 func mustParseRE(s string) *syntax.Regexp { 1119 - r, err := syntax.Parse(s, 0) 1119 + r, err := syntax.Parse(s, syntax.Perl) 1120 1120 if err != nil { 1121 1121 panic(err) 1122 1122 } ··· 3536 3536 3537 3537 }) 3538 3538 } 3539 + 3540 + // This tests the frequent pattern "\bLITERAL\b". 3541 + func TestWordSearch(t *testing.T) { 3542 + content := []byte("needle the bla") 3543 + // ----------------01234567890123 3544 + 3545 + b := testIndexBuilder(t, nil, 3546 + Document{ 3547 + Name: "f1", 3548 + Content: content, 3549 + }) 3550 + 3551 + t.Run("LineMatches", func(t *testing.T) { 3552 + sres := searchForTest(t, b, 3553 + &query.Regexp{ 3554 + Regexp: mustParseRE("\\bthe\\b"), 3555 + CaseSensitive: true, 3556 + Content: true, 3557 + }) 3558 + 3559 + if len(sres.Files) != 1 || len(sres.Files[0].LineMatches) != 1 { 3560 + t.Fatalf("got %v, want 1 match in 1 file", sres.Files) 3561 + } 3562 + 3563 + if sres.Stats.RegexpsConsidered != 0 { 3564 + t.Fatal("expected regexp to be skipped") 3565 + } 3566 + 3567 + got := sres.Files[0].LineMatches[0] 3568 + want := LineMatch{ 3569 + LineFragments: []LineFragmentMatch{{ 3570 + LineOffset: 7, 3571 + Offset: 7, 3572 + MatchLength: 3, 3573 + }}, 3574 + Line: content, 3575 + FileName: false, 3576 + LineNumber: 1, 3577 + LineStart: 0, 3578 + LineEnd: 14, 3579 + } 3580 + 3581 + if !reflect.DeepEqual(got, want) { 3582 + t.Errorf("got %#v, want %#v", got, want) 3583 + } 3584 + }) 3585 + 3586 + t.Run("ChunkMatches", func(t *testing.T) { 3587 + sres := searchForTest(t, b, 3588 + &query.Regexp{ 3589 + Regexp: mustParseRE("\\bthe\\b"), 3590 + CaseSensitive: true, 3591 + }, chunkOpts) 3592 + 3593 + if len(sres.Files) != 1 || len(sres.Files[0].ChunkMatches) != 1 { 3594 + t.Fatalf("got %v, want 1 match in 1 file", sres.Files) 3595 + } 3596 + 3597 + if sres.Stats.RegexpsConsidered != 0 { 3598 + t.Fatal("expected regexp to be skipped") 3599 + } 3600 + 3601 + got := sres.Files[0].ChunkMatches[0] 3602 + want := ChunkMatch{ 3603 + Content: content, 3604 + ContentStart: Location{ByteOffset: 0, LineNumber: 1, Column: 1}, 3605 + Ranges: []Range{{ 3606 + Start: Location{ByteOffset: 7, LineNumber: 1, Column: 8}, 3607 + End: Location{ByteOffset: 10, LineNumber: 1, Column: 11}, 3608 + }}, 3609 + } 3610 + 3611 + if diff := cmp.Diff(want, got); diff != "" { 3612 + t.Fatal(diff) 3613 + } 3614 + }) 3615 + }
+105 -7
matchtree.go
··· 15 15 package zoekt 16 16 17 17 import ( 18 + "bytes" 18 19 "fmt" 19 20 "log" 21 + "regexp/syntax" 20 22 "strings" 21 23 "unicode/utf8" 22 24 23 25 "github.com/grafana/regexp" 26 + 24 27 "github.com/sourcegraph/zoekt/query" 25 28 ) 26 29 ··· 142 145 bruteForceMatchTree 143 146 } 144 147 148 + // \bLITERAL\b 149 + type wordMatchTree struct { 150 + word string 151 + 152 + fileName bool 153 + 154 + // mutable 155 + evaluated bool 156 + found []*candidateMatch 157 + 158 + // nextDoc, prepare. 159 + bruteForceMatchTree 160 + } 161 + 145 162 type substrMatchTree struct { 146 163 matchIterator 147 164 ··· 306 323 t.bruteForceMatchTree.prepare(doc) 307 324 } 308 325 326 + func (t *wordMatchTree) prepare(doc uint32) { 327 + t.found = t.found[:0] 328 + t.evaluated = false 329 + t.bruteForceMatchTree.prepare(doc) 330 + } 331 + 309 332 func (t *orMatchTree) prepare(doc uint32) { 310 333 for _, c := range t.children { 311 334 c.prepare(doc) ··· 419 442 return fmt.Sprintf("%sre(%s)", f, t.regexp) 420 443 } 421 444 445 + func (t *wordMatchTree) String() string { 446 + f := "" 447 + if t.fileName { 448 + f = "f" 449 + } 450 + return fmt.Sprintf("%sword(%s)", f, t.word) 451 + } 452 + 422 453 func (t *orMatchTree) String() string { 423 454 return fmt.Sprintf("or%v", t.children) 424 455 } ··· 671 702 return len(t.found) > 0, true 672 703 } 673 704 705 + func (t *wordMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) (bool, bool) { 706 + if t.evaluated { 707 + return len(t.found) > 0, true 708 + } 709 + 710 + if cost < costRegexp { 711 + return false, false 712 + } 713 + 714 + data := cp.data(t.fileName) 715 + offset := 0 716 + found := t.found[:0] 717 + for { 718 + idx := bytes.Index(data[offset:], []byte(t.word)) 719 + if idx < 0 { 720 + break 721 + } 722 + 723 + relStartOffset := offset + idx 724 + relEndOffset := relStartOffset + len(t.word) 725 + 726 + startBoundary := relStartOffset < len(data) && (relStartOffset == 0 || !characterClass(data[relStartOffset-1])) 727 + endBoundary := relEndOffset > 0 && (relEndOffset == len(data) || !characterClass(data[relEndOffset])) 728 + if startBoundary && endBoundary { 729 + found = append(found, &candidateMatch{ 730 + byteOffset: uint32(offset + idx), 731 + byteMatchSz: uint32(len(t.word)), 732 + fileName: t.fileName, 733 + }) 734 + } 735 + offset += idx + len(t.word) 736 + } 737 + 738 + t.found = found 739 + t.evaluated = true 740 + 741 + return len(t.found) > 0, true 742 + } 743 + 674 744 // breakMatchesOnNewlines returns matches resulting from breaking each element 675 745 // of cms on newlines within text. 676 746 func breakMatchesOnNewlines(cms []*candidateMatch, text []byte) []*candidateMatch { ··· 781 851 return subMT, nil 782 852 } 783 853 784 - prefix := "" 785 - if !s.CaseSensitive { 786 - prefix = "(?i)" 787 - } 854 + var tr matchTree 855 + if wmt, ok := regexpToWordMatchTree(s); ok { 856 + // A common search we get is "\bLITERAL\b". Avoid the regex engine and 857 + // provide something faster. 858 + tr = wmt 859 + } else { 860 + prefix := "" 861 + if !s.CaseSensitive { 862 + prefix = "(?i)" 863 + } 788 864 789 - tr := &regexpMatchTree{ 790 - regexp: regexp.MustCompile(prefix + s.Regexp.String()), 791 - fileName: s.FileName, 865 + tr = &regexpMatchTree{ 866 + regexp: regexp.MustCompile(prefix + s.Regexp.String()), 867 + fileName: s.FileName, 868 + } 792 869 } 793 870 794 871 return &andMatchTree{ ··· 1050 1127 return st, nil 1051 1128 } 1052 1129 1130 + func regexpToWordMatchTree(q *query.Regexp) (_ *wordMatchTree, ok bool) { 1131 + // Needs to be case sensitive 1132 + if !q.CaseSensitive || q.Regexp.Flags&syntax.FoldCase != 0 { 1133 + return nil, false 1134 + } 1135 + // We want a regex that looks like Op.Concat[OpWordBoundary OpLiteral OpWordBoundary] 1136 + if q.Regexp.Op != syntax.OpConcat || len(q.Regexp.Sub) != 3 { 1137 + return nil, false 1138 + } 1139 + sub := q.Regexp.Sub 1140 + if sub[0].Op != syntax.OpWordBoundary || sub[1].Op != syntax.OpLiteral || sub[2].Op != syntax.OpWordBoundary { 1141 + return nil, false 1142 + } 1143 + 1144 + return &wordMatchTree{ 1145 + word: string(sub[1].Rune), 1146 + fileName: q.FileName, 1147 + }, true 1148 + } 1149 + 1053 1150 // pruneMatchTree removes impossible branches from the matchTree, as indicated 1054 1151 // by substrMatchTree having a noMatchTree and the resulting impossible and clauses and so forth. 1055 1152 func pruneMatchTree(mt matchTree) (matchTree, error) { ··· 1135 1232 case *docMatchTree: 1136 1233 case *bruteForceMatchTree: 1137 1234 case *regexpMatchTree: 1235 + case *wordMatchTree: 1138 1236 } 1139 1237 return mt, err 1140 1238 }
+35
matchtree_test.go
··· 20 20 21 21 "github.com/RoaringBitmap/roaring" 22 22 "github.com/grafana/regexp" 23 + 23 24 "github.com/sourcegraph/zoekt/query" 24 25 ) 25 26 ··· 189 190 t.Errorf("Expected regexpMatchTree to be skipped for query: %s", q) 190 191 } 191 192 }) 193 + } 194 + } 195 + 196 + // Test whether we skip the regexp engine for queries like "\bLITERAL\b 197 + // case:yes" 198 + func TestWordSearchSkipRegexpTree(t *testing.T) { 199 + qStr := "\\bfoo\\b case:yes" 200 + q, err := query.Parse(qStr) 201 + if err != nil { 202 + t.Fatalf("Error parsing query: %s", "sym:"+qStr) 203 + } 204 + 205 + d := &indexData{} 206 + mt, err := d.newMatchTree(q) 207 + if err != nil { 208 + t.Fatalf("Error creating match tree from query: %s", q) 209 + } 210 + 211 + countRegexMatchTree, countWordMatchTree := 0, 0 212 + visitMatchTree(mt, func(m matchTree) { 213 + switch m.(type) { 214 + case *regexpMatchTree: 215 + countRegexMatchTree++ 216 + case *wordMatchTree: 217 + countWordMatchTree++ 218 + } 219 + }) 220 + 221 + if countRegexMatchTree != 0 { 222 + t.Fatalf("expected to find 0 regexMatchTree, found %d", countRegexMatchTree) 223 + } 224 + 225 + if countWordMatchTree != 1 { 226 + t.Fatalf("expected to find 1 wordMatchTree, found %d", countWordMatchTree) 192 227 } 193 228 } 194 229