sort ngrams before looking them up (#617) · boltless.me/zoekt@45f608f

+1

BUILD.bazel

··· 55 55 "@com_github_rs_xid//:xid", 56 56 "@org_golang_google_protobuf//types/known/durationpb", 57 57 "@org_golang_google_protobuf//types/known/timestamppb", 58 + "@org_golang_x_exp//slices", 58 59 ] + select({ 59 60 "@io_bazel_rules_go//go/platform:aix": [ 60 61 "@org_golang_x_sys//unix",

+5 -10

bits.go

··· 106 106 } 107 107 108 108 type runeNgramOff struct { 109 - ngram ngram 110 - byteSize uint32 // size of ngram 111 - byteOff uint32 112 - runeOff uint32 109 + ngram ngram 110 + // index is the original index inside of the returned array of splitNGrams 111 + index uint32 113 112 } 114 113 115 114 func splitNGrams(str []byte) []runeNgramOff { ··· 120 119 result := make([]runeNgramOff, 0, len(str)) 121 120 var i uint32 122 121 123 - chars := -1 124 122 for len(str) > 0 { 125 - chars++ 126 123 r, sz := utf8.DecodeRune(str) 127 124 str = str[sz:] 128 125 runeGram[0] = runeGram[1] ··· 139 136 140 137 ng := runesToNGram(runeGram) 141 138 result = append(result, runeNgramOff{ 142 - ngram: ng, 143 - byteSize: i - off[0], 144 - byteOff: off[0], 145 - runeOff: uint32(chars), 139 + ngram: ng, 140 + index: uint32(len(result)), 146 141 }) 147 142 } 148 143 return result

+2 -2

deps.bzl

··· 2655 2655 name = "org_golang_x_exp", 2656 2656 build_file_proto_mode = "disable_global", 2657 2657 importpath = "golang.org/x/exp", 2658 - sum = "h1:A1gGSx58LAGVHUUsOf7IiR0u8Xb6W51gRwfDBhkdcaw=", 2659 - version = "v0.0.0-20191030013958-a1ab85dbe136", 2658 + sum = "h1:MGwJjxBy0HJshjDNfLsYO8xppfqWlA5ZT9OhtUUhTNw=", 2659 + version = "v0.0.0-20230713183714-613f0c0eb8a1", 2660 2660 ) 2661 2661 go_repository( 2662 2662 name = "org_golang_x_image",

+1

go.mod

··· 46 46 go.opentelemetry.io/otel/trace v1.16.0 47 47 go.uber.org/atomic v1.11.0 48 48 go.uber.org/automaxprocs v1.5.2 49 + golang.org/x/exp v0.0.0-20230713183714-613f0c0eb8a1 49 50 golang.org/x/net v0.11.0 50 51 golang.org/x/oauth2 v0.9.0 51 52 golang.org/x/sync v0.3.0

+2

go.sum

··· 368 368 golang.org/x/exp v0.0.0-20190125153040-c74c464bbbf2/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA= 369 369 golang.org/x/exp v0.0.0-20190306152737-a1d7652674e8/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA= 370 370 golang.org/x/exp v0.0.0-20191030013958-a1ab85dbe136/go.mod h1:JXzH8nQsPlswgeRAPE3MuO9GYsAcnJvJ4vnMwN/5qkY= 371 + golang.org/x/exp v0.0.0-20230713183714-613f0c0eb8a1 h1:MGwJjxBy0HJshjDNfLsYO8xppfqWlA5ZT9OhtUUhTNw= 372 + golang.org/x/exp v0.0.0-20230713183714-613f0c0eb8a1/go.mod h1:FXUEEKJgO7OQYeo8N01OfiKP8RXMtf6e8aTskBGqWdc= 371 373 golang.org/x/image v0.0.0-20180708004352-c73c2afc3b81/go.mod h1:ux5Hcp/YLpHSI86hEcLt0YII63i6oz57MZXIpbrjZUs= 372 374 golang.org/x/image v0.0.0-20190227222117-0694c2d4d067/go.mod h1:kZ7UVZpmo3dzQBMxlp+ypCbDeSB+sBbTgSJuh5dn5js= 373 375 golang.org/x/image v0.0.0-20190802002840-cff245a6509b/go.mod h1:FeLwcggjj3mMvU+oOTbSwawSJRM1uh48EjtB4UJZlP0=

+23 -12

indexdata.go

··· 23 23 "unicode/utf8" 24 24 25 25 "github.com/sourcegraph/zoekt/query" 26 + "golang.org/x/exp/slices" 26 27 ) 27 28 28 29 // indexData holds the pattern-independent data that we have to have ··· 381 382 382 383 // Find the 2 least common ngrams from the string. 383 384 ngramOffs := splitNGrams([]byte(query.Pattern)) 385 + // PERF: Sort to increase the chances adjacent checks are in the same btree 386 + // bucket (which can cause disk IO). 387 + slices.SortFunc(ngramOffs, func(a, b runeNgramOff) bool { 388 + return a.ngram < b.ngram 389 + }) 384 390 frequencies := make([]uint32, 0, len(ngramOffs)) 385 391 ngramLookups := 0 386 392 for _, o := range ngramOffs { ··· 408 414 409 415 frequencies = append(frequencies, freq) 410 416 } 411 - firstI := firstMinarg(frequencies) 412 - frequencies[firstI] = maxUInt32 413 - lastI := lastMinarg(frequencies) 414 - if firstI > lastI { 415 - lastI, firstI = firstI, lastI 417 + 418 + var first, last runeNgramOff 419 + { 420 + firstI := firstMinarg(frequencies) 421 + frequencies[firstI] = maxUInt32 422 + lastI := lastMinarg(frequencies) 423 + first = ngramOffs[firstI] 424 + last = ngramOffs[lastI] 425 + if first.index > last.index { 426 + last, first = first, last 427 + } 416 428 } 417 429 418 - firstNG := ngramOffs[firstI].ngram 419 - lastNG := ngramOffs[lastI].ngram 420 430 iter := &ngramDocIterator{ 421 - leftPad: firstI, 422 - rightPad: uint32(utf8.RuneCountInString(str)) - firstI, 431 + leftPad: first.index, 432 + rightPad: uint32(utf8.RuneCountInString(str)) - first.index, 423 433 ngramLookups: ngramLookups, 424 434 } 425 435 if query.FileName { ··· 428 438 iter.ends = d.fileEndRunes 429 439 } 430 440 431 - if firstI != lastI { 432 - i, err := d.newDistanceTrigramIter(firstNG, lastNG, lastI-firstI, query.CaseSensitive, query.FileName) 441 + if first != last { 442 + runeDist := last.index - first.index 443 + i, err := d.newDistanceTrigramIter(first.ngram, last.ngram, runeDist, query.CaseSensitive, query.FileName) 433 444 if err != nil { 434 445 return nil, err 435 446 } 436 447 437 448 iter.iter = i 438 449 } else { 439 - hitIter, err := d.trigramHitIterator(lastNG, query.CaseSensitive, query.FileName) 450 + hitIter, err := d.trigramHitIterator(last.ngram, query.CaseSensitive, query.FileName) 440 451 if err != nil { 441 452 return nil, err 442 453 }

Configure Feed

Configure Feed