fork of https://github.com/sourcegraph/zoekt
0

Configure Feed

Select the types of activity you want to include in your feed.

chunkmatches: reuse last calculated column when filling (#711)

This change uses the fact that candidate matches should be increasing in byte
offset, to avoid recounting runes on a line. Before this change if you have
many matches on the same line we would call `utf8.RuneCount` for each match,
which is a `O(nm)` algorithm where `n` is your line length and `m` is the
number of matches. After this change the complexity is `O(n)`.

I came across this while investigating slow performance for searching the
string "dev" on s2 taking 2s if the match limits where 100k instead of 10k.
With 10k it would take 0.04s. It turns out with the larger limit we ended up
searching a file were the word dev appeared many times on one line. Running a
profiler against the service came up with 96% of CPU time in `utf8.RuneCount`.

This commit adds a benchmark for the helper introduced to reuse RuneCounts.
Unsurprisingly the difference is massive between `O(nm)` and `O(n)` :)

name old time/op new time/op delta
ColumnHelper-32 299ms ± 2% 0ms ± 2% -99.97% (p=0.000 n=10+10)

Test Plan: Added tests and benchmarks.

+142 -3
+59 -3
contentprovider.go
··· 169 169 return result 170 170 } 171 171 172 + // fillChunkMatches converts the internal candidateMatch slice into our APIs ChunkMatch. 173 + // 174 + // Performance invariant: ms is sorted and non-overlapping. 175 + // 176 + // Note: the byte slices may be backed by mmapped data, so before being 177 + // returned by the API it needs to be copied. 172 178 func (p *contentProvider) fillChunkMatches(ms []*candidateMatch, numContextLines int, language string, debug bool) []ChunkMatch { 173 179 var result []ChunkMatch 174 180 if ms[0].fileName { ··· 290 296 291 297 func (p *contentProvider) fillContentChunkMatches(ms []*candidateMatch, numContextLines int) []ChunkMatch { 292 298 newlines := p.newlines() 299 + data := p.data(false) 300 + 301 + // columnHelper prevents O(len(ms) * len(data)) lookups for all columns. 302 + // However, it depends on ms being sorted by byteOffset and non-overlapping. 303 + // This invariant is true at the time of writing, but we conservatively 304 + // enforce this. Note: chunkCandidates preserves the sorting so safe to 305 + // transform now. 306 + columnHelper := columnHelper{data: data} 307 + if !sort.IsSorted((sortByOffsetSlice)(ms)) { 308 + log.Printf("WARN: performance invariant violated. candidate matches are not sorted in fillContentChunkMatches. Report to developers.") 309 + sort.Sort((sortByOffsetSlice)(ms)) 310 + } 311 + 293 312 chunks := chunkCandidates(ms, newlines, numContextLines) 294 - data := p.data(false) 295 313 chunkMatches := make([]ChunkMatch, 0, len(chunks)) 296 314 for _, chunk := range chunks { 297 315 ranges := make([]Range, 0, len(chunk.candidates)) ··· 306 324 Start: Location{ 307 325 ByteOffset: startOffset, 308 326 LineNumber: uint32(startLine), 309 - Column: uint32(utf8.RuneCount(data[startLineOffset:startOffset]) + 1), 327 + Column: columnHelper.get(startLineOffset, startOffset), 310 328 }, 311 329 End: Location{ 312 330 ByteOffset: endOffset, 313 331 LineNumber: uint32(endLine), 314 - Column: uint32(utf8.RuneCount(data[endLineOffset:endOffset]) + 1), 332 + Column: columnHelper.get(endLineOffset, endOffset), 315 333 }, 316 334 }) 317 335 ··· 361 379 // chunkCandidates groups a set of sorted, non-overlapping candidate matches by line number. Adjacent 362 380 // chunks will be merged if adding `numContextLines` to the beginning and end of the chunk would cause 363 381 // it to overlap with an adjacent chunk. 382 + // 383 + // input invariants: ms is sorted by byteOffset and is non overlapping with respect to endOffset. 384 + // output invariants: if you flatten candidates the input invariant is retained. 364 385 func chunkCandidates(ms []*candidateMatch, newlines newlines, numContextLines int) []candidateChunk { 365 386 var chunks []candidateChunk 366 387 for _, m := range ms { ··· 390 411 } 391 412 } 392 413 return chunks 414 + } 415 + 416 + // columnHelper is a helper struct which caches the number of runes last 417 + // counted. If we naively use utf8.RuneCount for each match on a line, this 418 + // leads to an O(nm) algorithm where m is the number of matches and n is the 419 + // length of the line. Aassuming we our candidates are increasing in offset 420 + // makes this operation O(n) instead. 421 + type columnHelper struct { 422 + data []byte 423 + 424 + // 0 values for all these are valid values 425 + lastLineOffset int 426 + lastOffset uint32 427 + lastRuneCount uint32 428 + } 429 + 430 + // get returns the line column for offset. offset is the byte offset of the 431 + // rune in data. lineOffset is the byte offset inside of data for the line 432 + // containing offset. 433 + func (c *columnHelper) get(lineOffset int, offset uint32) uint32 { 434 + var runeCount uint32 435 + 436 + if lineOffset == c.lastLineOffset && offset >= c.lastOffset { 437 + // Can count from last calculation 438 + runeCount = c.lastRuneCount + uint32(utf8.RuneCount(c.data[c.lastOffset:offset])) 439 + } else { 440 + // Need to count from the beginning of line 441 + runeCount = uint32(utf8.RuneCount(c.data[lineOffset:offset])) 442 + } 443 + 444 + c.lastLineOffset = lineOffset 445 + c.lastOffset = offset 446 + c.lastRuneCount = runeCount 447 + 448 + return runeCount + 1 393 449 } 394 450 395 451 type newlines struct {
+80
contentprovider_test.go
··· 4 4 "bytes" 5 5 "fmt" 6 6 "testing" 7 + "testing/quick" 8 + "unicode/utf8" 7 9 8 10 "github.com/google/go-cmp/cmp" 9 11 ) ··· 327 329 }) 328 330 } 329 331 } 332 + 333 + func BenchmarkColumnHelper(b *testing.B) { 334 + // We simulate looking up columns of evenly spaced matches 335 + const matches = 10_000 336 + const match = "match" 337 + const space = " " 338 + const dist = uint32(len(match) + len(space)) 339 + data := bytes.Repeat([]byte(match+space), matches) 340 + 341 + b.ResetTimer() 342 + 343 + for i := 0; i < b.N; i++ { 344 + columnHelper := columnHelper{data: data} 345 + 346 + lineOffset := 0 347 + offset := uint32(0) 348 + for offset < uint32(len(data)) { 349 + col := columnHelper.get(lineOffset, offset) 350 + if col != offset+1 { 351 + b.Fatal("column is not offset even though data is ASCII") 352 + } 353 + offset += dist 354 + } 355 + } 356 + } 357 + 358 + func TestColumnHelper(t *testing.T) { 359 + f := func(line0, line1 string) bool { 360 + data := []byte(line0 + line1) 361 + lineOffset := len(line0) 362 + 363 + columnHelper := columnHelper{data: data} 364 + 365 + // We check every second rune returns the correct answer 366 + offset := lineOffset 367 + column := 1 368 + for offset < len(data) { 369 + if column%2 == 0 { 370 + got := columnHelper.get(lineOffset, uint32(offset)) 371 + if got != uint32(column) { 372 + return false 373 + } 374 + } 375 + _, size := utf8.DecodeRune(data[offset:]) 376 + offset += size 377 + column++ 378 + } 379 + 380 + return true 381 + } 382 + 383 + if err := quick.Check(f, nil); err != nil { 384 + t.Fatal(err) 385 + } 386 + 387 + // Corner cases 388 + 389 + // empty data, shouldn't happen but just in case it slips through 390 + ch := columnHelper{data: nil} 391 + if got := ch.get(0, 0); got != 1 { 392 + t.Fatal("empty data didn't return 1", got) 393 + } 394 + 395 + // Repeating a call to get should return the same value 396 + // empty data, shouldn't happen but just in case it slips through 397 + ch = columnHelper{data: []byte("hello\nworld")} 398 + if got := ch.get(6, 8); got != 3 { 399 + t.Fatal("unexpected value for third column on second line", got) 400 + } 401 + if got := ch.get(6, 8); got != 3 { 402 + t.Fatal("unexpected value for repeated call for third column on second line", got) 403 + } 404 + 405 + // Now make sure if we go backwards we do not incorrectly use the cache 406 + if got := ch.get(6, 6); got != 1 { 407 + t.Fatal("unexpected value for backwards call for first column on second line", got) 408 + } 409 + }
+3
eval.go
··· 332 332 } 333 333 } 334 334 335 + // Important invariant for performance: finalCands is sorted by offset and 336 + // non-overlapping. gatherMatches respects this invariant and all later 337 + // transformations respect this. 335 338 shouldMergeMatches := !opts.ChunkMatches 336 339 finalCands := gatherMatches(mt, known, shouldMergeMatches) 337 340