chunkmatches: reuse last calculated column when filling (#711) · boltless.me/zoekt@7487a0d

+59 -3

contentprovider.go

··· 169 169 return result 170 170 } 171 171 172 + // fillChunkMatches converts the internal candidateMatch slice into our APIs ChunkMatch. 173 + // 174 + // Performance invariant: ms is sorted and non-overlapping. 175 + // 176 + // Note: the byte slices may be backed by mmapped data, so before being 177 + // returned by the API it needs to be copied. 172 178 func (p *contentProvider) fillChunkMatches(ms []*candidateMatch, numContextLines int, language string, debug bool) []ChunkMatch { 173 179 var result []ChunkMatch 174 180 if ms[0].fileName { ··· 290 296 291 297 func (p *contentProvider) fillContentChunkMatches(ms []*candidateMatch, numContextLines int) []ChunkMatch { 292 298 newlines := p.newlines() 299 + data := p.data(false) 300 + 301 + // columnHelper prevents O(len(ms) * len(data)) lookups for all columns. 302 + // However, it depends on ms being sorted by byteOffset and non-overlapping. 303 + // This invariant is true at the time of writing, but we conservatively 304 + // enforce this. Note: chunkCandidates preserves the sorting so safe to 305 + // transform now. 306 + columnHelper := columnHelper{data: data} 307 + if !sort.IsSorted((sortByOffsetSlice)(ms)) { 308 + log.Printf("WARN: performance invariant violated. candidate matches are not sorted in fillContentChunkMatches. Report to developers.") 309 + sort.Sort((sortByOffsetSlice)(ms)) 310 + } 311 + 293 312 chunks := chunkCandidates(ms, newlines, numContextLines) 294 - data := p.data(false) 295 313 chunkMatches := make([]ChunkMatch, 0, len(chunks)) 296 314 for _, chunk := range chunks { 297 315 ranges := make([]Range, 0, len(chunk.candidates)) ··· 306 324 Start: Location{ 307 325 ByteOffset: startOffset, 308 326 LineNumber: uint32(startLine), 309 - Column: uint32(utf8.RuneCount(data[startLineOffset:startOffset]) + 1), 327 + Column: columnHelper.get(startLineOffset, startOffset), 310 328 }, 311 329 End: Location{ 312 330 ByteOffset: endOffset, 313 331 LineNumber: uint32(endLine), 314 - Column: uint32(utf8.RuneCount(data[endLineOffset:endOffset]) + 1), 332 + Column: columnHelper.get(endLineOffset, endOffset), 315 333 }, 316 334 }) 317 335 ··· 361 379 // chunkCandidates groups a set of sorted, non-overlapping candidate matches by line number. Adjacent 362 380 // chunks will be merged if adding `numContextLines` to the beginning and end of the chunk would cause 363 381 // it to overlap with an adjacent chunk. 382 + // 383 + // input invariants: ms is sorted by byteOffset and is non overlapping with respect to endOffset. 384 + // output invariants: if you flatten candidates the input invariant is retained. 364 385 func chunkCandidates(ms []*candidateMatch, newlines newlines, numContextLines int) []candidateChunk { 365 386 var chunks []candidateChunk 366 387 for _, m := range ms { ··· 390 411 } 391 412 } 392 413 return chunks 414 + } 415 + 416 + // columnHelper is a helper struct which caches the number of runes last 417 + // counted. If we naively use utf8.RuneCount for each match on a line, this 418 + // leads to an O(nm) algorithm where m is the number of matches and n is the 419 + // length of the line. Aassuming we our candidates are increasing in offset 420 + // makes this operation O(n) instead. 421 + type columnHelper struct { 422 + data []byte 423 + 424 + // 0 values for all these are valid values 425 + lastLineOffset int 426 + lastOffset uint32 427 + lastRuneCount uint32 428 + } 429 + 430 + // get returns the line column for offset. offset is the byte offset of the 431 + // rune in data. lineOffset is the byte offset inside of data for the line 432 + // containing offset. 433 + func (c *columnHelper) get(lineOffset int, offset uint32) uint32 { 434 + var runeCount uint32 435 + 436 + if lineOffset == c.lastLineOffset && offset >= c.lastOffset { 437 + // Can count from last calculation 438 + runeCount = c.lastRuneCount + uint32(utf8.RuneCount(c.data[c.lastOffset:offset])) 439 + } else { 440 + // Need to count from the beginning of line 441 + runeCount = uint32(utf8.RuneCount(c.data[lineOffset:offset])) 442 + } 443 + 444 + c.lastLineOffset = lineOffset 445 + c.lastOffset = offset 446 + c.lastRuneCount = runeCount 447 + 448 + return runeCount + 1 393 449 } 394 450 395 451 type newlines struct {

+80

contentprovider_test.go

··· 4 4 "bytes" 5 5 "fmt" 6 6 "testing" 7 + "testing/quick" 8 + "unicode/utf8" 7 9 8 10 "github.com/google/go-cmp/cmp" 9 11 ) ··· 327 329 }) 328 330 } 329 331 } 332 + 333 + func BenchmarkColumnHelper(b *testing.B) { 334 + // We simulate looking up columns of evenly spaced matches 335 + const matches = 10_000 336 + const match = "match" 337 + const space = " " 338 + const dist = uint32(len(match) + len(space)) 339 + data := bytes.Repeat([]byte(match+space), matches) 340 + 341 + b.ResetTimer() 342 + 343 + for i := 0; i < b.N; i++ { 344 + columnHelper := columnHelper{data: data} 345 + 346 + lineOffset := 0 347 + offset := uint32(0) 348 + for offset < uint32(len(data)) { 349 + col := columnHelper.get(lineOffset, offset) 350 + if col != offset+1 { 351 + b.Fatal("column is not offset even though data is ASCII") 352 + } 353 + offset += dist 354 + } 355 + } 356 + } 357 + 358 + func TestColumnHelper(t *testing.T) { 359 + f := func(line0, line1 string) bool { 360 + data := []byte(line0 + line1) 361 + lineOffset := len(line0) 362 + 363 + columnHelper := columnHelper{data: data} 364 + 365 + // We check every second rune returns the correct answer 366 + offset := lineOffset 367 + column := 1 368 + for offset < len(data) { 369 + if column%2 == 0 { 370 + got := columnHelper.get(lineOffset, uint32(offset)) 371 + if got != uint32(column) { 372 + return false 373 + } 374 + } 375 + _, size := utf8.DecodeRune(data[offset:]) 376 + offset += size 377 + column++ 378 + } 379 + 380 + return true 381 + } 382 + 383 + if err := quick.Check(f, nil); err != nil { 384 + t.Fatal(err) 385 + } 386 + 387 + // Corner cases 388 + 389 + // empty data, shouldn't happen but just in case it slips through 390 + ch := columnHelper{data: nil} 391 + if got := ch.get(0, 0); got != 1 { 392 + t.Fatal("empty data didn't return 1", got) 393 + } 394 + 395 + // Repeating a call to get should return the same value 396 + // empty data, shouldn't happen but just in case it slips through 397 + ch = columnHelper{data: []byte("hello\nworld")} 398 + if got := ch.get(6, 8); got != 3 { 399 + t.Fatal("unexpected value for third column on second line", got) 400 + } 401 + if got := ch.get(6, 8); got != 3 { 402 + t.Fatal("unexpected value for repeated call for third column on second line", got) 403 + } 404 + 405 + // Now make sure if we go backwards we do not incorrectly use the cache 406 + if got := ch.get(6, 6); got != 1 { 407 + t.Fatal("unexpected value for backwards call for first column on second line", got) 408 + } 409 + }

+3

eval.go

··· 332 332 } 333 333 } 334 334 335 + // Important invariant for performance: finalCands is sorted by offset and 336 + // non-overlapping. gatherMatches respects this invariant and all later 337 + // transformations respect this. 335 338 shouldMergeMatches := !opts.ChunkMatches 336 339 finalCands := gatherMatches(mt, known, shouldMergeMatches) 337 340

Configure Feed

Configure Feed