Create `newlines` type (#363) · boltless.me/zoekt@f846f01

+43 -10

contentprovider.go

··· 64 64 return p._sects 65 65 } 66 66 67 - func (p *contentProvider) newlines() []uint32 { 67 + func (p *contentProvider) newlines() newlines { 68 68 if p._nl == nil { 69 69 var sz uint32 70 70 p._nl, sz, p.err = p.id.readNewlines(p.idx, p._nlBuf) 71 71 p._nlBuf = p._nl 72 72 p.stats.ContentBytesLoaded += int64(sz) 73 73 } 74 - return p._nl 74 + return newlines{locs: p._nl, fileSize: p.fileSize} 75 75 } 76 76 77 77 func (p *contentProvider) data(fileName bool) []byte { ··· 167 167 var result []LineMatch 168 168 for len(ms) > 0 { 169 169 m := ms[0] 170 - num, lineStart, lineEnd := m.line(p.newlines(), p.fileSize) 170 + num, lineStart, lineEnd := p.newlines().atOffset(m.byteOffset) 171 171 172 172 var lineCands []*candidateMatch 173 173 ··· 215 215 finalMatch.Line = data[lineStart:lineEnd] 216 216 217 217 if numContextLines > 0 { 218 - finalMatch.Before = getLines(data, p.newlines(), num-numContextLines, num) 219 - finalMatch.After = getLines(data, p.newlines(), num+1, num+1+numContextLines) 218 + finalMatch.Before = p.newlines().getLines(data, num-numContextLines, num) 219 + finalMatch.After = p.newlines().getLines(data, num+1, num+1+numContextLines) 220 220 } 221 221 222 222 for _, m := range lineCands { ··· 241 241 return result 242 242 } 243 243 244 + type newlines struct { 245 + // locs is the sorted set of byte offsets of the newlines in the file 246 + locs []uint32 247 + 248 + // fileSize is just the number of bytes in the file. It is stored 249 + // on this struct so we can safely know the length of the last line 250 + // in the file since not all files end in a newline. 251 + fileSize uint32 252 + } 253 + 254 + // atOffset returns the line containing the offset. If the offset lands on 255 + // the newline ending line M, we return M. The line is characterized 256 + // by its linenumber (base-1, byte index of line start, byte index of 257 + // line end). The line end is the index of a newline, or the filesize 258 + // (if matching the last line of the file.) 259 + func (nls newlines) atOffset(offset uint32) (lineNumber, lineStart, lineEnd int) { 260 + idx := sort.Search(len(nls.locs), func(n int) bool { 261 + return nls.locs[n] >= offset 262 + }) 263 + 264 + end := int(nls.fileSize) 265 + if idx < len(nls.locs) { 266 + end = int(nls.locs[idx]) 267 + } 268 + 269 + start := 0 270 + if idx > 0 { 271 + start = int(nls.locs[idx-1] + 1) 272 + } 273 + 274 + return idx + 1, start, end 275 + } 276 + 244 277 // getLines returns a slice of data containing the lines [low, high). 245 278 // low is 1-based and inclusive. high is exclusive. 246 - func getLines(data []byte, newLines []uint32, low, high int) []byte { 279 + func (nls newlines) getLines(data []byte, low, high int) []byte { 247 280 // newlines[0] is the start of the 2nd line in data. 248 281 // So adjust low and high to be based on newLines. 249 282 low -= 2 250 283 high -= 2 251 - if low >= high || high < 0 || low >= len(newLines) || len(newLines) == 0 { 284 + if low >= high || high < 0 || low >= len(nls.locs) || len(nls.locs) == 0 { 252 285 return nil 253 286 } 254 287 ··· 256 289 if low < 0 { 257 290 startIndex = 0 258 291 } else { 259 - startIndex = newLines[low] + 1 292 + startIndex = nls.locs[low] + 1 260 293 } 261 294 262 - if high >= len(newLines) { 295 + if high >= len(nls.locs) { 263 296 return data[startIndex:] 264 297 } 265 - return data[startIndex:newLines[high]] 298 + return data[startIndex:nls.locs[high]] 266 299 } 267 300 268 301 const (

+85 -7

contentprovider_test.go

··· 7 7 "github.com/google/go-cmp/cmp" 8 8 ) 9 9 10 + func getNewlines(data []byte) newlines { 11 + var locs []uint32 12 + for i, c := range data { 13 + if c == '\n' { 14 + locs = append(locs, uint32(i)) 15 + } 16 + } 17 + return newlines{ 18 + locs: locs, 19 + fileSize: uint32(len(data)), 20 + } 21 + } 22 + 10 23 func TestGetLines(t *testing.T) { 11 24 data := []byte(`one 12 25 two 13 26 three 14 27 four`) 15 28 16 - var newLines []uint32 17 - for i, c := range data { 18 - if c == '\n' { 19 - newLines = append(newLines, uint32(i)) 20 - } 21 - } 29 + newLines := getNewlines(data) 22 30 lines := bytes.Split(data, []byte{'\n'}) // TODO does split group consecutive sep? 23 31 wantGetLines := func(low, high int) []byte { 24 32 low-- ··· 41 49 for low := -1; low <= len(lines)+2; low++ { 42 50 for high := low; high <= len(lines)+2; high++ { 43 51 want := wantGetLines(low, high) 44 - got := getLines(data, newLines, low, high) 52 + got := newLines.getLines(data, low, high) 45 53 if d := cmp.Diff(string(want), string(got)); d != "" { 46 54 t.Fatal(d) 47 55 } 48 56 } 49 57 } 50 58 } 59 + 60 + func TestAtOffset(t *testing.T) { 61 + cases := []struct { 62 + data []byte 63 + offset uint32 64 + lineNumber int 65 + lineStart int 66 + lineEnd int 67 + }{{ 68 + data: []byte("0.2.4.\n7.9.11.\n"), 69 + offset: 0, 70 + lineNumber: 1, lineStart: 0, lineEnd: 6, 71 + }, { 72 + data: []byte("0.2.4.\n7.9.11.\n"), 73 + offset: 6, 74 + lineNumber: 1, lineStart: 0, lineEnd: 6, 75 + }, { 76 + data: []byte("0.2.4.\n7.9.11.\n"), 77 + offset: 2, 78 + lineNumber: 1, lineStart: 0, lineEnd: 6, 79 + }, { 80 + data: []byte("0.2.4.\n7.9.11.\n"), 81 + offset: 2, 82 + lineNumber: 1, lineStart: 0, lineEnd: 6, 83 + }, { 84 + data: []byte("0.2.4.\n7.9.11.\n"), 85 + offset: 7, 86 + lineNumber: 2, lineStart: 7, lineEnd: 14, 87 + }, { 88 + data: []byte("0.2.4.\n7.9.11.\n"), 89 + offset: 11, 90 + lineNumber: 2, lineStart: 7, lineEnd: 14, 91 + }, { 92 + data: []byte("0.2.4.\n7.9.11.\n"), 93 + offset: 15, 94 + lineNumber: 3, lineStart: 15, lineEnd: 15, 95 + }, { 96 + data: []byte("0.2.4.\n7.9.11."), 97 + offset: 7, 98 + lineNumber: 2, lineStart: 7, lineEnd: 14, 99 + }, { 100 + data: []byte("\n\n"), 101 + offset: 0, 102 + lineNumber: 1, lineStart: 0, lineEnd: 0, 103 + }, { 104 + data: []byte("\n\n"), 105 + offset: 1, 106 + lineNumber: 2, lineStart: 1, lineEnd: 1, 107 + }, { 108 + data: []byte("\n\n"), 109 + offset: 3, 110 + lineNumber: 3, lineStart: 2, lineEnd: 2, 111 + }} 112 + 113 + for _, tt := range cases { 114 + t.Run("", func(t *testing.T) { 115 + nls := getNewlines(tt.data) 116 + gotLineNumber, gotLineStart, gotLineEnd := nls.atOffset(tt.offset) 117 + if gotLineNumber != tt.lineNumber { 118 + t.Fatalf("expected line number %d, got %d", tt.lineNumber, gotLineNumber) 119 + } 120 + if gotLineStart != tt.lineStart { 121 + t.Fatalf("expected line start %d, got %d", tt.lineStart, gotLineStart) 122 + } 123 + if gotLineEnd != tt.lineEnd { 124 + t.Fatalf("expected line end %d, got %d", tt.lineEnd, gotLineEnd) 125 + } 126 + }) 127 + } 128 + }

-24

matchiter.go

··· 17 17 import ( 18 18 "bytes" 19 19 "fmt" 20 - "sort" 21 20 ) 22 21 23 22 // candidateMatch is a candidate match for a substring. ··· 57 56 m.byteMatchSz = uint32(sz) 58 57 return ok 59 58 } 60 - } 61 - 62 - // line returns the line holding the match. If the match starts with 63 - // the newline ending line M, we return M. The line is characterized 64 - // by its linenumber (base-1, byte index of line start, byte index of 65 - // line end). The line end is the index of a newline, or the filesize 66 - // (if matching the last line of the file.) 67 - func (m *candidateMatch) line(newlines []uint32, fileSize uint32) (lineNum, lineStart, lineEnd int) { 68 - idx := sort.Search(len(newlines), func(n int) bool { 69 - return newlines[n] >= m.byteOffset 70 - }) 71 - 72 - end := int(fileSize) 73 - if idx < len(newlines) { 74 - end = int(newlines[idx]) 75 - } 76 - 77 - start := 0 78 - if idx > 0 { 79 - start = int(newlines[idx-1] + 1) 80 - } 81 - 82 - return idx + 1, start, end 83 59 } 84 60 85 61 // matchIterator is a docIterator that produces candidateMatches for a given document

+1 -1

matchtree.go

··· 558 558 lines := make([]lineRange, 0, len(t.children[fewestChildren].(*substrMatchTree).current)) 559 559 prev := -1 560 560 for _, candidate := range t.children[fewestChildren].(*substrMatchTree).current { 561 - line, byteStart, byteEnd := candidate.line(cp.newlines(), cp.fileSize) 561 + line, byteStart, byteEnd := cp.newlines().atOffset(candidate.byteOffset) 562 562 if line == prev { 563 563 continue 564 564 }

Configure Feed

Configure Feed