codesearch: initial check-in for library. · boltless.me/zoekt@c1c38a5

+94

index.go

··· 1 + // Copyright 2016 Google Inc. All rights reserved. 2 + // 3 + // Licensed under the Apache License, Version 2.0 (the "License"); 4 + // you may not use this file except in compliance with the License. 5 + // You may obtain a copy of the License at 6 + // 7 + // http://www.apache.org/licenses/LICENSE-2.0 8 + // 9 + // Unless required by applicable law or agreed to in writing, software 10 + // distributed under the License is distributed on an "AS IS" BASIS, 11 + // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 + // See the License for the specific language governing permissions and 13 + // limitations under the License. 14 + 15 + package codesearch 16 + 17 + import ( 18 + "fmt" 19 + "log" 20 + ) 21 + 22 + var _ = log.Println 23 + 24 + const NGRAM = 3 25 + 26 + type fileEntry struct { 27 + content []byte 28 + name string 29 + offset uint32 30 + } 31 + 32 + func (e *fileEntry) end() uint32 { 33 + return e.offset + uint32(len(e.content)) 34 + } 35 + 36 + type IndexBuilder struct { 37 + contentEnd uint32 38 + files []fileEntry 39 + 40 + // ngram => posting. 41 + postings map[string][]uint32 42 + } 43 + 44 + func (m *candidateMatch) String() string { 45 + return fmt.Sprintf("%d:%d", m.file, m.offset) 46 + } 47 + 48 + func NewIndexBuilder() *IndexBuilder { 49 + return &IndexBuilder{postings: make(map[string][]uint32)} 50 + } 51 + 52 + func (b *IndexBuilder) AddFile(name string, content []byte) { 53 + off := b.contentEnd 54 + for i := range content { 55 + if i+NGRAM > len(content) { 56 + break 57 + } 58 + ngram := string(content[i : i+NGRAM]) 59 + b.postings[ngram] = append(b.postings[ngram], off+uint32(i)) 60 + } 61 + b.files = append(b.files, 62 + fileEntry{ 63 + name: name, 64 + content: content, 65 + offset: b.contentEnd, 66 + }) 67 + b.contentEnd += uint32(len(content)) 68 + } 69 + 70 + func (b *IndexBuilder) search(str string) ([]candidateMatch, error) { 71 + if len(str) < NGRAM { 72 + return nil, fmt.Errorf("too short") 73 + } 74 + if len(b.files) == 0 { 75 + return nil, fmt.Errorf("no files") 76 + } 77 + 78 + first := str[:NGRAM] 79 + last := str[len(str)-NGRAM:] 80 + 81 + input := searchInput{ 82 + first: b.postings[first], 83 + last: b.postings[last], 84 + pat: str, 85 + } 86 + 87 + for _, f := range b.files { 88 + input.ends = append(input.ends, f.end()) 89 + } 90 + 91 + input.ends = append(input.ends, b.files[len(b.files)-1].end()) 92 + 93 + return input.search(), nil 94 + }

+274

index_test.go

··· 1 + // Copyright 2016 Google Inc. All rights reserved. 2 + // 3 + // Licensed under the Apache License, Version 2.0 (the "License"); 4 + // you may not use this file except in compliance with the License. 5 + // You may obtain a copy of the License at 6 + // 7 + // http://www.apache.org/licenses/LICENSE-2.0 8 + // 9 + // Unless required by applicable law or agreed to in writing, software 10 + // distributed under the License is distributed on an "AS IS" BASIS, 11 + // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 + // See the License for the specific language governing permissions and 13 + // limitations under the License. 14 + 15 + package codesearch 16 + 17 + import ( 18 + "bytes" 19 + "fmt" 20 + "io" 21 + "log" 22 + "reflect" 23 + "testing" 24 + ) 25 + 26 + func TestBoundary(t *testing.T) { 27 + b := NewIndexBuilder() 28 + 29 + b.AddFile("f1", []byte("x the")) 30 + b.AddFile("f1", []byte("reader")) 31 + 32 + matches, err := b.search("there") 33 + if err != nil { 34 + t.Errorf("search: %v", err) 35 + } 36 + if len(matches) > 0 { 37 + t.Fatalf("got %v, want no matches", matches) 38 + } 39 + } 40 + 41 + var _ = log.Println 42 + 43 + func TestBasic(t *testing.T) { 44 + b := NewIndexBuilder() 45 + 46 + b.AddFile("f1", []byte("there is no water in the well")) 47 + // -------------------- 0123456789012345678901234567890123456789 48 + b.AddFile("f2", []byte("to carry water in the no later bla")) 49 + // -------------------- 0123456789012345678901234567890123456789 50 + 51 + matches, err := b.search("water") 52 + if err != nil { 53 + t.Errorf("search: %v", err) 54 + } 55 + if len(matches) != 2 { 56 + t.Fatalf("got %v, want 2 matches", matches) 57 + } 58 + 59 + got := matches[0].String() 60 + want := "0:12" 61 + if got != want { 62 + t.Errorf("0: got %s, want %s", got, want) 63 + } 64 + 65 + got = matches[1].String() 66 + want = "1:9" 67 + if got != want { 68 + t.Errorf("1: got %s, want %s", got, want) 69 + } 70 + } 71 + 72 + type memSeeker struct { 73 + data []byte 74 + off int64 75 + } 76 + 77 + func (s *memSeeker) Close() error { return nil } 78 + func (s *memSeeker) Read(b []byte) (int, error) { 79 + var err error 80 + n := int64(len(b)) + s.off 81 + if n > int64(len(s.data)) { 82 + err = io.EOF 83 + n = int64(len(s.data)) 84 + } 85 + 86 + m := copy(b, s.data[s.off:n]) 87 + s.off = n 88 + return m, err 89 + } 90 + 91 + func (s *memSeeker) Seek(off int64, whence int) (int64, error) { 92 + var n int64 93 + switch whence { 94 + case 0: 95 + n = off 96 + case 1: 97 + n = s.off + off 98 + case 2: 99 + n = int64(len(s.data)) + off 100 + } 101 + 102 + if n > int64(len(s.data)) || n < 0 { 103 + return s.off, fmt.Errorf("out of range") 104 + } 105 + s.off = n 106 + return s.off, nil 107 + } 108 + 109 + func TestNewlines(t *testing.T) { 110 + b := NewIndexBuilder() 111 + b.AddFile("filename", []byte("line1\nline2\nbla")) 112 + //----------------------------012345 678901 23456 113 + 114 + var buf bytes.Buffer 115 + b.Write(&buf) 116 + f := &memSeeker{buf.Bytes(), 0} 117 + 118 + r := reader{r: f} 119 + 120 + var toc indexTOC 121 + r.readTOC(&toc) 122 + data := r.readIndexData(&toc) 123 + nls := r.readNewlines(data, 0) 124 + 125 + if want := []uint32{5, 11}; !reflect.DeepEqual(nls, want) { 126 + t.Errorf("got newlines %v, want %v", nls, want) 127 + } 128 + 129 + f = &memSeeker{buf.Bytes(), 0} 130 + 131 + searcher, err := NewSearcher(f) 132 + if err != nil { 133 + t.Fatalf("NewSearcher: %v", err) 134 + } 135 + matches, err := searcher.Search("ne2") 136 + 137 + want := []Match{{ 138 + Rank: 0, 139 + Name: "filename", 140 + Offset: 8, 141 + Line: "line2", 142 + LineNum: 2, 143 + LineOff: 2, 144 + MatchLength: 3, 145 + }} 146 + if !reflect.DeepEqual(matches, want) { 147 + t.Errorf("got %v, want %v", matches, want) 148 + } 149 + } 150 + 151 + func TestReadWrite(t *testing.T) { 152 + b := NewIndexBuilder() 153 + b.AddFile("filename", []byte("abcde")) 154 + 155 + var buf bytes.Buffer 156 + b.Write(&buf) 157 + f := &memSeeker{buf.Bytes(), 0} 158 + 159 + r := reader{r: f} 160 + 161 + var toc indexTOC 162 + r.readTOC(&toc) 163 + 164 + if r.err != nil { 165 + t.Errorf("got read error %v", r.err) 166 + } 167 + if toc.contents.sz != 5 { 168 + t.Errorf("got contents size %d, want 5", toc.contents.sz) 169 + } 170 + 171 + data := r.readIndexData(&toc) 172 + if want := []string{"filename"}; !reflect.DeepEqual(data.fileNames, want) { 173 + t.Errorf("got filenames %s, want %v", data.fileNames, want) 174 + } 175 + 176 + if want := "abcbcdcde"; want != string(data.ngramText) { 177 + t.Fatalf("got ngram text %q, want %q", data.ngramText, want) 178 + } 179 + 180 + if want := []uint32{5}; !reflect.DeepEqual(data.fileEnds, want) { 181 + t.Fatalf("got fileEnds %v, want %v", data.fileEnds, want) 182 + } 183 + 184 + if _, ok := data.findNgramIdx("bcq"); ok { 185 + t.Errorf("found nonexistent ngram") 186 + } 187 + if idx, ok := data.findNgramIdx("bcd"); !ok || idx != 1 { 188 + t.Errorf("got %v,%v want true,1", ok, idx) 189 + } 190 + 191 + got, err := r.readPostingData(data, 1) 192 + if err != nil { 193 + t.Errorf("readPostingData: %V", err) 194 + } 195 + 196 + if want := []uint32{1}; !reflect.DeepEqual(got, want) { 197 + t.Errorf("got posting data %v, want %v", got, want) 198 + } 199 + } 200 + 201 + func TestDelta(t *testing.T) { 202 + b := NewIndexBuilder() 203 + 204 + b.AddFile("f1", []byte("abc abc")) 205 + // ---------------------0123456 206 + var buf bytes.Buffer 207 + b.Write(&buf) 208 + f := &memSeeker{buf.Bytes(), 0} 209 + 210 + r := reader{r: f} 211 + 212 + var toc indexTOC 213 + r.readTOC(&toc) 214 + data := r.readIndexData(&toc) 215 + 216 + idx, ok := data.findNgramIdx("abc") 217 + if !ok { 218 + t.Errorf("did not find ngram") 219 + } 220 + 221 + got, err := r.readPostingData(data, idx) 222 + if err != nil { 223 + t.Errorf("readPostingData: %V", err) 224 + } 225 + 226 + if want := []uint32{0, 4}; !reflect.DeepEqual(got, want) { 227 + t.Errorf("got posting data %v, want %v", got, want) 228 + } 229 + } 230 + 231 + func TestFileBasedSearch(t *testing.T) { 232 + b := NewIndexBuilder() 233 + 234 + c1 := []byte("I love bananas without skin") 235 + // -----------0123456789012345678901234567890123456789 236 + b.AddFile("f1", c1) 237 + c2 := []byte("In Dutch, ananas means pineapple") 238 + // -----------0123456789012345678901234567890123456789 239 + b.AddFile("f2", c2) 240 + 241 + var buf bytes.Buffer 242 + b.Write(&buf) 243 + f := &memSeeker{buf.Bytes(), 0} 244 + 245 + searcher, err := NewSearcher(f) 246 + if err != nil { 247 + t.Fatalf("NewSearcher: %v", err) 248 + } 249 + matches, err := searcher.Search("ananas") 250 + if err != nil { 251 + t.Fatalf("Search: %v", err) 252 + } 253 + 254 + want := []Match{{ 255 + Rank: 0, 256 + Name: "f1", 257 + Offset: 8, 258 + Line: string(c1), 259 + LineNum: 1, 260 + LineOff: 8, 261 + MatchLength: 6, 262 + }, { 263 + Rank: 1, 264 + Name: "f2", 265 + Line: string(c2), 266 + LineNum: 1, 267 + LineOff: 10, 268 + Offset: 10, 269 + MatchLength: 6, 270 + }} 271 + if !reflect.DeepEqual(matches, want) { 272 + t.Errorf("got matches %#v, want %#v", matches, want) 273 + } 274 + }

+393

read.go

··· 1 + // Copyright 2016 Google Inc. All rights reserved. 2 + // 3 + // Licensed under the Apache License, Version 2.0 (the "License"); 4 + // you may not use this file except in compliance with the License. 5 + // You may obtain a copy of the License at 6 + // 7 + // http://www.apache.org/licenses/LICENSE-2.0 8 + // 9 + // Unless required by applicable law or agreed to in writing, software 10 + // distributed under the License is distributed on an "AS IS" BASIS, 11 + // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 + // See the License for the specific language governing permissions and 13 + // limitations under the License. 14 + 15 + package codesearch 16 + 17 + import ( 18 + "bytes" 19 + "encoding/binary" 20 + "fmt" 21 + "io" 22 + "log" 23 + "os" 24 + "path/filepath" 25 + "sort" 26 + ) 27 + 28 + var _ = log.Println 29 + 30 + type reader struct { 31 + r ReadSeekCloser 32 + err error 33 + } 34 + 35 + func (r *reader) readSection(s *section) { 36 + s.off = r.U32() 37 + s.sz = r.U32() 38 + } 39 + 40 + func (r *reader) U32() uint32 { 41 + if r.err != nil { 42 + return 0 43 + } 44 + var b [4]byte 45 + _, r.err = r.r.Read(b[:]) 46 + return binary.BigEndian.Uint32(b[:]) 47 + } 48 + 49 + func (r *reader) readTOC(toc *indexTOC) { 50 + if r.err != nil { 51 + return 52 + } 53 + 54 + r.r.Seek(-8, 2) 55 + var tocSection section 56 + r.readSection(&tocSection) 57 + _, r.err = r.r.Seek(int64(tocSection.off), 0) 58 + for _, s := range toc.sections() { 59 + r.readSection(s) 60 + } 61 + } 62 + 63 + type ngramText []byte 64 + 65 + func (t ngramText) get(i int) []byte { 66 + return t[i*NGRAM : (i+1)*NGRAM] 67 + } 68 + func (t ngramText) length() int { 69 + return len(t) / NGRAM 70 + } 71 + 72 + // indexData holds the pattern independent data that we have to have 73 + // in memory to search. 74 + type indexData struct { 75 + ngramText ngramText 76 + ngramFrequencies []uint32 77 + postingIndex []uint32 78 + newlinesIndex []uint32 79 + 80 + // offsets of file contents. Includes end of last file. 81 + boundaries []uint32 82 + fileEnds []uint32 83 + fileNames []string 84 + } 85 + 86 + func (d *indexData) findNgramIdx(ngram string) (uint32, bool) { 87 + asBytes := []byte(ngram) 88 + idx := sort.Search(d.ngramText.length(), func(j int) bool { 89 + return bytes.Compare(d.ngramText.get(j), asBytes) >= 0 90 + }) 91 + if idx == d.ngramText.length() { 92 + return 0, false 93 + } 94 + if bytes.Compare(asBytes, d.ngramText.get(idx)) != 0 { 95 + return 0, false 96 + } 97 + return uint32(idx), true 98 + } 99 + 100 + func (r *reader) readSectionBlob(sec section) []byte { 101 + d := make([]byte, sec.sz) 102 + r.r.Seek(int64(sec.off), 0) 103 + _, r.err = r.r.Read(d) 104 + return d 105 + } 106 + 107 + func (r *reader) readSectionU32(sec section) []uint32 { 108 + blob := r.readSectionBlob(sec) 109 + arr := make([]uint32, 0, len(blob)/4) 110 + for len(blob) > 0 { 111 + arr = append(arr, binary.BigEndian.Uint32(blob)) 112 + blob = blob[4:] 113 + } 114 + return arr 115 + } 116 + 117 + func (r *reader) readIndexData(toc *indexTOC) *indexData { 118 + if r.err != nil { 119 + return nil 120 + } 121 + 122 + textContent := r.readSectionBlob(toc.ngramText) 123 + d := indexData{ 124 + ngramText: ngramText(textContent), 125 + ngramFrequencies: r.readSectionU32(toc.ngramFrequencies), 126 + postingIndex: r.readSectionU32(toc.postingsIndex), 127 + boundaries: r.readSectionU32(toc.contentBoundaries), 128 + newlinesIndex: r.readSectionU32(toc.newlinesIndex), 129 + } 130 + 131 + d.boundaries = append(d.boundaries, d.boundaries[0]+toc.contents.sz) 132 + d.postingIndex = append(d.postingIndex, toc.postings.off+toc.postings.sz) 133 + d.fileEnds = make([]uint32, 0, len(d.boundaries)) 134 + d.newlinesIndex = append(d.newlinesIndex, toc.newlines.off+toc.newlines.sz) 135 + for _, b := range d.boundaries[1:] { 136 + d.fileEnds = append(d.fileEnds, b-d.boundaries[0]) 137 + } 138 + 139 + fnBlob := r.readSectionBlob(toc.names) 140 + fnIndex := r.readSectionU32(toc.nameIndex) 141 + for i, n := range fnIndex { 142 + end := toc.names.sz 143 + if i < len(fnIndex)-1 { 144 + end = fnIndex[i+1] - fnIndex[0] 145 + } 146 + n -= fnIndex[0] 147 + d.fileNames = append(d.fileNames, string(fnBlob[n:end])) 148 + } 149 + return &d 150 + } 151 + 152 + func (r *reader) readContents(d *indexData, i uint32) []byte { 153 + return r.readSectionBlob(section{ 154 + off: d.boundaries[i], 155 + sz: d.boundaries[i+1] - d.boundaries[i], 156 + }) 157 + } 158 + 159 + func (r *reader) readNewlines(d *indexData, i uint32) []uint32 { 160 + blob := r.readSectionBlob(section{ 161 + off: d.newlinesIndex[i], 162 + sz: d.newlinesIndex[i+1] - d.newlinesIndex[i], 163 + }) 164 + last := -1 165 + 166 + var res []uint32 167 + for len(blob) > 0 { 168 + delta, m := binary.Uvarint(blob) 169 + next := int(delta) + last 170 + res = append(res, uint32(next)) 171 + last = next 172 + blob = blob[m:] 173 + } 174 + 175 + return res 176 + } 177 + 178 + func (r *reader) readPostingData(d *indexData, idx uint32) ([]uint32, error) { 179 + sec := section{ 180 + off: d.postingIndex[idx], 181 + sz: d.postingIndex[idx+1] - d.postingIndex[idx], 182 + } 183 + 184 + data := r.readSectionBlob(sec) 185 + if r.err != nil { 186 + return nil, r.err 187 + } 188 + var ps []uint32 189 + var last uint32 190 + for len(data) > 0 { 191 + delta, m := binary.Uvarint(data) 192 + offset := last + uint32(delta) 193 + last = offset 194 + data = data[m:] 195 + ps = append(ps, offset) 196 + } 197 + return ps, nil 198 + } 199 + 200 + func (r *reader) readSearch(data *indexData, str string) (*searchInput, error) { 201 + if len(str) < NGRAM { 202 + return nil, fmt.Errorf("patter must be at least %d bytes", NGRAM) 203 + } 204 + 205 + input := &searchInput{ 206 + pat: str, 207 + } 208 + 209 + firstIdx, ok := data.findNgramIdx(str[:NGRAM]) 210 + if !ok { 211 + return input, nil 212 + } 213 + lastIdx, ok := data.findNgramIdx(str[len(str)-NGRAM:]) 214 + if !ok { 215 + return input, nil 216 + } 217 + 218 + var err error 219 + input.first, err = r.readPostingData(data, firstIdx) 220 + if err != nil { 221 + return nil, err 222 + } 223 + input.last, err = r.readPostingData(data, lastIdx) 224 + if err != nil { 225 + return nil, err 226 + } 227 + input.ends = data.fileEnds 228 + return input, nil 229 + } 230 + 231 + type Searcher interface { 232 + Search(pat string) ([]Match, error) 233 + Close() error 234 + } 235 + 236 + type searcher struct { 237 + reader reader 238 + indexData *indexData 239 + } 240 + 241 + func (s *searcher) Close() error { 242 + return s.reader.r.Close() 243 + } 244 + 245 + type ReadSeekCloser interface { 246 + io.ReadSeeker 247 + io.Closer 248 + } 249 + 250 + func NewSearcher(r ReadSeekCloser) (Searcher, error) { 251 + s := &searcher{ 252 + reader: reader{r: r}, 253 + } 254 + var toc indexTOC 255 + s.reader.readTOC(&toc) 256 + s.indexData = s.reader.readIndexData(&toc) 257 + if s.reader.err != nil { 258 + return nil, s.reader.err 259 + } 260 + return s, nil 261 + } 262 + 263 + type Match struct { 264 + // Ranking; the lower, the better. 265 + Rank int 266 + Line string 267 + LineNum int 268 + LineOff int 269 + 270 + Name string 271 + Offset uint32 272 + MatchLength int 273 + } 274 + 275 + func (s *searcher) Search(pat string) ([]Match, error) { 276 + input, err := s.reader.readSearch(s.indexData, pat) 277 + if err != nil { 278 + return nil, err 279 + } 280 + cands := input.search() 281 + 282 + asBytes := []byte(pat) 283 + 284 + var matches []Match 285 + lastFile := uint32(0xFFFFFFFF) 286 + var content []byte 287 + var newlines []uint32 288 + for _, c := range cands { 289 + if lastFile != c.file { 290 + content = s.reader.readContents(s.indexData, c.file) 291 + newlines = s.reader.readNewlines(s.indexData, c.file) 292 + lastFile = c.file 293 + } 294 + 295 + if bytes.Compare(content[c.offset:c.offset+uint32(len(pat))], asBytes) == 0 { 296 + idx := sort.Search(len(newlines), func(n int) bool { 297 + return newlines[n] >= c.offset 298 + }) 299 + 300 + end := uint32(len(content)) 301 + if idx < len(newlines) { 302 + end = newlines[idx] 303 + } 304 + 305 + start := 0 306 + if idx > 0 { 307 + start = int(newlines[idx-1] + 1) 308 + } 309 + 310 + matches = append(matches, Match{ 311 + Rank: int(c.file), 312 + Offset: c.offset, 313 + Line: string(content[start:end]), 314 + LineNum: idx + 1, 315 + LineOff: int(c.offset) - start, 316 + Name: s.indexData.fileNames[c.file], 317 + MatchLength: len(pat), 318 + }) 319 + } 320 + } 321 + 322 + return matches, nil 323 + } 324 + 325 + type shardedSearcher struct { 326 + searchers []Searcher 327 + } 328 + 329 + func NewShardedSearcher(indexGlob string) (Searcher, error) { 330 + fs, err := filepath.Glob(indexGlob) 331 + if err != nil { 332 + return nil, err 333 + } 334 + 335 + if len(fs) == 0 { 336 + return nil, fmt.Errorf("glob %q does not match anything.", indexGlob) 337 + } 338 + 339 + ss := shardedSearcher{} 340 + 341 + for _, fn := range fs { 342 + f, err := os.Open(fn) 343 + if err != nil { 344 + return nil, err 345 + } 346 + 347 + s, err := NewSearcher(f) 348 + if err != nil { 349 + return nil, fmt.Errorf("NewSearcher(%s): %v", f, err) 350 + } 351 + ss.searchers = append(ss.searchers, s) 352 + } 353 + 354 + return &ss, nil 355 + } 356 + 357 + type matchSlice []Match 358 + 359 + func (m matchSlice) Len() int { return len(m) } 360 + func (m matchSlice) Less(i, j int) bool { return m[i].Rank < m[j].Rank } 361 + func (m matchSlice) Swap(i, j int) { m[i], m[j] = m[j], m[i] } 362 + 363 + func (ss *shardedSearcher) Close() error { 364 + for _, s := range ss.searchers { 365 + s.Close() 366 + } 367 + return nil 368 + } 369 + 370 + func (ss *shardedSearcher) Search(pat string) ([]Match, error) { 371 + type res struct { 372 + m []Match 373 + err error 374 + } 375 + all := make(chan res, len(ss.searchers)) 376 + for _, s := range ss.searchers { 377 + go func(s Searcher) { 378 + ms, err := s.Search(pat) 379 + all <- res{ms, err} 380 + }(s) 381 + } 382 + 383 + var aggregate []Match 384 + for _ = range ss.searchers { 385 + r := <-all 386 + if r.err != nil { 387 + return nil, r.err 388 + } 389 + aggregate = append(aggregate, r.m...) 390 + } 391 + sort.Sort((matchSlice)(aggregate)) 392 + return aggregate, nil 393 + }

+71

search.go

··· 1 + // Copyright 2016 Google Inc. All rights reserved. 2 + // 3 + // Licensed under the Apache License, Version 2.0 (the "License"); 4 + // you may not use this file except in compliance with the License. 5 + // You may obtain a copy of the License at 6 + // 7 + // http://www.apache.org/licenses/LICENSE-2.0 8 + // 9 + // Unless required by applicable law or agreed to in writing, software 10 + // distributed under the License is distributed on an "AS IS" BASIS, 11 + // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 + // See the License for the specific language governing permissions and 13 + // limitations under the License. 14 + 15 + package codesearch 16 + 17 + type searchInput struct { 18 + pat string 19 + 20 + first []uint32 21 + last []uint32 22 + ends []uint32 23 + } 24 + 25 + type candidateMatch struct { 26 + file uint32 27 + offset uint32 28 + } 29 + 30 + func (s *searchInput) search() []candidateMatch { 31 + fileIdx := 0 32 + diff := uint32(len(s.pat) - NGRAM) 33 + 34 + var candidates []candidateMatch 35 + for { 36 + if len(s.first) == 0 || len(s.last) == 0 { 37 + break 38 + } 39 + p1 := s.first[0] 40 + p2 := s.last[0] 41 + 42 + for fileIdx < len(s.ends) && s.ends[fileIdx] <= p1 { 43 + fileIdx++ 44 + } 45 + 46 + if p1+diff < p2 { 47 + s.first = s.first[1:] 48 + } else if p1+diff > p2 { 49 + s.last = s.last[1:] 50 + } else { 51 + s.first = s.first[1:] 52 + s.last = s.last[1:] 53 + 54 + if p1+uint32(len(s.pat)) >= s.ends[fileIdx] { 55 + continue 56 + } 57 + 58 + fileStart := uint32(0) 59 + if fileIdx > 0 { 60 + fileStart += s.ends[fileIdx-1] 61 + } 62 + candidates = append(candidates, 63 + candidateMatch{ 64 + uint32(fileIdx), 65 + p1 - fileStart, 66 + }) 67 + } 68 + } 69 + 70 + return candidates 71 + }

+215

write.go

··· 1 + // Copyright 2016 Google Inc. All rights reserved. 2 + // 3 + // Licensed under the Apache License, Version 2.0 (the "License"); 4 + // you may not use this file except in compliance with the License. 5 + // You may obtain a copy of the License at 6 + // 7 + // http://www.apache.org/licenses/LICENSE-2.0 8 + // 9 + // Unless required by applicable law or agreed to in writing, software 10 + // distributed under the License is distributed on an "AS IS" BASIS, 11 + // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 + // See the License for the specific language governing permissions and 13 + // limitations under the License. 14 + 15 + package codesearch 16 + 17 + import ( 18 + "bufio" 19 + "encoding/binary" 20 + "io" 21 + "log" 22 + "sort" 23 + ) 24 + 25 + var _ = log.Println 26 + 27 + type writer struct { 28 + err error 29 + w io.Writer 30 + off uint32 31 + } 32 + 33 + func (w *writer) Write(b []byte) error { 34 + if w.err != nil { 35 + return w.err 36 + } 37 + 38 + var n int 39 + n, w.err = w.w.Write(b) 40 + w.off += uint32(n) 41 + return w.err 42 + } 43 + 44 + func (w *writer) Off() uint32 { return w.off } 45 + 46 + func (w *writer) B(b byte) { 47 + s := []byte{b} 48 + w.Write(s) 49 + } 50 + 51 + func (w *writer) U32(n uint32) { 52 + var enc [4]byte 53 + binary.BigEndian.PutUint32(enc[:], n) 54 + w.Write(enc[:]) 55 + } 56 + 57 + func (w *writer) Varint(n uint32) { 58 + var enc [8]byte 59 + m := binary.PutUvarint(enc[:], uint64(n)) 60 + w.Write(enc[:m]) 61 + } 62 + 63 + func (w *writer) startSection(s *section) { 64 + s.off = w.Off() 65 + } 66 + 67 + func (w *writer) endSection(s *section) { 68 + s.sz = w.Off() - s.off 69 + } 70 + 71 + type section struct { 72 + off uint32 73 + sz uint32 74 + } 75 + 76 + func (w *writer) writeSection(s *section) { 77 + w.U32(s.off) 78 + w.U32(s.sz) 79 + } 80 + 81 + type indexTOC struct { 82 + contents section 83 + contentBoundaries section 84 + newlines section 85 + newlinesIndex section 86 + ngramText section 87 + ngramFrequencies section 88 + postings section 89 + postingsIndex section 90 + names section 91 + nameIndex section 92 + } 93 + 94 + func (t *indexTOC) sections() []*section { 95 + return []*section{ 96 + &t.contents, 97 + &t.contentBoundaries, 98 + &t.newlines, 99 + &t.newlinesIndex, 100 + &t.ngramText, 101 + &t.ngramFrequencies, 102 + &t.postings, 103 + &t.postingsIndex, 104 + &t.names, 105 + &t.nameIndex, 106 + } 107 + } 108 + 109 + func (w *writer) writeTOC(toc *indexTOC) { 110 + for _, s := range toc.sections() { 111 + w.writeSection(s) 112 + } 113 + } 114 + 115 + func (b *IndexBuilder) Write(out io.Writer) error { 116 + buffered := bufio.NewWriterSize(out, 1<<20) 117 + defer buffered.Flush() 118 + 119 + w := &writer{w: buffered} 120 + toc := indexTOC{} 121 + var items []uint32 122 + w.startSection(&toc.contents) 123 + for _, f := range b.files { 124 + items = append(items, w.Off()) 125 + w.Write(f.content) 126 + } 127 + w.endSection(&toc.contents) 128 + 129 + w.startSection(&toc.contentBoundaries) 130 + for _, off := range items { 131 + w.U32(off) 132 + } 133 + w.endSection(&toc.contentBoundaries) 134 + 135 + w.startSection(&toc.newlines) 136 + items = items[:0] 137 + for _, f := range b.files { 138 + items = append(items, w.Off()) 139 + last := -1 140 + for i, c := range f.content { 141 + if c == '\n' { 142 + w.Varint(uint32(i - last)) 143 + last = i 144 + } 145 + } 146 + } 147 + w.endSection(&toc.newlines) 148 + 149 + w.startSection(&toc.newlinesIndex) 150 + for _, off := range items { 151 + w.U32(off) 152 + } 153 + w.endSection(&toc.newlinesIndex) 154 + 155 + var keys []string 156 + for k := range b.postings { 157 + keys = append(keys, k) 158 + } 159 + sort.Strings(keys) 160 + 161 + w.startSection(&toc.ngramText) 162 + for _, k := range keys { 163 + w.Write([]byte(k)) 164 + } 165 + w.endSection(&toc.ngramText) 166 + 167 + w.startSection(&toc.postings) 168 + items = items[:0] 169 + for _, k := range keys { 170 + var last uint32 171 + items = append(items, w.Off()) 172 + for _, p := range b.postings[k] { 173 + delta := p - last 174 + w.Varint(delta) 175 + last = p 176 + } 177 + } 178 + w.endSection(&toc.postings) 179 + 180 + w.startSection(&toc.ngramFrequencies) 181 + for _, k := range keys { 182 + n := uint32(len(b.postings[k])) 183 + w.U32(n) 184 + } 185 + w.endSection(&toc.ngramFrequencies) 186 + 187 + w.startSection(&toc.postingsIndex) 188 + for _, off := range items { 189 + w.U32(off) 190 + } 191 + w.endSection(&toc.postingsIndex) 192 + 193 + w.startSection(&toc.names) 194 + items = items[:0] 195 + for _, f := range b.files { 196 + items = append(items, w.Off()) 197 + w.Write([]byte(f.name)) 198 + } 199 + w.endSection(&toc.names) 200 + 201 + w.startSection(&toc.nameIndex) 202 + for _, off := range items { 203 + w.U32(off) 204 + } 205 + w.endSection(&toc.nameIndex) 206 + 207 + var tocSection section 208 + w.startSection(&tocSection) 209 + w.writeTOC(&toc) 210 + 211 + w.endSection(&tocSection) 212 + w.writeSection(&tocSection) 213 + 214 + return w.err 215 + }

Configure Feed

Configure Feed