···11+// Copyright 2016 Google Inc. All rights reserved.
22+//
33+// Licensed under the Apache License, Version 2.0 (the "License");
44+// you may not use this file except in compliance with the License.
55+// You may obtain a copy of the License at
66+//
77+// http://www.apache.org/licenses/LICENSE-2.0
88+//
99+// Unless required by applicable law or agreed to in writing, software
1010+// distributed under the License is distributed on an "AS IS" BASIS,
1111+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
1212+// See the License for the specific language governing permissions and
1313+// limitations under the License.
1414+1515+package codesearch
1616+1717+import (
1818+ "fmt"
1919+ "log"
2020+)
2121+2222+var _ = log.Println
2323+2424+const NGRAM = 3
2525+2626+type fileEntry struct {
2727+ content []byte
2828+ name string
2929+ offset uint32
3030+}
3131+3232+func (e *fileEntry) end() uint32 {
3333+ return e.offset + uint32(len(e.content))
3434+}
3535+3636+type IndexBuilder struct {
3737+ contentEnd uint32
3838+ files []fileEntry
3939+4040+ // ngram => posting.
4141+ postings map[string][]uint32
4242+}
4343+4444+func (m *candidateMatch) String() string {
4545+ return fmt.Sprintf("%d:%d", m.file, m.offset)
4646+}
4747+4848+func NewIndexBuilder() *IndexBuilder {
4949+ return &IndexBuilder{postings: make(map[string][]uint32)}
5050+}
5151+5252+func (b *IndexBuilder) AddFile(name string, content []byte) {
5353+ off := b.contentEnd
5454+ for i := range content {
5555+ if i+NGRAM > len(content) {
5656+ break
5757+ }
5858+ ngram := string(content[i : i+NGRAM])
5959+ b.postings[ngram] = append(b.postings[ngram], off+uint32(i))
6060+ }
6161+ b.files = append(b.files,
6262+ fileEntry{
6363+ name: name,
6464+ content: content,
6565+ offset: b.contentEnd,
6666+ })
6767+ b.contentEnd += uint32(len(content))
6868+}
6969+7070+func (b *IndexBuilder) search(str string) ([]candidateMatch, error) {
7171+ if len(str) < NGRAM {
7272+ return nil, fmt.Errorf("too short")
7373+ }
7474+ if len(b.files) == 0 {
7575+ return nil, fmt.Errorf("no files")
7676+ }
7777+7878+ first := str[:NGRAM]
7979+ last := str[len(str)-NGRAM:]
8080+8181+ input := searchInput{
8282+ first: b.postings[first],
8383+ last: b.postings[last],
8484+ pat: str,
8585+ }
8686+8787+ for _, f := range b.files {
8888+ input.ends = append(input.ends, f.end())
8989+ }
9090+9191+ input.ends = append(input.ends, b.files[len(b.files)-1].end())
9292+9393+ return input.search(), nil
9494+}
+274
index_test.go
···11+// Copyright 2016 Google Inc. All rights reserved.
22+//
33+// Licensed under the Apache License, Version 2.0 (the "License");
44+// you may not use this file except in compliance with the License.
55+// You may obtain a copy of the License at
66+//
77+// http://www.apache.org/licenses/LICENSE-2.0
88+//
99+// Unless required by applicable law or agreed to in writing, software
1010+// distributed under the License is distributed on an "AS IS" BASIS,
1111+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
1212+// See the License for the specific language governing permissions and
1313+// limitations under the License.
1414+1515+package codesearch
1616+1717+import (
1818+ "bytes"
1919+ "fmt"
2020+ "io"
2121+ "log"
2222+ "reflect"
2323+ "testing"
2424+)
2525+2626+func TestBoundary(t *testing.T) {
2727+ b := NewIndexBuilder()
2828+2929+ b.AddFile("f1", []byte("x the"))
3030+ b.AddFile("f1", []byte("reader"))
3131+3232+ matches, err := b.search("there")
3333+ if err != nil {
3434+ t.Errorf("search: %v", err)
3535+ }
3636+ if len(matches) > 0 {
3737+ t.Fatalf("got %v, want no matches", matches)
3838+ }
3939+}
4040+4141+var _ = log.Println
4242+4343+func TestBasic(t *testing.T) {
4444+ b := NewIndexBuilder()
4545+4646+ b.AddFile("f1", []byte("there is no water in the well"))
4747+ // -------------------- 0123456789012345678901234567890123456789
4848+ b.AddFile("f2", []byte("to carry water in the no later bla"))
4949+ // -------------------- 0123456789012345678901234567890123456789
5050+5151+ matches, err := b.search("water")
5252+ if err != nil {
5353+ t.Errorf("search: %v", err)
5454+ }
5555+ if len(matches) != 2 {
5656+ t.Fatalf("got %v, want 2 matches", matches)
5757+ }
5858+5959+ got := matches[0].String()
6060+ want := "0:12"
6161+ if got != want {
6262+ t.Errorf("0: got %s, want %s", got, want)
6363+ }
6464+6565+ got = matches[1].String()
6666+ want = "1:9"
6767+ if got != want {
6868+ t.Errorf("1: got %s, want %s", got, want)
6969+ }
7070+}
7171+7272+type memSeeker struct {
7373+ data []byte
7474+ off int64
7575+}
7676+7777+func (s *memSeeker) Close() error { return nil }
7878+func (s *memSeeker) Read(b []byte) (int, error) {
7979+ var err error
8080+ n := int64(len(b)) + s.off
8181+ if n > int64(len(s.data)) {
8282+ err = io.EOF
8383+ n = int64(len(s.data))
8484+ }
8585+8686+ m := copy(b, s.data[s.off:n])
8787+ s.off = n
8888+ return m, err
8989+}
9090+9191+func (s *memSeeker) Seek(off int64, whence int) (int64, error) {
9292+ var n int64
9393+ switch whence {
9494+ case 0:
9595+ n = off
9696+ case 1:
9797+ n = s.off + off
9898+ case 2:
9999+ n = int64(len(s.data)) + off
100100+ }
101101+102102+ if n > int64(len(s.data)) || n < 0 {
103103+ return s.off, fmt.Errorf("out of range")
104104+ }
105105+ s.off = n
106106+ return s.off, nil
107107+}
108108+109109+func TestNewlines(t *testing.T) {
110110+ b := NewIndexBuilder()
111111+ b.AddFile("filename", []byte("line1\nline2\nbla"))
112112+ //----------------------------012345 678901 23456
113113+114114+ var buf bytes.Buffer
115115+ b.Write(&buf)
116116+ f := &memSeeker{buf.Bytes(), 0}
117117+118118+ r := reader{r: f}
119119+120120+ var toc indexTOC
121121+ r.readTOC(&toc)
122122+ data := r.readIndexData(&toc)
123123+ nls := r.readNewlines(data, 0)
124124+125125+ if want := []uint32{5, 11}; !reflect.DeepEqual(nls, want) {
126126+ t.Errorf("got newlines %v, want %v", nls, want)
127127+ }
128128+129129+ f = &memSeeker{buf.Bytes(), 0}
130130+131131+ searcher, err := NewSearcher(f)
132132+ if err != nil {
133133+ t.Fatalf("NewSearcher: %v", err)
134134+ }
135135+ matches, err := searcher.Search("ne2")
136136+137137+ want := []Match{{
138138+ Rank: 0,
139139+ Name: "filename",
140140+ Offset: 8,
141141+ Line: "line2",
142142+ LineNum: 2,
143143+ LineOff: 2,
144144+ MatchLength: 3,
145145+ }}
146146+ if !reflect.DeepEqual(matches, want) {
147147+ t.Errorf("got %v, want %v", matches, want)
148148+ }
149149+}
150150+151151+func TestReadWrite(t *testing.T) {
152152+ b := NewIndexBuilder()
153153+ b.AddFile("filename", []byte("abcde"))
154154+155155+ var buf bytes.Buffer
156156+ b.Write(&buf)
157157+ f := &memSeeker{buf.Bytes(), 0}
158158+159159+ r := reader{r: f}
160160+161161+ var toc indexTOC
162162+ r.readTOC(&toc)
163163+164164+ if r.err != nil {
165165+ t.Errorf("got read error %v", r.err)
166166+ }
167167+ if toc.contents.sz != 5 {
168168+ t.Errorf("got contents size %d, want 5", toc.contents.sz)
169169+ }
170170+171171+ data := r.readIndexData(&toc)
172172+ if want := []string{"filename"}; !reflect.DeepEqual(data.fileNames, want) {
173173+ t.Errorf("got filenames %s, want %v", data.fileNames, want)
174174+ }
175175+176176+ if want := "abcbcdcde"; want != string(data.ngramText) {
177177+ t.Fatalf("got ngram text %q, want %q", data.ngramText, want)
178178+ }
179179+180180+ if want := []uint32{5}; !reflect.DeepEqual(data.fileEnds, want) {
181181+ t.Fatalf("got fileEnds %v, want %v", data.fileEnds, want)
182182+ }
183183+184184+ if _, ok := data.findNgramIdx("bcq"); ok {
185185+ t.Errorf("found nonexistent ngram")
186186+ }
187187+ if idx, ok := data.findNgramIdx("bcd"); !ok || idx != 1 {
188188+ t.Errorf("got %v,%v want true,1", ok, idx)
189189+ }
190190+191191+ got, err := r.readPostingData(data, 1)
192192+ if err != nil {
193193+ t.Errorf("readPostingData: %V", err)
194194+ }
195195+196196+ if want := []uint32{1}; !reflect.DeepEqual(got, want) {
197197+ t.Errorf("got posting data %v, want %v", got, want)
198198+ }
199199+}
200200+201201+func TestDelta(t *testing.T) {
202202+ b := NewIndexBuilder()
203203+204204+ b.AddFile("f1", []byte("abc abc"))
205205+ // ---------------------0123456
206206+ var buf bytes.Buffer
207207+ b.Write(&buf)
208208+ f := &memSeeker{buf.Bytes(), 0}
209209+210210+ r := reader{r: f}
211211+212212+ var toc indexTOC
213213+ r.readTOC(&toc)
214214+ data := r.readIndexData(&toc)
215215+216216+ idx, ok := data.findNgramIdx("abc")
217217+ if !ok {
218218+ t.Errorf("did not find ngram")
219219+ }
220220+221221+ got, err := r.readPostingData(data, idx)
222222+ if err != nil {
223223+ t.Errorf("readPostingData: %V", err)
224224+ }
225225+226226+ if want := []uint32{0, 4}; !reflect.DeepEqual(got, want) {
227227+ t.Errorf("got posting data %v, want %v", got, want)
228228+ }
229229+}
230230+231231+func TestFileBasedSearch(t *testing.T) {
232232+ b := NewIndexBuilder()
233233+234234+ c1 := []byte("I love bananas without skin")
235235+ // -----------0123456789012345678901234567890123456789
236236+ b.AddFile("f1", c1)
237237+ c2 := []byte("In Dutch, ananas means pineapple")
238238+ // -----------0123456789012345678901234567890123456789
239239+ b.AddFile("f2", c2)
240240+241241+ var buf bytes.Buffer
242242+ b.Write(&buf)
243243+ f := &memSeeker{buf.Bytes(), 0}
244244+245245+ searcher, err := NewSearcher(f)
246246+ if err != nil {
247247+ t.Fatalf("NewSearcher: %v", err)
248248+ }
249249+ matches, err := searcher.Search("ananas")
250250+ if err != nil {
251251+ t.Fatalf("Search: %v", err)
252252+ }
253253+254254+ want := []Match{{
255255+ Rank: 0,
256256+ Name: "f1",
257257+ Offset: 8,
258258+ Line: string(c1),
259259+ LineNum: 1,
260260+ LineOff: 8,
261261+ MatchLength: 6,
262262+ }, {
263263+ Rank: 1,
264264+ Name: "f2",
265265+ Line: string(c2),
266266+ LineNum: 1,
267267+ LineOff: 10,
268268+ Offset: 10,
269269+ MatchLength: 6,
270270+ }}
271271+ if !reflect.DeepEqual(matches, want) {
272272+ t.Errorf("got matches %#v, want %#v", matches, want)
273273+ }
274274+}
+393
read.go
···11+// Copyright 2016 Google Inc. All rights reserved.
22+//
33+// Licensed under the Apache License, Version 2.0 (the "License");
44+// you may not use this file except in compliance with the License.
55+// You may obtain a copy of the License at
66+//
77+// http://www.apache.org/licenses/LICENSE-2.0
88+//
99+// Unless required by applicable law or agreed to in writing, software
1010+// distributed under the License is distributed on an "AS IS" BASIS,
1111+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
1212+// See the License for the specific language governing permissions and
1313+// limitations under the License.
1414+1515+package codesearch
1616+1717+import (
1818+ "bytes"
1919+ "encoding/binary"
2020+ "fmt"
2121+ "io"
2222+ "log"
2323+ "os"
2424+ "path/filepath"
2525+ "sort"
2626+)
2727+2828+var _ = log.Println
2929+3030+type reader struct {
3131+ r ReadSeekCloser
3232+ err error
3333+}
3434+3535+func (r *reader) readSection(s *section) {
3636+ s.off = r.U32()
3737+ s.sz = r.U32()
3838+}
3939+4040+func (r *reader) U32() uint32 {
4141+ if r.err != nil {
4242+ return 0
4343+ }
4444+ var b [4]byte
4545+ _, r.err = r.r.Read(b[:])
4646+ return binary.BigEndian.Uint32(b[:])
4747+}
4848+4949+func (r *reader) readTOC(toc *indexTOC) {
5050+ if r.err != nil {
5151+ return
5252+ }
5353+5454+ r.r.Seek(-8, 2)
5555+ var tocSection section
5656+ r.readSection(&tocSection)
5757+ _, r.err = r.r.Seek(int64(tocSection.off), 0)
5858+ for _, s := range toc.sections() {
5959+ r.readSection(s)
6060+ }
6161+}
6262+6363+type ngramText []byte
6464+6565+func (t ngramText) get(i int) []byte {
6666+ return t[i*NGRAM : (i+1)*NGRAM]
6767+}
6868+func (t ngramText) length() int {
6969+ return len(t) / NGRAM
7070+}
7171+7272+// indexData holds the pattern independent data that we have to have
7373+// in memory to search.
7474+type indexData struct {
7575+ ngramText ngramText
7676+ ngramFrequencies []uint32
7777+ postingIndex []uint32
7878+ newlinesIndex []uint32
7979+8080+ // offsets of file contents. Includes end of last file.
8181+ boundaries []uint32
8282+ fileEnds []uint32
8383+ fileNames []string
8484+}
8585+8686+func (d *indexData) findNgramIdx(ngram string) (uint32, bool) {
8787+ asBytes := []byte(ngram)
8888+ idx := sort.Search(d.ngramText.length(), func(j int) bool {
8989+ return bytes.Compare(d.ngramText.get(j), asBytes) >= 0
9090+ })
9191+ if idx == d.ngramText.length() {
9292+ return 0, false
9393+ }
9494+ if bytes.Compare(asBytes, d.ngramText.get(idx)) != 0 {
9595+ return 0, false
9696+ }
9797+ return uint32(idx), true
9898+}
9999+100100+func (r *reader) readSectionBlob(sec section) []byte {
101101+ d := make([]byte, sec.sz)
102102+ r.r.Seek(int64(sec.off), 0)
103103+ _, r.err = r.r.Read(d)
104104+ return d
105105+}
106106+107107+func (r *reader) readSectionU32(sec section) []uint32 {
108108+ blob := r.readSectionBlob(sec)
109109+ arr := make([]uint32, 0, len(blob)/4)
110110+ for len(blob) > 0 {
111111+ arr = append(arr, binary.BigEndian.Uint32(blob))
112112+ blob = blob[4:]
113113+ }
114114+ return arr
115115+}
116116+117117+func (r *reader) readIndexData(toc *indexTOC) *indexData {
118118+ if r.err != nil {
119119+ return nil
120120+ }
121121+122122+ textContent := r.readSectionBlob(toc.ngramText)
123123+ d := indexData{
124124+ ngramText: ngramText(textContent),
125125+ ngramFrequencies: r.readSectionU32(toc.ngramFrequencies),
126126+ postingIndex: r.readSectionU32(toc.postingsIndex),
127127+ boundaries: r.readSectionU32(toc.contentBoundaries),
128128+ newlinesIndex: r.readSectionU32(toc.newlinesIndex),
129129+ }
130130+131131+ d.boundaries = append(d.boundaries, d.boundaries[0]+toc.contents.sz)
132132+ d.postingIndex = append(d.postingIndex, toc.postings.off+toc.postings.sz)
133133+ d.fileEnds = make([]uint32, 0, len(d.boundaries))
134134+ d.newlinesIndex = append(d.newlinesIndex, toc.newlines.off+toc.newlines.sz)
135135+ for _, b := range d.boundaries[1:] {
136136+ d.fileEnds = append(d.fileEnds, b-d.boundaries[0])
137137+ }
138138+139139+ fnBlob := r.readSectionBlob(toc.names)
140140+ fnIndex := r.readSectionU32(toc.nameIndex)
141141+ for i, n := range fnIndex {
142142+ end := toc.names.sz
143143+ if i < len(fnIndex)-1 {
144144+ end = fnIndex[i+1] - fnIndex[0]
145145+ }
146146+ n -= fnIndex[0]
147147+ d.fileNames = append(d.fileNames, string(fnBlob[n:end]))
148148+ }
149149+ return &d
150150+}
151151+152152+func (r *reader) readContents(d *indexData, i uint32) []byte {
153153+ return r.readSectionBlob(section{
154154+ off: d.boundaries[i],
155155+ sz: d.boundaries[i+1] - d.boundaries[i],
156156+ })
157157+}
158158+159159+func (r *reader) readNewlines(d *indexData, i uint32) []uint32 {
160160+ blob := r.readSectionBlob(section{
161161+ off: d.newlinesIndex[i],
162162+ sz: d.newlinesIndex[i+1] - d.newlinesIndex[i],
163163+ })
164164+ last := -1
165165+166166+ var res []uint32
167167+ for len(blob) > 0 {
168168+ delta, m := binary.Uvarint(blob)
169169+ next := int(delta) + last
170170+ res = append(res, uint32(next))
171171+ last = next
172172+ blob = blob[m:]
173173+ }
174174+175175+ return res
176176+}
177177+178178+func (r *reader) readPostingData(d *indexData, idx uint32) ([]uint32, error) {
179179+ sec := section{
180180+ off: d.postingIndex[idx],
181181+ sz: d.postingIndex[idx+1] - d.postingIndex[idx],
182182+ }
183183+184184+ data := r.readSectionBlob(sec)
185185+ if r.err != nil {
186186+ return nil, r.err
187187+ }
188188+ var ps []uint32
189189+ var last uint32
190190+ for len(data) > 0 {
191191+ delta, m := binary.Uvarint(data)
192192+ offset := last + uint32(delta)
193193+ last = offset
194194+ data = data[m:]
195195+ ps = append(ps, offset)
196196+ }
197197+ return ps, nil
198198+}
199199+200200+func (r *reader) readSearch(data *indexData, str string) (*searchInput, error) {
201201+ if len(str) < NGRAM {
202202+ return nil, fmt.Errorf("patter must be at least %d bytes", NGRAM)
203203+ }
204204+205205+ input := &searchInput{
206206+ pat: str,
207207+ }
208208+209209+ firstIdx, ok := data.findNgramIdx(str[:NGRAM])
210210+ if !ok {
211211+ return input, nil
212212+ }
213213+ lastIdx, ok := data.findNgramIdx(str[len(str)-NGRAM:])
214214+ if !ok {
215215+ return input, nil
216216+ }
217217+218218+ var err error
219219+ input.first, err = r.readPostingData(data, firstIdx)
220220+ if err != nil {
221221+ return nil, err
222222+ }
223223+ input.last, err = r.readPostingData(data, lastIdx)
224224+ if err != nil {
225225+ return nil, err
226226+ }
227227+ input.ends = data.fileEnds
228228+ return input, nil
229229+}
230230+231231+type Searcher interface {
232232+ Search(pat string) ([]Match, error)
233233+ Close() error
234234+}
235235+236236+type searcher struct {
237237+ reader reader
238238+ indexData *indexData
239239+}
240240+241241+func (s *searcher) Close() error {
242242+ return s.reader.r.Close()
243243+}
244244+245245+type ReadSeekCloser interface {
246246+ io.ReadSeeker
247247+ io.Closer
248248+}
249249+250250+func NewSearcher(r ReadSeekCloser) (Searcher, error) {
251251+ s := &searcher{
252252+ reader: reader{r: r},
253253+ }
254254+ var toc indexTOC
255255+ s.reader.readTOC(&toc)
256256+ s.indexData = s.reader.readIndexData(&toc)
257257+ if s.reader.err != nil {
258258+ return nil, s.reader.err
259259+ }
260260+ return s, nil
261261+}
262262+263263+type Match struct {
264264+ // Ranking; the lower, the better.
265265+ Rank int
266266+ Line string
267267+ LineNum int
268268+ LineOff int
269269+270270+ Name string
271271+ Offset uint32
272272+ MatchLength int
273273+}
274274+275275+func (s *searcher) Search(pat string) ([]Match, error) {
276276+ input, err := s.reader.readSearch(s.indexData, pat)
277277+ if err != nil {
278278+ return nil, err
279279+ }
280280+ cands := input.search()
281281+282282+ asBytes := []byte(pat)
283283+284284+ var matches []Match
285285+ lastFile := uint32(0xFFFFFFFF)
286286+ var content []byte
287287+ var newlines []uint32
288288+ for _, c := range cands {
289289+ if lastFile != c.file {
290290+ content = s.reader.readContents(s.indexData, c.file)
291291+ newlines = s.reader.readNewlines(s.indexData, c.file)
292292+ lastFile = c.file
293293+ }
294294+295295+ if bytes.Compare(content[c.offset:c.offset+uint32(len(pat))], asBytes) == 0 {
296296+ idx := sort.Search(len(newlines), func(n int) bool {
297297+ return newlines[n] >= c.offset
298298+ })
299299+300300+ end := uint32(len(content))
301301+ if idx < len(newlines) {
302302+ end = newlines[idx]
303303+ }
304304+305305+ start := 0
306306+ if idx > 0 {
307307+ start = int(newlines[idx-1] + 1)
308308+ }
309309+310310+ matches = append(matches, Match{
311311+ Rank: int(c.file),
312312+ Offset: c.offset,
313313+ Line: string(content[start:end]),
314314+ LineNum: idx + 1,
315315+ LineOff: int(c.offset) - start,
316316+ Name: s.indexData.fileNames[c.file],
317317+ MatchLength: len(pat),
318318+ })
319319+ }
320320+ }
321321+322322+ return matches, nil
323323+}
324324+325325+type shardedSearcher struct {
326326+ searchers []Searcher
327327+}
328328+329329+func NewShardedSearcher(indexGlob string) (Searcher, error) {
330330+ fs, err := filepath.Glob(indexGlob)
331331+ if err != nil {
332332+ return nil, err
333333+ }
334334+335335+ if len(fs) == 0 {
336336+ return nil, fmt.Errorf("glob %q does not match anything.", indexGlob)
337337+ }
338338+339339+ ss := shardedSearcher{}
340340+341341+ for _, fn := range fs {
342342+ f, err := os.Open(fn)
343343+ if err != nil {
344344+ return nil, err
345345+ }
346346+347347+ s, err := NewSearcher(f)
348348+ if err != nil {
349349+ return nil, fmt.Errorf("NewSearcher(%s): %v", f, err)
350350+ }
351351+ ss.searchers = append(ss.searchers, s)
352352+ }
353353+354354+ return &ss, nil
355355+}
356356+357357+type matchSlice []Match
358358+359359+func (m matchSlice) Len() int { return len(m) }
360360+func (m matchSlice) Less(i, j int) bool { return m[i].Rank < m[j].Rank }
361361+func (m matchSlice) Swap(i, j int) { m[i], m[j] = m[j], m[i] }
362362+363363+func (ss *shardedSearcher) Close() error {
364364+ for _, s := range ss.searchers {
365365+ s.Close()
366366+ }
367367+ return nil
368368+}
369369+370370+func (ss *shardedSearcher) Search(pat string) ([]Match, error) {
371371+ type res struct {
372372+ m []Match
373373+ err error
374374+ }
375375+ all := make(chan res, len(ss.searchers))
376376+ for _, s := range ss.searchers {
377377+ go func(s Searcher) {
378378+ ms, err := s.Search(pat)
379379+ all <- res{ms, err}
380380+ }(s)
381381+ }
382382+383383+ var aggregate []Match
384384+ for _ = range ss.searchers {
385385+ r := <-all
386386+ if r.err != nil {
387387+ return nil, r.err
388388+ }
389389+ aggregate = append(aggregate, r.m...)
390390+ }
391391+ sort.Sort((matchSlice)(aggregate))
392392+ return aggregate, nil
393393+}
+71
search.go
···11+// Copyright 2016 Google Inc. All rights reserved.
22+//
33+// Licensed under the Apache License, Version 2.0 (the "License");
44+// you may not use this file except in compliance with the License.
55+// You may obtain a copy of the License at
66+//
77+// http://www.apache.org/licenses/LICENSE-2.0
88+//
99+// Unless required by applicable law or agreed to in writing, software
1010+// distributed under the License is distributed on an "AS IS" BASIS,
1111+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
1212+// See the License for the specific language governing permissions and
1313+// limitations under the License.
1414+1515+package codesearch
1616+1717+type searchInput struct {
1818+ pat string
1919+2020+ first []uint32
2121+ last []uint32
2222+ ends []uint32
2323+}
2424+2525+type candidateMatch struct {
2626+ file uint32
2727+ offset uint32
2828+}
2929+3030+func (s *searchInput) search() []candidateMatch {
3131+ fileIdx := 0
3232+ diff := uint32(len(s.pat) - NGRAM)
3333+3434+ var candidates []candidateMatch
3535+ for {
3636+ if len(s.first) == 0 || len(s.last) == 0 {
3737+ break
3838+ }
3939+ p1 := s.first[0]
4040+ p2 := s.last[0]
4141+4242+ for fileIdx < len(s.ends) && s.ends[fileIdx] <= p1 {
4343+ fileIdx++
4444+ }
4545+4646+ if p1+diff < p2 {
4747+ s.first = s.first[1:]
4848+ } else if p1+diff > p2 {
4949+ s.last = s.last[1:]
5050+ } else {
5151+ s.first = s.first[1:]
5252+ s.last = s.last[1:]
5353+5454+ if p1+uint32(len(s.pat)) >= s.ends[fileIdx] {
5555+ continue
5656+ }
5757+5858+ fileStart := uint32(0)
5959+ if fileIdx > 0 {
6060+ fileStart += s.ends[fileIdx-1]
6161+ }
6262+ candidates = append(candidates,
6363+ candidateMatch{
6464+ uint32(fileIdx),
6565+ p1 - fileStart,
6666+ })
6767+ }
6868+ }
6969+7070+ return candidates
7171+}
+215
write.go
···11+// Copyright 2016 Google Inc. All rights reserved.
22+//
33+// Licensed under the Apache License, Version 2.0 (the "License");
44+// you may not use this file except in compliance with the License.
55+// You may obtain a copy of the License at
66+//
77+// http://www.apache.org/licenses/LICENSE-2.0
88+//
99+// Unless required by applicable law or agreed to in writing, software
1010+// distributed under the License is distributed on an "AS IS" BASIS,
1111+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
1212+// See the License for the specific language governing permissions and
1313+// limitations under the License.
1414+1515+package codesearch
1616+1717+import (
1818+ "bufio"
1919+ "encoding/binary"
2020+ "io"
2121+ "log"
2222+ "sort"
2323+)
2424+2525+var _ = log.Println
2626+2727+type writer struct {
2828+ err error
2929+ w io.Writer
3030+ off uint32
3131+}
3232+3333+func (w *writer) Write(b []byte) error {
3434+ if w.err != nil {
3535+ return w.err
3636+ }
3737+3838+ var n int
3939+ n, w.err = w.w.Write(b)
4040+ w.off += uint32(n)
4141+ return w.err
4242+}
4343+4444+func (w *writer) Off() uint32 { return w.off }
4545+4646+func (w *writer) B(b byte) {
4747+ s := []byte{b}
4848+ w.Write(s)
4949+}
5050+5151+func (w *writer) U32(n uint32) {
5252+ var enc [4]byte
5353+ binary.BigEndian.PutUint32(enc[:], n)
5454+ w.Write(enc[:])
5555+}
5656+5757+func (w *writer) Varint(n uint32) {
5858+ var enc [8]byte
5959+ m := binary.PutUvarint(enc[:], uint64(n))
6060+ w.Write(enc[:m])
6161+}
6262+6363+func (w *writer) startSection(s *section) {
6464+ s.off = w.Off()
6565+}
6666+6767+func (w *writer) endSection(s *section) {
6868+ s.sz = w.Off() - s.off
6969+}
7070+7171+type section struct {
7272+ off uint32
7373+ sz uint32
7474+}
7575+7676+func (w *writer) writeSection(s *section) {
7777+ w.U32(s.off)
7878+ w.U32(s.sz)
7979+}
8080+8181+type indexTOC struct {
8282+ contents section
8383+ contentBoundaries section
8484+ newlines section
8585+ newlinesIndex section
8686+ ngramText section
8787+ ngramFrequencies section
8888+ postings section
8989+ postingsIndex section
9090+ names section
9191+ nameIndex section
9292+}
9393+9494+func (t *indexTOC) sections() []*section {
9595+ return []*section{
9696+ &t.contents,
9797+ &t.contentBoundaries,
9898+ &t.newlines,
9999+ &t.newlinesIndex,
100100+ &t.ngramText,
101101+ &t.ngramFrequencies,
102102+ &t.postings,
103103+ &t.postingsIndex,
104104+ &t.names,
105105+ &t.nameIndex,
106106+ }
107107+}
108108+109109+func (w *writer) writeTOC(toc *indexTOC) {
110110+ for _, s := range toc.sections() {
111111+ w.writeSection(s)
112112+ }
113113+}
114114+115115+func (b *IndexBuilder) Write(out io.Writer) error {
116116+ buffered := bufio.NewWriterSize(out, 1<<20)
117117+ defer buffered.Flush()
118118+119119+ w := &writer{w: buffered}
120120+ toc := indexTOC{}
121121+ var items []uint32
122122+ w.startSection(&toc.contents)
123123+ for _, f := range b.files {
124124+ items = append(items, w.Off())
125125+ w.Write(f.content)
126126+ }
127127+ w.endSection(&toc.contents)
128128+129129+ w.startSection(&toc.contentBoundaries)
130130+ for _, off := range items {
131131+ w.U32(off)
132132+ }
133133+ w.endSection(&toc.contentBoundaries)
134134+135135+ w.startSection(&toc.newlines)
136136+ items = items[:0]
137137+ for _, f := range b.files {
138138+ items = append(items, w.Off())
139139+ last := -1
140140+ for i, c := range f.content {
141141+ if c == '\n' {
142142+ w.Varint(uint32(i - last))
143143+ last = i
144144+ }
145145+ }
146146+ }
147147+ w.endSection(&toc.newlines)
148148+149149+ w.startSection(&toc.newlinesIndex)
150150+ for _, off := range items {
151151+ w.U32(off)
152152+ }
153153+ w.endSection(&toc.newlinesIndex)
154154+155155+ var keys []string
156156+ for k := range b.postings {
157157+ keys = append(keys, k)
158158+ }
159159+ sort.Strings(keys)
160160+161161+ w.startSection(&toc.ngramText)
162162+ for _, k := range keys {
163163+ w.Write([]byte(k))
164164+ }
165165+ w.endSection(&toc.ngramText)
166166+167167+ w.startSection(&toc.postings)
168168+ items = items[:0]
169169+ for _, k := range keys {
170170+ var last uint32
171171+ items = append(items, w.Off())
172172+ for _, p := range b.postings[k] {
173173+ delta := p - last
174174+ w.Varint(delta)
175175+ last = p
176176+ }
177177+ }
178178+ w.endSection(&toc.postings)
179179+180180+ w.startSection(&toc.ngramFrequencies)
181181+ for _, k := range keys {
182182+ n := uint32(len(b.postings[k]))
183183+ w.U32(n)
184184+ }
185185+ w.endSection(&toc.ngramFrequencies)
186186+187187+ w.startSection(&toc.postingsIndex)
188188+ for _, off := range items {
189189+ w.U32(off)
190190+ }
191191+ w.endSection(&toc.postingsIndex)
192192+193193+ w.startSection(&toc.names)
194194+ items = items[:0]
195195+ for _, f := range b.files {
196196+ items = append(items, w.Off())
197197+ w.Write([]byte(f.name))
198198+ }
199199+ w.endSection(&toc.names)
200200+201201+ w.startSection(&toc.nameIndex)
202202+ for _, off := range items {
203203+ w.U32(off)
204204+ }
205205+ w.endSection(&toc.nameIndex)
206206+207207+ var tocSection section
208208+ w.startSection(&tocSection)
209209+ w.writeTOC(&toc)
210210+211211+ w.endSection(&tocSection)
212212+ w.writeSection(&tocSection)
213213+214214+ return w.err
215215+}