fork of https://github.com/sourcegraph/zoekt
1// Copyright 2018 Google Inc. All rights reserved.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15package index
16
17import (
18 "encoding/binary"
19 "fmt"
20 "math"
21
22 "github.com/sourcegraph/zoekt"
23)
24
25// hitIterator finds potential search matches, measured in offsets of
26// the concatenation of all documents.
27type hitIterator interface {
28 // Return the first hit, or math.MaxUint32 if none.
29 first() uint32
30
31 // Skip until past limit. The argument math.MaxUint32 should be
32 // treated specially.
33 next(limit uint32)
34
35 // Return how many bytes were read.
36 updateStats(s *zoekt.Stats)
37}
38
39// distanceHitIterator looks for hits at a fixed distance apart.
40type distanceHitIterator struct {
41 i1 hitIterator
42 i2 hitIterator
43 distance uint32
44 started bool
45}
46
47func (i *distanceHitIterator) String() string {
48 return fmt.Sprintf("dist(%d, %v, %v)", i.distance, i.i1, i.i2)
49}
50
51func (i *distanceHitIterator) findNext() {
52 for {
53 var p1, p2 uint32
54 p1 = i.i1.first()
55 p2 = i.i2.first()
56 if p1 == math.MaxUint32 || p2 == math.MaxUint32 {
57 i.i1.next(math.MaxUint32)
58 break
59 }
60
61 if p1+i.distance < p2 {
62 i.i1.next(p2 - i.distance - 1)
63 } else if p1+i.distance > p2 {
64 i.i2.next(p1 + i.distance - 1)
65 } else {
66 break
67 }
68 }
69}
70
71func (i *distanceHitIterator) first() uint32 {
72 if !i.started {
73 i.findNext()
74 i.started = true
75 }
76 return i.i1.first()
77}
78
79func (i *distanceHitIterator) updateStats(s *zoekt.Stats) {
80 i.i1.updateStats(s)
81 i.i2.updateStats(s)
82}
83
84func (i *distanceHitIterator) next(limit uint32) {
85 i.i1.next(limit)
86 l2 := limit + i.distance
87
88 if l2 < limit { // overflow.
89 l2 = math.MaxUint32
90 }
91 i.i2.next(l2)
92 i.findNext()
93}
94
95func (d *indexData) newDistanceTrigramIter(ng1, ng2 ngram, dist uint32, caseSensitive, fileName bool) (hitIterator, error) {
96 if dist == 0 {
97 return nil, fmt.Errorf("d == 0")
98 }
99
100 i1, err := d.trigramHitIterator(ng1, caseSensitive, fileName)
101 if err != nil {
102 return nil, err
103 }
104 i2, err := d.trigramHitIterator(ng2, caseSensitive, fileName)
105 if err != nil {
106 return nil, err
107 }
108 return &distanceHitIterator{
109 i1: i1,
110 i2: i2,
111 distance: dist,
112 }, nil
113}
114
115func (d *indexData) trigramHitIterator(ng ngram, caseSensitive, fileName bool) (hitIterator, error) {
116 variants := []ngram{ng}
117 if !caseSensitive {
118 variants = generateCaseNgrams(ng)
119 }
120
121 iters := make([]hitIterator, 0, len(variants))
122 ngramLookups := 0
123 ngrams := d.ngrams(fileName)
124 for _, v := range variants {
125 sec := ngrams.Get(v)
126 ngramLookups++
127 blob, err := d.readSectionBlob(sec)
128 if err != nil {
129 return nil, err
130 }
131 if len(blob) > 0 {
132 iters = append(iters, newCompressedPostingIterator(blob, v))
133 }
134 }
135
136 if len(iters) == 1 {
137 // if we only return 1 then we need to include our ngramLookups stats
138 iter := (iters[0]).(*compressedPostingIterator)
139 iter.ngramLookups = ngramLookups
140 return iter, nil
141 }
142 return &mergingIterator{
143 ngramLookups: ngramLookups,
144 iters: iters,
145 }, nil
146}
147
148// inMemoryIterator is hitIterator that goes over an in-memory uint32 posting list.
149type inMemoryIterator struct {
150 postings []uint32
151 what ngram
152}
153
154func (i *inMemoryIterator) String() string {
155 return fmt.Sprintf("mem(%s):%v", i.what, i.postings)
156}
157
158func (i *inMemoryIterator) first() uint32 {
159 if len(i.postings) > 0 {
160 return i.postings[0]
161 }
162 return math.MaxUint32
163}
164
165func (i *inMemoryIterator) updateStats(_ *zoekt.Stats) {
166}
167
168func (i *inMemoryIterator) next(limit uint32) {
169 if limit == math.MaxUint32 {
170 i.postings = nil
171 }
172
173 for len(i.postings) > 0 && i.postings[0] <= limit {
174 i.postings = i.postings[1:]
175 }
176}
177
178// compressedPostingIterator goes over a delta varint encoded posting
179// list.
180type compressedPostingIterator struct {
181 blob []byte
182 indexBytesLoaded int
183 ngramLookups int
184 _first uint32
185 what ngram
186}
187
188func newCompressedPostingIterator(b []byte, w ngram) *compressedPostingIterator {
189 d, sz := binary.Uvarint(b)
190 return &compressedPostingIterator{
191 _first: uint32(d),
192 blob: b[sz:],
193 indexBytesLoaded: sz,
194 what: w,
195 }
196}
197
198func (i *compressedPostingIterator) String() string {
199 return fmt.Sprintf("compressed(%s, %d, [%d bytes])", i.what, i._first, len(i.blob))
200}
201
202func (i *compressedPostingIterator) first() uint32 {
203 return i._first
204}
205
206func (i *compressedPostingIterator) next(limit uint32) {
207 if limit == math.MaxUint32 {
208 i.blob = nil
209 i._first = math.MaxUint32
210 return
211 }
212
213 for i._first <= limit && len(i.blob) > 0 {
214 delta, sz := binary.Uvarint(i.blob)
215 i._first += uint32(delta)
216 i.indexBytesLoaded += sz
217 i.blob = i.blob[sz:]
218 }
219
220 if i._first <= limit && len(i.blob) == 0 {
221 i._first = math.MaxUint32
222 }
223}
224
225func (i *compressedPostingIterator) updateStats(s *zoekt.Stats) {
226 s.IndexBytesLoaded += int64(i.indexBytesLoaded)
227 s.NgramLookups += i.ngramLookups
228 i.indexBytesLoaded = 0
229 i.ngramLookups = 0
230}
231
232// mergingIterator forms the merge of a set of hitIterators, to
233// implement an OR operation at the hit level.
234type mergingIterator struct {
235 iters []hitIterator
236 ngramLookups int
237}
238
239func (i *mergingIterator) String() string {
240 return fmt.Sprintf("merge:%v", i.iters)
241}
242
243func (i *mergingIterator) updateStats(s *zoekt.Stats) {
244 s.NgramLookups += i.ngramLookups
245 i.ngramLookups = 0
246 for _, j := range i.iters {
247 j.updateStats(s)
248 }
249}
250
251func (i *mergingIterator) first() uint32 {
252 r := uint32(math.MaxUint32)
253 for _, j := range i.iters {
254 f := j.first()
255 if f < r {
256 r = f
257 }
258 }
259
260 return r
261}
262
263func (i *mergingIterator) next(limit uint32) {
264 for _, j := range i.iters {
265 j.next(limit)
266 }
267}