fork of https://github.com/sourcegraph/zoekt
1// Copyright 2016 Google Inc. All rights reserved.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15package zoekt
16
17import (
18 "bytes"
19 "fmt"
20)
21
22// candidateMatch is a candidate match for a substring.
23type candidateMatch struct {
24 caseSensitive bool
25 fileName bool
26 symbol bool
27 symbolIdx uint32
28
29 substrBytes []byte
30 substrLowered []byte
31
32 file uint32
33
34 // Offsets are relative to the start of the filename or file contents.
35 runeOffset uint32
36 byteOffset uint32
37 byteMatchSz uint32
38}
39
40// Matches content against the substring, and populates byteMatchSz on success
41func (m *candidateMatch) matchContent(content []byte) bool {
42 if m.caseSensitive {
43 comp := bytes.Equal(m.substrBytes, content[m.byteOffset:m.byteOffset+uint32(len(m.substrBytes))])
44
45 m.byteMatchSz = uint32(len(m.substrBytes))
46 return comp
47 } else {
48 // It is tempting to try a simple ASCII based
49 // comparison if possible, but we need more
50 // information. Simple ASCII chars have unicode upper
51 // case variants (the ASCII 'k' has the Kelvin symbol
52 // as upper case variant). We can only degrade to
53 // ASCII if we are sure that both the corpus and the
54 // query is ASCII only
55 sz, ok := caseFoldingEqualsRunes(m.substrLowered, content[m.byteOffset:])
56 m.byteMatchSz = uint32(sz)
57 return ok
58 }
59}
60
61// matchIterator is a docIterator that produces candidateMatches for a given document
62type matchIterator interface {
63 docIterator
64
65 candidates() []*candidateMatch
66
67 // updateStats is called twice. After matchtree construction and after
68 // searching is done. Implementations must take care to not report
69 // statistics twice.
70 updateStats(*Stats)
71}
72
73// noMatchTree is both matchIterator and matchTree that matches nothing.
74type noMatchTree struct {
75 Why string
76
77 // Stats captures the work done to create the noMatchTree.
78 Stats Stats
79}
80
81func (t *noMatchTree) String() string {
82 return fmt.Sprintf("not(%q)", t.Why)
83}
84
85func (t *noMatchTree) candidates() []*candidateMatch {
86 return nil
87}
88
89func (t *noMatchTree) nextDoc() uint32 {
90 return maxUInt32
91}
92
93func (t *noMatchTree) prepare(uint32) {}
94
95func (t *noMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) (bool, bool) {
96 return false, true
97}
98
99func (t *noMatchTree) updateStats(s *Stats) {
100 s.Add(t.Stats)
101 t.Stats = Stats{}
102}
103
104func (m *candidateMatch) String() string {
105 return fmt.Sprintf("%d:%d", m.file, m.runeOffset)
106}
107
108type ngramDocIterator struct {
109 leftPad uint32
110 rightPad uint32
111
112 iter hitIterator
113 ends []uint32
114
115 // ngramLookups is how many lookups we did to create this iterator.
116 ngramLookups int
117
118 // mutable
119 fileIdx uint32
120 matchCount int
121}
122
123// nextFileIndex returns the smallest index j of ends such that
124// ends[j] > offset, assuming ends[f] <= offset.
125func nextFileIndex(offset, f uint32, ends []uint32) uint32 {
126 d := uint32(1)
127 for f < uint32(len(ends)) && ends[f] <= offset {
128 if f+d < uint32(len(ends)) && ends[f+d] <= offset {
129 f += d
130 d *= 2
131 } else if d > 1 {
132 d = d/4 + 1
133 } else {
134 f++
135 }
136 }
137 return f
138}
139
140func (i *ngramDocIterator) nextDoc() uint32 {
141 i.fileIdx = nextFileIndex(i.iter.first(), i.fileIdx, i.ends)
142 if i.fileIdx >= uint32(len(i.ends)) {
143 return maxUInt32
144 }
145 return i.fileIdx
146}
147
148func (i *ngramDocIterator) String() string {
149 return fmt.Sprintf("ngram(L=%d,R=%d,%v)", i.leftPad, i.rightPad, i.iter)
150}
151
152func (i *ngramDocIterator) prepare(nextDoc uint32) {
153 var start uint32
154 if nextDoc > 0 {
155 start = i.ends[nextDoc-1]
156 }
157 if start > 0 {
158 i.iter.next(start + i.leftPad - 1)
159 }
160 i.fileIdx = nextDoc
161}
162
163func (i *ngramDocIterator) updateStats(s *Stats) {
164 i.iter.updateStats(s)
165 s.NgramMatches += i.matchCount
166 s.NgramLookups += i.ngramLookups
167 i.matchCount = 0
168 i.ngramLookups = 0
169}
170
171func (i *ngramDocIterator) candidates() []*candidateMatch {
172 if i.fileIdx >= uint32(len(i.ends)) {
173 return nil
174 }
175
176 var fileStart uint32
177 if i.fileIdx > 0 {
178 fileStart = i.ends[i.fileIdx-1]
179 }
180 fileEnd := i.ends[i.fileIdx]
181
182 var candidates []*candidateMatch
183 for {
184 p1 := i.iter.first()
185 if p1 == maxUInt32 || p1 >= i.ends[i.fileIdx] {
186 break
187 }
188 i.iter.next(p1)
189
190 if p1 < i.leftPad+fileStart || p1+i.rightPad > fileEnd {
191 continue
192 }
193
194 candidates = append(candidates, &candidateMatch{
195 file: uint32(i.fileIdx),
196 runeOffset: p1 - fileStart - i.leftPad,
197 })
198 }
199 i.matchCount += len(candidates)
200 return candidates
201}