fork of https://github.com/sourcegraph/zoekt
1// Copyright 2016 Google Inc. All rights reserved.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15package zoekt
16
17import (
18 "bytes"
19 "fmt"
20)
21
22// candidateMatch is a candidate match for a substring.
23//
24// Note: a lot of these can be in memory, so think about fieldalignment when
25// modify the fields of this structure.
26type candidateMatch struct {
27 substrBytes []byte
28 substrLowered []byte
29
30 file uint32
31 symbolIdx uint32
32
33 // Offsets are relative to the start of the filename or file contents.
34 runeOffset uint32
35 byteOffset uint32
36 byteMatchSz uint32
37
38 // bools at end for struct field alignment
39 caseSensitive bool
40 fileName bool
41 symbol bool
42}
43
44// Matches content against the substring, and populates byteMatchSz on success
45func (m *candidateMatch) matchContent(content []byte) bool {
46 if m.caseSensitive {
47 comp := bytes.Equal(m.substrBytes, content[m.byteOffset:m.byteOffset+uint32(len(m.substrBytes))])
48
49 m.byteMatchSz = uint32(len(m.substrBytes))
50 return comp
51 } else {
52 // It is tempting to try a simple ASCII based
53 // comparison if possible, but we need more
54 // information. Simple ASCII chars have unicode upper
55 // case variants (the ASCII 'k' has the Kelvin symbol
56 // as upper case variant). We can only degrade to
57 // ASCII if we are sure that both the corpus and the
58 // query is ASCII only
59 sz, ok := caseFoldingEqualsRunes(m.substrLowered, content[m.byteOffset:])
60 m.byteMatchSz = uint32(sz)
61 return ok
62 }
63}
64
65// matchIterator is a docIterator that produces candidateMatches for a given document
66type matchIterator interface {
67 docIterator
68
69 candidates() []*candidateMatch
70
71 // updateStats is called twice. After matchtree construction and after
72 // searching is done. Implementations must take care to not report
73 // statistics twice.
74 updateStats(*Stats)
75}
76
77// noMatchTree is both matchIterator and matchTree that matches nothing.
78type noMatchTree struct {
79 Why string
80
81 // Stats captures the work done to create the noMatchTree.
82 Stats Stats
83}
84
85func (t *noMatchTree) String() string {
86 return fmt.Sprintf("not(%q)", t.Why)
87}
88
89func (t *noMatchTree) candidates() []*candidateMatch {
90 return nil
91}
92
93func (t *noMatchTree) nextDoc() uint32 {
94 return maxUInt32
95}
96
97func (t *noMatchTree) prepare(uint32) {}
98
99func (t *noMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
100 return matchesNone
101}
102
103func (t *noMatchTree) updateStats(s *Stats) {
104 s.Add(t.Stats)
105 t.Stats = Stats{}
106}
107
108func (m *candidateMatch) String() string {
109 return fmt.Sprintf("%d:%d", m.file, m.runeOffset)
110}
111
112type ngramDocIterator struct {
113 leftPad uint32
114 rightPad uint32
115
116 iter hitIterator
117 ends []uint32
118
119 // ngramLookups is how many lookups we did to create this iterator.
120 ngramLookups int
121
122 // mutable
123 fileIdx uint32
124 matchCount int
125}
126
127// nextFileIndex returns the smallest index j of ends such that
128// ends[j] > offset, assuming ends[f] <= offset.
129func nextFileIndex(offset, f uint32, ends []uint32) uint32 {
130 d := uint32(1)
131 for f < uint32(len(ends)) && ends[f] <= offset {
132 if f+d < uint32(len(ends)) && ends[f+d] <= offset {
133 f += d
134 d *= 2
135 } else if d > 1 {
136 d = d/4 + 1
137 } else {
138 f++
139 }
140 }
141 return f
142}
143
144func (i *ngramDocIterator) nextDoc() uint32 {
145 i.fileIdx = nextFileIndex(i.iter.first(), i.fileIdx, i.ends)
146 if i.fileIdx >= uint32(len(i.ends)) {
147 return maxUInt32
148 }
149 return i.fileIdx
150}
151
152func (i *ngramDocIterator) String() string {
153 return fmt.Sprintf("ngram(L=%d,R=%d,%v)", i.leftPad, i.rightPad, i.iter)
154}
155
156func (i *ngramDocIterator) prepare(nextDoc uint32) {
157 var start uint32
158 if nextDoc > 0 {
159 start = i.ends[nextDoc-1]
160 }
161 if start > 0 {
162 i.iter.next(start + i.leftPad - 1)
163 }
164 i.fileIdx = nextDoc
165}
166
167func (i *ngramDocIterator) updateStats(s *Stats) {
168 i.iter.updateStats(s)
169 s.NgramMatches += i.matchCount
170 s.NgramLookups += i.ngramLookups
171 i.matchCount = 0
172 i.ngramLookups = 0
173}
174
175func (i *ngramDocIterator) candidates() []*candidateMatch {
176 if i.fileIdx >= uint32(len(i.ends)) {
177 return nil
178 }
179
180 var fileStart uint32
181 if i.fileIdx > 0 {
182 fileStart = i.ends[i.fileIdx-1]
183 }
184 fileEnd := i.ends[i.fileIdx]
185
186 var candidates []*candidateMatch
187 for {
188 p1 := i.iter.first()
189 if p1 == maxUInt32 || p1 >= i.ends[i.fileIdx] {
190 break
191 }
192 i.iter.next(p1)
193
194 if p1 < i.leftPad+fileStart || p1+i.rightPad > fileEnd {
195 continue
196 }
197
198 candidates = append(candidates, &candidateMatch{
199 file: uint32(i.fileIdx),
200 runeOffset: p1 - fileStart - i.leftPad,
201 })
202 }
203 i.matchCount += len(candidates)
204 return candidates
205}