fork of https://github.com/sourcegraph/zoekt
1// Copyright 2016 Google Inc. All rights reserved.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15package zoekt
16
17import (
18 "bytes"
19 "fmt"
20)
21
22// candidateMatch is a candidate match for a substring.
23//
24// Note: a lot of these can be in memory, so think about fieldalignment when
25// modify the fields of this structure.
26type candidateMatch struct {
27 substrBytes []byte
28 substrLowered []byte
29
30 scoreWeight float64
31
32 file uint32
33 symbolIdx uint32
34
35 // Offsets are relative to the start of the filename or file contents.
36 runeOffset uint32
37 byteOffset uint32
38 byteMatchSz uint32
39
40 // bools at end for struct field alignment
41 caseSensitive bool
42 fileName bool
43 symbol bool
44}
45
46// Matches content against the substring, and populates byteMatchSz on success
47func (m *candidateMatch) matchContent(content []byte) bool {
48 if m.caseSensitive {
49 comp := bytes.Equal(m.substrBytes, content[m.byteOffset:m.byteOffset+uint32(len(m.substrBytes))])
50
51 m.byteMatchSz = uint32(len(m.substrBytes))
52 return comp
53 } else {
54 // It is tempting to try a simple ASCII based
55 // comparison if possible, but we need more
56 // information. Simple ASCII chars have unicode upper
57 // case variants (the ASCII 'k' has the Kelvin symbol
58 // as upper case variant). We can only degrade to
59 // ASCII if we are sure that both the corpus and the
60 // query is ASCII only
61 sz, ok := caseFoldingEqualsRunes(m.substrLowered, content[m.byteOffset:])
62 m.byteMatchSz = uint32(sz)
63 return ok
64 }
65}
66
67// matchIterator is a docIterator that produces candidateMatches for a given document
68type matchIterator interface {
69 docIterator
70
71 candidates() []*candidateMatch
72
73 // updateStats is called twice. After matchtree construction and after
74 // searching is done. Implementations must take care to not report
75 // statistics twice.
76 updateStats(*Stats)
77}
78
79// noMatchTree is both matchIterator and matchTree that matches nothing.
80type noMatchTree struct {
81 Why string
82
83 // Stats captures the work done to create the noMatchTree.
84 Stats Stats
85}
86
87func (t *noMatchTree) String() string {
88 return fmt.Sprintf("not(%q)", t.Why)
89}
90
91func (t *noMatchTree) candidates() []*candidateMatch {
92 return nil
93}
94
95func (t *noMatchTree) nextDoc() uint32 {
96 return maxUInt32
97}
98
99func (t *noMatchTree) prepare(uint32) {}
100
101func (t *noMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
102 return matchesNone
103}
104
105func (t *noMatchTree) updateStats(s *Stats) {
106 s.Add(t.Stats)
107 t.Stats = Stats{}
108}
109
110func (m *candidateMatch) String() string {
111 return fmt.Sprintf("%d:%d", m.file, m.runeOffset)
112}
113
114type ngramDocIterator struct {
115 leftPad uint32
116 rightPad uint32
117
118 iter hitIterator
119 ends []uint32
120
121 // ngramLookups is how many lookups we did to create this iterator.
122 ngramLookups int
123
124 // mutable
125 fileIdx uint32
126 matchCount int
127}
128
129// nextFileIndex returns the smallest index j of ends such that
130// ends[j] > offset, assuming ends[f] <= offset.
131func nextFileIndex(offset, f uint32, ends []uint32) uint32 {
132 d := uint32(1)
133 for f < uint32(len(ends)) && ends[f] <= offset {
134 if f+d < uint32(len(ends)) && ends[f+d] <= offset {
135 f += d
136 d *= 2
137 } else if d > 1 {
138 d = d/4 + 1
139 } else {
140 f++
141 }
142 }
143 return f
144}
145
146func (i *ngramDocIterator) nextDoc() uint32 {
147 i.fileIdx = nextFileIndex(i.iter.first(), i.fileIdx, i.ends)
148 if i.fileIdx >= uint32(len(i.ends)) {
149 return maxUInt32
150 }
151 return i.fileIdx
152}
153
154func (i *ngramDocIterator) String() string {
155 return fmt.Sprintf("ngram(L=%d,R=%d,%v)", i.leftPad, i.rightPad, i.iter)
156}
157
158func (i *ngramDocIterator) prepare(nextDoc uint32) {
159 var start uint32
160 if nextDoc > 0 {
161 start = i.ends[nextDoc-1]
162 }
163 if start > 0 {
164 i.iter.next(start + i.leftPad - 1)
165 }
166 i.fileIdx = nextDoc
167}
168
169func (i *ngramDocIterator) updateStats(s *Stats) {
170 i.iter.updateStats(s)
171 s.NgramMatches += i.matchCount
172 s.NgramLookups += i.ngramLookups
173 i.matchCount = 0
174 i.ngramLookups = 0
175}
176
177func (i *ngramDocIterator) candidates() []*candidateMatch {
178 if i.fileIdx >= uint32(len(i.ends)) {
179 return nil
180 }
181
182 var fileStart uint32
183 if i.fileIdx > 0 {
184 fileStart = i.ends[i.fileIdx-1]
185 }
186 fileEnd := i.ends[i.fileIdx]
187
188 var candidates []*candidateMatch
189 for {
190 p1 := i.iter.first()
191 if p1 == maxUInt32 || p1 >= i.ends[i.fileIdx] {
192 break
193 }
194 i.iter.next(p1)
195
196 if p1 < i.leftPad+fileStart || p1+i.rightPad > fileEnd {
197 continue
198 }
199
200 candidates = append(candidates, &candidateMatch{
201 file: uint32(i.fileIdx),
202 runeOffset: p1 - fileStart - i.leftPad,
203 })
204 }
205 i.matchCount += len(candidates)
206 return candidates
207}