fork of https://github.com/sourcegraph/zoekt
1// Copyright 2016 Google Inc. All rights reserved.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15package index
16
17import (
18 "bytes"
19 "fmt"
20
21 "github.com/sourcegraph/zoekt"
22)
23
24// candidateMatch is a candidate match for a substring.
25//
26// Note: a lot of these can be in memory, so think about fieldalignment when
27// modify the fields of this structure.
28type candidateMatch struct {
29 substrBytes []byte
30 substrLowered []byte
31
32 scoreWeight float64
33
34 file uint32
35 symbolIdx uint32
36
37 // Offsets are relative to the start of the filename or file contents.
38 runeOffset uint32
39 byteOffset uint32
40 byteMatchSz uint32
41
42 // bools at end for struct field alignment
43 caseSensitive bool
44 fileName bool
45 symbol bool
46}
47
48// Matches content against the substring, and populates byteMatchSz on success
49func (m *candidateMatch) matchContent(content []byte) bool {
50 if m.caseSensitive {
51 comp := bytes.Equal(m.substrBytes, content[m.byteOffset:m.byteOffset+uint32(len(m.substrBytes))])
52
53 m.byteMatchSz = uint32(len(m.substrBytes))
54 return comp
55 } else {
56 // It is tempting to try a simple ASCII based
57 // comparison if possible, but we need more
58 // information. Simple ASCII chars have unicode upper
59 // case variants (the ASCII 'k' has the Kelvin symbol
60 // as upper case variant). We can only degrade to
61 // ASCII if we are sure that both the corpus and the
62 // query is ASCII only
63 sz, ok := caseFoldingEqualsRunes(m.substrLowered, content[m.byteOffset:])
64 m.byteMatchSz = uint32(sz)
65 return ok
66 }
67}
68
69// matchIterator is a docIterator that produces candidateMatches for a given document
70type matchIterator interface {
71 docIterator
72
73 candidates() []*candidateMatch
74
75 // updateStats is called twice. After matchtree construction and after
76 // searching is done. Implementations must take care to not report
77 // statistics twice.
78 updateStats(*zoekt.Stats)
79}
80
81// noMatchTree is both matchIterator and matchTree that matches nothing.
82type noMatchTree struct {
83 Why string
84
85 // Stats captures the work done to create the noMatchTree.
86 Stats zoekt.Stats
87}
88
89func (t *noMatchTree) String() string {
90 return fmt.Sprintf("not(%q)", t.Why)
91}
92
93func (t *noMatchTree) candidates() []*candidateMatch {
94 return nil
95}
96
97func (t *noMatchTree) nextDoc() uint32 {
98 return maxUInt32
99}
100
101func (t *noMatchTree) prepare(uint32) {}
102
103func (t *noMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
104 return matchesNone
105}
106
107func (t *noMatchTree) updateStats(s *zoekt.Stats) {
108 s.Add(t.Stats)
109 t.Stats = zoekt.Stats{}
110}
111
112func (m *candidateMatch) String() string {
113 return fmt.Sprintf("%d:%d", m.file, m.runeOffset)
114}
115
116type ngramDocIterator struct {
117 leftPad uint32
118 rightPad uint32
119
120 iter hitIterator
121 ends []uint32
122
123 // ngramLookups is how many lookups we did to create this iterator.
124 ngramLookups int
125
126 // mutable
127 fileIdx uint32
128 matchCount int
129}
130
131// nextFileIndex returns the smallest index j of ends such that
132// ends[j] > offset, assuming ends[f] <= offset.
133func nextFileIndex(offset, f uint32, ends []uint32) uint32 {
134 d := uint32(1)
135 for f < uint32(len(ends)) && ends[f] <= offset {
136 if f+d < uint32(len(ends)) && ends[f+d] <= offset {
137 f += d
138 d *= 2
139 } else if d > 1 {
140 d = d/4 + 1
141 } else {
142 f++
143 }
144 }
145 return f
146}
147
148func (i *ngramDocIterator) nextDoc() uint32 {
149 i.fileIdx = nextFileIndex(i.iter.first(), i.fileIdx, i.ends)
150 if i.fileIdx >= uint32(len(i.ends)) {
151 return maxUInt32
152 }
153 return i.fileIdx
154}
155
156func (i *ngramDocIterator) String() string {
157 return fmt.Sprintf("ngram(L=%d,R=%d,%v)", i.leftPad, i.rightPad, i.iter)
158}
159
160func (i *ngramDocIterator) prepare(nextDoc uint32) {
161 var start uint32
162 if nextDoc > 0 {
163 start = i.ends[nextDoc-1]
164 }
165 if start > 0 {
166 i.iter.next(start + i.leftPad - 1)
167 }
168 i.fileIdx = nextDoc
169}
170
171func (i *ngramDocIterator) updateStats(s *zoekt.Stats) {
172 i.iter.updateStats(s)
173 s.NgramMatches += i.matchCount
174 s.NgramLookups += i.ngramLookups
175 i.matchCount = 0
176 i.ngramLookups = 0
177}
178
179func (i *ngramDocIterator) candidates() []*candidateMatch {
180 if i.fileIdx >= uint32(len(i.ends)) {
181 return nil
182 }
183
184 var fileStart uint32
185 if i.fileIdx > 0 {
186 fileStart = i.ends[i.fileIdx-1]
187 }
188 fileEnd := i.ends[i.fileIdx]
189
190 var candidates []*candidateMatch
191 for {
192 p1 := i.iter.first()
193 if p1 == maxUInt32 || p1 >= i.ends[i.fileIdx] {
194 break
195 }
196 i.iter.next(p1)
197
198 if p1 < i.leftPad+fileStart || p1+i.rightPad > fileEnd {
199 continue
200 }
201
202 candidates = append(candidates, &candidateMatch{
203 file: uint32(i.fileIdx),
204 runeOffset: p1 - fileStart - i.leftPad,
205 })
206 }
207 i.matchCount += len(candidates)
208 return candidates
209}