fork of https://github.com/sourcegraph/zoekt
1// Copyright 2016 Google Inc. All rights reserved.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15package index
16
17import (
18 "bytes"
19 "fmt"
20 "math"
21
22 "github.com/sourcegraph/zoekt"
23)
24
25// candidateMatch is a candidate match for a substring.
26//
27// Note: a lot of these can be in memory, so think about fieldalignment when
28// modify the fields of this structure.
29type candidateMatch struct {
30 substrBytes []byte
31 substrLowered []byte
32
33 scoreWeight float64
34
35 file uint32
36 symbolIdx uint32
37
38 // Offsets are relative to the start of the filename or file contents.
39 runeOffset uint32
40 byteOffset uint32
41 byteMatchSz uint32
42
43 // bools at end for struct field alignment
44 caseSensitive bool
45 fileName bool
46 symbol bool
47}
48
49// Matches content against the substring, and populates byteMatchSz on success
50func (m *candidateMatch) matchContent(content []byte) bool {
51 if m.caseSensitive {
52 comp := bytes.Equal(m.substrBytes, content[m.byteOffset:m.byteOffset+uint32(len(m.substrBytes))])
53
54 m.byteMatchSz = uint32(len(m.substrBytes))
55 return comp
56 } else {
57 // It is tempting to try a simple ASCII based
58 // comparison if possible, but we need more
59 // information. Simple ASCII chars have unicode upper
60 // case variants (the ASCII 'k' has the Kelvin symbol
61 // as upper case variant). We can only degrade to
62 // ASCII if we are sure that both the corpus and the
63 // query is ASCII only
64 sz, ok := caseFoldingEqualsRunes(m.substrLowered, content[m.byteOffset:])
65 m.byteMatchSz = uint32(sz)
66 return ok
67 }
68}
69
70// matchIterator is a docIterator that produces candidateMatches for a given document
71type matchIterator interface {
72 docIterator
73
74 candidates() []*candidateMatch
75
76 // updateStats is called twice. After matchtree construction and after
77 // searching is done. Implementations must take care to not report
78 // statistics twice.
79 updateStats(*zoekt.Stats)
80}
81
82// noMatchTree is both matchIterator and matchTree that matches nothing.
83type noMatchTree struct {
84 Why string
85
86 // Stats captures the work done to create the noMatchTree.
87 Stats zoekt.Stats
88}
89
90func (t *noMatchTree) String() string {
91 return fmt.Sprintf("not(%q)", t.Why)
92}
93
94func (t *noMatchTree) candidates() []*candidateMatch {
95 return nil
96}
97
98func (t *noMatchTree) nextDoc() uint32 {
99 return math.MaxUint32
100}
101
102func (t *noMatchTree) prepare(uint32) {}
103
104func (t *noMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
105 return matchesNone
106}
107
108func (t *noMatchTree) updateStats(s *zoekt.Stats) {
109 s.Add(t.Stats)
110 t.Stats = zoekt.Stats{}
111}
112
113func (m *candidateMatch) String() string {
114 return fmt.Sprintf("%d:%d", m.file, m.runeOffset)
115}
116
117type ngramDocIterator struct {
118 leftPad uint32
119 rightPad uint32
120
121 iter hitIterator
122 ends []uint32
123
124 // ngramLookups is how many lookups we did to create this iterator.
125 ngramLookups int
126
127 // mutable
128 fileIdx uint32
129 matchCount int
130}
131
132// nextFileIndex returns the smallest index j of ends such that
133// ends[j] > offset, assuming ends[f] <= offset.
134func nextFileIndex(offset, f uint32, ends []uint32) uint32 {
135 d := uint32(1)
136 for f < uint32(len(ends)) && ends[f] <= offset {
137 if f+d < uint32(len(ends)) && ends[f+d] <= offset {
138 f += d
139 d *= 2
140 } else if d > 1 {
141 d = d/4 + 1
142 } else {
143 f++
144 }
145 }
146 return f
147}
148
149func (i *ngramDocIterator) nextDoc() uint32 {
150 i.fileIdx = nextFileIndex(i.iter.first(), i.fileIdx, i.ends)
151 if i.fileIdx >= uint32(len(i.ends)) {
152 return math.MaxUint32
153 }
154 return i.fileIdx
155}
156
157func (i *ngramDocIterator) String() string {
158 return fmt.Sprintf("ngram(L=%d,R=%d,%v)", i.leftPad, i.rightPad, i.iter)
159}
160
161func (i *ngramDocIterator) prepare(nextDoc uint32) {
162 var start uint32
163 if nextDoc > 0 {
164 start = i.ends[nextDoc-1]
165 }
166 if start > 0 {
167 i.iter.next(start + i.leftPad - 1)
168 }
169 i.fileIdx = nextDoc
170}
171
172func (i *ngramDocIterator) updateStats(s *zoekt.Stats) {
173 i.iter.updateStats(s)
174 s.NgramMatches += i.matchCount
175 s.NgramLookups += i.ngramLookups
176 i.matchCount = 0
177 i.ngramLookups = 0
178}
179
180func (i *ngramDocIterator) candidates() []*candidateMatch {
181 if i.fileIdx >= uint32(len(i.ends)) {
182 return nil
183 }
184
185 var fileStart uint32
186 if i.fileIdx > 0 {
187 fileStart = i.ends[i.fileIdx-1]
188 }
189 fileEnd := i.ends[i.fileIdx]
190
191 var candidates []*candidateMatch
192 for {
193 p1 := i.iter.first()
194 if p1 == math.MaxUint32 || p1 >= i.ends[i.fileIdx] {
195 break
196 }
197 i.iter.next(p1)
198
199 if p1 < i.leftPad+fileStart || p1+i.rightPad > fileEnd {
200 continue
201 }
202
203 candidates = append(candidates, &candidateMatch{
204 file: uint32(i.fileIdx),
205 runeOffset: p1 - fileStart - i.leftPad,
206 })
207 }
208 i.matchCount += len(candidates)
209 return candidates
210}