index/matchiter.go at 70647bad55921645285ce3776d8c29928eb205bc · boltless.me/zoekt

fork of https://github.com/sourcegraph/zoekt
zoekt / index / matchiter.go
at 70647bad55921645285ce3776d8c29928eb205bc 5.1 kB View raw
Julie Tibshirani Move root-level index code to index package (#902) 1y ago
  1// Copyright 2016 Google Inc. All rights reserved.
  2//
  3// Licensed under the Apache License, Version 2.0 (the "License");
  4// you may not use this file except in compliance with the License.
  5// You may obtain a copy of the License at
  6//
  7//    http://www.apache.org/licenses/LICENSE-2.0
  8//
  9// Unless required by applicable law or agreed to in writing, software
 10// distributed under the License is distributed on an "AS IS" BASIS,
 11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12// See the License for the specific language governing permissions and
 13// limitations under the License.
 14
 15package index
 16
 17import (
 18	"bytes"
 19	"fmt"
 20
 21	"github.com/sourcegraph/zoekt"
 22)
 23
 24// candidateMatch is a candidate match for a substring.
 25//
 26// Note: a lot of these can be in memory, so think about fieldalignment when
 27// modify the fields of this structure.
 28type candidateMatch struct {
 29	substrBytes   []byte
 30	substrLowered []byte
 31
 32	scoreWeight float64
 33
 34	file      uint32
 35	symbolIdx uint32
 36
 37	// Offsets are relative to the start of the filename or file contents.
 38	runeOffset  uint32
 39	byteOffset  uint32
 40	byteMatchSz uint32
 41
 42	// bools at end for struct field alignment
 43	caseSensitive bool
 44	fileName      bool
 45	symbol        bool
 46}
 47
 48// Matches content against the substring, and populates byteMatchSz on success
 49func (m *candidateMatch) matchContent(content []byte) bool {
 50	if m.caseSensitive {
 51		comp := bytes.Equal(m.substrBytes, content[m.byteOffset:m.byteOffset+uint32(len(m.substrBytes))])
 52
 53		m.byteMatchSz = uint32(len(m.substrBytes))
 54		return comp
 55	} else {
 56		// It is tempting to try a simple ASCII based
 57		// comparison if possible, but we need more
 58		// information. Simple ASCII chars have unicode upper
 59		// case variants (the ASCII 'k' has the Kelvin symbol
 60		// as upper case variant). We can only degrade to
 61		// ASCII if we are sure that both the corpus and the
 62		// query is ASCII only
 63		sz, ok := caseFoldingEqualsRunes(m.substrLowered, content[m.byteOffset:])
 64		m.byteMatchSz = uint32(sz)
 65		return ok
 66	}
 67}
 68
 69// matchIterator is a docIterator that produces candidateMatches for a given document
 70type matchIterator interface {
 71	docIterator
 72
 73	candidates() []*candidateMatch
 74
 75	// updateStats is called twice. After matchtree construction and after
 76	// searching is done. Implementations must take care to not report
 77	// statistics twice.
 78	updateStats(*zoekt.Stats)
 79}
 80
 81// noMatchTree is both matchIterator and matchTree that matches nothing.
 82type noMatchTree struct {
 83	Why string
 84
 85	// Stats captures the work done to create the noMatchTree.
 86	Stats zoekt.Stats
 87}
 88
 89func (t *noMatchTree) String() string {
 90	return fmt.Sprintf("not(%q)", t.Why)
 91}
 92
 93func (t *noMatchTree) candidates() []*candidateMatch {
 94	return nil
 95}
 96
 97func (t *noMatchTree) nextDoc() uint32 {
 98	return maxUInt32
 99}
100
101func (t *noMatchTree) prepare(uint32) {}
102
103func (t *noMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
104	return matchesNone
105}
106
107func (t *noMatchTree) updateStats(s *zoekt.Stats) {
108	s.Add(t.Stats)
109	t.Stats = zoekt.Stats{}
110}
111
112func (m *candidateMatch) String() string {
113	return fmt.Sprintf("%d:%d", m.file, m.runeOffset)
114}
115
116type ngramDocIterator struct {
117	leftPad  uint32
118	rightPad uint32
119
120	iter hitIterator
121	ends []uint32
122
123	// ngramLookups is how many lookups we did to create this iterator.
124	ngramLookups int
125
126	// mutable
127	fileIdx    uint32
128	matchCount int
129}
130
131// nextFileIndex returns the smallest index j of ends such that
132// ends[j] > offset, assuming ends[f] <= offset.
133func nextFileIndex(offset, f uint32, ends []uint32) uint32 {
134	d := uint32(1)
135	for f < uint32(len(ends)) && ends[f] <= offset {
136		if f+d < uint32(len(ends)) && ends[f+d] <= offset {
137			f += d
138			d *= 2
139		} else if d > 1 {
140			d = d/4 + 1
141		} else {
142			f++
143		}
144	}
145	return f
146}
147
148func (i *ngramDocIterator) nextDoc() uint32 {
149	i.fileIdx = nextFileIndex(i.iter.first(), i.fileIdx, i.ends)
150	if i.fileIdx >= uint32(len(i.ends)) {
151		return maxUInt32
152	}
153	return i.fileIdx
154}
155
156func (i *ngramDocIterator) String() string {
157	return fmt.Sprintf("ngram(L=%d,R=%d,%v)", i.leftPad, i.rightPad, i.iter)
158}
159
160func (i *ngramDocIterator) prepare(nextDoc uint32) {
161	var start uint32
162	if nextDoc > 0 {
163		start = i.ends[nextDoc-1]
164	}
165	if start > 0 {
166		i.iter.next(start + i.leftPad - 1)
167	}
168	i.fileIdx = nextDoc
169}
170
171func (i *ngramDocIterator) updateStats(s *zoekt.Stats) {
172	i.iter.updateStats(s)
173	s.NgramMatches += i.matchCount
174	s.NgramLookups += i.ngramLookups
175	i.matchCount = 0
176	i.ngramLookups = 0
177}
178
179func (i *ngramDocIterator) candidates() []*candidateMatch {
180	if i.fileIdx >= uint32(len(i.ends)) {
181		return nil
182	}
183
184	var fileStart uint32
185	if i.fileIdx > 0 {
186		fileStart = i.ends[i.fileIdx-1]
187	}
188	fileEnd := i.ends[i.fileIdx]
189
190	var candidates []*candidateMatch
191	for {
192		p1 := i.iter.first()
193		if p1 == maxUInt32 || p1 >= i.ends[i.fileIdx] {
194			break
195		}
196		i.iter.next(p1)
197
198		if p1 < i.leftPad+fileStart || p1+i.rightPad > fileEnd {
199			continue
200		}
201
202		candidates = append(candidates, &candidateMatch{
203			file:       uint32(i.fileIdx),
204			runeOffset: p1 - fileStart - i.leftPad,
205		})
206	}
207	i.matchCount += len(candidates)
208	return candidates
209}
Configure Feed

Configure Feed