index/matchiter.go at cfc4aa0d9ea620143fef099bfed61a88e434c9cd · boltless.me/zoekt

fork of https://github.com/sourcegraph/zoekt
zoekt / index / matchiter.go
at cfc4aa0d9ea620143fef099bfed61a88e434c9cd 5.1 kB View raw
Julie Tibshirani Fix compilation on 32 bit architectures (#936) 1y ago
  1// Copyright 2016 Google Inc. All rights reserved.
  2//
  3// Licensed under the Apache License, Version 2.0 (the "License");
  4// you may not use this file except in compliance with the License.
  5// You may obtain a copy of the License at
  6//
  7//    http://www.apache.org/licenses/LICENSE-2.0
  8//
  9// Unless required by applicable law or agreed to in writing, software
 10// distributed under the License is distributed on an "AS IS" BASIS,
 11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12// See the License for the specific language governing permissions and
 13// limitations under the License.
 14
 15package index
 16
 17import (
 18	"bytes"
 19	"fmt"
 20	"math"
 21
 22	"github.com/sourcegraph/zoekt"
 23)
 24
 25// candidateMatch is a candidate match for a substring.
 26//
 27// Note: a lot of these can be in memory, so think about fieldalignment when
 28// modify the fields of this structure.
 29type candidateMatch struct {
 30	substrBytes   []byte
 31	substrLowered []byte
 32
 33	scoreWeight float64
 34
 35	file      uint32
 36	symbolIdx uint32
 37
 38	// Offsets are relative to the start of the filename or file contents.
 39	runeOffset  uint32
 40	byteOffset  uint32
 41	byteMatchSz uint32
 42
 43	// bools at end for struct field alignment
 44	caseSensitive bool
 45	fileName      bool
 46	symbol        bool
 47}
 48
 49// Matches content against the substring, and populates byteMatchSz on success
 50func (m *candidateMatch) matchContent(content []byte) bool {
 51	if m.caseSensitive {
 52		comp := bytes.Equal(m.substrBytes, content[m.byteOffset:m.byteOffset+uint32(len(m.substrBytes))])
 53
 54		m.byteMatchSz = uint32(len(m.substrBytes))
 55		return comp
 56	} else {
 57		// It is tempting to try a simple ASCII based
 58		// comparison if possible, but we need more
 59		// information. Simple ASCII chars have unicode upper
 60		// case variants (the ASCII 'k' has the Kelvin symbol
 61		// as upper case variant). We can only degrade to
 62		// ASCII if we are sure that both the corpus and the
 63		// query is ASCII only
 64		sz, ok := caseFoldingEqualsRunes(m.substrLowered, content[m.byteOffset:])
 65		m.byteMatchSz = uint32(sz)
 66		return ok
 67	}
 68}
 69
 70// matchIterator is a docIterator that produces candidateMatches for a given document
 71type matchIterator interface {
 72	docIterator
 73
 74	candidates() []*candidateMatch
 75
 76	// updateStats is called twice. After matchtree construction and after
 77	// searching is done. Implementations must take care to not report
 78	// statistics twice.
 79	updateStats(*zoekt.Stats)
 80}
 81
 82// noMatchTree is both matchIterator and matchTree that matches nothing.
 83type noMatchTree struct {
 84	Why string
 85
 86	// Stats captures the work done to create the noMatchTree.
 87	Stats zoekt.Stats
 88}
 89
 90func (t *noMatchTree) String() string {
 91	return fmt.Sprintf("not(%q)", t.Why)
 92}
 93
 94func (t *noMatchTree) candidates() []*candidateMatch {
 95	return nil
 96}
 97
 98func (t *noMatchTree) nextDoc() uint32 {
 99	return math.MaxUint32
100}
101
102func (t *noMatchTree) prepare(uint32) {}
103
104func (t *noMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
105	return matchesNone
106}
107
108func (t *noMatchTree) updateStats(s *zoekt.Stats) {
109	s.Add(t.Stats)
110	t.Stats = zoekt.Stats{}
111}
112
113func (m *candidateMatch) String() string {
114	return fmt.Sprintf("%d:%d", m.file, m.runeOffset)
115}
116
117type ngramDocIterator struct {
118	leftPad  uint32
119	rightPad uint32
120
121	iter hitIterator
122	ends []uint32
123
124	// ngramLookups is how many lookups we did to create this iterator.
125	ngramLookups int
126
127	// mutable
128	fileIdx    uint32
129	matchCount int
130}
131
132// nextFileIndex returns the smallest index j of ends such that
133// ends[j] > offset, assuming ends[f] <= offset.
134func nextFileIndex(offset, f uint32, ends []uint32) uint32 {
135	d := uint32(1)
136	for f < uint32(len(ends)) && ends[f] <= offset {
137		if f+d < uint32(len(ends)) && ends[f+d] <= offset {
138			f += d
139			d *= 2
140		} else if d > 1 {
141			d = d/4 + 1
142		} else {
143			f++
144		}
145	}
146	return f
147}
148
149func (i *ngramDocIterator) nextDoc() uint32 {
150	i.fileIdx = nextFileIndex(i.iter.first(), i.fileIdx, i.ends)
151	if i.fileIdx >= uint32(len(i.ends)) {
152		return math.MaxUint32
153	}
154	return i.fileIdx
155}
156
157func (i *ngramDocIterator) String() string {
158	return fmt.Sprintf("ngram(L=%d,R=%d,%v)", i.leftPad, i.rightPad, i.iter)
159}
160
161func (i *ngramDocIterator) prepare(nextDoc uint32) {
162	var start uint32
163	if nextDoc > 0 {
164		start = i.ends[nextDoc-1]
165	}
166	if start > 0 {
167		i.iter.next(start + i.leftPad - 1)
168	}
169	i.fileIdx = nextDoc
170}
171
172func (i *ngramDocIterator) updateStats(s *zoekt.Stats) {
173	i.iter.updateStats(s)
174	s.NgramMatches += i.matchCount
175	s.NgramLookups += i.ngramLookups
176	i.matchCount = 0
177	i.ngramLookups = 0
178}
179
180func (i *ngramDocIterator) candidates() []*candidateMatch {
181	if i.fileIdx >= uint32(len(i.ends)) {
182		return nil
183	}
184
185	var fileStart uint32
186	if i.fileIdx > 0 {
187		fileStart = i.ends[i.fileIdx-1]
188	}
189	fileEnd := i.ends[i.fileIdx]
190
191	var candidates []*candidateMatch
192	for {
193		p1 := i.iter.first()
194		if p1 == math.MaxUint32 || p1 >= i.ends[i.fileIdx] {
195			break
196		}
197		i.iter.next(p1)
198
199		if p1 < i.leftPad+fileStart || p1+i.rightPad > fileEnd {
200			continue
201		}
202
203		candidates = append(candidates, &candidateMatch{
204			file:       uint32(i.fileIdx),
205			runeOffset: p1 - fileStart - i.leftPad,
206		})
207	}
208	i.matchCount += len(candidates)
209	return candidates
210}
Configure Feed

Configure Feed