matchiter.go at 92cb8852b4d41e0e7dd00ce2d3522a4e7164ca22 · boltless.me/zoekt

fork of https://github.com/sourcegraph/zoekt
zoekt / matchiter.go
at 92cb8852b4d41e0e7dd00ce2d3522a4e7164ca22 5.0 kB View raw
Keegan Carruthers-Smith all: adjust field order for match structs (#716) 2y ago
  1// Copyright 2016 Google Inc. All rights reserved.
  2//
  3// Licensed under the Apache License, Version 2.0 (the "License");
  4// you may not use this file except in compliance with the License.
  5// You may obtain a copy of the License at
  6//
  7//    http://www.apache.org/licenses/LICENSE-2.0
  8//
  9// Unless required by applicable law or agreed to in writing, software
 10// distributed under the License is distributed on an "AS IS" BASIS,
 11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12// See the License for the specific language governing permissions and
 13// limitations under the License.
 14
 15package zoekt
 16
 17import (
 18	"bytes"
 19	"fmt"
 20)
 21
 22// candidateMatch is a candidate match for a substring.
 23//
 24// Note: a lot of these can be in memory, so think about fieldalignment when
 25// modify the fields of this structure.
 26type candidateMatch struct {
 27	substrBytes   []byte
 28	substrLowered []byte
 29
 30	file      uint32
 31	symbolIdx uint32
 32
 33	// Offsets are relative to the start of the filename or file contents.
 34	runeOffset  uint32
 35	byteOffset  uint32
 36	byteMatchSz uint32
 37
 38	// bools at end for struct field alignment
 39	caseSensitive bool
 40	fileName      bool
 41	symbol        bool
 42}
 43
 44// Matches content against the substring, and populates byteMatchSz on success
 45func (m *candidateMatch) matchContent(content []byte) bool {
 46	if m.caseSensitive {
 47		comp := bytes.Equal(m.substrBytes, content[m.byteOffset:m.byteOffset+uint32(len(m.substrBytes))])
 48
 49		m.byteMatchSz = uint32(len(m.substrBytes))
 50		return comp
 51	} else {
 52		// It is tempting to try a simple ASCII based
 53		// comparison if possible, but we need more
 54		// information. Simple ASCII chars have unicode upper
 55		// case variants (the ASCII 'k' has the Kelvin symbol
 56		// as upper case variant). We can only degrade to
 57		// ASCII if we are sure that both the corpus and the
 58		// query is ASCII only
 59		sz, ok := caseFoldingEqualsRunes(m.substrLowered, content[m.byteOffset:])
 60		m.byteMatchSz = uint32(sz)
 61		return ok
 62	}
 63}
 64
 65// matchIterator is a docIterator that produces candidateMatches for a given document
 66type matchIterator interface {
 67	docIterator
 68
 69	candidates() []*candidateMatch
 70
 71	// updateStats is called twice. After matchtree construction and after
 72	// searching is done. Implementations must take care to not report
 73	// statistics twice.
 74	updateStats(*Stats)
 75}
 76
 77// noMatchTree is both matchIterator and matchTree that matches nothing.
 78type noMatchTree struct {
 79	Why string
 80
 81	// Stats captures the work done to create the noMatchTree.
 82	Stats Stats
 83}
 84
 85func (t *noMatchTree) String() string {
 86	return fmt.Sprintf("not(%q)", t.Why)
 87}
 88
 89func (t *noMatchTree) candidates() []*candidateMatch {
 90	return nil
 91}
 92
 93func (t *noMatchTree) nextDoc() uint32 {
 94	return maxUInt32
 95}
 96
 97func (t *noMatchTree) prepare(uint32) {}
 98
 99func (t *noMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
100	return matchesNone
101}
102
103func (t *noMatchTree) updateStats(s *Stats) {
104	s.Add(t.Stats)
105	t.Stats = Stats{}
106}
107
108func (m *candidateMatch) String() string {
109	return fmt.Sprintf("%d:%d", m.file, m.runeOffset)
110}
111
112type ngramDocIterator struct {
113	leftPad  uint32
114	rightPad uint32
115
116	iter hitIterator
117	ends []uint32
118
119	// ngramLookups is how many lookups we did to create this iterator.
120	ngramLookups int
121
122	// mutable
123	fileIdx    uint32
124	matchCount int
125}
126
127// nextFileIndex returns the smallest index j of ends such that
128// ends[j] > offset, assuming ends[f] <= offset.
129func nextFileIndex(offset, f uint32, ends []uint32) uint32 {
130	d := uint32(1)
131	for f < uint32(len(ends)) && ends[f] <= offset {
132		if f+d < uint32(len(ends)) && ends[f+d] <= offset {
133			f += d
134			d *= 2
135		} else if d > 1 {
136			d = d/4 + 1
137		} else {
138			f++
139		}
140	}
141	return f
142}
143
144func (i *ngramDocIterator) nextDoc() uint32 {
145	i.fileIdx = nextFileIndex(i.iter.first(), i.fileIdx, i.ends)
146	if i.fileIdx >= uint32(len(i.ends)) {
147		return maxUInt32
148	}
149	return i.fileIdx
150}
151
152func (i *ngramDocIterator) String() string {
153	return fmt.Sprintf("ngram(L=%d,R=%d,%v)", i.leftPad, i.rightPad, i.iter)
154}
155
156func (i *ngramDocIterator) prepare(nextDoc uint32) {
157	var start uint32
158	if nextDoc > 0 {
159		start = i.ends[nextDoc-1]
160	}
161	if start > 0 {
162		i.iter.next(start + i.leftPad - 1)
163	}
164	i.fileIdx = nextDoc
165}
166
167func (i *ngramDocIterator) updateStats(s *Stats) {
168	i.iter.updateStats(s)
169	s.NgramMatches += i.matchCount
170	s.NgramLookups += i.ngramLookups
171	i.matchCount = 0
172	i.ngramLookups = 0
173}
174
175func (i *ngramDocIterator) candidates() []*candidateMatch {
176	if i.fileIdx >= uint32(len(i.ends)) {
177		return nil
178	}
179
180	var fileStart uint32
181	if i.fileIdx > 0 {
182		fileStart = i.ends[i.fileIdx-1]
183	}
184	fileEnd := i.ends[i.fileIdx]
185
186	var candidates []*candidateMatch
187	for {
188		p1 := i.iter.first()
189		if p1 == maxUInt32 || p1 >= i.ends[i.fileIdx] {
190			break
191		}
192		i.iter.next(p1)
193
194		if p1 < i.leftPad+fileStart || p1+i.rightPad > fileEnd {
195			continue
196		}
197
198		candidates = append(candidates, &candidateMatch{
199			file:       uint32(i.fileIdx),
200			runeOffset: p1 - fileStart - i.leftPad,
201		})
202	}
203	i.matchCount += len(candidates)
204	return candidates
205}
Configure Feed

Configure Feed