matchiter.go at db067d1294f3ac13d77c19c8259baa3a8c6c1b6a · boltless.me/zoekt

fork of https://github.com/sourcegraph/zoekt
zoekt / matchiter.go
at db067d1294f3ac13d77c19c8259baa3a8c6c1b6a 4.9 kB View raw
Keegan Carruthers-Smith matchiter: capture metric NgramLookups (#608) 3y ago
  1// Copyright 2016 Google Inc. All rights reserved.
  2//
  3// Licensed under the Apache License, Version 2.0 (the "License");
  4// you may not use this file except in compliance with the License.
  5// You may obtain a copy of the License at
  6//
  7//    http://www.apache.org/licenses/LICENSE-2.0
  8//
  9// Unless required by applicable law or agreed to in writing, software
 10// distributed under the License is distributed on an "AS IS" BASIS,
 11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12// See the License for the specific language governing permissions and
 13// limitations under the License.
 14
 15package zoekt
 16
 17import (
 18	"bytes"
 19	"fmt"
 20)
 21
 22// candidateMatch is a candidate match for a substring.
 23type candidateMatch struct {
 24	caseSensitive bool
 25	fileName      bool
 26	symbol        bool
 27	symbolIdx     uint32
 28
 29	substrBytes   []byte
 30	substrLowered []byte
 31
 32	file uint32
 33
 34	// Offsets are relative to the start of the filename or file contents.
 35	runeOffset  uint32
 36	byteOffset  uint32
 37	byteMatchSz uint32
 38}
 39
 40// Matches content against the substring, and populates byteMatchSz on success
 41func (m *candidateMatch) matchContent(content []byte) bool {
 42	if m.caseSensitive {
 43		comp := bytes.Equal(m.substrBytes, content[m.byteOffset:m.byteOffset+uint32(len(m.substrBytes))])
 44
 45		m.byteMatchSz = uint32(len(m.substrBytes))
 46		return comp
 47	} else {
 48		// It is tempting to try a simple ASCII based
 49		// comparison if possible, but we need more
 50		// information. Simple ASCII chars have unicode upper
 51		// case variants (the ASCII 'k' has the Kelvin symbol
 52		// as upper case variant). We can only degrade to
 53		// ASCII if we are sure that both the corpus and the
 54		// query is ASCII only
 55		sz, ok := caseFoldingEqualsRunes(m.substrLowered, content[m.byteOffset:])
 56		m.byteMatchSz = uint32(sz)
 57		return ok
 58	}
 59}
 60
 61// matchIterator is a docIterator that produces candidateMatches for a given document
 62type matchIterator interface {
 63	docIterator
 64
 65	candidates() []*candidateMatch
 66
 67	// updateStats is called twice. After matchtree construction and after
 68	// searching is done. Implementations must take care to not report
 69	// statistics twice.
 70	updateStats(*Stats)
 71}
 72
 73// noMatchTree is both matchIterator and matchTree that matches nothing.
 74type noMatchTree struct {
 75	Why string
 76
 77	// Stats captures the work done to create the noMatchTree.
 78	Stats Stats
 79}
 80
 81func (t *noMatchTree) String() string {
 82	return fmt.Sprintf("not(%q)", t.Why)
 83}
 84
 85func (t *noMatchTree) candidates() []*candidateMatch {
 86	return nil
 87}
 88
 89func (t *noMatchTree) nextDoc() uint32 {
 90	return maxUInt32
 91}
 92
 93func (t *noMatchTree) prepare(uint32) {}
 94
 95func (t *noMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) (bool, bool) {
 96	return false, true
 97}
 98
 99func (t *noMatchTree) updateStats(s *Stats) {
100	s.Add(t.Stats)
101	t.Stats = Stats{}
102}
103
104func (m *candidateMatch) String() string {
105	return fmt.Sprintf("%d:%d", m.file, m.runeOffset)
106}
107
108type ngramDocIterator struct {
109	leftPad  uint32
110	rightPad uint32
111
112	iter hitIterator
113	ends []uint32
114
115	// ngramLookups is how many lookups we did to create this iterator.
116	ngramLookups int
117
118	// mutable
119	fileIdx    uint32
120	matchCount int
121}
122
123// nextFileIndex returns the smallest index j of ends such that
124// ends[j] > offset, assuming ends[f] <= offset.
125func nextFileIndex(offset, f uint32, ends []uint32) uint32 {
126	d := uint32(1)
127	for f < uint32(len(ends)) && ends[f] <= offset {
128		if f+d < uint32(len(ends)) && ends[f+d] <= offset {
129			f += d
130			d *= 2
131		} else if d > 1 {
132			d = d/4 + 1
133		} else {
134			f++
135		}
136	}
137	return f
138}
139
140func (i *ngramDocIterator) nextDoc() uint32 {
141	i.fileIdx = nextFileIndex(i.iter.first(), i.fileIdx, i.ends)
142	if i.fileIdx >= uint32(len(i.ends)) {
143		return maxUInt32
144	}
145	return i.fileIdx
146}
147
148func (i *ngramDocIterator) String() string {
149	return fmt.Sprintf("ngram(L=%d,R=%d,%v)", i.leftPad, i.rightPad, i.iter)
150}
151
152func (i *ngramDocIterator) prepare(nextDoc uint32) {
153	var start uint32
154	if nextDoc > 0 {
155		start = i.ends[nextDoc-1]
156	}
157	if start > 0 {
158		i.iter.next(start + i.leftPad - 1)
159	}
160	i.fileIdx = nextDoc
161}
162
163func (i *ngramDocIterator) updateStats(s *Stats) {
164	i.iter.updateStats(s)
165	s.NgramMatches += i.matchCount
166	s.NgramLookups += i.ngramLookups
167	i.matchCount = 0
168	i.ngramLookups = 0
169}
170
171func (i *ngramDocIterator) candidates() []*candidateMatch {
172	if i.fileIdx >= uint32(len(i.ends)) {
173		return nil
174	}
175
176	var fileStart uint32
177	if i.fileIdx > 0 {
178		fileStart = i.ends[i.fileIdx-1]
179	}
180	fileEnd := i.ends[i.fileIdx]
181
182	var candidates []*candidateMatch
183	for {
184		p1 := i.iter.first()
185		if p1 == maxUInt32 || p1 >= i.ends[i.fileIdx] {
186			break
187		}
188		i.iter.next(p1)
189
190		if p1 < i.leftPad+fileStart || p1+i.rightPad > fileEnd {
191			continue
192		}
193
194		candidates = append(candidates, &candidateMatch{
195			file:       uint32(i.fileIdx),
196			runeOffset: p1 - fileStart - i.leftPad,
197		})
198	}
199	i.matchCount += len(candidates)
200	return candidates
201}
Configure Feed

Configure Feed