matchiter.go at 4448a4594e21989eddd12df08a44c82b22f80c74 · boltless.me/zoekt

fork of https://github.com/sourcegraph/zoekt
zoekt / matchiter.go
at 4448a4594e21989eddd12df08a44c82b22f80c74 5.0 kB View raw
Keegan Carruthers-Smith score: introduce query.Boost to scale score (#728) 2y ago
  1// Copyright 2016 Google Inc. All rights reserved.
  2//
  3// Licensed under the Apache License, Version 2.0 (the "License");
  4// you may not use this file except in compliance with the License.
  5// You may obtain a copy of the License at
  6//
  7//    http://www.apache.org/licenses/LICENSE-2.0
  8//
  9// Unless required by applicable law or agreed to in writing, software
 10// distributed under the License is distributed on an "AS IS" BASIS,
 11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12// See the License for the specific language governing permissions and
 13// limitations under the License.
 14
 15package zoekt
 16
 17import (
 18	"bytes"
 19	"fmt"
 20)
 21
 22// candidateMatch is a candidate match for a substring.
 23//
 24// Note: a lot of these can be in memory, so think about fieldalignment when
 25// modify the fields of this structure.
 26type candidateMatch struct {
 27	substrBytes   []byte
 28	substrLowered []byte
 29
 30	scoreWeight float64
 31
 32	file      uint32
 33	symbolIdx uint32
 34
 35	// Offsets are relative to the start of the filename or file contents.
 36	runeOffset  uint32
 37	byteOffset  uint32
 38	byteMatchSz uint32
 39
 40	// bools at end for struct field alignment
 41	caseSensitive bool
 42	fileName      bool
 43	symbol        bool
 44}
 45
 46// Matches content against the substring, and populates byteMatchSz on success
 47func (m *candidateMatch) matchContent(content []byte) bool {
 48	if m.caseSensitive {
 49		comp := bytes.Equal(m.substrBytes, content[m.byteOffset:m.byteOffset+uint32(len(m.substrBytes))])
 50
 51		m.byteMatchSz = uint32(len(m.substrBytes))
 52		return comp
 53	} else {
 54		// It is tempting to try a simple ASCII based
 55		// comparison if possible, but we need more
 56		// information. Simple ASCII chars have unicode upper
 57		// case variants (the ASCII 'k' has the Kelvin symbol
 58		// as upper case variant). We can only degrade to
 59		// ASCII if we are sure that both the corpus and the
 60		// query is ASCII only
 61		sz, ok := caseFoldingEqualsRunes(m.substrLowered, content[m.byteOffset:])
 62		m.byteMatchSz = uint32(sz)
 63		return ok
 64	}
 65}
 66
 67// matchIterator is a docIterator that produces candidateMatches for a given document
 68type matchIterator interface {
 69	docIterator
 70
 71	candidates() []*candidateMatch
 72
 73	// updateStats is called twice. After matchtree construction and after
 74	// searching is done. Implementations must take care to not report
 75	// statistics twice.
 76	updateStats(*Stats)
 77}
 78
 79// noMatchTree is both matchIterator and matchTree that matches nothing.
 80type noMatchTree struct {
 81	Why string
 82
 83	// Stats captures the work done to create the noMatchTree.
 84	Stats Stats
 85}
 86
 87func (t *noMatchTree) String() string {
 88	return fmt.Sprintf("not(%q)", t.Why)
 89}
 90
 91func (t *noMatchTree) candidates() []*candidateMatch {
 92	return nil
 93}
 94
 95func (t *noMatchTree) nextDoc() uint32 {
 96	return maxUInt32
 97}
 98
 99func (t *noMatchTree) prepare(uint32) {}
100
101func (t *noMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
102	return matchesNone
103}
104
105func (t *noMatchTree) updateStats(s *Stats) {
106	s.Add(t.Stats)
107	t.Stats = Stats{}
108}
109
110func (m *candidateMatch) String() string {
111	return fmt.Sprintf("%d:%d", m.file, m.runeOffset)
112}
113
114type ngramDocIterator struct {
115	leftPad  uint32
116	rightPad uint32
117
118	iter hitIterator
119	ends []uint32
120
121	// ngramLookups is how many lookups we did to create this iterator.
122	ngramLookups int
123
124	// mutable
125	fileIdx    uint32
126	matchCount int
127}
128
129// nextFileIndex returns the smallest index j of ends such that
130// ends[j] > offset, assuming ends[f] <= offset.
131func nextFileIndex(offset, f uint32, ends []uint32) uint32 {
132	d := uint32(1)
133	for f < uint32(len(ends)) && ends[f] <= offset {
134		if f+d < uint32(len(ends)) && ends[f+d] <= offset {
135			f += d
136			d *= 2
137		} else if d > 1 {
138			d = d/4 + 1
139		} else {
140			f++
141		}
142	}
143	return f
144}
145
146func (i *ngramDocIterator) nextDoc() uint32 {
147	i.fileIdx = nextFileIndex(i.iter.first(), i.fileIdx, i.ends)
148	if i.fileIdx >= uint32(len(i.ends)) {
149		return maxUInt32
150	}
151	return i.fileIdx
152}
153
154func (i *ngramDocIterator) String() string {
155	return fmt.Sprintf("ngram(L=%d,R=%d,%v)", i.leftPad, i.rightPad, i.iter)
156}
157
158func (i *ngramDocIterator) prepare(nextDoc uint32) {
159	var start uint32
160	if nextDoc > 0 {
161		start = i.ends[nextDoc-1]
162	}
163	if start > 0 {
164		i.iter.next(start + i.leftPad - 1)
165	}
166	i.fileIdx = nextDoc
167}
168
169func (i *ngramDocIterator) updateStats(s *Stats) {
170	i.iter.updateStats(s)
171	s.NgramMatches += i.matchCount
172	s.NgramLookups += i.ngramLookups
173	i.matchCount = 0
174	i.ngramLookups = 0
175}
176
177func (i *ngramDocIterator) candidates() []*candidateMatch {
178	if i.fileIdx >= uint32(len(i.ends)) {
179		return nil
180	}
181
182	var fileStart uint32
183	if i.fileIdx > 0 {
184		fileStart = i.ends[i.fileIdx-1]
185	}
186	fileEnd := i.ends[i.fileIdx]
187
188	var candidates []*candidateMatch
189	for {
190		p1 := i.iter.first()
191		if p1 == maxUInt32 || p1 >= i.ends[i.fileIdx] {
192			break
193		}
194		i.iter.next(p1)
195
196		if p1 < i.leftPad+fileStart || p1+i.rightPad > fileEnd {
197			continue
198		}
199
200		candidates = append(candidates, &candidateMatch{
201			file:       uint32(i.fileIdx),
202			runeOffset: p1 - fileStart - i.leftPad,
203		})
204	}
205	i.matchCount += len(candidates)
206	return candidates
207}
Configure Feed

Configure Feed