indexdata.go at 1d71fd02269063322b1636affdbe8d9d45ffce68 · boltless.me/zoekt

fork of https://github.com/sourcegraph/zoekt
zoekt / indexdata.go
at 1d71fd02269063322b1636affdbe8d9d45ffce68 14 kB View raw
Keegan Carruthers-Smith maximise distance between ngrams (#618) 2y ago
  1// Copyright 2016 Google Inc. All rights reserved.
  2//
  3// Licensed under the Apache License, Version 2.0 (the "License");
  4// you may not use this file except in compliance with the License.
  5// You may obtain a copy of the License at
  6//
  7//    http://www.apache.org/licenses/LICENSE-2.0
  8//
  9// Unless required by applicable law or agreed to in writing, software
 10// distributed under the License is distributed on an "AS IS" BASIS,
 11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12// See the License for the specific language governing permissions and
 13// limitations under the License.
 14
 15package zoekt
 16
 17import (
 18	"encoding/binary"
 19	"errors"
 20	"fmt"
 21	"hash/crc64"
 22	"log"
 23	"math/bits"
 24	"unicode/utf8"
 25
 26	"github.com/sourcegraph/zoekt/query"
 27	"golang.org/x/exp/slices"
 28)
 29
 30// indexData holds the pattern-independent data that we have to have
 31// in memory to search. Most of the memory is taken up by the ngram =>
 32// offset index.
 33type indexData struct {
 34	symbols symbolData
 35
 36	file IndexFile
 37
 38	ngrams btreeIndex
 39
 40	newlinesStart uint32
 41	newlinesIndex []uint32
 42
 43	docSectionsStart uint32
 44	docSectionsIndex []uint32
 45
 46	runeDocSections    []DocumentSection
 47	runeDocSectionsRaw []byte
 48
 49	// rune offset=>byte offset mapping, relative to the start of the content corpus
 50	runeOffsets runeOffsetMap
 51
 52	// offsets of file contents; includes end of last file
 53	boundariesStart uint32
 54	boundaries      []uint32
 55
 56	// rune offsets for the file content boundaries
 57	fileEndRunes []uint32
 58
 59	fileNameContent []byte
 60	fileNameIndex   []uint32
 61	fileNameNgrams  btreeIndex
 62
 63	// fileEndSymbol[i] is the index of the first symbol for document i.
 64	fileEndSymbol []uint32
 65
 66	// rune offset=>byte offset mapping, relative to the start of the filename corpus
 67	fileNameRuneOffsets runeOffsetMap
 68
 69	// rune offsets for the file name boundaries
 70	fileNameEndRunes []uint32
 71
 72	fileBranchMasks []uint64
 73
 74	// mask (power of 2) => name
 75	branchNames []map[uint]string
 76
 77	// name => mask (power of 2)
 78	branchIDs []map[string]uint
 79
 80	metaData     IndexMetadata
 81	repoMetaData []Repository
 82
 83	subRepos     []uint32
 84	subRepoPaths [][]string
 85
 86	// Checksums for all the files, at 8-byte intervals
 87	checksums []byte
 88
 89	// languages for all the files.
 90	languages []byte
 91
 92	// inverse of LanguageMap in metaData
 93	languageMap map[uint16]string
 94
 95	repoListEntry []RepoListEntry
 96
 97	// repository indexes for all the files
 98	repos []uint16
 99
100	// Experimental: docID => rank vec
101	ranks [][]float64
102
103	// rawConfigMasks contains the encoded RawConfig for each repository
104	rawConfigMasks []uint8
105}
106
107type symbolData struct {
108	// symContent stores Symbol.Sym and Symbol.Parent.
109	// TODO we don't need to store Symbol.Sym.
110	symContent []byte
111	symIndex   []byte
112	// symKindContent is an enum of sym.Kind and sym.ParentKind
113	symKindContent []byte
114	symKindIndex   []uint32
115	// symMetadata is [4]uint32 0 Kind Parent ParentKind
116	symMetaData []byte
117}
118
119func uint32SliceAt(a []byte, n uint32) uint32 {
120	return binary.BigEndian.Uint32(a[n*4:])
121}
122
123func uint32SliceLen(a []byte) uint32 {
124	return uint32(len(a) / 4)
125}
126
127// parent returns index i of the parent enum
128func (d *symbolData) parent(i uint32) []byte {
129	delta := uint32SliceAt(d.symIndex, 0)
130	start := uint32SliceAt(d.symIndex, i) - delta
131	var end uint32
132	if i+1 == uint32SliceLen(d.symIndex) {
133		end = uint32(len(d.symContent))
134	} else {
135		end = uint32SliceAt(d.symIndex, i+1) - delta
136	}
137	return d.symContent[start:end]
138}
139
140// kind returns index i of the kind enum
141func (d *symbolData) kind(i uint32) []byte {
142	return d.symKindContent[d.symKindIndex[i]:d.symKindIndex[i+1]]
143}
144
145// data returns the symbol at index i
146func (d *symbolData) data(i uint32) *Symbol {
147	size := uint32(4 * 4) // 4 uint32s
148	offset := i * size
149	if offset >= uint32(len(d.symMetaData)) {
150		return nil
151	}
152
153	metadata := d.symMetaData[offset : offset+size]
154	sym := &Symbol{}
155	key := uint32SliceAt(metadata, 1)
156	sym.Kind = string(d.kind(key))
157	key = uint32SliceAt(metadata, 2)
158	sym.Parent = string(d.parent(key))
159	key = uint32SliceAt(metadata, 3)
160	sym.ParentKind = string(d.kind(key))
161	return sym
162}
163
164func (d *indexData) getChecksum(idx uint32) []byte {
165	start := crc64.Size * idx
166	return d.checksums[start : start+crc64.Size]
167}
168
169func (d *indexData) getLanguage(idx uint32) uint16 {
170	if d.metaData.IndexFeatureVersion < 12 {
171		// older zoekt files had 8-bit language entries
172		return uint16(d.languages[idx])
173	}
174	// newer zoekt files have 16-bit language entries
175	return uint16(d.languages[idx*2]) | uint16(d.languages[idx*2+1])<<8
176}
177
178// calculates stats for files in the range [start, end).
179func (d *indexData) calculateStatsForFileRange(start, end uint32) RepoStats {
180	if start >= end {
181		// An empty shard for an empty repository.
182		return RepoStats{
183			Shards: 1,
184		}
185	}
186
187	bytesContent := d.boundaries[end] - d.boundaries[start]
188	bytesFN := d.fileNameIndex[end] - d.fileNameIndex[start]
189	count, defaultCount, otherCount := d.calculateNewLinesStats(start, end)
190
191	// CR keegan for stefan: I think we may want to restructure RepoListEntry so
192	// that we don't change anything, except we have
193	// []Repository. Alternatively, things we can divide up we do (like
194	// here). Right now I don't like that these numbers are not true, especially
195	// after aggregation. For now I will move forward with this until we can
196	// chat more.
197	return RepoStats{
198		ContentBytes: int64(bytesContent) + int64(bytesFN),
199		Documents:    int(end - start),
200		// CR keegan for stefan: our shard count is going to go out of whack,
201		// since we will aggregate these. So we will report more shards than are
202		// present on disk. What should we do?
203		Shards: 1,
204
205		// Sourcegraph specific
206		NewLinesCount:              count,
207		DefaultBranchNewLinesCount: defaultCount,
208		OtherBranchesNewLinesCount: otherCount,
209	}
210}
211
212func (d *indexData) calculateStats() error {
213	d.repoListEntry = make([]RepoListEntry, 0, len(d.repoMetaData))
214	var start, end uint32
215
216	for repoID, md := range d.repoMetaData {
217		// determine the file range for repo i
218		for end < uint32(len(d.repos)) && d.repos[end] == uint16(repoID) {
219			end++
220		}
221		if start < end && d.repos[start] != uint16(repoID) {
222			return fmt.Errorf("shard documents out of order with respect to repositories: expected document %d to be part of repo %d", start, repoID)
223		}
224
225		d.repoListEntry = append(d.repoListEntry, RepoListEntry{
226			Repository:    md,
227			IndexMetadata: d.metaData,
228			Stats:         d.calculateStatsForFileRange(start, end),
229		})
230		start = end
231	}
232
233	// All repos in a compound shard share memoryUse. So we average out the
234	// memoryUse per shard in our reporting. This has the benefit that when you
235	// aggregate the IndexBytes you get back the actual memoryUse.
236	//
237	// TODO take into account tombstones for aggregation. Even better, adjust
238	// API to be shard centric not repo centric.
239	if len(d.repoListEntry) > 0 {
240		indexBytes := d.memoryUse()
241		indexBytesChunk := indexBytes / len(d.repoListEntry)
242		for i := range d.repoListEntry {
243			d.repoListEntry[i].Stats.IndexBytes = int64(indexBytesChunk)
244			indexBytes -= indexBytesChunk
245		}
246		d.repoListEntry[0].Stats.IndexBytes += int64(indexBytes)
247	}
248
249	return nil
250}
251
252// calculateNewLinesStats computes some Sourcegraph specific statistics for files
253// in the range [start, end). These are not as efficient to calculate as the
254// normal statistics. We experimentally measured about a 10% slower shard load
255// time. However, we find these values very useful to track and computing them
256// outside of load time introduces a lot of complexity.
257func (d *indexData) calculateNewLinesStats(start, end uint32) (count, defaultCount, otherCount uint64) {
258	for i := start; i < end; i++ {
259		// branchMask is a bitmask of the branches for a document. Zoekt by
260		// convention represents the default branch as the lowest bit.
261		branchMask := d.fileBranchMasks[i]
262		isDefault := (branchMask & 1) == 1
263		others := uint64(bits.OnesCount64(branchMask >> 1))
264
265		// this is readNewlines but only reading the size of each section which
266		// corresponds to the number of newlines.
267		sec := simpleSection{
268			off: d.newlinesStart + d.newlinesIndex[i],
269			sz:  d.newlinesIndex[i+1] - d.newlinesIndex[i],
270		}
271		// We are only reading the first varint which is the size. So we don't
272		// need to read more than MaxVarintLen64 bytes.
273		if sec.sz > binary.MaxVarintLen64 {
274			sec.sz = binary.MaxVarintLen64
275		}
276		blob, err := d.readSectionBlob(sec)
277		if err != nil {
278			log.Printf("error reading newline index for document %d on shard %s: %v", i, d.file.Name(), err)
279			continue
280		}
281		sz, _ := binary.Uvarint(blob)
282
283		count += sz
284		if isDefault {
285			defaultCount += sz
286		}
287		otherCount += (others * sz)
288	}
289
290	return
291}
292
293func (d *indexData) String() string {
294	return fmt.Sprintf("shard(%s)", d.file.Name())
295}
296
297// calculates an approximate size of indexData in memory in bytes.
298func (d *indexData) memoryUse() int {
299	sz := 0
300	for _, a := range [][]uint32{
301		d.newlinesIndex, d.docSectionsIndex,
302		d.boundaries, d.fileNameIndex,
303		d.fileEndRunes, d.fileNameEndRunes,
304		d.fileEndSymbol, d.symbols.symKindIndex,
305		d.subRepos,
306	} {
307		sz += 4 * len(a)
308	}
309	sz += d.runeOffsets.sizeBytes()
310	sz += d.fileNameRuneOffsets.sizeBytes()
311	sz += len(d.languages)
312	sz += len(d.checksums)
313	sz += 2 * len(d.repos)
314	if len(d.ranks) > 0 {
315		sz += 8 * len(d.ranks) * len(d.ranks[0])
316	}
317	sz += 8 * len(d.runeDocSections)
318	sz += 8 * len(d.fileBranchMasks)
319	sz += d.ngrams.SizeBytes()
320	sz += d.fileNameNgrams.SizeBytes()
321	return sz
322}
323
324const maxUInt32 = 0xffffffff
325
326func min2Index(xs []uint32) (idx0, idx1 int) {
327	min0, min1 := uint32(maxUInt32), uint32(maxUInt32)
328	for i, x := range xs {
329		if x <= min0 {
330			idx0, idx1 = i, idx0
331			min0, min1 = x, min0
332		} else if x <= min1 {
333			idx1 = i
334			min1 = x
335		}
336	}
337	return
338}
339
340// minFrequencyNgramOffsets returns the two lowest frequency ngrams to pass to
341// the distance iterator. If they have the same frequency, we maximise the
342// distance between them. first will always have a smaller index than last.
343func minFrequencyNgramOffsets(ngramOffs []runeNgramOff, frequencies []uint32) (first, last runeNgramOff) {
344	firstI, lastI := min2Index(frequencies)
345	// If the frequencies are equal lets maximise distance in the query
346	// string. This optimization normally triggers for long repeated trigrams
347	// in a string, eg a query like "AAAAA..."
348	if frequencies[firstI] == frequencies[lastI] {
349		for i, freq := range frequencies {
350			if freq != frequencies[firstI] {
351				continue
352			}
353			if ngramOffs[i].index < ngramOffs[firstI].index {
354				firstI = i
355			}
356			if ngramOffs[i].index > ngramOffs[lastI].index {
357				lastI = i
358			}
359		}
360	}
361	first = ngramOffs[firstI]
362	last = ngramOffs[lastI]
363	// Ensure first appears before last to make distance logic below clean.
364	if first.index > last.index {
365		last, first = first, last
366	}
367	return first, last
368}
369
370func (data *indexData) ngramFrequency(ng ngram, filename bool) uint32 {
371	if filename {
372		return data.fileNameNgrams.Get(ng).sz
373	}
374	return data.ngrams.Get(ng).sz
375}
376
377type ngramIterationResults struct {
378	matchIterator
379
380	caseSensitive bool
381	fileName      bool
382	substrBytes   []byte
383	substrLowered []byte
384}
385
386func (r *ngramIterationResults) String() string {
387	return fmt.Sprintf("wrapper(%v)", r.matchIterator)
388}
389
390func (r *ngramIterationResults) candidates() []*candidateMatch {
391	cs := r.matchIterator.candidates()
392	for _, c := range cs {
393		c.caseSensitive = r.caseSensitive
394		c.fileName = r.fileName
395		c.substrBytes = r.substrBytes
396		c.substrLowered = r.substrLowered
397	}
398	return cs
399}
400
401func (d *indexData) iterateNgrams(query *query.Substring) (*ngramIterationResults, error) {
402	str := query.Pattern
403
404	// Find the 2 least common ngrams from the string.
405	ngramOffs := splitNGrams([]byte(query.Pattern))
406
407	// protect against accidental searching of empty strings
408	if len(ngramOffs) == 0 {
409		return nil, errors.New("iterateNgrams needs non empty string")
410	}
411
412	// PERF: Sort to increase the chances adjacent checks are in the same btree
413	// bucket (which can cause disk IO).
414	slices.SortFunc(ngramOffs, func(a, b runeNgramOff) bool {
415		return a.ngram < b.ngram
416	})
417	frequencies := make([]uint32, 0, len(ngramOffs))
418	ngramLookups := 0
419	for _, o := range ngramOffs {
420		var freq uint32
421		if query.CaseSensitive {
422			freq = d.ngramFrequency(o.ngram, query.FileName)
423			ngramLookups++
424		} else {
425			for _, v := range generateCaseNgrams(o.ngram) {
426				freq += d.ngramFrequency(v, query.FileName)
427				ngramLookups++
428			}
429		}
430
431		if freq == 0 {
432			return &ngramIterationResults{
433				matchIterator: &noMatchTree{
434					Why: "freq=0",
435					Stats: Stats{
436						NgramLookups: ngramLookups,
437					},
438				},
439			}, nil
440		}
441
442		frequencies = append(frequencies, freq)
443	}
444
445	// first and last are now the smallest trigram posting lists to iterate
446	// through.
447	first, last := minFrequencyNgramOffsets(ngramOffs, frequencies)
448
449	iter := &ngramDocIterator{
450		leftPad:      first.index,
451		rightPad:     uint32(utf8.RuneCountInString(str)) - first.index,
452		ngramLookups: ngramLookups,
453	}
454	if query.FileName {
455		iter.ends = d.fileNameEndRunes
456	} else {
457		iter.ends = d.fileEndRunes
458	}
459
460	if first != last {
461		runeDist := last.index - first.index
462		i, err := d.newDistanceTrigramIter(first.ngram, last.ngram, runeDist, query.CaseSensitive, query.FileName)
463		if err != nil {
464			return nil, err
465		}
466
467		iter.iter = i
468	} else {
469		hitIter, err := d.trigramHitIterator(last.ngram, query.CaseSensitive, query.FileName)
470		if err != nil {
471			return nil, err
472		}
473		iter.iter = hitIter
474	}
475
476	patBytes := []byte(query.Pattern)
477	lowerPatBytes := toLower(patBytes)
478
479	return &ngramIterationResults{
480		matchIterator: iter,
481		caseSensitive: query.CaseSensitive,
482		fileName:      query.FileName,
483		substrBytes:   patBytes,
484		substrLowered: lowerPatBytes,
485	}, nil
486}
487
488func (d *indexData) fileName(i uint32) []byte {
489	return d.fileNameContent[d.fileNameIndex[i]:d.fileNameIndex[i+1]]
490}
491
492func (d *indexData) numDocs() uint32 {
493	return uint32(len(d.fileBranchMasks))
494}
495
496func (s *indexData) Close() {
497	s.file.Close()
498}
499
500const (
501	rawConfigYes = 1
502	rawConfigNo  = 2
503)
504
505// encodeRawConfig encodes a rawConfig map into a uint8 mask.
506func encodeRawConfig(rawConfig map[string]string) uint8 {
507	var encoded uint8
508	for i, f := range []string{"public", "fork", "archived"} {
509		var e uint8
510		v, ok := rawConfig[f]
511		if ok && v == "1" {
512			e |= rawConfigYes
513		} else {
514			e |= rawConfigNo
515		}
516		encoded = encoded | e<<(2*i)
517	}
518	return encoded
519}
Configure Feed

Configure Feed