index/indexdata.go at cd59bed0f02585a1aaacae74e4f3c2b7cc7de65c · boltless.me/zoekt

fork of https://github.com/sourcegraph/zoekt
zoekt / index / indexdata.go
at cd59bed0f02585a1aaacae74e4f3c2b7cc7de65c 15 kB View raw
John Mason Cache docMatchTree results for Meta conditions (#982) 9mo ago
  1// Copyright 2016 Google Inc. All rights reserved.
  2//
  3// Licensed under the Apache License, Version 2.0 (the "License");
  4// you may not use this file except in compliance with the License.
  5// You may obtain a copy of the License at
  6//
  7//    http://www.apache.org/licenses/LICENSE-2.0
  8//
  9// Unless required by applicable law or agreed to in writing, software
 10// distributed under the License is distributed on an "AS IS" BASIS,
 11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12// See the License for the specific language governing permissions and
 13// limitations under the License.
 14
 15package index
 16
 17import (
 18	"encoding/binary"
 19	"errors"
 20	"fmt"
 21	"hash/crc64"
 22	"log"
 23	"math"
 24	"math/bits"
 25	"slices"
 26	"unicode/utf8"
 27
 28	"github.com/sourcegraph/zoekt"
 29	"github.com/sourcegraph/zoekt/query"
 30)
 31
 32// indexData holds the pattern-independent data that we have to have
 33// in memory to search. Most of the memory is taken up by the ngram =>
 34// offset index.
 35type indexData struct {
 36	symbols symbolData
 37
 38	file IndexFile
 39
 40	contentNgrams btreeIndex
 41
 42	newlinesStart uint32
 43	newlinesIndex []uint32
 44
 45	docSectionsStart uint32
 46	docSectionsIndex []uint32
 47
 48	runeDocSections []DocumentSection
 49
 50	// rune offset=>byte offset mapping, relative to the start of the content corpus
 51	runeOffsets runeOffsetMap
 52
 53	// offsets of file contents; includes end of last file
 54	boundariesStart uint32
 55	boundaries      []uint32
 56
 57	// rune offsets for the file content boundaries
 58	fileEndRunes []uint32
 59
 60	fileNameContent []byte
 61	fileNameIndex   []uint32
 62	fileNameNgrams  btreeIndex
 63
 64	// fileEndSymbol[i] is the index of the first symbol for document i.
 65	fileEndSymbol []uint32
 66
 67	// rune offset=>byte offset mapping, relative to the start of the filename corpus
 68	fileNameRuneOffsets runeOffsetMap
 69
 70	// rune offsets for the file name boundaries
 71	fileNameEndRunes []uint32
 72
 73	fileBranchMasks []uint64
 74
 75	// mask (power of 2) => name
 76	branchNames []map[uint]string
 77
 78	// name => mask (power of 2)
 79	branchIDs []map[string]uint
 80
 81	metaData     zoekt.IndexMetadata
 82	repoMetaData []zoekt.Repository
 83
 84	subRepos     []uint32
 85	subRepoPaths [][]string
 86
 87	// Checksums for all the files, at 8-byte intervals
 88	checksums []byte
 89
 90	// languages for all the files.
 91	languages []byte
 92
 93	// inverse of LanguageMap in metaData
 94	languageMap map[uint16]string
 95
 96	// file categories for all the files.
 97	categories []byte
 98
 99	repoListEntry []zoekt.RepoListEntry
100
101	// repository indexes for all the files
102	repos []uint16
103
104	// rawConfigMasks contains the encoded RawConfig for each repository
105	rawConfigMasks []uint8
106
107	// Cache for docMatchTree objects
108	docMatchTreeCache *docMatchTreeCache
109}
110
111type symbolData struct {
112	// symContent stores Symbol.Sym and Symbol.Parent.
113	// TODO we don't need to store Symbol.Sym.
114	symContent []byte
115	symIndex   []byte
116	// symKindContent is an enum of sym.Kind and sym.ParentKind
117	symKindContent []byte
118	symKindIndex   []uint32
119	// symMetadata is [4]uint32 0 Kind Parent ParentKind
120	symMetaData []byte
121}
122
123func uint32SliceAt(a []byte, n uint32) uint32 {
124	return binary.BigEndian.Uint32(a[n*4:])
125}
126
127func uint32SliceLen(a []byte) uint32 {
128	return uint32(len(a) / 4)
129}
130
131// parent returns index i of the parent enum
132func (d *symbolData) parent(i uint32) []byte {
133	delta := uint32SliceAt(d.symIndex, 0)
134	start := uint32SliceAt(d.symIndex, i) - delta
135	var end uint32
136	if i+1 == uint32SliceLen(d.symIndex) {
137		end = uint32(len(d.symContent))
138	} else {
139		end = uint32SliceAt(d.symIndex, i+1) - delta
140	}
141	return d.symContent[start:end]
142}
143
144// kind returns index i of the kind enum
145func (d *symbolData) kind(i uint32) []byte {
146	return d.symKindContent[d.symKindIndex[i]:d.symKindIndex[i+1]]
147}
148
149// data returns the symbol at index i
150func (d *symbolData) data(i uint32) *zoekt.Symbol {
151	size := uint32(4 * 4) // 4 uint32s
152	offset := i * size
153	if offset >= uint32(len(d.symMetaData)) {
154		return nil
155	}
156
157	metadata := d.symMetaData[offset : offset+size]
158	sym := &zoekt.Symbol{}
159	key := uint32SliceAt(metadata, 1)
160	sym.Kind = string(d.kind(key))
161	key = uint32SliceAt(metadata, 2)
162	sym.Parent = string(d.parent(key))
163	key = uint32SliceAt(metadata, 3)
164	sym.ParentKind = string(d.kind(key))
165	return sym
166}
167
168func (d *indexData) getChecksum(idx uint32) []byte {
169	start := crc64.Size * idx
170	return d.checksums[start : start+crc64.Size]
171}
172
173func (d *indexData) getLanguage(idx uint32) uint16 {
174	if d.metaData.IndexFeatureVersion < 12 {
175		// older zoekt files had 8-bit language entries
176		return uint16(d.languages[idx])
177	}
178	// newer zoekt files have 16-bit language entries
179	return uint16(d.languages[idx*2]) | uint16(d.languages[idx*2+1])<<8
180}
181
182func (d *indexData) getCategory(idx uint32) FileCategory {
183	if len(d.categories) == 0 {
184		// This means we're reading an older index, so return 'missing'
185		return FileCategoryMissing
186	}
187	category, err := decodeCategory(d.categories[idx])
188	if err != nil {
189		return FileCategoryMissing
190	}
191	return category
192}
193
194// calculates stats for files in the range [start, end).
195func (d *indexData) calculateStatsForFileRange(start, end uint32) zoekt.RepoStats {
196	if start >= end {
197		// An empty shard for an empty repository.
198		return zoekt.RepoStats{
199			Shards: 1,
200		}
201	}
202
203	bytesContent := d.boundaries[end] - d.boundaries[start]
204	bytesFN := d.fileNameIndex[end] - d.fileNameIndex[start]
205	count, defaultCount, otherCount := d.calculateNewLinesStats(start, end)
206
207	// CR keegan for stefan: I think we may want to restructure RepoListEntry so
208	// that we don't change anything, except we have
209	// []Repository. Alternatively, things we can divide up we do (like
210	// here). Right now I don't like that these numbers are not true, especially
211	// after aggregation. For now I will move forward with this until we can
212	// chat more.
213	return zoekt.RepoStats{
214		ContentBytes: int64(bytesContent) + int64(bytesFN),
215		Documents:    int(end - start),
216		// CR keegan for stefan: our shard count is going to go out of whack,
217		// since we will aggregate these. So we will report more shards than are
218		// present on disk. What should we do?
219		Shards: 1,
220
221		// Sourcegraph specific
222		NewLinesCount:              count,
223		DefaultBranchNewLinesCount: defaultCount,
224		OtherBranchesNewLinesCount: otherCount,
225	}
226}
227
228func (d *indexData) calculateStats() error {
229	d.repoListEntry = make([]zoekt.RepoListEntry, 0, len(d.repoMetaData))
230	var start, end uint32
231
232	for repoID, md := range d.repoMetaData {
233		// determine the file range for repo i
234		for end < uint32(len(d.repos)) && d.repos[end] == uint16(repoID) {
235			end++
236		}
237		if start < end && d.repos[start] != uint16(repoID) {
238			return fmt.Errorf("shard documents out of order with respect to repositories: expected document %d to be part of repo %d", start, repoID)
239		}
240
241		d.repoListEntry = append(d.repoListEntry, zoekt.RepoListEntry{
242			Repository:    md,
243			IndexMetadata: d.metaData,
244			Stats:         d.calculateStatsForFileRange(start, end),
245		})
246		start = end
247	}
248
249	// All repos in a compound shard share memoryUse. So we average out the
250	// memoryUse per shard in our reporting. This has the benefit that when you
251	// aggregate the IndexBytes you get back the actual memoryUse.
252	//
253	// TODO take into account tombstones for aggregation. Even better, adjust
254	// API to be shard centric not repo centric.
255	if len(d.repoListEntry) > 0 {
256		indexBytes := d.memoryUse()
257		indexBytesChunk := indexBytes / len(d.repoListEntry)
258		for i := range d.repoListEntry {
259			d.repoListEntry[i].Stats.IndexBytes = int64(indexBytesChunk)
260			indexBytes -= indexBytesChunk
261		}
262		d.repoListEntry[0].Stats.IndexBytes += int64(indexBytes)
263	}
264
265	return nil
266}
267
268// calculateNewLinesStats computes some Sourcegraph specific statistics for files
269// in the range [start, end). These are not as efficient to calculate as the
270// normal statistics. We experimentally measured about a 10% slower shard load
271// time. However, we find these values very useful to track and computing them
272// outside of load time introduces a lot of complexity.
273func (d *indexData) calculateNewLinesStats(start, end uint32) (count, defaultCount, otherCount uint64) {
274	for i := start; i < end; i++ {
275		// branchMask is a bitmask of the branches for a document. Zoekt by
276		// convention represents the default branch as the lowest bit.
277		branchMask := d.fileBranchMasks[i]
278		isDefault := (branchMask & 1) == 1
279		others := uint64(bits.OnesCount64(branchMask >> 1))
280
281		// this is readNewlines but only reading the size of each section which
282		// corresponds to the number of newlines.
283		sec := simpleSection{
284			off: d.newlinesStart + d.newlinesIndex[i],
285			sz:  d.newlinesIndex[i+1] - d.newlinesIndex[i],
286		}
287		// We are only reading the first varint which is the size. So we don't
288		// need to read more than MaxVarintLen64 bytes.
289		if sec.sz > binary.MaxVarintLen64 {
290			sec.sz = binary.MaxVarintLen64
291		}
292		blob, err := d.readSectionBlob(sec)
293		if err != nil {
294			log.Printf("error reading newline index for document %d on shard %s: %v", i, d.file.Name(), err)
295			continue
296		}
297		sz, _ := binary.Uvarint(blob)
298
299		count += sz
300		if isDefault {
301			defaultCount += sz
302		}
303		otherCount += (others * sz)
304	}
305
306	return
307}
308
309func (d *indexData) String() string {
310	return fmt.Sprintf("shard(%s)", d.file.Name())
311}
312
313// calculates an approximate size of indexData in memory in bytes.
314func (d *indexData) memoryUse() int {
315	sz := 0
316	for _, a := range [][]uint32{
317		d.newlinesIndex, d.docSectionsIndex,
318		d.boundaries, d.fileNameIndex,
319		d.fileEndRunes, d.fileNameEndRunes,
320		d.fileEndSymbol, d.symbols.symKindIndex,
321		d.subRepos,
322	} {
323		sz += 4 * len(a)
324	}
325	sz += d.runeOffsets.sizeBytes()
326	sz += d.fileNameRuneOffsets.sizeBytes()
327	sz += len(d.languages)
328	sz += len(d.checksums)
329	sz += 2 * len(d.repos)
330	sz += 8 * len(d.runeDocSections)
331	sz += 8 * len(d.fileBranchMasks)
332	sz += d.contentNgrams.SizeBytes()
333	sz += d.fileNameNgrams.SizeBytes()
334	return sz
335}
336
337// findSelectiveNgrams returns two ngrams to pass to the distance iterator, chosen to
338// produce a small file intersection. It finds the two lowest frequency ngrams, but avoids
339// overlapping trigrams to keep their intersection as small as possible.
340//
341// Invariant: first will always have a smaller index than last.
342func findSelectiveNgrams(ngramOffs []runeNgramOff, indexMap []int, frequencies []uint32) (first, last runeNgramOff) {
343	first, last = minFrequencyNgramOffsets(ngramOffs, frequencies)
344
345	// If the trigrams are overlapping, then try to shift one to reduce overlap.
346	// This is guaranteed to produce a smaller intersection.
347	if last.index-first.index < ngramSize {
348		newFirstIndex := max(last.index-ngramSize, 0)
349		if newFirstIndex != first.index {
350			first = ngramOffs[indexMap[newFirstIndex]]
351		}
352
353		newLastIndex := min(first.index+ngramSize, len(ngramOffs)-1)
354		if newLastIndex != last.index {
355			last = ngramOffs[indexMap[newLastIndex]]
356		}
357	}
358	return
359}
360
361func minFrequencyNgramOffsets(ngramOffs []runeNgramOff, frequencies []uint32) (first, last runeNgramOff) {
362	// Find the two lowest frequency ngrams.
363	idx0, idx1 := 0, 0
364	min0, min1 := uint32(math.MaxUint32), uint32(math.MaxUint32)
365	for i, x := range frequencies {
366		if x <= min0 {
367			idx0, idx1 = i, idx0
368			min0, min1 = x, min0
369		} else if x <= min1 {
370			idx1 = i
371			min1 = x
372		}
373	}
374
375	first = ngramOffs[idx0]
376	last = ngramOffs[idx1]
377
378	// Ensure first appears before last as a helpful invariant.
379	if first.index > last.index {
380		last, first = first, last
381	}
382	return
383}
384
385func (data *indexData) ngrams(filename bool) btreeIndex {
386	if filename {
387		return data.fileNameNgrams
388	}
389	return data.contentNgrams
390}
391
392type ngramIterationResults struct {
393	matchIterator
394
395	caseSensitive bool
396	fileName      bool
397	substrBytes   []byte
398	substrLowered []byte
399}
400
401func (r *ngramIterationResults) String() string {
402	return fmt.Sprintf("wrapper(%v)", r.matchIterator)
403}
404
405func (r *ngramIterationResults) candidates() []*candidateMatch {
406	cs := r.matchIterator.candidates()
407	for _, c := range cs {
408		c.caseSensitive = r.caseSensitive
409		c.fileName = r.fileName
410		c.substrBytes = r.substrBytes
411		c.substrLowered = r.substrLowered
412	}
413	return cs
414}
415
416func (d *indexData) iterateNgrams(query *query.Substring) (*ngramIterationResults, error) {
417	str := query.Pattern
418
419	// Find the 2 least common ngrams from the string.
420	ngramOffs := splitNGrams([]byte(str))
421
422	// protect against accidental searching of empty strings
423	if len(ngramOffs) == 0 {
424		return nil, errors.New("iterateNgrams needs non empty string")
425	}
426
427	// PERF: Sort to increase the chances adjacent checks are in the same btree
428	// bucket (which can cause disk IO).
429	slices.SortFunc(ngramOffs, runeNgramOff.Compare)
430	frequencies := make([]uint32, 0, len(ngramOffs))
431	indexMap := make([]int, len(ngramOffs))
432	ngramLookups := 0
433	ngrams := d.ngrams(query.FileName)
434	for i, o := range ngramOffs {
435		var freq uint32
436		if query.CaseSensitive {
437			freq = ngrams.Get(o.ngram).sz
438			ngramLookups++
439		} else {
440			for _, v := range generateCaseNgrams(o.ngram) {
441				freq += ngrams.Get(v).sz
442				ngramLookups++
443			}
444		}
445
446		if freq == 0 {
447			return &ngramIterationResults{
448				matchIterator: &noMatchTree{
449					Why: "freq=0",
450					Stats: zoekt.Stats{
451						NgramLookups: ngramLookups,
452					},
453				},
454			}, nil
455		}
456
457		frequencies = append(frequencies, freq)
458		indexMap[o.index] = i
459	}
460
461	first, last := findSelectiveNgrams(ngramOffs, indexMap, frequencies)
462
463	iter := &ngramDocIterator{
464		leftPad:      uint32(first.index),
465		rightPad:     uint32(utf8.RuneCountInString(str) - first.index),
466		ngramLookups: ngramLookups,
467	}
468	if query.FileName {
469		iter.ends = d.fileNameEndRunes
470	} else {
471		iter.ends = d.fileEndRunes
472	}
473
474	if first != last {
475		runeDist := uint32(last.index - first.index)
476		i, err := d.newDistanceTrigramIter(first.ngram, last.ngram, runeDist, query.CaseSensitive, query.FileName)
477		if err != nil {
478			return nil, err
479		}
480
481		iter.iter = i
482	} else {
483		hitIter, err := d.trigramHitIterator(last.ngram, query.CaseSensitive, query.FileName)
484		if err != nil {
485			return nil, err
486		}
487		iter.iter = hitIter
488	}
489
490	patBytes := []byte(query.Pattern)
491	lowerPatBytes := toLower(patBytes)
492
493	return &ngramIterationResults{
494		matchIterator: iter,
495		caseSensitive: query.CaseSensitive,
496		fileName:      query.FileName,
497		substrBytes:   patBytes,
498		substrLowered: lowerPatBytes,
499	}, nil
500}
501
502func (d *indexData) fileName(i uint32) []byte {
503	return d.fileNameContent[d.fileNameIndex[i]:d.fileNameIndex[i+1]]
504}
505
506func (d *indexData) numDocs() uint32 {
507	return uint32(len(d.fileBranchMasks))
508}
509
510func (s *indexData) Close() {
511	s.file.Close()
512}
513
514const (
515	rawConfigYes = 1
516	rawConfigNo  = 2
517)
518
519// encodeRawConfig encodes a rawConfig map into a uint8 mask.
520func encodeRawConfig(rawConfig map[string]string) uint8 {
521	var encoded uint8
522	for i, f := range []string{"public", "fork", "archived"} {
523		var e uint8
524		v, ok := rawConfig[f]
525		if ok && v == "1" {
526			e |= rawConfigYes
527		} else {
528			e |= rawConfigNo
529		}
530		encoded = encoded | e<<(2*i)
531	}
532	return encoded
533}
Configure Feed

Configure Feed