index/indexdata.go at 261aae37dce6a46cdf1eb669d95d314adc0758e7 · boltless.me/zoekt

fork of https://github.com/sourcegraph/zoekt
zoekt / index / indexdata.go
at 261aae37dce6a46cdf1eb669d95d314adc0758e7 14 kB View raw
Julie Tibshirani Move root-level index code to index package (#902) 1y ago
  1// Copyright 2016 Google Inc. All rights reserved.
  2//
  3// Licensed under the Apache License, Version 2.0 (the "License");
  4// you may not use this file except in compliance with the License.
  5// You may obtain a copy of the License at
  6//
  7//    http://www.apache.org/licenses/LICENSE-2.0
  8//
  9// Unless required by applicable law or agreed to in writing, software
 10// distributed under the License is distributed on an "AS IS" BASIS,
 11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12// See the License for the specific language governing permissions and
 13// limitations under the License.
 14
 15package index
 16
 17import (
 18	"encoding/binary"
 19	"errors"
 20	"fmt"
 21	"hash/crc64"
 22	"log"
 23	"math/bits"
 24	"slices"
 25	"unicode/utf8"
 26
 27	"github.com/sourcegraph/zoekt"
 28	"github.com/sourcegraph/zoekt/query"
 29)
 30
 31// indexData holds the pattern-independent data that we have to have
 32// in memory to search. Most of the memory is taken up by the ngram =>
 33// offset index.
 34type indexData struct {
 35	symbols symbolData
 36
 37	file IndexFile
 38
 39	contentNgrams btreeIndex
 40
 41	newlinesStart uint32
 42	newlinesIndex []uint32
 43
 44	docSectionsStart uint32
 45	docSectionsIndex []uint32
 46
 47	runeDocSections []DocumentSection
 48
 49	// rune offset=>byte offset mapping, relative to the start of the content corpus
 50	runeOffsets runeOffsetMap
 51
 52	// offsets of file contents; includes end of last file
 53	boundariesStart uint32
 54	boundaries      []uint32
 55
 56	// rune offsets for the file content boundaries
 57	fileEndRunes []uint32
 58
 59	fileNameContent []byte
 60	fileNameIndex   []uint32
 61	fileNameNgrams  btreeIndex
 62
 63	// fileEndSymbol[i] is the index of the first symbol for document i.
 64	fileEndSymbol []uint32
 65
 66	// rune offset=>byte offset mapping, relative to the start of the filename corpus
 67	fileNameRuneOffsets runeOffsetMap
 68
 69	// rune offsets for the file name boundaries
 70	fileNameEndRunes []uint32
 71
 72	fileBranchMasks []uint64
 73
 74	// mask (power of 2) => name
 75	branchNames []map[uint]string
 76
 77	// name => mask (power of 2)
 78	branchIDs []map[string]uint
 79
 80	metaData     zoekt.IndexMetadata
 81	repoMetaData []zoekt.Repository
 82
 83	subRepos     []uint32
 84	subRepoPaths [][]string
 85
 86	// Checksums for all the files, at 8-byte intervals
 87	checksums []byte
 88
 89	// languages for all the files.
 90	languages []byte
 91
 92	// inverse of LanguageMap in metaData
 93	languageMap map[uint16]string
 94
 95	repoListEntry []zoekt.RepoListEntry
 96
 97	// repository indexes for all the files
 98	repos []uint16
 99
100	// rawConfigMasks contains the encoded RawConfig for each repository
101	rawConfigMasks []uint8
102}
103
104type symbolData struct {
105	// symContent stores Symbol.Sym and Symbol.Parent.
106	// TODO we don't need to store Symbol.Sym.
107	symContent []byte
108	symIndex   []byte
109	// symKindContent is an enum of sym.Kind and sym.ParentKind
110	symKindContent []byte
111	symKindIndex   []uint32
112	// symMetadata is [4]uint32 0 Kind Parent ParentKind
113	symMetaData []byte
114}
115
116func uint32SliceAt(a []byte, n uint32) uint32 {
117	return binary.BigEndian.Uint32(a[n*4:])
118}
119
120func uint32SliceLen(a []byte) uint32 {
121	return uint32(len(a) / 4)
122}
123
124// parent returns index i of the parent enum
125func (d *symbolData) parent(i uint32) []byte {
126	delta := uint32SliceAt(d.symIndex, 0)
127	start := uint32SliceAt(d.symIndex, i) - delta
128	var end uint32
129	if i+1 == uint32SliceLen(d.symIndex) {
130		end = uint32(len(d.symContent))
131	} else {
132		end = uint32SliceAt(d.symIndex, i+1) - delta
133	}
134	return d.symContent[start:end]
135}
136
137// kind returns index i of the kind enum
138func (d *symbolData) kind(i uint32) []byte {
139	return d.symKindContent[d.symKindIndex[i]:d.symKindIndex[i+1]]
140}
141
142// data returns the symbol at index i
143func (d *symbolData) data(i uint32) *zoekt.Symbol {
144	size := uint32(4 * 4) // 4 uint32s
145	offset := i * size
146	if offset >= uint32(len(d.symMetaData)) {
147		return nil
148	}
149
150	metadata := d.symMetaData[offset : offset+size]
151	sym := &zoekt.Symbol{}
152	key := uint32SliceAt(metadata, 1)
153	sym.Kind = string(d.kind(key))
154	key = uint32SliceAt(metadata, 2)
155	sym.Parent = string(d.parent(key))
156	key = uint32SliceAt(metadata, 3)
157	sym.ParentKind = string(d.kind(key))
158	return sym
159}
160
161func (d *indexData) getChecksum(idx uint32) []byte {
162	start := crc64.Size * idx
163	return d.checksums[start : start+crc64.Size]
164}
165
166func (d *indexData) getLanguage(idx uint32) uint16 {
167	if d.metaData.IndexFeatureVersion < 12 {
168		// older zoekt files had 8-bit language entries
169		return uint16(d.languages[idx])
170	}
171	// newer zoekt files have 16-bit language entries
172	return uint16(d.languages[idx*2]) | uint16(d.languages[idx*2+1])<<8
173}
174
175// calculates stats for files in the range [start, end).
176func (d *indexData) calculateStatsForFileRange(start, end uint32) zoekt.RepoStats {
177	if start >= end {
178		// An empty shard for an empty repository.
179		return zoekt.RepoStats{
180			Shards: 1,
181		}
182	}
183
184	bytesContent := d.boundaries[end] - d.boundaries[start]
185	bytesFN := d.fileNameIndex[end] - d.fileNameIndex[start]
186	count, defaultCount, otherCount := d.calculateNewLinesStats(start, end)
187
188	// CR keegan for stefan: I think we may want to restructure RepoListEntry so
189	// that we don't change anything, except we have
190	// []Repository. Alternatively, things we can divide up we do (like
191	// here). Right now I don't like that these numbers are not true, especially
192	// after aggregation. For now I will move forward with this until we can
193	// chat more.
194	return zoekt.RepoStats{
195		ContentBytes: int64(bytesContent) + int64(bytesFN),
196		Documents:    int(end - start),
197		// CR keegan for stefan: our shard count is going to go out of whack,
198		// since we will aggregate these. So we will report more shards than are
199		// present on disk. What should we do?
200		Shards: 1,
201
202		// Sourcegraph specific
203		NewLinesCount:              count,
204		DefaultBranchNewLinesCount: defaultCount,
205		OtherBranchesNewLinesCount: otherCount,
206	}
207}
208
209func (d *indexData) calculateStats() error {
210	d.repoListEntry = make([]zoekt.RepoListEntry, 0, len(d.repoMetaData))
211	var start, end uint32
212
213	for repoID, md := range d.repoMetaData {
214		// determine the file range for repo i
215		for end < uint32(len(d.repos)) && d.repos[end] == uint16(repoID) {
216			end++
217		}
218		if start < end && d.repos[start] != uint16(repoID) {
219			return fmt.Errorf("shard documents out of order with respect to repositories: expected document %d to be part of repo %d", start, repoID)
220		}
221
222		d.repoListEntry = append(d.repoListEntry, zoekt.RepoListEntry{
223			Repository:    md,
224			IndexMetadata: d.metaData,
225			Stats:         d.calculateStatsForFileRange(start, end),
226		})
227		start = end
228	}
229
230	// All repos in a compound shard share memoryUse. So we average out the
231	// memoryUse per shard in our reporting. This has the benefit that when you
232	// aggregate the IndexBytes you get back the actual memoryUse.
233	//
234	// TODO take into account tombstones for aggregation. Even better, adjust
235	// API to be shard centric not repo centric.
236	if len(d.repoListEntry) > 0 {
237		indexBytes := d.memoryUse()
238		indexBytesChunk := indexBytes / len(d.repoListEntry)
239		for i := range d.repoListEntry {
240			d.repoListEntry[i].Stats.IndexBytes = int64(indexBytesChunk)
241			indexBytes -= indexBytesChunk
242		}
243		d.repoListEntry[0].Stats.IndexBytes += int64(indexBytes)
244	}
245
246	return nil
247}
248
249// calculateNewLinesStats computes some Sourcegraph specific statistics for files
250// in the range [start, end). These are not as efficient to calculate as the
251// normal statistics. We experimentally measured about a 10% slower shard load
252// time. However, we find these values very useful to track and computing them
253// outside of load time introduces a lot of complexity.
254func (d *indexData) calculateNewLinesStats(start, end uint32) (count, defaultCount, otherCount uint64) {
255	for i := start; i < end; i++ {
256		// branchMask is a bitmask of the branches for a document. Zoekt by
257		// convention represents the default branch as the lowest bit.
258		branchMask := d.fileBranchMasks[i]
259		isDefault := (branchMask & 1) == 1
260		others := uint64(bits.OnesCount64(branchMask >> 1))
261
262		// this is readNewlines but only reading the size of each section which
263		// corresponds to the number of newlines.
264		sec := simpleSection{
265			off: d.newlinesStart + d.newlinesIndex[i],
266			sz:  d.newlinesIndex[i+1] - d.newlinesIndex[i],
267		}
268		// We are only reading the first varint which is the size. So we don't
269		// need to read more than MaxVarintLen64 bytes.
270		if sec.sz > binary.MaxVarintLen64 {
271			sec.sz = binary.MaxVarintLen64
272		}
273		blob, err := d.readSectionBlob(sec)
274		if err != nil {
275			log.Printf("error reading newline index for document %d on shard %s: %v", i, d.file.Name(), err)
276			continue
277		}
278		sz, _ := binary.Uvarint(blob)
279
280		count += sz
281		if isDefault {
282			defaultCount += sz
283		}
284		otherCount += (others * sz)
285	}
286
287	return
288}
289
290func (d *indexData) String() string {
291	return fmt.Sprintf("shard(%s)", d.file.Name())
292}
293
294// calculates an approximate size of indexData in memory in bytes.
295func (d *indexData) memoryUse() int {
296	sz := 0
297	for _, a := range [][]uint32{
298		d.newlinesIndex, d.docSectionsIndex,
299		d.boundaries, d.fileNameIndex,
300		d.fileEndRunes, d.fileNameEndRunes,
301		d.fileEndSymbol, d.symbols.symKindIndex,
302		d.subRepos,
303	} {
304		sz += 4 * len(a)
305	}
306	sz += d.runeOffsets.sizeBytes()
307	sz += d.fileNameRuneOffsets.sizeBytes()
308	sz += len(d.languages)
309	sz += len(d.checksums)
310	sz += 2 * len(d.repos)
311	sz += 8 * len(d.runeDocSections)
312	sz += 8 * len(d.fileBranchMasks)
313	sz += d.contentNgrams.SizeBytes()
314	sz += d.fileNameNgrams.SizeBytes()
315	return sz
316}
317
318// findSelectiveNgrams returns two ngrams to pass to the distance iterator, chosen to
319// produce a small file intersection. It finds the two lowest frequency ngrams, but avoids
320// overlapping trigrams to keep their intersection as small as possible.
321//
322// Invariant: first will always have a smaller index than last.
323func findSelectiveNgrams(ngramOffs []runeNgramOff, indexMap []int, frequencies []uint32) (first, last runeNgramOff) {
324	first, last = minFrequencyNgramOffsets(ngramOffs, frequencies)
325
326	// If the trigrams are overlapping, then try to shift one to reduce overlap.
327	// This is guaranteed to produce a smaller intersection.
328	if last.index-first.index < ngramSize {
329		newFirstIndex := max(last.index-ngramSize, 0)
330		if newFirstIndex != first.index {
331			first = ngramOffs[indexMap[newFirstIndex]]
332		}
333
334		newLastIndex := min(first.index+ngramSize, len(ngramOffs)-1)
335		if newLastIndex != last.index {
336			last = ngramOffs[indexMap[newLastIndex]]
337		}
338	}
339	return
340}
341
342const maxUInt32 = 0xffffffff
343
344func minFrequencyNgramOffsets(ngramOffs []runeNgramOff, frequencies []uint32) (first, last runeNgramOff) {
345	// Find the two lowest frequency ngrams.
346	idx0, idx1 := 0, 0
347	min0, min1 := uint32(maxUInt32), uint32(maxUInt32)
348	for i, x := range frequencies {
349		if x <= min0 {
350			idx0, idx1 = i, idx0
351			min0, min1 = x, min0
352		} else if x <= min1 {
353			idx1 = i
354			min1 = x
355		}
356	}
357
358	first = ngramOffs[idx0]
359	last = ngramOffs[idx1]
360
361	// Ensure first appears before last as a helpful invariant.
362	if first.index > last.index {
363		last, first = first, last
364	}
365	return
366}
367
368func (data *indexData) ngrams(filename bool) btreeIndex {
369	if filename {
370		return data.fileNameNgrams
371	}
372	return data.contentNgrams
373}
374
375type ngramIterationResults struct {
376	matchIterator
377
378	caseSensitive bool
379	fileName      bool
380	substrBytes   []byte
381	substrLowered []byte
382}
383
384func (r *ngramIterationResults) String() string {
385	return fmt.Sprintf("wrapper(%v)", r.matchIterator)
386}
387
388func (r *ngramIterationResults) candidates() []*candidateMatch {
389	cs := r.matchIterator.candidates()
390	for _, c := range cs {
391		c.caseSensitive = r.caseSensitive
392		c.fileName = r.fileName
393		c.substrBytes = r.substrBytes
394		c.substrLowered = r.substrLowered
395	}
396	return cs
397}
398
399func (d *indexData) iterateNgrams(query *query.Substring) (*ngramIterationResults, error) {
400	str := query.Pattern
401
402	// Find the 2 least common ngrams from the string.
403	ngramOffs := splitNGrams([]byte(str))
404
405	// protect against accidental searching of empty strings
406	if len(ngramOffs) == 0 {
407		return nil, errors.New("iterateNgrams needs non empty string")
408	}
409
410	// PERF: Sort to increase the chances adjacent checks are in the same btree
411	// bucket (which can cause disk IO).
412	slices.SortFunc(ngramOffs, runeNgramOff.Compare)
413	frequencies := make([]uint32, 0, len(ngramOffs))
414	indexMap := make([]int, len(ngramOffs))
415	ngramLookups := 0
416	ngrams := d.ngrams(query.FileName)
417	for i, o := range ngramOffs {
418		var freq uint32
419		if query.CaseSensitive {
420			freq = ngrams.Get(o.ngram).sz
421			ngramLookups++
422		} else {
423			for _, v := range generateCaseNgrams(o.ngram) {
424				freq += ngrams.Get(v).sz
425				ngramLookups++
426			}
427		}
428
429		if freq == 0 {
430			return &ngramIterationResults{
431				matchIterator: &noMatchTree{
432					Why: "freq=0",
433					Stats: zoekt.Stats{
434						NgramLookups: ngramLookups,
435					},
436				},
437			}, nil
438		}
439
440		frequencies = append(frequencies, freq)
441		indexMap[o.index] = i
442	}
443
444	first, last := findSelectiveNgrams(ngramOffs, indexMap, frequencies)
445
446	iter := &ngramDocIterator{
447		leftPad:      uint32(first.index),
448		rightPad:     uint32(utf8.RuneCountInString(str) - first.index),
449		ngramLookups: ngramLookups,
450	}
451	if query.FileName {
452		iter.ends = d.fileNameEndRunes
453	} else {
454		iter.ends = d.fileEndRunes
455	}
456
457	if first != last {
458		runeDist := uint32(last.index - first.index)
459		i, err := d.newDistanceTrigramIter(first.ngram, last.ngram, runeDist, query.CaseSensitive, query.FileName)
460		if err != nil {
461			return nil, err
462		}
463
464		iter.iter = i
465	} else {
466		hitIter, err := d.trigramHitIterator(last.ngram, query.CaseSensitive, query.FileName)
467		if err != nil {
468			return nil, err
469		}
470		iter.iter = hitIter
471	}
472
473	patBytes := []byte(query.Pattern)
474	lowerPatBytes := toLower(patBytes)
475
476	return &ngramIterationResults{
477		matchIterator: iter,
478		caseSensitive: query.CaseSensitive,
479		fileName:      query.FileName,
480		substrBytes:   patBytes,
481		substrLowered: lowerPatBytes,
482	}, nil
483}
484
485func (d *indexData) fileName(i uint32) []byte {
486	return d.fileNameContent[d.fileNameIndex[i]:d.fileNameIndex[i+1]]
487}
488
489func (d *indexData) numDocs() uint32 {
490	return uint32(len(d.fileBranchMasks))
491}
492
493func (s *indexData) Close() {
494	s.file.Close()
495}
496
497const (
498	rawConfigYes = 1
499	rawConfigNo  = 2
500)
501
502// encodeRawConfig encodes a rawConfig map into a uint8 mask.
503func encodeRawConfig(rawConfig map[string]string) uint8 {
504	var encoded uint8
505	for i, f := range []string{"public", "fork", "archived"} {
506		var e uint8
507		v, ok := rawConfig[f]
508		if ok && v == "1" {
509			e |= rawConfigYes
510		} else {
511			e |= rawConfigNo
512		}
513		encoded = encoded | e<<(2*i)
514	}
515	return encoded
516}
Configure Feed

Configure Feed