indexdata.go at ebb3ca24249dfa7d3be2bb48899b86e68bed2f6c · boltless.me/zoekt

fork of https://github.com/sourcegraph/zoekt
zoekt / indexdata.go
at ebb3ca24249dfa7d3be2bb48899b86e68bed2f6c 16 kB View raw
Keegan Carruthers-Smith index: experiment to limit ngram lookups for large snippets (#795) 2y ago
  1// Copyright 2016 Google Inc. All rights reserved.
  2//
  3// Licensed under the Apache License, Version 2.0 (the "License");
  4// you may not use this file except in compliance with the License.
  5// You may obtain a copy of the License at
  6//
  7//    http://www.apache.org/licenses/LICENSE-2.0
  8//
  9// Unless required by applicable law or agreed to in writing, software
 10// distributed under the License is distributed on an "AS IS" BASIS,
 11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12// See the License for the specific language governing permissions and
 13// limitations under the License.
 14
 15package zoekt
 16
 17import (
 18	"encoding/binary"
 19	"errors"
 20	"fmt"
 21	"hash/crc64"
 22	"log"
 23	"math/bits"
 24	"os"
 25	"slices"
 26	"strconv"
 27	"unicode/utf8"
 28
 29	"github.com/sourcegraph/zoekt/query"
 30)
 31
 32// indexData holds the pattern-independent data that we have to have
 33// in memory to search. Most of the memory is taken up by the ngram =>
 34// offset index.
 35type indexData struct {
 36	symbols symbolData
 37
 38	file IndexFile
 39
 40	contentNgrams btreeIndex
 41
 42	newlinesStart uint32
 43	newlinesIndex []uint32
 44
 45	docSectionsStart uint32
 46	docSectionsIndex []uint32
 47
 48	runeDocSections []DocumentSection
 49
 50	// rune offset=>byte offset mapping, relative to the start of the content corpus
 51	runeOffsets runeOffsetMap
 52
 53	// offsets of file contents; includes end of last file
 54	boundariesStart uint32
 55	boundaries      []uint32
 56
 57	// rune offsets for the file content boundaries
 58	fileEndRunes []uint32
 59
 60	fileNameContent []byte
 61	fileNameIndex   []uint32
 62	fileNameNgrams  btreeIndex
 63
 64	// fileEndSymbol[i] is the index of the first symbol for document i.
 65	fileEndSymbol []uint32
 66
 67	// rune offset=>byte offset mapping, relative to the start of the filename corpus
 68	fileNameRuneOffsets runeOffsetMap
 69
 70	// rune offsets for the file name boundaries
 71	fileNameEndRunes []uint32
 72
 73	fileBranchMasks []uint64
 74
 75	// mask (power of 2) => name
 76	branchNames []map[uint]string
 77
 78	// name => mask (power of 2)
 79	branchIDs []map[string]uint
 80
 81	metaData     IndexMetadata
 82	repoMetaData []Repository
 83
 84	subRepos     []uint32
 85	subRepoPaths [][]string
 86
 87	// Checksums for all the files, at 8-byte intervals
 88	checksums []byte
 89
 90	// languages for all the files.
 91	languages []byte
 92
 93	// inverse of LanguageMap in metaData
 94	languageMap map[uint16]string
 95
 96	repoListEntry []RepoListEntry
 97
 98	// repository indexes for all the files
 99	repos []uint16
100
101	// Experimental: docID => rank vec
102	ranks [][]float64
103
104	// rawConfigMasks contains the encoded RawConfig for each repository
105	rawConfigMasks []uint8
106}
107
108type symbolData struct {
109	// symContent stores Symbol.Sym and Symbol.Parent.
110	// TODO we don't need to store Symbol.Sym.
111	symContent []byte
112	symIndex   []byte
113	// symKindContent is an enum of sym.Kind and sym.ParentKind
114	symKindContent []byte
115	symKindIndex   []uint32
116	// symMetadata is [4]uint32 0 Kind Parent ParentKind
117	symMetaData []byte
118}
119
120func uint32SliceAt(a []byte, n uint32) uint32 {
121	return binary.BigEndian.Uint32(a[n*4:])
122}
123
124func uint32SliceLen(a []byte) uint32 {
125	return uint32(len(a) / 4)
126}
127
128// parent returns index i of the parent enum
129func (d *symbolData) parent(i uint32) []byte {
130	delta := uint32SliceAt(d.symIndex, 0)
131	start := uint32SliceAt(d.symIndex, i) - delta
132	var end uint32
133	if i+1 == uint32SliceLen(d.symIndex) {
134		end = uint32(len(d.symContent))
135	} else {
136		end = uint32SliceAt(d.symIndex, i+1) - delta
137	}
138	return d.symContent[start:end]
139}
140
141// kind returns index i of the kind enum
142func (d *symbolData) kind(i uint32) []byte {
143	return d.symKindContent[d.symKindIndex[i]:d.symKindIndex[i+1]]
144}
145
146// data returns the symbol at index i
147func (d *symbolData) data(i uint32) *Symbol {
148	size := uint32(4 * 4) // 4 uint32s
149	offset := i * size
150	if offset >= uint32(len(d.symMetaData)) {
151		return nil
152	}
153
154	metadata := d.symMetaData[offset : offset+size]
155	sym := &Symbol{}
156	key := uint32SliceAt(metadata, 1)
157	sym.Kind = string(d.kind(key))
158	key = uint32SliceAt(metadata, 2)
159	sym.Parent = string(d.parent(key))
160	key = uint32SliceAt(metadata, 3)
161	sym.ParentKind = string(d.kind(key))
162	return sym
163}
164
165func (d *indexData) getChecksum(idx uint32) []byte {
166	start := crc64.Size * idx
167	return d.checksums[start : start+crc64.Size]
168}
169
170func (d *indexData) getLanguage(idx uint32) uint16 {
171	if d.metaData.IndexFeatureVersion < 12 {
172		// older zoekt files had 8-bit language entries
173		return uint16(d.languages[idx])
174	}
175	// newer zoekt files have 16-bit language entries
176	return uint16(d.languages[idx*2]) | uint16(d.languages[idx*2+1])<<8
177}
178
179// calculates stats for files in the range [start, end).
180func (d *indexData) calculateStatsForFileRange(start, end uint32) RepoStats {
181	if start >= end {
182		// An empty shard for an empty repository.
183		return RepoStats{
184			Shards: 1,
185		}
186	}
187
188	bytesContent := d.boundaries[end] - d.boundaries[start]
189	bytesFN := d.fileNameIndex[end] - d.fileNameIndex[start]
190	count, defaultCount, otherCount := d.calculateNewLinesStats(start, end)
191
192	// CR keegan for stefan: I think we may want to restructure RepoListEntry so
193	// that we don't change anything, except we have
194	// []Repository. Alternatively, things we can divide up we do (like
195	// here). Right now I don't like that these numbers are not true, especially
196	// after aggregation. For now I will move forward with this until we can
197	// chat more.
198	return RepoStats{
199		ContentBytes: int64(bytesContent) + int64(bytesFN),
200		Documents:    int(end - start),
201		// CR keegan for stefan: our shard count is going to go out of whack,
202		// since we will aggregate these. So we will report more shards than are
203		// present on disk. What should we do?
204		Shards: 1,
205
206		// Sourcegraph specific
207		NewLinesCount:              count,
208		DefaultBranchNewLinesCount: defaultCount,
209		OtherBranchesNewLinesCount: otherCount,
210	}
211}
212
213func (d *indexData) calculateStats() error {
214	d.repoListEntry = make([]RepoListEntry, 0, len(d.repoMetaData))
215	var start, end uint32
216
217	for repoID, md := range d.repoMetaData {
218		// determine the file range for repo i
219		for end < uint32(len(d.repos)) && d.repos[end] == uint16(repoID) {
220			end++
221		}
222		if start < end && d.repos[start] != uint16(repoID) {
223			return fmt.Errorf("shard documents out of order with respect to repositories: expected document %d to be part of repo %d", start, repoID)
224		}
225
226		d.repoListEntry = append(d.repoListEntry, RepoListEntry{
227			Repository:    md,
228			IndexMetadata: d.metaData,
229			Stats:         d.calculateStatsForFileRange(start, end),
230		})
231		start = end
232	}
233
234	// All repos in a compound shard share memoryUse. So we average out the
235	// memoryUse per shard in our reporting. This has the benefit that when you
236	// aggregate the IndexBytes you get back the actual memoryUse.
237	//
238	// TODO take into account tombstones for aggregation. Even better, adjust
239	// API to be shard centric not repo centric.
240	if len(d.repoListEntry) > 0 {
241		indexBytes := d.memoryUse()
242		indexBytesChunk := indexBytes / len(d.repoListEntry)
243		for i := range d.repoListEntry {
244			d.repoListEntry[i].Stats.IndexBytes = int64(indexBytesChunk)
245			indexBytes -= indexBytesChunk
246		}
247		d.repoListEntry[0].Stats.IndexBytes += int64(indexBytes)
248	}
249
250	return nil
251}
252
253// calculateNewLinesStats computes some Sourcegraph specific statistics for files
254// in the range [start, end). These are not as efficient to calculate as the
255// normal statistics. We experimentally measured about a 10% slower shard load
256// time. However, we find these values very useful to track and computing them
257// outside of load time introduces a lot of complexity.
258func (d *indexData) calculateNewLinesStats(start, end uint32) (count, defaultCount, otherCount uint64) {
259	for i := start; i < end; i++ {
260		// branchMask is a bitmask of the branches for a document. Zoekt by
261		// convention represents the default branch as the lowest bit.
262		branchMask := d.fileBranchMasks[i]
263		isDefault := (branchMask & 1) == 1
264		others := uint64(bits.OnesCount64(branchMask >> 1))
265
266		// this is readNewlines but only reading the size of each section which
267		// corresponds to the number of newlines.
268		sec := simpleSection{
269			off: d.newlinesStart + d.newlinesIndex[i],
270			sz:  d.newlinesIndex[i+1] - d.newlinesIndex[i],
271		}
272		// We are only reading the first varint which is the size. So we don't
273		// need to read more than MaxVarintLen64 bytes.
274		if sec.sz > binary.MaxVarintLen64 {
275			sec.sz = binary.MaxVarintLen64
276		}
277		blob, err := d.readSectionBlob(sec)
278		if err != nil {
279			log.Printf("error reading newline index for document %d on shard %s: %v", i, d.file.Name(), err)
280			continue
281		}
282		sz, _ := binary.Uvarint(blob)
283
284		count += sz
285		if isDefault {
286			defaultCount += sz
287		}
288		otherCount += (others * sz)
289	}
290
291	return
292}
293
294func (d *indexData) String() string {
295	return fmt.Sprintf("shard(%s)", d.file.Name())
296}
297
298// calculates an approximate size of indexData in memory in bytes.
299func (d *indexData) memoryUse() int {
300	sz := 0
301	for _, a := range [][]uint32{
302		d.newlinesIndex, d.docSectionsIndex,
303		d.boundaries, d.fileNameIndex,
304		d.fileEndRunes, d.fileNameEndRunes,
305		d.fileEndSymbol, d.symbols.symKindIndex,
306		d.subRepos,
307	} {
308		sz += 4 * len(a)
309	}
310	sz += d.runeOffsets.sizeBytes()
311	sz += d.fileNameRuneOffsets.sizeBytes()
312	sz += len(d.languages)
313	sz += len(d.checksums)
314	sz += 2 * len(d.repos)
315	if len(d.ranks) > 0 {
316		sz += 8 * len(d.ranks) * len(d.ranks[0])
317	}
318	sz += 8 * len(d.runeDocSections)
319	sz += 8 * len(d.fileBranchMasks)
320	sz += d.contentNgrams.SizeBytes()
321	sz += d.fileNameNgrams.SizeBytes()
322	return sz
323}
324
325// findSelectiveNgrams returns two ngrams to pass to the distance iterator, chosen to
326// produce a small file intersection. It finds the two lowest frequency ngrams, but avoids
327// overlapping trigrams to keep their intersection as small as possible.
328//
329// Invariant: first will always have a smaller index than last.
330func findSelectiveNgrams(ngramOffs []runeNgramOff, indexMap []int, frequencies []uint32) (first, last runeNgramOff) {
331	first, last = minFrequencyNgramOffsets(ngramOffs, frequencies)
332
333	// If the trigrams are overlapping, then try to shift one to reduce overlap.
334	// This is guaranteed to produce a smaller intersection.
335	if last.index-first.index < ngramSize {
336		newFirstIndex := max(last.index-ngramSize, 0)
337		if newFirstIndex != first.index {
338			first = ngramOffs[indexMap[newFirstIndex]]
339		}
340
341		newLastIndex := min(first.index+ngramSize, len(ngramOffs)-1)
342		if newLastIndex != last.index {
343			last = ngramOffs[indexMap[newLastIndex]]
344		}
345	}
346	return
347}
348
349const maxUInt32 = 0xffffffff
350
351func minFrequencyNgramOffsets(ngramOffs []runeNgramOff, frequencies []uint32) (first, last runeNgramOff) {
352	// Find the two lowest frequency ngrams.
353	idx0, idx1 := 0, 0
354	min0, min1 := uint32(maxUInt32), uint32(maxUInt32)
355	for i, x := range frequencies {
356		if x <= min0 {
357			idx0, idx1 = i, idx0
358			min0, min1 = x, min0
359		} else if x <= min1 {
360			idx1 = i
361			min1 = x
362		}
363	}
364
365	first = ngramOffs[idx0]
366	last = ngramOffs[idx1]
367
368	// Ensure first appears before last as a helpful invariant.
369	if first.index > last.index {
370		last, first = first, last
371	}
372	return
373}
374
375func (data *indexData) ngrams(filename bool) btreeIndex {
376	if filename {
377		return data.fileNameNgrams
378	}
379	return data.contentNgrams
380}
381
382type ngramIterationResults struct {
383	matchIterator
384
385	caseSensitive bool
386	fileName      bool
387	substrBytes   []byte
388	substrLowered []byte
389}
390
391func (r *ngramIterationResults) String() string {
392	return fmt.Sprintf("wrapper(%v)", r.matchIterator)
393}
394
395func (r *ngramIterationResults) candidates() []*candidateMatch {
396	cs := r.matchIterator.candidates()
397	for _, c := range cs {
398		c.caseSensitive = r.caseSensitive
399		c.fileName = r.fileName
400		c.substrBytes = r.substrBytes
401		c.substrLowered = r.substrLowered
402	}
403	return cs
404}
405
406// experimentIterateNgramLookupLimit when non-zero will only lookup this many
407// ngrams from a query string. Note: that if case-insensitive, this only
408// limits the input. So we will still lookup the case folding.
409//
410// This experiment is targetting looking up large snippets. If it is
411// successful, we will likely hardcode the value we use in production.
412//
413// Future note: if we find cases where this works badly, we can consider only
414// searching a random subset of the query string to avoid bad strings.
415var experimentIterateNgramLookupLimit = getEnvInt("SRC_EXPERIMENT_ITERATE_NGRAM_LOOKUP_LIMIT")
416
417func getEnvInt(k string) int {
418	v, _ := strconv.Atoi(os.Getenv(k))
419	if v != 0 {
420		log.Printf("%s = %d\n", k, v)
421	}
422	return v
423}
424
425func (d *indexData) iterateNgrams(query *query.Substring) (*ngramIterationResults, error) {
426	str := query.Pattern
427
428	// Find the 2 least common ngrams from the string.
429	var ngramOffs []runeNgramOff
430	if ngramLimit := experimentIterateNgramLookupLimit; ngramLimit > 0 {
431		// Note: we can't just do str = str[:ngramLimit] due to utf-8 and str
432		// length is asked later on for other optimizations.
433		ngramOffs = splitNGramsLimit([]byte(str), ngramLimit)
434	} else {
435		ngramOffs = splitNGrams([]byte(str))
436	}
437
438	// protect against accidental searching of empty strings
439	if len(ngramOffs) == 0 {
440		return nil, errors.New("iterateNgrams needs non empty string")
441	}
442
443	// PERF: Sort to increase the chances adjacent checks are in the same btree
444	// bucket (which can cause disk IO).
445	slices.SortFunc(ngramOffs, runeNgramOff.Compare)
446	frequencies := make([]uint32, 0, len(ngramOffs))
447	indexMap := make([]int, len(ngramOffs))
448	ngramLookups := 0
449	ngrams := d.ngrams(query.FileName)
450	for i, o := range ngramOffs {
451		var freq uint32
452		if query.CaseSensitive {
453			freq = ngrams.Get(o.ngram).sz
454			ngramLookups++
455		} else {
456			for _, v := range generateCaseNgrams(o.ngram) {
457				freq += ngrams.Get(v).sz
458				ngramLookups++
459			}
460		}
461
462		if freq == 0 {
463			return &ngramIterationResults{
464				matchIterator: &noMatchTree{
465					Why: "freq=0",
466					Stats: Stats{
467						NgramLookups: ngramLookups,
468					},
469				},
470			}, nil
471		}
472
473		frequencies = append(frequencies, freq)
474		indexMap[o.index] = i
475	}
476
477	first, last := findSelectiveNgrams(ngramOffs, indexMap, frequencies)
478
479	iter := &ngramDocIterator{
480		leftPad:      uint32(first.index),
481		rightPad:     uint32(utf8.RuneCountInString(str) - first.index),
482		ngramLookups: ngramLookups,
483	}
484	if query.FileName {
485		iter.ends = d.fileNameEndRunes
486	} else {
487		iter.ends = d.fileEndRunes
488	}
489
490	if first != last {
491		runeDist := uint32(last.index - first.index)
492		i, err := d.newDistanceTrigramIter(first.ngram, last.ngram, runeDist, query.CaseSensitive, query.FileName)
493		if err != nil {
494			return nil, err
495		}
496
497		iter.iter = i
498	} else {
499		hitIter, err := d.trigramHitIterator(last.ngram, query.CaseSensitive, query.FileName)
500		if err != nil {
501			return nil, err
502		}
503		iter.iter = hitIter
504	}
505
506	patBytes := []byte(query.Pattern)
507	lowerPatBytes := toLower(patBytes)
508
509	return &ngramIterationResults{
510		matchIterator: iter,
511		caseSensitive: query.CaseSensitive,
512		fileName:      query.FileName,
513		substrBytes:   patBytes,
514		substrLowered: lowerPatBytes,
515	}, nil
516}
517
518func (d *indexData) fileName(i uint32) []byte {
519	return d.fileNameContent[d.fileNameIndex[i]:d.fileNameIndex[i+1]]
520}
521
522func (d *indexData) numDocs() uint32 {
523	return uint32(len(d.fileBranchMasks))
524}
525
526func (s *indexData) Close() {
527	s.file.Close()
528}
529
530const (
531	rawConfigYes = 1
532	rawConfigNo  = 2
533)
534
535// encodeRawConfig encodes a rawConfig map into a uint8 mask.
536func encodeRawConfig(rawConfig map[string]string) uint8 {
537	var encoded uint8
538	for i, f := range []string{"public", "fork", "archived"} {
539		var e uint8
540		v, ok := rawConfig[f]
541		if ok && v == "1" {
542			e |= rawConfigYes
543		} else {
544			e |= rawConfigNo
545		}
546		encoded = encoded | e<<(2*i)
547	}
548	return encoded
549}
Configure Feed

Configure Feed