indexdata.go at 72423e17625007e0576cd37f4026dad75d51dd7e · boltless.me/zoekt

fork of https://github.com/sourcegraph/zoekt
zoekt / indexdata.go
at 72423e17625007e0576cd37f4026dad75d51dd7e 14 kB View raw
Keegan Carruthers-Smith all: use stdlib slices package (#735) 2y ago
  1// Copyright 2016 Google Inc. All rights reserved.
  2//
  3// Licensed under the Apache License, Version 2.0 (the "License");
  4// you may not use this file except in compliance with the License.
  5// You may obtain a copy of the License at
  6//
  7//    http://www.apache.org/licenses/LICENSE-2.0
  8//
  9// Unless required by applicable law or agreed to in writing, software
 10// distributed under the License is distributed on an "AS IS" BASIS,
 11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12// See the License for the specific language governing permissions and
 13// limitations under the License.
 14
 15package zoekt
 16
 17import (
 18	"encoding/binary"
 19	"errors"
 20	"fmt"
 21	"hash/crc64"
 22	"log"
 23	"math/bits"
 24	"slices"
 25	"unicode/utf8"
 26
 27	"github.com/sourcegraph/zoekt/query"
 28)
 29
 30// indexData holds the pattern-independent data that we have to have
 31// in memory to search. Most of the memory is taken up by the ngram =>
 32// offset index.
 33type indexData struct {
 34	symbols symbolData
 35
 36	file IndexFile
 37
 38	contentNgrams btreeIndex
 39
 40	newlinesStart uint32
 41	newlinesIndex []uint32
 42
 43	docSectionsStart uint32
 44	docSectionsIndex []uint32
 45
 46	runeDocSections []DocumentSection
 47
 48	// rune offset=>byte offset mapping, relative to the start of the content corpus
 49	runeOffsets runeOffsetMap
 50
 51	// offsets of file contents; includes end of last file
 52	boundariesStart uint32
 53	boundaries      []uint32
 54
 55	// rune offsets for the file content boundaries
 56	fileEndRunes []uint32
 57
 58	fileNameContent []byte
 59	fileNameIndex   []uint32
 60	fileNameNgrams  btreeIndex
 61
 62	// fileEndSymbol[i] is the index of the first symbol for document i.
 63	fileEndSymbol []uint32
 64
 65	// rune offset=>byte offset mapping, relative to the start of the filename corpus
 66	fileNameRuneOffsets runeOffsetMap
 67
 68	// rune offsets for the file name boundaries
 69	fileNameEndRunes []uint32
 70
 71	fileBranchMasks []uint64
 72
 73	// mask (power of 2) => name
 74	branchNames []map[uint]string
 75
 76	// name => mask (power of 2)
 77	branchIDs []map[string]uint
 78
 79	metaData     IndexMetadata
 80	repoMetaData []Repository
 81
 82	subRepos     []uint32
 83	subRepoPaths [][]string
 84
 85	// Checksums for all the files, at 8-byte intervals
 86	checksums []byte
 87
 88	// languages for all the files.
 89	languages []byte
 90
 91	// inverse of LanguageMap in metaData
 92	languageMap map[uint16]string
 93
 94	repoListEntry []RepoListEntry
 95
 96	// repository indexes for all the files
 97	repos []uint16
 98
 99	// Experimental: docID => rank vec
100	ranks [][]float64
101
102	// rawConfigMasks contains the encoded RawConfig for each repository
103	rawConfigMasks []uint8
104}
105
106type symbolData struct {
107	// symContent stores Symbol.Sym and Symbol.Parent.
108	// TODO we don't need to store Symbol.Sym.
109	symContent []byte
110	symIndex   []byte
111	// symKindContent is an enum of sym.Kind and sym.ParentKind
112	symKindContent []byte
113	symKindIndex   []uint32
114	// symMetadata is [4]uint32 0 Kind Parent ParentKind
115	symMetaData []byte
116}
117
118func uint32SliceAt(a []byte, n uint32) uint32 {
119	return binary.BigEndian.Uint32(a[n*4:])
120}
121
122func uint32SliceLen(a []byte) uint32 {
123	return uint32(len(a) / 4)
124}
125
126// parent returns index i of the parent enum
127func (d *symbolData) parent(i uint32) []byte {
128	delta := uint32SliceAt(d.symIndex, 0)
129	start := uint32SliceAt(d.symIndex, i) - delta
130	var end uint32
131	if i+1 == uint32SliceLen(d.symIndex) {
132		end = uint32(len(d.symContent))
133	} else {
134		end = uint32SliceAt(d.symIndex, i+1) - delta
135	}
136	return d.symContent[start:end]
137}
138
139// kind returns index i of the kind enum
140func (d *symbolData) kind(i uint32) []byte {
141	return d.symKindContent[d.symKindIndex[i]:d.symKindIndex[i+1]]
142}
143
144// data returns the symbol at index i
145func (d *symbolData) data(i uint32) *Symbol {
146	size := uint32(4 * 4) // 4 uint32s
147	offset := i * size
148	if offset >= uint32(len(d.symMetaData)) {
149		return nil
150	}
151
152	metadata := d.symMetaData[offset : offset+size]
153	sym := &Symbol{}
154	key := uint32SliceAt(metadata, 1)
155	sym.Kind = string(d.kind(key))
156	key = uint32SliceAt(metadata, 2)
157	sym.Parent = string(d.parent(key))
158	key = uint32SliceAt(metadata, 3)
159	sym.ParentKind = string(d.kind(key))
160	return sym
161}
162
163func (d *indexData) getChecksum(idx uint32) []byte {
164	start := crc64.Size * idx
165	return d.checksums[start : start+crc64.Size]
166}
167
168func (d *indexData) getLanguage(idx uint32) uint16 {
169	if d.metaData.IndexFeatureVersion < 12 {
170		// older zoekt files had 8-bit language entries
171		return uint16(d.languages[idx])
172	}
173	// newer zoekt files have 16-bit language entries
174	return uint16(d.languages[idx*2]) | uint16(d.languages[idx*2+1])<<8
175}
176
177// calculates stats for files in the range [start, end).
178func (d *indexData) calculateStatsForFileRange(start, end uint32) RepoStats {
179	if start >= end {
180		// An empty shard for an empty repository.
181		return RepoStats{
182			Shards: 1,
183		}
184	}
185
186	bytesContent := d.boundaries[end] - d.boundaries[start]
187	bytesFN := d.fileNameIndex[end] - d.fileNameIndex[start]
188	count, defaultCount, otherCount := d.calculateNewLinesStats(start, end)
189
190	// CR keegan for stefan: I think we may want to restructure RepoListEntry so
191	// that we don't change anything, except we have
192	// []Repository. Alternatively, things we can divide up we do (like
193	// here). Right now I don't like that these numbers are not true, especially
194	// after aggregation. For now I will move forward with this until we can
195	// chat more.
196	return RepoStats{
197		ContentBytes: int64(bytesContent) + int64(bytesFN),
198		Documents:    int(end - start),
199		// CR keegan for stefan: our shard count is going to go out of whack,
200		// since we will aggregate these. So we will report more shards than are
201		// present on disk. What should we do?
202		Shards: 1,
203
204		// Sourcegraph specific
205		NewLinesCount:              count,
206		DefaultBranchNewLinesCount: defaultCount,
207		OtherBranchesNewLinesCount: otherCount,
208	}
209}
210
211func (d *indexData) calculateStats() error {
212	d.repoListEntry = make([]RepoListEntry, 0, len(d.repoMetaData))
213	var start, end uint32
214
215	for repoID, md := range d.repoMetaData {
216		// determine the file range for repo i
217		for end < uint32(len(d.repos)) && d.repos[end] == uint16(repoID) {
218			end++
219		}
220		if start < end && d.repos[start] != uint16(repoID) {
221			return fmt.Errorf("shard documents out of order with respect to repositories: expected document %d to be part of repo %d", start, repoID)
222		}
223
224		d.repoListEntry = append(d.repoListEntry, RepoListEntry{
225			Repository:    md,
226			IndexMetadata: d.metaData,
227			Stats:         d.calculateStatsForFileRange(start, end),
228		})
229		start = end
230	}
231
232	// All repos in a compound shard share memoryUse. So we average out the
233	// memoryUse per shard in our reporting. This has the benefit that when you
234	// aggregate the IndexBytes you get back the actual memoryUse.
235	//
236	// TODO take into account tombstones for aggregation. Even better, adjust
237	// API to be shard centric not repo centric.
238	if len(d.repoListEntry) > 0 {
239		indexBytes := d.memoryUse()
240		indexBytesChunk := indexBytes / len(d.repoListEntry)
241		for i := range d.repoListEntry {
242			d.repoListEntry[i].Stats.IndexBytes = int64(indexBytesChunk)
243			indexBytes -= indexBytesChunk
244		}
245		d.repoListEntry[0].Stats.IndexBytes += int64(indexBytes)
246	}
247
248	return nil
249}
250
251// calculateNewLinesStats computes some Sourcegraph specific statistics for files
252// in the range [start, end). These are not as efficient to calculate as the
253// normal statistics. We experimentally measured about a 10% slower shard load
254// time. However, we find these values very useful to track and computing them
255// outside of load time introduces a lot of complexity.
256func (d *indexData) calculateNewLinesStats(start, end uint32) (count, defaultCount, otherCount uint64) {
257	for i := start; i < end; i++ {
258		// branchMask is a bitmask of the branches for a document. Zoekt by
259		// convention represents the default branch as the lowest bit.
260		branchMask := d.fileBranchMasks[i]
261		isDefault := (branchMask & 1) == 1
262		others := uint64(bits.OnesCount64(branchMask >> 1))
263
264		// this is readNewlines but only reading the size of each section which
265		// corresponds to the number of newlines.
266		sec := simpleSection{
267			off: d.newlinesStart + d.newlinesIndex[i],
268			sz:  d.newlinesIndex[i+1] - d.newlinesIndex[i],
269		}
270		// We are only reading the first varint which is the size. So we don't
271		// need to read more than MaxVarintLen64 bytes.
272		if sec.sz > binary.MaxVarintLen64 {
273			sec.sz = binary.MaxVarintLen64
274		}
275		blob, err := d.readSectionBlob(sec)
276		if err != nil {
277			log.Printf("error reading newline index for document %d on shard %s: %v", i, d.file.Name(), err)
278			continue
279		}
280		sz, _ := binary.Uvarint(blob)
281
282		count += sz
283		if isDefault {
284			defaultCount += sz
285		}
286		otherCount += (others * sz)
287	}
288
289	return
290}
291
292func (d *indexData) String() string {
293	return fmt.Sprintf("shard(%s)", d.file.Name())
294}
295
296// calculates an approximate size of indexData in memory in bytes.
297func (d *indexData) memoryUse() int {
298	sz := 0
299	for _, a := range [][]uint32{
300		d.newlinesIndex, d.docSectionsIndex,
301		d.boundaries, d.fileNameIndex,
302		d.fileEndRunes, d.fileNameEndRunes,
303		d.fileEndSymbol, d.symbols.symKindIndex,
304		d.subRepos,
305	} {
306		sz += 4 * len(a)
307	}
308	sz += d.runeOffsets.sizeBytes()
309	sz += d.fileNameRuneOffsets.sizeBytes()
310	sz += len(d.languages)
311	sz += len(d.checksums)
312	sz += 2 * len(d.repos)
313	if len(d.ranks) > 0 {
314		sz += 8 * len(d.ranks) * len(d.ranks[0])
315	}
316	sz += 8 * len(d.runeDocSections)
317	sz += 8 * len(d.fileBranchMasks)
318	sz += d.contentNgrams.SizeBytes()
319	sz += d.fileNameNgrams.SizeBytes()
320	return sz
321}
322
323const maxUInt32 = 0xffffffff
324
325func min2Index(xs []uint32) (idx0, idx1 int) {
326	min0, min1 := uint32(maxUInt32), uint32(maxUInt32)
327	for i, x := range xs {
328		if x <= min0 {
329			idx0, idx1 = i, idx0
330			min0, min1 = x, min0
331		} else if x <= min1 {
332			idx1 = i
333			min1 = x
334		}
335	}
336	return
337}
338
339// minFrequencyNgramOffsets returns the two lowest frequency ngrams to pass to
340// the distance iterator. If they have the same frequency, we maximise the
341// distance between them. first will always have a smaller index than last.
342func minFrequencyNgramOffsets(ngramOffs []runeNgramOff, frequencies []uint32) (first, last runeNgramOff) {
343	firstI, lastI := min2Index(frequencies)
344	// If the frequencies are equal lets maximise distance in the query
345	// string. This optimization normally triggers for long repeated trigrams
346	// in a string, eg a query like "AAAAA..."
347	if frequencies[firstI] == frequencies[lastI] {
348		for i, freq := range frequencies {
349			if freq != frequencies[firstI] {
350				continue
351			}
352			if ngramOffs[i].index < ngramOffs[firstI].index {
353				firstI = i
354			}
355			if ngramOffs[i].index > ngramOffs[lastI].index {
356				lastI = i
357			}
358		}
359	}
360	first = ngramOffs[firstI]
361	last = ngramOffs[lastI]
362	// Ensure first appears before last to make distance logic below clean.
363	if first.index > last.index {
364		last, first = first, last
365	}
366	return first, last
367}
368
369func (data *indexData) ngrams(filename bool) btreeIndex {
370	if filename {
371		return data.fileNameNgrams
372	}
373	return data.contentNgrams
374}
375
376type ngramIterationResults struct {
377	matchIterator
378
379	caseSensitive bool
380	fileName      bool
381	substrBytes   []byte
382	substrLowered []byte
383}
384
385func (r *ngramIterationResults) String() string {
386	return fmt.Sprintf("wrapper(%v)", r.matchIterator)
387}
388
389func (r *ngramIterationResults) candidates() []*candidateMatch {
390	cs := r.matchIterator.candidates()
391	for _, c := range cs {
392		c.caseSensitive = r.caseSensitive
393		c.fileName = r.fileName
394		c.substrBytes = r.substrBytes
395		c.substrLowered = r.substrLowered
396	}
397	return cs
398}
399
400func (d *indexData) iterateNgrams(query *query.Substring) (*ngramIterationResults, error) {
401	str := query.Pattern
402
403	// Find the 2 least common ngrams from the string.
404	ngramOffs := splitNGrams([]byte(query.Pattern))
405
406	// protect against accidental searching of empty strings
407	if len(ngramOffs) == 0 {
408		return nil, errors.New("iterateNgrams needs non empty string")
409	}
410
411	// PERF: Sort to increase the chances adjacent checks are in the same btree
412	// bucket (which can cause disk IO).
413	slices.SortFunc(ngramOffs, runeNgramOff.Compare)
414	frequencies := make([]uint32, 0, len(ngramOffs))
415	ngramLookups := 0
416	ngrams := d.ngrams(query.FileName)
417	for _, o := range ngramOffs {
418		var freq uint32
419		if query.CaseSensitive {
420			freq = ngrams.Get(o.ngram).sz
421			ngramLookups++
422		} else {
423			for _, v := range generateCaseNgrams(o.ngram) {
424				freq += ngrams.Get(v).sz
425				ngramLookups++
426			}
427		}
428
429		if freq == 0 {
430			return &ngramIterationResults{
431				matchIterator: &noMatchTree{
432					Why: "freq=0",
433					Stats: Stats{
434						NgramLookups: ngramLookups,
435					},
436				},
437			}, nil
438		}
439
440		frequencies = append(frequencies, freq)
441	}
442
443	// first and last are now the smallest trigram posting lists to iterate
444	// through.
445	first, last := minFrequencyNgramOffsets(ngramOffs, frequencies)
446
447	iter := &ngramDocIterator{
448		leftPad:      first.index,
449		rightPad:     uint32(utf8.RuneCountInString(str)) - first.index,
450		ngramLookups: ngramLookups,
451	}
452	if query.FileName {
453		iter.ends = d.fileNameEndRunes
454	} else {
455		iter.ends = d.fileEndRunes
456	}
457
458	if first != last {
459		runeDist := last.index - first.index
460		i, err := d.newDistanceTrigramIter(first.ngram, last.ngram, runeDist, query.CaseSensitive, query.FileName)
461		if err != nil {
462			return nil, err
463		}
464
465		iter.iter = i
466	} else {
467		hitIter, err := d.trigramHitIterator(last.ngram, query.CaseSensitive, query.FileName)
468		if err != nil {
469			return nil, err
470		}
471		iter.iter = hitIter
472	}
473
474	patBytes := []byte(query.Pattern)
475	lowerPatBytes := toLower(patBytes)
476
477	return &ngramIterationResults{
478		matchIterator: iter,
479		caseSensitive: query.CaseSensitive,
480		fileName:      query.FileName,
481		substrBytes:   patBytes,
482		substrLowered: lowerPatBytes,
483	}, nil
484}
485
486func (d *indexData) fileName(i uint32) []byte {
487	return d.fileNameContent[d.fileNameIndex[i]:d.fileNameIndex[i+1]]
488}
489
490func (d *indexData) numDocs() uint32 {
491	return uint32(len(d.fileBranchMasks))
492}
493
494func (s *indexData) Close() {
495	s.file.Close()
496}
497
498const (
499	rawConfigYes = 1
500	rawConfigNo  = 2
501)
502
503// encodeRawConfig encodes a rawConfig map into a uint8 mask.
504func encodeRawConfig(rawConfig map[string]string) uint8 {
505	var encoded uint8
506	for i, f := range []string{"public", "fork", "archived"} {
507		var e uint8
508		v, ok := rawConfig[f]
509		if ok && v == "1" {
510			e |= rawConfigYes
511		} else {
512			e |= rawConfigNo
513		}
514		encoded = encoded | e<<(2*i)
515	}
516	return encoded
517}
Configure Feed

Configure Feed