indexdata.go at 8e309eb624d2cc3ba264d67f3ae8d694ad68b00c · boltless.me/zoekt

fork of https://github.com/sourcegraph/zoekt
zoekt / indexdata.go
at 8e309eb624d2cc3ba264d67f3ae8d694ad68b00c 13 kB View raw
Stefan Hengl zoekt: use b-tree for filenames (#530) 3y ago
  1// Copyright 2016 Google Inc. All rights reserved.
  2//
  3// Licensed under the Apache License, Version 2.0 (the "License");
  4// you may not use this file except in compliance with the License.
  5// You may obtain a copy of the License at
  6//
  7//    http://www.apache.org/licenses/LICENSE-2.0
  8//
  9// Unless required by applicable law or agreed to in writing, software
 10// distributed under the License is distributed on an "AS IS" BASIS,
 11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12// See the License for the specific language governing permissions and
 13// limitations under the License.
 14
 15package zoekt
 16
 17import (
 18	"encoding/binary"
 19	"fmt"
 20	"hash/crc64"
 21	"log"
 22	"math/bits"
 23	"unicode/utf8"
 24
 25	"github.com/sourcegraph/zoekt/query"
 26)
 27
 28// indexData holds the pattern-independent data that we have to have
 29// in memory to search. Most of the memory is taken up by the ngram =>
 30// offset index.
 31type indexData struct {
 32	symbols symbolData
 33
 34	file IndexFile
 35
 36	ngrams ngramIndex
 37
 38	newlinesStart uint32
 39	newlinesIndex []uint32
 40
 41	docSectionsStart uint32
 42	docSectionsIndex []uint32
 43
 44	runeDocSections    []DocumentSection
 45	runeDocSectionsRaw []byte
 46
 47	// rune offset=>byte offset mapping, relative to the start of the content corpus
 48	runeOffsets runeOffsetMap
 49
 50	// offsets of file contents; includes end of last file
 51	boundariesStart uint32
 52	boundaries      []uint32
 53
 54	// rune offsets for the file content boundaries
 55	fileEndRunes []uint32
 56
 57	fileNameContent []byte
 58	fileNameIndex   []uint32
 59	fileNameNgrams  fileNameNgrams
 60
 61	// fileEndSymbol[i] is the index of the first symbol for document i.
 62	fileEndSymbol []uint32
 63
 64	// rune offset=>byte offset mapping, relative to the start of the filename corpus
 65	fileNameRuneOffsets runeOffsetMap
 66
 67	// rune offsets for the file name boundaries
 68	fileNameEndRunes []uint32
 69
 70	fileBranchMasks []uint64
 71
 72	// mask (power of 2) => name
 73	branchNames []map[uint]string
 74
 75	// name => mask (power of 2)
 76	branchIDs []map[string]uint
 77
 78	metaData     IndexMetadata
 79	repoMetaData []Repository
 80
 81	subRepos     []uint32
 82	subRepoPaths [][]string
 83
 84	// Checksums for all the files, at 8-byte intervals
 85	checksums []byte
 86
 87	// languages for all the files.
 88	languages []byte
 89
 90	// inverse of LanguageMap in metaData
 91	languageMap map[uint16]string
 92
 93	repoListEntry []RepoListEntry
 94
 95	// repository indexes for all the files
 96	repos []uint16
 97
 98	// Experimental: docID => rank vec
 99	ranks [][]float64
100
101	// rawConfigMasks contains the encoded RawConfig for each repository
102	rawConfigMasks []uint8
103}
104
105type symbolData struct {
106	// symContent stores Symbol.Sym and Symbol.Parent.
107	// TODO we don't need to store Symbol.Sym.
108	symContent []byte
109	symIndex   []byte
110	// symKindContent is an enum of sym.Kind and sym.ParentKind
111	symKindContent []byte
112	symKindIndex   []uint32
113	// symMetadata is [4]uint32 0 Kind Parent ParentKind
114	symMetaData []byte
115}
116
117func uint32SliceAt(a []byte, n uint32) uint32 {
118	return binary.BigEndian.Uint32(a[n*4:])
119}
120
121func uint32SliceLen(a []byte) uint32 {
122	return uint32(len(a) / 4)
123}
124
125// parent returns index i of the parent enum
126func (d *symbolData) parent(i uint32) []byte {
127	delta := uint32SliceAt(d.symIndex, 0)
128	start := uint32SliceAt(d.symIndex, i) - delta
129	var end uint32
130	if i+1 == uint32SliceLen(d.symIndex) {
131		end = uint32(len(d.symContent))
132	} else {
133		end = uint32SliceAt(d.symIndex, i+1) - delta
134	}
135	return d.symContent[start:end]
136}
137
138// kind returns index i of the kind enum
139func (d *symbolData) kind(i uint32) []byte {
140	return d.symKindContent[d.symKindIndex[i]:d.symKindIndex[i+1]]
141}
142
143// data returns the symbol at index i
144func (d *symbolData) data(i uint32) *Symbol {
145	size := uint32(4 * 4) // 4 uint32s
146	offset := i * size
147	if offset >= uint32(len(d.symMetaData)) {
148		return nil
149	}
150
151	metadata := d.symMetaData[offset : offset+size]
152	sym := &Symbol{}
153	key := uint32SliceAt(metadata, 1)
154	sym.Kind = string(d.kind(key))
155	key = uint32SliceAt(metadata, 2)
156	sym.Parent = string(d.parent(key))
157	key = uint32SliceAt(metadata, 3)
158	sym.ParentKind = string(d.kind(key))
159	return sym
160}
161
162func (d *indexData) getChecksum(idx uint32) []byte {
163	start := crc64.Size * idx
164	return d.checksums[start : start+crc64.Size]
165}
166
167func (d *indexData) getLanguage(idx uint32) uint16 {
168	if d.metaData.IndexFeatureVersion < 12 {
169		// older zoekt files had 8-bit language entries
170		return uint16(d.languages[idx])
171	}
172	// newer zoekt files have 16-bit language entries
173	return uint16(d.languages[idx*2]) | uint16(d.languages[idx*2+1])<<8
174}
175
176// calculates stats for files in the range [start, end).
177func (d *indexData) calculateStatsForFileRange(start, end uint32) RepoStats {
178	if start >= end {
179		// An empty shard for an empty repository.
180		return RepoStats{
181			Shards: 1,
182		}
183	}
184
185	bytesContent := d.boundaries[end] - d.boundaries[start]
186	bytesFN := d.fileNameIndex[end] - d.fileNameIndex[start]
187	count, defaultCount, otherCount := d.calculateNewLinesStats(start, end)
188
189	// CR keegan for stefan: I think we may want to restructure RepoListEntry so
190	// that we don't change anything, except we have
191	// []Repository. Alternatively, things we can divide up we do (like
192	// here). Right now I don't like that these numbers are not true, especially
193	// after aggregation. For now I will move forward with this until we can
194	// chat more.
195	return RepoStats{
196		ContentBytes: int64(bytesContent) + int64(bytesFN),
197		Documents:    int(end - start),
198		// CR keegan for stefan: our shard count is going to go out of whack,
199		// since we will aggregate these. So we will report more shards than are
200		// present on disk. What should we do?
201		Shards: 1,
202
203		// Sourcegraph specific
204		NewLinesCount:              count,
205		DefaultBranchNewLinesCount: defaultCount,
206		OtherBranchesNewLinesCount: otherCount,
207	}
208}
209
210func (d *indexData) calculateStats() error {
211	d.repoListEntry = make([]RepoListEntry, 0, len(d.repoMetaData))
212	var start, end uint32
213
214	for repoID, md := range d.repoMetaData {
215		// determine the file range for repo i
216		for end < uint32(len(d.repos)) && d.repos[end] == uint16(repoID) {
217			end++
218		}
219		if start < end && d.repos[start] != uint16(repoID) {
220			return fmt.Errorf("shard documents out of order with respect to repositories: expected document %d to be part of repo %d", start, repoID)
221		}
222
223		d.repoListEntry = append(d.repoListEntry, RepoListEntry{
224			Repository:    md,
225			IndexMetadata: d.metaData,
226			Stats:         d.calculateStatsForFileRange(start, end),
227		})
228		start = end
229	}
230
231	// All repos in a compound shard share memoryUse. So we average out the
232	// memoryUse per shard in our reporting. This has the benefit that when you
233	// aggregate the IndexBytes you get back the actual memoryUse.
234	//
235	// TODO take into account tombstones for aggregation. Even better, adjust
236	// API to be shard centric not repo centric.
237	if len(d.repoListEntry) > 0 {
238		indexBytes := d.memoryUse()
239		indexBytesChunk := indexBytes / len(d.repoListEntry)
240		for i := range d.repoListEntry {
241			d.repoListEntry[i].Stats.IndexBytes = int64(indexBytesChunk)
242			indexBytes -= indexBytesChunk
243		}
244		d.repoListEntry[0].Stats.IndexBytes += int64(indexBytes)
245	}
246
247	return nil
248}
249
250// calculateNewLinesStats computes some Sourcegraph specific statistics for files
251// in the range [start, end). These are not as efficient to calculate as the
252// normal statistics. We experimentally measured about a 10% slower shard load
253// time. However, we find these values very useful to track and computing them
254// outside of load time introduces a lot of complexity.
255func (d *indexData) calculateNewLinesStats(start, end uint32) (count, defaultCount, otherCount uint64) {
256	for i := start; i < end; i++ {
257		// branchMask is a bitmask of the branches for a document. Zoekt by
258		// convention represents the default branch as the lowest bit.
259		branchMask := d.fileBranchMasks[i]
260		isDefault := (branchMask & 1) == 1
261		others := uint64(bits.OnesCount64(branchMask >> 1))
262
263		// this is readNewlines but only reading the size of each section which
264		// corresponds to the number of newlines.
265		sec := simpleSection{
266			off: d.newlinesStart + d.newlinesIndex[i],
267			sz:  d.newlinesIndex[i+1] - d.newlinesIndex[i],
268		}
269		// We are only reading the first varint which is the size. So we don't
270		// need to read more than MaxVarintLen64 bytes.
271		if sec.sz > binary.MaxVarintLen64 {
272			sec.sz = binary.MaxVarintLen64
273		}
274		blob, err := d.readSectionBlob(sec)
275		if err != nil {
276			log.Printf("error reading newline index for document %d on shard %s: %v", i, d.file.Name(), err)
277			continue
278		}
279		sz, _ := binary.Uvarint(blob)
280
281		count += sz
282		if isDefault {
283			defaultCount += sz
284		}
285		otherCount += (others * sz)
286	}
287
288	return
289}
290
291func (d *indexData) String() string {
292	return fmt.Sprintf("shard(%s)", d.file.Name())
293}
294
295// calculates an approximate size of indexData in memory in bytes.
296func (d *indexData) memoryUse() int {
297	sz := 0
298	for _, a := range [][]uint32{
299		d.newlinesIndex, d.docSectionsIndex,
300		d.boundaries, d.fileNameIndex,
301		d.fileEndRunes, d.fileNameEndRunes,
302		d.fileEndSymbol, d.symbols.symKindIndex,
303		d.subRepos,
304	} {
305		sz += 4 * len(a)
306	}
307	sz += d.runeOffsets.sizeBytes()
308	sz += d.fileNameRuneOffsets.sizeBytes()
309	sz += len(d.languages)
310	sz += len(d.checksums)
311	sz += 2 * len(d.repos)
312	if len(d.ranks) > 0 {
313		sz += 8 * len(d.ranks) * len(d.ranks[0])
314	}
315	sz += 8 * len(d.runeDocSections)
316	sz += 8 * len(d.fileBranchMasks)
317	if d.ngrams != nil {
318		sz += d.ngrams.SizeBytes()
319	}
320	sz += d.fileNameNgrams.SizeBytes()
321	return sz
322}
323
324const maxUInt32 = 0xffffffff
325
326func firstMinarg(xs []uint32) uint32 {
327	m := uint32(maxUInt32)
328	j := len(xs)
329	for i, x := range xs {
330		if x < m {
331			m = x
332			j = i
333		}
334	}
335	return uint32(j)
336}
337
338func lastMinarg(xs []uint32) uint32 {
339	m := uint32(maxUInt32)
340	j := len(xs)
341	for i, x := range xs {
342		if x <= m {
343			m = x
344			j = i
345		}
346	}
347	return uint32(j)
348}
349
350func (data *indexData) ngramFrequency(ng ngram, filename bool) uint32 {
351	if filename {
352		return data.fileNameNgrams.Frequency(ng)
353	}
354
355	if data.ngrams == nil {
356		return 0
357	}
358
359	return data.ngrams.Get(ng).sz
360}
361
362type ngramIterationResults struct {
363	matchIterator
364
365	caseSensitive bool
366	fileName      bool
367	substrBytes   []byte
368	substrLowered []byte
369}
370
371func (r *ngramIterationResults) String() string {
372	return fmt.Sprintf("wrapper(%v)", r.matchIterator)
373}
374
375func (r *ngramIterationResults) candidates() []*candidateMatch {
376	cs := r.matchIterator.candidates()
377	for _, c := range cs {
378		c.caseSensitive = r.caseSensitive
379		c.fileName = r.fileName
380		c.substrBytes = r.substrBytes
381		c.substrLowered = r.substrLowered
382	}
383	return cs
384}
385
386func (d *indexData) iterateNgrams(query *query.Substring) (*ngramIterationResults, error) {
387	str := query.Pattern
388
389	// Find the 2 least common ngrams from the string.
390	ngramOffs := splitNGrams([]byte(query.Pattern))
391	frequencies := make([]uint32, 0, len(ngramOffs))
392	for _, o := range ngramOffs {
393		var freq uint32
394		if query.CaseSensitive {
395			freq = d.ngramFrequency(o.ngram, query.FileName)
396		} else {
397			for _, v := range generateCaseNgrams(o.ngram) {
398				freq += d.ngramFrequency(v, query.FileName)
399			}
400		}
401
402		if freq == 0 {
403			return &ngramIterationResults{
404				matchIterator: &noMatchTree{
405					Why: "freq=0",
406				},
407			}, nil
408		}
409
410		frequencies = append(frequencies, freq)
411	}
412	firstI := firstMinarg(frequencies)
413	frequencies[firstI] = maxUInt32
414	lastI := lastMinarg(frequencies)
415	if firstI > lastI {
416		lastI, firstI = firstI, lastI
417	}
418
419	firstNG := ngramOffs[firstI].ngram
420	lastNG := ngramOffs[lastI].ngram
421	iter := &ngramDocIterator{
422		leftPad:  firstI,
423		rightPad: uint32(utf8.RuneCountInString(str)) - firstI,
424	}
425	if query.FileName {
426		iter.ends = d.fileNameEndRunes
427	} else {
428		iter.ends = d.fileEndRunes
429	}
430
431	if firstI != lastI {
432		i, err := d.newDistanceTrigramIter(firstNG, lastNG, lastI-firstI, query.CaseSensitive, query.FileName)
433		if err != nil {
434			return nil, err
435		}
436
437		iter.iter = i
438	} else {
439		hitIter, err := d.trigramHitIterator(lastNG, query.CaseSensitive, query.FileName)
440		if err != nil {
441			return nil, err
442		}
443		iter.iter = hitIter
444	}
445
446	patBytes := []byte(query.Pattern)
447	lowerPatBytes := toLower(patBytes)
448
449	return &ngramIterationResults{
450		matchIterator: iter,
451		caseSensitive: query.CaseSensitive,
452		fileName:      query.FileName,
453		substrBytes:   patBytes,
454		substrLowered: lowerPatBytes,
455	}, nil
456}
457
458func (d *indexData) fileName(i uint32) []byte {
459	return d.fileNameContent[d.fileNameIndex[i]:d.fileNameIndex[i+1]]
460}
461
462func (d *indexData) numDocs() uint32 {
463	return uint32(len(d.fileBranchMasks))
464}
465
466func (s *indexData) Close() {
467	s.file.Close()
468}
469
470const (
471	rawConfigYes = 1
472	rawConfigNo  = 2
473)
474
475// encodeRawConfig encodes a rawConfig map into a uint8 mask.
476func encodeRawConfig(rawConfig map[string]string) uint8 {
477	var encoded uint8
478	for i, f := range []string{"public", "fork", "archived"} {
479		var e uint8
480		v, ok := rawConfig[f]
481		if ok && v == "1" {
482			e |= rawConfigYes
483		} else {
484			e |= rawConfigNo
485		}
486		encoded = encoded | e<<(2*i)
487	}
488	return encoded
489}
Configure Feed

Configure Feed