indexdata.go at db940cfc367df55a0e88677096d139bc9aa5f5af · boltless.me/zoekt

fork of https://github.com/sourcegraph/zoekt
zoekt / indexdata.go
at db940cfc367df55a0e88677096d139bc9aa5f5af 14 kB View raw
Stefan Hengl ranking: removing document ranks (#853) 2y ago
  1// Copyright 2016 Google Inc. All rights reserved.
  2//
  3// Licensed under the Apache License, Version 2.0 (the "License");
  4// you may not use this file except in compliance with the License.
  5// You may obtain a copy of the License at
  6//
  7//    http://www.apache.org/licenses/LICENSE-2.0
  8//
  9// Unless required by applicable law or agreed to in writing, software
 10// distributed under the License is distributed on an "AS IS" BASIS,
 11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12// See the License for the specific language governing permissions and
 13// limitations under the License.
 14
 15package zoekt
 16
 17import (
 18	"encoding/binary"
 19	"errors"
 20	"fmt"
 21	"hash/crc64"
 22	"log"
 23	"math/bits"
 24	"slices"
 25	"unicode/utf8"
 26
 27	"github.com/sourcegraph/zoekt/query"
 28)
 29
 30// indexData holds the pattern-independent data that we have to have
 31// in memory to search. Most of the memory is taken up by the ngram =>
 32// offset index.
 33type indexData struct {
 34	symbols symbolData
 35
 36	file IndexFile
 37
 38	contentNgrams btreeIndex
 39
 40	newlinesStart uint32
 41	newlinesIndex []uint32
 42
 43	docSectionsStart uint32
 44	docSectionsIndex []uint32
 45
 46	runeDocSections []DocumentSection
 47
 48	// rune offset=>byte offset mapping, relative to the start of the content corpus
 49	runeOffsets runeOffsetMap
 50
 51	// offsets of file contents; includes end of last file
 52	boundariesStart uint32
 53	boundaries      []uint32
 54
 55	// rune offsets for the file content boundaries
 56	fileEndRunes []uint32
 57
 58	fileNameContent []byte
 59	fileNameIndex   []uint32
 60	fileNameNgrams  btreeIndex
 61
 62	// fileEndSymbol[i] is the index of the first symbol for document i.
 63	fileEndSymbol []uint32
 64
 65	// rune offset=>byte offset mapping, relative to the start of the filename corpus
 66	fileNameRuneOffsets runeOffsetMap
 67
 68	// rune offsets for the file name boundaries
 69	fileNameEndRunes []uint32
 70
 71	fileBranchMasks []uint64
 72
 73	// mask (power of 2) => name
 74	branchNames []map[uint]string
 75
 76	// name => mask (power of 2)
 77	branchIDs []map[string]uint
 78
 79	metaData     IndexMetadata
 80	repoMetaData []Repository
 81
 82	subRepos     []uint32
 83	subRepoPaths [][]string
 84
 85	// Checksums for all the files, at 8-byte intervals
 86	checksums []byte
 87
 88	// languages for all the files.
 89	languages []byte
 90
 91	// inverse of LanguageMap in metaData
 92	languageMap map[uint16]string
 93
 94	repoListEntry []RepoListEntry
 95
 96	// repository indexes for all the files
 97	repos []uint16
 98
 99	// rawConfigMasks contains the encoded RawConfig for each repository
100	rawConfigMasks []uint8
101}
102
103type symbolData struct {
104	// symContent stores Symbol.Sym and Symbol.Parent.
105	// TODO we don't need to store Symbol.Sym.
106	symContent []byte
107	symIndex   []byte
108	// symKindContent is an enum of sym.Kind and sym.ParentKind
109	symKindContent []byte
110	symKindIndex   []uint32
111	// symMetadata is [4]uint32 0 Kind Parent ParentKind
112	symMetaData []byte
113}
114
115func uint32SliceAt(a []byte, n uint32) uint32 {
116	return binary.BigEndian.Uint32(a[n*4:])
117}
118
119func uint32SliceLen(a []byte) uint32 {
120	return uint32(len(a) / 4)
121}
122
123// parent returns index i of the parent enum
124func (d *symbolData) parent(i uint32) []byte {
125	delta := uint32SliceAt(d.symIndex, 0)
126	start := uint32SliceAt(d.symIndex, i) - delta
127	var end uint32
128	if i+1 == uint32SliceLen(d.symIndex) {
129		end = uint32(len(d.symContent))
130	} else {
131		end = uint32SliceAt(d.symIndex, i+1) - delta
132	}
133	return d.symContent[start:end]
134}
135
136// kind returns index i of the kind enum
137func (d *symbolData) kind(i uint32) []byte {
138	return d.symKindContent[d.symKindIndex[i]:d.symKindIndex[i+1]]
139}
140
141// data returns the symbol at index i
142func (d *symbolData) data(i uint32) *Symbol {
143	size := uint32(4 * 4) // 4 uint32s
144	offset := i * size
145	if offset >= uint32(len(d.symMetaData)) {
146		return nil
147	}
148
149	metadata := d.symMetaData[offset : offset+size]
150	sym := &Symbol{}
151	key := uint32SliceAt(metadata, 1)
152	sym.Kind = string(d.kind(key))
153	key = uint32SliceAt(metadata, 2)
154	sym.Parent = string(d.parent(key))
155	key = uint32SliceAt(metadata, 3)
156	sym.ParentKind = string(d.kind(key))
157	return sym
158}
159
160func (d *indexData) getChecksum(idx uint32) []byte {
161	start := crc64.Size * idx
162	return d.checksums[start : start+crc64.Size]
163}
164
165func (d *indexData) getLanguage(idx uint32) uint16 {
166	if d.metaData.IndexFeatureVersion < 12 {
167		// older zoekt files had 8-bit language entries
168		return uint16(d.languages[idx])
169	}
170	// newer zoekt files have 16-bit language entries
171	return uint16(d.languages[idx*2]) | uint16(d.languages[idx*2+1])<<8
172}
173
174// calculates stats for files in the range [start, end).
175func (d *indexData) calculateStatsForFileRange(start, end uint32) RepoStats {
176	if start >= end {
177		// An empty shard for an empty repository.
178		return RepoStats{
179			Shards: 1,
180		}
181	}
182
183	bytesContent := d.boundaries[end] - d.boundaries[start]
184	bytesFN := d.fileNameIndex[end] - d.fileNameIndex[start]
185	count, defaultCount, otherCount := d.calculateNewLinesStats(start, end)
186
187	// CR keegan for stefan: I think we may want to restructure RepoListEntry so
188	// that we don't change anything, except we have
189	// []Repository. Alternatively, things we can divide up we do (like
190	// here). Right now I don't like that these numbers are not true, especially
191	// after aggregation. For now I will move forward with this until we can
192	// chat more.
193	return RepoStats{
194		ContentBytes: int64(bytesContent) + int64(bytesFN),
195		Documents:    int(end - start),
196		// CR keegan for stefan: our shard count is going to go out of whack,
197		// since we will aggregate these. So we will report more shards than are
198		// present on disk. What should we do?
199		Shards: 1,
200
201		// Sourcegraph specific
202		NewLinesCount:              count,
203		DefaultBranchNewLinesCount: defaultCount,
204		OtherBranchesNewLinesCount: otherCount,
205	}
206}
207
208func (d *indexData) calculateStats() error {
209	d.repoListEntry = make([]RepoListEntry, 0, len(d.repoMetaData))
210	var start, end uint32
211
212	for repoID, md := range d.repoMetaData {
213		// determine the file range for repo i
214		for end < uint32(len(d.repos)) && d.repos[end] == uint16(repoID) {
215			end++
216		}
217		if start < end && d.repos[start] != uint16(repoID) {
218			return fmt.Errorf("shard documents out of order with respect to repositories: expected document %d to be part of repo %d", start, repoID)
219		}
220
221		d.repoListEntry = append(d.repoListEntry, RepoListEntry{
222			Repository:    md,
223			IndexMetadata: d.metaData,
224			Stats:         d.calculateStatsForFileRange(start, end),
225		})
226		start = end
227	}
228
229	// All repos in a compound shard share memoryUse. So we average out the
230	// memoryUse per shard in our reporting. This has the benefit that when you
231	// aggregate the IndexBytes you get back the actual memoryUse.
232	//
233	// TODO take into account tombstones for aggregation. Even better, adjust
234	// API to be shard centric not repo centric.
235	if len(d.repoListEntry) > 0 {
236		indexBytes := d.memoryUse()
237		indexBytesChunk := indexBytes / len(d.repoListEntry)
238		for i := range d.repoListEntry {
239			d.repoListEntry[i].Stats.IndexBytes = int64(indexBytesChunk)
240			indexBytes -= indexBytesChunk
241		}
242		d.repoListEntry[0].Stats.IndexBytes += int64(indexBytes)
243	}
244
245	return nil
246}
247
248// calculateNewLinesStats computes some Sourcegraph specific statistics for files
249// in the range [start, end). These are not as efficient to calculate as the
250// normal statistics. We experimentally measured about a 10% slower shard load
251// time. However, we find these values very useful to track and computing them
252// outside of load time introduces a lot of complexity.
253func (d *indexData) calculateNewLinesStats(start, end uint32) (count, defaultCount, otherCount uint64) {
254	for i := start; i < end; i++ {
255		// branchMask is a bitmask of the branches for a document. Zoekt by
256		// convention represents the default branch as the lowest bit.
257		branchMask := d.fileBranchMasks[i]
258		isDefault := (branchMask & 1) == 1
259		others := uint64(bits.OnesCount64(branchMask >> 1))
260
261		// this is readNewlines but only reading the size of each section which
262		// corresponds to the number of newlines.
263		sec := simpleSection{
264			off: d.newlinesStart + d.newlinesIndex[i],
265			sz:  d.newlinesIndex[i+1] - d.newlinesIndex[i],
266		}
267		// We are only reading the first varint which is the size. So we don't
268		// need to read more than MaxVarintLen64 bytes.
269		if sec.sz > binary.MaxVarintLen64 {
270			sec.sz = binary.MaxVarintLen64
271		}
272		blob, err := d.readSectionBlob(sec)
273		if err != nil {
274			log.Printf("error reading newline index for document %d on shard %s: %v", i, d.file.Name(), err)
275			continue
276		}
277		sz, _ := binary.Uvarint(blob)
278
279		count += sz
280		if isDefault {
281			defaultCount += sz
282		}
283		otherCount += (others * sz)
284	}
285
286	return
287}
288
289func (d *indexData) String() string {
290	return fmt.Sprintf("shard(%s)", d.file.Name())
291}
292
293// calculates an approximate size of indexData in memory in bytes.
294func (d *indexData) memoryUse() int {
295	sz := 0
296	for _, a := range [][]uint32{
297		d.newlinesIndex, d.docSectionsIndex,
298		d.boundaries, d.fileNameIndex,
299		d.fileEndRunes, d.fileNameEndRunes,
300		d.fileEndSymbol, d.symbols.symKindIndex,
301		d.subRepos,
302	} {
303		sz += 4 * len(a)
304	}
305	sz += d.runeOffsets.sizeBytes()
306	sz += d.fileNameRuneOffsets.sizeBytes()
307	sz += len(d.languages)
308	sz += len(d.checksums)
309	sz += 2 * len(d.repos)
310	sz += 8 * len(d.runeDocSections)
311	sz += 8 * len(d.fileBranchMasks)
312	sz += d.contentNgrams.SizeBytes()
313	sz += d.fileNameNgrams.SizeBytes()
314	return sz
315}
316
317// findSelectiveNgrams returns two ngrams to pass to the distance iterator, chosen to
318// produce a small file intersection. It finds the two lowest frequency ngrams, but avoids
319// overlapping trigrams to keep their intersection as small as possible.
320//
321// Invariant: first will always have a smaller index than last.
322func findSelectiveNgrams(ngramOffs []runeNgramOff, indexMap []int, frequencies []uint32) (first, last runeNgramOff) {
323	first, last = minFrequencyNgramOffsets(ngramOffs, frequencies)
324
325	// If the trigrams are overlapping, then try to shift one to reduce overlap.
326	// This is guaranteed to produce a smaller intersection.
327	if last.index-first.index < ngramSize {
328		newFirstIndex := max(last.index-ngramSize, 0)
329		if newFirstIndex != first.index {
330			first = ngramOffs[indexMap[newFirstIndex]]
331		}
332
333		newLastIndex := min(first.index+ngramSize, len(ngramOffs)-1)
334		if newLastIndex != last.index {
335			last = ngramOffs[indexMap[newLastIndex]]
336		}
337	}
338	return
339}
340
341const maxUInt32 = 0xffffffff
342
343func minFrequencyNgramOffsets(ngramOffs []runeNgramOff, frequencies []uint32) (first, last runeNgramOff) {
344	// Find the two lowest frequency ngrams.
345	idx0, idx1 := 0, 0
346	min0, min1 := uint32(maxUInt32), uint32(maxUInt32)
347	for i, x := range frequencies {
348		if x <= min0 {
349			idx0, idx1 = i, idx0
350			min0, min1 = x, min0
351		} else if x <= min1 {
352			idx1 = i
353			min1 = x
354		}
355	}
356
357	first = ngramOffs[idx0]
358	last = ngramOffs[idx1]
359
360	// Ensure first appears before last as a helpful invariant.
361	if first.index > last.index {
362		last, first = first, last
363	}
364	return
365}
366
367func (data *indexData) ngrams(filename bool) btreeIndex {
368	if filename {
369		return data.fileNameNgrams
370	}
371	return data.contentNgrams
372}
373
374type ngramIterationResults struct {
375	matchIterator
376
377	caseSensitive bool
378	fileName      bool
379	substrBytes   []byte
380	substrLowered []byte
381}
382
383func (r *ngramIterationResults) String() string {
384	return fmt.Sprintf("wrapper(%v)", r.matchIterator)
385}
386
387func (r *ngramIterationResults) candidates() []*candidateMatch {
388	cs := r.matchIterator.candidates()
389	for _, c := range cs {
390		c.caseSensitive = r.caseSensitive
391		c.fileName = r.fileName
392		c.substrBytes = r.substrBytes
393		c.substrLowered = r.substrLowered
394	}
395	return cs
396}
397
398func (d *indexData) iterateNgrams(query *query.Substring) (*ngramIterationResults, error) {
399	str := query.Pattern
400
401	// Find the 2 least common ngrams from the string.
402	ngramOffs := splitNGrams([]byte(str))
403
404	// protect against accidental searching of empty strings
405	if len(ngramOffs) == 0 {
406		return nil, errors.New("iterateNgrams needs non empty string")
407	}
408
409	// PERF: Sort to increase the chances adjacent checks are in the same btree
410	// bucket (which can cause disk IO).
411	slices.SortFunc(ngramOffs, runeNgramOff.Compare)
412	frequencies := make([]uint32, 0, len(ngramOffs))
413	indexMap := make([]int, len(ngramOffs))
414	ngramLookups := 0
415	ngrams := d.ngrams(query.FileName)
416	for i, o := range ngramOffs {
417		var freq uint32
418		if query.CaseSensitive {
419			freq = ngrams.Get(o.ngram).sz
420			ngramLookups++
421		} else {
422			for _, v := range generateCaseNgrams(o.ngram) {
423				freq += ngrams.Get(v).sz
424				ngramLookups++
425			}
426		}
427
428		if freq == 0 {
429			return &ngramIterationResults{
430				matchIterator: &noMatchTree{
431					Why: "freq=0",
432					Stats: Stats{
433						NgramLookups: ngramLookups,
434					},
435				},
436			}, nil
437		}
438
439		frequencies = append(frequencies, freq)
440		indexMap[o.index] = i
441	}
442
443	first, last := findSelectiveNgrams(ngramOffs, indexMap, frequencies)
444
445	iter := &ngramDocIterator{
446		leftPad:      uint32(first.index),
447		rightPad:     uint32(utf8.RuneCountInString(str) - first.index),
448		ngramLookups: ngramLookups,
449	}
450	if query.FileName {
451		iter.ends = d.fileNameEndRunes
452	} else {
453		iter.ends = d.fileEndRunes
454	}
455
456	if first != last {
457		runeDist := uint32(last.index - first.index)
458		i, err := d.newDistanceTrigramIter(first.ngram, last.ngram, runeDist, query.CaseSensitive, query.FileName)
459		if err != nil {
460			return nil, err
461		}
462
463		iter.iter = i
464	} else {
465		hitIter, err := d.trigramHitIterator(last.ngram, query.CaseSensitive, query.FileName)
466		if err != nil {
467			return nil, err
468		}
469		iter.iter = hitIter
470	}
471
472	patBytes := []byte(query.Pattern)
473	lowerPatBytes := toLower(patBytes)
474
475	return &ngramIterationResults{
476		matchIterator: iter,
477		caseSensitive: query.CaseSensitive,
478		fileName:      query.FileName,
479		substrBytes:   patBytes,
480		substrLowered: lowerPatBytes,
481	}, nil
482}
483
484func (d *indexData) fileName(i uint32) []byte {
485	return d.fileNameContent[d.fileNameIndex[i]:d.fileNameIndex[i+1]]
486}
487
488func (d *indexData) numDocs() uint32 {
489	return uint32(len(d.fileBranchMasks))
490}
491
492func (s *indexData) Close() {
493	s.file.Close()
494}
495
496const (
497	rawConfigYes = 1
498	rawConfigNo  = 2
499)
500
501// encodeRawConfig encodes a rawConfig map into a uint8 mask.
502func encodeRawConfig(rawConfig map[string]string) uint8 {
503	var encoded uint8
504	for i, f := range []string{"public", "fork", "archived"} {
505		var e uint8
506		v, ok := rawConfig[f]
507		if ok && v == "1" {
508			e |= rawConfigYes
509		} else {
510			e |= rawConfigNo
511		}
512		encoded = encoded | e<<(2*i)
513	}
514	return encoded
515}
Configure Feed

Configure Feed