index/indexdata.go at 05b2b92c2ebd0927249db50155dfe7375e0ed525 · boltless.me/zoekt

fork of https://github.com/sourcegraph/zoekt
zoekt / index / indexdata.go
at 05b2b92c2ebd0927249db50155dfe7375e0ed525 15 kB View raw
Julie Tibshirani ranking: incorporate file signals into BM25F (#922) 1y ago
  1// Copyright 2016 Google Inc. All rights reserved.
  2//
  3// Licensed under the Apache License, Version 2.0 (the "License");
  4// you may not use this file except in compliance with the License.
  5// You may obtain a copy of the License at
  6//
  7//    http://www.apache.org/licenses/LICENSE-2.0
  8//
  9// Unless required by applicable law or agreed to in writing, software
 10// distributed under the License is distributed on an "AS IS" BASIS,
 11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12// See the License for the specific language governing permissions and
 13// limitations under the License.
 14
 15package index
 16
 17import (
 18	"encoding/binary"
 19	"errors"
 20	"fmt"
 21	"hash/crc64"
 22	"log"
 23	"math/bits"
 24	"slices"
 25	"unicode/utf8"
 26
 27	"github.com/sourcegraph/zoekt"
 28	"github.com/sourcegraph/zoekt/query"
 29)
 30
 31// indexData holds the pattern-independent data that we have to have
 32// in memory to search. Most of the memory is taken up by the ngram =>
 33// offset index.
 34type indexData struct {
 35	symbols symbolData
 36
 37	file IndexFile
 38
 39	contentNgrams btreeIndex
 40
 41	newlinesStart uint32
 42	newlinesIndex []uint32
 43
 44	docSectionsStart uint32
 45	docSectionsIndex []uint32
 46
 47	runeDocSections []DocumentSection
 48
 49	// rune offset=>byte offset mapping, relative to the start of the content corpus
 50	runeOffsets runeOffsetMap
 51
 52	// offsets of file contents; includes end of last file
 53	boundariesStart uint32
 54	boundaries      []uint32
 55
 56	// rune offsets for the file content boundaries
 57	fileEndRunes []uint32
 58
 59	fileNameContent []byte
 60	fileNameIndex   []uint32
 61	fileNameNgrams  btreeIndex
 62
 63	// fileEndSymbol[i] is the index of the first symbol for document i.
 64	fileEndSymbol []uint32
 65
 66	// rune offset=>byte offset mapping, relative to the start of the filename corpus
 67	fileNameRuneOffsets runeOffsetMap
 68
 69	// rune offsets for the file name boundaries
 70	fileNameEndRunes []uint32
 71
 72	fileBranchMasks []uint64
 73
 74	// mask (power of 2) => name
 75	branchNames []map[uint]string
 76
 77	// name => mask (power of 2)
 78	branchIDs []map[string]uint
 79
 80	metaData     zoekt.IndexMetadata
 81	repoMetaData []zoekt.Repository
 82
 83	subRepos     []uint32
 84	subRepoPaths [][]string
 85
 86	// Checksums for all the files, at 8-byte intervals
 87	checksums []byte
 88
 89	// languages for all the files.
 90	languages []byte
 91
 92	// inverse of LanguageMap in metaData
 93	languageMap map[uint16]string
 94
 95	// file categories for all the files.
 96	categories []byte
 97
 98	repoListEntry []zoekt.RepoListEntry
 99
100	// repository indexes for all the files
101	repos []uint16
102
103	// rawConfigMasks contains the encoded RawConfig for each repository
104	rawConfigMasks []uint8
105}
106
107type symbolData struct {
108	// symContent stores Symbol.Sym and Symbol.Parent.
109	// TODO we don't need to store Symbol.Sym.
110	symContent []byte
111	symIndex   []byte
112	// symKindContent is an enum of sym.Kind and sym.ParentKind
113	symKindContent []byte
114	symKindIndex   []uint32
115	// symMetadata is [4]uint32 0 Kind Parent ParentKind
116	symMetaData []byte
117}
118
119func uint32SliceAt(a []byte, n uint32) uint32 {
120	return binary.BigEndian.Uint32(a[n*4:])
121}
122
123func uint32SliceLen(a []byte) uint32 {
124	return uint32(len(a) / 4)
125}
126
127// parent returns index i of the parent enum
128func (d *symbolData) parent(i uint32) []byte {
129	delta := uint32SliceAt(d.symIndex, 0)
130	start := uint32SliceAt(d.symIndex, i) - delta
131	var end uint32
132	if i+1 == uint32SliceLen(d.symIndex) {
133		end = uint32(len(d.symContent))
134	} else {
135		end = uint32SliceAt(d.symIndex, i+1) - delta
136	}
137	return d.symContent[start:end]
138}
139
140// kind returns index i of the kind enum
141func (d *symbolData) kind(i uint32) []byte {
142	return d.symKindContent[d.symKindIndex[i]:d.symKindIndex[i+1]]
143}
144
145// data returns the symbol at index i
146func (d *symbolData) data(i uint32) *zoekt.Symbol {
147	size := uint32(4 * 4) // 4 uint32s
148	offset := i * size
149	if offset >= uint32(len(d.symMetaData)) {
150		return nil
151	}
152
153	metadata := d.symMetaData[offset : offset+size]
154	sym := &zoekt.Symbol{}
155	key := uint32SliceAt(metadata, 1)
156	sym.Kind = string(d.kind(key))
157	key = uint32SliceAt(metadata, 2)
158	sym.Parent = string(d.parent(key))
159	key = uint32SliceAt(metadata, 3)
160	sym.ParentKind = string(d.kind(key))
161	return sym
162}
163
164func (d *indexData) getChecksum(idx uint32) []byte {
165	start := crc64.Size * idx
166	return d.checksums[start : start+crc64.Size]
167}
168
169func (d *indexData) getLanguage(idx uint32) uint16 {
170	if d.metaData.IndexFeatureVersion < 12 {
171		// older zoekt files had 8-bit language entries
172		return uint16(d.languages[idx])
173	}
174	// newer zoekt files have 16-bit language entries
175	return uint16(d.languages[idx*2]) | uint16(d.languages[idx*2+1])<<8
176}
177
178func (d *indexData) getCategory(idx uint32) FileCategory {
179	if len(d.categories) == 0 {
180		// This means we're reading an older index, so return 'missing'
181		return FileCategoryMissing
182	}
183	category, err := decodeCategory(d.categories[idx])
184	if err != nil {
185		return FileCategoryMissing
186	}
187	return category
188}
189
190// calculates stats for files in the range [start, end).
191func (d *indexData) calculateStatsForFileRange(start, end uint32) zoekt.RepoStats {
192	if start >= end {
193		// An empty shard for an empty repository.
194		return zoekt.RepoStats{
195			Shards: 1,
196		}
197	}
198
199	bytesContent := d.boundaries[end] - d.boundaries[start]
200	bytesFN := d.fileNameIndex[end] - d.fileNameIndex[start]
201	count, defaultCount, otherCount := d.calculateNewLinesStats(start, end)
202
203	// CR keegan for stefan: I think we may want to restructure RepoListEntry so
204	// that we don't change anything, except we have
205	// []Repository. Alternatively, things we can divide up we do (like
206	// here). Right now I don't like that these numbers are not true, especially
207	// after aggregation. For now I will move forward with this until we can
208	// chat more.
209	return zoekt.RepoStats{
210		ContentBytes: int64(bytesContent) + int64(bytesFN),
211		Documents:    int(end - start),
212		// CR keegan for stefan: our shard count is going to go out of whack,
213		// since we will aggregate these. So we will report more shards than are
214		// present on disk. What should we do?
215		Shards: 1,
216
217		// Sourcegraph specific
218		NewLinesCount:              count,
219		DefaultBranchNewLinesCount: defaultCount,
220		OtherBranchesNewLinesCount: otherCount,
221	}
222}
223
224func (d *indexData) calculateStats() error {
225	d.repoListEntry = make([]zoekt.RepoListEntry, 0, len(d.repoMetaData))
226	var start, end uint32
227
228	for repoID, md := range d.repoMetaData {
229		// determine the file range for repo i
230		for end < uint32(len(d.repos)) && d.repos[end] == uint16(repoID) {
231			end++
232		}
233		if start < end && d.repos[start] != uint16(repoID) {
234			return fmt.Errorf("shard documents out of order with respect to repositories: expected document %d to be part of repo %d", start, repoID)
235		}
236
237		d.repoListEntry = append(d.repoListEntry, zoekt.RepoListEntry{
238			Repository:    md,
239			IndexMetadata: d.metaData,
240			Stats:         d.calculateStatsForFileRange(start, end),
241		})
242		start = end
243	}
244
245	// All repos in a compound shard share memoryUse. So we average out the
246	// memoryUse per shard in our reporting. This has the benefit that when you
247	// aggregate the IndexBytes you get back the actual memoryUse.
248	//
249	// TODO take into account tombstones for aggregation. Even better, adjust
250	// API to be shard centric not repo centric.
251	if len(d.repoListEntry) > 0 {
252		indexBytes := d.memoryUse()
253		indexBytesChunk := indexBytes / len(d.repoListEntry)
254		for i := range d.repoListEntry {
255			d.repoListEntry[i].Stats.IndexBytes = int64(indexBytesChunk)
256			indexBytes -= indexBytesChunk
257		}
258		d.repoListEntry[0].Stats.IndexBytes += int64(indexBytes)
259	}
260
261	return nil
262}
263
264// calculateNewLinesStats computes some Sourcegraph specific statistics for files
265// in the range [start, end). These are not as efficient to calculate as the
266// normal statistics. We experimentally measured about a 10% slower shard load
267// time. However, we find these values very useful to track and computing them
268// outside of load time introduces a lot of complexity.
269func (d *indexData) calculateNewLinesStats(start, end uint32) (count, defaultCount, otherCount uint64) {
270	for i := start; i < end; i++ {
271		// branchMask is a bitmask of the branches for a document. Zoekt by
272		// convention represents the default branch as the lowest bit.
273		branchMask := d.fileBranchMasks[i]
274		isDefault := (branchMask & 1) == 1
275		others := uint64(bits.OnesCount64(branchMask >> 1))
276
277		// this is readNewlines but only reading the size of each section which
278		// corresponds to the number of newlines.
279		sec := simpleSection{
280			off: d.newlinesStart + d.newlinesIndex[i],
281			sz:  d.newlinesIndex[i+1] - d.newlinesIndex[i],
282		}
283		// We are only reading the first varint which is the size. So we don't
284		// need to read more than MaxVarintLen64 bytes.
285		if sec.sz > binary.MaxVarintLen64 {
286			sec.sz = binary.MaxVarintLen64
287		}
288		blob, err := d.readSectionBlob(sec)
289		if err != nil {
290			log.Printf("error reading newline index for document %d on shard %s: %v", i, d.file.Name(), err)
291			continue
292		}
293		sz, _ := binary.Uvarint(blob)
294
295		count += sz
296		if isDefault {
297			defaultCount += sz
298		}
299		otherCount += (others * sz)
300	}
301
302	return
303}
304
305func (d *indexData) String() string {
306	return fmt.Sprintf("shard(%s)", d.file.Name())
307}
308
309// calculates an approximate size of indexData in memory in bytes.
310func (d *indexData) memoryUse() int {
311	sz := 0
312	for _, a := range [][]uint32{
313		d.newlinesIndex, d.docSectionsIndex,
314		d.boundaries, d.fileNameIndex,
315		d.fileEndRunes, d.fileNameEndRunes,
316		d.fileEndSymbol, d.symbols.symKindIndex,
317		d.subRepos,
318	} {
319		sz += 4 * len(a)
320	}
321	sz += d.runeOffsets.sizeBytes()
322	sz += d.fileNameRuneOffsets.sizeBytes()
323	sz += len(d.languages)
324	sz += len(d.checksums)
325	sz += 2 * len(d.repos)
326	sz += 8 * len(d.runeDocSections)
327	sz += 8 * len(d.fileBranchMasks)
328	sz += d.contentNgrams.SizeBytes()
329	sz += d.fileNameNgrams.SizeBytes()
330	return sz
331}
332
333// findSelectiveNgrams returns two ngrams to pass to the distance iterator, chosen to
334// produce a small file intersection. It finds the two lowest frequency ngrams, but avoids
335// overlapping trigrams to keep their intersection as small as possible.
336//
337// Invariant: first will always have a smaller index than last.
338func findSelectiveNgrams(ngramOffs []runeNgramOff, indexMap []int, frequencies []uint32) (first, last runeNgramOff) {
339	first, last = minFrequencyNgramOffsets(ngramOffs, frequencies)
340
341	// If the trigrams are overlapping, then try to shift one to reduce overlap.
342	// This is guaranteed to produce a smaller intersection.
343	if last.index-first.index < ngramSize {
344		newFirstIndex := max(last.index-ngramSize, 0)
345		if newFirstIndex != first.index {
346			first = ngramOffs[indexMap[newFirstIndex]]
347		}
348
349		newLastIndex := min(first.index+ngramSize, len(ngramOffs)-1)
350		if newLastIndex != last.index {
351			last = ngramOffs[indexMap[newLastIndex]]
352		}
353	}
354	return
355}
356
357const maxUInt32 = 0xffffffff
358
359func minFrequencyNgramOffsets(ngramOffs []runeNgramOff, frequencies []uint32) (first, last runeNgramOff) {
360	// Find the two lowest frequency ngrams.
361	idx0, idx1 := 0, 0
362	min0, min1 := uint32(maxUInt32), uint32(maxUInt32)
363	for i, x := range frequencies {
364		if x <= min0 {
365			idx0, idx1 = i, idx0
366			min0, min1 = x, min0
367		} else if x <= min1 {
368			idx1 = i
369			min1 = x
370		}
371	}
372
373	first = ngramOffs[idx0]
374	last = ngramOffs[idx1]
375
376	// Ensure first appears before last as a helpful invariant.
377	if first.index > last.index {
378		last, first = first, last
379	}
380	return
381}
382
383func (data *indexData) ngrams(filename bool) btreeIndex {
384	if filename {
385		return data.fileNameNgrams
386	}
387	return data.contentNgrams
388}
389
390type ngramIterationResults struct {
391	matchIterator
392
393	caseSensitive bool
394	fileName      bool
395	substrBytes   []byte
396	substrLowered []byte
397}
398
399func (r *ngramIterationResults) String() string {
400	return fmt.Sprintf("wrapper(%v)", r.matchIterator)
401}
402
403func (r *ngramIterationResults) candidates() []*candidateMatch {
404	cs := r.matchIterator.candidates()
405	for _, c := range cs {
406		c.caseSensitive = r.caseSensitive
407		c.fileName = r.fileName
408		c.substrBytes = r.substrBytes
409		c.substrLowered = r.substrLowered
410	}
411	return cs
412}
413
414func (d *indexData) iterateNgrams(query *query.Substring) (*ngramIterationResults, error) {
415	str := query.Pattern
416
417	// Find the 2 least common ngrams from the string.
418	ngramOffs := splitNGrams([]byte(str))
419
420	// protect against accidental searching of empty strings
421	if len(ngramOffs) == 0 {
422		return nil, errors.New("iterateNgrams needs non empty string")
423	}
424
425	// PERF: Sort to increase the chances adjacent checks are in the same btree
426	// bucket (which can cause disk IO).
427	slices.SortFunc(ngramOffs, runeNgramOff.Compare)
428	frequencies := make([]uint32, 0, len(ngramOffs))
429	indexMap := make([]int, len(ngramOffs))
430	ngramLookups := 0
431	ngrams := d.ngrams(query.FileName)
432	for i, o := range ngramOffs {
433		var freq uint32
434		if query.CaseSensitive {
435			freq = ngrams.Get(o.ngram).sz
436			ngramLookups++
437		} else {
438			for _, v := range generateCaseNgrams(o.ngram) {
439				freq += ngrams.Get(v).sz
440				ngramLookups++
441			}
442		}
443
444		if freq == 0 {
445			return &ngramIterationResults{
446				matchIterator: &noMatchTree{
447					Why: "freq=0",
448					Stats: zoekt.Stats{
449						NgramLookups: ngramLookups,
450					},
451				},
452			}, nil
453		}
454
455		frequencies = append(frequencies, freq)
456		indexMap[o.index] = i
457	}
458
459	first, last := findSelectiveNgrams(ngramOffs, indexMap, frequencies)
460
461	iter := &ngramDocIterator{
462		leftPad:      uint32(first.index),
463		rightPad:     uint32(utf8.RuneCountInString(str) - first.index),
464		ngramLookups: ngramLookups,
465	}
466	if query.FileName {
467		iter.ends = d.fileNameEndRunes
468	} else {
469		iter.ends = d.fileEndRunes
470	}
471
472	if first != last {
473		runeDist := uint32(last.index - first.index)
474		i, err := d.newDistanceTrigramIter(first.ngram, last.ngram, runeDist, query.CaseSensitive, query.FileName)
475		if err != nil {
476			return nil, err
477		}
478
479		iter.iter = i
480	} else {
481		hitIter, err := d.trigramHitIterator(last.ngram, query.CaseSensitive, query.FileName)
482		if err != nil {
483			return nil, err
484		}
485		iter.iter = hitIter
486	}
487
488	patBytes := []byte(query.Pattern)
489	lowerPatBytes := toLower(patBytes)
490
491	return &ngramIterationResults{
492		matchIterator: iter,
493		caseSensitive: query.CaseSensitive,
494		fileName:      query.FileName,
495		substrBytes:   patBytes,
496		substrLowered: lowerPatBytes,
497	}, nil
498}
499
500func (d *indexData) fileName(i uint32) []byte {
501	return d.fileNameContent[d.fileNameIndex[i]:d.fileNameIndex[i+1]]
502}
503
504func (d *indexData) numDocs() uint32 {
505	return uint32(len(d.fileBranchMasks))
506}
507
508func (s *indexData) Close() {
509	s.file.Close()
510}
511
512const (
513	rawConfigYes = 1
514	rawConfigNo  = 2
515)
516
517// encodeRawConfig encodes a rawConfig map into a uint8 mask.
518func encodeRawConfig(rawConfig map[string]string) uint8 {
519	var encoded uint8
520	for i, f := range []string{"public", "fork", "archived"} {
521		var e uint8
522		v, ok := rawConfig[f]
523		if ok && v == "1" {
524			e |= rawConfigYes
525		} else {
526			e |= rawConfigNo
527		}
528		encoded = encoded | e<<(2*i)
529	}
530	return encoded
531}
Configure Feed

Configure Feed