fork of https://github.com/sourcegraph/zoekt
0

Configure Feed

Select the types of activity you want to include in your feed.

at tngl 6.5 kB View raw
1// Copyright 2016 Google Inc. All rights reserved. 2// 3// Licensed under the Apache License, Version 2.0 (the "License"); 4// you may not use this file except in compliance with the License. 5// You may obtain a copy of the License at 6// 7// http://www.apache.org/licenses/LICENSE-2.0 8// 9// Unless required by applicable law or agreed to in writing, software 10// distributed under the License is distributed on an "AS IS" BASIS, 11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12// See the License for the specific language governing permissions and 13// limitations under the License. 14 15package index 16 17import ( 18 "bufio" 19 "bytes" 20 "cmp" 21 "encoding/binary" 22 "encoding/json" 23 "fmt" 24 "io" 25 "slices" 26 "sort" 27 "time" 28 29 "github.com/sourcegraph/zoekt" 30 31 "github.com/RoaringBitmap/roaring" 32) 33 34func (w *writer) writeTOC(toc *indexTOC) { 35 // Tagged sections are indicated with a 0 section count. 36 // Tagged sections allow easier forwards and backwards 37 // compatibility when evolving zoekt index files with new 38 // sections. 39 // 40 // A tagged section is: 41 // Varint TagLen, Tag String, Varint Kind, Section 42 // 43 // Section kind is indicated because simpleSections and 44 // compoundSections have different lengths. 45 w.U32(0) 46 secs := toc.sectionsTaggedList() 47 for _, s := range secs { 48 w.String(s.tag) 49 w.Varint(uint32(s.sec.kind())) 50 s.sec.write(w) 51 } 52} 53 54func (s *compoundSection) writeStrings(w *writer, strs []*searchableString) { 55 s.start(w) 56 for _, f := range strs { 57 s.addItem(w, f.data) 58 } 59 s.end(w) 60} 61 62func (s *compoundSection) writeMap(w *writer, m map[string]uint32) { 63 keys := make([]*searchableString, 0, len(m)) 64 for k := range m { 65 keys = append(keys, &searchableString{ 66 data: []byte(k), 67 }) 68 } 69 sort.Slice(keys, func(i, j int) bool { 70 return m[string(keys[i].data)] < m[string(keys[j].data)] 71 }) 72 s.writeStrings(w, keys) 73} 74 75func writeUint32Bitmap(w *writer, dat []uint32) { 76 rb := roaring.BitmapOf(dat...) 77 rb.RunOptimize() 78 rb.WriteTo(w) 79} 80 81func writePostings(w *writer, s *postingsBuilder, ngramText *simpleSection, 82 charOffsets *simpleSection, postings *compoundSection, endRunes *simpleSection, 83) { 84 // Collect ngrams from both the ASCII direct-indexed array and the 85 // non-ASCII map, then sort by ngram value. 86 type ngramPosting struct { 87 ng ngram 88 pl *postingList 89 } 90 all := make([]ngramPosting, 0, len(s.asciiPopulated)+len(s.postings)) 91 for _, idx := range s.asciiPopulated { 92 pl := s.asciiPostings[idx] 93 if len(pl.data) > 0 { 94 all = append(all, ngramPosting{asciiIndexToNgram(idx), pl}) 95 } 96 } 97 for k, pl := range s.postings { 98 if len(pl.data) > 0 { 99 all = append(all, ngramPosting{k, pl}) 100 } 101 } 102 slices.SortFunc(all, func(a, b ngramPosting) int { return cmp.Compare(a.ng, b.ng) }) 103 104 ngramText.start(w) 105 for _, np := range all { 106 var buf [8]byte 107 binary.BigEndian.PutUint64(buf[:], uint64(np.ng)) 108 w.Write(buf[:]) 109 } 110 ngramText.end(w) 111 112 postings.start(w) 113 for _, np := range all { 114 postings.addItem(w, np.pl.data) 115 } 116 postings.end(w) 117 118 charOffsets.start(w) 119 w.Write(toSizedDeltas(s.runeOffsets)) 120 charOffsets.end(w) 121 122 endRunes.start(w) 123 w.Write(toSizedDeltas(s.endRunes)) 124 endRunes.end(w) 125} 126 127func (b *ShardBuilder) Write(out io.Writer) error { 128 next := b.indexFormatVersion == NextIndexFormatVersion 129 130 buffered := bufio.NewWriterSize(out, 1<<20) 131 defer buffered.Flush() 132 133 w := &writer{w: buffered} 134 toc := indexTOC{} 135 136 toc.fileContents.writeStrings(w, b.contentStrings) 137 toc.newlines.start(w) 138 for _, f := range b.contentStrings { 139 toc.newlines.addItem(w, toSizedDeltas(newLinesIndices(f.data))) 140 } 141 toc.newlines.end(w) 142 143 toc.fileEndSymbol.start(w) 144 for _, m := range b.fileEndSymbol { 145 w.U32(m) 146 } 147 toc.fileEndSymbol.end(w) 148 149 toc.symbolMap.writeMap(w, b.symIndex) 150 toc.symbolKindMap.writeMap(w, b.symKindIndex) 151 toc.symbolMetaData.start(w) 152 for _, m := range b.symMetaData { 153 w.U32(m) 154 } 155 toc.symbolMetaData.end(w) 156 157 toc.branchMasks.start(w) 158 for _, m := range b.branchMasks { 159 w.U64(m) 160 } 161 toc.branchMasks.end(w) 162 163 toc.fileSections.start(w) 164 for _, s := range b.docSections { 165 toc.fileSections.addItem(w, marshalDocSections(s)) 166 } 167 toc.fileSections.end(w) 168 169 writePostings(w, b.contentPostings, &toc.ngramText, &toc.runeOffsets, &toc.postings, &toc.fileEndRunes) 170 171 // names. 172 toc.fileNames.writeStrings(w, b.nameStrings) 173 174 writePostings(w, b.namePostings, &toc.nameNgramText, &toc.nameRuneOffsets, &toc.namePostings, &toc.nameEndRunes) 175 176 toc.subRepos.start(w) 177 w.Write(toSizedDeltas(b.subRepos)) 178 toc.subRepos.end(w) 179 180 toc.contentChecksums.start(w) 181 w.Write(b.checksums) 182 toc.contentChecksums.end(w) 183 184 toc.languages.start(w) 185 w.Write(b.languages) 186 toc.languages.end(w) 187 188 toc.categories.start(w) 189 w.Write(b.categories) 190 toc.categories.end(w) 191 192 toc.runeDocSections.start(w) 193 w.Write(marshalDocSections(b.runeDocSections)) 194 toc.runeDocSections.end(w) 195 196 if next { 197 toc.repos.start(w) 198 w.Write(toSizedDeltas16(b.repos)) 199 toc.repos.end(w) 200 } 201 202 if repoIDs, ok := b.repoIDs(); ok && next { 203 toc.reposIDsBitmap.start(w) 204 writeUint32Bitmap(w, repoIDs) 205 toc.reposIDsBitmap.end(w) 206 } 207 208 indexTime := b.IndexTime 209 if indexTime.IsZero() { 210 indexTime = time.Now().UTC() 211 } 212 213 if err := b.writeJSON(&zoekt.IndexMetadata{ 214 IndexFormatVersion: b.indexFormatVersion, 215 IndexTime: indexTime, 216 IndexFeatureVersion: b.featureVersion, 217 IndexMinReaderVersion: WriteMinFeatureVersion, 218 PlainASCII: b.contentPostings.isPlainASCII && b.namePostings.isPlainASCII, 219 LanguageMap: b.languageMap, 220 ZoektVersion: Version, 221 ID: b.ID, 222 }, &toc.metaData, w); err != nil { 223 return err 224 } 225 226 if next { 227 if err := b.writeJSON(b.repoList, &toc.repoMetaData, w); err != nil { 228 return err 229 } 230 } else { 231 if len(b.repoList) != 1 { 232 return fmt.Errorf("have %d repos, but only support 1 in index format version %d", len(b.repoList), b.indexFormatVersion) 233 } 234 if err := b.writeJSON(b.repoList[0], &toc.repoMetaData, w); err != nil { 235 return err 236 } 237 } 238 239 var tocSection simpleSection 240 241 tocSection.start(w) 242 w.writeTOC(&toc) 243 tocSection.end(w) 244 tocSection.write(w) 245 return w.err 246} 247 248func (b *ShardBuilder) writeJSON(data any, sec *simpleSection, w *writer) error { 249 blob, err := json.Marshal(data) 250 if err != nil { 251 return err 252 } 253 sec.start(w) 254 w.Write(blob) 255 sec.end(w) 256 return nil 257} 258 259func newLinesIndices(in []byte) []uint32 { 260 out := make([]uint32, 0, bytes.Count(in, []byte{'\n'})) 261 for i, c := range in { 262 if c == '\n' { 263 out = append(out, uint32(i)) 264 } 265 } 266 return out 267}