fork of https://github.com/sourcegraph/zoekt
0

Configure Feed

Select the types of activity you want to include in your feed.

1// Copyright 2016 Google Inc. All rights reserved. 2// 3// Licensed under the Apache License, Version 2.0 (the "License"); 4// you may not use this file except in compliance with the License. 5// You may obtain a copy of the License at 6// 7// http://www.apache.org/licenses/LICENSE-2.0 8// 9// Unless required by applicable law or agreed to in writing, software 10// distributed under the License is distributed on an "AS IS" BASIS, 11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12// See the License for the specific language governing permissions and 13// limitations under the License. 14 15package index 16 17import ( 18 "bytes" 19 "encoding/binary" 20 "fmt" 21 "hash/crc64" 22 "log" 23 "net/url" 24 "os" 25 "path/filepath" 26 "sort" 27 "strings" 28 "text/template" 29 "time" 30 "unicode/utf8" 31 32 "github.com/sourcegraph/zoekt" 33 "github.com/sourcegraph/zoekt/internal/languages" 34) 35 36var _ = log.Println 37 38const ngramSize = 3 39 40type searchableString struct { 41 data []byte 42} 43 44// Filled by the linker 45var Version string 46 47func HostnameBestEffort() string { 48 if h := os.Getenv("NODE_NAME"); h != "" { 49 return h 50 } 51 if h := os.Getenv("HOSTNAME"); h != "" { 52 return h 53 } 54 hostname, _ := os.Hostname() 55 return hostname 56} 57 58// Store character (unicode codepoint) offset (in bytes) this often. 59const runeOffsetFrequency = 100 60 61type postingsBuilder struct { 62 postings map[ngram][]byte 63 lastOffsets map[ngram]uint32 64 65 // To support UTF-8 searching, we must map back runes to byte 66 // offsets. As a first attempt, we sample regularly. The 67 // precise offset can be found by walking from the recorded 68 // offset to the desired rune. 69 runeOffsets []uint32 70 runeCount uint32 71 72 isPlainASCII bool 73 74 endRunes []uint32 75 endByte uint32 76} 77 78func newPostingsBuilder() *postingsBuilder { 79 return &postingsBuilder{ 80 postings: map[ngram][]byte{}, 81 lastOffsets: map[ngram]uint32{}, 82 isPlainASCII: true, 83 } 84} 85 86// Store trigram offsets for the given UTF-8 data. The 87// DocumentSections must correspond to rune boundaries in the UTF-8 88// data. 89func (s *postingsBuilder) newSearchableString(data []byte, byteSections []DocumentSection) (*searchableString, []DocumentSection, error) { 90 dest := searchableString{ 91 data: data, 92 } 93 var buf [8]byte 94 var runeGram [3]rune 95 96 var runeIndex uint32 97 byteCount := 0 98 dataSz := uint32(len(data)) 99 100 byteSectionBoundaries := make([]uint32, 0, 2*len(byteSections)) 101 for _, s := range byteSections { 102 byteSectionBoundaries = append(byteSectionBoundaries, s.Start, s.End) 103 } 104 var runeSectionBoundaries []uint32 105 106 endRune := s.runeCount 107 for ; len(data) > 0; runeIndex++ { 108 c, sz := utf8.DecodeRune(data) 109 if sz > 1 { 110 s.isPlainASCII = false 111 } 112 data = data[sz:] 113 114 runeGram[0], runeGram[1], runeGram[2] = runeGram[1], runeGram[2], c 115 116 if idx := s.runeCount + runeIndex; idx%runeOffsetFrequency == 0 { 117 s.runeOffsets = append(s.runeOffsets, s.endByte+uint32(byteCount)) 118 } 119 for len(byteSectionBoundaries) > 0 && byteSectionBoundaries[0] == uint32(byteCount) { 120 runeSectionBoundaries = append(runeSectionBoundaries, 121 endRune+uint32(runeIndex)) 122 byteSectionBoundaries = byteSectionBoundaries[1:] 123 } 124 125 byteCount += sz 126 127 if runeIndex < 2 { 128 continue 129 } 130 131 ng := runesToNGram(runeGram) 132 lastOff := s.lastOffsets[ng] 133 newOff := endRune + uint32(runeIndex) - 2 134 135 m := binary.PutUvarint(buf[:], uint64(newOff-lastOff)) 136 s.postings[ng] = append(s.postings[ng], buf[:m]...) 137 s.lastOffsets[ng] = newOff 138 } 139 s.runeCount += runeIndex 140 141 for len(byteSectionBoundaries) > 0 && byteSectionBoundaries[0] < uint32(byteCount) { 142 return nil, nil, fmt.Errorf("no rune for section boundary at byte %d", byteSectionBoundaries[0]) 143 } 144 145 // Handle symbol definition that ends at file end. This can 146 // happen for labels at the end of .bat files. 147 148 for len(byteSectionBoundaries) > 0 && byteSectionBoundaries[0] == uint32(byteCount) { 149 runeSectionBoundaries = append(runeSectionBoundaries, 150 endRune+runeIndex) 151 byteSectionBoundaries = byteSectionBoundaries[1:] 152 } 153 runeSecs := make([]DocumentSection, 0, len(byteSections)) 154 for i := 0; i < len(runeSectionBoundaries); i += 2 { 155 runeSecs = append(runeSecs, DocumentSection{ 156 Start: runeSectionBoundaries[i], 157 End: runeSectionBoundaries[i+1], 158 }) 159 } 160 161 s.endRunes = append(s.endRunes, s.runeCount) 162 s.endByte += dataSz 163 return &dest, runeSecs, nil 164} 165 166// ShardBuilder builds a single index shard. 167type ShardBuilder struct { 168 // The version we will write to disk. Sourcegraph Specific. This is to 169 // enable feature flagging new format versions. 170 indexFormatVersion int 171 featureVersion int 172 173 contentStrings []*searchableString 174 nameStrings []*searchableString 175 docSections [][]DocumentSection 176 runeDocSections []DocumentSection 177 178 symID uint32 179 symIndex map[string]uint32 180 symKindID uint32 181 symKindIndex map[string]uint32 182 symMetaData []uint32 183 184 fileEndSymbol []uint32 185 186 checksums []byte 187 188 branchMasks []uint64 189 subRepos []uint32 190 191 // docID => repoID 192 repos []uint16 193 194 contentPostings *postingsBuilder 195 namePostings *postingsBuilder 196 197 // root repositories 198 repoList []zoekt.Repository 199 200 // name to index. 201 subRepoIndices []map[string]uint32 202 203 // language => language code 204 languageMap map[string]uint16 205 206 // language codes, uint16 encoded as little-endian 207 languages []uint8 208 209 // IndexTime will be used as the time if non-zero. Otherwise 210 // time.Now(). This is useful for doing reproducible builds in tests. 211 IndexTime time.Time 212 213 // a sortable 20 chars long id. 214 ID string 215} 216 217func verify(repo *zoekt.Repository) error { 218 for _, t := range []string{repo.FileURLTemplate, repo.LineFragmentTemplate, repo.CommitURLTemplate} { 219 if _, err := ParseTemplate(t); err != nil { 220 return err 221 } 222 } 223 return nil 224} 225 226func urlJoinPath(base string, elem ...string) string { 227 // golangs html/template always escapes "+" appearing in an HTML attribute 228 // [1]. We may even want to treat more characters, differently but this 229 // atleast makes it possible to visit URLs like [2]. 230 // 231 // We only do this to elem since base will normally be a hardcoded string. 232 // 233 // [1]: https://sourcegraph.com/github.com/golang/go@go1.23.2/-/blob/src/html/template/html.go?L71-80 234 // [2]: https://github.com/apple/swift-system/blob/main/Sources/System/Util+StringArray.swift 235 elem = append([]string{}, elem...) // copy to mutate 236 for i := range elem { 237 elem[i] = strings.ReplaceAll(elem[i], "+", "%2B") 238 } 239 u, err := url.JoinPath(base, elem...) 240 if err != nil { 241 return "#!error: " + err.Error() 242 } 243 return u 244} 245 246// ParseTemplate will parse the templates for FileURLTemplate, 247// LineFragmentTemplate and CommitURLTemplate. 248// 249// It makes available the extra function UrlJoinPath. 250func ParseTemplate(text string) (*template.Template, error) { 251 return template.New("").Funcs(template.FuncMap{ 252 "URLJoinPath": urlJoinPath, 253 }).Parse(text) 254} 255 256// ContentSize returns the number of content bytes so far ingested. 257func (b *ShardBuilder) ContentSize() uint32 { 258 // Add the name too so we don't skip building index if we have 259 // lots of empty files. 260 return b.contentPostings.endByte + b.namePostings.endByte 261} 262 263// NumFiles returns the number of files added to this builder 264func (b *ShardBuilder) NumFiles() int { 265 return len(b.contentStrings) 266} 267 268// NewShardBuilder creates a fresh ShardBuilder. The passed in 269// Repository contains repo metadata, and may be set to nil. 270func NewShardBuilder(r *zoekt.Repository) (*ShardBuilder, error) { 271 b := newShardBuilder() 272 273 if r == nil { 274 r = &zoekt.Repository{} 275 } 276 if err := b.setRepository(r); err != nil { 277 return nil, err 278 } 279 return b, nil 280} 281 282func newShardBuilder() *ShardBuilder { 283 return &ShardBuilder{ 284 indexFormatVersion: IndexFormatVersion, 285 featureVersion: FeatureVersion, 286 287 contentPostings: newPostingsBuilder(), 288 namePostings: newPostingsBuilder(), 289 fileEndSymbol: []uint32{0}, 290 symIndex: make(map[string]uint32), 291 symKindIndex: make(map[string]uint32), 292 languageMap: make(map[string]uint16), 293 } 294} 295 296func (b *ShardBuilder) setRepository(desc *zoekt.Repository) error { 297 if err := verify(desc); err != nil { 298 return err 299 } 300 301 if len(desc.Branches) > 64 { 302 return fmt.Errorf("too many branches") 303 } 304 305 repo := *desc 306 307 // copy subrepomap without root 308 repo.SubRepoMap = map[string]*zoekt.Repository{} 309 for k, v := range desc.SubRepoMap { 310 if k != "" { 311 repo.SubRepoMap[k] = v 312 } 313 } 314 315 b.repoList = append(b.repoList, repo) 316 317 return b.populateSubRepoIndices() 318} 319 320type symbolSlice struct { 321 symbols []DocumentSection 322 metaData []*zoekt.Symbol 323} 324 325func (s symbolSlice) Len() int { return len(s.symbols) } 326 327func (s symbolSlice) Swap(i, j int) { 328 s.symbols[i], s.symbols[j] = s.symbols[j], s.symbols[i] 329 s.metaData[i], s.metaData[j] = s.metaData[j], s.metaData[i] 330} 331 332func (s symbolSlice) Less(i, j int) bool { 333 return s.symbols[i].Start < s.symbols[j].Start 334} 335 336// AddFile is a convenience wrapper for Add 337func (b *ShardBuilder) AddFile(name string, content []byte) error { 338 return b.Add(Document{Name: name, Content: content}) 339} 340 341func (b *ShardBuilder) populateSubRepoIndices() error { 342 if len(b.subRepoIndices) == len(b.repoList) { 343 return nil 344 } 345 if len(b.subRepoIndices) != len(b.repoList)-1 { 346 return fmt.Errorf("populateSubRepoIndices not called for a repo: %d != %d - 1", len(b.subRepoIndices), len(b.repoList)) 347 } 348 repo := b.repoList[len(b.repoList)-1] 349 b.subRepoIndices = append(b.subRepoIndices, mkSubRepoIndices(repo)) 350 return nil 351} 352 353func mkSubRepoIndices(repo zoekt.Repository) map[string]uint32 { 354 paths := []string{""} 355 for k := range repo.SubRepoMap { 356 paths = append(paths, k) 357 } 358 sort.Strings(paths) 359 subRepoIndices := make(map[string]uint32, len(paths)) 360 for i, p := range paths { 361 subRepoIndices[p] = uint32(i) 362 } 363 return subRepoIndices 364} 365 366const notIndexedMarker = "NOT-INDEXED: " 367 368func (b *ShardBuilder) symbolID(sym string) uint32 { 369 if _, ok := b.symIndex[sym]; !ok { 370 b.symIndex[sym] = b.symID 371 b.symID++ 372 } 373 return b.symIndex[sym] 374} 375 376func (b *ShardBuilder) symbolKindID(t string) uint32 { 377 if _, ok := b.symKindIndex[t]; !ok { 378 b.symKindIndex[t] = b.symKindID 379 b.symKindID++ 380 } 381 return b.symKindIndex[t] 382} 383 384func (b *ShardBuilder) addSymbols(symbols []*zoekt.Symbol) { 385 for _, sym := range symbols { 386 b.symMetaData = append(b.symMetaData, 387 // This field was removed due to redundancy. To avoid 388 // needing to reindex, it is set to zero for now. In the 389 // future, this field will be completely removed. It 390 // will require incrementing the feature version. 391 0, 392 b.symbolKindID(sym.Kind), 393 b.symbolID(sym.Parent), 394 b.symbolKindID(sym.ParentKind)) 395 } 396} 397 398func DetermineLanguageIfUnknown(doc *Document) { 399 if doc.Language != "" { 400 return 401 } 402 403 if doc.SkipReason != "" { 404 // If this document has been skipped, it's likely very large, or it's a non-code file like binary. 405 // In this case, we just guess the language based on file name to avoid examining the contents. 406 // Note: passing nil content is allowed by the go-enry contract (the underlying library we use here). 407 doc.Language = languages.GetLanguage(doc.Name, nil) 408 } else { 409 doc.Language = languages.GetLanguage(doc.Name, doc.Content) 410 } 411} 412 413// Add a file which only occurs in certain branches. 414func (b *ShardBuilder) Add(doc Document) error { 415 hasher := crc64.New(crc64.MakeTable(crc64.ISO)) 416 417 if idx := bytes.IndexByte(doc.Content, 0); idx >= 0 { 418 doc.SkipReason = fmt.Sprintf("binary content at byte offset %d", idx) 419 } 420 421 if doc.SkipReason != "" { 422 doc.Content = []byte(notIndexedMarker + doc.SkipReason) 423 doc.Symbols = nil 424 doc.SymbolsMetaData = nil 425 } 426 427 DetermineLanguageIfUnknown(&doc) 428 429 sort.Sort(symbolSlice{doc.Symbols, doc.SymbolsMetaData}) 430 var last DocumentSection 431 for i, s := range doc.Symbols { 432 if i > 0 { 433 if last.End > s.Start { 434 return fmt.Errorf("sections overlap") 435 } 436 } 437 last = s 438 } 439 if last.End > uint32(len(doc.Content)) { 440 return fmt.Errorf("section goes past end of content") 441 } 442 443 if doc.SubRepositoryPath != "" { 444 rel, err := filepath.Rel(doc.SubRepositoryPath, doc.Name) 445 if err != nil || rel == doc.Name { 446 return fmt.Errorf("path %q must start subrepo path %q", doc.Name, doc.SubRepositoryPath) 447 } 448 } 449 docStr, runeSecs, err := b.contentPostings.newSearchableString(doc.Content, doc.Symbols) 450 if err != nil { 451 return err 452 } 453 nameStr, _, err := b.namePostings.newSearchableString([]byte(doc.Name), nil) 454 if err != nil { 455 return err 456 } 457 b.addSymbols(doc.SymbolsMetaData) 458 459 repoIdx := len(b.repoList) - 1 460 subRepoIdx, ok := b.subRepoIndices[repoIdx][doc.SubRepositoryPath] 461 if !ok { 462 return fmt.Errorf("unknown subrepo path %q", doc.SubRepositoryPath) 463 } 464 465 var mask uint64 466 for _, br := range doc.Branches { 467 m := b.branchMask(br) 468 if m == 0 { 469 return fmt.Errorf("no branch found for %s", br) 470 } 471 mask |= m 472 } 473 474 if repoIdx > 1<<16 { 475 return fmt.Errorf("too many repos in shard: max is %d", 1<<16) 476 } 477 478 b.subRepos = append(b.subRepos, subRepoIdx) 479 b.repos = append(b.repos, uint16(repoIdx)) 480 481 hasher.Write(doc.Content) 482 483 b.contentStrings = append(b.contentStrings, docStr) 484 b.runeDocSections = append(b.runeDocSections, runeSecs...) 485 486 b.nameStrings = append(b.nameStrings, nameStr) 487 b.docSections = append(b.docSections, doc.Symbols) 488 b.fileEndSymbol = append(b.fileEndSymbol, uint32(len(b.runeDocSections))) 489 b.branchMasks = append(b.branchMasks, mask) 490 b.checksums = append(b.checksums, hasher.Sum(nil)...) 491 492 langCode, ok := b.languageMap[doc.Language] 493 if !ok { 494 if len(b.languageMap) >= 65535 { 495 return fmt.Errorf("too many languages") 496 } 497 langCode = uint16(len(b.languageMap)) 498 b.languageMap[doc.Language] = langCode 499 } 500 b.languages = append(b.languages, uint8(langCode), uint8(langCode>>8)) 501 502 return nil 503} 504 505func (b *ShardBuilder) branchMask(br string) uint64 { 506 for i, b := range b.repoList[len(b.repoList)-1].Branches { 507 if b.Name == br { 508 return uint64(1) << uint(i) 509 } 510 } 511 return 0 512} 513 514type DocChecker struct { 515 // A map to count the unique trigrams in a doc. Reused across docs to cut down on allocations. 516 trigrams map[ngram]struct{} 517} 518 519// Check returns a reason why the given contents are probably not source texts. 520func (t *DocChecker) Check(content []byte, maxTrigramCount int, allowLargeFile bool) error { 521 if len(content) == 0 { 522 return nil 523 } 524 525 if len(content) < ngramSize { 526 return fmt.Errorf("file size smaller than %d", ngramSize) 527 } 528 529 if index := bytes.IndexByte(content, 0); index > 0 { 530 return fmt.Errorf("binary data at byte offset %d", index) 531 } 532 533 // PERF: we only need to do the trigram check if the upperbound on content is greater than 534 // our threshold. Also skip the trigram check if the file is explicitly marked as allowed. 535 if trigramsUpperBound := len(content) - ngramSize + 1; trigramsUpperBound <= maxTrigramCount || allowLargeFile { 536 return nil 537 } 538 539 var cur [3]rune 540 byteCount := 0 541 t.clearTrigrams(maxTrigramCount) 542 543 for len(content) > 0 { 544 r, sz := utf8.DecodeRune(content) 545 content = content[sz:] 546 byteCount += sz 547 548 cur[0], cur[1], cur[2] = cur[1], cur[2], r 549 if cur[0] == 0 { 550 // start of file. 551 continue 552 } 553 554 t.trigrams[runesToNGram(cur)] = struct{}{} 555 if len(t.trigrams) > maxTrigramCount { 556 // probably not text. 557 return fmt.Errorf("number of trigrams exceeds %d", maxTrigramCount) 558 } 559 } 560 return nil 561} 562 563func (t *DocChecker) clearTrigrams(maxTrigramCount int) { 564 if t.trigrams == nil { 565 t.trigrams = make(map[ngram]struct{}, maxTrigramCount) 566 } 567 for key := range t.trigrams { 568 delete(t.trigrams, key) 569 } 570} 571 572// ShardName returns the name of the shard for the given prefix, version, and 573// shard number. 574func ShardName(indexDir string, prefix string, version, n int) string { 575 prefix = url.QueryEscape(prefix) 576 if len(prefix) > 200 { 577 prefix = prefix[:200] + hashString(prefix)[:8] 578 } 579 return filepath.Join(indexDir, fmt.Sprintf("%s_v%d.%05d.zoekt", prefix, version, n)) 580}