fork of https://github.com/sourcegraph/zoekt
0

Configure Feed

Select the types of activity you want to include in your feed.

1// Copyright 2016 Google Inc. All rights reserved. 2// 3// Licensed under the Apache License, Version 2.0 (the "License"); 4// you may not use this file except in compliance with the License. 5// You may obtain a copy of the License at 6// 7// http://www.apache.org/licenses/LICENSE-2.0 8// 9// Unless required by applicable law or agreed to in writing, software 10// distributed under the License is distributed on an "AS IS" BASIS, 11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12// See the License for the specific language governing permissions and 13// limitations under the License. 14 15package index 16 17import ( 18 "bytes" 19 "encoding/binary" 20 "fmt" 21 "hash/crc64" 22 "log" 23 "net/url" 24 "os" 25 "path/filepath" 26 "slices" 27 "sort" 28 "strings" 29 "text/template" 30 "time" 31 "unicode/utf8" 32 33 "github.com/sourcegraph/zoekt" 34 "github.com/sourcegraph/zoekt/languages" 35) 36 37var _ = log.Println 38 39const ngramSize = 3 40 41type searchableString struct { 42 data []byte 43} 44 45// Filled by the linker 46var Version string 47 48func HostnameBestEffort() string { 49 if h := os.Getenv("NODE_NAME"); h != "" { 50 return h 51 } 52 if h := os.Getenv("HOSTNAME"); h != "" { 53 return h 54 } 55 hostname, _ := os.Hostname() 56 return hostname 57} 58 59// Store character (unicode codepoint) offset (in bytes) this often. 60const runeOffsetFrequency = 100 61 62type postingsBuilder struct { 63 postings map[ngram][]byte 64 lastOffsets map[ngram]uint32 65 66 // To support UTF-8 searching, we must map back runes to byte 67 // offsets. As a first attempt, we sample regularly. The 68 // precise offset can be found by walking from the recorded 69 // offset to the desired rune. 70 runeOffsets []uint32 71 runeCount uint32 72 73 isPlainASCII bool 74 75 endRunes []uint32 76 endByte uint32 77} 78 79func newPostingsBuilder() *postingsBuilder { 80 return &postingsBuilder{ 81 postings: map[ngram][]byte{}, 82 lastOffsets: map[ngram]uint32{}, 83 isPlainASCII: true, 84 } 85} 86 87// Store trigram offsets for the given UTF-8 data. The 88// DocumentSections must correspond to rune boundaries in the UTF-8 89// data. 90func (s *postingsBuilder) newSearchableString(data []byte, byteSections []DocumentSection) (*searchableString, []DocumentSection, error) { 91 dest := searchableString{ 92 data: data, 93 } 94 var buf [8]byte 95 var runeGram [3]rune 96 97 var runeIndex uint32 98 byteCount := 0 99 dataSz := uint32(len(data)) 100 101 byteSectionBoundaries := make([]uint32, 0, 2*len(byteSections)) 102 for _, s := range byteSections { 103 byteSectionBoundaries = append(byteSectionBoundaries, s.Start, s.End) 104 } 105 var runeSectionBoundaries []uint32 106 107 endRune := s.runeCount 108 for ; len(data) > 0; runeIndex++ { 109 c, sz := utf8.DecodeRune(data) 110 if sz > 1 { 111 s.isPlainASCII = false 112 } 113 data = data[sz:] 114 115 runeGram[0], runeGram[1], runeGram[2] = runeGram[1], runeGram[2], c 116 117 if idx := s.runeCount + runeIndex; idx%runeOffsetFrequency == 0 { 118 s.runeOffsets = append(s.runeOffsets, s.endByte+uint32(byteCount)) 119 } 120 for len(byteSectionBoundaries) > 0 && byteSectionBoundaries[0] == uint32(byteCount) { 121 runeSectionBoundaries = append(runeSectionBoundaries, 122 endRune+uint32(runeIndex)) 123 byteSectionBoundaries = byteSectionBoundaries[1:] 124 } 125 126 byteCount += sz 127 128 if runeIndex < 2 { 129 continue 130 } 131 132 ng := runesToNGram(runeGram) 133 lastOff := s.lastOffsets[ng] 134 newOff := endRune + uint32(runeIndex) - 2 135 136 m := binary.PutUvarint(buf[:], uint64(newOff-lastOff)) 137 s.postings[ng] = append(s.postings[ng], buf[:m]...) 138 s.lastOffsets[ng] = newOff 139 } 140 s.runeCount += runeIndex 141 142 for len(byteSectionBoundaries) > 0 && byteSectionBoundaries[0] < uint32(byteCount) { 143 return nil, nil, fmt.Errorf("no rune for section boundary at byte %d", byteSectionBoundaries[0]) 144 } 145 146 // Handle symbol definition that ends at file end. This can 147 // happen for labels at the end of .bat files. 148 149 for len(byteSectionBoundaries) > 0 && byteSectionBoundaries[0] == uint32(byteCount) { 150 runeSectionBoundaries = append(runeSectionBoundaries, 151 endRune+runeIndex) 152 byteSectionBoundaries = byteSectionBoundaries[1:] 153 } 154 runeSecs := make([]DocumentSection, 0, len(byteSections)) 155 for i := 0; i < len(runeSectionBoundaries); i += 2 { 156 runeSecs = append(runeSecs, DocumentSection{ 157 Start: runeSectionBoundaries[i], 158 End: runeSectionBoundaries[i+1], 159 }) 160 } 161 162 s.endRunes = append(s.endRunes, s.runeCount) 163 s.endByte += dataSz 164 return &dest, runeSecs, nil 165} 166 167// ShardBuilder builds a single index shard. 168type ShardBuilder struct { 169 // The version we will write to disk. Sourcegraph Specific. This is to 170 // enable feature flagging new format versions. 171 indexFormatVersion int 172 featureVersion int 173 174 contentStrings []*searchableString 175 nameStrings []*searchableString 176 docSections [][]DocumentSection 177 runeDocSections []DocumentSection 178 179 symID uint32 180 symIndex map[string]uint32 181 symKindID uint32 182 symKindIndex map[string]uint32 183 symMetaData []uint32 184 185 fileEndSymbol []uint32 186 187 checksums []byte 188 189 branchMasks []uint64 190 subRepos []uint32 191 192 // docID => repoID 193 repos []uint16 194 195 contentPostings *postingsBuilder 196 namePostings *postingsBuilder 197 198 // root repositories 199 repoList []zoekt.Repository 200 201 // name to index. 202 subRepoIndices []map[string]uint32 203 204 // language => language code 205 languageMap map[string]uint16 206 207 // language codes, uint16 encoded as little-endian 208 languages []uint8 209 210 categories []byte 211 212 // IndexTime will be used as the time if non-zero. Otherwise 213 // time.Now(). This is useful for doing reproducible builds in tests. 214 IndexTime time.Time 215 216 // a sortable 20 chars long id. 217 ID string 218} 219 220func verify(repo *zoekt.Repository) error { 221 for _, t := range []string{repo.FileURLTemplate, repo.LineFragmentTemplate, repo.CommitURLTemplate} { 222 if _, err := ParseTemplate(t); err != nil { 223 return err 224 } 225 } 226 return nil 227} 228 229func urlJoinPath(base string, elem ...string) string { 230 // golangs html/template always escapes "+" appearing in an HTML attribute 231 // [1]. We may even want to treat more characters, differently but this 232 // atleast makes it possible to visit URLs like [2]. 233 // 234 // We only do this to elem since base will normally be a hardcoded string. 235 // 236 // [1]: https://sourcegraph.com/github.com/golang/go@go1.23.2/-/blob/src/html/template/html.go?L71-80 237 // [2]: https://github.com/apple/swift-system/blob/main/Sources/System/Util+StringArray.swift 238 elem = slices.Clone(elem) // copy to mutate 239 for i := range elem { 240 elem[i] = strings.ReplaceAll(elem[i], "+", "%2B") 241 } 242 u, err := url.JoinPath(base, elem...) 243 if err != nil { 244 return "#!error: " + err.Error() 245 } 246 return u 247} 248 249// ParseTemplate will parse the templates for FileURLTemplate, 250// LineFragmentTemplate and CommitURLTemplate. 251// 252// It makes available the extra function UrlJoinPath. 253func ParseTemplate(text string) (*template.Template, error) { 254 return template.New("").Funcs(template.FuncMap{ 255 "URLJoinPath": urlJoinPath, 256 }).Parse(text) 257} 258 259// ContentSize returns the number of content bytes so far ingested. 260func (b *ShardBuilder) ContentSize() uint32 { 261 // Add the name too so we don't skip building index if we have 262 // lots of empty files. 263 return b.contentPostings.endByte + b.namePostings.endByte 264} 265 266// NumFiles returns the number of files added to this builder 267func (b *ShardBuilder) NumFiles() int { 268 return len(b.contentStrings) 269} 270 271// NewShardBuilder creates a fresh ShardBuilder. The passed in 272// Repository contains repo metadata, and may be set to nil. 273func NewShardBuilder(r *zoekt.Repository) (*ShardBuilder, error) { 274 b := newShardBuilder() 275 276 if r == nil { 277 r = &zoekt.Repository{} 278 } 279 if err := b.setRepository(r); err != nil { 280 return nil, err 281 } 282 return b, nil 283} 284 285func newShardBuilder() *ShardBuilder { 286 return &ShardBuilder{ 287 indexFormatVersion: IndexFormatVersion, 288 featureVersion: FeatureVersion, 289 290 contentPostings: newPostingsBuilder(), 291 namePostings: newPostingsBuilder(), 292 fileEndSymbol: []uint32{0}, 293 symIndex: make(map[string]uint32), 294 symKindIndex: make(map[string]uint32), 295 languageMap: make(map[string]uint16), 296 } 297} 298 299func (b *ShardBuilder) setRepository(desc *zoekt.Repository) error { 300 if err := verify(desc); err != nil { 301 return err 302 } 303 304 if len(desc.Branches) > 64 { 305 return fmt.Errorf("too many branches") 306 } 307 308 repo := *desc 309 310 // copy subrepomap without root 311 repo.SubRepoMap = map[string]*zoekt.Repository{} 312 for k, v := range desc.SubRepoMap { 313 if k != "" { 314 repo.SubRepoMap[k] = v 315 } 316 } 317 318 b.repoList = append(b.repoList, repo) 319 320 return b.populateSubRepoIndices() 321} 322 323type symbolSlice struct { 324 symbols []DocumentSection 325 metaData []*zoekt.Symbol 326} 327 328func (s symbolSlice) Len() int { return len(s.symbols) } 329 330func (s symbolSlice) Swap(i, j int) { 331 s.symbols[i], s.symbols[j] = s.symbols[j], s.symbols[i] 332 s.metaData[i], s.metaData[j] = s.metaData[j], s.metaData[i] 333} 334 335func (s symbolSlice) Less(i, j int) bool { 336 return s.symbols[i].Start < s.symbols[j].Start 337} 338 339// AddFile is a convenience wrapper for Add 340func (b *ShardBuilder) AddFile(name string, content []byte) error { 341 return b.Add(Document{Name: name, Content: content}) 342} 343 344func (b *ShardBuilder) populateSubRepoIndices() error { 345 if len(b.subRepoIndices) == len(b.repoList) { 346 return nil 347 } 348 if len(b.subRepoIndices) != len(b.repoList)-1 { 349 return fmt.Errorf("populateSubRepoIndices not called for a repo: %d != %d - 1", len(b.subRepoIndices), len(b.repoList)) 350 } 351 repo := b.repoList[len(b.repoList)-1] 352 b.subRepoIndices = append(b.subRepoIndices, mkSubRepoIndices(repo)) 353 return nil 354} 355 356func mkSubRepoIndices(repo zoekt.Repository) map[string]uint32 { 357 paths := []string{""} 358 for k := range repo.SubRepoMap { 359 paths = append(paths, k) 360 } 361 sort.Strings(paths) 362 subRepoIndices := make(map[string]uint32, len(paths)) 363 for i, p := range paths { 364 subRepoIndices[p] = uint32(i) 365 } 366 return subRepoIndices 367} 368 369const notIndexedMarker = "NOT-INDEXED: " 370 371func (b *ShardBuilder) symbolID(sym string) uint32 { 372 if _, ok := b.symIndex[sym]; !ok { 373 b.symIndex[sym] = b.symID 374 b.symID++ 375 } 376 return b.symIndex[sym] 377} 378 379func (b *ShardBuilder) symbolKindID(t string) uint32 { 380 if _, ok := b.symKindIndex[t]; !ok { 381 b.symKindIndex[t] = b.symKindID 382 b.symKindID++ 383 } 384 return b.symKindIndex[t] 385} 386 387func (b *ShardBuilder) addSymbols(symbols []*zoekt.Symbol) { 388 for _, sym := range symbols { 389 b.symMetaData = append(b.symMetaData, 390 // This field was removed due to redundancy. To avoid 391 // needing to reindex, it is set to zero for now. In the 392 // future, this field will be completely removed. It 393 // will require incrementing the feature version. 394 0, 395 b.symbolKindID(sym.Kind), 396 b.symbolID(sym.Parent), 397 b.symbolKindID(sym.ParentKind)) 398 } 399} 400 401func DetermineLanguageIfUnknown(doc *Document) { 402 if doc.Language != "" { 403 return 404 } 405 406 // If this document has been skipped (doc.SkipReason != SkipReasonNone), it's 407 // likely very large, or it's a non-code file like binary. In this case, we just 408 // guess the language based on the file name to avoid examining the contents. 409 // Note: passing nil content is allowed by the go-enry contract (the underlying 410 // library we use here). 411 var content []byte 412 if doc.SkipReason == SkipReasonNone { 413 content = doc.Content 414 } 415 langs := languages.GetLanguagesFromContent(doc.Name, content) 416 if len(langs) > 0 { 417 doc.Language = langs[0] 418 } 419} 420 421// Add a file which only occurs in certain branches. 422func (b *ShardBuilder) Add(doc Document) error { 423 if index := bytes.IndexByte(doc.Content, 0); index > 0 { 424 doc.SkipReason = SkipReasonBinary 425 } 426 427 if doc.SkipReason != SkipReasonNone { 428 doc.Content = []byte(notIndexedMarker + doc.SkipReason.explanation()) 429 doc.Symbols = nil 430 doc.SymbolsMetaData = nil 431 } 432 433 DetermineLanguageIfUnknown(&doc) 434 DetermineFileCategory(&doc) 435 436 sort.Sort(symbolSlice{doc.Symbols, doc.SymbolsMetaData}) 437 var last DocumentSection 438 for i, s := range doc.Symbols { 439 if i > 0 { 440 if last.End > s.Start { 441 return fmt.Errorf("sections overlap") 442 } 443 } 444 last = s 445 } 446 if last.End > uint32(len(doc.Content)) { 447 return fmt.Errorf("section goes past end of content") 448 } 449 450 if doc.SubRepositoryPath != "" { 451 rel, err := filepath.Rel(doc.SubRepositoryPath, doc.Name) 452 if err != nil || rel == doc.Name { 453 return fmt.Errorf("path %q must start subrepo path %q", doc.Name, doc.SubRepositoryPath) 454 } 455 } 456 docStr, runeSecs, err := b.contentPostings.newSearchableString(doc.Content, doc.Symbols) 457 if err != nil { 458 return err 459 } 460 nameStr, _, err := b.namePostings.newSearchableString([]byte(doc.Name), nil) 461 if err != nil { 462 return err 463 } 464 b.addSymbols(doc.SymbolsMetaData) 465 466 repoIdx := len(b.repoList) - 1 467 subRepoIdx, ok := b.subRepoIndices[repoIdx][doc.SubRepositoryPath] 468 if !ok { 469 return fmt.Errorf("unknown subrepo path %q", doc.SubRepositoryPath) 470 } 471 472 var mask uint64 473 for _, br := range doc.Branches { 474 m := b.branchMask(br) 475 if m == 0 { 476 return fmt.Errorf("no branch found for %s", br) 477 } 478 mask |= m 479 } 480 481 if repoIdx > 1<<16 { 482 return fmt.Errorf("too many repos in shard: max is %d", 1<<16) 483 } 484 485 b.subRepos = append(b.subRepos, subRepoIdx) 486 b.repos = append(b.repos, uint16(repoIdx)) 487 488 hasher := crc64.New(crc64.MakeTable(crc64.ISO)) 489 hasher.Write(doc.Content) 490 491 b.contentStrings = append(b.contentStrings, docStr) 492 b.runeDocSections = append(b.runeDocSections, runeSecs...) 493 494 b.nameStrings = append(b.nameStrings, nameStr) 495 b.docSections = append(b.docSections, doc.Symbols) 496 b.fileEndSymbol = append(b.fileEndSymbol, uint32(len(b.runeDocSections))) 497 b.branchMasks = append(b.branchMasks, mask) 498 b.checksums = append(b.checksums, hasher.Sum(nil)...) 499 500 langCode, ok := b.languageMap[doc.Language] 501 if !ok { 502 if len(b.languageMap) >= 65535 { 503 return fmt.Errorf("too many languages") 504 } 505 langCode = uint16(len(b.languageMap)) 506 b.languageMap[doc.Language] = langCode 507 } 508 b.languages = append(b.languages, uint8(langCode), uint8(langCode>>8)) 509 510 category, err := doc.Category.encode() 511 if err != nil { 512 return err 513 } 514 b.categories = append(b.categories, category) 515 516 return nil 517} 518 519func (b *ShardBuilder) branchMask(br string) uint64 { 520 for i, b := range b.repoList[len(b.repoList)-1].Branches { 521 if b.Name == br { 522 return uint64(1) << uint(i) 523 } 524 } 525 return 0 526} 527 528// repoIDs returns a list of sourcegraph IDs for the indexed repos. If the ID 529// is missing or there are no repos, this returns false. 530func (b *ShardBuilder) repoIDs() ([]uint32, bool) { 531 if len(b.repoList) == 0 { 532 return nil, false 533 } 534 535 ids := make([]uint32, 0, len(b.repoList)) 536 for _, repo := range b.repoList { 537 if repo.ID == 0 { 538 return nil, false 539 } 540 ids = append(ids, repo.ID) 541 } 542 return ids, true 543} 544 545type DocChecker struct { 546 // A map to count the unique trigrams in a doc. Reused across docs to cut down on allocations. 547 trigrams map[ngram]struct{} 548} 549 550// Check returns a reason why the given contents are probably not source texts. 551func (t *DocChecker) Check(content []byte, maxTrigramCount int, allowLargeFile bool) SkipReason { 552 if len(content) == 0 { 553 return SkipReasonNone 554 } 555 556 if len(content) < ngramSize { 557 return SkipReasonTooSmall 558 } 559 560 if index := bytes.IndexByte(content, 0); index > 0 { 561 return SkipReasonBinary 562 } 563 564 // PERF: we only need to do the trigram check if the upperbound on content is greater than 565 // our threshold. Also skip the trigram check if the file is explicitly marked as allowed. 566 if trigramsUpperBound := len(content) - ngramSize + 1; trigramsUpperBound <= maxTrigramCount || allowLargeFile { 567 return SkipReasonNone 568 } 569 570 var cur [3]rune 571 byteCount := 0 572 t.clearTrigrams(maxTrigramCount) 573 574 for len(content) > 0 { 575 r, sz := utf8.DecodeRune(content) 576 content = content[sz:] 577 byteCount += sz 578 579 cur[0], cur[1], cur[2] = cur[1], cur[2], r 580 if cur[0] == 0 { 581 // start of file. 582 continue 583 } 584 585 t.trigrams[runesToNGram(cur)] = struct{}{} 586 if len(t.trigrams) > maxTrigramCount { 587 // probably not text. 588 return SkipReasonTooManyTrigrams 589 } 590 } 591 return SkipReasonNone 592} 593 594func (t *DocChecker) clearTrigrams(maxTrigramCount int) { 595 if t.trigrams == nil { 596 t.trigrams = make(map[ngram]struct{}, maxTrigramCount) 597 } 598 for key := range t.trigrams { 599 delete(t.trigrams, key) 600 } 601} 602 603// shardName returns the name of the shard for the given prefix, version, and 604// shard number. 605func shardName(indexDir string, prefix string, version, n int) string { 606 prefix = url.QueryEscape(prefix) 607 if len(prefix) > 200 { 608 prefix = prefix[:200] + hashString(prefix)[:8] 609 } 610 return filepath.Join(indexDir, fmt.Sprintf("%s_v%d.%05d.zoekt", prefix, version, n)) 611}