fork of https://github.com/sourcegraph/zoekt
0

Configure Feed

Select the types of activity you want to include in your feed.

1// Copyright 2016 Google Inc. All rights reserved. 2// 3// Licensed under the Apache License, Version 2.0 (the "License"); 4// you may not use this file except in compliance with the License. 5// You may obtain a copy of the License at 6// 7// http://www.apache.org/licenses/LICENSE-2.0 8// 9// Unless required by applicable law or agreed to in writing, software 10// distributed under the License is distributed on an "AS IS" BASIS, 11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12// See the License for the specific language governing permissions and 13// limitations under the License. 14 15package index 16 17import ( 18 "bytes" 19 "encoding/binary" 20 "fmt" 21 "hash/crc64" 22 "log" 23 "net/url" 24 "os" 25 "path/filepath" 26 "sort" 27 "strings" 28 "text/template" 29 "time" 30 "unicode/utf8" 31 32 "slices" 33 34 "github.com/sourcegraph/zoekt" 35 "github.com/sourcegraph/zoekt/internal/languages" 36) 37 38var _ = log.Println 39 40const ngramSize = 3 41 42type searchableString struct { 43 data []byte 44} 45 46// Filled by the linker 47var Version string 48 49func HostnameBestEffort() string { 50 if h := os.Getenv("NODE_NAME"); h != "" { 51 return h 52 } 53 if h := os.Getenv("HOSTNAME"); h != "" { 54 return h 55 } 56 hostname, _ := os.Hostname() 57 return hostname 58} 59 60// Store character (unicode codepoint) offset (in bytes) this often. 61const runeOffsetFrequency = 100 62 63type postingsBuilder struct { 64 postings map[ngram][]byte 65 lastOffsets map[ngram]uint32 66 67 // To support UTF-8 searching, we must map back runes to byte 68 // offsets. As a first attempt, we sample regularly. The 69 // precise offset can be found by walking from the recorded 70 // offset to the desired rune. 71 runeOffsets []uint32 72 runeCount uint32 73 74 isPlainASCII bool 75 76 endRunes []uint32 77 endByte uint32 78} 79 80func newPostingsBuilder() *postingsBuilder { 81 return &postingsBuilder{ 82 postings: map[ngram][]byte{}, 83 lastOffsets: map[ngram]uint32{}, 84 isPlainASCII: true, 85 } 86} 87 88// Store trigram offsets for the given UTF-8 data. The 89// DocumentSections must correspond to rune boundaries in the UTF-8 90// data. 91func (s *postingsBuilder) newSearchableString(data []byte, byteSections []DocumentSection) (*searchableString, []DocumentSection, error) { 92 dest := searchableString{ 93 data: data, 94 } 95 var buf [8]byte 96 var runeGram [3]rune 97 98 var runeIndex uint32 99 byteCount := 0 100 dataSz := uint32(len(data)) 101 102 byteSectionBoundaries := make([]uint32, 0, 2*len(byteSections)) 103 for _, s := range byteSections { 104 byteSectionBoundaries = append(byteSectionBoundaries, s.Start, s.End) 105 } 106 var runeSectionBoundaries []uint32 107 108 endRune := s.runeCount 109 for ; len(data) > 0; runeIndex++ { 110 c, sz := utf8.DecodeRune(data) 111 if sz > 1 { 112 s.isPlainASCII = false 113 } 114 data = data[sz:] 115 116 runeGram[0], runeGram[1], runeGram[2] = runeGram[1], runeGram[2], c 117 118 if idx := s.runeCount + runeIndex; idx%runeOffsetFrequency == 0 { 119 s.runeOffsets = append(s.runeOffsets, s.endByte+uint32(byteCount)) 120 } 121 for len(byteSectionBoundaries) > 0 && byteSectionBoundaries[0] == uint32(byteCount) { 122 runeSectionBoundaries = append(runeSectionBoundaries, 123 endRune+uint32(runeIndex)) 124 byteSectionBoundaries = byteSectionBoundaries[1:] 125 } 126 127 byteCount += sz 128 129 if runeIndex < 2 { 130 continue 131 } 132 133 ng := runesToNGram(runeGram) 134 lastOff := s.lastOffsets[ng] 135 newOff := endRune + uint32(runeIndex) - 2 136 137 m := binary.PutUvarint(buf[:], uint64(newOff-lastOff)) 138 s.postings[ng] = append(s.postings[ng], buf[:m]...) 139 s.lastOffsets[ng] = newOff 140 } 141 s.runeCount += runeIndex 142 143 for len(byteSectionBoundaries) > 0 && byteSectionBoundaries[0] < uint32(byteCount) { 144 return nil, nil, fmt.Errorf("no rune for section boundary at byte %d", byteSectionBoundaries[0]) 145 } 146 147 // Handle symbol definition that ends at file end. This can 148 // happen for labels at the end of .bat files. 149 150 for len(byteSectionBoundaries) > 0 && byteSectionBoundaries[0] == uint32(byteCount) { 151 runeSectionBoundaries = append(runeSectionBoundaries, 152 endRune+runeIndex) 153 byteSectionBoundaries = byteSectionBoundaries[1:] 154 } 155 runeSecs := make([]DocumentSection, 0, len(byteSections)) 156 for i := 0; i < len(runeSectionBoundaries); i += 2 { 157 runeSecs = append(runeSecs, DocumentSection{ 158 Start: runeSectionBoundaries[i], 159 End: runeSectionBoundaries[i+1], 160 }) 161 } 162 163 s.endRunes = append(s.endRunes, s.runeCount) 164 s.endByte += dataSz 165 return &dest, runeSecs, nil 166} 167 168// ShardBuilder builds a single index shard. 169type ShardBuilder struct { 170 // The version we will write to disk. Sourcegraph Specific. This is to 171 // enable feature flagging new format versions. 172 indexFormatVersion int 173 featureVersion int 174 175 contentStrings []*searchableString 176 nameStrings []*searchableString 177 docSections [][]DocumentSection 178 runeDocSections []DocumentSection 179 180 symID uint32 181 symIndex map[string]uint32 182 symKindID uint32 183 symKindIndex map[string]uint32 184 symMetaData []uint32 185 186 fileEndSymbol []uint32 187 188 checksums []byte 189 190 branchMasks []uint64 191 subRepos []uint32 192 193 // docID => repoID 194 repos []uint16 195 196 contentPostings *postingsBuilder 197 namePostings *postingsBuilder 198 199 // root repositories 200 repoList []zoekt.Repository 201 202 // name to index. 203 subRepoIndices []map[string]uint32 204 205 // language => language code 206 languageMap map[string]uint16 207 208 // language codes, uint16 encoded as little-endian 209 languages []uint8 210 211 categories []byte 212 213 // IndexTime will be used as the time if non-zero. Otherwise 214 // time.Now(). This is useful for doing reproducible builds in tests. 215 IndexTime time.Time 216 217 // a sortable 20 chars long id. 218 ID string 219} 220 221func verify(repo *zoekt.Repository) error { 222 for _, t := range []string{repo.FileURLTemplate, repo.LineFragmentTemplate, repo.CommitURLTemplate} { 223 if _, err := ParseTemplate(t); err != nil { 224 return err 225 } 226 } 227 return nil 228} 229 230func urlJoinPath(base string, elem ...string) string { 231 // golangs html/template always escapes "+" appearing in an HTML attribute 232 // [1]. We may even want to treat more characters, differently but this 233 // atleast makes it possible to visit URLs like [2]. 234 // 235 // We only do this to elem since base will normally be a hardcoded string. 236 // 237 // [1]: https://sourcegraph.com/github.com/golang/go@go1.23.2/-/blob/src/html/template/html.go?L71-80 238 // [2]: https://github.com/apple/swift-system/blob/main/Sources/System/Util+StringArray.swift 239 elem = slices.Clone(elem) // copy to mutate 240 for i := range elem { 241 elem[i] = strings.ReplaceAll(elem[i], "+", "%2B") 242 } 243 u, err := url.JoinPath(base, elem...) 244 if err != nil { 245 return "#!error: " + err.Error() 246 } 247 return u 248} 249 250// ParseTemplate will parse the templates for FileURLTemplate, 251// LineFragmentTemplate and CommitURLTemplate. 252// 253// It makes available the extra function UrlJoinPath. 254func ParseTemplate(text string) (*template.Template, error) { 255 return template.New("").Funcs(template.FuncMap{ 256 "URLJoinPath": urlJoinPath, 257 }).Parse(text) 258} 259 260// ContentSize returns the number of content bytes so far ingested. 261func (b *ShardBuilder) ContentSize() uint32 { 262 // Add the name too so we don't skip building index if we have 263 // lots of empty files. 264 return b.contentPostings.endByte + b.namePostings.endByte 265} 266 267// NumFiles returns the number of files added to this builder 268func (b *ShardBuilder) NumFiles() int { 269 return len(b.contentStrings) 270} 271 272// NewShardBuilder creates a fresh ShardBuilder. The passed in 273// Repository contains repo metadata, and may be set to nil. 274func NewShardBuilder(r *zoekt.Repository) (*ShardBuilder, error) { 275 b := newShardBuilder() 276 277 if r == nil { 278 r = &zoekt.Repository{} 279 } 280 if err := b.setRepository(r); err != nil { 281 return nil, err 282 } 283 return b, nil 284} 285 286func newShardBuilder() *ShardBuilder { 287 return &ShardBuilder{ 288 indexFormatVersion: IndexFormatVersion, 289 featureVersion: FeatureVersion, 290 291 contentPostings: newPostingsBuilder(), 292 namePostings: newPostingsBuilder(), 293 fileEndSymbol: []uint32{0}, 294 symIndex: make(map[string]uint32), 295 symKindIndex: make(map[string]uint32), 296 languageMap: make(map[string]uint16), 297 } 298} 299 300func (b *ShardBuilder) setRepository(desc *zoekt.Repository) error { 301 if err := verify(desc); err != nil { 302 return err 303 } 304 305 if len(desc.Branches) > 64 { 306 return fmt.Errorf("too many branches") 307 } 308 309 repo := *desc 310 311 // copy subrepomap without root 312 repo.SubRepoMap = map[string]*zoekt.Repository{} 313 for k, v := range desc.SubRepoMap { 314 if k != "" { 315 repo.SubRepoMap[k] = v 316 } 317 } 318 319 b.repoList = append(b.repoList, repo) 320 321 return b.populateSubRepoIndices() 322} 323 324type symbolSlice struct { 325 symbols []DocumentSection 326 metaData []*zoekt.Symbol 327} 328 329func (s symbolSlice) Len() int { return len(s.symbols) } 330 331func (s symbolSlice) Swap(i, j int) { 332 s.symbols[i], s.symbols[j] = s.symbols[j], s.symbols[i] 333 s.metaData[i], s.metaData[j] = s.metaData[j], s.metaData[i] 334} 335 336func (s symbolSlice) Less(i, j int) bool { 337 return s.symbols[i].Start < s.symbols[j].Start 338} 339 340// AddFile is a convenience wrapper for Add 341func (b *ShardBuilder) AddFile(name string, content []byte) error { 342 return b.Add(Document{Name: name, Content: content}) 343} 344 345func (b *ShardBuilder) populateSubRepoIndices() error { 346 if len(b.subRepoIndices) == len(b.repoList) { 347 return nil 348 } 349 if len(b.subRepoIndices) != len(b.repoList)-1 { 350 return fmt.Errorf("populateSubRepoIndices not called for a repo: %d != %d - 1", len(b.subRepoIndices), len(b.repoList)) 351 } 352 repo := b.repoList[len(b.repoList)-1] 353 b.subRepoIndices = append(b.subRepoIndices, mkSubRepoIndices(repo)) 354 return nil 355} 356 357func mkSubRepoIndices(repo zoekt.Repository) map[string]uint32 { 358 paths := []string{""} 359 for k := range repo.SubRepoMap { 360 paths = append(paths, k) 361 } 362 sort.Strings(paths) 363 subRepoIndices := make(map[string]uint32, len(paths)) 364 for i, p := range paths { 365 subRepoIndices[p] = uint32(i) 366 } 367 return subRepoIndices 368} 369 370const notIndexedMarker = "NOT-INDEXED: " 371 372func (b *ShardBuilder) symbolID(sym string) uint32 { 373 if _, ok := b.symIndex[sym]; !ok { 374 b.symIndex[sym] = b.symID 375 b.symID++ 376 } 377 return b.symIndex[sym] 378} 379 380func (b *ShardBuilder) symbolKindID(t string) uint32 { 381 if _, ok := b.symKindIndex[t]; !ok { 382 b.symKindIndex[t] = b.symKindID 383 b.symKindID++ 384 } 385 return b.symKindIndex[t] 386} 387 388func (b *ShardBuilder) addSymbols(symbols []*zoekt.Symbol) { 389 for _, sym := range symbols { 390 b.symMetaData = append(b.symMetaData, 391 // This field was removed due to redundancy. To avoid 392 // needing to reindex, it is set to zero for now. In the 393 // future, this field will be completely removed. It 394 // will require incrementing the feature version. 395 0, 396 b.symbolKindID(sym.Kind), 397 b.symbolID(sym.Parent), 398 b.symbolKindID(sym.ParentKind)) 399 } 400} 401 402func DetermineLanguageIfUnknown(doc *Document) { 403 if doc.Language != "" { 404 return 405 } 406 407 if doc.SkipReason != SkipReasonNone { 408 // If this document has been skipped, it's likely very large, or it's a non-code file like binary. 409 // In this case, we just guess the language based on file name to avoid examining the contents. 410 // Note: passing nil content is allowed by the go-enry contract (the underlying library we use here). 411 doc.Language = languages.GetLanguage(doc.Name, nil) 412 } else { 413 doc.Language = languages.GetLanguage(doc.Name, doc.Content) 414 } 415} 416 417// Add a file which only occurs in certain branches. 418func (b *ShardBuilder) Add(doc Document) error { 419 if index := bytes.IndexByte(doc.Content, 0); index > 0 { 420 doc.SkipReason = SkipReasonBinary 421 } 422 423 if doc.SkipReason != SkipReasonNone { 424 doc.Content = []byte(notIndexedMarker + doc.SkipReason.explanation()) 425 doc.Symbols = nil 426 doc.SymbolsMetaData = nil 427 } 428 429 DetermineLanguageIfUnknown(&doc) 430 DetermineFileCategory(&doc) 431 432 sort.Sort(symbolSlice{doc.Symbols, doc.SymbolsMetaData}) 433 var last DocumentSection 434 for i, s := range doc.Symbols { 435 if i > 0 { 436 if last.End > s.Start { 437 return fmt.Errorf("sections overlap") 438 } 439 } 440 last = s 441 } 442 if last.End > uint32(len(doc.Content)) { 443 return fmt.Errorf("section goes past end of content") 444 } 445 446 if doc.SubRepositoryPath != "" { 447 rel, err := filepath.Rel(doc.SubRepositoryPath, doc.Name) 448 if err != nil || rel == doc.Name { 449 return fmt.Errorf("path %q must start subrepo path %q", doc.Name, doc.SubRepositoryPath) 450 } 451 } 452 docStr, runeSecs, err := b.contentPostings.newSearchableString(doc.Content, doc.Symbols) 453 if err != nil { 454 return err 455 } 456 nameStr, _, err := b.namePostings.newSearchableString([]byte(doc.Name), nil) 457 if err != nil { 458 return err 459 } 460 b.addSymbols(doc.SymbolsMetaData) 461 462 repoIdx := len(b.repoList) - 1 463 subRepoIdx, ok := b.subRepoIndices[repoIdx][doc.SubRepositoryPath] 464 if !ok { 465 return fmt.Errorf("unknown subrepo path %q", doc.SubRepositoryPath) 466 } 467 468 var mask uint64 469 for _, br := range doc.Branches { 470 m := b.branchMask(br) 471 if m == 0 { 472 return fmt.Errorf("no branch found for %s", br) 473 } 474 mask |= m 475 } 476 477 if repoIdx > 1<<16 { 478 return fmt.Errorf("too many repos in shard: max is %d", 1<<16) 479 } 480 481 b.subRepos = append(b.subRepos, subRepoIdx) 482 b.repos = append(b.repos, uint16(repoIdx)) 483 484 hasher := crc64.New(crc64.MakeTable(crc64.ISO)) 485 hasher.Write(doc.Content) 486 487 b.contentStrings = append(b.contentStrings, docStr) 488 b.runeDocSections = append(b.runeDocSections, runeSecs...) 489 490 b.nameStrings = append(b.nameStrings, nameStr) 491 b.docSections = append(b.docSections, doc.Symbols) 492 b.fileEndSymbol = append(b.fileEndSymbol, uint32(len(b.runeDocSections))) 493 b.branchMasks = append(b.branchMasks, mask) 494 b.checksums = append(b.checksums, hasher.Sum(nil)...) 495 496 langCode, ok := b.languageMap[doc.Language] 497 if !ok { 498 if len(b.languageMap) >= 65535 { 499 return fmt.Errorf("too many languages") 500 } 501 langCode = uint16(len(b.languageMap)) 502 b.languageMap[doc.Language] = langCode 503 } 504 b.languages = append(b.languages, uint8(langCode), uint8(langCode>>8)) 505 506 category, err := doc.Category.encode() 507 if err != nil { 508 return err 509 } 510 b.categories = append(b.categories, category) 511 512 return nil 513} 514 515func (b *ShardBuilder) branchMask(br string) uint64 { 516 for i, b := range b.repoList[len(b.repoList)-1].Branches { 517 if b.Name == br { 518 return uint64(1) << uint(i) 519 } 520 } 521 return 0 522} 523 524// repoIDs returns a list of sourcegraph IDs for the indexed repos. If the ID 525// is missing or there are no repos, this returns false. 526func (b *ShardBuilder) repoIDs() ([]uint32, bool) { 527 if len(b.repoList) == 0 { 528 return nil, false 529 } 530 531 ids := make([]uint32, 0, len(b.repoList)) 532 for _, repo := range b.repoList { 533 if repo.ID == 0 { 534 return nil, false 535 } 536 ids = append(ids, repo.ID) 537 } 538 return ids, true 539} 540 541type DocChecker struct { 542 // A map to count the unique trigrams in a doc. Reused across docs to cut down on allocations. 543 trigrams map[ngram]struct{} 544} 545 546// Check returns a reason why the given contents are probably not source texts. 547func (t *DocChecker) Check(content []byte, maxTrigramCount int, allowLargeFile bool) SkipReason { 548 if len(content) == 0 { 549 return SkipReasonNone 550 } 551 552 if len(content) < ngramSize { 553 return SkipReasonTooSmall 554 } 555 556 if index := bytes.IndexByte(content, 0); index > 0 { 557 return SkipReasonBinary 558 } 559 560 // PERF: we only need to do the trigram check if the upperbound on content is greater than 561 // our threshold. Also skip the trigram check if the file is explicitly marked as allowed. 562 if trigramsUpperBound := len(content) - ngramSize + 1; trigramsUpperBound <= maxTrigramCount || allowLargeFile { 563 return SkipReasonNone 564 } 565 566 var cur [3]rune 567 byteCount := 0 568 t.clearTrigrams(maxTrigramCount) 569 570 for len(content) > 0 { 571 r, sz := utf8.DecodeRune(content) 572 content = content[sz:] 573 byteCount += sz 574 575 cur[0], cur[1], cur[2] = cur[1], cur[2], r 576 if cur[0] == 0 { 577 // start of file. 578 continue 579 } 580 581 t.trigrams[runesToNGram(cur)] = struct{}{} 582 if len(t.trigrams) > maxTrigramCount { 583 // probably not text. 584 return SkipReasonTooManyTrigrams 585 } 586 } 587 return SkipReasonNone 588} 589 590func (t *DocChecker) clearTrigrams(maxTrigramCount int) { 591 if t.trigrams == nil { 592 t.trigrams = make(map[ngram]struct{}, maxTrigramCount) 593 } 594 for key := range t.trigrams { 595 delete(t.trigrams, key) 596 } 597} 598 599// shardName returns the name of the shard for the given prefix, version, and 600// shard number. 601func shardName(indexDir string, prefix string, version, n int) string { 602 prefix = url.QueryEscape(prefix) 603 if len(prefix) > 200 { 604 prefix = prefix[:200] + hashString(prefix)[:8] 605 } 606 return filepath.Join(indexDir, fmt.Sprintf("%s_v%d.%05d.zoekt", prefix, version, n)) 607}