fork of https://github.com/sourcegraph/zoekt
0

Configure Feed

Select the types of activity you want to include in your feed.

1// Copyright 2016 Google Inc. All rights reserved. 2// 3// Licensed under the Apache License, Version 2.0 (the "License"); 4// you may not use this file except in compliance with the License. 5// You may obtain a copy of the License at 6// 7// http://www.apache.org/licenses/LICENSE-2.0 8// 9// Unless required by applicable law or agreed to in writing, software 10// distributed under the License is distributed on an "AS IS" BASIS, 11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12// See the License for the specific language governing permissions and 13// limitations under the License. 14 15package index 16 17import ( 18 "bytes" 19 "encoding/binary" 20 "fmt" 21 "hash/crc64" 22 "log" 23 "net/url" 24 "os" 25 "path/filepath" 26 "sort" 27 "strings" 28 "text/template" 29 "time" 30 "unicode/utf8" 31 32 "github.com/sourcegraph/zoekt" 33 "github.com/sourcegraph/zoekt/internal/languages" 34) 35 36var _ = log.Println 37 38const ngramSize = 3 39 40type searchableString struct { 41 data []byte 42} 43 44// Filled by the linker 45var Version string 46 47func HostnameBestEffort() string { 48 if h := os.Getenv("NODE_NAME"); h != "" { 49 return h 50 } 51 if h := os.Getenv("HOSTNAME"); h != "" { 52 return h 53 } 54 hostname, _ := os.Hostname() 55 return hostname 56} 57 58// Store character (unicode codepoint) offset (in bytes) this often. 59const runeOffsetFrequency = 100 60 61type postingsBuilder struct { 62 postings map[ngram][]byte 63 lastOffsets map[ngram]uint32 64 65 // To support UTF-8 searching, we must map back runes to byte 66 // offsets. As a first attempt, we sample regularly. The 67 // precise offset can be found by walking from the recorded 68 // offset to the desired rune. 69 runeOffsets []uint32 70 runeCount uint32 71 72 isPlainASCII bool 73 74 endRunes []uint32 75 endByte uint32 76} 77 78func newPostingsBuilder() *postingsBuilder { 79 return &postingsBuilder{ 80 postings: map[ngram][]byte{}, 81 lastOffsets: map[ngram]uint32{}, 82 isPlainASCII: true, 83 } 84} 85 86// Store trigram offsets for the given UTF-8 data. The 87// DocumentSections must correspond to rune boundaries in the UTF-8 88// data. 89func (s *postingsBuilder) newSearchableString(data []byte, byteSections []DocumentSection) (*searchableString, []DocumentSection, error) { 90 dest := searchableString{ 91 data: data, 92 } 93 var buf [8]byte 94 var runeGram [3]rune 95 96 var runeIndex uint32 97 byteCount := 0 98 dataSz := uint32(len(data)) 99 100 byteSectionBoundaries := make([]uint32, 0, 2*len(byteSections)) 101 for _, s := range byteSections { 102 byteSectionBoundaries = append(byteSectionBoundaries, s.Start, s.End) 103 } 104 var runeSectionBoundaries []uint32 105 106 endRune := s.runeCount 107 for ; len(data) > 0; runeIndex++ { 108 c, sz := utf8.DecodeRune(data) 109 if sz > 1 { 110 s.isPlainASCII = false 111 } 112 data = data[sz:] 113 114 runeGram[0], runeGram[1], runeGram[2] = runeGram[1], runeGram[2], c 115 116 if idx := s.runeCount + runeIndex; idx%runeOffsetFrequency == 0 { 117 s.runeOffsets = append(s.runeOffsets, s.endByte+uint32(byteCount)) 118 } 119 for len(byteSectionBoundaries) > 0 && byteSectionBoundaries[0] == uint32(byteCount) { 120 runeSectionBoundaries = append(runeSectionBoundaries, 121 endRune+uint32(runeIndex)) 122 byteSectionBoundaries = byteSectionBoundaries[1:] 123 } 124 125 byteCount += sz 126 127 if runeIndex < 2 { 128 continue 129 } 130 131 ng := runesToNGram(runeGram) 132 lastOff := s.lastOffsets[ng] 133 newOff := endRune + uint32(runeIndex) - 2 134 135 m := binary.PutUvarint(buf[:], uint64(newOff-lastOff)) 136 s.postings[ng] = append(s.postings[ng], buf[:m]...) 137 s.lastOffsets[ng] = newOff 138 } 139 s.runeCount += runeIndex 140 141 for len(byteSectionBoundaries) > 0 && byteSectionBoundaries[0] < uint32(byteCount) { 142 return nil, nil, fmt.Errorf("no rune for section boundary at byte %d", byteSectionBoundaries[0]) 143 } 144 145 // Handle symbol definition that ends at file end. This can 146 // happen for labels at the end of .bat files. 147 148 for len(byteSectionBoundaries) > 0 && byteSectionBoundaries[0] == uint32(byteCount) { 149 runeSectionBoundaries = append(runeSectionBoundaries, 150 endRune+runeIndex) 151 byteSectionBoundaries = byteSectionBoundaries[1:] 152 } 153 runeSecs := make([]DocumentSection, 0, len(byteSections)) 154 for i := 0; i < len(runeSectionBoundaries); i += 2 { 155 runeSecs = append(runeSecs, DocumentSection{ 156 Start: runeSectionBoundaries[i], 157 End: runeSectionBoundaries[i+1], 158 }) 159 } 160 161 s.endRunes = append(s.endRunes, s.runeCount) 162 s.endByte += dataSz 163 return &dest, runeSecs, nil 164} 165 166// ShardBuilder builds a single index shard. 167type ShardBuilder struct { 168 // The version we will write to disk. Sourcegraph Specific. This is to 169 // enable feature flagging new format versions. 170 indexFormatVersion int 171 featureVersion int 172 173 contentStrings []*searchableString 174 nameStrings []*searchableString 175 docSections [][]DocumentSection 176 runeDocSections []DocumentSection 177 178 symID uint32 179 symIndex map[string]uint32 180 symKindID uint32 181 symKindIndex map[string]uint32 182 symMetaData []uint32 183 184 fileEndSymbol []uint32 185 186 checksums []byte 187 188 branchMasks []uint64 189 subRepos []uint32 190 191 // docID => repoID 192 repos []uint16 193 194 contentPostings *postingsBuilder 195 namePostings *postingsBuilder 196 197 // root repositories 198 repoList []zoekt.Repository 199 200 // name to index. 201 subRepoIndices []map[string]uint32 202 203 // language => language code 204 languageMap map[string]uint16 205 206 // language codes, uint16 encoded as little-endian 207 languages []uint8 208 209 categories []byte 210 211 // IndexTime will be used as the time if non-zero. Otherwise 212 // time.Now(). This is useful for doing reproducible builds in tests. 213 IndexTime time.Time 214 215 // a sortable 20 chars long id. 216 ID string 217} 218 219func verify(repo *zoekt.Repository) error { 220 for _, t := range []string{repo.FileURLTemplate, repo.LineFragmentTemplate, repo.CommitURLTemplate} { 221 if _, err := ParseTemplate(t); err != nil { 222 return err 223 } 224 } 225 return nil 226} 227 228func urlJoinPath(base string, elem ...string) string { 229 // golangs html/template always escapes "+" appearing in an HTML attribute 230 // [1]. We may even want to treat more characters, differently but this 231 // atleast makes it possible to visit URLs like [2]. 232 // 233 // We only do this to elem since base will normally be a hardcoded string. 234 // 235 // [1]: https://sourcegraph.com/github.com/golang/go@go1.23.2/-/blob/src/html/template/html.go?L71-80 236 // [2]: https://github.com/apple/swift-system/blob/main/Sources/System/Util+StringArray.swift 237 elem = append([]string{}, elem...) // copy to mutate 238 for i := range elem { 239 elem[i] = strings.ReplaceAll(elem[i], "+", "%2B") 240 } 241 u, err := url.JoinPath(base, elem...) 242 if err != nil { 243 return "#!error: " + err.Error() 244 } 245 return u 246} 247 248// ParseTemplate will parse the templates for FileURLTemplate, 249// LineFragmentTemplate and CommitURLTemplate. 250// 251// It makes available the extra function UrlJoinPath. 252func ParseTemplate(text string) (*template.Template, error) { 253 return template.New("").Funcs(template.FuncMap{ 254 "URLJoinPath": urlJoinPath, 255 }).Parse(text) 256} 257 258// ContentSize returns the number of content bytes so far ingested. 259func (b *ShardBuilder) ContentSize() uint32 { 260 // Add the name too so we don't skip building index if we have 261 // lots of empty files. 262 return b.contentPostings.endByte + b.namePostings.endByte 263} 264 265// NumFiles returns the number of files added to this builder 266func (b *ShardBuilder) NumFiles() int { 267 return len(b.contentStrings) 268} 269 270// NewShardBuilder creates a fresh ShardBuilder. The passed in 271// Repository contains repo metadata, and may be set to nil. 272func NewShardBuilder(r *zoekt.Repository) (*ShardBuilder, error) { 273 b := newShardBuilder() 274 275 if r == nil { 276 r = &zoekt.Repository{} 277 } 278 if err := b.setRepository(r); err != nil { 279 return nil, err 280 } 281 return b, nil 282} 283 284func newShardBuilder() *ShardBuilder { 285 return &ShardBuilder{ 286 indexFormatVersion: IndexFormatVersion, 287 featureVersion: FeatureVersion, 288 289 contentPostings: newPostingsBuilder(), 290 namePostings: newPostingsBuilder(), 291 fileEndSymbol: []uint32{0}, 292 symIndex: make(map[string]uint32), 293 symKindIndex: make(map[string]uint32), 294 languageMap: make(map[string]uint16), 295 } 296} 297 298func (b *ShardBuilder) setRepository(desc *zoekt.Repository) error { 299 if err := verify(desc); err != nil { 300 return err 301 } 302 303 if len(desc.Branches) > 64 { 304 return fmt.Errorf("too many branches") 305 } 306 307 repo := *desc 308 309 // copy subrepomap without root 310 repo.SubRepoMap = map[string]*zoekt.Repository{} 311 for k, v := range desc.SubRepoMap { 312 if k != "" { 313 repo.SubRepoMap[k] = v 314 } 315 } 316 317 b.repoList = append(b.repoList, repo) 318 319 return b.populateSubRepoIndices() 320} 321 322type symbolSlice struct { 323 symbols []DocumentSection 324 metaData []*zoekt.Symbol 325} 326 327func (s symbolSlice) Len() int { return len(s.symbols) } 328 329func (s symbolSlice) Swap(i, j int) { 330 s.symbols[i], s.symbols[j] = s.symbols[j], s.symbols[i] 331 s.metaData[i], s.metaData[j] = s.metaData[j], s.metaData[i] 332} 333 334func (s symbolSlice) Less(i, j int) bool { 335 return s.symbols[i].Start < s.symbols[j].Start 336} 337 338// AddFile is a convenience wrapper for Add 339func (b *ShardBuilder) AddFile(name string, content []byte) error { 340 return b.Add(Document{Name: name, Content: content}) 341} 342 343func (b *ShardBuilder) populateSubRepoIndices() error { 344 if len(b.subRepoIndices) == len(b.repoList) { 345 return nil 346 } 347 if len(b.subRepoIndices) != len(b.repoList)-1 { 348 return fmt.Errorf("populateSubRepoIndices not called for a repo: %d != %d - 1", len(b.subRepoIndices), len(b.repoList)) 349 } 350 repo := b.repoList[len(b.repoList)-1] 351 b.subRepoIndices = append(b.subRepoIndices, mkSubRepoIndices(repo)) 352 return nil 353} 354 355func mkSubRepoIndices(repo zoekt.Repository) map[string]uint32 { 356 paths := []string{""} 357 for k := range repo.SubRepoMap { 358 paths = append(paths, k) 359 } 360 sort.Strings(paths) 361 subRepoIndices := make(map[string]uint32, len(paths)) 362 for i, p := range paths { 363 subRepoIndices[p] = uint32(i) 364 } 365 return subRepoIndices 366} 367 368const notIndexedMarker = "NOT-INDEXED: " 369 370func (b *ShardBuilder) symbolID(sym string) uint32 { 371 if _, ok := b.symIndex[sym]; !ok { 372 b.symIndex[sym] = b.symID 373 b.symID++ 374 } 375 return b.symIndex[sym] 376} 377 378func (b *ShardBuilder) symbolKindID(t string) uint32 { 379 if _, ok := b.symKindIndex[t]; !ok { 380 b.symKindIndex[t] = b.symKindID 381 b.symKindID++ 382 } 383 return b.symKindIndex[t] 384} 385 386func (b *ShardBuilder) addSymbols(symbols []*zoekt.Symbol) { 387 for _, sym := range symbols { 388 b.symMetaData = append(b.symMetaData, 389 // This field was removed due to redundancy. To avoid 390 // needing to reindex, it is set to zero for now. In the 391 // future, this field will be completely removed. It 392 // will require incrementing the feature version. 393 0, 394 b.symbolKindID(sym.Kind), 395 b.symbolID(sym.Parent), 396 b.symbolKindID(sym.ParentKind)) 397 } 398} 399 400func DetermineLanguageIfUnknown(doc *Document) { 401 if doc.Language != "" { 402 return 403 } 404 405 if doc.SkipReason != SkipReasonNone { 406 // If this document has been skipped, it's likely very large, or it's a non-code file like binary. 407 // In this case, we just guess the language based on file name to avoid examining the contents. 408 // Note: passing nil content is allowed by the go-enry contract (the underlying library we use here). 409 doc.Language = languages.GetLanguage(doc.Name, nil) 410 } else { 411 doc.Language = languages.GetLanguage(doc.Name, doc.Content) 412 } 413} 414 415// Add a file which only occurs in certain branches. 416func (b *ShardBuilder) Add(doc Document) error { 417 if index := bytes.IndexByte(doc.Content, 0); index > 0 { 418 doc.SkipReason = SkipReasonBinary 419 } 420 421 if doc.SkipReason != SkipReasonNone { 422 doc.Content = []byte(notIndexedMarker + doc.SkipReason.explanation()) 423 doc.Symbols = nil 424 doc.SymbolsMetaData = nil 425 } 426 427 DetermineLanguageIfUnknown(&doc) 428 DetermineFileCategory(&doc) 429 430 sort.Sort(symbolSlice{doc.Symbols, doc.SymbolsMetaData}) 431 var last DocumentSection 432 for i, s := range doc.Symbols { 433 if i > 0 { 434 if last.End > s.Start { 435 return fmt.Errorf("sections overlap") 436 } 437 } 438 last = s 439 } 440 if last.End > uint32(len(doc.Content)) { 441 return fmt.Errorf("section goes past end of content") 442 } 443 444 if doc.SubRepositoryPath != "" { 445 rel, err := filepath.Rel(doc.SubRepositoryPath, doc.Name) 446 if err != nil || rel == doc.Name { 447 return fmt.Errorf("path %q must start subrepo path %q", doc.Name, doc.SubRepositoryPath) 448 } 449 } 450 docStr, runeSecs, err := b.contentPostings.newSearchableString(doc.Content, doc.Symbols) 451 if err != nil { 452 return err 453 } 454 nameStr, _, err := b.namePostings.newSearchableString([]byte(doc.Name), nil) 455 if err != nil { 456 return err 457 } 458 b.addSymbols(doc.SymbolsMetaData) 459 460 repoIdx := len(b.repoList) - 1 461 subRepoIdx, ok := b.subRepoIndices[repoIdx][doc.SubRepositoryPath] 462 if !ok { 463 return fmt.Errorf("unknown subrepo path %q", doc.SubRepositoryPath) 464 } 465 466 var mask uint64 467 for _, br := range doc.Branches { 468 m := b.branchMask(br) 469 if m == 0 { 470 return fmt.Errorf("no branch found for %s", br) 471 } 472 mask |= m 473 } 474 475 if repoIdx > 1<<16 { 476 return fmt.Errorf("too many repos in shard: max is %d", 1<<16) 477 } 478 479 b.subRepos = append(b.subRepos, subRepoIdx) 480 b.repos = append(b.repos, uint16(repoIdx)) 481 482 hasher := crc64.New(crc64.MakeTable(crc64.ISO)) 483 hasher.Write(doc.Content) 484 485 b.contentStrings = append(b.contentStrings, docStr) 486 b.runeDocSections = append(b.runeDocSections, runeSecs...) 487 488 b.nameStrings = append(b.nameStrings, nameStr) 489 b.docSections = append(b.docSections, doc.Symbols) 490 b.fileEndSymbol = append(b.fileEndSymbol, uint32(len(b.runeDocSections))) 491 b.branchMasks = append(b.branchMasks, mask) 492 b.checksums = append(b.checksums, hasher.Sum(nil)...) 493 494 langCode, ok := b.languageMap[doc.Language] 495 if !ok { 496 if len(b.languageMap) >= 65535 { 497 return fmt.Errorf("too many languages") 498 } 499 langCode = uint16(len(b.languageMap)) 500 b.languageMap[doc.Language] = langCode 501 } 502 b.languages = append(b.languages, uint8(langCode), uint8(langCode>>8)) 503 504 category, err := doc.Category.encode() 505 if err != nil { 506 return err 507 } 508 b.categories = append(b.categories, category) 509 510 return nil 511} 512 513func (b *ShardBuilder) branchMask(br string) uint64 { 514 for i, b := range b.repoList[len(b.repoList)-1].Branches { 515 if b.Name == br { 516 return uint64(1) << uint(i) 517 } 518 } 519 return 0 520} 521 522// repoIDs returns a list of sourcegraph IDs for the indexed repos. If the ID 523// is missing or there are no repos, this returns false. 524func (b *ShardBuilder) repoIDs() ([]uint32, bool) { 525 if len(b.repoList) == 0 { 526 return nil, false 527 } 528 529 ids := make([]uint32, 0, len(b.repoList)) 530 for _, repo := range b.repoList { 531 if repo.ID == 0 { 532 return nil, false 533 } 534 ids = append(ids, repo.ID) 535 } 536 return ids, true 537} 538 539type DocChecker struct { 540 // A map to count the unique trigrams in a doc. Reused across docs to cut down on allocations. 541 trigrams map[ngram]struct{} 542} 543 544// Check returns a reason why the given contents are probably not source texts. 545func (t *DocChecker) Check(content []byte, maxTrigramCount int, allowLargeFile bool) SkipReason { 546 if len(content) == 0 { 547 return SkipReasonNone 548 } 549 550 if len(content) < ngramSize { 551 return SkipReasonTooSmall 552 } 553 554 if index := bytes.IndexByte(content, 0); index > 0 { 555 return SkipReasonBinary 556 } 557 558 // PERF: we only need to do the trigram check if the upperbound on content is greater than 559 // our threshold. Also skip the trigram check if the file is explicitly marked as allowed. 560 if trigramsUpperBound := len(content) - ngramSize + 1; trigramsUpperBound <= maxTrigramCount || allowLargeFile { 561 return SkipReasonNone 562 } 563 564 var cur [3]rune 565 byteCount := 0 566 t.clearTrigrams(maxTrigramCount) 567 568 for len(content) > 0 { 569 r, sz := utf8.DecodeRune(content) 570 content = content[sz:] 571 byteCount += sz 572 573 cur[0], cur[1], cur[2] = cur[1], cur[2], r 574 if cur[0] == 0 { 575 // start of file. 576 continue 577 } 578 579 t.trigrams[runesToNGram(cur)] = struct{}{} 580 if len(t.trigrams) > maxTrigramCount { 581 // probably not text. 582 return SkipReasonTooManyTrigrams 583 } 584 } 585 return SkipReasonNone 586} 587 588func (t *DocChecker) clearTrigrams(maxTrigramCount int) { 589 if t.trigrams == nil { 590 t.trigrams = make(map[ngram]struct{}, maxTrigramCount) 591 } 592 for key := range t.trigrams { 593 delete(t.trigrams, key) 594 } 595} 596 597// ShardName returns the name of the shard for the given prefix, version, and 598// shard number. 599func ShardName(indexDir string, prefix string, version, n int) string { 600 prefix = url.QueryEscape(prefix) 601 if len(prefix) > 200 { 602 prefix = prefix[:200] + hashString(prefix)[:8] 603 } 604 return filepath.Join(indexDir, fmt.Sprintf("%s_v%d.%05d.zoekt", prefix, version, n)) 605}