fork of https://github.com/sourcegraph/zoekt
0

Configure Feed

Select the types of activity you want to include in your feed.

1// Copyright 2016 Google Inc. All rights reserved. 2// 3// Licensed under the Apache License, Version 2.0 (the "License"); 4// you may not use this file except in compliance with the License. 5// You may obtain a copy of the License at 6// 7// http://www.apache.org/licenses/LICENSE-2.0 8// 9// Unless required by applicable law or agreed to in writing, software 10// distributed under the License is distributed on an "AS IS" BASIS, 11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12// See the License for the specific language governing permissions and 13// limitations under the License. 14 15package zoekt 16 17import ( 18 "bytes" 19 "encoding/binary" 20 "fmt" 21 "hash/crc64" 22 "log" 23 "net/url" 24 "os" 25 "path/filepath" 26 "sort" 27 "strings" 28 "text/template" 29 "time" 30 "unicode/utf8" 31 32 "github.com/sourcegraph/zoekt/internal/languages" 33) 34 35var _ = log.Println 36 37const ngramSize = 3 38 39type searchableString struct { 40 data []byte 41} 42 43// Filled by the linker 44var Version string 45 46func HostnameBestEffort() string { 47 if h := os.Getenv("NODE_NAME"); h != "" { 48 return h 49 } 50 if h := os.Getenv("HOSTNAME"); h != "" { 51 return h 52 } 53 hostname, _ := os.Hostname() 54 return hostname 55} 56 57// Store character (unicode codepoint) offset (in bytes) this often. 58const runeOffsetFrequency = 100 59 60type postingsBuilder struct { 61 postings map[ngram][]byte 62 lastOffsets map[ngram]uint32 63 64 // To support UTF-8 searching, we must map back runes to byte 65 // offsets. As a first attempt, we sample regularly. The 66 // precise offset can be found by walking from the recorded 67 // offset to the desired rune. 68 runeOffsets []uint32 69 runeCount uint32 70 71 isPlainASCII bool 72 73 endRunes []uint32 74 endByte uint32 75} 76 77func newPostingsBuilder() *postingsBuilder { 78 return &postingsBuilder{ 79 postings: map[ngram][]byte{}, 80 lastOffsets: map[ngram]uint32{}, 81 isPlainASCII: true, 82 } 83} 84 85// Store trigram offsets for the given UTF-8 data. The 86// DocumentSections must correspond to rune boundaries in the UTF-8 87// data. 88func (s *postingsBuilder) newSearchableString(data []byte, byteSections []DocumentSection) (*searchableString, []DocumentSection, error) { 89 dest := searchableString{ 90 data: data, 91 } 92 var buf [8]byte 93 var runeGram [3]rune 94 95 var runeIndex uint32 96 byteCount := 0 97 dataSz := uint32(len(data)) 98 99 byteSectionBoundaries := make([]uint32, 0, 2*len(byteSections)) 100 for _, s := range byteSections { 101 byteSectionBoundaries = append(byteSectionBoundaries, s.Start, s.End) 102 } 103 var runeSectionBoundaries []uint32 104 105 endRune := s.runeCount 106 for ; len(data) > 0; runeIndex++ { 107 c, sz := utf8.DecodeRune(data) 108 if sz > 1 { 109 s.isPlainASCII = false 110 } 111 data = data[sz:] 112 113 runeGram[0], runeGram[1], runeGram[2] = runeGram[1], runeGram[2], c 114 115 if idx := s.runeCount + runeIndex; idx%runeOffsetFrequency == 0 { 116 s.runeOffsets = append(s.runeOffsets, s.endByte+uint32(byteCount)) 117 } 118 for len(byteSectionBoundaries) > 0 && byteSectionBoundaries[0] == uint32(byteCount) { 119 runeSectionBoundaries = append(runeSectionBoundaries, 120 endRune+uint32(runeIndex)) 121 byteSectionBoundaries = byteSectionBoundaries[1:] 122 } 123 124 byteCount += sz 125 126 if runeIndex < 2 { 127 continue 128 } 129 130 ng := runesToNGram(runeGram) 131 lastOff := s.lastOffsets[ng] 132 newOff := endRune + uint32(runeIndex) - 2 133 134 m := binary.PutUvarint(buf[:], uint64(newOff-lastOff)) 135 s.postings[ng] = append(s.postings[ng], buf[:m]...) 136 s.lastOffsets[ng] = newOff 137 } 138 s.runeCount += runeIndex 139 140 for len(byteSectionBoundaries) > 0 && byteSectionBoundaries[0] < uint32(byteCount) { 141 return nil, nil, fmt.Errorf("no rune for section boundary at byte %d", byteSectionBoundaries[0]) 142 } 143 144 // Handle symbol definition that ends at file end. This can 145 // happen for labels at the end of .bat files. 146 147 for len(byteSectionBoundaries) > 0 && byteSectionBoundaries[0] == uint32(byteCount) { 148 runeSectionBoundaries = append(runeSectionBoundaries, 149 endRune+runeIndex) 150 byteSectionBoundaries = byteSectionBoundaries[1:] 151 } 152 runeSecs := make([]DocumentSection, 0, len(byteSections)) 153 for i := 0; i < len(runeSectionBoundaries); i += 2 { 154 runeSecs = append(runeSecs, DocumentSection{ 155 Start: runeSectionBoundaries[i], 156 End: runeSectionBoundaries[i+1], 157 }) 158 } 159 160 s.endRunes = append(s.endRunes, s.runeCount) 161 s.endByte += dataSz 162 return &dest, runeSecs, nil 163} 164 165// IndexBuilder builds a single index shard. 166type IndexBuilder struct { 167 // The version we will write to disk. Sourcegraph Specific. This is to 168 // enable feature flagging new format versions. 169 indexFormatVersion int 170 featureVersion int 171 172 contentStrings []*searchableString 173 nameStrings []*searchableString 174 docSections [][]DocumentSection 175 runeDocSections []DocumentSection 176 177 symID uint32 178 symIndex map[string]uint32 179 symKindID uint32 180 symKindIndex map[string]uint32 181 symMetaData []uint32 182 183 fileEndSymbol []uint32 184 185 checksums []byte 186 187 branchMasks []uint64 188 subRepos []uint32 189 190 // docID => repoID 191 repos []uint16 192 193 contentPostings *postingsBuilder 194 namePostings *postingsBuilder 195 196 // root repositories 197 repoList []Repository 198 199 // name to index. 200 subRepoIndices []map[string]uint32 201 202 // language => language code 203 languageMap map[string]uint16 204 205 // language codes, uint16 encoded as little-endian 206 languages []uint8 207 208 // IndexTime will be used as the time if non-zero. Otherwise 209 // time.Now(). This is useful for doing reproducible builds in tests. 210 IndexTime time.Time 211 212 // a sortable 20 chars long id. 213 ID string 214} 215 216func (d *Repository) verify() error { 217 for _, t := range []string{d.FileURLTemplate, d.LineFragmentTemplate, d.CommitURLTemplate} { 218 if _, err := ParseTemplate(t); err != nil { 219 return err 220 } 221 } 222 return nil 223} 224 225func urlJoinPath(base string, elem ...string) string { 226 // golangs html/template always escapes "+" appearing in an HTML attribute 227 // [1]. We may even want to treat more characters, differently but this 228 // atleast makes it possible to visit URLs like [2]. 229 // 230 // We only do this to elem since base will normally be a hardcoded string. 231 // 232 // [1]: https://sourcegraph.com/github.com/golang/go@go1.23.2/-/blob/src/html/template/html.go?L71-80 233 // [2]: https://github.com/apple/swift-system/blob/main/Sources/System/Util+StringArray.swift 234 elem = append([]string{}, elem...) // copy to mutate 235 for i := range elem { 236 elem[i] = strings.ReplaceAll(elem[i], "+", "%2B") 237 } 238 u, err := url.JoinPath(base, elem...) 239 if err != nil { 240 return "#!error: " + err.Error() 241 } 242 return u 243} 244 245// ParseTemplate will parse the templates for FileURLTemplate, 246// LineFragmentTemplate and CommitURLTemplate. 247// 248// It makes available the extra function UrlJoinPath. 249func ParseTemplate(text string) (*template.Template, error) { 250 return template.New("").Funcs(template.FuncMap{ 251 "URLJoinPath": urlJoinPath, 252 }).Parse(text) 253} 254 255// ContentSize returns the number of content bytes so far ingested. 256func (b *IndexBuilder) ContentSize() uint32 { 257 // Add the name too so we don't skip building index if we have 258 // lots of empty files. 259 return b.contentPostings.endByte + b.namePostings.endByte 260} 261 262// NumFiles returns the number of files added to this builder 263func (b *IndexBuilder) NumFiles() int { 264 return len(b.contentStrings) 265} 266 267// NewIndexBuilder creates a fresh IndexBuilder. The passed in 268// Repository contains repo metadata, and may be set to nil. 269func NewIndexBuilder(r *Repository) (*IndexBuilder, error) { 270 b := newIndexBuilder() 271 272 if r == nil { 273 r = &Repository{} 274 } 275 if err := b.setRepository(r); err != nil { 276 return nil, err 277 } 278 return b, nil 279} 280 281func newIndexBuilder() *IndexBuilder { 282 return &IndexBuilder{ 283 indexFormatVersion: IndexFormatVersion, 284 featureVersion: FeatureVersion, 285 286 contentPostings: newPostingsBuilder(), 287 namePostings: newPostingsBuilder(), 288 fileEndSymbol: []uint32{0}, 289 symIndex: make(map[string]uint32), 290 symKindIndex: make(map[string]uint32), 291 languageMap: make(map[string]uint16), 292 } 293} 294 295func (b *IndexBuilder) setRepository(desc *Repository) error { 296 if err := desc.verify(); err != nil { 297 return err 298 } 299 300 if len(desc.Branches) > 64 { 301 return fmt.Errorf("too many branches") 302 } 303 304 repo := *desc 305 306 // copy subrepomap without root 307 repo.SubRepoMap = map[string]*Repository{} 308 for k, v := range desc.SubRepoMap { 309 if k != "" { 310 repo.SubRepoMap[k] = v 311 } 312 } 313 314 b.repoList = append(b.repoList, repo) 315 316 return b.populateSubRepoIndices() 317} 318 319type DocumentSection struct { 320 Start, End uint32 321} 322 323// Document holds a document (file) to index. 324type Document struct { 325 Name string 326 Content []byte 327 Branches []string 328 SubRepositoryPath string 329 Language string 330 331 // If set, something is wrong with the file contents, and this 332 // is the reason it wasn't indexed. 333 SkipReason string 334 335 // Document sections for symbols. Offsets should use bytes. 336 Symbols []DocumentSection 337 SymbolsMetaData []*Symbol 338} 339 340type symbolSlice struct { 341 symbols []DocumentSection 342 metaData []*Symbol 343} 344 345func (s symbolSlice) Len() int { return len(s.symbols) } 346 347func (s symbolSlice) Swap(i, j int) { 348 s.symbols[i], s.symbols[j] = s.symbols[j], s.symbols[i] 349 s.metaData[i], s.metaData[j] = s.metaData[j], s.metaData[i] 350} 351 352func (s symbolSlice) Less(i, j int) bool { 353 return s.symbols[i].Start < s.symbols[j].Start 354} 355 356// AddFile is a convenience wrapper for Add 357func (b *IndexBuilder) AddFile(name string, content []byte) error { 358 return b.Add(Document{Name: name, Content: content}) 359} 360 361func (b *IndexBuilder) populateSubRepoIndices() error { 362 if len(b.subRepoIndices) == len(b.repoList) { 363 return nil 364 } 365 if len(b.subRepoIndices) != len(b.repoList)-1 { 366 return fmt.Errorf("populateSubRepoIndices not called for a repo: %d != %d - 1", len(b.subRepoIndices), len(b.repoList)) 367 } 368 repo := b.repoList[len(b.repoList)-1] 369 b.subRepoIndices = append(b.subRepoIndices, mkSubRepoIndices(repo)) 370 return nil 371} 372 373func mkSubRepoIndices(repo Repository) map[string]uint32 { 374 paths := []string{""} 375 for k := range repo.SubRepoMap { 376 paths = append(paths, k) 377 } 378 sort.Strings(paths) 379 subRepoIndices := make(map[string]uint32, len(paths)) 380 for i, p := range paths { 381 subRepoIndices[p] = uint32(i) 382 } 383 return subRepoIndices 384} 385 386const notIndexedMarker = "NOT-INDEXED: " 387 388func (b *IndexBuilder) symbolID(sym string) uint32 { 389 if _, ok := b.symIndex[sym]; !ok { 390 b.symIndex[sym] = b.symID 391 b.symID++ 392 } 393 return b.symIndex[sym] 394} 395 396func (b *IndexBuilder) symbolKindID(t string) uint32 { 397 if _, ok := b.symKindIndex[t]; !ok { 398 b.symKindIndex[t] = b.symKindID 399 b.symKindID++ 400 } 401 return b.symKindIndex[t] 402} 403 404func (b *IndexBuilder) addSymbols(symbols []*Symbol) { 405 for _, sym := range symbols { 406 b.symMetaData = append(b.symMetaData, 407 // This field was removed due to redundancy. To avoid 408 // needing to reindex, it is set to zero for now. In the 409 // future, this field will be completely removed. It 410 // will require incrementing the feature version. 411 0, 412 b.symbolKindID(sym.Kind), 413 b.symbolID(sym.Parent), 414 b.symbolKindID(sym.ParentKind)) 415 } 416} 417 418func DetermineLanguageIfUnknown(doc *Document) { 419 if doc.Language == "" { 420 doc.Language = languages.GetLanguage(doc.Name, doc.Content) 421 } 422} 423 424// Add a file which only occurs in certain branches. 425func (b *IndexBuilder) Add(doc Document) error { 426 hasher := crc64.New(crc64.MakeTable(crc64.ISO)) 427 428 if idx := bytes.IndexByte(doc.Content, 0); idx >= 0 { 429 doc.SkipReason = fmt.Sprintf("binary content at byte offset %d", idx) 430 doc.Language = "binary" 431 } 432 433 if doc.SkipReason != "" { 434 doc.Content = []byte(notIndexedMarker + doc.SkipReason) 435 doc.Symbols = nil 436 doc.SymbolsMetaData = nil 437 if doc.Language == "" { 438 doc.Language = "skipped" 439 } 440 } 441 442 DetermineLanguageIfUnknown(&doc) 443 444 sort.Sort(symbolSlice{doc.Symbols, doc.SymbolsMetaData}) 445 var last DocumentSection 446 for i, s := range doc.Symbols { 447 if i > 0 { 448 if last.End > s.Start { 449 return fmt.Errorf("sections overlap") 450 } 451 } 452 last = s 453 } 454 if last.End > uint32(len(doc.Content)) { 455 return fmt.Errorf("section goes past end of content") 456 } 457 458 if doc.SubRepositoryPath != "" { 459 rel, err := filepath.Rel(doc.SubRepositoryPath, doc.Name) 460 if err != nil || rel == doc.Name { 461 return fmt.Errorf("path %q must start subrepo path %q", doc.Name, doc.SubRepositoryPath) 462 } 463 } 464 docStr, runeSecs, err := b.contentPostings.newSearchableString(doc.Content, doc.Symbols) 465 if err != nil { 466 return err 467 } 468 nameStr, _, err := b.namePostings.newSearchableString([]byte(doc.Name), nil) 469 if err != nil { 470 return err 471 } 472 b.addSymbols(doc.SymbolsMetaData) 473 474 repoIdx := len(b.repoList) - 1 475 subRepoIdx, ok := b.subRepoIndices[repoIdx][doc.SubRepositoryPath] 476 if !ok { 477 return fmt.Errorf("unknown subrepo path %q", doc.SubRepositoryPath) 478 } 479 480 var mask uint64 481 for _, br := range doc.Branches { 482 m := b.branchMask(br) 483 if m == 0 { 484 return fmt.Errorf("no branch found for %s", br) 485 } 486 mask |= m 487 } 488 489 if repoIdx > 1<<16 { 490 return fmt.Errorf("too many repos in shard: max is %d", 1<<16) 491 } 492 493 b.subRepos = append(b.subRepos, subRepoIdx) 494 b.repos = append(b.repos, uint16(repoIdx)) 495 496 hasher.Write(doc.Content) 497 498 b.contentStrings = append(b.contentStrings, docStr) 499 b.runeDocSections = append(b.runeDocSections, runeSecs...) 500 501 b.nameStrings = append(b.nameStrings, nameStr) 502 b.docSections = append(b.docSections, doc.Symbols) 503 b.fileEndSymbol = append(b.fileEndSymbol, uint32(len(b.runeDocSections))) 504 b.branchMasks = append(b.branchMasks, mask) 505 b.checksums = append(b.checksums, hasher.Sum(nil)...) 506 507 langCode, ok := b.languageMap[doc.Language] 508 if !ok { 509 if len(b.languageMap) >= 65535 { 510 return fmt.Errorf("too many languages") 511 } 512 langCode = uint16(len(b.languageMap)) 513 b.languageMap[doc.Language] = langCode 514 } 515 b.languages = append(b.languages, uint8(langCode), uint8(langCode>>8)) 516 517 return nil 518} 519 520func (b *IndexBuilder) branchMask(br string) uint64 { 521 for i, b := range b.repoList[len(b.repoList)-1].Branches { 522 if b.Name == br { 523 return uint64(1) << uint(i) 524 } 525 } 526 return 0 527} 528 529type DocChecker struct { 530 // A map to count the unique trigrams in a doc. Reused across docs to cut down on allocations. 531 trigrams map[ngram]struct{} 532} 533 534// Check returns a reason why the given contents are probably not source texts. 535func (t *DocChecker) Check(content []byte, maxTrigramCount int, allowLargeFile bool) error { 536 if len(content) == 0 { 537 return nil 538 } 539 540 if len(content) < ngramSize { 541 return fmt.Errorf("file size smaller than %d", ngramSize) 542 } 543 544 if index := bytes.IndexByte(content, 0); index > 0 { 545 return fmt.Errorf("binary data at byte offset %d", index) 546 } 547 548 // PERF: we only need to do the trigram check if the upperbound on content is greater than 549 // our threshold. Also skip the trigram check if the file is explicitly marked as allowed. 550 if trigramsUpperBound := len(content) - ngramSize + 1; trigramsUpperBound <= maxTrigramCount || allowLargeFile { 551 return nil 552 } 553 554 var cur [3]rune 555 byteCount := 0 556 t.clearTrigrams(maxTrigramCount) 557 558 for len(content) > 0 { 559 r, sz := utf8.DecodeRune(content) 560 content = content[sz:] 561 byteCount += sz 562 563 cur[0], cur[1], cur[2] = cur[1], cur[2], r 564 if cur[0] == 0 { 565 // start of file. 566 continue 567 } 568 569 t.trigrams[runesToNGram(cur)] = struct{}{} 570 if len(t.trigrams) > maxTrigramCount { 571 // probably not text. 572 return fmt.Errorf("number of trigrams exceeds %d", maxTrigramCount) 573 } 574 } 575 return nil 576} 577 578func (t *DocChecker) clearTrigrams(maxTrigramCount int) { 579 if t.trigrams == nil { 580 t.trigrams = make(map[ngram]struct{}, maxTrigramCount) 581 } 582 for key := range t.trigrams { 583 delete(t.trigrams, key) 584 } 585} 586 587// ShardName returns the name of the shard for the given prefix, version, and 588// shard number. 589func ShardName(indexDir string, prefix string, version, n int) string { 590 prefix = url.QueryEscape(prefix) 591 if len(prefix) > 200 { 592 prefix = prefix[:200] + hashString(prefix)[:8] 593 } 594 return filepath.Join(indexDir, fmt.Sprintf("%s_v%d.%05d.zoekt", prefix, version, n)) 595}