fork of https://github.com/sourcegraph/zoekt
0

Configure Feed

Select the types of activity you want to include in your feed.

1// Copyright 2016 Google Inc. All rights reserved. 2// 3// Licensed under the Apache License, Version 2.0 (the "License"); 4// you may not use this file except in compliance with the License. 5// You may obtain a copy of the License at 6// 7// http://www.apache.org/licenses/LICENSE-2.0 8// 9// Unless required by applicable law or agreed to in writing, software 10// distributed under the License is distributed on an "AS IS" BASIS, 11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12// See the License for the specific language governing permissions and 13// limitations under the License. 14 15package zoekt 16 17import ( 18 "bytes" 19 "encoding/binary" 20 "fmt" 21 "hash/crc64" 22 "log" 23 "net/url" 24 "os" 25 "path/filepath" 26 "sort" 27 "strings" 28 "text/template" 29 "time" 30 "unicode/utf8" 31 32 "github.com/sourcegraph/zoekt/internal/languages" 33) 34 35var _ = log.Println 36 37const ngramSize = 3 38 39type searchableString struct { 40 data []byte 41} 42 43// Filled by the linker 44var Version string 45 46func HostnameBestEffort() string { 47 if h := os.Getenv("NODE_NAME"); h != "" { 48 return h 49 } 50 if h := os.Getenv("HOSTNAME"); h != "" { 51 return h 52 } 53 hostname, _ := os.Hostname() 54 return hostname 55} 56 57// Store character (unicode codepoint) offset (in bytes) this often. 58const runeOffsetFrequency = 100 59 60type postingsBuilder struct { 61 postings map[ngram][]byte 62 lastOffsets map[ngram]uint32 63 64 // To support UTF-8 searching, we must map back runes to byte 65 // offsets. As a first attempt, we sample regularly. The 66 // precise offset can be found by walking from the recorded 67 // offset to the desired rune. 68 runeOffsets []uint32 69 runeCount uint32 70 71 isPlainASCII bool 72 73 endRunes []uint32 74 endByte uint32 75} 76 77func newPostingsBuilder() *postingsBuilder { 78 return &postingsBuilder{ 79 postings: map[ngram][]byte{}, 80 lastOffsets: map[ngram]uint32{}, 81 isPlainASCII: true, 82 } 83} 84 85// Store trigram offsets for the given UTF-8 data. The 86// DocumentSections must correspond to rune boundaries in the UTF-8 87// data. 88func (s *postingsBuilder) newSearchableString(data []byte, byteSections []DocumentSection) (*searchableString, []DocumentSection, error) { 89 dest := searchableString{ 90 data: data, 91 } 92 var buf [8]byte 93 var runeGram [3]rune 94 95 var runeIndex uint32 96 byteCount := 0 97 dataSz := uint32(len(data)) 98 99 byteSectionBoundaries := make([]uint32, 0, 2*len(byteSections)) 100 for _, s := range byteSections { 101 byteSectionBoundaries = append(byteSectionBoundaries, s.Start, s.End) 102 } 103 var runeSectionBoundaries []uint32 104 105 endRune := s.runeCount 106 for ; len(data) > 0; runeIndex++ { 107 c, sz := utf8.DecodeRune(data) 108 if sz > 1 { 109 s.isPlainASCII = false 110 } 111 data = data[sz:] 112 113 runeGram[0], runeGram[1], runeGram[2] = runeGram[1], runeGram[2], c 114 115 if idx := s.runeCount + runeIndex; idx%runeOffsetFrequency == 0 { 116 s.runeOffsets = append(s.runeOffsets, s.endByte+uint32(byteCount)) 117 } 118 for len(byteSectionBoundaries) > 0 && byteSectionBoundaries[0] == uint32(byteCount) { 119 runeSectionBoundaries = append(runeSectionBoundaries, 120 endRune+uint32(runeIndex)) 121 byteSectionBoundaries = byteSectionBoundaries[1:] 122 } 123 124 byteCount += sz 125 126 if runeIndex < 2 { 127 continue 128 } 129 130 ng := runesToNGram(runeGram) 131 lastOff := s.lastOffsets[ng] 132 newOff := endRune + uint32(runeIndex) - 2 133 134 m := binary.PutUvarint(buf[:], uint64(newOff-lastOff)) 135 s.postings[ng] = append(s.postings[ng], buf[:m]...) 136 s.lastOffsets[ng] = newOff 137 } 138 s.runeCount += runeIndex 139 140 for len(byteSectionBoundaries) > 0 && byteSectionBoundaries[0] < uint32(byteCount) { 141 return nil, nil, fmt.Errorf("no rune for section boundary at byte %d", byteSectionBoundaries[0]) 142 } 143 144 // Handle symbol definition that ends at file end. This can 145 // happen for labels at the end of .bat files. 146 147 for len(byteSectionBoundaries) > 0 && byteSectionBoundaries[0] == uint32(byteCount) { 148 runeSectionBoundaries = append(runeSectionBoundaries, 149 endRune+runeIndex) 150 byteSectionBoundaries = byteSectionBoundaries[1:] 151 } 152 runeSecs := make([]DocumentSection, 0, len(byteSections)) 153 for i := 0; i < len(runeSectionBoundaries); i += 2 { 154 runeSecs = append(runeSecs, DocumentSection{ 155 Start: runeSectionBoundaries[i], 156 End: runeSectionBoundaries[i+1], 157 }) 158 } 159 160 s.endRunes = append(s.endRunes, s.runeCount) 161 s.endByte += dataSz 162 return &dest, runeSecs, nil 163} 164 165// IndexBuilder builds a single index shard. 166type IndexBuilder struct { 167 // The version we will write to disk. Sourcegraph Specific. This is to 168 // enable feature flagging new format versions. 169 indexFormatVersion int 170 featureVersion int 171 172 contentStrings []*searchableString 173 nameStrings []*searchableString 174 docSections [][]DocumentSection 175 runeDocSections []DocumentSection 176 177 symID uint32 178 symIndex map[string]uint32 179 symKindID uint32 180 symKindIndex map[string]uint32 181 symMetaData []uint32 182 183 fileEndSymbol []uint32 184 185 checksums []byte 186 187 branchMasks []uint64 188 subRepos []uint32 189 190 // docID => repoID 191 repos []uint16 192 193 // Experimental: docID => rank vec 194 ranks [][]float64 195 196 contentPostings *postingsBuilder 197 namePostings *postingsBuilder 198 199 // root repositories 200 repoList []Repository 201 202 // name to index. 203 subRepoIndices []map[string]uint32 204 205 // language => language code 206 languageMap map[string]uint16 207 208 // language codes, uint16 encoded as little-endian 209 languages []uint8 210 211 // IndexTime will be used as the time if non-zero. Otherwise 212 // time.Now(). This is useful for doing reproducible builds in tests. 213 IndexTime time.Time 214 215 // a sortable 20 chars long id. 216 ID string 217} 218 219func (d *Repository) verify() error { 220 for _, t := range []string{d.FileURLTemplate, d.LineFragmentTemplate, d.CommitURLTemplate} { 221 if _, err := ParseTemplate(t); err != nil { 222 return err 223 } 224 } 225 return nil 226} 227 228func urlJoinPath(base string, elem ...string) string { 229 // golangs html/template always escapes "+" appearing in an HTML attribute 230 // [1]. We may even want to treat more characters, differently but this 231 // atleast makes it possible to visit URLs like [2]. 232 // 233 // We only do this to elem since base will normally be a hardcoded string. 234 // 235 // [1]: https://sourcegraph.com/github.com/golang/go@go1.23.2/-/blob/src/html/template/html.go?L71-80 236 // [2]: https://github.com/apple/swift-system/blob/main/Sources/System/Util+StringArray.swift 237 elem = append([]string{}, elem...) // copy to mutate 238 for i := range elem { 239 elem[i] = strings.ReplaceAll(elem[i], "+", "%2B") 240 } 241 u, err := url.JoinPath(base, elem...) 242 if err != nil { 243 return "#!error: " + err.Error() 244 } 245 return u 246} 247 248// ParseTemplate will parse the templates for FileURLTemplate, 249// LineFragmentTemplate and CommitURLTemplate. 250// 251// It makes available the extra function UrlJoinPath. 252func ParseTemplate(text string) (*template.Template, error) { 253 return template.New("").Funcs(template.FuncMap{ 254 "URLJoinPath": urlJoinPath, 255 }).Parse(text) 256} 257 258// ContentSize returns the number of content bytes so far ingested. 259func (b *IndexBuilder) ContentSize() uint32 { 260 // Add the name too so we don't skip building index if we have 261 // lots of empty files. 262 return b.contentPostings.endByte + b.namePostings.endByte 263} 264 265// NumFiles returns the number of files added to this builder 266func (b *IndexBuilder) NumFiles() int { 267 return len(b.contentStrings) 268} 269 270// NewIndexBuilder creates a fresh IndexBuilder. The passed in 271// Repository contains repo metadata, and may be set to nil. 272func NewIndexBuilder(r *Repository) (*IndexBuilder, error) { 273 b := newIndexBuilder() 274 275 if r == nil { 276 r = &Repository{} 277 } 278 if err := b.setRepository(r); err != nil { 279 return nil, err 280 } 281 return b, nil 282} 283 284func newIndexBuilder() *IndexBuilder { 285 return &IndexBuilder{ 286 indexFormatVersion: IndexFormatVersion, 287 featureVersion: FeatureVersion, 288 289 contentPostings: newPostingsBuilder(), 290 namePostings: newPostingsBuilder(), 291 fileEndSymbol: []uint32{0}, 292 symIndex: make(map[string]uint32), 293 symKindIndex: make(map[string]uint32), 294 languageMap: make(map[string]uint16), 295 } 296} 297 298func (b *IndexBuilder) setRepository(desc *Repository) error { 299 if err := desc.verify(); err != nil { 300 return err 301 } 302 303 if len(desc.Branches) > 64 { 304 return fmt.Errorf("too many branches") 305 } 306 307 repo := *desc 308 309 // copy subrepomap without root 310 repo.SubRepoMap = map[string]*Repository{} 311 for k, v := range desc.SubRepoMap { 312 if k != "" { 313 repo.SubRepoMap[k] = v 314 } 315 } 316 317 b.repoList = append(b.repoList, repo) 318 319 return b.populateSubRepoIndices() 320} 321 322type DocumentSection struct { 323 Start, End uint32 324} 325 326// Document holds a document (file) to index. 327type Document struct { 328 Name string 329 Content []byte 330 Branches []string 331 SubRepositoryPath string 332 Language string 333 334 // If set, something is wrong with the file contents, and this 335 // is the reason it wasn't indexed. 336 SkipReason string 337 338 // Document sections for symbols. Offsets should use bytes. 339 Symbols []DocumentSection 340 SymbolsMetaData []*Symbol 341 342 // Ranks is a vector of ranks for a document as provided by a DocumentRanksFile 343 // file in the git repo. 344 // 345 // Two documents can be ordered by comparing the components of their rank 346 // vectors. Bigger entries are better, as are longer vectors. 347 // 348 // This field is experimental and may change at any time without warning. 349 Ranks []float64 350} 351 352type symbolSlice struct { 353 symbols []DocumentSection 354 metaData []*Symbol 355} 356 357func (s symbolSlice) Len() int { return len(s.symbols) } 358 359func (s symbolSlice) Swap(i, j int) { 360 s.symbols[i], s.symbols[j] = s.symbols[j], s.symbols[i] 361 s.metaData[i], s.metaData[j] = s.metaData[j], s.metaData[i] 362} 363 364func (s symbolSlice) Less(i, j int) bool { 365 return s.symbols[i].Start < s.symbols[j].Start 366} 367 368// AddFile is a convenience wrapper for Add 369func (b *IndexBuilder) AddFile(name string, content []byte) error { 370 return b.Add(Document{Name: name, Content: content}) 371} 372 373func (b *IndexBuilder) populateSubRepoIndices() error { 374 if len(b.subRepoIndices) == len(b.repoList) { 375 return nil 376 } 377 if len(b.subRepoIndices) != len(b.repoList)-1 { 378 return fmt.Errorf("populateSubRepoIndices not called for a repo: %d != %d - 1", len(b.subRepoIndices), len(b.repoList)) 379 } 380 repo := b.repoList[len(b.repoList)-1] 381 b.subRepoIndices = append(b.subRepoIndices, mkSubRepoIndices(repo)) 382 return nil 383} 384 385func mkSubRepoIndices(repo Repository) map[string]uint32 { 386 paths := []string{""} 387 for k := range repo.SubRepoMap { 388 paths = append(paths, k) 389 } 390 sort.Strings(paths) 391 subRepoIndices := make(map[string]uint32, len(paths)) 392 for i, p := range paths { 393 subRepoIndices[p] = uint32(i) 394 } 395 return subRepoIndices 396} 397 398const notIndexedMarker = "NOT-INDEXED: " 399 400func (b *IndexBuilder) symbolID(sym string) uint32 { 401 if _, ok := b.symIndex[sym]; !ok { 402 b.symIndex[sym] = b.symID 403 b.symID++ 404 } 405 return b.symIndex[sym] 406} 407 408func (b *IndexBuilder) symbolKindID(t string) uint32 { 409 if _, ok := b.symKindIndex[t]; !ok { 410 b.symKindIndex[t] = b.symKindID 411 b.symKindID++ 412 } 413 return b.symKindIndex[t] 414} 415 416func (b *IndexBuilder) addSymbols(symbols []*Symbol) { 417 for _, sym := range symbols { 418 b.symMetaData = append(b.symMetaData, 419 // This field was removed due to redundancy. To avoid 420 // needing to reindex, it is set to zero for now. In the 421 // future, this field will be completely removed. It 422 // will require incrementing the feature version. 423 0, 424 b.symbolKindID(sym.Kind), 425 b.symbolID(sym.Parent), 426 b.symbolKindID(sym.ParentKind)) 427 } 428} 429 430func DetermineLanguageIfUnknown(doc *Document) { 431 if doc.Language == "" { 432 doc.Language = languages.GetLanguage(doc.Name, doc.Content) 433 } 434} 435 436// Add a file which only occurs in certain branches. 437func (b *IndexBuilder) Add(doc Document) error { 438 hasher := crc64.New(crc64.MakeTable(crc64.ISO)) 439 440 if idx := bytes.IndexByte(doc.Content, 0); idx >= 0 { 441 doc.SkipReason = fmt.Sprintf("binary content at byte offset %d", idx) 442 doc.Language = "binary" 443 } 444 445 if doc.SkipReason != "" { 446 doc.Content = []byte(notIndexedMarker + doc.SkipReason) 447 doc.Symbols = nil 448 doc.SymbolsMetaData = nil 449 if doc.Language == "" { 450 doc.Language = "skipped" 451 } 452 } 453 454 DetermineLanguageIfUnknown(&doc) 455 456 sort.Sort(symbolSlice{doc.Symbols, doc.SymbolsMetaData}) 457 var last DocumentSection 458 for i, s := range doc.Symbols { 459 if i > 0 { 460 if last.End > s.Start { 461 return fmt.Errorf("sections overlap") 462 } 463 } 464 last = s 465 } 466 if last.End > uint32(len(doc.Content)) { 467 return fmt.Errorf("section goes past end of content") 468 } 469 470 if doc.SubRepositoryPath != "" { 471 rel, err := filepath.Rel(doc.SubRepositoryPath, doc.Name) 472 if err != nil || rel == doc.Name { 473 return fmt.Errorf("path %q must start subrepo path %q", doc.Name, doc.SubRepositoryPath) 474 } 475 } 476 docStr, runeSecs, err := b.contentPostings.newSearchableString(doc.Content, doc.Symbols) 477 if err != nil { 478 return err 479 } 480 nameStr, _, err := b.namePostings.newSearchableString([]byte(doc.Name), nil) 481 if err != nil { 482 return err 483 } 484 b.addSymbols(doc.SymbolsMetaData) 485 486 repoIdx := len(b.repoList) - 1 487 subRepoIdx, ok := b.subRepoIndices[repoIdx][doc.SubRepositoryPath] 488 if !ok { 489 return fmt.Errorf("unknown subrepo path %q", doc.SubRepositoryPath) 490 } 491 492 var mask uint64 493 for _, br := range doc.Branches { 494 m := b.branchMask(br) 495 if m == 0 { 496 return fmt.Errorf("no branch found for %s", br) 497 } 498 mask |= m 499 } 500 501 if repoIdx > 1<<16 { 502 return fmt.Errorf("too many repos in shard: max is %d", 1<<16) 503 } 504 505 b.subRepos = append(b.subRepos, subRepoIdx) 506 b.repos = append(b.repos, uint16(repoIdx)) 507 508 // doc.Ranks might be nil. In case we don't use offline ranking, doc.Ranks is 509 // always nil. 510 b.ranks = append(b.ranks, doc.Ranks) 511 512 hasher.Write(doc.Content) 513 514 b.contentStrings = append(b.contentStrings, docStr) 515 b.runeDocSections = append(b.runeDocSections, runeSecs...) 516 517 b.nameStrings = append(b.nameStrings, nameStr) 518 b.docSections = append(b.docSections, doc.Symbols) 519 b.fileEndSymbol = append(b.fileEndSymbol, uint32(len(b.runeDocSections))) 520 b.branchMasks = append(b.branchMasks, mask) 521 b.checksums = append(b.checksums, hasher.Sum(nil)...) 522 523 langCode, ok := b.languageMap[doc.Language] 524 if !ok { 525 if len(b.languageMap) >= 65535 { 526 return fmt.Errorf("too many languages") 527 } 528 langCode = uint16(len(b.languageMap)) 529 b.languageMap[doc.Language] = langCode 530 } 531 b.languages = append(b.languages, uint8(langCode), uint8(langCode>>8)) 532 533 return nil 534} 535 536func (b *IndexBuilder) branchMask(br string) uint64 { 537 for i, b := range b.repoList[len(b.repoList)-1].Branches { 538 if b.Name == br { 539 return uint64(1) << uint(i) 540 } 541 } 542 return 0 543} 544 545type DocChecker struct { 546 // A map to count the unique trigrams in a doc. Reused across docs to cut down on allocations. 547 trigrams map[ngram]struct{} 548} 549 550// Check returns a reason why the given contents are probably not source texts. 551func (t *DocChecker) Check(content []byte, maxTrigramCount int, allowLargeFile bool) error { 552 if len(content) == 0 { 553 return nil 554 } 555 556 if len(content) < ngramSize { 557 return fmt.Errorf("file size smaller than %d", ngramSize) 558 } 559 560 if index := bytes.IndexByte(content, 0); index > 0 { 561 return fmt.Errorf("binary data at byte offset %d", index) 562 } 563 564 // PERF: we only need to do the trigram check if the upperbound on content is greater than 565 // our threshold. Also skip the trigram check if the file is explicitly marked as allowed. 566 if trigramsUpperBound := len(content) - ngramSize + 1; trigramsUpperBound <= maxTrigramCount || allowLargeFile { 567 return nil 568 } 569 570 var cur [3]rune 571 byteCount := 0 572 t.clearTrigrams(maxTrigramCount) 573 574 for len(content) > 0 { 575 r, sz := utf8.DecodeRune(content) 576 content = content[sz:] 577 byteCount += sz 578 579 cur[0], cur[1], cur[2] = cur[1], cur[2], r 580 if cur[0] == 0 { 581 // start of file. 582 continue 583 } 584 585 t.trigrams[runesToNGram(cur)] = struct{}{} 586 if len(t.trigrams) > maxTrigramCount { 587 // probably not text. 588 return fmt.Errorf("number of trigrams exceeds %d", maxTrigramCount) 589 } 590 } 591 return nil 592} 593 594func (t *DocChecker) clearTrigrams(maxTrigramCount int) { 595 if t.trigrams == nil { 596 t.trigrams = make(map[ngram]struct{}, maxTrigramCount) 597 } 598 for key := range t.trigrams { 599 delete(t.trigrams, key) 600 } 601}