fork of https://github.com/sourcegraph/zoekt
0

Configure Feed

Select the types of activity you want to include in your feed.

1// Copyright 2016 Google Inc. All rights reserved. 2// 3// Licensed under the Apache License, Version 2.0 (the "License"); 4// you may not use this file except in compliance with the License. 5// You may obtain a copy of the License at 6// 7// http://www.apache.org/licenses/LICENSE-2.0 8// 9// Unless required by applicable law or agreed to in writing, software 10// distributed under the License is distributed on an "AS IS" BASIS, 11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12// See the License for the specific language governing permissions and 13// limitations under the License. 14 15package zoekt 16 17import ( 18 "bytes" 19 "encoding/binary" 20 "fmt" 21 "hash/crc64" 22 "html/template" 23 "log" 24 "os" 25 "path/filepath" 26 "sort" 27 "time" 28 "unicode/utf8" 29 30 "github.com/go-enry/go-enry/v2" 31) 32 33var _ = log.Println 34 35const ngramSize = 3 36 37type searchableString struct { 38 data []byte 39} 40 41// Filled by the linker 42var Version string 43 44func HostnameBestEffort() string { 45 if h := os.Getenv("NODE_NAME"); h != "" { 46 return h 47 } 48 if h := os.Getenv("HOSTNAME"); h != "" { 49 return h 50 } 51 hostname, _ := os.Hostname() 52 return hostname 53} 54 55// Store character (unicode codepoint) offset (in bytes) this often. 56const runeOffsetFrequency = 100 57 58type postingsBuilder struct { 59 postings map[ngram][]byte 60 lastOffsets map[ngram]uint32 61 62 // To support UTF-8 searching, we must map back runes to byte 63 // offsets. As a first attempt, we sample regularly. The 64 // precise offset can be found by walking from the recorded 65 // offset to the desired rune. 66 runeOffsets []uint32 67 runeCount uint32 68 69 isPlainASCII bool 70 71 endRunes []uint32 72 endByte uint32 73} 74 75func newPostingsBuilder() *postingsBuilder { 76 return &postingsBuilder{ 77 postings: map[ngram][]byte{}, 78 lastOffsets: map[ngram]uint32{}, 79 isPlainASCII: true, 80 } 81} 82 83// Store trigram offsets for the given UTF-8 data. The 84// DocumentSections must correspond to rune boundaries in the UTF-8 85// data. 86func (s *postingsBuilder) newSearchableString(data []byte, byteSections []DocumentSection) (*searchableString, []DocumentSection, error) { 87 dest := searchableString{ 88 data: data, 89 } 90 var buf [8]byte 91 var runeGram [3]rune 92 93 var runeIndex uint32 94 byteCount := 0 95 dataSz := uint32(len(data)) 96 97 byteSectionBoundaries := make([]uint32, 0, 2*len(byteSections)) 98 for _, s := range byteSections { 99 byteSectionBoundaries = append(byteSectionBoundaries, s.Start, s.End) 100 } 101 var runeSectionBoundaries []uint32 102 103 endRune := s.runeCount 104 for ; len(data) > 0; runeIndex++ { 105 c, sz := utf8.DecodeRune(data) 106 if sz > 1 { 107 s.isPlainASCII = false 108 } 109 data = data[sz:] 110 111 runeGram[0], runeGram[1], runeGram[2] = runeGram[1], runeGram[2], c 112 113 if idx := s.runeCount + runeIndex; idx%runeOffsetFrequency == 0 { 114 s.runeOffsets = append(s.runeOffsets, s.endByte+uint32(byteCount)) 115 } 116 for len(byteSectionBoundaries) > 0 && byteSectionBoundaries[0] == uint32(byteCount) { 117 runeSectionBoundaries = append(runeSectionBoundaries, 118 endRune+uint32(runeIndex)) 119 byteSectionBoundaries = byteSectionBoundaries[1:] 120 } 121 122 byteCount += sz 123 124 if runeIndex < 2 { 125 continue 126 } 127 128 ng := runesToNGram(runeGram) 129 lastOff := s.lastOffsets[ng] 130 newOff := endRune + uint32(runeIndex) - 2 131 132 m := binary.PutUvarint(buf[:], uint64(newOff-lastOff)) 133 s.postings[ng] = append(s.postings[ng], buf[:m]...) 134 s.lastOffsets[ng] = newOff 135 } 136 s.runeCount += runeIndex 137 138 for len(byteSectionBoundaries) > 0 && byteSectionBoundaries[0] < uint32(byteCount) { 139 return nil, nil, fmt.Errorf("no rune for section boundary at byte %d", byteSectionBoundaries[0]) 140 } 141 142 // Handle symbol definition that ends at file end. This can 143 // happen for labels at the end of .bat files. 144 145 for len(byteSectionBoundaries) > 0 && byteSectionBoundaries[0] == uint32(byteCount) { 146 runeSectionBoundaries = append(runeSectionBoundaries, 147 endRune+runeIndex) 148 byteSectionBoundaries = byteSectionBoundaries[1:] 149 } 150 runeSecs := make([]DocumentSection, 0, len(byteSections)) 151 for i := 0; i < len(runeSectionBoundaries); i += 2 { 152 runeSecs = append(runeSecs, DocumentSection{ 153 Start: runeSectionBoundaries[i], 154 End: runeSectionBoundaries[i+1], 155 }) 156 } 157 158 s.endRunes = append(s.endRunes, s.runeCount) 159 s.endByte += dataSz 160 return &dest, runeSecs, nil 161} 162 163// IndexBuilder builds a single index shard. 164type IndexBuilder struct { 165 // The version we will write to disk. Sourcegraph Specific. This is to 166 // enable feature flagging new format versions. 167 indexFormatVersion int 168 featureVersion int 169 170 contentStrings []*searchableString 171 nameStrings []*searchableString 172 docSections [][]DocumentSection 173 runeDocSections []DocumentSection 174 175 symID uint32 176 symIndex map[string]uint32 177 symKindID uint32 178 symKindIndex map[string]uint32 179 symMetaData []uint32 180 181 fileEndSymbol []uint32 182 183 checksums []byte 184 185 branchMasks []uint64 186 subRepos []uint32 187 188 // docID => repoID 189 repos []uint16 190 191 // Experimental: docID => rank vec 192 ranks [][]float64 193 194 contentPostings *postingsBuilder 195 namePostings *postingsBuilder 196 197 // root repositories 198 repoList []Repository 199 200 // name to index. 201 subRepoIndices []map[string]uint32 202 203 // language => language code 204 languageMap map[string]uint16 205 206 // language codes, uint16 encoded as little-endian 207 languages []uint8 208 209 // IndexTime will be used as the time if non-zero. Otherwise 210 // time.Now(). This is useful for doing reproducible builds in tests. 211 IndexTime time.Time 212 213 // a sortable 20 chars long id. 214 ID string 215} 216 217func (d *Repository) verify() error { 218 for _, t := range []string{d.FileURLTemplate, d.LineFragmentTemplate, d.CommitURLTemplate} { 219 if _, err := template.New("").Parse(t); err != nil { 220 return err 221 } 222 } 223 return nil 224} 225 226// ContentSize returns the number of content bytes so far ingested. 227func (b *IndexBuilder) ContentSize() uint32 { 228 // Add the name too so we don't skip building index if we have 229 // lots of empty files. 230 return b.contentPostings.endByte + b.namePostings.endByte 231} 232 233// NumFiles returns the number of files added to this builder 234func (b *IndexBuilder) NumFiles() int { 235 return len(b.contentStrings) 236} 237 238// NewIndexBuilder creates a fresh IndexBuilder. The passed in 239// Repository contains repo metadata, and may be set to nil. 240func NewIndexBuilder(r *Repository) (*IndexBuilder, error) { 241 b := newIndexBuilder() 242 243 if r == nil { 244 r = &Repository{} 245 } 246 if err := b.setRepository(r); err != nil { 247 return nil, err 248 } 249 return b, nil 250} 251 252func newIndexBuilder() *IndexBuilder { 253 return &IndexBuilder{ 254 indexFormatVersion: IndexFormatVersion, 255 featureVersion: FeatureVersion, 256 257 contentPostings: newPostingsBuilder(), 258 namePostings: newPostingsBuilder(), 259 fileEndSymbol: []uint32{0}, 260 symIndex: make(map[string]uint32), 261 symKindIndex: make(map[string]uint32), 262 languageMap: make(map[string]uint16), 263 } 264} 265 266func (b *IndexBuilder) setRepository(desc *Repository) error { 267 if err := desc.verify(); err != nil { 268 return err 269 } 270 271 if len(desc.Branches) > 64 { 272 return fmt.Errorf("too many branches") 273 } 274 275 repo := *desc 276 277 // copy subrepomap without root 278 repo.SubRepoMap = map[string]*Repository{} 279 for k, v := range desc.SubRepoMap { 280 if k != "" { 281 repo.SubRepoMap[k] = v 282 } 283 } 284 285 b.repoList = append(b.repoList, repo) 286 287 return b.populateSubRepoIndices() 288} 289 290type DocumentSection struct { 291 Start, End uint32 292} 293 294// Document holds a document (file) to index. 295type Document struct { 296 Name string 297 Content []byte 298 Branches []string 299 SubRepositoryPath string 300 Language string 301 302 // If set, something is wrong with the file contents, and this 303 // is the reason it wasn't indexed. 304 SkipReason string 305 306 // Document sections for symbols. Offsets should use bytes. 307 Symbols []DocumentSection 308 SymbolsMetaData []*Symbol 309 310 // Ranks is a vector of ranks for a document as provided by a DocumentRanksFile 311 // file in the git repo. 312 // 313 // Two documents can be ordered by comparing the components of their rank 314 // vectors. Bigger entries are better, as are longer vectors. 315 // 316 // This field is experimental and may change at any time without warning. 317 Ranks []float64 318} 319 320type symbolSlice struct { 321 symbols []DocumentSection 322 metaData []*Symbol 323} 324 325func (s symbolSlice) Len() int { return len(s.symbols) } 326 327func (s symbolSlice) Swap(i, j int) { 328 s.symbols[i], s.symbols[j] = s.symbols[j], s.symbols[i] 329 s.metaData[i], s.metaData[j] = s.metaData[j], s.metaData[i] 330} 331 332func (s symbolSlice) Less(i, j int) bool { 333 return s.symbols[i].Start < s.symbols[j].Start 334} 335 336// AddFile is a convenience wrapper for Add 337func (b *IndexBuilder) AddFile(name string, content []byte) error { 338 return b.Add(Document{Name: name, Content: content}) 339} 340 341// CheckText returns a reason why the given contents are probably not source texts. 342func CheckText(content []byte, maxTrigramCount int) error { 343 if len(content) == 0 { 344 return nil 345 } 346 347 if len(content) < ngramSize { 348 return fmt.Errorf("file size smaller than %d", ngramSize) 349 } 350 351 // PERF: we only need to do the trigram check if the upperbound on content 352 // is greater than our threshold. 353 var trigrams map[ngram]struct{} 354 if trigramsUpperBound := len(content) - ngramSize + 1; trigramsUpperBound > maxTrigramCount { 355 trigrams = make(map[ngram]struct{}, maxTrigramCount+1) 356 } 357 358 var cur [3]rune 359 byteCount := 0 360 for len(content) > 0 { 361 if content[0] == 0 { 362 return fmt.Errorf("binary data at byte offset %d", byteCount) 363 } 364 365 r, sz := utf8.DecodeRune(content) 366 content = content[sz:] 367 byteCount += sz 368 369 cur[0], cur[1], cur[2] = cur[1], cur[2], r 370 if cur[0] == 0 { 371 // start of file. 372 continue 373 } 374 375 if trigrams != nil { 376 trigrams[runesToNGram(cur)] = struct{}{} 377 if len(trigrams) > maxTrigramCount { 378 // probably not text. 379 return fmt.Errorf("number of trigrams exceeds %d", maxTrigramCount) 380 } 381 } 382 } 383 return nil 384} 385 386func (b *IndexBuilder) populateSubRepoIndices() error { 387 if len(b.subRepoIndices) == len(b.repoList) { 388 return nil 389 } 390 if len(b.subRepoIndices) != len(b.repoList)-1 { 391 return fmt.Errorf("populateSubRepoIndices not called for a repo: %d != %d - 1", len(b.subRepoIndices), len(b.repoList)) 392 } 393 repo := b.repoList[len(b.repoList)-1] 394 b.subRepoIndices = append(b.subRepoIndices, mkSubRepoIndices(repo)) 395 return nil 396} 397 398func mkSubRepoIndices(repo Repository) map[string]uint32 { 399 paths := []string{""} 400 for k := range repo.SubRepoMap { 401 paths = append(paths, k) 402 } 403 sort.Strings(paths) 404 subRepoIndices := make(map[string]uint32, len(paths)) 405 for i, p := range paths { 406 subRepoIndices[p] = uint32(i) 407 } 408 return subRepoIndices 409} 410 411const notIndexedMarker = "NOT-INDEXED: " 412 413func (b *IndexBuilder) symbolID(sym string) uint32 { 414 if _, ok := b.symIndex[sym]; !ok { 415 b.symIndex[sym] = b.symID 416 b.symID++ 417 } 418 return b.symIndex[sym] 419} 420 421func (b *IndexBuilder) symbolKindID(t string) uint32 { 422 if _, ok := b.symKindIndex[t]; !ok { 423 b.symKindIndex[t] = b.symKindID 424 b.symKindID++ 425 } 426 return b.symKindIndex[t] 427} 428 429func (b *IndexBuilder) addSymbols(symbols []*Symbol) { 430 for _, sym := range symbols { 431 b.symMetaData = append(b.symMetaData, 432 // This field was removed due to redundancy. To avoid 433 // needing to reindex, it is set to zero for now. In the 434 // future, this field will be completely removed. It 435 // will require incrementing the feature version. 436 0, 437 b.symbolKindID(sym.Kind), 438 b.symbolID(sym.Parent), 439 b.symbolKindID(sym.ParentKind)) 440 } 441} 442 443func DetermineLanguageIfUnknown(doc *Document) { 444 if doc.Language == "" { 445 c := doc.Content 446 // classifier is faster on small files without losing much accuracy 447 if len(c) > 2048 { 448 c = c[:2048] 449 } 450 doc.Language = enry.GetLanguage(doc.Name, c) 451 } 452} 453 454// Add a file which only occurs in certain branches. 455func (b *IndexBuilder) Add(doc Document) error { 456 hasher := crc64.New(crc64.MakeTable(crc64.ISO)) 457 458 if idx := bytes.IndexByte(doc.Content, 0); idx >= 0 { 459 doc.SkipReason = fmt.Sprintf("binary content at byte offset %d", idx) 460 doc.Language = "binary" 461 } 462 463 if doc.SkipReason != "" { 464 doc.Content = []byte(notIndexedMarker + doc.SkipReason) 465 doc.Symbols = nil 466 doc.SymbolsMetaData = nil 467 if doc.Language == "" { 468 doc.Language = "skipped" 469 } 470 } 471 472 DetermineLanguageIfUnknown(&doc) 473 474 sort.Sort(symbolSlice{doc.Symbols, doc.SymbolsMetaData}) 475 var last DocumentSection 476 for i, s := range doc.Symbols { 477 if i > 0 { 478 if last.End > s.Start { 479 return fmt.Errorf("sections overlap") 480 } 481 } 482 last = s 483 } 484 if last.End > uint32(len(doc.Content)) { 485 return fmt.Errorf("section goes past end of content") 486 } 487 488 if doc.SubRepositoryPath != "" { 489 rel, err := filepath.Rel(doc.SubRepositoryPath, doc.Name) 490 if err != nil || rel == doc.Name { 491 return fmt.Errorf("path %q must start subrepo path %q", doc.Name, doc.SubRepositoryPath) 492 } 493 } 494 docStr, runeSecs, err := b.contentPostings.newSearchableString(doc.Content, doc.Symbols) 495 if err != nil { 496 return err 497 } 498 nameStr, _, err := b.namePostings.newSearchableString([]byte(doc.Name), nil) 499 if err != nil { 500 return err 501 } 502 b.addSymbols(doc.SymbolsMetaData) 503 504 repoIdx := len(b.repoList) - 1 505 subRepoIdx, ok := b.subRepoIndices[repoIdx][doc.SubRepositoryPath] 506 if !ok { 507 return fmt.Errorf("unknown subrepo path %q", doc.SubRepositoryPath) 508 } 509 510 var mask uint64 511 for _, br := range doc.Branches { 512 m := b.branchMask(br) 513 if m == 0 { 514 return fmt.Errorf("no branch found for %s", br) 515 } 516 mask |= m 517 } 518 519 if repoIdx > 1<<16 { 520 return fmt.Errorf("too many repos in shard: max is %d", 1<<16) 521 } 522 523 b.subRepos = append(b.subRepos, subRepoIdx) 524 b.repos = append(b.repos, uint16(repoIdx)) 525 526 // doc.Ranks might be nil. In case we don't use offline ranking, doc.Ranks is 527 // always nil. 528 b.ranks = append(b.ranks, doc.Ranks) 529 530 hasher.Write(doc.Content) 531 532 b.contentStrings = append(b.contentStrings, docStr) 533 b.runeDocSections = append(b.runeDocSections, runeSecs...) 534 535 b.nameStrings = append(b.nameStrings, nameStr) 536 b.docSections = append(b.docSections, doc.Symbols) 537 b.fileEndSymbol = append(b.fileEndSymbol, uint32(len(b.runeDocSections))) 538 b.branchMasks = append(b.branchMasks, mask) 539 b.checksums = append(b.checksums, hasher.Sum(nil)...) 540 541 langCode, ok := b.languageMap[doc.Language] 542 if !ok { 543 if len(b.languageMap) >= 65535 { 544 return fmt.Errorf("too many languages") 545 } 546 langCode = uint16(len(b.languageMap)) 547 b.languageMap[doc.Language] = langCode 548 } 549 b.languages = append(b.languages, uint8(langCode), uint8(langCode>>8)) 550 551 return nil 552} 553 554func (b *IndexBuilder) branchMask(br string) uint64 { 555 for i, b := range b.repoList[len(b.repoList)-1].Branches { 556 if b.Name == br { 557 return uint64(1) << uint(i) 558 } 559 } 560 return 0 561}