fork of https://github.com/sourcegraph/zoekt
0

Configure Feed

Select the types of activity you want to include in your feed.

1// Copyright 2016 Google Inc. All rights reserved. 2// 3// Licensed under the Apache License, Version 2.0 (the "License"); 4// you may not use this file except in compliance with the License. 5// You may obtain a copy of the License at 6// 7// http://www.apache.org/licenses/LICENSE-2.0 8// 9// Unless required by applicable law or agreed to in writing, software 10// distributed under the License is distributed on an "AS IS" BASIS, 11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12// See the License for the specific language governing permissions and 13// limitations under the License. 14 15package zoekt 16 17import ( 18 "bytes" 19 "encoding/binary" 20 "fmt" 21 "hash/crc64" 22 "html/template" 23 "log" 24 "path/filepath" 25 "sort" 26 "time" 27 "unicode/utf8" 28 29 "github.com/go-enry/go-enry/v2" 30) 31 32var _ = log.Println 33 34const ngramSize = 3 35 36type searchableString struct { 37 data []byte 38} 39 40// Filled by the linker 41var Version string 42 43// Store character (unicode codepoint) offset (in bytes) this often. 44const runeOffsetFrequency = 100 45 46type postingsBuilder struct { 47 postings map[ngram][]byte 48 lastOffsets map[ngram]uint32 49 50 // To support UTF-8 searching, we must map back runes to byte 51 // offsets. As a first attempt, we sample regularly. The 52 // precise offset can be found by walking from the recorded 53 // offset to the desired rune. 54 runeOffsets []uint32 55 runeCount uint32 56 57 isPlainASCII bool 58 59 endRunes []uint32 60 endByte uint32 61} 62 63func newPostingsBuilder() *postingsBuilder { 64 return &postingsBuilder{ 65 postings: map[ngram][]byte{}, 66 lastOffsets: map[ngram]uint32{}, 67 isPlainASCII: true, 68 } 69} 70 71// Store trigram offsets for the given UTF-8 data. The 72// DocumentSections must correspond to rune boundaries in the UTF-8 73// data. 74func (s *postingsBuilder) newSearchableString(data []byte, byteSections []DocumentSection) (*searchableString, []DocumentSection, error) { 75 dest := searchableString{ 76 data: data, 77 } 78 var buf [8]byte 79 var runeGram [3]rune 80 81 var runeIndex uint32 82 byteCount := 0 83 dataSz := uint32(len(data)) 84 85 byteSectionBoundaries := make([]uint32, 0, 2*len(byteSections)) 86 for _, s := range byteSections { 87 byteSectionBoundaries = append(byteSectionBoundaries, s.Start, s.End) 88 } 89 var runeSectionBoundaries []uint32 90 91 endRune := s.runeCount 92 for ; len(data) > 0; runeIndex++ { 93 c, sz := utf8.DecodeRune(data) 94 if sz > 1 { 95 s.isPlainASCII = false 96 } 97 data = data[sz:] 98 99 runeGram[0], runeGram[1], runeGram[2] = runeGram[1], runeGram[2], c 100 101 if idx := s.runeCount + runeIndex; idx%runeOffsetFrequency == 0 { 102 s.runeOffsets = append(s.runeOffsets, s.endByte+uint32(byteCount)) 103 } 104 for len(byteSectionBoundaries) > 0 && byteSectionBoundaries[0] == uint32(byteCount) { 105 runeSectionBoundaries = append(runeSectionBoundaries, 106 endRune+uint32(runeIndex)) 107 byteSectionBoundaries = byteSectionBoundaries[1:] 108 } 109 110 byteCount += sz 111 112 if runeIndex < 2 { 113 continue 114 } 115 116 ng := runesToNGram(runeGram) 117 lastOff := s.lastOffsets[ng] 118 newOff := endRune + uint32(runeIndex) - 2 119 120 m := binary.PutUvarint(buf[:], uint64(newOff-lastOff)) 121 s.postings[ng] = append(s.postings[ng], buf[:m]...) 122 s.lastOffsets[ng] = newOff 123 } 124 s.runeCount += runeIndex 125 126 for len(byteSectionBoundaries) > 0 && byteSectionBoundaries[0] < uint32(byteCount) { 127 return nil, nil, fmt.Errorf("no rune for section boundary at byte %d", byteSectionBoundaries[0]) 128 } 129 130 // Handle symbol definition that ends at file end. This can 131 // happen for labels at the end of .bat files. 132 133 for len(byteSectionBoundaries) > 0 && byteSectionBoundaries[0] == uint32(byteCount) { 134 runeSectionBoundaries = append(runeSectionBoundaries, 135 endRune+runeIndex) 136 byteSectionBoundaries = byteSectionBoundaries[1:] 137 } 138 runeSecs := make([]DocumentSection, 0, len(byteSections)) 139 for i := 0; i < len(runeSectionBoundaries); i += 2 { 140 runeSecs = append(runeSecs, DocumentSection{ 141 Start: runeSectionBoundaries[i], 142 End: runeSectionBoundaries[i+1], 143 }) 144 } 145 146 s.endRunes = append(s.endRunes, s.runeCount) 147 s.endByte += dataSz 148 return &dest, runeSecs, nil 149} 150 151// IndexBuilder builds a single index shard. 152type IndexBuilder struct { 153 // The version we will write to disk. Sourcegraph Specific. This is to 154 // enable feature flagging new format versions. 155 indexFormatVersion int 156 featureVersion int 157 158 contentStrings []*searchableString 159 nameStrings []*searchableString 160 docSections [][]DocumentSection 161 runeDocSections []DocumentSection 162 163 symID uint32 164 symIndex map[string]uint32 165 symKindID uint32 166 symKindIndex map[string]uint32 167 symMetaData []uint32 168 169 fileEndSymbol []uint32 170 171 checksums []byte 172 173 branchMasks []uint64 174 subRepos []uint32 175 176 // docID => repoID 177 repos []uint16 178 179 // Experimental: docID => rank vec 180 ranks [][]float64 181 182 contentPostings *postingsBuilder 183 namePostings *postingsBuilder 184 185 // root repositories 186 repoList []Repository 187 188 // name to index. 189 subRepoIndices []map[string]uint32 190 191 // language => language code 192 languageMap map[string]uint16 193 194 // language codes, uint16 encoded as little-endian 195 languages []uint8 196 197 // IndexTime will be used as the time if non-zero. Otherwise 198 // time.Now(). This is useful for doing reproducible builds in tests. 199 IndexTime time.Time 200 201 // a sortable 20 chars long id. 202 ID string 203} 204 205func (d *Repository) verify() error { 206 for _, t := range []string{d.FileURLTemplate, d.LineFragmentTemplate, d.CommitURLTemplate} { 207 if _, err := template.New("").Parse(t); err != nil { 208 return err 209 } 210 } 211 return nil 212} 213 214// ContentSize returns the number of content bytes so far ingested. 215func (b *IndexBuilder) ContentSize() uint32 { 216 // Add the name too so we don't skip building index if we have 217 // lots of empty files. 218 return b.contentPostings.endByte + b.namePostings.endByte 219} 220 221// NewIndexBuilder creates a fresh IndexBuilder. The passed in 222// Repository contains repo metadata, and may be set to nil. 223func NewIndexBuilder(r *Repository) (*IndexBuilder, error) { 224 b := newIndexBuilder() 225 226 if r == nil { 227 r = &Repository{} 228 } 229 if err := b.setRepository(r); err != nil { 230 return nil, err 231 } 232 return b, nil 233} 234 235func newIndexBuilder() *IndexBuilder { 236 return &IndexBuilder{ 237 indexFormatVersion: IndexFormatVersion, 238 featureVersion: FeatureVersion, 239 240 contentPostings: newPostingsBuilder(), 241 namePostings: newPostingsBuilder(), 242 fileEndSymbol: []uint32{0}, 243 symIndex: make(map[string]uint32), 244 symKindIndex: make(map[string]uint32), 245 languageMap: map[string]uint16{}, 246 } 247} 248 249func (b *IndexBuilder) setRepository(desc *Repository) error { 250 if err := desc.verify(); err != nil { 251 return err 252 } 253 254 if len(desc.Branches) > 64 { 255 return fmt.Errorf("too many branches") 256 } 257 258 repo := *desc 259 260 // copy subrepomap without root 261 repo.SubRepoMap = map[string]*Repository{} 262 for k, v := range desc.SubRepoMap { 263 if k != "" { 264 repo.SubRepoMap[k] = v 265 } 266 } 267 268 b.repoList = append(b.repoList, repo) 269 270 return b.populateSubRepoIndices() 271} 272 273type DocumentSection struct { 274 Start, End uint32 275} 276 277// Document holds a document (file) to index. 278type Document struct { 279 Name string 280 Content []byte 281 Branches []string 282 SubRepositoryPath string 283 Language string 284 285 // If set, something is wrong with the file contents, and this 286 // is the reason it wasn't indexed. 287 SkipReason string 288 289 // Document sections for symbols. Offsets should use bytes. 290 Symbols []DocumentSection 291 SymbolsMetaData []*Symbol 292 293 // Ranks is a vector of ranks for a document as provided by a DocumentRanksFile 294 // file in the git repo. 295 // 296 // Two documents can be ordered by comparing the components of their rank 297 // vectors. Bigger entries are better, as are longer vectors. 298 // 299 // This field is experimental and may change at any time without warning. 300 Ranks []float64 301} 302 303type symbolSlice struct { 304 symbols []DocumentSection 305 metaData []*Symbol 306} 307 308func (s symbolSlice) Len() int { return len(s.symbols) } 309 310func (s symbolSlice) Swap(i, j int) { 311 s.symbols[i], s.symbols[j] = s.symbols[j], s.symbols[i] 312 s.metaData[i], s.metaData[j] = s.metaData[j], s.metaData[i] 313} 314 315func (s symbolSlice) Less(i, j int) bool { 316 return s.symbols[i].Start < s.symbols[j].Start 317} 318 319// AddFile is a convenience wrapper for Add 320func (b *IndexBuilder) AddFile(name string, content []byte) error { 321 return b.Add(Document{Name: name, Content: content}) 322} 323 324// CheckText returns a reason why the given contents are probably not source texts. 325func CheckText(content []byte, maxTrigramCount int) error { 326 if len(content) == 0 { 327 return nil 328 } 329 330 if len(content) < ngramSize { 331 return fmt.Errorf("file size smaller than %d", ngramSize) 332 } 333 334 // PERF: we only need to do the trigram check if the upperbound on content 335 // is greater than our threshold. 336 var trigrams map[ngram]struct{} 337 if trigramsUpperBound := len(content) - ngramSize + 1; trigramsUpperBound > maxTrigramCount { 338 trigrams = make(map[ngram]struct{}, maxTrigramCount+1) 339 } 340 341 var cur [3]rune 342 byteCount := 0 343 for len(content) > 0 { 344 if content[0] == 0 { 345 return fmt.Errorf("binary data at byte offset %d", byteCount) 346 } 347 348 r, sz := utf8.DecodeRune(content) 349 content = content[sz:] 350 byteCount += sz 351 352 cur[0], cur[1], cur[2] = cur[1], cur[2], r 353 if cur[0] == 0 { 354 // start of file. 355 continue 356 } 357 358 if trigrams != nil { 359 trigrams[runesToNGram(cur)] = struct{}{} 360 if len(trigrams) > maxTrigramCount { 361 // probably not text. 362 return fmt.Errorf("number of trigrams exceeds %d", maxTrigramCount) 363 } 364 } 365 } 366 return nil 367} 368 369func (b *IndexBuilder) populateSubRepoIndices() error { 370 if len(b.subRepoIndices) == len(b.repoList) { 371 return nil 372 } 373 if len(b.subRepoIndices) != len(b.repoList)-1 { 374 return fmt.Errorf("populateSubRepoIndices not called for a repo: %d != %d - 1", len(b.subRepoIndices), len(b.repoList)) 375 } 376 repo := b.repoList[len(b.repoList)-1] 377 b.subRepoIndices = append(b.subRepoIndices, mkSubRepoIndices(repo)) 378 return nil 379} 380 381func mkSubRepoIndices(repo Repository) map[string]uint32 { 382 paths := []string{""} 383 for k := range repo.SubRepoMap { 384 paths = append(paths, k) 385 } 386 sort.Strings(paths) 387 subRepoIndices := make(map[string]uint32, len(paths)) 388 for i, p := range paths { 389 subRepoIndices[p] = uint32(i) 390 } 391 return subRepoIndices 392} 393 394const notIndexedMarker = "NOT-INDEXED: " 395 396func (b *IndexBuilder) symbolID(sym string) uint32 { 397 if _, ok := b.symIndex[sym]; !ok { 398 b.symIndex[sym] = b.symID 399 b.symID++ 400 } 401 return b.symIndex[sym] 402} 403 404func (b *IndexBuilder) symbolKindID(t string) uint32 { 405 if _, ok := b.symKindIndex[t]; !ok { 406 b.symKindIndex[t] = b.symKindID 407 b.symKindID++ 408 } 409 return b.symKindIndex[t] 410} 411 412func (b *IndexBuilder) addSymbols(symbols []*Symbol) { 413 for _, sym := range symbols { 414 b.symMetaData = append(b.symMetaData, 415 // This field was removed due to redundancy. To avoid 416 // needing to reindex, it is set to zero for now. In the 417 // future, this field will be completely removed. It 418 // will require incrementing the feature version. 419 0, 420 b.symbolKindID(sym.Kind), 421 b.symbolID(sym.Parent), 422 b.symbolKindID(sym.ParentKind)) 423 } 424} 425 426// Add a file which only occurs in certain branches. 427func (b *IndexBuilder) Add(doc Document) error { 428 hasher := crc64.New(crc64.MakeTable(crc64.ISO)) 429 430 if idx := bytes.IndexByte(doc.Content, 0); idx >= 0 { 431 doc.SkipReason = fmt.Sprintf("binary content at byte offset %d", idx) 432 doc.Language = "binary" 433 } 434 435 if doc.SkipReason != "" { 436 doc.Content = []byte(notIndexedMarker + doc.SkipReason) 437 doc.Symbols = nil 438 doc.SymbolsMetaData = nil 439 if doc.Language == "" { 440 doc.Language = "skipped" 441 } 442 } 443 444 if doc.Language == "" { 445 c := doc.Content 446 // classifier is faster on small files without losing much accuracy 447 if len(c) > 2048 { 448 c = c[:2048] 449 } 450 doc.Language = enry.GetLanguage(doc.Name, c) 451 } 452 453 sort.Sort(symbolSlice{doc.Symbols, doc.SymbolsMetaData}) 454 var last DocumentSection 455 for i, s := range doc.Symbols { 456 if i > 0 { 457 if last.End > s.Start { 458 return fmt.Errorf("sections overlap") 459 } 460 } 461 last = s 462 } 463 if last.End > uint32(len(doc.Content)) { 464 return fmt.Errorf("section goes past end of content") 465 } 466 467 if doc.SubRepositoryPath != "" { 468 rel, err := filepath.Rel(doc.SubRepositoryPath, doc.Name) 469 if err != nil || rel == doc.Name { 470 return fmt.Errorf("path %q must start subrepo path %q", doc.Name, doc.SubRepositoryPath) 471 } 472 } 473 docStr, runeSecs, err := b.contentPostings.newSearchableString(doc.Content, doc.Symbols) 474 if err != nil { 475 return err 476 } 477 nameStr, _, err := b.namePostings.newSearchableString([]byte(doc.Name), nil) 478 if err != nil { 479 return err 480 } 481 b.addSymbols(doc.SymbolsMetaData) 482 483 repoIdx := len(b.repoList) - 1 484 subRepoIdx, ok := b.subRepoIndices[repoIdx][doc.SubRepositoryPath] 485 if !ok { 486 return fmt.Errorf("unknown subrepo path %q", doc.SubRepositoryPath) 487 } 488 489 var mask uint64 490 for _, br := range doc.Branches { 491 m := b.branchMask(br) 492 if m == 0 { 493 return fmt.Errorf("no branch found for %s", br) 494 } 495 mask |= m 496 } 497 498 if repoIdx > 1<<16 { 499 return fmt.Errorf("too many repos in shard: max is %d", 1<<16) 500 } 501 502 b.subRepos = append(b.subRepos, subRepoIdx) 503 b.repos = append(b.repos, uint16(repoIdx)) 504 505 // doc.Ranks might be nil. In case we don't use offline ranking, doc.Ranks is 506 // always nil. 507 b.ranks = append(b.ranks, doc.Ranks) 508 509 hasher.Write(doc.Content) 510 511 b.contentStrings = append(b.contentStrings, docStr) 512 b.runeDocSections = append(b.runeDocSections, runeSecs...) 513 514 b.nameStrings = append(b.nameStrings, nameStr) 515 b.docSections = append(b.docSections, doc.Symbols) 516 b.fileEndSymbol = append(b.fileEndSymbol, uint32(len(b.runeDocSections))) 517 b.branchMasks = append(b.branchMasks, mask) 518 b.checksums = append(b.checksums, hasher.Sum(nil)...) 519 520 langCode, ok := b.languageMap[doc.Language] 521 if !ok { 522 if len(b.languageMap) >= 65535 { 523 return fmt.Errorf("too many languages") 524 } 525 langCode = uint16(len(b.languageMap)) 526 b.languageMap[doc.Language] = langCode 527 } 528 b.languages = append(b.languages, uint8(langCode), uint8(langCode>>8)) 529 530 return nil 531} 532 533func (b *IndexBuilder) branchMask(br string) uint64 { 534 for i, b := range b.repoList[len(b.repoList)-1].Branches { 535 if b.Name == br { 536 return uint64(1) << uint(i) 537 } 538 } 539 return 0 540}