fork of https://github.com/sourcegraph/zoekt
0

Configure Feed

Select the types of activity you want to include in your feed.

1// Copyright 2016 Google Inc. All rights reserved. 2// 3// Licensed under the Apache License, Version 2.0 (the "License"); 4// you may not use this file except in compliance with the License. 5// You may obtain a copy of the License at 6// 7// http://www.apache.org/licenses/LICENSE-2.0 8// 9// Unless required by applicable law or agreed to in writing, software 10// distributed under the License is distributed on an "AS IS" BASIS, 11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12// See the License for the specific language governing permissions and 13// limitations under the License. 14 15package index 16 17import ( 18 "context" 19 "fmt" 20 "log" 21 "regexp/syntax" 22 "sort" 23 "strings" 24 "time" 25 26 enry_data "github.com/go-enry/go-enry/v2/data" 27 "github.com/grafana/regexp" 28 29 "github.com/sourcegraph/zoekt" 30 "github.com/sourcegraph/zoekt/internal/tenant" 31 "github.com/sourcegraph/zoekt/query" 32) 33 34// simplifyMultiRepo takes a query and a predicate. It returns Const(true) if all 35// repository names fulfill the predicate, Const(false) if none of them do, and q 36// otherwise. 37func (d *indexData) simplifyMultiRepo(q query.Q, predicate func(*zoekt.Repository) bool) query.Q { 38 count := 0 39 alive := len(d.repoMetaData) 40 for i := range d.repoMetaData { 41 if d.repoMetaData[i].Tombstone { 42 alive-- 43 } else if predicate(&d.repoMetaData[i]) { 44 count++ 45 } 46 } 47 if count == alive { 48 return &query.Const{Value: true} 49 } 50 if count > 0 { 51 return q 52 } 53 return &query.Const{Value: false} 54} 55 56func (d *indexData) simplify(in query.Q) query.Q { 57 eval := query.Map(in, func(q query.Q) query.Q { 58 switch r := q.(type) { 59 case *query.Repo: 60 return d.simplifyMultiRepo(q, func(repo *zoekt.Repository) bool { 61 return r.Regexp.MatchString(repo.Name) 62 }) 63 case *query.RepoRegexp: 64 return d.simplifyMultiRepo(q, func(repo *zoekt.Repository) bool { 65 return r.Regexp.MatchString(repo.Name) 66 }) 67 case *query.BranchesRepos: 68 for i := range d.repoMetaData { 69 for _, br := range r.List { 70 if br.Repos.Contains(d.repoMetaData[i].ID) { 71 return q 72 } 73 } 74 } 75 return &query.Const{Value: false} 76 case *query.RepoSet: 77 return d.simplifyMultiRepo(q, func(repo *zoekt.Repository) bool { 78 return r.Set[repo.Name] 79 }) 80 case query.RawConfig: 81 return d.simplifyMultiRepo(q, func(repo *zoekt.Repository) bool { return uint8(r)&encodeRawConfig(repo.RawConfig) == uint8(r) }) 82 case *query.RepoIDs: 83 return d.simplifyMultiRepo(q, func(repo *zoekt.Repository) bool { 84 return r.Repos.Contains(repo.ID) 85 }) 86 case *query.Language: 87 _, has := d.metaData.LanguageMap[r.Language] 88 if !has && d.metaData.IndexFeatureVersion < 12 { 89 // For index files that haven't been re-indexed by go-enry, 90 // fall back to file-based matching and continue even if this 91 // repo doesn't have the specific language present. 92 extsForLang := enry_data.ExtensionsByLanguage[r.Language] 93 if extsForLang != nil { 94 extFrags := make([]string, 0, len(extsForLang)) 95 for _, ext := range extsForLang { 96 extFrags = append(extFrags, regexp.QuoteMeta(ext)) 97 } 98 if len(extFrags) > 0 { 99 pattern := fmt.Sprintf("(?i)(%s)$", strings.Join(extFrags, "|")) 100 // inlined copy of query.regexpQuery 101 re, err := syntax.Parse(pattern, syntax.Perl) 102 if err != nil { 103 return &query.Const{Value: false} 104 } 105 if re.Op == syntax.OpLiteral { 106 return &query.Substring{ 107 Pattern: string(re.Rune), 108 FileName: true, 109 } 110 } 111 return &query.Regexp{ 112 Regexp: re, 113 FileName: true, 114 } 115 } 116 } 117 } 118 if !has { 119 return &query.Const{Value: false} 120 } 121 } 122 return q 123 }) 124 return query.Simplify(eval) 125} 126 127func (d *indexData) Search(ctx context.Context, q query.Q, opts *zoekt.SearchOptions) (sr *zoekt.SearchResult, err error) { 128 timer := newTimer() 129 130 copyOpts := *opts 131 opts = &copyOpts 132 opts.SetDefaults() 133 134 var res zoekt.SearchResult 135 if len(d.fileNameIndex) == 0 { 136 return &res, nil 137 } 138 139 select { 140 case <-ctx.Done(): 141 res.Stats.ShardsSkipped++ 142 return &res, nil 143 default: 144 } 145 146 q = d.simplify(q) 147 if c, ok := q.(*query.Const); ok && !c.Value { 148 return &res, nil 149 } 150 151 if opts.EstimateDocCount { 152 res.Stats.ShardFilesConsidered = len(d.fileBranchMasks) 153 return &res, nil 154 } 155 156 q = query.Map(q, query.ExpandFileContent) 157 158 mt, err := d.newMatchTree(q, matchTreeOpt{}) 159 if err != nil { 160 return nil, err 161 } 162 163 // Capture the costs of construction before pruning 164 updateMatchTreeStats(mt, &res.Stats) 165 166 mt, err = pruneMatchTree(mt) 167 if err != nil { 168 return nil, err 169 } 170 res.Stats.MatchTreeConstruction = timer.Elapsed() 171 if mt == nil { 172 res.Stats.ShardsSkippedFilter++ 173 return &res, nil 174 } 175 176 res.Stats.ShardsScanned++ 177 178 cp := &contentProvider{ 179 id: d, 180 stats: &res.Stats, 181 } 182 183 // Track the number of documents found in a repository for 184 // ShardRepoMaxMatchCount 185 var ( 186 lastRepoID uint16 187 repoMatchCount int 188 ) 189 190 docCount := uint32(len(d.fileBranchMasks)) 191 lastDoc := int(-1) 192 193nextFileMatch: 194 for { 195 canceled := false 196 select { 197 case <-ctx.Done(): 198 canceled = true 199 default: 200 } 201 202 nextDoc := mt.nextDoc() 203 if int(nextDoc) <= lastDoc { 204 nextDoc = uint32(lastDoc + 1) 205 } 206 207 for ; nextDoc < docCount; nextDoc++ { 208 repoID := d.repos[nextDoc] 209 repoMetadata := &d.repoMetaData[repoID] 210 211 // Skip tombstoned repositories 212 if repoMetadata.Tombstone { 213 continue 214 } 215 216 // 🚨 SECURITY: Skip documents that don't belong to the tenant. This check is 217 // necessary to prevent leaking data across tenants. 218 if !tenant.HasAccess(ctx, repoMetadata.TenantID) { 219 continue 220 } 221 222 // Skip documents that are tombstoned 223 if len(repoMetadata.FileTombstones) > 0 { 224 if _, tombstoned := repoMetadata.FileTombstones[string(d.fileName(nextDoc))]; tombstoned { 225 continue 226 } 227 } 228 229 // Skip documents over ShardRepoMaxMatchCount if specified. 230 if opts.ShardRepoMaxMatchCount > 0 { 231 if repoMatchCount >= opts.ShardRepoMaxMatchCount && repoID == lastRepoID { 232 res.Stats.FilesSkipped++ 233 continue 234 } 235 } 236 237 break 238 } 239 240 if nextDoc >= docCount { 241 break 242 } 243 244 lastDoc = int(nextDoc) 245 246 // We track lastRepoID for ShardRepoMaxMatchCount 247 if lastRepoID != d.repos[nextDoc] { 248 lastRepoID = d.repos[nextDoc] 249 repoMatchCount = 0 250 } 251 252 if canceled || (res.Stats.MatchCount >= opts.ShardMaxMatchCount && opts.ShardMaxMatchCount > 0) { 253 res.Stats.FilesSkipped += int(docCount - nextDoc) 254 break 255 } 256 257 res.Stats.FilesConsidered++ 258 mt.prepare(nextDoc) 259 260 cp.setDocument(nextDoc) 261 262 known := make(map[matchTree]bool) 263 md := d.repoMetaData[d.repos[nextDoc]] 264 265 for cost := costMin; cost <= costMax; cost++ { 266 switch evalMatchTree(cp, cost, known, mt) { 267 case matchesRequiresHigherCost: 268 if cost == costMax { 269 log.Panicf("did not decide. Repo %s, doc %d, known %v", 270 md.Name, nextDoc, known) 271 } 272 case matchesFound: 273 // could short-circuit now, but we want to run higher costs to 274 // potentially find higher ranked matches. 275 case matchesNone: 276 continue nextFileMatch 277 } 278 } 279 280 fileMatch := zoekt.FileMatch{ 281 Repository: md.Name, 282 RepositoryID: md.ID, 283 RepositoryPriority: md.GetPriority(), 284 FileName: string(d.fileName(nextDoc)), 285 Checksum: d.getChecksum(nextDoc), 286 Language: d.languageMap[d.getLanguage(nextDoc)], 287 } 288 289 if s := d.subRepos[nextDoc]; s > 0 { 290 if s >= uint32(len(d.subRepoPaths[d.repos[nextDoc]])) { 291 log.Panicf("corrupt index: subrepo %d beyond %v", s, d.subRepoPaths) 292 } 293 path := d.subRepoPaths[d.repos[nextDoc]][s] 294 fileMatch.SubRepositoryPath = path 295 sr := md.SubRepoMap[path] 296 fileMatch.SubRepositoryName = sr.Name 297 if idx := d.branchIndex(nextDoc); idx >= 0 { 298 fileMatch.Version = sr.Branches[idx].Version 299 } 300 } else { 301 idx := d.branchIndex(nextDoc) 302 if idx >= 0 { 303 fileMatch.Version = md.Branches[idx].Version 304 } 305 } 306 307 // Important invariant for performance: finalCands is sorted by offset and 308 // non-overlapping. gatherMatches respects this invariant and all later 309 // transformations respect this. 310 finalCands := d.gatherMatches(nextDoc, mt, known) 311 312 if opts.ChunkMatches { 313 fileMatch.ChunkMatches = cp.fillChunkMatches(finalCands, opts.NumContextLines, fileMatch.Language, opts) 314 } else { 315 fileMatch.LineMatches = cp.fillMatches(finalCands, opts.NumContextLines, fileMatch.Language, opts) 316 } 317 318 if opts.UseBM25Scoring { 319 d.scoreFileBM25(&fileMatch, nextDoc, finalCands, cp, opts) 320 } else { 321 // Use the standard, non-experimental scoring method by default 322 d.scoreFile(&fileMatch, nextDoc, mt, known, opts) 323 } 324 325 fileMatch.Branches = d.gatherBranches(nextDoc, mt, known) 326 sortMatchesByScore(fileMatch.LineMatches) 327 sortChunkMatchesByScore(fileMatch.ChunkMatches) 328 if opts.Whole { 329 fileMatch.Content = cp.data(false) 330 } 331 332 matchedChunkRanges := 0 333 for _, cm := range fileMatch.ChunkMatches { 334 matchedChunkRanges += len(cm.Ranges) 335 } 336 337 repoMatchCount += len(fileMatch.LineMatches) 338 repoMatchCount += matchedChunkRanges 339 340 res.Files = append(res.Files, fileMatch) 341 342 res.Stats.MatchCount += len(fileMatch.LineMatches) 343 res.Stats.MatchCount += matchedChunkRanges 344 res.Stats.FileCount++ 345 } 346 347 for _, md := range d.repoMetaData { 348 r := md 349 addRepo(&res, &r) 350 for _, v := range r.SubRepoMap { 351 addRepo(&res, v) 352 } 353 } 354 355 // Update stats based on work done during document search. 356 updateMatchTreeStats(mt, &res.Stats) 357 358 res.Stats.MatchTreeSearch = timer.Elapsed() 359 360 return &res, nil 361} 362 363func addRepo(res *zoekt.SearchResult, repo *zoekt.Repository) { 364 if res.RepoURLs == nil { 365 res.RepoURLs = map[string]string{} 366 } 367 res.RepoURLs[repo.Name] = repo.FileURLTemplate 368 369 if res.LineFragments == nil { 370 res.LineFragments = map[string]string{} 371 } 372 res.LineFragments[repo.Name] = repo.LineFragmentTemplate 373} 374 375// Gather matches from this document. The matches are returned in document 376// order and are non-overlapping. All filename and content matches are 377// returned, with filename matches first. 378// 379// If `merge` is set, overlapping and adjacent matches will be merged 380// into a single index. Otherwise, overlapping matches will be removed, 381// but adjacent matches will remain. 382func (d *indexData) gatherMatches(nextDoc uint32, mt matchTree, known map[matchTree]bool) []*candidateMatch { 383 var cands []*candidateMatch 384 visitMatches(mt, known, 1, func(mt matchTree, scoreWeight float64) { 385 if smt, ok := mt.(*substrMatchTree); ok { 386 cands = append(cands, setScoreWeight(scoreWeight, smt.current)...) 387 } 388 if rmt, ok := mt.(*regexpMatchTree); ok { 389 cands = append(cands, setScoreWeight(scoreWeight, rmt.found)...) 390 } 391 if rmt, ok := mt.(*wordMatchTree); ok { 392 cands = append(cands, setScoreWeight(scoreWeight, rmt.found)...) 393 } 394 if smt, ok := mt.(*symbolRegexpMatchTree); ok { 395 cands = append(cands, setScoreWeight(scoreWeight, smt.found)...) 396 } 397 }) 398 399 // If we found no candidate matches at all, assume there must have been a match on filename. 400 if len(cands) == 0 { 401 nm := d.fileName(nextDoc) 402 return []*candidateMatch{{ 403 caseSensitive: false, 404 fileName: true, 405 substrBytes: nm, 406 substrLowered: nm, 407 file: nextDoc, 408 runeOffset: 0, 409 byteOffset: 0, 410 byteMatchSz: uint32(len(nm)), 411 }} 412 } 413 414 // Remove overlapping candidates. This guarantees that the matches 415 // are non-overlapping, but also preserves expected match counts. 416 sort.Sort((sortByOffsetSlice)(cands)) 417 res := cands[:0] 418 for i, c := range cands { 419 if i == 0 { 420 res = append(res, c) 421 continue 422 } 423 424 last := res[len(res)-1] 425 426 // Never compare filename and content matches 427 if last.fileName != c.fileName { 428 res = append(res, c) 429 continue 430 } 431 432 // Only add the match if its range doesn't overlap 433 lastEnd := last.byteOffset + last.byteMatchSz 434 if lastEnd <= c.byteOffset { 435 res = append(res, c) 436 continue 437 } 438 } 439 return res 440} 441 442type sortByOffsetSlice []*candidateMatch 443 444func (m sortByOffsetSlice) Len() int { return len(m) } 445func (m sortByOffsetSlice) Swap(i, j int) { m[i], m[j] = m[j], m[i] } 446func (m sortByOffsetSlice) Less(i, j int) bool { 447 // Sort all filename matches to the start 448 if m[i].fileName != m[j].fileName { 449 return m[i].fileName 450 } 451 452 if m[i].byteOffset == m[j].byteOffset { // tie break if same offset 453 // Prefer longer candidates if starting at same position 454 return m[i].byteMatchSz > m[j].byteMatchSz 455 } 456 return m[i].byteOffset < m[j].byteOffset 457} 458 459// setScoreWeight is a helper used by gatherMatches to set the weight based on 460// the score weight of the matchTree. 461func setScoreWeight(scoreWeight float64, cm []*candidateMatch) []*candidateMatch { 462 for _, m := range cm { 463 m.scoreWeight = scoreWeight 464 } 465 return cm 466} 467 468func (d *indexData) branchIndex(docID uint32) int { 469 mask := d.fileBranchMasks[docID] 470 idx := 0 471 for mask != 0 { 472 if mask&0x1 != 0 { 473 return idx 474 } 475 idx++ 476 mask >>= 1 477 } 478 return -1 479} 480 481// gatherBranches returns a list of branch names taking into account any branch 482// filters in the query. If the query contains a branch filter, it returns all 483// branches containing the docID and matching the branch filter. Otherwise, it 484// returns all branches containing docID. 485func (d *indexData) gatherBranches(docID uint32, mt matchTree, known map[matchTree]bool) []string { 486 var mask uint64 487 visitMatchAtoms(mt, known, func(mt matchTree) { 488 bq, ok := mt.(*branchQueryMatchTree) 489 if !ok { 490 return 491 } 492 493 mask = mask | bq.branchMask() 494 }) 495 496 if mask == 0 { 497 mask = d.fileBranchMasks[docID] 498 } 499 500 var branches []string 501 id := uint32(1) 502 branchNames := d.branchNames[d.repos[docID]] 503 for mask != 0 { 504 if mask&0x1 != 0 { 505 branches = append(branches, branchNames[uint(id)]) 506 } 507 id <<= 1 508 mask >>= 1 509 } 510 511 return branches 512} 513 514func (d *indexData) List(ctx context.Context, q query.Q, opts *zoekt.ListOptions) (rl *zoekt.RepoList, err error) { 515 var include func(rle *zoekt.RepoListEntry) bool 516 517 q = d.simplify(q) 518 if c, ok := q.(*query.Const); ok { 519 if !c.Value { 520 return &zoekt.RepoList{}, nil 521 } 522 include = func(rle *zoekt.RepoListEntry) bool { 523 return true 524 } 525 } else { 526 sr, err := d.Search(ctx, q, &zoekt.SearchOptions{ 527 ShardRepoMaxMatchCount: 1, 528 }) 529 if err != nil { 530 return nil, err 531 } 532 533 foundRepos := make(map[string]struct{}, len(sr.Files)) 534 for _, file := range sr.Files { 535 foundRepos[file.Repository] = struct{}{} 536 } 537 538 include = func(rle *zoekt.RepoListEntry) bool { 539 _, ok := foundRepos[rle.Repository.Name] 540 return ok 541 } 542 } 543 544 var l zoekt.RepoList 545 546 field, err := opts.GetField() 547 if err != nil { 548 return nil, err 549 } 550 switch field { 551 case zoekt.RepoListFieldRepos: 552 l.Repos = make([]*zoekt.RepoListEntry, 0, len(d.repoListEntry)) 553 case zoekt.RepoListFieldReposMap: 554 l.ReposMap = make(zoekt.ReposMap, len(d.repoListEntry)) 555 } 556 557 for i := range d.repoListEntry { 558 if d.repoMetaData[i].Tombstone { 559 continue 560 } 561 // 🚨 SECURITY: Skip documents that don't belong to the tenant. This check is 562 // necessary to prevent leaking data across tenants. 563 if !tenant.HasAccess(ctx, d.repoMetaData[i].TenantID) { 564 continue 565 } 566 rle := &d.repoListEntry[i] 567 if !include(rle) { 568 continue 569 } 570 571 l.Stats.Add(&rle.Stats) 572 573 // Backwards compat for when ID is missing 574 if rle.Repository.ID == 0 { 575 l.Repos = append(l.Repos, rle) 576 continue 577 } 578 579 switch field { 580 case zoekt.RepoListFieldRepos: 581 l.Repos = append(l.Repos, rle) 582 case zoekt.RepoListFieldReposMap: 583 l.ReposMap[rle.Repository.ID] = zoekt.MinimalRepoListEntry{ 584 HasSymbols: rle.Repository.HasSymbols, 585 Branches: rle.Repository.Branches, 586 IndexTimeUnix: rle.IndexMetadata.IndexTime.Unix(), 587 } 588 } 589 590 } 591 592 // Only one of these fields is populated and in all cases the size of that 593 // field is the number of Repos in this shard. 594 l.Stats.Repos = len(l.Repos) + len(l.ReposMap) 595 596 return &l, nil 597} 598 599// regexpToMatchTreeRecursive converts a regular expression to a matchTree mt. If 600// mt is equivalent to the input r, isEqual = true and the matchTree can be used 601// in place of the regex r. If singleLine = true, then the matchTree and all 602// its children only match terms on the same line. singleLine is used during 603// recursion to decide whether to return an andLineMatchTree (singleLine = true) 604// or a andMatchTree (singleLine = false). 605func (d *indexData) regexpToMatchTreeRecursive(r *syntax.Regexp, minTextSize int, fileName bool, caseSensitive bool) (mt matchTree, isEqual bool, singleLine bool, err error) { 606 // TODO - we could perhaps transform Begin/EndText in '\n'? 607 // TODO - we could perhaps transform CharClass in (OrQuery ) 608 // if there are just a few runes, and part of a OpConcat? 609 switch r.Op { 610 case syntax.OpLiteral: 611 s := string(r.Rune) 612 if len(s) >= minTextSize { 613 ignoreCase := syntax.FoldCase == (r.Flags & syntax.FoldCase) 614 mt, err := d.newSubstringMatchTree(&query.Substring{Pattern: s, FileName: fileName, CaseSensitive: !ignoreCase && caseSensitive}) 615 return mt, true, !strings.Contains(s, "\n"), err 616 } 617 case syntax.OpCapture: 618 return d.regexpToMatchTreeRecursive(r.Sub[0], minTextSize, fileName, caseSensitive) 619 620 case syntax.OpPlus: 621 return d.regexpToMatchTreeRecursive(r.Sub[0], minTextSize, fileName, caseSensitive) 622 623 case syntax.OpRepeat: 624 if r.Min == 1 { 625 return d.regexpToMatchTreeRecursive(r.Sub[0], minTextSize, fileName, caseSensitive) 626 } else if r.Min > 1 { 627 // (x){2,} can't be expressed precisely by the matchTree 628 mt, _, singleLine, err := d.regexpToMatchTreeRecursive(r.Sub[0], minTextSize, fileName, caseSensitive) 629 return mt, false, singleLine, err 630 } 631 case syntax.OpConcat, syntax.OpAlternate: 632 var qs []matchTree 633 isEq := true 634 singleLine = true 635 for _, sr := range r.Sub { 636 if sq, subIsEq, subSingleLine, err := d.regexpToMatchTreeRecursive(sr, minTextSize, fileName, caseSensitive); sq != nil { 637 if err != nil { 638 return nil, false, false, err 639 } 640 isEq = isEq && subIsEq 641 singleLine = singleLine && subSingleLine 642 qs = append(qs, sq) 643 } 644 } 645 if r.Op == syntax.OpConcat { 646 if len(qs) > 1 { 647 isEq = false 648 } 649 newQs := make([]matchTree, 0, len(qs)) 650 for _, q := range qs { 651 if _, ok := q.(*bruteForceMatchTree); ok { 652 continue 653 } 654 newQs = append(newQs, q) 655 } 656 if len(newQs) == 1 { 657 return newQs[0], isEq, singleLine, nil 658 } 659 if len(newQs) == 0 { 660 return &bruteForceMatchTree{}, isEq, singleLine, nil 661 } 662 if singleLine { 663 return &andLineMatchTree{andMatchTree{children: newQs}}, isEq, singleLine, nil 664 } 665 return &andMatchTree{newQs}, isEq, singleLine, nil 666 } 667 for _, q := range qs { 668 if _, ok := q.(*bruteForceMatchTree); ok { 669 return q, isEq, false, nil 670 } 671 } 672 if len(qs) == 0 { 673 return &noMatchTree{Why: "const"}, isEq, false, nil 674 } 675 return &orMatchTree{qs}, isEq, false, nil 676 case syntax.OpStar: 677 if r.Sub[0].Op == syntax.OpAnyCharNotNL { 678 return &bruteForceMatchTree{}, false, true, nil 679 } 680 } 681 return &bruteForceMatchTree{}, false, false, nil 682} 683 684type timer struct { 685 last time.Time 686} 687 688func newTimer() *timer { 689 return &timer{ 690 last: time.Now(), 691 } 692} 693 694func (t *timer) Elapsed() time.Duration { 695 now := time.Now() 696 d := now.Sub(t.last) 697 t.last = now 698 return d 699}