fork of https://github.com/sourcegraph/zoekt
0

Configure Feed

Select the types of activity you want to include in your feed.

at tngl 20 kB View raw
1// Copyright 2016 Google Inc. All rights reserved. 2// 3// Licensed under the Apache License, Version 2.0 (the "License"); 4// you may not use this file except in compliance with the License. 5// You may obtain a copy of the License at 6// 7// http://www.apache.org/licenses/LICENSE-2.0 8// 9// Unless required by applicable law or agreed to in writing, software 10// distributed under the License is distributed on an "AS IS" BASIS, 11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12// See the License for the specific language governing permissions and 13// limitations under the License. 14 15package index 16 17import ( 18 "context" 19 "fmt" 20 "log" 21 "regexp/syntax" 22 "sort" 23 "strings" 24 "time" 25 26 enry_data "github.com/go-enry/go-enry/v2/data" 27 "github.com/grafana/regexp" 28 29 "github.com/sourcegraph/zoekt" 30 "github.com/sourcegraph/zoekt/internal/tenant" 31 "github.com/sourcegraph/zoekt/query" 32) 33 34// simplifyMultiRepo takes a query and a predicate. It returns Const(true) if all 35// repository names fulfill the predicate, Const(false) if none of them do, and q 36// otherwise. 37func (d *indexData) simplifyMultiRepo(q query.Q, predicate func(*zoekt.Repository) bool) query.Q { 38 count := 0 39 alive := len(d.repoMetaData) 40 for i := range d.repoMetaData { 41 if d.repoMetaData[i].Tombstone { 42 alive-- 43 } else if predicate(&d.repoMetaData[i]) { 44 count++ 45 } 46 } 47 if count == alive { 48 return &query.Const{Value: true} 49 } 50 if count > 0 { 51 return q 52 } 53 return &query.Const{Value: false} 54} 55 56func (d *indexData) simplify(in query.Q) query.Q { 57 eval := query.Map(in, func(q query.Q) query.Q { 58 switch r := q.(type) { 59 case *query.Repo: 60 return d.simplifyMultiRepo(q, func(repo *zoekt.Repository) bool { 61 return r.Regexp.MatchString(repo.Name) 62 }) 63 case *query.RepoRegexp: 64 return d.simplifyMultiRepo(q, func(repo *zoekt.Repository) bool { 65 return r.Regexp.MatchString(repo.Name) 66 }) 67 case *query.BranchesRepos: 68 for i := range d.repoMetaData { 69 for _, br := range r.List { 70 if br.Repos.Contains(d.repoMetaData[i].ID) { 71 return q 72 } 73 } 74 } 75 return &query.Const{Value: false} 76 case *query.RepoSet: 77 return d.simplifyMultiRepo(q, func(repo *zoekt.Repository) bool { 78 return r.Set[repo.Name] 79 }) 80 case query.RawConfig: 81 return d.simplifyMultiRepo(q, func(repo *zoekt.Repository) bool { return uint8(r)&encodeRawConfig(repo.RawConfig) == uint8(r) }) 82 case *query.RepoIDs: 83 return d.simplifyMultiRepo(q, func(repo *zoekt.Repository) bool { 84 return r.Repos.Contains(repo.ID) 85 }) 86 case *query.Language: 87 _, has := d.metaData.LanguageMap[r.Language] 88 if !has && d.metaData.IndexFeatureVersion < 12 { 89 // For index files that haven't been re-indexed by go-enry, 90 // fall back to file-based matching and continue even if this 91 // repo doesn't have the specific language present. 92 extsForLang := enry_data.ExtensionsByLanguage[r.Language] 93 if extsForLang != nil { 94 extFrags := make([]string, 0, len(extsForLang)) 95 for _, ext := range extsForLang { 96 extFrags = append(extFrags, regexp.QuoteMeta(ext)) 97 } 98 if len(extFrags) > 0 { 99 pattern := fmt.Sprintf("(?i)(%s)$", strings.Join(extFrags, "|")) 100 // inlined copy of query.regexpQuery 101 re, err := syntax.Parse(pattern, syntax.Perl) 102 if err != nil { 103 return &query.Const{Value: false} 104 } 105 if re.Op == syntax.OpLiteral { 106 return &query.Substring{ 107 Pattern: string(re.Rune), 108 FileName: true, 109 } 110 } 111 return &query.Regexp{ 112 Regexp: re, 113 FileName: true, 114 } 115 } 116 } 117 } 118 if !has { 119 return &query.Const{Value: false} 120 } 121 case *query.Meta: 122 return d.simplifyMultiRepo(q, func(repo *zoekt.Repository) bool { 123 if repo.Metadata == nil { 124 return false 125 } 126 v, ok := repo.Metadata[r.Field] 127 if !ok { 128 return false 129 } 130 return r.Value.MatchString(v) 131 }) 132 } 133 return q 134 }) 135 return query.Simplify(eval) 136} 137 138func (d *indexData) Search(ctx context.Context, q query.Q, opts *zoekt.SearchOptions) (sr *zoekt.SearchResult, err error) { 139 timer := newTimer() 140 141 copyOpts := *opts 142 opts = &copyOpts 143 opts.SetDefaults() 144 145 var res zoekt.SearchResult 146 if len(d.fileNameIndex) == 0 { 147 return &res, nil 148 } 149 150 select { 151 case <-ctx.Done(): 152 res.Stats.ShardsSkipped++ 153 return &res, nil 154 default: 155 } 156 157 q = d.simplify(q) 158 if c, ok := q.(*query.Const); ok && !c.Value { 159 return &res, nil 160 } 161 162 if opts.EstimateDocCount { 163 res.Stats.ShardFilesConsidered = len(d.fileBranchMasks) 164 return &res, nil 165 } 166 167 q = query.Map(q, query.ExpandFileContent) 168 169 mt, err := d.newMatchTree(q, matchTreeOpt{}) 170 if err != nil { 171 return nil, err 172 } 173 174 // Capture the costs of construction before pruning 175 updateMatchTreeStats(mt, &res.Stats) 176 177 mt, err = pruneMatchTree(mt) 178 if err != nil { 179 return nil, err 180 } 181 res.Stats.MatchTreeConstruction = timer.Elapsed() 182 if mt == nil { 183 res.Stats.ShardsSkippedFilter++ 184 return &res, nil 185 } 186 187 res.Stats.ShardsScanned++ 188 189 cp := &contentProvider{ 190 id: d, 191 stats: &res.Stats, 192 } 193 194 // Track the number of documents found in a repository for 195 // ShardRepoMaxMatchCount 196 var ( 197 lastRepoID uint16 198 repoMatchCount int 199 ) 200 201 docCount := uint32(len(d.fileBranchMasks)) 202 lastDoc := int(-1) 203 204nextFileMatch: 205 for { 206 canceled := false 207 select { 208 case <-ctx.Done(): 209 canceled = true 210 default: 211 } 212 213 nextDoc := mt.nextDoc() 214 if int(nextDoc) <= lastDoc { 215 nextDoc = uint32(lastDoc + 1) 216 } 217 218 for ; nextDoc < docCount; nextDoc++ { 219 repoID := d.repos[nextDoc] 220 repoMetadata := &d.repoMetaData[repoID] 221 222 // Skip tombstoned repositories 223 if repoMetadata.Tombstone { 224 continue 225 } 226 227 // 🚨 SECURITY: Skip documents that don't belong to the tenant. This check is 228 // necessary to prevent leaking data across tenants. 229 if !tenant.HasAccess(ctx, repoMetadata.TenantID) { 230 continue 231 } 232 233 // Skip documents that are tombstoned 234 if len(repoMetadata.FileTombstones) > 0 { 235 if _, tombstoned := repoMetadata.FileTombstones[string(d.fileName(nextDoc))]; tombstoned { 236 continue 237 } 238 } 239 240 // Skip documents over ShardRepoMaxMatchCount if specified. 241 if opts.ShardRepoMaxMatchCount > 0 { 242 if repoMatchCount >= opts.ShardRepoMaxMatchCount && repoID == lastRepoID { 243 res.Stats.FilesSkipped++ 244 continue 245 } 246 } 247 248 break 249 } 250 251 if nextDoc >= docCount { 252 break 253 } 254 255 lastDoc = int(nextDoc) 256 257 // We track lastRepoID for ShardRepoMaxMatchCount 258 if lastRepoID != d.repos[nextDoc] { 259 lastRepoID = d.repos[nextDoc] 260 repoMatchCount = 0 261 } 262 263 if canceled || (res.Stats.MatchCount >= opts.ShardMaxMatchCount && opts.ShardMaxMatchCount > 0) { 264 res.Stats.FilesSkipped += int(docCount - nextDoc) 265 break 266 } 267 268 res.Stats.FilesConsidered++ 269 mt.prepare(nextDoc) 270 271 cp.setDocument(nextDoc) 272 273 known := make(map[matchTree]bool) 274 md := d.repoMetaData[d.repos[nextDoc]] 275 276 for cost := costMin; cost <= costMax; cost++ { 277 switch evalMatchTree(cp, cost, known, mt) { 278 case matchesRequiresHigherCost: 279 if cost == costMax { 280 log.Panicf("did not decide. Repo %s, doc %d, known %v", 281 md.Name, nextDoc, known) 282 } 283 case matchesFound: 284 // could short-circuit now, but we want to run higher costs to 285 // potentially find higher ranked matches. 286 case matchesNone: 287 continue nextFileMatch 288 } 289 } 290 291 fileMatch := zoekt.FileMatch{ 292 Repository: md.URL, 293 RepositoryID: md.ID, 294 RepositoryPriority: md.GetPriority(), 295 FileName: string(d.fileName(nextDoc)), 296 Checksum: d.getChecksum(nextDoc), 297 Language: d.languageMap[d.getLanguage(nextDoc)], 298 } 299 300 if s := d.subRepos[nextDoc]; s > 0 { 301 if s >= uint32(len(d.subRepoPaths[d.repos[nextDoc]])) { 302 log.Panicf("corrupt index: subrepo %d beyond %v", s, d.subRepoPaths) 303 } 304 path := d.subRepoPaths[d.repos[nextDoc]][s] 305 fileMatch.SubRepositoryPath = path 306 sr := md.SubRepoMap[path] 307 fileMatch.SubRepositoryName = sr.Name 308 if idx := d.branchIndex(nextDoc); idx >= 0 { 309 fileMatch.Version = sr.Branches[idx].Version 310 } 311 } else { 312 idx := d.branchIndex(nextDoc) 313 if idx >= 0 { 314 fileMatch.Version = md.Branches[idx].Version 315 } 316 } 317 318 // Important invariant for performance: finalCands is sorted by offset and 319 // non-overlapping. gatherMatches respects this invariant and all later 320 // transformations respect this. 321 finalCands := d.gatherMatches(nextDoc, mt, known) 322 323 if opts.ChunkMatches { 324 fileMatch.ChunkMatches = cp.fillChunkMatches(finalCands, opts.NumContextLines, fileMatch.Language, opts) 325 } else { 326 fileMatch.LineMatches = cp.fillMatches(finalCands, opts.NumContextLines, fileMatch.Language, opts) 327 } 328 329 if opts.UseBM25Scoring { 330 d.scoreFileBM25(&fileMatch, nextDoc, finalCands, cp, opts) 331 } else { 332 // Use the standard, non-experimental scoring method by default 333 d.scoreFile(&fileMatch, nextDoc, mt, known, opts) 334 } 335 336 fileMatch.Branches = d.gatherBranches(nextDoc, mt, known) 337 sortMatchesByScore(fileMatch.LineMatches) 338 sortChunkMatchesByScore(fileMatch.ChunkMatches) 339 if opts.Whole { 340 fileMatch.Content = cp.data(false) 341 } 342 343 matchedChunkRanges := 0 344 for _, cm := range fileMatch.ChunkMatches { 345 matchedChunkRanges += len(cm.Ranges) 346 } 347 348 repoMatchCount += len(fileMatch.LineMatches) 349 repoMatchCount += matchedChunkRanges 350 351 res.Files = append(res.Files, fileMatch) 352 353 res.Stats.MatchCount += len(fileMatch.LineMatches) 354 res.Stats.MatchCount += matchedChunkRanges 355 res.Stats.FileCount++ 356 } 357 358 for _, md := range d.repoMetaData { 359 r := md 360 // Key by URL: it is the unique repo identity and matches 361 // FileMatch.Repository (also set to the URL). Subrepos are keyed by 362 // their Name, matching FileMatch.SubRepositoryName. 363 addRepo(&res, r.URL, &r) 364 for _, v := range r.SubRepoMap { 365 addRepo(&res, v.Name, v) 366 } 367 } 368 369 // Update stats based on work done during document search. 370 updateMatchTreeStats(mt, &res.Stats) 371 372 res.Stats.MatchTreeSearch = timer.Elapsed() 373 374 return &res, nil 375} 376 377func addRepo(res *zoekt.SearchResult, key string, repo *zoekt.Repository) { 378 if res.RepoURLs == nil { 379 res.RepoURLs = map[string]string{} 380 } 381 res.RepoURLs[key] = repo.FileURLTemplate 382 383 if res.LineFragments == nil { 384 res.LineFragments = map[string]string{} 385 } 386 res.LineFragments[key] = repo.LineFragmentTemplate 387} 388 389// Gather matches from this document. The matches are returned in document 390// order and are non-overlapping. All filename and content matches are 391// returned, with filename matches first. 392// 393// If `merge` is set, overlapping and adjacent matches will be merged 394// into a single index. Otherwise, overlapping matches will be removed, 395// but adjacent matches will remain. 396func (d *indexData) gatherMatches(nextDoc uint32, mt matchTree, known map[matchTree]bool) []*candidateMatch { 397 var cands []*candidateMatch 398 visitMatches(mt, known, 1, func(mt matchTree, scoreWeight float64) { 399 if smt, ok := mt.(*substrMatchTree); ok { 400 cands = append(cands, setScoreWeight(scoreWeight, smt.current)...) 401 } 402 if rmt, ok := mt.(*regexpMatchTree); ok { 403 cands = append(cands, setScoreWeight(scoreWeight, rmt.found)...) 404 } 405 if rmt, ok := mt.(*wordMatchTree); ok { 406 cands = append(cands, setScoreWeight(scoreWeight, rmt.found)...) 407 } 408 if smt, ok := mt.(*symbolRegexpMatchTree); ok { 409 cands = append(cands, setScoreWeight(scoreWeight, smt.found)...) 410 } 411 }) 412 413 // If we found no candidate matches at all, assume there must have been a match on filename. 414 if len(cands) == 0 { 415 nm := d.fileName(nextDoc) 416 return []*candidateMatch{{ 417 caseSensitive: false, 418 fileName: true, 419 substrBytes: nm, 420 substrLowered: nm, 421 file: nextDoc, 422 runeOffset: 0, 423 byteOffset: 0, 424 byteMatchSz: uint32(len(nm)), 425 }} 426 } 427 428 // Remove overlapping candidates. This guarantees that the matches 429 // are non-overlapping, but also preserves expected match counts. 430 sort.Sort((sortByOffsetSlice)(cands)) 431 res := cands[:0] 432 for i, c := range cands { 433 if i == 0 { 434 res = append(res, c) 435 continue 436 } 437 438 last := res[len(res)-1] 439 440 // Never compare filename and content matches 441 if last.fileName != c.fileName { 442 res = append(res, c) 443 continue 444 } 445 446 // Only add the match if its range doesn't overlap 447 lastEnd := last.byteOffset + last.byteMatchSz 448 if lastEnd <= c.byteOffset { 449 res = append(res, c) 450 continue 451 } 452 } 453 return res 454} 455 456type sortByOffsetSlice []*candidateMatch 457 458func (m sortByOffsetSlice) Len() int { return len(m) } 459func (m sortByOffsetSlice) Swap(i, j int) { m[i], m[j] = m[j], m[i] } 460func (m sortByOffsetSlice) Less(i, j int) bool { 461 // Sort all filename matches to the start 462 if m[i].fileName != m[j].fileName { 463 return m[i].fileName 464 } 465 466 if m[i].byteOffset == m[j].byteOffset { // tie break if same offset 467 // Prefer longer candidates if starting at same position 468 return m[i].byteMatchSz > m[j].byteMatchSz 469 } 470 return m[i].byteOffset < m[j].byteOffset 471} 472 473// setScoreWeight is a helper used by gatherMatches to set the weight based on 474// the score weight of the matchTree. 475func setScoreWeight(scoreWeight float64, cm []*candidateMatch) []*candidateMatch { 476 for _, m := range cm { 477 m.scoreWeight = scoreWeight 478 } 479 return cm 480} 481 482func (d *indexData) branchIndex(docID uint32) int { 483 mask := d.fileBranchMasks[docID] 484 idx := 0 485 for mask != 0 { 486 if mask&0x1 != 0 { 487 return idx 488 } 489 idx++ 490 mask >>= 1 491 } 492 return -1 493} 494 495// gatherBranches returns a list of branch names taking into account any branch 496// filters in the query. If the query contains a branch filter, it returns all 497// branches containing the docID and matching the branch filter. Otherwise, it 498// returns all branches containing docID. 499func (d *indexData) gatherBranches(docID uint32, mt matchTree, known map[matchTree]bool) []string { 500 var mask uint64 501 visitMatchAtoms(mt, known, func(mt matchTree) { 502 bq, ok := mt.(*branchQueryMatchTree) 503 if !ok { 504 return 505 } 506 507 mask = mask | bq.branchMask() 508 }) 509 510 if mask == 0 { 511 mask = d.fileBranchMasks[docID] 512 } 513 514 var branches []string 515 id := uint64(1) 516 branchNames := d.branchNames[d.repos[docID]] 517 for mask != 0 { 518 if mask&0x1 != 0 { 519 branches = append(branches, branchNames[uint(id)]) 520 } 521 id <<= 1 522 mask >>= 1 523 } 524 525 return branches 526} 527 528func (d *indexData) List(ctx context.Context, q query.Q, opts *zoekt.ListOptions) (rl *zoekt.RepoList, err error) { 529 var include func(rle *zoekt.RepoListEntry) bool 530 531 q = d.simplify(q) 532 if c, ok := q.(*query.Const); ok { 533 if !c.Value { 534 return &zoekt.RepoList{}, nil 535 } 536 include = func(rle *zoekt.RepoListEntry) bool { 537 return true 538 } 539 } else { 540 sr, err := d.Search(ctx, q, &zoekt.SearchOptions{ 541 ShardRepoMaxMatchCount: 1, 542 }) 543 if err != nil { 544 return nil, err 545 } 546 547 foundRepos := make(map[string]struct{}, len(sr.Files)) 548 for _, file := range sr.Files { 549 foundRepos[file.Repository] = struct{}{} 550 } 551 552 include = func(rle *zoekt.RepoListEntry) bool { 553 _, ok := foundRepos[rle.Repository.URL] 554 return ok 555 } 556 } 557 558 var l zoekt.RepoList 559 560 field, err := opts.GetField() 561 if err != nil { 562 return nil, err 563 } 564 switch field { 565 case zoekt.RepoListFieldRepos: 566 l.Repos = make([]*zoekt.RepoListEntry, 0, len(d.repoListEntry)) 567 case zoekt.RepoListFieldReposMap: 568 l.ReposMap = make(zoekt.ReposMap, len(d.repoListEntry)) 569 } 570 571 for i := range d.repoListEntry { 572 if d.repoMetaData[i].Tombstone { 573 continue 574 } 575 // 🚨 SECURITY: Skip documents that don't belong to the tenant. This check is 576 // necessary to prevent leaking data across tenants. 577 if !tenant.HasAccess(ctx, d.repoMetaData[i].TenantID) { 578 continue 579 } 580 rle := &d.repoListEntry[i] 581 if !include(rle) { 582 continue 583 } 584 585 l.Stats.Add(&rle.Stats) 586 587 // Backwards compat for when ID is missing 588 if rle.Repository.ID == 0 { 589 l.Repos = append(l.Repos, rle) 590 continue 591 } 592 593 switch field { 594 case zoekt.RepoListFieldRepos: 595 l.Repos = append(l.Repos, rle) 596 case zoekt.RepoListFieldReposMap: 597 l.ReposMap[rle.Repository.ID] = zoekt.MinimalRepoListEntry{ 598 HasSymbols: rle.Repository.HasSymbols, 599 Branches: rle.Repository.Branches, 600 IndexTimeUnix: rle.IndexMetadata.IndexTime.Unix(), 601 } 602 } 603 604 } 605 606 // Only one of these fields is populated and in all cases the size of that 607 // field is the number of Repos in this shard. 608 l.Stats.Repos = len(l.Repos) + len(l.ReposMap) 609 610 return &l, nil 611} 612 613// regexpToMatchTreeRecursive converts a regular expression to a matchTree mt. If 614// mt is equivalent to the input r, isEqual = true and the matchTree can be used 615// in place of the regex r. If singleLine = true, then the matchTree and all 616// its children only match terms on the same line. singleLine is used during 617// recursion to decide whether to return an andLineMatchTree (singleLine = true) 618// or a andMatchTree (singleLine = false). 619func (d *indexData) regexpToMatchTreeRecursive(r *syntax.Regexp, minTextSize int, fileName bool, caseSensitive bool) (mt matchTree, isEqual bool, singleLine bool, err error) { 620 // TODO - we could perhaps transform Begin/EndText in '\n'? 621 // TODO - we could perhaps transform CharClass in (OrQuery ) 622 // if there are just a few runes, and part of a OpConcat? 623 switch r.Op { 624 case syntax.OpLiteral: 625 s := string(r.Rune) 626 if len(s) >= minTextSize { 627 ignoreCase := syntax.FoldCase == (r.Flags & syntax.FoldCase) 628 mt, err := d.newSubstringMatchTree(&query.Substring{Pattern: s, FileName: fileName, CaseSensitive: !ignoreCase && caseSensitive}) 629 return mt, true, !strings.Contains(s, "\n"), err 630 } 631 case syntax.OpCapture: 632 return d.regexpToMatchTreeRecursive(r.Sub[0], minTextSize, fileName, caseSensitive) 633 634 case syntax.OpPlus: 635 return d.regexpToMatchTreeRecursive(r.Sub[0], minTextSize, fileName, caseSensitive) 636 637 case syntax.OpRepeat: 638 if r.Min == 1 { 639 return d.regexpToMatchTreeRecursive(r.Sub[0], minTextSize, fileName, caseSensitive) 640 } else if r.Min > 1 { 641 // (x){2,} can't be expressed precisely by the matchTree 642 mt, _, singleLine, err := d.regexpToMatchTreeRecursive(r.Sub[0], minTextSize, fileName, caseSensitive) 643 return mt, false, singleLine, err 644 } 645 case syntax.OpConcat, syntax.OpAlternate: 646 var qs []matchTree 647 isEq := true 648 singleLine = true 649 for _, sr := range r.Sub { 650 if sq, subIsEq, subSingleLine, err := d.regexpToMatchTreeRecursive(sr, minTextSize, fileName, caseSensitive); sq != nil { 651 if err != nil { 652 return nil, false, false, err 653 } 654 isEq = isEq && subIsEq 655 singleLine = singleLine && subSingleLine 656 qs = append(qs, sq) 657 } 658 } 659 if r.Op == syntax.OpConcat { 660 if len(qs) > 1 { 661 isEq = false 662 } 663 newQs := make([]matchTree, 0, len(qs)) 664 for _, q := range qs { 665 if _, ok := q.(*bruteForceMatchTree); ok { 666 continue 667 } 668 newQs = append(newQs, q) 669 } 670 if len(newQs) == 1 { 671 return newQs[0], isEq, singleLine, nil 672 } 673 if len(newQs) == 0 { 674 return &bruteForceMatchTree{}, isEq, singleLine, nil 675 } 676 if singleLine { 677 return &andLineMatchTree{andMatchTree{children: newQs}}, isEq, singleLine, nil 678 } 679 return &andMatchTree{newQs}, isEq, singleLine, nil 680 } 681 for _, q := range qs { 682 if _, ok := q.(*bruteForceMatchTree); ok { 683 return q, isEq, false, nil 684 } 685 } 686 if len(qs) == 0 { 687 return &noMatchTree{Why: "const"}, isEq, false, nil 688 } 689 return &orMatchTree{qs}, isEq, false, nil 690 case syntax.OpStar: 691 if r.Sub[0].Op == syntax.OpAnyCharNotNL { 692 return &bruteForceMatchTree{}, false, true, nil 693 } 694 } 695 return &bruteForceMatchTree{}, false, false, nil 696} 697 698type timer struct { 699 last time.Time 700} 701 702func newTimer() *timer { 703 return &timer{ 704 last: time.Now(), 705 } 706} 707 708func (t *timer) Elapsed() time.Duration { 709 now := time.Now() 710 d := now.Sub(t.last) 711 t.last = now 712 return d 713}