fork of https://github.com/sourcegraph/zoekt
0

Configure Feed

Select the types of activity you want to include in your feed.

1// Copyright 2016 Google Inc. All rights reserved. 2// 3// Licensed under the Apache License, Version 2.0 (the "License"); 4// you may not use this file except in compliance with the License. 5// You may obtain a copy of the License at 6// 7// http://www.apache.org/licenses/LICENSE-2.0 8// 9// Unless required by applicable law or agreed to in writing, software 10// distributed under the License is distributed on an "AS IS" BASIS, 11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12// See the License for the specific language governing permissions and 13// limitations under the License. 14 15package zoekt 16 17import ( 18 "context" 19 "fmt" 20 "log" 21 "math" 22 "regexp/syntax" 23 "sort" 24 "strconv" 25 "strings" 26 "time" 27 28 enry_data "github.com/go-enry/go-enry/v2/data" 29 "github.com/grafana/regexp" 30 31 "github.com/sourcegraph/zoekt/query" 32) 33 34const maxUInt16 = 0xffff 35 36// addScore increments the score of the FileMatch by the computed score. If 37// debugScore is true, it also adds a debug string to the FileMatch. If raw is 38// -1, it is ignored. Otherwise, it is added to the debug string. 39func (m *FileMatch) addScore(what string, computed float64, raw float64, debugScore bool) { 40 if computed != 0 && debugScore { 41 var b strings.Builder 42 fmt.Fprintf(&b, "%s", what) 43 if raw != -1 { 44 fmt.Fprintf(&b, "(%s)", strconv.FormatFloat(raw, 'f', -1, 64)) 45 } 46 fmt.Fprintf(&b, ":%.2f, ", computed) 47 m.Debug += b.String() 48 } 49 m.Score += computed 50} 51 52func (m *FileMatch) addKeywordScore(score float64, sumTf float64, L float64, debugScore bool) { 53 if debugScore { 54 m.Debug += fmt.Sprintf("keyword-score:%.2f (sum-tf: %.2f, length-ratio: %.2f)", score, sumTf, L) 55 } 56 m.Score += score 57} 58 59// simplifyMultiRepo takes a query and a predicate. It returns Const(true) if all 60// repository names fulfill the predicate, Const(false) if none of them do, and q 61// otherwise. 62func (d *indexData) simplifyMultiRepo(q query.Q, predicate func(*Repository) bool) query.Q { 63 count := 0 64 alive := len(d.repoMetaData) 65 for i := range d.repoMetaData { 66 if d.repoMetaData[i].Tombstone { 67 alive-- 68 } else if predicate(&d.repoMetaData[i]) { 69 count++ 70 } 71 } 72 if count == alive { 73 return &query.Const{Value: true} 74 } 75 if count > 0 { 76 return q 77 } 78 return &query.Const{Value: false} 79} 80 81func (d *indexData) simplify(in query.Q) query.Q { 82 eval := query.Map(in, func(q query.Q) query.Q { 83 switch r := q.(type) { 84 case *query.Repo: 85 return d.simplifyMultiRepo(q, func(repo *Repository) bool { 86 return r.Regexp.MatchString(repo.Name) 87 }) 88 case *query.RepoRegexp: 89 return d.simplifyMultiRepo(q, func(repo *Repository) bool { 90 return r.Regexp.MatchString(repo.Name) 91 }) 92 case *query.BranchesRepos: 93 for i := range d.repoMetaData { 94 for _, br := range r.List { 95 if br.Repos.Contains(d.repoMetaData[i].ID) { 96 return q 97 } 98 } 99 } 100 return &query.Const{Value: false} 101 case *query.RepoSet: 102 return d.simplifyMultiRepo(q, func(repo *Repository) bool { 103 return r.Set[repo.Name] 104 }) 105 case *query.RepoIDs: 106 return d.simplifyMultiRepo(q, func(repo *Repository) bool { 107 return r.Repos.Contains(repo.ID) 108 }) 109 case *query.Language: 110 _, has := d.metaData.LanguageMap[r.Language] 111 if !has && d.metaData.IndexFeatureVersion < 12 { 112 // For index files that haven't been re-indexed by go-enry, 113 // fall back to file-based matching and continue even if this 114 // repo doesn't have the specific language present. 115 extsForLang := enry_data.ExtensionsByLanguage[r.Language] 116 if extsForLang != nil { 117 extFrags := make([]string, 0, len(extsForLang)) 118 for _, ext := range extsForLang { 119 extFrags = append(extFrags, regexp.QuoteMeta(ext)) 120 } 121 if len(extFrags) > 0 { 122 pattern := fmt.Sprintf("(?i)(%s)$", strings.Join(extFrags, "|")) 123 // inlined copy of query.regexpQuery 124 re, err := syntax.Parse(pattern, syntax.Perl) 125 if err != nil { 126 return &query.Const{Value: false} 127 } 128 if re.Op == syntax.OpLiteral { 129 return &query.Substring{ 130 Pattern: string(re.Rune), 131 FileName: true, 132 } 133 } 134 return &query.Regexp{ 135 Regexp: re, 136 FileName: true, 137 } 138 } 139 } 140 } 141 if !has { 142 return &query.Const{Value: false} 143 } 144 } 145 return q 146 }) 147 return query.Simplify(eval) 148} 149 150func (o *SearchOptions) SetDefaults() { 151 if o.ShardMaxMatchCount == 0 { 152 // We cap the total number of matches, so overly broad 153 // searches don't crash the machine. 154 o.ShardMaxMatchCount = 100000 155 } 156 if o.TotalMaxMatchCount == 0 { 157 o.TotalMaxMatchCount = 10 * o.ShardMaxMatchCount 158 } 159} 160 161func (d *indexData) Search(ctx context.Context, q query.Q, opts *SearchOptions) (sr *SearchResult, err error) { 162 timer := newTimer() 163 164 copyOpts := *opts 165 opts = &copyOpts 166 opts.SetDefaults() 167 168 var res SearchResult 169 if len(d.fileNameIndex) == 0 { 170 return &res, nil 171 } 172 173 select { 174 case <-ctx.Done(): 175 res.Stats.ShardsSkipped++ 176 return &res, nil 177 default: 178 } 179 180 q = d.simplify(q) 181 if c, ok := q.(*query.Const); ok && !c.Value { 182 return &res, nil 183 } 184 185 if opts.EstimateDocCount { 186 res.Stats.ShardFilesConsidered = len(d.fileBranchMasks) 187 return &res, nil 188 } 189 190 q = query.Map(q, query.ExpandFileContent) 191 192 mt, err := d.newMatchTree(q, matchTreeOpt{}) 193 if err != nil { 194 return nil, err 195 } 196 197 // Capture the costs of construction before pruning 198 updateMatchTreeStats(mt, &res.Stats) 199 200 mt, err = pruneMatchTree(mt) 201 if err != nil { 202 return nil, err 203 } 204 res.Stats.MatchTreeConstruction = timer.Elapsed() 205 if mt == nil { 206 res.Stats.ShardsSkippedFilter++ 207 return &res, nil 208 } 209 210 res.Stats.ShardsScanned++ 211 212 cp := &contentProvider{ 213 id: d, 214 stats: &res.Stats, 215 } 216 217 // Track the number of documents found in a repository for 218 // ShardRepoMaxMatchCount 219 var ( 220 lastRepoID uint16 221 repoMatchCount int 222 ) 223 224 docCount := uint32(len(d.fileBranchMasks)) 225 lastDoc := int(-1) 226 227nextFileMatch: 228 for { 229 canceled := false 230 select { 231 case <-ctx.Done(): 232 canceled = true 233 default: 234 } 235 236 nextDoc := mt.nextDoc() 237 if int(nextDoc) <= lastDoc { 238 nextDoc = uint32(lastDoc + 1) 239 } 240 241 for ; nextDoc < docCount; nextDoc++ { 242 repoID := d.repos[nextDoc] 243 repoMetadata := &d.repoMetaData[repoID] 244 245 // Skip tombstoned repositories 246 if repoMetadata.Tombstone { 247 continue 248 } 249 250 // Skip documents that are tombstoned 251 if len(repoMetadata.FileTombstones) > 0 { 252 if _, tombstoned := repoMetadata.FileTombstones[string(d.fileName(nextDoc))]; tombstoned { 253 continue 254 } 255 } 256 257 // Skip documents over ShardRepoMaxMatchCount if specified. 258 if opts.ShardRepoMaxMatchCount > 0 { 259 if repoMatchCount >= opts.ShardRepoMaxMatchCount && repoID == lastRepoID { 260 res.Stats.FilesSkipped++ 261 continue 262 } 263 } 264 265 break 266 } 267 268 if nextDoc >= docCount { 269 break 270 } 271 272 lastDoc = int(nextDoc) 273 274 // We track lastRepoID for ShardRepoMaxMatchCount 275 if lastRepoID != d.repos[nextDoc] { 276 lastRepoID = d.repos[nextDoc] 277 repoMatchCount = 0 278 } 279 280 if canceled || (res.Stats.MatchCount >= opts.ShardMaxMatchCount && opts.ShardMaxMatchCount > 0) { 281 res.Stats.FilesSkipped += int(docCount - nextDoc) 282 break 283 } 284 285 res.Stats.FilesConsidered++ 286 mt.prepare(nextDoc) 287 288 cp.setDocument(nextDoc) 289 290 known := make(map[matchTree]bool) 291 md := d.repoMetaData[d.repos[nextDoc]] 292 293 for cost := costMin; cost <= costMax; cost++ { 294 switch evalMatchTree(cp, cost, known, mt) { 295 case matchesRequiresHigherCost: 296 if cost == costMax { 297 log.Panicf("did not decide. Repo %s, doc %d, known %v", 298 md.Name, nextDoc, known) 299 } 300 case matchesFound: 301 // could short-circuit now, but we want to run higher costs to 302 // potentially find higher ranked matches. 303 case matchesNone: 304 continue nextFileMatch 305 } 306 } 307 308 fileMatch := FileMatch{ 309 Repository: md.Name, 310 RepositoryID: md.ID, 311 RepositoryPriority: md.priority, 312 FileName: string(d.fileName(nextDoc)), 313 Checksum: d.getChecksum(nextDoc), 314 Language: d.languageMap[d.getLanguage(nextDoc)], 315 } 316 317 if s := d.subRepos[nextDoc]; s > 0 { 318 if s >= uint32(len(d.subRepoPaths[d.repos[nextDoc]])) { 319 log.Panicf("corrupt index: subrepo %d beyond %v", s, d.subRepoPaths) 320 } 321 path := d.subRepoPaths[d.repos[nextDoc]][s] 322 fileMatch.SubRepositoryPath = path 323 sr := md.SubRepoMap[path] 324 fileMatch.SubRepositoryName = sr.Name 325 if idx := d.branchIndex(nextDoc); idx >= 0 { 326 fileMatch.Version = sr.Branches[idx].Version 327 } 328 } else { 329 idx := d.branchIndex(nextDoc) 330 if idx >= 0 { 331 fileMatch.Version = md.Branches[idx].Version 332 } 333 } 334 335 // Important invariant for performance: finalCands is sorted by offset and 336 // non-overlapping. gatherMatches respects this invariant and all later 337 // transformations respect this. 338 shouldMergeMatches := !opts.ChunkMatches 339 finalCands := gatherMatches(mt, known, shouldMergeMatches) 340 341 if len(finalCands) == 0 { 342 nm := d.fileName(nextDoc) 343 finalCands = append(finalCands, 344 &candidateMatch{ 345 caseSensitive: false, 346 fileName: true, 347 substrBytes: nm, 348 substrLowered: nm, 349 file: nextDoc, 350 runeOffset: 0, 351 byteOffset: 0, 352 byteMatchSz: uint32(len(nm)), 353 }) 354 } 355 356 if opts.ChunkMatches { 357 fileMatch.ChunkMatches = cp.fillChunkMatches(finalCands, opts.NumContextLines, fileMatch.Language, opts.DebugScore) 358 } else { 359 fileMatch.LineMatches = cp.fillMatches(finalCands, opts.NumContextLines, fileMatch.Language, opts.DebugScore) 360 } 361 362 if opts.UseKeywordScoring { 363 d.scoreFileUsingBM25(&fileMatch, nextDoc, finalCands, opts) 364 } else { 365 // Use the standard, non-experimental scoring method by default 366 d.scoreFile(&fileMatch, nextDoc, mt, known, opts) 367 } 368 369 fileMatch.Branches = d.gatherBranches(nextDoc, mt, known) 370 sortMatchesByScore(fileMatch.LineMatches) 371 sortChunkMatchesByScore(fileMatch.ChunkMatches) 372 if opts.Whole { 373 fileMatch.Content = cp.data(false) 374 } 375 376 matchedChunkRanges := 0 377 for _, cm := range fileMatch.ChunkMatches { 378 matchedChunkRanges += len(cm.Ranges) 379 } 380 381 repoMatchCount += len(fileMatch.LineMatches) 382 repoMatchCount += matchedChunkRanges 383 384 if opts.DebugScore { 385 fileMatch.Debug = fmt.Sprintf("score:%.2f <- %s", fileMatch.Score, fileMatch.Debug) 386 } 387 388 res.Files = append(res.Files, fileMatch) 389 res.Stats.MatchCount += len(fileMatch.LineMatches) 390 res.Stats.MatchCount += matchedChunkRanges 391 res.Stats.FileCount++ 392 } 393 394 // We do not sort Files here, instead we rely on the shards pkg to do file 395 // ranking. If we sorted now, we would break the assumption that results 396 // from the same repo in a shard appear next to each other. 397 398 for _, md := range d.repoMetaData { 399 r := md 400 addRepo(&res, &r) 401 for _, v := range r.SubRepoMap { 402 addRepo(&res, v) 403 } 404 } 405 406 // Update stats based on work done during document search. 407 updateMatchTreeStats(mt, &res.Stats) 408 409 // If document ranking is enabled, then we can rank and truncate the files to save memory. 410 if opts.UseDocumentRanks { 411 res.Files = SortAndTruncateFiles(res.Files, opts) 412 } 413 414 res.Stats.MatchTreeSearch = timer.Elapsed() 415 416 return &res, nil 417} 418 419// scoreFile computes a score for the file match using various scoring signals, like 420// whether there's an exact match on a symbol, the number of query clauses that matched, etc. 421func (d *indexData) scoreFile(fileMatch *FileMatch, doc uint32, mt matchTree, known map[matchTree]bool, opts *SearchOptions) { 422 atomMatchCount := 0 423 visitMatchAtoms(mt, known, func(mt matchTree) { 424 atomMatchCount++ 425 }) 426 427 addScore := func(what string, computed float64) { 428 fileMatch.addScore(what, computed, -1, opts.DebugScore) 429 } 430 431 // atom-count boosts files with matches from more than 1 atom. The 432 // maximum boost is scoreFactorAtomMatch. 433 if atomMatchCount > 0 { 434 fileMatch.addScore("atom", (1.0-1.0/float64(atomMatchCount))*scoreFactorAtomMatch, float64(atomMatchCount), opts.DebugScore) 435 } 436 437 maxFileScore := 0.0 438 for i := range fileMatch.LineMatches { 439 if maxFileScore < fileMatch.LineMatches[i].Score { 440 maxFileScore = fileMatch.LineMatches[i].Score 441 } 442 443 // Order by ordering in file. 444 fileMatch.LineMatches[i].Score += scoreLineOrderFactor * (1.0 - (float64(i) / float64(len(fileMatch.LineMatches)))) 445 } 446 447 for i := range fileMatch.ChunkMatches { 448 if maxFileScore < fileMatch.ChunkMatches[i].Score { 449 maxFileScore = fileMatch.ChunkMatches[i].Score 450 } 451 452 // Order by ordering in file. 453 fileMatch.ChunkMatches[i].Score += scoreLineOrderFactor * (1.0 - (float64(i) / float64(len(fileMatch.ChunkMatches)))) 454 } 455 456 // Maintain ordering of input files. This 457 // strictly dominates the in-file ordering of 458 // the matches. 459 addScore("fragment", maxFileScore) 460 461 if opts.UseDocumentRanks && len(d.ranks) > int(doc) { 462 weight := scoreFileRankFactor 463 if opts.DocumentRanksWeight > 0.0 { 464 weight = opts.DocumentRanksWeight 465 } 466 467 ranks := d.ranks[doc] 468 // The ranks slice always contains one entry representing the file rank (unless it's empty since the 469 // file doesn't have a rank). This is left over from when documents could have multiple rank signals, 470 // and we plan to clean this up. 471 if len(ranks) > 0 { 472 // The file rank represents a log (base 2) count. The log ranks should be bounded at 32, but we 473 // cap it just in case to ensure it falls in the range [0, 1]. 474 normalized := math.Min(1.0, ranks[0]/32.0) 475 addScore("file-rank", weight*normalized) 476 } 477 } 478 479 md := d.repoMetaData[d.repos[doc]] 480 addScore("doc-order", scoreFileOrderFactor*(1.0-float64(doc)/float64(len(d.boundaries)))) 481 addScore("repo-rank", scoreRepoRankFactor*float64(md.Rank)/maxUInt16) 482 483 if opts.DebugScore { 484 fileMatch.Debug = strings.TrimSuffix(fileMatch.Debug, ", ") 485 } 486} 487 488// scoreFileUsingBM25 computes a score for the file match using an approximation to BM25, the most common scoring 489// algorithm for keyword search: https://en.wikipedia.org/wiki/Okapi_BM25. It implements all parts of the formula 490// except inverse document frequency (idf), since we don't have access to global term frequency statistics. 491// 492// This scoring strategy ignores all other signals including document ranks. This keeps things simple for now, 493// since BM25 is not normalized and can be tricky to combine with other scoring signals. 494func (d *indexData) scoreFileUsingBM25(fileMatch *FileMatch, doc uint32, cands []*candidateMatch, opts *SearchOptions) { 495 // Treat each candidate match as a term and compute the frequencies. For now, ignore case 496 // sensitivity and treat filenames and symbols the same as content. 497 termFreqs := map[string]int{} 498 for _, cand := range cands { 499 term := string(cand.substrLowered) 500 termFreqs[term]++ 501 } 502 503 // Compute the file length ratio. Usually the calculation would be based on terms, but using 504 // bytes should work fine, as we're just computing a ratio. 505 fileLength := float64(d.boundaries[doc+1] - d.boundaries[doc]) 506 numFiles := len(d.boundaries) 507 averageFileLength := float64(d.boundaries[numFiles-1]) / float64(numFiles) 508 L := fileLength / averageFileLength 509 510 // Use standard parameter defaults (used in Lucene and academic papers) 511 k, b := 1.2, 0.75 512 sumTf := 0.0 // Just for debugging 513 score := 0.0 514 for _, freq := range termFreqs { 515 tf := float64(freq) 516 sumTf += tf 517 score += ((k + 1.0) * tf) / (k*(1.0-b+b*L) + tf) 518 } 519 520 fileMatch.addKeywordScore(score, sumTf, L, opts.DebugScore) 521} 522 523func addRepo(res *SearchResult, repo *Repository) { 524 if res.RepoURLs == nil { 525 res.RepoURLs = map[string]string{} 526 } 527 res.RepoURLs[repo.Name] = repo.FileURLTemplate 528 529 if res.LineFragments == nil { 530 res.LineFragments = map[string]string{} 531 } 532 res.LineFragments[repo.Name] = repo.LineFragmentTemplate 533} 534 535type sortByOffsetSlice []*candidateMatch 536 537func (m sortByOffsetSlice) Len() int { return len(m) } 538func (m sortByOffsetSlice) Swap(i, j int) { m[i], m[j] = m[j], m[i] } 539func (m sortByOffsetSlice) Less(i, j int) bool { 540 if m[i].byteOffset == m[j].byteOffset { // tie break if same offset 541 // Prefer longer candidates if starting at same position 542 return m[i].byteMatchSz > m[j].byteMatchSz 543 } 544 return m[i].byteOffset < m[j].byteOffset 545} 546 547// setScoreWeight is a helper used by gatherMatches to set the weight based on 548// the score weight of the matchTree. 549func setScoreWeight(scoreWeight float64, cm []*candidateMatch) []*candidateMatch { 550 for _, m := range cm { 551 m.scoreWeight = scoreWeight 552 } 553 return cm 554} 555 556// Gather matches from this document. This never returns a mixture of 557// filename/content matches: if there are content matches, all 558// filename matches are trimmed from the result. The matches are 559// returned in document order and are non-overlapping. 560// 561// If `merge` is set, overlapping and adjacent matches will be merged 562// into a single match. Otherwise, overlapping matches will be removed, 563// but adjacent matches will remain. 564func gatherMatches(mt matchTree, known map[matchTree]bool, merge bool) []*candidateMatch { 565 var cands []*candidateMatch 566 visitMatches(mt, known, 1, func(mt matchTree, scoreWeight float64) { 567 if smt, ok := mt.(*substrMatchTree); ok { 568 cands = append(cands, setScoreWeight(scoreWeight, smt.current)...) 569 } 570 if rmt, ok := mt.(*regexpMatchTree); ok { 571 cands = append(cands, setScoreWeight(scoreWeight, rmt.found)...) 572 } 573 if rmt, ok := mt.(*wordMatchTree); ok { 574 cands = append(cands, setScoreWeight(scoreWeight, rmt.found)...) 575 } 576 if smt, ok := mt.(*symbolRegexpMatchTree); ok { 577 cands = append(cands, setScoreWeight(scoreWeight, smt.found)...) 578 } 579 }) 580 581 foundContentMatch := false 582 for _, c := range cands { 583 if !c.fileName { 584 foundContentMatch = true 585 break 586 } 587 } 588 589 res := cands[:0] 590 for _, c := range cands { 591 if !foundContentMatch || !c.fileName { 592 res = append(res, c) 593 } 594 } 595 cands = res 596 597 if merge { 598 // Merge adjacent candidates. This guarantees that the matches 599 // are non-overlapping. 600 sort.Sort((sortByOffsetSlice)(cands)) 601 res = cands[:0] 602 mergeRun := 1 603 for i, c := range cands { 604 if i == 0 { 605 res = append(res, c) 606 continue 607 } 608 last := res[len(res)-1] 609 lastEnd := last.byteOffset + last.byteMatchSz 610 end := c.byteOffset + c.byteMatchSz 611 if lastEnd >= c.byteOffset { 612 mergeRun++ 613 614 // Average out the score across the merged candidates. Only do it if 615 // we are boosting to avoid floating point funkiness in the normal 616 // case. 617 if !(epsilonEqualsOne(last.scoreWeight) && epsilonEqualsOne(c.scoreWeight)) { 618 last.scoreWeight = ((last.scoreWeight * float64(mergeRun-1)) + c.scoreWeight) / float64(mergeRun) 619 } 620 621 // latest candidate goes further, update our end 622 if end > lastEnd { 623 last.byteMatchSz = end - last.byteOffset 624 } 625 626 continue 627 } else { 628 mergeRun = 1 629 } 630 631 res = append(res, c) 632 } 633 } else { 634 // Remove overlapping candidates. This guarantees that the matches 635 // are non-overlapping, but also preserves expected match counts. 636 sort.Sort((sortByOffsetSlice)(cands)) 637 res = cands[:0] 638 for i, c := range cands { 639 if i == 0 { 640 res = append(res, c) 641 continue 642 } 643 last := res[len(res)-1] 644 lastEnd := last.byteOffset + last.byteMatchSz 645 if lastEnd > c.byteOffset { 646 continue 647 } 648 649 res = append(res, c) 650 } 651 } 652 653 return res 654} 655 656func (d *indexData) branchIndex(docID uint32) int { 657 mask := d.fileBranchMasks[docID] 658 idx := 0 659 for mask != 0 { 660 if mask&0x1 != 0 { 661 return idx 662 } 663 idx++ 664 mask >>= 1 665 } 666 return -1 667} 668 669// gatherBranches returns a list of branch names taking into account any branch 670// filters in the query. If the query contains a branch filter, it returns all 671// branches containing the docID and matching the branch filter. Otherwise, it 672// returns all branches containing docID. 673func (d *indexData) gatherBranches(docID uint32, mt matchTree, known map[matchTree]bool) []string { 674 var mask uint64 675 visitMatchAtoms(mt, known, func(mt matchTree) { 676 bq, ok := mt.(*branchQueryMatchTree) 677 if !ok { 678 return 679 } 680 681 mask = mask | bq.branchMask() 682 }) 683 684 if mask == 0 { 685 mask = d.fileBranchMasks[docID] 686 } 687 688 var branches []string 689 id := uint32(1) 690 branchNames := d.branchNames[d.repos[docID]] 691 for mask != 0 { 692 if mask&0x1 != 0 { 693 branches = append(branches, branchNames[uint(id)]) 694 } 695 id <<= 1 696 mask >>= 1 697 } 698 699 return branches 700} 701 702func (d *indexData) List(ctx context.Context, q query.Q, opts *ListOptions) (rl *RepoList, err error) { 703 var include func(rle *RepoListEntry) bool 704 705 q = d.simplify(q) 706 if c, ok := q.(*query.Const); ok { 707 if !c.Value { 708 return &RepoList{}, nil 709 } 710 include = func(rle *RepoListEntry) bool { 711 return true 712 } 713 } else { 714 sr, err := d.Search(ctx, q, &SearchOptions{ 715 ShardRepoMaxMatchCount: 1, 716 }) 717 if err != nil { 718 return nil, err 719 } 720 721 foundRepos := make(map[string]struct{}, len(sr.Files)) 722 for _, file := range sr.Files { 723 foundRepos[file.Repository] = struct{}{} 724 } 725 726 include = func(rle *RepoListEntry) bool { 727 _, ok := foundRepos[rle.Repository.Name] 728 return ok 729 } 730 } 731 732 var l RepoList 733 734 field, err := opts.GetField() 735 if err != nil { 736 return nil, err 737 } 738 switch field { 739 case RepoListFieldRepos: 740 l.Repos = make([]*RepoListEntry, 0, len(d.repoListEntry)) 741 case RepoListFieldReposMap: 742 l.ReposMap = make(ReposMap, len(d.repoListEntry)) 743 } 744 745 for i := range d.repoListEntry { 746 if d.repoMetaData[i].Tombstone { 747 continue 748 } 749 rle := &d.repoListEntry[i] 750 if !include(rle) { 751 continue 752 } 753 754 l.Stats.Add(&rle.Stats) 755 756 // Backwards compat for when ID is missing 757 if rle.Repository.ID == 0 { 758 l.Repos = append(l.Repos, rle) 759 continue 760 } 761 762 switch field { 763 case RepoListFieldRepos: 764 l.Repos = append(l.Repos, rle) 765 case RepoListFieldReposMap: 766 l.ReposMap[rle.Repository.ID] = MinimalRepoListEntry{ 767 HasSymbols: rle.Repository.HasSymbols, 768 Branches: rle.Repository.Branches, 769 IndexTimeUnix: rle.IndexMetadata.IndexTime.Unix(), 770 } 771 } 772 773 } 774 775 // Only one of these fields is populated and in all cases the size of that 776 // field is the number of Repos in this shard. 777 l.Stats.Repos = len(l.Repos) + len(l.ReposMap) 778 779 return &l, nil 780} 781 782// regexpToMatchTreeRecursive converts a regular expression to a matchTree mt. If 783// mt is equivalent to the input r, isEqual = true and the matchTree can be used 784// in place of the regex r. If singleLine = true, then the matchTree and all 785// its children only match terms on the same line. singleLine is used during 786// recursion to decide whether to return an andLineMatchTree (singleLine = true) 787// or a andMatchTree (singleLine = false). 788func (d *indexData) regexpToMatchTreeRecursive(r *syntax.Regexp, minTextSize int, fileName bool, caseSensitive bool) (mt matchTree, isEqual bool, singleLine bool, err error) { 789 // TODO - we could perhaps transform Begin/EndText in '\n'? 790 // TODO - we could perhaps transform CharClass in (OrQuery ) 791 // if there are just a few runes, and part of a OpConcat? 792 switch r.Op { 793 case syntax.OpLiteral: 794 s := string(r.Rune) 795 if len(s) >= minTextSize { 796 mt, err := d.newSubstringMatchTree(&query.Substring{Pattern: s, FileName: fileName, CaseSensitive: caseSensitive}) 797 return mt, true, !strings.Contains(s, "\n"), err 798 } 799 case syntax.OpCapture: 800 return d.regexpToMatchTreeRecursive(r.Sub[0], minTextSize, fileName, caseSensitive) 801 802 case syntax.OpPlus: 803 return d.regexpToMatchTreeRecursive(r.Sub[0], minTextSize, fileName, caseSensitive) 804 805 case syntax.OpRepeat: 806 if r.Min == 1 { 807 return d.regexpToMatchTreeRecursive(r.Sub[0], minTextSize, fileName, caseSensitive) 808 } else if r.Min > 1 { 809 // (x){2,} can't be expressed precisely by the matchTree 810 mt, _, singleLine, err := d.regexpToMatchTreeRecursive(r.Sub[0], minTextSize, fileName, caseSensitive) 811 return mt, false, singleLine, err 812 } 813 case syntax.OpConcat, syntax.OpAlternate: 814 var qs []matchTree 815 isEq := true 816 singleLine = true 817 for _, sr := range r.Sub { 818 if sq, subIsEq, subSingleLine, err := d.regexpToMatchTreeRecursive(sr, minTextSize, fileName, caseSensitive); sq != nil { 819 if err != nil { 820 return nil, false, false, err 821 } 822 isEq = isEq && subIsEq 823 singleLine = singleLine && subSingleLine 824 qs = append(qs, sq) 825 } 826 } 827 if r.Op == syntax.OpConcat { 828 if len(qs) > 1 { 829 isEq = false 830 } 831 newQs := make([]matchTree, 0, len(qs)) 832 for _, q := range qs { 833 if _, ok := q.(*bruteForceMatchTree); ok { 834 continue 835 } 836 newQs = append(newQs, q) 837 } 838 if len(newQs) == 1 { 839 return newQs[0], isEq, singleLine, nil 840 } 841 if len(newQs) == 0 { 842 return &bruteForceMatchTree{}, isEq, singleLine, nil 843 } 844 if singleLine { 845 return &andLineMatchTree{andMatchTree{children: newQs}}, isEq, singleLine, nil 846 } 847 return &andMatchTree{newQs}, isEq, singleLine, nil 848 } 849 for _, q := range qs { 850 if _, ok := q.(*bruteForceMatchTree); ok { 851 return q, isEq, false, nil 852 } 853 } 854 if len(qs) == 0 { 855 return &noMatchTree{Why: "const"}, isEq, false, nil 856 } 857 return &orMatchTree{qs}, isEq, false, nil 858 case syntax.OpStar: 859 if r.Sub[0].Op == syntax.OpAnyCharNotNL { 860 return &bruteForceMatchTree{}, false, true, nil 861 } 862 } 863 return &bruteForceMatchTree{}, false, false, nil 864} 865 866type timer struct { 867 last time.Time 868} 869 870func newTimer() *timer { 871 return &timer{ 872 last: time.Now(), 873 } 874} 875 876func (t *timer) Elapsed() time.Duration { 877 now := time.Now() 878 d := now.Sub(t.last) 879 t.last = now 880 return d 881}