fork of https://github.com/sourcegraph/zoekt
0

Configure Feed

Select the types of activity you want to include in your feed.

1// Copyright 2016 Google Inc. All rights reserved. 2// 3// Licensed under the Apache License, Version 2.0 (the "License"); 4// you may not use this file except in compliance with the License. 5// You may obtain a copy of the License at 6// 7// http://www.apache.org/licenses/LICENSE-2.0 8// 9// Unless required by applicable law or agreed to in writing, software 10// distributed under the License is distributed on an "AS IS" BASIS, 11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12// See the License for the specific language governing permissions and 13// limitations under the License. 14 15package zoekt 16 17import ( 18 "context" 19 "fmt" 20 "log" 21 "math" 22 "regexp/syntax" 23 "sort" 24 "strings" 25 "time" 26 27 enry_data "github.com/go-enry/go-enry/v2/data" 28 "github.com/grafana/regexp" 29 30 "github.com/sourcegraph/zoekt/query" 31) 32 33const maxUInt16 = 0xffff 34 35func (m *FileMatch) addScore(what string, s float64, debugScore bool) { 36 if debugScore { 37 m.Debug += fmt.Sprintf("%s:%.2f, ", what, s) 38 } 39 m.Score += s 40} 41 42func (m *FileMatch) addKeywordScore(score float64, sumTf float64, L float64, debugScore bool) { 43 if debugScore { 44 m.Debug += fmt.Sprintf("keyword-score:%.2f (sum-tf: %.2f, length-ratio: %.2f)", score, sumTf, L) 45 } 46 m.Score += score 47} 48 49// simplifyMultiRepo takes a query and a predicate. It returns Const(true) if all 50// repository names fulfill the predicate, Const(false) if none of them do, and q 51// otherwise. 52func (d *indexData) simplifyMultiRepo(q query.Q, predicate func(*Repository) bool) query.Q { 53 count := 0 54 alive := len(d.repoMetaData) 55 for i := range d.repoMetaData { 56 if d.repoMetaData[i].Tombstone { 57 alive-- 58 } else if predicate(&d.repoMetaData[i]) { 59 count++ 60 } 61 } 62 if count == alive { 63 return &query.Const{Value: true} 64 } 65 if count > 0 { 66 return q 67 } 68 return &query.Const{Value: false} 69} 70 71func (d *indexData) simplify(in query.Q) query.Q { 72 eval := query.Map(in, func(q query.Q) query.Q { 73 switch r := q.(type) { 74 case *query.Repo: 75 return d.simplifyMultiRepo(q, func(repo *Repository) bool { 76 return r.Regexp.MatchString(repo.Name) 77 }) 78 case *query.RepoRegexp: 79 return d.simplifyMultiRepo(q, func(repo *Repository) bool { 80 return r.Regexp.MatchString(repo.Name) 81 }) 82 case *query.BranchesRepos: 83 for i := range d.repoMetaData { 84 for _, br := range r.List { 85 if br.Repos.Contains(d.repoMetaData[i].ID) { 86 return q 87 } 88 } 89 } 90 return &query.Const{Value: false} 91 case *query.RepoSet: 92 return d.simplifyMultiRepo(q, func(repo *Repository) bool { 93 return r.Set[repo.Name] 94 }) 95 case *query.RepoIDs: 96 return d.simplifyMultiRepo(q, func(repo *Repository) bool { 97 return r.Repos.Contains(repo.ID) 98 }) 99 case *query.Language: 100 _, has := d.metaData.LanguageMap[r.Language] 101 if !has && d.metaData.IndexFeatureVersion < 12 { 102 // For index files that haven't been re-indexed by go-enry, 103 // fall back to file-based matching and continue even if this 104 // repo doesn't have the specific language present. 105 extsForLang := enry_data.ExtensionsByLanguage[r.Language] 106 if extsForLang != nil { 107 extFrags := make([]string, 0, len(extsForLang)) 108 for _, ext := range extsForLang { 109 extFrags = append(extFrags, regexp.QuoteMeta(ext)) 110 } 111 if len(extFrags) > 0 { 112 pattern := fmt.Sprintf("(?i)(%s)$", strings.Join(extFrags, "|")) 113 // inlined copy of query.regexpQuery 114 re, err := syntax.Parse(pattern, syntax.Perl) 115 if err != nil { 116 return &query.Const{Value: false} 117 } 118 if re.Op == syntax.OpLiteral { 119 return &query.Substring{ 120 Pattern: string(re.Rune), 121 FileName: true, 122 } 123 } 124 return &query.Regexp{ 125 Regexp: re, 126 FileName: true, 127 } 128 } 129 } 130 } 131 if !has { 132 return &query.Const{Value: false} 133 } 134 } 135 return q 136 }) 137 return query.Simplify(eval) 138} 139 140func (o *SearchOptions) SetDefaults() { 141 if o.ShardMaxMatchCount == 0 { 142 // We cap the total number of matches, so overly broad 143 // searches don't crash the machine. 144 o.ShardMaxMatchCount = 100000 145 } 146 if o.TotalMaxMatchCount == 0 { 147 o.TotalMaxMatchCount = 10 * o.ShardMaxMatchCount 148 } 149} 150 151func (d *indexData) Search(ctx context.Context, q query.Q, opts *SearchOptions) (sr *SearchResult, err error) { 152 timer := newTimer() 153 154 copyOpts := *opts 155 opts = &copyOpts 156 opts.SetDefaults() 157 158 var res SearchResult 159 if len(d.fileNameIndex) == 0 { 160 return &res, nil 161 } 162 163 select { 164 case <-ctx.Done(): 165 res.Stats.ShardsSkipped++ 166 return &res, nil 167 default: 168 } 169 170 q = d.simplify(q) 171 if c, ok := q.(*query.Const); ok && !c.Value { 172 return &res, nil 173 } 174 175 if opts.EstimateDocCount { 176 res.Stats.ShardFilesConsidered = len(d.fileBranchMasks) 177 return &res, nil 178 } 179 180 q = query.Map(q, query.ExpandFileContent) 181 182 mt, err := d.newMatchTree(q, matchTreeOpt{}) 183 if err != nil { 184 return nil, err 185 } 186 187 // Capture the costs of construction before pruning 188 updateMatchTreeStats(mt, &res.Stats) 189 190 mt, err = pruneMatchTree(mt) 191 if err != nil { 192 return nil, err 193 } 194 res.Stats.MatchTreeConstruction = timer.Elapsed() 195 if mt == nil { 196 res.Stats.ShardsSkippedFilter++ 197 return &res, nil 198 } 199 200 res.Stats.ShardsScanned++ 201 202 cp := &contentProvider{ 203 id: d, 204 stats: &res.Stats, 205 } 206 207 // Track the number of documents found in a repository for 208 // ShardRepoMaxMatchCount 209 var ( 210 lastRepoID uint16 211 repoMatchCount int 212 ) 213 214 docCount := uint32(len(d.fileBranchMasks)) 215 lastDoc := int(-1) 216 217nextFileMatch: 218 for { 219 canceled := false 220 select { 221 case <-ctx.Done(): 222 canceled = true 223 default: 224 } 225 226 nextDoc := mt.nextDoc() 227 if int(nextDoc) <= lastDoc { 228 nextDoc = uint32(lastDoc + 1) 229 } 230 231 for ; nextDoc < docCount; nextDoc++ { 232 repoID := d.repos[nextDoc] 233 repoMetadata := &d.repoMetaData[repoID] 234 235 // Skip tombstoned repositories 236 if repoMetadata.Tombstone { 237 continue 238 } 239 240 // Skip documents that are tombstoned 241 if len(repoMetadata.FileTombstones) > 0 { 242 if _, tombstoned := repoMetadata.FileTombstones[string(d.fileName(nextDoc))]; tombstoned { 243 continue 244 } 245 } 246 247 // Skip documents over ShardRepoMaxMatchCount if specified. 248 if opts.ShardRepoMaxMatchCount > 0 { 249 if repoMatchCount >= opts.ShardRepoMaxMatchCount && repoID == lastRepoID { 250 res.Stats.FilesSkipped++ 251 continue 252 } 253 } 254 255 break 256 } 257 258 if nextDoc >= docCount { 259 break 260 } 261 262 lastDoc = int(nextDoc) 263 264 // We track lastRepoID for ShardRepoMaxMatchCount 265 if lastRepoID != d.repos[nextDoc] { 266 lastRepoID = d.repos[nextDoc] 267 repoMatchCount = 0 268 } 269 270 if canceled || (res.Stats.MatchCount >= opts.ShardMaxMatchCount && opts.ShardMaxMatchCount > 0) { 271 res.Stats.FilesSkipped += int(docCount - nextDoc) 272 break 273 } 274 275 res.Stats.FilesConsidered++ 276 mt.prepare(nextDoc) 277 278 cp.setDocument(nextDoc) 279 280 known := make(map[matchTree]bool) 281 md := d.repoMetaData[d.repos[nextDoc]] 282 283 for cost := costMin; cost <= costMax; cost++ { 284 v, ok := mt.matches(cp, cost, known) 285 if ok && !v { 286 continue nextFileMatch 287 } 288 289 if cost == costMax && !ok { 290 log.Panicf("did not decide. Repo %s, doc %d, known %v", 291 md.Name, nextDoc, known) 292 } 293 } 294 295 fileMatch := FileMatch{ 296 Repository: md.Name, 297 RepositoryID: md.ID, 298 RepositoryPriority: md.priority, 299 FileName: string(d.fileName(nextDoc)), 300 Checksum: d.getChecksum(nextDoc), 301 Language: d.languageMap[d.getLanguage(nextDoc)], 302 } 303 304 if s := d.subRepos[nextDoc]; s > 0 { 305 if s >= uint32(len(d.subRepoPaths[d.repos[nextDoc]])) { 306 log.Panicf("corrupt index: subrepo %d beyond %v", s, d.subRepoPaths) 307 } 308 path := d.subRepoPaths[d.repos[nextDoc]][s] 309 fileMatch.SubRepositoryPath = path 310 sr := md.SubRepoMap[path] 311 fileMatch.SubRepositoryName = sr.Name 312 if idx := d.branchIndex(nextDoc); idx >= 0 { 313 fileMatch.Version = sr.Branches[idx].Version 314 } 315 } else { 316 idx := d.branchIndex(nextDoc) 317 if idx >= 0 { 318 fileMatch.Version = md.Branches[idx].Version 319 } 320 } 321 322 shouldMergeMatches := !opts.ChunkMatches 323 finalCands := gatherMatches(mt, known, shouldMergeMatches) 324 325 if len(finalCands) == 0 { 326 nm := d.fileName(nextDoc) 327 finalCands = append(finalCands, 328 &candidateMatch{ 329 caseSensitive: false, 330 fileName: true, 331 substrBytes: nm, 332 substrLowered: nm, 333 file: nextDoc, 334 runeOffset: 0, 335 byteOffset: 0, 336 byteMatchSz: uint32(len(nm)), 337 }) 338 } 339 340 if opts.ChunkMatches { 341 fileMatch.ChunkMatches = cp.fillChunkMatches(finalCands, opts.NumContextLines, fileMatch.Language, opts.DebugScore) 342 } else { 343 fileMatch.LineMatches = cp.fillMatches(finalCands, opts.NumContextLines, fileMatch.Language, opts.DebugScore) 344 } 345 346 if opts.UseKeywordScoring { 347 d.scoreFileUsingBM25(&fileMatch, nextDoc, finalCands, opts) 348 } else { 349 // Use the standard, non-experimental scoring method by default 350 d.scoreFile(&fileMatch, nextDoc, mt, known, opts) 351 } 352 353 fileMatch.Branches = d.gatherBranches(nextDoc, mt, known) 354 sortMatchesByScore(fileMatch.LineMatches) 355 sortChunkMatchesByScore(fileMatch.ChunkMatches) 356 if opts.Whole { 357 fileMatch.Content = cp.data(false) 358 } 359 360 matchedChunkRanges := 0 361 for _, cm := range fileMatch.ChunkMatches { 362 matchedChunkRanges += len(cm.Ranges) 363 } 364 365 repoMatchCount += len(fileMatch.LineMatches) 366 repoMatchCount += matchedChunkRanges 367 368 if opts.DebugScore { 369 fileMatch.Debug = fmt.Sprintf("score:%.2f <- %s", fileMatch.Score, fileMatch.Debug) 370 } 371 372 res.Files = append(res.Files, fileMatch) 373 res.Stats.MatchCount += len(fileMatch.LineMatches) 374 res.Stats.MatchCount += matchedChunkRanges 375 res.Stats.FileCount++ 376 } 377 378 // We do not sort Files here, instead we rely on the shards pkg to do file 379 // ranking. If we sorted now, we would break the assumption that results 380 // from the same repo in a shard appear next to each other. 381 382 for _, md := range d.repoMetaData { 383 r := md 384 addRepo(&res, &r) 385 for _, v := range r.SubRepoMap { 386 addRepo(&res, v) 387 } 388 } 389 390 // Update stats based on work done during document search. 391 updateMatchTreeStats(mt, &res.Stats) 392 393 // If document ranking is enabled, then we can rank and truncate the files to save memory. 394 if opts.UseDocumentRanks { 395 res.Files = SortAndTruncateFiles(res.Files, opts) 396 } 397 398 res.Stats.MatchTreeSearch = timer.Elapsed() 399 400 return &res, nil 401} 402 403// scoreFile computes a score for the file match using various scoring signals, like 404// whether there's an exact match on a symbol, the number of query clauses that matched, etc. 405func (d *indexData) scoreFile(fileMatch *FileMatch, doc uint32, mt matchTree, known map[matchTree]bool, opts *SearchOptions) { 406 atomMatchCount := 0 407 visitMatches(mt, known, func(mt matchTree) { 408 atomMatchCount++ 409 }) 410 411 // atom-count boosts files with matches from more than 1 atom. The 412 // maximum boost is scoreFactorAtomMatch. 413 if atomMatchCount > 0 { 414 fileMatch.addScore("atom", (1.0-1.0/float64(atomMatchCount))*scoreFactorAtomMatch, opts.DebugScore) 415 } 416 417 maxFileScore := 0.0 418 repetitions := 0 419 for i := range fileMatch.LineMatches { 420 if maxFileScore < fileMatch.LineMatches[i].Score { 421 maxFileScore = fileMatch.LineMatches[i].Score 422 repetitions = 0 423 } else if maxFileScore == fileMatch.LineMatches[i].Score { 424 repetitions += 1 425 } 426 427 // Order by ordering in file. 428 fileMatch.LineMatches[i].Score += scoreLineOrderFactor * (1.0 - (float64(i) / float64(len(fileMatch.LineMatches)))) 429 } 430 431 for i := range fileMatch.ChunkMatches { 432 if maxFileScore < fileMatch.ChunkMatches[i].Score { 433 maxFileScore = fileMatch.ChunkMatches[i].Score 434 } 435 436 // Order by ordering in file. 437 fileMatch.ChunkMatches[i].Score += scoreLineOrderFactor * (1.0 - (float64(i) / float64(len(fileMatch.ChunkMatches)))) 438 } 439 440 // Maintain ordering of input files. This 441 // strictly dominates the in-file ordering of 442 // the matches. 443 fileMatch.addScore("fragment", maxFileScore, opts.DebugScore) 444 445 // Prefer docs with several top-scored matches. 446 fileMatch.addScore("repetition-boost", scoreRepetitionFactor*float64(repetitions), opts.DebugScore) 447 448 if opts.UseDocumentRanks && len(d.ranks) > int(doc) { 449 weight := scoreFileRankFactor 450 if opts.DocumentRanksWeight > 0.0 { 451 weight = opts.DocumentRanksWeight 452 } 453 454 ranks := d.ranks[doc] 455 // The ranks slice always contains one entry representing the file rank (unless it's empty since the 456 // file doesn't have a rank). This is left over from when documents could have multiple rank signals, 457 // and we plan to clean this up. 458 if len(ranks) > 0 { 459 // The file rank represents a log (base 2) count. The log ranks should be bounded at 32, but we 460 // cap it just in case to ensure it falls in the range [0, 1]. 461 normalized := math.Min(1.0, ranks[0]/32.0) 462 fileMatch.addScore("file-rank", weight*normalized, opts.DebugScore) 463 } 464 } 465 466 md := d.repoMetaData[d.repos[doc]] 467 fileMatch.addScore("doc-order", scoreFileOrderFactor*(1.0-float64(doc)/float64(len(d.boundaries))), opts.DebugScore) 468 fileMatch.addScore("repo-rank", scoreRepoRankFactor*float64(md.Rank)/maxUInt16, opts.DebugScore) 469} 470 471// scoreFileUsingBM25 computes a score for the file match using an approximation to BM25, the most common scoring 472// algorithm for keyword search: https://en.wikipedia.org/wiki/Okapi_BM25. It implements all parts of the formula 473// except inverse document frequency (idf), since we don't have access to global term frequency statistics. 474// 475// This scoring strategy ignores all other signals including document ranks. This keeps things simple for now, 476// since BM25 is not normalized and can be tricky to combine with other scoring signals. 477func (d *indexData) scoreFileUsingBM25(fileMatch *FileMatch, doc uint32, cands []*candidateMatch, opts *SearchOptions) { 478 // Treat each candidate match as a term and compute the frequencies. For now, ignore case 479 // sensitivity and treat filenames and symbols the same as content. 480 termFreqs := map[string]int{} 481 for _, cand := range cands { 482 term := string(cand.substrLowered) 483 termFreqs[term]++ 484 } 485 486 // Compute the file length ratio. Usually the calculation would be based on terms, but using 487 // bytes should work fine, as we're just computing a ratio. 488 fileLength := float64(d.boundaries[doc+1] - d.boundaries[doc]) 489 numFiles := len(d.boundaries) 490 averageFileLength := float64(d.boundaries[numFiles-1]) / float64(numFiles) 491 L := fileLength / averageFileLength 492 493 // Use standard parameter defaults (used in Lucene and academic papers) 494 k, b := 1.2, 0.75 495 sumTf := 0.0 // Just for debugging 496 score := 0.0 497 for _, freq := range termFreqs { 498 tf := float64(freq) 499 sumTf += tf 500 score += ((k + 1.0) * tf) / (k*(1.0-b+b*L) + tf) 501 } 502 503 fileMatch.addKeywordScore(score, sumTf, L, opts.DebugScore) 504} 505 506func addRepo(res *SearchResult, repo *Repository) { 507 if res.RepoURLs == nil { 508 res.RepoURLs = map[string]string{} 509 } 510 res.RepoURLs[repo.Name] = repo.FileURLTemplate 511 512 if res.LineFragments == nil { 513 res.LineFragments = map[string]string{} 514 } 515 res.LineFragments[repo.Name] = repo.LineFragmentTemplate 516} 517 518type sortByOffsetSlice []*candidateMatch 519 520func (m sortByOffsetSlice) Len() int { return len(m) } 521func (m sortByOffsetSlice) Swap(i, j int) { m[i], m[j] = m[j], m[i] } 522func (m sortByOffsetSlice) Less(i, j int) bool { 523 return m[i].byteOffset < m[j].byteOffset 524} 525 526// Gather matches from this document. This never returns a mixture of 527// filename/content matches: if there are content matches, all 528// filename matches are trimmed from the result. The matches are 529// returned in document order and are non-overlapping. 530// 531// If `merge` is set, overlapping and adjacent matches will be merged 532// into a single match. Otherwise, overlapping matches will be removed, 533// but adjacent matches will remain. 534func gatherMatches(mt matchTree, known map[matchTree]bool, merge bool) []*candidateMatch { 535 var cands []*candidateMatch 536 visitMatches(mt, known, func(mt matchTree) { 537 if smt, ok := mt.(*substrMatchTree); ok { 538 cands = append(cands, smt.current...) 539 } 540 if rmt, ok := mt.(*regexpMatchTree); ok { 541 cands = append(cands, rmt.found...) 542 } 543 if rmt, ok := mt.(*wordMatchTree); ok { 544 cands = append(cands, rmt.found...) 545 } 546 if smt, ok := mt.(*symbolRegexpMatchTree); ok { 547 cands = append(cands, smt.found...) 548 } 549 }) 550 551 foundContentMatch := false 552 for _, c := range cands { 553 if !c.fileName { 554 foundContentMatch = true 555 break 556 } 557 } 558 559 res := cands[:0] 560 for _, c := range cands { 561 if !foundContentMatch || !c.fileName { 562 res = append(res, c) 563 } 564 } 565 cands = res 566 567 if merge { 568 // Merge adjacent candidates. This guarantees that the matches 569 // are non-overlapping. 570 sort.Sort((sortByOffsetSlice)(cands)) 571 res = cands[:0] 572 for i, c := range cands { 573 if i == 0 { 574 res = append(res, c) 575 continue 576 } 577 last := res[len(res)-1] 578 lastEnd := last.byteOffset + last.byteMatchSz 579 end := c.byteOffset + c.byteMatchSz 580 if lastEnd >= c.byteOffset { 581 if end > lastEnd { 582 last.byteMatchSz = end - last.byteOffset 583 } 584 continue 585 } 586 587 res = append(res, c) 588 } 589 } else { 590 // Remove overlapping candidates. This guarantees that the matches 591 // are non-overlapping, but also preserves expected match counts. 592 sort.Sort((sortByOffsetSlice)(cands)) 593 res = cands[:0] 594 for i, c := range cands { 595 if i == 0 { 596 res = append(res, c) 597 continue 598 } 599 last := res[len(res)-1] 600 lastEnd := last.byteOffset + last.byteMatchSz 601 if lastEnd > c.byteOffset { 602 continue 603 } 604 605 res = append(res, c) 606 } 607 } 608 609 return res 610} 611 612func (d *indexData) branchIndex(docID uint32) int { 613 mask := d.fileBranchMasks[docID] 614 idx := 0 615 for mask != 0 { 616 if mask&0x1 != 0 { 617 return idx 618 } 619 idx++ 620 mask >>= 1 621 } 622 return -1 623} 624 625// gatherBranches returns a list of branch names taking into account any branch 626// filters in the query. If the query contains a branch filter, it returns all 627// branches containing the docID and matching the branch filter. Otherwise, it 628// returns all branches containing docID. 629func (d *indexData) gatherBranches(docID uint32, mt matchTree, known map[matchTree]bool) []string { 630 var mask uint64 631 visitMatches(mt, known, func(mt matchTree) { 632 bq, ok := mt.(*branchQueryMatchTree) 633 if !ok { 634 return 635 } 636 637 mask = mask | bq.branchMask() 638 }) 639 640 if mask == 0 { 641 mask = d.fileBranchMasks[docID] 642 } 643 644 var branches []string 645 id := uint32(1) 646 branchNames := d.branchNames[d.repos[docID]] 647 for mask != 0 { 648 if mask&0x1 != 0 { 649 branches = append(branches, branchNames[uint(id)]) 650 } 651 id <<= 1 652 mask >>= 1 653 } 654 655 return branches 656} 657 658func (d *indexData) List(ctx context.Context, q query.Q, opts *ListOptions) (rl *RepoList, err error) { 659 var include func(rle *RepoListEntry) bool 660 661 q = d.simplify(q) 662 if c, ok := q.(*query.Const); ok { 663 if !c.Value { 664 return &RepoList{}, nil 665 } 666 include = func(rle *RepoListEntry) bool { 667 return true 668 } 669 } else { 670 sr, err := d.Search(ctx, q, &SearchOptions{ 671 ShardRepoMaxMatchCount: 1, 672 }) 673 if err != nil { 674 return nil, err 675 } 676 677 foundRepos := make(map[string]struct{}, len(sr.Files)) 678 for _, file := range sr.Files { 679 foundRepos[file.Repository] = struct{}{} 680 } 681 682 include = func(rle *RepoListEntry) bool { 683 _, ok := foundRepos[rle.Repository.Name] 684 return ok 685 } 686 } 687 688 var l RepoList 689 690 field, err := opts.GetField() 691 if err != nil { 692 return nil, err 693 } 694 switch field { 695 case RepoListFieldRepos: 696 l.Repos = make([]*RepoListEntry, 0, len(d.repoListEntry)) 697 case RepoListFieldMinimal: 698 l.Minimal = make(map[uint32]*MinimalRepoListEntry, len(d.repoListEntry)) 699 case RepoListFieldReposMap: 700 l.ReposMap = make(ReposMap, len(d.repoListEntry)) 701 } 702 703 for i := range d.repoListEntry { 704 if d.repoMetaData[i].Tombstone { 705 continue 706 } 707 rle := &d.repoListEntry[i] 708 if !include(rle) { 709 continue 710 } 711 712 l.Stats.Add(&rle.Stats) 713 714 // Backwards compat for when ID is missing 715 if rle.Repository.ID == 0 { 716 l.Repos = append(l.Repos, rle) 717 continue 718 } 719 720 switch field { 721 case RepoListFieldRepos: 722 l.Repos = append(l.Repos, rle) 723 case RepoListFieldMinimal: 724 l.Minimal[rle.Repository.ID] = &MinimalRepoListEntry{ 725 HasSymbols: rle.Repository.HasSymbols, 726 Branches: rle.Repository.Branches, 727 IndexTimeUnix: rle.IndexMetadata.IndexTime.Unix(), 728 } 729 case RepoListFieldReposMap: 730 l.ReposMap[rle.Repository.ID] = MinimalRepoListEntry{ 731 HasSymbols: rle.Repository.HasSymbols, 732 Branches: rle.Repository.Branches, 733 IndexTimeUnix: rle.IndexMetadata.IndexTime.Unix(), 734 } 735 } 736 737 } 738 739 // Only one of these fields is populated and in all cases the size of that 740 // field is the number of Repos in this shard. 741 l.Stats.Repos = len(l.Repos) + len(l.Minimal) + len(l.ReposMap) 742 743 return &l, nil 744} 745 746// regexpToMatchTreeRecursive converts a regular expression to a matchTree mt. If 747// mt is equivalent to the input r, isEqual = true and the matchTree can be used 748// in place of the regex r. If singleLine = true, then the matchTree and all 749// its children only match terms on the same line. singleLine is used during 750// recursion to decide whether to return an andLineMatchTree (singleLine = true) 751// or a andMatchTree (singleLine = false). 752func (d *indexData) regexpToMatchTreeRecursive(r *syntax.Regexp, minTextSize int, fileName bool, caseSensitive bool) (mt matchTree, isEqual bool, singleLine bool, err error) { 753 // TODO - we could perhaps transform Begin/EndText in '\n'? 754 // TODO - we could perhaps transform CharClass in (OrQuery ) 755 // if there are just a few runes, and part of a OpConcat? 756 switch r.Op { 757 case syntax.OpLiteral: 758 s := string(r.Rune) 759 if len(s) >= minTextSize { 760 mt, err := d.newSubstringMatchTree(&query.Substring{Pattern: s, FileName: fileName, CaseSensitive: caseSensitive}) 761 return mt, true, !strings.Contains(s, "\n"), err 762 } 763 case syntax.OpCapture: 764 return d.regexpToMatchTreeRecursive(r.Sub[0], minTextSize, fileName, caseSensitive) 765 766 case syntax.OpPlus: 767 return d.regexpToMatchTreeRecursive(r.Sub[0], minTextSize, fileName, caseSensitive) 768 769 case syntax.OpRepeat: 770 if r.Min == 1 { 771 return d.regexpToMatchTreeRecursive(r.Sub[0], minTextSize, fileName, caseSensitive) 772 } else if r.Min > 1 { 773 // (x){2,} can't be expressed precisely by the matchTree 774 mt, _, singleLine, err := d.regexpToMatchTreeRecursive(r.Sub[0], minTextSize, fileName, caseSensitive) 775 return mt, false, singleLine, err 776 } 777 case syntax.OpConcat, syntax.OpAlternate: 778 var qs []matchTree 779 isEq := true 780 singleLine = true 781 for _, sr := range r.Sub { 782 if sq, subIsEq, subSingleLine, err := d.regexpToMatchTreeRecursive(sr, minTextSize, fileName, caseSensitive); sq != nil { 783 if err != nil { 784 return nil, false, false, err 785 } 786 isEq = isEq && subIsEq 787 singleLine = singleLine && subSingleLine 788 qs = append(qs, sq) 789 } 790 } 791 if r.Op == syntax.OpConcat { 792 if len(qs) > 1 { 793 isEq = false 794 } 795 newQs := make([]matchTree, 0, len(qs)) 796 for _, q := range qs { 797 if _, ok := q.(*bruteForceMatchTree); ok { 798 continue 799 } 800 newQs = append(newQs, q) 801 } 802 if len(newQs) == 1 { 803 return newQs[0], isEq, singleLine, nil 804 } 805 if len(newQs) == 0 { 806 return &bruteForceMatchTree{}, isEq, singleLine, nil 807 } 808 if singleLine { 809 return &andLineMatchTree{andMatchTree{children: newQs}}, isEq, singleLine, nil 810 } 811 return &andMatchTree{newQs}, isEq, singleLine, nil 812 } 813 for _, q := range qs { 814 if _, ok := q.(*bruteForceMatchTree); ok { 815 return q, isEq, false, nil 816 } 817 } 818 if len(qs) == 0 { 819 return &noMatchTree{Why: "const"}, isEq, false, nil 820 } 821 return &orMatchTree{qs}, isEq, false, nil 822 case syntax.OpStar: 823 if r.Sub[0].Op == syntax.OpAnyCharNotNL { 824 return &bruteForceMatchTree{}, false, true, nil 825 } 826 } 827 return &bruteForceMatchTree{}, false, false, nil 828} 829 830type timer struct { 831 last time.Time 832} 833 834func newTimer() *timer { 835 return &timer{ 836 last: time.Now(), 837 } 838} 839 840func (t *timer) Elapsed() time.Duration { 841 now := time.Now() 842 d := now.Sub(t.last) 843 t.last = now 844 return d 845}