fork of https://github.com/sourcegraph/zoekt
0

Configure Feed

Select the types of activity you want to include in your feed.

1// Copyright 2016 Google Inc. All rights reserved. 2// 3// Licensed under the Apache License, Version 2.0 (the "License"); 4// you may not use this file except in compliance with the License. 5// You may obtain a copy of the License at 6// 7// http://www.apache.org/licenses/LICENSE-2.0 8// 9// Unless required by applicable law or agreed to in writing, software 10// distributed under the License is distributed on an "AS IS" BASIS, 11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12// See the License for the specific language governing permissions and 13// limitations under the License. 14 15package zoekt 16 17import ( 18 "context" 19 "fmt" 20 "log" 21 "math" 22 "regexp/syntax" 23 "sort" 24 "strings" 25 "time" 26 27 enry_data "github.com/go-enry/go-enry/v2/data" 28 "github.com/grafana/regexp" 29 30 "github.com/sourcegraph/zoekt/query" 31) 32 33const maxUInt16 = 0xffff 34 35func (m *FileMatch) addScore(what string, s float64, debugScore bool) { 36 if s != 0 && debugScore { 37 m.Debug += fmt.Sprintf("%s:%.2f, ", what, s) 38 } 39 m.Score += s 40} 41 42func (m *FileMatch) addKeywordScore(score float64, sumTf float64, L float64, debugScore bool) { 43 if debugScore { 44 m.Debug += fmt.Sprintf("keyword-score:%.2f (sum-tf: %.2f, length-ratio: %.2f)", score, sumTf, L) 45 } 46 m.Score += score 47} 48 49// simplifyMultiRepo takes a query and a predicate. It returns Const(true) if all 50// repository names fulfill the predicate, Const(false) if none of them do, and q 51// otherwise. 52func (d *indexData) simplifyMultiRepo(q query.Q, predicate func(*Repository) bool) query.Q { 53 count := 0 54 alive := len(d.repoMetaData) 55 for i := range d.repoMetaData { 56 if d.repoMetaData[i].Tombstone { 57 alive-- 58 } else if predicate(&d.repoMetaData[i]) { 59 count++ 60 } 61 } 62 if count == alive { 63 return &query.Const{Value: true} 64 } 65 if count > 0 { 66 return q 67 } 68 return &query.Const{Value: false} 69} 70 71func (d *indexData) simplify(in query.Q) query.Q { 72 eval := query.Map(in, func(q query.Q) query.Q { 73 switch r := q.(type) { 74 case *query.Repo: 75 return d.simplifyMultiRepo(q, func(repo *Repository) bool { 76 return r.Regexp.MatchString(repo.Name) 77 }) 78 case *query.RepoRegexp: 79 return d.simplifyMultiRepo(q, func(repo *Repository) bool { 80 return r.Regexp.MatchString(repo.Name) 81 }) 82 case *query.BranchesRepos: 83 for i := range d.repoMetaData { 84 for _, br := range r.List { 85 if br.Repos.Contains(d.repoMetaData[i].ID) { 86 return q 87 } 88 } 89 } 90 return &query.Const{Value: false} 91 case *query.RepoSet: 92 return d.simplifyMultiRepo(q, func(repo *Repository) bool { 93 return r.Set[repo.Name] 94 }) 95 case *query.RepoIDs: 96 return d.simplifyMultiRepo(q, func(repo *Repository) bool { 97 return r.Repos.Contains(repo.ID) 98 }) 99 case *query.Language: 100 _, has := d.metaData.LanguageMap[r.Language] 101 if !has && d.metaData.IndexFeatureVersion < 12 { 102 // For index files that haven't been re-indexed by go-enry, 103 // fall back to file-based matching and continue even if this 104 // repo doesn't have the specific language present. 105 extsForLang := enry_data.ExtensionsByLanguage[r.Language] 106 if extsForLang != nil { 107 extFrags := make([]string, 0, len(extsForLang)) 108 for _, ext := range extsForLang { 109 extFrags = append(extFrags, regexp.QuoteMeta(ext)) 110 } 111 if len(extFrags) > 0 { 112 pattern := fmt.Sprintf("(?i)(%s)$", strings.Join(extFrags, "|")) 113 // inlined copy of query.regexpQuery 114 re, err := syntax.Parse(pattern, syntax.Perl) 115 if err != nil { 116 return &query.Const{Value: false} 117 } 118 if re.Op == syntax.OpLiteral { 119 return &query.Substring{ 120 Pattern: string(re.Rune), 121 FileName: true, 122 } 123 } 124 return &query.Regexp{ 125 Regexp: re, 126 FileName: true, 127 } 128 } 129 } 130 } 131 if !has { 132 return &query.Const{Value: false} 133 } 134 } 135 return q 136 }) 137 return query.Simplify(eval) 138} 139 140func (o *SearchOptions) SetDefaults() { 141 if o.ShardMaxMatchCount == 0 { 142 // We cap the total number of matches, so overly broad 143 // searches don't crash the machine. 144 o.ShardMaxMatchCount = 100000 145 } 146 if o.TotalMaxMatchCount == 0 { 147 o.TotalMaxMatchCount = 10 * o.ShardMaxMatchCount 148 } 149} 150 151func (d *indexData) Search(ctx context.Context, q query.Q, opts *SearchOptions) (sr *SearchResult, err error) { 152 timer := newTimer() 153 154 copyOpts := *opts 155 opts = &copyOpts 156 opts.SetDefaults() 157 158 var res SearchResult 159 if len(d.fileNameIndex) == 0 { 160 return &res, nil 161 } 162 163 select { 164 case <-ctx.Done(): 165 res.Stats.ShardsSkipped++ 166 return &res, nil 167 default: 168 } 169 170 q = d.simplify(q) 171 if c, ok := q.(*query.Const); ok && !c.Value { 172 return &res, nil 173 } 174 175 if opts.EstimateDocCount { 176 res.Stats.ShardFilesConsidered = len(d.fileBranchMasks) 177 return &res, nil 178 } 179 180 q = query.Map(q, query.ExpandFileContent) 181 182 mt, err := d.newMatchTree(q, matchTreeOpt{}) 183 if err != nil { 184 return nil, err 185 } 186 187 // Capture the costs of construction before pruning 188 updateMatchTreeStats(mt, &res.Stats) 189 190 mt, err = pruneMatchTree(mt) 191 if err != nil { 192 return nil, err 193 } 194 res.Stats.MatchTreeConstruction = timer.Elapsed() 195 if mt == nil { 196 res.Stats.ShardsSkippedFilter++ 197 return &res, nil 198 } 199 200 res.Stats.ShardsScanned++ 201 202 cp := &contentProvider{ 203 id: d, 204 stats: &res.Stats, 205 } 206 207 // Track the number of documents found in a repository for 208 // ShardRepoMaxMatchCount 209 var ( 210 lastRepoID uint16 211 repoMatchCount int 212 ) 213 214 docCount := uint32(len(d.fileBranchMasks)) 215 lastDoc := int(-1) 216 217nextFileMatch: 218 for { 219 canceled := false 220 select { 221 case <-ctx.Done(): 222 canceled = true 223 default: 224 } 225 226 nextDoc := mt.nextDoc() 227 if int(nextDoc) <= lastDoc { 228 nextDoc = uint32(lastDoc + 1) 229 } 230 231 for ; nextDoc < docCount; nextDoc++ { 232 repoID := d.repos[nextDoc] 233 repoMetadata := &d.repoMetaData[repoID] 234 235 // Skip tombstoned repositories 236 if repoMetadata.Tombstone { 237 continue 238 } 239 240 // Skip documents that are tombstoned 241 if len(repoMetadata.FileTombstones) > 0 { 242 if _, tombstoned := repoMetadata.FileTombstones[string(d.fileName(nextDoc))]; tombstoned { 243 continue 244 } 245 } 246 247 // Skip documents over ShardRepoMaxMatchCount if specified. 248 if opts.ShardRepoMaxMatchCount > 0 { 249 if repoMatchCount >= opts.ShardRepoMaxMatchCount && repoID == lastRepoID { 250 res.Stats.FilesSkipped++ 251 continue 252 } 253 } 254 255 break 256 } 257 258 if nextDoc >= docCount { 259 break 260 } 261 262 lastDoc = int(nextDoc) 263 264 // We track lastRepoID for ShardRepoMaxMatchCount 265 if lastRepoID != d.repos[nextDoc] { 266 lastRepoID = d.repos[nextDoc] 267 repoMatchCount = 0 268 } 269 270 if canceled || (res.Stats.MatchCount >= opts.ShardMaxMatchCount && opts.ShardMaxMatchCount > 0) { 271 res.Stats.FilesSkipped += int(docCount - nextDoc) 272 break 273 } 274 275 res.Stats.FilesConsidered++ 276 mt.prepare(nextDoc) 277 278 cp.setDocument(nextDoc) 279 280 known := make(map[matchTree]bool) 281 md := d.repoMetaData[d.repos[nextDoc]] 282 283 for cost := costMin; cost <= costMax; cost++ { 284 v, ok := mt.matches(cp, cost, known) 285 if ok && !v { 286 continue nextFileMatch 287 } 288 289 if cost == costMax && !ok { 290 log.Panicf("did not decide. Repo %s, doc %d, known %v", 291 md.Name, nextDoc, known) 292 } 293 } 294 295 fileMatch := FileMatch{ 296 Repository: md.Name, 297 RepositoryID: md.ID, 298 RepositoryPriority: md.priority, 299 FileName: string(d.fileName(nextDoc)), 300 Checksum: d.getChecksum(nextDoc), 301 Language: d.languageMap[d.getLanguage(nextDoc)], 302 } 303 304 if s := d.subRepos[nextDoc]; s > 0 { 305 if s >= uint32(len(d.subRepoPaths[d.repos[nextDoc]])) { 306 log.Panicf("corrupt index: subrepo %d beyond %v", s, d.subRepoPaths) 307 } 308 path := d.subRepoPaths[d.repos[nextDoc]][s] 309 fileMatch.SubRepositoryPath = path 310 sr := md.SubRepoMap[path] 311 fileMatch.SubRepositoryName = sr.Name 312 if idx := d.branchIndex(nextDoc); idx >= 0 { 313 fileMatch.Version = sr.Branches[idx].Version 314 } 315 } else { 316 idx := d.branchIndex(nextDoc) 317 if idx >= 0 { 318 fileMatch.Version = md.Branches[idx].Version 319 } 320 } 321 322 shouldMergeMatches := !opts.ChunkMatches 323 finalCands := gatherMatches(mt, known, shouldMergeMatches) 324 325 if len(finalCands) == 0 { 326 nm := d.fileName(nextDoc) 327 finalCands = append(finalCands, 328 &candidateMatch{ 329 caseSensitive: false, 330 fileName: true, 331 substrBytes: nm, 332 substrLowered: nm, 333 file: nextDoc, 334 runeOffset: 0, 335 byteOffset: 0, 336 byteMatchSz: uint32(len(nm)), 337 }) 338 } 339 340 if opts.ChunkMatches { 341 fileMatch.ChunkMatches = cp.fillChunkMatches(finalCands, opts.NumContextLines, fileMatch.Language, opts.DebugScore) 342 } else { 343 fileMatch.LineMatches = cp.fillMatches(finalCands, opts.NumContextLines, fileMatch.Language, opts.DebugScore) 344 } 345 346 if opts.UseKeywordScoring { 347 d.scoreFileUsingBM25(&fileMatch, nextDoc, finalCands, opts) 348 } else { 349 // Use the standard, non-experimental scoring method by default 350 d.scoreFile(&fileMatch, nextDoc, mt, known, opts) 351 } 352 353 fileMatch.Branches = d.gatherBranches(nextDoc, mt, known) 354 sortMatchesByScore(fileMatch.LineMatches) 355 sortChunkMatchesByScore(fileMatch.ChunkMatches) 356 if opts.Whole { 357 fileMatch.Content = cp.data(false) 358 } 359 360 matchedChunkRanges := 0 361 for _, cm := range fileMatch.ChunkMatches { 362 matchedChunkRanges += len(cm.Ranges) 363 } 364 365 repoMatchCount += len(fileMatch.LineMatches) 366 repoMatchCount += matchedChunkRanges 367 368 if opts.DebugScore { 369 fileMatch.Debug = fmt.Sprintf("score:%.2f <- %s", fileMatch.Score, fileMatch.Debug) 370 } 371 372 res.Files = append(res.Files, fileMatch) 373 res.Stats.MatchCount += len(fileMatch.LineMatches) 374 res.Stats.MatchCount += matchedChunkRanges 375 res.Stats.FileCount++ 376 } 377 378 // We do not sort Files here, instead we rely on the shards pkg to do file 379 // ranking. If we sorted now, we would break the assumption that results 380 // from the same repo in a shard appear next to each other. 381 382 for _, md := range d.repoMetaData { 383 r := md 384 addRepo(&res, &r) 385 for _, v := range r.SubRepoMap { 386 addRepo(&res, v) 387 } 388 } 389 390 // Update stats based on work done during document search. 391 updateMatchTreeStats(mt, &res.Stats) 392 393 // If document ranking is enabled, then we can rank and truncate the files to save memory. 394 if opts.UseDocumentRanks { 395 res.Files = SortAndTruncateFiles(res.Files, opts) 396 } 397 398 res.Stats.MatchTreeSearch = timer.Elapsed() 399 400 return &res, nil 401} 402 403// scoreFile computes a score for the file match using various scoring signals, like 404// whether there's an exact match on a symbol, the number of query clauses that matched, etc. 405func (d *indexData) scoreFile(fileMatch *FileMatch, doc uint32, mt matchTree, known map[matchTree]bool, opts *SearchOptions) { 406 atomMatchCount := 0 407 visitMatches(mt, known, func(mt matchTree) { 408 atomMatchCount++ 409 }) 410 411 // atom-count boosts files with matches from more than 1 atom. The 412 // maximum boost is scoreFactorAtomMatch. 413 if atomMatchCount > 0 { 414 fileMatch.addScore("atom", (1.0-1.0/float64(atomMatchCount))*scoreFactorAtomMatch, opts.DebugScore) 415 } 416 417 maxFileScore := 0.0 418 for i := range fileMatch.LineMatches { 419 if maxFileScore < fileMatch.LineMatches[i].Score { 420 maxFileScore = fileMatch.LineMatches[i].Score 421 } 422 423 // Order by ordering in file. 424 fileMatch.LineMatches[i].Score += scoreLineOrderFactor * (1.0 - (float64(i) / float64(len(fileMatch.LineMatches)))) 425 } 426 427 for i := range fileMatch.ChunkMatches { 428 if maxFileScore < fileMatch.ChunkMatches[i].Score { 429 maxFileScore = fileMatch.ChunkMatches[i].Score 430 } 431 432 // Order by ordering in file. 433 fileMatch.ChunkMatches[i].Score += scoreLineOrderFactor * (1.0 - (float64(i) / float64(len(fileMatch.ChunkMatches)))) 434 } 435 436 // Maintain ordering of input files. This 437 // strictly dominates the in-file ordering of 438 // the matches. 439 fileMatch.addScore("fragment", maxFileScore, opts.DebugScore) 440 441 if opts.UseDocumentRanks && len(d.ranks) > int(doc) { 442 weight := scoreFileRankFactor 443 if opts.DocumentRanksWeight > 0.0 { 444 weight = opts.DocumentRanksWeight 445 } 446 447 ranks := d.ranks[doc] 448 // The ranks slice always contains one entry representing the file rank (unless it's empty since the 449 // file doesn't have a rank). This is left over from when documents could have multiple rank signals, 450 // and we plan to clean this up. 451 if len(ranks) > 0 { 452 // The file rank represents a log (base 2) count. The log ranks should be bounded at 32, but we 453 // cap it just in case to ensure it falls in the range [0, 1]. 454 normalized := math.Min(1.0, ranks[0]/32.0) 455 fileMatch.addScore("file-rank", weight*normalized, opts.DebugScore) 456 } 457 } 458 459 md := d.repoMetaData[d.repos[doc]] 460 fileMatch.addScore("doc-order", scoreFileOrderFactor*(1.0-float64(doc)/float64(len(d.boundaries))), opts.DebugScore) 461 fileMatch.addScore("repo-rank", scoreRepoRankFactor*float64(md.Rank)/maxUInt16, opts.DebugScore) 462 463 if opts.DebugScore { 464 fileMatch.Debug = strings.TrimSuffix(fileMatch.Debug, ", ") 465 } 466} 467 468// scoreFileUsingBM25 computes a score for the file match using an approximation to BM25, the most common scoring 469// algorithm for keyword search: https://en.wikipedia.org/wiki/Okapi_BM25. It implements all parts of the formula 470// except inverse document frequency (idf), since we don't have access to global term frequency statistics. 471// 472// This scoring strategy ignores all other signals including document ranks. This keeps things simple for now, 473// since BM25 is not normalized and can be tricky to combine with other scoring signals. 474func (d *indexData) scoreFileUsingBM25(fileMatch *FileMatch, doc uint32, cands []*candidateMatch, opts *SearchOptions) { 475 // Treat each candidate match as a term and compute the frequencies. For now, ignore case 476 // sensitivity and treat filenames and symbols the same as content. 477 termFreqs := map[string]int{} 478 for _, cand := range cands { 479 term := string(cand.substrLowered) 480 termFreqs[term]++ 481 } 482 483 // Compute the file length ratio. Usually the calculation would be based on terms, but using 484 // bytes should work fine, as we're just computing a ratio. 485 fileLength := float64(d.boundaries[doc+1] - d.boundaries[doc]) 486 numFiles := len(d.boundaries) 487 averageFileLength := float64(d.boundaries[numFiles-1]) / float64(numFiles) 488 L := fileLength / averageFileLength 489 490 // Use standard parameter defaults (used in Lucene and academic papers) 491 k, b := 1.2, 0.75 492 sumTf := 0.0 // Just for debugging 493 score := 0.0 494 for _, freq := range termFreqs { 495 tf := float64(freq) 496 sumTf += tf 497 score += ((k + 1.0) * tf) / (k*(1.0-b+b*L) + tf) 498 } 499 500 fileMatch.addKeywordScore(score, sumTf, L, opts.DebugScore) 501} 502 503func addRepo(res *SearchResult, repo *Repository) { 504 if res.RepoURLs == nil { 505 res.RepoURLs = map[string]string{} 506 } 507 res.RepoURLs[repo.Name] = repo.FileURLTemplate 508 509 if res.LineFragments == nil { 510 res.LineFragments = map[string]string{} 511 } 512 res.LineFragments[repo.Name] = repo.LineFragmentTemplate 513} 514 515type sortByOffsetSlice []*candidateMatch 516 517func (m sortByOffsetSlice) Len() int { return len(m) } 518func (m sortByOffsetSlice) Swap(i, j int) { m[i], m[j] = m[j], m[i] } 519func (m sortByOffsetSlice) Less(i, j int) bool { 520 return m[i].byteOffset < m[j].byteOffset 521} 522 523// Gather matches from this document. This never returns a mixture of 524// filename/content matches: if there are content matches, all 525// filename matches are trimmed from the result. The matches are 526// returned in document order and are non-overlapping. 527// 528// If `merge` is set, overlapping and adjacent matches will be merged 529// into a single match. Otherwise, overlapping matches will be removed, 530// but adjacent matches will remain. 531func gatherMatches(mt matchTree, known map[matchTree]bool, merge bool) []*candidateMatch { 532 var cands []*candidateMatch 533 visitMatches(mt, known, func(mt matchTree) { 534 if smt, ok := mt.(*substrMatchTree); ok { 535 cands = append(cands, smt.current...) 536 } 537 if rmt, ok := mt.(*regexpMatchTree); ok { 538 cands = append(cands, rmt.found...) 539 } 540 if rmt, ok := mt.(*wordMatchTree); ok { 541 cands = append(cands, rmt.found...) 542 } 543 if smt, ok := mt.(*symbolRegexpMatchTree); ok { 544 cands = append(cands, smt.found...) 545 } 546 }) 547 548 foundContentMatch := false 549 for _, c := range cands { 550 if !c.fileName { 551 foundContentMatch = true 552 break 553 } 554 } 555 556 res := cands[:0] 557 for _, c := range cands { 558 if !foundContentMatch || !c.fileName { 559 res = append(res, c) 560 } 561 } 562 cands = res 563 564 if merge { 565 // Merge adjacent candidates. This guarantees that the matches 566 // are non-overlapping. 567 sort.Sort((sortByOffsetSlice)(cands)) 568 res = cands[:0] 569 for i, c := range cands { 570 if i == 0 { 571 res = append(res, c) 572 continue 573 } 574 last := res[len(res)-1] 575 lastEnd := last.byteOffset + last.byteMatchSz 576 end := c.byteOffset + c.byteMatchSz 577 if lastEnd >= c.byteOffset { 578 if end > lastEnd { 579 last.byteMatchSz = end - last.byteOffset 580 } 581 continue 582 } 583 584 res = append(res, c) 585 } 586 } else { 587 // Remove overlapping candidates. This guarantees that the matches 588 // are non-overlapping, but also preserves expected match counts. 589 sort.Sort((sortByOffsetSlice)(cands)) 590 res = cands[:0] 591 for i, c := range cands { 592 if i == 0 { 593 res = append(res, c) 594 continue 595 } 596 last := res[len(res)-1] 597 lastEnd := last.byteOffset + last.byteMatchSz 598 if lastEnd > c.byteOffset { 599 continue 600 } 601 602 res = append(res, c) 603 } 604 } 605 606 return res 607} 608 609func (d *indexData) branchIndex(docID uint32) int { 610 mask := d.fileBranchMasks[docID] 611 idx := 0 612 for mask != 0 { 613 if mask&0x1 != 0 { 614 return idx 615 } 616 idx++ 617 mask >>= 1 618 } 619 return -1 620} 621 622// gatherBranches returns a list of branch names taking into account any branch 623// filters in the query. If the query contains a branch filter, it returns all 624// branches containing the docID and matching the branch filter. Otherwise, it 625// returns all branches containing docID. 626func (d *indexData) gatherBranches(docID uint32, mt matchTree, known map[matchTree]bool) []string { 627 var mask uint64 628 visitMatches(mt, known, func(mt matchTree) { 629 bq, ok := mt.(*branchQueryMatchTree) 630 if !ok { 631 return 632 } 633 634 mask = mask | bq.branchMask() 635 }) 636 637 if mask == 0 { 638 mask = d.fileBranchMasks[docID] 639 } 640 641 var branches []string 642 id := uint32(1) 643 branchNames := d.branchNames[d.repos[docID]] 644 for mask != 0 { 645 if mask&0x1 != 0 { 646 branches = append(branches, branchNames[uint(id)]) 647 } 648 id <<= 1 649 mask >>= 1 650 } 651 652 return branches 653} 654 655func (d *indexData) List(ctx context.Context, q query.Q, opts *ListOptions) (rl *RepoList, err error) { 656 var include func(rle *RepoListEntry) bool 657 658 q = d.simplify(q) 659 if c, ok := q.(*query.Const); ok { 660 if !c.Value { 661 return &RepoList{}, nil 662 } 663 include = func(rle *RepoListEntry) bool { 664 return true 665 } 666 } else { 667 sr, err := d.Search(ctx, q, &SearchOptions{ 668 ShardRepoMaxMatchCount: 1, 669 }) 670 if err != nil { 671 return nil, err 672 } 673 674 foundRepos := make(map[string]struct{}, len(sr.Files)) 675 for _, file := range sr.Files { 676 foundRepos[file.Repository] = struct{}{} 677 } 678 679 include = func(rle *RepoListEntry) bool { 680 _, ok := foundRepos[rle.Repository.Name] 681 return ok 682 } 683 } 684 685 var l RepoList 686 687 field, err := opts.GetField() 688 if err != nil { 689 return nil, err 690 } 691 switch field { 692 case RepoListFieldRepos: 693 l.Repos = make([]*RepoListEntry, 0, len(d.repoListEntry)) 694 case RepoListFieldReposMap: 695 l.ReposMap = make(ReposMap, len(d.repoListEntry)) 696 } 697 698 for i := range d.repoListEntry { 699 if d.repoMetaData[i].Tombstone { 700 continue 701 } 702 rle := &d.repoListEntry[i] 703 if !include(rle) { 704 continue 705 } 706 707 l.Stats.Add(&rle.Stats) 708 709 // Backwards compat for when ID is missing 710 if rle.Repository.ID == 0 { 711 l.Repos = append(l.Repos, rle) 712 continue 713 } 714 715 switch field { 716 case RepoListFieldRepos: 717 l.Repos = append(l.Repos, rle) 718 case RepoListFieldReposMap: 719 l.ReposMap[rle.Repository.ID] = MinimalRepoListEntry{ 720 HasSymbols: rle.Repository.HasSymbols, 721 Branches: rle.Repository.Branches, 722 IndexTimeUnix: rle.IndexMetadata.IndexTime.Unix(), 723 } 724 } 725 726 } 727 728 // Only one of these fields is populated and in all cases the size of that 729 // field is the number of Repos in this shard. 730 l.Stats.Repos = len(l.Repos) + len(l.ReposMap) 731 732 return &l, nil 733} 734 735// regexpToMatchTreeRecursive converts a regular expression to a matchTree mt. If 736// mt is equivalent to the input r, isEqual = true and the matchTree can be used 737// in place of the regex r. If singleLine = true, then the matchTree and all 738// its children only match terms on the same line. singleLine is used during 739// recursion to decide whether to return an andLineMatchTree (singleLine = true) 740// or a andMatchTree (singleLine = false). 741func (d *indexData) regexpToMatchTreeRecursive(r *syntax.Regexp, minTextSize int, fileName bool, caseSensitive bool) (mt matchTree, isEqual bool, singleLine bool, err error) { 742 // TODO - we could perhaps transform Begin/EndText in '\n'? 743 // TODO - we could perhaps transform CharClass in (OrQuery ) 744 // if there are just a few runes, and part of a OpConcat? 745 switch r.Op { 746 case syntax.OpLiteral: 747 s := string(r.Rune) 748 if len(s) >= minTextSize { 749 mt, err := d.newSubstringMatchTree(&query.Substring{Pattern: s, FileName: fileName, CaseSensitive: caseSensitive}) 750 return mt, true, !strings.Contains(s, "\n"), err 751 } 752 case syntax.OpCapture: 753 return d.regexpToMatchTreeRecursive(r.Sub[0], minTextSize, fileName, caseSensitive) 754 755 case syntax.OpPlus: 756 return d.regexpToMatchTreeRecursive(r.Sub[0], minTextSize, fileName, caseSensitive) 757 758 case syntax.OpRepeat: 759 if r.Min == 1 { 760 return d.regexpToMatchTreeRecursive(r.Sub[0], minTextSize, fileName, caseSensitive) 761 } else if r.Min > 1 { 762 // (x){2,} can't be expressed precisely by the matchTree 763 mt, _, singleLine, err := d.regexpToMatchTreeRecursive(r.Sub[0], minTextSize, fileName, caseSensitive) 764 return mt, false, singleLine, err 765 } 766 case syntax.OpConcat, syntax.OpAlternate: 767 var qs []matchTree 768 isEq := true 769 singleLine = true 770 for _, sr := range r.Sub { 771 if sq, subIsEq, subSingleLine, err := d.regexpToMatchTreeRecursive(sr, minTextSize, fileName, caseSensitive); sq != nil { 772 if err != nil { 773 return nil, false, false, err 774 } 775 isEq = isEq && subIsEq 776 singleLine = singleLine && subSingleLine 777 qs = append(qs, sq) 778 } 779 } 780 if r.Op == syntax.OpConcat { 781 if len(qs) > 1 { 782 isEq = false 783 } 784 newQs := make([]matchTree, 0, len(qs)) 785 for _, q := range qs { 786 if _, ok := q.(*bruteForceMatchTree); ok { 787 continue 788 } 789 newQs = append(newQs, q) 790 } 791 if len(newQs) == 1 { 792 return newQs[0], isEq, singleLine, nil 793 } 794 if len(newQs) == 0 { 795 return &bruteForceMatchTree{}, isEq, singleLine, nil 796 } 797 if singleLine { 798 return &andLineMatchTree{andMatchTree{children: newQs}}, isEq, singleLine, nil 799 } 800 return &andMatchTree{newQs}, isEq, singleLine, nil 801 } 802 for _, q := range qs { 803 if _, ok := q.(*bruteForceMatchTree); ok { 804 return q, isEq, false, nil 805 } 806 } 807 if len(qs) == 0 { 808 return &noMatchTree{Why: "const"}, isEq, false, nil 809 } 810 return &orMatchTree{qs}, isEq, false, nil 811 case syntax.OpStar: 812 if r.Sub[0].Op == syntax.OpAnyCharNotNL { 813 return &bruteForceMatchTree{}, false, true, nil 814 } 815 } 816 return &bruteForceMatchTree{}, false, false, nil 817} 818 819type timer struct { 820 last time.Time 821} 822 823func newTimer() *timer { 824 return &timer{ 825 last: time.Now(), 826 } 827} 828 829func (t *timer) Elapsed() time.Duration { 830 now := time.Now() 831 d := now.Sub(t.last) 832 t.last = now 833 return d 834}