fork of https://github.com/sourcegraph/zoekt
0

Configure Feed

Select the types of activity you want to include in your feed.

1// Copyright 2016 Google Inc. All rights reserved. 2// 3// Licensed under the Apache License, Version 2.0 (the "License"); 4// you may not use this file except in compliance with the License. 5// You may obtain a copy of the License at 6// 7// http://www.apache.org/licenses/LICENSE-2.0 8// 9// Unless required by applicable law or agreed to in writing, software 10// distributed under the License is distributed on an "AS IS" BASIS, 11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12// See the License for the specific language governing permissions and 13// limitations under the License. 14 15package zoekt 16 17import ( 18 "context" 19 "fmt" 20 "log" 21 "math" 22 "regexp/syntax" 23 "sort" 24 "strings" 25 26 enry_data "github.com/go-enry/go-enry/v2/data" 27 "github.com/grafana/regexp" 28 29 "github.com/sourcegraph/zoekt/query" 30) 31 32const maxUInt16 = 0xffff 33 34func (m *FileMatch) addScore(what string, s float64, debugScore bool) { 35 if debugScore { 36 m.Debug += fmt.Sprintf("%s:%.2f, ", what, s) 37 } 38 m.Score += s 39} 40 41func (m *FileMatch) addKeywordScore(score float64, sumTf float64, L float64, debugScore bool) { 42 if debugScore { 43 m.Debug += fmt.Sprintf("keyword-score:%.2f (sum-tf: %.2f, length-ratio: %.2f)", score, sumTf, L) 44 } 45 m.Score += score 46} 47 48// simplifyMultiRepo takes a query and a predicate. It returns Const(true) if all 49// repository names fulfill the predicate, Const(false) if none of them do, and q 50// otherwise. 51func (d *indexData) simplifyMultiRepo(q query.Q, predicate func(*Repository) bool) query.Q { 52 count := 0 53 alive := len(d.repoMetaData) 54 for i := range d.repoMetaData { 55 if d.repoMetaData[i].Tombstone { 56 alive-- 57 } else if predicate(&d.repoMetaData[i]) { 58 count++ 59 } 60 } 61 if count == alive { 62 return &query.Const{Value: true} 63 } 64 if count > 0 { 65 return q 66 } 67 return &query.Const{Value: false} 68} 69 70func (d *indexData) simplify(in query.Q) query.Q { 71 eval := query.Map(in, func(q query.Q) query.Q { 72 switch r := q.(type) { 73 case *query.Repo: 74 return d.simplifyMultiRepo(q, func(repo *Repository) bool { 75 return r.Regexp.MatchString(repo.Name) 76 }) 77 case *query.RepoRegexp: 78 return d.simplifyMultiRepo(q, func(repo *Repository) bool { 79 return r.Regexp.MatchString(repo.Name) 80 }) 81 case *query.BranchesRepos: 82 for i := range d.repoMetaData { 83 for _, br := range r.List { 84 if br.Repos.Contains(d.repoMetaData[i].ID) { 85 return q 86 } 87 } 88 } 89 return &query.Const{Value: false} 90 case *query.RepoSet: 91 return d.simplifyMultiRepo(q, func(repo *Repository) bool { 92 return r.Set[repo.Name] 93 }) 94 case *query.RepoIDs: 95 return d.simplifyMultiRepo(q, func(repo *Repository) bool { 96 return r.Repos.Contains(repo.ID) 97 }) 98 case *query.Language: 99 _, has := d.metaData.LanguageMap[r.Language] 100 if !has && d.metaData.IndexFeatureVersion < 12 { 101 // For index files that haven't been re-indexed by go-enry, 102 // fall back to file-based matching and continue even if this 103 // repo doesn't have the specific language present. 104 extsForLang := enry_data.ExtensionsByLanguage[r.Language] 105 if extsForLang != nil { 106 extFrags := make([]string, 0, len(extsForLang)) 107 for _, ext := range extsForLang { 108 extFrags = append(extFrags, regexp.QuoteMeta(ext)) 109 } 110 if len(extFrags) > 0 { 111 pattern := fmt.Sprintf("(?i)(%s)$", strings.Join(extFrags, "|")) 112 // inlined copy of query.regexpQuery 113 re, err := syntax.Parse(pattern, syntax.Perl) 114 if err != nil { 115 return &query.Const{Value: false} 116 } 117 if re.Op == syntax.OpLiteral { 118 return &query.Substring{ 119 Pattern: string(re.Rune), 120 FileName: true, 121 } 122 } 123 return &query.Regexp{ 124 Regexp: re, 125 FileName: true, 126 } 127 } 128 } 129 } 130 if !has { 131 return &query.Const{Value: false} 132 } 133 } 134 return q 135 }) 136 return query.Simplify(eval) 137} 138 139func (o *SearchOptions) SetDefaults() { 140 if o.ShardMaxMatchCount == 0 { 141 // We cap the total number of matches, so overly broad 142 // searches don't crash the machine. 143 o.ShardMaxMatchCount = 100000 144 } 145 if o.TotalMaxMatchCount == 0 { 146 o.TotalMaxMatchCount = 10 * o.ShardMaxMatchCount 147 } 148} 149 150func (d *indexData) Search(ctx context.Context, q query.Q, opts *SearchOptions) (sr *SearchResult, err error) { 151 copyOpts := *opts 152 opts = &copyOpts 153 opts.SetDefaults() 154 155 var res SearchResult 156 if len(d.fileNameIndex) == 0 { 157 return &res, nil 158 } 159 160 select { 161 case <-ctx.Done(): 162 res.Stats.ShardsSkipped++ 163 return &res, nil 164 default: 165 } 166 167 q = d.simplify(q) 168 if c, ok := q.(*query.Const); ok && !c.Value { 169 return &res, nil 170 } 171 172 if opts.EstimateDocCount { 173 res.Stats.ShardFilesConsidered = len(d.fileBranchMasks) 174 return &res, nil 175 } 176 177 q = query.Map(q, query.ExpandFileContent) 178 179 mt, err := d.newMatchTree(q, matchTreeOpt{}) 180 if err != nil { 181 return nil, err 182 } 183 184 mt, err = pruneMatchTree(mt) 185 if err != nil { 186 return nil, err 187 } 188 if mt == nil { 189 res.Stats.ShardsSkippedFilter++ 190 return &res, nil 191 } 192 193 res.Stats.ShardsScanned++ 194 195 cp := &contentProvider{ 196 id: d, 197 stats: &res.Stats, 198 } 199 200 // Track the number of documents found in a repository for 201 // ShardRepoMaxMatchCount 202 var ( 203 lastRepoID uint16 204 repoMatchCount int 205 ) 206 207 docCount := uint32(len(d.fileBranchMasks)) 208 lastDoc := int(-1) 209 210nextFileMatch: 211 for { 212 canceled := false 213 select { 214 case <-ctx.Done(): 215 canceled = true 216 default: 217 } 218 219 nextDoc := mt.nextDoc() 220 if int(nextDoc) <= lastDoc { 221 nextDoc = uint32(lastDoc + 1) 222 } 223 224 for ; nextDoc < docCount; nextDoc++ { 225 repoID := d.repos[nextDoc] 226 repoMetadata := &d.repoMetaData[repoID] 227 228 // Skip tombstoned repositories 229 if repoMetadata.Tombstone { 230 continue 231 } 232 233 // Skip documents that are tombstoned 234 if len(repoMetadata.FileTombstones) > 0 { 235 if _, tombstoned := repoMetadata.FileTombstones[string(d.fileName(nextDoc))]; tombstoned { 236 continue 237 } 238 } 239 240 // Skip documents over ShardRepoMaxMatchCount if specified. 241 if opts.ShardRepoMaxMatchCount > 0 { 242 if repoMatchCount >= opts.ShardRepoMaxMatchCount && repoID == lastRepoID { 243 res.Stats.FilesSkipped++ 244 continue 245 } 246 } 247 248 break 249 } 250 251 if nextDoc >= docCount { 252 break 253 } 254 255 lastDoc = int(nextDoc) 256 257 // We track lastRepoID for ShardRepoMaxMatchCount 258 if lastRepoID != d.repos[nextDoc] { 259 lastRepoID = d.repos[nextDoc] 260 repoMatchCount = 0 261 } 262 263 if canceled || (res.Stats.MatchCount >= opts.ShardMaxMatchCount && opts.ShardMaxMatchCount > 0) { 264 res.Stats.FilesSkipped += int(docCount - nextDoc) 265 break 266 } 267 268 res.Stats.FilesConsidered++ 269 mt.prepare(nextDoc) 270 271 cp.setDocument(nextDoc) 272 273 known := make(map[matchTree]bool) 274 md := d.repoMetaData[d.repos[nextDoc]] 275 276 for cost := costMin; cost <= costMax; cost++ { 277 v, ok := mt.matches(cp, cost, known) 278 if ok && !v { 279 continue nextFileMatch 280 } 281 282 if cost == costMax && !ok { 283 log.Panicf("did not decide. Repo %s, doc %d, known %v", 284 md.Name, nextDoc, known) 285 } 286 } 287 288 fileMatch := FileMatch{ 289 Repository: md.Name, 290 RepositoryID: md.ID, 291 RepositoryPriority: md.priority, 292 FileName: string(d.fileName(nextDoc)), 293 Checksum: d.getChecksum(nextDoc), 294 Language: d.languageMap[d.getLanguage(nextDoc)], 295 } 296 297 if s := d.subRepos[nextDoc]; s > 0 { 298 if s >= uint32(len(d.subRepoPaths[d.repos[nextDoc]])) { 299 log.Panicf("corrupt index: subrepo %d beyond %v", s, d.subRepoPaths) 300 } 301 path := d.subRepoPaths[d.repos[nextDoc]][s] 302 fileMatch.SubRepositoryPath = path 303 sr := md.SubRepoMap[path] 304 fileMatch.SubRepositoryName = sr.Name 305 if idx := d.branchIndex(nextDoc); idx >= 0 { 306 fileMatch.Version = sr.Branches[idx].Version 307 } 308 } else { 309 idx := d.branchIndex(nextDoc) 310 if idx >= 0 { 311 fileMatch.Version = md.Branches[idx].Version 312 } 313 } 314 315 shouldMergeMatches := !opts.ChunkMatches 316 finalCands := gatherMatches(mt, known, shouldMergeMatches) 317 318 if len(finalCands) == 0 { 319 nm := d.fileName(nextDoc) 320 finalCands = append(finalCands, 321 &candidateMatch{ 322 caseSensitive: false, 323 fileName: true, 324 substrBytes: nm, 325 substrLowered: nm, 326 file: nextDoc, 327 runeOffset: 0, 328 byteOffset: 0, 329 byteMatchSz: uint32(len(nm)), 330 }) 331 } 332 333 if opts.ChunkMatches { 334 fileMatch.ChunkMatches = cp.fillChunkMatches(finalCands, opts.NumContextLines, fileMatch.Language, opts.DebugScore) 335 } else { 336 fileMatch.LineMatches = cp.fillMatches(finalCands, opts.NumContextLines, fileMatch.Language, opts.DebugScore) 337 } 338 339 if opts.UseKeywordScoring { 340 d.scoreFileUsingBM25(&fileMatch, nextDoc, finalCands, opts) 341 } else { 342 // Use the standard, non-experimental scoring method by default 343 d.scoreFile(&fileMatch, nextDoc, mt, known, opts) 344 } 345 346 fileMatch.Branches = d.gatherBranches(nextDoc, mt, known) 347 sortMatchesByScore(fileMatch.LineMatches) 348 sortChunkMatchesByScore(fileMatch.ChunkMatches) 349 if opts.Whole { 350 fileMatch.Content = cp.data(false) 351 } 352 353 matchedChunkRanges := 0 354 for _, cm := range fileMatch.ChunkMatches { 355 matchedChunkRanges += len(cm.Ranges) 356 } 357 358 repoMatchCount += len(fileMatch.LineMatches) 359 repoMatchCount += matchedChunkRanges 360 361 if opts.DebugScore { 362 fileMatch.Debug = fmt.Sprintf("score:%.2f <- %s", fileMatch.Score, fileMatch.Debug) 363 } 364 365 res.Files = append(res.Files, fileMatch) 366 res.Stats.MatchCount += len(fileMatch.LineMatches) 367 res.Stats.MatchCount += matchedChunkRanges 368 res.Stats.FileCount++ 369 } 370 371 // We do not sort Files here, instead we rely on the shards pkg to do file 372 // ranking. If we sorted now, we would break the assumption that results 373 // from the same repo in a shard appear next to each other. 374 375 for _, md := range d.repoMetaData { 376 r := md 377 addRepo(&res, &r) 378 for _, v := range r.SubRepoMap { 379 addRepo(&res, v) 380 } 381 } 382 383 visitMatchTree(mt, func(mt matchTree) { 384 if atom, ok := mt.(interface{ updateStats(*Stats) }); ok { 385 atom.updateStats(&res.Stats) 386 } 387 }) 388 389 // If document ranking is enabled, then we can rank and truncate the files to save memory. 390 if limit := opts.MaxDocDisplayCount; opts.UseDocumentRanks && limit > 0 && limit < len(res.Files) { 391 SortFiles(res.Files) 392 res.Files = res.Files[:limit] 393 } 394 395 return &res, nil 396} 397 398// scoreFile computes a score for the file match using various scoring signals, like 399// whether there's an exact match on a symbol, the number of query clauses that matched, etc. 400func (d *indexData) scoreFile(fileMatch *FileMatch, doc uint32, mt matchTree, known map[matchTree]bool, opts *SearchOptions) { 401 atomMatchCount := 0 402 visitMatches(mt, known, func(mt matchTree) { 403 atomMatchCount++ 404 }) 405 406 // atom-count boosts files with matches from more than 1 atom. The 407 // maximum boost is scoreFactorAtomMatch. 408 if atomMatchCount > 0 { 409 fileMatch.addScore("atom", (1.0-1.0/float64(atomMatchCount))*scoreFactorAtomMatch, opts.DebugScore) 410 } 411 412 maxFileScore := 0.0 413 repetitions := 0 414 for i := range fileMatch.LineMatches { 415 if maxFileScore < fileMatch.LineMatches[i].Score { 416 maxFileScore = fileMatch.LineMatches[i].Score 417 repetitions = 0 418 } else if maxFileScore == fileMatch.LineMatches[i].Score { 419 repetitions += 1 420 } 421 422 // Order by ordering in file. 423 fileMatch.LineMatches[i].Score += scoreLineOrderFactor * (1.0 - (float64(i) / float64(len(fileMatch.LineMatches)))) 424 } 425 426 for i := range fileMatch.ChunkMatches { 427 if maxFileScore < fileMatch.ChunkMatches[i].Score { 428 maxFileScore = fileMatch.ChunkMatches[i].Score 429 } 430 431 // Order by ordering in file. 432 fileMatch.ChunkMatches[i].Score += scoreLineOrderFactor * (1.0 - (float64(i) / float64(len(fileMatch.ChunkMatches)))) 433 } 434 435 // Maintain ordering of input files. This 436 // strictly dominates the in-file ordering of 437 // the matches. 438 fileMatch.addScore("fragment", maxFileScore, opts.DebugScore) 439 440 // Prefer docs with several top-scored matches. 441 fileMatch.addScore("repetition-boost", scoreRepetitionFactor*float64(repetitions), opts.DebugScore) 442 443 if opts.UseDocumentRanks && len(d.ranks) > int(doc) { 444 weight := scoreFileRankFactor 445 if opts.DocumentRanksWeight > 0.0 { 446 weight = opts.DocumentRanksWeight 447 } 448 449 ranks := d.ranks[doc] 450 // The ranks slice always contains one entry representing the file rank (unless it's empty since the 451 // file doesn't have a rank). This is left over from when documents could have multiple rank signals, 452 // and we plan to clean this up. 453 if len(ranks) > 0 { 454 // The file rank represents a log (base 2) count. The log ranks should be bounded at 32, but we 455 // cap it just in case to ensure it falls in the range [0, 1]. 456 normalized := math.Min(1.0, ranks[0]/32.0) 457 fileMatch.addScore("file-rank", weight*normalized, opts.DebugScore) 458 } 459 } 460 461 md := d.repoMetaData[d.repos[doc]] 462 fileMatch.addScore("doc-order", scoreFileOrderFactor*(1.0-float64(doc)/float64(len(d.boundaries))), opts.DebugScore) 463 fileMatch.addScore("repo-rank", scoreRepoRankFactor*float64(md.Rank)/maxUInt16, opts.DebugScore) 464} 465 466// scoreFileUsingBM25 computes a score for the file match using an approximation to BM25, the most common scoring 467// algorithm for keyword search: https://en.wikipedia.org/wiki/Okapi_BM25. It implements all parts of the formula 468// except inverse document frequency (idf), since we don't have access to global term frequency statistics. 469// 470// This scoring strategy ignores all other signals including document ranks. This keeps things simple for now, 471// since BM25 is not normalized and can be tricky to combine with other scoring signals. 472func (d *indexData) scoreFileUsingBM25(fileMatch *FileMatch, doc uint32, cands []*candidateMatch, opts *SearchOptions) { 473 // Treat each candidate match as a term and compute the frequencies. For now, ignore case 474 // sensitivity and treat filenames and symbols the same as content. 475 termFreqs := map[string]int{} 476 for _, cand := range cands { 477 term := string(cand.substrLowered) 478 termFreqs[term]++ 479 } 480 481 // Compute the file length ratio. Usually the calculation would be based on terms, but using 482 // bytes should work fine, as we're just computing a ratio. 483 fileLength := float64(d.boundaries[doc+1] - d.boundaries[doc]) 484 numFiles := len(d.boundaries) 485 averageFileLength := float64(d.boundaries[numFiles - 1]) / float64(numFiles) 486 L := fileLength / averageFileLength 487 488 // Use standard parameter defaults (used in Lucene and academic papers) 489 k, b := 1.2, 0.75 490 sumTf := 0.0 // Just for debugging 491 score := 0.0 492 for _, freq := range termFreqs { 493 tf := float64(freq) 494 sumTf += tf 495 score += ((k + 1.0) * tf) / (k * (1.0 - b + b * L) + tf) 496 } 497 498 fileMatch.addKeywordScore(score, sumTf, L, opts.DebugScore) 499} 500 501func addRepo(res *SearchResult, repo *Repository) { 502 if res.RepoURLs == nil { 503 res.RepoURLs = map[string]string{} 504 } 505 res.RepoURLs[repo.Name] = repo.FileURLTemplate 506 507 if res.LineFragments == nil { 508 res.LineFragments = map[string]string{} 509 } 510 res.LineFragments[repo.Name] = repo.LineFragmentTemplate 511} 512 513type sortByOffsetSlice []*candidateMatch 514 515func (m sortByOffsetSlice) Len() int { return len(m) } 516func (m sortByOffsetSlice) Swap(i, j int) { m[i], m[j] = m[j], m[i] } 517func (m sortByOffsetSlice) Less(i, j int) bool { 518 return m[i].byteOffset < m[j].byteOffset 519} 520 521// Gather matches from this document. This never returns a mixture of 522// filename/content matches: if there are content matches, all 523// filename matches are trimmed from the result. The matches are 524// returned in document order and are non-overlapping. 525// 526// If `merge` is set, overlapping and adjacent matches will be merged 527// into a single match. Otherwise, overlapping matches will be removed, 528// but adjacent matches will remain. 529func gatherMatches(mt matchTree, known map[matchTree]bool, merge bool) []*candidateMatch { 530 var cands []*candidateMatch 531 visitMatches(mt, known, func(mt matchTree) { 532 if smt, ok := mt.(*substrMatchTree); ok { 533 cands = append(cands, smt.current...) 534 } 535 if rmt, ok := mt.(*regexpMatchTree); ok { 536 cands = append(cands, rmt.found...) 537 } 538 if rmt, ok := mt.(*wordMatchTree); ok { 539 cands = append(cands, rmt.found...) 540 } 541 if smt, ok := mt.(*symbolRegexpMatchTree); ok { 542 cands = append(cands, smt.found...) 543 } 544 }) 545 546 foundContentMatch := false 547 for _, c := range cands { 548 if !c.fileName { 549 foundContentMatch = true 550 break 551 } 552 } 553 554 res := cands[:0] 555 for _, c := range cands { 556 if !foundContentMatch || !c.fileName { 557 res = append(res, c) 558 } 559 } 560 cands = res 561 562 if merge { 563 // Merge adjacent candidates. This guarantees that the matches 564 // are non-overlapping. 565 sort.Sort((sortByOffsetSlice)(cands)) 566 res = cands[:0] 567 for i, c := range cands { 568 if i == 0 { 569 res = append(res, c) 570 continue 571 } 572 last := res[len(res)-1] 573 lastEnd := last.byteOffset + last.byteMatchSz 574 end := c.byteOffset + c.byteMatchSz 575 if lastEnd >= c.byteOffset { 576 if end > lastEnd { 577 last.byteMatchSz = end - last.byteOffset 578 } 579 continue 580 } 581 582 res = append(res, c) 583 } 584 } else { 585 // Remove overlapping candidates. This guarantees that the matches 586 // are non-overlapping, but also preserves expected match counts. 587 sort.Sort((sortByOffsetSlice)(cands)) 588 res = cands[:0] 589 for i, c := range cands { 590 if i == 0 { 591 res = append(res, c) 592 continue 593 } 594 last := res[len(res)-1] 595 lastEnd := last.byteOffset + last.byteMatchSz 596 if lastEnd > c.byteOffset { 597 continue 598 } 599 600 res = append(res, c) 601 } 602 } 603 604 return res 605} 606 607func (d *indexData) branchIndex(docID uint32) int { 608 mask := d.fileBranchMasks[docID] 609 idx := 0 610 for mask != 0 { 611 if mask&0x1 != 0 { 612 return idx 613 } 614 idx++ 615 mask >>= 1 616 } 617 return -1 618} 619 620// gatherBranches returns a list of branch names taking into account any branch 621// filters in the query. If the query contains a branch filter, it returns all 622// branches containing the docID and matching the branch filter. Otherwise, it 623// returns all branches containing docID. 624func (d *indexData) gatherBranches(docID uint32, mt matchTree, known map[matchTree]bool) []string { 625 var mask uint64 626 visitMatches(mt, known, func(mt matchTree) { 627 bq, ok := mt.(*branchQueryMatchTree) 628 if !ok { 629 return 630 } 631 632 mask = mask | bq.branchMask() 633 }) 634 635 if mask == 0 { 636 mask = d.fileBranchMasks[docID] 637 } 638 639 var branches []string 640 id := uint32(1) 641 branchNames := d.branchNames[d.repos[docID]] 642 for mask != 0 { 643 if mask&0x1 != 0 { 644 branches = append(branches, branchNames[uint(id)]) 645 } 646 id <<= 1 647 mask >>= 1 648 } 649 650 return branches 651} 652 653func (d *indexData) List(ctx context.Context, q query.Q, opts *ListOptions) (rl *RepoList, err error) { 654 var include func(rle *RepoListEntry) bool 655 656 q = d.simplify(q) 657 if c, ok := q.(*query.Const); ok { 658 if !c.Value { 659 return &RepoList{}, nil 660 } 661 include = func(rle *RepoListEntry) bool { 662 return true 663 } 664 } else { 665 sr, err := d.Search(ctx, q, &SearchOptions{ 666 ShardRepoMaxMatchCount: 1, 667 }) 668 if err != nil { 669 return nil, err 670 } 671 672 foundRepos := make(map[string]struct{}, len(sr.Files)) 673 for _, file := range sr.Files { 674 foundRepos[file.Repository] = struct{}{} 675 } 676 677 include = func(rle *RepoListEntry) bool { 678 _, ok := foundRepos[rle.Repository.Name] 679 return ok 680 } 681 } 682 683 var l RepoList 684 685 field, err := opts.GetField() 686 if err != nil { 687 return nil, err 688 } 689 switch field { 690 case RepoListFieldRepos: 691 l.Repos = make([]*RepoListEntry, 0, len(d.repoListEntry)) 692 case RepoListFieldMinimal: 693 l.Minimal = make(map[uint32]*MinimalRepoListEntry, len(d.repoListEntry)) 694 case RepoListFieldReposMap: 695 l.ReposMap = make(ReposMap, len(d.repoListEntry)) 696 } 697 698 for i := range d.repoListEntry { 699 if d.repoMetaData[i].Tombstone { 700 continue 701 } 702 rle := &d.repoListEntry[i] 703 if !include(rle) { 704 continue 705 } 706 707 l.Stats.Add(&rle.Stats) 708 709 // Backwards compat for when ID is missing 710 if rle.Repository.ID == 0 { 711 l.Repos = append(l.Repos, rle) 712 continue 713 } 714 715 switch field { 716 case RepoListFieldRepos: 717 l.Repos = append(l.Repos, rle) 718 case RepoListFieldMinimal: 719 l.Minimal[rle.Repository.ID] = &MinimalRepoListEntry{ 720 HasSymbols: rle.Repository.HasSymbols, 721 Branches: rle.Repository.Branches, 722 IndexTimeUnix: rle.IndexMetadata.IndexTime.Unix(), 723 } 724 case RepoListFieldReposMap: 725 l.ReposMap[rle.Repository.ID] = MinimalRepoListEntry{ 726 HasSymbols: rle.Repository.HasSymbols, 727 Branches: rle.Repository.Branches, 728 IndexTimeUnix: rle.IndexMetadata.IndexTime.Unix(), 729 } 730 } 731 732 } 733 734 // Only one of these fields is populated and in all cases the size of that 735 // field is the number of Repos in this shard. 736 l.Stats.Repos = len(l.Repos) + len(l.Minimal) + len(l.ReposMap) 737 738 return &l, nil 739} 740 741// regexpToMatchTreeRecursive converts a regular expression to a matchTree mt. If 742// mt is equivalent to the input r, isEqual = true and the matchTree can be used 743// in place of the regex r. If singleLine = true, then the matchTree and all 744// its children only match terms on the same line. singleLine is used during 745// recursion to decide whether to return an andLineMatchTree (singleLine = true) 746// or a andMatchTree (singleLine = false). 747func (d *indexData) regexpToMatchTreeRecursive(r *syntax.Regexp, minTextSize int, fileName bool, caseSensitive bool) (mt matchTree, isEqual bool, singleLine bool, err error) { 748 // TODO - we could perhaps transform Begin/EndText in '\n'? 749 // TODO - we could perhaps transform CharClass in (OrQuery ) 750 // if there are just a few runes, and part of a OpConcat? 751 switch r.Op { 752 case syntax.OpLiteral: 753 s := string(r.Rune) 754 if len(s) >= minTextSize { 755 mt, err := d.newSubstringMatchTree(&query.Substring{Pattern: s, FileName: fileName, CaseSensitive: caseSensitive}) 756 return mt, true, !strings.Contains(s, "\n"), err 757 } 758 case syntax.OpCapture: 759 return d.regexpToMatchTreeRecursive(r.Sub[0], minTextSize, fileName, caseSensitive) 760 761 case syntax.OpPlus: 762 return d.regexpToMatchTreeRecursive(r.Sub[0], minTextSize, fileName, caseSensitive) 763 764 case syntax.OpRepeat: 765 if r.Min == 1 { 766 return d.regexpToMatchTreeRecursive(r.Sub[0], minTextSize, fileName, caseSensitive) 767 } else if r.Min > 1 { 768 // (x){2,} can't be expressed precisely by the matchTree 769 mt, _, singleLine, err := d.regexpToMatchTreeRecursive(r.Sub[0], minTextSize, fileName, caseSensitive) 770 return mt, false, singleLine, err 771 } 772 case syntax.OpConcat, syntax.OpAlternate: 773 var qs []matchTree 774 isEq := true 775 singleLine = true 776 for _, sr := range r.Sub { 777 if sq, subIsEq, subSingleLine, err := d.regexpToMatchTreeRecursive(sr, minTextSize, fileName, caseSensitive); sq != nil { 778 if err != nil { 779 return nil, false, false, err 780 } 781 isEq = isEq && subIsEq 782 singleLine = singleLine && subSingleLine 783 qs = append(qs, sq) 784 } 785 } 786 if r.Op == syntax.OpConcat { 787 if len(qs) > 1 { 788 isEq = false 789 } 790 newQs := make([]matchTree, 0, len(qs)) 791 for _, q := range qs { 792 if _, ok := q.(*bruteForceMatchTree); ok { 793 continue 794 } 795 newQs = append(newQs, q) 796 } 797 if len(newQs) == 1 { 798 return newQs[0], isEq, singleLine, nil 799 } 800 if len(newQs) == 0 { 801 return &bruteForceMatchTree{}, isEq, singleLine, nil 802 } 803 if singleLine { 804 return &andLineMatchTree{andMatchTree{children: newQs}}, isEq, singleLine, nil 805 } 806 return &andMatchTree{newQs}, isEq, singleLine, nil 807 } 808 for _, q := range qs { 809 if _, ok := q.(*bruteForceMatchTree); ok { 810 return q, isEq, false, nil 811 } 812 } 813 if len(qs) == 0 { 814 return &noMatchTree{"const"}, isEq, false, nil 815 } 816 return &orMatchTree{qs}, isEq, false, nil 817 case syntax.OpStar: 818 if r.Sub[0].Op == syntax.OpAnyCharNotNL { 819 return &bruteForceMatchTree{}, false, true, nil 820 } 821 } 822 return &bruteForceMatchTree{}, false, false, nil 823}