fork of https://github.com/sourcegraph/zoekt
0

Configure Feed

Select the types of activity you want to include in your feed.

1// Copyright 2016 Google Inc. All rights reserved. 2// 3// Licensed under the Apache License, Version 2.0 (the "License"); 4// you may not use this file except in compliance with the License. 5// You may obtain a copy of the License at 6// 7// http://www.apache.org/licenses/LICENSE-2.0 8// 9// Unless required by applicable law or agreed to in writing, software 10// distributed under the License is distributed on an "AS IS" BASIS, 11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12// See the License for the specific language governing permissions and 13// limitations under the License. 14 15package zoekt 16 17import ( 18 "context" 19 "fmt" 20 "log" 21 "math" 22 "regexp/syntax" 23 "sort" 24 "strings" 25 26 enry_data "github.com/go-enry/go-enry/v2/data" 27 "github.com/grafana/regexp" 28 29 "github.com/sourcegraph/zoekt/query" 30) 31 32const maxUInt16 = 0xffff 33 34func (m *FileMatch) addScore(what string, s float64, debugScore bool) { 35 if debugScore { 36 m.Debug += fmt.Sprintf("%s:%.2f, ", what, s) 37 } 38 m.Score += s 39} 40 41// simplifyMultiRepo takes a query and a predicate. It returns Const(true) if all 42// repository names fulfill the predicate, Const(false) if none of them do, and q 43// otherwise. 44func (d *indexData) simplifyMultiRepo(q query.Q, predicate func(*Repository) bool) query.Q { 45 count := 0 46 alive := len(d.repoMetaData) 47 for i := range d.repoMetaData { 48 if d.repoMetaData[i].Tombstone { 49 alive-- 50 } else if predicate(&d.repoMetaData[i]) { 51 count++ 52 } 53 } 54 if count == alive { 55 return &query.Const{Value: true} 56 } 57 if count > 0 { 58 return q 59 } 60 return &query.Const{Value: false} 61} 62 63func (d *indexData) simplify(in query.Q) query.Q { 64 eval := query.Map(in, func(q query.Q) query.Q { 65 switch r := q.(type) { 66 case *query.Repo: 67 return d.simplifyMultiRepo(q, func(repo *Repository) bool { 68 return r.Regexp.MatchString(repo.Name) 69 }) 70 case *query.RepoRegexp: 71 return d.simplifyMultiRepo(q, func(repo *Repository) bool { 72 return r.Regexp.MatchString(repo.Name) 73 }) 74 case *query.BranchesRepos: 75 for i := range d.repoMetaData { 76 for _, br := range r.List { 77 if br.Repos.Contains(d.repoMetaData[i].ID) { 78 return q 79 } 80 } 81 } 82 return &query.Const{Value: false} 83 case *query.RepoSet: 84 return d.simplifyMultiRepo(q, func(repo *Repository) bool { 85 return r.Set[repo.Name] 86 }) 87 case *query.RepoIDs: 88 return d.simplifyMultiRepo(q, func(repo *Repository) bool { 89 return r.Repos.Contains(repo.ID) 90 }) 91 case *query.Language: 92 _, has := d.metaData.LanguageMap[r.Language] 93 if !has && d.metaData.IndexFeatureVersion < 12 { 94 // For index files that haven't been re-indexed by go-enry, 95 // fall back to file-based matching and continue even if this 96 // repo doesn't have the specific language present. 97 extsForLang := enry_data.ExtensionsByLanguage[r.Language] 98 if extsForLang != nil { 99 extFrags := make([]string, 0, len(extsForLang)) 100 for _, ext := range extsForLang { 101 extFrags = append(extFrags, regexp.QuoteMeta(ext)) 102 } 103 if len(extFrags) > 0 { 104 pattern := fmt.Sprintf("(?i)(%s)$", strings.Join(extFrags, "|")) 105 // inlined copy of query.regexpQuery 106 re, err := syntax.Parse(pattern, syntax.Perl) 107 if err != nil { 108 return &query.Const{Value: false} 109 } 110 if re.Op == syntax.OpLiteral { 111 return &query.Substring{ 112 Pattern: string(re.Rune), 113 FileName: true, 114 } 115 } 116 return &query.Regexp{ 117 Regexp: re, 118 FileName: true, 119 } 120 } 121 } 122 } 123 if !has { 124 return &query.Const{Value: false} 125 } 126 } 127 return q 128 }) 129 return query.Simplify(eval) 130} 131 132func (o *SearchOptions) SetDefaults() { 133 if o.ShardMaxMatchCount == 0 { 134 // We cap the total number of matches, so overly broad 135 // searches don't crash the machine. 136 o.ShardMaxMatchCount = 100000 137 } 138 if o.TotalMaxMatchCount == 0 { 139 o.TotalMaxMatchCount = 10 * o.ShardMaxMatchCount 140 } 141} 142 143func (d *indexData) Search(ctx context.Context, q query.Q, opts *SearchOptions) (sr *SearchResult, err error) { 144 copyOpts := *opts 145 opts = &copyOpts 146 opts.SetDefaults() 147 148 var res SearchResult 149 if len(d.fileNameIndex) == 0 { 150 return &res, nil 151 } 152 153 select { 154 case <-ctx.Done(): 155 res.Stats.ShardsSkipped++ 156 return &res, nil 157 default: 158 } 159 160 q = d.simplify(q) 161 if c, ok := q.(*query.Const); ok && !c.Value { 162 return &res, nil 163 } 164 165 if opts.EstimateDocCount { 166 res.Stats.ShardFilesConsidered = len(d.fileBranchMasks) 167 return &res, nil 168 } 169 170 q = query.Map(q, query.ExpandFileContent) 171 172 mt, err := d.newMatchTree(q, matchTreeOpt{}) 173 if err != nil { 174 return nil, err 175 } 176 177 mt, err = pruneMatchTree(mt) 178 if err != nil { 179 return nil, err 180 } 181 if mt == nil { 182 res.Stats.ShardsSkippedFilter++ 183 return &res, nil 184 } 185 186 res.Stats.ShardsScanned++ 187 188 cp := &contentProvider{ 189 id: d, 190 stats: &res.Stats, 191 } 192 193 // Track the number of documents found in a repository for 194 // ShardRepoMaxMatchCount 195 var ( 196 lastRepoID uint16 197 repoMatchCount int 198 ) 199 200 docCount := uint32(len(d.fileBranchMasks)) 201 lastDoc := int(-1) 202 203nextFileMatch: 204 for { 205 canceled := false 206 select { 207 case <-ctx.Done(): 208 canceled = true 209 default: 210 } 211 212 nextDoc := mt.nextDoc() 213 if int(nextDoc) <= lastDoc { 214 nextDoc = uint32(lastDoc + 1) 215 } 216 217 for ; nextDoc < docCount; nextDoc++ { 218 repoID := d.repos[nextDoc] 219 repoMetadata := &d.repoMetaData[repoID] 220 221 // Skip tombstoned repositories 222 if repoMetadata.Tombstone { 223 continue 224 } 225 226 // Skip documents that are tombstoned 227 if len(repoMetadata.FileTombstones) > 0 { 228 if _, tombstoned := repoMetadata.FileTombstones[string(d.fileName(nextDoc))]; tombstoned { 229 continue 230 } 231 } 232 233 // Skip documents over ShardRepoMaxMatchCount if specified. 234 if opts.ShardRepoMaxMatchCount > 0 { 235 if repoMatchCount >= opts.ShardRepoMaxMatchCount && repoID == lastRepoID { 236 res.Stats.FilesSkipped++ 237 continue 238 } 239 } 240 241 break 242 } 243 244 if nextDoc >= docCount { 245 break 246 } 247 248 lastDoc = int(nextDoc) 249 250 // We track lastRepoID for ShardRepoMaxMatchCount 251 if lastRepoID != d.repos[nextDoc] { 252 lastRepoID = d.repos[nextDoc] 253 repoMatchCount = 0 254 } 255 256 if canceled || (res.Stats.MatchCount >= opts.ShardMaxMatchCount && opts.ShardMaxMatchCount > 0) { 257 res.Stats.FilesSkipped += int(docCount - nextDoc) 258 break 259 } 260 261 res.Stats.FilesConsidered++ 262 mt.prepare(nextDoc) 263 264 cp.setDocument(nextDoc) 265 266 known := make(map[matchTree]bool) 267 md := d.repoMetaData[d.repos[nextDoc]] 268 269 for cost := costMin; cost <= costMax; cost++ { 270 v, ok := mt.matches(cp, cost, known) 271 if ok && !v { 272 continue nextFileMatch 273 } 274 275 if cost == costMax && !ok { 276 log.Panicf("did not decide. Repo %s, doc %d, known %v", 277 md.Name, nextDoc, known) 278 } 279 } 280 281 fileMatch := FileMatch{ 282 Repository: md.Name, 283 RepositoryID: md.ID, 284 RepositoryPriority: md.priority, 285 FileName: string(d.fileName(nextDoc)), 286 Checksum: d.getChecksum(nextDoc), 287 Language: d.languageMap[d.getLanguage(nextDoc)], 288 } 289 290 if s := d.subRepos[nextDoc]; s > 0 { 291 if s >= uint32(len(d.subRepoPaths[d.repos[nextDoc]])) { 292 log.Panicf("corrupt index: subrepo %d beyond %v", s, d.subRepoPaths) 293 } 294 path := d.subRepoPaths[d.repos[nextDoc]][s] 295 fileMatch.SubRepositoryPath = path 296 sr := md.SubRepoMap[path] 297 fileMatch.SubRepositoryName = sr.Name 298 if idx := d.branchIndex(nextDoc); idx >= 0 { 299 fileMatch.Version = sr.Branches[idx].Version 300 } 301 } else { 302 idx := d.branchIndex(nextDoc) 303 if idx >= 0 { 304 fileMatch.Version = md.Branches[idx].Version 305 } 306 } 307 308 shouldMergeMatches := !opts.ChunkMatches 309 finalCands := gatherMatches(mt, known, shouldMergeMatches) 310 311 if len(finalCands) == 0 { 312 nm := d.fileName(nextDoc) 313 finalCands = append(finalCands, 314 &candidateMatch{ 315 caseSensitive: false, 316 fileName: true, 317 substrBytes: nm, 318 substrLowered: nm, 319 file: nextDoc, 320 runeOffset: 0, 321 byteOffset: 0, 322 byteMatchSz: uint32(len(nm)), 323 }) 324 } 325 326 if opts.ChunkMatches { 327 fileMatch.ChunkMatches = cp.fillChunkMatches(finalCands, opts.NumContextLines, fileMatch.Language, opts.DebugScore) 328 } else { 329 fileMatch.LineMatches = cp.fillMatches(finalCands, opts.NumContextLines, fileMatch.Language, opts.DebugScore) 330 } 331 332 if opts.UseKeywordScoring { 333 d.scoreFileUsingBM25(&fileMatch, nextDoc, finalCands, opts) 334 } else { 335 // Use the standard, non-experimental scoring method by default 336 d.scoreFile(&fileMatch, nextDoc, mt, known, opts) 337 } 338 339 fileMatch.Branches = d.gatherBranches(nextDoc, mt, known) 340 sortMatchesByScore(fileMatch.LineMatches) 341 sortChunkMatchesByScore(fileMatch.ChunkMatches) 342 if opts.Whole { 343 fileMatch.Content = cp.data(false) 344 } 345 346 matchedChunkRanges := 0 347 for _, cm := range fileMatch.ChunkMatches { 348 matchedChunkRanges += len(cm.Ranges) 349 } 350 351 repoMatchCount += len(fileMatch.LineMatches) 352 repoMatchCount += matchedChunkRanges 353 354 if opts.DebugScore { 355 fileMatch.Debug = fmt.Sprintf("score:%.2f <- %s", fileMatch.Score, fileMatch.Debug) 356 } 357 358 res.Files = append(res.Files, fileMatch) 359 res.Stats.MatchCount += len(fileMatch.LineMatches) 360 res.Stats.MatchCount += matchedChunkRanges 361 res.Stats.FileCount++ 362 } 363 364 // We do not sort Files here, instead we rely on the shards pkg to do file 365 // ranking. If we sorted now, we would break the assumption that results 366 // from the same repo in a shard appear next to each other. 367 368 for _, md := range d.repoMetaData { 369 r := md 370 addRepo(&res, &r) 371 for _, v := range r.SubRepoMap { 372 addRepo(&res, v) 373 } 374 } 375 376 visitMatchTree(mt, func(mt matchTree) { 377 if atom, ok := mt.(interface{ updateStats(*Stats) }); ok { 378 atom.updateStats(&res.Stats) 379 } 380 }) 381 382 // If document ranking is enabled, then we can rank and truncate the files to save memory. 383 if limit := opts.MaxDocDisplayCount; opts.UseDocumentRanks && limit > 0 && limit < len(res.Files) { 384 SortFiles(res.Files) 385 res.Files = res.Files[:limit] 386 } 387 388 return &res, nil 389} 390 391// scoreFile computes a score for the file match using various scoring signals, like 392// whether there's an exact match on a symbol, the number of query clauses that matched, etc. 393func (d *indexData) scoreFile(fileMatch *FileMatch, doc uint32, mt matchTree, known map[matchTree]bool, opts *SearchOptions) { 394 atomMatchCount := 0 395 visitMatches(mt, known, func(mt matchTree) { 396 atomMatchCount++ 397 }) 398 399 // atom-count boosts files with matches from more than 1 atom. The 400 // maximum boost is scoreFactorAtomMatch. 401 if atomMatchCount > 0 { 402 fileMatch.addScore("atom", (1.0-1.0/float64(atomMatchCount))*scoreFactorAtomMatch, opts.DebugScore) 403 } 404 405 maxFileScore := 0.0 406 repetitions := 0 407 for i := range fileMatch.LineMatches { 408 if maxFileScore < fileMatch.LineMatches[i].Score { 409 maxFileScore = fileMatch.LineMatches[i].Score 410 repetitions = 0 411 } else if maxFileScore == fileMatch.LineMatches[i].Score { 412 repetitions += 1 413 } 414 415 // Order by ordering in file. 416 fileMatch.LineMatches[i].Score += scoreLineOrderFactor * (1.0 - (float64(i) / float64(len(fileMatch.LineMatches)))) 417 } 418 419 for i := range fileMatch.ChunkMatches { 420 if maxFileScore < fileMatch.ChunkMatches[i].Score { 421 maxFileScore = fileMatch.ChunkMatches[i].Score 422 } 423 424 // Order by ordering in file. 425 fileMatch.ChunkMatches[i].Score += scoreLineOrderFactor * (1.0 - (float64(i) / float64(len(fileMatch.ChunkMatches)))) 426 } 427 428 // Maintain ordering of input files. This 429 // strictly dominates the in-file ordering of 430 // the matches. 431 fileMatch.addScore("fragment", maxFileScore, opts.DebugScore) 432 433 // Prefer docs with several top-scored matches. 434 fileMatch.addScore("repetition-boost", scoreRepetitionFactor*float64(repetitions), opts.DebugScore) 435 436 if opts.UseDocumentRanks && len(d.ranks) > int(doc) { 437 weight := scoreFileRankFactor 438 if opts.DocumentRanksWeight > 0.0 { 439 weight = opts.DocumentRanksWeight 440 } 441 442 ranks := d.ranks[doc] 443 // The ranks slice always contains one entry representing the file rank (unless it's empty since the 444 // file doesn't have a rank). This is left over from when documents could have multiple rank signals, 445 // and we plan to clean this up. 446 if len(ranks) > 0 { 447 // The file rank represents a log (base 2) count. The log ranks should be bounded at 32, but we 448 // cap it just in case to ensure it falls in the range [0, 1]. 449 normalized := math.Min(1.0, ranks[0]/32.0) 450 fileMatch.addScore("file-rank", weight*normalized, opts.DebugScore) 451 } 452 } 453 454 md := d.repoMetaData[d.repos[doc]] 455 fileMatch.addScore("doc-order", scoreFileOrderFactor*(1.0-float64(doc)/float64(len(d.boundaries))), opts.DebugScore) 456 fileMatch.addScore("repo-rank", scoreRepoRankFactor*float64(md.Rank)/maxUInt16, opts.DebugScore) 457} 458 459// scoreFileUsingBM25 computes a score for the file match using an approximation to BM25, the most common scoring 460// algorithm for keyword search: https://en.wikipedia.org/wiki/Okapi_BM25. It implements all parts of the formula 461// except inverse document frequency (idf), since we don't have access to global term frequency statistics. 462// 463// This scoring strategy ignores all other signals including document ranks. This keeps things simple for now, 464// since BM25 is not normalized and can be tricky to combine with other scoring signals. 465func (d *indexData) scoreFileUsingBM25(fileMatch *FileMatch, doc uint32, cands []*candidateMatch, opts *SearchOptions) { 466 // Treat each candidate match as a term and compute the frequencies. For now, ignore case 467 // sensitivity and treat filenames and symbols the same as content. 468 termFreqs := map[string]int{} 469 for _, cand := range cands { 470 term := string(cand.substrLowered) 471 termFreqs[term]++ 472 } 473 474 // Compute the file length ratio. Usually the calculation would be based on terms, but using 475 // bytes should work fine, as we're just computing a ratio. 476 fileLength := float64(d.boundaries[doc+1] - d.boundaries[doc]) 477 numFiles := len(d.boundaries) 478 averageFileLength := float64(d.boundaries[numFiles - 1]) / float64(numFiles) 479 L := fileLength / averageFileLength 480 481 // Use standard parameter defaults (used in Lucene and academic papers) 482 k, b := 1.2, 0.75 483 score := 0.0 484 for _, freq := range termFreqs { 485 tf := float64(freq) 486 score += ((k + 1.0) * tf) / (k * (1.0 - b + b * L) + tf) 487 } 488 489 fileMatch.addScore("keyword-score", score, opts.DebugScore) 490} 491 492func addRepo(res *SearchResult, repo *Repository) { 493 if res.RepoURLs == nil { 494 res.RepoURLs = map[string]string{} 495 } 496 res.RepoURLs[repo.Name] = repo.FileURLTemplate 497 498 if res.LineFragments == nil { 499 res.LineFragments = map[string]string{} 500 } 501 res.LineFragments[repo.Name] = repo.LineFragmentTemplate 502} 503 504type sortByOffsetSlice []*candidateMatch 505 506func (m sortByOffsetSlice) Len() int { return len(m) } 507func (m sortByOffsetSlice) Swap(i, j int) { m[i], m[j] = m[j], m[i] } 508func (m sortByOffsetSlice) Less(i, j int) bool { 509 return m[i].byteOffset < m[j].byteOffset 510} 511 512// Gather matches from this document. This never returns a mixture of 513// filename/content matches: if there are content matches, all 514// filename matches are trimmed from the result. The matches are 515// returned in document order and are non-overlapping. 516// 517// If `merge` is set, overlapping and adjacent matches will be merged 518// into a single match. Otherwise, overlapping matches will be removed, 519// but adjacent matches will remain. 520func gatherMatches(mt matchTree, known map[matchTree]bool, merge bool) []*candidateMatch { 521 var cands []*candidateMatch 522 visitMatches(mt, known, func(mt matchTree) { 523 if smt, ok := mt.(*substrMatchTree); ok { 524 cands = append(cands, smt.current...) 525 } 526 if rmt, ok := mt.(*regexpMatchTree); ok { 527 cands = append(cands, rmt.found...) 528 } 529 if rmt, ok := mt.(*wordMatchTree); ok { 530 cands = append(cands, rmt.found...) 531 } 532 if smt, ok := mt.(*symbolRegexpMatchTree); ok { 533 cands = append(cands, smt.found...) 534 } 535 }) 536 537 foundContentMatch := false 538 for _, c := range cands { 539 if !c.fileName { 540 foundContentMatch = true 541 break 542 } 543 } 544 545 res := cands[:0] 546 for _, c := range cands { 547 if !foundContentMatch || !c.fileName { 548 res = append(res, c) 549 } 550 } 551 cands = res 552 553 if merge { 554 // Merge adjacent candidates. This guarantees that the matches 555 // are non-overlapping. 556 sort.Sort((sortByOffsetSlice)(cands)) 557 res = cands[:0] 558 for i, c := range cands { 559 if i == 0 { 560 res = append(res, c) 561 continue 562 } 563 last := res[len(res)-1] 564 lastEnd := last.byteOffset + last.byteMatchSz 565 end := c.byteOffset + c.byteMatchSz 566 if lastEnd >= c.byteOffset { 567 if end > lastEnd { 568 last.byteMatchSz = end - last.byteOffset 569 } 570 continue 571 } 572 573 res = append(res, c) 574 } 575 } else { 576 // Remove overlapping candidates. This guarantees that the matches 577 // are non-overlapping, but also preserves expected match counts. 578 sort.Sort((sortByOffsetSlice)(cands)) 579 res = cands[:0] 580 for i, c := range cands { 581 if i == 0 { 582 res = append(res, c) 583 continue 584 } 585 last := res[len(res)-1] 586 lastEnd := last.byteOffset + last.byteMatchSz 587 if lastEnd > c.byteOffset { 588 continue 589 } 590 591 res = append(res, c) 592 } 593 } 594 595 return res 596} 597 598func (d *indexData) branchIndex(docID uint32) int { 599 mask := d.fileBranchMasks[docID] 600 idx := 0 601 for mask != 0 { 602 if mask&0x1 != 0 { 603 return idx 604 } 605 idx++ 606 mask >>= 1 607 } 608 return -1 609} 610 611// gatherBranches returns a list of branch names taking into account any branch 612// filters in the query. If the query contains a branch filter, it returns all 613// branches containing the docID and matching the branch filter. Otherwise, it 614// returns all branches containing docID. 615func (d *indexData) gatherBranches(docID uint32, mt matchTree, known map[matchTree]bool) []string { 616 var mask uint64 617 visitMatches(mt, known, func(mt matchTree) { 618 bq, ok := mt.(*branchQueryMatchTree) 619 if !ok { 620 return 621 } 622 623 mask = mask | bq.branchMask() 624 }) 625 626 if mask == 0 { 627 mask = d.fileBranchMasks[docID] 628 } 629 630 var branches []string 631 id := uint32(1) 632 branchNames := d.branchNames[d.repos[docID]] 633 for mask != 0 { 634 if mask&0x1 != 0 { 635 branches = append(branches, branchNames[uint(id)]) 636 } 637 id <<= 1 638 mask >>= 1 639 } 640 641 return branches 642} 643 644func (d *indexData) List(ctx context.Context, q query.Q, opts *ListOptions) (rl *RepoList, err error) { 645 var include func(rle *RepoListEntry) bool 646 647 q = d.simplify(q) 648 if c, ok := q.(*query.Const); ok { 649 if !c.Value { 650 return &RepoList{}, nil 651 } 652 include = func(rle *RepoListEntry) bool { 653 return true 654 } 655 } else { 656 sr, err := d.Search(ctx, q, &SearchOptions{ 657 ShardRepoMaxMatchCount: 1, 658 }) 659 if err != nil { 660 return nil, err 661 } 662 663 foundRepos := make(map[string]struct{}, len(sr.Files)) 664 for _, file := range sr.Files { 665 foundRepos[file.Repository] = struct{}{} 666 } 667 668 include = func(rle *RepoListEntry) bool { 669 _, ok := foundRepos[rle.Repository.Name] 670 return ok 671 } 672 } 673 674 var l RepoList 675 676 field, err := opts.GetField() 677 if err != nil { 678 return nil, err 679 } 680 switch field { 681 case RepoListFieldRepos: 682 l.Repos = make([]*RepoListEntry, 0, len(d.repoListEntry)) 683 case RepoListFieldMinimal: 684 l.Minimal = make(map[uint32]*MinimalRepoListEntry, len(d.repoListEntry)) 685 case RepoListFieldReposMap: 686 l.ReposMap = make(ReposMap, len(d.repoListEntry)) 687 } 688 689 for i := range d.repoListEntry { 690 if d.repoMetaData[i].Tombstone { 691 continue 692 } 693 rle := &d.repoListEntry[i] 694 if !include(rle) { 695 continue 696 } 697 698 l.Stats.Add(&rle.Stats) 699 700 // Backwards compat for when ID is missing 701 if rle.Repository.ID == 0 { 702 l.Repos = append(l.Repos, rle) 703 continue 704 } 705 706 switch field { 707 case RepoListFieldRepos: 708 l.Repos = append(l.Repos, rle) 709 case RepoListFieldMinimal: 710 l.Minimal[rle.Repository.ID] = &MinimalRepoListEntry{ 711 HasSymbols: rle.Repository.HasSymbols, 712 Branches: rle.Repository.Branches, 713 } 714 case RepoListFieldReposMap: 715 l.ReposMap[rle.Repository.ID] = MinimalRepoListEntry{ 716 HasSymbols: rle.Repository.HasSymbols, 717 Branches: rle.Repository.Branches, 718 } 719 } 720 721 } 722 723 return &l, nil 724} 725 726// regexpToMatchTreeRecursive converts a regular expression to a matchTree mt. If 727// mt is equivalent to the input r, isEqual = true and the matchTree can be used 728// in place of the regex r. If singleLine = true, then the matchTree and all 729// its children only match terms on the same line. singleLine is used during 730// recursion to decide whether to return an andLineMatchTree (singleLine = true) 731// or a andMatchTree (singleLine = false). 732func (d *indexData) regexpToMatchTreeRecursive(r *syntax.Regexp, minTextSize int, fileName bool, caseSensitive bool) (mt matchTree, isEqual bool, singleLine bool, err error) { 733 // TODO - we could perhaps transform Begin/EndText in '\n'? 734 // TODO - we could perhaps transform CharClass in (OrQuery ) 735 // if there are just a few runes, and part of a OpConcat? 736 switch r.Op { 737 case syntax.OpLiteral: 738 s := string(r.Rune) 739 if len(s) >= minTextSize { 740 mt, err := d.newSubstringMatchTree(&query.Substring{Pattern: s, FileName: fileName, CaseSensitive: caseSensitive}) 741 return mt, true, !strings.Contains(s, "\n"), err 742 } 743 case syntax.OpCapture: 744 return d.regexpToMatchTreeRecursive(r.Sub[0], minTextSize, fileName, caseSensitive) 745 746 case syntax.OpPlus: 747 return d.regexpToMatchTreeRecursive(r.Sub[0], minTextSize, fileName, caseSensitive) 748 749 case syntax.OpRepeat: 750 if r.Min == 1 { 751 return d.regexpToMatchTreeRecursive(r.Sub[0], minTextSize, fileName, caseSensitive) 752 } else if r.Min > 1 { 753 // (x){2,} can't be expressed precisely by the matchTree 754 mt, _, singleLine, err := d.regexpToMatchTreeRecursive(r.Sub[0], minTextSize, fileName, caseSensitive) 755 return mt, false, singleLine, err 756 } 757 case syntax.OpConcat, syntax.OpAlternate: 758 var qs []matchTree 759 isEq := true 760 singleLine = true 761 for _, sr := range r.Sub { 762 if sq, subIsEq, subSingleLine, err := d.regexpToMatchTreeRecursive(sr, minTextSize, fileName, caseSensitive); sq != nil { 763 if err != nil { 764 return nil, false, false, err 765 } 766 isEq = isEq && subIsEq 767 singleLine = singleLine && subSingleLine 768 qs = append(qs, sq) 769 } 770 } 771 if r.Op == syntax.OpConcat { 772 if len(qs) > 1 { 773 isEq = false 774 } 775 newQs := make([]matchTree, 0, len(qs)) 776 for _, q := range qs { 777 if _, ok := q.(*bruteForceMatchTree); ok { 778 continue 779 } 780 newQs = append(newQs, q) 781 } 782 if len(newQs) == 1 { 783 return newQs[0], isEq, singleLine, nil 784 } 785 if len(newQs) == 0 { 786 return &bruteForceMatchTree{}, isEq, singleLine, nil 787 } 788 if singleLine { 789 return &andLineMatchTree{andMatchTree{children: newQs}}, isEq, singleLine, nil 790 } 791 return &andMatchTree{newQs}, isEq, singleLine, nil 792 } 793 for _, q := range qs { 794 if _, ok := q.(*bruteForceMatchTree); ok { 795 return q, isEq, false, nil 796 } 797 } 798 if len(qs) == 0 { 799 return &noMatchTree{"const"}, isEq, false, nil 800 } 801 return &orMatchTree{qs}, isEq, false, nil 802 case syntax.OpStar: 803 if r.Sub[0].Op == syntax.OpAnyCharNotNL { 804 return &bruteForceMatchTree{}, false, true, nil 805 } 806 } 807 return &bruteForceMatchTree{}, false, false, nil 808}