fork of https://github.com/sourcegraph/zoekt
1// Copyright 2016 Google Inc. All rights reserved.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15package zoekt
16
17import (
18 "context"
19 "fmt"
20 "log"
21 "math"
22 "regexp/syntax"
23 "sort"
24 "strings"
25 "time"
26
27 enry_data "github.com/go-enry/go-enry/v2/data"
28 "github.com/grafana/regexp"
29
30 "github.com/sourcegraph/zoekt/query"
31)
32
33const maxUInt16 = 0xffff
34
35func (m *FileMatch) addScore(what string, s float64, debugScore bool) {
36 if debugScore {
37 m.Debug += fmt.Sprintf("%s:%.2f, ", what, s)
38 }
39 m.Score += s
40}
41
42func (m *FileMatch) addKeywordScore(score float64, sumTf float64, L float64, debugScore bool) {
43 if debugScore {
44 m.Debug += fmt.Sprintf("keyword-score:%.2f (sum-tf: %.2f, length-ratio: %.2f)", score, sumTf, L)
45 }
46 m.Score += score
47}
48
49// simplifyMultiRepo takes a query and a predicate. It returns Const(true) if all
50// repository names fulfill the predicate, Const(false) if none of them do, and q
51// otherwise.
52func (d *indexData) simplifyMultiRepo(q query.Q, predicate func(*Repository) bool) query.Q {
53 count := 0
54 alive := len(d.repoMetaData)
55 for i := range d.repoMetaData {
56 if d.repoMetaData[i].Tombstone {
57 alive--
58 } else if predicate(&d.repoMetaData[i]) {
59 count++
60 }
61 }
62 if count == alive {
63 return &query.Const{Value: true}
64 }
65 if count > 0 {
66 return q
67 }
68 return &query.Const{Value: false}
69}
70
71func (d *indexData) simplify(in query.Q) query.Q {
72 eval := query.Map(in, func(q query.Q) query.Q {
73 switch r := q.(type) {
74 case *query.Repo:
75 return d.simplifyMultiRepo(q, func(repo *Repository) bool {
76 return r.Regexp.MatchString(repo.Name)
77 })
78 case *query.RepoRegexp:
79 return d.simplifyMultiRepo(q, func(repo *Repository) bool {
80 return r.Regexp.MatchString(repo.Name)
81 })
82 case *query.BranchesRepos:
83 for i := range d.repoMetaData {
84 for _, br := range r.List {
85 if br.Repos.Contains(d.repoMetaData[i].ID) {
86 return q
87 }
88 }
89 }
90 return &query.Const{Value: false}
91 case *query.RepoSet:
92 return d.simplifyMultiRepo(q, func(repo *Repository) bool {
93 return r.Set[repo.Name]
94 })
95 case *query.RepoIDs:
96 return d.simplifyMultiRepo(q, func(repo *Repository) bool {
97 return r.Repos.Contains(repo.ID)
98 })
99 case *query.Language:
100 _, has := d.metaData.LanguageMap[r.Language]
101 if !has && d.metaData.IndexFeatureVersion < 12 {
102 // For index files that haven't been re-indexed by go-enry,
103 // fall back to file-based matching and continue even if this
104 // repo doesn't have the specific language present.
105 extsForLang := enry_data.ExtensionsByLanguage[r.Language]
106 if extsForLang != nil {
107 extFrags := make([]string, 0, len(extsForLang))
108 for _, ext := range extsForLang {
109 extFrags = append(extFrags, regexp.QuoteMeta(ext))
110 }
111 if len(extFrags) > 0 {
112 pattern := fmt.Sprintf("(?i)(%s)$", strings.Join(extFrags, "|"))
113 // inlined copy of query.regexpQuery
114 re, err := syntax.Parse(pattern, syntax.Perl)
115 if err != nil {
116 return &query.Const{Value: false}
117 }
118 if re.Op == syntax.OpLiteral {
119 return &query.Substring{
120 Pattern: string(re.Rune),
121 FileName: true,
122 }
123 }
124 return &query.Regexp{
125 Regexp: re,
126 FileName: true,
127 }
128 }
129 }
130 }
131 if !has {
132 return &query.Const{Value: false}
133 }
134 }
135 return q
136 })
137 return query.Simplify(eval)
138}
139
140func (o *SearchOptions) SetDefaults() {
141 if o.ShardMaxMatchCount == 0 {
142 // We cap the total number of matches, so overly broad
143 // searches don't crash the machine.
144 o.ShardMaxMatchCount = 100000
145 }
146 if o.TotalMaxMatchCount == 0 {
147 o.TotalMaxMatchCount = 10 * o.ShardMaxMatchCount
148 }
149}
150
151func (d *indexData) Search(ctx context.Context, q query.Q, opts *SearchOptions) (sr *SearchResult, err error) {
152 timer := newTimer()
153
154 copyOpts := *opts
155 opts = ©Opts
156 opts.SetDefaults()
157
158 var res SearchResult
159 if len(d.fileNameIndex) == 0 {
160 return &res, nil
161 }
162
163 select {
164 case <-ctx.Done():
165 res.Stats.ShardsSkipped++
166 return &res, nil
167 default:
168 }
169
170 q = d.simplify(q)
171 if c, ok := q.(*query.Const); ok && !c.Value {
172 return &res, nil
173 }
174
175 if opts.EstimateDocCount {
176 res.Stats.ShardFilesConsidered = len(d.fileBranchMasks)
177 return &res, nil
178 }
179
180 q = query.Map(q, query.ExpandFileContent)
181
182 mt, err := d.newMatchTree(q, matchTreeOpt{})
183 if err != nil {
184 return nil, err
185 }
186
187 // Capture the costs of construction before pruning
188 updateMatchTreeStats(mt, &res.Stats)
189
190 mt, err = pruneMatchTree(mt)
191 if err != nil {
192 return nil, err
193 }
194 res.Stats.MatchTreeConstruction = timer.Elapsed()
195 if mt == nil {
196 res.Stats.ShardsSkippedFilter++
197 return &res, nil
198 }
199
200 res.Stats.ShardsScanned++
201
202 cp := &contentProvider{
203 id: d,
204 stats: &res.Stats,
205 }
206
207 // Track the number of documents found in a repository for
208 // ShardRepoMaxMatchCount
209 var (
210 lastRepoID uint16
211 repoMatchCount int
212 )
213
214 docCount := uint32(len(d.fileBranchMasks))
215 lastDoc := int(-1)
216
217nextFileMatch:
218 for {
219 canceled := false
220 select {
221 case <-ctx.Done():
222 canceled = true
223 default:
224 }
225
226 nextDoc := mt.nextDoc()
227 if int(nextDoc) <= lastDoc {
228 nextDoc = uint32(lastDoc + 1)
229 }
230
231 for ; nextDoc < docCount; nextDoc++ {
232 repoID := d.repos[nextDoc]
233 repoMetadata := &d.repoMetaData[repoID]
234
235 // Skip tombstoned repositories
236 if repoMetadata.Tombstone {
237 continue
238 }
239
240 // Skip documents that are tombstoned
241 if len(repoMetadata.FileTombstones) > 0 {
242 if _, tombstoned := repoMetadata.FileTombstones[string(d.fileName(nextDoc))]; tombstoned {
243 continue
244 }
245 }
246
247 // Skip documents over ShardRepoMaxMatchCount if specified.
248 if opts.ShardRepoMaxMatchCount > 0 {
249 if repoMatchCount >= opts.ShardRepoMaxMatchCount && repoID == lastRepoID {
250 res.Stats.FilesSkipped++
251 continue
252 }
253 }
254
255 break
256 }
257
258 if nextDoc >= docCount {
259 break
260 }
261
262 lastDoc = int(nextDoc)
263
264 // We track lastRepoID for ShardRepoMaxMatchCount
265 if lastRepoID != d.repos[nextDoc] {
266 lastRepoID = d.repos[nextDoc]
267 repoMatchCount = 0
268 }
269
270 if canceled || (res.Stats.MatchCount >= opts.ShardMaxMatchCount && opts.ShardMaxMatchCount > 0) {
271 res.Stats.FilesSkipped += int(docCount - nextDoc)
272 break
273 }
274
275 res.Stats.FilesConsidered++
276 mt.prepare(nextDoc)
277
278 cp.setDocument(nextDoc)
279
280 known := make(map[matchTree]bool)
281 md := d.repoMetaData[d.repos[nextDoc]]
282
283 for cost := costMin; cost <= costMax; cost++ {
284 v, ok := mt.matches(cp, cost, known)
285 if ok && !v {
286 continue nextFileMatch
287 }
288
289 if cost == costMax && !ok {
290 log.Panicf("did not decide. Repo %s, doc %d, known %v",
291 md.Name, nextDoc, known)
292 }
293 }
294
295 fileMatch := FileMatch{
296 Repository: md.Name,
297 RepositoryID: md.ID,
298 RepositoryPriority: md.priority,
299 FileName: string(d.fileName(nextDoc)),
300 Checksum: d.getChecksum(nextDoc),
301 Language: d.languageMap[d.getLanguage(nextDoc)],
302 }
303
304 if s := d.subRepos[nextDoc]; s > 0 {
305 if s >= uint32(len(d.subRepoPaths[d.repos[nextDoc]])) {
306 log.Panicf("corrupt index: subrepo %d beyond %v", s, d.subRepoPaths)
307 }
308 path := d.subRepoPaths[d.repos[nextDoc]][s]
309 fileMatch.SubRepositoryPath = path
310 sr := md.SubRepoMap[path]
311 fileMatch.SubRepositoryName = sr.Name
312 if idx := d.branchIndex(nextDoc); idx >= 0 {
313 fileMatch.Version = sr.Branches[idx].Version
314 }
315 } else {
316 idx := d.branchIndex(nextDoc)
317 if idx >= 0 {
318 fileMatch.Version = md.Branches[idx].Version
319 }
320 }
321
322 shouldMergeMatches := !opts.ChunkMatches
323 finalCands := gatherMatches(mt, known, shouldMergeMatches)
324
325 if len(finalCands) == 0 {
326 nm := d.fileName(nextDoc)
327 finalCands = append(finalCands,
328 &candidateMatch{
329 caseSensitive: false,
330 fileName: true,
331 substrBytes: nm,
332 substrLowered: nm,
333 file: nextDoc,
334 runeOffset: 0,
335 byteOffset: 0,
336 byteMatchSz: uint32(len(nm)),
337 })
338 }
339
340 if opts.ChunkMatches {
341 fileMatch.ChunkMatches = cp.fillChunkMatches(finalCands, opts.NumContextLines, fileMatch.Language, opts.DebugScore)
342 } else {
343 fileMatch.LineMatches = cp.fillMatches(finalCands, opts.NumContextLines, fileMatch.Language, opts.DebugScore)
344 }
345
346 if opts.UseKeywordScoring {
347 d.scoreFileUsingBM25(&fileMatch, nextDoc, finalCands, opts)
348 } else {
349 // Use the standard, non-experimental scoring method by default
350 d.scoreFile(&fileMatch, nextDoc, mt, known, opts)
351 }
352
353 fileMatch.Branches = d.gatherBranches(nextDoc, mt, known)
354 sortMatchesByScore(fileMatch.LineMatches)
355 sortChunkMatchesByScore(fileMatch.ChunkMatches)
356 if opts.Whole {
357 fileMatch.Content = cp.data(false)
358 }
359
360 matchedChunkRanges := 0
361 for _, cm := range fileMatch.ChunkMatches {
362 matchedChunkRanges += len(cm.Ranges)
363 }
364
365 repoMatchCount += len(fileMatch.LineMatches)
366 repoMatchCount += matchedChunkRanges
367
368 if opts.DebugScore {
369 fileMatch.Debug = fmt.Sprintf("score:%.2f <- %s", fileMatch.Score, fileMatch.Debug)
370 }
371
372 res.Files = append(res.Files, fileMatch)
373 res.Stats.MatchCount += len(fileMatch.LineMatches)
374 res.Stats.MatchCount += matchedChunkRanges
375 res.Stats.FileCount++
376 }
377
378 // We do not sort Files here, instead we rely on the shards pkg to do file
379 // ranking. If we sorted now, we would break the assumption that results
380 // from the same repo in a shard appear next to each other.
381
382 for _, md := range d.repoMetaData {
383 r := md
384 addRepo(&res, &r)
385 for _, v := range r.SubRepoMap {
386 addRepo(&res, v)
387 }
388 }
389
390 // Update stats based on work done during document search.
391 updateMatchTreeStats(mt, &res.Stats)
392
393 // If document ranking is enabled, then we can rank and truncate the files to save memory.
394 if opts.UseDocumentRanks {
395 res.Files = SortAndTruncateFiles(res.Files, opts)
396 }
397
398 res.Stats.MatchTreeSearch = timer.Elapsed()
399
400 return &res, nil
401}
402
403// scoreFile computes a score for the file match using various scoring signals, like
404// whether there's an exact match on a symbol, the number of query clauses that matched, etc.
405func (d *indexData) scoreFile(fileMatch *FileMatch, doc uint32, mt matchTree, known map[matchTree]bool, opts *SearchOptions) {
406 atomMatchCount := 0
407 visitMatches(mt, known, func(mt matchTree) {
408 atomMatchCount++
409 })
410
411 // atom-count boosts files with matches from more than 1 atom. The
412 // maximum boost is scoreFactorAtomMatch.
413 if atomMatchCount > 0 {
414 fileMatch.addScore("atom", (1.0-1.0/float64(atomMatchCount))*scoreFactorAtomMatch, opts.DebugScore)
415 }
416
417 maxFileScore := 0.0
418 repetitions := 0
419 for i := range fileMatch.LineMatches {
420 if maxFileScore < fileMatch.LineMatches[i].Score {
421 maxFileScore = fileMatch.LineMatches[i].Score
422 repetitions = 0
423 } else if maxFileScore == fileMatch.LineMatches[i].Score {
424 repetitions += 1
425 }
426
427 // Order by ordering in file.
428 fileMatch.LineMatches[i].Score += scoreLineOrderFactor * (1.0 - (float64(i) / float64(len(fileMatch.LineMatches))))
429 }
430
431 for i := range fileMatch.ChunkMatches {
432 if maxFileScore < fileMatch.ChunkMatches[i].Score {
433 maxFileScore = fileMatch.ChunkMatches[i].Score
434 }
435
436 // Order by ordering in file.
437 fileMatch.ChunkMatches[i].Score += scoreLineOrderFactor * (1.0 - (float64(i) / float64(len(fileMatch.ChunkMatches))))
438 }
439
440 // Maintain ordering of input files. This
441 // strictly dominates the in-file ordering of
442 // the matches.
443 fileMatch.addScore("fragment", maxFileScore, opts.DebugScore)
444
445 // Prefer docs with several top-scored matches.
446 fileMatch.addScore("repetition-boost", scoreRepetitionFactor*float64(repetitions), opts.DebugScore)
447
448 if opts.UseDocumentRanks && len(d.ranks) > int(doc) {
449 weight := scoreFileRankFactor
450 if opts.DocumentRanksWeight > 0.0 {
451 weight = opts.DocumentRanksWeight
452 }
453
454 ranks := d.ranks[doc]
455 // The ranks slice always contains one entry representing the file rank (unless it's empty since the
456 // file doesn't have a rank). This is left over from when documents could have multiple rank signals,
457 // and we plan to clean this up.
458 if len(ranks) > 0 {
459 // The file rank represents a log (base 2) count. The log ranks should be bounded at 32, but we
460 // cap it just in case to ensure it falls in the range [0, 1].
461 normalized := math.Min(1.0, ranks[0]/32.0)
462 fileMatch.addScore("file-rank", weight*normalized, opts.DebugScore)
463 }
464 }
465
466 md := d.repoMetaData[d.repos[doc]]
467 fileMatch.addScore("doc-order", scoreFileOrderFactor*(1.0-float64(doc)/float64(len(d.boundaries))), opts.DebugScore)
468 fileMatch.addScore("repo-rank", scoreRepoRankFactor*float64(md.Rank)/maxUInt16, opts.DebugScore)
469}
470
471// scoreFileUsingBM25 computes a score for the file match using an approximation to BM25, the most common scoring
472// algorithm for keyword search: https://en.wikipedia.org/wiki/Okapi_BM25. It implements all parts of the formula
473// except inverse document frequency (idf), since we don't have access to global term frequency statistics.
474//
475// This scoring strategy ignores all other signals including document ranks. This keeps things simple for now,
476// since BM25 is not normalized and can be tricky to combine with other scoring signals.
477func (d *indexData) scoreFileUsingBM25(fileMatch *FileMatch, doc uint32, cands []*candidateMatch, opts *SearchOptions) {
478 // Treat each candidate match as a term and compute the frequencies. For now, ignore case
479 // sensitivity and treat filenames and symbols the same as content.
480 termFreqs := map[string]int{}
481 for _, cand := range cands {
482 term := string(cand.substrLowered)
483 termFreqs[term]++
484 }
485
486 // Compute the file length ratio. Usually the calculation would be based on terms, but using
487 // bytes should work fine, as we're just computing a ratio.
488 fileLength := float64(d.boundaries[doc+1] - d.boundaries[doc])
489 numFiles := len(d.boundaries)
490 averageFileLength := float64(d.boundaries[numFiles-1]) / float64(numFiles)
491 L := fileLength / averageFileLength
492
493 // Use standard parameter defaults (used in Lucene and academic papers)
494 k, b := 1.2, 0.75
495 sumTf := 0.0 // Just for debugging
496 score := 0.0
497 for _, freq := range termFreqs {
498 tf := float64(freq)
499 sumTf += tf
500 score += ((k + 1.0) * tf) / (k*(1.0-b+b*L) + tf)
501 }
502
503 fileMatch.addKeywordScore(score, sumTf, L, opts.DebugScore)
504}
505
506func addRepo(res *SearchResult, repo *Repository) {
507 if res.RepoURLs == nil {
508 res.RepoURLs = map[string]string{}
509 }
510 res.RepoURLs[repo.Name] = repo.FileURLTemplate
511
512 if res.LineFragments == nil {
513 res.LineFragments = map[string]string{}
514 }
515 res.LineFragments[repo.Name] = repo.LineFragmentTemplate
516}
517
518type sortByOffsetSlice []*candidateMatch
519
520func (m sortByOffsetSlice) Len() int { return len(m) }
521func (m sortByOffsetSlice) Swap(i, j int) { m[i], m[j] = m[j], m[i] }
522func (m sortByOffsetSlice) Less(i, j int) bool {
523 return m[i].byteOffset < m[j].byteOffset
524}
525
526// Gather matches from this document. This never returns a mixture of
527// filename/content matches: if there are content matches, all
528// filename matches are trimmed from the result. The matches are
529// returned in document order and are non-overlapping.
530//
531// If `merge` is set, overlapping and adjacent matches will be merged
532// into a single match. Otherwise, overlapping matches will be removed,
533// but adjacent matches will remain.
534func gatherMatches(mt matchTree, known map[matchTree]bool, merge bool) []*candidateMatch {
535 var cands []*candidateMatch
536 visitMatches(mt, known, func(mt matchTree) {
537 if smt, ok := mt.(*substrMatchTree); ok {
538 cands = append(cands, smt.current...)
539 }
540 if rmt, ok := mt.(*regexpMatchTree); ok {
541 cands = append(cands, rmt.found...)
542 }
543 if rmt, ok := mt.(*wordMatchTree); ok {
544 cands = append(cands, rmt.found...)
545 }
546 if smt, ok := mt.(*symbolRegexpMatchTree); ok {
547 cands = append(cands, smt.found...)
548 }
549 })
550
551 foundContentMatch := false
552 for _, c := range cands {
553 if !c.fileName {
554 foundContentMatch = true
555 break
556 }
557 }
558
559 res := cands[:0]
560 for _, c := range cands {
561 if !foundContentMatch || !c.fileName {
562 res = append(res, c)
563 }
564 }
565 cands = res
566
567 if merge {
568 // Merge adjacent candidates. This guarantees that the matches
569 // are non-overlapping.
570 sort.Sort((sortByOffsetSlice)(cands))
571 res = cands[:0]
572 for i, c := range cands {
573 if i == 0 {
574 res = append(res, c)
575 continue
576 }
577 last := res[len(res)-1]
578 lastEnd := last.byteOffset + last.byteMatchSz
579 end := c.byteOffset + c.byteMatchSz
580 if lastEnd >= c.byteOffset {
581 if end > lastEnd {
582 last.byteMatchSz = end - last.byteOffset
583 }
584 continue
585 }
586
587 res = append(res, c)
588 }
589 } else {
590 // Remove overlapping candidates. This guarantees that the matches
591 // are non-overlapping, but also preserves expected match counts.
592 sort.Sort((sortByOffsetSlice)(cands))
593 res = cands[:0]
594 for i, c := range cands {
595 if i == 0 {
596 res = append(res, c)
597 continue
598 }
599 last := res[len(res)-1]
600 lastEnd := last.byteOffset + last.byteMatchSz
601 if lastEnd > c.byteOffset {
602 continue
603 }
604
605 res = append(res, c)
606 }
607 }
608
609 return res
610}
611
612func (d *indexData) branchIndex(docID uint32) int {
613 mask := d.fileBranchMasks[docID]
614 idx := 0
615 for mask != 0 {
616 if mask&0x1 != 0 {
617 return idx
618 }
619 idx++
620 mask >>= 1
621 }
622 return -1
623}
624
625// gatherBranches returns a list of branch names taking into account any branch
626// filters in the query. If the query contains a branch filter, it returns all
627// branches containing the docID and matching the branch filter. Otherwise, it
628// returns all branches containing docID.
629func (d *indexData) gatherBranches(docID uint32, mt matchTree, known map[matchTree]bool) []string {
630 var mask uint64
631 visitMatches(mt, known, func(mt matchTree) {
632 bq, ok := mt.(*branchQueryMatchTree)
633 if !ok {
634 return
635 }
636
637 mask = mask | bq.branchMask()
638 })
639
640 if mask == 0 {
641 mask = d.fileBranchMasks[docID]
642 }
643
644 var branches []string
645 id := uint32(1)
646 branchNames := d.branchNames[d.repos[docID]]
647 for mask != 0 {
648 if mask&0x1 != 0 {
649 branches = append(branches, branchNames[uint(id)])
650 }
651 id <<= 1
652 mask >>= 1
653 }
654
655 return branches
656}
657
658func (d *indexData) List(ctx context.Context, q query.Q, opts *ListOptions) (rl *RepoList, err error) {
659 var include func(rle *RepoListEntry) bool
660
661 q = d.simplify(q)
662 if c, ok := q.(*query.Const); ok {
663 if !c.Value {
664 return &RepoList{}, nil
665 }
666 include = func(rle *RepoListEntry) bool {
667 return true
668 }
669 } else {
670 sr, err := d.Search(ctx, q, &SearchOptions{
671 ShardRepoMaxMatchCount: 1,
672 })
673 if err != nil {
674 return nil, err
675 }
676
677 foundRepos := make(map[string]struct{}, len(sr.Files))
678 for _, file := range sr.Files {
679 foundRepos[file.Repository] = struct{}{}
680 }
681
682 include = func(rle *RepoListEntry) bool {
683 _, ok := foundRepos[rle.Repository.Name]
684 return ok
685 }
686 }
687
688 var l RepoList
689
690 field, err := opts.GetField()
691 if err != nil {
692 return nil, err
693 }
694 switch field {
695 case RepoListFieldRepos:
696 l.Repos = make([]*RepoListEntry, 0, len(d.repoListEntry))
697 case RepoListFieldReposMap:
698 l.ReposMap = make(ReposMap, len(d.repoListEntry))
699 }
700
701 for i := range d.repoListEntry {
702 if d.repoMetaData[i].Tombstone {
703 continue
704 }
705 rle := &d.repoListEntry[i]
706 if !include(rle) {
707 continue
708 }
709
710 l.Stats.Add(&rle.Stats)
711
712 // Backwards compat for when ID is missing
713 if rle.Repository.ID == 0 {
714 l.Repos = append(l.Repos, rle)
715 continue
716 }
717
718 switch field {
719 case RepoListFieldRepos:
720 l.Repos = append(l.Repos, rle)
721 case RepoListFieldReposMap:
722 l.ReposMap[rle.Repository.ID] = MinimalRepoListEntry{
723 HasSymbols: rle.Repository.HasSymbols,
724 Branches: rle.Repository.Branches,
725 IndexTimeUnix: rle.IndexMetadata.IndexTime.Unix(),
726 }
727 }
728
729 }
730
731 // Only one of these fields is populated and in all cases the size of that
732 // field is the number of Repos in this shard.
733 l.Stats.Repos = len(l.Repos) + len(l.ReposMap)
734
735 return &l, nil
736}
737
738// regexpToMatchTreeRecursive converts a regular expression to a matchTree mt. If
739// mt is equivalent to the input r, isEqual = true and the matchTree can be used
740// in place of the regex r. If singleLine = true, then the matchTree and all
741// its children only match terms on the same line. singleLine is used during
742// recursion to decide whether to return an andLineMatchTree (singleLine = true)
743// or a andMatchTree (singleLine = false).
744func (d *indexData) regexpToMatchTreeRecursive(r *syntax.Regexp, minTextSize int, fileName bool, caseSensitive bool) (mt matchTree, isEqual bool, singleLine bool, err error) {
745 // TODO - we could perhaps transform Begin/EndText in '\n'?
746 // TODO - we could perhaps transform CharClass in (OrQuery )
747 // if there are just a few runes, and part of a OpConcat?
748 switch r.Op {
749 case syntax.OpLiteral:
750 s := string(r.Rune)
751 if len(s) >= minTextSize {
752 mt, err := d.newSubstringMatchTree(&query.Substring{Pattern: s, FileName: fileName, CaseSensitive: caseSensitive})
753 return mt, true, !strings.Contains(s, "\n"), err
754 }
755 case syntax.OpCapture:
756 return d.regexpToMatchTreeRecursive(r.Sub[0], minTextSize, fileName, caseSensitive)
757
758 case syntax.OpPlus:
759 return d.regexpToMatchTreeRecursive(r.Sub[0], minTextSize, fileName, caseSensitive)
760
761 case syntax.OpRepeat:
762 if r.Min == 1 {
763 return d.regexpToMatchTreeRecursive(r.Sub[0], minTextSize, fileName, caseSensitive)
764 } else if r.Min > 1 {
765 // (x){2,} can't be expressed precisely by the matchTree
766 mt, _, singleLine, err := d.regexpToMatchTreeRecursive(r.Sub[0], minTextSize, fileName, caseSensitive)
767 return mt, false, singleLine, err
768 }
769 case syntax.OpConcat, syntax.OpAlternate:
770 var qs []matchTree
771 isEq := true
772 singleLine = true
773 for _, sr := range r.Sub {
774 if sq, subIsEq, subSingleLine, err := d.regexpToMatchTreeRecursive(sr, minTextSize, fileName, caseSensitive); sq != nil {
775 if err != nil {
776 return nil, false, false, err
777 }
778 isEq = isEq && subIsEq
779 singleLine = singleLine && subSingleLine
780 qs = append(qs, sq)
781 }
782 }
783 if r.Op == syntax.OpConcat {
784 if len(qs) > 1 {
785 isEq = false
786 }
787 newQs := make([]matchTree, 0, len(qs))
788 for _, q := range qs {
789 if _, ok := q.(*bruteForceMatchTree); ok {
790 continue
791 }
792 newQs = append(newQs, q)
793 }
794 if len(newQs) == 1 {
795 return newQs[0], isEq, singleLine, nil
796 }
797 if len(newQs) == 0 {
798 return &bruteForceMatchTree{}, isEq, singleLine, nil
799 }
800 if singleLine {
801 return &andLineMatchTree{andMatchTree{children: newQs}}, isEq, singleLine, nil
802 }
803 return &andMatchTree{newQs}, isEq, singleLine, nil
804 }
805 for _, q := range qs {
806 if _, ok := q.(*bruteForceMatchTree); ok {
807 return q, isEq, false, nil
808 }
809 }
810 if len(qs) == 0 {
811 return &noMatchTree{Why: "const"}, isEq, false, nil
812 }
813 return &orMatchTree{qs}, isEq, false, nil
814 case syntax.OpStar:
815 if r.Sub[0].Op == syntax.OpAnyCharNotNL {
816 return &bruteForceMatchTree{}, false, true, nil
817 }
818 }
819 return &bruteForceMatchTree{}, false, false, nil
820}
821
822type timer struct {
823 last time.Time
824}
825
826func newTimer() *timer {
827 return &timer{
828 last: time.Now(),
829 }
830}
831
832func (t *timer) Elapsed() time.Duration {
833 now := time.Now()
834 d := now.Sub(t.last)
835 t.last = now
836 return d
837}