fork of https://github.com/sourcegraph/zoekt
1// Copyright 2016 Google Inc. All rights reserved.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15package zoekt
16
17import (
18 "context"
19 "fmt"
20 "log"
21 "math"
22 "regexp/syntax"
23 "sort"
24 "strconv"
25 "strings"
26 "time"
27
28 enry_data "github.com/go-enry/go-enry/v2/data"
29 "github.com/grafana/regexp"
30
31 "github.com/sourcegraph/zoekt/query"
32)
33
34const maxUInt16 = 0xffff
35
36// addScore increments the score of the FileMatch by the computed score. If
37// debugScore is true, it also adds a debug string to the FileMatch. If raw is
38// -1, it is ignored. Otherwise, it is added to the debug string.
39func (m *FileMatch) addScore(what string, computed float64, raw float64, debugScore bool) {
40 if computed != 0 && debugScore {
41 var b strings.Builder
42 fmt.Fprintf(&b, "%s", what)
43 if raw != -1 {
44 fmt.Fprintf(&b, "(%s)", strconv.FormatFloat(raw, 'f', -1, 64))
45 }
46 fmt.Fprintf(&b, ":%.2f, ", computed)
47 m.Debug += b.String()
48 }
49 m.Score += computed
50}
51
52func (m *FileMatch) addKeywordScore(score float64, sumTf float64, L float64, debugScore bool) {
53 if debugScore {
54 m.Debug += fmt.Sprintf("keyword-score:%.2f (sum-tf: %.2f, length-ratio: %.2f)", score, sumTf, L)
55 }
56 m.Score += score
57}
58
59// simplifyMultiRepo takes a query and a predicate. It returns Const(true) if all
60// repository names fulfill the predicate, Const(false) if none of them do, and q
61// otherwise.
62func (d *indexData) simplifyMultiRepo(q query.Q, predicate func(*Repository) bool) query.Q {
63 count := 0
64 alive := len(d.repoMetaData)
65 for i := range d.repoMetaData {
66 if d.repoMetaData[i].Tombstone {
67 alive--
68 } else if predicate(&d.repoMetaData[i]) {
69 count++
70 }
71 }
72 if count == alive {
73 return &query.Const{Value: true}
74 }
75 if count > 0 {
76 return q
77 }
78 return &query.Const{Value: false}
79}
80
81func (d *indexData) simplify(in query.Q) query.Q {
82 eval := query.Map(in, func(q query.Q) query.Q {
83 switch r := q.(type) {
84 case *query.Repo:
85 return d.simplifyMultiRepo(q, func(repo *Repository) bool {
86 return r.Regexp.MatchString(repo.Name)
87 })
88 case *query.RepoRegexp:
89 return d.simplifyMultiRepo(q, func(repo *Repository) bool {
90 return r.Regexp.MatchString(repo.Name)
91 })
92 case *query.BranchesRepos:
93 for i := range d.repoMetaData {
94 for _, br := range r.List {
95 if br.Repos.Contains(d.repoMetaData[i].ID) {
96 return q
97 }
98 }
99 }
100 return &query.Const{Value: false}
101 case *query.RepoSet:
102 return d.simplifyMultiRepo(q, func(repo *Repository) bool {
103 return r.Set[repo.Name]
104 })
105 case *query.RepoIDs:
106 return d.simplifyMultiRepo(q, func(repo *Repository) bool {
107 return r.Repos.Contains(repo.ID)
108 })
109 case *query.Language:
110 _, has := d.metaData.LanguageMap[r.Language]
111 if !has && d.metaData.IndexFeatureVersion < 12 {
112 // For index files that haven't been re-indexed by go-enry,
113 // fall back to file-based matching and continue even if this
114 // repo doesn't have the specific language present.
115 extsForLang := enry_data.ExtensionsByLanguage[r.Language]
116 if extsForLang != nil {
117 extFrags := make([]string, 0, len(extsForLang))
118 for _, ext := range extsForLang {
119 extFrags = append(extFrags, regexp.QuoteMeta(ext))
120 }
121 if len(extFrags) > 0 {
122 pattern := fmt.Sprintf("(?i)(%s)$", strings.Join(extFrags, "|"))
123 // inlined copy of query.regexpQuery
124 re, err := syntax.Parse(pattern, syntax.Perl)
125 if err != nil {
126 return &query.Const{Value: false}
127 }
128 if re.Op == syntax.OpLiteral {
129 return &query.Substring{
130 Pattern: string(re.Rune),
131 FileName: true,
132 }
133 }
134 return &query.Regexp{
135 Regexp: re,
136 FileName: true,
137 }
138 }
139 }
140 }
141 if !has {
142 return &query.Const{Value: false}
143 }
144 }
145 return q
146 })
147 return query.Simplify(eval)
148}
149
150func (o *SearchOptions) SetDefaults() {
151 if o.ShardMaxMatchCount == 0 {
152 // We cap the total number of matches, so overly broad
153 // searches don't crash the machine.
154 o.ShardMaxMatchCount = 100000
155 }
156 if o.TotalMaxMatchCount == 0 {
157 o.TotalMaxMatchCount = 10 * o.ShardMaxMatchCount
158 }
159}
160
161func (d *indexData) Search(ctx context.Context, q query.Q, opts *SearchOptions) (sr *SearchResult, err error) {
162 timer := newTimer()
163
164 copyOpts := *opts
165 opts = ©Opts
166 opts.SetDefaults()
167
168 var res SearchResult
169 if len(d.fileNameIndex) == 0 {
170 return &res, nil
171 }
172
173 select {
174 case <-ctx.Done():
175 res.Stats.ShardsSkipped++
176 return &res, nil
177 default:
178 }
179
180 q = d.simplify(q)
181 if c, ok := q.(*query.Const); ok && !c.Value {
182 return &res, nil
183 }
184
185 if opts.EstimateDocCount {
186 res.Stats.ShardFilesConsidered = len(d.fileBranchMasks)
187 return &res, nil
188 }
189
190 q = query.Map(q, query.ExpandFileContent)
191
192 mt, err := d.newMatchTree(q, matchTreeOpt{})
193 if err != nil {
194 return nil, err
195 }
196
197 // Capture the costs of construction before pruning
198 updateMatchTreeStats(mt, &res.Stats)
199
200 mt, err = pruneMatchTree(mt)
201 if err != nil {
202 return nil, err
203 }
204 res.Stats.MatchTreeConstruction = timer.Elapsed()
205 if mt == nil {
206 res.Stats.ShardsSkippedFilter++
207 return &res, nil
208 }
209
210 res.Stats.ShardsScanned++
211
212 cp := &contentProvider{
213 id: d,
214 stats: &res.Stats,
215 }
216
217 // Track the number of documents found in a repository for
218 // ShardRepoMaxMatchCount
219 var (
220 lastRepoID uint16
221 repoMatchCount int
222 )
223
224 docCount := uint32(len(d.fileBranchMasks))
225 lastDoc := int(-1)
226
227nextFileMatch:
228 for {
229 canceled := false
230 select {
231 case <-ctx.Done():
232 canceled = true
233 default:
234 }
235
236 nextDoc := mt.nextDoc()
237 if int(nextDoc) <= lastDoc {
238 nextDoc = uint32(lastDoc + 1)
239 }
240
241 for ; nextDoc < docCount; nextDoc++ {
242 repoID := d.repos[nextDoc]
243 repoMetadata := &d.repoMetaData[repoID]
244
245 // Skip tombstoned repositories
246 if repoMetadata.Tombstone {
247 continue
248 }
249
250 // Skip documents that are tombstoned
251 if len(repoMetadata.FileTombstones) > 0 {
252 if _, tombstoned := repoMetadata.FileTombstones[string(d.fileName(nextDoc))]; tombstoned {
253 continue
254 }
255 }
256
257 // Skip documents over ShardRepoMaxMatchCount if specified.
258 if opts.ShardRepoMaxMatchCount > 0 {
259 if repoMatchCount >= opts.ShardRepoMaxMatchCount && repoID == lastRepoID {
260 res.Stats.FilesSkipped++
261 continue
262 }
263 }
264
265 break
266 }
267
268 if nextDoc >= docCount {
269 break
270 }
271
272 lastDoc = int(nextDoc)
273
274 // We track lastRepoID for ShardRepoMaxMatchCount
275 if lastRepoID != d.repos[nextDoc] {
276 lastRepoID = d.repos[nextDoc]
277 repoMatchCount = 0
278 }
279
280 if canceled || (res.Stats.MatchCount >= opts.ShardMaxMatchCount && opts.ShardMaxMatchCount > 0) {
281 res.Stats.FilesSkipped += int(docCount - nextDoc)
282 break
283 }
284
285 res.Stats.FilesConsidered++
286 mt.prepare(nextDoc)
287
288 cp.setDocument(nextDoc)
289
290 known := make(map[matchTree]bool)
291 md := d.repoMetaData[d.repos[nextDoc]]
292
293 for cost := costMin; cost <= costMax; cost++ {
294 switch evalMatchTree(cp, cost, known, mt) {
295 case matchesRequiresHigherCost:
296 if cost == costMax {
297 log.Panicf("did not decide. Repo %s, doc %d, known %v",
298 md.Name, nextDoc, known)
299 }
300 case matchesFound:
301 // could short-circuit now, but we want to run higher costs to
302 // potentially find higher ranked matches.
303 case matchesNone:
304 continue nextFileMatch
305 }
306 }
307
308 fileMatch := FileMatch{
309 Repository: md.Name,
310 RepositoryID: md.ID,
311 RepositoryPriority: md.priority,
312 FileName: string(d.fileName(nextDoc)),
313 Checksum: d.getChecksum(nextDoc),
314 Language: d.languageMap[d.getLanguage(nextDoc)],
315 }
316
317 if s := d.subRepos[nextDoc]; s > 0 {
318 if s >= uint32(len(d.subRepoPaths[d.repos[nextDoc]])) {
319 log.Panicf("corrupt index: subrepo %d beyond %v", s, d.subRepoPaths)
320 }
321 path := d.subRepoPaths[d.repos[nextDoc]][s]
322 fileMatch.SubRepositoryPath = path
323 sr := md.SubRepoMap[path]
324 fileMatch.SubRepositoryName = sr.Name
325 if idx := d.branchIndex(nextDoc); idx >= 0 {
326 fileMatch.Version = sr.Branches[idx].Version
327 }
328 } else {
329 idx := d.branchIndex(nextDoc)
330 if idx >= 0 {
331 fileMatch.Version = md.Branches[idx].Version
332 }
333 }
334
335 // Important invariant for performance: finalCands is sorted by offset and
336 // non-overlapping. gatherMatches respects this invariant and all later
337 // transformations respect this.
338 shouldMergeMatches := !opts.ChunkMatches
339 finalCands := gatherMatches(mt, known, shouldMergeMatches)
340
341 if len(finalCands) == 0 {
342 nm := d.fileName(nextDoc)
343 finalCands = append(finalCands,
344 &candidateMatch{
345 caseSensitive: false,
346 fileName: true,
347 substrBytes: nm,
348 substrLowered: nm,
349 file: nextDoc,
350 runeOffset: 0,
351 byteOffset: 0,
352 byteMatchSz: uint32(len(nm)),
353 })
354 }
355
356 if opts.ChunkMatches {
357 fileMatch.ChunkMatches = cp.fillChunkMatches(finalCands, opts.NumContextLines, fileMatch.Language, opts.DebugScore)
358 } else {
359 fileMatch.LineMatches = cp.fillMatches(finalCands, opts.NumContextLines, fileMatch.Language, opts.DebugScore)
360 }
361
362 if opts.UseKeywordScoring {
363 d.scoreFileUsingBM25(&fileMatch, nextDoc, finalCands, opts)
364 } else {
365 // Use the standard, non-experimental scoring method by default
366 d.scoreFile(&fileMatch, nextDoc, mt, known, opts)
367 }
368
369 fileMatch.Branches = d.gatherBranches(nextDoc, mt, known)
370 sortMatchesByScore(fileMatch.LineMatches)
371 sortChunkMatchesByScore(fileMatch.ChunkMatches)
372 if opts.Whole {
373 fileMatch.Content = cp.data(false)
374 }
375
376 matchedChunkRanges := 0
377 for _, cm := range fileMatch.ChunkMatches {
378 matchedChunkRanges += len(cm.Ranges)
379 }
380
381 repoMatchCount += len(fileMatch.LineMatches)
382 repoMatchCount += matchedChunkRanges
383
384 if opts.DebugScore {
385 fileMatch.Debug = fmt.Sprintf("score:%.2f <- %s", fileMatch.Score, fileMatch.Debug)
386 }
387
388 res.Files = append(res.Files, fileMatch)
389 res.Stats.MatchCount += len(fileMatch.LineMatches)
390 res.Stats.MatchCount += matchedChunkRanges
391 res.Stats.FileCount++
392 }
393
394 // We do not sort Files here, instead we rely on the shards pkg to do file
395 // ranking. If we sorted now, we would break the assumption that results
396 // from the same repo in a shard appear next to each other.
397
398 for _, md := range d.repoMetaData {
399 r := md
400 addRepo(&res, &r)
401 for _, v := range r.SubRepoMap {
402 addRepo(&res, v)
403 }
404 }
405
406 // Update stats based on work done during document search.
407 updateMatchTreeStats(mt, &res.Stats)
408
409 // If document ranking is enabled, then we can rank and truncate the files to save memory.
410 if opts.UseDocumentRanks {
411 res.Files = SortAndTruncateFiles(res.Files, opts)
412 }
413
414 res.Stats.MatchTreeSearch = timer.Elapsed()
415
416 return &res, nil
417}
418
419// scoreFile computes a score for the file match using various scoring signals, like
420// whether there's an exact match on a symbol, the number of query clauses that matched, etc.
421func (d *indexData) scoreFile(fileMatch *FileMatch, doc uint32, mt matchTree, known map[matchTree]bool, opts *SearchOptions) {
422 atomMatchCount := 0
423 visitMatchAtoms(mt, known, func(mt matchTree) {
424 atomMatchCount++
425 })
426
427 addScore := func(what string, computed float64) {
428 fileMatch.addScore(what, computed, -1, opts.DebugScore)
429 }
430
431 // atom-count boosts files with matches from more than 1 atom. The
432 // maximum boost is scoreFactorAtomMatch.
433 if atomMatchCount > 0 {
434 fileMatch.addScore("atom", (1.0-1.0/float64(atomMatchCount))*scoreFactorAtomMatch, float64(atomMatchCount), opts.DebugScore)
435 }
436
437 maxFileScore := 0.0
438 for i := range fileMatch.LineMatches {
439 if maxFileScore < fileMatch.LineMatches[i].Score {
440 maxFileScore = fileMatch.LineMatches[i].Score
441 }
442
443 // Order by ordering in file.
444 fileMatch.LineMatches[i].Score += scoreLineOrderFactor * (1.0 - (float64(i) / float64(len(fileMatch.LineMatches))))
445 }
446
447 for i := range fileMatch.ChunkMatches {
448 if maxFileScore < fileMatch.ChunkMatches[i].Score {
449 maxFileScore = fileMatch.ChunkMatches[i].Score
450 }
451
452 // Order by ordering in file.
453 fileMatch.ChunkMatches[i].Score += scoreLineOrderFactor * (1.0 - (float64(i) / float64(len(fileMatch.ChunkMatches))))
454 }
455
456 // Maintain ordering of input files. This
457 // strictly dominates the in-file ordering of
458 // the matches.
459 addScore("fragment", maxFileScore)
460
461 if opts.UseDocumentRanks && len(d.ranks) > int(doc) {
462 weight := scoreFileRankFactor
463 if opts.DocumentRanksWeight > 0.0 {
464 weight = opts.DocumentRanksWeight
465 }
466
467 ranks := d.ranks[doc]
468 // The ranks slice always contains one entry representing the file rank (unless it's empty since the
469 // file doesn't have a rank). This is left over from when documents could have multiple rank signals,
470 // and we plan to clean this up.
471 if len(ranks) > 0 {
472 // The file rank represents a log (base 2) count. The log ranks should be bounded at 32, but we
473 // cap it just in case to ensure it falls in the range [0, 1].
474 normalized := math.Min(1.0, ranks[0]/32.0)
475 addScore("file-rank", weight*normalized)
476 }
477 }
478
479 md := d.repoMetaData[d.repos[doc]]
480 addScore("doc-order", scoreFileOrderFactor*(1.0-float64(doc)/float64(len(d.boundaries))))
481 addScore("repo-rank", scoreRepoRankFactor*float64(md.Rank)/maxUInt16)
482
483 if opts.DebugScore {
484 fileMatch.Debug = strings.TrimSuffix(fileMatch.Debug, ", ")
485 }
486}
487
488// scoreFileUsingBM25 computes a score for the file match using an approximation to BM25, the most common scoring
489// algorithm for keyword search: https://en.wikipedia.org/wiki/Okapi_BM25. It implements all parts of the formula
490// except inverse document frequency (idf), since we don't have access to global term frequency statistics.
491//
492// This scoring strategy ignores all other signals including document ranks. This keeps things simple for now,
493// since BM25 is not normalized and can be tricky to combine with other scoring signals.
494func (d *indexData) scoreFileUsingBM25(fileMatch *FileMatch, doc uint32, cands []*candidateMatch, opts *SearchOptions) {
495 // Treat each candidate match as a term and compute the frequencies. For now, ignore case
496 // sensitivity and treat filenames and symbols the same as content.
497 termFreqs := map[string]int{}
498 for _, cand := range cands {
499 term := string(cand.substrLowered)
500 termFreqs[term]++
501 }
502
503 // Compute the file length ratio. Usually the calculation would be based on terms, but using
504 // bytes should work fine, as we're just computing a ratio.
505 fileLength := float64(d.boundaries[doc+1] - d.boundaries[doc])
506 numFiles := len(d.boundaries)
507 averageFileLength := float64(d.boundaries[numFiles-1]) / float64(numFiles)
508 L := fileLength / averageFileLength
509
510 // Use standard parameter defaults (used in Lucene and academic papers)
511 k, b := 1.2, 0.75
512 sumTf := 0.0 // Just for debugging
513 score := 0.0
514 for _, freq := range termFreqs {
515 tf := float64(freq)
516 sumTf += tf
517 score += ((k + 1.0) * tf) / (k*(1.0-b+b*L) + tf)
518 }
519
520 fileMatch.addKeywordScore(score, sumTf, L, opts.DebugScore)
521}
522
523func addRepo(res *SearchResult, repo *Repository) {
524 if res.RepoURLs == nil {
525 res.RepoURLs = map[string]string{}
526 }
527 res.RepoURLs[repo.Name] = repo.FileURLTemplate
528
529 if res.LineFragments == nil {
530 res.LineFragments = map[string]string{}
531 }
532 res.LineFragments[repo.Name] = repo.LineFragmentTemplate
533}
534
535type sortByOffsetSlice []*candidateMatch
536
537func (m sortByOffsetSlice) Len() int { return len(m) }
538func (m sortByOffsetSlice) Swap(i, j int) { m[i], m[j] = m[j], m[i] }
539func (m sortByOffsetSlice) Less(i, j int) bool {
540 if m[i].byteOffset == m[j].byteOffset { // tie break if same offset
541 // Prefer longer candidates if starting at same position
542 return m[i].byteMatchSz > m[j].byteMatchSz
543 }
544 return m[i].byteOffset < m[j].byteOffset
545}
546
547// setScoreWeight is a helper used by gatherMatches to set the weight based on
548// the score weight of the matchTree.
549func setScoreWeight(scoreWeight float64, cm []*candidateMatch) []*candidateMatch {
550 for _, m := range cm {
551 m.scoreWeight = scoreWeight
552 }
553 return cm
554}
555
556// Gather matches from this document. This never returns a mixture of
557// filename/content matches: if there are content matches, all
558// filename matches are trimmed from the result. The matches are
559// returned in document order and are non-overlapping.
560//
561// If `merge` is set, overlapping and adjacent matches will be merged
562// into a single match. Otherwise, overlapping matches will be removed,
563// but adjacent matches will remain.
564func gatherMatches(mt matchTree, known map[matchTree]bool, merge bool) []*candidateMatch {
565 var cands []*candidateMatch
566 visitMatches(mt, known, 1, func(mt matchTree, scoreWeight float64) {
567 if smt, ok := mt.(*substrMatchTree); ok {
568 cands = append(cands, setScoreWeight(scoreWeight, smt.current)...)
569 }
570 if rmt, ok := mt.(*regexpMatchTree); ok {
571 cands = append(cands, setScoreWeight(scoreWeight, rmt.found)...)
572 }
573 if rmt, ok := mt.(*wordMatchTree); ok {
574 cands = append(cands, setScoreWeight(scoreWeight, rmt.found)...)
575 }
576 if smt, ok := mt.(*symbolRegexpMatchTree); ok {
577 cands = append(cands, setScoreWeight(scoreWeight, smt.found)...)
578 }
579 })
580
581 foundContentMatch := false
582 for _, c := range cands {
583 if !c.fileName {
584 foundContentMatch = true
585 break
586 }
587 }
588
589 res := cands[:0]
590 for _, c := range cands {
591 if !foundContentMatch || !c.fileName {
592 res = append(res, c)
593 }
594 }
595 cands = res
596
597 if merge {
598 // Merge adjacent candidates. This guarantees that the matches
599 // are non-overlapping.
600 sort.Sort((sortByOffsetSlice)(cands))
601 res = cands[:0]
602 mergeRun := 1
603 for i, c := range cands {
604 if i == 0 {
605 res = append(res, c)
606 continue
607 }
608 last := res[len(res)-1]
609 lastEnd := last.byteOffset + last.byteMatchSz
610 end := c.byteOffset + c.byteMatchSz
611 if lastEnd >= c.byteOffset {
612 mergeRun++
613
614 // Average out the score across the merged candidates. Only do it if
615 // we are boosting to avoid floating point funkiness in the normal
616 // case.
617 if !(epsilonEqualsOne(last.scoreWeight) && epsilonEqualsOne(c.scoreWeight)) {
618 last.scoreWeight = ((last.scoreWeight * float64(mergeRun-1)) + c.scoreWeight) / float64(mergeRun)
619 }
620
621 // latest candidate goes further, update our end
622 if end > lastEnd {
623 last.byteMatchSz = end - last.byteOffset
624 }
625
626 continue
627 } else {
628 mergeRun = 1
629 }
630
631 res = append(res, c)
632 }
633 } else {
634 // Remove overlapping candidates. This guarantees that the matches
635 // are non-overlapping, but also preserves expected match counts.
636 sort.Sort((sortByOffsetSlice)(cands))
637 res = cands[:0]
638 for i, c := range cands {
639 if i == 0 {
640 res = append(res, c)
641 continue
642 }
643 last := res[len(res)-1]
644 lastEnd := last.byteOffset + last.byteMatchSz
645 if lastEnd > c.byteOffset {
646 continue
647 }
648
649 res = append(res, c)
650 }
651 }
652
653 return res
654}
655
656func (d *indexData) branchIndex(docID uint32) int {
657 mask := d.fileBranchMasks[docID]
658 idx := 0
659 for mask != 0 {
660 if mask&0x1 != 0 {
661 return idx
662 }
663 idx++
664 mask >>= 1
665 }
666 return -1
667}
668
669// gatherBranches returns a list of branch names taking into account any branch
670// filters in the query. If the query contains a branch filter, it returns all
671// branches containing the docID and matching the branch filter. Otherwise, it
672// returns all branches containing docID.
673func (d *indexData) gatherBranches(docID uint32, mt matchTree, known map[matchTree]bool) []string {
674 var mask uint64
675 visitMatchAtoms(mt, known, func(mt matchTree) {
676 bq, ok := mt.(*branchQueryMatchTree)
677 if !ok {
678 return
679 }
680
681 mask = mask | bq.branchMask()
682 })
683
684 if mask == 0 {
685 mask = d.fileBranchMasks[docID]
686 }
687
688 var branches []string
689 id := uint32(1)
690 branchNames := d.branchNames[d.repos[docID]]
691 for mask != 0 {
692 if mask&0x1 != 0 {
693 branches = append(branches, branchNames[uint(id)])
694 }
695 id <<= 1
696 mask >>= 1
697 }
698
699 return branches
700}
701
702func (d *indexData) List(ctx context.Context, q query.Q, opts *ListOptions) (rl *RepoList, err error) {
703 var include func(rle *RepoListEntry) bool
704
705 q = d.simplify(q)
706 if c, ok := q.(*query.Const); ok {
707 if !c.Value {
708 return &RepoList{}, nil
709 }
710 include = func(rle *RepoListEntry) bool {
711 return true
712 }
713 } else {
714 sr, err := d.Search(ctx, q, &SearchOptions{
715 ShardRepoMaxMatchCount: 1,
716 })
717 if err != nil {
718 return nil, err
719 }
720
721 foundRepos := make(map[string]struct{}, len(sr.Files))
722 for _, file := range sr.Files {
723 foundRepos[file.Repository] = struct{}{}
724 }
725
726 include = func(rle *RepoListEntry) bool {
727 _, ok := foundRepos[rle.Repository.Name]
728 return ok
729 }
730 }
731
732 var l RepoList
733
734 field, err := opts.GetField()
735 if err != nil {
736 return nil, err
737 }
738 switch field {
739 case RepoListFieldRepos:
740 l.Repos = make([]*RepoListEntry, 0, len(d.repoListEntry))
741 case RepoListFieldReposMap:
742 l.ReposMap = make(ReposMap, len(d.repoListEntry))
743 }
744
745 for i := range d.repoListEntry {
746 if d.repoMetaData[i].Tombstone {
747 continue
748 }
749 rle := &d.repoListEntry[i]
750 if !include(rle) {
751 continue
752 }
753
754 l.Stats.Add(&rle.Stats)
755
756 // Backwards compat for when ID is missing
757 if rle.Repository.ID == 0 {
758 l.Repos = append(l.Repos, rle)
759 continue
760 }
761
762 switch field {
763 case RepoListFieldRepos:
764 l.Repos = append(l.Repos, rle)
765 case RepoListFieldReposMap:
766 l.ReposMap[rle.Repository.ID] = MinimalRepoListEntry{
767 HasSymbols: rle.Repository.HasSymbols,
768 Branches: rle.Repository.Branches,
769 IndexTimeUnix: rle.IndexMetadata.IndexTime.Unix(),
770 }
771 }
772
773 }
774
775 // Only one of these fields is populated and in all cases the size of that
776 // field is the number of Repos in this shard.
777 l.Stats.Repos = len(l.Repos) + len(l.ReposMap)
778
779 return &l, nil
780}
781
782// regexpToMatchTreeRecursive converts a regular expression to a matchTree mt. If
783// mt is equivalent to the input r, isEqual = true and the matchTree can be used
784// in place of the regex r. If singleLine = true, then the matchTree and all
785// its children only match terms on the same line. singleLine is used during
786// recursion to decide whether to return an andLineMatchTree (singleLine = true)
787// or a andMatchTree (singleLine = false).
788func (d *indexData) regexpToMatchTreeRecursive(r *syntax.Regexp, minTextSize int, fileName bool, caseSensitive bool) (mt matchTree, isEqual bool, singleLine bool, err error) {
789 // TODO - we could perhaps transform Begin/EndText in '\n'?
790 // TODO - we could perhaps transform CharClass in (OrQuery )
791 // if there are just a few runes, and part of a OpConcat?
792 switch r.Op {
793 case syntax.OpLiteral:
794 s := string(r.Rune)
795 if len(s) >= minTextSize {
796 mt, err := d.newSubstringMatchTree(&query.Substring{Pattern: s, FileName: fileName, CaseSensitive: caseSensitive})
797 return mt, true, !strings.Contains(s, "\n"), err
798 }
799 case syntax.OpCapture:
800 return d.regexpToMatchTreeRecursive(r.Sub[0], minTextSize, fileName, caseSensitive)
801
802 case syntax.OpPlus:
803 return d.regexpToMatchTreeRecursive(r.Sub[0], minTextSize, fileName, caseSensitive)
804
805 case syntax.OpRepeat:
806 if r.Min == 1 {
807 return d.regexpToMatchTreeRecursive(r.Sub[0], minTextSize, fileName, caseSensitive)
808 } else if r.Min > 1 {
809 // (x){2,} can't be expressed precisely by the matchTree
810 mt, _, singleLine, err := d.regexpToMatchTreeRecursive(r.Sub[0], minTextSize, fileName, caseSensitive)
811 return mt, false, singleLine, err
812 }
813 case syntax.OpConcat, syntax.OpAlternate:
814 var qs []matchTree
815 isEq := true
816 singleLine = true
817 for _, sr := range r.Sub {
818 if sq, subIsEq, subSingleLine, err := d.regexpToMatchTreeRecursive(sr, minTextSize, fileName, caseSensitive); sq != nil {
819 if err != nil {
820 return nil, false, false, err
821 }
822 isEq = isEq && subIsEq
823 singleLine = singleLine && subSingleLine
824 qs = append(qs, sq)
825 }
826 }
827 if r.Op == syntax.OpConcat {
828 if len(qs) > 1 {
829 isEq = false
830 }
831 newQs := make([]matchTree, 0, len(qs))
832 for _, q := range qs {
833 if _, ok := q.(*bruteForceMatchTree); ok {
834 continue
835 }
836 newQs = append(newQs, q)
837 }
838 if len(newQs) == 1 {
839 return newQs[0], isEq, singleLine, nil
840 }
841 if len(newQs) == 0 {
842 return &bruteForceMatchTree{}, isEq, singleLine, nil
843 }
844 if singleLine {
845 return &andLineMatchTree{andMatchTree{children: newQs}}, isEq, singleLine, nil
846 }
847 return &andMatchTree{newQs}, isEq, singleLine, nil
848 }
849 for _, q := range qs {
850 if _, ok := q.(*bruteForceMatchTree); ok {
851 return q, isEq, false, nil
852 }
853 }
854 if len(qs) == 0 {
855 return &noMatchTree{Why: "const"}, isEq, false, nil
856 }
857 return &orMatchTree{qs}, isEq, false, nil
858 case syntax.OpStar:
859 if r.Sub[0].Op == syntax.OpAnyCharNotNL {
860 return &bruteForceMatchTree{}, false, true, nil
861 }
862 }
863 return &bruteForceMatchTree{}, false, false, nil
864}
865
866type timer struct {
867 last time.Time
868}
869
870func newTimer() *timer {
871 return &timer{
872 last: time.Now(),
873 }
874}
875
876func (t *timer) Elapsed() time.Duration {
877 now := time.Now()
878 d := now.Sub(t.last)
879 t.last = now
880 return d
881}