fork of https://github.com/sourcegraph/zoekt
1// Copyright 2016 Google Inc. All rights reserved.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15package zoekt
16
17import (
18 "context"
19 "fmt"
20 "log"
21 "math"
22 "regexp/syntax"
23 "sort"
24 "strings"
25
26 enry_data "github.com/go-enry/go-enry/v2/data"
27 "github.com/grafana/regexp"
28
29 "github.com/sourcegraph/zoekt/query"
30)
31
32const maxUInt16 = 0xffff
33
34func (m *FileMatch) addScore(what string, s float64, debugScore bool) {
35 if debugScore {
36 m.Debug += fmt.Sprintf("%s:%.2f, ", what, s)
37 }
38 m.Score += s
39}
40
41func (m *FileMatch) addKeywordScore(score float64, sumTf float64, L float64, debugScore bool) {
42 if debugScore {
43 m.Debug += fmt.Sprintf("keyword-score:%.2f (sum-tf: %.2f, length-ratio: %.2f)", score, sumTf, L)
44 }
45 m.Score += score
46}
47
48// simplifyMultiRepo takes a query and a predicate. It returns Const(true) if all
49// repository names fulfill the predicate, Const(false) if none of them do, and q
50// otherwise.
51func (d *indexData) simplifyMultiRepo(q query.Q, predicate func(*Repository) bool) query.Q {
52 count := 0
53 alive := len(d.repoMetaData)
54 for i := range d.repoMetaData {
55 if d.repoMetaData[i].Tombstone {
56 alive--
57 } else if predicate(&d.repoMetaData[i]) {
58 count++
59 }
60 }
61 if count == alive {
62 return &query.Const{Value: true}
63 }
64 if count > 0 {
65 return q
66 }
67 return &query.Const{Value: false}
68}
69
70func (d *indexData) simplify(in query.Q) query.Q {
71 eval := query.Map(in, func(q query.Q) query.Q {
72 switch r := q.(type) {
73 case *query.Repo:
74 return d.simplifyMultiRepo(q, func(repo *Repository) bool {
75 return r.Regexp.MatchString(repo.Name)
76 })
77 case *query.RepoRegexp:
78 return d.simplifyMultiRepo(q, func(repo *Repository) bool {
79 return r.Regexp.MatchString(repo.Name)
80 })
81 case *query.BranchesRepos:
82 for i := range d.repoMetaData {
83 for _, br := range r.List {
84 if br.Repos.Contains(d.repoMetaData[i].ID) {
85 return q
86 }
87 }
88 }
89 return &query.Const{Value: false}
90 case *query.RepoSet:
91 return d.simplifyMultiRepo(q, func(repo *Repository) bool {
92 return r.Set[repo.Name]
93 })
94 case *query.RepoIDs:
95 return d.simplifyMultiRepo(q, func(repo *Repository) bool {
96 return r.Repos.Contains(repo.ID)
97 })
98 case *query.Language:
99 _, has := d.metaData.LanguageMap[r.Language]
100 if !has && d.metaData.IndexFeatureVersion < 12 {
101 // For index files that haven't been re-indexed by go-enry,
102 // fall back to file-based matching and continue even if this
103 // repo doesn't have the specific language present.
104 extsForLang := enry_data.ExtensionsByLanguage[r.Language]
105 if extsForLang != nil {
106 extFrags := make([]string, 0, len(extsForLang))
107 for _, ext := range extsForLang {
108 extFrags = append(extFrags, regexp.QuoteMeta(ext))
109 }
110 if len(extFrags) > 0 {
111 pattern := fmt.Sprintf("(?i)(%s)$", strings.Join(extFrags, "|"))
112 // inlined copy of query.regexpQuery
113 re, err := syntax.Parse(pattern, syntax.Perl)
114 if err != nil {
115 return &query.Const{Value: false}
116 }
117 if re.Op == syntax.OpLiteral {
118 return &query.Substring{
119 Pattern: string(re.Rune),
120 FileName: true,
121 }
122 }
123 return &query.Regexp{
124 Regexp: re,
125 FileName: true,
126 }
127 }
128 }
129 }
130 if !has {
131 return &query.Const{Value: false}
132 }
133 }
134 return q
135 })
136 return query.Simplify(eval)
137}
138
139func (o *SearchOptions) SetDefaults() {
140 if o.ShardMaxMatchCount == 0 {
141 // We cap the total number of matches, so overly broad
142 // searches don't crash the machine.
143 o.ShardMaxMatchCount = 100000
144 }
145 if o.TotalMaxMatchCount == 0 {
146 o.TotalMaxMatchCount = 10 * o.ShardMaxMatchCount
147 }
148}
149
150func (d *indexData) Search(ctx context.Context, q query.Q, opts *SearchOptions) (sr *SearchResult, err error) {
151 copyOpts := *opts
152 opts = ©Opts
153 opts.SetDefaults()
154
155 var res SearchResult
156 if len(d.fileNameIndex) == 0 {
157 return &res, nil
158 }
159
160 select {
161 case <-ctx.Done():
162 res.Stats.ShardsSkipped++
163 return &res, nil
164 default:
165 }
166
167 q = d.simplify(q)
168 if c, ok := q.(*query.Const); ok && !c.Value {
169 return &res, nil
170 }
171
172 if opts.EstimateDocCount {
173 res.Stats.ShardFilesConsidered = len(d.fileBranchMasks)
174 return &res, nil
175 }
176
177 q = query.Map(q, query.ExpandFileContent)
178
179 mt, err := d.newMatchTree(q, matchTreeOpt{})
180 if err != nil {
181 return nil, err
182 }
183
184 mt, err = pruneMatchTree(mt)
185 if err != nil {
186 return nil, err
187 }
188 if mt == nil {
189 res.Stats.ShardsSkippedFilter++
190 return &res, nil
191 }
192
193 res.Stats.ShardsScanned++
194
195 cp := &contentProvider{
196 id: d,
197 stats: &res.Stats,
198 }
199
200 // Track the number of documents found in a repository for
201 // ShardRepoMaxMatchCount
202 var (
203 lastRepoID uint16
204 repoMatchCount int
205 )
206
207 docCount := uint32(len(d.fileBranchMasks))
208 lastDoc := int(-1)
209
210nextFileMatch:
211 for {
212 canceled := false
213 select {
214 case <-ctx.Done():
215 canceled = true
216 default:
217 }
218
219 nextDoc := mt.nextDoc()
220 if int(nextDoc) <= lastDoc {
221 nextDoc = uint32(lastDoc + 1)
222 }
223
224 for ; nextDoc < docCount; nextDoc++ {
225 repoID := d.repos[nextDoc]
226 repoMetadata := &d.repoMetaData[repoID]
227
228 // Skip tombstoned repositories
229 if repoMetadata.Tombstone {
230 continue
231 }
232
233 // Skip documents that are tombstoned
234 if len(repoMetadata.FileTombstones) > 0 {
235 if _, tombstoned := repoMetadata.FileTombstones[string(d.fileName(nextDoc))]; tombstoned {
236 continue
237 }
238 }
239
240 // Skip documents over ShardRepoMaxMatchCount if specified.
241 if opts.ShardRepoMaxMatchCount > 0 {
242 if repoMatchCount >= opts.ShardRepoMaxMatchCount && repoID == lastRepoID {
243 res.Stats.FilesSkipped++
244 continue
245 }
246 }
247
248 break
249 }
250
251 if nextDoc >= docCount {
252 break
253 }
254
255 lastDoc = int(nextDoc)
256
257 // We track lastRepoID for ShardRepoMaxMatchCount
258 if lastRepoID != d.repos[nextDoc] {
259 lastRepoID = d.repos[nextDoc]
260 repoMatchCount = 0
261 }
262
263 if canceled || (res.Stats.MatchCount >= opts.ShardMaxMatchCount && opts.ShardMaxMatchCount > 0) {
264 res.Stats.FilesSkipped += int(docCount - nextDoc)
265 break
266 }
267
268 res.Stats.FilesConsidered++
269 mt.prepare(nextDoc)
270
271 cp.setDocument(nextDoc)
272
273 known := make(map[matchTree]bool)
274 md := d.repoMetaData[d.repos[nextDoc]]
275
276 for cost := costMin; cost <= costMax; cost++ {
277 v, ok := mt.matches(cp, cost, known)
278 if ok && !v {
279 continue nextFileMatch
280 }
281
282 if cost == costMax && !ok {
283 log.Panicf("did not decide. Repo %s, doc %d, known %v",
284 md.Name, nextDoc, known)
285 }
286 }
287
288 fileMatch := FileMatch{
289 Repository: md.Name,
290 RepositoryID: md.ID,
291 RepositoryPriority: md.priority,
292 FileName: string(d.fileName(nextDoc)),
293 Checksum: d.getChecksum(nextDoc),
294 Language: d.languageMap[d.getLanguage(nextDoc)],
295 }
296
297 if s := d.subRepos[nextDoc]; s > 0 {
298 if s >= uint32(len(d.subRepoPaths[d.repos[nextDoc]])) {
299 log.Panicf("corrupt index: subrepo %d beyond %v", s, d.subRepoPaths)
300 }
301 path := d.subRepoPaths[d.repos[nextDoc]][s]
302 fileMatch.SubRepositoryPath = path
303 sr := md.SubRepoMap[path]
304 fileMatch.SubRepositoryName = sr.Name
305 if idx := d.branchIndex(nextDoc); idx >= 0 {
306 fileMatch.Version = sr.Branches[idx].Version
307 }
308 } else {
309 idx := d.branchIndex(nextDoc)
310 if idx >= 0 {
311 fileMatch.Version = md.Branches[idx].Version
312 }
313 }
314
315 shouldMergeMatches := !opts.ChunkMatches
316 finalCands := gatherMatches(mt, known, shouldMergeMatches)
317
318 if len(finalCands) == 0 {
319 nm := d.fileName(nextDoc)
320 finalCands = append(finalCands,
321 &candidateMatch{
322 caseSensitive: false,
323 fileName: true,
324 substrBytes: nm,
325 substrLowered: nm,
326 file: nextDoc,
327 runeOffset: 0,
328 byteOffset: 0,
329 byteMatchSz: uint32(len(nm)),
330 })
331 }
332
333 if opts.ChunkMatches {
334 fileMatch.ChunkMatches = cp.fillChunkMatches(finalCands, opts.NumContextLines, fileMatch.Language, opts.DebugScore)
335 } else {
336 fileMatch.LineMatches = cp.fillMatches(finalCands, opts.NumContextLines, fileMatch.Language, opts.DebugScore)
337 }
338
339 if opts.UseKeywordScoring {
340 d.scoreFileUsingBM25(&fileMatch, nextDoc, finalCands, opts)
341 } else {
342 // Use the standard, non-experimental scoring method by default
343 d.scoreFile(&fileMatch, nextDoc, mt, known, opts)
344 }
345
346 fileMatch.Branches = d.gatherBranches(nextDoc, mt, known)
347 sortMatchesByScore(fileMatch.LineMatches)
348 sortChunkMatchesByScore(fileMatch.ChunkMatches)
349 if opts.Whole {
350 fileMatch.Content = cp.data(false)
351 }
352
353 matchedChunkRanges := 0
354 for _, cm := range fileMatch.ChunkMatches {
355 matchedChunkRanges += len(cm.Ranges)
356 }
357
358 repoMatchCount += len(fileMatch.LineMatches)
359 repoMatchCount += matchedChunkRanges
360
361 if opts.DebugScore {
362 fileMatch.Debug = fmt.Sprintf("score:%.2f <- %s", fileMatch.Score, fileMatch.Debug)
363 }
364
365 res.Files = append(res.Files, fileMatch)
366 res.Stats.MatchCount += len(fileMatch.LineMatches)
367 res.Stats.MatchCount += matchedChunkRanges
368 res.Stats.FileCount++
369 }
370
371 // We do not sort Files here, instead we rely on the shards pkg to do file
372 // ranking. If we sorted now, we would break the assumption that results
373 // from the same repo in a shard appear next to each other.
374
375 for _, md := range d.repoMetaData {
376 r := md
377 addRepo(&res, &r)
378 for _, v := range r.SubRepoMap {
379 addRepo(&res, v)
380 }
381 }
382
383 visitMatchTree(mt, func(mt matchTree) {
384 if atom, ok := mt.(interface{ updateStats(*Stats) }); ok {
385 atom.updateStats(&res.Stats)
386 }
387 })
388
389 // If document ranking is enabled, then we can rank and truncate the files to save memory.
390 if limit := opts.MaxDocDisplayCount; opts.UseDocumentRanks && limit > 0 && limit < len(res.Files) {
391 SortFiles(res.Files)
392 res.Files = res.Files[:limit]
393 }
394
395 return &res, nil
396}
397
398// scoreFile computes a score for the file match using various scoring signals, like
399// whether there's an exact match on a symbol, the number of query clauses that matched, etc.
400func (d *indexData) scoreFile(fileMatch *FileMatch, doc uint32, mt matchTree, known map[matchTree]bool, opts *SearchOptions) {
401 atomMatchCount := 0
402 visitMatches(mt, known, func(mt matchTree) {
403 atomMatchCount++
404 })
405
406 // atom-count boosts files with matches from more than 1 atom. The
407 // maximum boost is scoreFactorAtomMatch.
408 if atomMatchCount > 0 {
409 fileMatch.addScore("atom", (1.0-1.0/float64(atomMatchCount))*scoreFactorAtomMatch, opts.DebugScore)
410 }
411
412 maxFileScore := 0.0
413 repetitions := 0
414 for i := range fileMatch.LineMatches {
415 if maxFileScore < fileMatch.LineMatches[i].Score {
416 maxFileScore = fileMatch.LineMatches[i].Score
417 repetitions = 0
418 } else if maxFileScore == fileMatch.LineMatches[i].Score {
419 repetitions += 1
420 }
421
422 // Order by ordering in file.
423 fileMatch.LineMatches[i].Score += scoreLineOrderFactor * (1.0 - (float64(i) / float64(len(fileMatch.LineMatches))))
424 }
425
426 for i := range fileMatch.ChunkMatches {
427 if maxFileScore < fileMatch.ChunkMatches[i].Score {
428 maxFileScore = fileMatch.ChunkMatches[i].Score
429 }
430
431 // Order by ordering in file.
432 fileMatch.ChunkMatches[i].Score += scoreLineOrderFactor * (1.0 - (float64(i) / float64(len(fileMatch.ChunkMatches))))
433 }
434
435 // Maintain ordering of input files. This
436 // strictly dominates the in-file ordering of
437 // the matches.
438 fileMatch.addScore("fragment", maxFileScore, opts.DebugScore)
439
440 // Prefer docs with several top-scored matches.
441 fileMatch.addScore("repetition-boost", scoreRepetitionFactor*float64(repetitions), opts.DebugScore)
442
443 if opts.UseDocumentRanks && len(d.ranks) > int(doc) {
444 weight := scoreFileRankFactor
445 if opts.DocumentRanksWeight > 0.0 {
446 weight = opts.DocumentRanksWeight
447 }
448
449 ranks := d.ranks[doc]
450 // The ranks slice always contains one entry representing the file rank (unless it's empty since the
451 // file doesn't have a rank). This is left over from when documents could have multiple rank signals,
452 // and we plan to clean this up.
453 if len(ranks) > 0 {
454 // The file rank represents a log (base 2) count. The log ranks should be bounded at 32, but we
455 // cap it just in case to ensure it falls in the range [0, 1].
456 normalized := math.Min(1.0, ranks[0]/32.0)
457 fileMatch.addScore("file-rank", weight*normalized, opts.DebugScore)
458 }
459 }
460
461 md := d.repoMetaData[d.repos[doc]]
462 fileMatch.addScore("doc-order", scoreFileOrderFactor*(1.0-float64(doc)/float64(len(d.boundaries))), opts.DebugScore)
463 fileMatch.addScore("repo-rank", scoreRepoRankFactor*float64(md.Rank)/maxUInt16, opts.DebugScore)
464}
465
466// scoreFileUsingBM25 computes a score for the file match using an approximation to BM25, the most common scoring
467// algorithm for keyword search: https://en.wikipedia.org/wiki/Okapi_BM25. It implements all parts of the formula
468// except inverse document frequency (idf), since we don't have access to global term frequency statistics.
469//
470// This scoring strategy ignores all other signals including document ranks. This keeps things simple for now,
471// since BM25 is not normalized and can be tricky to combine with other scoring signals.
472func (d *indexData) scoreFileUsingBM25(fileMatch *FileMatch, doc uint32, cands []*candidateMatch, opts *SearchOptions) {
473 // Treat each candidate match as a term and compute the frequencies. For now, ignore case
474 // sensitivity and treat filenames and symbols the same as content.
475 termFreqs := map[string]int{}
476 for _, cand := range cands {
477 term := string(cand.substrLowered)
478 termFreqs[term]++
479 }
480
481 // Compute the file length ratio. Usually the calculation would be based on terms, but using
482 // bytes should work fine, as we're just computing a ratio.
483 fileLength := float64(d.boundaries[doc+1] - d.boundaries[doc])
484 numFiles := len(d.boundaries)
485 averageFileLength := float64(d.boundaries[numFiles - 1]) / float64(numFiles)
486 L := fileLength / averageFileLength
487
488 // Use standard parameter defaults (used in Lucene and academic papers)
489 k, b := 1.2, 0.75
490 sumTf := 0.0 // Just for debugging
491 score := 0.0
492 for _, freq := range termFreqs {
493 tf := float64(freq)
494 sumTf += tf
495 score += ((k + 1.0) * tf) / (k * (1.0 - b + b * L) + tf)
496 }
497
498 fileMatch.addKeywordScore(score, sumTf, L, opts.DebugScore)
499}
500
501func addRepo(res *SearchResult, repo *Repository) {
502 if res.RepoURLs == nil {
503 res.RepoURLs = map[string]string{}
504 }
505 res.RepoURLs[repo.Name] = repo.FileURLTemplate
506
507 if res.LineFragments == nil {
508 res.LineFragments = map[string]string{}
509 }
510 res.LineFragments[repo.Name] = repo.LineFragmentTemplate
511}
512
513type sortByOffsetSlice []*candidateMatch
514
515func (m sortByOffsetSlice) Len() int { return len(m) }
516func (m sortByOffsetSlice) Swap(i, j int) { m[i], m[j] = m[j], m[i] }
517func (m sortByOffsetSlice) Less(i, j int) bool {
518 return m[i].byteOffset < m[j].byteOffset
519}
520
521// Gather matches from this document. This never returns a mixture of
522// filename/content matches: if there are content matches, all
523// filename matches are trimmed from the result. The matches are
524// returned in document order and are non-overlapping.
525//
526// If `merge` is set, overlapping and adjacent matches will be merged
527// into a single match. Otherwise, overlapping matches will be removed,
528// but adjacent matches will remain.
529func gatherMatches(mt matchTree, known map[matchTree]bool, merge bool) []*candidateMatch {
530 var cands []*candidateMatch
531 visitMatches(mt, known, func(mt matchTree) {
532 if smt, ok := mt.(*substrMatchTree); ok {
533 cands = append(cands, smt.current...)
534 }
535 if rmt, ok := mt.(*regexpMatchTree); ok {
536 cands = append(cands, rmt.found...)
537 }
538 if rmt, ok := mt.(*wordMatchTree); ok {
539 cands = append(cands, rmt.found...)
540 }
541 if smt, ok := mt.(*symbolRegexpMatchTree); ok {
542 cands = append(cands, smt.found...)
543 }
544 })
545
546 foundContentMatch := false
547 for _, c := range cands {
548 if !c.fileName {
549 foundContentMatch = true
550 break
551 }
552 }
553
554 res := cands[:0]
555 for _, c := range cands {
556 if !foundContentMatch || !c.fileName {
557 res = append(res, c)
558 }
559 }
560 cands = res
561
562 if merge {
563 // Merge adjacent candidates. This guarantees that the matches
564 // are non-overlapping.
565 sort.Sort((sortByOffsetSlice)(cands))
566 res = cands[:0]
567 for i, c := range cands {
568 if i == 0 {
569 res = append(res, c)
570 continue
571 }
572 last := res[len(res)-1]
573 lastEnd := last.byteOffset + last.byteMatchSz
574 end := c.byteOffset + c.byteMatchSz
575 if lastEnd >= c.byteOffset {
576 if end > lastEnd {
577 last.byteMatchSz = end - last.byteOffset
578 }
579 continue
580 }
581
582 res = append(res, c)
583 }
584 } else {
585 // Remove overlapping candidates. This guarantees that the matches
586 // are non-overlapping, but also preserves expected match counts.
587 sort.Sort((sortByOffsetSlice)(cands))
588 res = cands[:0]
589 for i, c := range cands {
590 if i == 0 {
591 res = append(res, c)
592 continue
593 }
594 last := res[len(res)-1]
595 lastEnd := last.byteOffset + last.byteMatchSz
596 if lastEnd > c.byteOffset {
597 continue
598 }
599
600 res = append(res, c)
601 }
602 }
603
604 return res
605}
606
607func (d *indexData) branchIndex(docID uint32) int {
608 mask := d.fileBranchMasks[docID]
609 idx := 0
610 for mask != 0 {
611 if mask&0x1 != 0 {
612 return idx
613 }
614 idx++
615 mask >>= 1
616 }
617 return -1
618}
619
620// gatherBranches returns a list of branch names taking into account any branch
621// filters in the query. If the query contains a branch filter, it returns all
622// branches containing the docID and matching the branch filter. Otherwise, it
623// returns all branches containing docID.
624func (d *indexData) gatherBranches(docID uint32, mt matchTree, known map[matchTree]bool) []string {
625 var mask uint64
626 visitMatches(mt, known, func(mt matchTree) {
627 bq, ok := mt.(*branchQueryMatchTree)
628 if !ok {
629 return
630 }
631
632 mask = mask | bq.branchMask()
633 })
634
635 if mask == 0 {
636 mask = d.fileBranchMasks[docID]
637 }
638
639 var branches []string
640 id := uint32(1)
641 branchNames := d.branchNames[d.repos[docID]]
642 for mask != 0 {
643 if mask&0x1 != 0 {
644 branches = append(branches, branchNames[uint(id)])
645 }
646 id <<= 1
647 mask >>= 1
648 }
649
650 return branches
651}
652
653func (d *indexData) List(ctx context.Context, q query.Q, opts *ListOptions) (rl *RepoList, err error) {
654 var include func(rle *RepoListEntry) bool
655
656 q = d.simplify(q)
657 if c, ok := q.(*query.Const); ok {
658 if !c.Value {
659 return &RepoList{}, nil
660 }
661 include = func(rle *RepoListEntry) bool {
662 return true
663 }
664 } else {
665 sr, err := d.Search(ctx, q, &SearchOptions{
666 ShardRepoMaxMatchCount: 1,
667 })
668 if err != nil {
669 return nil, err
670 }
671
672 foundRepos := make(map[string]struct{}, len(sr.Files))
673 for _, file := range sr.Files {
674 foundRepos[file.Repository] = struct{}{}
675 }
676
677 include = func(rle *RepoListEntry) bool {
678 _, ok := foundRepos[rle.Repository.Name]
679 return ok
680 }
681 }
682
683 var l RepoList
684
685 field, err := opts.GetField()
686 if err != nil {
687 return nil, err
688 }
689 switch field {
690 case RepoListFieldRepos:
691 l.Repos = make([]*RepoListEntry, 0, len(d.repoListEntry))
692 case RepoListFieldMinimal:
693 l.Minimal = make(map[uint32]*MinimalRepoListEntry, len(d.repoListEntry))
694 case RepoListFieldReposMap:
695 l.ReposMap = make(ReposMap, len(d.repoListEntry))
696 }
697
698 for i := range d.repoListEntry {
699 if d.repoMetaData[i].Tombstone {
700 continue
701 }
702 rle := &d.repoListEntry[i]
703 if !include(rle) {
704 continue
705 }
706
707 l.Stats.Add(&rle.Stats)
708
709 // Backwards compat for when ID is missing
710 if rle.Repository.ID == 0 {
711 l.Repos = append(l.Repos, rle)
712 continue
713 }
714
715 switch field {
716 case RepoListFieldRepos:
717 l.Repos = append(l.Repos, rle)
718 case RepoListFieldMinimal:
719 l.Minimal[rle.Repository.ID] = &MinimalRepoListEntry{
720 HasSymbols: rle.Repository.HasSymbols,
721 Branches: rle.Repository.Branches,
722 IndexTimeUnix: rle.IndexMetadata.IndexTime.Unix(),
723 }
724 case RepoListFieldReposMap:
725 l.ReposMap[rle.Repository.ID] = MinimalRepoListEntry{
726 HasSymbols: rle.Repository.HasSymbols,
727 Branches: rle.Repository.Branches,
728 IndexTimeUnix: rle.IndexMetadata.IndexTime.Unix(),
729 }
730 }
731
732 }
733
734 // Only one of these fields is populated and in all cases the size of that
735 // field is the number of Repos in this shard.
736 l.Stats.Repos = len(l.Repos) + len(l.Minimal) + len(l.ReposMap)
737
738 return &l, nil
739}
740
741// regexpToMatchTreeRecursive converts a regular expression to a matchTree mt. If
742// mt is equivalent to the input r, isEqual = true and the matchTree can be used
743// in place of the regex r. If singleLine = true, then the matchTree and all
744// its children only match terms on the same line. singleLine is used during
745// recursion to decide whether to return an andLineMatchTree (singleLine = true)
746// or a andMatchTree (singleLine = false).
747func (d *indexData) regexpToMatchTreeRecursive(r *syntax.Regexp, minTextSize int, fileName bool, caseSensitive bool) (mt matchTree, isEqual bool, singleLine bool, err error) {
748 // TODO - we could perhaps transform Begin/EndText in '\n'?
749 // TODO - we could perhaps transform CharClass in (OrQuery )
750 // if there are just a few runes, and part of a OpConcat?
751 switch r.Op {
752 case syntax.OpLiteral:
753 s := string(r.Rune)
754 if len(s) >= minTextSize {
755 mt, err := d.newSubstringMatchTree(&query.Substring{Pattern: s, FileName: fileName, CaseSensitive: caseSensitive})
756 return mt, true, !strings.Contains(s, "\n"), err
757 }
758 case syntax.OpCapture:
759 return d.regexpToMatchTreeRecursive(r.Sub[0], minTextSize, fileName, caseSensitive)
760
761 case syntax.OpPlus:
762 return d.regexpToMatchTreeRecursive(r.Sub[0], minTextSize, fileName, caseSensitive)
763
764 case syntax.OpRepeat:
765 if r.Min == 1 {
766 return d.regexpToMatchTreeRecursive(r.Sub[0], minTextSize, fileName, caseSensitive)
767 } else if r.Min > 1 {
768 // (x){2,} can't be expressed precisely by the matchTree
769 mt, _, singleLine, err := d.regexpToMatchTreeRecursive(r.Sub[0], minTextSize, fileName, caseSensitive)
770 return mt, false, singleLine, err
771 }
772 case syntax.OpConcat, syntax.OpAlternate:
773 var qs []matchTree
774 isEq := true
775 singleLine = true
776 for _, sr := range r.Sub {
777 if sq, subIsEq, subSingleLine, err := d.regexpToMatchTreeRecursive(sr, minTextSize, fileName, caseSensitive); sq != nil {
778 if err != nil {
779 return nil, false, false, err
780 }
781 isEq = isEq && subIsEq
782 singleLine = singleLine && subSingleLine
783 qs = append(qs, sq)
784 }
785 }
786 if r.Op == syntax.OpConcat {
787 if len(qs) > 1 {
788 isEq = false
789 }
790 newQs := make([]matchTree, 0, len(qs))
791 for _, q := range qs {
792 if _, ok := q.(*bruteForceMatchTree); ok {
793 continue
794 }
795 newQs = append(newQs, q)
796 }
797 if len(newQs) == 1 {
798 return newQs[0], isEq, singleLine, nil
799 }
800 if len(newQs) == 0 {
801 return &bruteForceMatchTree{}, isEq, singleLine, nil
802 }
803 if singleLine {
804 return &andLineMatchTree{andMatchTree{children: newQs}}, isEq, singleLine, nil
805 }
806 return &andMatchTree{newQs}, isEq, singleLine, nil
807 }
808 for _, q := range qs {
809 if _, ok := q.(*bruteForceMatchTree); ok {
810 return q, isEq, false, nil
811 }
812 }
813 if len(qs) == 0 {
814 return &noMatchTree{"const"}, isEq, false, nil
815 }
816 return &orMatchTree{qs}, isEq, false, nil
817 case syntax.OpStar:
818 if r.Sub[0].Op == syntax.OpAnyCharNotNL {
819 return &bruteForceMatchTree{}, false, true, nil
820 }
821 }
822 return &bruteForceMatchTree{}, false, false, nil
823}