fork of https://github.com/sourcegraph/zoekt
1// Copyright 2016 Google Inc. All rights reserved.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15package zoekt
16
17import (
18 "context"
19 "fmt"
20 "log"
21 "regexp/syntax"
22 "sort"
23 "strings"
24 "time"
25
26 enry_data "github.com/go-enry/go-enry/v2/data"
27 "github.com/grafana/regexp"
28
29 "github.com/sourcegraph/zoekt/query"
30)
31
32// simplifyMultiRepo takes a query and a predicate. It returns Const(true) if all
33// repository names fulfill the predicate, Const(false) if none of them do, and q
34// otherwise.
35func (d *indexData) simplifyMultiRepo(q query.Q, predicate func(*Repository) bool) query.Q {
36 count := 0
37 alive := len(d.repoMetaData)
38 for i := range d.repoMetaData {
39 if d.repoMetaData[i].Tombstone {
40 alive--
41 } else if predicate(&d.repoMetaData[i]) {
42 count++
43 }
44 }
45 if count == alive {
46 return &query.Const{Value: true}
47 }
48 if count > 0 {
49 return q
50 }
51 return &query.Const{Value: false}
52}
53
54func (d *indexData) simplify(in query.Q) query.Q {
55 eval := query.Map(in, func(q query.Q) query.Q {
56 switch r := q.(type) {
57 case *query.Repo:
58 return d.simplifyMultiRepo(q, func(repo *Repository) bool {
59 return r.Regexp.MatchString(repo.Name)
60 })
61 case *query.RepoRegexp:
62 return d.simplifyMultiRepo(q, func(repo *Repository) bool {
63 return r.Regexp.MatchString(repo.Name)
64 })
65 case *query.BranchesRepos:
66 for i := range d.repoMetaData {
67 for _, br := range r.List {
68 if br.Repos.Contains(d.repoMetaData[i].ID) {
69 return q
70 }
71 }
72 }
73 return &query.Const{Value: false}
74 case *query.RepoSet:
75 return d.simplifyMultiRepo(q, func(repo *Repository) bool {
76 return r.Set[repo.Name]
77 })
78 case *query.RepoIDs:
79 return d.simplifyMultiRepo(q, func(repo *Repository) bool {
80 return r.Repos.Contains(repo.ID)
81 })
82 case *query.Language:
83 _, has := d.metaData.LanguageMap[r.Language]
84 if !has && d.metaData.IndexFeatureVersion < 12 {
85 // For index files that haven't been re-indexed by go-enry,
86 // fall back to file-based matching and continue even if this
87 // repo doesn't have the specific language present.
88 extsForLang := enry_data.ExtensionsByLanguage[r.Language]
89 if extsForLang != nil {
90 extFrags := make([]string, 0, len(extsForLang))
91 for _, ext := range extsForLang {
92 extFrags = append(extFrags, regexp.QuoteMeta(ext))
93 }
94 if len(extFrags) > 0 {
95 pattern := fmt.Sprintf("(?i)(%s)$", strings.Join(extFrags, "|"))
96 // inlined copy of query.regexpQuery
97 re, err := syntax.Parse(pattern, syntax.Perl)
98 if err != nil {
99 return &query.Const{Value: false}
100 }
101 if re.Op == syntax.OpLiteral {
102 return &query.Substring{
103 Pattern: string(re.Rune),
104 FileName: true,
105 }
106 }
107 return &query.Regexp{
108 Regexp: re,
109 FileName: true,
110 }
111 }
112 }
113 }
114 if !has {
115 return &query.Const{Value: false}
116 }
117 }
118 return q
119 })
120 return query.Simplify(eval)
121}
122
123func (o *SearchOptions) SetDefaults() {
124 if o.ShardMaxMatchCount == 0 {
125 // We cap the total number of matches, so overly broad
126 // searches don't crash the machine.
127 o.ShardMaxMatchCount = 100000
128 }
129 if o.TotalMaxMatchCount == 0 {
130 o.TotalMaxMatchCount = 10 * o.ShardMaxMatchCount
131 }
132}
133
134func (d *indexData) Search(ctx context.Context, q query.Q, opts *SearchOptions) (sr *SearchResult, err error) {
135 timer := newTimer()
136
137 copyOpts := *opts
138 opts = ©Opts
139 opts.SetDefaults()
140
141 var res SearchResult
142 if len(d.fileNameIndex) == 0 {
143 return &res, nil
144 }
145
146 select {
147 case <-ctx.Done():
148 res.Stats.ShardsSkipped++
149 return &res, nil
150 default:
151 }
152
153 q = d.simplify(q)
154 if c, ok := q.(*query.Const); ok && !c.Value {
155 return &res, nil
156 }
157
158 if opts.EstimateDocCount {
159 res.Stats.ShardFilesConsidered = len(d.fileBranchMasks)
160 return &res, nil
161 }
162
163 q = query.Map(q, query.ExpandFileContent)
164
165 mt, err := d.newMatchTree(q, matchTreeOpt{})
166 if err != nil {
167 return nil, err
168 }
169
170 // Capture the costs of construction before pruning
171 updateMatchTreeStats(mt, &res.Stats)
172
173 mt, err = pruneMatchTree(mt)
174 if err != nil {
175 return nil, err
176 }
177 res.Stats.MatchTreeConstruction = timer.Elapsed()
178 if mt == nil {
179 res.Stats.ShardsSkippedFilter++
180 return &res, nil
181 }
182
183 res.Stats.ShardsScanned++
184
185 cp := &contentProvider{
186 id: d,
187 stats: &res.Stats,
188 }
189
190 // Track the number of documents found in a repository for
191 // ShardRepoMaxMatchCount
192 var (
193 lastRepoID uint16
194 repoMatchCount int
195 )
196
197 docCount := uint32(len(d.fileBranchMasks))
198 lastDoc := int(-1)
199
200nextFileMatch:
201 for {
202 canceled := false
203 select {
204 case <-ctx.Done():
205 canceled = true
206 default:
207 }
208
209 nextDoc := mt.nextDoc()
210 if int(nextDoc) <= lastDoc {
211 nextDoc = uint32(lastDoc + 1)
212 }
213
214 for ; nextDoc < docCount; nextDoc++ {
215 repoID := d.repos[nextDoc]
216 repoMetadata := &d.repoMetaData[repoID]
217
218 // Skip tombstoned repositories
219 if repoMetadata.Tombstone {
220 continue
221 }
222
223 // Skip documents that are tombstoned
224 if len(repoMetadata.FileTombstones) > 0 {
225 if _, tombstoned := repoMetadata.FileTombstones[string(d.fileName(nextDoc))]; tombstoned {
226 continue
227 }
228 }
229
230 // Skip documents over ShardRepoMaxMatchCount if specified.
231 if opts.ShardRepoMaxMatchCount > 0 {
232 if repoMatchCount >= opts.ShardRepoMaxMatchCount && repoID == lastRepoID {
233 res.Stats.FilesSkipped++
234 continue
235 }
236 }
237
238 break
239 }
240
241 if nextDoc >= docCount {
242 break
243 }
244
245 lastDoc = int(nextDoc)
246
247 // We track lastRepoID for ShardRepoMaxMatchCount
248 if lastRepoID != d.repos[nextDoc] {
249 lastRepoID = d.repos[nextDoc]
250 repoMatchCount = 0
251 }
252
253 if canceled || (res.Stats.MatchCount >= opts.ShardMaxMatchCount && opts.ShardMaxMatchCount > 0) {
254 res.Stats.FilesSkipped += int(docCount - nextDoc)
255 break
256 }
257
258 res.Stats.FilesConsidered++
259 mt.prepare(nextDoc)
260
261 cp.setDocument(nextDoc)
262
263 known := make(map[matchTree]bool)
264 md := d.repoMetaData[d.repos[nextDoc]]
265
266 for cost := costMin; cost <= costMax; cost++ {
267 switch evalMatchTree(cp, cost, known, mt) {
268 case matchesRequiresHigherCost:
269 if cost == costMax {
270 log.Panicf("did not decide. Repo %s, doc %d, known %v",
271 md.Name, nextDoc, known)
272 }
273 case matchesFound:
274 // could short-circuit now, but we want to run higher costs to
275 // potentially find higher ranked matches.
276 case matchesNone:
277 continue nextFileMatch
278 }
279 }
280
281 fileMatch := FileMatch{
282 Repository: md.Name,
283 RepositoryID: md.ID,
284 RepositoryPriority: md.priority,
285 FileName: string(d.fileName(nextDoc)),
286 Checksum: d.getChecksum(nextDoc),
287 Language: d.languageMap[d.getLanguage(nextDoc)],
288 }
289
290 if s := d.subRepos[nextDoc]; s > 0 {
291 if s >= uint32(len(d.subRepoPaths[d.repos[nextDoc]])) {
292 log.Panicf("corrupt index: subrepo %d beyond %v", s, d.subRepoPaths)
293 }
294 path := d.subRepoPaths[d.repos[nextDoc]][s]
295 fileMatch.SubRepositoryPath = path
296 sr := md.SubRepoMap[path]
297 fileMatch.SubRepositoryName = sr.Name
298 if idx := d.branchIndex(nextDoc); idx >= 0 {
299 fileMatch.Version = sr.Branches[idx].Version
300 }
301 } else {
302 idx := d.branchIndex(nextDoc)
303 if idx >= 0 {
304 fileMatch.Version = md.Branches[idx].Version
305 }
306 }
307
308 // Important invariant for performance: finalCands is sorted by offset and
309 // non-overlapping. gatherMatches respects this invariant and all later
310 // transformations respect this.
311 shouldMergeMatches := !opts.ChunkMatches
312 finalCands := d.gatherMatches(nextDoc, mt, known, shouldMergeMatches)
313
314 if opts.ChunkMatches {
315 fileMatch.ChunkMatches = cp.fillChunkMatches(finalCands, opts.NumContextLines, fileMatch.Language, opts.DebugScore)
316 } else {
317 fileMatch.LineMatches = cp.fillMatches(finalCands, opts.NumContextLines, fileMatch.Language, opts.DebugScore)
318 }
319
320 if opts.UseKeywordScoring {
321 d.scoreFileUsingBM25(&fileMatch, nextDoc, finalCands, opts)
322 } else {
323 // Use the standard, non-experimental scoring method by default
324 d.scoreFile(&fileMatch, nextDoc, mt, known, opts)
325 }
326
327 fileMatch.Branches = d.gatherBranches(nextDoc, mt, known)
328 sortMatchesByScore(fileMatch.LineMatches)
329 sortChunkMatchesByScore(fileMatch.ChunkMatches)
330 if opts.Whole {
331 fileMatch.Content = cp.data(false)
332 }
333
334 matchedChunkRanges := 0
335 for _, cm := range fileMatch.ChunkMatches {
336 matchedChunkRanges += len(cm.Ranges)
337 }
338
339 repoMatchCount += len(fileMatch.LineMatches)
340 repoMatchCount += matchedChunkRanges
341
342 if opts.DebugScore {
343 fileMatch.Debug = fmt.Sprintf("score:%.2f <- %s", fileMatch.Score, fileMatch.Debug)
344 }
345
346 res.Files = append(res.Files, fileMatch)
347 res.Stats.MatchCount += len(fileMatch.LineMatches)
348 res.Stats.MatchCount += matchedChunkRanges
349 res.Stats.FileCount++
350 }
351
352 for _, md := range d.repoMetaData {
353 r := md
354 addRepo(&res, &r)
355 for _, v := range r.SubRepoMap {
356 addRepo(&res, v)
357 }
358 }
359
360 // Update stats based on work done during document search.
361 updateMatchTreeStats(mt, &res.Stats)
362
363 // If document ranking is enabled, then we can rank and truncate the files to save memory.
364 if opts.UseDocumentRanks {
365 res.Files = SortAndTruncateFiles(res.Files, opts)
366 }
367
368 res.Stats.MatchTreeSearch = timer.Elapsed()
369
370 return &res, nil
371}
372
373func addRepo(res *SearchResult, repo *Repository) {
374 if res.RepoURLs == nil {
375 res.RepoURLs = map[string]string{}
376 }
377 res.RepoURLs[repo.Name] = repo.FileURLTemplate
378
379 if res.LineFragments == nil {
380 res.LineFragments = map[string]string{}
381 }
382 res.LineFragments[repo.Name] = repo.LineFragmentTemplate
383}
384
385// Gather matches from this document. The matches are returned in document
386// order and are non-overlapping. All filename and content matches are
387// returned, with filename matches first.
388//
389// If `merge` is set, overlapping and adjacent matches will be merged
390// into a single match. Otherwise, overlapping matches will be removed,
391// but adjacent matches will remain.
392func (d *indexData) gatherMatches(nextDoc uint32, mt matchTree, known map[matchTree]bool, merge bool) []*candidateMatch {
393 var cands []*candidateMatch
394 visitMatches(mt, known, 1, func(mt matchTree, scoreWeight float64) {
395 if smt, ok := mt.(*substrMatchTree); ok {
396 cands = append(cands, setScoreWeight(scoreWeight, smt.current)...)
397 }
398 if rmt, ok := mt.(*regexpMatchTree); ok {
399 cands = append(cands, setScoreWeight(scoreWeight, rmt.found)...)
400 }
401 if rmt, ok := mt.(*wordMatchTree); ok {
402 cands = append(cands, setScoreWeight(scoreWeight, rmt.found)...)
403 }
404 if smt, ok := mt.(*symbolRegexpMatchTree); ok {
405 cands = append(cands, setScoreWeight(scoreWeight, smt.found)...)
406 }
407 })
408
409 // If we found no candidate matches at all, assume there must have been a match on filename.
410 if len(cands) == 0 {
411 nm := d.fileName(nextDoc)
412 return []*candidateMatch{{
413 caseSensitive: false,
414 fileName: true,
415 substrBytes: nm,
416 substrLowered: nm,
417 file: nextDoc,
418 runeOffset: 0,
419 byteOffset: 0,
420 byteMatchSz: uint32(len(nm)),
421 }}
422 }
423
424 sort.Sort((sortByOffsetSlice)(cands))
425 res := cands[:0]
426 mergeRun := 1
427 for i, c := range cands {
428 if i == 0 {
429 res = append(res, c)
430 continue
431 }
432
433 last := res[len(res)-1]
434
435 // Never compare filename and content matches
436 if last.fileName != c.fileName {
437 res = append(res, c)
438 continue
439 }
440
441 if merge {
442 // Merge adjacent candidates. This guarantees that the matches
443 // are non-overlapping.
444 lastEnd := last.byteOffset + last.byteMatchSz
445 end := c.byteOffset + c.byteMatchSz
446 if lastEnd >= c.byteOffset {
447 mergeRun++
448 // Average out the score across the merged candidates. Only do it if
449 // we are boosting to avoid floating point funkiness in the normal
450 // case.
451 if !(epsilonEqualsOne(last.scoreWeight) && epsilonEqualsOne(c.scoreWeight)) {
452 last.scoreWeight = ((last.scoreWeight * float64(mergeRun-1)) + c.scoreWeight) / float64(mergeRun)
453 }
454
455 // latest candidate goes further, update our end
456 if end > lastEnd {
457 last.byteMatchSz = end - last.byteOffset
458 }
459
460 continue
461 } else {
462 mergeRun = 1
463 }
464 } else {
465 // Remove overlapping candidates. This guarantees that the matches
466 // are non-overlapping, but also preserves expected match counts.
467 lastEnd := last.byteOffset + last.byteMatchSz
468 if lastEnd > c.byteOffset {
469 continue
470 }
471 }
472
473 res = append(res, c)
474 }
475 return res
476}
477
478type sortByOffsetSlice []*candidateMatch
479
480func (m sortByOffsetSlice) Len() int { return len(m) }
481func (m sortByOffsetSlice) Swap(i, j int) { m[i], m[j] = m[j], m[i] }
482func (m sortByOffsetSlice) Less(i, j int) bool {
483 // Sort all filename matches to the start
484 if m[i].fileName != m[j].fileName {
485 return m[i].fileName
486 }
487
488 if m[i].byteOffset == m[j].byteOffset { // tie break if same offset
489 // Prefer longer candidates if starting at same position
490 return m[i].byteMatchSz > m[j].byteMatchSz
491 }
492 return m[i].byteOffset < m[j].byteOffset
493}
494
495// setScoreWeight is a helper used by gatherMatches to set the weight based on
496// the score weight of the matchTree.
497func setScoreWeight(scoreWeight float64, cm []*candidateMatch) []*candidateMatch {
498 for _, m := range cm {
499 m.scoreWeight = scoreWeight
500 }
501 return cm
502}
503
504func (d *indexData) branchIndex(docID uint32) int {
505 mask := d.fileBranchMasks[docID]
506 idx := 0
507 for mask != 0 {
508 if mask&0x1 != 0 {
509 return idx
510 }
511 idx++
512 mask >>= 1
513 }
514 return -1
515}
516
517// gatherBranches returns a list of branch names taking into account any branch
518// filters in the query. If the query contains a branch filter, it returns all
519// branches containing the docID and matching the branch filter. Otherwise, it
520// returns all branches containing docID.
521func (d *indexData) gatherBranches(docID uint32, mt matchTree, known map[matchTree]bool) []string {
522 var mask uint64
523 visitMatchAtoms(mt, known, func(mt matchTree) {
524 bq, ok := mt.(*branchQueryMatchTree)
525 if !ok {
526 return
527 }
528
529 mask = mask | bq.branchMask()
530 })
531
532 if mask == 0 {
533 mask = d.fileBranchMasks[docID]
534 }
535
536 var branches []string
537 id := uint32(1)
538 branchNames := d.branchNames[d.repos[docID]]
539 for mask != 0 {
540 if mask&0x1 != 0 {
541 branches = append(branches, branchNames[uint(id)])
542 }
543 id <<= 1
544 mask >>= 1
545 }
546
547 return branches
548}
549
550func (d *indexData) List(ctx context.Context, q query.Q, opts *ListOptions) (rl *RepoList, err error) {
551 var include func(rle *RepoListEntry) bool
552
553 q = d.simplify(q)
554 if c, ok := q.(*query.Const); ok {
555 if !c.Value {
556 return &RepoList{}, nil
557 }
558 include = func(rle *RepoListEntry) bool {
559 return true
560 }
561 } else {
562 sr, err := d.Search(ctx, q, &SearchOptions{
563 ShardRepoMaxMatchCount: 1,
564 })
565 if err != nil {
566 return nil, err
567 }
568
569 foundRepos := make(map[string]struct{}, len(sr.Files))
570 for _, file := range sr.Files {
571 foundRepos[file.Repository] = struct{}{}
572 }
573
574 include = func(rle *RepoListEntry) bool {
575 _, ok := foundRepos[rle.Repository.Name]
576 return ok
577 }
578 }
579
580 var l RepoList
581
582 field, err := opts.GetField()
583 if err != nil {
584 return nil, err
585 }
586 switch field {
587 case RepoListFieldRepos:
588 l.Repos = make([]*RepoListEntry, 0, len(d.repoListEntry))
589 case RepoListFieldReposMap:
590 l.ReposMap = make(ReposMap, len(d.repoListEntry))
591 }
592
593 for i := range d.repoListEntry {
594 if d.repoMetaData[i].Tombstone {
595 continue
596 }
597 rle := &d.repoListEntry[i]
598 if !include(rle) {
599 continue
600 }
601
602 l.Stats.Add(&rle.Stats)
603
604 // Backwards compat for when ID is missing
605 if rle.Repository.ID == 0 {
606 l.Repos = append(l.Repos, rle)
607 continue
608 }
609
610 switch field {
611 case RepoListFieldRepos:
612 l.Repos = append(l.Repos, rle)
613 case RepoListFieldReposMap:
614 l.ReposMap[rle.Repository.ID] = MinimalRepoListEntry{
615 HasSymbols: rle.Repository.HasSymbols,
616 Branches: rle.Repository.Branches,
617 IndexTimeUnix: rle.IndexMetadata.IndexTime.Unix(),
618 }
619 }
620
621 }
622
623 // Only one of these fields is populated and in all cases the size of that
624 // field is the number of Repos in this shard.
625 l.Stats.Repos = len(l.Repos) + len(l.ReposMap)
626
627 return &l, nil
628}
629
630// regexpToMatchTreeRecursive converts a regular expression to a matchTree mt. If
631// mt is equivalent to the input r, isEqual = true and the matchTree can be used
632// in place of the regex r. If singleLine = true, then the matchTree and all
633// its children only match terms on the same line. singleLine is used during
634// recursion to decide whether to return an andLineMatchTree (singleLine = true)
635// or a andMatchTree (singleLine = false).
636func (d *indexData) regexpToMatchTreeRecursive(r *syntax.Regexp, minTextSize int, fileName bool, caseSensitive bool) (mt matchTree, isEqual bool, singleLine bool, err error) {
637 // TODO - we could perhaps transform Begin/EndText in '\n'?
638 // TODO - we could perhaps transform CharClass in (OrQuery )
639 // if there are just a few runes, and part of a OpConcat?
640 switch r.Op {
641 case syntax.OpLiteral:
642 s := string(r.Rune)
643 if len(s) >= minTextSize {
644 ignoreCase := syntax.FoldCase == (r.Flags & syntax.FoldCase)
645 mt, err := d.newSubstringMatchTree(&query.Substring{Pattern: s, FileName: fileName, CaseSensitive: !ignoreCase && caseSensitive})
646 return mt, true, !strings.Contains(s, "\n"), err
647 }
648 case syntax.OpCapture:
649 return d.regexpToMatchTreeRecursive(r.Sub[0], minTextSize, fileName, caseSensitive)
650
651 case syntax.OpPlus:
652 return d.regexpToMatchTreeRecursive(r.Sub[0], minTextSize, fileName, caseSensitive)
653
654 case syntax.OpRepeat:
655 if r.Min == 1 {
656 return d.regexpToMatchTreeRecursive(r.Sub[0], minTextSize, fileName, caseSensitive)
657 } else if r.Min > 1 {
658 // (x){2,} can't be expressed precisely by the matchTree
659 mt, _, singleLine, err := d.regexpToMatchTreeRecursive(r.Sub[0], minTextSize, fileName, caseSensitive)
660 return mt, false, singleLine, err
661 }
662 case syntax.OpConcat, syntax.OpAlternate:
663 var qs []matchTree
664 isEq := true
665 singleLine = true
666 for _, sr := range r.Sub {
667 if sq, subIsEq, subSingleLine, err := d.regexpToMatchTreeRecursive(sr, minTextSize, fileName, caseSensitive); sq != nil {
668 if err != nil {
669 return nil, false, false, err
670 }
671 isEq = isEq && subIsEq
672 singleLine = singleLine && subSingleLine
673 qs = append(qs, sq)
674 }
675 }
676 if r.Op == syntax.OpConcat {
677 if len(qs) > 1 {
678 isEq = false
679 }
680 newQs := make([]matchTree, 0, len(qs))
681 for _, q := range qs {
682 if _, ok := q.(*bruteForceMatchTree); ok {
683 continue
684 }
685 newQs = append(newQs, q)
686 }
687 if len(newQs) == 1 {
688 return newQs[0], isEq, singleLine, nil
689 }
690 if len(newQs) == 0 {
691 return &bruteForceMatchTree{}, isEq, singleLine, nil
692 }
693 if singleLine {
694 return &andLineMatchTree{andMatchTree{children: newQs}}, isEq, singleLine, nil
695 }
696 return &andMatchTree{newQs}, isEq, singleLine, nil
697 }
698 for _, q := range qs {
699 if _, ok := q.(*bruteForceMatchTree); ok {
700 return q, isEq, false, nil
701 }
702 }
703 if len(qs) == 0 {
704 return &noMatchTree{Why: "const"}, isEq, false, nil
705 }
706 return &orMatchTree{qs}, isEq, false, nil
707 case syntax.OpStar:
708 if r.Sub[0].Op == syntax.OpAnyCharNotNL {
709 return &bruteForceMatchTree{}, false, true, nil
710 }
711 }
712 return &bruteForceMatchTree{}, false, false, nil
713}
714
715type timer struct {
716 last time.Time
717}
718
719func newTimer() *timer {
720 return &timer{
721 last: time.Now(),
722 }
723}
724
725func (t *timer) Elapsed() time.Duration {
726 now := time.Now()
727 d := now.Sub(t.last)
728 t.last = now
729 return d
730}