fork of https://github.com/sourcegraph/zoekt
1// Copyright 2016 Google Inc. All rights reserved.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15package index
16
17import (
18 "context"
19 "fmt"
20 "log"
21 "regexp/syntax"
22 "sort"
23 "strings"
24 "time"
25
26 enry_data "github.com/go-enry/go-enry/v2/data"
27 "github.com/grafana/regexp"
28
29 "github.com/sourcegraph/zoekt"
30 "github.com/sourcegraph/zoekt/internal/tenant"
31 "github.com/sourcegraph/zoekt/query"
32)
33
34// simplifyMultiRepo takes a query and a predicate. It returns Const(true) if all
35// repository names fulfill the predicate, Const(false) if none of them do, and q
36// otherwise.
37func (d *indexData) simplifyMultiRepo(q query.Q, predicate func(*zoekt.Repository) bool) query.Q {
38 count := 0
39 alive := len(d.repoMetaData)
40 for i := range d.repoMetaData {
41 if d.repoMetaData[i].Tombstone {
42 alive--
43 } else if predicate(&d.repoMetaData[i]) {
44 count++
45 }
46 }
47 if count == alive {
48 return &query.Const{Value: true}
49 }
50 if count > 0 {
51 return q
52 }
53 return &query.Const{Value: false}
54}
55
56func (d *indexData) simplify(in query.Q) query.Q {
57 eval := query.Map(in, func(q query.Q) query.Q {
58 switch r := q.(type) {
59 case *query.Repo:
60 return d.simplifyMultiRepo(q, func(repo *zoekt.Repository) bool {
61 return r.Regexp.MatchString(repo.Name)
62 })
63 case *query.RepoRegexp:
64 return d.simplifyMultiRepo(q, func(repo *zoekt.Repository) bool {
65 return r.Regexp.MatchString(repo.Name)
66 })
67 case *query.BranchesRepos:
68 for i := range d.repoMetaData {
69 for _, br := range r.List {
70 if br.Repos.Contains(d.repoMetaData[i].ID) {
71 return q
72 }
73 }
74 }
75 return &query.Const{Value: false}
76 case *query.RepoSet:
77 return d.simplifyMultiRepo(q, func(repo *zoekt.Repository) bool {
78 return r.Set[repo.Name]
79 })
80 case query.RawConfig:
81 return d.simplifyMultiRepo(q, func(repo *zoekt.Repository) bool { return uint8(r)&encodeRawConfig(repo.RawConfig) == uint8(r) })
82 case *query.RepoIDs:
83 return d.simplifyMultiRepo(q, func(repo *zoekt.Repository) bool {
84 return r.Repos.Contains(repo.ID)
85 })
86 case *query.Language:
87 _, has := d.metaData.LanguageMap[r.Language]
88 if !has && d.metaData.IndexFeatureVersion < 12 {
89 // For index files that haven't been re-indexed by go-enry,
90 // fall back to file-based matching and continue even if this
91 // repo doesn't have the specific language present.
92 extsForLang := enry_data.ExtensionsByLanguage[r.Language]
93 if extsForLang != nil {
94 extFrags := make([]string, 0, len(extsForLang))
95 for _, ext := range extsForLang {
96 extFrags = append(extFrags, regexp.QuoteMeta(ext))
97 }
98 if len(extFrags) > 0 {
99 pattern := fmt.Sprintf("(?i)(%s)$", strings.Join(extFrags, "|"))
100 // inlined copy of query.regexpQuery
101 re, err := syntax.Parse(pattern, syntax.Perl)
102 if err != nil {
103 return &query.Const{Value: false}
104 }
105 if re.Op == syntax.OpLiteral {
106 return &query.Substring{
107 Pattern: string(re.Rune),
108 FileName: true,
109 }
110 }
111 return &query.Regexp{
112 Regexp: re,
113 FileName: true,
114 }
115 }
116 }
117 }
118 if !has {
119 return &query.Const{Value: false}
120 }
121 }
122 return q
123 })
124 return query.Simplify(eval)
125}
126
127func (d *indexData) Search(ctx context.Context, q query.Q, opts *zoekt.SearchOptions) (sr *zoekt.SearchResult, err error) {
128 timer := newTimer()
129
130 copyOpts := *opts
131 opts = ©Opts
132 opts.SetDefaults()
133
134 var res zoekt.SearchResult
135 if len(d.fileNameIndex) == 0 {
136 return &res, nil
137 }
138
139 select {
140 case <-ctx.Done():
141 res.Stats.ShardsSkipped++
142 return &res, nil
143 default:
144 }
145
146 q = d.simplify(q)
147 if c, ok := q.(*query.Const); ok && !c.Value {
148 return &res, nil
149 }
150
151 if opts.EstimateDocCount {
152 res.Stats.ShardFilesConsidered = len(d.fileBranchMasks)
153 return &res, nil
154 }
155
156 q = query.Map(q, query.ExpandFileContent)
157
158 mt, err := d.newMatchTree(q, matchTreeOpt{})
159 if err != nil {
160 return nil, err
161 }
162
163 // Capture the costs of construction before pruning
164 updateMatchTreeStats(mt, &res.Stats)
165
166 mt, err = pruneMatchTree(mt)
167 if err != nil {
168 return nil, err
169 }
170 res.Stats.MatchTreeConstruction = timer.Elapsed()
171 if mt == nil {
172 res.Stats.ShardsSkippedFilter++
173 return &res, nil
174 }
175
176 res.Stats.ShardsScanned++
177
178 cp := &contentProvider{
179 id: d,
180 stats: &res.Stats,
181 }
182
183 // Track the number of documents found in a repository for
184 // ShardRepoMaxMatchCount
185 var (
186 lastRepoID uint16
187 repoMatchCount int
188 )
189
190 docCount := uint32(len(d.fileBranchMasks))
191 lastDoc := int(-1)
192
193nextFileMatch:
194 for {
195 canceled := false
196 select {
197 case <-ctx.Done():
198 canceled = true
199 default:
200 }
201
202 nextDoc := mt.nextDoc()
203 if int(nextDoc) <= lastDoc {
204 nextDoc = uint32(lastDoc + 1)
205 }
206
207 for ; nextDoc < docCount; nextDoc++ {
208 repoID := d.repos[nextDoc]
209 repoMetadata := &d.repoMetaData[repoID]
210
211 // Skip tombstoned repositories
212 if repoMetadata.Tombstone {
213 continue
214 }
215
216 // 🚨 SECURITY: Skip documents that don't belong to the tenant. This check is
217 // necessary to prevent leaking data across tenants.
218 if !tenant.HasAccess(ctx, repoMetadata.TenantID) {
219 continue
220 }
221
222 // Skip documents that are tombstoned
223 if len(repoMetadata.FileTombstones) > 0 {
224 if _, tombstoned := repoMetadata.FileTombstones[string(d.fileName(nextDoc))]; tombstoned {
225 continue
226 }
227 }
228
229 // Skip documents over ShardRepoMaxMatchCount if specified.
230 if opts.ShardRepoMaxMatchCount > 0 {
231 if repoMatchCount >= opts.ShardRepoMaxMatchCount && repoID == lastRepoID {
232 res.Stats.FilesSkipped++
233 continue
234 }
235 }
236
237 break
238 }
239
240 if nextDoc >= docCount {
241 break
242 }
243
244 lastDoc = int(nextDoc)
245
246 // We track lastRepoID for ShardRepoMaxMatchCount
247 if lastRepoID != d.repos[nextDoc] {
248 lastRepoID = d.repos[nextDoc]
249 repoMatchCount = 0
250 }
251
252 if canceled || (res.Stats.MatchCount >= opts.ShardMaxMatchCount && opts.ShardMaxMatchCount > 0) {
253 res.Stats.FilesSkipped += int(docCount - nextDoc)
254 break
255 }
256
257 res.Stats.FilesConsidered++
258 mt.prepare(nextDoc)
259
260 cp.setDocument(nextDoc)
261
262 known := make(map[matchTree]bool)
263 md := d.repoMetaData[d.repos[nextDoc]]
264
265 for cost := costMin; cost <= costMax; cost++ {
266 switch evalMatchTree(cp, cost, known, mt) {
267 case matchesRequiresHigherCost:
268 if cost == costMax {
269 log.Panicf("did not decide. Repo %s, doc %d, known %v",
270 md.Name, nextDoc, known)
271 }
272 case matchesFound:
273 // could short-circuit now, but we want to run higher costs to
274 // potentially find higher ranked matches.
275 case matchesNone:
276 continue nextFileMatch
277 }
278 }
279
280 fileMatch := zoekt.FileMatch{
281 Repository: md.Name,
282 RepositoryID: md.ID,
283 RepositoryPriority: md.GetPriority(),
284 FileName: string(d.fileName(nextDoc)),
285 Checksum: d.getChecksum(nextDoc),
286 Language: d.languageMap[d.getLanguage(nextDoc)],
287 }
288
289 if s := d.subRepos[nextDoc]; s > 0 {
290 if s >= uint32(len(d.subRepoPaths[d.repos[nextDoc]])) {
291 log.Panicf("corrupt index: subrepo %d beyond %v", s, d.subRepoPaths)
292 }
293 path := d.subRepoPaths[d.repos[nextDoc]][s]
294 fileMatch.SubRepositoryPath = path
295 sr := md.SubRepoMap[path]
296 fileMatch.SubRepositoryName = sr.Name
297 if idx := d.branchIndex(nextDoc); idx >= 0 {
298 fileMatch.Version = sr.Branches[idx].Version
299 }
300 } else {
301 idx := d.branchIndex(nextDoc)
302 if idx >= 0 {
303 fileMatch.Version = md.Branches[idx].Version
304 }
305 }
306
307 // Important invariant for performance: finalCands is sorted by offset and
308 // non-overlapping. gatherMatches respects this invariant and all later
309 // transformations respect this.
310 finalCands := d.gatherMatches(nextDoc, mt, known)
311
312 if opts.ChunkMatches {
313 fileMatch.ChunkMatches = cp.fillChunkMatches(finalCands, opts.NumContextLines, fileMatch.Language, opts)
314 } else {
315 fileMatch.LineMatches = cp.fillMatches(finalCands, opts.NumContextLines, fileMatch.Language, opts)
316 }
317
318 if opts.UseBM25Scoring {
319 tf := cp.calculateTermFrequency(finalCands)
320 d.scoreFilesUsingBM25(&fileMatch, nextDoc, tf, opts)
321 } else {
322 // Use the standard, non-experimental scoring method by default
323 d.scoreFile(&fileMatch, nextDoc, mt, known, opts)
324 }
325
326 fileMatch.Branches = d.gatherBranches(nextDoc, mt, known)
327 sortMatchesByScore(fileMatch.LineMatches)
328 sortChunkMatchesByScore(fileMatch.ChunkMatches)
329 if opts.Whole {
330 fileMatch.Content = cp.data(false)
331 }
332
333 matchedChunkRanges := 0
334 for _, cm := range fileMatch.ChunkMatches {
335 matchedChunkRanges += len(cm.Ranges)
336 }
337
338 repoMatchCount += len(fileMatch.LineMatches)
339 repoMatchCount += matchedChunkRanges
340
341 res.Files = append(res.Files, fileMatch)
342
343 res.Stats.MatchCount += len(fileMatch.LineMatches)
344 res.Stats.MatchCount += matchedChunkRanges
345 res.Stats.FileCount++
346 }
347
348 for _, md := range d.repoMetaData {
349 r := md
350 addRepo(&res, &r)
351 for _, v := range r.SubRepoMap {
352 addRepo(&res, v)
353 }
354 }
355
356 // Update stats based on work done during document search.
357 updateMatchTreeStats(mt, &res.Stats)
358
359 res.Stats.MatchTreeSearch = timer.Elapsed()
360
361 return &res, nil
362}
363
364func addRepo(res *zoekt.SearchResult, repo *zoekt.Repository) {
365 if res.RepoURLs == nil {
366 res.RepoURLs = map[string]string{}
367 }
368 res.RepoURLs[repo.Name] = repo.FileURLTemplate
369
370 if res.LineFragments == nil {
371 res.LineFragments = map[string]string{}
372 }
373 res.LineFragments[repo.Name] = repo.LineFragmentTemplate
374}
375
376// Gather matches from this document. The matches are returned in document
377// order and are non-overlapping. All filename and content matches are
378// returned, with filename matches first.
379//
380// If `merge` is set, overlapping and adjacent matches will be merged
381// into a single index. Otherwise, overlapping matches will be removed,
382// but adjacent matches will remain.
383func (d *indexData) gatherMatches(nextDoc uint32, mt matchTree, known map[matchTree]bool) []*candidateMatch {
384 var cands []*candidateMatch
385 visitMatches(mt, known, 1, func(mt matchTree, scoreWeight float64) {
386 if smt, ok := mt.(*substrMatchTree); ok {
387 cands = append(cands, setScoreWeight(scoreWeight, smt.current)...)
388 }
389 if rmt, ok := mt.(*regexpMatchTree); ok {
390 cands = append(cands, setScoreWeight(scoreWeight, rmt.found)...)
391 }
392 if rmt, ok := mt.(*wordMatchTree); ok {
393 cands = append(cands, setScoreWeight(scoreWeight, rmt.found)...)
394 }
395 if smt, ok := mt.(*symbolRegexpMatchTree); ok {
396 cands = append(cands, setScoreWeight(scoreWeight, smt.found)...)
397 }
398 })
399
400 // If we found no candidate matches at all, assume there must have been a match on filename.
401 if len(cands) == 0 {
402 nm := d.fileName(nextDoc)
403 return []*candidateMatch{{
404 caseSensitive: false,
405 fileName: true,
406 substrBytes: nm,
407 substrLowered: nm,
408 file: nextDoc,
409 runeOffset: 0,
410 byteOffset: 0,
411 byteMatchSz: uint32(len(nm)),
412 }}
413 }
414
415 // Remove overlapping candidates. This guarantees that the matches
416 // are non-overlapping, but also preserves expected match counts.
417 sort.Sort((sortByOffsetSlice)(cands))
418 res := cands[:0]
419 for i, c := range cands {
420 if i == 0 {
421 res = append(res, c)
422 continue
423 }
424
425 last := res[len(res)-1]
426
427 // Never compare filename and content matches
428 if last.fileName != c.fileName {
429 res = append(res, c)
430 continue
431 }
432
433 // Only add the match if its range doesn't overlap
434 lastEnd := last.byteOffset + last.byteMatchSz
435 if lastEnd <= c.byteOffset {
436 res = append(res, c)
437 continue
438 }
439 }
440 return res
441}
442
443type sortByOffsetSlice []*candidateMatch
444
445func (m sortByOffsetSlice) Len() int { return len(m) }
446func (m sortByOffsetSlice) Swap(i, j int) { m[i], m[j] = m[j], m[i] }
447func (m sortByOffsetSlice) Less(i, j int) bool {
448 // Sort all filename matches to the start
449 if m[i].fileName != m[j].fileName {
450 return m[i].fileName
451 }
452
453 if m[i].byteOffset == m[j].byteOffset { // tie break if same offset
454 // Prefer longer candidates if starting at same position
455 return m[i].byteMatchSz > m[j].byteMatchSz
456 }
457 return m[i].byteOffset < m[j].byteOffset
458}
459
460// setScoreWeight is a helper used by gatherMatches to set the weight based on
461// the score weight of the matchTree.
462func setScoreWeight(scoreWeight float64, cm []*candidateMatch) []*candidateMatch {
463 for _, m := range cm {
464 m.scoreWeight = scoreWeight
465 }
466 return cm
467}
468
469func (d *indexData) branchIndex(docID uint32) int {
470 mask := d.fileBranchMasks[docID]
471 idx := 0
472 for mask != 0 {
473 if mask&0x1 != 0 {
474 return idx
475 }
476 idx++
477 mask >>= 1
478 }
479 return -1
480}
481
482// gatherBranches returns a list of branch names taking into account any branch
483// filters in the query. If the query contains a branch filter, it returns all
484// branches containing the docID and matching the branch filter. Otherwise, it
485// returns all branches containing docID.
486func (d *indexData) gatherBranches(docID uint32, mt matchTree, known map[matchTree]bool) []string {
487 var mask uint64
488 visitMatchAtoms(mt, known, func(mt matchTree) {
489 bq, ok := mt.(*branchQueryMatchTree)
490 if !ok {
491 return
492 }
493
494 mask = mask | bq.branchMask()
495 })
496
497 if mask == 0 {
498 mask = d.fileBranchMasks[docID]
499 }
500
501 var branches []string
502 id := uint32(1)
503 branchNames := d.branchNames[d.repos[docID]]
504 for mask != 0 {
505 if mask&0x1 != 0 {
506 branches = append(branches, branchNames[uint(id)])
507 }
508 id <<= 1
509 mask >>= 1
510 }
511
512 return branches
513}
514
515func (d *indexData) List(ctx context.Context, q query.Q, opts *zoekt.ListOptions) (rl *zoekt.RepoList, err error) {
516 var include func(rle *zoekt.RepoListEntry) bool
517
518 q = d.simplify(q)
519 if c, ok := q.(*query.Const); ok {
520 if !c.Value {
521 return &zoekt.RepoList{}, nil
522 }
523 include = func(rle *zoekt.RepoListEntry) bool {
524 return true
525 }
526 } else {
527 sr, err := d.Search(ctx, q, &zoekt.SearchOptions{
528 ShardRepoMaxMatchCount: 1,
529 })
530 if err != nil {
531 return nil, err
532 }
533
534 foundRepos := make(map[string]struct{}, len(sr.Files))
535 for _, file := range sr.Files {
536 foundRepos[file.Repository] = struct{}{}
537 }
538
539 include = func(rle *zoekt.RepoListEntry) bool {
540 _, ok := foundRepos[rle.Repository.Name]
541 return ok
542 }
543 }
544
545 var l zoekt.RepoList
546
547 field, err := opts.GetField()
548 if err != nil {
549 return nil, err
550 }
551 switch field {
552 case zoekt.RepoListFieldRepos:
553 l.Repos = make([]*zoekt.RepoListEntry, 0, len(d.repoListEntry))
554 case zoekt.RepoListFieldReposMap:
555 l.ReposMap = make(zoekt.ReposMap, len(d.repoListEntry))
556 }
557
558 for i := range d.repoListEntry {
559 if d.repoMetaData[i].Tombstone {
560 continue
561 }
562 // 🚨 SECURITY: Skip documents that don't belong to the tenant. This check is
563 // necessary to prevent leaking data across tenants.
564 if !tenant.HasAccess(ctx, d.repoMetaData[i].TenantID) {
565 continue
566 }
567 rle := &d.repoListEntry[i]
568 if !include(rle) {
569 continue
570 }
571
572 l.Stats.Add(&rle.Stats)
573
574 // Backwards compat for when ID is missing
575 if rle.Repository.ID == 0 {
576 l.Repos = append(l.Repos, rle)
577 continue
578 }
579
580 switch field {
581 case zoekt.RepoListFieldRepos:
582 l.Repos = append(l.Repos, rle)
583 case zoekt.RepoListFieldReposMap:
584 l.ReposMap[rle.Repository.ID] = zoekt.MinimalRepoListEntry{
585 HasSymbols: rle.Repository.HasSymbols,
586 Branches: rle.Repository.Branches,
587 IndexTimeUnix: rle.IndexMetadata.IndexTime.Unix(),
588 }
589 }
590
591 }
592
593 // Only one of these fields is populated and in all cases the size of that
594 // field is the number of Repos in this shard.
595 l.Stats.Repos = len(l.Repos) + len(l.ReposMap)
596
597 return &l, nil
598}
599
600// regexpToMatchTreeRecursive converts a regular expression to a matchTree mt. If
601// mt is equivalent to the input r, isEqual = true and the matchTree can be used
602// in place of the regex r. If singleLine = true, then the matchTree and all
603// its children only match terms on the same line. singleLine is used during
604// recursion to decide whether to return an andLineMatchTree (singleLine = true)
605// or a andMatchTree (singleLine = false).
606func (d *indexData) regexpToMatchTreeRecursive(r *syntax.Regexp, minTextSize int, fileName bool, caseSensitive bool) (mt matchTree, isEqual bool, singleLine bool, err error) {
607 // TODO - we could perhaps transform Begin/EndText in '\n'?
608 // TODO - we could perhaps transform CharClass in (OrQuery )
609 // if there are just a few runes, and part of a OpConcat?
610 switch r.Op {
611 case syntax.OpLiteral:
612 s := string(r.Rune)
613 if len(s) >= minTextSize {
614 ignoreCase := syntax.FoldCase == (r.Flags & syntax.FoldCase)
615 mt, err := d.newSubstringMatchTree(&query.Substring{Pattern: s, FileName: fileName, CaseSensitive: !ignoreCase && caseSensitive})
616 return mt, true, !strings.Contains(s, "\n"), err
617 }
618 case syntax.OpCapture:
619 return d.regexpToMatchTreeRecursive(r.Sub[0], minTextSize, fileName, caseSensitive)
620
621 case syntax.OpPlus:
622 return d.regexpToMatchTreeRecursive(r.Sub[0], minTextSize, fileName, caseSensitive)
623
624 case syntax.OpRepeat:
625 if r.Min == 1 {
626 return d.regexpToMatchTreeRecursive(r.Sub[0], minTextSize, fileName, caseSensitive)
627 } else if r.Min > 1 {
628 // (x){2,} can't be expressed precisely by the matchTree
629 mt, _, singleLine, err := d.regexpToMatchTreeRecursive(r.Sub[0], minTextSize, fileName, caseSensitive)
630 return mt, false, singleLine, err
631 }
632 case syntax.OpConcat, syntax.OpAlternate:
633 var qs []matchTree
634 isEq := true
635 singleLine = true
636 for _, sr := range r.Sub {
637 if sq, subIsEq, subSingleLine, err := d.regexpToMatchTreeRecursive(sr, minTextSize, fileName, caseSensitive); sq != nil {
638 if err != nil {
639 return nil, false, false, err
640 }
641 isEq = isEq && subIsEq
642 singleLine = singleLine && subSingleLine
643 qs = append(qs, sq)
644 }
645 }
646 if r.Op == syntax.OpConcat {
647 if len(qs) > 1 {
648 isEq = false
649 }
650 newQs := make([]matchTree, 0, len(qs))
651 for _, q := range qs {
652 if _, ok := q.(*bruteForceMatchTree); ok {
653 continue
654 }
655 newQs = append(newQs, q)
656 }
657 if len(newQs) == 1 {
658 return newQs[0], isEq, singleLine, nil
659 }
660 if len(newQs) == 0 {
661 return &bruteForceMatchTree{}, isEq, singleLine, nil
662 }
663 if singleLine {
664 return &andLineMatchTree{andMatchTree{children: newQs}}, isEq, singleLine, nil
665 }
666 return &andMatchTree{newQs}, isEq, singleLine, nil
667 }
668 for _, q := range qs {
669 if _, ok := q.(*bruteForceMatchTree); ok {
670 return q, isEq, false, nil
671 }
672 }
673 if len(qs) == 0 {
674 return &noMatchTree{Why: "const"}, isEq, false, nil
675 }
676 return &orMatchTree{qs}, isEq, false, nil
677 case syntax.OpStar:
678 if r.Sub[0].Op == syntax.OpAnyCharNotNL {
679 return &bruteForceMatchTree{}, false, true, nil
680 }
681 }
682 return &bruteForceMatchTree{}, false, false, nil
683}
684
685type timer struct {
686 last time.Time
687}
688
689func newTimer() *timer {
690 return &timer{
691 last: time.Now(),
692 }
693}
694
695func (t *timer) Elapsed() time.Duration {
696 now := time.Now()
697 d := now.Sub(t.last)
698 t.last = now
699 return d
700}