fork of https://github.com/sourcegraph/zoekt
1// Copyright 2016 Google Inc. All rights reserved.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15package index
16
17import (
18 "context"
19 "fmt"
20 "log"
21 "regexp/syntax"
22 "sort"
23 "strings"
24 "time"
25
26 enry_data "github.com/go-enry/go-enry/v2/data"
27 "github.com/grafana/regexp"
28
29 "github.com/sourcegraph/zoekt"
30 "github.com/sourcegraph/zoekt/internal/tenant"
31 "github.com/sourcegraph/zoekt/query"
32)
33
34// simplifyMultiRepo takes a query and a predicate. It returns Const(true) if all
35// repository names fulfill the predicate, Const(false) if none of them do, and q
36// otherwise.
37func (d *indexData) simplifyMultiRepo(q query.Q, predicate func(*zoekt.Repository) bool) query.Q {
38 count := 0
39 alive := len(d.repoMetaData)
40 for i := range d.repoMetaData {
41 if d.repoMetaData[i].Tombstone {
42 alive--
43 } else if predicate(&d.repoMetaData[i]) {
44 count++
45 }
46 }
47 if count == alive {
48 return &query.Const{Value: true}
49 }
50 if count > 0 {
51 return q
52 }
53 return &query.Const{Value: false}
54}
55
56func (d *indexData) simplify(in query.Q) query.Q {
57 eval := query.Map(in, func(q query.Q) query.Q {
58 switch r := q.(type) {
59 case *query.Repo:
60 return d.simplifyMultiRepo(q, func(repo *zoekt.Repository) bool {
61 return r.Regexp.MatchString(repo.Name)
62 })
63 case *query.RepoRegexp:
64 return d.simplifyMultiRepo(q, func(repo *zoekt.Repository) bool {
65 return r.Regexp.MatchString(repo.Name)
66 })
67 case *query.BranchesRepos:
68 for i := range d.repoMetaData {
69 for _, br := range r.List {
70 if br.Repos.Contains(d.repoMetaData[i].ID) {
71 return q
72 }
73 }
74 }
75 return &query.Const{Value: false}
76 case *query.RepoSet:
77 return d.simplifyMultiRepo(q, func(repo *zoekt.Repository) bool {
78 return r.Set[repo.Name]
79 })
80 case query.RawConfig:
81 return d.simplifyMultiRepo(q, func(repo *zoekt.Repository) bool { return uint8(r)&encodeRawConfig(repo.RawConfig) == uint8(r) })
82 case *query.RepoIDs:
83 return d.simplifyMultiRepo(q, func(repo *zoekt.Repository) bool {
84 return r.Repos.Contains(repo.ID)
85 })
86 case *query.Language:
87 _, has := d.metaData.LanguageMap[r.Language]
88 if !has && d.metaData.IndexFeatureVersion < 12 {
89 // For index files that haven't been re-indexed by go-enry,
90 // fall back to file-based matching and continue even if this
91 // repo doesn't have the specific language present.
92 extsForLang := enry_data.ExtensionsByLanguage[r.Language]
93 if extsForLang != nil {
94 extFrags := make([]string, 0, len(extsForLang))
95 for _, ext := range extsForLang {
96 extFrags = append(extFrags, regexp.QuoteMeta(ext))
97 }
98 if len(extFrags) > 0 {
99 pattern := fmt.Sprintf("(?i)(%s)$", strings.Join(extFrags, "|"))
100 // inlined copy of query.regexpQuery
101 re, err := syntax.Parse(pattern, syntax.Perl)
102 if err != nil {
103 return &query.Const{Value: false}
104 }
105 if re.Op == syntax.OpLiteral {
106 return &query.Substring{
107 Pattern: string(re.Rune),
108 FileName: true,
109 }
110 }
111 return &query.Regexp{
112 Regexp: re,
113 FileName: true,
114 }
115 }
116 }
117 }
118 if !has {
119 return &query.Const{Value: false}
120 }
121 }
122 return q
123 })
124 return query.Simplify(eval)
125}
126
127func (d *indexData) Search(ctx context.Context, q query.Q, opts *zoekt.SearchOptions) (sr *zoekt.SearchResult, err error) {
128 timer := newTimer()
129
130 copyOpts := *opts
131 opts = ©Opts
132 opts.SetDefaults()
133
134 var res zoekt.SearchResult
135 if len(d.fileNameIndex) == 0 {
136 return &res, nil
137 }
138
139 select {
140 case <-ctx.Done():
141 res.Stats.ShardsSkipped++
142 return &res, nil
143 default:
144 }
145
146 q = d.simplify(q)
147 if c, ok := q.(*query.Const); ok && !c.Value {
148 return &res, nil
149 }
150
151 if opts.EstimateDocCount {
152 res.Stats.ShardFilesConsidered = len(d.fileBranchMasks)
153 return &res, nil
154 }
155
156 q = query.Map(q, query.ExpandFileContent)
157
158 mt, err := d.newMatchTree(q, matchTreeOpt{})
159 if err != nil {
160 return nil, err
161 }
162
163 // Capture the costs of construction before pruning
164 updateMatchTreeStats(mt, &res.Stats)
165
166 mt, err = pruneMatchTree(mt)
167 if err != nil {
168 return nil, err
169 }
170 res.Stats.MatchTreeConstruction = timer.Elapsed()
171 if mt == nil {
172 res.Stats.ShardsSkippedFilter++
173 return &res, nil
174 }
175
176 res.Stats.ShardsScanned++
177
178 cp := &contentProvider{
179 id: d,
180 stats: &res.Stats,
181 }
182
183 // Track the number of documents found in a repository for
184 // ShardRepoMaxMatchCount
185 var (
186 lastRepoID uint16
187 repoMatchCount int
188 )
189
190 docCount := uint32(len(d.fileBranchMasks))
191 lastDoc := int(-1)
192
193nextFileMatch:
194 for {
195 canceled := false
196 select {
197 case <-ctx.Done():
198 canceled = true
199 default:
200 }
201
202 nextDoc := mt.nextDoc()
203 if int(nextDoc) <= lastDoc {
204 nextDoc = uint32(lastDoc + 1)
205 }
206
207 for ; nextDoc < docCount; nextDoc++ {
208 repoID := d.repos[nextDoc]
209 repoMetadata := &d.repoMetaData[repoID]
210
211 // Skip tombstoned repositories
212 if repoMetadata.Tombstone {
213 continue
214 }
215
216 // 🚨 SECURITY: Skip documents that don't belong to the tenant. This check is
217 // necessary to prevent leaking data across tenants.
218 if !tenant.HasAccess(ctx, repoMetadata.TenantID) {
219 continue
220 }
221
222 // Skip documents that are tombstoned
223 if len(repoMetadata.FileTombstones) > 0 {
224 if _, tombstoned := repoMetadata.FileTombstones[string(d.fileName(nextDoc))]; tombstoned {
225 continue
226 }
227 }
228
229 // Skip documents over ShardRepoMaxMatchCount if specified.
230 if opts.ShardRepoMaxMatchCount > 0 {
231 if repoMatchCount >= opts.ShardRepoMaxMatchCount && repoID == lastRepoID {
232 res.Stats.FilesSkipped++
233 continue
234 }
235 }
236
237 break
238 }
239
240 if nextDoc >= docCount {
241 break
242 }
243
244 lastDoc = int(nextDoc)
245
246 // We track lastRepoID for ShardRepoMaxMatchCount
247 if lastRepoID != d.repos[nextDoc] {
248 lastRepoID = d.repos[nextDoc]
249 repoMatchCount = 0
250 }
251
252 if canceled || (res.Stats.MatchCount >= opts.ShardMaxMatchCount && opts.ShardMaxMatchCount > 0) {
253 res.Stats.FilesSkipped += int(docCount - nextDoc)
254 break
255 }
256
257 res.Stats.FilesConsidered++
258 mt.prepare(nextDoc)
259
260 cp.setDocument(nextDoc)
261
262 known := make(map[matchTree]bool)
263 md := d.repoMetaData[d.repos[nextDoc]]
264
265 for cost := costMin; cost <= costMax; cost++ {
266 switch evalMatchTree(cp, cost, known, mt) {
267 case matchesRequiresHigherCost:
268 if cost == costMax {
269 log.Panicf("did not decide. Repo %s, doc %d, known %v",
270 md.Name, nextDoc, known)
271 }
272 case matchesFound:
273 // could short-circuit now, but we want to run higher costs to
274 // potentially find higher ranked matches.
275 case matchesNone:
276 continue nextFileMatch
277 }
278 }
279
280 fileMatch := zoekt.FileMatch{
281 Repository: md.Name,
282 RepositoryID: md.ID,
283 RepositoryPriority: md.GetPriority(),
284 FileName: string(d.fileName(nextDoc)),
285 Checksum: d.getChecksum(nextDoc),
286 Language: d.languageMap[d.getLanguage(nextDoc)],
287 }
288
289 if s := d.subRepos[nextDoc]; s > 0 {
290 if s >= uint32(len(d.subRepoPaths[d.repos[nextDoc]])) {
291 log.Panicf("corrupt index: subrepo %d beyond %v", s, d.subRepoPaths)
292 }
293 path := d.subRepoPaths[d.repos[nextDoc]][s]
294 fileMatch.SubRepositoryPath = path
295 sr := md.SubRepoMap[path]
296 fileMatch.SubRepositoryName = sr.Name
297 if idx := d.branchIndex(nextDoc); idx >= 0 {
298 fileMatch.Version = sr.Branches[idx].Version
299 }
300 } else {
301 idx := d.branchIndex(nextDoc)
302 if idx >= 0 {
303 fileMatch.Version = md.Branches[idx].Version
304 }
305 }
306
307 // Important invariant for performance: finalCands is sorted by offset and
308 // non-overlapping. gatherMatches respects this invariant and all later
309 // transformations respect this.
310 finalCands := d.gatherMatches(nextDoc, mt, known)
311
312 if opts.ChunkMatches {
313 fileMatch.ChunkMatches = cp.fillChunkMatches(finalCands, opts.NumContextLines, fileMatch.Language, opts)
314 } else {
315 fileMatch.LineMatches = cp.fillMatches(finalCands, opts.NumContextLines, fileMatch.Language, opts)
316 }
317
318 if opts.UseBM25Scoring {
319 d.scoreFileBM25(&fileMatch, nextDoc, finalCands, cp, opts)
320 } else {
321 // Use the standard, non-experimental scoring method by default
322 d.scoreFile(&fileMatch, nextDoc, mt, known, opts)
323 }
324
325 fileMatch.Branches = d.gatherBranches(nextDoc, mt, known)
326 sortMatchesByScore(fileMatch.LineMatches)
327 sortChunkMatchesByScore(fileMatch.ChunkMatches)
328 if opts.Whole {
329 fileMatch.Content = cp.data(false)
330 }
331
332 matchedChunkRanges := 0
333 for _, cm := range fileMatch.ChunkMatches {
334 matchedChunkRanges += len(cm.Ranges)
335 }
336
337 repoMatchCount += len(fileMatch.LineMatches)
338 repoMatchCount += matchedChunkRanges
339
340 res.Files = append(res.Files, fileMatch)
341
342 res.Stats.MatchCount += len(fileMatch.LineMatches)
343 res.Stats.MatchCount += matchedChunkRanges
344 res.Stats.FileCount++
345 }
346
347 for _, md := range d.repoMetaData {
348 r := md
349 addRepo(&res, &r)
350 for _, v := range r.SubRepoMap {
351 addRepo(&res, v)
352 }
353 }
354
355 // Update stats based on work done during document search.
356 updateMatchTreeStats(mt, &res.Stats)
357
358 res.Stats.MatchTreeSearch = timer.Elapsed()
359
360 return &res, nil
361}
362
363func addRepo(res *zoekt.SearchResult, repo *zoekt.Repository) {
364 if res.RepoURLs == nil {
365 res.RepoURLs = map[string]string{}
366 }
367 res.RepoURLs[repo.Name] = repo.FileURLTemplate
368
369 if res.LineFragments == nil {
370 res.LineFragments = map[string]string{}
371 }
372 res.LineFragments[repo.Name] = repo.LineFragmentTemplate
373}
374
375// Gather matches from this document. The matches are returned in document
376// order and are non-overlapping. All filename and content matches are
377// returned, with filename matches first.
378//
379// If `merge` is set, overlapping and adjacent matches will be merged
380// into a single index. Otherwise, overlapping matches will be removed,
381// but adjacent matches will remain.
382func (d *indexData) gatherMatches(nextDoc uint32, mt matchTree, known map[matchTree]bool) []*candidateMatch {
383 var cands []*candidateMatch
384 visitMatches(mt, known, 1, func(mt matchTree, scoreWeight float64) {
385 if smt, ok := mt.(*substrMatchTree); ok {
386 cands = append(cands, setScoreWeight(scoreWeight, smt.current)...)
387 }
388 if rmt, ok := mt.(*regexpMatchTree); ok {
389 cands = append(cands, setScoreWeight(scoreWeight, rmt.found)...)
390 }
391 if rmt, ok := mt.(*wordMatchTree); ok {
392 cands = append(cands, setScoreWeight(scoreWeight, rmt.found)...)
393 }
394 if smt, ok := mt.(*symbolRegexpMatchTree); ok {
395 cands = append(cands, setScoreWeight(scoreWeight, smt.found)...)
396 }
397 })
398
399 // If we found no candidate matches at all, assume there must have been a match on filename.
400 if len(cands) == 0 {
401 nm := d.fileName(nextDoc)
402 return []*candidateMatch{{
403 caseSensitive: false,
404 fileName: true,
405 substrBytes: nm,
406 substrLowered: nm,
407 file: nextDoc,
408 runeOffset: 0,
409 byteOffset: 0,
410 byteMatchSz: uint32(len(nm)),
411 }}
412 }
413
414 // Remove overlapping candidates. This guarantees that the matches
415 // are non-overlapping, but also preserves expected match counts.
416 sort.Sort((sortByOffsetSlice)(cands))
417 res := cands[:0]
418 for i, c := range cands {
419 if i == 0 {
420 res = append(res, c)
421 continue
422 }
423
424 last := res[len(res)-1]
425
426 // Never compare filename and content matches
427 if last.fileName != c.fileName {
428 res = append(res, c)
429 continue
430 }
431
432 // Only add the match if its range doesn't overlap
433 lastEnd := last.byteOffset + last.byteMatchSz
434 if lastEnd <= c.byteOffset {
435 res = append(res, c)
436 continue
437 }
438 }
439 return res
440}
441
442type sortByOffsetSlice []*candidateMatch
443
444func (m sortByOffsetSlice) Len() int { return len(m) }
445func (m sortByOffsetSlice) Swap(i, j int) { m[i], m[j] = m[j], m[i] }
446func (m sortByOffsetSlice) Less(i, j int) bool {
447 // Sort all filename matches to the start
448 if m[i].fileName != m[j].fileName {
449 return m[i].fileName
450 }
451
452 if m[i].byteOffset == m[j].byteOffset { // tie break if same offset
453 // Prefer longer candidates if starting at same position
454 return m[i].byteMatchSz > m[j].byteMatchSz
455 }
456 return m[i].byteOffset < m[j].byteOffset
457}
458
459// setScoreWeight is a helper used by gatherMatches to set the weight based on
460// the score weight of the matchTree.
461func setScoreWeight(scoreWeight float64, cm []*candidateMatch) []*candidateMatch {
462 for _, m := range cm {
463 m.scoreWeight = scoreWeight
464 }
465 return cm
466}
467
468func (d *indexData) branchIndex(docID uint32) int {
469 mask := d.fileBranchMasks[docID]
470 idx := 0
471 for mask != 0 {
472 if mask&0x1 != 0 {
473 return idx
474 }
475 idx++
476 mask >>= 1
477 }
478 return -1
479}
480
481// gatherBranches returns a list of branch names taking into account any branch
482// filters in the query. If the query contains a branch filter, it returns all
483// branches containing the docID and matching the branch filter. Otherwise, it
484// returns all branches containing docID.
485func (d *indexData) gatherBranches(docID uint32, mt matchTree, known map[matchTree]bool) []string {
486 var mask uint64
487 visitMatchAtoms(mt, known, func(mt matchTree) {
488 bq, ok := mt.(*branchQueryMatchTree)
489 if !ok {
490 return
491 }
492
493 mask = mask | bq.branchMask()
494 })
495
496 if mask == 0 {
497 mask = d.fileBranchMasks[docID]
498 }
499
500 var branches []string
501 id := uint32(1)
502 branchNames := d.branchNames[d.repos[docID]]
503 for mask != 0 {
504 if mask&0x1 != 0 {
505 branches = append(branches, branchNames[uint(id)])
506 }
507 id <<= 1
508 mask >>= 1
509 }
510
511 return branches
512}
513
514func (d *indexData) List(ctx context.Context, q query.Q, opts *zoekt.ListOptions) (rl *zoekt.RepoList, err error) {
515 var include func(rle *zoekt.RepoListEntry) bool
516
517 q = d.simplify(q)
518 if c, ok := q.(*query.Const); ok {
519 if !c.Value {
520 return &zoekt.RepoList{}, nil
521 }
522 include = func(rle *zoekt.RepoListEntry) bool {
523 return true
524 }
525 } else {
526 sr, err := d.Search(ctx, q, &zoekt.SearchOptions{
527 ShardRepoMaxMatchCount: 1,
528 })
529 if err != nil {
530 return nil, err
531 }
532
533 foundRepos := make(map[string]struct{}, len(sr.Files))
534 for _, file := range sr.Files {
535 foundRepos[file.Repository] = struct{}{}
536 }
537
538 include = func(rle *zoekt.RepoListEntry) bool {
539 _, ok := foundRepos[rle.Repository.Name]
540 return ok
541 }
542 }
543
544 var l zoekt.RepoList
545
546 field, err := opts.GetField()
547 if err != nil {
548 return nil, err
549 }
550 switch field {
551 case zoekt.RepoListFieldRepos:
552 l.Repos = make([]*zoekt.RepoListEntry, 0, len(d.repoListEntry))
553 case zoekt.RepoListFieldReposMap:
554 l.ReposMap = make(zoekt.ReposMap, len(d.repoListEntry))
555 }
556
557 for i := range d.repoListEntry {
558 if d.repoMetaData[i].Tombstone {
559 continue
560 }
561 // 🚨 SECURITY: Skip documents that don't belong to the tenant. This check is
562 // necessary to prevent leaking data across tenants.
563 if !tenant.HasAccess(ctx, d.repoMetaData[i].TenantID) {
564 continue
565 }
566 rle := &d.repoListEntry[i]
567 if !include(rle) {
568 continue
569 }
570
571 l.Stats.Add(&rle.Stats)
572
573 // Backwards compat for when ID is missing
574 if rle.Repository.ID == 0 {
575 l.Repos = append(l.Repos, rle)
576 continue
577 }
578
579 switch field {
580 case zoekt.RepoListFieldRepos:
581 l.Repos = append(l.Repos, rle)
582 case zoekt.RepoListFieldReposMap:
583 l.ReposMap[rle.Repository.ID] = zoekt.MinimalRepoListEntry{
584 HasSymbols: rle.Repository.HasSymbols,
585 Branches: rle.Repository.Branches,
586 IndexTimeUnix: rle.IndexMetadata.IndexTime.Unix(),
587 }
588 }
589
590 }
591
592 // Only one of these fields is populated and in all cases the size of that
593 // field is the number of Repos in this shard.
594 l.Stats.Repos = len(l.Repos) + len(l.ReposMap)
595
596 return &l, nil
597}
598
599// regexpToMatchTreeRecursive converts a regular expression to a matchTree mt. If
600// mt is equivalent to the input r, isEqual = true and the matchTree can be used
601// in place of the regex r. If singleLine = true, then the matchTree and all
602// its children only match terms on the same line. singleLine is used during
603// recursion to decide whether to return an andLineMatchTree (singleLine = true)
604// or a andMatchTree (singleLine = false).
605func (d *indexData) regexpToMatchTreeRecursive(r *syntax.Regexp, minTextSize int, fileName bool, caseSensitive bool) (mt matchTree, isEqual bool, singleLine bool, err error) {
606 // TODO - we could perhaps transform Begin/EndText in '\n'?
607 // TODO - we could perhaps transform CharClass in (OrQuery )
608 // if there are just a few runes, and part of a OpConcat?
609 switch r.Op {
610 case syntax.OpLiteral:
611 s := string(r.Rune)
612 if len(s) >= minTextSize {
613 ignoreCase := syntax.FoldCase == (r.Flags & syntax.FoldCase)
614 mt, err := d.newSubstringMatchTree(&query.Substring{Pattern: s, FileName: fileName, CaseSensitive: !ignoreCase && caseSensitive})
615 return mt, true, !strings.Contains(s, "\n"), err
616 }
617 case syntax.OpCapture:
618 return d.regexpToMatchTreeRecursive(r.Sub[0], minTextSize, fileName, caseSensitive)
619
620 case syntax.OpPlus:
621 return d.regexpToMatchTreeRecursive(r.Sub[0], minTextSize, fileName, caseSensitive)
622
623 case syntax.OpRepeat:
624 if r.Min == 1 {
625 return d.regexpToMatchTreeRecursive(r.Sub[0], minTextSize, fileName, caseSensitive)
626 } else if r.Min > 1 {
627 // (x){2,} can't be expressed precisely by the matchTree
628 mt, _, singleLine, err := d.regexpToMatchTreeRecursive(r.Sub[0], minTextSize, fileName, caseSensitive)
629 return mt, false, singleLine, err
630 }
631 case syntax.OpConcat, syntax.OpAlternate:
632 var qs []matchTree
633 isEq := true
634 singleLine = true
635 for _, sr := range r.Sub {
636 if sq, subIsEq, subSingleLine, err := d.regexpToMatchTreeRecursive(sr, minTextSize, fileName, caseSensitive); sq != nil {
637 if err != nil {
638 return nil, false, false, err
639 }
640 isEq = isEq && subIsEq
641 singleLine = singleLine && subSingleLine
642 qs = append(qs, sq)
643 }
644 }
645 if r.Op == syntax.OpConcat {
646 if len(qs) > 1 {
647 isEq = false
648 }
649 newQs := make([]matchTree, 0, len(qs))
650 for _, q := range qs {
651 if _, ok := q.(*bruteForceMatchTree); ok {
652 continue
653 }
654 newQs = append(newQs, q)
655 }
656 if len(newQs) == 1 {
657 return newQs[0], isEq, singleLine, nil
658 }
659 if len(newQs) == 0 {
660 return &bruteForceMatchTree{}, isEq, singleLine, nil
661 }
662 if singleLine {
663 return &andLineMatchTree{andMatchTree{children: newQs}}, isEq, singleLine, nil
664 }
665 return &andMatchTree{newQs}, isEq, singleLine, nil
666 }
667 for _, q := range qs {
668 if _, ok := q.(*bruteForceMatchTree); ok {
669 return q, isEq, false, nil
670 }
671 }
672 if len(qs) == 0 {
673 return &noMatchTree{Why: "const"}, isEq, false, nil
674 }
675 return &orMatchTree{qs}, isEq, false, nil
676 case syntax.OpStar:
677 if r.Sub[0].Op == syntax.OpAnyCharNotNL {
678 return &bruteForceMatchTree{}, false, true, nil
679 }
680 }
681 return &bruteForceMatchTree{}, false, false, nil
682}
683
684type timer struct {
685 last time.Time
686}
687
688func newTimer() *timer {
689 return &timer{
690 last: time.Now(),
691 }
692}
693
694func (t *timer) Elapsed() time.Duration {
695 now := time.Now()
696 d := now.Sub(t.last)
697 t.last = now
698 return d
699}