fork of https://github.com/sourcegraph/zoekt
1// Copyright 2016 Google Inc. All rights reserved.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15package index
16
17import (
18 "context"
19 "fmt"
20 "log"
21 "regexp/syntax"
22 "sort"
23 "strings"
24 "time"
25
26 enry_data "github.com/go-enry/go-enry/v2/data"
27 "github.com/grafana/regexp"
28
29 "github.com/sourcegraph/zoekt"
30 "github.com/sourcegraph/zoekt/internal/tenant"
31 "github.com/sourcegraph/zoekt/query"
32)
33
34// simplifyMultiRepo takes a query and a predicate. It returns Const(true) if all
35// repository names fulfill the predicate, Const(false) if none of them do, and q
36// otherwise.
37func (d *indexData) simplifyMultiRepo(q query.Q, predicate func(*zoekt.Repository) bool) query.Q {
38 count := 0
39 alive := len(d.repoMetaData)
40 for i := range d.repoMetaData {
41 if d.repoMetaData[i].Tombstone {
42 alive--
43 } else if predicate(&d.repoMetaData[i]) {
44 count++
45 }
46 }
47 if count == alive {
48 return &query.Const{Value: true}
49 }
50 if count > 0 {
51 return q
52 }
53 return &query.Const{Value: false}
54}
55
56func (d *indexData) simplify(in query.Q) query.Q {
57 eval := query.Map(in, func(q query.Q) query.Q {
58 switch r := q.(type) {
59 case *query.Repo:
60 return d.simplifyMultiRepo(q, func(repo *zoekt.Repository) bool {
61 return r.Regexp.MatchString(repo.Name)
62 })
63 case *query.RepoRegexp:
64 return d.simplifyMultiRepo(q, func(repo *zoekt.Repository) bool {
65 return r.Regexp.MatchString(repo.Name)
66 })
67 case *query.BranchesRepos:
68 for i := range d.repoMetaData {
69 for _, br := range r.List {
70 if br.Repos.Contains(d.repoMetaData[i].ID) {
71 return q
72 }
73 }
74 }
75 return &query.Const{Value: false}
76 case *query.RepoSet:
77 return d.simplifyMultiRepo(q, func(repo *zoekt.Repository) bool {
78 return r.Set[repo.Name]
79 })
80 case query.RawConfig:
81 return d.simplifyMultiRepo(q, func(repo *zoekt.Repository) bool { return uint8(r)&encodeRawConfig(repo.RawConfig) == uint8(r) })
82 case *query.RepoIDs:
83 return d.simplifyMultiRepo(q, func(repo *zoekt.Repository) bool {
84 return r.Repos.Contains(repo.ID)
85 })
86 case *query.Language:
87 _, has := d.metaData.LanguageMap[r.Language]
88 if !has && d.metaData.IndexFeatureVersion < 12 {
89 // For index files that haven't been re-indexed by go-enry,
90 // fall back to file-based matching and continue even if this
91 // repo doesn't have the specific language present.
92 extsForLang := enry_data.ExtensionsByLanguage[r.Language]
93 if extsForLang != nil {
94 extFrags := make([]string, 0, len(extsForLang))
95 for _, ext := range extsForLang {
96 extFrags = append(extFrags, regexp.QuoteMeta(ext))
97 }
98 if len(extFrags) > 0 {
99 pattern := fmt.Sprintf("(?i)(%s)$", strings.Join(extFrags, "|"))
100 // inlined copy of query.regexpQuery
101 re, err := syntax.Parse(pattern, syntax.Perl)
102 if err != nil {
103 return &query.Const{Value: false}
104 }
105 if re.Op == syntax.OpLiteral {
106 return &query.Substring{
107 Pattern: string(re.Rune),
108 FileName: true,
109 }
110 }
111 return &query.Regexp{
112 Regexp: re,
113 FileName: true,
114 }
115 }
116 }
117 }
118 if !has {
119 return &query.Const{Value: false}
120 }
121 case *query.Meta:
122 return d.simplifyMultiRepo(q, func(repo *zoekt.Repository) bool {
123 if repo.Metadata == nil {
124 return false
125 }
126 v, ok := repo.Metadata[r.Field]
127 if !ok {
128 return false
129 }
130 return r.Value.MatchString(v)
131 })
132 }
133 return q
134 })
135 return query.Simplify(eval)
136}
137
138func (d *indexData) Search(ctx context.Context, q query.Q, opts *zoekt.SearchOptions) (sr *zoekt.SearchResult, err error) {
139 timer := newTimer()
140
141 copyOpts := *opts
142 opts = ©Opts
143 opts.SetDefaults()
144
145 var res zoekt.SearchResult
146 if len(d.fileNameIndex) == 0 {
147 return &res, nil
148 }
149
150 select {
151 case <-ctx.Done():
152 res.Stats.ShardsSkipped++
153 return &res, nil
154 default:
155 }
156
157 q = d.simplify(q)
158 if c, ok := q.(*query.Const); ok && !c.Value {
159 return &res, nil
160 }
161
162 if opts.EstimateDocCount {
163 res.Stats.ShardFilesConsidered = len(d.fileBranchMasks)
164 return &res, nil
165 }
166
167 q = query.Map(q, query.ExpandFileContent)
168
169 mt, err := d.newMatchTree(q, matchTreeOpt{})
170 if err != nil {
171 return nil, err
172 }
173
174 // Capture the costs of construction before pruning
175 updateMatchTreeStats(mt, &res.Stats)
176
177 mt, err = pruneMatchTree(mt)
178 if err != nil {
179 return nil, err
180 }
181 res.Stats.MatchTreeConstruction = timer.Elapsed()
182 if mt == nil {
183 res.Stats.ShardsSkippedFilter++
184 return &res, nil
185 }
186
187 res.Stats.ShardsScanned++
188
189 cp := &contentProvider{
190 id: d,
191 stats: &res.Stats,
192 }
193
194 // Track the number of documents found in a repository for
195 // ShardRepoMaxMatchCount
196 var (
197 lastRepoID uint16
198 repoMatchCount int
199 )
200
201 docCount := uint32(len(d.fileBranchMasks))
202 lastDoc := int(-1)
203
204nextFileMatch:
205 for {
206 canceled := false
207 select {
208 case <-ctx.Done():
209 canceled = true
210 default:
211 }
212
213 nextDoc := mt.nextDoc()
214 if int(nextDoc) <= lastDoc {
215 nextDoc = uint32(lastDoc + 1)
216 }
217
218 for ; nextDoc < docCount; nextDoc++ {
219 repoID := d.repos[nextDoc]
220 repoMetadata := &d.repoMetaData[repoID]
221
222 // Skip tombstoned repositories
223 if repoMetadata.Tombstone {
224 continue
225 }
226
227 // 🚨 SECURITY: Skip documents that don't belong to the tenant. This check is
228 // necessary to prevent leaking data across tenants.
229 if !tenant.HasAccess(ctx, repoMetadata.TenantID) {
230 continue
231 }
232
233 // Skip documents that are tombstoned
234 if len(repoMetadata.FileTombstones) > 0 {
235 if _, tombstoned := repoMetadata.FileTombstones[string(d.fileName(nextDoc))]; tombstoned {
236 continue
237 }
238 }
239
240 // Skip documents over ShardRepoMaxMatchCount if specified.
241 if opts.ShardRepoMaxMatchCount > 0 {
242 if repoMatchCount >= opts.ShardRepoMaxMatchCount && repoID == lastRepoID {
243 res.Stats.FilesSkipped++
244 continue
245 }
246 }
247
248 break
249 }
250
251 if nextDoc >= docCount {
252 break
253 }
254
255 lastDoc = int(nextDoc)
256
257 // We track lastRepoID for ShardRepoMaxMatchCount
258 if lastRepoID != d.repos[nextDoc] {
259 lastRepoID = d.repos[nextDoc]
260 repoMatchCount = 0
261 }
262
263 if canceled || (res.Stats.MatchCount >= opts.ShardMaxMatchCount && opts.ShardMaxMatchCount > 0) {
264 res.Stats.FilesSkipped += int(docCount - nextDoc)
265 break
266 }
267
268 res.Stats.FilesConsidered++
269 mt.prepare(nextDoc)
270
271 cp.setDocument(nextDoc)
272
273 known := make(map[matchTree]bool)
274 md := d.repoMetaData[d.repos[nextDoc]]
275
276 for cost := costMin; cost <= costMax; cost++ {
277 switch evalMatchTree(cp, cost, known, mt) {
278 case matchesRequiresHigherCost:
279 if cost == costMax {
280 log.Panicf("did not decide. Repo %s, doc %d, known %v",
281 md.Name, nextDoc, known)
282 }
283 case matchesFound:
284 // could short-circuit now, but we want to run higher costs to
285 // potentially find higher ranked matches.
286 case matchesNone:
287 continue nextFileMatch
288 }
289 }
290
291 fileMatch := zoekt.FileMatch{
292 Repository: md.URL,
293 RepositoryID: md.ID,
294 RepositoryPriority: md.GetPriority(),
295 FileName: string(d.fileName(nextDoc)),
296 Checksum: d.getChecksum(nextDoc),
297 Language: d.languageMap[d.getLanguage(nextDoc)],
298 }
299
300 if s := d.subRepos[nextDoc]; s > 0 {
301 if s >= uint32(len(d.subRepoPaths[d.repos[nextDoc]])) {
302 log.Panicf("corrupt index: subrepo %d beyond %v", s, d.subRepoPaths)
303 }
304 path := d.subRepoPaths[d.repos[nextDoc]][s]
305 fileMatch.SubRepositoryPath = path
306 sr := md.SubRepoMap[path]
307 fileMatch.SubRepositoryName = sr.Name
308 if idx := d.branchIndex(nextDoc); idx >= 0 {
309 fileMatch.Version = sr.Branches[idx].Version
310 }
311 } else {
312 idx := d.branchIndex(nextDoc)
313 if idx >= 0 {
314 fileMatch.Version = md.Branches[idx].Version
315 }
316 }
317
318 // Important invariant for performance: finalCands is sorted by offset and
319 // non-overlapping. gatherMatches respects this invariant and all later
320 // transformations respect this.
321 finalCands := d.gatherMatches(nextDoc, mt, known)
322
323 if opts.ChunkMatches {
324 fileMatch.ChunkMatches = cp.fillChunkMatches(finalCands, opts.NumContextLines, fileMatch.Language, opts)
325 } else {
326 fileMatch.LineMatches = cp.fillMatches(finalCands, opts.NumContextLines, fileMatch.Language, opts)
327 }
328
329 if opts.UseBM25Scoring {
330 d.scoreFileBM25(&fileMatch, nextDoc, finalCands, cp, opts)
331 } else {
332 // Use the standard, non-experimental scoring method by default
333 d.scoreFile(&fileMatch, nextDoc, mt, known, opts)
334 }
335
336 fileMatch.Branches = d.gatherBranches(nextDoc, mt, known)
337 sortMatchesByScore(fileMatch.LineMatches)
338 sortChunkMatchesByScore(fileMatch.ChunkMatches)
339 if opts.Whole {
340 fileMatch.Content = cp.data(false)
341 }
342
343 matchedChunkRanges := 0
344 for _, cm := range fileMatch.ChunkMatches {
345 matchedChunkRanges += len(cm.Ranges)
346 }
347
348 repoMatchCount += len(fileMatch.LineMatches)
349 repoMatchCount += matchedChunkRanges
350
351 res.Files = append(res.Files, fileMatch)
352
353 res.Stats.MatchCount += len(fileMatch.LineMatches)
354 res.Stats.MatchCount += matchedChunkRanges
355 res.Stats.FileCount++
356 }
357
358 for _, md := range d.repoMetaData {
359 r := md
360 // Key by URL: it is the unique repo identity and matches
361 // FileMatch.Repository (also set to the URL). Subrepos are keyed by
362 // their Name, matching FileMatch.SubRepositoryName.
363 addRepo(&res, r.URL, &r)
364 for _, v := range r.SubRepoMap {
365 addRepo(&res, v.Name, v)
366 }
367 }
368
369 // Update stats based on work done during document search.
370 updateMatchTreeStats(mt, &res.Stats)
371
372 res.Stats.MatchTreeSearch = timer.Elapsed()
373
374 return &res, nil
375}
376
377func addRepo(res *zoekt.SearchResult, key string, repo *zoekt.Repository) {
378 if res.RepoURLs == nil {
379 res.RepoURLs = map[string]string{}
380 }
381 res.RepoURLs[key] = repo.FileURLTemplate
382
383 if res.LineFragments == nil {
384 res.LineFragments = map[string]string{}
385 }
386 res.LineFragments[key] = repo.LineFragmentTemplate
387}
388
389// Gather matches from this document. The matches are returned in document
390// order and are non-overlapping. All filename and content matches are
391// returned, with filename matches first.
392//
393// If `merge` is set, overlapping and adjacent matches will be merged
394// into a single index. Otherwise, overlapping matches will be removed,
395// but adjacent matches will remain.
396func (d *indexData) gatherMatches(nextDoc uint32, mt matchTree, known map[matchTree]bool) []*candidateMatch {
397 var cands []*candidateMatch
398 visitMatches(mt, known, 1, func(mt matchTree, scoreWeight float64) {
399 if smt, ok := mt.(*substrMatchTree); ok {
400 cands = append(cands, setScoreWeight(scoreWeight, smt.current)...)
401 }
402 if rmt, ok := mt.(*regexpMatchTree); ok {
403 cands = append(cands, setScoreWeight(scoreWeight, rmt.found)...)
404 }
405 if rmt, ok := mt.(*wordMatchTree); ok {
406 cands = append(cands, setScoreWeight(scoreWeight, rmt.found)...)
407 }
408 if smt, ok := mt.(*symbolRegexpMatchTree); ok {
409 cands = append(cands, setScoreWeight(scoreWeight, smt.found)...)
410 }
411 })
412
413 // If we found no candidate matches at all, assume there must have been a match on filename.
414 if len(cands) == 0 {
415 nm := d.fileName(nextDoc)
416 return []*candidateMatch{{
417 caseSensitive: false,
418 fileName: true,
419 substrBytes: nm,
420 substrLowered: nm,
421 file: nextDoc,
422 runeOffset: 0,
423 byteOffset: 0,
424 byteMatchSz: uint32(len(nm)),
425 }}
426 }
427
428 // Remove overlapping candidates. This guarantees that the matches
429 // are non-overlapping, but also preserves expected match counts.
430 sort.Sort((sortByOffsetSlice)(cands))
431 res := cands[:0]
432 for i, c := range cands {
433 if i == 0 {
434 res = append(res, c)
435 continue
436 }
437
438 last := res[len(res)-1]
439
440 // Never compare filename and content matches
441 if last.fileName != c.fileName {
442 res = append(res, c)
443 continue
444 }
445
446 // Only add the match if its range doesn't overlap
447 lastEnd := last.byteOffset + last.byteMatchSz
448 if lastEnd <= c.byteOffset {
449 res = append(res, c)
450 continue
451 }
452 }
453 return res
454}
455
456type sortByOffsetSlice []*candidateMatch
457
458func (m sortByOffsetSlice) Len() int { return len(m) }
459func (m sortByOffsetSlice) Swap(i, j int) { m[i], m[j] = m[j], m[i] }
460func (m sortByOffsetSlice) Less(i, j int) bool {
461 // Sort all filename matches to the start
462 if m[i].fileName != m[j].fileName {
463 return m[i].fileName
464 }
465
466 if m[i].byteOffset == m[j].byteOffset { // tie break if same offset
467 // Prefer longer candidates if starting at same position
468 return m[i].byteMatchSz > m[j].byteMatchSz
469 }
470 return m[i].byteOffset < m[j].byteOffset
471}
472
473// setScoreWeight is a helper used by gatherMatches to set the weight based on
474// the score weight of the matchTree.
475func setScoreWeight(scoreWeight float64, cm []*candidateMatch) []*candidateMatch {
476 for _, m := range cm {
477 m.scoreWeight = scoreWeight
478 }
479 return cm
480}
481
482func (d *indexData) branchIndex(docID uint32) int {
483 mask := d.fileBranchMasks[docID]
484 idx := 0
485 for mask != 0 {
486 if mask&0x1 != 0 {
487 return idx
488 }
489 idx++
490 mask >>= 1
491 }
492 return -1
493}
494
495// gatherBranches returns a list of branch names taking into account any branch
496// filters in the query. If the query contains a branch filter, it returns all
497// branches containing the docID and matching the branch filter. Otherwise, it
498// returns all branches containing docID.
499func (d *indexData) gatherBranches(docID uint32, mt matchTree, known map[matchTree]bool) []string {
500 var mask uint64
501 visitMatchAtoms(mt, known, func(mt matchTree) {
502 bq, ok := mt.(*branchQueryMatchTree)
503 if !ok {
504 return
505 }
506
507 mask = mask | bq.branchMask()
508 })
509
510 if mask == 0 {
511 mask = d.fileBranchMasks[docID]
512 }
513
514 var branches []string
515 id := uint64(1)
516 branchNames := d.branchNames[d.repos[docID]]
517 for mask != 0 {
518 if mask&0x1 != 0 {
519 branches = append(branches, branchNames[uint(id)])
520 }
521 id <<= 1
522 mask >>= 1
523 }
524
525 return branches
526}
527
528func (d *indexData) List(ctx context.Context, q query.Q, opts *zoekt.ListOptions) (rl *zoekt.RepoList, err error) {
529 var include func(rle *zoekt.RepoListEntry) bool
530
531 q = d.simplify(q)
532 if c, ok := q.(*query.Const); ok {
533 if !c.Value {
534 return &zoekt.RepoList{}, nil
535 }
536 include = func(rle *zoekt.RepoListEntry) bool {
537 return true
538 }
539 } else {
540 sr, err := d.Search(ctx, q, &zoekt.SearchOptions{
541 ShardRepoMaxMatchCount: 1,
542 })
543 if err != nil {
544 return nil, err
545 }
546
547 foundRepos := make(map[string]struct{}, len(sr.Files))
548 for _, file := range sr.Files {
549 foundRepos[file.Repository] = struct{}{}
550 }
551
552 include = func(rle *zoekt.RepoListEntry) bool {
553 _, ok := foundRepos[rle.Repository.URL]
554 return ok
555 }
556 }
557
558 var l zoekt.RepoList
559
560 field, err := opts.GetField()
561 if err != nil {
562 return nil, err
563 }
564 switch field {
565 case zoekt.RepoListFieldRepos:
566 l.Repos = make([]*zoekt.RepoListEntry, 0, len(d.repoListEntry))
567 case zoekt.RepoListFieldReposMap:
568 l.ReposMap = make(zoekt.ReposMap, len(d.repoListEntry))
569 }
570
571 for i := range d.repoListEntry {
572 if d.repoMetaData[i].Tombstone {
573 continue
574 }
575 // 🚨 SECURITY: Skip documents that don't belong to the tenant. This check is
576 // necessary to prevent leaking data across tenants.
577 if !tenant.HasAccess(ctx, d.repoMetaData[i].TenantID) {
578 continue
579 }
580 rle := &d.repoListEntry[i]
581 if !include(rle) {
582 continue
583 }
584
585 l.Stats.Add(&rle.Stats)
586
587 // Backwards compat for when ID is missing
588 if rle.Repository.ID == 0 {
589 l.Repos = append(l.Repos, rle)
590 continue
591 }
592
593 switch field {
594 case zoekt.RepoListFieldRepos:
595 l.Repos = append(l.Repos, rle)
596 case zoekt.RepoListFieldReposMap:
597 l.ReposMap[rle.Repository.ID] = zoekt.MinimalRepoListEntry{
598 HasSymbols: rle.Repository.HasSymbols,
599 Branches: rle.Repository.Branches,
600 IndexTimeUnix: rle.IndexMetadata.IndexTime.Unix(),
601 }
602 }
603
604 }
605
606 // Only one of these fields is populated and in all cases the size of that
607 // field is the number of Repos in this shard.
608 l.Stats.Repos = len(l.Repos) + len(l.ReposMap)
609
610 return &l, nil
611}
612
613// regexpToMatchTreeRecursive converts a regular expression to a matchTree mt. If
614// mt is equivalent to the input r, isEqual = true and the matchTree can be used
615// in place of the regex r. If singleLine = true, then the matchTree and all
616// its children only match terms on the same line. singleLine is used during
617// recursion to decide whether to return an andLineMatchTree (singleLine = true)
618// or a andMatchTree (singleLine = false).
619func (d *indexData) regexpToMatchTreeRecursive(r *syntax.Regexp, minTextSize int, fileName bool, caseSensitive bool) (mt matchTree, isEqual bool, singleLine bool, err error) {
620 // TODO - we could perhaps transform Begin/EndText in '\n'?
621 // TODO - we could perhaps transform CharClass in (OrQuery )
622 // if there are just a few runes, and part of a OpConcat?
623 switch r.Op {
624 case syntax.OpLiteral:
625 s := string(r.Rune)
626 if len(s) >= minTextSize {
627 ignoreCase := syntax.FoldCase == (r.Flags & syntax.FoldCase)
628 mt, err := d.newSubstringMatchTree(&query.Substring{Pattern: s, FileName: fileName, CaseSensitive: !ignoreCase && caseSensitive})
629 return mt, true, !strings.Contains(s, "\n"), err
630 }
631 case syntax.OpCapture:
632 return d.regexpToMatchTreeRecursive(r.Sub[0], minTextSize, fileName, caseSensitive)
633
634 case syntax.OpPlus:
635 return d.regexpToMatchTreeRecursive(r.Sub[0], minTextSize, fileName, caseSensitive)
636
637 case syntax.OpRepeat:
638 if r.Min == 1 {
639 return d.regexpToMatchTreeRecursive(r.Sub[0], minTextSize, fileName, caseSensitive)
640 } else if r.Min > 1 {
641 // (x){2,} can't be expressed precisely by the matchTree
642 mt, _, singleLine, err := d.regexpToMatchTreeRecursive(r.Sub[0], minTextSize, fileName, caseSensitive)
643 return mt, false, singleLine, err
644 }
645 case syntax.OpConcat, syntax.OpAlternate:
646 var qs []matchTree
647 isEq := true
648 singleLine = true
649 for _, sr := range r.Sub {
650 if sq, subIsEq, subSingleLine, err := d.regexpToMatchTreeRecursive(sr, minTextSize, fileName, caseSensitive); sq != nil {
651 if err != nil {
652 return nil, false, false, err
653 }
654 isEq = isEq && subIsEq
655 singleLine = singleLine && subSingleLine
656 qs = append(qs, sq)
657 }
658 }
659 if r.Op == syntax.OpConcat {
660 if len(qs) > 1 {
661 isEq = false
662 }
663 newQs := make([]matchTree, 0, len(qs))
664 for _, q := range qs {
665 if _, ok := q.(*bruteForceMatchTree); ok {
666 continue
667 }
668 newQs = append(newQs, q)
669 }
670 if len(newQs) == 1 {
671 return newQs[0], isEq, singleLine, nil
672 }
673 if len(newQs) == 0 {
674 return &bruteForceMatchTree{}, isEq, singleLine, nil
675 }
676 if singleLine {
677 return &andLineMatchTree{andMatchTree{children: newQs}}, isEq, singleLine, nil
678 }
679 return &andMatchTree{newQs}, isEq, singleLine, nil
680 }
681 for _, q := range qs {
682 if _, ok := q.(*bruteForceMatchTree); ok {
683 return q, isEq, false, nil
684 }
685 }
686 if len(qs) == 0 {
687 return &noMatchTree{Why: "const"}, isEq, false, nil
688 }
689 return &orMatchTree{qs}, isEq, false, nil
690 case syntax.OpStar:
691 if r.Sub[0].Op == syntax.OpAnyCharNotNL {
692 return &bruteForceMatchTree{}, false, true, nil
693 }
694 }
695 return &bruteForceMatchTree{}, false, false, nil
696}
697
698type timer struct {
699 last time.Time
700}
701
702func newTimer() *timer {
703 return &timer{
704 last: time.Now(),
705 }
706}
707
708func (t *timer) Elapsed() time.Duration {
709 now := time.Now()
710 d := now.Sub(t.last)
711 t.last = now
712 return d
713}