fork of https://github.com/sourcegraph/zoekt
1// Copyright 2016 Google Inc. All rights reserved.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15package zoekt // import "github.com/sourcegraph/zoekt"
16
17import (
18 "context"
19 "encoding/json"
20 "errors"
21 "fmt"
22 "math"
23 "reflect"
24 "strconv"
25 "strings"
26 "time"
27
28 "github.com/sourcegraph/zoekt/query"
29)
30
31const (
32 mapHeaderBytes uint64 = 48
33 sliceHeaderBytes uint64 = 24
34 stringHeaderBytes uint64 = 16
35 pointerSize uint64 = 8
36 interfaceBytes uint64 = 16
37)
38
39// FileMatch contains all the matches within a file.
40type FileMatch struct {
41 FileName string
42
43 // Repository is the globally unique name of the repo of the
44 // match
45 Repository string
46
47 // SubRepositoryName is the globally unique name of the repo,
48 // if it came from a subrepository
49 SubRepositoryName string `json:",omitempty"`
50
51 // SubRepositoryPath holds the prefix where the subrepository
52 // was mounted.
53 SubRepositoryPath string `json:",omitempty"`
54
55 // Commit SHA1 (hex) of the (sub)repo holding the file.
56 Version string `json:",omitempty"`
57
58 // Detected language of the result.
59 Language string
60
61 // For debugging. Needs DebugScore set, but public so tests in
62 // other packages can print some diagnostics.
63 Debug string `json:",omitempty"`
64
65 Branches []string `json:",omitempty"`
66
67 // One of LineMatches or ChunkMatches will be returned depending on whether
68 // the SearchOptions.ChunkMatches is set.
69 LineMatches []LineMatch `json:",omitempty"`
70 ChunkMatches []ChunkMatch `json:",omitempty"`
71
72 // Only set if requested
73 Content []byte `json:",omitempty"`
74
75 // Checksum of the content.
76 Checksum []byte
77
78 // Ranking; the higher, the better.
79 Score float64 `json:",omitempty"`
80
81 // RepositoryPriority is a Sourcegraph extension. It is used by Sourcegraph to
82 // order results from different repositories relative to each other.
83 RepositoryPriority float64 `json:",omitempty"`
84
85 // RepositoryID is a Sourcegraph extension. This is the ID of Repository in
86 // Sourcegraph.
87 RepositoryID uint32 `json:",omitempty"`
88}
89
90func (m *FileMatch) sizeBytes() (sz uint64) {
91 // Score
92 sz += 8
93
94 for _, s := range []string{
95 m.Debug,
96 m.FileName,
97 m.Repository,
98 m.Language,
99 m.SubRepositoryName,
100 m.SubRepositoryPath,
101 m.Version,
102 } {
103 sz += stringHeaderBytes + uint64(len(s))
104 }
105
106 // Branches
107 sz += sliceHeaderBytes
108 for _, s := range m.Branches {
109 sz += stringHeaderBytes + uint64(len(s))
110 }
111
112 // LineMatches
113 sz += sliceHeaderBytes
114 for _, lm := range m.LineMatches {
115 sz += lm.sizeBytes()
116 }
117
118 // ChunkMatches
119 sz += sliceHeaderBytes
120 for _, cm := range m.ChunkMatches {
121 sz += cm.sizeBytes()
122 }
123
124 // RepositoryID
125 sz += 4
126
127 // RepositoryPriority
128 sz += 8
129
130 // Content
131 sz += sliceHeaderBytes + uint64(len(m.Content))
132
133 // Checksum
134 sz += sliceHeaderBytes + uint64(len(m.Checksum))
135
136 return
137}
138
139// addScore increments the score of the FileMatch by the computed score. If
140// debugScore is true, it also adds a debug string to the FileMatch. If raw is
141// -1, it is ignored. Otherwise, it is added to the debug string.
142func (m *FileMatch) addScore(what string, computed float64, raw float64, debugScore bool) {
143 if computed != 0 && debugScore {
144 var b strings.Builder
145 fmt.Fprintf(&b, "%s", what)
146 if raw != -1 {
147 fmt.Fprintf(&b, "(%s)", strconv.FormatFloat(raw, 'f', -1, 64))
148 }
149 fmt.Fprintf(&b, ":%.2f, ", computed)
150 m.Debug += b.String()
151 }
152 m.Score += computed
153}
154
155// ChunkMatch is a set of non-overlapping matches within a contiguous range of
156// lines in the file.
157type ChunkMatch struct {
158 DebugScore string
159
160 // Content is a contiguous range of complete lines that fully contains Ranges.
161 // Lines will always include their terminating newline (if it exists).
162 Content []byte
163
164 // Ranges is a set of matching ranges within this chunk. Each range is relative
165 // to the beginning of the file (not the beginning of Content).
166 Ranges []Range
167
168 // SymbolInfo is the symbol information associated with Ranges. If it is non-nil,
169 // its length will equal that of Ranges. Any of its elements may be nil.
170 SymbolInfo []*Symbol
171
172 // FileName indicates whether this match is a match on the file name, in
173 // which case Content will contain the file name.
174 FileName bool
175
176 // ContentStart is the location (inclusive) of the beginning of content
177 // relative to the beginning of the file. It will always be at the
178 // beginning of a line (Column will always be 1).
179 ContentStart Location
180
181 // Score is the overall relevance score of this chunk.
182 Score float64
183
184 // BestLineMatch is the line number of the highest-scoring line match in this chunk.
185 // The line number represents the index in the full file, and is 1-based. If FileName: true,
186 // this number will be 0.
187 BestLineMatch uint32
188}
189
190func (cm *ChunkMatch) sizeBytes() (sz uint64) {
191 // Content
192 sz += sliceHeaderBytes + uint64(len(cm.Content))
193
194 // ContentStart
195 sz += cm.ContentStart.sizeBytes()
196
197 // FileName
198 sz += 1
199
200 // Ranges
201 sz += sliceHeaderBytes
202 if len(cm.Ranges) > 0 {
203 sz += uint64(len(cm.Ranges)) * cm.Ranges[0].sizeBytes()
204 }
205
206 // SymbolInfo
207 sz += sliceHeaderBytes
208 for _, si := range cm.SymbolInfo {
209 sz += pointerSize
210 if si != nil {
211 sz += si.sizeBytes()
212 }
213 }
214
215 // Score
216 sz += 8
217
218 // DebugScore
219 sz += stringHeaderBytes + uint64(len(cm.DebugScore))
220
221 return
222}
223
224type Range struct {
225 // The inclusive beginning of the range.
226 Start Location
227 // The exclusive end of the range.
228 End Location
229}
230
231func (r *Range) sizeBytes() uint64 {
232 return r.Start.sizeBytes() + r.End.sizeBytes()
233}
234
235type Location struct {
236 // 0-based byte offset from the beginning of the file
237 ByteOffset uint32
238 // 1-based line number from the beginning of the file
239 LineNumber uint32
240 // 1-based column number (in runes) from the beginning of line
241 Column uint32
242}
243
244func (l *Location) sizeBytes() uint64 {
245 return 3 * 4
246}
247
248// LineMatch holds the matches within a single line in a file.
249type LineMatch struct {
250 // The line in which a match was found.
251 Line []byte
252 // The byte offset of the first byte of the line.
253 LineStart int
254 // The byte offset of the first byte past the end of the line.
255 // This is usually the byte after the terminating newline, but can also be
256 // the end of the file if there is no terminating newline
257 LineEnd int
258 LineNumber int
259
260 // Before and After are only set when SearchOptions.NumContextLines is > 0
261 Before []byte
262 After []byte
263
264 // If set, this was a match on the filename.
265 FileName bool
266
267 // The higher the better. Only ranks the quality of the match
268 // within the file, does not take rank of file into account
269 Score float64
270 DebugScore string
271
272 LineFragments []LineFragmentMatch
273}
274
275func (lm *LineMatch) sizeBytes() (sz uint64) {
276 // Line
277 sz += sliceHeaderBytes + uint64(len(lm.Line))
278
279 // LineStart, LineEnd, LineNumber
280 sz += 3 * 8
281
282 // Before
283 sz += sliceHeaderBytes + uint64(len(lm.Before))
284
285 // After
286 sz += sliceHeaderBytes + uint64(len(lm.After))
287
288 // FileName
289 sz += 1
290
291 // Score
292 sz += 8
293
294 // DebugScore
295 sz += stringHeaderBytes + uint64(len(lm.DebugScore))
296
297 // LineFragments
298 sz += sliceHeaderBytes
299 for _, lf := range lm.LineFragments {
300 sz += lf.sizeBytes()
301 }
302
303 return
304}
305
306type Symbol struct {
307 Sym string
308 Kind string
309 Parent string
310 ParentKind string
311}
312
313func (s *Symbol) sizeBytes() uint64 {
314 return 4*stringHeaderBytes + uint64(len(s.Sym)+len(s.Kind)+len(s.Parent)+len(s.ParentKind))
315}
316
317// LineFragmentMatch a segment of matching text within a line.
318type LineFragmentMatch struct {
319 // Offset within the line, in bytes.
320 LineOffset int
321
322 // Offset from file start, in bytes.
323 Offset uint32
324
325 // Number bytes that match.
326 MatchLength int
327
328 SymbolInfo *Symbol
329}
330
331func (lfm *LineFragmentMatch) sizeBytes() (sz uint64) {
332 // LineOffset
333 sz += 8
334
335 // Offset
336 sz += 4
337
338 // MatchLength
339 sz += 8
340
341 // SymbolInfo
342 sz += pointerSize
343 if lfm.SymbolInfo != nil {
344 sz += lfm.SymbolInfo.sizeBytes()
345 }
346
347 return
348}
349
350type FlushReason uint8
351
352const (
353 FlushReasonTimerExpired FlushReason = 1 << iota
354 FlushReasonFinalFlush
355 FlushReasonMaxSize
356)
357
358var FlushReasonStrings = map[FlushReason]string{
359 FlushReasonTimerExpired: "timer_expired",
360 FlushReasonFinalFlush: "final_flush",
361 FlushReasonMaxSize: "max_size_reached",
362}
363
364func (fr FlushReason) String() string {
365 if v, ok := FlushReasonStrings[fr]; ok {
366 return v
367 }
368
369 return "none"
370}
371
372// Stats contains interesting numbers on the search
373type Stats struct {
374 // Amount of I/O for reading contents.
375 ContentBytesLoaded int64
376
377 // Amount of I/O for reading from index.
378 IndexBytesLoaded int64
379
380 // Number of search shards that had a crash.
381 Crashes int
382
383 // Wall clock time for this search
384 Duration time.Duration
385
386 // Number of files containing a match.
387 FileCount int
388
389 // Number of files in shards that we considered.
390 ShardFilesConsidered int
391
392 // Files that we evaluated. Equivalent to files for which all
393 // atom matches (including negations) evaluated to true.
394 FilesConsidered int
395
396 // Files for which we loaded file content to verify substring matches
397 FilesLoaded int
398
399 // Candidate files whose contents weren't examined because we
400 // gathered enough matches.
401 FilesSkipped int
402
403 // Shards that we scanned to find matches.
404 ShardsScanned int
405
406 // Shards that we did not process because a query was canceled.
407 ShardsSkipped int
408
409 // Shards that we did not process because the query was rejected by the
410 // ngram filter indicating it had no matches.
411 ShardsSkippedFilter int
412
413 // Number of non-overlapping matches
414 MatchCount int
415
416 // Number of candidate matches as a result of searching ngrams.
417 NgramMatches int
418
419 // NgramLookups is the number of times we accessed an ngram in the index.
420 NgramLookups int
421
422 // Wall clock time for queued search.
423 Wait time.Duration
424
425 // Aggregate wall clock time spent constructing and pruning the match tree.
426 // This accounts for time such as lookups in the trigram index.
427 MatchTreeConstruction time.Duration
428
429 // Aggregate wall clock time spent searching the match tree. This accounts
430 // for the bulk of search work done looking for matches.
431 MatchTreeSearch time.Duration
432
433 // Number of times regexp was called on files that we evaluated.
434 RegexpsConsidered int
435
436 // FlushReason explains why results were flushed.
437 FlushReason FlushReason
438}
439
440func (s *Stats) sizeBytes() (sz uint64) {
441 sz = 16 * 8 // This assumes we are running on a 64-bit architecture
442 sz += 1 // FlushReason
443
444 return
445}
446
447func (s *Stats) Add(o Stats) {
448 s.ContentBytesLoaded += o.ContentBytesLoaded
449 s.IndexBytesLoaded += o.IndexBytesLoaded
450 s.Crashes += o.Crashes
451 s.FileCount += o.FileCount
452 s.FilesConsidered += o.FilesConsidered
453 s.FilesLoaded += o.FilesLoaded
454 s.FilesSkipped += o.FilesSkipped
455 s.MatchCount += o.MatchCount
456 s.NgramMatches += o.NgramMatches
457 s.NgramLookups += o.NgramLookups
458 s.ShardFilesConsidered += o.ShardFilesConsidered
459 s.ShardsScanned += o.ShardsScanned
460 s.ShardsSkipped += o.ShardsSkipped
461 s.ShardsSkippedFilter += o.ShardsSkippedFilter
462 s.Wait += o.Wait
463 s.MatchTreeConstruction += o.MatchTreeConstruction
464 s.MatchTreeSearch += o.MatchTreeSearch
465 s.RegexpsConsidered += o.RegexpsConsidered
466
467 // We want the first non-zero FlushReason to be sticky. This is a useful
468 // property when aggregating stats from several Zoekts.
469 if s.FlushReason == 0 {
470 s.FlushReason = o.FlushReason
471 }
472}
473
474// Zero returns true if stats is empty.
475func (s *Stats) Zero() bool {
476 if s == nil {
477 return true
478 }
479
480 return !(s.ContentBytesLoaded > 0 ||
481 s.IndexBytesLoaded > 0 ||
482 s.Crashes > 0 ||
483 s.FileCount > 0 ||
484 s.FilesConsidered > 0 ||
485 s.FilesLoaded > 0 ||
486 s.FilesSkipped > 0 ||
487 s.MatchCount > 0 ||
488 s.NgramMatches > 0 ||
489 s.NgramLookups > 0 ||
490 s.ShardFilesConsidered > 0 ||
491 s.ShardsScanned > 0 ||
492 s.ShardsSkipped > 0 ||
493 s.ShardsSkippedFilter > 0 ||
494 s.Wait > 0 ||
495 s.MatchTreeConstruction > 0 ||
496 s.MatchTreeSearch > 0 ||
497 s.RegexpsConsidered > 0)
498}
499
500// Progress contains information about the global progress of the running search query.
501// This is used by the frontend to reorder results and emit them when stable.
502// Sourcegraph specific: this is used when querying multiple zoekt-webserver instances.
503type Progress struct {
504 // Priority of the shard that was searched.
505 Priority float64
506
507 // MaxPendingPriority is the maximum priority of pending result that is being searched in parallel.
508 // This is used to reorder results when the result set is known to be stable-- that is, when a result's
509 // Priority is greater than the max(MaxPendingPriority) from the latest results of each backend, it can be returned to the user.
510 //
511 // MaxPendingPriority decreases monotonically in each SearchResult.
512 MaxPendingPriority float64
513}
514
515func (p *Progress) sizeBytes() uint64 {
516 return 2 * 8
517}
518
519// SearchResult contains search matches and extra data
520type SearchResult struct {
521 Stats
522
523 // Do not encode this as we cannot encode -Inf in JSON
524 Progress `json:"-"`
525
526 Files []FileMatch
527
528 // RepoURLs holds a repo => template string map.
529 RepoURLs map[string]string
530
531 // FragmentNames holds a repo => template string map, for
532 // the line number fragment.
533 LineFragments map[string]string
534}
535
536// SizeBytes is a best-effort estimate of the size of SearchResult in memory.
537// The estimate does not take alignment into account. The result is a lower
538// bound on the actual size in memory.
539func (sr *SearchResult) SizeBytes() (sz uint64) {
540 sz += sr.Stats.sizeBytes()
541 sz += sr.Progress.sizeBytes()
542
543 // Files
544 sz += sliceHeaderBytes
545 for _, f := range sr.Files {
546 sz += f.sizeBytes()
547 }
548
549 // RepoURLs
550 sz += mapHeaderBytes
551 for k, v := range sr.RepoURLs {
552 sz += stringHeaderBytes + uint64(len(k))
553 sz += stringHeaderBytes + uint64(len(v))
554 }
555
556 // LineFragments
557 sz += mapHeaderBytes
558 for k, v := range sr.LineFragments {
559 sz += stringHeaderBytes + uint64(len(k))
560 sz += stringHeaderBytes + uint64(len(v))
561 }
562
563 return
564}
565
566// RepositoryBranch describes an indexed branch, which is a name
567// combined with a version.
568type RepositoryBranch struct {
569 Name string
570 Version string
571}
572
573func (r RepositoryBranch) String() string {
574 return fmt.Sprintf("%s@%s", r.Name, r.Version)
575}
576
577// Repository holds repository metadata.
578type Repository struct {
579 // Sourcegraph's tenant ID
580 TenantID int
581
582 // Sourcegraph's repository ID
583 ID uint32
584
585 // The repository name
586 Name string
587
588 // The repository URL.
589 URL string
590
591 // The physical source where this repo came from, eg. full
592 // path to the zip filename or git repository directory. This
593 // will not be exposed in the UI, but can be used to detect
594 // orphaned index shards.
595 Source string
596
597 // The branches indexed in this repo.
598 Branches []RepositoryBranch
599
600 // Nil if this is not the super project.
601 SubRepoMap map[string]*Repository
602
603 // URL template to link to the commit of a branch
604 CommitURLTemplate string
605
606 // The repository URL for getting to a file. Has access to
607 // {{.Version}}, {{.Path}}
608 FileURLTemplate string
609
610 // The URL fragment to add to a file URL for line numbers. has
611 // access to {{.LineNumber}}. The fragment should include the
612 // separator, generally '#' or ';'.
613 LineFragmentTemplate string
614
615 // Perf optimization: priority is set when we load the shard. It corresponds to
616 // the value of "priority" stored in RawConfig.
617 priority float64
618
619 // All zoekt.* configuration settings.
620 RawConfig map[string]string
621
622 // Importance of the repository, bigger is more important
623 Rank uint16
624
625 // IndexOptions is a hash of the options used to create the index for the
626 // repo.
627 IndexOptions string
628
629 // HasSymbols is true if this repository has indexed ctags
630 // output. Sourcegraph specific: This field is more appropriate for
631 // IndexMetadata. However, we store it here since the Sourcegraph frontend
632 // can read this structure but not IndexMetadata.
633 HasSymbols bool
634
635 // Tombstone is true if we are not allowed to search this repo.
636 Tombstone bool
637
638 // LatestCommitDate is the date of the latest commit among all indexed Branches.
639 // The date might be time.Time's 0-value if the repository was last indexed
640 // before this field was added.
641 LatestCommitDate time.Time
642
643 // FileTombstones is a set of file paths that should be ignored across all branches
644 // in this shard.
645 FileTombstones map[string]struct{} `json:",omitempty"`
646}
647
648func (r *Repository) UnmarshalJSON(data []byte) error {
649 // We define a new type so that we can use json.Unmarshal
650 // without recursing into this same method.
651 type repository *Repository
652 repo := repository(r)
653
654 err := json.Unmarshal(data, repo)
655 if err != nil {
656 return err
657 }
658
659 if v, ok := repo.RawConfig["repoid"]; ok {
660 id, _ := strconv.ParseUint(v, 10, 32)
661 r.ID = uint32(id)
662 }
663
664 if v, ok := repo.RawConfig["tenantID"]; ok {
665 id, _ := strconv.ParseInt(v, 10, 64)
666 r.TenantID = int(id)
667 }
668
669 // Sourcegraph indexserver doesn't set repo.Rank, so we set it here. Setting it
670 // on read instead of during indexing allows us to avoid a complete reindex.
671 //
672 // Prefer "latestCommitDate" over "priority" for ranking. We keep priority for
673 // backwards compatibility.
674 if _, ok := repo.RawConfig["latestCommitDate"]; ok {
675 // We use the number of months since 1970 as a simple measure of repo freshness.
676 // It is monotonically increasing and stable across re-indexes and restarts.
677 r.Rank = monthsSince1970(repo.LatestCommitDate)
678 } else if v, ok := repo.RawConfig["priority"]; ok {
679 r.priority, err = strconv.ParseFloat(v, 64)
680 if err != nil {
681 r.priority = 0
682 }
683
684 // Sourcegraph indexserver doesn't set repo.Rank, so we set it here
685 // based on priority. Setting it on read instead of during indexing
686 // allows us to avoid a complete reindex.
687 if r.Rank == 0 && r.priority > 0 {
688 // Normalize the repo score within [0, maxUint16), with the midpoint at 5,000.
689 // This means popular repos (roughly ones with over 5,000 stars) see diminishing
690 // returns from more stars.
691 r.Rank = uint16(r.priority / (5000.0 + r.priority) * math.MaxUint16)
692 }
693 }
694
695 return nil
696}
697
698// monthsSince1970 returns the number of months since 1970. It returns values in
699// the range [0, maxUInt16]. The upper bound is reached in the year 7431, the
700// lower bound for all dates before 1970.
701func monthsSince1970(t time.Time) uint16 {
702 base := time.Unix(0, 0)
703 if t.Before(base) {
704 return 0
705 }
706 months := int(t.Year()-1970)*12 + int(t.Month()-1)
707 return uint16(min(months, math.MaxUint16))
708}
709
710// MergeMutable will merge x into r. mutated will be true if it made any
711// changes. err is non-nil if we needed to mutate an immutable field.
712//
713// Note: SubRepoMap, IndexOptions and HasSymbol fields are ignored. They are
714// computed while indexing so can't be synthesized from x.
715//
716// Note: We ignore RawConfig fields which are duplicated into Repository:
717// name and id.
718func (r *Repository) MergeMutable(x *Repository) (mutated bool, err error) {
719 if r.ID != x.ID {
720 // Sourcegraph: strange behaviour may occur if ID changes but names don't.
721 return mutated, errors.New("ID is immutable")
722 }
723 if r.Name != x.Name {
724 // Name is encoded into the shard name on disk. We need to re-index if it
725 // changes.
726 return mutated, errors.New("Name is immutable")
727 }
728 if !reflect.DeepEqual(r.Branches, x.Branches) {
729 // Need a reindex if content changing.
730 return mutated, errors.New("Branches is immutable")
731 }
732
733 for k, v := range x.RawConfig {
734 // We ignore name and id since they are encoded into the repository.
735 if k == "name" || k == "id" {
736 continue
737 }
738 if r.RawConfig == nil {
739 mutated = true
740 r.RawConfig = make(map[string]string)
741 }
742 if r.RawConfig[k] != v {
743 mutated = true
744 r.RawConfig[k] = v
745 }
746 }
747
748 if r.URL != x.URL {
749 mutated = true
750 r.URL = x.URL
751 }
752 if r.CommitURLTemplate != x.CommitURLTemplate {
753 mutated = true
754 r.CommitURLTemplate = x.CommitURLTemplate
755 }
756 if r.FileURLTemplate != x.FileURLTemplate {
757 mutated = true
758 r.FileURLTemplate = x.FileURLTemplate
759 }
760 if r.LineFragmentTemplate != x.LineFragmentTemplate {
761 mutated = true
762 r.LineFragmentTemplate = x.LineFragmentTemplate
763 }
764
765 return mutated, nil
766}
767
768// IndexMetadata holds metadata stored in the index file. It contains
769// data generated by the core indexing library.
770type IndexMetadata struct {
771 IndexFormatVersion int
772 IndexFeatureVersion int
773 IndexMinReaderVersion int
774 IndexTime time.Time
775 PlainASCII bool
776 LanguageMap map[string]uint16
777 ZoektVersion string
778 ID string
779}
780
781// Statistics of a (collection of) repositories.
782type RepoStats struct {
783 // Repos is used for aggregrating the number of repositories.
784 //
785 // Note: This field is not populated on RepoListEntry.Stats (individual) but
786 // only for RepoList.Stats (aggregate).
787 Repos int
788
789 // Shards is the total number of search shards.
790 Shards int
791
792 // Documents holds the number of documents or files.
793 Documents int
794
795 // IndexBytes is the amount of RAM used for index overhead.
796 IndexBytes int64
797
798 // ContentBytes is the amount of RAM used for raw content.
799 ContentBytes int64
800
801 // Sourcegraph specific stats below. These are not as efficient to calculate
802 // as the above statistics. We experimentally measured about a 10% slower
803 // shard load time. However, we find these values very useful to track and
804 // computing them outside of load time introduces a lot of complexity.
805
806 // NewLinesCount is the number of newlines "\n" that appear in the zoekt
807 // indexed documents. This is not exactly the same as line count, since it
808 // will not include lines not terminated by "\n" (eg a file with no "\n", or
809 // a final line without "\n"). Note: Zoekt deduplicates documents across
810 // branches, so if a path has the same contents on multiple branches, there
811 // is only one document for it. As such that document's newlines is only
812 // counted once. See DefaultBranchNewLinesCount and AllBranchesNewLinesCount
813 // for counts which do not deduplicate.
814 NewLinesCount uint64
815
816 // DefaultBranchNewLinesCount is the number of newlines "\n" in the default
817 // branch.
818 DefaultBranchNewLinesCount uint64
819
820 // OtherBranchesNewLinesCount is the number of newlines "\n" in all branches
821 // except the default branch.
822 OtherBranchesNewLinesCount uint64
823}
824
825func (s *RepoStats) Add(o *RepoStats) {
826 // can't update Repos, since one repo may have multiple
827 // shards.
828 s.Shards += o.Shards
829 s.IndexBytes += o.IndexBytes
830 s.Documents += o.Documents
831 s.ContentBytes += o.ContentBytes
832
833 // Sourcegraph specific
834 s.NewLinesCount += o.NewLinesCount
835 s.DefaultBranchNewLinesCount += o.DefaultBranchNewLinesCount
836 s.OtherBranchesNewLinesCount += o.OtherBranchesNewLinesCount
837}
838
839type RepoListEntry struct {
840 Repository Repository
841 IndexMetadata IndexMetadata
842 Stats RepoStats
843}
844
845// MinimalRepoListEntry is a subset of RepoListEntry. It was added after
846// performance profiling of sourcegraph.com revealed that querying this
847// information from Zoekt was causing lots of CPU and memory usage. Note: we
848// can revisit this, how we store and query this information has changed a lot
849// since this was introduced.
850type MinimalRepoListEntry struct {
851 // HasSymbols is exported since Sourcegraph uses this information at search
852 // planning time to decide between Zoekt and an unindexed symbol search.
853 //
854 // Note: it pretty much is always true in practice.
855 HasSymbols bool
856
857 // Branches is used by Sourcegraphs query planner to decided if it can use
858 // zoekt or go via an unindexed code path.
859 Branches []RepositoryBranch
860
861 // IndexTimeUnix is the IndexTime converted to unix time (number of seconds
862 // since the epoch). This is to make it clear we are not transporting the
863 // full fidelty timestamp (ie with milliseconds and location). Additionally
864 // it saves 16 bytes in this struct.
865 //
866 // IndexTime is used as a heuristic in Sourcegraph to decide in aggregate
867 // how many repositories need updating after a ranking change/etc.
868 //
869 // TODO(keegancsmith) audit updates to IndexTime and document how and when
870 // it changes. Concerned about things like metadata updates or compound
871 // shards leading to untrustworthy data here.
872 IndexTimeUnix int64
873}
874
875type ReposMap map[uint32]MinimalRepoListEntry
876
877// MarshalBinary implements a specialized encoder for ReposMap.
878func (q *ReposMap) MarshalBinary() ([]byte, error) {
879 return reposMapEncode(*q)
880}
881
882// UnmarshalBinary implements a specialized decoder for ReposMap.
883func (q *ReposMap) UnmarshalBinary(b []byte) error {
884 var err error
885 (*q), err = reposMapDecode(b)
886 return err
887}
888
889// RepoList holds a set of Repository metadata.
890type RepoList struct {
891 // Returned when ListOptions.Field is RepoListFieldRepos.
892 Repos []*RepoListEntry
893
894 // ReposMap is set when ListOptions.Field is RepoListFieldReposMap.
895 ReposMap ReposMap
896
897 Crashes int
898
899 // Stats response to a List request.
900 // This is the aggregate RepoStats of all repos matching the input query.
901 Stats RepoStats
902}
903
904type Searcher interface {
905 Search(ctx context.Context, q query.Q, opts *SearchOptions) (*SearchResult, error)
906
907 // List lists repositories. The query `q` can only contain
908 // query.Repo atoms.
909 List(ctx context.Context, q query.Q, opts *ListOptions) (*RepoList, error)
910 Close()
911
912 // Describe the searcher for debug messages.
913 String() string
914}
915
916type RepoListField int
917
918const (
919 RepoListFieldRepos RepoListField = 0
920 RepoListFieldReposMap = 2
921)
922
923type ListOptions struct {
924 // Field decides which field to populate in RepoList response.
925 Field RepoListField
926}
927
928func (o *ListOptions) GetField() (RepoListField, error) {
929 if o == nil {
930 return RepoListFieldRepos, nil
931 }
932 switch o.Field {
933 case RepoListFieldRepos, RepoListFieldReposMap:
934 return o.Field, nil
935 case 1:
936 return 0, fmt.Errorf("RepoListFieldMinimal (%d) is no longer supported", o.Field)
937 default:
938 return 0, fmt.Errorf("unknown RepoListField %d", o.Field)
939 }
940}
941
942func (o *ListOptions) String() string {
943 return fmt.Sprintf("%#v", o)
944}
945
946type SearchOptions struct {
947 // Return an upper-bound estimate of eligible documents in
948 // stats.ShardFilesConsidered.
949 EstimateDocCount bool
950
951 // Return the whole file.
952 Whole bool
953
954 // Maximum number of matches: skip all processing an index
955 // shard after we found this many non-overlapping matches.
956 ShardMaxMatchCount int
957
958 // Maximum number of matches: stop looking for more matches
959 // once we have this many matches across shards.
960 TotalMaxMatchCount int
961
962 // Maximum number of matches: skip processing documents for a repository in
963 // a shard once we have found ShardRepoMaxMatchCount.
964 //
965 // A compound shard may contain multiple repositories. This will most often
966 // be set to 1 to find all repositories containing a result.
967 ShardRepoMaxMatchCount int
968
969 // Abort the search after this much time has passed.
970 MaxWallTime time.Duration
971
972 // FlushWallTime if non-zero will stop streaming behaviour at first and
973 // instead will collate and sort results. At FlushWallTime the results will
974 // be sent and then the behaviour will revert to the normal streaming.
975 FlushWallTime time.Duration
976
977 // Truncates the number of documents (i.e. files) after collating and
978 // sorting the results.
979 MaxDocDisplayCount int
980
981 // Truncates the number of matchs after collating and sorting the results.
982 MaxMatchDisplayCount int
983
984 // If set to a number greater than zero then up to this many number
985 // of context lines will be added before and after each matched line.
986 // Note that the included context lines might contain matches and
987 // it's up to the consumer of the result to remove those lines.
988 NumContextLines int
989
990 // If true, ChunkMatches will be returned in each FileMatch rather than LineMatches
991 // EXPERIMENTAL: the behavior of this flag may be changed in future versions.
992 ChunkMatches bool
993
994 // EXPERIMENTAL. If true, use text-search style scoring instead of the default
995 // scoring formula. The scoring algorithm treats each match in a file as a term
996 // and computes an approximation to BM25. When enabled, BM25 scoring is used for
997 // the overall FileMatch score, as well as individual LineMatch and ChunkMatch scores.
998 //
999 // The calculation of IDF assumes that Zoekt visits all documents containing any
1000 // of the query terms during evaluation. This is true, for example, if all query
1001 // terms are ORed together.
1002 //
1003 // When enabled, all other scoring signals are ignored, including document ranks.
1004 UseBM25Scoring bool
1005
1006 // Trace turns on opentracing for this request if true and if the Jaeger address was provided as
1007 // a command-line flag
1008 Trace bool
1009
1010 // If set, the search results will contain debug information for scoring.
1011 DebugScore bool
1012
1013 // SpanContext is the opentracing span context, if it exists, from the zoekt client
1014 SpanContext map[string]string
1015}
1016
1017// String returns a succinct representation of the options. This is meant for
1018// human consumption in logs and traces.
1019//
1020// Note: some tracing systems have limits on length of values, so we take care
1021// to try and make this small, and include the important information near the
1022// front incase of truncation.
1023func (s *SearchOptions) String() string {
1024 var b strings.Builder
1025
1026 add := func(name, value string) {
1027 b.WriteString(name)
1028 b.WriteByte('=')
1029 b.WriteString(value)
1030 b.WriteByte(' ')
1031 }
1032 addInt := func(name string, value int) {
1033 if value != 0 {
1034 add(name, strconv.Itoa(value))
1035 }
1036 }
1037 addDuration := func(name string, value time.Duration) {
1038 if value != 0 {
1039 add(name, value.String())
1040 }
1041 }
1042 addBool := func(name string, value bool) {
1043 if !value {
1044 return
1045 }
1046 b.WriteString(name)
1047 b.WriteByte(' ')
1048 }
1049
1050 b.WriteString("zoekt.SearchOptions{ ")
1051
1052 addInt("ShardMaxMatchCount", s.ShardMaxMatchCount)
1053 addInt("TotalMaxMatchCount", s.TotalMaxMatchCount)
1054 addInt("ShardRepoMaxMatchCount", s.ShardRepoMaxMatchCount)
1055 addInt("MaxDocDisplayCount", s.MaxDocDisplayCount)
1056 addInt("MaxMatchDisplayCount", s.MaxMatchDisplayCount)
1057 addInt("NumContextLines", s.NumContextLines)
1058
1059 addDuration("MaxWallTime", s.MaxWallTime)
1060 addDuration("FlushWallTime", s.FlushWallTime)
1061
1062 addBool("EstimateDocCount", s.EstimateDocCount)
1063 addBool("Whole", s.Whole)
1064 addBool("ChunkMatches", s.ChunkMatches)
1065 addBool("UseBM25Scoring", s.UseBM25Scoring)
1066 addBool("Trace", s.Trace)
1067 addBool("DebugScore", s.DebugScore)
1068
1069 for k, v := range s.SpanContext {
1070 add("SpanContext."+k, strconv.Quote(v))
1071 }
1072
1073 b.WriteByte('}')
1074 return b.String()
1075}
1076
1077// Sender is the interface that wraps the basic Send method.
1078type Sender interface {
1079 Send(*SearchResult)
1080}
1081
1082// SenderFunc is an adapter to allow the use of ordinary functions as Sender.
1083// If f is a function with the appropriate signature, SenderFunc(f) is a Sender
1084// that calls f.
1085type SenderFunc func(result *SearchResult)
1086
1087func (f SenderFunc) Send(result *SearchResult) {
1088 f(result)
1089}
1090
1091// Streamer adds the method StreamSearch to the Searcher interface.
1092type Streamer interface {
1093 Searcher
1094 StreamSearch(ctx context.Context, q query.Q, opts *SearchOptions, sender Sender) (err error)
1095}