fork of https://github.com/sourcegraph/zoekt
1// Copyright 2016 Google Inc. All rights reserved.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15package zoekt
16
17import (
18 "context"
19 "encoding/json"
20 "errors"
21 "fmt"
22 "math"
23 "reflect"
24 "strconv"
25 "strings"
26 "time"
27
28 "github.com/sourcegraph/zoekt/query"
29)
30
31const (
32 mapHeaderBytes uint64 = 48
33 sliceHeaderBytes uint64 = 24
34 stringHeaderBytes uint64 = 16
35 pointerSize uint64 = 8
36)
37
38// FileMatch contains all the matches within a file.
39type FileMatch struct {
40 FileName string
41
42 // Repository is the globally unique name of the repo of the
43 // match
44 Repository string
45
46 // SubRepositoryName is the globally unique name of the repo,
47 // if it came from a subrepository
48 SubRepositoryName string `json:",omitempty"`
49
50 // SubRepositoryPath holds the prefix where the subrepository
51 // was mounted.
52 SubRepositoryPath string `json:",omitempty"`
53
54 // Commit SHA1 (hex) of the (sub)repo holding the file.
55 Version string `json:",omitempty"`
56
57 // Detected language of the result.
58 Language string
59
60 // For debugging. Needs DebugScore set, but public so tests in
61 // other packages can print some diagnostics.
62 Debug string `json:",omitempty"`
63
64 Branches []string `json:",omitempty"`
65
66 // One of LineMatches or ChunkMatches will be returned depending on whether
67 // the SearchOptions.ChunkMatches is set.
68 LineMatches []LineMatch `json:",omitempty"`
69 ChunkMatches []ChunkMatch `json:",omitempty"`
70
71 // Only set if requested
72 Content []byte `json:",omitempty"`
73
74 // Checksum of the content.
75 Checksum []byte
76
77 // Ranking; the higher, the better.
78 Score float64 `json:",omitempty"`
79
80 // RepositoryPriority is a Sourcegraph extension. It is used by Sourcegraph to
81 // order results from different repositories relative to each other.
82 RepositoryPriority float64 `json:",omitempty"`
83
84 // RepositoryID is a Sourcegraph extension. This is the ID of Repository in
85 // Sourcegraph.
86 RepositoryID uint32 `json:",omitempty"`
87}
88
89func (m *FileMatch) sizeBytes() (sz uint64) {
90 // Score
91 sz += 8
92
93 for _, s := range []string{
94 m.Debug,
95 m.FileName,
96 m.Repository,
97 m.Language,
98 m.SubRepositoryName,
99 m.SubRepositoryPath,
100 m.Version,
101 } {
102 sz += stringHeaderBytes + uint64(len(s))
103 }
104
105 // Branches
106 sz += sliceHeaderBytes
107 for _, s := range m.Branches {
108 sz += stringHeaderBytes + uint64(len(s))
109 }
110
111 // LineMatches
112 sz += sliceHeaderBytes
113 for _, lm := range m.LineMatches {
114 sz += lm.sizeBytes()
115 }
116
117 // ChunkMatches
118 sz += sliceHeaderBytes
119 for _, cm := range m.ChunkMatches {
120 sz += cm.sizeBytes()
121 }
122
123 // RepositoryID
124 sz += 4
125
126 // RepositoryPriority
127 sz += 8
128
129 // Content
130 sz += sliceHeaderBytes + uint64(len(m.Content))
131
132 // Checksum
133 sz += sliceHeaderBytes + uint64(len(m.Checksum))
134
135 return
136}
137
138// AddScore increments the score of the FileMatch by the computed score. If
139// debugScore is true, it also adds a debug string to the FileMatch. If raw is
140// -1, it is ignored. Otherwise, it is added to the debug string.
141func (m *FileMatch) AddScore(what string, computed float64, raw float64, debugScore bool) {
142 if computed != 0 && debugScore {
143 var b strings.Builder
144 fmt.Fprintf(&b, "%s", what)
145 if raw != -1 {
146 fmt.Fprintf(&b, "(%s)", strconv.FormatFloat(raw, 'f', -1, 64))
147 }
148 fmt.Fprintf(&b, ":%.2f, ", computed)
149 m.Debug += b.String()
150 }
151 m.Score += computed
152}
153
154// ChunkMatch is a set of non-overlapping matches within a contiguous range of
155// lines in the file.
156type ChunkMatch struct {
157 DebugScore string
158
159 // Content is a contiguous range of complete lines that fully contains Ranges.
160 // Lines will always include their terminating newline (if it exists).
161 Content []byte
162
163 // Ranges is a set of matching ranges within this chunk. Each range is relative
164 // to the beginning of the file (not the beginning of Content).
165 Ranges []Range
166
167 // SymbolInfo is the symbol information associated with Ranges. If it is non-nil,
168 // its length will equal that of Ranges. Any of its elements may be nil.
169 SymbolInfo []*Symbol
170
171 // FileName indicates whether this match is a match on the file name, in
172 // which case Content will contain the file name.
173 FileName bool
174
175 // ContentStart is the location (inclusive) of the beginning of content
176 // relative to the beginning of the file. It will always be at the
177 // beginning of a line (Column will always be 1).
178 ContentStart Location
179
180 // Score is the overall relevance score of this chunk.
181 Score float64
182
183 // BestLineMatch is the line number of the highest-scoring line match in this chunk.
184 // The line number represents the index in the full file, and is 1-based. If FileName: true,
185 // this number will be 0.
186 BestLineMatch uint32
187}
188
189func (cm *ChunkMatch) sizeBytes() (sz uint64) {
190 // Content
191 sz += sliceHeaderBytes + uint64(len(cm.Content))
192
193 // ContentStart
194 sz += cm.ContentStart.sizeBytes()
195
196 // FileName
197 sz += 1
198
199 // Ranges
200 sz += sliceHeaderBytes
201 if len(cm.Ranges) > 0 {
202 sz += uint64(len(cm.Ranges)) * cm.Ranges[0].sizeBytes()
203 }
204
205 // SymbolInfo
206 sz += sliceHeaderBytes
207 for _, si := range cm.SymbolInfo {
208 sz += pointerSize
209 if si != nil {
210 sz += si.sizeBytes()
211 }
212 }
213
214 // Score
215 sz += 8
216
217 // DebugScore
218 sz += stringHeaderBytes + uint64(len(cm.DebugScore))
219
220 return
221}
222
223type Range struct {
224 // The inclusive beginning of the range.
225 Start Location
226 // The exclusive end of the range.
227 End Location
228}
229
230func (r *Range) sizeBytes() uint64 {
231 return r.Start.sizeBytes() + r.End.sizeBytes()
232}
233
234type Location struct {
235 // 0-based byte offset from the beginning of the file
236 ByteOffset uint32
237 // 1-based line number from the beginning of the file
238 LineNumber uint32
239 // 1-based column number (in runes) from the beginning of line
240 Column uint32
241}
242
243func (l *Location) sizeBytes() uint64 {
244 return 3 * 4
245}
246
247// LineMatch holds the matches within a single line in a file.
248type LineMatch struct {
249 // The line in which a match was found.
250 Line []byte
251 // The byte offset of the first byte of the line.
252 LineStart int
253 // The byte offset of the first byte past the end of the line.
254 // This is usually the byte after the terminating newline, but can also be
255 // the end of the file if there is no terminating newline
256 LineEnd int
257 LineNumber int
258
259 // Before and After are only set when SearchOptions.NumContextLines is > 0
260 Before []byte
261 After []byte
262
263 // If set, this was a match on the filename.
264 FileName bool
265
266 // The higher the better. Only ranks the quality of the match
267 // within the file, does not take rank of file into account
268 Score float64
269 DebugScore string
270
271 LineFragments []LineFragmentMatch
272}
273
274func (lm *LineMatch) sizeBytes() (sz uint64) {
275 // Line
276 sz += sliceHeaderBytes + uint64(len(lm.Line))
277
278 // LineStart, LineEnd, LineNumber
279 sz += 3 * 8
280
281 // Before
282 sz += sliceHeaderBytes + uint64(len(lm.Before))
283
284 // After
285 sz += sliceHeaderBytes + uint64(len(lm.After))
286
287 // FileName
288 sz += 1
289
290 // Score
291 sz += 8
292
293 // DebugScore
294 sz += stringHeaderBytes + uint64(len(lm.DebugScore))
295
296 // LineFragments
297 sz += sliceHeaderBytes
298 for _, lf := range lm.LineFragments {
299 sz += lf.sizeBytes()
300 }
301
302 return
303}
304
305type Symbol struct {
306 Sym string
307 Kind string
308 Parent string
309 ParentKind string
310}
311
312func (s *Symbol) sizeBytes() uint64 {
313 return 4*stringHeaderBytes + uint64(len(s.Sym)+len(s.Kind)+len(s.Parent)+len(s.ParentKind))
314}
315
316// LineFragmentMatch a segment of matching text within a line.
317type LineFragmentMatch struct {
318 // Offset within the line, in bytes.
319 LineOffset int
320
321 // Offset from file start, in bytes.
322 Offset uint32
323
324 // Number bytes that match.
325 MatchLength int
326
327 SymbolInfo *Symbol
328}
329
330func (lfm *LineFragmentMatch) sizeBytes() (sz uint64) {
331 // LineOffset
332 sz += 8
333
334 // Offset
335 sz += 4
336
337 // MatchLength
338 sz += 8
339
340 // SymbolInfo
341 sz += pointerSize
342 if lfm.SymbolInfo != nil {
343 sz += lfm.SymbolInfo.sizeBytes()
344 }
345
346 return
347}
348
349type FlushReason uint8
350
351const (
352 FlushReasonTimerExpired FlushReason = 1 << iota
353 FlushReasonFinalFlush
354 FlushReasonMaxSize
355)
356
357var FlushReasonStrings = map[FlushReason]string{
358 FlushReasonTimerExpired: "timer_expired",
359 FlushReasonFinalFlush: "final_flush",
360 FlushReasonMaxSize: "max_size_reached",
361}
362
363func (fr FlushReason) String() string {
364 if v, ok := FlushReasonStrings[fr]; ok {
365 return v
366 }
367
368 return "none"
369}
370
371// Stats contains interesting numbers on the search
372type Stats struct {
373 // Amount of I/O for reading contents.
374 ContentBytesLoaded int64
375
376 // Amount of I/O for reading from index.
377 IndexBytesLoaded int64
378
379 // Number of search shards that had a crash.
380 Crashes int
381
382 // Wall clock time for this search
383 Duration time.Duration
384
385 // Number of files containing a match.
386 FileCount int
387
388 // Number of files in shards that we considered.
389 ShardFilesConsidered int
390
391 // Files that we evaluated. Equivalent to files for which all
392 // atom matches (including negations) evaluated to true.
393 FilesConsidered int
394
395 // Files for which we loaded file content to verify substring matches
396 FilesLoaded int
397
398 // Candidate files whose contents weren't examined because we
399 // gathered enough matches.
400 FilesSkipped int
401
402 // Shards that we scanned to find matches.
403 ShardsScanned int
404
405 // Shards that we did not process because a query was canceled.
406 ShardsSkipped int
407
408 // Shards that we did not process because the query was rejected by the
409 // ngram filter indicating it had no matches.
410 ShardsSkippedFilter int
411
412 // Number of non-overlapping matches
413 MatchCount int
414
415 // Number of candidate matches as a result of searching ngrams.
416 NgramMatches int
417
418 // NgramLookups is the number of times we accessed an ngram in the index.
419 NgramLookups int
420
421 // Wall clock time for queued search.
422 Wait time.Duration
423
424 // Aggregate wall clock time spent constructing and pruning the match tree.
425 // This accounts for time such as lookups in the trigram index.
426 MatchTreeConstruction time.Duration
427
428 // Aggregate wall clock time spent searching the match tree. This accounts
429 // for the bulk of search work done looking for matches.
430 MatchTreeSearch time.Duration
431
432 // Number of times regexp was called on files that we evaluated.
433 RegexpsConsidered int
434
435 // FlushReason explains why results were flushed.
436 FlushReason FlushReason
437}
438
439func (s *Stats) sizeBytes() (sz uint64) {
440 sz = 16 * 8 // This assumes we are running on a 64-bit architecture
441 sz += 1 // FlushReason
442
443 return
444}
445
446func (s *Stats) Add(o Stats) {
447 s.ContentBytesLoaded += o.ContentBytesLoaded
448 s.IndexBytesLoaded += o.IndexBytesLoaded
449 s.Crashes += o.Crashes
450 s.FileCount += o.FileCount
451 s.FilesConsidered += o.FilesConsidered
452 s.FilesLoaded += o.FilesLoaded
453 s.FilesSkipped += o.FilesSkipped
454 s.MatchCount += o.MatchCount
455 s.NgramMatches += o.NgramMatches
456 s.NgramLookups += o.NgramLookups
457 s.ShardFilesConsidered += o.ShardFilesConsidered
458 s.ShardsScanned += o.ShardsScanned
459 s.ShardsSkipped += o.ShardsSkipped
460 s.ShardsSkippedFilter += o.ShardsSkippedFilter
461 s.Wait += o.Wait
462 s.MatchTreeConstruction += o.MatchTreeConstruction
463 s.MatchTreeSearch += o.MatchTreeSearch
464 s.RegexpsConsidered += o.RegexpsConsidered
465
466 // We want the first non-zero FlushReason to be sticky. This is a useful
467 // property when aggregating stats from several Zoekts.
468 if s.FlushReason == 0 {
469 s.FlushReason = o.FlushReason
470 }
471}
472
473// Zero returns true if stats is empty.
474func (s *Stats) Zero() bool {
475 if s == nil {
476 return true
477 }
478
479 return !(s.ContentBytesLoaded > 0 ||
480 s.IndexBytesLoaded > 0 ||
481 s.Crashes > 0 ||
482 s.FileCount > 0 ||
483 s.FilesConsidered > 0 ||
484 s.FilesLoaded > 0 ||
485 s.FilesSkipped > 0 ||
486 s.MatchCount > 0 ||
487 s.NgramMatches > 0 ||
488 s.NgramLookups > 0 ||
489 s.ShardFilesConsidered > 0 ||
490 s.ShardsScanned > 0 ||
491 s.ShardsSkipped > 0 ||
492 s.ShardsSkippedFilter > 0 ||
493 s.Wait > 0 ||
494 s.MatchTreeConstruction > 0 ||
495 s.MatchTreeSearch > 0 ||
496 s.RegexpsConsidered > 0)
497}
498
499// Progress contains information about the global progress of the running search query.
500// This is used by the frontend to reorder results and emit them when stable.
501// Sourcegraph specific: this is used when querying multiple zoekt-webserver instances.
502type Progress struct {
503 // Priority of the shard that was searched.
504 Priority float64
505
506 // MaxPendingPriority is the maximum priority of pending result that is being searched in parallel.
507 // This is used to reorder results when the result set is known to be stable-- that is, when a result's
508 // Priority is greater than the max(MaxPendingPriority) from the latest results of each backend, it can be returned to the user.
509 //
510 // MaxPendingPriority decreases monotonically in each SearchResult.
511 MaxPendingPriority float64
512}
513
514func (p *Progress) sizeBytes() uint64 {
515 return 2 * 8
516}
517
518// SearchResult contains search matches and extra data
519type SearchResult struct {
520 Stats
521
522 // Do not encode this as we cannot encode -Inf in JSON
523 Progress `json:"-"`
524
525 Files []FileMatch
526
527 // RepoURLs holds a repo => template string map.
528 RepoURLs map[string]string
529
530 // FragmentNames holds a repo => template string map, for
531 // the line number fragment.
532 LineFragments map[string]string
533}
534
535// SizeBytes is a best-effort estimate of the size of SearchResult in memory.
536// The estimate does not take alignment into account. The result is a lower
537// bound on the actual size in memory.
538func (sr *SearchResult) SizeBytes() (sz uint64) {
539 sz += sr.Stats.sizeBytes()
540 sz += sr.Progress.sizeBytes()
541
542 // Files
543 sz += sliceHeaderBytes
544 for _, f := range sr.Files {
545 sz += f.sizeBytes()
546 }
547
548 // RepoURLs
549 sz += mapHeaderBytes
550 for k, v := range sr.RepoURLs {
551 sz += stringHeaderBytes + uint64(len(k))
552 sz += stringHeaderBytes + uint64(len(v))
553 }
554
555 // LineFragments
556 sz += mapHeaderBytes
557 for k, v := range sr.LineFragments {
558 sz += stringHeaderBytes + uint64(len(k))
559 sz += stringHeaderBytes + uint64(len(v))
560 }
561
562 return
563}
564
565// RepositoryBranch describes an indexed branch, which is a name
566// combined with a version.
567type RepositoryBranch struct {
568 Name string
569 Version string
570}
571
572func (r RepositoryBranch) String() string {
573 return fmt.Sprintf("%s@%s", r.Name, r.Version)
574}
575
576// Repository holds repository metadata.
577type Repository struct {
578 // Sourcegraph's tenant ID
579 TenantID int
580
581 // Sourcegraph's repository ID
582 ID uint32
583
584 // The repository name
585 Name string
586
587 // The repository URL.
588 URL string
589
590 // Additional metadata about the repository.
591 Metadata map[string]string
592
593 // The physical source where this repo came from, eg. full
594 // path to the zip filename or git repository directory. This
595 // will not be exposed in the UI, but can be used to detect
596 // orphaned index shards.
597 Source string
598
599 // The branches indexed in this repo.
600 Branches []RepositoryBranch
601
602 // Nil if this is not the super project.
603 SubRepoMap map[string]*Repository
604
605 // URL template to link to the commit of a branch
606 CommitURLTemplate string
607
608 // The repository URL for getting to a file. Has access to
609 // {{.Version}}, {{.Path}}
610 FileURLTemplate string
611
612 // The URL fragment to add to a file URL for line numbers. has
613 // access to {{.LineNumber}}. The fragment should include the
614 // separator, generally '#' or ';'.
615 LineFragmentTemplate string
616
617 // Perf optimization: priority is set when we load the shard. It corresponds to
618 // the value of "priority" stored in RawConfig.
619 priority float64
620
621 // All zoekt.* configuration settings.
622 RawConfig map[string]string
623
624 // Importance of the repository, bigger is more important
625 Rank uint16
626
627 // IndexOptions is a hash of the options used to create the index for the
628 // repo.
629 IndexOptions string
630
631 // HasSymbols is true if this repository has indexed ctags
632 // output. Sourcegraph specific: This field is more appropriate for
633 // IndexMetadata. However, we store it here since the Sourcegraph frontend
634 // can read this structure but not IndexMetadata.
635 HasSymbols bool
636
637 // Tombstone is true if we are not allowed to search this repo.
638 Tombstone bool
639
640 // LatestCommitDate is the date of the latest commit among all indexed Branches.
641 // The date might be time.Time's 0-value if the repository was last indexed
642 // before this field was added.
643 LatestCommitDate time.Time
644
645 // FileTombstones is a set of file paths that should be ignored across all branches
646 // in this shard.
647 FileTombstones map[string]struct{} `json:",omitempty"`
648}
649
650func (r *Repository) UnmarshalJSON(data []byte) error {
651 // We define a new type so that we can use json.Unmarshal
652 // without recursing into this same method.
653 type repository Repository
654 repo := (*repository)(r)
655
656 err := json.Unmarshal(data, repo)
657 if err != nil {
658 return err
659 }
660
661 if v, ok := repo.RawConfig["repoid"]; ok {
662 id, _ := strconv.ParseUint(v, 10, 32)
663 r.ID = uint32(id)
664 }
665
666 if v, ok := repo.RawConfig["tenantID"]; ok {
667 id, _ := strconv.ParseInt(v, 10, 64)
668 r.TenantID = int(id)
669 }
670
671 // Sourcegraph indexserver doesn't set repo.Rank, so we set it here. Setting it
672 // on read instead of during indexing allows us to avoid a complete reindex.
673 //
674 // Prefer "latestCommitDate" over "priority" for ranking. We keep priority for
675 // backwards compatibility.
676 if _, ok := repo.RawConfig["latestCommitDate"]; ok {
677 // We use the number of months since 1970 as a simple measure of repo freshness.
678 // It is monotonically increasing and stable across re-indexes and restarts.
679 r.Rank = monthsSince1970(repo.LatestCommitDate)
680 } else if v, ok := repo.RawConfig["priority"]; ok {
681 r.priority, err = strconv.ParseFloat(v, 64)
682 if err != nil {
683 r.priority = 0
684 }
685
686 // Sourcegraph indexserver doesn't set repo.Rank, so we set it here
687 // based on priority. Setting it on read instead of during indexing
688 // allows us to avoid a complete reindex.
689 if r.Rank == 0 && r.priority > 0 {
690 // Normalize the repo score within [0, maxUint16), with the midpoint at 5,000.
691 // This means popular repos (roughly ones with over 5,000 stars) see diminishing
692 // returns from more stars.
693 r.Rank = uint16(r.priority / (5000.0 + r.priority) * math.MaxUint16)
694 }
695 }
696
697 return nil
698}
699
700func (r *Repository) GetPriority() float64 {
701 return r.priority
702}
703
704// monthsSince1970 returns the number of months since 1970. It returns values in
705// the range [0, maxUInt16]. The upper bound is reached in the year 7431, the
706// lower bound for all dates before 1970.
707func monthsSince1970(t time.Time) uint16 {
708 base := time.Unix(0, 0)
709 if t.Before(base) {
710 return 0
711 }
712 months := int(t.Year()-1970)*12 + int(t.Month()-1)
713 return uint16(min(months, math.MaxUint16))
714}
715
716// MergeMutable will merge x into r. mutated will be true if it made any
717// changes. err is non-nil if we needed to mutate an immutable field.
718//
719// Note: SubRepoMap, IndexOptions and HasSymbol fields are ignored. They are
720// computed while indexing so can't be synthesized from x.
721//
722// Note: We ignore RawConfig fields which are duplicated into Repository:
723// name and id.
724func (r *Repository) MergeMutable(x *Repository) (mutated bool, err error) {
725 if r.ID != x.ID {
726 // Sourcegraph: strange behaviour may occur if ID changes but names don't.
727 return mutated, errors.New("ID is immutable")
728 }
729 if r.Name != x.Name {
730 // Name is encoded into the shard name on disk. We need to re-index if it
731 // changes.
732 return mutated, errors.New("Name is immutable")
733 }
734 if !reflect.DeepEqual(r.Branches, x.Branches) {
735 // Need a reindex if content changing.
736 return mutated, errors.New("Branches is immutable")
737 }
738
739 for k, v := range x.RawConfig {
740 // We ignore name and id since they are encoded into the repository.
741 if k == "name" || k == "id" {
742 continue
743 }
744 if r.RawConfig == nil {
745 mutated = true
746 r.RawConfig = make(map[string]string)
747 }
748 if r.RawConfig[k] != v {
749 mutated = true
750 r.RawConfig[k] = v
751 }
752 }
753
754 if r.URL != x.URL {
755 mutated = true
756 r.URL = x.URL
757 }
758 if r.CommitURLTemplate != x.CommitURLTemplate {
759 mutated = true
760 r.CommitURLTemplate = x.CommitURLTemplate
761 }
762 if r.FileURLTemplate != x.FileURLTemplate {
763 mutated = true
764 r.FileURLTemplate = x.FileURLTemplate
765 }
766 if r.LineFragmentTemplate != x.LineFragmentTemplate {
767 mutated = true
768 r.LineFragmentTemplate = x.LineFragmentTemplate
769 }
770
771 return mutated, nil
772}
773
774// IndexMetadata holds metadata stored in the index file. It contains
775// data generated by the core indexing library.
776type IndexMetadata struct {
777 IndexFormatVersion int
778 IndexFeatureVersion int
779 IndexMinReaderVersion int
780 IndexTime time.Time
781 PlainASCII bool
782 LanguageMap map[string]uint16
783 ZoektVersion string
784 ID string
785}
786
787// Statistics of a (collection of) repositories.
788type RepoStats struct {
789 // Repos is used for aggregrating the number of repositories.
790 //
791 // Note: This field is not populated on RepoListEntry.Stats (individual) but
792 // only for RepoList.Stats (aggregate).
793 Repos int
794
795 // Shards is the total number of search shards.
796 Shards int
797
798 // Documents holds the number of documents or files.
799 Documents int
800
801 // IndexBytes is the amount of RAM used for index overhead.
802 IndexBytes int64
803
804 // ContentBytes is the amount of RAM used for raw content.
805 ContentBytes int64
806
807 // Sourcegraph specific stats below. These are not as efficient to calculate
808 // as the above statistics. We experimentally measured about a 10% slower
809 // shard load time. However, we find these values very useful to track and
810 // computing them outside of load time introduces a lot of complexity.
811
812 // NewLinesCount is the number of newlines "\n" that appear in the zoekt
813 // indexed documents. This is not exactly the same as line count, since it
814 // will not include lines not terminated by "\n" (eg a file with no "\n", or
815 // a final line without "\n"). Note: Zoekt deduplicates documents across
816 // branches, so if a path has the same contents on multiple branches, there
817 // is only one document for it. As such that document's newlines is only
818 // counted once. See DefaultBranchNewLinesCount and AllBranchesNewLinesCount
819 // for counts which do not deduplicate.
820 NewLinesCount uint64
821
822 // DefaultBranchNewLinesCount is the number of newlines "\n" in the default
823 // branch.
824 DefaultBranchNewLinesCount uint64
825
826 // OtherBranchesNewLinesCount is the number of newlines "\n" in all branches
827 // except the default branch.
828 OtherBranchesNewLinesCount uint64
829}
830
831func (s *RepoStats) Add(o *RepoStats) {
832 // can't update Repos, since one repo may have multiple
833 // shards.
834 s.Shards += o.Shards
835 s.IndexBytes += o.IndexBytes
836 s.Documents += o.Documents
837 s.ContentBytes += o.ContentBytes
838
839 // Sourcegraph specific
840 s.NewLinesCount += o.NewLinesCount
841 s.DefaultBranchNewLinesCount += o.DefaultBranchNewLinesCount
842 s.OtherBranchesNewLinesCount += o.OtherBranchesNewLinesCount
843}
844
845type RepoListEntry struct {
846 Repository Repository
847 IndexMetadata IndexMetadata
848 Stats RepoStats
849}
850
851// MinimalRepoListEntry is a subset of RepoListEntry. It was added after
852// performance profiling of sourcegraph.com revealed that querying this
853// information from Zoekt was causing lots of CPU and memory usage. Note: we
854// can revisit this, how we store and query this information has changed a lot
855// since this was introduced.
856type MinimalRepoListEntry struct {
857 // HasSymbols is exported since Sourcegraph uses this information at search
858 // planning time to decide between Zoekt and an unindexed symbol search.
859 //
860 // Note: it pretty much is always true in practice.
861 HasSymbols bool
862
863 // Branches is used by Sourcegraphs query planner to decided if it can use
864 // zoekt or go via an unindexed code path.
865 Branches []RepositoryBranch
866
867 // IndexTimeUnix is the IndexTime converted to unix time (number of seconds
868 // since the epoch). This is to make it clear we are not transporting the
869 // full fidelty timestamp (ie with milliseconds and location). Additionally
870 // it saves 16 bytes in this struct.
871 //
872 // IndexTime is used as a heuristic in Sourcegraph to decide in aggregate
873 // how many repositories need updating after a ranking change/etc.
874 //
875 // TODO(keegancsmith) audit updates to IndexTime and document how and when
876 // it changes. Concerned about things like metadata updates or compound
877 // shards leading to untrustworthy data here.
878 IndexTimeUnix int64
879}
880
881type ReposMap map[uint32]MinimalRepoListEntry
882
883// MarshalBinary implements a specialized encoder for ReposMap.
884func (q *ReposMap) MarshalBinary() ([]byte, error) {
885 return reposMapEncode(*q)
886}
887
888// UnmarshalBinary implements a specialized decoder for ReposMap.
889func (q *ReposMap) UnmarshalBinary(b []byte) error {
890 var err error
891 (*q), err = reposMapDecode(b)
892 return err
893}
894
895// RepoList holds a set of Repository metadata.
896type RepoList struct {
897 // Returned when ListOptions.Field is RepoListFieldRepos.
898 Repos []*RepoListEntry
899
900 // ReposMap is set when ListOptions.Field is RepoListFieldReposMap.
901 ReposMap ReposMap
902
903 Crashes int
904
905 // Stats response to a List request.
906 // This is the aggregate RepoStats of all repos matching the input query.
907 Stats RepoStats
908}
909
910type Searcher interface {
911 Search(ctx context.Context, q query.Q, opts *SearchOptions) (*SearchResult, error)
912
913 // List lists repositories. The query `q` can only contain
914 // query.Repo atoms.
915 List(ctx context.Context, q query.Q, opts *ListOptions) (*RepoList, error)
916 Close()
917
918 // Describe the searcher for debug messages.
919 String() string
920}
921
922type RepoListField int
923
924const (
925 RepoListFieldRepos RepoListField = 0
926 RepoListFieldReposMap = 2
927)
928
929type ListOptions struct {
930 // Field decides which field to populate in RepoList response.
931 Field RepoListField
932}
933
934func (o *ListOptions) GetField() (RepoListField, error) {
935 if o == nil {
936 return RepoListFieldRepos, nil
937 }
938 switch o.Field {
939 case RepoListFieldRepos, RepoListFieldReposMap:
940 return o.Field, nil
941 case 1:
942 return 0, fmt.Errorf("RepoListFieldMinimal (%d) is no longer supported", o.Field)
943 default:
944 return 0, fmt.Errorf("unknown RepoListField %d", o.Field)
945 }
946}
947
948func (o *ListOptions) String() string {
949 return fmt.Sprintf("%#v", o)
950}
951
952type SearchOptions struct {
953 // Return an upper-bound estimate of eligible documents in
954 // stats.ShardFilesConsidered.
955 EstimateDocCount bool
956
957 // Return the whole file.
958 Whole bool
959
960 // Maximum number of matches: skip all processing an index
961 // shard after we found this many non-overlapping matches.
962 ShardMaxMatchCount int
963
964 // Maximum number of matches: stop looking for more matches
965 // once we have this many matches across shards.
966 TotalMaxMatchCount int
967
968 // Maximum number of matches: skip processing documents for a repository in
969 // a shard once we have found ShardRepoMaxMatchCount.
970 //
971 // A compound shard may contain multiple repositories. This will most often
972 // be set to 1 to find all repositories containing a result.
973 ShardRepoMaxMatchCount int
974
975 // Abort the search after this much time has passed.
976 MaxWallTime time.Duration
977
978 // FlushWallTime if non-zero will stop streaming behaviour at first and
979 // instead will collate and sort results. At FlushWallTime the results will
980 // be sent and then the behaviour will revert to the normal streaming.
981 FlushWallTime time.Duration
982
983 // Truncates the number of documents (i.e. files) after collating and
984 // sorting the results.
985 MaxDocDisplayCount int
986
987 // Truncates the number of matchs after collating and sorting the results.
988 MaxMatchDisplayCount int
989
990 // If set to a number greater than zero then up to this many number
991 // of context lines will be added before and after each matched line.
992 // Note that the included context lines might contain matches and
993 // it's up to the consumer of the result to remove those lines.
994 NumContextLines int
995
996 // If true, ChunkMatches will be returned in each FileMatch rather than LineMatches
997 // EXPERIMENTAL: the behavior of this flag may be changed in future versions.
998 ChunkMatches bool
999
1000 // EXPERIMENTAL. If true, use text-search style scoring instead of the default
1001 // scoring formula. The scoring algorithm treats each match in a file as a term
1002 // and computes an approximation to BM25. When enabled, BM25 scoring is used for
1003 // the overall FileMatch score, as well as individual LineMatch and ChunkMatch scores.
1004 //
1005 // The calculation of IDF assumes that Zoekt visits all documents containing any
1006 // of the query terms during evaluation. This is true, for example, if all query
1007 // terms are ORed together.
1008 //
1009 // When enabled, all other scoring signals are ignored, including document ranks.
1010 UseBM25Scoring bool
1011
1012 // Trace turns on opentracing for this request if true and if the Jaeger address was provided as
1013 // a command-line flag
1014 Trace bool
1015
1016 // If set, the search results will contain debug information for scoring.
1017 DebugScore bool
1018
1019 // SpanContext is the opentracing span context, if it exists, from the zoekt client
1020 SpanContext map[string]string
1021}
1022
1023func (o *SearchOptions) SetDefaults() {
1024 if o.ShardMaxMatchCount == 0 {
1025 // We cap the total number of matches, so overly broad
1026 // searches don't crash the machine.
1027 o.ShardMaxMatchCount = 100000
1028 }
1029 if o.TotalMaxMatchCount == 0 {
1030 o.TotalMaxMatchCount = 10 * o.ShardMaxMatchCount
1031 }
1032}
1033
1034// String returns a succinct representation of the options. This is meant for
1035// human consumption in logs and traces.
1036//
1037// Note: some tracing systems have limits on length of values, so we take care
1038// to try and make this small, and include the important information near the
1039// front incase of truncation.
1040func (s *SearchOptions) String() string {
1041 var b strings.Builder
1042
1043 add := func(name, value string) {
1044 b.WriteString(name)
1045 b.WriteByte('=')
1046 b.WriteString(value)
1047 b.WriteByte(' ')
1048 }
1049 addInt := func(name string, value int) {
1050 if value != 0 {
1051 add(name, strconv.Itoa(value))
1052 }
1053 }
1054 addDuration := func(name string, value time.Duration) {
1055 if value != 0 {
1056 add(name, value.String())
1057 }
1058 }
1059 addBool := func(name string, value bool) {
1060 if !value {
1061 return
1062 }
1063 b.WriteString(name)
1064 b.WriteByte(' ')
1065 }
1066
1067 b.WriteString("zoekt.SearchOptions{ ")
1068
1069 addInt("ShardMaxMatchCount", s.ShardMaxMatchCount)
1070 addInt("TotalMaxMatchCount", s.TotalMaxMatchCount)
1071 addInt("ShardRepoMaxMatchCount", s.ShardRepoMaxMatchCount)
1072 addInt("MaxDocDisplayCount", s.MaxDocDisplayCount)
1073 addInt("MaxMatchDisplayCount", s.MaxMatchDisplayCount)
1074 addInt("NumContextLines", s.NumContextLines)
1075
1076 addDuration("MaxWallTime", s.MaxWallTime)
1077 addDuration("FlushWallTime", s.FlushWallTime)
1078
1079 addBool("EstimateDocCount", s.EstimateDocCount)
1080 addBool("Whole", s.Whole)
1081 addBool("ChunkMatches", s.ChunkMatches)
1082 addBool("UseBM25Scoring", s.UseBM25Scoring)
1083 addBool("Trace", s.Trace)
1084 addBool("DebugScore", s.DebugScore)
1085
1086 for k, v := range s.SpanContext {
1087 add("SpanContext."+k, strconv.Quote(v))
1088 }
1089
1090 b.WriteByte('}')
1091 return b.String()
1092}
1093
1094// Sender is the interface that wraps the basic Send method.
1095type Sender interface {
1096 Send(*SearchResult)
1097}
1098
1099// SenderFunc is an adapter to allow the use of ordinary functions as Sender.
1100// If f is a function with the appropriate signature, SenderFunc(f) is a Sender
1101// that calls f.
1102type SenderFunc func(result *SearchResult)
1103
1104func (f SenderFunc) Send(result *SearchResult) {
1105 f(result)
1106}
1107
1108// Streamer adds the method StreamSearch to the Searcher interface.
1109type Streamer interface {
1110 Searcher
1111 StreamSearch(ctx context.Context, q query.Q, opts *SearchOptions, sender Sender) (err error)
1112}