fork of https://github.com/sourcegraph/zoekt
1// Copyright 2016 Google Inc. All rights reserved.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15package zoekt // import "github.com/sourcegraph/zoekt"
16
17import (
18 "context"
19 "encoding/json"
20 "errors"
21 "fmt"
22 "reflect"
23 "strconv"
24 "strings"
25 "time"
26
27 "github.com/sourcegraph/zoekt/query"
28)
29
30const mapHeaderBytes uint64 = 48
31const sliceHeaderBytes uint64 = 24
32const stringHeaderBytes uint64 = 16
33const pointerSize uint64 = 8
34const interfaceBytes uint64 = 16
35
36// FileMatch contains all the matches within a file.
37type FileMatch struct {
38 FileName string
39
40 // Repository is the globally unique name of the repo of the
41 // match
42 Repository string
43
44 // SubRepositoryName is the globally unique name of the repo,
45 // if it came from a subrepository
46 SubRepositoryName string
47
48 // SubRepositoryPath holds the prefix where the subrepository
49 // was mounted.
50 SubRepositoryPath string
51
52 // Commit SHA1 (hex) of the (sub)repo holding the file.
53 Version string
54
55 // Detected language of the result.
56 Language string
57
58 // For debugging. Needs DebugScore set, but public so tests in
59 // other packages can print some diagnostics.
60 Debug string
61
62 Branches []string
63
64 // One of LineMatches or ChunkMatches will be returned depending on whether
65 // the SearchOptions.ChunkMatches is set.
66 LineMatches []LineMatch
67 ChunkMatches []ChunkMatch
68
69 // Only set if requested
70 Content []byte
71
72 // Checksum of the content.
73 Checksum []byte
74
75 // Ranking; the higher, the better.
76 Score float64 // TODO - hide this field?
77
78 // RepositoryPriority is a Sourcegraph extension. It is used by Sourcegraph to
79 // order results from different repositories relative to each other.
80 RepositoryPriority float64
81
82 // RepositoryID is a Sourcegraph extension. This is the ID of Repository in
83 // Sourcegraph.
84 RepositoryID uint32
85}
86
87func (m *FileMatch) sizeBytes() (sz uint64) {
88 // Score
89 sz += 8
90
91 for _, s := range []string{
92 m.Debug,
93 m.FileName,
94 m.Repository,
95 m.Language,
96 m.SubRepositoryName,
97 m.SubRepositoryPath,
98 m.Version,
99 } {
100 sz += stringHeaderBytes + uint64(len(s))
101 }
102
103 // Branches
104 sz += sliceHeaderBytes
105 for _, s := range m.Branches {
106 sz += stringHeaderBytes + uint64(len(s))
107 }
108
109 // LineMatches
110 sz += sliceHeaderBytes
111 for _, lm := range m.LineMatches {
112 sz += lm.sizeBytes()
113 }
114
115 // ChunkMatches
116 sz += sliceHeaderBytes
117 for _, cm := range m.ChunkMatches {
118 sz += cm.sizeBytes()
119 }
120
121 // RepositoryID
122 sz += 4
123
124 // RepositoryPriority
125 sz += 8
126
127 // Content
128 sz += sliceHeaderBytes + uint64(len(m.Content))
129
130 // Checksum
131 sz += sliceHeaderBytes + uint64(len(m.Checksum))
132
133 return
134}
135
136// ChunkMatch is a set of non-overlapping matches within a contiguous range of
137// lines in the file.
138type ChunkMatch struct {
139 DebugScore string
140
141 // Content is a contiguous range of complete lines that fully contains Ranges.
142 Content []byte
143
144 // Ranges is a set of matching ranges within this chunk. Each range is relative
145 // to the beginning of the file (not the beginning of Content).
146 Ranges []Range
147
148 // SymbolInfo is the symbol information associated with Ranges. If it is non-nil,
149 // its length will equal that of Ranges. Any of its elements may be nil.
150 SymbolInfo []*Symbol
151
152 // FileName indicates whether this match is a match on the file name, in
153 // which case Content will contain the file name.
154 FileName bool
155
156 // ContentStart is the location (inclusive) of the beginning of content
157 // relative to the beginning of the file. It will always be at the
158 // beginning of a line (Column will always be 1).
159 ContentStart Location
160
161 Score float64
162}
163
164func (cm *ChunkMatch) sizeBytes() (sz uint64) {
165 // Content
166 sz += sliceHeaderBytes + uint64(len(cm.Content))
167
168 // ContentStart
169 sz += cm.ContentStart.sizeBytes()
170
171 // FileName
172 sz += 1
173
174 // Ranges
175 sz += sliceHeaderBytes
176 if len(cm.Ranges) > 0 {
177 sz += uint64(len(cm.Ranges)) * cm.Ranges[0].sizeBytes()
178 }
179
180 // SymbolInfo
181 sz += sliceHeaderBytes
182 for _, si := range cm.SymbolInfo {
183 sz += pointerSize
184 if si != nil {
185 sz += si.sizeBytes()
186 }
187 }
188
189 // Score
190 sz += 8
191
192 // DebugScore
193 sz += stringHeaderBytes + uint64(len(cm.DebugScore))
194
195 return
196}
197
198type Range struct {
199 // The inclusive beginning of the range.
200 Start Location
201 // The exclusive end of the range.
202 End Location
203}
204
205func (r *Range) sizeBytes() uint64 {
206 return r.Start.sizeBytes() + r.End.sizeBytes()
207}
208
209type Location struct {
210 // 0-based byte offset from the beginning of the file
211 ByteOffset uint32
212 // 1-based line number from the beginning of the file
213 LineNumber uint32
214 // 1-based column number (in runes) from the beginning of line
215 Column uint32
216}
217
218func (l *Location) sizeBytes() uint64 {
219 return 3 * 4
220}
221
222// LineMatch holds the matches within a single line in a file.
223type LineMatch struct {
224 // The line in which a match was found.
225 Line []byte
226 LineStart int
227 LineEnd int
228 LineNumber int
229
230 // Before and After are only set when SearchOptions.NumContextLines is > 0
231 Before []byte
232 After []byte
233
234 // If set, this was a match on the filename.
235 FileName bool
236
237 // The higher the better. Only ranks the quality of the match
238 // within the file, does not take rank of file into account
239 Score float64
240 DebugScore string
241
242 LineFragments []LineFragmentMatch
243}
244
245func (lm *LineMatch) sizeBytes() (sz uint64) {
246 // Line
247 sz += sliceHeaderBytes + uint64(len(lm.Line))
248
249 // LineStart, LineEnd, LineNumber
250 sz += 3 * 8
251
252 // Before
253 sz += sliceHeaderBytes + uint64(len(lm.Before))
254
255 // After
256 sz += sliceHeaderBytes + uint64(len(lm.After))
257
258 // FileName
259 sz += 1
260
261 // Score
262 sz += 8
263
264 // DebugScore
265 sz += stringHeaderBytes + uint64(len(lm.DebugScore))
266
267 // LineFragments
268 sz += sliceHeaderBytes
269 for _, lf := range lm.LineFragments {
270 sz += lf.sizeBytes()
271 }
272
273 return
274}
275
276type Symbol struct {
277 Sym string
278 Kind string
279 Parent string
280 ParentKind string
281}
282
283func (s *Symbol) sizeBytes() uint64 {
284 return 4*stringHeaderBytes + uint64(len(s.Sym)+len(s.Kind)+len(s.Parent)+len(s.ParentKind))
285}
286
287// LineFragmentMatch a segment of matching text within a line.
288type LineFragmentMatch struct {
289 // Offset within the line, in bytes.
290 LineOffset int
291
292 // Offset from file start, in bytes.
293 Offset uint32
294
295 // Number bytes that match.
296 MatchLength int
297
298 SymbolInfo *Symbol
299}
300
301func (lfm *LineFragmentMatch) sizeBytes() (sz uint64) {
302 // LineOffset
303 sz += 8
304
305 // Offset
306 sz += 4
307
308 // MatchLength
309 sz += 8
310
311 // SymbolInfo
312 sz += pointerSize
313 if lfm.SymbolInfo != nil {
314 sz += lfm.SymbolInfo.sizeBytes()
315 }
316
317 return
318}
319
320type FlushReason uint8
321
322const (
323 FlushReasonTimerExpired FlushReason = 1 << iota
324 FlushReasonFinalFlush
325 FlushReasonMaxSize
326)
327
328var FlushReasonStrings = map[FlushReason]string{
329 FlushReasonTimerExpired: "timer_expired",
330 FlushReasonFinalFlush: "final_flush",
331 FlushReasonMaxSize: "max_size_reached",
332}
333
334func (fr FlushReason) String() string {
335 if v, ok := FlushReasonStrings[fr]; ok {
336 return v
337 }
338
339 return "none"
340}
341
342// Stats contains interesting numbers on the search
343type Stats struct {
344 // Amount of I/O for reading contents.
345 ContentBytesLoaded int64
346
347 // Amount of I/O for reading from index.
348 IndexBytesLoaded int64
349
350 // Number of search shards that had a crash.
351 Crashes int
352
353 // Wall clock time for this search
354 Duration time.Duration
355
356 // Number of files containing a match.
357 FileCount int
358
359 // Number of files in shards that we considered.
360 ShardFilesConsidered int
361
362 // Files that we evaluated. Equivalent to files for which all
363 // atom matches (including negations) evaluated to true.
364 FilesConsidered int
365
366 // Files for which we loaded file content to verify substring matches
367 FilesLoaded int
368
369 // Candidate files whose contents weren't examined because we
370 // gathered enough matches.
371 FilesSkipped int
372
373 // Shards that we scanned to find matches.
374 ShardsScanned int
375
376 // Shards that we did not process because a query was canceled.
377 ShardsSkipped int
378
379 // Shards that we did not process because the query was rejected by the
380 // ngram filter indicating it had no matches.
381 ShardsSkippedFilter int
382
383 // Number of non-overlapping matches
384 MatchCount int
385
386 // Number of candidate matches as a result of searching ngrams.
387 NgramMatches int
388
389 // NgramLookups is the number of times we accessed an ngram in the index.
390 NgramLookups int
391
392 // Wall clock time for queued search.
393 Wait time.Duration
394
395 // Aggregate wall clock time spent constructing and pruning the match tree.
396 // This accounts for time such as lookups in the trigram index.
397 MatchTreeConstruction time.Duration
398
399 // Aggregate wall clock time spent searching the match tree. This accounts
400 // for the bulk of search work done looking for matches.
401 MatchTreeSearch time.Duration
402
403 // Number of times regexp was called on files that we evaluated.
404 RegexpsConsidered int
405
406 // FlushReason explains why results were flushed.
407 FlushReason FlushReason
408}
409
410func (s *Stats) sizeBytes() (sz uint64) {
411 sz = 16 * 8 // This assumes we are running on a 64-bit architecture
412 sz += 1 // FlushReason
413
414 return
415}
416
417func (s *Stats) Add(o Stats) {
418 s.ContentBytesLoaded += o.ContentBytesLoaded
419 s.IndexBytesLoaded += o.IndexBytesLoaded
420 s.Crashes += o.Crashes
421 s.FileCount += o.FileCount
422 s.FilesConsidered += o.FilesConsidered
423 s.FilesLoaded += o.FilesLoaded
424 s.FilesSkipped += o.FilesSkipped
425 s.MatchCount += o.MatchCount
426 s.NgramMatches += o.NgramMatches
427 s.NgramLookups += o.NgramLookups
428 s.ShardFilesConsidered += o.ShardFilesConsidered
429 s.ShardsScanned += o.ShardsScanned
430 s.ShardsSkipped += o.ShardsSkipped
431 s.ShardsSkippedFilter += o.ShardsSkippedFilter
432 s.Wait += o.Wait
433 s.MatchTreeConstruction += o.MatchTreeConstruction
434 s.MatchTreeSearch += o.MatchTreeSearch
435 s.RegexpsConsidered += o.RegexpsConsidered
436
437 // We want the first non-zero FlushReason to be sticky. This is a useful
438 // property when aggregating stats from several Zoekts.
439 if s.FlushReason == 0 {
440 s.FlushReason = o.FlushReason
441 }
442}
443
444// Zero returns true if stats is empty.
445func (s *Stats) Zero() bool {
446 if s == nil {
447 return true
448 }
449
450 return !(s.ContentBytesLoaded > 0 ||
451 s.IndexBytesLoaded > 0 ||
452 s.Crashes > 0 ||
453 s.FileCount > 0 ||
454 s.FilesConsidered > 0 ||
455 s.FilesLoaded > 0 ||
456 s.FilesSkipped > 0 ||
457 s.MatchCount > 0 ||
458 s.NgramMatches > 0 ||
459 s.NgramLookups > 0 ||
460 s.ShardFilesConsidered > 0 ||
461 s.ShardsScanned > 0 ||
462 s.ShardsSkipped > 0 ||
463 s.ShardsSkippedFilter > 0 ||
464 s.Wait > 0 ||
465 s.MatchTreeConstruction > 0 ||
466 s.MatchTreeSearch > 0 ||
467 s.RegexpsConsidered > 0)
468}
469
470// Progress contains information about the global progress of the running search query.
471// This is used by the frontend to reorder results and emit them when stable.
472// Sourcegraph specific: this is used when querying multiple zoekt-webserver instances.
473type Progress struct {
474 // Priority of the shard that was searched.
475 Priority float64
476
477 // MaxPendingPriority is the maximum priority of pending result that is being searched in parallel.
478 // This is used to reorder results when the result set is known to be stable-- that is, when a result's
479 // Priority is greater than the max(MaxPendingPriority) from the latest results of each backend, it can be returned to the user.
480 //
481 // MaxPendingPriority decreases monotonically in each SearchResult.
482 MaxPendingPriority float64
483}
484
485func (p *Progress) sizeBytes() uint64 {
486 return 2 * 8
487}
488
489// SearchResult contains search matches and extra data
490type SearchResult struct {
491 Stats
492
493 // Do not encode this as we cannot encode -Inf in JSON
494 Progress `json:"-"`
495
496 Files []FileMatch
497
498 // RepoURLs holds a repo => template string map.
499 RepoURLs map[string]string
500
501 // FragmentNames holds a repo => template string map, for
502 // the line number fragment.
503 LineFragments map[string]string
504}
505
506// SizeBytes is a best-effort estimate of the size of SearchResult in memory.
507// The estimate does not take alignment into account. The result is a lower
508// bound on the actual size in memory.
509func (sr *SearchResult) SizeBytes() (sz uint64) {
510 sz += sr.Stats.sizeBytes()
511 sz += sr.Progress.sizeBytes()
512
513 // Files
514 sz += sliceHeaderBytes
515 for _, f := range sr.Files {
516 sz += f.sizeBytes()
517 }
518
519 // RepoURLs
520 sz += mapHeaderBytes
521 for k, v := range sr.RepoURLs {
522 sz += stringHeaderBytes + uint64(len(k))
523 sz += stringHeaderBytes + uint64(len(v))
524 }
525
526 // LineFragments
527 sz += mapHeaderBytes
528 for k, v := range sr.LineFragments {
529 sz += stringHeaderBytes + uint64(len(k))
530 sz += stringHeaderBytes + uint64(len(v))
531 }
532
533 return
534}
535
536// RepositoryBranch describes an indexed branch, which is a name
537// combined with a version.
538type RepositoryBranch struct {
539 Name string
540 Version string
541}
542
543func (r RepositoryBranch) String() string {
544 return fmt.Sprintf("%s@%s", r.Name, r.Version)
545}
546
547// Repository holds repository metadata.
548type Repository struct {
549 // Sourcegraph's repository ID
550 ID uint32
551
552 // The repository name
553 Name string
554
555 // The repository URL.
556 URL string
557
558 // The physical source where this repo came from, eg. full
559 // path to the zip filename or git repository directory. This
560 // will not be exposed in the UI, but can be used to detect
561 // orphaned index shards.
562 Source string
563
564 // The branches indexed in this repo.
565 Branches []RepositoryBranch
566
567 // Nil if this is not the super project.
568 SubRepoMap map[string]*Repository
569
570 // URL template to link to the commit of a branch
571 CommitURLTemplate string
572
573 // The repository URL for getting to a file. Has access to
574 // {{.Version}}, {{.Path}}
575 FileURLTemplate string
576
577 // The URL fragment to add to a file URL for line numbers. has
578 // access to {{.LineNumber}}. The fragment should include the
579 // separator, generally '#' or ';'.
580 LineFragmentTemplate string
581
582 // Perf optimization: priority is set when we load the shard. It corresponds to
583 // the value of "priority" stored in RawConfig.
584 priority float64
585
586 // All zoekt.* configuration settings.
587 RawConfig map[string]string
588
589 // Importance of the repository, bigger is more important
590 Rank uint16
591
592 // IndexOptions is a hash of the options used to create the index for the
593 // repo.
594 IndexOptions string
595
596 // HasSymbols is true if this repository has indexed ctags
597 // output. Sourcegraph specific: This field is more appropriate for
598 // IndexMetadata. However, we store it here since the Sourcegraph frontend
599 // can read this structure but not IndexMetadata.
600 HasSymbols bool
601
602 // Tombstone is true if we are not allowed to search this repo.
603 Tombstone bool
604
605 // LatestCommitDate is the date of the latest commit among all indexed Branches.
606 // The date might be time.Time's 0-value if the repository was last indexed
607 // before this field was added.
608 LatestCommitDate time.Time
609
610 // FileTombstones is a set of file paths that should be ignored across all branches
611 // in this shard.
612 FileTombstones map[string]struct{} `json:",omitempty"`
613}
614
615func (r *Repository) UnmarshalJSON(data []byte) error {
616 // We define a new type so that we can use json.Unmarshal
617 // without recursing into this same method.
618 type repository *Repository
619 repo := repository(r)
620
621 err := json.Unmarshal(data, repo)
622 if err != nil {
623 return err
624 }
625
626 if v, ok := repo.RawConfig["repoid"]; ok {
627 id, _ := strconv.ParseUint(v, 10, 32)
628 r.ID = uint32(id)
629 }
630
631 if v, ok := repo.RawConfig["priority"]; ok {
632 r.priority, err = strconv.ParseFloat(v, 64)
633 if err != nil {
634 r.priority = 0
635 }
636
637 // Sourcegraph indexserver doesn't set repo.Rank, so we set it here
638 // based on priority. Setting it on read instead of during indexing
639 // allows us to avoid a complete reindex.
640 if r.Rank == 0 && r.priority > 0 {
641 // Normalize the repo score within [0, 1), with the midpoint at 5,000. This means popular
642 // repos (roughly ones with over 5,000 stars) see diminishing returns from more stars.
643 r.Rank = uint16(r.priority / (5000.0 + r.priority) * maxUInt16)
644 }
645 }
646 return nil
647}
648
649// MergeMutable will merge x into r. mutated will be true if it made any
650// changes. err is non-nil if we needed to mutate an immutable field.
651//
652// Note: SubRepoMap, IndexOptions and HasSymbol fields are ignored. They are
653// computed while indexing so can't be synthesized from x.
654//
655// Note: We ignore RawConfig fields which are duplicated into Repository:
656// name and id.
657//
658// Note: URL, *Template fields are ignored. They are not used by Sourcegraph.
659func (r *Repository) MergeMutable(x *Repository) (mutated bool, err error) {
660 if r.ID != x.ID {
661 // Sourcegraph: strange behaviour may occur if ID changes but names don't.
662 return mutated, errors.New("ID is immutable")
663 }
664 if r.Name != x.Name {
665 // Name is encoded into the shard name on disk. We need to re-index if it
666 // changes.
667 return mutated, errors.New("Name is immutable")
668 }
669 if !reflect.DeepEqual(r.Branches, x.Branches) {
670 // Need a reindex if content changing.
671 return mutated, errors.New("Branches is immutable")
672 }
673
674 for k, v := range x.RawConfig {
675 // We ignore name and id since they are encoded into the repository.
676 if k == "name" || k == "id" {
677 continue
678 }
679 if r.RawConfig == nil {
680 mutated = true
681 r.RawConfig = make(map[string]string)
682 }
683 if r.RawConfig[k] != v {
684 mutated = true
685 r.RawConfig[k] = v
686 }
687 }
688
689 return mutated, nil
690}
691
692// IndexMetadata holds metadata stored in the index file. It contains
693// data generated by the core indexing library.
694type IndexMetadata struct {
695 IndexFormatVersion int
696 IndexFeatureVersion int
697 IndexMinReaderVersion int
698 IndexTime time.Time
699 PlainASCII bool
700 LanguageMap map[string]uint16
701 ZoektVersion string
702 ID string
703}
704
705// Statistics of a (collection of) repositories.
706type RepoStats struct {
707 // Repos is used for aggregrating the number of repositories.
708 //
709 // Note: This field is not populated on RepoListEntry.Stats (individual) but
710 // only for RepoList.Stats (aggregate).
711 Repos int
712
713 // Shards is the total number of search shards.
714 Shards int
715
716 // Documents holds the number of documents or files.
717 Documents int
718
719 // IndexBytes is the amount of RAM used for index overhead.
720 IndexBytes int64
721
722 // ContentBytes is the amount of RAM used for raw content.
723 ContentBytes int64
724
725 // Sourcegraph specific stats below. These are not as efficient to calculate
726 // as the above statistics. We experimentally measured about a 10% slower
727 // shard load time. However, we find these values very useful to track and
728 // computing them outside of load time introduces a lot of complexity.
729
730 // NewLinesCount is the number of newlines "\n" that appear in the zoekt
731 // indexed documents. This is not exactly the same as line count, since it
732 // will not include lines not terminated by "\n" (eg a file with no "\n", or
733 // a final line without "\n"). Note: Zoekt deduplicates documents across
734 // branches, so if a path has the same contents on multiple branches, there
735 // is only one document for it. As such that document's newlines is only
736 // counted once. See DefaultBranchNewLinesCount and AllBranchesNewLinesCount
737 // for counts which do not deduplicate.
738 NewLinesCount uint64
739
740 // DefaultBranchNewLinesCount is the number of newlines "\n" in the default
741 // branch.
742 DefaultBranchNewLinesCount uint64
743
744 // OtherBranchesNewLinesCount is the number of newlines "\n" in all branches
745 // except the default branch.
746 OtherBranchesNewLinesCount uint64
747}
748
749func (s *RepoStats) Add(o *RepoStats) {
750 // can't update Repos, since one repo may have multiple
751 // shards.
752 s.Shards += o.Shards
753 s.IndexBytes += o.IndexBytes
754 s.Documents += o.Documents
755 s.ContentBytes += o.ContentBytes
756
757 // Sourcegraph specific
758 s.NewLinesCount += o.NewLinesCount
759 s.DefaultBranchNewLinesCount += o.DefaultBranchNewLinesCount
760 s.OtherBranchesNewLinesCount += o.OtherBranchesNewLinesCount
761}
762
763type RepoListEntry struct {
764 Repository Repository
765 IndexMetadata IndexMetadata
766 Stats RepoStats
767}
768
769// MinimalRepoListEntry is a subset of RepoListEntry. It was added after
770// performance profiling of sourcegraph.com revealed that querying this
771// information from Zoekt was causing lots of CPU and memory usage. Note: we
772// can revisit this, how we store and query this information has changed a lot
773// since this was introduced.
774type MinimalRepoListEntry struct {
775 // HasSymbols is exported since Sourcegraph uses this information at search
776 // planning time to decide between Zoekt and an unindexed symbol search.
777 //
778 // Note: it pretty much is always true in practice.
779 HasSymbols bool
780
781 // Branches is used by Sourcegraphs query planner to decided if it can use
782 // zoekt or go via an unindexed code path.
783 Branches []RepositoryBranch
784
785 // IndexTimeUnix is the IndexTime converted to unix time (number of seconds
786 // since the epoch). This is to make it clear we are not transporting the
787 // full fidelty timestamp (ie with milliseconds and location). Additionally
788 // it saves 16 bytes in this struct.
789 //
790 // IndexTime is used as a heuristic in Sourcegraph to decide in aggregate
791 // how many repositories need updating after a ranking change/etc.
792 //
793 // TODO(keegancsmith) audit updates to IndexTime and document how and when
794 // it changes. Concerned about things like metadata updates or compound
795 // shards leading to untrustworthy data here.
796 IndexTimeUnix int64
797}
798
799type ReposMap map[uint32]MinimalRepoListEntry
800
801// MarshalBinary implements a specialized encoder for ReposMap.
802func (q *ReposMap) MarshalBinary() ([]byte, error) {
803 return reposMapEncode(*q)
804}
805
806// UnmarshalBinary implements a specialized decoder for ReposMap.
807func (q *ReposMap) UnmarshalBinary(b []byte) error {
808 var err error
809 (*q), err = reposMapDecode(b)
810 return err
811}
812
813// RepoList holds a set of Repository metadata.
814type RepoList struct {
815 // Returned when ListOptions.Field is RepoListFieldRepos.
816 Repos []*RepoListEntry
817
818 // ReposMap is set when ListOptions.Field is RepoListFieldReposMap.
819 ReposMap ReposMap
820
821 Crashes int
822
823 // Stats response to a List request.
824 // This is the aggregate RepoStats of all repos matching the input query.
825 Stats RepoStats
826}
827
828type Searcher interface {
829 Search(ctx context.Context, q query.Q, opts *SearchOptions) (*SearchResult, error)
830
831 // List lists repositories. The query `q` can only contain
832 // query.Repo atoms.
833 List(ctx context.Context, q query.Q, opts *ListOptions) (*RepoList, error)
834 Close()
835
836 // Describe the searcher for debug messages.
837 String() string
838}
839
840type RepoListField int
841
842const (
843 RepoListFieldRepos RepoListField = 0
844 RepoListFieldReposMap = 2
845)
846
847type ListOptions struct {
848 // Field decides which field to populate in RepoList response.
849 Field RepoListField
850}
851
852func (o *ListOptions) GetField() (RepoListField, error) {
853 if o == nil {
854 return RepoListFieldRepos, nil
855 }
856 switch o.Field {
857 case RepoListFieldRepos, RepoListFieldReposMap:
858 return o.Field, nil
859 case 1:
860 return 0, fmt.Errorf("RepoListFieldMinimal (%d) is no longer supported", o.Field)
861 default:
862 return 0, fmt.Errorf("unknown RepoListField %d", o.Field)
863 }
864}
865
866func (o *ListOptions) String() string {
867 return fmt.Sprintf("%#v", o)
868}
869
870type SearchOptions struct {
871 // Return an upper-bound estimate of eligible documents in
872 // stats.ShardFilesConsidered.
873 EstimateDocCount bool
874
875 // Return the whole file.
876 Whole bool
877
878 // Maximum number of matches: skip all processing an index
879 // shard after we found this many non-overlapping matches.
880 ShardMaxMatchCount int
881
882 // Maximum number of matches: stop looking for more matches
883 // once we have this many matches across shards.
884 TotalMaxMatchCount int
885
886 // Maximum number of matches: skip processing documents for a repository in
887 // a shard once we have found ShardRepoMaxMatchCount.
888 //
889 // A compound shard may contain multiple repositories. This will most often
890 // be set to 1 to find all repositories containing a result.
891 ShardRepoMaxMatchCount int
892
893 // Deprecated: this field is not read anymore.
894 ShardMaxImportantMatch int
895
896 // Deprecated: this field is not read anymore.
897 TotalMaxImportantMatch int
898
899 // Abort the search after this much time has passed.
900 MaxWallTime time.Duration
901
902 // FlushWallTime if non-zero will stop streaming behaviour at first and
903 // instead will collate and sort results. At FlushWallTime the results will
904 // be sent and then the behaviour will revert to the normal streaming.
905 FlushWallTime time.Duration
906
907 // Truncates the number of documents (i.e. files) after collating and
908 // sorting the results.
909 MaxDocDisplayCount int
910
911 // Truncates the number of matchs after collating and sorting the results.
912 MaxMatchDisplayCount int
913
914 // If set to a number greater than zero then up to this many number
915 // of context lines will be added before and after each matched line.
916 // Note that the included context lines might contain matches and
917 // it's up to the consumer of the result to remove those lines.
918 NumContextLines int
919
920 // If true, ChunkMatches will be returned in each FileMatch rather than LineMatches
921 // EXPERIMENTAL: the behavior of this flag may be changed in future versions.
922 ChunkMatches bool
923
924 // EXPERIMENTAL. If true, document ranks are used as additional input for
925 // sorting matches.
926 UseDocumentRanks bool
927
928 // EXPERIMENTAL. When UseDocumentRanks is enabled, this can be optionally set to adjust
929 // their weight in the file match score. If the value is <= 0.0, the default weight value
930 // will be used. This option is temporary and is only exposed for testing/ tuning purposes.
931 DocumentRanksWeight float64
932
933 // EXPERIMENTAL. If true, use keyword-style scoring instead of the default scoring formula.
934 // Currently, this treats each match in a file as a term and computes an approximation to BM25.
935 // When enabled, all other scoring signals are ignored, including document ranks.
936 UseKeywordScoring bool
937
938 // Trace turns on opentracing for this request if true and if the Jaeger address was provided as
939 // a command-line flag
940 Trace bool
941
942 // If set, the search results will contain debug information for scoring.
943 DebugScore bool
944
945 // SpanContext is the opentracing span context, if it exists, from the zoekt client
946 SpanContext map[string]string
947}
948
949// String returns a succinct representation of the options. This is meant for
950// human consumption in logs and traces.
951//
952// Note: some tracing systems have limits on length of values, so we take care
953// to try and make this small, and include the important information near the
954// front incase of truncation.
955func (s *SearchOptions) String() string {
956 var b strings.Builder
957
958 add := func(name, value string) {
959 b.WriteString(name)
960 b.WriteByte('=')
961 b.WriteString(value)
962 b.WriteByte(' ')
963 }
964 addInt := func(name string, value int) {
965 if value != 0 {
966 add(name, strconv.Itoa(value))
967 }
968 }
969 addDuration := func(name string, value time.Duration) {
970 if value != 0 {
971 add(name, value.String())
972 }
973 }
974 addBool := func(name string, value bool) {
975 if !value {
976 return
977 }
978 b.WriteString(name)
979 b.WriteByte(' ')
980 }
981
982 b.WriteString("zoekt.SearchOptions{ ")
983
984 addInt("ShardMaxMatchCount", s.ShardMaxMatchCount)
985 addInt("TotalMaxMatchCount", s.TotalMaxMatchCount)
986 addInt("ShardRepoMaxMatchCount", s.ShardRepoMaxMatchCount)
987 addInt("ShardMaxImportantMatch", s.ShardMaxImportantMatch)
988 addInt("TotalMaxImportantMatch", s.TotalMaxImportantMatch)
989 addInt("MaxDocDisplayCount", s.MaxDocDisplayCount)
990 addInt("MaxMatchDisplayCount", s.MaxMatchDisplayCount)
991 addInt("NumContextLines", s.NumContextLines)
992
993 addDuration("MaxWallTime", s.MaxWallTime)
994 addDuration("FlushWallTime", s.FlushWallTime)
995
996 if s.DocumentRanksWeight > 0 {
997 add("DocumentRanksWeight", strconv.FormatFloat(s.DocumentRanksWeight, 'g', -1, 64))
998 }
999
1000 addBool("EstimateDocCount", s.EstimateDocCount)
1001 addBool("Whole", s.Whole)
1002 addBool("ChunkMatches", s.ChunkMatches)
1003 addBool("UseDocumentRanks", s.UseDocumentRanks)
1004 addBool("UseKeywordScoring", s.UseKeywordScoring)
1005 addBool("Trace", s.Trace)
1006 addBool("DebugScore", s.DebugScore)
1007
1008 for k, v := range s.SpanContext {
1009 add("SpanContext."+k, strconv.Quote(v))
1010 }
1011
1012 b.WriteByte('}')
1013 return b.String()
1014}
1015
1016// Sender is the interface that wraps the basic Send method.
1017type Sender interface {
1018 Send(*SearchResult)
1019}
1020
1021// Streamer adds the method StreamSearch to the Searcher interface.
1022type Streamer interface {
1023 Searcher
1024 StreamSearch(ctx context.Context, q query.Q, opts *SearchOptions, sender Sender) (err error)
1025}