fork of https://github.com/sourcegraph/zoekt
1// Copyright 2016 Google Inc. All rights reserved.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15package zoekt // import "github.com/sourcegraph/zoekt"
16
17import (
18 "context"
19 "encoding/json"
20 "errors"
21 "fmt"
22 "reflect"
23 "strconv"
24 "strings"
25 "time"
26
27 "github.com/sourcegraph/zoekt/query"
28)
29
30const (
31 mapHeaderBytes uint64 = 48
32 sliceHeaderBytes uint64 = 24
33 stringHeaderBytes uint64 = 16
34 pointerSize uint64 = 8
35 interfaceBytes uint64 = 16
36)
37
38// FileMatch contains all the matches within a file.
39type FileMatch struct {
40 FileName string
41
42 // Repository is the globally unique name of the repo of the
43 // match
44 Repository string
45
46 // SubRepositoryName is the globally unique name of the repo,
47 // if it came from a subrepository
48 SubRepositoryName string `json:",omitempty"`
49
50 // SubRepositoryPath holds the prefix where the subrepository
51 // was mounted.
52 SubRepositoryPath string `json:",omitempty"`
53
54 // Commit SHA1 (hex) of the (sub)repo holding the file.
55 Version string `json:",omitempty"`
56
57 // Detected language of the result.
58 Language string
59
60 // For debugging. Needs DebugScore set, but public so tests in
61 // other packages can print some diagnostics.
62 Debug string `json:",omitempty"`
63
64 Branches []string `json:",omitempty"`
65
66 // One of LineMatches or ChunkMatches will be returned depending on whether
67 // the SearchOptions.ChunkMatches is set.
68 LineMatches []LineMatch `json:",omitempty"`
69 ChunkMatches []ChunkMatch `json:",omitempty"`
70
71 // Only set if requested
72 Content []byte `json:",omitempty"`
73
74 // Checksum of the content.
75 Checksum []byte
76
77 // Ranking; the higher, the better.
78 Score float64 `json:",omitempty"`
79
80 // RepositoryPriority is a Sourcegraph extension. It is used by Sourcegraph to
81 // order results from different repositories relative to each other.
82 RepositoryPriority float64 `json:",omitempty"`
83
84 // RepositoryID is a Sourcegraph extension. This is the ID of Repository in
85 // Sourcegraph.
86 RepositoryID uint32 `json:",omitempty"`
87}
88
89func (m *FileMatch) sizeBytes() (sz uint64) {
90 // Score
91 sz += 8
92
93 for _, s := range []string{
94 m.Debug,
95 m.FileName,
96 m.Repository,
97 m.Language,
98 m.SubRepositoryName,
99 m.SubRepositoryPath,
100 m.Version,
101 } {
102 sz += stringHeaderBytes + uint64(len(s))
103 }
104
105 // Branches
106 sz += sliceHeaderBytes
107 for _, s := range m.Branches {
108 sz += stringHeaderBytes + uint64(len(s))
109 }
110
111 // LineMatches
112 sz += sliceHeaderBytes
113 for _, lm := range m.LineMatches {
114 sz += lm.sizeBytes()
115 }
116
117 // ChunkMatches
118 sz += sliceHeaderBytes
119 for _, cm := range m.ChunkMatches {
120 sz += cm.sizeBytes()
121 }
122
123 // RepositoryID
124 sz += 4
125
126 // RepositoryPriority
127 sz += 8
128
129 // Content
130 sz += sliceHeaderBytes + uint64(len(m.Content))
131
132 // Checksum
133 sz += sliceHeaderBytes + uint64(len(m.Checksum))
134
135 return
136}
137
138// ChunkMatch is a set of non-overlapping matches within a contiguous range of
139// lines in the file.
140type ChunkMatch struct {
141 DebugScore string
142
143 // Content is a contiguous range of complete lines that fully contains Ranges.
144 Content []byte
145
146 // Ranges is a set of matching ranges within this chunk. Each range is relative
147 // to the beginning of the file (not the beginning of Content).
148 Ranges []Range
149
150 // SymbolInfo is the symbol information associated with Ranges. If it is non-nil,
151 // its length will equal that of Ranges. Any of its elements may be nil.
152 SymbolInfo []*Symbol
153
154 // FileName indicates whether this match is a match on the file name, in
155 // which case Content will contain the file name.
156 FileName bool
157
158 // ContentStart is the location (inclusive) of the beginning of content
159 // relative to the beginning of the file. It will always be at the
160 // beginning of a line (Column will always be 1).
161 ContentStart Location
162
163 Score float64
164}
165
166func (cm *ChunkMatch) sizeBytes() (sz uint64) {
167 // Content
168 sz += sliceHeaderBytes + uint64(len(cm.Content))
169
170 // ContentStart
171 sz += cm.ContentStart.sizeBytes()
172
173 // FileName
174 sz += 1
175
176 // Ranges
177 sz += sliceHeaderBytes
178 if len(cm.Ranges) > 0 {
179 sz += uint64(len(cm.Ranges)) * cm.Ranges[0].sizeBytes()
180 }
181
182 // SymbolInfo
183 sz += sliceHeaderBytes
184 for _, si := range cm.SymbolInfo {
185 sz += pointerSize
186 if si != nil {
187 sz += si.sizeBytes()
188 }
189 }
190
191 // Score
192 sz += 8
193
194 // DebugScore
195 sz += stringHeaderBytes + uint64(len(cm.DebugScore))
196
197 return
198}
199
200type Range struct {
201 // The inclusive beginning of the range.
202 Start Location
203 // The exclusive end of the range.
204 End Location
205}
206
207func (r *Range) sizeBytes() uint64 {
208 return r.Start.sizeBytes() + r.End.sizeBytes()
209}
210
211type Location struct {
212 // 0-based byte offset from the beginning of the file
213 ByteOffset uint32
214 // 1-based line number from the beginning of the file
215 LineNumber uint32
216 // 1-based column number (in runes) from the beginning of line
217 Column uint32
218}
219
220func (l *Location) sizeBytes() uint64 {
221 return 3 * 4
222}
223
224// LineMatch holds the matches within a single line in a file.
225type LineMatch struct {
226 // The line in which a match was found.
227 Line []byte
228 LineStart int
229 LineEnd int
230 LineNumber int
231
232 // Before and After are only set when SearchOptions.NumContextLines is > 0
233 Before []byte
234 After []byte
235
236 // If set, this was a match on the filename.
237 FileName bool
238
239 // The higher the better. Only ranks the quality of the match
240 // within the file, does not take rank of file into account
241 Score float64
242 DebugScore string
243
244 LineFragments []LineFragmentMatch
245}
246
247func (lm *LineMatch) sizeBytes() (sz uint64) {
248 // Line
249 sz += sliceHeaderBytes + uint64(len(lm.Line))
250
251 // LineStart, LineEnd, LineNumber
252 sz += 3 * 8
253
254 // Before
255 sz += sliceHeaderBytes + uint64(len(lm.Before))
256
257 // After
258 sz += sliceHeaderBytes + uint64(len(lm.After))
259
260 // FileName
261 sz += 1
262
263 // Score
264 sz += 8
265
266 // DebugScore
267 sz += stringHeaderBytes + uint64(len(lm.DebugScore))
268
269 // LineFragments
270 sz += sliceHeaderBytes
271 for _, lf := range lm.LineFragments {
272 sz += lf.sizeBytes()
273 }
274
275 return
276}
277
278type Symbol struct {
279 Sym string
280 Kind string
281 Parent string
282 ParentKind string
283}
284
285func (s *Symbol) sizeBytes() uint64 {
286 return 4*stringHeaderBytes + uint64(len(s.Sym)+len(s.Kind)+len(s.Parent)+len(s.ParentKind))
287}
288
289// LineFragmentMatch a segment of matching text within a line.
290type LineFragmentMatch struct {
291 // Offset within the line, in bytes.
292 LineOffset int
293
294 // Offset from file start, in bytes.
295 Offset uint32
296
297 // Number bytes that match.
298 MatchLength int
299
300 SymbolInfo *Symbol
301}
302
303func (lfm *LineFragmentMatch) sizeBytes() (sz uint64) {
304 // LineOffset
305 sz += 8
306
307 // Offset
308 sz += 4
309
310 // MatchLength
311 sz += 8
312
313 // SymbolInfo
314 sz += pointerSize
315 if lfm.SymbolInfo != nil {
316 sz += lfm.SymbolInfo.sizeBytes()
317 }
318
319 return
320}
321
322type FlushReason uint8
323
324const (
325 FlushReasonTimerExpired FlushReason = 1 << iota
326 FlushReasonFinalFlush
327 FlushReasonMaxSize
328)
329
330var FlushReasonStrings = map[FlushReason]string{
331 FlushReasonTimerExpired: "timer_expired",
332 FlushReasonFinalFlush: "final_flush",
333 FlushReasonMaxSize: "max_size_reached",
334}
335
336func (fr FlushReason) String() string {
337 if v, ok := FlushReasonStrings[fr]; ok {
338 return v
339 }
340
341 return "none"
342}
343
344// Stats contains interesting numbers on the search
345type Stats struct {
346 // Amount of I/O for reading contents.
347 ContentBytesLoaded int64
348
349 // Amount of I/O for reading from index.
350 IndexBytesLoaded int64
351
352 // Number of search shards that had a crash.
353 Crashes int
354
355 // Wall clock time for this search
356 Duration time.Duration
357
358 // Number of files containing a match.
359 FileCount int
360
361 // Number of files in shards that we considered.
362 ShardFilesConsidered int
363
364 // Files that we evaluated. Equivalent to files for which all
365 // atom matches (including negations) evaluated to true.
366 FilesConsidered int
367
368 // Files for which we loaded file content to verify substring matches
369 FilesLoaded int
370
371 // Candidate files whose contents weren't examined because we
372 // gathered enough matches.
373 FilesSkipped int
374
375 // Shards that we scanned to find matches.
376 ShardsScanned int
377
378 // Shards that we did not process because a query was canceled.
379 ShardsSkipped int
380
381 // Shards that we did not process because the query was rejected by the
382 // ngram filter indicating it had no matches.
383 ShardsSkippedFilter int
384
385 // Number of non-overlapping matches
386 MatchCount int
387
388 // Number of candidate matches as a result of searching ngrams.
389 NgramMatches int
390
391 // NgramLookups is the number of times we accessed an ngram in the index.
392 NgramLookups int
393
394 // Wall clock time for queued search.
395 Wait time.Duration
396
397 // Aggregate wall clock time spent constructing and pruning the match tree.
398 // This accounts for time such as lookups in the trigram index.
399 MatchTreeConstruction time.Duration
400
401 // Aggregate wall clock time spent searching the match tree. This accounts
402 // for the bulk of search work done looking for matches.
403 MatchTreeSearch time.Duration
404
405 // Number of times regexp was called on files that we evaluated.
406 RegexpsConsidered int
407
408 // FlushReason explains why results were flushed.
409 FlushReason FlushReason
410}
411
412func (s *Stats) sizeBytes() (sz uint64) {
413 sz = 16 * 8 // This assumes we are running on a 64-bit architecture
414 sz += 1 // FlushReason
415
416 return
417}
418
419func (s *Stats) Add(o Stats) {
420 s.ContentBytesLoaded += o.ContentBytesLoaded
421 s.IndexBytesLoaded += o.IndexBytesLoaded
422 s.Crashes += o.Crashes
423 s.FileCount += o.FileCount
424 s.FilesConsidered += o.FilesConsidered
425 s.FilesLoaded += o.FilesLoaded
426 s.FilesSkipped += o.FilesSkipped
427 s.MatchCount += o.MatchCount
428 s.NgramMatches += o.NgramMatches
429 s.NgramLookups += o.NgramLookups
430 s.ShardFilesConsidered += o.ShardFilesConsidered
431 s.ShardsScanned += o.ShardsScanned
432 s.ShardsSkipped += o.ShardsSkipped
433 s.ShardsSkippedFilter += o.ShardsSkippedFilter
434 s.Wait += o.Wait
435 s.MatchTreeConstruction += o.MatchTreeConstruction
436 s.MatchTreeSearch += o.MatchTreeSearch
437 s.RegexpsConsidered += o.RegexpsConsidered
438
439 // We want the first non-zero FlushReason to be sticky. This is a useful
440 // property when aggregating stats from several Zoekts.
441 if s.FlushReason == 0 {
442 s.FlushReason = o.FlushReason
443 }
444}
445
446// Zero returns true if stats is empty.
447func (s *Stats) Zero() bool {
448 if s == nil {
449 return true
450 }
451
452 return !(s.ContentBytesLoaded > 0 ||
453 s.IndexBytesLoaded > 0 ||
454 s.Crashes > 0 ||
455 s.FileCount > 0 ||
456 s.FilesConsidered > 0 ||
457 s.FilesLoaded > 0 ||
458 s.FilesSkipped > 0 ||
459 s.MatchCount > 0 ||
460 s.NgramMatches > 0 ||
461 s.NgramLookups > 0 ||
462 s.ShardFilesConsidered > 0 ||
463 s.ShardsScanned > 0 ||
464 s.ShardsSkipped > 0 ||
465 s.ShardsSkippedFilter > 0 ||
466 s.Wait > 0 ||
467 s.MatchTreeConstruction > 0 ||
468 s.MatchTreeSearch > 0 ||
469 s.RegexpsConsidered > 0)
470}
471
472// Progress contains information about the global progress of the running search query.
473// This is used by the frontend to reorder results and emit them when stable.
474// Sourcegraph specific: this is used when querying multiple zoekt-webserver instances.
475type Progress struct {
476 // Priority of the shard that was searched.
477 Priority float64
478
479 // MaxPendingPriority is the maximum priority of pending result that is being searched in parallel.
480 // This is used to reorder results when the result set is known to be stable-- that is, when a result's
481 // Priority is greater than the max(MaxPendingPriority) from the latest results of each backend, it can be returned to the user.
482 //
483 // MaxPendingPriority decreases monotonically in each SearchResult.
484 MaxPendingPriority float64
485}
486
487func (p *Progress) sizeBytes() uint64 {
488 return 2 * 8
489}
490
491// SearchResult contains search matches and extra data
492type SearchResult struct {
493 Stats
494
495 // Do not encode this as we cannot encode -Inf in JSON
496 Progress `json:"-"`
497
498 Files []FileMatch
499
500 // RepoURLs holds a repo => template string map.
501 RepoURLs map[string]string
502
503 // FragmentNames holds a repo => template string map, for
504 // the line number fragment.
505 LineFragments map[string]string
506}
507
508// SizeBytes is a best-effort estimate of the size of SearchResult in memory.
509// The estimate does not take alignment into account. The result is a lower
510// bound on the actual size in memory.
511func (sr *SearchResult) SizeBytes() (sz uint64) {
512 sz += sr.Stats.sizeBytes()
513 sz += sr.Progress.sizeBytes()
514
515 // Files
516 sz += sliceHeaderBytes
517 for _, f := range sr.Files {
518 sz += f.sizeBytes()
519 }
520
521 // RepoURLs
522 sz += mapHeaderBytes
523 for k, v := range sr.RepoURLs {
524 sz += stringHeaderBytes + uint64(len(k))
525 sz += stringHeaderBytes + uint64(len(v))
526 }
527
528 // LineFragments
529 sz += mapHeaderBytes
530 for k, v := range sr.LineFragments {
531 sz += stringHeaderBytes + uint64(len(k))
532 sz += stringHeaderBytes + uint64(len(v))
533 }
534
535 return
536}
537
538// RepositoryBranch describes an indexed branch, which is a name
539// combined with a version.
540type RepositoryBranch struct {
541 Name string
542 Version string
543}
544
545func (r RepositoryBranch) String() string {
546 return fmt.Sprintf("%s@%s", r.Name, r.Version)
547}
548
549// Repository holds repository metadata.
550type Repository struct {
551 // Sourcegraph's repository ID
552 ID uint32
553
554 // The repository name
555 Name string
556
557 // The repository URL.
558 URL string
559
560 // The physical source where this repo came from, eg. full
561 // path to the zip filename or git repository directory. This
562 // will not be exposed in the UI, but can be used to detect
563 // orphaned index shards.
564 Source string
565
566 // The branches indexed in this repo.
567 Branches []RepositoryBranch
568
569 // Nil if this is not the super project.
570 SubRepoMap map[string]*Repository
571
572 // URL template to link to the commit of a branch
573 CommitURLTemplate string
574
575 // The repository URL for getting to a file. Has access to
576 // {{.Version}}, {{.Path}}
577 FileURLTemplate string
578
579 // The URL fragment to add to a file URL for line numbers. has
580 // access to {{.LineNumber}}. The fragment should include the
581 // separator, generally '#' or ';'.
582 LineFragmentTemplate string
583
584 // Perf optimization: priority is set when we load the shard. It corresponds to
585 // the value of "priority" stored in RawConfig.
586 priority float64
587
588 // All zoekt.* configuration settings.
589 RawConfig map[string]string
590
591 // Importance of the repository, bigger is more important
592 Rank uint16
593
594 // IndexOptions is a hash of the options used to create the index for the
595 // repo.
596 IndexOptions string
597
598 // HasSymbols is true if this repository has indexed ctags
599 // output. Sourcegraph specific: This field is more appropriate for
600 // IndexMetadata. However, we store it here since the Sourcegraph frontend
601 // can read this structure but not IndexMetadata.
602 HasSymbols bool
603
604 // Tombstone is true if we are not allowed to search this repo.
605 Tombstone bool
606
607 // LatestCommitDate is the date of the latest commit among all indexed Branches.
608 // The date might be time.Time's 0-value if the repository was last indexed
609 // before this field was added.
610 LatestCommitDate time.Time
611
612 // FileTombstones is a set of file paths that should be ignored across all branches
613 // in this shard.
614 FileTombstones map[string]struct{} `json:",omitempty"`
615}
616
617func (r *Repository) UnmarshalJSON(data []byte) error {
618 // We define a new type so that we can use json.Unmarshal
619 // without recursing into this same method.
620 type repository *Repository
621 repo := repository(r)
622
623 err := json.Unmarshal(data, repo)
624 if err != nil {
625 return err
626 }
627
628 if v, ok := repo.RawConfig["repoid"]; ok {
629 id, _ := strconv.ParseUint(v, 10, 32)
630 r.ID = uint32(id)
631 }
632
633 if v, ok := repo.RawConfig["priority"]; ok {
634 r.priority, err = strconv.ParseFloat(v, 64)
635 if err != nil {
636 r.priority = 0
637 }
638
639 // Sourcegraph indexserver doesn't set repo.Rank, so we set it here
640 // based on priority. Setting it on read instead of during indexing
641 // allows us to avoid a complete reindex.
642 if r.Rank == 0 && r.priority > 0 {
643 // Normalize the repo score within [0, 1), with the midpoint at 5,000. This means popular
644 // repos (roughly ones with over 5,000 stars) see diminishing returns from more stars.
645 r.Rank = uint16(r.priority / (5000.0 + r.priority) * maxUInt16)
646 }
647 }
648 return nil
649}
650
651// MergeMutable will merge x into r. mutated will be true if it made any
652// changes. err is non-nil if we needed to mutate an immutable field.
653//
654// Note: SubRepoMap, IndexOptions and HasSymbol fields are ignored. They are
655// computed while indexing so can't be synthesized from x.
656//
657// Note: We ignore RawConfig fields which are duplicated into Repository:
658// name and id.
659//
660// Note: URL, *Template fields are ignored. They are not used by Sourcegraph.
661func (r *Repository) MergeMutable(x *Repository) (mutated bool, err error) {
662 if r.ID != x.ID {
663 // Sourcegraph: strange behaviour may occur if ID changes but names don't.
664 return mutated, errors.New("ID is immutable")
665 }
666 if r.Name != x.Name {
667 // Name is encoded into the shard name on disk. We need to re-index if it
668 // changes.
669 return mutated, errors.New("Name is immutable")
670 }
671 if !reflect.DeepEqual(r.Branches, x.Branches) {
672 // Need a reindex if content changing.
673 return mutated, errors.New("Branches is immutable")
674 }
675
676 for k, v := range x.RawConfig {
677 // We ignore name and id since they are encoded into the repository.
678 if k == "name" || k == "id" {
679 continue
680 }
681 if r.RawConfig == nil {
682 mutated = true
683 r.RawConfig = make(map[string]string)
684 }
685 if r.RawConfig[k] != v {
686 mutated = true
687 r.RawConfig[k] = v
688 }
689 }
690
691 return mutated, nil
692}
693
694// IndexMetadata holds metadata stored in the index file. It contains
695// data generated by the core indexing library.
696type IndexMetadata struct {
697 IndexFormatVersion int
698 IndexFeatureVersion int
699 IndexMinReaderVersion int
700 IndexTime time.Time
701 PlainASCII bool
702 LanguageMap map[string]uint16
703 ZoektVersion string
704 ID string
705}
706
707// Statistics of a (collection of) repositories.
708type RepoStats struct {
709 // Repos is used for aggregrating the number of repositories.
710 //
711 // Note: This field is not populated on RepoListEntry.Stats (individual) but
712 // only for RepoList.Stats (aggregate).
713 Repos int
714
715 // Shards is the total number of search shards.
716 Shards int
717
718 // Documents holds the number of documents or files.
719 Documents int
720
721 // IndexBytes is the amount of RAM used for index overhead.
722 IndexBytes int64
723
724 // ContentBytes is the amount of RAM used for raw content.
725 ContentBytes int64
726
727 // Sourcegraph specific stats below. These are not as efficient to calculate
728 // as the above statistics. We experimentally measured about a 10% slower
729 // shard load time. However, we find these values very useful to track and
730 // computing them outside of load time introduces a lot of complexity.
731
732 // NewLinesCount is the number of newlines "\n" that appear in the zoekt
733 // indexed documents. This is not exactly the same as line count, since it
734 // will not include lines not terminated by "\n" (eg a file with no "\n", or
735 // a final line without "\n"). Note: Zoekt deduplicates documents across
736 // branches, so if a path has the same contents on multiple branches, there
737 // is only one document for it. As such that document's newlines is only
738 // counted once. See DefaultBranchNewLinesCount and AllBranchesNewLinesCount
739 // for counts which do not deduplicate.
740 NewLinesCount uint64
741
742 // DefaultBranchNewLinesCount is the number of newlines "\n" in the default
743 // branch.
744 DefaultBranchNewLinesCount uint64
745
746 // OtherBranchesNewLinesCount is the number of newlines "\n" in all branches
747 // except the default branch.
748 OtherBranchesNewLinesCount uint64
749}
750
751func (s *RepoStats) Add(o *RepoStats) {
752 // can't update Repos, since one repo may have multiple
753 // shards.
754 s.Shards += o.Shards
755 s.IndexBytes += o.IndexBytes
756 s.Documents += o.Documents
757 s.ContentBytes += o.ContentBytes
758
759 // Sourcegraph specific
760 s.NewLinesCount += o.NewLinesCount
761 s.DefaultBranchNewLinesCount += o.DefaultBranchNewLinesCount
762 s.OtherBranchesNewLinesCount += o.OtherBranchesNewLinesCount
763}
764
765type RepoListEntry struct {
766 Repository Repository
767 IndexMetadata IndexMetadata
768 Stats RepoStats
769}
770
771// MinimalRepoListEntry is a subset of RepoListEntry. It was added after
772// performance profiling of sourcegraph.com revealed that querying this
773// information from Zoekt was causing lots of CPU and memory usage. Note: we
774// can revisit this, how we store and query this information has changed a lot
775// since this was introduced.
776type MinimalRepoListEntry struct {
777 // HasSymbols is exported since Sourcegraph uses this information at search
778 // planning time to decide between Zoekt and an unindexed symbol search.
779 //
780 // Note: it pretty much is always true in practice.
781 HasSymbols bool
782
783 // Branches is used by Sourcegraphs query planner to decided if it can use
784 // zoekt or go via an unindexed code path.
785 Branches []RepositoryBranch
786
787 // IndexTimeUnix is the IndexTime converted to unix time (number of seconds
788 // since the epoch). This is to make it clear we are not transporting the
789 // full fidelty timestamp (ie with milliseconds and location). Additionally
790 // it saves 16 bytes in this struct.
791 //
792 // IndexTime is used as a heuristic in Sourcegraph to decide in aggregate
793 // how many repositories need updating after a ranking change/etc.
794 //
795 // TODO(keegancsmith) audit updates to IndexTime and document how and when
796 // it changes. Concerned about things like metadata updates or compound
797 // shards leading to untrustworthy data here.
798 IndexTimeUnix int64
799}
800
801type ReposMap map[uint32]MinimalRepoListEntry
802
803// MarshalBinary implements a specialized encoder for ReposMap.
804func (q *ReposMap) MarshalBinary() ([]byte, error) {
805 return reposMapEncode(*q)
806}
807
808// UnmarshalBinary implements a specialized decoder for ReposMap.
809func (q *ReposMap) UnmarshalBinary(b []byte) error {
810 var err error
811 (*q), err = reposMapDecode(b)
812 return err
813}
814
815// RepoList holds a set of Repository metadata.
816type RepoList struct {
817 // Returned when ListOptions.Field is RepoListFieldRepos.
818 Repos []*RepoListEntry
819
820 // ReposMap is set when ListOptions.Field is RepoListFieldReposMap.
821 ReposMap ReposMap
822
823 Crashes int
824
825 // Stats response to a List request.
826 // This is the aggregate RepoStats of all repos matching the input query.
827 Stats RepoStats
828}
829
830type Searcher interface {
831 Search(ctx context.Context, q query.Q, opts *SearchOptions) (*SearchResult, error)
832
833 // List lists repositories. The query `q` can only contain
834 // query.Repo atoms.
835 List(ctx context.Context, q query.Q, opts *ListOptions) (*RepoList, error)
836 Close()
837
838 // Describe the searcher for debug messages.
839 String() string
840}
841
842type RepoListField int
843
844const (
845 RepoListFieldRepos RepoListField = 0
846 RepoListFieldReposMap = 2
847)
848
849type ListOptions struct {
850 // Field decides which field to populate in RepoList response.
851 Field RepoListField
852}
853
854func (o *ListOptions) GetField() (RepoListField, error) {
855 if o == nil {
856 return RepoListFieldRepos, nil
857 }
858 switch o.Field {
859 case RepoListFieldRepos, RepoListFieldReposMap:
860 return o.Field, nil
861 case 1:
862 return 0, fmt.Errorf("RepoListFieldMinimal (%d) is no longer supported", o.Field)
863 default:
864 return 0, fmt.Errorf("unknown RepoListField %d", o.Field)
865 }
866}
867
868func (o *ListOptions) String() string {
869 return fmt.Sprintf("%#v", o)
870}
871
872type SearchOptions struct {
873 // Return an upper-bound estimate of eligible documents in
874 // stats.ShardFilesConsidered.
875 EstimateDocCount bool
876
877 // Return the whole file.
878 Whole bool
879
880 // Maximum number of matches: skip all processing an index
881 // shard after we found this many non-overlapping matches.
882 ShardMaxMatchCount int
883
884 // Maximum number of matches: stop looking for more matches
885 // once we have this many matches across shards.
886 TotalMaxMatchCount int
887
888 // Maximum number of matches: skip processing documents for a repository in
889 // a shard once we have found ShardRepoMaxMatchCount.
890 //
891 // A compound shard may contain multiple repositories. This will most often
892 // be set to 1 to find all repositories containing a result.
893 ShardRepoMaxMatchCount int
894
895 // Deprecated: this field is not read anymore.
896 ShardMaxImportantMatch int
897
898 // Deprecated: this field is not read anymore.
899 TotalMaxImportantMatch int
900
901 // Abort the search after this much time has passed.
902 MaxWallTime time.Duration
903
904 // FlushWallTime if non-zero will stop streaming behaviour at first and
905 // instead will collate and sort results. At FlushWallTime the results will
906 // be sent and then the behaviour will revert to the normal streaming.
907 FlushWallTime time.Duration
908
909 // Truncates the number of documents (i.e. files) after collating and
910 // sorting the results.
911 MaxDocDisplayCount int
912
913 // Truncates the number of matchs after collating and sorting the results.
914 MaxMatchDisplayCount int
915
916 // If set to a number greater than zero then up to this many number
917 // of context lines will be added before and after each matched line.
918 // Note that the included context lines might contain matches and
919 // it's up to the consumer of the result to remove those lines.
920 NumContextLines int
921
922 // If true, ChunkMatches will be returned in each FileMatch rather than LineMatches
923 // EXPERIMENTAL: the behavior of this flag may be changed in future versions.
924 ChunkMatches bool
925
926 // EXPERIMENTAL. If true, document ranks are used as additional input for
927 // sorting matches.
928 UseDocumentRanks bool
929
930 // EXPERIMENTAL. When UseDocumentRanks is enabled, this can be optionally set to adjust
931 // their weight in the file match score. If the value is <= 0.0, the default weight value
932 // will be used. This option is temporary and is only exposed for testing/ tuning purposes.
933 DocumentRanksWeight float64
934
935 // EXPERIMENTAL. If true, use keyword-style scoring instead of the default scoring formula.
936 // Currently, this treats each match in a file as a term and computes an approximation to BM25.
937 // When enabled, all other scoring signals are ignored, including document ranks.
938 UseKeywordScoring bool
939
940 // Trace turns on opentracing for this request if true and if the Jaeger address was provided as
941 // a command-line flag
942 Trace bool
943
944 // If set, the search results will contain debug information for scoring.
945 DebugScore bool
946
947 // SpanContext is the opentracing span context, if it exists, from the zoekt client
948 SpanContext map[string]string
949}
950
951// String returns a succinct representation of the options. This is meant for
952// human consumption in logs and traces.
953//
954// Note: some tracing systems have limits on length of values, so we take care
955// to try and make this small, and include the important information near the
956// front incase of truncation.
957func (s *SearchOptions) String() string {
958 var b strings.Builder
959
960 add := func(name, value string) {
961 b.WriteString(name)
962 b.WriteByte('=')
963 b.WriteString(value)
964 b.WriteByte(' ')
965 }
966 addInt := func(name string, value int) {
967 if value != 0 {
968 add(name, strconv.Itoa(value))
969 }
970 }
971 addDuration := func(name string, value time.Duration) {
972 if value != 0 {
973 add(name, value.String())
974 }
975 }
976 addBool := func(name string, value bool) {
977 if !value {
978 return
979 }
980 b.WriteString(name)
981 b.WriteByte(' ')
982 }
983
984 b.WriteString("zoekt.SearchOptions{ ")
985
986 addInt("ShardMaxMatchCount", s.ShardMaxMatchCount)
987 addInt("TotalMaxMatchCount", s.TotalMaxMatchCount)
988 addInt("ShardRepoMaxMatchCount", s.ShardRepoMaxMatchCount)
989 addInt("ShardMaxImportantMatch", s.ShardMaxImportantMatch)
990 addInt("TotalMaxImportantMatch", s.TotalMaxImportantMatch)
991 addInt("MaxDocDisplayCount", s.MaxDocDisplayCount)
992 addInt("MaxMatchDisplayCount", s.MaxMatchDisplayCount)
993 addInt("NumContextLines", s.NumContextLines)
994
995 addDuration("MaxWallTime", s.MaxWallTime)
996 addDuration("FlushWallTime", s.FlushWallTime)
997
998 if s.DocumentRanksWeight > 0 {
999 add("DocumentRanksWeight", strconv.FormatFloat(s.DocumentRanksWeight, 'g', -1, 64))
1000 }
1001
1002 addBool("EstimateDocCount", s.EstimateDocCount)
1003 addBool("Whole", s.Whole)
1004 addBool("ChunkMatches", s.ChunkMatches)
1005 addBool("UseDocumentRanks", s.UseDocumentRanks)
1006 addBool("UseKeywordScoring", s.UseKeywordScoring)
1007 addBool("Trace", s.Trace)
1008 addBool("DebugScore", s.DebugScore)
1009
1010 for k, v := range s.SpanContext {
1011 add("SpanContext."+k, strconv.Quote(v))
1012 }
1013
1014 b.WriteByte('}')
1015 return b.String()
1016}
1017
1018// Sender is the interface that wraps the basic Send method.
1019type Sender interface {
1020 Send(*SearchResult)
1021}
1022
1023// Streamer adds the method StreamSearch to the Searcher interface.
1024type Streamer interface {
1025 Searcher
1026 StreamSearch(ctx context.Context, q query.Q, opts *SearchOptions, sender Sender) (err error)
1027}