fork of https://github.com/sourcegraph/zoekt
1// Copyright 2016 Google Inc. All rights reserved.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15package zoekt // import "github.com/sourcegraph/zoekt"
16
17import (
18 "context"
19 "encoding/json"
20 "errors"
21 "fmt"
22 "reflect"
23 "strconv"
24 "strings"
25 "time"
26
27 "github.com/sourcegraph/zoekt/query"
28)
29
30const (
31 mapHeaderBytes uint64 = 48
32 sliceHeaderBytes uint64 = 24
33 stringHeaderBytes uint64 = 16
34 pointerSize uint64 = 8
35 interfaceBytes uint64 = 16
36)
37
38// FileMatch contains all the matches within a file.
39type FileMatch struct {
40 FileName string
41
42 // Repository is the globally unique name of the repo of the
43 // match
44 Repository string
45
46 // SubRepositoryName is the globally unique name of the repo,
47 // if it came from a subrepository
48 SubRepositoryName string `json:",omitempty"`
49
50 // SubRepositoryPath holds the prefix where the subrepository
51 // was mounted.
52 SubRepositoryPath string `json:",omitempty"`
53
54 // Commit SHA1 (hex) of the (sub)repo holding the file.
55 Version string `json:",omitempty"`
56
57 // Detected language of the result.
58 Language string
59
60 // For debugging. Needs DebugScore set, but public so tests in
61 // other packages can print some diagnostics.
62 Debug string `json:",omitempty"`
63
64 Branches []string `json:",omitempty"`
65
66 // One of LineMatches or ChunkMatches will be returned depending on whether
67 // the SearchOptions.ChunkMatches is set.
68 LineMatches []LineMatch `json:",omitempty"`
69 ChunkMatches []ChunkMatch `json:",omitempty"`
70
71 // Only set if requested
72 Content []byte `json:",omitempty"`
73
74 // Checksum of the content.
75 Checksum []byte
76
77 // Ranking; the higher, the better.
78 Score float64 `json:",omitempty"`
79
80 // RepositoryPriority is a Sourcegraph extension. It is used by Sourcegraph to
81 // order results from different repositories relative to each other.
82 RepositoryPriority float64 `json:",omitempty"`
83
84 // RepositoryID is a Sourcegraph extension. This is the ID of Repository in
85 // Sourcegraph.
86 RepositoryID uint32 `json:",omitempty"`
87}
88
89func (m *FileMatch) sizeBytes() (sz uint64) {
90 // Score
91 sz += 8
92
93 for _, s := range []string{
94 m.Debug,
95 m.FileName,
96 m.Repository,
97 m.Language,
98 m.SubRepositoryName,
99 m.SubRepositoryPath,
100 m.Version,
101 } {
102 sz += stringHeaderBytes + uint64(len(s))
103 }
104
105 // Branches
106 sz += sliceHeaderBytes
107 for _, s := range m.Branches {
108 sz += stringHeaderBytes + uint64(len(s))
109 }
110
111 // LineMatches
112 sz += sliceHeaderBytes
113 for _, lm := range m.LineMatches {
114 sz += lm.sizeBytes()
115 }
116
117 // ChunkMatches
118 sz += sliceHeaderBytes
119 for _, cm := range m.ChunkMatches {
120 sz += cm.sizeBytes()
121 }
122
123 // RepositoryID
124 sz += 4
125
126 // RepositoryPriority
127 sz += 8
128
129 // Content
130 sz += sliceHeaderBytes + uint64(len(m.Content))
131
132 // Checksum
133 sz += sliceHeaderBytes + uint64(len(m.Checksum))
134
135 return
136}
137
138// ChunkMatch is a set of non-overlapping matches within a contiguous range of
139// lines in the file.
140type ChunkMatch struct {
141 DebugScore string
142
143 // Content is a contiguous range of complete lines that fully contains Ranges.
144 Content []byte
145
146 // Ranges is a set of matching ranges within this chunk. Each range is relative
147 // to the beginning of the file (not the beginning of Content).
148 Ranges []Range
149
150 // SymbolInfo is the symbol information associated with Ranges. If it is non-nil,
151 // its length will equal that of Ranges. Any of its elements may be nil.
152 SymbolInfo []*Symbol
153
154 // FileName indicates whether this match is a match on the file name, in
155 // which case Content will contain the file name.
156 FileName bool
157
158 // ContentStart is the location (inclusive) of the beginning of content
159 // relative to the beginning of the file. It will always be at the
160 // beginning of a line (Column will always be 1).
161 ContentStart Location
162
163 Score float64
164}
165
166func (cm *ChunkMatch) sizeBytes() (sz uint64) {
167 // Content
168 sz += sliceHeaderBytes + uint64(len(cm.Content))
169
170 // ContentStart
171 sz += cm.ContentStart.sizeBytes()
172
173 // FileName
174 sz += 1
175
176 // Ranges
177 sz += sliceHeaderBytes
178 if len(cm.Ranges) > 0 {
179 sz += uint64(len(cm.Ranges)) * cm.Ranges[0].sizeBytes()
180 }
181
182 // SymbolInfo
183 sz += sliceHeaderBytes
184 for _, si := range cm.SymbolInfo {
185 sz += pointerSize
186 if si != nil {
187 sz += si.sizeBytes()
188 }
189 }
190
191 // Score
192 sz += 8
193
194 // DebugScore
195 sz += stringHeaderBytes + uint64(len(cm.DebugScore))
196
197 return
198}
199
200type Range struct {
201 // The inclusive beginning of the range.
202 Start Location
203 // The exclusive end of the range.
204 End Location
205}
206
207func (r *Range) sizeBytes() uint64 {
208 return r.Start.sizeBytes() + r.End.sizeBytes()
209}
210
211type Location struct {
212 // 0-based byte offset from the beginning of the file
213 ByteOffset uint32
214 // 1-based line number from the beginning of the file
215 LineNumber uint32
216 // 1-based column number (in runes) from the beginning of line
217 Column uint32
218}
219
220func (l *Location) sizeBytes() uint64 {
221 return 3 * 4
222}
223
224// LineMatch holds the matches within a single line in a file.
225type LineMatch struct {
226 // The line in which a match was found.
227 Line []byte
228 LineStart int
229 LineEnd int
230 LineNumber int
231
232 // Before and After are only set when SearchOptions.NumContextLines is > 0
233 Before []byte
234 After []byte
235
236 // If set, this was a match on the filename.
237 FileName bool
238
239 // The higher the better. Only ranks the quality of the match
240 // within the file, does not take rank of file into account
241 Score float64
242 DebugScore string
243
244 LineFragments []LineFragmentMatch
245}
246
247func (lm *LineMatch) sizeBytes() (sz uint64) {
248 // Line
249 sz += sliceHeaderBytes + uint64(len(lm.Line))
250
251 // LineStart, LineEnd, LineNumber
252 sz += 3 * 8
253
254 // Before
255 sz += sliceHeaderBytes + uint64(len(lm.Before))
256
257 // After
258 sz += sliceHeaderBytes + uint64(len(lm.After))
259
260 // FileName
261 sz += 1
262
263 // Score
264 sz += 8
265
266 // DebugScore
267 sz += stringHeaderBytes + uint64(len(lm.DebugScore))
268
269 // LineFragments
270 sz += sliceHeaderBytes
271 for _, lf := range lm.LineFragments {
272 sz += lf.sizeBytes()
273 }
274
275 return
276}
277
278type Symbol struct {
279 Sym string
280 Kind string
281 Parent string
282 ParentKind string
283}
284
285func (s *Symbol) sizeBytes() uint64 {
286 return 4*stringHeaderBytes + uint64(len(s.Sym)+len(s.Kind)+len(s.Parent)+len(s.ParentKind))
287}
288
289// LineFragmentMatch a segment of matching text within a line.
290type LineFragmentMatch struct {
291 // Offset within the line, in bytes.
292 LineOffset int
293
294 // Offset from file start, in bytes.
295 Offset uint32
296
297 // Number bytes that match.
298 MatchLength int
299
300 SymbolInfo *Symbol
301}
302
303func (lfm *LineFragmentMatch) sizeBytes() (sz uint64) {
304 // LineOffset
305 sz += 8
306
307 // Offset
308 sz += 4
309
310 // MatchLength
311 sz += 8
312
313 // SymbolInfo
314 sz += pointerSize
315 if lfm.SymbolInfo != nil {
316 sz += lfm.SymbolInfo.sizeBytes()
317 }
318
319 return
320}
321
322type FlushReason uint8
323
324const (
325 FlushReasonTimerExpired FlushReason = 1 << iota
326 FlushReasonFinalFlush
327 FlushReasonMaxSize
328)
329
330var FlushReasonStrings = map[FlushReason]string{
331 FlushReasonTimerExpired: "timer_expired",
332 FlushReasonFinalFlush: "final_flush",
333 FlushReasonMaxSize: "max_size_reached",
334}
335
336func (fr FlushReason) String() string {
337 if v, ok := FlushReasonStrings[fr]; ok {
338 return v
339 }
340
341 return "none"
342}
343
344// Stats contains interesting numbers on the search
345type Stats struct {
346 // Amount of I/O for reading contents.
347 ContentBytesLoaded int64
348
349 // Amount of I/O for reading from index.
350 IndexBytesLoaded int64
351
352 // Number of search shards that had a crash.
353 Crashes int
354
355 // Wall clock time for this search
356 Duration time.Duration
357
358 // Number of files containing a match.
359 FileCount int
360
361 // Number of files in shards that we considered.
362 ShardFilesConsidered int
363
364 // Files that we evaluated. Equivalent to files for which all
365 // atom matches (including negations) evaluated to true.
366 FilesConsidered int
367
368 // Files for which we loaded file content to verify substring matches
369 FilesLoaded int
370
371 // Candidate files whose contents weren't examined because we
372 // gathered enough matches.
373 FilesSkipped int
374
375 // Shards that we scanned to find matches.
376 ShardsScanned int
377
378 // Shards that we did not process because a query was canceled.
379 ShardsSkipped int
380
381 // Shards that we did not process because the query was rejected by the
382 // ngram filter indicating it had no matches.
383 ShardsSkippedFilter int
384
385 // Number of non-overlapping matches
386 MatchCount int
387
388 // Number of candidate matches as a result of searching ngrams.
389 NgramMatches int
390
391 // NgramLookups is the number of times we accessed an ngram in the index.
392 NgramLookups int
393
394 // Wall clock time for queued search.
395 Wait time.Duration
396
397 // Aggregate wall clock time spent constructing and pruning the match tree.
398 // This accounts for time such as lookups in the trigram index.
399 MatchTreeConstruction time.Duration
400
401 // Aggregate wall clock time spent searching the match tree. This accounts
402 // for the bulk of search work done looking for matches.
403 MatchTreeSearch time.Duration
404
405 // Number of times regexp was called on files that we evaluated.
406 RegexpsConsidered int
407
408 // FlushReason explains why results were flushed.
409 FlushReason FlushReason
410}
411
412func (s *Stats) sizeBytes() (sz uint64) {
413 sz = 16 * 8 // This assumes we are running on a 64-bit architecture
414 sz += 1 // FlushReason
415
416 return
417}
418
419func (s *Stats) Add(o Stats) {
420 s.ContentBytesLoaded += o.ContentBytesLoaded
421 s.IndexBytesLoaded += o.IndexBytesLoaded
422 s.Crashes += o.Crashes
423 s.FileCount += o.FileCount
424 s.FilesConsidered += o.FilesConsidered
425 s.FilesLoaded += o.FilesLoaded
426 s.FilesSkipped += o.FilesSkipped
427 s.MatchCount += o.MatchCount
428 s.NgramMatches += o.NgramMatches
429 s.NgramLookups += o.NgramLookups
430 s.ShardFilesConsidered += o.ShardFilesConsidered
431 s.ShardsScanned += o.ShardsScanned
432 s.ShardsSkipped += o.ShardsSkipped
433 s.ShardsSkippedFilter += o.ShardsSkippedFilter
434 s.Wait += o.Wait
435 s.MatchTreeConstruction += o.MatchTreeConstruction
436 s.MatchTreeSearch += o.MatchTreeSearch
437 s.RegexpsConsidered += o.RegexpsConsidered
438
439 // We want the first non-zero FlushReason to be sticky. This is a useful
440 // property when aggregating stats from several Zoekts.
441 if s.FlushReason == 0 {
442 s.FlushReason = o.FlushReason
443 }
444}
445
446// Zero returns true if stats is empty.
447func (s *Stats) Zero() bool {
448 if s == nil {
449 return true
450 }
451
452 return !(s.ContentBytesLoaded > 0 ||
453 s.IndexBytesLoaded > 0 ||
454 s.Crashes > 0 ||
455 s.FileCount > 0 ||
456 s.FilesConsidered > 0 ||
457 s.FilesLoaded > 0 ||
458 s.FilesSkipped > 0 ||
459 s.MatchCount > 0 ||
460 s.NgramMatches > 0 ||
461 s.NgramLookups > 0 ||
462 s.ShardFilesConsidered > 0 ||
463 s.ShardsScanned > 0 ||
464 s.ShardsSkipped > 0 ||
465 s.ShardsSkippedFilter > 0 ||
466 s.Wait > 0 ||
467 s.MatchTreeConstruction > 0 ||
468 s.MatchTreeSearch > 0 ||
469 s.RegexpsConsidered > 0)
470}
471
472// Progress contains information about the global progress of the running search query.
473// This is used by the frontend to reorder results and emit them when stable.
474// Sourcegraph specific: this is used when querying multiple zoekt-webserver instances.
475type Progress struct {
476 // Priority of the shard that was searched.
477 Priority float64
478
479 // MaxPendingPriority is the maximum priority of pending result that is being searched in parallel.
480 // This is used to reorder results when the result set is known to be stable-- that is, when a result's
481 // Priority is greater than the max(MaxPendingPriority) from the latest results of each backend, it can be returned to the user.
482 //
483 // MaxPendingPriority decreases monotonically in each SearchResult.
484 MaxPendingPriority float64
485}
486
487func (p *Progress) sizeBytes() uint64 {
488 return 2 * 8
489}
490
491// SearchResult contains search matches and extra data
492type SearchResult struct {
493 Stats
494
495 // Do not encode this as we cannot encode -Inf in JSON
496 Progress `json:"-"`
497
498 Files []FileMatch
499
500 // RepoURLs holds a repo => template string map.
501 RepoURLs map[string]string
502
503 // FragmentNames holds a repo => template string map, for
504 // the line number fragment.
505 LineFragments map[string]string
506}
507
508// SizeBytes is a best-effort estimate of the size of SearchResult in memory.
509// The estimate does not take alignment into account. The result is a lower
510// bound on the actual size in memory.
511func (sr *SearchResult) SizeBytes() (sz uint64) {
512 sz += sr.Stats.sizeBytes()
513 sz += sr.Progress.sizeBytes()
514
515 // Files
516 sz += sliceHeaderBytes
517 for _, f := range sr.Files {
518 sz += f.sizeBytes()
519 }
520
521 // RepoURLs
522 sz += mapHeaderBytes
523 for k, v := range sr.RepoURLs {
524 sz += stringHeaderBytes + uint64(len(k))
525 sz += stringHeaderBytes + uint64(len(v))
526 }
527
528 // LineFragments
529 sz += mapHeaderBytes
530 for k, v := range sr.LineFragments {
531 sz += stringHeaderBytes + uint64(len(k))
532 sz += stringHeaderBytes + uint64(len(v))
533 }
534
535 return
536}
537
538// RepositoryBranch describes an indexed branch, which is a name
539// combined with a version.
540type RepositoryBranch struct {
541 Name string
542 Version string
543}
544
545func (r RepositoryBranch) String() string {
546 return fmt.Sprintf("%s@%s", r.Name, r.Version)
547}
548
549// Repository holds repository metadata.
550type Repository struct {
551 // Sourcegraph's repository ID
552 ID uint32
553
554 // The repository name
555 Name string
556
557 // The repository URL.
558 URL string
559
560 // The physical source where this repo came from, eg. full
561 // path to the zip filename or git repository directory. This
562 // will not be exposed in the UI, but can be used to detect
563 // orphaned index shards.
564 Source string
565
566 // The branches indexed in this repo.
567 Branches []RepositoryBranch
568
569 // Nil if this is not the super project.
570 SubRepoMap map[string]*Repository
571
572 // URL template to link to the commit of a branch
573 CommitURLTemplate string
574
575 // The repository URL for getting to a file. Has access to
576 // {{.Version}}, {{.Path}}
577 FileURLTemplate string
578
579 // The URL fragment to add to a file URL for line numbers. has
580 // access to {{.LineNumber}}. The fragment should include the
581 // separator, generally '#' or ';'.
582 LineFragmentTemplate string
583
584 // Perf optimization: priority is set when we load the shard. It corresponds to
585 // the value of "priority" stored in RawConfig.
586 priority float64
587
588 // All zoekt.* configuration settings.
589 RawConfig map[string]string
590
591 // Importance of the repository, bigger is more important
592 Rank uint16
593
594 // IndexOptions is a hash of the options used to create the index for the
595 // repo.
596 IndexOptions string
597
598 // HasSymbols is true if this repository has indexed ctags
599 // output. Sourcegraph specific: This field is more appropriate for
600 // IndexMetadata. However, we store it here since the Sourcegraph frontend
601 // can read this structure but not IndexMetadata.
602 HasSymbols bool
603
604 // Tombstone is true if we are not allowed to search this repo.
605 Tombstone bool
606
607 // LatestCommitDate is the date of the latest commit among all indexed Branches.
608 // The date might be time.Time's 0-value if the repository was last indexed
609 // before this field was added.
610 LatestCommitDate time.Time
611
612 // FileTombstones is a set of file paths that should be ignored across all branches
613 // in this shard.
614 FileTombstones map[string]struct{} `json:",omitempty"`
615}
616
617func (r *Repository) UnmarshalJSON(data []byte) error {
618 // We define a new type so that we can use json.Unmarshal
619 // without recursing into this same method.
620 type repository *Repository
621 repo := repository(r)
622
623 err := json.Unmarshal(data, repo)
624 if err != nil {
625 return err
626 }
627
628 if v, ok := repo.RawConfig["repoid"]; ok {
629 id, _ := strconv.ParseUint(v, 10, 32)
630 r.ID = uint32(id)
631 }
632
633 if v, ok := repo.RawConfig["priority"]; ok {
634 r.priority, err = strconv.ParseFloat(v, 64)
635 if err != nil {
636 r.priority = 0
637 }
638
639 // Sourcegraph indexserver doesn't set repo.Rank, so we set it here
640 // based on priority. Setting it on read instead of during indexing
641 // allows us to avoid a complete reindex.
642 if r.Rank == 0 && r.priority > 0 {
643 // Normalize the repo score within [0, 1), with the midpoint at 5,000. This means popular
644 // repos (roughly ones with over 5,000 stars) see diminishing returns from more stars.
645 r.Rank = uint16(r.priority / (5000.0 + r.priority) * maxUInt16)
646 }
647 }
648 return nil
649}
650
651// MergeMutable will merge x into r. mutated will be true if it made any
652// changes. err is non-nil if we needed to mutate an immutable field.
653//
654// Note: SubRepoMap, IndexOptions and HasSymbol fields are ignored. They are
655// computed while indexing so can't be synthesized from x.
656//
657// Note: We ignore RawConfig fields which are duplicated into Repository:
658// name and id.
659//
660// Note: URL, *Template fields are ignored. They are not used by Sourcegraph.
661func (r *Repository) MergeMutable(x *Repository) (mutated bool, err error) {
662 if r.ID != x.ID {
663 // Sourcegraph: strange behaviour may occur if ID changes but names don't.
664 return mutated, errors.New("ID is immutable")
665 }
666 if r.Name != x.Name {
667 // Name is encoded into the shard name on disk. We need to re-index if it
668 // changes.
669 return mutated, errors.New("Name is immutable")
670 }
671 if !reflect.DeepEqual(r.Branches, x.Branches) {
672 // Need a reindex if content changing.
673 return mutated, errors.New("Branches is immutable")
674 }
675
676 for k, v := range x.RawConfig {
677 // We ignore name and id since they are encoded into the repository.
678 if k == "name" || k == "id" {
679 continue
680 }
681 if r.RawConfig == nil {
682 mutated = true
683 r.RawConfig = make(map[string]string)
684 }
685 if r.RawConfig[k] != v {
686 mutated = true
687 r.RawConfig[k] = v
688 }
689 }
690
691 return mutated, nil
692}
693
694// IndexMetadata holds metadata stored in the index file. It contains
695// data generated by the core indexing library.
696type IndexMetadata struct {
697 IndexFormatVersion int
698 IndexFeatureVersion int
699 IndexMinReaderVersion int
700 IndexTime time.Time
701 PlainASCII bool
702 LanguageMap map[string]uint16
703 ZoektVersion string
704 ID string
705}
706
707// Statistics of a (collection of) repositories.
708type RepoStats struct {
709 // Repos is used for aggregrating the number of repositories.
710 //
711 // Note: This field is not populated on RepoListEntry.Stats (individual) but
712 // only for RepoList.Stats (aggregate).
713 Repos int
714
715 // Shards is the total number of search shards.
716 Shards int
717
718 // Documents holds the number of documents or files.
719 Documents int
720
721 // IndexBytes is the amount of RAM used for index overhead.
722 IndexBytes int64
723
724 // ContentBytes is the amount of RAM used for raw content.
725 ContentBytes int64
726
727 // Sourcegraph specific stats below. These are not as efficient to calculate
728 // as the above statistics. We experimentally measured about a 10% slower
729 // shard load time. However, we find these values very useful to track and
730 // computing them outside of load time introduces a lot of complexity.
731
732 // NewLinesCount is the number of newlines "\n" that appear in the zoekt
733 // indexed documents. This is not exactly the same as line count, since it
734 // will not include lines not terminated by "\n" (eg a file with no "\n", or
735 // a final line without "\n"). Note: Zoekt deduplicates documents across
736 // branches, so if a path has the same contents on multiple branches, there
737 // is only one document for it. As such that document's newlines is only
738 // counted once. See DefaultBranchNewLinesCount and AllBranchesNewLinesCount
739 // for counts which do not deduplicate.
740 NewLinesCount uint64
741
742 // DefaultBranchNewLinesCount is the number of newlines "\n" in the default
743 // branch.
744 DefaultBranchNewLinesCount uint64
745
746 // OtherBranchesNewLinesCount is the number of newlines "\n" in all branches
747 // except the default branch.
748 OtherBranchesNewLinesCount uint64
749}
750
751func (s *RepoStats) Add(o *RepoStats) {
752 // can't update Repos, since one repo may have multiple
753 // shards.
754 s.Shards += o.Shards
755 s.IndexBytes += o.IndexBytes
756 s.Documents += o.Documents
757 s.ContentBytes += o.ContentBytes
758
759 // Sourcegraph specific
760 s.NewLinesCount += o.NewLinesCount
761 s.DefaultBranchNewLinesCount += o.DefaultBranchNewLinesCount
762 s.OtherBranchesNewLinesCount += o.OtherBranchesNewLinesCount
763}
764
765type RepoListEntry struct {
766 Repository Repository
767 IndexMetadata IndexMetadata
768 Stats RepoStats
769}
770
771// MinimalRepoListEntry is a subset of RepoListEntry. It was added after
772// performance profiling of sourcegraph.com revealed that querying this
773// information from Zoekt was causing lots of CPU and memory usage. Note: we
774// can revisit this, how we store and query this information has changed a lot
775// since this was introduced.
776type MinimalRepoListEntry struct {
777 // HasSymbols is exported since Sourcegraph uses this information at search
778 // planning time to decide between Zoekt and an unindexed symbol search.
779 //
780 // Note: it pretty much is always true in practice.
781 HasSymbols bool
782
783 // Branches is used by Sourcegraphs query planner to decided if it can use
784 // zoekt or go via an unindexed code path.
785 Branches []RepositoryBranch
786
787 // IndexTimeUnix is the IndexTime converted to unix time (number of seconds
788 // since the epoch). This is to make it clear we are not transporting the
789 // full fidelty timestamp (ie with milliseconds and location). Additionally
790 // it saves 16 bytes in this struct.
791 //
792 // IndexTime is used as a heuristic in Sourcegraph to decide in aggregate
793 // how many repositories need updating after a ranking change/etc.
794 //
795 // TODO(keegancsmith) audit updates to IndexTime and document how and when
796 // it changes. Concerned about things like metadata updates or compound
797 // shards leading to untrustworthy data here.
798 IndexTimeUnix int64
799}
800
801type ReposMap map[uint32]MinimalRepoListEntry
802
803// MarshalBinary implements a specialized encoder for ReposMap.
804func (q *ReposMap) MarshalBinary() ([]byte, error) {
805 return reposMapEncode(*q)
806}
807
808// UnmarshalBinary implements a specialized decoder for ReposMap.
809func (q *ReposMap) UnmarshalBinary(b []byte) error {
810 var err error
811 (*q), err = reposMapDecode(b)
812 return err
813}
814
815// RepoList holds a set of Repository metadata.
816type RepoList struct {
817 // Returned when ListOptions.Field is RepoListFieldRepos.
818 Repos []*RepoListEntry
819
820 // ReposMap is set when ListOptions.Field is RepoListFieldReposMap.
821 ReposMap ReposMap
822
823 Crashes int
824
825 // Stats response to a List request.
826 // This is the aggregate RepoStats of all repos matching the input query.
827 Stats RepoStats
828}
829
830type Searcher interface {
831 Search(ctx context.Context, q query.Q, opts *SearchOptions) (*SearchResult, error)
832
833 // List lists repositories. The query `q` can only contain
834 // query.Repo atoms.
835 List(ctx context.Context, q query.Q, opts *ListOptions) (*RepoList, error)
836 Close()
837
838 // Describe the searcher for debug messages.
839 String() string
840}
841
842type RepoListField int
843
844const (
845 RepoListFieldRepos RepoListField = 0
846 RepoListFieldReposMap = 2
847)
848
849type ListOptions struct {
850 // Field decides which field to populate in RepoList response.
851 Field RepoListField
852}
853
854func (o *ListOptions) GetField() (RepoListField, error) {
855 if o == nil {
856 return RepoListFieldRepos, nil
857 }
858 switch o.Field {
859 case RepoListFieldRepos, RepoListFieldReposMap:
860 return o.Field, nil
861 case 1:
862 return 0, fmt.Errorf("RepoListFieldMinimal (%d) is no longer supported", o.Field)
863 default:
864 return 0, fmt.Errorf("unknown RepoListField %d", o.Field)
865 }
866}
867
868func (o *ListOptions) String() string {
869 return fmt.Sprintf("%#v", o)
870}
871
872type SearchOptions struct {
873 // Return an upper-bound estimate of eligible documents in
874 // stats.ShardFilesConsidered.
875 EstimateDocCount bool
876
877 // Return the whole file.
878 Whole bool
879
880 // Maximum number of matches: skip all processing an index
881 // shard after we found this many non-overlapping matches.
882 ShardMaxMatchCount int
883
884 // Maximum number of matches: stop looking for more matches
885 // once we have this many matches across shards.
886 TotalMaxMatchCount int
887
888 // Maximum number of matches: skip processing documents for a repository in
889 // a shard once we have found ShardRepoMaxMatchCount.
890 //
891 // A compound shard may contain multiple repositories. This will most often
892 // be set to 1 to find all repositories containing a result.
893 ShardRepoMaxMatchCount int
894
895 // Abort the search after this much time has passed.
896 MaxWallTime time.Duration
897
898 // FlushWallTime if non-zero will stop streaming behaviour at first and
899 // instead will collate and sort results. At FlushWallTime the results will
900 // be sent and then the behaviour will revert to the normal streaming.
901 FlushWallTime time.Duration
902
903 // Truncates the number of documents (i.e. files) after collating and
904 // sorting the results.
905 MaxDocDisplayCount int
906
907 // Truncates the number of matchs after collating and sorting the results.
908 MaxMatchDisplayCount int
909
910 // If set to a number greater than zero then up to this many number
911 // of context lines will be added before and after each matched line.
912 // Note that the included context lines might contain matches and
913 // it's up to the consumer of the result to remove those lines.
914 NumContextLines int
915
916 // If true, ChunkMatches will be returned in each FileMatch rather than LineMatches
917 // EXPERIMENTAL: the behavior of this flag may be changed in future versions.
918 ChunkMatches bool
919
920 // EXPERIMENTAL. If true, document ranks are used as additional input for
921 // sorting matches.
922 UseDocumentRanks bool
923
924 // EXPERIMENTAL. When UseDocumentRanks is enabled, this can be optionally set to adjust
925 // their weight in the file match score. If the value is <= 0.0, the default weight value
926 // will be used. This option is temporary and is only exposed for testing/ tuning purposes.
927 DocumentRanksWeight float64
928
929 // EXPERIMENTAL. If true, use keyword-style scoring instead of the default scoring formula.
930 // Currently, this treats each match in a file as a term and computes an approximation to BM25.
931 // When enabled, all other scoring signals are ignored, including document ranks.
932 UseKeywordScoring bool
933
934 // Trace turns on opentracing for this request if true and if the Jaeger address was provided as
935 // a command-line flag
936 Trace bool
937
938 // If set, the search results will contain debug information for scoring.
939 DebugScore bool
940
941 // SpanContext is the opentracing span context, if it exists, from the zoekt client
942 SpanContext map[string]string
943}
944
945// String returns a succinct representation of the options. This is meant for
946// human consumption in logs and traces.
947//
948// Note: some tracing systems have limits on length of values, so we take care
949// to try and make this small, and include the important information near the
950// front incase of truncation.
951func (s *SearchOptions) String() string {
952 var b strings.Builder
953
954 add := func(name, value string) {
955 b.WriteString(name)
956 b.WriteByte('=')
957 b.WriteString(value)
958 b.WriteByte(' ')
959 }
960 addInt := func(name string, value int) {
961 if value != 0 {
962 add(name, strconv.Itoa(value))
963 }
964 }
965 addDuration := func(name string, value time.Duration) {
966 if value != 0 {
967 add(name, value.String())
968 }
969 }
970 addBool := func(name string, value bool) {
971 if !value {
972 return
973 }
974 b.WriteString(name)
975 b.WriteByte(' ')
976 }
977
978 b.WriteString("zoekt.SearchOptions{ ")
979
980 addInt("ShardMaxMatchCount", s.ShardMaxMatchCount)
981 addInt("TotalMaxMatchCount", s.TotalMaxMatchCount)
982 addInt("ShardRepoMaxMatchCount", s.ShardRepoMaxMatchCount)
983 addInt("MaxDocDisplayCount", s.MaxDocDisplayCount)
984 addInt("MaxMatchDisplayCount", s.MaxMatchDisplayCount)
985 addInt("NumContextLines", s.NumContextLines)
986
987 addDuration("MaxWallTime", s.MaxWallTime)
988 addDuration("FlushWallTime", s.FlushWallTime)
989
990 if s.DocumentRanksWeight > 0 {
991 add("DocumentRanksWeight", strconv.FormatFloat(s.DocumentRanksWeight, 'g', -1, 64))
992 }
993
994 addBool("EstimateDocCount", s.EstimateDocCount)
995 addBool("Whole", s.Whole)
996 addBool("ChunkMatches", s.ChunkMatches)
997 addBool("UseDocumentRanks", s.UseDocumentRanks)
998 addBool("UseKeywordScoring", s.UseKeywordScoring)
999 addBool("Trace", s.Trace)
1000 addBool("DebugScore", s.DebugScore)
1001
1002 for k, v := range s.SpanContext {
1003 add("SpanContext."+k, strconv.Quote(v))
1004 }
1005
1006 b.WriteByte('}')
1007 return b.String()
1008}
1009
1010// Sender is the interface that wraps the basic Send method.
1011type Sender interface {
1012 Send(*SearchResult)
1013}
1014
1015// Streamer adds the method StreamSearch to the Searcher interface.
1016type Streamer interface {
1017 Searcher
1018 StreamSearch(ctx context.Context, q query.Q, opts *SearchOptions, sender Sender) (err error)
1019}