fork of https://github.com/sourcegraph/zoekt
1// Copyright 2016 Google Inc. All rights reserved.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15package zoekt // import "github.com/sourcegraph/zoekt"
16
17import (
18 "context"
19 "encoding/json"
20 "errors"
21 "fmt"
22 "reflect"
23 "strconv"
24 "strings"
25 "time"
26
27 "github.com/sourcegraph/zoekt/query"
28)
29
30const (
31 mapHeaderBytes uint64 = 48
32 sliceHeaderBytes uint64 = 24
33 stringHeaderBytes uint64 = 16
34 pointerSize uint64 = 8
35 interfaceBytes uint64 = 16
36)
37
38// FileMatch contains all the matches within a file.
39type FileMatch struct {
40 FileName string
41
42 // Repository is the globally unique name of the repo of the
43 // match
44 Repository string
45
46 // SubRepositoryName is the globally unique name of the repo,
47 // if it came from a subrepository
48 SubRepositoryName string `json:",omitempty"`
49
50 // SubRepositoryPath holds the prefix where the subrepository
51 // was mounted.
52 SubRepositoryPath string `json:",omitempty"`
53
54 // Commit SHA1 (hex) of the (sub)repo holding the file.
55 Version string `json:",omitempty"`
56
57 // Detected language of the result.
58 Language string
59
60 // For debugging. Needs DebugScore set, but public so tests in
61 // other packages can print some diagnostics.
62 Debug string `json:",omitempty"`
63
64 Branches []string `json:",omitempty"`
65
66 // One of LineMatches or ChunkMatches will be returned depending on whether
67 // the SearchOptions.ChunkMatches is set.
68 LineMatches []LineMatch `json:",omitempty"`
69 ChunkMatches []ChunkMatch `json:",omitempty"`
70
71 // Only set if requested
72 Content []byte `json:",omitempty"`
73
74 // Checksum of the content.
75 Checksum []byte
76
77 // Ranking; the higher, the better.
78 Score float64 `json:",omitempty"`
79
80 // RepositoryPriority is a Sourcegraph extension. It is used by Sourcegraph to
81 // order results from different repositories relative to each other.
82 RepositoryPriority float64 `json:",omitempty"`
83
84 // RepositoryID is a Sourcegraph extension. This is the ID of Repository in
85 // Sourcegraph.
86 RepositoryID uint32 `json:",omitempty"`
87}
88
89func (m *FileMatch) sizeBytes() (sz uint64) {
90 // Score
91 sz += 8
92
93 for _, s := range []string{
94 m.Debug,
95 m.FileName,
96 m.Repository,
97 m.Language,
98 m.SubRepositoryName,
99 m.SubRepositoryPath,
100 m.Version,
101 } {
102 sz += stringHeaderBytes + uint64(len(s))
103 }
104
105 // Branches
106 sz += sliceHeaderBytes
107 for _, s := range m.Branches {
108 sz += stringHeaderBytes + uint64(len(s))
109 }
110
111 // LineMatches
112 sz += sliceHeaderBytes
113 for _, lm := range m.LineMatches {
114 sz += lm.sizeBytes()
115 }
116
117 // ChunkMatches
118 sz += sliceHeaderBytes
119 for _, cm := range m.ChunkMatches {
120 sz += cm.sizeBytes()
121 }
122
123 // RepositoryID
124 sz += 4
125
126 // RepositoryPriority
127 sz += 8
128
129 // Content
130 sz += sliceHeaderBytes + uint64(len(m.Content))
131
132 // Checksum
133 sz += sliceHeaderBytes + uint64(len(m.Checksum))
134
135 return
136}
137
138// ChunkMatch is a set of non-overlapping matches within a contiguous range of
139// lines in the file.
140type ChunkMatch struct {
141 DebugScore string
142
143 // Content is a contiguous range of complete lines that fully contains Ranges.
144 Content []byte
145
146 // Ranges is a set of matching ranges within this chunk. Each range is relative
147 // to the beginning of the file (not the beginning of Content).
148 Ranges []Range
149
150 // SymbolInfo is the symbol information associated with Ranges. If it is non-nil,
151 // its length will equal that of Ranges. Any of its elements may be nil.
152 SymbolInfo []*Symbol
153
154 // FileName indicates whether this match is a match on the file name, in
155 // which case Content will contain the file name.
156 FileName bool
157
158 // ContentStart is the location (inclusive) of the beginning of content
159 // relative to the beginning of the file. It will always be at the
160 // beginning of a line (Column will always be 1).
161 ContentStart Location
162
163 Score float64
164}
165
166func (cm *ChunkMatch) sizeBytes() (sz uint64) {
167 // Content
168 sz += sliceHeaderBytes + uint64(len(cm.Content))
169
170 // ContentStart
171 sz += cm.ContentStart.sizeBytes()
172
173 // FileName
174 sz += 1
175
176 // Ranges
177 sz += sliceHeaderBytes
178 if len(cm.Ranges) > 0 {
179 sz += uint64(len(cm.Ranges)) * cm.Ranges[0].sizeBytes()
180 }
181
182 // SymbolInfo
183 sz += sliceHeaderBytes
184 for _, si := range cm.SymbolInfo {
185 sz += pointerSize
186 if si != nil {
187 sz += si.sizeBytes()
188 }
189 }
190
191 // Score
192 sz += 8
193
194 // DebugScore
195 sz += stringHeaderBytes + uint64(len(cm.DebugScore))
196
197 return
198}
199
200type Range struct {
201 // The inclusive beginning of the range.
202 Start Location
203 // The exclusive end of the range.
204 End Location
205}
206
207func (r *Range) sizeBytes() uint64 {
208 return r.Start.sizeBytes() + r.End.sizeBytes()
209}
210
211type Location struct {
212 // 0-based byte offset from the beginning of the file
213 ByteOffset uint32
214 // 1-based line number from the beginning of the file
215 LineNumber uint32
216 // 1-based column number (in runes) from the beginning of line
217 Column uint32
218}
219
220func (l *Location) sizeBytes() uint64 {
221 return 3 * 4
222}
223
224// LineMatch holds the matches within a single line in a file.
225type LineMatch struct {
226 // The line in which a match was found.
227 Line []byte
228 LineStart int
229 LineEnd int
230 LineNumber int
231
232 // Before and After are only set when SearchOptions.NumContextLines is > 0
233 Before []byte
234 After []byte
235
236 // If set, this was a match on the filename.
237 FileName bool
238
239 // The higher the better. Only ranks the quality of the match
240 // within the file, does not take rank of file into account
241 Score float64
242 DebugScore string
243
244 LineFragments []LineFragmentMatch
245}
246
247func (lm *LineMatch) sizeBytes() (sz uint64) {
248 // Line
249 sz += sliceHeaderBytes + uint64(len(lm.Line))
250
251 // LineStart, LineEnd, LineNumber
252 sz += 3 * 8
253
254 // Before
255 sz += sliceHeaderBytes + uint64(len(lm.Before))
256
257 // After
258 sz += sliceHeaderBytes + uint64(len(lm.After))
259
260 // FileName
261 sz += 1
262
263 // Score
264 sz += 8
265
266 // DebugScore
267 sz += stringHeaderBytes + uint64(len(lm.DebugScore))
268
269 // LineFragments
270 sz += sliceHeaderBytes
271 for _, lf := range lm.LineFragments {
272 sz += lf.sizeBytes()
273 }
274
275 return
276}
277
278type Symbol struct {
279 Sym string
280 Kind string
281 Parent string
282 ParentKind string
283}
284
285func (s *Symbol) sizeBytes() uint64 {
286 return 4*stringHeaderBytes + uint64(len(s.Sym)+len(s.Kind)+len(s.Parent)+len(s.ParentKind))
287}
288
289// LineFragmentMatch a segment of matching text within a line.
290type LineFragmentMatch struct {
291 // Offset within the line, in bytes.
292 LineOffset int
293
294 // Offset from file start, in bytes.
295 Offset uint32
296
297 // Number bytes that match.
298 MatchLength int
299
300 SymbolInfo *Symbol
301}
302
303func (lfm *LineFragmentMatch) sizeBytes() (sz uint64) {
304 // LineOffset
305 sz += 8
306
307 // Offset
308 sz += 4
309
310 // MatchLength
311 sz += 8
312
313 // SymbolInfo
314 sz += pointerSize
315 if lfm.SymbolInfo != nil {
316 sz += lfm.SymbolInfo.sizeBytes()
317 }
318
319 return
320}
321
322type FlushReason uint8
323
324const (
325 FlushReasonTimerExpired FlushReason = 1 << iota
326 FlushReasonFinalFlush
327 FlushReasonMaxSize
328)
329
330var FlushReasonStrings = map[FlushReason]string{
331 FlushReasonTimerExpired: "timer_expired",
332 FlushReasonFinalFlush: "final_flush",
333 FlushReasonMaxSize: "max_size_reached",
334}
335
336func (fr FlushReason) String() string {
337 if v, ok := FlushReasonStrings[fr]; ok {
338 return v
339 }
340
341 return "none"
342}
343
344// Stats contains interesting numbers on the search
345type Stats struct {
346 // Amount of I/O for reading contents.
347 ContentBytesLoaded int64
348
349 // Amount of I/O for reading from index.
350 IndexBytesLoaded int64
351
352 // Number of search shards that had a crash.
353 Crashes int
354
355 // Wall clock time for this search
356 Duration time.Duration
357
358 // Number of files containing a match.
359 FileCount int
360
361 // Number of files in shards that we considered.
362 ShardFilesConsidered int
363
364 // Files that we evaluated. Equivalent to files for which all
365 // atom matches (including negations) evaluated to true.
366 FilesConsidered int
367
368 // Files for which we loaded file content to verify substring matches
369 FilesLoaded int
370
371 // Candidate files whose contents weren't examined because we
372 // gathered enough matches.
373 FilesSkipped int
374
375 // Shards that we scanned to find matches.
376 ShardsScanned int
377
378 // Shards that we did not process because a query was canceled.
379 ShardsSkipped int
380
381 // Shards that we did not process because the query was rejected by the
382 // ngram filter indicating it had no matches.
383 ShardsSkippedFilter int
384
385 // Number of non-overlapping matches
386 MatchCount int
387
388 // Number of candidate matches as a result of searching ngrams.
389 NgramMatches int
390
391 // NgramLookups is the number of times we accessed an ngram in the index.
392 NgramLookups int
393
394 // Wall clock time for queued search.
395 Wait time.Duration
396
397 // Aggregate wall clock time spent constructing and pruning the match tree.
398 // This accounts for time such as lookups in the trigram index.
399 MatchTreeConstruction time.Duration
400
401 // Aggregate wall clock time spent searching the match tree. This accounts
402 // for the bulk of search work done looking for matches.
403 MatchTreeSearch time.Duration
404
405 // Number of times regexp was called on files that we evaluated.
406 RegexpsConsidered int
407
408 // FlushReason explains why results were flushed.
409 FlushReason FlushReason
410}
411
412func (s *Stats) sizeBytes() (sz uint64) {
413 sz = 16 * 8 // This assumes we are running on a 64-bit architecture
414 sz += 1 // FlushReason
415
416 return
417}
418
419func (s *Stats) Add(o Stats) {
420 s.ContentBytesLoaded += o.ContentBytesLoaded
421 s.IndexBytesLoaded += o.IndexBytesLoaded
422 s.Crashes += o.Crashes
423 s.FileCount += o.FileCount
424 s.FilesConsidered += o.FilesConsidered
425 s.FilesLoaded += o.FilesLoaded
426 s.FilesSkipped += o.FilesSkipped
427 s.MatchCount += o.MatchCount
428 s.NgramMatches += o.NgramMatches
429 s.NgramLookups += o.NgramLookups
430 s.ShardFilesConsidered += o.ShardFilesConsidered
431 s.ShardsScanned += o.ShardsScanned
432 s.ShardsSkipped += o.ShardsSkipped
433 s.ShardsSkippedFilter += o.ShardsSkippedFilter
434 s.Wait += o.Wait
435 s.MatchTreeConstruction += o.MatchTreeConstruction
436 s.MatchTreeSearch += o.MatchTreeSearch
437 s.RegexpsConsidered += o.RegexpsConsidered
438
439 // We want the first non-zero FlushReason to be sticky. This is a useful
440 // property when aggregating stats from several Zoekts.
441 if s.FlushReason == 0 {
442 s.FlushReason = o.FlushReason
443 }
444}
445
446// Zero returns true if stats is empty.
447func (s *Stats) Zero() bool {
448 if s == nil {
449 return true
450 }
451
452 return !(s.ContentBytesLoaded > 0 ||
453 s.IndexBytesLoaded > 0 ||
454 s.Crashes > 0 ||
455 s.FileCount > 0 ||
456 s.FilesConsidered > 0 ||
457 s.FilesLoaded > 0 ||
458 s.FilesSkipped > 0 ||
459 s.MatchCount > 0 ||
460 s.NgramMatches > 0 ||
461 s.NgramLookups > 0 ||
462 s.ShardFilesConsidered > 0 ||
463 s.ShardsScanned > 0 ||
464 s.ShardsSkipped > 0 ||
465 s.ShardsSkippedFilter > 0 ||
466 s.Wait > 0 ||
467 s.MatchTreeConstruction > 0 ||
468 s.MatchTreeSearch > 0 ||
469 s.RegexpsConsidered > 0)
470}
471
472// Progress contains information about the global progress of the running search query.
473// This is used by the frontend to reorder results and emit them when stable.
474// Sourcegraph specific: this is used when querying multiple zoekt-webserver instances.
475type Progress struct {
476 // Priority of the shard that was searched.
477 Priority float64
478
479 // MaxPendingPriority is the maximum priority of pending result that is being searched in parallel.
480 // This is used to reorder results when the result set is known to be stable-- that is, when a result's
481 // Priority is greater than the max(MaxPendingPriority) from the latest results of each backend, it can be returned to the user.
482 //
483 // MaxPendingPriority decreases monotonically in each SearchResult.
484 MaxPendingPriority float64
485}
486
487func (p *Progress) sizeBytes() uint64 {
488 return 2 * 8
489}
490
491// SearchResult contains search matches and extra data
492type SearchResult struct {
493 Stats
494
495 // Do not encode this as we cannot encode -Inf in JSON
496 Progress `json:"-"`
497
498 Files []FileMatch
499
500 // RepoURLs holds a repo => template string map.
501 RepoURLs map[string]string
502
503 // FragmentNames holds a repo => template string map, for
504 // the line number fragment.
505 LineFragments map[string]string
506}
507
508// SizeBytes is a best-effort estimate of the size of SearchResult in memory.
509// The estimate does not take alignment into account. The result is a lower
510// bound on the actual size in memory.
511func (sr *SearchResult) SizeBytes() (sz uint64) {
512 sz += sr.Stats.sizeBytes()
513 sz += sr.Progress.sizeBytes()
514
515 // Files
516 sz += sliceHeaderBytes
517 for _, f := range sr.Files {
518 sz += f.sizeBytes()
519 }
520
521 // RepoURLs
522 sz += mapHeaderBytes
523 for k, v := range sr.RepoURLs {
524 sz += stringHeaderBytes + uint64(len(k))
525 sz += stringHeaderBytes + uint64(len(v))
526 }
527
528 // LineFragments
529 sz += mapHeaderBytes
530 for k, v := range sr.LineFragments {
531 sz += stringHeaderBytes + uint64(len(k))
532 sz += stringHeaderBytes + uint64(len(v))
533 }
534
535 return
536}
537
538// RepositoryBranch describes an indexed branch, which is a name
539// combined with a version.
540type RepositoryBranch struct {
541 Name string
542 Version string
543}
544
545func (r RepositoryBranch) String() string {
546 return fmt.Sprintf("%s@%s", r.Name, r.Version)
547}
548
549// Repository holds repository metadata.
550type Repository struct {
551 // Sourcegraph's repository ID
552 ID uint32
553
554 // The repository name
555 Name string
556
557 // The repository URL.
558 URL string
559
560 // The physical source where this repo came from, eg. full
561 // path to the zip filename or git repository directory. This
562 // will not be exposed in the UI, but can be used to detect
563 // orphaned index shards.
564 Source string
565
566 // The branches indexed in this repo.
567 Branches []RepositoryBranch
568
569 // Nil if this is not the super project.
570 SubRepoMap map[string]*Repository
571
572 // URL template to link to the commit of a branch
573 CommitURLTemplate string
574
575 // The repository URL for getting to a file. Has access to
576 // {{.Version}}, {{.Path}}
577 FileURLTemplate string
578
579 // The URL fragment to add to a file URL for line numbers. has
580 // access to {{.LineNumber}}. The fragment should include the
581 // separator, generally '#' or ';'.
582 LineFragmentTemplate string
583
584 // Perf optimization: priority is set when we load the shard. It corresponds to
585 // the value of "priority" stored in RawConfig.
586 priority float64
587
588 // All zoekt.* configuration settings.
589 RawConfig map[string]string
590
591 // Importance of the repository, bigger is more important
592 Rank uint16
593
594 // IndexOptions is a hash of the options used to create the index for the
595 // repo.
596 IndexOptions string
597
598 // HasSymbols is true if this repository has indexed ctags
599 // output. Sourcegraph specific: This field is more appropriate for
600 // IndexMetadata. However, we store it here since the Sourcegraph frontend
601 // can read this structure but not IndexMetadata.
602 HasSymbols bool
603
604 // Tombstone is true if we are not allowed to search this repo.
605 Tombstone bool
606
607 // LatestCommitDate is the date of the latest commit among all indexed Branches.
608 // The date might be time.Time's 0-value if the repository was last indexed
609 // before this field was added.
610 LatestCommitDate time.Time
611
612 // FileTombstones is a set of file paths that should be ignored across all branches
613 // in this shard.
614 FileTombstones map[string]struct{} `json:",omitempty"`
615}
616
617func (r *Repository) UnmarshalJSON(data []byte) error {
618 // We define a new type so that we can use json.Unmarshal
619 // without recursing into this same method.
620 type repository *Repository
621 repo := repository(r)
622
623 err := json.Unmarshal(data, repo)
624 if err != nil {
625 return err
626 }
627
628 if v, ok := repo.RawConfig["repoid"]; ok {
629 id, _ := strconv.ParseUint(v, 10, 32)
630 r.ID = uint32(id)
631 }
632
633 if v, ok := repo.RawConfig["priority"]; ok {
634 r.priority, err = strconv.ParseFloat(v, 64)
635 if err != nil {
636 r.priority = 0
637 }
638
639 // Sourcegraph indexserver doesn't set repo.Rank, so we set it here
640 // based on priority. Setting it on read instead of during indexing
641 // allows us to avoid a complete reindex.
642 if r.Rank == 0 && r.priority > 0 {
643 // Normalize the repo score within [0, 1), with the midpoint at 5,000. This means popular
644 // repos (roughly ones with over 5,000 stars) see diminishing returns from more stars.
645 r.Rank = uint16(r.priority / (5000.0 + r.priority) * maxUInt16)
646 }
647 }
648 return nil
649}
650
651// MergeMutable will merge x into r. mutated will be true if it made any
652// changes. err is non-nil if we needed to mutate an immutable field.
653//
654// Note: SubRepoMap, IndexOptions and HasSymbol fields are ignored. They are
655// computed while indexing so can't be synthesized from x.
656//
657// Note: We ignore RawConfig fields which are duplicated into Repository:
658// name and id.
659func (r *Repository) MergeMutable(x *Repository) (mutated bool, err error) {
660 if r.ID != x.ID {
661 // Sourcegraph: strange behaviour may occur if ID changes but names don't.
662 return mutated, errors.New("ID is immutable")
663 }
664 if r.Name != x.Name {
665 // Name is encoded into the shard name on disk. We need to re-index if it
666 // changes.
667 return mutated, errors.New("Name is immutable")
668 }
669 if !reflect.DeepEqual(r.Branches, x.Branches) {
670 // Need a reindex if content changing.
671 return mutated, errors.New("Branches is immutable")
672 }
673
674 for k, v := range x.RawConfig {
675 // We ignore name and id since they are encoded into the repository.
676 if k == "name" || k == "id" {
677 continue
678 }
679 if r.RawConfig == nil {
680 mutated = true
681 r.RawConfig = make(map[string]string)
682 }
683 if r.RawConfig[k] != v {
684 mutated = true
685 r.RawConfig[k] = v
686 }
687 }
688
689 if r.URL != x.URL {
690 mutated = true
691 r.URL = x.URL
692 }
693 if r.CommitURLTemplate != x.CommitURLTemplate {
694 mutated = true
695 r.CommitURLTemplate = x.CommitURLTemplate
696 }
697 if r.FileURLTemplate != x.FileURLTemplate {
698 mutated = true
699 r.FileURLTemplate = x.FileURLTemplate
700 }
701 if r.LineFragmentTemplate != x.LineFragmentTemplate {
702 mutated = true
703 r.LineFragmentTemplate = x.LineFragmentTemplate
704 }
705
706 return mutated, nil
707}
708
709// IndexMetadata holds metadata stored in the index file. It contains
710// data generated by the core indexing library.
711type IndexMetadata struct {
712 IndexFormatVersion int
713 IndexFeatureVersion int
714 IndexMinReaderVersion int
715 IndexTime time.Time
716 PlainASCII bool
717 LanguageMap map[string]uint16
718 ZoektVersion string
719 ID string
720}
721
722// Statistics of a (collection of) repositories.
723type RepoStats struct {
724 // Repos is used for aggregrating the number of repositories.
725 //
726 // Note: This field is not populated on RepoListEntry.Stats (individual) but
727 // only for RepoList.Stats (aggregate).
728 Repos int
729
730 // Shards is the total number of search shards.
731 Shards int
732
733 // Documents holds the number of documents or files.
734 Documents int
735
736 // IndexBytes is the amount of RAM used for index overhead.
737 IndexBytes int64
738
739 // ContentBytes is the amount of RAM used for raw content.
740 ContentBytes int64
741
742 // Sourcegraph specific stats below. These are not as efficient to calculate
743 // as the above statistics. We experimentally measured about a 10% slower
744 // shard load time. However, we find these values very useful to track and
745 // computing them outside of load time introduces a lot of complexity.
746
747 // NewLinesCount is the number of newlines "\n" that appear in the zoekt
748 // indexed documents. This is not exactly the same as line count, since it
749 // will not include lines not terminated by "\n" (eg a file with no "\n", or
750 // a final line without "\n"). Note: Zoekt deduplicates documents across
751 // branches, so if a path has the same contents on multiple branches, there
752 // is only one document for it. As such that document's newlines is only
753 // counted once. See DefaultBranchNewLinesCount and AllBranchesNewLinesCount
754 // for counts which do not deduplicate.
755 NewLinesCount uint64
756
757 // DefaultBranchNewLinesCount is the number of newlines "\n" in the default
758 // branch.
759 DefaultBranchNewLinesCount uint64
760
761 // OtherBranchesNewLinesCount is the number of newlines "\n" in all branches
762 // except the default branch.
763 OtherBranchesNewLinesCount uint64
764}
765
766func (s *RepoStats) Add(o *RepoStats) {
767 // can't update Repos, since one repo may have multiple
768 // shards.
769 s.Shards += o.Shards
770 s.IndexBytes += o.IndexBytes
771 s.Documents += o.Documents
772 s.ContentBytes += o.ContentBytes
773
774 // Sourcegraph specific
775 s.NewLinesCount += o.NewLinesCount
776 s.DefaultBranchNewLinesCount += o.DefaultBranchNewLinesCount
777 s.OtherBranchesNewLinesCount += o.OtherBranchesNewLinesCount
778}
779
780type RepoListEntry struct {
781 Repository Repository
782 IndexMetadata IndexMetadata
783 Stats RepoStats
784}
785
786// MinimalRepoListEntry is a subset of RepoListEntry. It was added after
787// performance profiling of sourcegraph.com revealed that querying this
788// information from Zoekt was causing lots of CPU and memory usage. Note: we
789// can revisit this, how we store and query this information has changed a lot
790// since this was introduced.
791type MinimalRepoListEntry struct {
792 // HasSymbols is exported since Sourcegraph uses this information at search
793 // planning time to decide between Zoekt and an unindexed symbol search.
794 //
795 // Note: it pretty much is always true in practice.
796 HasSymbols bool
797
798 // Branches is used by Sourcegraphs query planner to decided if it can use
799 // zoekt or go via an unindexed code path.
800 Branches []RepositoryBranch
801
802 // IndexTimeUnix is the IndexTime converted to unix time (number of seconds
803 // since the epoch). This is to make it clear we are not transporting the
804 // full fidelty timestamp (ie with milliseconds and location). Additionally
805 // it saves 16 bytes in this struct.
806 //
807 // IndexTime is used as a heuristic in Sourcegraph to decide in aggregate
808 // how many repositories need updating after a ranking change/etc.
809 //
810 // TODO(keegancsmith) audit updates to IndexTime and document how and when
811 // it changes. Concerned about things like metadata updates or compound
812 // shards leading to untrustworthy data here.
813 IndexTimeUnix int64
814}
815
816type ReposMap map[uint32]MinimalRepoListEntry
817
818// MarshalBinary implements a specialized encoder for ReposMap.
819func (q *ReposMap) MarshalBinary() ([]byte, error) {
820 return reposMapEncode(*q)
821}
822
823// UnmarshalBinary implements a specialized decoder for ReposMap.
824func (q *ReposMap) UnmarshalBinary(b []byte) error {
825 var err error
826 (*q), err = reposMapDecode(b)
827 return err
828}
829
830// RepoList holds a set of Repository metadata.
831type RepoList struct {
832 // Returned when ListOptions.Field is RepoListFieldRepos.
833 Repos []*RepoListEntry
834
835 // ReposMap is set when ListOptions.Field is RepoListFieldReposMap.
836 ReposMap ReposMap
837
838 Crashes int
839
840 // Stats response to a List request.
841 // This is the aggregate RepoStats of all repos matching the input query.
842 Stats RepoStats
843}
844
845type Searcher interface {
846 Search(ctx context.Context, q query.Q, opts *SearchOptions) (*SearchResult, error)
847
848 // List lists repositories. The query `q` can only contain
849 // query.Repo atoms.
850 List(ctx context.Context, q query.Q, opts *ListOptions) (*RepoList, error)
851 Close()
852
853 // Describe the searcher for debug messages.
854 String() string
855}
856
857type RepoListField int
858
859const (
860 RepoListFieldRepos RepoListField = 0
861 RepoListFieldReposMap = 2
862)
863
864type ListOptions struct {
865 // Field decides which field to populate in RepoList response.
866 Field RepoListField
867}
868
869func (o *ListOptions) GetField() (RepoListField, error) {
870 if o == nil {
871 return RepoListFieldRepos, nil
872 }
873 switch o.Field {
874 case RepoListFieldRepos, RepoListFieldReposMap:
875 return o.Field, nil
876 case 1:
877 return 0, fmt.Errorf("RepoListFieldMinimal (%d) is no longer supported", o.Field)
878 default:
879 return 0, fmt.Errorf("unknown RepoListField %d", o.Field)
880 }
881}
882
883func (o *ListOptions) String() string {
884 return fmt.Sprintf("%#v", o)
885}
886
887type SearchOptions struct {
888 // Return an upper-bound estimate of eligible documents in
889 // stats.ShardFilesConsidered.
890 EstimateDocCount bool
891
892 // Return the whole file.
893 Whole bool
894
895 // Maximum number of matches: skip all processing an index
896 // shard after we found this many non-overlapping matches.
897 ShardMaxMatchCount int
898
899 // Maximum number of matches: stop looking for more matches
900 // once we have this many matches across shards.
901 TotalMaxMatchCount int
902
903 // Maximum number of matches: skip processing documents for a repository in
904 // a shard once we have found ShardRepoMaxMatchCount.
905 //
906 // A compound shard may contain multiple repositories. This will most often
907 // be set to 1 to find all repositories containing a result.
908 ShardRepoMaxMatchCount int
909
910 // Abort the search after this much time has passed.
911 MaxWallTime time.Duration
912
913 // FlushWallTime if non-zero will stop streaming behaviour at first and
914 // instead will collate and sort results. At FlushWallTime the results will
915 // be sent and then the behaviour will revert to the normal streaming.
916 FlushWallTime time.Duration
917
918 // Truncates the number of documents (i.e. files) after collating and
919 // sorting the results.
920 MaxDocDisplayCount int
921
922 // Truncates the number of matchs after collating and sorting the results.
923 MaxMatchDisplayCount int
924
925 // If set to a number greater than zero then up to this many number
926 // of context lines will be added before and after each matched line.
927 // Note that the included context lines might contain matches and
928 // it's up to the consumer of the result to remove those lines.
929 NumContextLines int
930
931 // If true, ChunkMatches will be returned in each FileMatch rather than LineMatches
932 // EXPERIMENTAL: the behavior of this flag may be changed in future versions.
933 ChunkMatches bool
934
935 // EXPERIMENTAL. If true, document ranks are used as additional input for
936 // sorting matches.
937 UseDocumentRanks bool
938
939 // EXPERIMENTAL. When UseDocumentRanks is enabled, this can be optionally set to adjust
940 // their weight in the file match score. If the value is <= 0.0, the default weight value
941 // will be used. This option is temporary and is only exposed for testing/ tuning purposes.
942 DocumentRanksWeight float64
943
944 // EXPERIMENTAL. If true, use keyword-style scoring instead of the default scoring formula.
945 // Currently, this treats each match in a file as a term and computes an approximation to BM25.
946 // When enabled, all other scoring signals are ignored, including document ranks.
947 UseKeywordScoring bool
948
949 // Trace turns on opentracing for this request if true and if the Jaeger address was provided as
950 // a command-line flag
951 Trace bool
952
953 // If set, the search results will contain debug information for scoring.
954 DebugScore bool
955
956 // SpanContext is the opentracing span context, if it exists, from the zoekt client
957 SpanContext map[string]string
958}
959
960// String returns a succinct representation of the options. This is meant for
961// human consumption in logs and traces.
962//
963// Note: some tracing systems have limits on length of values, so we take care
964// to try and make this small, and include the important information near the
965// front incase of truncation.
966func (s *SearchOptions) String() string {
967 var b strings.Builder
968
969 add := func(name, value string) {
970 b.WriteString(name)
971 b.WriteByte('=')
972 b.WriteString(value)
973 b.WriteByte(' ')
974 }
975 addInt := func(name string, value int) {
976 if value != 0 {
977 add(name, strconv.Itoa(value))
978 }
979 }
980 addDuration := func(name string, value time.Duration) {
981 if value != 0 {
982 add(name, value.String())
983 }
984 }
985 addBool := func(name string, value bool) {
986 if !value {
987 return
988 }
989 b.WriteString(name)
990 b.WriteByte(' ')
991 }
992
993 b.WriteString("zoekt.SearchOptions{ ")
994
995 addInt("ShardMaxMatchCount", s.ShardMaxMatchCount)
996 addInt("TotalMaxMatchCount", s.TotalMaxMatchCount)
997 addInt("ShardRepoMaxMatchCount", s.ShardRepoMaxMatchCount)
998 addInt("MaxDocDisplayCount", s.MaxDocDisplayCount)
999 addInt("MaxMatchDisplayCount", s.MaxMatchDisplayCount)
1000 addInt("NumContextLines", s.NumContextLines)
1001
1002 addDuration("MaxWallTime", s.MaxWallTime)
1003 addDuration("FlushWallTime", s.FlushWallTime)
1004
1005 if s.DocumentRanksWeight > 0 {
1006 add("DocumentRanksWeight", strconv.FormatFloat(s.DocumentRanksWeight, 'g', -1, 64))
1007 }
1008
1009 addBool("EstimateDocCount", s.EstimateDocCount)
1010 addBool("Whole", s.Whole)
1011 addBool("ChunkMatches", s.ChunkMatches)
1012 addBool("UseDocumentRanks", s.UseDocumentRanks)
1013 addBool("UseKeywordScoring", s.UseKeywordScoring)
1014 addBool("Trace", s.Trace)
1015 addBool("DebugScore", s.DebugScore)
1016
1017 for k, v := range s.SpanContext {
1018 add("SpanContext."+k, strconv.Quote(v))
1019 }
1020
1021 b.WriteByte('}')
1022 return b.String()
1023}
1024
1025// Sender is the interface that wraps the basic Send method.
1026type Sender interface {
1027 Send(*SearchResult)
1028}
1029
1030// Streamer adds the method StreamSearch to the Searcher interface.
1031type Streamer interface {
1032 Searcher
1033 StreamSearch(ctx context.Context, q query.Q, opts *SearchOptions, sender Sender) (err error)
1034}