fork of https://github.com/sourcegraph/zoekt
1// Copyright 2016 Google Inc. All rights reserved.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15package zoekt // import "github.com/sourcegraph/zoekt"
16
17import (
18 "context"
19 "encoding/json"
20 "errors"
21 "fmt"
22 "reflect"
23 "strconv"
24 "strings"
25 "time"
26
27 "github.com/sourcegraph/zoekt/query"
28)
29
30const (
31 mapHeaderBytes uint64 = 48
32 sliceHeaderBytes uint64 = 24
33 stringHeaderBytes uint64 = 16
34 pointerSize uint64 = 8
35 interfaceBytes uint64 = 16
36)
37
38// FileMatch contains all the matches within a file.
39type FileMatch struct {
40 FileName string
41
42 // Repository is the globally unique name of the repo of the
43 // match
44 Repository string
45
46 // SubRepositoryName is the globally unique name of the repo,
47 // if it came from a subrepository
48 SubRepositoryName string `json:",omitempty"`
49
50 // SubRepositoryPath holds the prefix where the subrepository
51 // was mounted.
52 SubRepositoryPath string `json:",omitempty"`
53
54 // Commit SHA1 (hex) of the (sub)repo holding the file.
55 Version string `json:",omitempty"`
56
57 // Detected language of the result.
58 Language string
59
60 // For debugging. Needs DebugScore set, but public so tests in
61 // other packages can print some diagnostics.
62 Debug string `json:",omitempty"`
63
64 Branches []string `json:",omitempty"`
65
66 // One of LineMatches or ChunkMatches will be returned depending on whether
67 // the SearchOptions.ChunkMatches is set.
68 LineMatches []LineMatch `json:",omitempty"`
69 ChunkMatches []ChunkMatch `json:",omitempty"`
70
71 // Only set if requested
72 Content []byte `json:",omitempty"`
73
74 // Checksum of the content.
75 Checksum []byte
76
77 // Ranking; the higher, the better.
78 Score float64 `json:",omitempty"`
79
80 // RepositoryPriority is a Sourcegraph extension. It is used by Sourcegraph to
81 // order results from different repositories relative to each other.
82 RepositoryPriority float64 `json:",omitempty"`
83
84 // RepositoryID is a Sourcegraph extension. This is the ID of Repository in
85 // Sourcegraph.
86 RepositoryID uint32 `json:",omitempty"`
87}
88
89func (m *FileMatch) sizeBytes() (sz uint64) {
90 // Score
91 sz += 8
92
93 for _, s := range []string{
94 m.Debug,
95 m.FileName,
96 m.Repository,
97 m.Language,
98 m.SubRepositoryName,
99 m.SubRepositoryPath,
100 m.Version,
101 } {
102 sz += stringHeaderBytes + uint64(len(s))
103 }
104
105 // Branches
106 sz += sliceHeaderBytes
107 for _, s := range m.Branches {
108 sz += stringHeaderBytes + uint64(len(s))
109 }
110
111 // LineMatches
112 sz += sliceHeaderBytes
113 for _, lm := range m.LineMatches {
114 sz += lm.sizeBytes()
115 }
116
117 // ChunkMatches
118 sz += sliceHeaderBytes
119 for _, cm := range m.ChunkMatches {
120 sz += cm.sizeBytes()
121 }
122
123 // RepositoryID
124 sz += 4
125
126 // RepositoryPriority
127 sz += 8
128
129 // Content
130 sz += sliceHeaderBytes + uint64(len(m.Content))
131
132 // Checksum
133 sz += sliceHeaderBytes + uint64(len(m.Checksum))
134
135 return
136}
137
138// ChunkMatch is a set of non-overlapping matches within a contiguous range of
139// lines in the file.
140type ChunkMatch struct {
141 DebugScore string
142
143 // Content is a contiguous range of complete lines that fully contains Ranges.
144 // Lines will always include their terminating newline (if it exists).
145 Content []byte
146
147 // Ranges is a set of matching ranges within this chunk. Each range is relative
148 // to the beginning of the file (not the beginning of Content).
149 Ranges []Range
150
151 // SymbolInfo is the symbol information associated with Ranges. If it is non-nil,
152 // its length will equal that of Ranges. Any of its elements may be nil.
153 SymbolInfo []*Symbol
154
155 // FileName indicates whether this match is a match on the file name, in
156 // which case Content will contain the file name.
157 FileName bool
158
159 // ContentStart is the location (inclusive) of the beginning of content
160 // relative to the beginning of the file. It will always be at the
161 // beginning of a line (Column will always be 1).
162 ContentStart Location
163
164 Score float64
165}
166
167func (cm *ChunkMatch) sizeBytes() (sz uint64) {
168 // Content
169 sz += sliceHeaderBytes + uint64(len(cm.Content))
170
171 // ContentStart
172 sz += cm.ContentStart.sizeBytes()
173
174 // FileName
175 sz += 1
176
177 // Ranges
178 sz += sliceHeaderBytes
179 if len(cm.Ranges) > 0 {
180 sz += uint64(len(cm.Ranges)) * cm.Ranges[0].sizeBytes()
181 }
182
183 // SymbolInfo
184 sz += sliceHeaderBytes
185 for _, si := range cm.SymbolInfo {
186 sz += pointerSize
187 if si != nil {
188 sz += si.sizeBytes()
189 }
190 }
191
192 // Score
193 sz += 8
194
195 // DebugScore
196 sz += stringHeaderBytes + uint64(len(cm.DebugScore))
197
198 return
199}
200
201type Range struct {
202 // The inclusive beginning of the range.
203 Start Location
204 // The exclusive end of the range.
205 End Location
206}
207
208func (r *Range) sizeBytes() uint64 {
209 return r.Start.sizeBytes() + r.End.sizeBytes()
210}
211
212type Location struct {
213 // 0-based byte offset from the beginning of the file
214 ByteOffset uint32
215 // 1-based line number from the beginning of the file
216 LineNumber uint32
217 // 1-based column number (in runes) from the beginning of line
218 Column uint32
219}
220
221func (l *Location) sizeBytes() uint64 {
222 return 3 * 4
223}
224
225// LineMatch holds the matches within a single line in a file.
226type LineMatch struct {
227 // The line in which a match was found.
228 Line []byte
229 // The byte offset of the first byte of the line.
230 LineStart int
231 // The byte offset of the first byte past the end of the line.
232 // This is usually the byte after the terminating newline, but can also be
233 // the end of the file if there is no terminating newline
234 LineEnd int
235 LineNumber int
236
237 // Before and After are only set when SearchOptions.NumContextLines is > 0
238 Before []byte
239 After []byte
240
241 // If set, this was a match on the filename.
242 FileName bool
243
244 // The higher the better. Only ranks the quality of the match
245 // within the file, does not take rank of file into account
246 Score float64
247 DebugScore string
248
249 LineFragments []LineFragmentMatch
250}
251
252func (lm *LineMatch) sizeBytes() (sz uint64) {
253 // Line
254 sz += sliceHeaderBytes + uint64(len(lm.Line))
255
256 // LineStart, LineEnd, LineNumber
257 sz += 3 * 8
258
259 // Before
260 sz += sliceHeaderBytes + uint64(len(lm.Before))
261
262 // After
263 sz += sliceHeaderBytes + uint64(len(lm.After))
264
265 // FileName
266 sz += 1
267
268 // Score
269 sz += 8
270
271 // DebugScore
272 sz += stringHeaderBytes + uint64(len(lm.DebugScore))
273
274 // LineFragments
275 sz += sliceHeaderBytes
276 for _, lf := range lm.LineFragments {
277 sz += lf.sizeBytes()
278 }
279
280 return
281}
282
283type Symbol struct {
284 Sym string
285 Kind string
286 Parent string
287 ParentKind string
288}
289
290func (s *Symbol) sizeBytes() uint64 {
291 return 4*stringHeaderBytes + uint64(len(s.Sym)+len(s.Kind)+len(s.Parent)+len(s.ParentKind))
292}
293
294// LineFragmentMatch a segment of matching text within a line.
295type LineFragmentMatch struct {
296 // Offset within the line, in bytes.
297 LineOffset int
298
299 // Offset from file start, in bytes.
300 Offset uint32
301
302 // Number bytes that match.
303 MatchLength int
304
305 SymbolInfo *Symbol
306}
307
308func (lfm *LineFragmentMatch) sizeBytes() (sz uint64) {
309 // LineOffset
310 sz += 8
311
312 // Offset
313 sz += 4
314
315 // MatchLength
316 sz += 8
317
318 // SymbolInfo
319 sz += pointerSize
320 if lfm.SymbolInfo != nil {
321 sz += lfm.SymbolInfo.sizeBytes()
322 }
323
324 return
325}
326
327type FlushReason uint8
328
329const (
330 FlushReasonTimerExpired FlushReason = 1 << iota
331 FlushReasonFinalFlush
332 FlushReasonMaxSize
333)
334
335var FlushReasonStrings = map[FlushReason]string{
336 FlushReasonTimerExpired: "timer_expired",
337 FlushReasonFinalFlush: "final_flush",
338 FlushReasonMaxSize: "max_size_reached",
339}
340
341func (fr FlushReason) String() string {
342 if v, ok := FlushReasonStrings[fr]; ok {
343 return v
344 }
345
346 return "none"
347}
348
349// Stats contains interesting numbers on the search
350type Stats struct {
351 // Amount of I/O for reading contents.
352 ContentBytesLoaded int64
353
354 // Amount of I/O for reading from index.
355 IndexBytesLoaded int64
356
357 // Number of search shards that had a crash.
358 Crashes int
359
360 // Wall clock time for this search
361 Duration time.Duration
362
363 // Number of files containing a match.
364 FileCount int
365
366 // Number of files in shards that we considered.
367 ShardFilesConsidered int
368
369 // Files that we evaluated. Equivalent to files for which all
370 // atom matches (including negations) evaluated to true.
371 FilesConsidered int
372
373 // Files for which we loaded file content to verify substring matches
374 FilesLoaded int
375
376 // Candidate files whose contents weren't examined because we
377 // gathered enough matches.
378 FilesSkipped int
379
380 // Shards that we scanned to find matches.
381 ShardsScanned int
382
383 // Shards that we did not process because a query was canceled.
384 ShardsSkipped int
385
386 // Shards that we did not process because the query was rejected by the
387 // ngram filter indicating it had no matches.
388 ShardsSkippedFilter int
389
390 // Number of non-overlapping matches
391 MatchCount int
392
393 // Number of candidate matches as a result of searching ngrams.
394 NgramMatches int
395
396 // NgramLookups is the number of times we accessed an ngram in the index.
397 NgramLookups int
398
399 // Wall clock time for queued search.
400 Wait time.Duration
401
402 // Aggregate wall clock time spent constructing and pruning the match tree.
403 // This accounts for time such as lookups in the trigram index.
404 MatchTreeConstruction time.Duration
405
406 // Aggregate wall clock time spent searching the match tree. This accounts
407 // for the bulk of search work done looking for matches.
408 MatchTreeSearch time.Duration
409
410 // Number of times regexp was called on files that we evaluated.
411 RegexpsConsidered int
412
413 // FlushReason explains why results were flushed.
414 FlushReason FlushReason
415}
416
417func (s *Stats) sizeBytes() (sz uint64) {
418 sz = 16 * 8 // This assumes we are running on a 64-bit architecture
419 sz += 1 // FlushReason
420
421 return
422}
423
424func (s *Stats) Add(o Stats) {
425 s.ContentBytesLoaded += o.ContentBytesLoaded
426 s.IndexBytesLoaded += o.IndexBytesLoaded
427 s.Crashes += o.Crashes
428 s.FileCount += o.FileCount
429 s.FilesConsidered += o.FilesConsidered
430 s.FilesLoaded += o.FilesLoaded
431 s.FilesSkipped += o.FilesSkipped
432 s.MatchCount += o.MatchCount
433 s.NgramMatches += o.NgramMatches
434 s.NgramLookups += o.NgramLookups
435 s.ShardFilesConsidered += o.ShardFilesConsidered
436 s.ShardsScanned += o.ShardsScanned
437 s.ShardsSkipped += o.ShardsSkipped
438 s.ShardsSkippedFilter += o.ShardsSkippedFilter
439 s.Wait += o.Wait
440 s.MatchTreeConstruction += o.MatchTreeConstruction
441 s.MatchTreeSearch += o.MatchTreeSearch
442 s.RegexpsConsidered += o.RegexpsConsidered
443
444 // We want the first non-zero FlushReason to be sticky. This is a useful
445 // property when aggregating stats from several Zoekts.
446 if s.FlushReason == 0 {
447 s.FlushReason = o.FlushReason
448 }
449}
450
451// Zero returns true if stats is empty.
452func (s *Stats) Zero() bool {
453 if s == nil {
454 return true
455 }
456
457 return !(s.ContentBytesLoaded > 0 ||
458 s.IndexBytesLoaded > 0 ||
459 s.Crashes > 0 ||
460 s.FileCount > 0 ||
461 s.FilesConsidered > 0 ||
462 s.FilesLoaded > 0 ||
463 s.FilesSkipped > 0 ||
464 s.MatchCount > 0 ||
465 s.NgramMatches > 0 ||
466 s.NgramLookups > 0 ||
467 s.ShardFilesConsidered > 0 ||
468 s.ShardsScanned > 0 ||
469 s.ShardsSkipped > 0 ||
470 s.ShardsSkippedFilter > 0 ||
471 s.Wait > 0 ||
472 s.MatchTreeConstruction > 0 ||
473 s.MatchTreeSearch > 0 ||
474 s.RegexpsConsidered > 0)
475}
476
477// Progress contains information about the global progress of the running search query.
478// This is used by the frontend to reorder results and emit them when stable.
479// Sourcegraph specific: this is used when querying multiple zoekt-webserver instances.
480type Progress struct {
481 // Priority of the shard that was searched.
482 Priority float64
483
484 // MaxPendingPriority is the maximum priority of pending result that is being searched in parallel.
485 // This is used to reorder results when the result set is known to be stable-- that is, when a result's
486 // Priority is greater than the max(MaxPendingPriority) from the latest results of each backend, it can be returned to the user.
487 //
488 // MaxPendingPriority decreases monotonically in each SearchResult.
489 MaxPendingPriority float64
490}
491
492func (p *Progress) sizeBytes() uint64 {
493 return 2 * 8
494}
495
496// SearchResult contains search matches and extra data
497type SearchResult struct {
498 Stats
499
500 // Do not encode this as we cannot encode -Inf in JSON
501 Progress `json:"-"`
502
503 Files []FileMatch
504
505 // RepoURLs holds a repo => template string map.
506 RepoURLs map[string]string
507
508 // FragmentNames holds a repo => template string map, for
509 // the line number fragment.
510 LineFragments map[string]string
511}
512
513// SizeBytes is a best-effort estimate of the size of SearchResult in memory.
514// The estimate does not take alignment into account. The result is a lower
515// bound on the actual size in memory.
516func (sr *SearchResult) SizeBytes() (sz uint64) {
517 sz += sr.Stats.sizeBytes()
518 sz += sr.Progress.sizeBytes()
519
520 // Files
521 sz += sliceHeaderBytes
522 for _, f := range sr.Files {
523 sz += f.sizeBytes()
524 }
525
526 // RepoURLs
527 sz += mapHeaderBytes
528 for k, v := range sr.RepoURLs {
529 sz += stringHeaderBytes + uint64(len(k))
530 sz += stringHeaderBytes + uint64(len(v))
531 }
532
533 // LineFragments
534 sz += mapHeaderBytes
535 for k, v := range sr.LineFragments {
536 sz += stringHeaderBytes + uint64(len(k))
537 sz += stringHeaderBytes + uint64(len(v))
538 }
539
540 return
541}
542
543// RepositoryBranch describes an indexed branch, which is a name
544// combined with a version.
545type RepositoryBranch struct {
546 Name string
547 Version string
548}
549
550func (r RepositoryBranch) String() string {
551 return fmt.Sprintf("%s@%s", r.Name, r.Version)
552}
553
554// Repository holds repository metadata.
555type Repository struct {
556 // Sourcegraph's repository ID
557 ID uint32
558
559 // The repository name
560 Name string
561
562 // The repository URL.
563 URL string
564
565 // The physical source where this repo came from, eg. full
566 // path to the zip filename or git repository directory. This
567 // will not be exposed in the UI, but can be used to detect
568 // orphaned index shards.
569 Source string
570
571 // The branches indexed in this repo.
572 Branches []RepositoryBranch
573
574 // Nil if this is not the super project.
575 SubRepoMap map[string]*Repository
576
577 // URL template to link to the commit of a branch
578 CommitURLTemplate string
579
580 // The repository URL for getting to a file. Has access to
581 // {{.Version}}, {{.Path}}
582 FileURLTemplate string
583
584 // The URL fragment to add to a file URL for line numbers. has
585 // access to {{.LineNumber}}. The fragment should include the
586 // separator, generally '#' or ';'.
587 LineFragmentTemplate string
588
589 // Perf optimization: priority is set when we load the shard. It corresponds to
590 // the value of "priority" stored in RawConfig.
591 priority float64
592
593 // All zoekt.* configuration settings.
594 RawConfig map[string]string
595
596 // Importance of the repository, bigger is more important
597 Rank uint16
598
599 // IndexOptions is a hash of the options used to create the index for the
600 // repo.
601 IndexOptions string
602
603 // HasSymbols is true if this repository has indexed ctags
604 // output. Sourcegraph specific: This field is more appropriate for
605 // IndexMetadata. However, we store it here since the Sourcegraph frontend
606 // can read this structure but not IndexMetadata.
607 HasSymbols bool
608
609 // Tombstone is true if we are not allowed to search this repo.
610 Tombstone bool
611
612 // LatestCommitDate is the date of the latest commit among all indexed Branches.
613 // The date might be time.Time's 0-value if the repository was last indexed
614 // before this field was added.
615 LatestCommitDate time.Time
616
617 // FileTombstones is a set of file paths that should be ignored across all branches
618 // in this shard.
619 FileTombstones map[string]struct{} `json:",omitempty"`
620}
621
622func (r *Repository) UnmarshalJSON(data []byte) error {
623 // We define a new type so that we can use json.Unmarshal
624 // without recursing into this same method.
625 type repository *Repository
626 repo := repository(r)
627
628 err := json.Unmarshal(data, repo)
629 if err != nil {
630 return err
631 }
632
633 if v, ok := repo.RawConfig["repoid"]; ok {
634 id, _ := strconv.ParseUint(v, 10, 32)
635 r.ID = uint32(id)
636 }
637
638 if v, ok := repo.RawConfig["priority"]; ok {
639 r.priority, err = strconv.ParseFloat(v, 64)
640 if err != nil {
641 r.priority = 0
642 }
643
644 // Sourcegraph indexserver doesn't set repo.Rank, so we set it here
645 // based on priority. Setting it on read instead of during indexing
646 // allows us to avoid a complete reindex.
647 if r.Rank == 0 && r.priority > 0 {
648 // Normalize the repo score within [0, 1), with the midpoint at 5,000. This means popular
649 // repos (roughly ones with over 5,000 stars) see diminishing returns from more stars.
650 r.Rank = uint16(r.priority / (5000.0 + r.priority) * maxUInt16)
651 }
652 }
653 return nil
654}
655
656// MergeMutable will merge x into r. mutated will be true if it made any
657// changes. err is non-nil if we needed to mutate an immutable field.
658//
659// Note: SubRepoMap, IndexOptions and HasSymbol fields are ignored. They are
660// computed while indexing so can't be synthesized from x.
661//
662// Note: We ignore RawConfig fields which are duplicated into Repository:
663// name and id.
664func (r *Repository) MergeMutable(x *Repository) (mutated bool, err error) {
665 if r.ID != x.ID {
666 // Sourcegraph: strange behaviour may occur if ID changes but names don't.
667 return mutated, errors.New("ID is immutable")
668 }
669 if r.Name != x.Name {
670 // Name is encoded into the shard name on disk. We need to re-index if it
671 // changes.
672 return mutated, errors.New("Name is immutable")
673 }
674 if !reflect.DeepEqual(r.Branches, x.Branches) {
675 // Need a reindex if content changing.
676 return mutated, errors.New("Branches is immutable")
677 }
678
679 for k, v := range x.RawConfig {
680 // We ignore name and id since they are encoded into the repository.
681 if k == "name" || k == "id" {
682 continue
683 }
684 if r.RawConfig == nil {
685 mutated = true
686 r.RawConfig = make(map[string]string)
687 }
688 if r.RawConfig[k] != v {
689 mutated = true
690 r.RawConfig[k] = v
691 }
692 }
693
694 if r.URL != x.URL {
695 mutated = true
696 r.URL = x.URL
697 }
698 if r.CommitURLTemplate != x.CommitURLTemplate {
699 mutated = true
700 r.CommitURLTemplate = x.CommitURLTemplate
701 }
702 if r.FileURLTemplate != x.FileURLTemplate {
703 mutated = true
704 r.FileURLTemplate = x.FileURLTemplate
705 }
706 if r.LineFragmentTemplate != x.LineFragmentTemplate {
707 mutated = true
708 r.LineFragmentTemplate = x.LineFragmentTemplate
709 }
710
711 return mutated, nil
712}
713
714// IndexMetadata holds metadata stored in the index file. It contains
715// data generated by the core indexing library.
716type IndexMetadata struct {
717 IndexFormatVersion int
718 IndexFeatureVersion int
719 IndexMinReaderVersion int
720 IndexTime time.Time
721 PlainASCII bool
722 LanguageMap map[string]uint16
723 ZoektVersion string
724 ID string
725}
726
727// Statistics of a (collection of) repositories.
728type RepoStats struct {
729 // Repos is used for aggregrating the number of repositories.
730 //
731 // Note: This field is not populated on RepoListEntry.Stats (individual) but
732 // only for RepoList.Stats (aggregate).
733 Repos int
734
735 // Shards is the total number of search shards.
736 Shards int
737
738 // Documents holds the number of documents or files.
739 Documents int
740
741 // IndexBytes is the amount of RAM used for index overhead.
742 IndexBytes int64
743
744 // ContentBytes is the amount of RAM used for raw content.
745 ContentBytes int64
746
747 // Sourcegraph specific stats below. These are not as efficient to calculate
748 // as the above statistics. We experimentally measured about a 10% slower
749 // shard load time. However, we find these values very useful to track and
750 // computing them outside of load time introduces a lot of complexity.
751
752 // NewLinesCount is the number of newlines "\n" that appear in the zoekt
753 // indexed documents. This is not exactly the same as line count, since it
754 // will not include lines not terminated by "\n" (eg a file with no "\n", or
755 // a final line without "\n"). Note: Zoekt deduplicates documents across
756 // branches, so if a path has the same contents on multiple branches, there
757 // is only one document for it. As such that document's newlines is only
758 // counted once. See DefaultBranchNewLinesCount and AllBranchesNewLinesCount
759 // for counts which do not deduplicate.
760 NewLinesCount uint64
761
762 // DefaultBranchNewLinesCount is the number of newlines "\n" in the default
763 // branch.
764 DefaultBranchNewLinesCount uint64
765
766 // OtherBranchesNewLinesCount is the number of newlines "\n" in all branches
767 // except the default branch.
768 OtherBranchesNewLinesCount uint64
769}
770
771func (s *RepoStats) Add(o *RepoStats) {
772 // can't update Repos, since one repo may have multiple
773 // shards.
774 s.Shards += o.Shards
775 s.IndexBytes += o.IndexBytes
776 s.Documents += o.Documents
777 s.ContentBytes += o.ContentBytes
778
779 // Sourcegraph specific
780 s.NewLinesCount += o.NewLinesCount
781 s.DefaultBranchNewLinesCount += o.DefaultBranchNewLinesCount
782 s.OtherBranchesNewLinesCount += o.OtherBranchesNewLinesCount
783}
784
785type RepoListEntry struct {
786 Repository Repository
787 IndexMetadata IndexMetadata
788 Stats RepoStats
789}
790
791// MinimalRepoListEntry is a subset of RepoListEntry. It was added after
792// performance profiling of sourcegraph.com revealed that querying this
793// information from Zoekt was causing lots of CPU and memory usage. Note: we
794// can revisit this, how we store and query this information has changed a lot
795// since this was introduced.
796type MinimalRepoListEntry struct {
797 // HasSymbols is exported since Sourcegraph uses this information at search
798 // planning time to decide between Zoekt and an unindexed symbol search.
799 //
800 // Note: it pretty much is always true in practice.
801 HasSymbols bool
802
803 // Branches is used by Sourcegraphs query planner to decided if it can use
804 // zoekt or go via an unindexed code path.
805 Branches []RepositoryBranch
806
807 // IndexTimeUnix is the IndexTime converted to unix time (number of seconds
808 // since the epoch). This is to make it clear we are not transporting the
809 // full fidelty timestamp (ie with milliseconds and location). Additionally
810 // it saves 16 bytes in this struct.
811 //
812 // IndexTime is used as a heuristic in Sourcegraph to decide in aggregate
813 // how many repositories need updating after a ranking change/etc.
814 //
815 // TODO(keegancsmith) audit updates to IndexTime and document how and when
816 // it changes. Concerned about things like metadata updates or compound
817 // shards leading to untrustworthy data here.
818 IndexTimeUnix int64
819}
820
821type ReposMap map[uint32]MinimalRepoListEntry
822
823// MarshalBinary implements a specialized encoder for ReposMap.
824func (q *ReposMap) MarshalBinary() ([]byte, error) {
825 return reposMapEncode(*q)
826}
827
828// UnmarshalBinary implements a specialized decoder for ReposMap.
829func (q *ReposMap) UnmarshalBinary(b []byte) error {
830 var err error
831 (*q), err = reposMapDecode(b)
832 return err
833}
834
835// RepoList holds a set of Repository metadata.
836type RepoList struct {
837 // Returned when ListOptions.Field is RepoListFieldRepos.
838 Repos []*RepoListEntry
839
840 // ReposMap is set when ListOptions.Field is RepoListFieldReposMap.
841 ReposMap ReposMap
842
843 Crashes int
844
845 // Stats response to a List request.
846 // This is the aggregate RepoStats of all repos matching the input query.
847 Stats RepoStats
848}
849
850type Searcher interface {
851 Search(ctx context.Context, q query.Q, opts *SearchOptions) (*SearchResult, error)
852
853 // List lists repositories. The query `q` can only contain
854 // query.Repo atoms.
855 List(ctx context.Context, q query.Q, opts *ListOptions) (*RepoList, error)
856 Close()
857
858 // Describe the searcher for debug messages.
859 String() string
860}
861
862type RepoListField int
863
864const (
865 RepoListFieldRepos RepoListField = 0
866 RepoListFieldReposMap = 2
867)
868
869type ListOptions struct {
870 // Field decides which field to populate in RepoList response.
871 Field RepoListField
872}
873
874func (o *ListOptions) GetField() (RepoListField, error) {
875 if o == nil {
876 return RepoListFieldRepos, nil
877 }
878 switch o.Field {
879 case RepoListFieldRepos, RepoListFieldReposMap:
880 return o.Field, nil
881 case 1:
882 return 0, fmt.Errorf("RepoListFieldMinimal (%d) is no longer supported", o.Field)
883 default:
884 return 0, fmt.Errorf("unknown RepoListField %d", o.Field)
885 }
886}
887
888func (o *ListOptions) String() string {
889 return fmt.Sprintf("%#v", o)
890}
891
892type SearchOptions struct {
893 // Return an upper-bound estimate of eligible documents in
894 // stats.ShardFilesConsidered.
895 EstimateDocCount bool
896
897 // Return the whole file.
898 Whole bool
899
900 // Maximum number of matches: skip all processing an index
901 // shard after we found this many non-overlapping matches.
902 ShardMaxMatchCount int
903
904 // Maximum number of matches: stop looking for more matches
905 // once we have this many matches across shards.
906 TotalMaxMatchCount int
907
908 // Maximum number of matches: skip processing documents for a repository in
909 // a shard once we have found ShardRepoMaxMatchCount.
910 //
911 // A compound shard may contain multiple repositories. This will most often
912 // be set to 1 to find all repositories containing a result.
913 ShardRepoMaxMatchCount int
914
915 // Abort the search after this much time has passed.
916 MaxWallTime time.Duration
917
918 // FlushWallTime if non-zero will stop streaming behaviour at first and
919 // instead will collate and sort results. At FlushWallTime the results will
920 // be sent and then the behaviour will revert to the normal streaming.
921 FlushWallTime time.Duration
922
923 // Truncates the number of documents (i.e. files) after collating and
924 // sorting the results.
925 MaxDocDisplayCount int
926
927 // Truncates the number of matchs after collating and sorting the results.
928 MaxMatchDisplayCount int
929
930 // If set to a number greater than zero then up to this many number
931 // of context lines will be added before and after each matched line.
932 // Note that the included context lines might contain matches and
933 // it's up to the consumer of the result to remove those lines.
934 NumContextLines int
935
936 // If true, ChunkMatches will be returned in each FileMatch rather than LineMatches
937 // EXPERIMENTAL: the behavior of this flag may be changed in future versions.
938 ChunkMatches bool
939
940 // EXPERIMENTAL. If true, document ranks are used as additional input for
941 // sorting matches.
942 UseDocumentRanks bool
943
944 // EXPERIMENTAL. When UseDocumentRanks is enabled, this can be optionally set to adjust
945 // their weight in the file match score. If the value is <= 0.0, the default weight value
946 // will be used. This option is temporary and is only exposed for testing/ tuning purposes.
947 DocumentRanksWeight float64
948
949 // EXPERIMENTAL. If true, use text-search style scoring instead of the default scoring formula.
950 // The scoring algorithm treats each match in a file as a term and computes an approximation to
951 // BM25. When enabled, all other scoring signals are ignored, including document ranks.
952 UseBM25Scoring bool
953
954 // Trace turns on opentracing for this request if true and if the Jaeger address was provided as
955 // a command-line flag
956 Trace bool
957
958 // If set, the search results will contain debug information for scoring.
959 DebugScore bool
960
961 // SpanContext is the opentracing span context, if it exists, from the zoekt client
962 SpanContext map[string]string
963}
964
965// String returns a succinct representation of the options. This is meant for
966// human consumption in logs and traces.
967//
968// Note: some tracing systems have limits on length of values, so we take care
969// to try and make this small, and include the important information near the
970// front incase of truncation.
971func (s *SearchOptions) String() string {
972 var b strings.Builder
973
974 add := func(name, value string) {
975 b.WriteString(name)
976 b.WriteByte('=')
977 b.WriteString(value)
978 b.WriteByte(' ')
979 }
980 addInt := func(name string, value int) {
981 if value != 0 {
982 add(name, strconv.Itoa(value))
983 }
984 }
985 addDuration := func(name string, value time.Duration) {
986 if value != 0 {
987 add(name, value.String())
988 }
989 }
990 addBool := func(name string, value bool) {
991 if !value {
992 return
993 }
994 b.WriteString(name)
995 b.WriteByte(' ')
996 }
997
998 b.WriteString("zoekt.SearchOptions{ ")
999
1000 addInt("ShardMaxMatchCount", s.ShardMaxMatchCount)
1001 addInt("TotalMaxMatchCount", s.TotalMaxMatchCount)
1002 addInt("ShardRepoMaxMatchCount", s.ShardRepoMaxMatchCount)
1003 addInt("MaxDocDisplayCount", s.MaxDocDisplayCount)
1004 addInt("MaxMatchDisplayCount", s.MaxMatchDisplayCount)
1005 addInt("NumContextLines", s.NumContextLines)
1006
1007 addDuration("MaxWallTime", s.MaxWallTime)
1008 addDuration("FlushWallTime", s.FlushWallTime)
1009
1010 if s.DocumentRanksWeight > 0 {
1011 add("DocumentRanksWeight", strconv.FormatFloat(s.DocumentRanksWeight, 'g', -1, 64))
1012 }
1013
1014 addBool("EstimateDocCount", s.EstimateDocCount)
1015 addBool("Whole", s.Whole)
1016 addBool("ChunkMatches", s.ChunkMatches)
1017 addBool("UseDocumentRanks", s.UseDocumentRanks)
1018 addBool("UseBM25Scoring", s.UseBM25Scoring)
1019 addBool("Trace", s.Trace)
1020 addBool("DebugScore", s.DebugScore)
1021
1022 for k, v := range s.SpanContext {
1023 add("SpanContext."+k, strconv.Quote(v))
1024 }
1025
1026 b.WriteByte('}')
1027 return b.String()
1028}
1029
1030// Sender is the interface that wraps the basic Send method.
1031type Sender interface {
1032 Send(*SearchResult)
1033}
1034
1035// SenderFunc is an adapter to allow the use of ordinary functions as Sender.
1036// If f is a function with the appropriate signature, SenderFunc(f) is a Sender
1037// that calls f.
1038type SenderFunc func(result *SearchResult)
1039
1040func (f SenderFunc) Send(result *SearchResult) {
1041 f(result)
1042}
1043
1044// Streamer adds the method StreamSearch to the Searcher interface.
1045type Streamer interface {
1046 Searcher
1047 StreamSearch(ctx context.Context, q query.Q, opts *SearchOptions, sender Sender) (err error)
1048}