fork of https://github.com/sourcegraph/zoekt
1// Copyright 2016 Google Inc. All rights reserved.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15package zoekt // import "github.com/sourcegraph/zoekt"
16
17import (
18 "context"
19 "encoding/json"
20 "errors"
21 "fmt"
22 "reflect"
23 "strconv"
24 "strings"
25 "time"
26
27 "github.com/sourcegraph/zoekt/query"
28)
29
30const (
31 mapHeaderBytes uint64 = 48
32 sliceHeaderBytes uint64 = 24
33 stringHeaderBytes uint64 = 16
34 pointerSize uint64 = 8
35 interfaceBytes uint64 = 16
36)
37
38// FileMatch contains all the matches within a file.
39type FileMatch struct {
40 FileName string
41
42 // Repository is the globally unique name of the repo of the
43 // match
44 Repository string
45
46 // SubRepositoryName is the globally unique name of the repo,
47 // if it came from a subrepository
48 SubRepositoryName string `json:",omitempty"`
49
50 // SubRepositoryPath holds the prefix where the subrepository
51 // was mounted.
52 SubRepositoryPath string `json:",omitempty"`
53
54 // Commit SHA1 (hex) of the (sub)repo holding the file.
55 Version string `json:",omitempty"`
56
57 // Detected language of the result.
58 Language string
59
60 // For debugging. Needs DebugScore set, but public so tests in
61 // other packages can print some diagnostics.
62 Debug string `json:",omitempty"`
63
64 Branches []string `json:",omitempty"`
65
66 // One of LineMatches or ChunkMatches will be returned depending on whether
67 // the SearchOptions.ChunkMatches is set.
68 LineMatches []LineMatch `json:",omitempty"`
69 ChunkMatches []ChunkMatch `json:",omitempty"`
70
71 // Only set if requested
72 Content []byte `json:",omitempty"`
73
74 // Checksum of the content.
75 Checksum []byte
76
77 // Ranking; the higher, the better.
78 Score float64 `json:",omitempty"`
79
80 // RepositoryPriority is a Sourcegraph extension. It is used by Sourcegraph to
81 // order results from different repositories relative to each other.
82 RepositoryPriority float64 `json:",omitempty"`
83
84 // RepositoryID is a Sourcegraph extension. This is the ID of Repository in
85 // Sourcegraph.
86 RepositoryID uint32 `json:",omitempty"`
87}
88
89func (m *FileMatch) sizeBytes() (sz uint64) {
90 // Score
91 sz += 8
92
93 for _, s := range []string{
94 m.Debug,
95 m.FileName,
96 m.Repository,
97 m.Language,
98 m.SubRepositoryName,
99 m.SubRepositoryPath,
100 m.Version,
101 } {
102 sz += stringHeaderBytes + uint64(len(s))
103 }
104
105 // Branches
106 sz += sliceHeaderBytes
107 for _, s := range m.Branches {
108 sz += stringHeaderBytes + uint64(len(s))
109 }
110
111 // LineMatches
112 sz += sliceHeaderBytes
113 for _, lm := range m.LineMatches {
114 sz += lm.sizeBytes()
115 }
116
117 // ChunkMatches
118 sz += sliceHeaderBytes
119 for _, cm := range m.ChunkMatches {
120 sz += cm.sizeBytes()
121 }
122
123 // RepositoryID
124 sz += 4
125
126 // RepositoryPriority
127 sz += 8
128
129 // Content
130 sz += sliceHeaderBytes + uint64(len(m.Content))
131
132 // Checksum
133 sz += sliceHeaderBytes + uint64(len(m.Checksum))
134
135 return
136}
137
138// ChunkMatch is a set of non-overlapping matches within a contiguous range of
139// lines in the file.
140type ChunkMatch struct {
141 DebugScore string
142
143 // Content is a contiguous range of complete lines that fully contains Ranges.
144 // Lines will always include their terminating newline (if it exists).
145 Content []byte
146
147 // Ranges is a set of matching ranges within this chunk. Each range is relative
148 // to the beginning of the file (not the beginning of Content).
149 Ranges []Range
150
151 // SymbolInfo is the symbol information associated with Ranges. If it is non-nil,
152 // its length will equal that of Ranges. Any of its elements may be nil.
153 SymbolInfo []*Symbol
154
155 // FileName indicates whether this match is a match on the file name, in
156 // which case Content will contain the file name.
157 FileName bool
158
159 // ContentStart is the location (inclusive) of the beginning of content
160 // relative to the beginning of the file. It will always be at the
161 // beginning of a line (Column will always be 1).
162 ContentStart Location
163
164 Score float64
165}
166
167func (cm *ChunkMatch) sizeBytes() (sz uint64) {
168 // Content
169 sz += sliceHeaderBytes + uint64(len(cm.Content))
170
171 // ContentStart
172 sz += cm.ContentStart.sizeBytes()
173
174 // FileName
175 sz += 1
176
177 // Ranges
178 sz += sliceHeaderBytes
179 if len(cm.Ranges) > 0 {
180 sz += uint64(len(cm.Ranges)) * cm.Ranges[0].sizeBytes()
181 }
182
183 // SymbolInfo
184 sz += sliceHeaderBytes
185 for _, si := range cm.SymbolInfo {
186 sz += pointerSize
187 if si != nil {
188 sz += si.sizeBytes()
189 }
190 }
191
192 // Score
193 sz += 8
194
195 // DebugScore
196 sz += stringHeaderBytes + uint64(len(cm.DebugScore))
197
198 return
199}
200
201type Range struct {
202 // The inclusive beginning of the range.
203 Start Location
204 // The exclusive end of the range.
205 End Location
206}
207
208func (r *Range) sizeBytes() uint64 {
209 return r.Start.sizeBytes() + r.End.sizeBytes()
210}
211
212type Location struct {
213 // 0-based byte offset from the beginning of the file
214 ByteOffset uint32
215 // 1-based line number from the beginning of the file
216 LineNumber uint32
217 // 1-based column number (in runes) from the beginning of line
218 Column uint32
219}
220
221func (l *Location) sizeBytes() uint64 {
222 return 3 * 4
223}
224
225// LineMatch holds the matches within a single line in a file.
226type LineMatch struct {
227 // The line in which a match was found.
228 Line []byte
229 // The byte offset of the first byte of the line.
230 LineStart int
231 // The byte offset of the first byte past the end of the line.
232 // This is usually the byte after the terminating newline, but can also be
233 // the end of the file if there is no terminating newline
234 LineEnd int
235 LineNumber int
236
237 // Before and After are only set when SearchOptions.NumContextLines is > 0
238 Before []byte
239 After []byte
240
241 // If set, this was a match on the filename.
242 FileName bool
243
244 // The higher the better. Only ranks the quality of the match
245 // within the file, does not take rank of file into account
246 Score float64
247 DebugScore string
248
249 LineFragments []LineFragmentMatch
250}
251
252func (lm *LineMatch) sizeBytes() (sz uint64) {
253 // Line
254 sz += sliceHeaderBytes + uint64(len(lm.Line))
255
256 // LineStart, LineEnd, LineNumber
257 sz += 3 * 8
258
259 // Before
260 sz += sliceHeaderBytes + uint64(len(lm.Before))
261
262 // After
263 sz += sliceHeaderBytes + uint64(len(lm.After))
264
265 // FileName
266 sz += 1
267
268 // Score
269 sz += 8
270
271 // DebugScore
272 sz += stringHeaderBytes + uint64(len(lm.DebugScore))
273
274 // LineFragments
275 sz += sliceHeaderBytes
276 for _, lf := range lm.LineFragments {
277 sz += lf.sizeBytes()
278 }
279
280 return
281}
282
283type Symbol struct {
284 Sym string
285 Kind string
286 Parent string
287 ParentKind string
288}
289
290func (s *Symbol) sizeBytes() uint64 {
291 return 4*stringHeaderBytes + uint64(len(s.Sym)+len(s.Kind)+len(s.Parent)+len(s.ParentKind))
292}
293
294// LineFragmentMatch a segment of matching text within a line.
295type LineFragmentMatch struct {
296 // Offset within the line, in bytes.
297 LineOffset int
298
299 // Offset from file start, in bytes.
300 Offset uint32
301
302 // Number bytes that match.
303 MatchLength int
304
305 SymbolInfo *Symbol
306}
307
308func (lfm *LineFragmentMatch) sizeBytes() (sz uint64) {
309 // LineOffset
310 sz += 8
311
312 // Offset
313 sz += 4
314
315 // MatchLength
316 sz += 8
317
318 // SymbolInfo
319 sz += pointerSize
320 if lfm.SymbolInfo != nil {
321 sz += lfm.SymbolInfo.sizeBytes()
322 }
323
324 return
325}
326
327type FlushReason uint8
328
329const (
330 FlushReasonTimerExpired FlushReason = 1 << iota
331 FlushReasonFinalFlush
332 FlushReasonMaxSize
333)
334
335var FlushReasonStrings = map[FlushReason]string{
336 FlushReasonTimerExpired: "timer_expired",
337 FlushReasonFinalFlush: "final_flush",
338 FlushReasonMaxSize: "max_size_reached",
339}
340
341func (fr FlushReason) String() string {
342 if v, ok := FlushReasonStrings[fr]; ok {
343 return v
344 }
345
346 return "none"
347}
348
349// Stats contains interesting numbers on the search
350type Stats struct {
351 // Amount of I/O for reading contents.
352 ContentBytesLoaded int64
353
354 // Amount of I/O for reading from index.
355 IndexBytesLoaded int64
356
357 // Number of search shards that had a crash.
358 Crashes int
359
360 // Wall clock time for this search
361 Duration time.Duration
362
363 // Number of files containing a match.
364 FileCount int
365
366 // Number of files in shards that we considered.
367 ShardFilesConsidered int
368
369 // Files that we evaluated. Equivalent to files for which all
370 // atom matches (including negations) evaluated to true.
371 FilesConsidered int
372
373 // Files for which we loaded file content to verify substring matches
374 FilesLoaded int
375
376 // Candidate files whose contents weren't examined because we
377 // gathered enough matches.
378 FilesSkipped int
379
380 // Shards that we scanned to find matches.
381 ShardsScanned int
382
383 // Shards that we did not process because a query was canceled.
384 ShardsSkipped int
385
386 // Shards that we did not process because the query was rejected by the
387 // ngram filter indicating it had no matches.
388 ShardsSkippedFilter int
389
390 // Number of non-overlapping matches
391 MatchCount int
392
393 // Number of candidate matches as a result of searching ngrams.
394 NgramMatches int
395
396 // NgramLookups is the number of times we accessed an ngram in the index.
397 NgramLookups int
398
399 // Wall clock time for queued search.
400 Wait time.Duration
401
402 // Aggregate wall clock time spent constructing and pruning the match tree.
403 // This accounts for time such as lookups in the trigram index.
404 MatchTreeConstruction time.Duration
405
406 // Aggregate wall clock time spent searching the match tree. This accounts
407 // for the bulk of search work done looking for matches.
408 MatchTreeSearch time.Duration
409
410 // Number of times regexp was called on files that we evaluated.
411 RegexpsConsidered int
412
413 // FlushReason explains why results were flushed.
414 FlushReason FlushReason
415}
416
417func (s *Stats) sizeBytes() (sz uint64) {
418 sz = 16 * 8 // This assumes we are running on a 64-bit architecture
419 sz += 1 // FlushReason
420
421 return
422}
423
424func (s *Stats) Add(o Stats) {
425 s.ContentBytesLoaded += o.ContentBytesLoaded
426 s.IndexBytesLoaded += o.IndexBytesLoaded
427 s.Crashes += o.Crashes
428 s.FileCount += o.FileCount
429 s.FilesConsidered += o.FilesConsidered
430 s.FilesLoaded += o.FilesLoaded
431 s.FilesSkipped += o.FilesSkipped
432 s.MatchCount += o.MatchCount
433 s.NgramMatches += o.NgramMatches
434 s.NgramLookups += o.NgramLookups
435 s.ShardFilesConsidered += o.ShardFilesConsidered
436 s.ShardsScanned += o.ShardsScanned
437 s.ShardsSkipped += o.ShardsSkipped
438 s.ShardsSkippedFilter += o.ShardsSkippedFilter
439 s.Wait += o.Wait
440 s.MatchTreeConstruction += o.MatchTreeConstruction
441 s.MatchTreeSearch += o.MatchTreeSearch
442 s.RegexpsConsidered += o.RegexpsConsidered
443
444 // We want the first non-zero FlushReason to be sticky. This is a useful
445 // property when aggregating stats from several Zoekts.
446 if s.FlushReason == 0 {
447 s.FlushReason = o.FlushReason
448 }
449}
450
451// Zero returns true if stats is empty.
452func (s *Stats) Zero() bool {
453 if s == nil {
454 return true
455 }
456
457 return !(s.ContentBytesLoaded > 0 ||
458 s.IndexBytesLoaded > 0 ||
459 s.Crashes > 0 ||
460 s.FileCount > 0 ||
461 s.FilesConsidered > 0 ||
462 s.FilesLoaded > 0 ||
463 s.FilesSkipped > 0 ||
464 s.MatchCount > 0 ||
465 s.NgramMatches > 0 ||
466 s.NgramLookups > 0 ||
467 s.ShardFilesConsidered > 0 ||
468 s.ShardsScanned > 0 ||
469 s.ShardsSkipped > 0 ||
470 s.ShardsSkippedFilter > 0 ||
471 s.Wait > 0 ||
472 s.MatchTreeConstruction > 0 ||
473 s.MatchTreeSearch > 0 ||
474 s.RegexpsConsidered > 0)
475}
476
477// Progress contains information about the global progress of the running search query.
478// This is used by the frontend to reorder results and emit them when stable.
479// Sourcegraph specific: this is used when querying multiple zoekt-webserver instances.
480type Progress struct {
481 // Priority of the shard that was searched.
482 Priority float64
483
484 // MaxPendingPriority is the maximum priority of pending result that is being searched in parallel.
485 // This is used to reorder results when the result set is known to be stable-- that is, when a result's
486 // Priority is greater than the max(MaxPendingPriority) from the latest results of each backend, it can be returned to the user.
487 //
488 // MaxPendingPriority decreases monotonically in each SearchResult.
489 MaxPendingPriority float64
490}
491
492func (p *Progress) sizeBytes() uint64 {
493 return 2 * 8
494}
495
496// SearchResult contains search matches and extra data
497type SearchResult struct {
498 Stats
499
500 // Do not encode this as we cannot encode -Inf in JSON
501 Progress `json:"-"`
502
503 Files []FileMatch
504
505 // RepoURLs holds a repo => template string map.
506 RepoURLs map[string]string
507
508 // FragmentNames holds a repo => template string map, for
509 // the line number fragment.
510 LineFragments map[string]string
511}
512
513// SizeBytes is a best-effort estimate of the size of SearchResult in memory.
514// The estimate does not take alignment into account. The result is a lower
515// bound on the actual size in memory.
516func (sr *SearchResult) SizeBytes() (sz uint64) {
517 sz += sr.Stats.sizeBytes()
518 sz += sr.Progress.sizeBytes()
519
520 // Files
521 sz += sliceHeaderBytes
522 for _, f := range sr.Files {
523 sz += f.sizeBytes()
524 }
525
526 // RepoURLs
527 sz += mapHeaderBytes
528 for k, v := range sr.RepoURLs {
529 sz += stringHeaderBytes + uint64(len(k))
530 sz += stringHeaderBytes + uint64(len(v))
531 }
532
533 // LineFragments
534 sz += mapHeaderBytes
535 for k, v := range sr.LineFragments {
536 sz += stringHeaderBytes + uint64(len(k))
537 sz += stringHeaderBytes + uint64(len(v))
538 }
539
540 return
541}
542
543// RepositoryBranch describes an indexed branch, which is a name
544// combined with a version.
545type RepositoryBranch struct {
546 Name string
547 Version string
548}
549
550func (r RepositoryBranch) String() string {
551 return fmt.Sprintf("%s@%s", r.Name, r.Version)
552}
553
554// Repository holds repository metadata.
555type Repository struct {
556 // Sourcegraph's repository ID
557 ID uint32
558
559 // The repository name
560 Name string
561
562 // The repository URL.
563 URL string
564
565 // The physical source where this repo came from, eg. full
566 // path to the zip filename or git repository directory. This
567 // will not be exposed in the UI, but can be used to detect
568 // orphaned index shards.
569 Source string
570
571 // The branches indexed in this repo.
572 Branches []RepositoryBranch
573
574 // Nil if this is not the super project.
575 SubRepoMap map[string]*Repository
576
577 // URL template to link to the commit of a branch
578 CommitURLTemplate string
579
580 // The repository URL for getting to a file. Has access to
581 // {{.Version}}, {{.Path}}
582 FileURLTemplate string
583
584 // The URL fragment to add to a file URL for line numbers. has
585 // access to {{.LineNumber}}. The fragment should include the
586 // separator, generally '#' or ';'.
587 LineFragmentTemplate string
588
589 // Perf optimization: priority is set when we load the shard. It corresponds to
590 // the value of "priority" stored in RawConfig.
591 priority float64
592
593 // All zoekt.* configuration settings.
594 RawConfig map[string]string
595
596 // Importance of the repository, bigger is more important
597 Rank uint16
598
599 // IndexOptions is a hash of the options used to create the index for the
600 // repo.
601 IndexOptions string
602
603 // HasSymbols is true if this repository has indexed ctags
604 // output. Sourcegraph specific: This field is more appropriate for
605 // IndexMetadata. However, we store it here since the Sourcegraph frontend
606 // can read this structure but not IndexMetadata.
607 HasSymbols bool
608
609 // Tombstone is true if we are not allowed to search this repo.
610 Tombstone bool
611
612 // LatestCommitDate is the date of the latest commit among all indexed Branches.
613 // The date might be time.Time's 0-value if the repository was last indexed
614 // before this field was added.
615 LatestCommitDate time.Time
616
617 // FileTombstones is a set of file paths that should be ignored across all branches
618 // in this shard.
619 FileTombstones map[string]struct{} `json:",omitempty"`
620}
621
622func (r *Repository) UnmarshalJSON(data []byte) error {
623 // We define a new type so that we can use json.Unmarshal
624 // without recursing into this same method.
625 type repository *Repository
626 repo := repository(r)
627
628 err := json.Unmarshal(data, repo)
629 if err != nil {
630 return err
631 }
632
633 if v, ok := repo.RawConfig["repoid"]; ok {
634 id, _ := strconv.ParseUint(v, 10, 32)
635 r.ID = uint32(id)
636 }
637
638 // Sourcegraph indexserver doesn't set repo.Rank, so we set it here. Setting it
639 // on read instead of during indexing allows us to avoid a complete reindex.
640 //
641 // Prefer "latestCommitDate" over "priority" for ranking. We keep priority for
642 // backwards compatibility.
643 if _, ok := repo.RawConfig["latestCommitDate"]; ok {
644 // We use the number of months since 1970 as a simple measure of repo freshness.
645 // It is monotonically increasing and stable across re-indexes and restarts.
646 r.Rank = monthsSince1970(repo.LatestCommitDate)
647 } else if v, ok := repo.RawConfig["priority"]; ok {
648 r.priority, err = strconv.ParseFloat(v, 64)
649 if err != nil {
650 r.priority = 0
651 }
652
653 // Sourcegraph indexserver doesn't set repo.Rank, so we set it here
654 // based on priority. Setting it on read instead of during indexing
655 // allows us to avoid a complete reindex.
656 if r.Rank == 0 && r.priority > 0 {
657 // Normalize the repo score within [0, maxUint16), with the midpoint at 5,000.
658 // This means popular repos (roughly ones with over 5,000 stars) see diminishing
659 // returns from more stars.
660 r.Rank = uint16(r.priority / (5000.0 + r.priority) * maxUInt16)
661 }
662 }
663
664 return nil
665}
666
667// monthsSince1970 returns the number of months since 1970. It returns values in
668// the range [0, maxUInt16]. The upper bound is reached in the year 7431, the
669// lower bound for all dates before 1970.
670func monthsSince1970(t time.Time) uint16 {
671 base := time.Unix(0, 0)
672 if t.Before(base) {
673 return 0
674 }
675 months := int(t.Year()-1970)*12 + int(t.Month()-1)
676 return uint16(min(months, maxUInt16))
677}
678
679// MergeMutable will merge x into r. mutated will be true if it made any
680// changes. err is non-nil if we needed to mutate an immutable field.
681//
682// Note: SubRepoMap, IndexOptions and HasSymbol fields are ignored. They are
683// computed while indexing so can't be synthesized from x.
684//
685// Note: We ignore RawConfig fields which are duplicated into Repository:
686// name and id.
687func (r *Repository) MergeMutable(x *Repository) (mutated bool, err error) {
688 if r.ID != x.ID {
689 // Sourcegraph: strange behaviour may occur if ID changes but names don't.
690 return mutated, errors.New("ID is immutable")
691 }
692 if r.Name != x.Name {
693 // Name is encoded into the shard name on disk. We need to re-index if it
694 // changes.
695 return mutated, errors.New("Name is immutable")
696 }
697 if !reflect.DeepEqual(r.Branches, x.Branches) {
698 // Need a reindex if content changing.
699 return mutated, errors.New("Branches is immutable")
700 }
701
702 for k, v := range x.RawConfig {
703 // We ignore name and id since they are encoded into the repository.
704 if k == "name" || k == "id" {
705 continue
706 }
707 if r.RawConfig == nil {
708 mutated = true
709 r.RawConfig = make(map[string]string)
710 }
711 if r.RawConfig[k] != v {
712 mutated = true
713 r.RawConfig[k] = v
714 }
715 }
716
717 if r.URL != x.URL {
718 mutated = true
719 r.URL = x.URL
720 }
721 if r.CommitURLTemplate != x.CommitURLTemplate {
722 mutated = true
723 r.CommitURLTemplate = x.CommitURLTemplate
724 }
725 if r.FileURLTemplate != x.FileURLTemplate {
726 mutated = true
727 r.FileURLTemplate = x.FileURLTemplate
728 }
729 if r.LineFragmentTemplate != x.LineFragmentTemplate {
730 mutated = true
731 r.LineFragmentTemplate = x.LineFragmentTemplate
732 }
733
734 return mutated, nil
735}
736
737// IndexMetadata holds metadata stored in the index file. It contains
738// data generated by the core indexing library.
739type IndexMetadata struct {
740 IndexFormatVersion int
741 IndexFeatureVersion int
742 IndexMinReaderVersion int
743 IndexTime time.Time
744 PlainASCII bool
745 LanguageMap map[string]uint16
746 ZoektVersion string
747 ID string
748}
749
750// Statistics of a (collection of) repositories.
751type RepoStats struct {
752 // Repos is used for aggregrating the number of repositories.
753 //
754 // Note: This field is not populated on RepoListEntry.Stats (individual) but
755 // only for RepoList.Stats (aggregate).
756 Repos int
757
758 // Shards is the total number of search shards.
759 Shards int
760
761 // Documents holds the number of documents or files.
762 Documents int
763
764 // IndexBytes is the amount of RAM used for index overhead.
765 IndexBytes int64
766
767 // ContentBytes is the amount of RAM used for raw content.
768 ContentBytes int64
769
770 // Sourcegraph specific stats below. These are not as efficient to calculate
771 // as the above statistics. We experimentally measured about a 10% slower
772 // shard load time. However, we find these values very useful to track and
773 // computing them outside of load time introduces a lot of complexity.
774
775 // NewLinesCount is the number of newlines "\n" that appear in the zoekt
776 // indexed documents. This is not exactly the same as line count, since it
777 // will not include lines not terminated by "\n" (eg a file with no "\n", or
778 // a final line without "\n"). Note: Zoekt deduplicates documents across
779 // branches, so if a path has the same contents on multiple branches, there
780 // is only one document for it. As such that document's newlines is only
781 // counted once. See DefaultBranchNewLinesCount and AllBranchesNewLinesCount
782 // for counts which do not deduplicate.
783 NewLinesCount uint64
784
785 // DefaultBranchNewLinesCount is the number of newlines "\n" in the default
786 // branch.
787 DefaultBranchNewLinesCount uint64
788
789 // OtherBranchesNewLinesCount is the number of newlines "\n" in all branches
790 // except the default branch.
791 OtherBranchesNewLinesCount uint64
792}
793
794func (s *RepoStats) Add(o *RepoStats) {
795 // can't update Repos, since one repo may have multiple
796 // shards.
797 s.Shards += o.Shards
798 s.IndexBytes += o.IndexBytes
799 s.Documents += o.Documents
800 s.ContentBytes += o.ContentBytes
801
802 // Sourcegraph specific
803 s.NewLinesCount += o.NewLinesCount
804 s.DefaultBranchNewLinesCount += o.DefaultBranchNewLinesCount
805 s.OtherBranchesNewLinesCount += o.OtherBranchesNewLinesCount
806}
807
808type RepoListEntry struct {
809 Repository Repository
810 IndexMetadata IndexMetadata
811 Stats RepoStats
812}
813
814// MinimalRepoListEntry is a subset of RepoListEntry. It was added after
815// performance profiling of sourcegraph.com revealed that querying this
816// information from Zoekt was causing lots of CPU and memory usage. Note: we
817// can revisit this, how we store and query this information has changed a lot
818// since this was introduced.
819type MinimalRepoListEntry struct {
820 // HasSymbols is exported since Sourcegraph uses this information at search
821 // planning time to decide between Zoekt and an unindexed symbol search.
822 //
823 // Note: it pretty much is always true in practice.
824 HasSymbols bool
825
826 // Branches is used by Sourcegraphs query planner to decided if it can use
827 // zoekt or go via an unindexed code path.
828 Branches []RepositoryBranch
829
830 // IndexTimeUnix is the IndexTime converted to unix time (number of seconds
831 // since the epoch). This is to make it clear we are not transporting the
832 // full fidelty timestamp (ie with milliseconds and location). Additionally
833 // it saves 16 bytes in this struct.
834 //
835 // IndexTime is used as a heuristic in Sourcegraph to decide in aggregate
836 // how many repositories need updating after a ranking change/etc.
837 //
838 // TODO(keegancsmith) audit updates to IndexTime and document how and when
839 // it changes. Concerned about things like metadata updates or compound
840 // shards leading to untrustworthy data here.
841 IndexTimeUnix int64
842}
843
844type ReposMap map[uint32]MinimalRepoListEntry
845
846// MarshalBinary implements a specialized encoder for ReposMap.
847func (q *ReposMap) MarshalBinary() ([]byte, error) {
848 return reposMapEncode(*q)
849}
850
851// UnmarshalBinary implements a specialized decoder for ReposMap.
852func (q *ReposMap) UnmarshalBinary(b []byte) error {
853 var err error
854 (*q), err = reposMapDecode(b)
855 return err
856}
857
858// RepoList holds a set of Repository metadata.
859type RepoList struct {
860 // Returned when ListOptions.Field is RepoListFieldRepos.
861 Repos []*RepoListEntry
862
863 // ReposMap is set when ListOptions.Field is RepoListFieldReposMap.
864 ReposMap ReposMap
865
866 Crashes int
867
868 // Stats response to a List request.
869 // This is the aggregate RepoStats of all repos matching the input query.
870 Stats RepoStats
871}
872
873type Searcher interface {
874 Search(ctx context.Context, q query.Q, opts *SearchOptions) (*SearchResult, error)
875
876 // List lists repositories. The query `q` can only contain
877 // query.Repo atoms.
878 List(ctx context.Context, q query.Q, opts *ListOptions) (*RepoList, error)
879 Close()
880
881 // Describe the searcher for debug messages.
882 String() string
883}
884
885type RepoListField int
886
887const (
888 RepoListFieldRepos RepoListField = 0
889 RepoListFieldReposMap = 2
890)
891
892type ListOptions struct {
893 // Field decides which field to populate in RepoList response.
894 Field RepoListField
895}
896
897func (o *ListOptions) GetField() (RepoListField, error) {
898 if o == nil {
899 return RepoListFieldRepos, nil
900 }
901 switch o.Field {
902 case RepoListFieldRepos, RepoListFieldReposMap:
903 return o.Field, nil
904 case 1:
905 return 0, fmt.Errorf("RepoListFieldMinimal (%d) is no longer supported", o.Field)
906 default:
907 return 0, fmt.Errorf("unknown RepoListField %d", o.Field)
908 }
909}
910
911func (o *ListOptions) String() string {
912 return fmt.Sprintf("%#v", o)
913}
914
915type SearchOptions struct {
916 // Return an upper-bound estimate of eligible documents in
917 // stats.ShardFilesConsidered.
918 EstimateDocCount bool
919
920 // Return the whole file.
921 Whole bool
922
923 // Maximum number of matches: skip all processing an index
924 // shard after we found this many non-overlapping matches.
925 ShardMaxMatchCount int
926
927 // Maximum number of matches: stop looking for more matches
928 // once we have this many matches across shards.
929 TotalMaxMatchCount int
930
931 // Maximum number of matches: skip processing documents for a repository in
932 // a shard once we have found ShardRepoMaxMatchCount.
933 //
934 // A compound shard may contain multiple repositories. This will most often
935 // be set to 1 to find all repositories containing a result.
936 ShardRepoMaxMatchCount int
937
938 // Abort the search after this much time has passed.
939 MaxWallTime time.Duration
940
941 // FlushWallTime if non-zero will stop streaming behaviour at first and
942 // instead will collate and sort results. At FlushWallTime the results will
943 // be sent and then the behaviour will revert to the normal streaming.
944 FlushWallTime time.Duration
945
946 // Truncates the number of documents (i.e. files) after collating and
947 // sorting the results.
948 MaxDocDisplayCount int
949
950 // Truncates the number of matchs after collating and sorting the results.
951 MaxMatchDisplayCount int
952
953 // If set to a number greater than zero then up to this many number
954 // of context lines will be added before and after each matched line.
955 // Note that the included context lines might contain matches and
956 // it's up to the consumer of the result to remove those lines.
957 NumContextLines int
958
959 // If true, ChunkMatches will be returned in each FileMatch rather than LineMatches
960 // EXPERIMENTAL: the behavior of this flag may be changed in future versions.
961 ChunkMatches bool
962
963 // EXPERIMENTAL. If true, document ranks are used as additional input for
964 // sorting matches.
965 UseDocumentRanks bool
966
967 // EXPERIMENTAL. When UseDocumentRanks is enabled, this can be optionally set to adjust
968 // their weight in the file match score. If the value is <= 0.0, the default weight value
969 // will be used. This option is temporary and is only exposed for testing/ tuning purposes.
970 DocumentRanksWeight float64
971
972 // EXPERIMENTAL. If true, use text-search style scoring instead of the default
973 // scoring formula. The scoring algorithm treats each match in a file as a term
974 // and computes an approximation to BM25.
975 //
976 // The calculation of IDF assumes that Zoekt visits all documents containing any
977 // of the query terms during evaluation. This is true, for example, if all query
978 // terms are ORed together.
979 //
980 // When enabled, all other scoring signals are ignored, including document ranks.
981 UseBM25Scoring bool
982
983 // Trace turns on opentracing for this request if true and if the Jaeger address was provided as
984 // a command-line flag
985 Trace bool
986
987 // If set, the search results will contain debug information for scoring.
988 DebugScore bool
989
990 // SpanContext is the opentracing span context, if it exists, from the zoekt client
991 SpanContext map[string]string
992}
993
994// String returns a succinct representation of the options. This is meant for
995// human consumption in logs and traces.
996//
997// Note: some tracing systems have limits on length of values, so we take care
998// to try and make this small, and include the important information near the
999// front incase of truncation.
1000func (s *SearchOptions) String() string {
1001 var b strings.Builder
1002
1003 add := func(name, value string) {
1004 b.WriteString(name)
1005 b.WriteByte('=')
1006 b.WriteString(value)
1007 b.WriteByte(' ')
1008 }
1009 addInt := func(name string, value int) {
1010 if value != 0 {
1011 add(name, strconv.Itoa(value))
1012 }
1013 }
1014 addDuration := func(name string, value time.Duration) {
1015 if value != 0 {
1016 add(name, value.String())
1017 }
1018 }
1019 addBool := func(name string, value bool) {
1020 if !value {
1021 return
1022 }
1023 b.WriteString(name)
1024 b.WriteByte(' ')
1025 }
1026
1027 b.WriteString("zoekt.SearchOptions{ ")
1028
1029 addInt("ShardMaxMatchCount", s.ShardMaxMatchCount)
1030 addInt("TotalMaxMatchCount", s.TotalMaxMatchCount)
1031 addInt("ShardRepoMaxMatchCount", s.ShardRepoMaxMatchCount)
1032 addInt("MaxDocDisplayCount", s.MaxDocDisplayCount)
1033 addInt("MaxMatchDisplayCount", s.MaxMatchDisplayCount)
1034 addInt("NumContextLines", s.NumContextLines)
1035
1036 addDuration("MaxWallTime", s.MaxWallTime)
1037 addDuration("FlushWallTime", s.FlushWallTime)
1038
1039 if s.DocumentRanksWeight > 0 {
1040 add("DocumentRanksWeight", strconv.FormatFloat(s.DocumentRanksWeight, 'g', -1, 64))
1041 }
1042
1043 addBool("EstimateDocCount", s.EstimateDocCount)
1044 addBool("Whole", s.Whole)
1045 addBool("ChunkMatches", s.ChunkMatches)
1046 addBool("UseDocumentRanks", s.UseDocumentRanks)
1047 addBool("UseBM25Scoring", s.UseBM25Scoring)
1048 addBool("Trace", s.Trace)
1049 addBool("DebugScore", s.DebugScore)
1050
1051 for k, v := range s.SpanContext {
1052 add("SpanContext."+k, strconv.Quote(v))
1053 }
1054
1055 b.WriteByte('}')
1056 return b.String()
1057}
1058
1059// Sender is the interface that wraps the basic Send method.
1060type Sender interface {
1061 Send(*SearchResult)
1062}
1063
1064// SenderFunc is an adapter to allow the use of ordinary functions as Sender.
1065// If f is a function with the appropriate signature, SenderFunc(f) is a Sender
1066// that calls f.
1067type SenderFunc func(result *SearchResult)
1068
1069func (f SenderFunc) Send(result *SearchResult) {
1070 f(result)
1071}
1072
1073// Streamer adds the method StreamSearch to the Searcher interface.
1074type Streamer interface {
1075 Searcher
1076 StreamSearch(ctx context.Context, q query.Q, opts *SearchOptions, sender Sender) (err error)
1077}