fork of https://github.com/sourcegraph/zoekt
1// Copyright 2016 Google Inc. All rights reserved.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15package zoekt // import "github.com/sourcegraph/zoekt"
16
17import (
18 "context"
19 "encoding/json"
20 "errors"
21 "fmt"
22 "reflect"
23 "strconv"
24 "strings"
25 "time"
26
27 "github.com/sourcegraph/zoekt/query"
28)
29
30const (
31 mapHeaderBytes uint64 = 48
32 sliceHeaderBytes uint64 = 24
33 stringHeaderBytes uint64 = 16
34 pointerSize uint64 = 8
35 interfaceBytes uint64 = 16
36)
37
38// FileMatch contains all the matches within a file.
39type FileMatch struct {
40 FileName string
41
42 // Repository is the globally unique name of the repo of the
43 // match
44 Repository string
45
46 // SubRepositoryName is the globally unique name of the repo,
47 // if it came from a subrepository
48 SubRepositoryName string `json:",omitempty"`
49
50 // SubRepositoryPath holds the prefix where the subrepository
51 // was mounted.
52 SubRepositoryPath string `json:",omitempty"`
53
54 // Commit SHA1 (hex) of the (sub)repo holding the file.
55 Version string `json:",omitempty"`
56
57 // Detected language of the result.
58 Language string
59
60 // For debugging. Needs DebugScore set, but public so tests in
61 // other packages can print some diagnostics.
62 Debug string `json:",omitempty"`
63
64 Branches []string `json:",omitempty"`
65
66 // One of LineMatches or ChunkMatches will be returned depending on whether
67 // the SearchOptions.ChunkMatches is set.
68 LineMatches []LineMatch `json:",omitempty"`
69 ChunkMatches []ChunkMatch `json:",omitempty"`
70
71 // Only set if requested
72 Content []byte `json:",omitempty"`
73
74 // Checksum of the content.
75 Checksum []byte
76
77 // Ranking; the higher, the better.
78 Score float64 `json:",omitempty"`
79
80 // RepositoryPriority is a Sourcegraph extension. It is used by Sourcegraph to
81 // order results from different repositories relative to each other.
82 RepositoryPriority float64 `json:",omitempty"`
83
84 // RepositoryID is a Sourcegraph extension. This is the ID of Repository in
85 // Sourcegraph.
86 RepositoryID uint32 `json:",omitempty"`
87}
88
89func (m *FileMatch) sizeBytes() (sz uint64) {
90 // Score
91 sz += 8
92
93 for _, s := range []string{
94 m.Debug,
95 m.FileName,
96 m.Repository,
97 m.Language,
98 m.SubRepositoryName,
99 m.SubRepositoryPath,
100 m.Version,
101 } {
102 sz += stringHeaderBytes + uint64(len(s))
103 }
104
105 // Branches
106 sz += sliceHeaderBytes
107 for _, s := range m.Branches {
108 sz += stringHeaderBytes + uint64(len(s))
109 }
110
111 // LineMatches
112 sz += sliceHeaderBytes
113 for _, lm := range m.LineMatches {
114 sz += lm.sizeBytes()
115 }
116
117 // ChunkMatches
118 sz += sliceHeaderBytes
119 for _, cm := range m.ChunkMatches {
120 sz += cm.sizeBytes()
121 }
122
123 // RepositoryID
124 sz += 4
125
126 // RepositoryPriority
127 sz += 8
128
129 // Content
130 sz += sliceHeaderBytes + uint64(len(m.Content))
131
132 // Checksum
133 sz += sliceHeaderBytes + uint64(len(m.Checksum))
134
135 return
136}
137
138// ChunkMatch is a set of non-overlapping matches within a contiguous range of
139// lines in the file.
140type ChunkMatch struct {
141 DebugScore string
142
143 // Content is a contiguous range of complete lines that fully contains Ranges.
144 // Lines will always include their terminating newline (if it exists).
145 Content []byte
146
147 // Ranges is a set of matching ranges within this chunk. Each range is relative
148 // to the beginning of the file (not the beginning of Content).
149 Ranges []Range
150
151 // SymbolInfo is the symbol information associated with Ranges. If it is non-nil,
152 // its length will equal that of Ranges. Any of its elements may be nil.
153 SymbolInfo []*Symbol
154
155 // FileName indicates whether this match is a match on the file name, in
156 // which case Content will contain the file name.
157 FileName bool
158
159 // ContentStart is the location (inclusive) of the beginning of content
160 // relative to the beginning of the file. It will always be at the
161 // beginning of a line (Column will always be 1).
162 ContentStart Location
163
164 // Score is the overall relevance score of this chunk.
165 Score float64
166
167 // BestLineMatch is the line number of the highest-scoring line match in this chunk.
168 // The line number represents the index in the full file, and is 1-based. If FileName: true,
169 // this number will be 0.
170 BestLineMatch uint32
171}
172
173func (cm *ChunkMatch) sizeBytes() (sz uint64) {
174 // Content
175 sz += sliceHeaderBytes + uint64(len(cm.Content))
176
177 // ContentStart
178 sz += cm.ContentStart.sizeBytes()
179
180 // FileName
181 sz += 1
182
183 // Ranges
184 sz += sliceHeaderBytes
185 if len(cm.Ranges) > 0 {
186 sz += uint64(len(cm.Ranges)) * cm.Ranges[0].sizeBytes()
187 }
188
189 // SymbolInfo
190 sz += sliceHeaderBytes
191 for _, si := range cm.SymbolInfo {
192 sz += pointerSize
193 if si != nil {
194 sz += si.sizeBytes()
195 }
196 }
197
198 // Score
199 sz += 8
200
201 // DebugScore
202 sz += stringHeaderBytes + uint64(len(cm.DebugScore))
203
204 return
205}
206
207type Range struct {
208 // The inclusive beginning of the range.
209 Start Location
210 // The exclusive end of the range.
211 End Location
212}
213
214func (r *Range) sizeBytes() uint64 {
215 return r.Start.sizeBytes() + r.End.sizeBytes()
216}
217
218type Location struct {
219 // 0-based byte offset from the beginning of the file
220 ByteOffset uint32
221 // 1-based line number from the beginning of the file
222 LineNumber uint32
223 // 1-based column number (in runes) from the beginning of line
224 Column uint32
225}
226
227func (l *Location) sizeBytes() uint64 {
228 return 3 * 4
229}
230
231// LineMatch holds the matches within a single line in a file.
232type LineMatch struct {
233 // The line in which a match was found.
234 Line []byte
235 // The byte offset of the first byte of the line.
236 LineStart int
237 // The byte offset of the first byte past the end of the line.
238 // This is usually the byte after the terminating newline, but can also be
239 // the end of the file if there is no terminating newline
240 LineEnd int
241 LineNumber int
242
243 // Before and After are only set when SearchOptions.NumContextLines is > 0
244 Before []byte
245 After []byte
246
247 // If set, this was a match on the filename.
248 FileName bool
249
250 // The higher the better. Only ranks the quality of the match
251 // within the file, does not take rank of file into account
252 Score float64
253 DebugScore string
254
255 LineFragments []LineFragmentMatch
256}
257
258func (lm *LineMatch) sizeBytes() (sz uint64) {
259 // Line
260 sz += sliceHeaderBytes + uint64(len(lm.Line))
261
262 // LineStart, LineEnd, LineNumber
263 sz += 3 * 8
264
265 // Before
266 sz += sliceHeaderBytes + uint64(len(lm.Before))
267
268 // After
269 sz += sliceHeaderBytes + uint64(len(lm.After))
270
271 // FileName
272 sz += 1
273
274 // Score
275 sz += 8
276
277 // DebugScore
278 sz += stringHeaderBytes + uint64(len(lm.DebugScore))
279
280 // LineFragments
281 sz += sliceHeaderBytes
282 for _, lf := range lm.LineFragments {
283 sz += lf.sizeBytes()
284 }
285
286 return
287}
288
289type Symbol struct {
290 Sym string
291 Kind string
292 Parent string
293 ParentKind string
294}
295
296func (s *Symbol) sizeBytes() uint64 {
297 return 4*stringHeaderBytes + uint64(len(s.Sym)+len(s.Kind)+len(s.Parent)+len(s.ParentKind))
298}
299
300// LineFragmentMatch a segment of matching text within a line.
301type LineFragmentMatch struct {
302 // Offset within the line, in bytes.
303 LineOffset int
304
305 // Offset from file start, in bytes.
306 Offset uint32
307
308 // Number bytes that match.
309 MatchLength int
310
311 SymbolInfo *Symbol
312}
313
314func (lfm *LineFragmentMatch) sizeBytes() (sz uint64) {
315 // LineOffset
316 sz += 8
317
318 // Offset
319 sz += 4
320
321 // MatchLength
322 sz += 8
323
324 // SymbolInfo
325 sz += pointerSize
326 if lfm.SymbolInfo != nil {
327 sz += lfm.SymbolInfo.sizeBytes()
328 }
329
330 return
331}
332
333type FlushReason uint8
334
335const (
336 FlushReasonTimerExpired FlushReason = 1 << iota
337 FlushReasonFinalFlush
338 FlushReasonMaxSize
339)
340
341var FlushReasonStrings = map[FlushReason]string{
342 FlushReasonTimerExpired: "timer_expired",
343 FlushReasonFinalFlush: "final_flush",
344 FlushReasonMaxSize: "max_size_reached",
345}
346
347func (fr FlushReason) String() string {
348 if v, ok := FlushReasonStrings[fr]; ok {
349 return v
350 }
351
352 return "none"
353}
354
355// Stats contains interesting numbers on the search
356type Stats struct {
357 // Amount of I/O for reading contents.
358 ContentBytesLoaded int64
359
360 // Amount of I/O for reading from index.
361 IndexBytesLoaded int64
362
363 // Number of search shards that had a crash.
364 Crashes int
365
366 // Wall clock time for this search
367 Duration time.Duration
368
369 // Number of files containing a match.
370 FileCount int
371
372 // Number of files in shards that we considered.
373 ShardFilesConsidered int
374
375 // Files that we evaluated. Equivalent to files for which all
376 // atom matches (including negations) evaluated to true.
377 FilesConsidered int
378
379 // Files for which we loaded file content to verify substring matches
380 FilesLoaded int
381
382 // Candidate files whose contents weren't examined because we
383 // gathered enough matches.
384 FilesSkipped int
385
386 // Shards that we scanned to find matches.
387 ShardsScanned int
388
389 // Shards that we did not process because a query was canceled.
390 ShardsSkipped int
391
392 // Shards that we did not process because the query was rejected by the
393 // ngram filter indicating it had no matches.
394 ShardsSkippedFilter int
395
396 // Number of non-overlapping matches
397 MatchCount int
398
399 // Number of candidate matches as a result of searching ngrams.
400 NgramMatches int
401
402 // NgramLookups is the number of times we accessed an ngram in the index.
403 NgramLookups int
404
405 // Wall clock time for queued search.
406 Wait time.Duration
407
408 // Aggregate wall clock time spent constructing and pruning the match tree.
409 // This accounts for time such as lookups in the trigram index.
410 MatchTreeConstruction time.Duration
411
412 // Aggregate wall clock time spent searching the match tree. This accounts
413 // for the bulk of search work done looking for matches.
414 MatchTreeSearch time.Duration
415
416 // Number of times regexp was called on files that we evaluated.
417 RegexpsConsidered int
418
419 // FlushReason explains why results were flushed.
420 FlushReason FlushReason
421}
422
423func (s *Stats) sizeBytes() (sz uint64) {
424 sz = 16 * 8 // This assumes we are running on a 64-bit architecture
425 sz += 1 // FlushReason
426
427 return
428}
429
430func (s *Stats) Add(o Stats) {
431 s.ContentBytesLoaded += o.ContentBytesLoaded
432 s.IndexBytesLoaded += o.IndexBytesLoaded
433 s.Crashes += o.Crashes
434 s.FileCount += o.FileCount
435 s.FilesConsidered += o.FilesConsidered
436 s.FilesLoaded += o.FilesLoaded
437 s.FilesSkipped += o.FilesSkipped
438 s.MatchCount += o.MatchCount
439 s.NgramMatches += o.NgramMatches
440 s.NgramLookups += o.NgramLookups
441 s.ShardFilesConsidered += o.ShardFilesConsidered
442 s.ShardsScanned += o.ShardsScanned
443 s.ShardsSkipped += o.ShardsSkipped
444 s.ShardsSkippedFilter += o.ShardsSkippedFilter
445 s.Wait += o.Wait
446 s.MatchTreeConstruction += o.MatchTreeConstruction
447 s.MatchTreeSearch += o.MatchTreeSearch
448 s.RegexpsConsidered += o.RegexpsConsidered
449
450 // We want the first non-zero FlushReason to be sticky. This is a useful
451 // property when aggregating stats from several Zoekts.
452 if s.FlushReason == 0 {
453 s.FlushReason = o.FlushReason
454 }
455}
456
457// Zero returns true if stats is empty.
458func (s *Stats) Zero() bool {
459 if s == nil {
460 return true
461 }
462
463 return !(s.ContentBytesLoaded > 0 ||
464 s.IndexBytesLoaded > 0 ||
465 s.Crashes > 0 ||
466 s.FileCount > 0 ||
467 s.FilesConsidered > 0 ||
468 s.FilesLoaded > 0 ||
469 s.FilesSkipped > 0 ||
470 s.MatchCount > 0 ||
471 s.NgramMatches > 0 ||
472 s.NgramLookups > 0 ||
473 s.ShardFilesConsidered > 0 ||
474 s.ShardsScanned > 0 ||
475 s.ShardsSkipped > 0 ||
476 s.ShardsSkippedFilter > 0 ||
477 s.Wait > 0 ||
478 s.MatchTreeConstruction > 0 ||
479 s.MatchTreeSearch > 0 ||
480 s.RegexpsConsidered > 0)
481}
482
483// Progress contains information about the global progress of the running search query.
484// This is used by the frontend to reorder results and emit them when stable.
485// Sourcegraph specific: this is used when querying multiple zoekt-webserver instances.
486type Progress struct {
487 // Priority of the shard that was searched.
488 Priority float64
489
490 // MaxPendingPriority is the maximum priority of pending result that is being searched in parallel.
491 // This is used to reorder results when the result set is known to be stable-- that is, when a result's
492 // Priority is greater than the max(MaxPendingPriority) from the latest results of each backend, it can be returned to the user.
493 //
494 // MaxPendingPriority decreases monotonically in each SearchResult.
495 MaxPendingPriority float64
496}
497
498func (p *Progress) sizeBytes() uint64 {
499 return 2 * 8
500}
501
502// SearchResult contains search matches and extra data
503type SearchResult struct {
504 Stats
505
506 // Do not encode this as we cannot encode -Inf in JSON
507 Progress `json:"-"`
508
509 Files []FileMatch
510
511 // RepoURLs holds a repo => template string map.
512 RepoURLs map[string]string
513
514 // FragmentNames holds a repo => template string map, for
515 // the line number fragment.
516 LineFragments map[string]string
517}
518
519// SizeBytes is a best-effort estimate of the size of SearchResult in memory.
520// The estimate does not take alignment into account. The result is a lower
521// bound on the actual size in memory.
522func (sr *SearchResult) SizeBytes() (sz uint64) {
523 sz += sr.Stats.sizeBytes()
524 sz += sr.Progress.sizeBytes()
525
526 // Files
527 sz += sliceHeaderBytes
528 for _, f := range sr.Files {
529 sz += f.sizeBytes()
530 }
531
532 // RepoURLs
533 sz += mapHeaderBytes
534 for k, v := range sr.RepoURLs {
535 sz += stringHeaderBytes + uint64(len(k))
536 sz += stringHeaderBytes + uint64(len(v))
537 }
538
539 // LineFragments
540 sz += mapHeaderBytes
541 for k, v := range sr.LineFragments {
542 sz += stringHeaderBytes + uint64(len(k))
543 sz += stringHeaderBytes + uint64(len(v))
544 }
545
546 return
547}
548
549// RepositoryBranch describes an indexed branch, which is a name
550// combined with a version.
551type RepositoryBranch struct {
552 Name string
553 Version string
554}
555
556func (r RepositoryBranch) String() string {
557 return fmt.Sprintf("%s@%s", r.Name, r.Version)
558}
559
560// Repository holds repository metadata.
561type Repository struct {
562 // Sourcegraph's tenant ID
563 TenantID int
564
565 // Sourcegraph's repository ID
566 ID uint32
567
568 // The repository name
569 Name string
570
571 // The repository URL.
572 URL string
573
574 // The physical source where this repo came from, eg. full
575 // path to the zip filename or git repository directory. This
576 // will not be exposed in the UI, but can be used to detect
577 // orphaned index shards.
578 Source string
579
580 // The branches indexed in this repo.
581 Branches []RepositoryBranch
582
583 // Nil if this is not the super project.
584 SubRepoMap map[string]*Repository
585
586 // URL template to link to the commit of a branch
587 CommitURLTemplate string
588
589 // The repository URL for getting to a file. Has access to
590 // {{.Version}}, {{.Path}}
591 FileURLTemplate string
592
593 // The URL fragment to add to a file URL for line numbers. has
594 // access to {{.LineNumber}}. The fragment should include the
595 // separator, generally '#' or ';'.
596 LineFragmentTemplate string
597
598 // Perf optimization: priority is set when we load the shard. It corresponds to
599 // the value of "priority" stored in RawConfig.
600 priority float64
601
602 // All zoekt.* configuration settings.
603 RawConfig map[string]string
604
605 // Importance of the repository, bigger is more important
606 Rank uint16
607
608 // IndexOptions is a hash of the options used to create the index for the
609 // repo.
610 IndexOptions string
611
612 // HasSymbols is true if this repository has indexed ctags
613 // output. Sourcegraph specific: This field is more appropriate for
614 // IndexMetadata. However, we store it here since the Sourcegraph frontend
615 // can read this structure but not IndexMetadata.
616 HasSymbols bool
617
618 // Tombstone is true if we are not allowed to search this repo.
619 Tombstone bool
620
621 // LatestCommitDate is the date of the latest commit among all indexed Branches.
622 // The date might be time.Time's 0-value if the repository was last indexed
623 // before this field was added.
624 LatestCommitDate time.Time
625
626 // FileTombstones is a set of file paths that should be ignored across all branches
627 // in this shard.
628 FileTombstones map[string]struct{} `json:",omitempty"`
629}
630
631func (r *Repository) UnmarshalJSON(data []byte) error {
632 // We define a new type so that we can use json.Unmarshal
633 // without recursing into this same method.
634 type repository *Repository
635 repo := repository(r)
636
637 err := json.Unmarshal(data, repo)
638 if err != nil {
639 return err
640 }
641
642 if v, ok := repo.RawConfig["repoid"]; ok {
643 id, _ := strconv.ParseUint(v, 10, 32)
644 r.ID = uint32(id)
645 }
646
647 if v, ok := repo.RawConfig["tenantID"]; ok {
648 id, _ := strconv.ParseInt(v, 10, 64)
649 r.TenantID = int(id)
650 }
651
652 // Sourcegraph indexserver doesn't set repo.Rank, so we set it here. Setting it
653 // on read instead of during indexing allows us to avoid a complete reindex.
654 //
655 // Prefer "latestCommitDate" over "priority" for ranking. We keep priority for
656 // backwards compatibility.
657 if _, ok := repo.RawConfig["latestCommitDate"]; ok {
658 // We use the number of months since 1970 as a simple measure of repo freshness.
659 // It is monotonically increasing and stable across re-indexes and restarts.
660 r.Rank = monthsSince1970(repo.LatestCommitDate)
661 } else if v, ok := repo.RawConfig["priority"]; ok {
662 r.priority, err = strconv.ParseFloat(v, 64)
663 if err != nil {
664 r.priority = 0
665 }
666
667 // Sourcegraph indexserver doesn't set repo.Rank, so we set it here
668 // based on priority. Setting it on read instead of during indexing
669 // allows us to avoid a complete reindex.
670 if r.Rank == 0 && r.priority > 0 {
671 // Normalize the repo score within [0, maxUint16), with the midpoint at 5,000.
672 // This means popular repos (roughly ones with over 5,000 stars) see diminishing
673 // returns from more stars.
674 r.Rank = uint16(r.priority / (5000.0 + r.priority) * maxUInt16)
675 }
676 }
677
678 return nil
679}
680
681// monthsSince1970 returns the number of months since 1970. It returns values in
682// the range [0, maxUInt16]. The upper bound is reached in the year 7431, the
683// lower bound for all dates before 1970.
684func monthsSince1970(t time.Time) uint16 {
685 base := time.Unix(0, 0)
686 if t.Before(base) {
687 return 0
688 }
689 months := int(t.Year()-1970)*12 + int(t.Month()-1)
690 return uint16(min(months, maxUInt16))
691}
692
693// MergeMutable will merge x into r. mutated will be true if it made any
694// changes. err is non-nil if we needed to mutate an immutable field.
695//
696// Note: SubRepoMap, IndexOptions and HasSymbol fields are ignored. They are
697// computed while indexing so can't be synthesized from x.
698//
699// Note: We ignore RawConfig fields which are duplicated into Repository:
700// name and id.
701func (r *Repository) MergeMutable(x *Repository) (mutated bool, err error) {
702 if r.ID != x.ID {
703 // Sourcegraph: strange behaviour may occur if ID changes but names don't.
704 return mutated, errors.New("ID is immutable")
705 }
706 if r.Name != x.Name {
707 // Name is encoded into the shard name on disk. We need to re-index if it
708 // changes.
709 return mutated, errors.New("Name is immutable")
710 }
711 if !reflect.DeepEqual(r.Branches, x.Branches) {
712 // Need a reindex if content changing.
713 return mutated, errors.New("Branches is immutable")
714 }
715
716 for k, v := range x.RawConfig {
717 // We ignore name and id since they are encoded into the repository.
718 if k == "name" || k == "id" {
719 continue
720 }
721 if r.RawConfig == nil {
722 mutated = true
723 r.RawConfig = make(map[string]string)
724 }
725 if r.RawConfig[k] != v {
726 mutated = true
727 r.RawConfig[k] = v
728 }
729 }
730
731 if r.URL != x.URL {
732 mutated = true
733 r.URL = x.URL
734 }
735 if r.CommitURLTemplate != x.CommitURLTemplate {
736 mutated = true
737 r.CommitURLTemplate = x.CommitURLTemplate
738 }
739 if r.FileURLTemplate != x.FileURLTemplate {
740 mutated = true
741 r.FileURLTemplate = x.FileURLTemplate
742 }
743 if r.LineFragmentTemplate != x.LineFragmentTemplate {
744 mutated = true
745 r.LineFragmentTemplate = x.LineFragmentTemplate
746 }
747
748 return mutated, nil
749}
750
751// IndexMetadata holds metadata stored in the index file. It contains
752// data generated by the core indexing library.
753type IndexMetadata struct {
754 IndexFormatVersion int
755 IndexFeatureVersion int
756 IndexMinReaderVersion int
757 IndexTime time.Time
758 PlainASCII bool
759 LanguageMap map[string]uint16
760 ZoektVersion string
761 ID string
762}
763
764// Statistics of a (collection of) repositories.
765type RepoStats struct {
766 // Repos is used for aggregrating the number of repositories.
767 //
768 // Note: This field is not populated on RepoListEntry.Stats (individual) but
769 // only for RepoList.Stats (aggregate).
770 Repos int
771
772 // Shards is the total number of search shards.
773 Shards int
774
775 // Documents holds the number of documents or files.
776 Documents int
777
778 // IndexBytes is the amount of RAM used for index overhead.
779 IndexBytes int64
780
781 // ContentBytes is the amount of RAM used for raw content.
782 ContentBytes int64
783
784 // Sourcegraph specific stats below. These are not as efficient to calculate
785 // as the above statistics. We experimentally measured about a 10% slower
786 // shard load time. However, we find these values very useful to track and
787 // computing them outside of load time introduces a lot of complexity.
788
789 // NewLinesCount is the number of newlines "\n" that appear in the zoekt
790 // indexed documents. This is not exactly the same as line count, since it
791 // will not include lines not terminated by "\n" (eg a file with no "\n", or
792 // a final line without "\n"). Note: Zoekt deduplicates documents across
793 // branches, so if a path has the same contents on multiple branches, there
794 // is only one document for it. As such that document's newlines is only
795 // counted once. See DefaultBranchNewLinesCount and AllBranchesNewLinesCount
796 // for counts which do not deduplicate.
797 NewLinesCount uint64
798
799 // DefaultBranchNewLinesCount is the number of newlines "\n" in the default
800 // branch.
801 DefaultBranchNewLinesCount uint64
802
803 // OtherBranchesNewLinesCount is the number of newlines "\n" in all branches
804 // except the default branch.
805 OtherBranchesNewLinesCount uint64
806}
807
808func (s *RepoStats) Add(o *RepoStats) {
809 // can't update Repos, since one repo may have multiple
810 // shards.
811 s.Shards += o.Shards
812 s.IndexBytes += o.IndexBytes
813 s.Documents += o.Documents
814 s.ContentBytes += o.ContentBytes
815
816 // Sourcegraph specific
817 s.NewLinesCount += o.NewLinesCount
818 s.DefaultBranchNewLinesCount += o.DefaultBranchNewLinesCount
819 s.OtherBranchesNewLinesCount += o.OtherBranchesNewLinesCount
820}
821
822type RepoListEntry struct {
823 Repository Repository
824 IndexMetadata IndexMetadata
825 Stats RepoStats
826}
827
828// MinimalRepoListEntry is a subset of RepoListEntry. It was added after
829// performance profiling of sourcegraph.com revealed that querying this
830// information from Zoekt was causing lots of CPU and memory usage. Note: we
831// can revisit this, how we store and query this information has changed a lot
832// since this was introduced.
833type MinimalRepoListEntry struct {
834 // HasSymbols is exported since Sourcegraph uses this information at search
835 // planning time to decide between Zoekt and an unindexed symbol search.
836 //
837 // Note: it pretty much is always true in practice.
838 HasSymbols bool
839
840 // Branches is used by Sourcegraphs query planner to decided if it can use
841 // zoekt or go via an unindexed code path.
842 Branches []RepositoryBranch
843
844 // IndexTimeUnix is the IndexTime converted to unix time (number of seconds
845 // since the epoch). This is to make it clear we are not transporting the
846 // full fidelty timestamp (ie with milliseconds and location). Additionally
847 // it saves 16 bytes in this struct.
848 //
849 // IndexTime is used as a heuristic in Sourcegraph to decide in aggregate
850 // how many repositories need updating after a ranking change/etc.
851 //
852 // TODO(keegancsmith) audit updates to IndexTime and document how and when
853 // it changes. Concerned about things like metadata updates or compound
854 // shards leading to untrustworthy data here.
855 IndexTimeUnix int64
856}
857
858type ReposMap map[uint32]MinimalRepoListEntry
859
860// MarshalBinary implements a specialized encoder for ReposMap.
861func (q *ReposMap) MarshalBinary() ([]byte, error) {
862 return reposMapEncode(*q)
863}
864
865// UnmarshalBinary implements a specialized decoder for ReposMap.
866func (q *ReposMap) UnmarshalBinary(b []byte) error {
867 var err error
868 (*q), err = reposMapDecode(b)
869 return err
870}
871
872// RepoList holds a set of Repository metadata.
873type RepoList struct {
874 // Returned when ListOptions.Field is RepoListFieldRepos.
875 Repos []*RepoListEntry
876
877 // ReposMap is set when ListOptions.Field is RepoListFieldReposMap.
878 ReposMap ReposMap
879
880 Crashes int
881
882 // Stats response to a List request.
883 // This is the aggregate RepoStats of all repos matching the input query.
884 Stats RepoStats
885}
886
887type Searcher interface {
888 Search(ctx context.Context, q query.Q, opts *SearchOptions) (*SearchResult, error)
889
890 // List lists repositories. The query `q` can only contain
891 // query.Repo atoms.
892 List(ctx context.Context, q query.Q, opts *ListOptions) (*RepoList, error)
893 Close()
894
895 // Describe the searcher for debug messages.
896 String() string
897}
898
899type RepoListField int
900
901const (
902 RepoListFieldRepos RepoListField = 0
903 RepoListFieldReposMap = 2
904)
905
906type ListOptions struct {
907 // Field decides which field to populate in RepoList response.
908 Field RepoListField
909}
910
911func (o *ListOptions) GetField() (RepoListField, error) {
912 if o == nil {
913 return RepoListFieldRepos, nil
914 }
915 switch o.Field {
916 case RepoListFieldRepos, RepoListFieldReposMap:
917 return o.Field, nil
918 case 1:
919 return 0, fmt.Errorf("RepoListFieldMinimal (%d) is no longer supported", o.Field)
920 default:
921 return 0, fmt.Errorf("unknown RepoListField %d", o.Field)
922 }
923}
924
925func (o *ListOptions) String() string {
926 return fmt.Sprintf("%#v", o)
927}
928
929type SearchOptions struct {
930 // Return an upper-bound estimate of eligible documents in
931 // stats.ShardFilesConsidered.
932 EstimateDocCount bool
933
934 // Return the whole file.
935 Whole bool
936
937 // Maximum number of matches: skip all processing an index
938 // shard after we found this many non-overlapping matches.
939 ShardMaxMatchCount int
940
941 // Maximum number of matches: stop looking for more matches
942 // once we have this many matches across shards.
943 TotalMaxMatchCount int
944
945 // Maximum number of matches: skip processing documents for a repository in
946 // a shard once we have found ShardRepoMaxMatchCount.
947 //
948 // A compound shard may contain multiple repositories. This will most often
949 // be set to 1 to find all repositories containing a result.
950 ShardRepoMaxMatchCount int
951
952 // Abort the search after this much time has passed.
953 MaxWallTime time.Duration
954
955 // FlushWallTime if non-zero will stop streaming behaviour at first and
956 // instead will collate and sort results. At FlushWallTime the results will
957 // be sent and then the behaviour will revert to the normal streaming.
958 FlushWallTime time.Duration
959
960 // Truncates the number of documents (i.e. files) after collating and
961 // sorting the results.
962 MaxDocDisplayCount int
963
964 // Truncates the number of matchs after collating and sorting the results.
965 MaxMatchDisplayCount int
966
967 // If set to a number greater than zero then up to this many number
968 // of context lines will be added before and after each matched line.
969 // Note that the included context lines might contain matches and
970 // it's up to the consumer of the result to remove those lines.
971 NumContextLines int
972
973 // If true, ChunkMatches will be returned in each FileMatch rather than LineMatches
974 // EXPERIMENTAL: the behavior of this flag may be changed in future versions.
975 ChunkMatches bool
976
977 // EXPERIMENTAL. If true, use text-search style scoring instead of the default
978 // scoring formula. The scoring algorithm treats each match in a file as a term
979 // and computes an approximation to BM25.
980 //
981 // The calculation of IDF assumes that Zoekt visits all documents containing any
982 // of the query terms during evaluation. This is true, for example, if all query
983 // terms are ORed together.
984 //
985 // When enabled, all other scoring signals are ignored, including document ranks.
986 UseBM25Scoring bool
987
988 // Trace turns on opentracing for this request if true and if the Jaeger address was provided as
989 // a command-line flag
990 Trace bool
991
992 // If set, the search results will contain debug information for scoring.
993 DebugScore bool
994
995 // SpanContext is the opentracing span context, if it exists, from the zoekt client
996 SpanContext map[string]string
997}
998
999// String returns a succinct representation of the options. This is meant for
1000// human consumption in logs and traces.
1001//
1002// Note: some tracing systems have limits on length of values, so we take care
1003// to try and make this small, and include the important information near the
1004// front incase of truncation.
1005func (s *SearchOptions) String() string {
1006 var b strings.Builder
1007
1008 add := func(name, value string) {
1009 b.WriteString(name)
1010 b.WriteByte('=')
1011 b.WriteString(value)
1012 b.WriteByte(' ')
1013 }
1014 addInt := func(name string, value int) {
1015 if value != 0 {
1016 add(name, strconv.Itoa(value))
1017 }
1018 }
1019 addDuration := func(name string, value time.Duration) {
1020 if value != 0 {
1021 add(name, value.String())
1022 }
1023 }
1024 addBool := func(name string, value bool) {
1025 if !value {
1026 return
1027 }
1028 b.WriteString(name)
1029 b.WriteByte(' ')
1030 }
1031
1032 b.WriteString("zoekt.SearchOptions{ ")
1033
1034 addInt("ShardMaxMatchCount", s.ShardMaxMatchCount)
1035 addInt("TotalMaxMatchCount", s.TotalMaxMatchCount)
1036 addInt("ShardRepoMaxMatchCount", s.ShardRepoMaxMatchCount)
1037 addInt("MaxDocDisplayCount", s.MaxDocDisplayCount)
1038 addInt("MaxMatchDisplayCount", s.MaxMatchDisplayCount)
1039 addInt("NumContextLines", s.NumContextLines)
1040
1041 addDuration("MaxWallTime", s.MaxWallTime)
1042 addDuration("FlushWallTime", s.FlushWallTime)
1043
1044 addBool("EstimateDocCount", s.EstimateDocCount)
1045 addBool("Whole", s.Whole)
1046 addBool("ChunkMatches", s.ChunkMatches)
1047 addBool("UseBM25Scoring", s.UseBM25Scoring)
1048 addBool("Trace", s.Trace)
1049 addBool("DebugScore", s.DebugScore)
1050
1051 for k, v := range s.SpanContext {
1052 add("SpanContext."+k, strconv.Quote(v))
1053 }
1054
1055 b.WriteByte('}')
1056 return b.String()
1057}
1058
1059// Sender is the interface that wraps the basic Send method.
1060type Sender interface {
1061 Send(*SearchResult)
1062}
1063
1064// SenderFunc is an adapter to allow the use of ordinary functions as Sender.
1065// If f is a function with the appropriate signature, SenderFunc(f) is a Sender
1066// that calls f.
1067type SenderFunc func(result *SearchResult)
1068
1069func (f SenderFunc) Send(result *SearchResult) {
1070 f(result)
1071}
1072
1073// Streamer adds the method StreamSearch to the Searcher interface.
1074type Streamer interface {
1075 Searcher
1076 StreamSearch(ctx context.Context, q query.Q, opts *SearchOptions, sender Sender) (err error)
1077}