fork of https://github.com/sourcegraph/zoekt
1// Copyright 2016 Google Inc. All rights reserved.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15package zoekt // import "github.com/sourcegraph/zoekt"
16
17import (
18 "context"
19 "encoding/json"
20 "errors"
21 "fmt"
22 "reflect"
23 "strconv"
24 "strings"
25 "time"
26
27 "github.com/sourcegraph/zoekt/query"
28)
29
30const (
31 mapHeaderBytes uint64 = 48
32 sliceHeaderBytes uint64 = 24
33 stringHeaderBytes uint64 = 16
34 pointerSize uint64 = 8
35 interfaceBytes uint64 = 16
36)
37
38// FileMatch contains all the matches within a file.
39type FileMatch struct {
40 FileName string
41
42 // Repository is the globally unique name of the repo of the
43 // match
44 Repository string
45
46 // SubRepositoryName is the globally unique name of the repo,
47 // if it came from a subrepository
48 SubRepositoryName string `json:",omitempty"`
49
50 // SubRepositoryPath holds the prefix where the subrepository
51 // was mounted.
52 SubRepositoryPath string `json:",omitempty"`
53
54 // Commit SHA1 (hex) of the (sub)repo holding the file.
55 Version string `json:",omitempty"`
56
57 // Detected language of the result.
58 Language string
59
60 // For debugging. Needs DebugScore set, but public so tests in
61 // other packages can print some diagnostics.
62 Debug string `json:",omitempty"`
63
64 Branches []string `json:",omitempty"`
65
66 // One of LineMatches or ChunkMatches will be returned depending on whether
67 // the SearchOptions.ChunkMatches is set.
68 LineMatches []LineMatch `json:",omitempty"`
69 ChunkMatches []ChunkMatch `json:",omitempty"`
70
71 // Only set if requested
72 Content []byte `json:",omitempty"`
73
74 // Checksum of the content.
75 Checksum []byte
76
77 // Ranking; the higher, the better.
78 Score float64 `json:",omitempty"`
79
80 // RepositoryPriority is a Sourcegraph extension. It is used by Sourcegraph to
81 // order results from different repositories relative to each other.
82 RepositoryPriority float64 `json:",omitempty"`
83
84 // RepositoryID is a Sourcegraph extension. This is the ID of Repository in
85 // Sourcegraph.
86 RepositoryID uint32 `json:",omitempty"`
87}
88
89func (m *FileMatch) sizeBytes() (sz uint64) {
90 // Score
91 sz += 8
92
93 for _, s := range []string{
94 m.Debug,
95 m.FileName,
96 m.Repository,
97 m.Language,
98 m.SubRepositoryName,
99 m.SubRepositoryPath,
100 m.Version,
101 } {
102 sz += stringHeaderBytes + uint64(len(s))
103 }
104
105 // Branches
106 sz += sliceHeaderBytes
107 for _, s := range m.Branches {
108 sz += stringHeaderBytes + uint64(len(s))
109 }
110
111 // LineMatches
112 sz += sliceHeaderBytes
113 for _, lm := range m.LineMatches {
114 sz += lm.sizeBytes()
115 }
116
117 // ChunkMatches
118 sz += sliceHeaderBytes
119 for _, cm := range m.ChunkMatches {
120 sz += cm.sizeBytes()
121 }
122
123 // RepositoryID
124 sz += 4
125
126 // RepositoryPriority
127 sz += 8
128
129 // Content
130 sz += sliceHeaderBytes + uint64(len(m.Content))
131
132 // Checksum
133 sz += sliceHeaderBytes + uint64(len(m.Checksum))
134
135 return
136}
137
138// ChunkMatch is a set of non-overlapping matches within a contiguous range of
139// lines in the file.
140type ChunkMatch struct {
141 DebugScore string
142
143 // Content is a contiguous range of complete lines that fully contains Ranges.
144 // Lines will always include their terminating newline (if it exists).
145 Content []byte
146
147 // Ranges is a set of matching ranges within this chunk. Each range is relative
148 // to the beginning of the file (not the beginning of Content).
149 Ranges []Range
150
151 // SymbolInfo is the symbol information associated with Ranges. If it is non-nil,
152 // its length will equal that of Ranges. Any of its elements may be nil.
153 SymbolInfo []*Symbol
154
155 // FileName indicates whether this match is a match on the file name, in
156 // which case Content will contain the file name.
157 FileName bool
158
159 // ContentStart is the location (inclusive) of the beginning of content
160 // relative to the beginning of the file. It will always be at the
161 // beginning of a line (Column will always be 1).
162 ContentStart Location
163
164 Score float64
165}
166
167func (cm *ChunkMatch) sizeBytes() (sz uint64) {
168 // Content
169 sz += sliceHeaderBytes + uint64(len(cm.Content))
170
171 // ContentStart
172 sz += cm.ContentStart.sizeBytes()
173
174 // FileName
175 sz += 1
176
177 // Ranges
178 sz += sliceHeaderBytes
179 if len(cm.Ranges) > 0 {
180 sz += uint64(len(cm.Ranges)) * cm.Ranges[0].sizeBytes()
181 }
182
183 // SymbolInfo
184 sz += sliceHeaderBytes
185 for _, si := range cm.SymbolInfo {
186 sz += pointerSize
187 if si != nil {
188 sz += si.sizeBytes()
189 }
190 }
191
192 // Score
193 sz += 8
194
195 // DebugScore
196 sz += stringHeaderBytes + uint64(len(cm.DebugScore))
197
198 return
199}
200
201type Range struct {
202 // The inclusive beginning of the range.
203 Start Location
204 // The exclusive end of the range.
205 End Location
206}
207
208func (r *Range) sizeBytes() uint64 {
209 return r.Start.sizeBytes() + r.End.sizeBytes()
210}
211
212type Location struct {
213 // 0-based byte offset from the beginning of the file
214 ByteOffset uint32
215 // 1-based line number from the beginning of the file
216 LineNumber uint32
217 // 1-based column number (in runes) from the beginning of line
218 Column uint32
219}
220
221func (l *Location) sizeBytes() uint64 {
222 return 3 * 4
223}
224
225// LineMatch holds the matches within a single line in a file.
226type LineMatch struct {
227 // The line in which a match was found.
228 Line []byte
229 // The byte offset of the first byte of the line.
230 LineStart int
231 // The byte offset of the first byte past the end of the line.
232 // This is usually the byte after the terminating newline, but can also be
233 // the end of the file if there is no terminating newline
234 LineEnd int
235 LineNumber int
236
237 // Before and After are only set when SearchOptions.NumContextLines is > 0
238 Before []byte
239 After []byte
240
241 // If set, this was a match on the filename.
242 FileName bool
243
244 // The higher the better. Only ranks the quality of the match
245 // within the file, does not take rank of file into account
246 Score float64
247 DebugScore string
248
249 LineFragments []LineFragmentMatch
250}
251
252func (lm *LineMatch) sizeBytes() (sz uint64) {
253 // Line
254 sz += sliceHeaderBytes + uint64(len(lm.Line))
255
256 // LineStart, LineEnd, LineNumber
257 sz += 3 * 8
258
259 // Before
260 sz += sliceHeaderBytes + uint64(len(lm.Before))
261
262 // After
263 sz += sliceHeaderBytes + uint64(len(lm.After))
264
265 // FileName
266 sz += 1
267
268 // Score
269 sz += 8
270
271 // DebugScore
272 sz += stringHeaderBytes + uint64(len(lm.DebugScore))
273
274 // LineFragments
275 sz += sliceHeaderBytes
276 for _, lf := range lm.LineFragments {
277 sz += lf.sizeBytes()
278 }
279
280 return
281}
282
283type Symbol struct {
284 Sym string
285 Kind string
286 Parent string
287 ParentKind string
288}
289
290func (s *Symbol) sizeBytes() uint64 {
291 return 4*stringHeaderBytes + uint64(len(s.Sym)+len(s.Kind)+len(s.Parent)+len(s.ParentKind))
292}
293
294// LineFragmentMatch a segment of matching text within a line.
295type LineFragmentMatch struct {
296 // Offset within the line, in bytes.
297 LineOffset int
298
299 // Offset from file start, in bytes.
300 Offset uint32
301
302 // Number bytes that match.
303 MatchLength int
304
305 SymbolInfo *Symbol
306}
307
308func (lfm *LineFragmentMatch) sizeBytes() (sz uint64) {
309 // LineOffset
310 sz += 8
311
312 // Offset
313 sz += 4
314
315 // MatchLength
316 sz += 8
317
318 // SymbolInfo
319 sz += pointerSize
320 if lfm.SymbolInfo != nil {
321 sz += lfm.SymbolInfo.sizeBytes()
322 }
323
324 return
325}
326
327type FlushReason uint8
328
329const (
330 FlushReasonTimerExpired FlushReason = 1 << iota
331 FlushReasonFinalFlush
332 FlushReasonMaxSize
333)
334
335var FlushReasonStrings = map[FlushReason]string{
336 FlushReasonTimerExpired: "timer_expired",
337 FlushReasonFinalFlush: "final_flush",
338 FlushReasonMaxSize: "max_size_reached",
339}
340
341func (fr FlushReason) String() string {
342 if v, ok := FlushReasonStrings[fr]; ok {
343 return v
344 }
345
346 return "none"
347}
348
349// Stats contains interesting numbers on the search
350type Stats struct {
351 // Amount of I/O for reading contents.
352 ContentBytesLoaded int64
353
354 // Amount of I/O for reading from index.
355 IndexBytesLoaded int64
356
357 // Number of search shards that had a crash.
358 Crashes int
359
360 // Wall clock time for this search
361 Duration time.Duration
362
363 // Number of files containing a match.
364 FileCount int
365
366 // Number of files in shards that we considered.
367 ShardFilesConsidered int
368
369 // Files that we evaluated. Equivalent to files for which all
370 // atom matches (including negations) evaluated to true.
371 FilesConsidered int
372
373 // Files for which we loaded file content to verify substring matches
374 FilesLoaded int
375
376 // Candidate files whose contents weren't examined because we
377 // gathered enough matches.
378 FilesSkipped int
379
380 // Shards that we scanned to find matches.
381 ShardsScanned int
382
383 // Shards that we did not process because a query was canceled.
384 ShardsSkipped int
385
386 // Shards that we did not process because the query was rejected by the
387 // ngram filter indicating it had no matches.
388 ShardsSkippedFilter int
389
390 // Number of non-overlapping matches
391 MatchCount int
392
393 // Number of candidate matches as a result of searching ngrams.
394 NgramMatches int
395
396 // NgramLookups is the number of times we accessed an ngram in the index.
397 NgramLookups int
398
399 // Wall clock time for queued search.
400 Wait time.Duration
401
402 // Aggregate wall clock time spent constructing and pruning the match tree.
403 // This accounts for time such as lookups in the trigram index.
404 MatchTreeConstruction time.Duration
405
406 // Aggregate wall clock time spent searching the match tree. This accounts
407 // for the bulk of search work done looking for matches.
408 MatchTreeSearch time.Duration
409
410 // Number of times regexp was called on files that we evaluated.
411 RegexpsConsidered int
412
413 // FlushReason explains why results were flushed.
414 FlushReason FlushReason
415}
416
417func (s *Stats) sizeBytes() (sz uint64) {
418 sz = 16 * 8 // This assumes we are running on a 64-bit architecture
419 sz += 1 // FlushReason
420
421 return
422}
423
424func (s *Stats) Add(o Stats) {
425 s.ContentBytesLoaded += o.ContentBytesLoaded
426 s.IndexBytesLoaded += o.IndexBytesLoaded
427 s.Crashes += o.Crashes
428 s.FileCount += o.FileCount
429 s.FilesConsidered += o.FilesConsidered
430 s.FilesLoaded += o.FilesLoaded
431 s.FilesSkipped += o.FilesSkipped
432 s.MatchCount += o.MatchCount
433 s.NgramMatches += o.NgramMatches
434 s.NgramLookups += o.NgramLookups
435 s.ShardFilesConsidered += o.ShardFilesConsidered
436 s.ShardsScanned += o.ShardsScanned
437 s.ShardsSkipped += o.ShardsSkipped
438 s.ShardsSkippedFilter += o.ShardsSkippedFilter
439 s.Wait += o.Wait
440 s.MatchTreeConstruction += o.MatchTreeConstruction
441 s.MatchTreeSearch += o.MatchTreeSearch
442 s.RegexpsConsidered += o.RegexpsConsidered
443
444 // We want the first non-zero FlushReason to be sticky. This is a useful
445 // property when aggregating stats from several Zoekts.
446 if s.FlushReason == 0 {
447 s.FlushReason = o.FlushReason
448 }
449}
450
451// Zero returns true if stats is empty.
452func (s *Stats) Zero() bool {
453 if s == nil {
454 return true
455 }
456
457 return !(s.ContentBytesLoaded > 0 ||
458 s.IndexBytesLoaded > 0 ||
459 s.Crashes > 0 ||
460 s.FileCount > 0 ||
461 s.FilesConsidered > 0 ||
462 s.FilesLoaded > 0 ||
463 s.FilesSkipped > 0 ||
464 s.MatchCount > 0 ||
465 s.NgramMatches > 0 ||
466 s.NgramLookups > 0 ||
467 s.ShardFilesConsidered > 0 ||
468 s.ShardsScanned > 0 ||
469 s.ShardsSkipped > 0 ||
470 s.ShardsSkippedFilter > 0 ||
471 s.Wait > 0 ||
472 s.MatchTreeConstruction > 0 ||
473 s.MatchTreeSearch > 0 ||
474 s.RegexpsConsidered > 0)
475}
476
477// Progress contains information about the global progress of the running search query.
478// This is used by the frontend to reorder results and emit them when stable.
479// Sourcegraph specific: this is used when querying multiple zoekt-webserver instances.
480type Progress struct {
481 // Priority of the shard that was searched.
482 Priority float64
483
484 // MaxPendingPriority is the maximum priority of pending result that is being searched in parallel.
485 // This is used to reorder results when the result set is known to be stable-- that is, when a result's
486 // Priority is greater than the max(MaxPendingPriority) from the latest results of each backend, it can be returned to the user.
487 //
488 // MaxPendingPriority decreases monotonically in each SearchResult.
489 MaxPendingPriority float64
490}
491
492func (p *Progress) sizeBytes() uint64 {
493 return 2 * 8
494}
495
496// SearchResult contains search matches and extra data
497type SearchResult struct {
498 Stats
499
500 // Do not encode this as we cannot encode -Inf in JSON
501 Progress `json:"-"`
502
503 Files []FileMatch
504
505 // RepoURLs holds a repo => template string map.
506 RepoURLs map[string]string
507
508 // FragmentNames holds a repo => template string map, for
509 // the line number fragment.
510 LineFragments map[string]string
511}
512
513// SizeBytes is a best-effort estimate of the size of SearchResult in memory.
514// The estimate does not take alignment into account. The result is a lower
515// bound on the actual size in memory.
516func (sr *SearchResult) SizeBytes() (sz uint64) {
517 sz += sr.Stats.sizeBytes()
518 sz += sr.Progress.sizeBytes()
519
520 // Files
521 sz += sliceHeaderBytes
522 for _, f := range sr.Files {
523 sz += f.sizeBytes()
524 }
525
526 // RepoURLs
527 sz += mapHeaderBytes
528 for k, v := range sr.RepoURLs {
529 sz += stringHeaderBytes + uint64(len(k))
530 sz += stringHeaderBytes + uint64(len(v))
531 }
532
533 // LineFragments
534 sz += mapHeaderBytes
535 for k, v := range sr.LineFragments {
536 sz += stringHeaderBytes + uint64(len(k))
537 sz += stringHeaderBytes + uint64(len(v))
538 }
539
540 return
541}
542
543// RepositoryBranch describes an indexed branch, which is a name
544// combined with a version.
545type RepositoryBranch struct {
546 Name string
547 Version string
548}
549
550func (r RepositoryBranch) String() string {
551 return fmt.Sprintf("%s@%s", r.Name, r.Version)
552}
553
554// Repository holds repository metadata.
555type Repository struct {
556 // Sourcegraph's tenant ID
557 TenantID int
558
559 // Sourcegraph's repository ID
560 ID uint32
561
562 // The repository name
563 Name string
564
565 // The repository URL.
566 URL string
567
568 // The physical source where this repo came from, eg. full
569 // path to the zip filename or git repository directory. This
570 // will not be exposed in the UI, but can be used to detect
571 // orphaned index shards.
572 Source string
573
574 // The branches indexed in this repo.
575 Branches []RepositoryBranch
576
577 // Nil if this is not the super project.
578 SubRepoMap map[string]*Repository
579
580 // URL template to link to the commit of a branch
581 CommitURLTemplate string
582
583 // The repository URL for getting to a file. Has access to
584 // {{.Version}}, {{.Path}}
585 FileURLTemplate string
586
587 // The URL fragment to add to a file URL for line numbers. has
588 // access to {{.LineNumber}}. The fragment should include the
589 // separator, generally '#' or ';'.
590 LineFragmentTemplate string
591
592 // Perf optimization: priority is set when we load the shard. It corresponds to
593 // the value of "priority" stored in RawConfig.
594 priority float64
595
596 // All zoekt.* configuration settings.
597 RawConfig map[string]string
598
599 // Importance of the repository, bigger is more important
600 Rank uint16
601
602 // IndexOptions is a hash of the options used to create the index for the
603 // repo.
604 IndexOptions string
605
606 // HasSymbols is true if this repository has indexed ctags
607 // output. Sourcegraph specific: This field is more appropriate for
608 // IndexMetadata. However, we store it here since the Sourcegraph frontend
609 // can read this structure but not IndexMetadata.
610 HasSymbols bool
611
612 // Tombstone is true if we are not allowed to search this repo.
613 Tombstone bool
614
615 // LatestCommitDate is the date of the latest commit among all indexed Branches.
616 // The date might be time.Time's 0-value if the repository was last indexed
617 // before this field was added.
618 LatestCommitDate time.Time
619
620 // FileTombstones is a set of file paths that should be ignored across all branches
621 // in this shard.
622 FileTombstones map[string]struct{} `json:",omitempty"`
623}
624
625func (r *Repository) UnmarshalJSON(data []byte) error {
626 // We define a new type so that we can use json.Unmarshal
627 // without recursing into this same method.
628 type repository *Repository
629 repo := repository(r)
630
631 err := json.Unmarshal(data, repo)
632 if err != nil {
633 return err
634 }
635
636 if v, ok := repo.RawConfig["repoid"]; ok {
637 id, _ := strconv.ParseUint(v, 10, 32)
638 r.ID = uint32(id)
639 }
640
641 if v, ok := repo.RawConfig["tenantID"]; ok {
642 id, _ := strconv.ParseInt(v, 10, 64)
643 r.TenantID = int(id)
644 }
645
646 // Sourcegraph indexserver doesn't set repo.Rank, so we set it here. Setting it
647 // on read instead of during indexing allows us to avoid a complete reindex.
648 //
649 // Prefer "latestCommitDate" over "priority" for ranking. We keep priority for
650 // backwards compatibility.
651 if _, ok := repo.RawConfig["latestCommitDate"]; ok {
652 // We use the number of months since 1970 as a simple measure of repo freshness.
653 // It is monotonically increasing and stable across re-indexes and restarts.
654 r.Rank = monthsSince1970(repo.LatestCommitDate)
655 } else if v, ok := repo.RawConfig["priority"]; ok {
656 r.priority, err = strconv.ParseFloat(v, 64)
657 if err != nil {
658 r.priority = 0
659 }
660
661 // Sourcegraph indexserver doesn't set repo.Rank, so we set it here
662 // based on priority. Setting it on read instead of during indexing
663 // allows us to avoid a complete reindex.
664 if r.Rank == 0 && r.priority > 0 {
665 // Normalize the repo score within [0, maxUint16), with the midpoint at 5,000.
666 // This means popular repos (roughly ones with over 5,000 stars) see diminishing
667 // returns from more stars.
668 r.Rank = uint16(r.priority / (5000.0 + r.priority) * maxUInt16)
669 }
670 }
671
672 return nil
673}
674
675// monthsSince1970 returns the number of months since 1970. It returns values in
676// the range [0, maxUInt16]. The upper bound is reached in the year 7431, the
677// lower bound for all dates before 1970.
678func monthsSince1970(t time.Time) uint16 {
679 base := time.Unix(0, 0)
680 if t.Before(base) {
681 return 0
682 }
683 months := int(t.Year()-1970)*12 + int(t.Month()-1)
684 return uint16(min(months, maxUInt16))
685}
686
687// MergeMutable will merge x into r. mutated will be true if it made any
688// changes. err is non-nil if we needed to mutate an immutable field.
689//
690// Note: SubRepoMap, IndexOptions and HasSymbol fields are ignored. They are
691// computed while indexing so can't be synthesized from x.
692//
693// Note: We ignore RawConfig fields which are duplicated into Repository:
694// name and id.
695func (r *Repository) MergeMutable(x *Repository) (mutated bool, err error) {
696 if r.ID != x.ID {
697 // Sourcegraph: strange behaviour may occur if ID changes but names don't.
698 return mutated, errors.New("ID is immutable")
699 }
700 if r.Name != x.Name {
701 // Name is encoded into the shard name on disk. We need to re-index if it
702 // changes.
703 return mutated, errors.New("Name is immutable")
704 }
705 if !reflect.DeepEqual(r.Branches, x.Branches) {
706 // Need a reindex if content changing.
707 return mutated, errors.New("Branches is immutable")
708 }
709
710 for k, v := range x.RawConfig {
711 // We ignore name and id since they are encoded into the repository.
712 if k == "name" || k == "id" {
713 continue
714 }
715 if r.RawConfig == nil {
716 mutated = true
717 r.RawConfig = make(map[string]string)
718 }
719 if r.RawConfig[k] != v {
720 mutated = true
721 r.RawConfig[k] = v
722 }
723 }
724
725 if r.URL != x.URL {
726 mutated = true
727 r.URL = x.URL
728 }
729 if r.CommitURLTemplate != x.CommitURLTemplate {
730 mutated = true
731 r.CommitURLTemplate = x.CommitURLTemplate
732 }
733 if r.FileURLTemplate != x.FileURLTemplate {
734 mutated = true
735 r.FileURLTemplate = x.FileURLTemplate
736 }
737 if r.LineFragmentTemplate != x.LineFragmentTemplate {
738 mutated = true
739 r.LineFragmentTemplate = x.LineFragmentTemplate
740 }
741
742 return mutated, nil
743}
744
745// IndexMetadata holds metadata stored in the index file. It contains
746// data generated by the core indexing library.
747type IndexMetadata struct {
748 IndexFormatVersion int
749 IndexFeatureVersion int
750 IndexMinReaderVersion int
751 IndexTime time.Time
752 PlainASCII bool
753 LanguageMap map[string]uint16
754 ZoektVersion string
755 ID string
756}
757
758// Statistics of a (collection of) repositories.
759type RepoStats struct {
760 // Repos is used for aggregrating the number of repositories.
761 //
762 // Note: This field is not populated on RepoListEntry.Stats (individual) but
763 // only for RepoList.Stats (aggregate).
764 Repos int
765
766 // Shards is the total number of search shards.
767 Shards int
768
769 // Documents holds the number of documents or files.
770 Documents int
771
772 // IndexBytes is the amount of RAM used for index overhead.
773 IndexBytes int64
774
775 // ContentBytes is the amount of RAM used for raw content.
776 ContentBytes int64
777
778 // Sourcegraph specific stats below. These are not as efficient to calculate
779 // as the above statistics. We experimentally measured about a 10% slower
780 // shard load time. However, we find these values very useful to track and
781 // computing them outside of load time introduces a lot of complexity.
782
783 // NewLinesCount is the number of newlines "\n" that appear in the zoekt
784 // indexed documents. This is not exactly the same as line count, since it
785 // will not include lines not terminated by "\n" (eg a file with no "\n", or
786 // a final line without "\n"). Note: Zoekt deduplicates documents across
787 // branches, so if a path has the same contents on multiple branches, there
788 // is only one document for it. As such that document's newlines is only
789 // counted once. See DefaultBranchNewLinesCount and AllBranchesNewLinesCount
790 // for counts which do not deduplicate.
791 NewLinesCount uint64
792
793 // DefaultBranchNewLinesCount is the number of newlines "\n" in the default
794 // branch.
795 DefaultBranchNewLinesCount uint64
796
797 // OtherBranchesNewLinesCount is the number of newlines "\n" in all branches
798 // except the default branch.
799 OtherBranchesNewLinesCount uint64
800}
801
802func (s *RepoStats) Add(o *RepoStats) {
803 // can't update Repos, since one repo may have multiple
804 // shards.
805 s.Shards += o.Shards
806 s.IndexBytes += o.IndexBytes
807 s.Documents += o.Documents
808 s.ContentBytes += o.ContentBytes
809
810 // Sourcegraph specific
811 s.NewLinesCount += o.NewLinesCount
812 s.DefaultBranchNewLinesCount += o.DefaultBranchNewLinesCount
813 s.OtherBranchesNewLinesCount += o.OtherBranchesNewLinesCount
814}
815
816type RepoListEntry struct {
817 Repository Repository
818 IndexMetadata IndexMetadata
819 Stats RepoStats
820}
821
822// MinimalRepoListEntry is a subset of RepoListEntry. It was added after
823// performance profiling of sourcegraph.com revealed that querying this
824// information from Zoekt was causing lots of CPU and memory usage. Note: we
825// can revisit this, how we store and query this information has changed a lot
826// since this was introduced.
827type MinimalRepoListEntry struct {
828 // HasSymbols is exported since Sourcegraph uses this information at search
829 // planning time to decide between Zoekt and an unindexed symbol search.
830 //
831 // Note: it pretty much is always true in practice.
832 HasSymbols bool
833
834 // Branches is used by Sourcegraphs query planner to decided if it can use
835 // zoekt or go via an unindexed code path.
836 Branches []RepositoryBranch
837
838 // IndexTimeUnix is the IndexTime converted to unix time (number of seconds
839 // since the epoch). This is to make it clear we are not transporting the
840 // full fidelty timestamp (ie with milliseconds and location). Additionally
841 // it saves 16 bytes in this struct.
842 //
843 // IndexTime is used as a heuristic in Sourcegraph to decide in aggregate
844 // how many repositories need updating after a ranking change/etc.
845 //
846 // TODO(keegancsmith) audit updates to IndexTime and document how and when
847 // it changes. Concerned about things like metadata updates or compound
848 // shards leading to untrustworthy data here.
849 IndexTimeUnix int64
850}
851
852type ReposMap map[uint32]MinimalRepoListEntry
853
854// MarshalBinary implements a specialized encoder for ReposMap.
855func (q *ReposMap) MarshalBinary() ([]byte, error) {
856 return reposMapEncode(*q)
857}
858
859// UnmarshalBinary implements a specialized decoder for ReposMap.
860func (q *ReposMap) UnmarshalBinary(b []byte) error {
861 var err error
862 (*q), err = reposMapDecode(b)
863 return err
864}
865
866// RepoList holds a set of Repository metadata.
867type RepoList struct {
868 // Returned when ListOptions.Field is RepoListFieldRepos.
869 Repos []*RepoListEntry
870
871 // ReposMap is set when ListOptions.Field is RepoListFieldReposMap.
872 ReposMap ReposMap
873
874 Crashes int
875
876 // Stats response to a List request.
877 // This is the aggregate RepoStats of all repos matching the input query.
878 Stats RepoStats
879}
880
881type Searcher interface {
882 Search(ctx context.Context, q query.Q, opts *SearchOptions) (*SearchResult, error)
883
884 // List lists repositories. The query `q` can only contain
885 // query.Repo atoms.
886 List(ctx context.Context, q query.Q, opts *ListOptions) (*RepoList, error)
887 Close()
888
889 // Describe the searcher for debug messages.
890 String() string
891}
892
893type RepoListField int
894
895const (
896 RepoListFieldRepos RepoListField = 0
897 RepoListFieldReposMap = 2
898)
899
900type ListOptions struct {
901 // Field decides which field to populate in RepoList response.
902 Field RepoListField
903}
904
905func (o *ListOptions) GetField() (RepoListField, error) {
906 if o == nil {
907 return RepoListFieldRepos, nil
908 }
909 switch o.Field {
910 case RepoListFieldRepos, RepoListFieldReposMap:
911 return o.Field, nil
912 case 1:
913 return 0, fmt.Errorf("RepoListFieldMinimal (%d) is no longer supported", o.Field)
914 default:
915 return 0, fmt.Errorf("unknown RepoListField %d", o.Field)
916 }
917}
918
919func (o *ListOptions) String() string {
920 return fmt.Sprintf("%#v", o)
921}
922
923type SearchOptions struct {
924 // Return an upper-bound estimate of eligible documents in
925 // stats.ShardFilesConsidered.
926 EstimateDocCount bool
927
928 // Return the whole file.
929 Whole bool
930
931 // Maximum number of matches: skip all processing an index
932 // shard after we found this many non-overlapping matches.
933 ShardMaxMatchCount int
934
935 // Maximum number of matches: stop looking for more matches
936 // once we have this many matches across shards.
937 TotalMaxMatchCount int
938
939 // Maximum number of matches: skip processing documents for a repository in
940 // a shard once we have found ShardRepoMaxMatchCount.
941 //
942 // A compound shard may contain multiple repositories. This will most often
943 // be set to 1 to find all repositories containing a result.
944 ShardRepoMaxMatchCount int
945
946 // Abort the search after this much time has passed.
947 MaxWallTime time.Duration
948
949 // FlushWallTime if non-zero will stop streaming behaviour at first and
950 // instead will collate and sort results. At FlushWallTime the results will
951 // be sent and then the behaviour will revert to the normal streaming.
952 FlushWallTime time.Duration
953
954 // Truncates the number of documents (i.e. files) after collating and
955 // sorting the results.
956 MaxDocDisplayCount int
957
958 // Truncates the number of matchs after collating and sorting the results.
959 MaxMatchDisplayCount int
960
961 // If set to a number greater than zero then up to this many number
962 // of context lines will be added before and after each matched line.
963 // Note that the included context lines might contain matches and
964 // it's up to the consumer of the result to remove those lines.
965 NumContextLines int
966
967 // If true, ChunkMatches will be returned in each FileMatch rather than LineMatches
968 // EXPERIMENTAL: the behavior of this flag may be changed in future versions.
969 ChunkMatches bool
970
971 // EXPERIMENTAL. If true, use text-search style scoring instead of the default
972 // scoring formula. The scoring algorithm treats each match in a file as a term
973 // and computes an approximation to BM25.
974 //
975 // The calculation of IDF assumes that Zoekt visits all documents containing any
976 // of the query terms during evaluation. This is true, for example, if all query
977 // terms are ORed together.
978 //
979 // When enabled, all other scoring signals are ignored, including document ranks.
980 UseBM25Scoring bool
981
982 // Trace turns on opentracing for this request if true and if the Jaeger address was provided as
983 // a command-line flag
984 Trace bool
985
986 // If set, the search results will contain debug information for scoring.
987 DebugScore bool
988
989 // SpanContext is the opentracing span context, if it exists, from the zoekt client
990 SpanContext map[string]string
991}
992
993// String returns a succinct representation of the options. This is meant for
994// human consumption in logs and traces.
995//
996// Note: some tracing systems have limits on length of values, so we take care
997// to try and make this small, and include the important information near the
998// front incase of truncation.
999func (s *SearchOptions) String() string {
1000 var b strings.Builder
1001
1002 add := func(name, value string) {
1003 b.WriteString(name)
1004 b.WriteByte('=')
1005 b.WriteString(value)
1006 b.WriteByte(' ')
1007 }
1008 addInt := func(name string, value int) {
1009 if value != 0 {
1010 add(name, strconv.Itoa(value))
1011 }
1012 }
1013 addDuration := func(name string, value time.Duration) {
1014 if value != 0 {
1015 add(name, value.String())
1016 }
1017 }
1018 addBool := func(name string, value bool) {
1019 if !value {
1020 return
1021 }
1022 b.WriteString(name)
1023 b.WriteByte(' ')
1024 }
1025
1026 b.WriteString("zoekt.SearchOptions{ ")
1027
1028 addInt("ShardMaxMatchCount", s.ShardMaxMatchCount)
1029 addInt("TotalMaxMatchCount", s.TotalMaxMatchCount)
1030 addInt("ShardRepoMaxMatchCount", s.ShardRepoMaxMatchCount)
1031 addInt("MaxDocDisplayCount", s.MaxDocDisplayCount)
1032 addInt("MaxMatchDisplayCount", s.MaxMatchDisplayCount)
1033 addInt("NumContextLines", s.NumContextLines)
1034
1035 addDuration("MaxWallTime", s.MaxWallTime)
1036 addDuration("FlushWallTime", s.FlushWallTime)
1037
1038 addBool("EstimateDocCount", s.EstimateDocCount)
1039 addBool("Whole", s.Whole)
1040 addBool("ChunkMatches", s.ChunkMatches)
1041 addBool("UseBM25Scoring", s.UseBM25Scoring)
1042 addBool("Trace", s.Trace)
1043 addBool("DebugScore", s.DebugScore)
1044
1045 for k, v := range s.SpanContext {
1046 add("SpanContext."+k, strconv.Quote(v))
1047 }
1048
1049 b.WriteByte('}')
1050 return b.String()
1051}
1052
1053// Sender is the interface that wraps the basic Send method.
1054type Sender interface {
1055 Send(*SearchResult)
1056}
1057
1058// SenderFunc is an adapter to allow the use of ordinary functions as Sender.
1059// If f is a function with the appropriate signature, SenderFunc(f) is a Sender
1060// that calls f.
1061type SenderFunc func(result *SearchResult)
1062
1063func (f SenderFunc) Send(result *SearchResult) {
1064 f(result)
1065}
1066
1067// Streamer adds the method StreamSearch to the Searcher interface.
1068type Streamer interface {
1069 Searcher
1070 StreamSearch(ctx context.Context, q query.Q, opts *SearchOptions, sender Sender) (err error)
1071}