fork of https://github.com/sourcegraph/zoekt
1// Copyright 2016 Google Inc. All rights reserved.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15package zoekt // import "github.com/sourcegraph/zoekt"
16
17import (
18 "context"
19 "encoding/json"
20 "errors"
21 "fmt"
22 "math"
23 "reflect"
24 "strconv"
25 "strings"
26 "time"
27
28 "github.com/sourcegraph/zoekt/query"
29)
30
31const (
32 mapHeaderBytes uint64 = 48
33 sliceHeaderBytes uint64 = 24
34 stringHeaderBytes uint64 = 16
35 pointerSize uint64 = 8
36)
37
38// FileMatch contains all the matches within a file.
39type FileMatch struct {
40 FileName string
41
42 // Repository is the globally unique name of the repo of the
43 // match
44 Repository string
45
46 // SubRepositoryName is the globally unique name of the repo,
47 // if it came from a subrepository
48 SubRepositoryName string `json:",omitempty"`
49
50 // SubRepositoryPath holds the prefix where the subrepository
51 // was mounted.
52 SubRepositoryPath string `json:",omitempty"`
53
54 // Commit SHA1 (hex) of the (sub)repo holding the file.
55 Version string `json:",omitempty"`
56
57 // Detected language of the result.
58 Language string
59
60 // For debugging. Needs DebugScore set, but public so tests in
61 // other packages can print some diagnostics.
62 Debug string `json:",omitempty"`
63
64 Branches []string `json:",omitempty"`
65
66 // One of LineMatches or ChunkMatches will be returned depending on whether
67 // the SearchOptions.ChunkMatches is set.
68 LineMatches []LineMatch `json:",omitempty"`
69 ChunkMatches []ChunkMatch `json:",omitempty"`
70
71 // Only set if requested
72 Content []byte `json:",omitempty"`
73
74 // Checksum of the content.
75 Checksum []byte
76
77 // Ranking; the higher, the better.
78 Score float64 `json:",omitempty"`
79
80 // RepositoryPriority is a Sourcegraph extension. It is used by Sourcegraph to
81 // order results from different repositories relative to each other.
82 RepositoryPriority float64 `json:",omitempty"`
83
84 // RepositoryID is a Sourcegraph extension. This is the ID of Repository in
85 // Sourcegraph.
86 RepositoryID uint32 `json:",omitempty"`
87}
88
89func (m *FileMatch) sizeBytes() (sz uint64) {
90 // Score
91 sz += 8
92
93 for _, s := range []string{
94 m.Debug,
95 m.FileName,
96 m.Repository,
97 m.Language,
98 m.SubRepositoryName,
99 m.SubRepositoryPath,
100 m.Version,
101 } {
102 sz += stringHeaderBytes + uint64(len(s))
103 }
104
105 // Branches
106 sz += sliceHeaderBytes
107 for _, s := range m.Branches {
108 sz += stringHeaderBytes + uint64(len(s))
109 }
110
111 // LineMatches
112 sz += sliceHeaderBytes
113 for _, lm := range m.LineMatches {
114 sz += lm.sizeBytes()
115 }
116
117 // ChunkMatches
118 sz += sliceHeaderBytes
119 for _, cm := range m.ChunkMatches {
120 sz += cm.sizeBytes()
121 }
122
123 // RepositoryID
124 sz += 4
125
126 // RepositoryPriority
127 sz += 8
128
129 // Content
130 sz += sliceHeaderBytes + uint64(len(m.Content))
131
132 // Checksum
133 sz += sliceHeaderBytes + uint64(len(m.Checksum))
134
135 return
136}
137
138// AddScore increments the score of the FileMatch by the computed score. If
139// debugScore is true, it also adds a debug string to the FileMatch. If raw is
140// -1, it is ignored. Otherwise, it is added to the debug string.
141func (m *FileMatch) AddScore(what string, computed float64, raw float64, debugScore bool) {
142 if computed != 0 && debugScore {
143 var b strings.Builder
144 fmt.Fprintf(&b, "%s", what)
145 if raw != -1 {
146 fmt.Fprintf(&b, "(%s)", strconv.FormatFloat(raw, 'f', -1, 64))
147 }
148 fmt.Fprintf(&b, ":%.2f, ", computed)
149 m.Debug += b.String()
150 }
151 m.Score += computed
152}
153
154// ChunkMatch is a set of non-overlapping matches within a contiguous range of
155// lines in the file.
156type ChunkMatch struct {
157 DebugScore string
158
159 // Content is a contiguous range of complete lines that fully contains Ranges.
160 // Lines will always include their terminating newline (if it exists).
161 Content []byte
162
163 // Ranges is a set of matching ranges within this chunk. Each range is relative
164 // to the beginning of the file (not the beginning of Content).
165 Ranges []Range
166
167 // SymbolInfo is the symbol information associated with Ranges. If it is non-nil,
168 // its length will equal that of Ranges. Any of its elements may be nil.
169 SymbolInfo []*Symbol
170
171 // FileName indicates whether this match is a match on the file name, in
172 // which case Content will contain the file name.
173 FileName bool
174
175 // ContentStart is the location (inclusive) of the beginning of content
176 // relative to the beginning of the file. It will always be at the
177 // beginning of a line (Column will always be 1).
178 ContentStart Location
179
180 // Score is the overall relevance score of this chunk.
181 Score float64
182
183 // BestLineMatch is the line number of the highest-scoring line match in this chunk.
184 // The line number represents the index in the full file, and is 1-based. If FileName: true,
185 // this number will be 0.
186 BestLineMatch uint32
187}
188
189func (cm *ChunkMatch) sizeBytes() (sz uint64) {
190 // Content
191 sz += sliceHeaderBytes + uint64(len(cm.Content))
192
193 // ContentStart
194 sz += cm.ContentStart.sizeBytes()
195
196 // FileName
197 sz += 1
198
199 // Ranges
200 sz += sliceHeaderBytes
201 if len(cm.Ranges) > 0 {
202 sz += uint64(len(cm.Ranges)) * cm.Ranges[0].sizeBytes()
203 }
204
205 // SymbolInfo
206 sz += sliceHeaderBytes
207 for _, si := range cm.SymbolInfo {
208 sz += pointerSize
209 if si != nil {
210 sz += si.sizeBytes()
211 }
212 }
213
214 // Score
215 sz += 8
216
217 // DebugScore
218 sz += stringHeaderBytes + uint64(len(cm.DebugScore))
219
220 return
221}
222
223type Range struct {
224 // The inclusive beginning of the range.
225 Start Location
226 // The exclusive end of the range.
227 End Location
228}
229
230func (r *Range) sizeBytes() uint64 {
231 return r.Start.sizeBytes() + r.End.sizeBytes()
232}
233
234type Location struct {
235 // 0-based byte offset from the beginning of the file
236 ByteOffset uint32
237 // 1-based line number from the beginning of the file
238 LineNumber uint32
239 // 1-based column number (in runes) from the beginning of line
240 Column uint32
241}
242
243func (l *Location) sizeBytes() uint64 {
244 return 3 * 4
245}
246
247// LineMatch holds the matches within a single line in a file.
248type LineMatch struct {
249 // The line in which a match was found.
250 Line []byte
251 // The byte offset of the first byte of the line.
252 LineStart int
253 // The byte offset of the first byte past the end of the line.
254 // This is usually the byte after the terminating newline, but can also be
255 // the end of the file if there is no terminating newline
256 LineEnd int
257 LineNumber int
258
259 // Before and After are only set when SearchOptions.NumContextLines is > 0
260 Before []byte
261 After []byte
262
263 // If set, this was a match on the filename.
264 FileName bool
265
266 // The higher the better. Only ranks the quality of the match
267 // within the file, does not take rank of file into account
268 Score float64
269 DebugScore string
270
271 LineFragments []LineFragmentMatch
272}
273
274func (lm *LineMatch) sizeBytes() (sz uint64) {
275 // Line
276 sz += sliceHeaderBytes + uint64(len(lm.Line))
277
278 // LineStart, LineEnd, LineNumber
279 sz += 3 * 8
280
281 // Before
282 sz += sliceHeaderBytes + uint64(len(lm.Before))
283
284 // After
285 sz += sliceHeaderBytes + uint64(len(lm.After))
286
287 // FileName
288 sz += 1
289
290 // Score
291 sz += 8
292
293 // DebugScore
294 sz += stringHeaderBytes + uint64(len(lm.DebugScore))
295
296 // LineFragments
297 sz += sliceHeaderBytes
298 for _, lf := range lm.LineFragments {
299 sz += lf.sizeBytes()
300 }
301
302 return
303}
304
305type Symbol struct {
306 Sym string
307 Kind string
308 Parent string
309 ParentKind string
310}
311
312func (s *Symbol) sizeBytes() uint64 {
313 return 4*stringHeaderBytes + uint64(len(s.Sym)+len(s.Kind)+len(s.Parent)+len(s.ParentKind))
314}
315
316// LineFragmentMatch a segment of matching text within a line.
317type LineFragmentMatch struct {
318 // Offset within the line, in bytes.
319 LineOffset int
320
321 // Offset from file start, in bytes.
322 Offset uint32
323
324 // Number bytes that match.
325 MatchLength int
326
327 SymbolInfo *Symbol
328}
329
330func (lfm *LineFragmentMatch) sizeBytes() (sz uint64) {
331 // LineOffset
332 sz += 8
333
334 // Offset
335 sz += 4
336
337 // MatchLength
338 sz += 8
339
340 // SymbolInfo
341 sz += pointerSize
342 if lfm.SymbolInfo != nil {
343 sz += lfm.SymbolInfo.sizeBytes()
344 }
345
346 return
347}
348
349type FlushReason uint8
350
351const (
352 FlushReasonTimerExpired FlushReason = 1 << iota
353 FlushReasonFinalFlush
354 FlushReasonMaxSize
355)
356
357var FlushReasonStrings = map[FlushReason]string{
358 FlushReasonTimerExpired: "timer_expired",
359 FlushReasonFinalFlush: "final_flush",
360 FlushReasonMaxSize: "max_size_reached",
361}
362
363func (fr FlushReason) String() string {
364 if v, ok := FlushReasonStrings[fr]; ok {
365 return v
366 }
367
368 return "none"
369}
370
371// Stats contains interesting numbers on the search
372type Stats struct {
373 // Amount of I/O for reading contents.
374 ContentBytesLoaded int64
375
376 // Amount of I/O for reading from index.
377 IndexBytesLoaded int64
378
379 // Number of search shards that had a crash.
380 Crashes int
381
382 // Wall clock time for this search
383 Duration time.Duration
384
385 // Number of files containing a match.
386 FileCount int
387
388 // Number of files in shards that we considered.
389 ShardFilesConsidered int
390
391 // Files that we evaluated. Equivalent to files for which all
392 // atom matches (including negations) evaluated to true.
393 FilesConsidered int
394
395 // Files for which we loaded file content to verify substring matches
396 FilesLoaded int
397
398 // Candidate files whose contents weren't examined because we
399 // gathered enough matches.
400 FilesSkipped int
401
402 // Shards that we scanned to find matches.
403 ShardsScanned int
404
405 // Shards that we did not process because a query was canceled.
406 ShardsSkipped int
407
408 // Shards that we did not process because the query was rejected by the
409 // ngram filter indicating it had no matches.
410 ShardsSkippedFilter int
411
412 // Number of non-overlapping matches
413 MatchCount int
414
415 // Number of candidate matches as a result of searching ngrams.
416 NgramMatches int
417
418 // NgramLookups is the number of times we accessed an ngram in the index.
419 NgramLookups int
420
421 // Wall clock time for queued search.
422 Wait time.Duration
423
424 // Aggregate wall clock time spent constructing and pruning the match tree.
425 // This accounts for time such as lookups in the trigram index.
426 MatchTreeConstruction time.Duration
427
428 // Aggregate wall clock time spent searching the match tree. This accounts
429 // for the bulk of search work done looking for matches.
430 MatchTreeSearch time.Duration
431
432 // Number of times regexp was called on files that we evaluated.
433 RegexpsConsidered int
434
435 // FlushReason explains why results were flushed.
436 FlushReason FlushReason
437}
438
439func (s *Stats) sizeBytes() (sz uint64) {
440 sz = 16 * 8 // This assumes we are running on a 64-bit architecture
441 sz += 1 // FlushReason
442
443 return
444}
445
446func (s *Stats) Add(o Stats) {
447 s.ContentBytesLoaded += o.ContentBytesLoaded
448 s.IndexBytesLoaded += o.IndexBytesLoaded
449 s.Crashes += o.Crashes
450 s.FileCount += o.FileCount
451 s.FilesConsidered += o.FilesConsidered
452 s.FilesLoaded += o.FilesLoaded
453 s.FilesSkipped += o.FilesSkipped
454 s.MatchCount += o.MatchCount
455 s.NgramMatches += o.NgramMatches
456 s.NgramLookups += o.NgramLookups
457 s.ShardFilesConsidered += o.ShardFilesConsidered
458 s.ShardsScanned += o.ShardsScanned
459 s.ShardsSkipped += o.ShardsSkipped
460 s.ShardsSkippedFilter += o.ShardsSkippedFilter
461 s.Wait += o.Wait
462 s.MatchTreeConstruction += o.MatchTreeConstruction
463 s.MatchTreeSearch += o.MatchTreeSearch
464 s.RegexpsConsidered += o.RegexpsConsidered
465
466 // We want the first non-zero FlushReason to be sticky. This is a useful
467 // property when aggregating stats from several Zoekts.
468 if s.FlushReason == 0 {
469 s.FlushReason = o.FlushReason
470 }
471}
472
473// Zero returns true if stats is empty.
474func (s *Stats) Zero() bool {
475 if s == nil {
476 return true
477 }
478
479 return !(s.ContentBytesLoaded > 0 ||
480 s.IndexBytesLoaded > 0 ||
481 s.Crashes > 0 ||
482 s.FileCount > 0 ||
483 s.FilesConsidered > 0 ||
484 s.FilesLoaded > 0 ||
485 s.FilesSkipped > 0 ||
486 s.MatchCount > 0 ||
487 s.NgramMatches > 0 ||
488 s.NgramLookups > 0 ||
489 s.ShardFilesConsidered > 0 ||
490 s.ShardsScanned > 0 ||
491 s.ShardsSkipped > 0 ||
492 s.ShardsSkippedFilter > 0 ||
493 s.Wait > 0 ||
494 s.MatchTreeConstruction > 0 ||
495 s.MatchTreeSearch > 0 ||
496 s.RegexpsConsidered > 0)
497}
498
499// Progress contains information about the global progress of the running search query.
500// This is used by the frontend to reorder results and emit them when stable.
501// Sourcegraph specific: this is used when querying multiple zoekt-webserver instances.
502type Progress struct {
503 // Priority of the shard that was searched.
504 Priority float64
505
506 // MaxPendingPriority is the maximum priority of pending result that is being searched in parallel.
507 // This is used to reorder results when the result set is known to be stable-- that is, when a result's
508 // Priority is greater than the max(MaxPendingPriority) from the latest results of each backend, it can be returned to the user.
509 //
510 // MaxPendingPriority decreases monotonically in each SearchResult.
511 MaxPendingPriority float64
512}
513
514func (p *Progress) sizeBytes() uint64 {
515 return 2 * 8
516}
517
518// SearchResult contains search matches and extra data
519type SearchResult struct {
520 Stats
521
522 // Do not encode this as we cannot encode -Inf in JSON
523 Progress `json:"-"`
524
525 Files []FileMatch
526
527 // RepoURLs holds a repo => template string map.
528 RepoURLs map[string]string
529
530 // FragmentNames holds a repo => template string map, for
531 // the line number fragment.
532 LineFragments map[string]string
533}
534
535// SizeBytes is a best-effort estimate of the size of SearchResult in memory.
536// The estimate does not take alignment into account. The result is a lower
537// bound on the actual size in memory.
538func (sr *SearchResult) SizeBytes() (sz uint64) {
539 sz += sr.Stats.sizeBytes()
540 sz += sr.Progress.sizeBytes()
541
542 // Files
543 sz += sliceHeaderBytes
544 for _, f := range sr.Files {
545 sz += f.sizeBytes()
546 }
547
548 // RepoURLs
549 sz += mapHeaderBytes
550 for k, v := range sr.RepoURLs {
551 sz += stringHeaderBytes + uint64(len(k))
552 sz += stringHeaderBytes + uint64(len(v))
553 }
554
555 // LineFragments
556 sz += mapHeaderBytes
557 for k, v := range sr.LineFragments {
558 sz += stringHeaderBytes + uint64(len(k))
559 sz += stringHeaderBytes + uint64(len(v))
560 }
561
562 return
563}
564
565// RepositoryBranch describes an indexed branch, which is a name
566// combined with a version.
567type RepositoryBranch struct {
568 Name string
569 Version string
570}
571
572func (r RepositoryBranch) String() string {
573 return fmt.Sprintf("%s@%s", r.Name, r.Version)
574}
575
576// Repository holds repository metadata.
577type Repository struct {
578 // Sourcegraph's tenant ID
579 TenantID int
580
581 // Sourcegraph's repository ID
582 ID uint32
583
584 // The repository name
585 Name string
586
587 // The repository URL.
588 URL string
589
590 // The physical source where this repo came from, eg. full
591 // path to the zip filename or git repository directory. This
592 // will not be exposed in the UI, but can be used to detect
593 // orphaned index shards.
594 Source string
595
596 // The branches indexed in this repo.
597 Branches []RepositoryBranch
598
599 // Nil if this is not the super project.
600 SubRepoMap map[string]*Repository
601
602 // URL template to link to the commit of a branch
603 CommitURLTemplate string
604
605 // The repository URL for getting to a file. Has access to
606 // {{.Version}}, {{.Path}}
607 FileURLTemplate string
608
609 // The URL fragment to add to a file URL for line numbers. has
610 // access to {{.LineNumber}}. The fragment should include the
611 // separator, generally '#' or ';'.
612 LineFragmentTemplate string
613
614 // Perf optimization: priority is set when we load the shard. It corresponds to
615 // the value of "priority" stored in RawConfig.
616 priority float64
617
618 // All zoekt.* configuration settings.
619 RawConfig map[string]string
620
621 // Importance of the repository, bigger is more important
622 Rank uint16
623
624 // IndexOptions is a hash of the options used to create the index for the
625 // repo.
626 IndexOptions string
627
628 // HasSymbols is true if this repository has indexed ctags
629 // output. Sourcegraph specific: This field is more appropriate for
630 // IndexMetadata. However, we store it here since the Sourcegraph frontend
631 // can read this structure but not IndexMetadata.
632 HasSymbols bool
633
634 // Tombstone is true if we are not allowed to search this repo.
635 Tombstone bool
636
637 // LatestCommitDate is the date of the latest commit among all indexed Branches.
638 // The date might be time.Time's 0-value if the repository was last indexed
639 // before this field was added.
640 LatestCommitDate time.Time
641
642 // FileTombstones is a set of file paths that should be ignored across all branches
643 // in this shard.
644 FileTombstones map[string]struct{} `json:",omitempty"`
645}
646
647func (r *Repository) UnmarshalJSON(data []byte) error {
648 // We define a new type so that we can use json.Unmarshal
649 // without recursing into this same method.
650 type repository *Repository
651 repo := repository(r)
652
653 err := json.Unmarshal(data, repo)
654 if err != nil {
655 return err
656 }
657
658 if v, ok := repo.RawConfig["repoid"]; ok {
659 id, _ := strconv.ParseUint(v, 10, 32)
660 r.ID = uint32(id)
661 }
662
663 if v, ok := repo.RawConfig["tenantID"]; ok {
664 id, _ := strconv.ParseInt(v, 10, 64)
665 r.TenantID = int(id)
666 }
667
668 // Sourcegraph indexserver doesn't set repo.Rank, so we set it here. Setting it
669 // on read instead of during indexing allows us to avoid a complete reindex.
670 //
671 // Prefer "latestCommitDate" over "priority" for ranking. We keep priority for
672 // backwards compatibility.
673 if _, ok := repo.RawConfig["latestCommitDate"]; ok {
674 // We use the number of months since 1970 as a simple measure of repo freshness.
675 // It is monotonically increasing and stable across re-indexes and restarts.
676 r.Rank = monthsSince1970(repo.LatestCommitDate)
677 } else if v, ok := repo.RawConfig["priority"]; ok {
678 r.priority, err = strconv.ParseFloat(v, 64)
679 if err != nil {
680 r.priority = 0
681 }
682
683 // Sourcegraph indexserver doesn't set repo.Rank, so we set it here
684 // based on priority. Setting it on read instead of during indexing
685 // allows us to avoid a complete reindex.
686 if r.Rank == 0 && r.priority > 0 {
687 // Normalize the repo score within [0, maxUint16), with the midpoint at 5,000.
688 // This means popular repos (roughly ones with over 5,000 stars) see diminishing
689 // returns from more stars.
690 r.Rank = uint16(r.priority / (5000.0 + r.priority) * math.MaxUint16)
691 }
692 }
693
694 return nil
695}
696
697func (r *Repository) GetPriority() float64 {
698 return r.priority
699}
700
701// monthsSince1970 returns the number of months since 1970. It returns values in
702// the range [0, maxUInt16]. The upper bound is reached in the year 7431, the
703// lower bound for all dates before 1970.
704func monthsSince1970(t time.Time) uint16 {
705 base := time.Unix(0, 0)
706 if t.Before(base) {
707 return 0
708 }
709 months := int(t.Year()-1970)*12 + int(t.Month()-1)
710 return uint16(min(months, math.MaxUint16))
711}
712
713// MergeMutable will merge x into r. mutated will be true if it made any
714// changes. err is non-nil if we needed to mutate an immutable field.
715//
716// Note: SubRepoMap, IndexOptions and HasSymbol fields are ignored. They are
717// computed while indexing so can't be synthesized from x.
718//
719// Note: We ignore RawConfig fields which are duplicated into Repository:
720// name and id.
721func (r *Repository) MergeMutable(x *Repository) (mutated bool, err error) {
722 if r.ID != x.ID {
723 // Sourcegraph: strange behaviour may occur if ID changes but names don't.
724 return mutated, errors.New("ID is immutable")
725 }
726 if r.Name != x.Name {
727 // Name is encoded into the shard name on disk. We need to re-index if it
728 // changes.
729 return mutated, errors.New("Name is immutable")
730 }
731 if !reflect.DeepEqual(r.Branches, x.Branches) {
732 // Need a reindex if content changing.
733 return mutated, errors.New("Branches is immutable")
734 }
735
736 for k, v := range x.RawConfig {
737 // We ignore name and id since they are encoded into the repository.
738 if k == "name" || k == "id" {
739 continue
740 }
741 if r.RawConfig == nil {
742 mutated = true
743 r.RawConfig = make(map[string]string)
744 }
745 if r.RawConfig[k] != v {
746 mutated = true
747 r.RawConfig[k] = v
748 }
749 }
750
751 if r.URL != x.URL {
752 mutated = true
753 r.URL = x.URL
754 }
755 if r.CommitURLTemplate != x.CommitURLTemplate {
756 mutated = true
757 r.CommitURLTemplate = x.CommitURLTemplate
758 }
759 if r.FileURLTemplate != x.FileURLTemplate {
760 mutated = true
761 r.FileURLTemplate = x.FileURLTemplate
762 }
763 if r.LineFragmentTemplate != x.LineFragmentTemplate {
764 mutated = true
765 r.LineFragmentTemplate = x.LineFragmentTemplate
766 }
767
768 return mutated, nil
769}
770
771// IndexMetadata holds metadata stored in the index file. It contains
772// data generated by the core indexing library.
773type IndexMetadata struct {
774 IndexFormatVersion int
775 IndexFeatureVersion int
776 IndexMinReaderVersion int
777 IndexTime time.Time
778 PlainASCII bool
779 LanguageMap map[string]uint16
780 ZoektVersion string
781 ID string
782}
783
784// Statistics of a (collection of) repositories.
785type RepoStats struct {
786 // Repos is used for aggregrating the number of repositories.
787 //
788 // Note: This field is not populated on RepoListEntry.Stats (individual) but
789 // only for RepoList.Stats (aggregate).
790 Repos int
791
792 // Shards is the total number of search shards.
793 Shards int
794
795 // Documents holds the number of documents or files.
796 Documents int
797
798 // IndexBytes is the amount of RAM used for index overhead.
799 IndexBytes int64
800
801 // ContentBytes is the amount of RAM used for raw content.
802 ContentBytes int64
803
804 // Sourcegraph specific stats below. These are not as efficient to calculate
805 // as the above statistics. We experimentally measured about a 10% slower
806 // shard load time. However, we find these values very useful to track and
807 // computing them outside of load time introduces a lot of complexity.
808
809 // NewLinesCount is the number of newlines "\n" that appear in the zoekt
810 // indexed documents. This is not exactly the same as line count, since it
811 // will not include lines not terminated by "\n" (eg a file with no "\n", or
812 // a final line without "\n"). Note: Zoekt deduplicates documents across
813 // branches, so if a path has the same contents on multiple branches, there
814 // is only one document for it. As such that document's newlines is only
815 // counted once. See DefaultBranchNewLinesCount and AllBranchesNewLinesCount
816 // for counts which do not deduplicate.
817 NewLinesCount uint64
818
819 // DefaultBranchNewLinesCount is the number of newlines "\n" in the default
820 // branch.
821 DefaultBranchNewLinesCount uint64
822
823 // OtherBranchesNewLinesCount is the number of newlines "\n" in all branches
824 // except the default branch.
825 OtherBranchesNewLinesCount uint64
826}
827
828func (s *RepoStats) Add(o *RepoStats) {
829 // can't update Repos, since one repo may have multiple
830 // shards.
831 s.Shards += o.Shards
832 s.IndexBytes += o.IndexBytes
833 s.Documents += o.Documents
834 s.ContentBytes += o.ContentBytes
835
836 // Sourcegraph specific
837 s.NewLinesCount += o.NewLinesCount
838 s.DefaultBranchNewLinesCount += o.DefaultBranchNewLinesCount
839 s.OtherBranchesNewLinesCount += o.OtherBranchesNewLinesCount
840}
841
842type RepoListEntry struct {
843 Repository Repository
844 IndexMetadata IndexMetadata
845 Stats RepoStats
846}
847
848// MinimalRepoListEntry is a subset of RepoListEntry. It was added after
849// performance profiling of sourcegraph.com revealed that querying this
850// information from Zoekt was causing lots of CPU and memory usage. Note: we
851// can revisit this, how we store and query this information has changed a lot
852// since this was introduced.
853type MinimalRepoListEntry struct {
854 // HasSymbols is exported since Sourcegraph uses this information at search
855 // planning time to decide between Zoekt and an unindexed symbol search.
856 //
857 // Note: it pretty much is always true in practice.
858 HasSymbols bool
859
860 // Branches is used by Sourcegraphs query planner to decided if it can use
861 // zoekt or go via an unindexed code path.
862 Branches []RepositoryBranch
863
864 // IndexTimeUnix is the IndexTime converted to unix time (number of seconds
865 // since the epoch). This is to make it clear we are not transporting the
866 // full fidelty timestamp (ie with milliseconds and location). Additionally
867 // it saves 16 bytes in this struct.
868 //
869 // IndexTime is used as a heuristic in Sourcegraph to decide in aggregate
870 // how many repositories need updating after a ranking change/etc.
871 //
872 // TODO(keegancsmith) audit updates to IndexTime and document how and when
873 // it changes. Concerned about things like metadata updates or compound
874 // shards leading to untrustworthy data here.
875 IndexTimeUnix int64
876}
877
878type ReposMap map[uint32]MinimalRepoListEntry
879
880// MarshalBinary implements a specialized encoder for ReposMap.
881func (q *ReposMap) MarshalBinary() ([]byte, error) {
882 return reposMapEncode(*q)
883}
884
885// UnmarshalBinary implements a specialized decoder for ReposMap.
886func (q *ReposMap) UnmarshalBinary(b []byte) error {
887 var err error
888 (*q), err = reposMapDecode(b)
889 return err
890}
891
892// RepoList holds a set of Repository metadata.
893type RepoList struct {
894 // Returned when ListOptions.Field is RepoListFieldRepos.
895 Repos []*RepoListEntry
896
897 // ReposMap is set when ListOptions.Field is RepoListFieldReposMap.
898 ReposMap ReposMap
899
900 Crashes int
901
902 // Stats response to a List request.
903 // This is the aggregate RepoStats of all repos matching the input query.
904 Stats RepoStats
905}
906
907type Searcher interface {
908 Search(ctx context.Context, q query.Q, opts *SearchOptions) (*SearchResult, error)
909
910 // List lists repositories. The query `q` can only contain
911 // query.Repo atoms.
912 List(ctx context.Context, q query.Q, opts *ListOptions) (*RepoList, error)
913 Close()
914
915 // Describe the searcher for debug messages.
916 String() string
917}
918
919type RepoListField int
920
921const (
922 RepoListFieldRepos RepoListField = 0
923 RepoListFieldReposMap = 2
924)
925
926type ListOptions struct {
927 // Field decides which field to populate in RepoList response.
928 Field RepoListField
929}
930
931func (o *ListOptions) GetField() (RepoListField, error) {
932 if o == nil {
933 return RepoListFieldRepos, nil
934 }
935 switch o.Field {
936 case RepoListFieldRepos, RepoListFieldReposMap:
937 return o.Field, nil
938 case 1:
939 return 0, fmt.Errorf("RepoListFieldMinimal (%d) is no longer supported", o.Field)
940 default:
941 return 0, fmt.Errorf("unknown RepoListField %d", o.Field)
942 }
943}
944
945func (o *ListOptions) String() string {
946 return fmt.Sprintf("%#v", o)
947}
948
949type SearchOptions struct {
950 // Return an upper-bound estimate of eligible documents in
951 // stats.ShardFilesConsidered.
952 EstimateDocCount bool
953
954 // Return the whole file.
955 Whole bool
956
957 // Maximum number of matches: skip all processing an index
958 // shard after we found this many non-overlapping matches.
959 ShardMaxMatchCount int
960
961 // Maximum number of matches: stop looking for more matches
962 // once we have this many matches across shards.
963 TotalMaxMatchCount int
964
965 // Maximum number of matches: skip processing documents for a repository in
966 // a shard once we have found ShardRepoMaxMatchCount.
967 //
968 // A compound shard may contain multiple repositories. This will most often
969 // be set to 1 to find all repositories containing a result.
970 ShardRepoMaxMatchCount int
971
972 // Abort the search after this much time has passed.
973 MaxWallTime time.Duration
974
975 // FlushWallTime if non-zero will stop streaming behaviour at first and
976 // instead will collate and sort results. At FlushWallTime the results will
977 // be sent and then the behaviour will revert to the normal streaming.
978 FlushWallTime time.Duration
979
980 // Truncates the number of documents (i.e. files) after collating and
981 // sorting the results.
982 MaxDocDisplayCount int
983
984 // Truncates the number of matchs after collating and sorting the results.
985 MaxMatchDisplayCount int
986
987 // If set to a number greater than zero then up to this many number
988 // of context lines will be added before and after each matched line.
989 // Note that the included context lines might contain matches and
990 // it's up to the consumer of the result to remove those lines.
991 NumContextLines int
992
993 // If true, ChunkMatches will be returned in each FileMatch rather than LineMatches
994 // EXPERIMENTAL: the behavior of this flag may be changed in future versions.
995 ChunkMatches bool
996
997 // EXPERIMENTAL. If true, use text-search style scoring instead of the default
998 // scoring formula. The scoring algorithm treats each match in a file as a term
999 // and computes an approximation to BM25. When enabled, BM25 scoring is used for
1000 // the overall FileMatch score, as well as individual LineMatch and ChunkMatch scores.
1001 //
1002 // The calculation of IDF assumes that Zoekt visits all documents containing any
1003 // of the query terms during evaluation. This is true, for example, if all query
1004 // terms are ORed together.
1005 //
1006 // When enabled, all other scoring signals are ignored, including document ranks.
1007 UseBM25Scoring bool
1008
1009 // Trace turns on opentracing for this request if true and if the Jaeger address was provided as
1010 // a command-line flag
1011 Trace bool
1012
1013 // If set, the search results will contain debug information for scoring.
1014 DebugScore bool
1015
1016 // SpanContext is the opentracing span context, if it exists, from the zoekt client
1017 SpanContext map[string]string
1018}
1019
1020func (o *SearchOptions) SetDefaults() {
1021 if o.ShardMaxMatchCount == 0 {
1022 // We cap the total number of matches, so overly broad
1023 // searches don't crash the machine.
1024 o.ShardMaxMatchCount = 100000
1025 }
1026 if o.TotalMaxMatchCount == 0 {
1027 o.TotalMaxMatchCount = 10 * o.ShardMaxMatchCount
1028 }
1029}
1030
1031// String returns a succinct representation of the options. This is meant for
1032// human consumption in logs and traces.
1033//
1034// Note: some tracing systems have limits on length of values, so we take care
1035// to try and make this small, and include the important information near the
1036// front incase of truncation.
1037func (s *SearchOptions) String() string {
1038 var b strings.Builder
1039
1040 add := func(name, value string) {
1041 b.WriteString(name)
1042 b.WriteByte('=')
1043 b.WriteString(value)
1044 b.WriteByte(' ')
1045 }
1046 addInt := func(name string, value int) {
1047 if value != 0 {
1048 add(name, strconv.Itoa(value))
1049 }
1050 }
1051 addDuration := func(name string, value time.Duration) {
1052 if value != 0 {
1053 add(name, value.String())
1054 }
1055 }
1056 addBool := func(name string, value bool) {
1057 if !value {
1058 return
1059 }
1060 b.WriteString(name)
1061 b.WriteByte(' ')
1062 }
1063
1064 b.WriteString("zoekt.SearchOptions{ ")
1065
1066 addInt("ShardMaxMatchCount", s.ShardMaxMatchCount)
1067 addInt("TotalMaxMatchCount", s.TotalMaxMatchCount)
1068 addInt("ShardRepoMaxMatchCount", s.ShardRepoMaxMatchCount)
1069 addInt("MaxDocDisplayCount", s.MaxDocDisplayCount)
1070 addInt("MaxMatchDisplayCount", s.MaxMatchDisplayCount)
1071 addInt("NumContextLines", s.NumContextLines)
1072
1073 addDuration("MaxWallTime", s.MaxWallTime)
1074 addDuration("FlushWallTime", s.FlushWallTime)
1075
1076 addBool("EstimateDocCount", s.EstimateDocCount)
1077 addBool("Whole", s.Whole)
1078 addBool("ChunkMatches", s.ChunkMatches)
1079 addBool("UseBM25Scoring", s.UseBM25Scoring)
1080 addBool("Trace", s.Trace)
1081 addBool("DebugScore", s.DebugScore)
1082
1083 for k, v := range s.SpanContext {
1084 add("SpanContext."+k, strconv.Quote(v))
1085 }
1086
1087 b.WriteByte('}')
1088 return b.String()
1089}
1090
1091// Sender is the interface that wraps the basic Send method.
1092type Sender interface {
1093 Send(*SearchResult)
1094}
1095
1096// SenderFunc is an adapter to allow the use of ordinary functions as Sender.
1097// If f is a function with the appropriate signature, SenderFunc(f) is a Sender
1098// that calls f.
1099type SenderFunc func(result *SearchResult)
1100
1101func (f SenderFunc) Send(result *SearchResult) {
1102 f(result)
1103}
1104
1105// Streamer adds the method StreamSearch to the Searcher interface.
1106type Streamer interface {
1107 Searcher
1108 StreamSearch(ctx context.Context, q query.Q, opts *SearchOptions, sender Sender) (err error)
1109}