api.go at 1adab43b4c6b28f927229d6ef0908daad2404875 · boltless.me/zoekt

fork of https://github.com/sourcegraph/zoekt
zoekt / api.go
at 1adab43b4c6b28f927229d6ef0908daad2404875 29 kB View raw
Keegan Carruthers-Smith api: implement succinct output for SearchOptions.String (#719) 2y ago
   1// Copyright 2016 Google Inc. All rights reserved.
   2//
   3// Licensed under the Apache License, Version 2.0 (the "License");
   4// you may not use this file except in compliance with the License.
   5// You may obtain a copy of the License at
   6//
   7//    http://www.apache.org/licenses/LICENSE-2.0
   8//
   9// Unless required by applicable law or agreed to in writing, software
  10// distributed under the License is distributed on an "AS IS" BASIS,
  11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12// See the License for the specific language governing permissions and
  13// limitations under the License.
  14
  15package zoekt // import "github.com/sourcegraph/zoekt"
  16
  17import (
  18	"context"
  19	"encoding/json"
  20	"errors"
  21	"fmt"
  22	"reflect"
  23	"strconv"
  24	"strings"
  25	"time"
  26
  27	"github.com/sourcegraph/zoekt/query"
  28)
  29
  30const mapHeaderBytes uint64 = 48
  31const sliceHeaderBytes uint64 = 24
  32const stringHeaderBytes uint64 = 16
  33const pointerSize uint64 = 8
  34const interfaceBytes uint64 = 16
  35
  36// FileMatch contains all the matches within a file.
  37type FileMatch struct {
  38	FileName string
  39
  40	// Repository is the globally unique name of the repo of the
  41	// match
  42	Repository string
  43
  44	// SubRepositoryName is the globally unique name of the repo,
  45	// if it came from a subrepository
  46	SubRepositoryName string
  47
  48	// SubRepositoryPath holds the prefix where the subrepository
  49	// was mounted.
  50	SubRepositoryPath string
  51
  52	// Commit SHA1 (hex) of the (sub)repo holding the file.
  53	Version string
  54
  55	// Detected language of the result.
  56	Language string
  57
  58	// For debugging. Needs DebugScore set, but public so tests in
  59	// other packages can print some diagnostics.
  60	Debug string
  61
  62	Branches []string
  63
  64	// One of LineMatches or ChunkMatches will be returned depending on whether
  65	// the SearchOptions.ChunkMatches is set.
  66	LineMatches  []LineMatch
  67	ChunkMatches []ChunkMatch
  68
  69	// Only set if requested
  70	Content []byte
  71
  72	// Checksum of the content.
  73	Checksum []byte
  74
  75	// Ranking; the higher, the better.
  76	Score float64 // TODO - hide this field?
  77
  78	// RepositoryPriority is a Sourcegraph extension. It is used by Sourcegraph to
  79	// order results from different repositories relative to each other.
  80	RepositoryPriority float64
  81
  82	// RepositoryID is a Sourcegraph extension. This is the ID of Repository in
  83	// Sourcegraph.
  84	RepositoryID uint32
  85}
  86
  87func (m *FileMatch) sizeBytes() (sz uint64) {
  88	// Score
  89	sz += 8
  90
  91	for _, s := range []string{
  92		m.Debug,
  93		m.FileName,
  94		m.Repository,
  95		m.Language,
  96		m.SubRepositoryName,
  97		m.SubRepositoryPath,
  98		m.Version,
  99	} {
 100		sz += stringHeaderBytes + uint64(len(s))
 101	}
 102
 103	// Branches
 104	sz += sliceHeaderBytes
 105	for _, s := range m.Branches {
 106		sz += stringHeaderBytes + uint64(len(s))
 107	}
 108
 109	// LineMatches
 110	sz += sliceHeaderBytes
 111	for _, lm := range m.LineMatches {
 112		sz += lm.sizeBytes()
 113	}
 114
 115	// ChunkMatches
 116	sz += sliceHeaderBytes
 117	for _, cm := range m.ChunkMatches {
 118		sz += cm.sizeBytes()
 119	}
 120
 121	// RepositoryID
 122	sz += 4
 123
 124	// RepositoryPriority
 125	sz += 8
 126
 127	// Content
 128	sz += sliceHeaderBytes + uint64(len(m.Content))
 129
 130	// Checksum
 131	sz += sliceHeaderBytes + uint64(len(m.Checksum))
 132
 133	return
 134}
 135
 136// ChunkMatch is a set of non-overlapping matches within a contiguous range of
 137// lines in the file.
 138type ChunkMatch struct {
 139	DebugScore string
 140
 141	// Content is a contiguous range of complete lines that fully contains Ranges.
 142	Content []byte
 143
 144	// Ranges is a set of matching ranges within this chunk. Each range is relative
 145	// to the beginning of the file (not the beginning of Content).
 146	Ranges []Range
 147
 148	// SymbolInfo is the symbol information associated with Ranges. If it is non-nil,
 149	// its length will equal that of Ranges. Any of its elements may be nil.
 150	SymbolInfo []*Symbol
 151
 152	// FileName indicates whether this match is a match on the file name, in
 153	// which case Content will contain the file name.
 154	FileName bool
 155
 156	// ContentStart is the location (inclusive) of the beginning of content
 157	// relative to the beginning of the file. It will always be at the
 158	// beginning of a line (Column will always be 1).
 159	ContentStart Location
 160
 161	Score float64
 162}
 163
 164func (cm *ChunkMatch) sizeBytes() (sz uint64) {
 165	// Content
 166	sz += sliceHeaderBytes + uint64(len(cm.Content))
 167
 168	// ContentStart
 169	sz += cm.ContentStart.sizeBytes()
 170
 171	// FileName
 172	sz += 1
 173
 174	// Ranges
 175	sz += sliceHeaderBytes
 176	if len(cm.Ranges) > 0 {
 177		sz += uint64(len(cm.Ranges)) * cm.Ranges[0].sizeBytes()
 178	}
 179
 180	// SymbolInfo
 181	sz += sliceHeaderBytes
 182	for _, si := range cm.SymbolInfo {
 183		sz += pointerSize
 184		if si != nil {
 185			sz += si.sizeBytes()
 186		}
 187	}
 188
 189	// Score
 190	sz += 8
 191
 192	// DebugScore
 193	sz += stringHeaderBytes + uint64(len(cm.DebugScore))
 194
 195	return
 196}
 197
 198type Range struct {
 199	// The inclusive beginning of the range.
 200	Start Location
 201	// The exclusive end of the range.
 202	End Location
 203}
 204
 205func (r *Range) sizeBytes() uint64 {
 206	return r.Start.sizeBytes() + r.End.sizeBytes()
 207}
 208
 209type Location struct {
 210	// 0-based byte offset from the beginning of the file
 211	ByteOffset uint32
 212	// 1-based line number from the beginning of the file
 213	LineNumber uint32
 214	// 1-based column number (in runes) from the beginning of line
 215	Column uint32
 216}
 217
 218func (l *Location) sizeBytes() uint64 {
 219	return 3 * 4
 220}
 221
 222// LineMatch holds the matches within a single line in a file.
 223type LineMatch struct {
 224	// The line in which a match was found.
 225	Line       []byte
 226	LineStart  int
 227	LineEnd    int
 228	LineNumber int
 229
 230	// Before and After are only set when SearchOptions.NumContextLines is > 0
 231	Before []byte
 232	After  []byte
 233
 234	// If set, this was a match on the filename.
 235	FileName bool
 236
 237	// The higher the better. Only ranks the quality of the match
 238	// within the file, does not take rank of file into account
 239	Score      float64
 240	DebugScore string
 241
 242	LineFragments []LineFragmentMatch
 243}
 244
 245func (lm *LineMatch) sizeBytes() (sz uint64) {
 246	// Line
 247	sz += sliceHeaderBytes + uint64(len(lm.Line))
 248
 249	// LineStart, LineEnd, LineNumber
 250	sz += 3 * 8
 251
 252	// Before
 253	sz += sliceHeaderBytes + uint64(len(lm.Before))
 254
 255	// After
 256	sz += sliceHeaderBytes + uint64(len(lm.After))
 257
 258	// FileName
 259	sz += 1
 260
 261	// Score
 262	sz += 8
 263
 264	// DebugScore
 265	sz += stringHeaderBytes + uint64(len(lm.DebugScore))
 266
 267	// LineFragments
 268	sz += sliceHeaderBytes
 269	for _, lf := range lm.LineFragments {
 270		sz += lf.sizeBytes()
 271	}
 272
 273	return
 274}
 275
 276type Symbol struct {
 277	Sym        string
 278	Kind       string
 279	Parent     string
 280	ParentKind string
 281}
 282
 283func (s *Symbol) sizeBytes() uint64 {
 284	return 4*stringHeaderBytes + uint64(len(s.Sym)+len(s.Kind)+len(s.Parent)+len(s.ParentKind))
 285}
 286
 287// LineFragmentMatch a segment of matching text within a line.
 288type LineFragmentMatch struct {
 289	// Offset within the line, in bytes.
 290	LineOffset int
 291
 292	// Offset from file start, in bytes.
 293	Offset uint32
 294
 295	// Number bytes that match.
 296	MatchLength int
 297
 298	SymbolInfo *Symbol
 299}
 300
 301func (lfm *LineFragmentMatch) sizeBytes() (sz uint64) {
 302	// LineOffset
 303	sz += 8
 304
 305	// Offset
 306	sz += 4
 307
 308	// MatchLength
 309	sz += 8
 310
 311	// SymbolInfo
 312	sz += pointerSize
 313	if lfm.SymbolInfo != nil {
 314		sz += lfm.SymbolInfo.sizeBytes()
 315	}
 316
 317	return
 318}
 319
 320type FlushReason uint8
 321
 322const (
 323	FlushReasonTimerExpired FlushReason = 1 << iota
 324	FlushReasonFinalFlush
 325	FlushReasonMaxSize
 326)
 327
 328var FlushReasonStrings = map[FlushReason]string{
 329	FlushReasonTimerExpired: "timer_expired",
 330	FlushReasonFinalFlush:   "final_flush",
 331	FlushReasonMaxSize:      "max_size_reached",
 332}
 333
 334func (fr FlushReason) String() string {
 335	if v, ok := FlushReasonStrings[fr]; ok {
 336		return v
 337	}
 338
 339	return "none"
 340}
 341
 342// Stats contains interesting numbers on the search
 343type Stats struct {
 344	// Amount of I/O for reading contents.
 345	ContentBytesLoaded int64
 346
 347	// Amount of I/O for reading from index.
 348	IndexBytesLoaded int64
 349
 350	// Number of search shards that had a crash.
 351	Crashes int
 352
 353	// Wall clock time for this search
 354	Duration time.Duration
 355
 356	// Number of files containing a match.
 357	FileCount int
 358
 359	// Number of files in shards that we considered.
 360	ShardFilesConsidered int
 361
 362	// Files that we evaluated. Equivalent to files for which all
 363	// atom matches (including negations) evaluated to true.
 364	FilesConsidered int
 365
 366	// Files for which we loaded file content to verify substring matches
 367	FilesLoaded int
 368
 369	// Candidate files whose contents weren't examined because we
 370	// gathered enough matches.
 371	FilesSkipped int
 372
 373	// Shards that we scanned to find matches.
 374	ShardsScanned int
 375
 376	// Shards that we did not process because a query was canceled.
 377	ShardsSkipped int
 378
 379	// Shards that we did not process because the query was rejected by the
 380	// ngram filter indicating it had no matches.
 381	ShardsSkippedFilter int
 382
 383	// Number of non-overlapping matches
 384	MatchCount int
 385
 386	// Number of candidate matches as a result of searching ngrams.
 387	NgramMatches int
 388
 389	// NgramLookups is the number of times we accessed an ngram in the index.
 390	NgramLookups int
 391
 392	// Wall clock time for queued search.
 393	Wait time.Duration
 394
 395	// Aggregate wall clock time spent constructing and pruning the match tree.
 396	// This accounts for time such as lookups in the trigram index.
 397	MatchTreeConstruction time.Duration
 398
 399	// Aggregate wall clock time spent searching the match tree. This accounts
 400	// for the bulk of search work done looking for matches.
 401	MatchTreeSearch time.Duration
 402
 403	// Number of times regexp was called on files that we evaluated.
 404	RegexpsConsidered int
 405
 406	// FlushReason explains why results were flushed.
 407	FlushReason FlushReason
 408}
 409
 410func (s *Stats) sizeBytes() (sz uint64) {
 411	sz = 16 * 8 // This assumes we are running on a 64-bit architecture
 412	sz += 1     // FlushReason
 413
 414	return
 415}
 416
 417func (s *Stats) Add(o Stats) {
 418	s.ContentBytesLoaded += o.ContentBytesLoaded
 419	s.IndexBytesLoaded += o.IndexBytesLoaded
 420	s.Crashes += o.Crashes
 421	s.FileCount += o.FileCount
 422	s.FilesConsidered += o.FilesConsidered
 423	s.FilesLoaded += o.FilesLoaded
 424	s.FilesSkipped += o.FilesSkipped
 425	s.MatchCount += o.MatchCount
 426	s.NgramMatches += o.NgramMatches
 427	s.NgramLookups += o.NgramLookups
 428	s.ShardFilesConsidered += o.ShardFilesConsidered
 429	s.ShardsScanned += o.ShardsScanned
 430	s.ShardsSkipped += o.ShardsSkipped
 431	s.ShardsSkippedFilter += o.ShardsSkippedFilter
 432	s.Wait += o.Wait
 433	s.MatchTreeConstruction += o.MatchTreeConstruction
 434	s.MatchTreeSearch += o.MatchTreeSearch
 435	s.RegexpsConsidered += o.RegexpsConsidered
 436
 437	// We want the first non-zero FlushReason to be sticky. This is a useful
 438	// property when aggregating stats from several Zoekts.
 439	if s.FlushReason == 0 {
 440		s.FlushReason = o.FlushReason
 441	}
 442}
 443
 444// Zero returns true if stats is empty.
 445func (s *Stats) Zero() bool {
 446	if s == nil {
 447		return true
 448	}
 449
 450	return !(s.ContentBytesLoaded > 0 ||
 451		s.IndexBytesLoaded > 0 ||
 452		s.Crashes > 0 ||
 453		s.FileCount > 0 ||
 454		s.FilesConsidered > 0 ||
 455		s.FilesLoaded > 0 ||
 456		s.FilesSkipped > 0 ||
 457		s.MatchCount > 0 ||
 458		s.NgramMatches > 0 ||
 459		s.NgramLookups > 0 ||
 460		s.ShardFilesConsidered > 0 ||
 461		s.ShardsScanned > 0 ||
 462		s.ShardsSkipped > 0 ||
 463		s.ShardsSkippedFilter > 0 ||
 464		s.Wait > 0 ||
 465		s.MatchTreeConstruction > 0 ||
 466		s.MatchTreeSearch > 0 ||
 467		s.RegexpsConsidered > 0)
 468}
 469
 470// Progress contains information about the global progress of the running search query.
 471// This is used by the frontend to reorder results and emit them when stable.
 472// Sourcegraph specific: this is used when querying multiple zoekt-webserver instances.
 473type Progress struct {
 474	// Priority of the shard that was searched.
 475	Priority float64
 476
 477	// MaxPendingPriority is the maximum priority of pending result that is being searched in parallel.
 478	// This is used to reorder results when the result set is known to be stable-- that is, when a result's
 479	// Priority is greater than the max(MaxPendingPriority) from the latest results of each backend, it can be returned to the user.
 480	//
 481	// MaxPendingPriority decreases monotonically in each SearchResult.
 482	MaxPendingPriority float64
 483}
 484
 485func (p *Progress) sizeBytes() uint64 {
 486	return 2 * 8
 487}
 488
 489// SearchResult contains search matches and extra data
 490type SearchResult struct {
 491	Stats
 492
 493	// Do not encode this as we cannot encode -Inf in JSON
 494	Progress `json:"-"`
 495
 496	Files []FileMatch
 497
 498	// RepoURLs holds a repo => template string map.
 499	RepoURLs map[string]string
 500
 501	// FragmentNames holds a repo => template string map, for
 502	// the line number fragment.
 503	LineFragments map[string]string
 504}
 505
 506// SizeBytes is a best-effort estimate of the size of SearchResult in memory.
 507// The estimate does not take alignment into account. The result is a lower
 508// bound on the actual size in memory.
 509func (sr *SearchResult) SizeBytes() (sz uint64) {
 510	sz += sr.Stats.sizeBytes()
 511	sz += sr.Progress.sizeBytes()
 512
 513	// Files
 514	sz += sliceHeaderBytes
 515	for _, f := range sr.Files {
 516		sz += f.sizeBytes()
 517	}
 518
 519	// RepoURLs
 520	sz += mapHeaderBytes
 521	for k, v := range sr.RepoURLs {
 522		sz += stringHeaderBytes + uint64(len(k))
 523		sz += stringHeaderBytes + uint64(len(v))
 524	}
 525
 526	// LineFragments
 527	sz += mapHeaderBytes
 528	for k, v := range sr.LineFragments {
 529		sz += stringHeaderBytes + uint64(len(k))
 530		sz += stringHeaderBytes + uint64(len(v))
 531	}
 532
 533	return
 534}
 535
 536// RepositoryBranch describes an indexed branch, which is a name
 537// combined with a version.
 538type RepositoryBranch struct {
 539	Name    string
 540	Version string
 541}
 542
 543func (r RepositoryBranch) String() string {
 544	return fmt.Sprintf("%s@%s", r.Name, r.Version)
 545}
 546
 547// Repository holds repository metadata.
 548type Repository struct {
 549	// Sourcegraph's repository ID
 550	ID uint32
 551
 552	// The repository name
 553	Name string
 554
 555	// The repository URL.
 556	URL string
 557
 558	// The physical source where this repo came from, eg. full
 559	// path to the zip filename or git repository directory. This
 560	// will not be exposed in the UI, but can be used to detect
 561	// orphaned index shards.
 562	Source string
 563
 564	// The branches indexed in this repo.
 565	Branches []RepositoryBranch
 566
 567	// Nil if this is not the super project.
 568	SubRepoMap map[string]*Repository
 569
 570	// URL template to link to the commit of a branch
 571	CommitURLTemplate string
 572
 573	// The repository URL for getting to a file.  Has access to
 574	// {{.Version}}, {{.Path}}
 575	FileURLTemplate string
 576
 577	// The URL fragment to add to a file URL for line numbers. has
 578	// access to {{.LineNumber}}. The fragment should include the
 579	// separator, generally '#' or ';'.
 580	LineFragmentTemplate string
 581
 582	// Perf optimization: priority is set when we load the shard. It corresponds to
 583	// the value of "priority" stored in RawConfig.
 584	priority float64
 585
 586	// All zoekt.* configuration settings.
 587	RawConfig map[string]string
 588
 589	// Importance of the repository, bigger is more important
 590	Rank uint16
 591
 592	// IndexOptions is a hash of the options used to create the index for the
 593	// repo.
 594	IndexOptions string
 595
 596	// HasSymbols is true if this repository has indexed ctags
 597	// output. Sourcegraph specific: This field is more appropriate for
 598	// IndexMetadata. However, we store it here since the Sourcegraph frontend
 599	// can read this structure but not IndexMetadata.
 600	HasSymbols bool
 601
 602	// Tombstone is true if we are not allowed to search this repo.
 603	Tombstone bool
 604
 605	// LatestCommitDate is the date of the latest commit among all indexed Branches.
 606	// The date might be time.Time's 0-value if the repository was last indexed
 607	// before this field was added.
 608	LatestCommitDate time.Time
 609
 610	// FileTombstones is a set of file paths that should be ignored across all branches
 611	// in this shard.
 612	FileTombstones map[string]struct{} `json:",omitempty"`
 613}
 614
 615func (r *Repository) UnmarshalJSON(data []byte) error {
 616	// We define a new type so that we can use json.Unmarshal
 617	// without recursing into this same method.
 618	type repository *Repository
 619	repo := repository(r)
 620
 621	err := json.Unmarshal(data, repo)
 622	if err != nil {
 623		return err
 624	}
 625
 626	if v, ok := repo.RawConfig["repoid"]; ok {
 627		id, _ := strconv.ParseUint(v, 10, 32)
 628		r.ID = uint32(id)
 629	}
 630
 631	if v, ok := repo.RawConfig["priority"]; ok {
 632		r.priority, err = strconv.ParseFloat(v, 64)
 633		if err != nil {
 634			r.priority = 0
 635		}
 636
 637		// Sourcegraph indexserver doesn't set repo.Rank, so we set it here
 638		// based on priority. Setting it on read instead of during indexing
 639		// allows us to avoid a complete reindex.
 640		if r.Rank == 0 && r.priority > 0 {
 641			// Normalize the repo score within [0, 1), with the midpoint at 5,000. This means popular
 642			// repos (roughly ones with over 5,000 stars) see diminishing returns from more stars.
 643			r.Rank = uint16(r.priority / (5000.0 + r.priority) * maxUInt16)
 644		}
 645	}
 646	return nil
 647}
 648
 649// MergeMutable will merge x into r. mutated will be true if it made any
 650// changes. err is non-nil if we needed to mutate an immutable field.
 651//
 652// Note: SubRepoMap, IndexOptions and HasSymbol fields are ignored. They are
 653// computed while indexing so can't be synthesized from x.
 654//
 655// Note: We ignore RawConfig fields which are duplicated into Repository:
 656// name and id.
 657//
 658// Note: URL, *Template fields are ignored. They are not used by Sourcegraph.
 659func (r *Repository) MergeMutable(x *Repository) (mutated bool, err error) {
 660	if r.ID != x.ID {
 661		// Sourcegraph: strange behaviour may occur if ID changes but names don't.
 662		return mutated, errors.New("ID is immutable")
 663	}
 664	if r.Name != x.Name {
 665		// Name is encoded into the shard name on disk. We need to re-index if it
 666		// changes.
 667		return mutated, errors.New("Name is immutable")
 668	}
 669	if !reflect.DeepEqual(r.Branches, x.Branches) {
 670		// Need a reindex if content changing.
 671		return mutated, errors.New("Branches is immutable")
 672	}
 673
 674	for k, v := range x.RawConfig {
 675		// We ignore name and id since they are encoded into the repository.
 676		if k == "name" || k == "id" {
 677			continue
 678		}
 679		if r.RawConfig == nil {
 680			mutated = true
 681			r.RawConfig = make(map[string]string)
 682		}
 683		if r.RawConfig[k] != v {
 684			mutated = true
 685			r.RawConfig[k] = v
 686		}
 687	}
 688
 689	return mutated, nil
 690}
 691
 692// IndexMetadata holds metadata stored in the index file. It contains
 693// data generated by the core indexing library.
 694type IndexMetadata struct {
 695	IndexFormatVersion    int
 696	IndexFeatureVersion   int
 697	IndexMinReaderVersion int
 698	IndexTime             time.Time
 699	PlainASCII            bool
 700	LanguageMap           map[string]uint16
 701	ZoektVersion          string
 702	ID                    string
 703}
 704
 705// Statistics of a (collection of) repositories.
 706type RepoStats struct {
 707	// Repos is used for aggregrating the number of repositories.
 708	//
 709	// Note: This field is not populated on RepoListEntry.Stats (individual) but
 710	// only for RepoList.Stats (aggregate).
 711	Repos int
 712
 713	// Shards is the total number of search shards.
 714	Shards int
 715
 716	// Documents holds the number of documents or files.
 717	Documents int
 718
 719	// IndexBytes is the amount of RAM used for index overhead.
 720	IndexBytes int64
 721
 722	// ContentBytes is the amount of RAM used for raw content.
 723	ContentBytes int64
 724
 725	// Sourcegraph specific stats below. These are not as efficient to calculate
 726	// as the above statistics. We experimentally measured about a 10% slower
 727	// shard load time. However, we find these values very useful to track and
 728	// computing them outside of load time introduces a lot of complexity.
 729
 730	// NewLinesCount is the number of newlines "\n" that appear in the zoekt
 731	// indexed documents. This is not exactly the same as line count, since it
 732	// will not include lines not terminated by "\n" (eg a file with no "\n", or
 733	// a final line without "\n"). Note: Zoekt deduplicates documents across
 734	// branches, so if a path has the same contents on multiple branches, there
 735	// is only one document for it. As such that document's newlines is only
 736	// counted once. See DefaultBranchNewLinesCount and AllBranchesNewLinesCount
 737	// for counts which do not deduplicate.
 738	NewLinesCount uint64
 739
 740	// DefaultBranchNewLinesCount is the number of newlines "\n" in the default
 741	// branch.
 742	DefaultBranchNewLinesCount uint64
 743
 744	// OtherBranchesNewLinesCount is the number of newlines "\n" in all branches
 745	// except the default branch.
 746	OtherBranchesNewLinesCount uint64
 747}
 748
 749func (s *RepoStats) Add(o *RepoStats) {
 750	// can't update Repos, since one repo may have multiple
 751	// shards.
 752	s.Shards += o.Shards
 753	s.IndexBytes += o.IndexBytes
 754	s.Documents += o.Documents
 755	s.ContentBytes += o.ContentBytes
 756
 757	// Sourcegraph specific
 758	s.NewLinesCount += o.NewLinesCount
 759	s.DefaultBranchNewLinesCount += o.DefaultBranchNewLinesCount
 760	s.OtherBranchesNewLinesCount += o.OtherBranchesNewLinesCount
 761}
 762
 763type RepoListEntry struct {
 764	Repository    Repository
 765	IndexMetadata IndexMetadata
 766	Stats         RepoStats
 767}
 768
 769// MinimalRepoListEntry is a subset of RepoListEntry. It was added after
 770// performance profiling of sourcegraph.com revealed that querying this
 771// information from Zoekt was causing lots of CPU and memory usage. Note: we
 772// can revisit this, how we store and query this information has changed a lot
 773// since this was introduced.
 774type MinimalRepoListEntry struct {
 775	// HasSymbols is exported since Sourcegraph uses this information at search
 776	// planning time to decide between Zoekt and an unindexed symbol search.
 777	//
 778	// Note: it pretty much is always true in practice.
 779	HasSymbols bool
 780
 781	// Branches is used by Sourcegraphs query planner to decided if it can use
 782	// zoekt or go via an unindexed code path.
 783	Branches []RepositoryBranch
 784
 785	// IndexTimeUnix is the IndexTime converted to unix time (number of seconds
 786	// since the epoch). This is to make it clear we are not transporting the
 787	// full fidelty timestamp (ie with milliseconds and location). Additionally
 788	// it saves 16 bytes in this struct.
 789	//
 790	// IndexTime is used as a heuristic in Sourcegraph to decide in aggregate
 791	// how many repositories need updating after a ranking change/etc.
 792	//
 793	// TODO(keegancsmith) audit updates to IndexTime and document how and when
 794	// it changes. Concerned about things like metadata updates or compound
 795	// shards leading to untrustworthy data here.
 796	IndexTimeUnix int64
 797}
 798
 799type ReposMap map[uint32]MinimalRepoListEntry
 800
 801// MarshalBinary implements a specialized encoder for ReposMap.
 802func (q *ReposMap) MarshalBinary() ([]byte, error) {
 803	return reposMapEncode(*q)
 804}
 805
 806// UnmarshalBinary implements a specialized decoder for ReposMap.
 807func (q *ReposMap) UnmarshalBinary(b []byte) error {
 808	var err error
 809	(*q), err = reposMapDecode(b)
 810	return err
 811}
 812
 813// RepoList holds a set of Repository metadata.
 814type RepoList struct {
 815	// Returned when ListOptions.Field is RepoListFieldRepos.
 816	Repos []*RepoListEntry
 817
 818	// ReposMap is set when ListOptions.Field is RepoListFieldReposMap.
 819	ReposMap ReposMap
 820
 821	Crashes int
 822
 823	// Stats response to a List request.
 824	// This is the aggregate RepoStats of all repos matching the input query.
 825	Stats RepoStats
 826}
 827
 828type Searcher interface {
 829	Search(ctx context.Context, q query.Q, opts *SearchOptions) (*SearchResult, error)
 830
 831	// List lists repositories. The query `q` can only contain
 832	// query.Repo atoms.
 833	List(ctx context.Context, q query.Q, opts *ListOptions) (*RepoList, error)
 834	Close()
 835
 836	// Describe the searcher for debug messages.
 837	String() string
 838}
 839
 840type RepoListField int
 841
 842const (
 843	RepoListFieldRepos    RepoListField = 0
 844	RepoListFieldReposMap               = 2
 845)
 846
 847type ListOptions struct {
 848	// Field decides which field to populate in RepoList response.
 849	Field RepoListField
 850}
 851
 852func (o *ListOptions) GetField() (RepoListField, error) {
 853	if o == nil {
 854		return RepoListFieldRepos, nil
 855	}
 856	switch o.Field {
 857	case RepoListFieldRepos, RepoListFieldReposMap:
 858		return o.Field, nil
 859	case 1:
 860		return 0, fmt.Errorf("RepoListFieldMinimal (%d) is no longer supported", o.Field)
 861	default:
 862		return 0, fmt.Errorf("unknown RepoListField %d", o.Field)
 863	}
 864}
 865
 866func (o *ListOptions) String() string {
 867	return fmt.Sprintf("%#v", o)
 868}
 869
 870type SearchOptions struct {
 871	// Return an upper-bound estimate of eligible documents in
 872	// stats.ShardFilesConsidered.
 873	EstimateDocCount bool
 874
 875	// Return the whole file.
 876	Whole bool
 877
 878	// Maximum number of matches: skip all processing an index
 879	// shard after we found this many non-overlapping matches.
 880	ShardMaxMatchCount int
 881
 882	// Maximum number of matches: stop looking for more matches
 883	// once we have this many matches across shards.
 884	TotalMaxMatchCount int
 885
 886	// Maximum number of matches: skip processing documents for a repository in
 887	// a shard once we have found ShardRepoMaxMatchCount.
 888	//
 889	// A compound shard may contain multiple repositories. This will most often
 890	// be set to 1 to find all repositories containing a result.
 891	ShardRepoMaxMatchCount int
 892
 893	// Deprecated: this field is not read anymore.
 894	ShardMaxImportantMatch int
 895
 896	// Deprecated: this field is not read anymore.
 897	TotalMaxImportantMatch int
 898
 899	// Abort the search after this much time has passed.
 900	MaxWallTime time.Duration
 901
 902	// FlushWallTime if non-zero will stop streaming behaviour at first and
 903	// instead will collate and sort results. At FlushWallTime the results will
 904	// be sent and then the behaviour will revert to the normal streaming.
 905	FlushWallTime time.Duration
 906
 907	// Truncates the number of documents (i.e. files) after collating and
 908	// sorting the results.
 909	MaxDocDisplayCount int
 910
 911	// Truncates the number of matchs after collating and sorting the results.
 912	MaxMatchDisplayCount int
 913
 914	// If set to a number greater than zero then up to this many number
 915	// of context lines will be added before and after each matched line.
 916	// Note that the included context lines might contain matches and
 917	// it's up to the consumer of the result to remove those lines.
 918	NumContextLines int
 919
 920	// If true, ChunkMatches will be returned in each FileMatch rather than LineMatches
 921	// EXPERIMENTAL: the behavior of this flag may be changed in future versions.
 922	ChunkMatches bool
 923
 924	// EXPERIMENTAL. If true, document ranks are used as additional input for
 925	// sorting matches.
 926	UseDocumentRanks bool
 927
 928	// EXPERIMENTAL. When UseDocumentRanks is enabled, this can be optionally set to adjust
 929	// their weight in the file match score. If the value is <= 0.0, the default weight value
 930	// will be used. This option is temporary and is only exposed for testing/ tuning purposes.
 931	DocumentRanksWeight float64
 932
 933	// EXPERIMENTAL. If true, use keyword-style scoring instead of the default scoring formula.
 934	// Currently, this treats each match in a file as a term and computes an approximation to BM25.
 935	// When enabled, all other scoring signals are ignored, including document ranks.
 936	UseKeywordScoring bool
 937
 938	// Trace turns on opentracing for this request if true and if the Jaeger address was provided as
 939	// a command-line flag
 940	Trace bool
 941
 942	// If set, the search results will contain debug information for scoring.
 943	DebugScore bool
 944
 945	// SpanContext is the opentracing span context, if it exists, from the zoekt client
 946	SpanContext map[string]string
 947}
 948
 949// String returns a succinct representation of the options. This is meant for
 950// human consumption in logs and traces.
 951//
 952// Note: some tracing systems have limits on length of values, so we take care
 953// to try and make this small, and include the important information near the
 954// front incase of truncation.
 955func (s *SearchOptions) String() string {
 956	var b strings.Builder
 957
 958	add := func(name, value string) {
 959		b.WriteString(name)
 960		b.WriteByte('=')
 961		b.WriteString(value)
 962		b.WriteByte(' ')
 963	}
 964	addInt := func(name string, value int) {
 965		if value != 0 {
 966			add(name, strconv.Itoa(value))
 967		}
 968	}
 969	addDuration := func(name string, value time.Duration) {
 970		if value != 0 {
 971			add(name, value.String())
 972		}
 973	}
 974	addBool := func(name string, value bool) {
 975		if !value {
 976			return
 977		}
 978		b.WriteString(name)
 979		b.WriteByte(' ')
 980	}
 981
 982	b.WriteString("zoekt.SearchOptions{ ")
 983
 984	addInt("ShardMaxMatchCount", s.ShardMaxMatchCount)
 985	addInt("TotalMaxMatchCount", s.TotalMaxMatchCount)
 986	addInt("ShardRepoMaxMatchCount", s.ShardRepoMaxMatchCount)
 987	addInt("ShardMaxImportantMatch", s.ShardMaxImportantMatch)
 988	addInt("TotalMaxImportantMatch", s.TotalMaxImportantMatch)
 989	addInt("MaxDocDisplayCount", s.MaxDocDisplayCount)
 990	addInt("MaxMatchDisplayCount", s.MaxMatchDisplayCount)
 991	addInt("NumContextLines", s.NumContextLines)
 992
 993	addDuration("MaxWallTime", s.MaxWallTime)
 994	addDuration("FlushWallTime", s.FlushWallTime)
 995
 996	if s.DocumentRanksWeight > 0 {
 997		add("DocumentRanksWeight", strconv.FormatFloat(s.DocumentRanksWeight, 'g', -1, 64))
 998	}
 999
1000	addBool("EstimateDocCount", s.EstimateDocCount)
1001	addBool("Whole", s.Whole)
1002	addBool("ChunkMatches", s.ChunkMatches)
1003	addBool("UseDocumentRanks", s.UseDocumentRanks)
1004	addBool("UseKeywordScoring", s.UseKeywordScoring)
1005	addBool("Trace", s.Trace)
1006	addBool("DebugScore", s.DebugScore)
1007
1008	for k, v := range s.SpanContext {
1009		add("SpanContext."+k, strconv.Quote(v))
1010	}
1011
1012	b.WriteByte('}')
1013	return b.String()
1014}
1015
1016// Sender is the interface that wraps the basic Send method.
1017type Sender interface {
1018	Send(*SearchResult)
1019}
1020
1021// Streamer adds the method StreamSearch to the Searcher interface.
1022type Streamer interface {
1023	Searcher
1024	StreamSearch(ctx context.Context, q query.Q, opts *SearchOptions, sender Sender) (err error)
1025}
Configure Feed

Configure Feed