api.go at b227501acf82ca21a07a6cf1d7b36616a2b21327 · boltless.me/zoekt

fork of https://github.com/sourcegraph/zoekt
zoekt / api.go
at b227501acf82ca21a07a6cf1d7b36616a2b21327 29 kB View raw
Keegan Carruthers-Smith all: gofumpt -l -w . 2y ago
   1// Copyright 2016 Google Inc. All rights reserved.
   2//
   3// Licensed under the Apache License, Version 2.0 (the "License");
   4// you may not use this file except in compliance with the License.
   5// You may obtain a copy of the License at
   6//
   7//    http://www.apache.org/licenses/LICENSE-2.0
   8//
   9// Unless required by applicable law or agreed to in writing, software
  10// distributed under the License is distributed on an "AS IS" BASIS,
  11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12// See the License for the specific language governing permissions and
  13// limitations under the License.
  14
  15package zoekt // import "github.com/sourcegraph/zoekt"
  16
  17import (
  18	"context"
  19	"encoding/json"
  20	"errors"
  21	"fmt"
  22	"reflect"
  23	"strconv"
  24	"strings"
  25	"time"
  26
  27	"github.com/sourcegraph/zoekt/query"
  28)
  29
  30const (
  31	mapHeaderBytes    uint64 = 48
  32	sliceHeaderBytes  uint64 = 24
  33	stringHeaderBytes uint64 = 16
  34	pointerSize       uint64 = 8
  35	interfaceBytes    uint64 = 16
  36)
  37
  38// FileMatch contains all the matches within a file.
  39type FileMatch struct {
  40	FileName string
  41
  42	// Repository is the globally unique name of the repo of the
  43	// match
  44	Repository string
  45
  46	// SubRepositoryName is the globally unique name of the repo,
  47	// if it came from a subrepository
  48	SubRepositoryName string `json:",omitempty"`
  49
  50	// SubRepositoryPath holds the prefix where the subrepository
  51	// was mounted.
  52	SubRepositoryPath string `json:",omitempty"`
  53
  54	// Commit SHA1 (hex) of the (sub)repo holding the file.
  55	Version string `json:",omitempty"`
  56
  57	// Detected language of the result.
  58	Language string
  59
  60	// For debugging. Needs DebugScore set, but public so tests in
  61	// other packages can print some diagnostics.
  62	Debug string `json:",omitempty"`
  63
  64	Branches []string `json:",omitempty"`
  65
  66	// One of LineMatches or ChunkMatches will be returned depending on whether
  67	// the SearchOptions.ChunkMatches is set.
  68	LineMatches  []LineMatch  `json:",omitempty"`
  69	ChunkMatches []ChunkMatch `json:",omitempty"`
  70
  71	// Only set if requested
  72	Content []byte `json:",omitempty"`
  73
  74	// Checksum of the content.
  75	Checksum []byte
  76
  77	// Ranking; the higher, the better.
  78	Score float64 `json:",omitempty"`
  79
  80	// RepositoryPriority is a Sourcegraph extension. It is used by Sourcegraph to
  81	// order results from different repositories relative to each other.
  82	RepositoryPriority float64 `json:",omitempty"`
  83
  84	// RepositoryID is a Sourcegraph extension. This is the ID of Repository in
  85	// Sourcegraph.
  86	RepositoryID uint32 `json:",omitempty"`
  87}
  88
  89func (m *FileMatch) sizeBytes() (sz uint64) {
  90	// Score
  91	sz += 8
  92
  93	for _, s := range []string{
  94		m.Debug,
  95		m.FileName,
  96		m.Repository,
  97		m.Language,
  98		m.SubRepositoryName,
  99		m.SubRepositoryPath,
 100		m.Version,
 101	} {
 102		sz += stringHeaderBytes + uint64(len(s))
 103	}
 104
 105	// Branches
 106	sz += sliceHeaderBytes
 107	for _, s := range m.Branches {
 108		sz += stringHeaderBytes + uint64(len(s))
 109	}
 110
 111	// LineMatches
 112	sz += sliceHeaderBytes
 113	for _, lm := range m.LineMatches {
 114		sz += lm.sizeBytes()
 115	}
 116
 117	// ChunkMatches
 118	sz += sliceHeaderBytes
 119	for _, cm := range m.ChunkMatches {
 120		sz += cm.sizeBytes()
 121	}
 122
 123	// RepositoryID
 124	sz += 4
 125
 126	// RepositoryPriority
 127	sz += 8
 128
 129	// Content
 130	sz += sliceHeaderBytes + uint64(len(m.Content))
 131
 132	// Checksum
 133	sz += sliceHeaderBytes + uint64(len(m.Checksum))
 134
 135	return
 136}
 137
 138// ChunkMatch is a set of non-overlapping matches within a contiguous range of
 139// lines in the file.
 140type ChunkMatch struct {
 141	DebugScore string
 142
 143	// Content is a contiguous range of complete lines that fully contains Ranges.
 144	Content []byte
 145
 146	// Ranges is a set of matching ranges within this chunk. Each range is relative
 147	// to the beginning of the file (not the beginning of Content).
 148	Ranges []Range
 149
 150	// SymbolInfo is the symbol information associated with Ranges. If it is non-nil,
 151	// its length will equal that of Ranges. Any of its elements may be nil.
 152	SymbolInfo []*Symbol
 153
 154	// FileName indicates whether this match is a match on the file name, in
 155	// which case Content will contain the file name.
 156	FileName bool
 157
 158	// ContentStart is the location (inclusive) of the beginning of content
 159	// relative to the beginning of the file. It will always be at the
 160	// beginning of a line (Column will always be 1).
 161	ContentStart Location
 162
 163	Score float64
 164}
 165
 166func (cm *ChunkMatch) sizeBytes() (sz uint64) {
 167	// Content
 168	sz += sliceHeaderBytes + uint64(len(cm.Content))
 169
 170	// ContentStart
 171	sz += cm.ContentStart.sizeBytes()
 172
 173	// FileName
 174	sz += 1
 175
 176	// Ranges
 177	sz += sliceHeaderBytes
 178	if len(cm.Ranges) > 0 {
 179		sz += uint64(len(cm.Ranges)) * cm.Ranges[0].sizeBytes()
 180	}
 181
 182	// SymbolInfo
 183	sz += sliceHeaderBytes
 184	for _, si := range cm.SymbolInfo {
 185		sz += pointerSize
 186		if si != nil {
 187			sz += si.sizeBytes()
 188		}
 189	}
 190
 191	// Score
 192	sz += 8
 193
 194	// DebugScore
 195	sz += stringHeaderBytes + uint64(len(cm.DebugScore))
 196
 197	return
 198}
 199
 200type Range struct {
 201	// The inclusive beginning of the range.
 202	Start Location
 203	// The exclusive end of the range.
 204	End Location
 205}
 206
 207func (r *Range) sizeBytes() uint64 {
 208	return r.Start.sizeBytes() + r.End.sizeBytes()
 209}
 210
 211type Location struct {
 212	// 0-based byte offset from the beginning of the file
 213	ByteOffset uint32
 214	// 1-based line number from the beginning of the file
 215	LineNumber uint32
 216	// 1-based column number (in runes) from the beginning of line
 217	Column uint32
 218}
 219
 220func (l *Location) sizeBytes() uint64 {
 221	return 3 * 4
 222}
 223
 224// LineMatch holds the matches within a single line in a file.
 225type LineMatch struct {
 226	// The line in which a match was found.
 227	Line       []byte
 228	LineStart  int
 229	LineEnd    int
 230	LineNumber int
 231
 232	// Before and After are only set when SearchOptions.NumContextLines is > 0
 233	Before []byte
 234	After  []byte
 235
 236	// If set, this was a match on the filename.
 237	FileName bool
 238
 239	// The higher the better. Only ranks the quality of the match
 240	// within the file, does not take rank of file into account
 241	Score      float64
 242	DebugScore string
 243
 244	LineFragments []LineFragmentMatch
 245}
 246
 247func (lm *LineMatch) sizeBytes() (sz uint64) {
 248	// Line
 249	sz += sliceHeaderBytes + uint64(len(lm.Line))
 250
 251	// LineStart, LineEnd, LineNumber
 252	sz += 3 * 8
 253
 254	// Before
 255	sz += sliceHeaderBytes + uint64(len(lm.Before))
 256
 257	// After
 258	sz += sliceHeaderBytes + uint64(len(lm.After))
 259
 260	// FileName
 261	sz += 1
 262
 263	// Score
 264	sz += 8
 265
 266	// DebugScore
 267	sz += stringHeaderBytes + uint64(len(lm.DebugScore))
 268
 269	// LineFragments
 270	sz += sliceHeaderBytes
 271	for _, lf := range lm.LineFragments {
 272		sz += lf.sizeBytes()
 273	}
 274
 275	return
 276}
 277
 278type Symbol struct {
 279	Sym        string
 280	Kind       string
 281	Parent     string
 282	ParentKind string
 283}
 284
 285func (s *Symbol) sizeBytes() uint64 {
 286	return 4*stringHeaderBytes + uint64(len(s.Sym)+len(s.Kind)+len(s.Parent)+len(s.ParentKind))
 287}
 288
 289// LineFragmentMatch a segment of matching text within a line.
 290type LineFragmentMatch struct {
 291	// Offset within the line, in bytes.
 292	LineOffset int
 293
 294	// Offset from file start, in bytes.
 295	Offset uint32
 296
 297	// Number bytes that match.
 298	MatchLength int
 299
 300	SymbolInfo *Symbol
 301}
 302
 303func (lfm *LineFragmentMatch) sizeBytes() (sz uint64) {
 304	// LineOffset
 305	sz += 8
 306
 307	// Offset
 308	sz += 4
 309
 310	// MatchLength
 311	sz += 8
 312
 313	// SymbolInfo
 314	sz += pointerSize
 315	if lfm.SymbolInfo != nil {
 316		sz += lfm.SymbolInfo.sizeBytes()
 317	}
 318
 319	return
 320}
 321
 322type FlushReason uint8
 323
 324const (
 325	FlushReasonTimerExpired FlushReason = 1 << iota
 326	FlushReasonFinalFlush
 327	FlushReasonMaxSize
 328)
 329
 330var FlushReasonStrings = map[FlushReason]string{
 331	FlushReasonTimerExpired: "timer_expired",
 332	FlushReasonFinalFlush:   "final_flush",
 333	FlushReasonMaxSize:      "max_size_reached",
 334}
 335
 336func (fr FlushReason) String() string {
 337	if v, ok := FlushReasonStrings[fr]; ok {
 338		return v
 339	}
 340
 341	return "none"
 342}
 343
 344// Stats contains interesting numbers on the search
 345type Stats struct {
 346	// Amount of I/O for reading contents.
 347	ContentBytesLoaded int64
 348
 349	// Amount of I/O for reading from index.
 350	IndexBytesLoaded int64
 351
 352	// Number of search shards that had a crash.
 353	Crashes int
 354
 355	// Wall clock time for this search
 356	Duration time.Duration
 357
 358	// Number of files containing a match.
 359	FileCount int
 360
 361	// Number of files in shards that we considered.
 362	ShardFilesConsidered int
 363
 364	// Files that we evaluated. Equivalent to files for which all
 365	// atom matches (including negations) evaluated to true.
 366	FilesConsidered int
 367
 368	// Files for which we loaded file content to verify substring matches
 369	FilesLoaded int
 370
 371	// Candidate files whose contents weren't examined because we
 372	// gathered enough matches.
 373	FilesSkipped int
 374
 375	// Shards that we scanned to find matches.
 376	ShardsScanned int
 377
 378	// Shards that we did not process because a query was canceled.
 379	ShardsSkipped int
 380
 381	// Shards that we did not process because the query was rejected by the
 382	// ngram filter indicating it had no matches.
 383	ShardsSkippedFilter int
 384
 385	// Number of non-overlapping matches
 386	MatchCount int
 387
 388	// Number of candidate matches as a result of searching ngrams.
 389	NgramMatches int
 390
 391	// NgramLookups is the number of times we accessed an ngram in the index.
 392	NgramLookups int
 393
 394	// Wall clock time for queued search.
 395	Wait time.Duration
 396
 397	// Aggregate wall clock time spent constructing and pruning the match tree.
 398	// This accounts for time such as lookups in the trigram index.
 399	MatchTreeConstruction time.Duration
 400
 401	// Aggregate wall clock time spent searching the match tree. This accounts
 402	// for the bulk of search work done looking for matches.
 403	MatchTreeSearch time.Duration
 404
 405	// Number of times regexp was called on files that we evaluated.
 406	RegexpsConsidered int
 407
 408	// FlushReason explains why results were flushed.
 409	FlushReason FlushReason
 410}
 411
 412func (s *Stats) sizeBytes() (sz uint64) {
 413	sz = 16 * 8 // This assumes we are running on a 64-bit architecture
 414	sz += 1     // FlushReason
 415
 416	return
 417}
 418
 419func (s *Stats) Add(o Stats) {
 420	s.ContentBytesLoaded += o.ContentBytesLoaded
 421	s.IndexBytesLoaded += o.IndexBytesLoaded
 422	s.Crashes += o.Crashes
 423	s.FileCount += o.FileCount
 424	s.FilesConsidered += o.FilesConsidered
 425	s.FilesLoaded += o.FilesLoaded
 426	s.FilesSkipped += o.FilesSkipped
 427	s.MatchCount += o.MatchCount
 428	s.NgramMatches += o.NgramMatches
 429	s.NgramLookups += o.NgramLookups
 430	s.ShardFilesConsidered += o.ShardFilesConsidered
 431	s.ShardsScanned += o.ShardsScanned
 432	s.ShardsSkipped += o.ShardsSkipped
 433	s.ShardsSkippedFilter += o.ShardsSkippedFilter
 434	s.Wait += o.Wait
 435	s.MatchTreeConstruction += o.MatchTreeConstruction
 436	s.MatchTreeSearch += o.MatchTreeSearch
 437	s.RegexpsConsidered += o.RegexpsConsidered
 438
 439	// We want the first non-zero FlushReason to be sticky. This is a useful
 440	// property when aggregating stats from several Zoekts.
 441	if s.FlushReason == 0 {
 442		s.FlushReason = o.FlushReason
 443	}
 444}
 445
 446// Zero returns true if stats is empty.
 447func (s *Stats) Zero() bool {
 448	if s == nil {
 449		return true
 450	}
 451
 452	return !(s.ContentBytesLoaded > 0 ||
 453		s.IndexBytesLoaded > 0 ||
 454		s.Crashes > 0 ||
 455		s.FileCount > 0 ||
 456		s.FilesConsidered > 0 ||
 457		s.FilesLoaded > 0 ||
 458		s.FilesSkipped > 0 ||
 459		s.MatchCount > 0 ||
 460		s.NgramMatches > 0 ||
 461		s.NgramLookups > 0 ||
 462		s.ShardFilesConsidered > 0 ||
 463		s.ShardsScanned > 0 ||
 464		s.ShardsSkipped > 0 ||
 465		s.ShardsSkippedFilter > 0 ||
 466		s.Wait > 0 ||
 467		s.MatchTreeConstruction > 0 ||
 468		s.MatchTreeSearch > 0 ||
 469		s.RegexpsConsidered > 0)
 470}
 471
 472// Progress contains information about the global progress of the running search query.
 473// This is used by the frontend to reorder results and emit them when stable.
 474// Sourcegraph specific: this is used when querying multiple zoekt-webserver instances.
 475type Progress struct {
 476	// Priority of the shard that was searched.
 477	Priority float64
 478
 479	// MaxPendingPriority is the maximum priority of pending result that is being searched in parallel.
 480	// This is used to reorder results when the result set is known to be stable-- that is, when a result's
 481	// Priority is greater than the max(MaxPendingPriority) from the latest results of each backend, it can be returned to the user.
 482	//
 483	// MaxPendingPriority decreases monotonically in each SearchResult.
 484	MaxPendingPriority float64
 485}
 486
 487func (p *Progress) sizeBytes() uint64 {
 488	return 2 * 8
 489}
 490
 491// SearchResult contains search matches and extra data
 492type SearchResult struct {
 493	Stats
 494
 495	// Do not encode this as we cannot encode -Inf in JSON
 496	Progress `json:"-"`
 497
 498	Files []FileMatch
 499
 500	// RepoURLs holds a repo => template string map.
 501	RepoURLs map[string]string
 502
 503	// FragmentNames holds a repo => template string map, for
 504	// the line number fragment.
 505	LineFragments map[string]string
 506}
 507
 508// SizeBytes is a best-effort estimate of the size of SearchResult in memory.
 509// The estimate does not take alignment into account. The result is a lower
 510// bound on the actual size in memory.
 511func (sr *SearchResult) SizeBytes() (sz uint64) {
 512	sz += sr.Stats.sizeBytes()
 513	sz += sr.Progress.sizeBytes()
 514
 515	// Files
 516	sz += sliceHeaderBytes
 517	for _, f := range sr.Files {
 518		sz += f.sizeBytes()
 519	}
 520
 521	// RepoURLs
 522	sz += mapHeaderBytes
 523	for k, v := range sr.RepoURLs {
 524		sz += stringHeaderBytes + uint64(len(k))
 525		sz += stringHeaderBytes + uint64(len(v))
 526	}
 527
 528	// LineFragments
 529	sz += mapHeaderBytes
 530	for k, v := range sr.LineFragments {
 531		sz += stringHeaderBytes + uint64(len(k))
 532		sz += stringHeaderBytes + uint64(len(v))
 533	}
 534
 535	return
 536}
 537
 538// RepositoryBranch describes an indexed branch, which is a name
 539// combined with a version.
 540type RepositoryBranch struct {
 541	Name    string
 542	Version string
 543}
 544
 545func (r RepositoryBranch) String() string {
 546	return fmt.Sprintf("%s@%s", r.Name, r.Version)
 547}
 548
 549// Repository holds repository metadata.
 550type Repository struct {
 551	// Sourcegraph's repository ID
 552	ID uint32
 553
 554	// The repository name
 555	Name string
 556
 557	// The repository URL.
 558	URL string
 559
 560	// The physical source where this repo came from, eg. full
 561	// path to the zip filename or git repository directory. This
 562	// will not be exposed in the UI, but can be used to detect
 563	// orphaned index shards.
 564	Source string
 565
 566	// The branches indexed in this repo.
 567	Branches []RepositoryBranch
 568
 569	// Nil if this is not the super project.
 570	SubRepoMap map[string]*Repository
 571
 572	// URL template to link to the commit of a branch
 573	CommitURLTemplate string
 574
 575	// The repository URL for getting to a file.  Has access to
 576	// {{.Version}}, {{.Path}}
 577	FileURLTemplate string
 578
 579	// The URL fragment to add to a file URL for line numbers. has
 580	// access to {{.LineNumber}}. The fragment should include the
 581	// separator, generally '#' or ';'.
 582	LineFragmentTemplate string
 583
 584	// Perf optimization: priority is set when we load the shard. It corresponds to
 585	// the value of "priority" stored in RawConfig.
 586	priority float64
 587
 588	// All zoekt.* configuration settings.
 589	RawConfig map[string]string
 590
 591	// Importance of the repository, bigger is more important
 592	Rank uint16
 593
 594	// IndexOptions is a hash of the options used to create the index for the
 595	// repo.
 596	IndexOptions string
 597
 598	// HasSymbols is true if this repository has indexed ctags
 599	// output. Sourcegraph specific: This field is more appropriate for
 600	// IndexMetadata. However, we store it here since the Sourcegraph frontend
 601	// can read this structure but not IndexMetadata.
 602	HasSymbols bool
 603
 604	// Tombstone is true if we are not allowed to search this repo.
 605	Tombstone bool
 606
 607	// LatestCommitDate is the date of the latest commit among all indexed Branches.
 608	// The date might be time.Time's 0-value if the repository was last indexed
 609	// before this field was added.
 610	LatestCommitDate time.Time
 611
 612	// FileTombstones is a set of file paths that should be ignored across all branches
 613	// in this shard.
 614	FileTombstones map[string]struct{} `json:",omitempty"`
 615}
 616
 617func (r *Repository) UnmarshalJSON(data []byte) error {
 618	// We define a new type so that we can use json.Unmarshal
 619	// without recursing into this same method.
 620	type repository *Repository
 621	repo := repository(r)
 622
 623	err := json.Unmarshal(data, repo)
 624	if err != nil {
 625		return err
 626	}
 627
 628	if v, ok := repo.RawConfig["repoid"]; ok {
 629		id, _ := strconv.ParseUint(v, 10, 32)
 630		r.ID = uint32(id)
 631	}
 632
 633	if v, ok := repo.RawConfig["priority"]; ok {
 634		r.priority, err = strconv.ParseFloat(v, 64)
 635		if err != nil {
 636			r.priority = 0
 637		}
 638
 639		// Sourcegraph indexserver doesn't set repo.Rank, so we set it here
 640		// based on priority. Setting it on read instead of during indexing
 641		// allows us to avoid a complete reindex.
 642		if r.Rank == 0 && r.priority > 0 {
 643			// Normalize the repo score within [0, 1), with the midpoint at 5,000. This means popular
 644			// repos (roughly ones with over 5,000 stars) see diminishing returns from more stars.
 645			r.Rank = uint16(r.priority / (5000.0 + r.priority) * maxUInt16)
 646		}
 647	}
 648	return nil
 649}
 650
 651// MergeMutable will merge x into r. mutated will be true if it made any
 652// changes. err is non-nil if we needed to mutate an immutable field.
 653//
 654// Note: SubRepoMap, IndexOptions and HasSymbol fields are ignored. They are
 655// computed while indexing so can't be synthesized from x.
 656//
 657// Note: We ignore RawConfig fields which are duplicated into Repository:
 658// name and id.
 659//
 660// Note: URL, *Template fields are ignored. They are not used by Sourcegraph.
 661func (r *Repository) MergeMutable(x *Repository) (mutated bool, err error) {
 662	if r.ID != x.ID {
 663		// Sourcegraph: strange behaviour may occur if ID changes but names don't.
 664		return mutated, errors.New("ID is immutable")
 665	}
 666	if r.Name != x.Name {
 667		// Name is encoded into the shard name on disk. We need to re-index if it
 668		// changes.
 669		return mutated, errors.New("Name is immutable")
 670	}
 671	if !reflect.DeepEqual(r.Branches, x.Branches) {
 672		// Need a reindex if content changing.
 673		return mutated, errors.New("Branches is immutable")
 674	}
 675
 676	for k, v := range x.RawConfig {
 677		// We ignore name and id since they are encoded into the repository.
 678		if k == "name" || k == "id" {
 679			continue
 680		}
 681		if r.RawConfig == nil {
 682			mutated = true
 683			r.RawConfig = make(map[string]string)
 684		}
 685		if r.RawConfig[k] != v {
 686			mutated = true
 687			r.RawConfig[k] = v
 688		}
 689	}
 690
 691	return mutated, nil
 692}
 693
 694// IndexMetadata holds metadata stored in the index file. It contains
 695// data generated by the core indexing library.
 696type IndexMetadata struct {
 697	IndexFormatVersion    int
 698	IndexFeatureVersion   int
 699	IndexMinReaderVersion int
 700	IndexTime             time.Time
 701	PlainASCII            bool
 702	LanguageMap           map[string]uint16
 703	ZoektVersion          string
 704	ID                    string
 705}
 706
 707// Statistics of a (collection of) repositories.
 708type RepoStats struct {
 709	// Repos is used for aggregrating the number of repositories.
 710	//
 711	// Note: This field is not populated on RepoListEntry.Stats (individual) but
 712	// only for RepoList.Stats (aggregate).
 713	Repos int
 714
 715	// Shards is the total number of search shards.
 716	Shards int
 717
 718	// Documents holds the number of documents or files.
 719	Documents int
 720
 721	// IndexBytes is the amount of RAM used for index overhead.
 722	IndexBytes int64
 723
 724	// ContentBytes is the amount of RAM used for raw content.
 725	ContentBytes int64
 726
 727	// Sourcegraph specific stats below. These are not as efficient to calculate
 728	// as the above statistics. We experimentally measured about a 10% slower
 729	// shard load time. However, we find these values very useful to track and
 730	// computing them outside of load time introduces a lot of complexity.
 731
 732	// NewLinesCount is the number of newlines "\n" that appear in the zoekt
 733	// indexed documents. This is not exactly the same as line count, since it
 734	// will not include lines not terminated by "\n" (eg a file with no "\n", or
 735	// a final line without "\n"). Note: Zoekt deduplicates documents across
 736	// branches, so if a path has the same contents on multiple branches, there
 737	// is only one document for it. As such that document's newlines is only
 738	// counted once. See DefaultBranchNewLinesCount and AllBranchesNewLinesCount
 739	// for counts which do not deduplicate.
 740	NewLinesCount uint64
 741
 742	// DefaultBranchNewLinesCount is the number of newlines "\n" in the default
 743	// branch.
 744	DefaultBranchNewLinesCount uint64
 745
 746	// OtherBranchesNewLinesCount is the number of newlines "\n" in all branches
 747	// except the default branch.
 748	OtherBranchesNewLinesCount uint64
 749}
 750
 751func (s *RepoStats) Add(o *RepoStats) {
 752	// can't update Repos, since one repo may have multiple
 753	// shards.
 754	s.Shards += o.Shards
 755	s.IndexBytes += o.IndexBytes
 756	s.Documents += o.Documents
 757	s.ContentBytes += o.ContentBytes
 758
 759	// Sourcegraph specific
 760	s.NewLinesCount += o.NewLinesCount
 761	s.DefaultBranchNewLinesCount += o.DefaultBranchNewLinesCount
 762	s.OtherBranchesNewLinesCount += o.OtherBranchesNewLinesCount
 763}
 764
 765type RepoListEntry struct {
 766	Repository    Repository
 767	IndexMetadata IndexMetadata
 768	Stats         RepoStats
 769}
 770
 771// MinimalRepoListEntry is a subset of RepoListEntry. It was added after
 772// performance profiling of sourcegraph.com revealed that querying this
 773// information from Zoekt was causing lots of CPU and memory usage. Note: we
 774// can revisit this, how we store and query this information has changed a lot
 775// since this was introduced.
 776type MinimalRepoListEntry struct {
 777	// HasSymbols is exported since Sourcegraph uses this information at search
 778	// planning time to decide between Zoekt and an unindexed symbol search.
 779	//
 780	// Note: it pretty much is always true in practice.
 781	HasSymbols bool
 782
 783	// Branches is used by Sourcegraphs query planner to decided if it can use
 784	// zoekt or go via an unindexed code path.
 785	Branches []RepositoryBranch
 786
 787	// IndexTimeUnix is the IndexTime converted to unix time (number of seconds
 788	// since the epoch). This is to make it clear we are not transporting the
 789	// full fidelty timestamp (ie with milliseconds and location). Additionally
 790	// it saves 16 bytes in this struct.
 791	//
 792	// IndexTime is used as a heuristic in Sourcegraph to decide in aggregate
 793	// how many repositories need updating after a ranking change/etc.
 794	//
 795	// TODO(keegancsmith) audit updates to IndexTime and document how and when
 796	// it changes. Concerned about things like metadata updates or compound
 797	// shards leading to untrustworthy data here.
 798	IndexTimeUnix int64
 799}
 800
 801type ReposMap map[uint32]MinimalRepoListEntry
 802
 803// MarshalBinary implements a specialized encoder for ReposMap.
 804func (q *ReposMap) MarshalBinary() ([]byte, error) {
 805	return reposMapEncode(*q)
 806}
 807
 808// UnmarshalBinary implements a specialized decoder for ReposMap.
 809func (q *ReposMap) UnmarshalBinary(b []byte) error {
 810	var err error
 811	(*q), err = reposMapDecode(b)
 812	return err
 813}
 814
 815// RepoList holds a set of Repository metadata.
 816type RepoList struct {
 817	// Returned when ListOptions.Field is RepoListFieldRepos.
 818	Repos []*RepoListEntry
 819
 820	// ReposMap is set when ListOptions.Field is RepoListFieldReposMap.
 821	ReposMap ReposMap
 822
 823	Crashes int
 824
 825	// Stats response to a List request.
 826	// This is the aggregate RepoStats of all repos matching the input query.
 827	Stats RepoStats
 828}
 829
 830type Searcher interface {
 831	Search(ctx context.Context, q query.Q, opts *SearchOptions) (*SearchResult, error)
 832
 833	// List lists repositories. The query `q` can only contain
 834	// query.Repo atoms.
 835	List(ctx context.Context, q query.Q, opts *ListOptions) (*RepoList, error)
 836	Close()
 837
 838	// Describe the searcher for debug messages.
 839	String() string
 840}
 841
 842type RepoListField int
 843
 844const (
 845	RepoListFieldRepos    RepoListField = 0
 846	RepoListFieldReposMap               = 2
 847)
 848
 849type ListOptions struct {
 850	// Field decides which field to populate in RepoList response.
 851	Field RepoListField
 852}
 853
 854func (o *ListOptions) GetField() (RepoListField, error) {
 855	if o == nil {
 856		return RepoListFieldRepos, nil
 857	}
 858	switch o.Field {
 859	case RepoListFieldRepos, RepoListFieldReposMap:
 860		return o.Field, nil
 861	case 1:
 862		return 0, fmt.Errorf("RepoListFieldMinimal (%d) is no longer supported", o.Field)
 863	default:
 864		return 0, fmt.Errorf("unknown RepoListField %d", o.Field)
 865	}
 866}
 867
 868func (o *ListOptions) String() string {
 869	return fmt.Sprintf("%#v", o)
 870}
 871
 872type SearchOptions struct {
 873	// Return an upper-bound estimate of eligible documents in
 874	// stats.ShardFilesConsidered.
 875	EstimateDocCount bool
 876
 877	// Return the whole file.
 878	Whole bool
 879
 880	// Maximum number of matches: skip all processing an index
 881	// shard after we found this many non-overlapping matches.
 882	ShardMaxMatchCount int
 883
 884	// Maximum number of matches: stop looking for more matches
 885	// once we have this many matches across shards.
 886	TotalMaxMatchCount int
 887
 888	// Maximum number of matches: skip processing documents for a repository in
 889	// a shard once we have found ShardRepoMaxMatchCount.
 890	//
 891	// A compound shard may contain multiple repositories. This will most often
 892	// be set to 1 to find all repositories containing a result.
 893	ShardRepoMaxMatchCount int
 894
 895	// Deprecated: this field is not read anymore.
 896	ShardMaxImportantMatch int
 897
 898	// Deprecated: this field is not read anymore.
 899	TotalMaxImportantMatch int
 900
 901	// Abort the search after this much time has passed.
 902	MaxWallTime time.Duration
 903
 904	// FlushWallTime if non-zero will stop streaming behaviour at first and
 905	// instead will collate and sort results. At FlushWallTime the results will
 906	// be sent and then the behaviour will revert to the normal streaming.
 907	FlushWallTime time.Duration
 908
 909	// Truncates the number of documents (i.e. files) after collating and
 910	// sorting the results.
 911	MaxDocDisplayCount int
 912
 913	// Truncates the number of matchs after collating and sorting the results.
 914	MaxMatchDisplayCount int
 915
 916	// If set to a number greater than zero then up to this many number
 917	// of context lines will be added before and after each matched line.
 918	// Note that the included context lines might contain matches and
 919	// it's up to the consumer of the result to remove those lines.
 920	NumContextLines int
 921
 922	// If true, ChunkMatches will be returned in each FileMatch rather than LineMatches
 923	// EXPERIMENTAL: the behavior of this flag may be changed in future versions.
 924	ChunkMatches bool
 925
 926	// EXPERIMENTAL. If true, document ranks are used as additional input for
 927	// sorting matches.
 928	UseDocumentRanks bool
 929
 930	// EXPERIMENTAL. When UseDocumentRanks is enabled, this can be optionally set to adjust
 931	// their weight in the file match score. If the value is <= 0.0, the default weight value
 932	// will be used. This option is temporary and is only exposed for testing/ tuning purposes.
 933	DocumentRanksWeight float64
 934
 935	// EXPERIMENTAL. If true, use keyword-style scoring instead of the default scoring formula.
 936	// Currently, this treats each match in a file as a term and computes an approximation to BM25.
 937	// When enabled, all other scoring signals are ignored, including document ranks.
 938	UseKeywordScoring bool
 939
 940	// Trace turns on opentracing for this request if true and if the Jaeger address was provided as
 941	// a command-line flag
 942	Trace bool
 943
 944	// If set, the search results will contain debug information for scoring.
 945	DebugScore bool
 946
 947	// SpanContext is the opentracing span context, if it exists, from the zoekt client
 948	SpanContext map[string]string
 949}
 950
 951// String returns a succinct representation of the options. This is meant for
 952// human consumption in logs and traces.
 953//
 954// Note: some tracing systems have limits on length of values, so we take care
 955// to try and make this small, and include the important information near the
 956// front incase of truncation.
 957func (s *SearchOptions) String() string {
 958	var b strings.Builder
 959
 960	add := func(name, value string) {
 961		b.WriteString(name)
 962		b.WriteByte('=')
 963		b.WriteString(value)
 964		b.WriteByte(' ')
 965	}
 966	addInt := func(name string, value int) {
 967		if value != 0 {
 968			add(name, strconv.Itoa(value))
 969		}
 970	}
 971	addDuration := func(name string, value time.Duration) {
 972		if value != 0 {
 973			add(name, value.String())
 974		}
 975	}
 976	addBool := func(name string, value bool) {
 977		if !value {
 978			return
 979		}
 980		b.WriteString(name)
 981		b.WriteByte(' ')
 982	}
 983
 984	b.WriteString("zoekt.SearchOptions{ ")
 985
 986	addInt("ShardMaxMatchCount", s.ShardMaxMatchCount)
 987	addInt("TotalMaxMatchCount", s.TotalMaxMatchCount)
 988	addInt("ShardRepoMaxMatchCount", s.ShardRepoMaxMatchCount)
 989	addInt("ShardMaxImportantMatch", s.ShardMaxImportantMatch)
 990	addInt("TotalMaxImportantMatch", s.TotalMaxImportantMatch)
 991	addInt("MaxDocDisplayCount", s.MaxDocDisplayCount)
 992	addInt("MaxMatchDisplayCount", s.MaxMatchDisplayCount)
 993	addInt("NumContextLines", s.NumContextLines)
 994
 995	addDuration("MaxWallTime", s.MaxWallTime)
 996	addDuration("FlushWallTime", s.FlushWallTime)
 997
 998	if s.DocumentRanksWeight > 0 {
 999		add("DocumentRanksWeight", strconv.FormatFloat(s.DocumentRanksWeight, 'g', -1, 64))
1000	}
1001
1002	addBool("EstimateDocCount", s.EstimateDocCount)
1003	addBool("Whole", s.Whole)
1004	addBool("ChunkMatches", s.ChunkMatches)
1005	addBool("UseDocumentRanks", s.UseDocumentRanks)
1006	addBool("UseKeywordScoring", s.UseKeywordScoring)
1007	addBool("Trace", s.Trace)
1008	addBool("DebugScore", s.DebugScore)
1009
1010	for k, v := range s.SpanContext {
1011		add("SpanContext."+k, strconv.Quote(v))
1012	}
1013
1014	b.WriteByte('}')
1015	return b.String()
1016}
1017
1018// Sender is the interface that wraps the basic Send method.
1019type Sender interface {
1020	Send(*SearchResult)
1021}
1022
1023// Streamer adds the method StreamSearch to the Searcher interface.
1024type Streamer interface {
1025	Searcher
1026	StreamSearch(ctx context.Context, q query.Q, opts *SearchOptions, sender Sender) (err error)
1027}
Configure Feed

Configure Feed