api.go at acce0526a2e90e4c45c8bdf7caa02cfb4237f19f · boltless.me/zoekt

fork of https://github.com/sourcegraph/zoekt
zoekt / api.go
at acce0526a2e90e4c45c8bdf7caa02cfb4237f19f 28 kB View raw
Keegan Carruthers-Smith remove deprecated ShardMaxImportantMatch TotalMaxImportantMatch (#744) 2y ago
   1// Copyright 2016 Google Inc. All rights reserved.
   2//
   3// Licensed under the Apache License, Version 2.0 (the "License");
   4// you may not use this file except in compliance with the License.
   5// You may obtain a copy of the License at
   6//
   7//    http://www.apache.org/licenses/LICENSE-2.0
   8//
   9// Unless required by applicable law or agreed to in writing, software
  10// distributed under the License is distributed on an "AS IS" BASIS,
  11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12// See the License for the specific language governing permissions and
  13// limitations under the License.
  14
  15package zoekt // import "github.com/sourcegraph/zoekt"
  16
  17import (
  18	"context"
  19	"encoding/json"
  20	"errors"
  21	"fmt"
  22	"reflect"
  23	"strconv"
  24	"strings"
  25	"time"
  26
  27	"github.com/sourcegraph/zoekt/query"
  28)
  29
  30const (
  31	mapHeaderBytes    uint64 = 48
  32	sliceHeaderBytes  uint64 = 24
  33	stringHeaderBytes uint64 = 16
  34	pointerSize       uint64 = 8
  35	interfaceBytes    uint64 = 16
  36)
  37
  38// FileMatch contains all the matches within a file.
  39type FileMatch struct {
  40	FileName string
  41
  42	// Repository is the globally unique name of the repo of the
  43	// match
  44	Repository string
  45
  46	// SubRepositoryName is the globally unique name of the repo,
  47	// if it came from a subrepository
  48	SubRepositoryName string `json:",omitempty"`
  49
  50	// SubRepositoryPath holds the prefix where the subrepository
  51	// was mounted.
  52	SubRepositoryPath string `json:",omitempty"`
  53
  54	// Commit SHA1 (hex) of the (sub)repo holding the file.
  55	Version string `json:",omitempty"`
  56
  57	// Detected language of the result.
  58	Language string
  59
  60	// For debugging. Needs DebugScore set, but public so tests in
  61	// other packages can print some diagnostics.
  62	Debug string `json:",omitempty"`
  63
  64	Branches []string `json:",omitempty"`
  65
  66	// One of LineMatches or ChunkMatches will be returned depending on whether
  67	// the SearchOptions.ChunkMatches is set.
  68	LineMatches  []LineMatch  `json:",omitempty"`
  69	ChunkMatches []ChunkMatch `json:",omitempty"`
  70
  71	// Only set if requested
  72	Content []byte `json:",omitempty"`
  73
  74	// Checksum of the content.
  75	Checksum []byte
  76
  77	// Ranking; the higher, the better.
  78	Score float64 `json:",omitempty"`
  79
  80	// RepositoryPriority is a Sourcegraph extension. It is used by Sourcegraph to
  81	// order results from different repositories relative to each other.
  82	RepositoryPriority float64 `json:",omitempty"`
  83
  84	// RepositoryID is a Sourcegraph extension. This is the ID of Repository in
  85	// Sourcegraph.
  86	RepositoryID uint32 `json:",omitempty"`
  87}
  88
  89func (m *FileMatch) sizeBytes() (sz uint64) {
  90	// Score
  91	sz += 8
  92
  93	for _, s := range []string{
  94		m.Debug,
  95		m.FileName,
  96		m.Repository,
  97		m.Language,
  98		m.SubRepositoryName,
  99		m.SubRepositoryPath,
 100		m.Version,
 101	} {
 102		sz += stringHeaderBytes + uint64(len(s))
 103	}
 104
 105	// Branches
 106	sz += sliceHeaderBytes
 107	for _, s := range m.Branches {
 108		sz += stringHeaderBytes + uint64(len(s))
 109	}
 110
 111	// LineMatches
 112	sz += sliceHeaderBytes
 113	for _, lm := range m.LineMatches {
 114		sz += lm.sizeBytes()
 115	}
 116
 117	// ChunkMatches
 118	sz += sliceHeaderBytes
 119	for _, cm := range m.ChunkMatches {
 120		sz += cm.sizeBytes()
 121	}
 122
 123	// RepositoryID
 124	sz += 4
 125
 126	// RepositoryPriority
 127	sz += 8
 128
 129	// Content
 130	sz += sliceHeaderBytes + uint64(len(m.Content))
 131
 132	// Checksum
 133	sz += sliceHeaderBytes + uint64(len(m.Checksum))
 134
 135	return
 136}
 137
 138// ChunkMatch is a set of non-overlapping matches within a contiguous range of
 139// lines in the file.
 140type ChunkMatch struct {
 141	DebugScore string
 142
 143	// Content is a contiguous range of complete lines that fully contains Ranges.
 144	Content []byte
 145
 146	// Ranges is a set of matching ranges within this chunk. Each range is relative
 147	// to the beginning of the file (not the beginning of Content).
 148	Ranges []Range
 149
 150	// SymbolInfo is the symbol information associated with Ranges. If it is non-nil,
 151	// its length will equal that of Ranges. Any of its elements may be nil.
 152	SymbolInfo []*Symbol
 153
 154	// FileName indicates whether this match is a match on the file name, in
 155	// which case Content will contain the file name.
 156	FileName bool
 157
 158	// ContentStart is the location (inclusive) of the beginning of content
 159	// relative to the beginning of the file. It will always be at the
 160	// beginning of a line (Column will always be 1).
 161	ContentStart Location
 162
 163	Score float64
 164}
 165
 166func (cm *ChunkMatch) sizeBytes() (sz uint64) {
 167	// Content
 168	sz += sliceHeaderBytes + uint64(len(cm.Content))
 169
 170	// ContentStart
 171	sz += cm.ContentStart.sizeBytes()
 172
 173	// FileName
 174	sz += 1
 175
 176	// Ranges
 177	sz += sliceHeaderBytes
 178	if len(cm.Ranges) > 0 {
 179		sz += uint64(len(cm.Ranges)) * cm.Ranges[0].sizeBytes()
 180	}
 181
 182	// SymbolInfo
 183	sz += sliceHeaderBytes
 184	for _, si := range cm.SymbolInfo {
 185		sz += pointerSize
 186		if si != nil {
 187			sz += si.sizeBytes()
 188		}
 189	}
 190
 191	// Score
 192	sz += 8
 193
 194	// DebugScore
 195	sz += stringHeaderBytes + uint64(len(cm.DebugScore))
 196
 197	return
 198}
 199
 200type Range struct {
 201	// The inclusive beginning of the range.
 202	Start Location
 203	// The exclusive end of the range.
 204	End Location
 205}
 206
 207func (r *Range) sizeBytes() uint64 {
 208	return r.Start.sizeBytes() + r.End.sizeBytes()
 209}
 210
 211type Location struct {
 212	// 0-based byte offset from the beginning of the file
 213	ByteOffset uint32
 214	// 1-based line number from the beginning of the file
 215	LineNumber uint32
 216	// 1-based column number (in runes) from the beginning of line
 217	Column uint32
 218}
 219
 220func (l *Location) sizeBytes() uint64 {
 221	return 3 * 4
 222}
 223
 224// LineMatch holds the matches within a single line in a file.
 225type LineMatch struct {
 226	// The line in which a match was found.
 227	Line       []byte
 228	LineStart  int
 229	LineEnd    int
 230	LineNumber int
 231
 232	// Before and After are only set when SearchOptions.NumContextLines is > 0
 233	Before []byte
 234	After  []byte
 235
 236	// If set, this was a match on the filename.
 237	FileName bool
 238
 239	// The higher the better. Only ranks the quality of the match
 240	// within the file, does not take rank of file into account
 241	Score      float64
 242	DebugScore string
 243
 244	LineFragments []LineFragmentMatch
 245}
 246
 247func (lm *LineMatch) sizeBytes() (sz uint64) {
 248	// Line
 249	sz += sliceHeaderBytes + uint64(len(lm.Line))
 250
 251	// LineStart, LineEnd, LineNumber
 252	sz += 3 * 8
 253
 254	// Before
 255	sz += sliceHeaderBytes + uint64(len(lm.Before))
 256
 257	// After
 258	sz += sliceHeaderBytes + uint64(len(lm.After))
 259
 260	// FileName
 261	sz += 1
 262
 263	// Score
 264	sz += 8
 265
 266	// DebugScore
 267	sz += stringHeaderBytes + uint64(len(lm.DebugScore))
 268
 269	// LineFragments
 270	sz += sliceHeaderBytes
 271	for _, lf := range lm.LineFragments {
 272		sz += lf.sizeBytes()
 273	}
 274
 275	return
 276}
 277
 278type Symbol struct {
 279	Sym        string
 280	Kind       string
 281	Parent     string
 282	ParentKind string
 283}
 284
 285func (s *Symbol) sizeBytes() uint64 {
 286	return 4*stringHeaderBytes + uint64(len(s.Sym)+len(s.Kind)+len(s.Parent)+len(s.ParentKind))
 287}
 288
 289// LineFragmentMatch a segment of matching text within a line.
 290type LineFragmentMatch struct {
 291	// Offset within the line, in bytes.
 292	LineOffset int
 293
 294	// Offset from file start, in bytes.
 295	Offset uint32
 296
 297	// Number bytes that match.
 298	MatchLength int
 299
 300	SymbolInfo *Symbol
 301}
 302
 303func (lfm *LineFragmentMatch) sizeBytes() (sz uint64) {
 304	// LineOffset
 305	sz += 8
 306
 307	// Offset
 308	sz += 4
 309
 310	// MatchLength
 311	sz += 8
 312
 313	// SymbolInfo
 314	sz += pointerSize
 315	if lfm.SymbolInfo != nil {
 316		sz += lfm.SymbolInfo.sizeBytes()
 317	}
 318
 319	return
 320}
 321
 322type FlushReason uint8
 323
 324const (
 325	FlushReasonTimerExpired FlushReason = 1 << iota
 326	FlushReasonFinalFlush
 327	FlushReasonMaxSize
 328)
 329
 330var FlushReasonStrings = map[FlushReason]string{
 331	FlushReasonTimerExpired: "timer_expired",
 332	FlushReasonFinalFlush:   "final_flush",
 333	FlushReasonMaxSize:      "max_size_reached",
 334}
 335
 336func (fr FlushReason) String() string {
 337	if v, ok := FlushReasonStrings[fr]; ok {
 338		return v
 339	}
 340
 341	return "none"
 342}
 343
 344// Stats contains interesting numbers on the search
 345type Stats struct {
 346	// Amount of I/O for reading contents.
 347	ContentBytesLoaded int64
 348
 349	// Amount of I/O for reading from index.
 350	IndexBytesLoaded int64
 351
 352	// Number of search shards that had a crash.
 353	Crashes int
 354
 355	// Wall clock time for this search
 356	Duration time.Duration
 357
 358	// Number of files containing a match.
 359	FileCount int
 360
 361	// Number of files in shards that we considered.
 362	ShardFilesConsidered int
 363
 364	// Files that we evaluated. Equivalent to files for which all
 365	// atom matches (including negations) evaluated to true.
 366	FilesConsidered int
 367
 368	// Files for which we loaded file content to verify substring matches
 369	FilesLoaded int
 370
 371	// Candidate files whose contents weren't examined because we
 372	// gathered enough matches.
 373	FilesSkipped int
 374
 375	// Shards that we scanned to find matches.
 376	ShardsScanned int
 377
 378	// Shards that we did not process because a query was canceled.
 379	ShardsSkipped int
 380
 381	// Shards that we did not process because the query was rejected by the
 382	// ngram filter indicating it had no matches.
 383	ShardsSkippedFilter int
 384
 385	// Number of non-overlapping matches
 386	MatchCount int
 387
 388	// Number of candidate matches as a result of searching ngrams.
 389	NgramMatches int
 390
 391	// NgramLookups is the number of times we accessed an ngram in the index.
 392	NgramLookups int
 393
 394	// Wall clock time for queued search.
 395	Wait time.Duration
 396
 397	// Aggregate wall clock time spent constructing and pruning the match tree.
 398	// This accounts for time such as lookups in the trigram index.
 399	MatchTreeConstruction time.Duration
 400
 401	// Aggregate wall clock time spent searching the match tree. This accounts
 402	// for the bulk of search work done looking for matches.
 403	MatchTreeSearch time.Duration
 404
 405	// Number of times regexp was called on files that we evaluated.
 406	RegexpsConsidered int
 407
 408	// FlushReason explains why results were flushed.
 409	FlushReason FlushReason
 410}
 411
 412func (s *Stats) sizeBytes() (sz uint64) {
 413	sz = 16 * 8 // This assumes we are running on a 64-bit architecture
 414	sz += 1     // FlushReason
 415
 416	return
 417}
 418
 419func (s *Stats) Add(o Stats) {
 420	s.ContentBytesLoaded += o.ContentBytesLoaded
 421	s.IndexBytesLoaded += o.IndexBytesLoaded
 422	s.Crashes += o.Crashes
 423	s.FileCount += o.FileCount
 424	s.FilesConsidered += o.FilesConsidered
 425	s.FilesLoaded += o.FilesLoaded
 426	s.FilesSkipped += o.FilesSkipped
 427	s.MatchCount += o.MatchCount
 428	s.NgramMatches += o.NgramMatches
 429	s.NgramLookups += o.NgramLookups
 430	s.ShardFilesConsidered += o.ShardFilesConsidered
 431	s.ShardsScanned += o.ShardsScanned
 432	s.ShardsSkipped += o.ShardsSkipped
 433	s.ShardsSkippedFilter += o.ShardsSkippedFilter
 434	s.Wait += o.Wait
 435	s.MatchTreeConstruction += o.MatchTreeConstruction
 436	s.MatchTreeSearch += o.MatchTreeSearch
 437	s.RegexpsConsidered += o.RegexpsConsidered
 438
 439	// We want the first non-zero FlushReason to be sticky. This is a useful
 440	// property when aggregating stats from several Zoekts.
 441	if s.FlushReason == 0 {
 442		s.FlushReason = o.FlushReason
 443	}
 444}
 445
 446// Zero returns true if stats is empty.
 447func (s *Stats) Zero() bool {
 448	if s == nil {
 449		return true
 450	}
 451
 452	return !(s.ContentBytesLoaded > 0 ||
 453		s.IndexBytesLoaded > 0 ||
 454		s.Crashes > 0 ||
 455		s.FileCount > 0 ||
 456		s.FilesConsidered > 0 ||
 457		s.FilesLoaded > 0 ||
 458		s.FilesSkipped > 0 ||
 459		s.MatchCount > 0 ||
 460		s.NgramMatches > 0 ||
 461		s.NgramLookups > 0 ||
 462		s.ShardFilesConsidered > 0 ||
 463		s.ShardsScanned > 0 ||
 464		s.ShardsSkipped > 0 ||
 465		s.ShardsSkippedFilter > 0 ||
 466		s.Wait > 0 ||
 467		s.MatchTreeConstruction > 0 ||
 468		s.MatchTreeSearch > 0 ||
 469		s.RegexpsConsidered > 0)
 470}
 471
 472// Progress contains information about the global progress of the running search query.
 473// This is used by the frontend to reorder results and emit them when stable.
 474// Sourcegraph specific: this is used when querying multiple zoekt-webserver instances.
 475type Progress struct {
 476	// Priority of the shard that was searched.
 477	Priority float64
 478
 479	// MaxPendingPriority is the maximum priority of pending result that is being searched in parallel.
 480	// This is used to reorder results when the result set is known to be stable-- that is, when a result's
 481	// Priority is greater than the max(MaxPendingPriority) from the latest results of each backend, it can be returned to the user.
 482	//
 483	// MaxPendingPriority decreases monotonically in each SearchResult.
 484	MaxPendingPriority float64
 485}
 486
 487func (p *Progress) sizeBytes() uint64 {
 488	return 2 * 8
 489}
 490
 491// SearchResult contains search matches and extra data
 492type SearchResult struct {
 493	Stats
 494
 495	// Do not encode this as we cannot encode -Inf in JSON
 496	Progress `json:"-"`
 497
 498	Files []FileMatch
 499
 500	// RepoURLs holds a repo => template string map.
 501	RepoURLs map[string]string
 502
 503	// FragmentNames holds a repo => template string map, for
 504	// the line number fragment.
 505	LineFragments map[string]string
 506}
 507
 508// SizeBytes is a best-effort estimate of the size of SearchResult in memory.
 509// The estimate does not take alignment into account. The result is a lower
 510// bound on the actual size in memory.
 511func (sr *SearchResult) SizeBytes() (sz uint64) {
 512	sz += sr.Stats.sizeBytes()
 513	sz += sr.Progress.sizeBytes()
 514
 515	// Files
 516	sz += sliceHeaderBytes
 517	for _, f := range sr.Files {
 518		sz += f.sizeBytes()
 519	}
 520
 521	// RepoURLs
 522	sz += mapHeaderBytes
 523	for k, v := range sr.RepoURLs {
 524		sz += stringHeaderBytes + uint64(len(k))
 525		sz += stringHeaderBytes + uint64(len(v))
 526	}
 527
 528	// LineFragments
 529	sz += mapHeaderBytes
 530	for k, v := range sr.LineFragments {
 531		sz += stringHeaderBytes + uint64(len(k))
 532		sz += stringHeaderBytes + uint64(len(v))
 533	}
 534
 535	return
 536}
 537
 538// RepositoryBranch describes an indexed branch, which is a name
 539// combined with a version.
 540type RepositoryBranch struct {
 541	Name    string
 542	Version string
 543}
 544
 545func (r RepositoryBranch) String() string {
 546	return fmt.Sprintf("%s@%s", r.Name, r.Version)
 547}
 548
 549// Repository holds repository metadata.
 550type Repository struct {
 551	// Sourcegraph's repository ID
 552	ID uint32
 553
 554	// The repository name
 555	Name string
 556
 557	// The repository URL.
 558	URL string
 559
 560	// The physical source where this repo came from, eg. full
 561	// path to the zip filename or git repository directory. This
 562	// will not be exposed in the UI, but can be used to detect
 563	// orphaned index shards.
 564	Source string
 565
 566	// The branches indexed in this repo.
 567	Branches []RepositoryBranch
 568
 569	// Nil if this is not the super project.
 570	SubRepoMap map[string]*Repository
 571
 572	// URL template to link to the commit of a branch
 573	CommitURLTemplate string
 574
 575	// The repository URL for getting to a file.  Has access to
 576	// {{.Version}}, {{.Path}}
 577	FileURLTemplate string
 578
 579	// The URL fragment to add to a file URL for line numbers. has
 580	// access to {{.LineNumber}}. The fragment should include the
 581	// separator, generally '#' or ';'.
 582	LineFragmentTemplate string
 583
 584	// Perf optimization: priority is set when we load the shard. It corresponds to
 585	// the value of "priority" stored in RawConfig.
 586	priority float64
 587
 588	// All zoekt.* configuration settings.
 589	RawConfig map[string]string
 590
 591	// Importance of the repository, bigger is more important
 592	Rank uint16
 593
 594	// IndexOptions is a hash of the options used to create the index for the
 595	// repo.
 596	IndexOptions string
 597
 598	// HasSymbols is true if this repository has indexed ctags
 599	// output. Sourcegraph specific: This field is more appropriate for
 600	// IndexMetadata. However, we store it here since the Sourcegraph frontend
 601	// can read this structure but not IndexMetadata.
 602	HasSymbols bool
 603
 604	// Tombstone is true if we are not allowed to search this repo.
 605	Tombstone bool
 606
 607	// LatestCommitDate is the date of the latest commit among all indexed Branches.
 608	// The date might be time.Time's 0-value if the repository was last indexed
 609	// before this field was added.
 610	LatestCommitDate time.Time
 611
 612	// FileTombstones is a set of file paths that should be ignored across all branches
 613	// in this shard.
 614	FileTombstones map[string]struct{} `json:",omitempty"`
 615}
 616
 617func (r *Repository) UnmarshalJSON(data []byte) error {
 618	// We define a new type so that we can use json.Unmarshal
 619	// without recursing into this same method.
 620	type repository *Repository
 621	repo := repository(r)
 622
 623	err := json.Unmarshal(data, repo)
 624	if err != nil {
 625		return err
 626	}
 627
 628	if v, ok := repo.RawConfig["repoid"]; ok {
 629		id, _ := strconv.ParseUint(v, 10, 32)
 630		r.ID = uint32(id)
 631	}
 632
 633	if v, ok := repo.RawConfig["priority"]; ok {
 634		r.priority, err = strconv.ParseFloat(v, 64)
 635		if err != nil {
 636			r.priority = 0
 637		}
 638
 639		// Sourcegraph indexserver doesn't set repo.Rank, so we set it here
 640		// based on priority. Setting it on read instead of during indexing
 641		// allows us to avoid a complete reindex.
 642		if r.Rank == 0 && r.priority > 0 {
 643			// Normalize the repo score within [0, 1), with the midpoint at 5,000. This means popular
 644			// repos (roughly ones with over 5,000 stars) see diminishing returns from more stars.
 645			r.Rank = uint16(r.priority / (5000.0 + r.priority) * maxUInt16)
 646		}
 647	}
 648	return nil
 649}
 650
 651// MergeMutable will merge x into r. mutated will be true if it made any
 652// changes. err is non-nil if we needed to mutate an immutable field.
 653//
 654// Note: SubRepoMap, IndexOptions and HasSymbol fields are ignored. They are
 655// computed while indexing so can't be synthesized from x.
 656//
 657// Note: We ignore RawConfig fields which are duplicated into Repository:
 658// name and id.
 659//
 660// Note: URL, *Template fields are ignored. They are not used by Sourcegraph.
 661func (r *Repository) MergeMutable(x *Repository) (mutated bool, err error) {
 662	if r.ID != x.ID {
 663		// Sourcegraph: strange behaviour may occur if ID changes but names don't.
 664		return mutated, errors.New("ID is immutable")
 665	}
 666	if r.Name != x.Name {
 667		// Name is encoded into the shard name on disk. We need to re-index if it
 668		// changes.
 669		return mutated, errors.New("Name is immutable")
 670	}
 671	if !reflect.DeepEqual(r.Branches, x.Branches) {
 672		// Need a reindex if content changing.
 673		return mutated, errors.New("Branches is immutable")
 674	}
 675
 676	for k, v := range x.RawConfig {
 677		// We ignore name and id since they are encoded into the repository.
 678		if k == "name" || k == "id" {
 679			continue
 680		}
 681		if r.RawConfig == nil {
 682			mutated = true
 683			r.RawConfig = make(map[string]string)
 684		}
 685		if r.RawConfig[k] != v {
 686			mutated = true
 687			r.RawConfig[k] = v
 688		}
 689	}
 690
 691	return mutated, nil
 692}
 693
 694// IndexMetadata holds metadata stored in the index file. It contains
 695// data generated by the core indexing library.
 696type IndexMetadata struct {
 697	IndexFormatVersion    int
 698	IndexFeatureVersion   int
 699	IndexMinReaderVersion int
 700	IndexTime             time.Time
 701	PlainASCII            bool
 702	LanguageMap           map[string]uint16
 703	ZoektVersion          string
 704	ID                    string
 705}
 706
 707// Statistics of a (collection of) repositories.
 708type RepoStats struct {
 709	// Repos is used for aggregrating the number of repositories.
 710	//
 711	// Note: This field is not populated on RepoListEntry.Stats (individual) but
 712	// only for RepoList.Stats (aggregate).
 713	Repos int
 714
 715	// Shards is the total number of search shards.
 716	Shards int
 717
 718	// Documents holds the number of documents or files.
 719	Documents int
 720
 721	// IndexBytes is the amount of RAM used for index overhead.
 722	IndexBytes int64
 723
 724	// ContentBytes is the amount of RAM used for raw content.
 725	ContentBytes int64
 726
 727	// Sourcegraph specific stats below. These are not as efficient to calculate
 728	// as the above statistics. We experimentally measured about a 10% slower
 729	// shard load time. However, we find these values very useful to track and
 730	// computing them outside of load time introduces a lot of complexity.
 731
 732	// NewLinesCount is the number of newlines "\n" that appear in the zoekt
 733	// indexed documents. This is not exactly the same as line count, since it
 734	// will not include lines not terminated by "\n" (eg a file with no "\n", or
 735	// a final line without "\n"). Note: Zoekt deduplicates documents across
 736	// branches, so if a path has the same contents on multiple branches, there
 737	// is only one document for it. As such that document's newlines is only
 738	// counted once. See DefaultBranchNewLinesCount and AllBranchesNewLinesCount
 739	// for counts which do not deduplicate.
 740	NewLinesCount uint64
 741
 742	// DefaultBranchNewLinesCount is the number of newlines "\n" in the default
 743	// branch.
 744	DefaultBranchNewLinesCount uint64
 745
 746	// OtherBranchesNewLinesCount is the number of newlines "\n" in all branches
 747	// except the default branch.
 748	OtherBranchesNewLinesCount uint64
 749}
 750
 751func (s *RepoStats) Add(o *RepoStats) {
 752	// can't update Repos, since one repo may have multiple
 753	// shards.
 754	s.Shards += o.Shards
 755	s.IndexBytes += o.IndexBytes
 756	s.Documents += o.Documents
 757	s.ContentBytes += o.ContentBytes
 758
 759	// Sourcegraph specific
 760	s.NewLinesCount += o.NewLinesCount
 761	s.DefaultBranchNewLinesCount += o.DefaultBranchNewLinesCount
 762	s.OtherBranchesNewLinesCount += o.OtherBranchesNewLinesCount
 763}
 764
 765type RepoListEntry struct {
 766	Repository    Repository
 767	IndexMetadata IndexMetadata
 768	Stats         RepoStats
 769}
 770
 771// MinimalRepoListEntry is a subset of RepoListEntry. It was added after
 772// performance profiling of sourcegraph.com revealed that querying this
 773// information from Zoekt was causing lots of CPU and memory usage. Note: we
 774// can revisit this, how we store and query this information has changed a lot
 775// since this was introduced.
 776type MinimalRepoListEntry struct {
 777	// HasSymbols is exported since Sourcegraph uses this information at search
 778	// planning time to decide between Zoekt and an unindexed symbol search.
 779	//
 780	// Note: it pretty much is always true in practice.
 781	HasSymbols bool
 782
 783	// Branches is used by Sourcegraphs query planner to decided if it can use
 784	// zoekt or go via an unindexed code path.
 785	Branches []RepositoryBranch
 786
 787	// IndexTimeUnix is the IndexTime converted to unix time (number of seconds
 788	// since the epoch). This is to make it clear we are not transporting the
 789	// full fidelty timestamp (ie with milliseconds and location). Additionally
 790	// it saves 16 bytes in this struct.
 791	//
 792	// IndexTime is used as a heuristic in Sourcegraph to decide in aggregate
 793	// how many repositories need updating after a ranking change/etc.
 794	//
 795	// TODO(keegancsmith) audit updates to IndexTime and document how and when
 796	// it changes. Concerned about things like metadata updates or compound
 797	// shards leading to untrustworthy data here.
 798	IndexTimeUnix int64
 799}
 800
 801type ReposMap map[uint32]MinimalRepoListEntry
 802
 803// MarshalBinary implements a specialized encoder for ReposMap.
 804func (q *ReposMap) MarshalBinary() ([]byte, error) {
 805	return reposMapEncode(*q)
 806}
 807
 808// UnmarshalBinary implements a specialized decoder for ReposMap.
 809func (q *ReposMap) UnmarshalBinary(b []byte) error {
 810	var err error
 811	(*q), err = reposMapDecode(b)
 812	return err
 813}
 814
 815// RepoList holds a set of Repository metadata.
 816type RepoList struct {
 817	// Returned when ListOptions.Field is RepoListFieldRepos.
 818	Repos []*RepoListEntry
 819
 820	// ReposMap is set when ListOptions.Field is RepoListFieldReposMap.
 821	ReposMap ReposMap
 822
 823	Crashes int
 824
 825	// Stats response to a List request.
 826	// This is the aggregate RepoStats of all repos matching the input query.
 827	Stats RepoStats
 828}
 829
 830type Searcher interface {
 831	Search(ctx context.Context, q query.Q, opts *SearchOptions) (*SearchResult, error)
 832
 833	// List lists repositories. The query `q` can only contain
 834	// query.Repo atoms.
 835	List(ctx context.Context, q query.Q, opts *ListOptions) (*RepoList, error)
 836	Close()
 837
 838	// Describe the searcher for debug messages.
 839	String() string
 840}
 841
 842type RepoListField int
 843
 844const (
 845	RepoListFieldRepos    RepoListField = 0
 846	RepoListFieldReposMap               = 2
 847)
 848
 849type ListOptions struct {
 850	// Field decides which field to populate in RepoList response.
 851	Field RepoListField
 852}
 853
 854func (o *ListOptions) GetField() (RepoListField, error) {
 855	if o == nil {
 856		return RepoListFieldRepos, nil
 857	}
 858	switch o.Field {
 859	case RepoListFieldRepos, RepoListFieldReposMap:
 860		return o.Field, nil
 861	case 1:
 862		return 0, fmt.Errorf("RepoListFieldMinimal (%d) is no longer supported", o.Field)
 863	default:
 864		return 0, fmt.Errorf("unknown RepoListField %d", o.Field)
 865	}
 866}
 867
 868func (o *ListOptions) String() string {
 869	return fmt.Sprintf("%#v", o)
 870}
 871
 872type SearchOptions struct {
 873	// Return an upper-bound estimate of eligible documents in
 874	// stats.ShardFilesConsidered.
 875	EstimateDocCount bool
 876
 877	// Return the whole file.
 878	Whole bool
 879
 880	// Maximum number of matches: skip all processing an index
 881	// shard after we found this many non-overlapping matches.
 882	ShardMaxMatchCount int
 883
 884	// Maximum number of matches: stop looking for more matches
 885	// once we have this many matches across shards.
 886	TotalMaxMatchCount int
 887
 888	// Maximum number of matches: skip processing documents for a repository in
 889	// a shard once we have found ShardRepoMaxMatchCount.
 890	//
 891	// A compound shard may contain multiple repositories. This will most often
 892	// be set to 1 to find all repositories containing a result.
 893	ShardRepoMaxMatchCount int
 894
 895	// Abort the search after this much time has passed.
 896	MaxWallTime time.Duration
 897
 898	// FlushWallTime if non-zero will stop streaming behaviour at first and
 899	// instead will collate and sort results. At FlushWallTime the results will
 900	// be sent and then the behaviour will revert to the normal streaming.
 901	FlushWallTime time.Duration
 902
 903	// Truncates the number of documents (i.e. files) after collating and
 904	// sorting the results.
 905	MaxDocDisplayCount int
 906
 907	// Truncates the number of matchs after collating and sorting the results.
 908	MaxMatchDisplayCount int
 909
 910	// If set to a number greater than zero then up to this many number
 911	// of context lines will be added before and after each matched line.
 912	// Note that the included context lines might contain matches and
 913	// it's up to the consumer of the result to remove those lines.
 914	NumContextLines int
 915
 916	// If true, ChunkMatches will be returned in each FileMatch rather than LineMatches
 917	// EXPERIMENTAL: the behavior of this flag may be changed in future versions.
 918	ChunkMatches bool
 919
 920	// EXPERIMENTAL. If true, document ranks are used as additional input for
 921	// sorting matches.
 922	UseDocumentRanks bool
 923
 924	// EXPERIMENTAL. When UseDocumentRanks is enabled, this can be optionally set to adjust
 925	// their weight in the file match score. If the value is <= 0.0, the default weight value
 926	// will be used. This option is temporary and is only exposed for testing/ tuning purposes.
 927	DocumentRanksWeight float64
 928
 929	// EXPERIMENTAL. If true, use keyword-style scoring instead of the default scoring formula.
 930	// Currently, this treats each match in a file as a term and computes an approximation to BM25.
 931	// When enabled, all other scoring signals are ignored, including document ranks.
 932	UseKeywordScoring bool
 933
 934	// Trace turns on opentracing for this request if true and if the Jaeger address was provided as
 935	// a command-line flag
 936	Trace bool
 937
 938	// If set, the search results will contain debug information for scoring.
 939	DebugScore bool
 940
 941	// SpanContext is the opentracing span context, if it exists, from the zoekt client
 942	SpanContext map[string]string
 943}
 944
 945// String returns a succinct representation of the options. This is meant for
 946// human consumption in logs and traces.
 947//
 948// Note: some tracing systems have limits on length of values, so we take care
 949// to try and make this small, and include the important information near the
 950// front incase of truncation.
 951func (s *SearchOptions) String() string {
 952	var b strings.Builder
 953
 954	add := func(name, value string) {
 955		b.WriteString(name)
 956		b.WriteByte('=')
 957		b.WriteString(value)
 958		b.WriteByte(' ')
 959	}
 960	addInt := func(name string, value int) {
 961		if value != 0 {
 962			add(name, strconv.Itoa(value))
 963		}
 964	}
 965	addDuration := func(name string, value time.Duration) {
 966		if value != 0 {
 967			add(name, value.String())
 968		}
 969	}
 970	addBool := func(name string, value bool) {
 971		if !value {
 972			return
 973		}
 974		b.WriteString(name)
 975		b.WriteByte(' ')
 976	}
 977
 978	b.WriteString("zoekt.SearchOptions{ ")
 979
 980	addInt("ShardMaxMatchCount", s.ShardMaxMatchCount)
 981	addInt("TotalMaxMatchCount", s.TotalMaxMatchCount)
 982	addInt("ShardRepoMaxMatchCount", s.ShardRepoMaxMatchCount)
 983	addInt("MaxDocDisplayCount", s.MaxDocDisplayCount)
 984	addInt("MaxMatchDisplayCount", s.MaxMatchDisplayCount)
 985	addInt("NumContextLines", s.NumContextLines)
 986
 987	addDuration("MaxWallTime", s.MaxWallTime)
 988	addDuration("FlushWallTime", s.FlushWallTime)
 989
 990	if s.DocumentRanksWeight > 0 {
 991		add("DocumentRanksWeight", strconv.FormatFloat(s.DocumentRanksWeight, 'g', -1, 64))
 992	}
 993
 994	addBool("EstimateDocCount", s.EstimateDocCount)
 995	addBool("Whole", s.Whole)
 996	addBool("ChunkMatches", s.ChunkMatches)
 997	addBool("UseDocumentRanks", s.UseDocumentRanks)
 998	addBool("UseKeywordScoring", s.UseKeywordScoring)
 999	addBool("Trace", s.Trace)
1000	addBool("DebugScore", s.DebugScore)
1001
1002	for k, v := range s.SpanContext {
1003		add("SpanContext."+k, strconv.Quote(v))
1004	}
1005
1006	b.WriteByte('}')
1007	return b.String()
1008}
1009
1010// Sender is the interface that wraps the basic Send method.
1011type Sender interface {
1012	Send(*SearchResult)
1013}
1014
1015// Streamer adds the method StreamSearch to the Searcher interface.
1016type Streamer interface {
1017	Searcher
1018	StreamSearch(ctx context.Context, q query.Q, opts *SearchOptions, sender Sender) (err error)
1019}
Configure Feed

Configure Feed