api.go at 687cafc8f702e6c0efa0b562b35c0eee619a88d8 · boltless.me/zoekt

fork of https://github.com/sourcegraph/zoekt
zoekt / api.go
at 687cafc8f702e6c0efa0b562b35c0eee619a88d8 30 kB View raw
Stefan Hengl sourcegraph: multi-tenant Zoekt (#859) 2y ago
   1// Copyright 2016 Google Inc. All rights reserved.
   2//
   3// Licensed under the Apache License, Version 2.0 (the "License");
   4// you may not use this file except in compliance with the License.
   5// You may obtain a copy of the License at
   6//
   7//    http://www.apache.org/licenses/LICENSE-2.0
   8//
   9// Unless required by applicable law or agreed to in writing, software
  10// distributed under the License is distributed on an "AS IS" BASIS,
  11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12// See the License for the specific language governing permissions and
  13// limitations under the License.
  14
  15package zoekt // import "github.com/sourcegraph/zoekt"
  16
  17import (
  18	"context"
  19	"encoding/json"
  20	"errors"
  21	"fmt"
  22	"reflect"
  23	"strconv"
  24	"strings"
  25	"time"
  26
  27	"github.com/sourcegraph/zoekt/query"
  28)
  29
  30const (
  31	mapHeaderBytes    uint64 = 48
  32	sliceHeaderBytes  uint64 = 24
  33	stringHeaderBytes uint64 = 16
  34	pointerSize       uint64 = 8
  35	interfaceBytes    uint64 = 16
  36)
  37
  38// FileMatch contains all the matches within a file.
  39type FileMatch struct {
  40	FileName string
  41
  42	// Repository is the globally unique name of the repo of the
  43	// match
  44	Repository string
  45
  46	// SubRepositoryName is the globally unique name of the repo,
  47	// if it came from a subrepository
  48	SubRepositoryName string `json:",omitempty"`
  49
  50	// SubRepositoryPath holds the prefix where the subrepository
  51	// was mounted.
  52	SubRepositoryPath string `json:",omitempty"`
  53
  54	// Commit SHA1 (hex) of the (sub)repo holding the file.
  55	Version string `json:",omitempty"`
  56
  57	// Detected language of the result.
  58	Language string
  59
  60	// For debugging. Needs DebugScore set, but public so tests in
  61	// other packages can print some diagnostics.
  62	Debug string `json:",omitempty"`
  63
  64	Branches []string `json:",omitempty"`
  65
  66	// One of LineMatches or ChunkMatches will be returned depending on whether
  67	// the SearchOptions.ChunkMatches is set.
  68	LineMatches  []LineMatch  `json:",omitempty"`
  69	ChunkMatches []ChunkMatch `json:",omitempty"`
  70
  71	// Only set if requested
  72	Content []byte `json:",omitempty"`
  73
  74	// Checksum of the content.
  75	Checksum []byte
  76
  77	// Ranking; the higher, the better.
  78	Score float64 `json:",omitempty"`
  79
  80	// RepositoryPriority is a Sourcegraph extension. It is used by Sourcegraph to
  81	// order results from different repositories relative to each other.
  82	RepositoryPriority float64 `json:",omitempty"`
  83
  84	// RepositoryID is a Sourcegraph extension. This is the ID of Repository in
  85	// Sourcegraph.
  86	RepositoryID uint32 `json:",omitempty"`
  87}
  88
  89func (m *FileMatch) sizeBytes() (sz uint64) {
  90	// Score
  91	sz += 8
  92
  93	for _, s := range []string{
  94		m.Debug,
  95		m.FileName,
  96		m.Repository,
  97		m.Language,
  98		m.SubRepositoryName,
  99		m.SubRepositoryPath,
 100		m.Version,
 101	} {
 102		sz += stringHeaderBytes + uint64(len(s))
 103	}
 104
 105	// Branches
 106	sz += sliceHeaderBytes
 107	for _, s := range m.Branches {
 108		sz += stringHeaderBytes + uint64(len(s))
 109	}
 110
 111	// LineMatches
 112	sz += sliceHeaderBytes
 113	for _, lm := range m.LineMatches {
 114		sz += lm.sizeBytes()
 115	}
 116
 117	// ChunkMatches
 118	sz += sliceHeaderBytes
 119	for _, cm := range m.ChunkMatches {
 120		sz += cm.sizeBytes()
 121	}
 122
 123	// RepositoryID
 124	sz += 4
 125
 126	// RepositoryPriority
 127	sz += 8
 128
 129	// Content
 130	sz += sliceHeaderBytes + uint64(len(m.Content))
 131
 132	// Checksum
 133	sz += sliceHeaderBytes + uint64(len(m.Checksum))
 134
 135	return
 136}
 137
 138// ChunkMatch is a set of non-overlapping matches within a contiguous range of
 139// lines in the file.
 140type ChunkMatch struct {
 141	DebugScore string
 142
 143	// Content is a contiguous range of complete lines that fully contains Ranges.
 144	// Lines will always include their terminating newline (if it exists).
 145	Content []byte
 146
 147	// Ranges is a set of matching ranges within this chunk. Each range is relative
 148	// to the beginning of the file (not the beginning of Content).
 149	Ranges []Range
 150
 151	// SymbolInfo is the symbol information associated with Ranges. If it is non-nil,
 152	// its length will equal that of Ranges. Any of its elements may be nil.
 153	SymbolInfo []*Symbol
 154
 155	// FileName indicates whether this match is a match on the file name, in
 156	// which case Content will contain the file name.
 157	FileName bool
 158
 159	// ContentStart is the location (inclusive) of the beginning of content
 160	// relative to the beginning of the file. It will always be at the
 161	// beginning of a line (Column will always be 1).
 162	ContentStart Location
 163
 164	Score float64
 165}
 166
 167func (cm *ChunkMatch) sizeBytes() (sz uint64) {
 168	// Content
 169	sz += sliceHeaderBytes + uint64(len(cm.Content))
 170
 171	// ContentStart
 172	sz += cm.ContentStart.sizeBytes()
 173
 174	// FileName
 175	sz += 1
 176
 177	// Ranges
 178	sz += sliceHeaderBytes
 179	if len(cm.Ranges) > 0 {
 180		sz += uint64(len(cm.Ranges)) * cm.Ranges[0].sizeBytes()
 181	}
 182
 183	// SymbolInfo
 184	sz += sliceHeaderBytes
 185	for _, si := range cm.SymbolInfo {
 186		sz += pointerSize
 187		if si != nil {
 188			sz += si.sizeBytes()
 189		}
 190	}
 191
 192	// Score
 193	sz += 8
 194
 195	// DebugScore
 196	sz += stringHeaderBytes + uint64(len(cm.DebugScore))
 197
 198	return
 199}
 200
 201type Range struct {
 202	// The inclusive beginning of the range.
 203	Start Location
 204	// The exclusive end of the range.
 205	End Location
 206}
 207
 208func (r *Range) sizeBytes() uint64 {
 209	return r.Start.sizeBytes() + r.End.sizeBytes()
 210}
 211
 212type Location struct {
 213	// 0-based byte offset from the beginning of the file
 214	ByteOffset uint32
 215	// 1-based line number from the beginning of the file
 216	LineNumber uint32
 217	// 1-based column number (in runes) from the beginning of line
 218	Column uint32
 219}
 220
 221func (l *Location) sizeBytes() uint64 {
 222	return 3 * 4
 223}
 224
 225// LineMatch holds the matches within a single line in a file.
 226type LineMatch struct {
 227	// The line in which a match was found.
 228	Line []byte
 229	// The byte offset of the first byte of the line.
 230	LineStart int
 231	// The byte offset of the first byte past the end of the line.
 232	// This is usually the byte after the terminating newline, but can also be
 233	// the end of the file if there is no terminating newline
 234	LineEnd    int
 235	LineNumber int
 236
 237	// Before and After are only set when SearchOptions.NumContextLines is > 0
 238	Before []byte
 239	After  []byte
 240
 241	// If set, this was a match on the filename.
 242	FileName bool
 243
 244	// The higher the better. Only ranks the quality of the match
 245	// within the file, does not take rank of file into account
 246	Score      float64
 247	DebugScore string
 248
 249	LineFragments []LineFragmentMatch
 250}
 251
 252func (lm *LineMatch) sizeBytes() (sz uint64) {
 253	// Line
 254	sz += sliceHeaderBytes + uint64(len(lm.Line))
 255
 256	// LineStart, LineEnd, LineNumber
 257	sz += 3 * 8
 258
 259	// Before
 260	sz += sliceHeaderBytes + uint64(len(lm.Before))
 261
 262	// After
 263	sz += sliceHeaderBytes + uint64(len(lm.After))
 264
 265	// FileName
 266	sz += 1
 267
 268	// Score
 269	sz += 8
 270
 271	// DebugScore
 272	sz += stringHeaderBytes + uint64(len(lm.DebugScore))
 273
 274	// LineFragments
 275	sz += sliceHeaderBytes
 276	for _, lf := range lm.LineFragments {
 277		sz += lf.sizeBytes()
 278	}
 279
 280	return
 281}
 282
 283type Symbol struct {
 284	Sym        string
 285	Kind       string
 286	Parent     string
 287	ParentKind string
 288}
 289
 290func (s *Symbol) sizeBytes() uint64 {
 291	return 4*stringHeaderBytes + uint64(len(s.Sym)+len(s.Kind)+len(s.Parent)+len(s.ParentKind))
 292}
 293
 294// LineFragmentMatch a segment of matching text within a line.
 295type LineFragmentMatch struct {
 296	// Offset within the line, in bytes.
 297	LineOffset int
 298
 299	// Offset from file start, in bytes.
 300	Offset uint32
 301
 302	// Number bytes that match.
 303	MatchLength int
 304
 305	SymbolInfo *Symbol
 306}
 307
 308func (lfm *LineFragmentMatch) sizeBytes() (sz uint64) {
 309	// LineOffset
 310	sz += 8
 311
 312	// Offset
 313	sz += 4
 314
 315	// MatchLength
 316	sz += 8
 317
 318	// SymbolInfo
 319	sz += pointerSize
 320	if lfm.SymbolInfo != nil {
 321		sz += lfm.SymbolInfo.sizeBytes()
 322	}
 323
 324	return
 325}
 326
 327type FlushReason uint8
 328
 329const (
 330	FlushReasonTimerExpired FlushReason = 1 << iota
 331	FlushReasonFinalFlush
 332	FlushReasonMaxSize
 333)
 334
 335var FlushReasonStrings = map[FlushReason]string{
 336	FlushReasonTimerExpired: "timer_expired",
 337	FlushReasonFinalFlush:   "final_flush",
 338	FlushReasonMaxSize:      "max_size_reached",
 339}
 340
 341func (fr FlushReason) String() string {
 342	if v, ok := FlushReasonStrings[fr]; ok {
 343		return v
 344	}
 345
 346	return "none"
 347}
 348
 349// Stats contains interesting numbers on the search
 350type Stats struct {
 351	// Amount of I/O for reading contents.
 352	ContentBytesLoaded int64
 353
 354	// Amount of I/O for reading from index.
 355	IndexBytesLoaded int64
 356
 357	// Number of search shards that had a crash.
 358	Crashes int
 359
 360	// Wall clock time for this search
 361	Duration time.Duration
 362
 363	// Number of files containing a match.
 364	FileCount int
 365
 366	// Number of files in shards that we considered.
 367	ShardFilesConsidered int
 368
 369	// Files that we evaluated. Equivalent to files for which all
 370	// atom matches (including negations) evaluated to true.
 371	FilesConsidered int
 372
 373	// Files for which we loaded file content to verify substring matches
 374	FilesLoaded int
 375
 376	// Candidate files whose contents weren't examined because we
 377	// gathered enough matches.
 378	FilesSkipped int
 379
 380	// Shards that we scanned to find matches.
 381	ShardsScanned int
 382
 383	// Shards that we did not process because a query was canceled.
 384	ShardsSkipped int
 385
 386	// Shards that we did not process because the query was rejected by the
 387	// ngram filter indicating it had no matches.
 388	ShardsSkippedFilter int
 389
 390	// Number of non-overlapping matches
 391	MatchCount int
 392
 393	// Number of candidate matches as a result of searching ngrams.
 394	NgramMatches int
 395
 396	// NgramLookups is the number of times we accessed an ngram in the index.
 397	NgramLookups int
 398
 399	// Wall clock time for queued search.
 400	Wait time.Duration
 401
 402	// Aggregate wall clock time spent constructing and pruning the match tree.
 403	// This accounts for time such as lookups in the trigram index.
 404	MatchTreeConstruction time.Duration
 405
 406	// Aggregate wall clock time spent searching the match tree. This accounts
 407	// for the bulk of search work done looking for matches.
 408	MatchTreeSearch time.Duration
 409
 410	// Number of times regexp was called on files that we evaluated.
 411	RegexpsConsidered int
 412
 413	// FlushReason explains why results were flushed.
 414	FlushReason FlushReason
 415}
 416
 417func (s *Stats) sizeBytes() (sz uint64) {
 418	sz = 16 * 8 // This assumes we are running on a 64-bit architecture
 419	sz += 1     // FlushReason
 420
 421	return
 422}
 423
 424func (s *Stats) Add(o Stats) {
 425	s.ContentBytesLoaded += o.ContentBytesLoaded
 426	s.IndexBytesLoaded += o.IndexBytesLoaded
 427	s.Crashes += o.Crashes
 428	s.FileCount += o.FileCount
 429	s.FilesConsidered += o.FilesConsidered
 430	s.FilesLoaded += o.FilesLoaded
 431	s.FilesSkipped += o.FilesSkipped
 432	s.MatchCount += o.MatchCount
 433	s.NgramMatches += o.NgramMatches
 434	s.NgramLookups += o.NgramLookups
 435	s.ShardFilesConsidered += o.ShardFilesConsidered
 436	s.ShardsScanned += o.ShardsScanned
 437	s.ShardsSkipped += o.ShardsSkipped
 438	s.ShardsSkippedFilter += o.ShardsSkippedFilter
 439	s.Wait += o.Wait
 440	s.MatchTreeConstruction += o.MatchTreeConstruction
 441	s.MatchTreeSearch += o.MatchTreeSearch
 442	s.RegexpsConsidered += o.RegexpsConsidered
 443
 444	// We want the first non-zero FlushReason to be sticky. This is a useful
 445	// property when aggregating stats from several Zoekts.
 446	if s.FlushReason == 0 {
 447		s.FlushReason = o.FlushReason
 448	}
 449}
 450
 451// Zero returns true if stats is empty.
 452func (s *Stats) Zero() bool {
 453	if s == nil {
 454		return true
 455	}
 456
 457	return !(s.ContentBytesLoaded > 0 ||
 458		s.IndexBytesLoaded > 0 ||
 459		s.Crashes > 0 ||
 460		s.FileCount > 0 ||
 461		s.FilesConsidered > 0 ||
 462		s.FilesLoaded > 0 ||
 463		s.FilesSkipped > 0 ||
 464		s.MatchCount > 0 ||
 465		s.NgramMatches > 0 ||
 466		s.NgramLookups > 0 ||
 467		s.ShardFilesConsidered > 0 ||
 468		s.ShardsScanned > 0 ||
 469		s.ShardsSkipped > 0 ||
 470		s.ShardsSkippedFilter > 0 ||
 471		s.Wait > 0 ||
 472		s.MatchTreeConstruction > 0 ||
 473		s.MatchTreeSearch > 0 ||
 474		s.RegexpsConsidered > 0)
 475}
 476
 477// Progress contains information about the global progress of the running search query.
 478// This is used by the frontend to reorder results and emit them when stable.
 479// Sourcegraph specific: this is used when querying multiple zoekt-webserver instances.
 480type Progress struct {
 481	// Priority of the shard that was searched.
 482	Priority float64
 483
 484	// MaxPendingPriority is the maximum priority of pending result that is being searched in parallel.
 485	// This is used to reorder results when the result set is known to be stable-- that is, when a result's
 486	// Priority is greater than the max(MaxPendingPriority) from the latest results of each backend, it can be returned to the user.
 487	//
 488	// MaxPendingPriority decreases monotonically in each SearchResult.
 489	MaxPendingPriority float64
 490}
 491
 492func (p *Progress) sizeBytes() uint64 {
 493	return 2 * 8
 494}
 495
 496// SearchResult contains search matches and extra data
 497type SearchResult struct {
 498	Stats
 499
 500	// Do not encode this as we cannot encode -Inf in JSON
 501	Progress `json:"-"`
 502
 503	Files []FileMatch
 504
 505	// RepoURLs holds a repo => template string map.
 506	RepoURLs map[string]string
 507
 508	// FragmentNames holds a repo => template string map, for
 509	// the line number fragment.
 510	LineFragments map[string]string
 511}
 512
 513// SizeBytes is a best-effort estimate of the size of SearchResult in memory.
 514// The estimate does not take alignment into account. The result is a lower
 515// bound on the actual size in memory.
 516func (sr *SearchResult) SizeBytes() (sz uint64) {
 517	sz += sr.Stats.sizeBytes()
 518	sz += sr.Progress.sizeBytes()
 519
 520	// Files
 521	sz += sliceHeaderBytes
 522	for _, f := range sr.Files {
 523		sz += f.sizeBytes()
 524	}
 525
 526	// RepoURLs
 527	sz += mapHeaderBytes
 528	for k, v := range sr.RepoURLs {
 529		sz += stringHeaderBytes + uint64(len(k))
 530		sz += stringHeaderBytes + uint64(len(v))
 531	}
 532
 533	// LineFragments
 534	sz += mapHeaderBytes
 535	for k, v := range sr.LineFragments {
 536		sz += stringHeaderBytes + uint64(len(k))
 537		sz += stringHeaderBytes + uint64(len(v))
 538	}
 539
 540	return
 541}
 542
 543// RepositoryBranch describes an indexed branch, which is a name
 544// combined with a version.
 545type RepositoryBranch struct {
 546	Name    string
 547	Version string
 548}
 549
 550func (r RepositoryBranch) String() string {
 551	return fmt.Sprintf("%s@%s", r.Name, r.Version)
 552}
 553
 554// Repository holds repository metadata.
 555type Repository struct {
 556	// Sourcegraph's tenant ID
 557	TenantID int
 558
 559	// Sourcegraph's repository ID
 560	ID uint32
 561
 562	// The repository name
 563	Name string
 564
 565	// The repository URL.
 566	URL string
 567
 568	// The physical source where this repo came from, eg. full
 569	// path to the zip filename or git repository directory. This
 570	// will not be exposed in the UI, but can be used to detect
 571	// orphaned index shards.
 572	Source string
 573
 574	// The branches indexed in this repo.
 575	Branches []RepositoryBranch
 576
 577	// Nil if this is not the super project.
 578	SubRepoMap map[string]*Repository
 579
 580	// URL template to link to the commit of a branch
 581	CommitURLTemplate string
 582
 583	// The repository URL for getting to a file.  Has access to
 584	// {{.Version}}, {{.Path}}
 585	FileURLTemplate string
 586
 587	// The URL fragment to add to a file URL for line numbers. has
 588	// access to {{.LineNumber}}. The fragment should include the
 589	// separator, generally '#' or ';'.
 590	LineFragmentTemplate string
 591
 592	// Perf optimization: priority is set when we load the shard. It corresponds to
 593	// the value of "priority" stored in RawConfig.
 594	priority float64
 595
 596	// All zoekt.* configuration settings.
 597	RawConfig map[string]string
 598
 599	// Importance of the repository, bigger is more important
 600	Rank uint16
 601
 602	// IndexOptions is a hash of the options used to create the index for the
 603	// repo.
 604	IndexOptions string
 605
 606	// HasSymbols is true if this repository has indexed ctags
 607	// output. Sourcegraph specific: This field is more appropriate for
 608	// IndexMetadata. However, we store it here since the Sourcegraph frontend
 609	// can read this structure but not IndexMetadata.
 610	HasSymbols bool
 611
 612	// Tombstone is true if we are not allowed to search this repo.
 613	Tombstone bool
 614
 615	// LatestCommitDate is the date of the latest commit among all indexed Branches.
 616	// The date might be time.Time's 0-value if the repository was last indexed
 617	// before this field was added.
 618	LatestCommitDate time.Time
 619
 620	// FileTombstones is a set of file paths that should be ignored across all branches
 621	// in this shard.
 622	FileTombstones map[string]struct{} `json:",omitempty"`
 623}
 624
 625func (r *Repository) UnmarshalJSON(data []byte) error {
 626	// We define a new type so that we can use json.Unmarshal
 627	// without recursing into this same method.
 628	type repository *Repository
 629	repo := repository(r)
 630
 631	err := json.Unmarshal(data, repo)
 632	if err != nil {
 633		return err
 634	}
 635
 636	if v, ok := repo.RawConfig["repoid"]; ok {
 637		id, _ := strconv.ParseUint(v, 10, 32)
 638		r.ID = uint32(id)
 639	}
 640
 641	if v, ok := repo.RawConfig["tenantID"]; ok {
 642		id, _ := strconv.ParseInt(v, 10, 64)
 643		r.TenantID = int(id)
 644	}
 645
 646	// Sourcegraph indexserver doesn't set repo.Rank, so we set it here. Setting it
 647	// on read instead of during indexing allows us to avoid a complete reindex.
 648	//
 649	// Prefer "latestCommitDate" over "priority" for ranking. We keep priority for
 650	// backwards compatibility.
 651	if _, ok := repo.RawConfig["latestCommitDate"]; ok {
 652		// We use the number of months since 1970 as a simple measure of repo freshness.
 653		// It is monotonically increasing and stable across re-indexes and restarts.
 654		r.Rank = monthsSince1970(repo.LatestCommitDate)
 655	} else if v, ok := repo.RawConfig["priority"]; ok {
 656		r.priority, err = strconv.ParseFloat(v, 64)
 657		if err != nil {
 658			r.priority = 0
 659		}
 660
 661		// Sourcegraph indexserver doesn't set repo.Rank, so we set it here
 662		// based on priority. Setting it on read instead of during indexing
 663		// allows us to avoid a complete reindex.
 664		if r.Rank == 0 && r.priority > 0 {
 665			// Normalize the repo score within [0, maxUint16), with the midpoint at 5,000.
 666			// This means popular repos (roughly ones with over 5,000 stars) see diminishing
 667			// returns from more stars.
 668			r.Rank = uint16(r.priority / (5000.0 + r.priority) * maxUInt16)
 669		}
 670	}
 671
 672	return nil
 673}
 674
 675// monthsSince1970 returns the number of months since 1970. It returns values in
 676// the range [0, maxUInt16]. The upper bound is reached in the year 7431, the
 677// lower bound for all dates before 1970.
 678func monthsSince1970(t time.Time) uint16 {
 679	base := time.Unix(0, 0)
 680	if t.Before(base) {
 681		return 0
 682	}
 683	months := int(t.Year()-1970)*12 + int(t.Month()-1)
 684	return uint16(min(months, maxUInt16))
 685}
 686
 687// MergeMutable will merge x into r. mutated will be true if it made any
 688// changes. err is non-nil if we needed to mutate an immutable field.
 689//
 690// Note: SubRepoMap, IndexOptions and HasSymbol fields are ignored. They are
 691// computed while indexing so can't be synthesized from x.
 692//
 693// Note: We ignore RawConfig fields which are duplicated into Repository:
 694// name and id.
 695func (r *Repository) MergeMutable(x *Repository) (mutated bool, err error) {
 696	if r.ID != x.ID {
 697		// Sourcegraph: strange behaviour may occur if ID changes but names don't.
 698		return mutated, errors.New("ID is immutable")
 699	}
 700	if r.Name != x.Name {
 701		// Name is encoded into the shard name on disk. We need to re-index if it
 702		// changes.
 703		return mutated, errors.New("Name is immutable")
 704	}
 705	if !reflect.DeepEqual(r.Branches, x.Branches) {
 706		// Need a reindex if content changing.
 707		return mutated, errors.New("Branches is immutable")
 708	}
 709
 710	for k, v := range x.RawConfig {
 711		// We ignore name and id since they are encoded into the repository.
 712		if k == "name" || k == "id" {
 713			continue
 714		}
 715		if r.RawConfig == nil {
 716			mutated = true
 717			r.RawConfig = make(map[string]string)
 718		}
 719		if r.RawConfig[k] != v {
 720			mutated = true
 721			r.RawConfig[k] = v
 722		}
 723	}
 724
 725	if r.URL != x.URL {
 726		mutated = true
 727		r.URL = x.URL
 728	}
 729	if r.CommitURLTemplate != x.CommitURLTemplate {
 730		mutated = true
 731		r.CommitURLTemplate = x.CommitURLTemplate
 732	}
 733	if r.FileURLTemplate != x.FileURLTemplate {
 734		mutated = true
 735		r.FileURLTemplate = x.FileURLTemplate
 736	}
 737	if r.LineFragmentTemplate != x.LineFragmentTemplate {
 738		mutated = true
 739		r.LineFragmentTemplate = x.LineFragmentTemplate
 740	}
 741
 742	return mutated, nil
 743}
 744
 745// IndexMetadata holds metadata stored in the index file. It contains
 746// data generated by the core indexing library.
 747type IndexMetadata struct {
 748	IndexFormatVersion    int
 749	IndexFeatureVersion   int
 750	IndexMinReaderVersion int
 751	IndexTime             time.Time
 752	PlainASCII            bool
 753	LanguageMap           map[string]uint16
 754	ZoektVersion          string
 755	ID                    string
 756}
 757
 758// Statistics of a (collection of) repositories.
 759type RepoStats struct {
 760	// Repos is used for aggregrating the number of repositories.
 761	//
 762	// Note: This field is not populated on RepoListEntry.Stats (individual) but
 763	// only for RepoList.Stats (aggregate).
 764	Repos int
 765
 766	// Shards is the total number of search shards.
 767	Shards int
 768
 769	// Documents holds the number of documents or files.
 770	Documents int
 771
 772	// IndexBytes is the amount of RAM used for index overhead.
 773	IndexBytes int64
 774
 775	// ContentBytes is the amount of RAM used for raw content.
 776	ContentBytes int64
 777
 778	// Sourcegraph specific stats below. These are not as efficient to calculate
 779	// as the above statistics. We experimentally measured about a 10% slower
 780	// shard load time. However, we find these values very useful to track and
 781	// computing them outside of load time introduces a lot of complexity.
 782
 783	// NewLinesCount is the number of newlines "\n" that appear in the zoekt
 784	// indexed documents. This is not exactly the same as line count, since it
 785	// will not include lines not terminated by "\n" (eg a file with no "\n", or
 786	// a final line without "\n"). Note: Zoekt deduplicates documents across
 787	// branches, so if a path has the same contents on multiple branches, there
 788	// is only one document for it. As such that document's newlines is only
 789	// counted once. See DefaultBranchNewLinesCount and AllBranchesNewLinesCount
 790	// for counts which do not deduplicate.
 791	NewLinesCount uint64
 792
 793	// DefaultBranchNewLinesCount is the number of newlines "\n" in the default
 794	// branch.
 795	DefaultBranchNewLinesCount uint64
 796
 797	// OtherBranchesNewLinesCount is the number of newlines "\n" in all branches
 798	// except the default branch.
 799	OtherBranchesNewLinesCount uint64
 800}
 801
 802func (s *RepoStats) Add(o *RepoStats) {
 803	// can't update Repos, since one repo may have multiple
 804	// shards.
 805	s.Shards += o.Shards
 806	s.IndexBytes += o.IndexBytes
 807	s.Documents += o.Documents
 808	s.ContentBytes += o.ContentBytes
 809
 810	// Sourcegraph specific
 811	s.NewLinesCount += o.NewLinesCount
 812	s.DefaultBranchNewLinesCount += o.DefaultBranchNewLinesCount
 813	s.OtherBranchesNewLinesCount += o.OtherBranchesNewLinesCount
 814}
 815
 816type RepoListEntry struct {
 817	Repository    Repository
 818	IndexMetadata IndexMetadata
 819	Stats         RepoStats
 820}
 821
 822// MinimalRepoListEntry is a subset of RepoListEntry. It was added after
 823// performance profiling of sourcegraph.com revealed that querying this
 824// information from Zoekt was causing lots of CPU and memory usage. Note: we
 825// can revisit this, how we store and query this information has changed a lot
 826// since this was introduced.
 827type MinimalRepoListEntry struct {
 828	// HasSymbols is exported since Sourcegraph uses this information at search
 829	// planning time to decide between Zoekt and an unindexed symbol search.
 830	//
 831	// Note: it pretty much is always true in practice.
 832	HasSymbols bool
 833
 834	// Branches is used by Sourcegraphs query planner to decided if it can use
 835	// zoekt or go via an unindexed code path.
 836	Branches []RepositoryBranch
 837
 838	// IndexTimeUnix is the IndexTime converted to unix time (number of seconds
 839	// since the epoch). This is to make it clear we are not transporting the
 840	// full fidelty timestamp (ie with milliseconds and location). Additionally
 841	// it saves 16 bytes in this struct.
 842	//
 843	// IndexTime is used as a heuristic in Sourcegraph to decide in aggregate
 844	// how many repositories need updating after a ranking change/etc.
 845	//
 846	// TODO(keegancsmith) audit updates to IndexTime and document how and when
 847	// it changes. Concerned about things like metadata updates or compound
 848	// shards leading to untrustworthy data here.
 849	IndexTimeUnix int64
 850}
 851
 852type ReposMap map[uint32]MinimalRepoListEntry
 853
 854// MarshalBinary implements a specialized encoder for ReposMap.
 855func (q *ReposMap) MarshalBinary() ([]byte, error) {
 856	return reposMapEncode(*q)
 857}
 858
 859// UnmarshalBinary implements a specialized decoder for ReposMap.
 860func (q *ReposMap) UnmarshalBinary(b []byte) error {
 861	var err error
 862	(*q), err = reposMapDecode(b)
 863	return err
 864}
 865
 866// RepoList holds a set of Repository metadata.
 867type RepoList struct {
 868	// Returned when ListOptions.Field is RepoListFieldRepos.
 869	Repos []*RepoListEntry
 870
 871	// ReposMap is set when ListOptions.Field is RepoListFieldReposMap.
 872	ReposMap ReposMap
 873
 874	Crashes int
 875
 876	// Stats response to a List request.
 877	// This is the aggregate RepoStats of all repos matching the input query.
 878	Stats RepoStats
 879}
 880
 881type Searcher interface {
 882	Search(ctx context.Context, q query.Q, opts *SearchOptions) (*SearchResult, error)
 883
 884	// List lists repositories. The query `q` can only contain
 885	// query.Repo atoms.
 886	List(ctx context.Context, q query.Q, opts *ListOptions) (*RepoList, error)
 887	Close()
 888
 889	// Describe the searcher for debug messages.
 890	String() string
 891}
 892
 893type RepoListField int
 894
 895const (
 896	RepoListFieldRepos    RepoListField = 0
 897	RepoListFieldReposMap               = 2
 898)
 899
 900type ListOptions struct {
 901	// Field decides which field to populate in RepoList response.
 902	Field RepoListField
 903}
 904
 905func (o *ListOptions) GetField() (RepoListField, error) {
 906	if o == nil {
 907		return RepoListFieldRepos, nil
 908	}
 909	switch o.Field {
 910	case RepoListFieldRepos, RepoListFieldReposMap:
 911		return o.Field, nil
 912	case 1:
 913		return 0, fmt.Errorf("RepoListFieldMinimal (%d) is no longer supported", o.Field)
 914	default:
 915		return 0, fmt.Errorf("unknown RepoListField %d", o.Field)
 916	}
 917}
 918
 919func (o *ListOptions) String() string {
 920	return fmt.Sprintf("%#v", o)
 921}
 922
 923type SearchOptions struct {
 924	// Return an upper-bound estimate of eligible documents in
 925	// stats.ShardFilesConsidered.
 926	EstimateDocCount bool
 927
 928	// Return the whole file.
 929	Whole bool
 930
 931	// Maximum number of matches: skip all processing an index
 932	// shard after we found this many non-overlapping matches.
 933	ShardMaxMatchCount int
 934
 935	// Maximum number of matches: stop looking for more matches
 936	// once we have this many matches across shards.
 937	TotalMaxMatchCount int
 938
 939	// Maximum number of matches: skip processing documents for a repository in
 940	// a shard once we have found ShardRepoMaxMatchCount.
 941	//
 942	// A compound shard may contain multiple repositories. This will most often
 943	// be set to 1 to find all repositories containing a result.
 944	ShardRepoMaxMatchCount int
 945
 946	// Abort the search after this much time has passed.
 947	MaxWallTime time.Duration
 948
 949	// FlushWallTime if non-zero will stop streaming behaviour at first and
 950	// instead will collate and sort results. At FlushWallTime the results will
 951	// be sent and then the behaviour will revert to the normal streaming.
 952	FlushWallTime time.Duration
 953
 954	// Truncates the number of documents (i.e. files) after collating and
 955	// sorting the results.
 956	MaxDocDisplayCount int
 957
 958	// Truncates the number of matchs after collating and sorting the results.
 959	MaxMatchDisplayCount int
 960
 961	// If set to a number greater than zero then up to this many number
 962	// of context lines will be added before and after each matched line.
 963	// Note that the included context lines might contain matches and
 964	// it's up to the consumer of the result to remove those lines.
 965	NumContextLines int
 966
 967	// If true, ChunkMatches will be returned in each FileMatch rather than LineMatches
 968	// EXPERIMENTAL: the behavior of this flag may be changed in future versions.
 969	ChunkMatches bool
 970
 971	// EXPERIMENTAL. If true, use text-search style scoring instead of the default
 972	// scoring formula. The scoring algorithm treats each match in a file as a term
 973	// and computes an approximation to BM25.
 974	//
 975	// The calculation of IDF assumes that Zoekt visits all documents containing any
 976	// of the query terms during evaluation. This is true, for example, if all query
 977	// terms are ORed together.
 978	//
 979	// When enabled, all other scoring signals are ignored, including document ranks.
 980	UseBM25Scoring bool
 981
 982	// Trace turns on opentracing for this request if true and if the Jaeger address was provided as
 983	// a command-line flag
 984	Trace bool
 985
 986	// If set, the search results will contain debug information for scoring.
 987	DebugScore bool
 988
 989	// SpanContext is the opentracing span context, if it exists, from the zoekt client
 990	SpanContext map[string]string
 991}
 992
 993// String returns a succinct representation of the options. This is meant for
 994// human consumption in logs and traces.
 995//
 996// Note: some tracing systems have limits on length of values, so we take care
 997// to try and make this small, and include the important information near the
 998// front incase of truncation.
 999func (s *SearchOptions) String() string {
1000	var b strings.Builder
1001
1002	add := func(name, value string) {
1003		b.WriteString(name)
1004		b.WriteByte('=')
1005		b.WriteString(value)
1006		b.WriteByte(' ')
1007	}
1008	addInt := func(name string, value int) {
1009		if value != 0 {
1010			add(name, strconv.Itoa(value))
1011		}
1012	}
1013	addDuration := func(name string, value time.Duration) {
1014		if value != 0 {
1015			add(name, value.String())
1016		}
1017	}
1018	addBool := func(name string, value bool) {
1019		if !value {
1020			return
1021		}
1022		b.WriteString(name)
1023		b.WriteByte(' ')
1024	}
1025
1026	b.WriteString("zoekt.SearchOptions{ ")
1027
1028	addInt("ShardMaxMatchCount", s.ShardMaxMatchCount)
1029	addInt("TotalMaxMatchCount", s.TotalMaxMatchCount)
1030	addInt("ShardRepoMaxMatchCount", s.ShardRepoMaxMatchCount)
1031	addInt("MaxDocDisplayCount", s.MaxDocDisplayCount)
1032	addInt("MaxMatchDisplayCount", s.MaxMatchDisplayCount)
1033	addInt("NumContextLines", s.NumContextLines)
1034
1035	addDuration("MaxWallTime", s.MaxWallTime)
1036	addDuration("FlushWallTime", s.FlushWallTime)
1037
1038	addBool("EstimateDocCount", s.EstimateDocCount)
1039	addBool("Whole", s.Whole)
1040	addBool("ChunkMatches", s.ChunkMatches)
1041	addBool("UseBM25Scoring", s.UseBM25Scoring)
1042	addBool("Trace", s.Trace)
1043	addBool("DebugScore", s.DebugScore)
1044
1045	for k, v := range s.SpanContext {
1046		add("SpanContext."+k, strconv.Quote(v))
1047	}
1048
1049	b.WriteByte('}')
1050	return b.String()
1051}
1052
1053// Sender is the interface that wraps the basic Send method.
1054type Sender interface {
1055	Send(*SearchResult)
1056}
1057
1058// SenderFunc is an adapter to allow the use of ordinary functions as Sender.
1059// If f is a function with the appropriate signature, SenderFunc(f) is a Sender
1060// that calls f.
1061type SenderFunc func(result *SearchResult)
1062
1063func (f SenderFunc) Send(result *SearchResult) {
1064	f(result)
1065}
1066
1067// Streamer adds the method StreamSearch to the Searcher interface.
1068type Streamer interface {
1069	Searcher
1070	StreamSearch(ctx context.Context, q query.Q, opts *SearchOptions, sender Sender) (err error)
1071}
Configure Feed

Configure Feed