api.go at 43b92256ba7117270d7cc8ca434cc14faf2cc241 · boltless.me/zoekt

fork of https://github.com/sourcegraph/zoekt
zoekt / api.go
at 43b92256ba7117270d7cc8ca434cc14faf2cc241 29 kB View raw
Ian Kerins Expand Repository.MergeMutable to cover more fields (#684) 2y ago
   1// Copyright 2016 Google Inc. All rights reserved.
   2//
   3// Licensed under the Apache License, Version 2.0 (the "License");
   4// you may not use this file except in compliance with the License.
   5// You may obtain a copy of the License at
   6//
   7//    http://www.apache.org/licenses/LICENSE-2.0
   8//
   9// Unless required by applicable law or agreed to in writing, software
  10// distributed under the License is distributed on an "AS IS" BASIS,
  11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12// See the License for the specific language governing permissions and
  13// limitations under the License.
  14
  15package zoekt // import "github.com/sourcegraph/zoekt"
  16
  17import (
  18	"context"
  19	"encoding/json"
  20	"errors"
  21	"fmt"
  22	"reflect"
  23	"strconv"
  24	"strings"
  25	"time"
  26
  27	"github.com/sourcegraph/zoekt/query"
  28)
  29
  30const (
  31	mapHeaderBytes    uint64 = 48
  32	sliceHeaderBytes  uint64 = 24
  33	stringHeaderBytes uint64 = 16
  34	pointerSize       uint64 = 8
  35	interfaceBytes    uint64 = 16
  36)
  37
  38// FileMatch contains all the matches within a file.
  39type FileMatch struct {
  40	FileName string
  41
  42	// Repository is the globally unique name of the repo of the
  43	// match
  44	Repository string
  45
  46	// SubRepositoryName is the globally unique name of the repo,
  47	// if it came from a subrepository
  48	SubRepositoryName string `json:",omitempty"`
  49
  50	// SubRepositoryPath holds the prefix where the subrepository
  51	// was mounted.
  52	SubRepositoryPath string `json:",omitempty"`
  53
  54	// Commit SHA1 (hex) of the (sub)repo holding the file.
  55	Version string `json:",omitempty"`
  56
  57	// Detected language of the result.
  58	Language string
  59
  60	// For debugging. Needs DebugScore set, but public so tests in
  61	// other packages can print some diagnostics.
  62	Debug string `json:",omitempty"`
  63
  64	Branches []string `json:",omitempty"`
  65
  66	// One of LineMatches or ChunkMatches will be returned depending on whether
  67	// the SearchOptions.ChunkMatches is set.
  68	LineMatches  []LineMatch  `json:",omitempty"`
  69	ChunkMatches []ChunkMatch `json:",omitempty"`
  70
  71	// Only set if requested
  72	Content []byte `json:",omitempty"`
  73
  74	// Checksum of the content.
  75	Checksum []byte
  76
  77	// Ranking; the higher, the better.
  78	Score float64 `json:",omitempty"`
  79
  80	// RepositoryPriority is a Sourcegraph extension. It is used by Sourcegraph to
  81	// order results from different repositories relative to each other.
  82	RepositoryPriority float64 `json:",omitempty"`
  83
  84	// RepositoryID is a Sourcegraph extension. This is the ID of Repository in
  85	// Sourcegraph.
  86	RepositoryID uint32 `json:",omitempty"`
  87}
  88
  89func (m *FileMatch) sizeBytes() (sz uint64) {
  90	// Score
  91	sz += 8
  92
  93	for _, s := range []string{
  94		m.Debug,
  95		m.FileName,
  96		m.Repository,
  97		m.Language,
  98		m.SubRepositoryName,
  99		m.SubRepositoryPath,
 100		m.Version,
 101	} {
 102		sz += stringHeaderBytes + uint64(len(s))
 103	}
 104
 105	// Branches
 106	sz += sliceHeaderBytes
 107	for _, s := range m.Branches {
 108		sz += stringHeaderBytes + uint64(len(s))
 109	}
 110
 111	// LineMatches
 112	sz += sliceHeaderBytes
 113	for _, lm := range m.LineMatches {
 114		sz += lm.sizeBytes()
 115	}
 116
 117	// ChunkMatches
 118	sz += sliceHeaderBytes
 119	for _, cm := range m.ChunkMatches {
 120		sz += cm.sizeBytes()
 121	}
 122
 123	// RepositoryID
 124	sz += 4
 125
 126	// RepositoryPriority
 127	sz += 8
 128
 129	// Content
 130	sz += sliceHeaderBytes + uint64(len(m.Content))
 131
 132	// Checksum
 133	sz += sliceHeaderBytes + uint64(len(m.Checksum))
 134
 135	return
 136}
 137
 138// ChunkMatch is a set of non-overlapping matches within a contiguous range of
 139// lines in the file.
 140type ChunkMatch struct {
 141	DebugScore string
 142
 143	// Content is a contiguous range of complete lines that fully contains Ranges.
 144	Content []byte
 145
 146	// Ranges is a set of matching ranges within this chunk. Each range is relative
 147	// to the beginning of the file (not the beginning of Content).
 148	Ranges []Range
 149
 150	// SymbolInfo is the symbol information associated with Ranges. If it is non-nil,
 151	// its length will equal that of Ranges. Any of its elements may be nil.
 152	SymbolInfo []*Symbol
 153
 154	// FileName indicates whether this match is a match on the file name, in
 155	// which case Content will contain the file name.
 156	FileName bool
 157
 158	// ContentStart is the location (inclusive) of the beginning of content
 159	// relative to the beginning of the file. It will always be at the
 160	// beginning of a line (Column will always be 1).
 161	ContentStart Location
 162
 163	Score float64
 164}
 165
 166func (cm *ChunkMatch) sizeBytes() (sz uint64) {
 167	// Content
 168	sz += sliceHeaderBytes + uint64(len(cm.Content))
 169
 170	// ContentStart
 171	sz += cm.ContentStart.sizeBytes()
 172
 173	// FileName
 174	sz += 1
 175
 176	// Ranges
 177	sz += sliceHeaderBytes
 178	if len(cm.Ranges) > 0 {
 179		sz += uint64(len(cm.Ranges)) * cm.Ranges[0].sizeBytes()
 180	}
 181
 182	// SymbolInfo
 183	sz += sliceHeaderBytes
 184	for _, si := range cm.SymbolInfo {
 185		sz += pointerSize
 186		if si != nil {
 187			sz += si.sizeBytes()
 188		}
 189	}
 190
 191	// Score
 192	sz += 8
 193
 194	// DebugScore
 195	sz += stringHeaderBytes + uint64(len(cm.DebugScore))
 196
 197	return
 198}
 199
 200type Range struct {
 201	// The inclusive beginning of the range.
 202	Start Location
 203	// The exclusive end of the range.
 204	End Location
 205}
 206
 207func (r *Range) sizeBytes() uint64 {
 208	return r.Start.sizeBytes() + r.End.sizeBytes()
 209}
 210
 211type Location struct {
 212	// 0-based byte offset from the beginning of the file
 213	ByteOffset uint32
 214	// 1-based line number from the beginning of the file
 215	LineNumber uint32
 216	// 1-based column number (in runes) from the beginning of line
 217	Column uint32
 218}
 219
 220func (l *Location) sizeBytes() uint64 {
 221	return 3 * 4
 222}
 223
 224// LineMatch holds the matches within a single line in a file.
 225type LineMatch struct {
 226	// The line in which a match was found.
 227	Line       []byte
 228	LineStart  int
 229	LineEnd    int
 230	LineNumber int
 231
 232	// Before and After are only set when SearchOptions.NumContextLines is > 0
 233	Before []byte
 234	After  []byte
 235
 236	// If set, this was a match on the filename.
 237	FileName bool
 238
 239	// The higher the better. Only ranks the quality of the match
 240	// within the file, does not take rank of file into account
 241	Score      float64
 242	DebugScore string
 243
 244	LineFragments []LineFragmentMatch
 245}
 246
 247func (lm *LineMatch) sizeBytes() (sz uint64) {
 248	// Line
 249	sz += sliceHeaderBytes + uint64(len(lm.Line))
 250
 251	// LineStart, LineEnd, LineNumber
 252	sz += 3 * 8
 253
 254	// Before
 255	sz += sliceHeaderBytes + uint64(len(lm.Before))
 256
 257	// After
 258	sz += sliceHeaderBytes + uint64(len(lm.After))
 259
 260	// FileName
 261	sz += 1
 262
 263	// Score
 264	sz += 8
 265
 266	// DebugScore
 267	sz += stringHeaderBytes + uint64(len(lm.DebugScore))
 268
 269	// LineFragments
 270	sz += sliceHeaderBytes
 271	for _, lf := range lm.LineFragments {
 272		sz += lf.sizeBytes()
 273	}
 274
 275	return
 276}
 277
 278type Symbol struct {
 279	Sym        string
 280	Kind       string
 281	Parent     string
 282	ParentKind string
 283}
 284
 285func (s *Symbol) sizeBytes() uint64 {
 286	return 4*stringHeaderBytes + uint64(len(s.Sym)+len(s.Kind)+len(s.Parent)+len(s.ParentKind))
 287}
 288
 289// LineFragmentMatch a segment of matching text within a line.
 290type LineFragmentMatch struct {
 291	// Offset within the line, in bytes.
 292	LineOffset int
 293
 294	// Offset from file start, in bytes.
 295	Offset uint32
 296
 297	// Number bytes that match.
 298	MatchLength int
 299
 300	SymbolInfo *Symbol
 301}
 302
 303func (lfm *LineFragmentMatch) sizeBytes() (sz uint64) {
 304	// LineOffset
 305	sz += 8
 306
 307	// Offset
 308	sz += 4
 309
 310	// MatchLength
 311	sz += 8
 312
 313	// SymbolInfo
 314	sz += pointerSize
 315	if lfm.SymbolInfo != nil {
 316		sz += lfm.SymbolInfo.sizeBytes()
 317	}
 318
 319	return
 320}
 321
 322type FlushReason uint8
 323
 324const (
 325	FlushReasonTimerExpired FlushReason = 1 << iota
 326	FlushReasonFinalFlush
 327	FlushReasonMaxSize
 328)
 329
 330var FlushReasonStrings = map[FlushReason]string{
 331	FlushReasonTimerExpired: "timer_expired",
 332	FlushReasonFinalFlush:   "final_flush",
 333	FlushReasonMaxSize:      "max_size_reached",
 334}
 335
 336func (fr FlushReason) String() string {
 337	if v, ok := FlushReasonStrings[fr]; ok {
 338		return v
 339	}
 340
 341	return "none"
 342}
 343
 344// Stats contains interesting numbers on the search
 345type Stats struct {
 346	// Amount of I/O for reading contents.
 347	ContentBytesLoaded int64
 348
 349	// Amount of I/O for reading from index.
 350	IndexBytesLoaded int64
 351
 352	// Number of search shards that had a crash.
 353	Crashes int
 354
 355	// Wall clock time for this search
 356	Duration time.Duration
 357
 358	// Number of files containing a match.
 359	FileCount int
 360
 361	// Number of files in shards that we considered.
 362	ShardFilesConsidered int
 363
 364	// Files that we evaluated. Equivalent to files for which all
 365	// atom matches (including negations) evaluated to true.
 366	FilesConsidered int
 367
 368	// Files for which we loaded file content to verify substring matches
 369	FilesLoaded int
 370
 371	// Candidate files whose contents weren't examined because we
 372	// gathered enough matches.
 373	FilesSkipped int
 374
 375	// Shards that we scanned to find matches.
 376	ShardsScanned int
 377
 378	// Shards that we did not process because a query was canceled.
 379	ShardsSkipped int
 380
 381	// Shards that we did not process because the query was rejected by the
 382	// ngram filter indicating it had no matches.
 383	ShardsSkippedFilter int
 384
 385	// Number of non-overlapping matches
 386	MatchCount int
 387
 388	// Number of candidate matches as a result of searching ngrams.
 389	NgramMatches int
 390
 391	// NgramLookups is the number of times we accessed an ngram in the index.
 392	NgramLookups int
 393
 394	// Wall clock time for queued search.
 395	Wait time.Duration
 396
 397	// Aggregate wall clock time spent constructing and pruning the match tree.
 398	// This accounts for time such as lookups in the trigram index.
 399	MatchTreeConstruction time.Duration
 400
 401	// Aggregate wall clock time spent searching the match tree. This accounts
 402	// for the bulk of search work done looking for matches.
 403	MatchTreeSearch time.Duration
 404
 405	// Number of times regexp was called on files that we evaluated.
 406	RegexpsConsidered int
 407
 408	// FlushReason explains why results were flushed.
 409	FlushReason FlushReason
 410}
 411
 412func (s *Stats) sizeBytes() (sz uint64) {
 413	sz = 16 * 8 // This assumes we are running on a 64-bit architecture
 414	sz += 1     // FlushReason
 415
 416	return
 417}
 418
 419func (s *Stats) Add(o Stats) {
 420	s.ContentBytesLoaded += o.ContentBytesLoaded
 421	s.IndexBytesLoaded += o.IndexBytesLoaded
 422	s.Crashes += o.Crashes
 423	s.FileCount += o.FileCount
 424	s.FilesConsidered += o.FilesConsidered
 425	s.FilesLoaded += o.FilesLoaded
 426	s.FilesSkipped += o.FilesSkipped
 427	s.MatchCount += o.MatchCount
 428	s.NgramMatches += o.NgramMatches
 429	s.NgramLookups += o.NgramLookups
 430	s.ShardFilesConsidered += o.ShardFilesConsidered
 431	s.ShardsScanned += o.ShardsScanned
 432	s.ShardsSkipped += o.ShardsSkipped
 433	s.ShardsSkippedFilter += o.ShardsSkippedFilter
 434	s.Wait += o.Wait
 435	s.MatchTreeConstruction += o.MatchTreeConstruction
 436	s.MatchTreeSearch += o.MatchTreeSearch
 437	s.RegexpsConsidered += o.RegexpsConsidered
 438
 439	// We want the first non-zero FlushReason to be sticky. This is a useful
 440	// property when aggregating stats from several Zoekts.
 441	if s.FlushReason == 0 {
 442		s.FlushReason = o.FlushReason
 443	}
 444}
 445
 446// Zero returns true if stats is empty.
 447func (s *Stats) Zero() bool {
 448	if s == nil {
 449		return true
 450	}
 451
 452	return !(s.ContentBytesLoaded > 0 ||
 453		s.IndexBytesLoaded > 0 ||
 454		s.Crashes > 0 ||
 455		s.FileCount > 0 ||
 456		s.FilesConsidered > 0 ||
 457		s.FilesLoaded > 0 ||
 458		s.FilesSkipped > 0 ||
 459		s.MatchCount > 0 ||
 460		s.NgramMatches > 0 ||
 461		s.NgramLookups > 0 ||
 462		s.ShardFilesConsidered > 0 ||
 463		s.ShardsScanned > 0 ||
 464		s.ShardsSkipped > 0 ||
 465		s.ShardsSkippedFilter > 0 ||
 466		s.Wait > 0 ||
 467		s.MatchTreeConstruction > 0 ||
 468		s.MatchTreeSearch > 0 ||
 469		s.RegexpsConsidered > 0)
 470}
 471
 472// Progress contains information about the global progress of the running search query.
 473// This is used by the frontend to reorder results and emit them when stable.
 474// Sourcegraph specific: this is used when querying multiple zoekt-webserver instances.
 475type Progress struct {
 476	// Priority of the shard that was searched.
 477	Priority float64
 478
 479	// MaxPendingPriority is the maximum priority of pending result that is being searched in parallel.
 480	// This is used to reorder results when the result set is known to be stable-- that is, when a result's
 481	// Priority is greater than the max(MaxPendingPriority) from the latest results of each backend, it can be returned to the user.
 482	//
 483	// MaxPendingPriority decreases monotonically in each SearchResult.
 484	MaxPendingPriority float64
 485}
 486
 487func (p *Progress) sizeBytes() uint64 {
 488	return 2 * 8
 489}
 490
 491// SearchResult contains search matches and extra data
 492type SearchResult struct {
 493	Stats
 494
 495	// Do not encode this as we cannot encode -Inf in JSON
 496	Progress `json:"-"`
 497
 498	Files []FileMatch
 499
 500	// RepoURLs holds a repo => template string map.
 501	RepoURLs map[string]string
 502
 503	// FragmentNames holds a repo => template string map, for
 504	// the line number fragment.
 505	LineFragments map[string]string
 506}
 507
 508// SizeBytes is a best-effort estimate of the size of SearchResult in memory.
 509// The estimate does not take alignment into account. The result is a lower
 510// bound on the actual size in memory.
 511func (sr *SearchResult) SizeBytes() (sz uint64) {
 512	sz += sr.Stats.sizeBytes()
 513	sz += sr.Progress.sizeBytes()
 514
 515	// Files
 516	sz += sliceHeaderBytes
 517	for _, f := range sr.Files {
 518		sz += f.sizeBytes()
 519	}
 520
 521	// RepoURLs
 522	sz += mapHeaderBytes
 523	for k, v := range sr.RepoURLs {
 524		sz += stringHeaderBytes + uint64(len(k))
 525		sz += stringHeaderBytes + uint64(len(v))
 526	}
 527
 528	// LineFragments
 529	sz += mapHeaderBytes
 530	for k, v := range sr.LineFragments {
 531		sz += stringHeaderBytes + uint64(len(k))
 532		sz += stringHeaderBytes + uint64(len(v))
 533	}
 534
 535	return
 536}
 537
 538// RepositoryBranch describes an indexed branch, which is a name
 539// combined with a version.
 540type RepositoryBranch struct {
 541	Name    string
 542	Version string
 543}
 544
 545func (r RepositoryBranch) String() string {
 546	return fmt.Sprintf("%s@%s", r.Name, r.Version)
 547}
 548
 549// Repository holds repository metadata.
 550type Repository struct {
 551	// Sourcegraph's repository ID
 552	ID uint32
 553
 554	// The repository name
 555	Name string
 556
 557	// The repository URL.
 558	URL string
 559
 560	// The physical source where this repo came from, eg. full
 561	// path to the zip filename or git repository directory. This
 562	// will not be exposed in the UI, but can be used to detect
 563	// orphaned index shards.
 564	Source string
 565
 566	// The branches indexed in this repo.
 567	Branches []RepositoryBranch
 568
 569	// Nil if this is not the super project.
 570	SubRepoMap map[string]*Repository
 571
 572	// URL template to link to the commit of a branch
 573	CommitURLTemplate string
 574
 575	// The repository URL for getting to a file.  Has access to
 576	// {{.Version}}, {{.Path}}
 577	FileURLTemplate string
 578
 579	// The URL fragment to add to a file URL for line numbers. has
 580	// access to {{.LineNumber}}. The fragment should include the
 581	// separator, generally '#' or ';'.
 582	LineFragmentTemplate string
 583
 584	// Perf optimization: priority is set when we load the shard. It corresponds to
 585	// the value of "priority" stored in RawConfig.
 586	priority float64
 587
 588	// All zoekt.* configuration settings.
 589	RawConfig map[string]string
 590
 591	// Importance of the repository, bigger is more important
 592	Rank uint16
 593
 594	// IndexOptions is a hash of the options used to create the index for the
 595	// repo.
 596	IndexOptions string
 597
 598	// HasSymbols is true if this repository has indexed ctags
 599	// output. Sourcegraph specific: This field is more appropriate for
 600	// IndexMetadata. However, we store it here since the Sourcegraph frontend
 601	// can read this structure but not IndexMetadata.
 602	HasSymbols bool
 603
 604	// Tombstone is true if we are not allowed to search this repo.
 605	Tombstone bool
 606
 607	// LatestCommitDate is the date of the latest commit among all indexed Branches.
 608	// The date might be time.Time's 0-value if the repository was last indexed
 609	// before this field was added.
 610	LatestCommitDate time.Time
 611
 612	// FileTombstones is a set of file paths that should be ignored across all branches
 613	// in this shard.
 614	FileTombstones map[string]struct{} `json:",omitempty"`
 615}
 616
 617func (r *Repository) UnmarshalJSON(data []byte) error {
 618	// We define a new type so that we can use json.Unmarshal
 619	// without recursing into this same method.
 620	type repository *Repository
 621	repo := repository(r)
 622
 623	err := json.Unmarshal(data, repo)
 624	if err != nil {
 625		return err
 626	}
 627
 628	if v, ok := repo.RawConfig["repoid"]; ok {
 629		id, _ := strconv.ParseUint(v, 10, 32)
 630		r.ID = uint32(id)
 631	}
 632
 633	if v, ok := repo.RawConfig["priority"]; ok {
 634		r.priority, err = strconv.ParseFloat(v, 64)
 635		if err != nil {
 636			r.priority = 0
 637		}
 638
 639		// Sourcegraph indexserver doesn't set repo.Rank, so we set it here
 640		// based on priority. Setting it on read instead of during indexing
 641		// allows us to avoid a complete reindex.
 642		if r.Rank == 0 && r.priority > 0 {
 643			// Normalize the repo score within [0, 1), with the midpoint at 5,000. This means popular
 644			// repos (roughly ones with over 5,000 stars) see diminishing returns from more stars.
 645			r.Rank = uint16(r.priority / (5000.0 + r.priority) * maxUInt16)
 646		}
 647	}
 648	return nil
 649}
 650
 651// MergeMutable will merge x into r. mutated will be true if it made any
 652// changes. err is non-nil if we needed to mutate an immutable field.
 653//
 654// Note: SubRepoMap, IndexOptions and HasSymbol fields are ignored. They are
 655// computed while indexing so can't be synthesized from x.
 656//
 657// Note: We ignore RawConfig fields which are duplicated into Repository:
 658// name and id.
 659func (r *Repository) MergeMutable(x *Repository) (mutated bool, err error) {
 660	if r.ID != x.ID {
 661		// Sourcegraph: strange behaviour may occur if ID changes but names don't.
 662		return mutated, errors.New("ID is immutable")
 663	}
 664	if r.Name != x.Name {
 665		// Name is encoded into the shard name on disk. We need to re-index if it
 666		// changes.
 667		return mutated, errors.New("Name is immutable")
 668	}
 669	if !reflect.DeepEqual(r.Branches, x.Branches) {
 670		// Need a reindex if content changing.
 671		return mutated, errors.New("Branches is immutable")
 672	}
 673
 674	for k, v := range x.RawConfig {
 675		// We ignore name and id since they are encoded into the repository.
 676		if k == "name" || k == "id" {
 677			continue
 678		}
 679		if r.RawConfig == nil {
 680			mutated = true
 681			r.RawConfig = make(map[string]string)
 682		}
 683		if r.RawConfig[k] != v {
 684			mutated = true
 685			r.RawConfig[k] = v
 686		}
 687	}
 688
 689	if r.URL != x.URL {
 690		mutated = true
 691		r.URL = x.URL
 692	}
 693	if r.CommitURLTemplate != x.CommitURLTemplate {
 694		mutated = true
 695		r.CommitURLTemplate = x.CommitURLTemplate
 696	}
 697	if r.FileURLTemplate != x.FileURLTemplate {
 698		mutated = true
 699		r.FileURLTemplate = x.FileURLTemplate
 700	}
 701	if r.LineFragmentTemplate != x.LineFragmentTemplate {
 702		mutated = true
 703		r.LineFragmentTemplate = x.LineFragmentTemplate
 704	}
 705
 706	return mutated, nil
 707}
 708
 709// IndexMetadata holds metadata stored in the index file. It contains
 710// data generated by the core indexing library.
 711type IndexMetadata struct {
 712	IndexFormatVersion    int
 713	IndexFeatureVersion   int
 714	IndexMinReaderVersion int
 715	IndexTime             time.Time
 716	PlainASCII            bool
 717	LanguageMap           map[string]uint16
 718	ZoektVersion          string
 719	ID                    string
 720}
 721
 722// Statistics of a (collection of) repositories.
 723type RepoStats struct {
 724	// Repos is used for aggregrating the number of repositories.
 725	//
 726	// Note: This field is not populated on RepoListEntry.Stats (individual) but
 727	// only for RepoList.Stats (aggregate).
 728	Repos int
 729
 730	// Shards is the total number of search shards.
 731	Shards int
 732
 733	// Documents holds the number of documents or files.
 734	Documents int
 735
 736	// IndexBytes is the amount of RAM used for index overhead.
 737	IndexBytes int64
 738
 739	// ContentBytes is the amount of RAM used for raw content.
 740	ContentBytes int64
 741
 742	// Sourcegraph specific stats below. These are not as efficient to calculate
 743	// as the above statistics. We experimentally measured about a 10% slower
 744	// shard load time. However, we find these values very useful to track and
 745	// computing them outside of load time introduces a lot of complexity.
 746
 747	// NewLinesCount is the number of newlines "\n" that appear in the zoekt
 748	// indexed documents. This is not exactly the same as line count, since it
 749	// will not include lines not terminated by "\n" (eg a file with no "\n", or
 750	// a final line without "\n"). Note: Zoekt deduplicates documents across
 751	// branches, so if a path has the same contents on multiple branches, there
 752	// is only one document for it. As such that document's newlines is only
 753	// counted once. See DefaultBranchNewLinesCount and AllBranchesNewLinesCount
 754	// for counts which do not deduplicate.
 755	NewLinesCount uint64
 756
 757	// DefaultBranchNewLinesCount is the number of newlines "\n" in the default
 758	// branch.
 759	DefaultBranchNewLinesCount uint64
 760
 761	// OtherBranchesNewLinesCount is the number of newlines "\n" in all branches
 762	// except the default branch.
 763	OtherBranchesNewLinesCount uint64
 764}
 765
 766func (s *RepoStats) Add(o *RepoStats) {
 767	// can't update Repos, since one repo may have multiple
 768	// shards.
 769	s.Shards += o.Shards
 770	s.IndexBytes += o.IndexBytes
 771	s.Documents += o.Documents
 772	s.ContentBytes += o.ContentBytes
 773
 774	// Sourcegraph specific
 775	s.NewLinesCount += o.NewLinesCount
 776	s.DefaultBranchNewLinesCount += o.DefaultBranchNewLinesCount
 777	s.OtherBranchesNewLinesCount += o.OtherBranchesNewLinesCount
 778}
 779
 780type RepoListEntry struct {
 781	Repository    Repository
 782	IndexMetadata IndexMetadata
 783	Stats         RepoStats
 784}
 785
 786// MinimalRepoListEntry is a subset of RepoListEntry. It was added after
 787// performance profiling of sourcegraph.com revealed that querying this
 788// information from Zoekt was causing lots of CPU and memory usage. Note: we
 789// can revisit this, how we store and query this information has changed a lot
 790// since this was introduced.
 791type MinimalRepoListEntry struct {
 792	// HasSymbols is exported since Sourcegraph uses this information at search
 793	// planning time to decide between Zoekt and an unindexed symbol search.
 794	//
 795	// Note: it pretty much is always true in practice.
 796	HasSymbols bool
 797
 798	// Branches is used by Sourcegraphs query planner to decided if it can use
 799	// zoekt or go via an unindexed code path.
 800	Branches []RepositoryBranch
 801
 802	// IndexTimeUnix is the IndexTime converted to unix time (number of seconds
 803	// since the epoch). This is to make it clear we are not transporting the
 804	// full fidelty timestamp (ie with milliseconds and location). Additionally
 805	// it saves 16 bytes in this struct.
 806	//
 807	// IndexTime is used as a heuristic in Sourcegraph to decide in aggregate
 808	// how many repositories need updating after a ranking change/etc.
 809	//
 810	// TODO(keegancsmith) audit updates to IndexTime and document how and when
 811	// it changes. Concerned about things like metadata updates or compound
 812	// shards leading to untrustworthy data here.
 813	IndexTimeUnix int64
 814}
 815
 816type ReposMap map[uint32]MinimalRepoListEntry
 817
 818// MarshalBinary implements a specialized encoder for ReposMap.
 819func (q *ReposMap) MarshalBinary() ([]byte, error) {
 820	return reposMapEncode(*q)
 821}
 822
 823// UnmarshalBinary implements a specialized decoder for ReposMap.
 824func (q *ReposMap) UnmarshalBinary(b []byte) error {
 825	var err error
 826	(*q), err = reposMapDecode(b)
 827	return err
 828}
 829
 830// RepoList holds a set of Repository metadata.
 831type RepoList struct {
 832	// Returned when ListOptions.Field is RepoListFieldRepos.
 833	Repos []*RepoListEntry
 834
 835	// ReposMap is set when ListOptions.Field is RepoListFieldReposMap.
 836	ReposMap ReposMap
 837
 838	Crashes int
 839
 840	// Stats response to a List request.
 841	// This is the aggregate RepoStats of all repos matching the input query.
 842	Stats RepoStats
 843}
 844
 845type Searcher interface {
 846	Search(ctx context.Context, q query.Q, opts *SearchOptions) (*SearchResult, error)
 847
 848	// List lists repositories. The query `q` can only contain
 849	// query.Repo atoms.
 850	List(ctx context.Context, q query.Q, opts *ListOptions) (*RepoList, error)
 851	Close()
 852
 853	// Describe the searcher for debug messages.
 854	String() string
 855}
 856
 857type RepoListField int
 858
 859const (
 860	RepoListFieldRepos    RepoListField = 0
 861	RepoListFieldReposMap               = 2
 862)
 863
 864type ListOptions struct {
 865	// Field decides which field to populate in RepoList response.
 866	Field RepoListField
 867}
 868
 869func (o *ListOptions) GetField() (RepoListField, error) {
 870	if o == nil {
 871		return RepoListFieldRepos, nil
 872	}
 873	switch o.Field {
 874	case RepoListFieldRepos, RepoListFieldReposMap:
 875		return o.Field, nil
 876	case 1:
 877		return 0, fmt.Errorf("RepoListFieldMinimal (%d) is no longer supported", o.Field)
 878	default:
 879		return 0, fmt.Errorf("unknown RepoListField %d", o.Field)
 880	}
 881}
 882
 883func (o *ListOptions) String() string {
 884	return fmt.Sprintf("%#v", o)
 885}
 886
 887type SearchOptions struct {
 888	// Return an upper-bound estimate of eligible documents in
 889	// stats.ShardFilesConsidered.
 890	EstimateDocCount bool
 891
 892	// Return the whole file.
 893	Whole bool
 894
 895	// Maximum number of matches: skip all processing an index
 896	// shard after we found this many non-overlapping matches.
 897	ShardMaxMatchCount int
 898
 899	// Maximum number of matches: stop looking for more matches
 900	// once we have this many matches across shards.
 901	TotalMaxMatchCount int
 902
 903	// Maximum number of matches: skip processing documents for a repository in
 904	// a shard once we have found ShardRepoMaxMatchCount.
 905	//
 906	// A compound shard may contain multiple repositories. This will most often
 907	// be set to 1 to find all repositories containing a result.
 908	ShardRepoMaxMatchCount int
 909
 910	// Abort the search after this much time has passed.
 911	MaxWallTime time.Duration
 912
 913	// FlushWallTime if non-zero will stop streaming behaviour at first and
 914	// instead will collate and sort results. At FlushWallTime the results will
 915	// be sent and then the behaviour will revert to the normal streaming.
 916	FlushWallTime time.Duration
 917
 918	// Truncates the number of documents (i.e. files) after collating and
 919	// sorting the results.
 920	MaxDocDisplayCount int
 921
 922	// Truncates the number of matchs after collating and sorting the results.
 923	MaxMatchDisplayCount int
 924
 925	// If set to a number greater than zero then up to this many number
 926	// of context lines will be added before and after each matched line.
 927	// Note that the included context lines might contain matches and
 928	// it's up to the consumer of the result to remove those lines.
 929	NumContextLines int
 930
 931	// If true, ChunkMatches will be returned in each FileMatch rather than LineMatches
 932	// EXPERIMENTAL: the behavior of this flag may be changed in future versions.
 933	ChunkMatches bool
 934
 935	// EXPERIMENTAL. If true, document ranks are used as additional input for
 936	// sorting matches.
 937	UseDocumentRanks bool
 938
 939	// EXPERIMENTAL. When UseDocumentRanks is enabled, this can be optionally set to adjust
 940	// their weight in the file match score. If the value is <= 0.0, the default weight value
 941	// will be used. This option is temporary and is only exposed for testing/ tuning purposes.
 942	DocumentRanksWeight float64
 943
 944	// EXPERIMENTAL. If true, use keyword-style scoring instead of the default scoring formula.
 945	// Currently, this treats each match in a file as a term and computes an approximation to BM25.
 946	// When enabled, all other scoring signals are ignored, including document ranks.
 947	UseKeywordScoring bool
 948
 949	// Trace turns on opentracing for this request if true and if the Jaeger address was provided as
 950	// a command-line flag
 951	Trace bool
 952
 953	// If set, the search results will contain debug information for scoring.
 954	DebugScore bool
 955
 956	// SpanContext is the opentracing span context, if it exists, from the zoekt client
 957	SpanContext map[string]string
 958}
 959
 960// String returns a succinct representation of the options. This is meant for
 961// human consumption in logs and traces.
 962//
 963// Note: some tracing systems have limits on length of values, so we take care
 964// to try and make this small, and include the important information near the
 965// front incase of truncation.
 966func (s *SearchOptions) String() string {
 967	var b strings.Builder
 968
 969	add := func(name, value string) {
 970		b.WriteString(name)
 971		b.WriteByte('=')
 972		b.WriteString(value)
 973		b.WriteByte(' ')
 974	}
 975	addInt := func(name string, value int) {
 976		if value != 0 {
 977			add(name, strconv.Itoa(value))
 978		}
 979	}
 980	addDuration := func(name string, value time.Duration) {
 981		if value != 0 {
 982			add(name, value.String())
 983		}
 984	}
 985	addBool := func(name string, value bool) {
 986		if !value {
 987			return
 988		}
 989		b.WriteString(name)
 990		b.WriteByte(' ')
 991	}
 992
 993	b.WriteString("zoekt.SearchOptions{ ")
 994
 995	addInt("ShardMaxMatchCount", s.ShardMaxMatchCount)
 996	addInt("TotalMaxMatchCount", s.TotalMaxMatchCount)
 997	addInt("ShardRepoMaxMatchCount", s.ShardRepoMaxMatchCount)
 998	addInt("MaxDocDisplayCount", s.MaxDocDisplayCount)
 999	addInt("MaxMatchDisplayCount", s.MaxMatchDisplayCount)
1000	addInt("NumContextLines", s.NumContextLines)
1001
1002	addDuration("MaxWallTime", s.MaxWallTime)
1003	addDuration("FlushWallTime", s.FlushWallTime)
1004
1005	if s.DocumentRanksWeight > 0 {
1006		add("DocumentRanksWeight", strconv.FormatFloat(s.DocumentRanksWeight, 'g', -1, 64))
1007	}
1008
1009	addBool("EstimateDocCount", s.EstimateDocCount)
1010	addBool("Whole", s.Whole)
1011	addBool("ChunkMatches", s.ChunkMatches)
1012	addBool("UseDocumentRanks", s.UseDocumentRanks)
1013	addBool("UseKeywordScoring", s.UseKeywordScoring)
1014	addBool("Trace", s.Trace)
1015	addBool("DebugScore", s.DebugScore)
1016
1017	for k, v := range s.SpanContext {
1018		add("SpanContext."+k, strconv.Quote(v))
1019	}
1020
1021	b.WriteByte('}')
1022	return b.String()
1023}
1024
1025// Sender is the interface that wraps the basic Send method.
1026type Sender interface {
1027	Send(*SearchResult)
1028}
1029
1030// Streamer adds the method StreamSearch to the Searcher interface.
1031type Streamer interface {
1032	Searcher
1033	StreamSearch(ctx context.Context, q query.Q, opts *SearchOptions, sender Sender) (err error)
1034}
Configure Feed

Configure Feed