api.go at 6a4b615768a3bdbd9c05d0a92d7d5b2321937583 · boltless.me/zoekt

fork of https://github.com/sourcegraph/zoekt
zoekt / api.go
at 6a4b615768a3bdbd9c05d0a92d7d5b2321937583 31 kB View raw
Stefan Hengl sourcegraph: fix wrong git config (#841) 2y ago
   1// Copyright 2016 Google Inc. All rights reserved.
   2//
   3// Licensed under the Apache License, Version 2.0 (the "License");
   4// you may not use this file except in compliance with the License.
   5// You may obtain a copy of the License at
   6//
   7//    http://www.apache.org/licenses/LICENSE-2.0
   8//
   9// Unless required by applicable law or agreed to in writing, software
  10// distributed under the License is distributed on an "AS IS" BASIS,
  11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12// See the License for the specific language governing permissions and
  13// limitations under the License.
  14
  15package zoekt // import "github.com/sourcegraph/zoekt"
  16
  17import (
  18	"context"
  19	"encoding/json"
  20	"errors"
  21	"fmt"
  22	"reflect"
  23	"strconv"
  24	"strings"
  25	"time"
  26
  27	"github.com/sourcegraph/zoekt/query"
  28)
  29
  30const (
  31	mapHeaderBytes    uint64 = 48
  32	sliceHeaderBytes  uint64 = 24
  33	stringHeaderBytes uint64 = 16
  34	pointerSize       uint64 = 8
  35	interfaceBytes    uint64 = 16
  36)
  37
  38// FileMatch contains all the matches within a file.
  39type FileMatch struct {
  40	FileName string
  41
  42	// Repository is the globally unique name of the repo of the
  43	// match
  44	Repository string
  45
  46	// SubRepositoryName is the globally unique name of the repo,
  47	// if it came from a subrepository
  48	SubRepositoryName string `json:",omitempty"`
  49
  50	// SubRepositoryPath holds the prefix where the subrepository
  51	// was mounted.
  52	SubRepositoryPath string `json:",omitempty"`
  53
  54	// Commit SHA1 (hex) of the (sub)repo holding the file.
  55	Version string `json:",omitempty"`
  56
  57	// Detected language of the result.
  58	Language string
  59
  60	// For debugging. Needs DebugScore set, but public so tests in
  61	// other packages can print some diagnostics.
  62	Debug string `json:",omitempty"`
  63
  64	Branches []string `json:",omitempty"`
  65
  66	// One of LineMatches or ChunkMatches will be returned depending on whether
  67	// the SearchOptions.ChunkMatches is set.
  68	LineMatches  []LineMatch  `json:",omitempty"`
  69	ChunkMatches []ChunkMatch `json:",omitempty"`
  70
  71	// Only set if requested
  72	Content []byte `json:",omitempty"`
  73
  74	// Checksum of the content.
  75	Checksum []byte
  76
  77	// Ranking; the higher, the better.
  78	Score float64 `json:",omitempty"`
  79
  80	// RepositoryPriority is a Sourcegraph extension. It is used by Sourcegraph to
  81	// order results from different repositories relative to each other.
  82	RepositoryPriority float64 `json:",omitempty"`
  83
  84	// RepositoryID is a Sourcegraph extension. This is the ID of Repository in
  85	// Sourcegraph.
  86	RepositoryID uint32 `json:",omitempty"`
  87}
  88
  89func (m *FileMatch) sizeBytes() (sz uint64) {
  90	// Score
  91	sz += 8
  92
  93	for _, s := range []string{
  94		m.Debug,
  95		m.FileName,
  96		m.Repository,
  97		m.Language,
  98		m.SubRepositoryName,
  99		m.SubRepositoryPath,
 100		m.Version,
 101	} {
 102		sz += stringHeaderBytes + uint64(len(s))
 103	}
 104
 105	// Branches
 106	sz += sliceHeaderBytes
 107	for _, s := range m.Branches {
 108		sz += stringHeaderBytes + uint64(len(s))
 109	}
 110
 111	// LineMatches
 112	sz += sliceHeaderBytes
 113	for _, lm := range m.LineMatches {
 114		sz += lm.sizeBytes()
 115	}
 116
 117	// ChunkMatches
 118	sz += sliceHeaderBytes
 119	for _, cm := range m.ChunkMatches {
 120		sz += cm.sizeBytes()
 121	}
 122
 123	// RepositoryID
 124	sz += 4
 125
 126	// RepositoryPriority
 127	sz += 8
 128
 129	// Content
 130	sz += sliceHeaderBytes + uint64(len(m.Content))
 131
 132	// Checksum
 133	sz += sliceHeaderBytes + uint64(len(m.Checksum))
 134
 135	return
 136}
 137
 138// ChunkMatch is a set of non-overlapping matches within a contiguous range of
 139// lines in the file.
 140type ChunkMatch struct {
 141	DebugScore string
 142
 143	// Content is a contiguous range of complete lines that fully contains Ranges.
 144	// Lines will always include their terminating newline (if it exists).
 145	Content []byte
 146
 147	// Ranges is a set of matching ranges within this chunk. Each range is relative
 148	// to the beginning of the file (not the beginning of Content).
 149	Ranges []Range
 150
 151	// SymbolInfo is the symbol information associated with Ranges. If it is non-nil,
 152	// its length will equal that of Ranges. Any of its elements may be nil.
 153	SymbolInfo []*Symbol
 154
 155	// FileName indicates whether this match is a match on the file name, in
 156	// which case Content will contain the file name.
 157	FileName bool
 158
 159	// ContentStart is the location (inclusive) of the beginning of content
 160	// relative to the beginning of the file. It will always be at the
 161	// beginning of a line (Column will always be 1).
 162	ContentStart Location
 163
 164	Score float64
 165}
 166
 167func (cm *ChunkMatch) sizeBytes() (sz uint64) {
 168	// Content
 169	sz += sliceHeaderBytes + uint64(len(cm.Content))
 170
 171	// ContentStart
 172	sz += cm.ContentStart.sizeBytes()
 173
 174	// FileName
 175	sz += 1
 176
 177	// Ranges
 178	sz += sliceHeaderBytes
 179	if len(cm.Ranges) > 0 {
 180		sz += uint64(len(cm.Ranges)) * cm.Ranges[0].sizeBytes()
 181	}
 182
 183	// SymbolInfo
 184	sz += sliceHeaderBytes
 185	for _, si := range cm.SymbolInfo {
 186		sz += pointerSize
 187		if si != nil {
 188			sz += si.sizeBytes()
 189		}
 190	}
 191
 192	// Score
 193	sz += 8
 194
 195	// DebugScore
 196	sz += stringHeaderBytes + uint64(len(cm.DebugScore))
 197
 198	return
 199}
 200
 201type Range struct {
 202	// The inclusive beginning of the range.
 203	Start Location
 204	// The exclusive end of the range.
 205	End Location
 206}
 207
 208func (r *Range) sizeBytes() uint64 {
 209	return r.Start.sizeBytes() + r.End.sizeBytes()
 210}
 211
 212type Location struct {
 213	// 0-based byte offset from the beginning of the file
 214	ByteOffset uint32
 215	// 1-based line number from the beginning of the file
 216	LineNumber uint32
 217	// 1-based column number (in runes) from the beginning of line
 218	Column uint32
 219}
 220
 221func (l *Location) sizeBytes() uint64 {
 222	return 3 * 4
 223}
 224
 225// LineMatch holds the matches within a single line in a file.
 226type LineMatch struct {
 227	// The line in which a match was found.
 228	Line []byte
 229	// The byte offset of the first byte of the line.
 230	LineStart int
 231	// The byte offset of the first byte past the end of the line.
 232	// This is usually the byte after the terminating newline, but can also be
 233	// the end of the file if there is no terminating newline
 234	LineEnd    int
 235	LineNumber int
 236
 237	// Before and After are only set when SearchOptions.NumContextLines is > 0
 238	Before []byte
 239	After  []byte
 240
 241	// If set, this was a match on the filename.
 242	FileName bool
 243
 244	// The higher the better. Only ranks the quality of the match
 245	// within the file, does not take rank of file into account
 246	Score      float64
 247	DebugScore string
 248
 249	LineFragments []LineFragmentMatch
 250}
 251
 252func (lm *LineMatch) sizeBytes() (sz uint64) {
 253	// Line
 254	sz += sliceHeaderBytes + uint64(len(lm.Line))
 255
 256	// LineStart, LineEnd, LineNumber
 257	sz += 3 * 8
 258
 259	// Before
 260	sz += sliceHeaderBytes + uint64(len(lm.Before))
 261
 262	// After
 263	sz += sliceHeaderBytes + uint64(len(lm.After))
 264
 265	// FileName
 266	sz += 1
 267
 268	// Score
 269	sz += 8
 270
 271	// DebugScore
 272	sz += stringHeaderBytes + uint64(len(lm.DebugScore))
 273
 274	// LineFragments
 275	sz += sliceHeaderBytes
 276	for _, lf := range lm.LineFragments {
 277		sz += lf.sizeBytes()
 278	}
 279
 280	return
 281}
 282
 283type Symbol struct {
 284	Sym        string
 285	Kind       string
 286	Parent     string
 287	ParentKind string
 288}
 289
 290func (s *Symbol) sizeBytes() uint64 {
 291	return 4*stringHeaderBytes + uint64(len(s.Sym)+len(s.Kind)+len(s.Parent)+len(s.ParentKind))
 292}
 293
 294// LineFragmentMatch a segment of matching text within a line.
 295type LineFragmentMatch struct {
 296	// Offset within the line, in bytes.
 297	LineOffset int
 298
 299	// Offset from file start, in bytes.
 300	Offset uint32
 301
 302	// Number bytes that match.
 303	MatchLength int
 304
 305	SymbolInfo *Symbol
 306}
 307
 308func (lfm *LineFragmentMatch) sizeBytes() (sz uint64) {
 309	// LineOffset
 310	sz += 8
 311
 312	// Offset
 313	sz += 4
 314
 315	// MatchLength
 316	sz += 8
 317
 318	// SymbolInfo
 319	sz += pointerSize
 320	if lfm.SymbolInfo != nil {
 321		sz += lfm.SymbolInfo.sizeBytes()
 322	}
 323
 324	return
 325}
 326
 327type FlushReason uint8
 328
 329const (
 330	FlushReasonTimerExpired FlushReason = 1 << iota
 331	FlushReasonFinalFlush
 332	FlushReasonMaxSize
 333)
 334
 335var FlushReasonStrings = map[FlushReason]string{
 336	FlushReasonTimerExpired: "timer_expired",
 337	FlushReasonFinalFlush:   "final_flush",
 338	FlushReasonMaxSize:      "max_size_reached",
 339}
 340
 341func (fr FlushReason) String() string {
 342	if v, ok := FlushReasonStrings[fr]; ok {
 343		return v
 344	}
 345
 346	return "none"
 347}
 348
 349// Stats contains interesting numbers on the search
 350type Stats struct {
 351	// Amount of I/O for reading contents.
 352	ContentBytesLoaded int64
 353
 354	// Amount of I/O for reading from index.
 355	IndexBytesLoaded int64
 356
 357	// Number of search shards that had a crash.
 358	Crashes int
 359
 360	// Wall clock time for this search
 361	Duration time.Duration
 362
 363	// Number of files containing a match.
 364	FileCount int
 365
 366	// Number of files in shards that we considered.
 367	ShardFilesConsidered int
 368
 369	// Files that we evaluated. Equivalent to files for which all
 370	// atom matches (including negations) evaluated to true.
 371	FilesConsidered int
 372
 373	// Files for which we loaded file content to verify substring matches
 374	FilesLoaded int
 375
 376	// Candidate files whose contents weren't examined because we
 377	// gathered enough matches.
 378	FilesSkipped int
 379
 380	// Shards that we scanned to find matches.
 381	ShardsScanned int
 382
 383	// Shards that we did not process because a query was canceled.
 384	ShardsSkipped int
 385
 386	// Shards that we did not process because the query was rejected by the
 387	// ngram filter indicating it had no matches.
 388	ShardsSkippedFilter int
 389
 390	// Number of non-overlapping matches
 391	MatchCount int
 392
 393	// Number of candidate matches as a result of searching ngrams.
 394	NgramMatches int
 395
 396	// NgramLookups is the number of times we accessed an ngram in the index.
 397	NgramLookups int
 398
 399	// Wall clock time for queued search.
 400	Wait time.Duration
 401
 402	// Aggregate wall clock time spent constructing and pruning the match tree.
 403	// This accounts for time such as lookups in the trigram index.
 404	MatchTreeConstruction time.Duration
 405
 406	// Aggregate wall clock time spent searching the match tree. This accounts
 407	// for the bulk of search work done looking for matches.
 408	MatchTreeSearch time.Duration
 409
 410	// Number of times regexp was called on files that we evaluated.
 411	RegexpsConsidered int
 412
 413	// FlushReason explains why results were flushed.
 414	FlushReason FlushReason
 415}
 416
 417func (s *Stats) sizeBytes() (sz uint64) {
 418	sz = 16 * 8 // This assumes we are running on a 64-bit architecture
 419	sz += 1     // FlushReason
 420
 421	return
 422}
 423
 424func (s *Stats) Add(o Stats) {
 425	s.ContentBytesLoaded += o.ContentBytesLoaded
 426	s.IndexBytesLoaded += o.IndexBytesLoaded
 427	s.Crashes += o.Crashes
 428	s.FileCount += o.FileCount
 429	s.FilesConsidered += o.FilesConsidered
 430	s.FilesLoaded += o.FilesLoaded
 431	s.FilesSkipped += o.FilesSkipped
 432	s.MatchCount += o.MatchCount
 433	s.NgramMatches += o.NgramMatches
 434	s.NgramLookups += o.NgramLookups
 435	s.ShardFilesConsidered += o.ShardFilesConsidered
 436	s.ShardsScanned += o.ShardsScanned
 437	s.ShardsSkipped += o.ShardsSkipped
 438	s.ShardsSkippedFilter += o.ShardsSkippedFilter
 439	s.Wait += o.Wait
 440	s.MatchTreeConstruction += o.MatchTreeConstruction
 441	s.MatchTreeSearch += o.MatchTreeSearch
 442	s.RegexpsConsidered += o.RegexpsConsidered
 443
 444	// We want the first non-zero FlushReason to be sticky. This is a useful
 445	// property when aggregating stats from several Zoekts.
 446	if s.FlushReason == 0 {
 447		s.FlushReason = o.FlushReason
 448	}
 449}
 450
 451// Zero returns true if stats is empty.
 452func (s *Stats) Zero() bool {
 453	if s == nil {
 454		return true
 455	}
 456
 457	return !(s.ContentBytesLoaded > 0 ||
 458		s.IndexBytesLoaded > 0 ||
 459		s.Crashes > 0 ||
 460		s.FileCount > 0 ||
 461		s.FilesConsidered > 0 ||
 462		s.FilesLoaded > 0 ||
 463		s.FilesSkipped > 0 ||
 464		s.MatchCount > 0 ||
 465		s.NgramMatches > 0 ||
 466		s.NgramLookups > 0 ||
 467		s.ShardFilesConsidered > 0 ||
 468		s.ShardsScanned > 0 ||
 469		s.ShardsSkipped > 0 ||
 470		s.ShardsSkippedFilter > 0 ||
 471		s.Wait > 0 ||
 472		s.MatchTreeConstruction > 0 ||
 473		s.MatchTreeSearch > 0 ||
 474		s.RegexpsConsidered > 0)
 475}
 476
 477// Progress contains information about the global progress of the running search query.
 478// This is used by the frontend to reorder results and emit them when stable.
 479// Sourcegraph specific: this is used when querying multiple zoekt-webserver instances.
 480type Progress struct {
 481	// Priority of the shard that was searched.
 482	Priority float64
 483
 484	// MaxPendingPriority is the maximum priority of pending result that is being searched in parallel.
 485	// This is used to reorder results when the result set is known to be stable-- that is, when a result's
 486	// Priority is greater than the max(MaxPendingPriority) from the latest results of each backend, it can be returned to the user.
 487	//
 488	// MaxPendingPriority decreases monotonically in each SearchResult.
 489	MaxPendingPriority float64
 490}
 491
 492func (p *Progress) sizeBytes() uint64 {
 493	return 2 * 8
 494}
 495
 496// SearchResult contains search matches and extra data
 497type SearchResult struct {
 498	Stats
 499
 500	// Do not encode this as we cannot encode -Inf in JSON
 501	Progress `json:"-"`
 502
 503	Files []FileMatch
 504
 505	// RepoURLs holds a repo => template string map.
 506	RepoURLs map[string]string
 507
 508	// FragmentNames holds a repo => template string map, for
 509	// the line number fragment.
 510	LineFragments map[string]string
 511}
 512
 513// SizeBytes is a best-effort estimate of the size of SearchResult in memory.
 514// The estimate does not take alignment into account. The result is a lower
 515// bound on the actual size in memory.
 516func (sr *SearchResult) SizeBytes() (sz uint64) {
 517	sz += sr.Stats.sizeBytes()
 518	sz += sr.Progress.sizeBytes()
 519
 520	// Files
 521	sz += sliceHeaderBytes
 522	for _, f := range sr.Files {
 523		sz += f.sizeBytes()
 524	}
 525
 526	// RepoURLs
 527	sz += mapHeaderBytes
 528	for k, v := range sr.RepoURLs {
 529		sz += stringHeaderBytes + uint64(len(k))
 530		sz += stringHeaderBytes + uint64(len(v))
 531	}
 532
 533	// LineFragments
 534	sz += mapHeaderBytes
 535	for k, v := range sr.LineFragments {
 536		sz += stringHeaderBytes + uint64(len(k))
 537		sz += stringHeaderBytes + uint64(len(v))
 538	}
 539
 540	return
 541}
 542
 543// RepositoryBranch describes an indexed branch, which is a name
 544// combined with a version.
 545type RepositoryBranch struct {
 546	Name    string
 547	Version string
 548}
 549
 550func (r RepositoryBranch) String() string {
 551	return fmt.Sprintf("%s@%s", r.Name, r.Version)
 552}
 553
 554// Repository holds repository metadata.
 555type Repository struct {
 556	// Sourcegraph's repository ID
 557	ID uint32
 558
 559	// The repository name
 560	Name string
 561
 562	// The repository URL.
 563	URL string
 564
 565	// The physical source where this repo came from, eg. full
 566	// path to the zip filename or git repository directory. This
 567	// will not be exposed in the UI, but can be used to detect
 568	// orphaned index shards.
 569	Source string
 570
 571	// The branches indexed in this repo.
 572	Branches []RepositoryBranch
 573
 574	// Nil if this is not the super project.
 575	SubRepoMap map[string]*Repository
 576
 577	// URL template to link to the commit of a branch
 578	CommitURLTemplate string
 579
 580	// The repository URL for getting to a file.  Has access to
 581	// {{.Version}}, {{.Path}}
 582	FileURLTemplate string
 583
 584	// The URL fragment to add to a file URL for line numbers. has
 585	// access to {{.LineNumber}}. The fragment should include the
 586	// separator, generally '#' or ';'.
 587	LineFragmentTemplate string
 588
 589	// Perf optimization: priority is set when we load the shard. It corresponds to
 590	// the value of "priority" stored in RawConfig.
 591	priority float64
 592
 593	// All zoekt.* configuration settings.
 594	RawConfig map[string]string
 595
 596	// Importance of the repository, bigger is more important
 597	Rank uint16
 598
 599	// IndexOptions is a hash of the options used to create the index for the
 600	// repo.
 601	IndexOptions string
 602
 603	// HasSymbols is true if this repository has indexed ctags
 604	// output. Sourcegraph specific: This field is more appropriate for
 605	// IndexMetadata. However, we store it here since the Sourcegraph frontend
 606	// can read this structure but not IndexMetadata.
 607	HasSymbols bool
 608
 609	// Tombstone is true if we are not allowed to search this repo.
 610	Tombstone bool
 611
 612	// LatestCommitDate is the date of the latest commit among all indexed Branches.
 613	// The date might be time.Time's 0-value if the repository was last indexed
 614	// before this field was added.
 615	LatestCommitDate time.Time
 616
 617	// FileTombstones is a set of file paths that should be ignored across all branches
 618	// in this shard.
 619	FileTombstones map[string]struct{} `json:",omitempty"`
 620}
 621
 622func (r *Repository) UnmarshalJSON(data []byte) error {
 623	// We define a new type so that we can use json.Unmarshal
 624	// without recursing into this same method.
 625	type repository *Repository
 626	repo := repository(r)
 627
 628	err := json.Unmarshal(data, repo)
 629	if err != nil {
 630		return err
 631	}
 632
 633	if v, ok := repo.RawConfig["repoid"]; ok {
 634		id, _ := strconv.ParseUint(v, 10, 32)
 635		r.ID = uint32(id)
 636	}
 637
 638	// Sourcegraph indexserver doesn't set repo.Rank, so we set it here. Setting it
 639	// on read instead of during indexing allows us to avoid a complete reindex.
 640	//
 641	// Prefer "latestCommitDate" over "priority" for ranking. We keep priority for
 642	// backwards compatibility.
 643	if _, ok := repo.RawConfig["latestCommitDate"]; ok {
 644		// We use the number of months since 1970 as a simple measure of repo freshness.
 645		// It is monotonically increasing and stable across re-indexes and restarts.
 646		r.Rank = monthsSince1970(repo.LatestCommitDate)
 647	} else if v, ok := repo.RawConfig["priority"]; ok {
 648		r.priority, err = strconv.ParseFloat(v, 64)
 649		if err != nil {
 650			r.priority = 0
 651		}
 652
 653		// Sourcegraph indexserver doesn't set repo.Rank, so we set it here
 654		// based on priority. Setting it on read instead of during indexing
 655		// allows us to avoid a complete reindex.
 656		if r.Rank == 0 && r.priority > 0 {
 657			// Normalize the repo score within [0, maxUint16), with the midpoint at 5,000.
 658			// This means popular repos (roughly ones with over 5,000 stars) see diminishing
 659			// returns from more stars.
 660			r.Rank = uint16(r.priority / (5000.0 + r.priority) * maxUInt16)
 661		}
 662	}
 663
 664	return nil
 665}
 666
 667// monthsSince1970 returns the number of months since 1970. It returns values in
 668// the range [0, maxUInt16]. The upper bound is reached in the year 7431, the
 669// lower bound for all dates before 1970.
 670func monthsSince1970(t time.Time) uint16 {
 671	base := time.Unix(0, 0)
 672	if t.Before(base) {
 673		return 0
 674	}
 675	months := int(t.Year()-1970)*12 + int(t.Month()-1)
 676	return uint16(min(months, maxUInt16))
 677}
 678
 679// MergeMutable will merge x into r. mutated will be true if it made any
 680// changes. err is non-nil if we needed to mutate an immutable field.
 681//
 682// Note: SubRepoMap, IndexOptions and HasSymbol fields are ignored. They are
 683// computed while indexing so can't be synthesized from x.
 684//
 685// Note: We ignore RawConfig fields which are duplicated into Repository:
 686// name and id.
 687func (r *Repository) MergeMutable(x *Repository) (mutated bool, err error) {
 688	if r.ID != x.ID {
 689		// Sourcegraph: strange behaviour may occur if ID changes but names don't.
 690		return mutated, errors.New("ID is immutable")
 691	}
 692	if r.Name != x.Name {
 693		// Name is encoded into the shard name on disk. We need to re-index if it
 694		// changes.
 695		return mutated, errors.New("Name is immutable")
 696	}
 697	if !reflect.DeepEqual(r.Branches, x.Branches) {
 698		// Need a reindex if content changing.
 699		return mutated, errors.New("Branches is immutable")
 700	}
 701
 702	for k, v := range x.RawConfig {
 703		// We ignore name and id since they are encoded into the repository.
 704		if k == "name" || k == "id" {
 705			continue
 706		}
 707		if r.RawConfig == nil {
 708			mutated = true
 709			r.RawConfig = make(map[string]string)
 710		}
 711		if r.RawConfig[k] != v {
 712			mutated = true
 713			r.RawConfig[k] = v
 714		}
 715	}
 716
 717	if r.URL != x.URL {
 718		mutated = true
 719		r.URL = x.URL
 720	}
 721	if r.CommitURLTemplate != x.CommitURLTemplate {
 722		mutated = true
 723		r.CommitURLTemplate = x.CommitURLTemplate
 724	}
 725	if r.FileURLTemplate != x.FileURLTemplate {
 726		mutated = true
 727		r.FileURLTemplate = x.FileURLTemplate
 728	}
 729	if r.LineFragmentTemplate != x.LineFragmentTemplate {
 730		mutated = true
 731		r.LineFragmentTemplate = x.LineFragmentTemplate
 732	}
 733
 734	return mutated, nil
 735}
 736
 737// IndexMetadata holds metadata stored in the index file. It contains
 738// data generated by the core indexing library.
 739type IndexMetadata struct {
 740	IndexFormatVersion    int
 741	IndexFeatureVersion   int
 742	IndexMinReaderVersion int
 743	IndexTime             time.Time
 744	PlainASCII            bool
 745	LanguageMap           map[string]uint16
 746	ZoektVersion          string
 747	ID                    string
 748}
 749
 750// Statistics of a (collection of) repositories.
 751type RepoStats struct {
 752	// Repos is used for aggregrating the number of repositories.
 753	//
 754	// Note: This field is not populated on RepoListEntry.Stats (individual) but
 755	// only for RepoList.Stats (aggregate).
 756	Repos int
 757
 758	// Shards is the total number of search shards.
 759	Shards int
 760
 761	// Documents holds the number of documents or files.
 762	Documents int
 763
 764	// IndexBytes is the amount of RAM used for index overhead.
 765	IndexBytes int64
 766
 767	// ContentBytes is the amount of RAM used for raw content.
 768	ContentBytes int64
 769
 770	// Sourcegraph specific stats below. These are not as efficient to calculate
 771	// as the above statistics. We experimentally measured about a 10% slower
 772	// shard load time. However, we find these values very useful to track and
 773	// computing them outside of load time introduces a lot of complexity.
 774
 775	// NewLinesCount is the number of newlines "\n" that appear in the zoekt
 776	// indexed documents. This is not exactly the same as line count, since it
 777	// will not include lines not terminated by "\n" (eg a file with no "\n", or
 778	// a final line without "\n"). Note: Zoekt deduplicates documents across
 779	// branches, so if a path has the same contents on multiple branches, there
 780	// is only one document for it. As such that document's newlines is only
 781	// counted once. See DefaultBranchNewLinesCount and AllBranchesNewLinesCount
 782	// for counts which do not deduplicate.
 783	NewLinesCount uint64
 784
 785	// DefaultBranchNewLinesCount is the number of newlines "\n" in the default
 786	// branch.
 787	DefaultBranchNewLinesCount uint64
 788
 789	// OtherBranchesNewLinesCount is the number of newlines "\n" in all branches
 790	// except the default branch.
 791	OtherBranchesNewLinesCount uint64
 792}
 793
 794func (s *RepoStats) Add(o *RepoStats) {
 795	// can't update Repos, since one repo may have multiple
 796	// shards.
 797	s.Shards += o.Shards
 798	s.IndexBytes += o.IndexBytes
 799	s.Documents += o.Documents
 800	s.ContentBytes += o.ContentBytes
 801
 802	// Sourcegraph specific
 803	s.NewLinesCount += o.NewLinesCount
 804	s.DefaultBranchNewLinesCount += o.DefaultBranchNewLinesCount
 805	s.OtherBranchesNewLinesCount += o.OtherBranchesNewLinesCount
 806}
 807
 808type RepoListEntry struct {
 809	Repository    Repository
 810	IndexMetadata IndexMetadata
 811	Stats         RepoStats
 812}
 813
 814// MinimalRepoListEntry is a subset of RepoListEntry. It was added after
 815// performance profiling of sourcegraph.com revealed that querying this
 816// information from Zoekt was causing lots of CPU and memory usage. Note: we
 817// can revisit this, how we store and query this information has changed a lot
 818// since this was introduced.
 819type MinimalRepoListEntry struct {
 820	// HasSymbols is exported since Sourcegraph uses this information at search
 821	// planning time to decide between Zoekt and an unindexed symbol search.
 822	//
 823	// Note: it pretty much is always true in practice.
 824	HasSymbols bool
 825
 826	// Branches is used by Sourcegraphs query planner to decided if it can use
 827	// zoekt or go via an unindexed code path.
 828	Branches []RepositoryBranch
 829
 830	// IndexTimeUnix is the IndexTime converted to unix time (number of seconds
 831	// since the epoch). This is to make it clear we are not transporting the
 832	// full fidelty timestamp (ie with milliseconds and location). Additionally
 833	// it saves 16 bytes in this struct.
 834	//
 835	// IndexTime is used as a heuristic in Sourcegraph to decide in aggregate
 836	// how many repositories need updating after a ranking change/etc.
 837	//
 838	// TODO(keegancsmith) audit updates to IndexTime and document how and when
 839	// it changes. Concerned about things like metadata updates or compound
 840	// shards leading to untrustworthy data here.
 841	IndexTimeUnix int64
 842}
 843
 844type ReposMap map[uint32]MinimalRepoListEntry
 845
 846// MarshalBinary implements a specialized encoder for ReposMap.
 847func (q *ReposMap) MarshalBinary() ([]byte, error) {
 848	return reposMapEncode(*q)
 849}
 850
 851// UnmarshalBinary implements a specialized decoder for ReposMap.
 852func (q *ReposMap) UnmarshalBinary(b []byte) error {
 853	var err error
 854	(*q), err = reposMapDecode(b)
 855	return err
 856}
 857
 858// RepoList holds a set of Repository metadata.
 859type RepoList struct {
 860	// Returned when ListOptions.Field is RepoListFieldRepos.
 861	Repos []*RepoListEntry
 862
 863	// ReposMap is set when ListOptions.Field is RepoListFieldReposMap.
 864	ReposMap ReposMap
 865
 866	Crashes int
 867
 868	// Stats response to a List request.
 869	// This is the aggregate RepoStats of all repos matching the input query.
 870	Stats RepoStats
 871}
 872
 873type Searcher interface {
 874	Search(ctx context.Context, q query.Q, opts *SearchOptions) (*SearchResult, error)
 875
 876	// List lists repositories. The query `q` can only contain
 877	// query.Repo atoms.
 878	List(ctx context.Context, q query.Q, opts *ListOptions) (*RepoList, error)
 879	Close()
 880
 881	// Describe the searcher for debug messages.
 882	String() string
 883}
 884
 885type RepoListField int
 886
 887const (
 888	RepoListFieldRepos    RepoListField = 0
 889	RepoListFieldReposMap               = 2
 890)
 891
 892type ListOptions struct {
 893	// Field decides which field to populate in RepoList response.
 894	Field RepoListField
 895}
 896
 897func (o *ListOptions) GetField() (RepoListField, error) {
 898	if o == nil {
 899		return RepoListFieldRepos, nil
 900	}
 901	switch o.Field {
 902	case RepoListFieldRepos, RepoListFieldReposMap:
 903		return o.Field, nil
 904	case 1:
 905		return 0, fmt.Errorf("RepoListFieldMinimal (%d) is no longer supported", o.Field)
 906	default:
 907		return 0, fmt.Errorf("unknown RepoListField %d", o.Field)
 908	}
 909}
 910
 911func (o *ListOptions) String() string {
 912	return fmt.Sprintf("%#v", o)
 913}
 914
 915type SearchOptions struct {
 916	// Return an upper-bound estimate of eligible documents in
 917	// stats.ShardFilesConsidered.
 918	EstimateDocCount bool
 919
 920	// Return the whole file.
 921	Whole bool
 922
 923	// Maximum number of matches: skip all processing an index
 924	// shard after we found this many non-overlapping matches.
 925	ShardMaxMatchCount int
 926
 927	// Maximum number of matches: stop looking for more matches
 928	// once we have this many matches across shards.
 929	TotalMaxMatchCount int
 930
 931	// Maximum number of matches: skip processing documents for a repository in
 932	// a shard once we have found ShardRepoMaxMatchCount.
 933	//
 934	// A compound shard may contain multiple repositories. This will most often
 935	// be set to 1 to find all repositories containing a result.
 936	ShardRepoMaxMatchCount int
 937
 938	// Abort the search after this much time has passed.
 939	MaxWallTime time.Duration
 940
 941	// FlushWallTime if non-zero will stop streaming behaviour at first and
 942	// instead will collate and sort results. At FlushWallTime the results will
 943	// be sent and then the behaviour will revert to the normal streaming.
 944	FlushWallTime time.Duration
 945
 946	// Truncates the number of documents (i.e. files) after collating and
 947	// sorting the results.
 948	MaxDocDisplayCount int
 949
 950	// Truncates the number of matchs after collating and sorting the results.
 951	MaxMatchDisplayCount int
 952
 953	// If set to a number greater than zero then up to this many number
 954	// of context lines will be added before and after each matched line.
 955	// Note that the included context lines might contain matches and
 956	// it's up to the consumer of the result to remove those lines.
 957	NumContextLines int
 958
 959	// If true, ChunkMatches will be returned in each FileMatch rather than LineMatches
 960	// EXPERIMENTAL: the behavior of this flag may be changed in future versions.
 961	ChunkMatches bool
 962
 963	// EXPERIMENTAL. If true, document ranks are used as additional input for
 964	// sorting matches.
 965	UseDocumentRanks bool
 966
 967	// EXPERIMENTAL. When UseDocumentRanks is enabled, this can be optionally set to adjust
 968	// their weight in the file match score. If the value is <= 0.0, the default weight value
 969	// will be used. This option is temporary and is only exposed for testing/ tuning purposes.
 970	DocumentRanksWeight float64
 971
 972	// EXPERIMENTAL. If true, use text-search style scoring instead of the default
 973	// scoring formula. The scoring algorithm treats each match in a file as a term
 974	// and computes an approximation to BM25.
 975	//
 976	// The calculation of IDF assumes that Zoekt visits all documents containing any
 977	// of the query terms during evaluation. This is true, for example, if all query
 978	// terms are ORed together.
 979	//
 980	// When enabled, all other scoring signals are ignored, including document ranks.
 981	UseBM25Scoring bool
 982
 983	// Trace turns on opentracing for this request if true and if the Jaeger address was provided as
 984	// a command-line flag
 985	Trace bool
 986
 987	// If set, the search results will contain debug information for scoring.
 988	DebugScore bool
 989
 990	// SpanContext is the opentracing span context, if it exists, from the zoekt client
 991	SpanContext map[string]string
 992}
 993
 994// String returns a succinct representation of the options. This is meant for
 995// human consumption in logs and traces.
 996//
 997// Note: some tracing systems have limits on length of values, so we take care
 998// to try and make this small, and include the important information near the
 999// front incase of truncation.
1000func (s *SearchOptions) String() string {
1001	var b strings.Builder
1002
1003	add := func(name, value string) {
1004		b.WriteString(name)
1005		b.WriteByte('=')
1006		b.WriteString(value)
1007		b.WriteByte(' ')
1008	}
1009	addInt := func(name string, value int) {
1010		if value != 0 {
1011			add(name, strconv.Itoa(value))
1012		}
1013	}
1014	addDuration := func(name string, value time.Duration) {
1015		if value != 0 {
1016			add(name, value.String())
1017		}
1018	}
1019	addBool := func(name string, value bool) {
1020		if !value {
1021			return
1022		}
1023		b.WriteString(name)
1024		b.WriteByte(' ')
1025	}
1026
1027	b.WriteString("zoekt.SearchOptions{ ")
1028
1029	addInt("ShardMaxMatchCount", s.ShardMaxMatchCount)
1030	addInt("TotalMaxMatchCount", s.TotalMaxMatchCount)
1031	addInt("ShardRepoMaxMatchCount", s.ShardRepoMaxMatchCount)
1032	addInt("MaxDocDisplayCount", s.MaxDocDisplayCount)
1033	addInt("MaxMatchDisplayCount", s.MaxMatchDisplayCount)
1034	addInt("NumContextLines", s.NumContextLines)
1035
1036	addDuration("MaxWallTime", s.MaxWallTime)
1037	addDuration("FlushWallTime", s.FlushWallTime)
1038
1039	if s.DocumentRanksWeight > 0 {
1040		add("DocumentRanksWeight", strconv.FormatFloat(s.DocumentRanksWeight, 'g', -1, 64))
1041	}
1042
1043	addBool("EstimateDocCount", s.EstimateDocCount)
1044	addBool("Whole", s.Whole)
1045	addBool("ChunkMatches", s.ChunkMatches)
1046	addBool("UseDocumentRanks", s.UseDocumentRanks)
1047	addBool("UseBM25Scoring", s.UseBM25Scoring)
1048	addBool("Trace", s.Trace)
1049	addBool("DebugScore", s.DebugScore)
1050
1051	for k, v := range s.SpanContext {
1052		add("SpanContext."+k, strconv.Quote(v))
1053	}
1054
1055	b.WriteByte('}')
1056	return b.String()
1057}
1058
1059// Sender is the interface that wraps the basic Send method.
1060type Sender interface {
1061	Send(*SearchResult)
1062}
1063
1064// SenderFunc is an adapter to allow the use of ordinary functions as Sender.
1065// If f is a function with the appropriate signature, SenderFunc(f) is a Sender
1066// that calls f.
1067type SenderFunc func(result *SearchResult)
1068
1069func (f SenderFunc) Send(result *SearchResult) {
1070	f(result)
1071}
1072
1073// Streamer adds the method StreamSearch to the Searcher interface.
1074type Streamer interface {
1075	Searcher
1076	StreamSearch(ctx context.Context, q query.Q, opts *SearchOptions, sender Sender) (err error)
1077}
Configure Feed

Configure Feed