api.go at 3aee1cc797babea157a0510c405d669d83cbdfb9 · boltless.me/zoekt

fork of https://github.com/sourcegraph/zoekt
zoekt / api.go
at 3aee1cc797babea157a0510c405d669d83cbdfb9 32 kB View raw
Julie Tibshirani Move root-level index code to index package (#902) 1y ago
   1// Copyright 2016 Google Inc. All rights reserved.
   2//
   3// Licensed under the Apache License, Version 2.0 (the "License");
   4// you may not use this file except in compliance with the License.
   5// You may obtain a copy of the License at
   6//
   7//    http://www.apache.org/licenses/LICENSE-2.0
   8//
   9// Unless required by applicable law or agreed to in writing, software
  10// distributed under the License is distributed on an "AS IS" BASIS,
  11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12// See the License for the specific language governing permissions and
  13// limitations under the License.
  14
  15package zoekt // import "github.com/sourcegraph/zoekt"
  16
  17import (
  18	"context"
  19	"encoding/json"
  20	"errors"
  21	"fmt"
  22	"math"
  23	"reflect"
  24	"strconv"
  25	"strings"
  26	"time"
  27
  28	"github.com/sourcegraph/zoekt/query"
  29)
  30
  31const (
  32	mapHeaderBytes    uint64 = 48
  33	sliceHeaderBytes  uint64 = 24
  34	stringHeaderBytes uint64 = 16
  35	pointerSize       uint64 = 8
  36)
  37
  38// FileMatch contains all the matches within a file.
  39type FileMatch struct {
  40	FileName string
  41
  42	// Repository is the globally unique name of the repo of the
  43	// match
  44	Repository string
  45
  46	// SubRepositoryName is the globally unique name of the repo,
  47	// if it came from a subrepository
  48	SubRepositoryName string `json:",omitempty"`
  49
  50	// SubRepositoryPath holds the prefix where the subrepository
  51	// was mounted.
  52	SubRepositoryPath string `json:",omitempty"`
  53
  54	// Commit SHA1 (hex) of the (sub)repo holding the file.
  55	Version string `json:",omitempty"`
  56
  57	// Detected language of the result.
  58	Language string
  59
  60	// For debugging. Needs DebugScore set, but public so tests in
  61	// other packages can print some diagnostics.
  62	Debug string `json:",omitempty"`
  63
  64	Branches []string `json:",omitempty"`
  65
  66	// One of LineMatches or ChunkMatches will be returned depending on whether
  67	// the SearchOptions.ChunkMatches is set.
  68	LineMatches  []LineMatch  `json:",omitempty"`
  69	ChunkMatches []ChunkMatch `json:",omitempty"`
  70
  71	// Only set if requested
  72	Content []byte `json:",omitempty"`
  73
  74	// Checksum of the content.
  75	Checksum []byte
  76
  77	// Ranking; the higher, the better.
  78	Score float64 `json:",omitempty"`
  79
  80	// RepositoryPriority is a Sourcegraph extension. It is used by Sourcegraph to
  81	// order results from different repositories relative to each other.
  82	RepositoryPriority float64 `json:",omitempty"`
  83
  84	// RepositoryID is a Sourcegraph extension. This is the ID of Repository in
  85	// Sourcegraph.
  86	RepositoryID uint32 `json:",omitempty"`
  87}
  88
  89func (m *FileMatch) sizeBytes() (sz uint64) {
  90	// Score
  91	sz += 8
  92
  93	for _, s := range []string{
  94		m.Debug,
  95		m.FileName,
  96		m.Repository,
  97		m.Language,
  98		m.SubRepositoryName,
  99		m.SubRepositoryPath,
 100		m.Version,
 101	} {
 102		sz += stringHeaderBytes + uint64(len(s))
 103	}
 104
 105	// Branches
 106	sz += sliceHeaderBytes
 107	for _, s := range m.Branches {
 108		sz += stringHeaderBytes + uint64(len(s))
 109	}
 110
 111	// LineMatches
 112	sz += sliceHeaderBytes
 113	for _, lm := range m.LineMatches {
 114		sz += lm.sizeBytes()
 115	}
 116
 117	// ChunkMatches
 118	sz += sliceHeaderBytes
 119	for _, cm := range m.ChunkMatches {
 120		sz += cm.sizeBytes()
 121	}
 122
 123	// RepositoryID
 124	sz += 4
 125
 126	// RepositoryPriority
 127	sz += 8
 128
 129	// Content
 130	sz += sliceHeaderBytes + uint64(len(m.Content))
 131
 132	// Checksum
 133	sz += sliceHeaderBytes + uint64(len(m.Checksum))
 134
 135	return
 136}
 137
 138// AddScore increments the score of the FileMatch by the computed score. If
 139// debugScore is true, it also adds a debug string to the FileMatch. If raw is
 140// -1, it is ignored. Otherwise, it is added to the debug string.
 141func (m *FileMatch) AddScore(what string, computed float64, raw float64, debugScore bool) {
 142	if computed != 0 && debugScore {
 143		var b strings.Builder
 144		fmt.Fprintf(&b, "%s", what)
 145		if raw != -1 {
 146			fmt.Fprintf(&b, "(%s)", strconv.FormatFloat(raw, 'f', -1, 64))
 147		}
 148		fmt.Fprintf(&b, ":%.2f, ", computed)
 149		m.Debug += b.String()
 150	}
 151	m.Score += computed
 152}
 153
 154// ChunkMatch is a set of non-overlapping matches within a contiguous range of
 155// lines in the file.
 156type ChunkMatch struct {
 157	DebugScore string
 158
 159	// Content is a contiguous range of complete lines that fully contains Ranges.
 160	// Lines will always include their terminating newline (if it exists).
 161	Content []byte
 162
 163	// Ranges is a set of matching ranges within this chunk. Each range is relative
 164	// to the beginning of the file (not the beginning of Content).
 165	Ranges []Range
 166
 167	// SymbolInfo is the symbol information associated with Ranges. If it is non-nil,
 168	// its length will equal that of Ranges. Any of its elements may be nil.
 169	SymbolInfo []*Symbol
 170
 171	// FileName indicates whether this match is a match on the file name, in
 172	// which case Content will contain the file name.
 173	FileName bool
 174
 175	// ContentStart is the location (inclusive) of the beginning of content
 176	// relative to the beginning of the file. It will always be at the
 177	// beginning of a line (Column will always be 1).
 178	ContentStart Location
 179
 180	// Score is the overall relevance score of this chunk.
 181	Score float64
 182
 183	// BestLineMatch is the line number of the highest-scoring line match in this chunk.
 184	// The line number represents the index in the full file, and is 1-based. If FileName: true,
 185	// this number will be 0.
 186	BestLineMatch uint32
 187}
 188
 189func (cm *ChunkMatch) sizeBytes() (sz uint64) {
 190	// Content
 191	sz += sliceHeaderBytes + uint64(len(cm.Content))
 192
 193	// ContentStart
 194	sz += cm.ContentStart.sizeBytes()
 195
 196	// FileName
 197	sz += 1
 198
 199	// Ranges
 200	sz += sliceHeaderBytes
 201	if len(cm.Ranges) > 0 {
 202		sz += uint64(len(cm.Ranges)) * cm.Ranges[0].sizeBytes()
 203	}
 204
 205	// SymbolInfo
 206	sz += sliceHeaderBytes
 207	for _, si := range cm.SymbolInfo {
 208		sz += pointerSize
 209		if si != nil {
 210			sz += si.sizeBytes()
 211		}
 212	}
 213
 214	// Score
 215	sz += 8
 216
 217	// DebugScore
 218	sz += stringHeaderBytes + uint64(len(cm.DebugScore))
 219
 220	return
 221}
 222
 223type Range struct {
 224	// The inclusive beginning of the range.
 225	Start Location
 226	// The exclusive end of the range.
 227	End Location
 228}
 229
 230func (r *Range) sizeBytes() uint64 {
 231	return r.Start.sizeBytes() + r.End.sizeBytes()
 232}
 233
 234type Location struct {
 235	// 0-based byte offset from the beginning of the file
 236	ByteOffset uint32
 237	// 1-based line number from the beginning of the file
 238	LineNumber uint32
 239	// 1-based column number (in runes) from the beginning of line
 240	Column uint32
 241}
 242
 243func (l *Location) sizeBytes() uint64 {
 244	return 3 * 4
 245}
 246
 247// LineMatch holds the matches within a single line in a file.
 248type LineMatch struct {
 249	// The line in which a match was found.
 250	Line []byte
 251	// The byte offset of the first byte of the line.
 252	LineStart int
 253	// The byte offset of the first byte past the end of the line.
 254	// This is usually the byte after the terminating newline, but can also be
 255	// the end of the file if there is no terminating newline
 256	LineEnd    int
 257	LineNumber int
 258
 259	// Before and After are only set when SearchOptions.NumContextLines is > 0
 260	Before []byte
 261	After  []byte
 262
 263	// If set, this was a match on the filename.
 264	FileName bool
 265
 266	// The higher the better. Only ranks the quality of the match
 267	// within the file, does not take rank of file into account
 268	Score      float64
 269	DebugScore string
 270
 271	LineFragments []LineFragmentMatch
 272}
 273
 274func (lm *LineMatch) sizeBytes() (sz uint64) {
 275	// Line
 276	sz += sliceHeaderBytes + uint64(len(lm.Line))
 277
 278	// LineStart, LineEnd, LineNumber
 279	sz += 3 * 8
 280
 281	// Before
 282	sz += sliceHeaderBytes + uint64(len(lm.Before))
 283
 284	// After
 285	sz += sliceHeaderBytes + uint64(len(lm.After))
 286
 287	// FileName
 288	sz += 1
 289
 290	// Score
 291	sz += 8
 292
 293	// DebugScore
 294	sz += stringHeaderBytes + uint64(len(lm.DebugScore))
 295
 296	// LineFragments
 297	sz += sliceHeaderBytes
 298	for _, lf := range lm.LineFragments {
 299		sz += lf.sizeBytes()
 300	}
 301
 302	return
 303}
 304
 305type Symbol struct {
 306	Sym        string
 307	Kind       string
 308	Parent     string
 309	ParentKind string
 310}
 311
 312func (s *Symbol) sizeBytes() uint64 {
 313	return 4*stringHeaderBytes + uint64(len(s.Sym)+len(s.Kind)+len(s.Parent)+len(s.ParentKind))
 314}
 315
 316// LineFragmentMatch a segment of matching text within a line.
 317type LineFragmentMatch struct {
 318	// Offset within the line, in bytes.
 319	LineOffset int
 320
 321	// Offset from file start, in bytes.
 322	Offset uint32
 323
 324	// Number bytes that match.
 325	MatchLength int
 326
 327	SymbolInfo *Symbol
 328}
 329
 330func (lfm *LineFragmentMatch) sizeBytes() (sz uint64) {
 331	// LineOffset
 332	sz += 8
 333
 334	// Offset
 335	sz += 4
 336
 337	// MatchLength
 338	sz += 8
 339
 340	// SymbolInfo
 341	sz += pointerSize
 342	if lfm.SymbolInfo != nil {
 343		sz += lfm.SymbolInfo.sizeBytes()
 344	}
 345
 346	return
 347}
 348
 349type FlushReason uint8
 350
 351const (
 352	FlushReasonTimerExpired FlushReason = 1 << iota
 353	FlushReasonFinalFlush
 354	FlushReasonMaxSize
 355)
 356
 357var FlushReasonStrings = map[FlushReason]string{
 358	FlushReasonTimerExpired: "timer_expired",
 359	FlushReasonFinalFlush:   "final_flush",
 360	FlushReasonMaxSize:      "max_size_reached",
 361}
 362
 363func (fr FlushReason) String() string {
 364	if v, ok := FlushReasonStrings[fr]; ok {
 365		return v
 366	}
 367
 368	return "none"
 369}
 370
 371// Stats contains interesting numbers on the search
 372type Stats struct {
 373	// Amount of I/O for reading contents.
 374	ContentBytesLoaded int64
 375
 376	// Amount of I/O for reading from index.
 377	IndexBytesLoaded int64
 378
 379	// Number of search shards that had a crash.
 380	Crashes int
 381
 382	// Wall clock time for this search
 383	Duration time.Duration
 384
 385	// Number of files containing a match.
 386	FileCount int
 387
 388	// Number of files in shards that we considered.
 389	ShardFilesConsidered int
 390
 391	// Files that we evaluated. Equivalent to files for which all
 392	// atom matches (including negations) evaluated to true.
 393	FilesConsidered int
 394
 395	// Files for which we loaded file content to verify substring matches
 396	FilesLoaded int
 397
 398	// Candidate files whose contents weren't examined because we
 399	// gathered enough matches.
 400	FilesSkipped int
 401
 402	// Shards that we scanned to find matches.
 403	ShardsScanned int
 404
 405	// Shards that we did not process because a query was canceled.
 406	ShardsSkipped int
 407
 408	// Shards that we did not process because the query was rejected by the
 409	// ngram filter indicating it had no matches.
 410	ShardsSkippedFilter int
 411
 412	// Number of non-overlapping matches
 413	MatchCount int
 414
 415	// Number of candidate matches as a result of searching ngrams.
 416	NgramMatches int
 417
 418	// NgramLookups is the number of times we accessed an ngram in the index.
 419	NgramLookups int
 420
 421	// Wall clock time for queued search.
 422	Wait time.Duration
 423
 424	// Aggregate wall clock time spent constructing and pruning the match tree.
 425	// This accounts for time such as lookups in the trigram index.
 426	MatchTreeConstruction time.Duration
 427
 428	// Aggregate wall clock time spent searching the match tree. This accounts
 429	// for the bulk of search work done looking for matches.
 430	MatchTreeSearch time.Duration
 431
 432	// Number of times regexp was called on files that we evaluated.
 433	RegexpsConsidered int
 434
 435	// FlushReason explains why results were flushed.
 436	FlushReason FlushReason
 437}
 438
 439func (s *Stats) sizeBytes() (sz uint64) {
 440	sz = 16 * 8 // This assumes we are running on a 64-bit architecture
 441	sz += 1     // FlushReason
 442
 443	return
 444}
 445
 446func (s *Stats) Add(o Stats) {
 447	s.ContentBytesLoaded += o.ContentBytesLoaded
 448	s.IndexBytesLoaded += o.IndexBytesLoaded
 449	s.Crashes += o.Crashes
 450	s.FileCount += o.FileCount
 451	s.FilesConsidered += o.FilesConsidered
 452	s.FilesLoaded += o.FilesLoaded
 453	s.FilesSkipped += o.FilesSkipped
 454	s.MatchCount += o.MatchCount
 455	s.NgramMatches += o.NgramMatches
 456	s.NgramLookups += o.NgramLookups
 457	s.ShardFilesConsidered += o.ShardFilesConsidered
 458	s.ShardsScanned += o.ShardsScanned
 459	s.ShardsSkipped += o.ShardsSkipped
 460	s.ShardsSkippedFilter += o.ShardsSkippedFilter
 461	s.Wait += o.Wait
 462	s.MatchTreeConstruction += o.MatchTreeConstruction
 463	s.MatchTreeSearch += o.MatchTreeSearch
 464	s.RegexpsConsidered += o.RegexpsConsidered
 465
 466	// We want the first non-zero FlushReason to be sticky. This is a useful
 467	// property when aggregating stats from several Zoekts.
 468	if s.FlushReason == 0 {
 469		s.FlushReason = o.FlushReason
 470	}
 471}
 472
 473// Zero returns true if stats is empty.
 474func (s *Stats) Zero() bool {
 475	if s == nil {
 476		return true
 477	}
 478
 479	return !(s.ContentBytesLoaded > 0 ||
 480		s.IndexBytesLoaded > 0 ||
 481		s.Crashes > 0 ||
 482		s.FileCount > 0 ||
 483		s.FilesConsidered > 0 ||
 484		s.FilesLoaded > 0 ||
 485		s.FilesSkipped > 0 ||
 486		s.MatchCount > 0 ||
 487		s.NgramMatches > 0 ||
 488		s.NgramLookups > 0 ||
 489		s.ShardFilesConsidered > 0 ||
 490		s.ShardsScanned > 0 ||
 491		s.ShardsSkipped > 0 ||
 492		s.ShardsSkippedFilter > 0 ||
 493		s.Wait > 0 ||
 494		s.MatchTreeConstruction > 0 ||
 495		s.MatchTreeSearch > 0 ||
 496		s.RegexpsConsidered > 0)
 497}
 498
 499// Progress contains information about the global progress of the running search query.
 500// This is used by the frontend to reorder results and emit them when stable.
 501// Sourcegraph specific: this is used when querying multiple zoekt-webserver instances.
 502type Progress struct {
 503	// Priority of the shard that was searched.
 504	Priority float64
 505
 506	// MaxPendingPriority is the maximum priority of pending result that is being searched in parallel.
 507	// This is used to reorder results when the result set is known to be stable-- that is, when a result's
 508	// Priority is greater than the max(MaxPendingPriority) from the latest results of each backend, it can be returned to the user.
 509	//
 510	// MaxPendingPriority decreases monotonically in each SearchResult.
 511	MaxPendingPriority float64
 512}
 513
 514func (p *Progress) sizeBytes() uint64 {
 515	return 2 * 8
 516}
 517
 518// SearchResult contains search matches and extra data
 519type SearchResult struct {
 520	Stats
 521
 522	// Do not encode this as we cannot encode -Inf in JSON
 523	Progress `json:"-"`
 524
 525	Files []FileMatch
 526
 527	// RepoURLs holds a repo => template string map.
 528	RepoURLs map[string]string
 529
 530	// FragmentNames holds a repo => template string map, for
 531	// the line number fragment.
 532	LineFragments map[string]string
 533}
 534
 535// SizeBytes is a best-effort estimate of the size of SearchResult in memory.
 536// The estimate does not take alignment into account. The result is a lower
 537// bound on the actual size in memory.
 538func (sr *SearchResult) SizeBytes() (sz uint64) {
 539	sz += sr.Stats.sizeBytes()
 540	sz += sr.Progress.sizeBytes()
 541
 542	// Files
 543	sz += sliceHeaderBytes
 544	for _, f := range sr.Files {
 545		sz += f.sizeBytes()
 546	}
 547
 548	// RepoURLs
 549	sz += mapHeaderBytes
 550	for k, v := range sr.RepoURLs {
 551		sz += stringHeaderBytes + uint64(len(k))
 552		sz += stringHeaderBytes + uint64(len(v))
 553	}
 554
 555	// LineFragments
 556	sz += mapHeaderBytes
 557	for k, v := range sr.LineFragments {
 558		sz += stringHeaderBytes + uint64(len(k))
 559		sz += stringHeaderBytes + uint64(len(v))
 560	}
 561
 562	return
 563}
 564
 565// RepositoryBranch describes an indexed branch, which is a name
 566// combined with a version.
 567type RepositoryBranch struct {
 568	Name    string
 569	Version string
 570}
 571
 572func (r RepositoryBranch) String() string {
 573	return fmt.Sprintf("%s@%s", r.Name, r.Version)
 574}
 575
 576// Repository holds repository metadata.
 577type Repository struct {
 578	// Sourcegraph's tenant ID
 579	TenantID int
 580
 581	// Sourcegraph's repository ID
 582	ID uint32
 583
 584	// The repository name
 585	Name string
 586
 587	// The repository URL.
 588	URL string
 589
 590	// The physical source where this repo came from, eg. full
 591	// path to the zip filename or git repository directory. This
 592	// will not be exposed in the UI, but can be used to detect
 593	// orphaned index shards.
 594	Source string
 595
 596	// The branches indexed in this repo.
 597	Branches []RepositoryBranch
 598
 599	// Nil if this is not the super project.
 600	SubRepoMap map[string]*Repository
 601
 602	// URL template to link to the commit of a branch
 603	CommitURLTemplate string
 604
 605	// The repository URL for getting to a file.  Has access to
 606	// {{.Version}}, {{.Path}}
 607	FileURLTemplate string
 608
 609	// The URL fragment to add to a file URL for line numbers. has
 610	// access to {{.LineNumber}}. The fragment should include the
 611	// separator, generally '#' or ';'.
 612	LineFragmentTemplate string
 613
 614	// Perf optimization: priority is set when we load the shard. It corresponds to
 615	// the value of "priority" stored in RawConfig.
 616	priority float64
 617
 618	// All zoekt.* configuration settings.
 619	RawConfig map[string]string
 620
 621	// Importance of the repository, bigger is more important
 622	Rank uint16
 623
 624	// IndexOptions is a hash of the options used to create the index for the
 625	// repo.
 626	IndexOptions string
 627
 628	// HasSymbols is true if this repository has indexed ctags
 629	// output. Sourcegraph specific: This field is more appropriate for
 630	// IndexMetadata. However, we store it here since the Sourcegraph frontend
 631	// can read this structure but not IndexMetadata.
 632	HasSymbols bool
 633
 634	// Tombstone is true if we are not allowed to search this repo.
 635	Tombstone bool
 636
 637	// LatestCommitDate is the date of the latest commit among all indexed Branches.
 638	// The date might be time.Time's 0-value if the repository was last indexed
 639	// before this field was added.
 640	LatestCommitDate time.Time
 641
 642	// FileTombstones is a set of file paths that should be ignored across all branches
 643	// in this shard.
 644	FileTombstones map[string]struct{} `json:",omitempty"`
 645}
 646
 647func (r *Repository) UnmarshalJSON(data []byte) error {
 648	// We define a new type so that we can use json.Unmarshal
 649	// without recursing into this same method.
 650	type repository *Repository
 651	repo := repository(r)
 652
 653	err := json.Unmarshal(data, repo)
 654	if err != nil {
 655		return err
 656	}
 657
 658	if v, ok := repo.RawConfig["repoid"]; ok {
 659		id, _ := strconv.ParseUint(v, 10, 32)
 660		r.ID = uint32(id)
 661	}
 662
 663	if v, ok := repo.RawConfig["tenantID"]; ok {
 664		id, _ := strconv.ParseInt(v, 10, 64)
 665		r.TenantID = int(id)
 666	}
 667
 668	// Sourcegraph indexserver doesn't set repo.Rank, so we set it here. Setting it
 669	// on read instead of during indexing allows us to avoid a complete reindex.
 670	//
 671	// Prefer "latestCommitDate" over "priority" for ranking. We keep priority for
 672	// backwards compatibility.
 673	if _, ok := repo.RawConfig["latestCommitDate"]; ok {
 674		// We use the number of months since 1970 as a simple measure of repo freshness.
 675		// It is monotonically increasing and stable across re-indexes and restarts.
 676		r.Rank = monthsSince1970(repo.LatestCommitDate)
 677	} else if v, ok := repo.RawConfig["priority"]; ok {
 678		r.priority, err = strconv.ParseFloat(v, 64)
 679		if err != nil {
 680			r.priority = 0
 681		}
 682
 683		// Sourcegraph indexserver doesn't set repo.Rank, so we set it here
 684		// based on priority. Setting it on read instead of during indexing
 685		// allows us to avoid a complete reindex.
 686		if r.Rank == 0 && r.priority > 0 {
 687			// Normalize the repo score within [0, maxUint16), with the midpoint at 5,000.
 688			// This means popular repos (roughly ones with over 5,000 stars) see diminishing
 689			// returns from more stars.
 690			r.Rank = uint16(r.priority / (5000.0 + r.priority) * math.MaxUint16)
 691		}
 692	}
 693
 694	return nil
 695}
 696
 697func (r *Repository) GetPriority() float64 {
 698	return r.priority
 699}
 700
 701// monthsSince1970 returns the number of months since 1970. It returns values in
 702// the range [0, maxUInt16]. The upper bound is reached in the year 7431, the
 703// lower bound for all dates before 1970.
 704func monthsSince1970(t time.Time) uint16 {
 705	base := time.Unix(0, 0)
 706	if t.Before(base) {
 707		return 0
 708	}
 709	months := int(t.Year()-1970)*12 + int(t.Month()-1)
 710	return uint16(min(months, math.MaxUint16))
 711}
 712
 713// MergeMutable will merge x into r. mutated will be true if it made any
 714// changes. err is non-nil if we needed to mutate an immutable field.
 715//
 716// Note: SubRepoMap, IndexOptions and HasSymbol fields are ignored. They are
 717// computed while indexing so can't be synthesized from x.
 718//
 719// Note: We ignore RawConfig fields which are duplicated into Repository:
 720// name and id.
 721func (r *Repository) MergeMutable(x *Repository) (mutated bool, err error) {
 722	if r.ID != x.ID {
 723		// Sourcegraph: strange behaviour may occur if ID changes but names don't.
 724		return mutated, errors.New("ID is immutable")
 725	}
 726	if r.Name != x.Name {
 727		// Name is encoded into the shard name on disk. We need to re-index if it
 728		// changes.
 729		return mutated, errors.New("Name is immutable")
 730	}
 731	if !reflect.DeepEqual(r.Branches, x.Branches) {
 732		// Need a reindex if content changing.
 733		return mutated, errors.New("Branches is immutable")
 734	}
 735
 736	for k, v := range x.RawConfig {
 737		// We ignore name and id since they are encoded into the repository.
 738		if k == "name" || k == "id" {
 739			continue
 740		}
 741		if r.RawConfig == nil {
 742			mutated = true
 743			r.RawConfig = make(map[string]string)
 744		}
 745		if r.RawConfig[k] != v {
 746			mutated = true
 747			r.RawConfig[k] = v
 748		}
 749	}
 750
 751	if r.URL != x.URL {
 752		mutated = true
 753		r.URL = x.URL
 754	}
 755	if r.CommitURLTemplate != x.CommitURLTemplate {
 756		mutated = true
 757		r.CommitURLTemplate = x.CommitURLTemplate
 758	}
 759	if r.FileURLTemplate != x.FileURLTemplate {
 760		mutated = true
 761		r.FileURLTemplate = x.FileURLTemplate
 762	}
 763	if r.LineFragmentTemplate != x.LineFragmentTemplate {
 764		mutated = true
 765		r.LineFragmentTemplate = x.LineFragmentTemplate
 766	}
 767
 768	return mutated, nil
 769}
 770
 771// IndexMetadata holds metadata stored in the index file. It contains
 772// data generated by the core indexing library.
 773type IndexMetadata struct {
 774	IndexFormatVersion    int
 775	IndexFeatureVersion   int
 776	IndexMinReaderVersion int
 777	IndexTime             time.Time
 778	PlainASCII            bool
 779	LanguageMap           map[string]uint16
 780	ZoektVersion          string
 781	ID                    string
 782}
 783
 784// Statistics of a (collection of) repositories.
 785type RepoStats struct {
 786	// Repos is used for aggregrating the number of repositories.
 787	//
 788	// Note: This field is not populated on RepoListEntry.Stats (individual) but
 789	// only for RepoList.Stats (aggregate).
 790	Repos int
 791
 792	// Shards is the total number of search shards.
 793	Shards int
 794
 795	// Documents holds the number of documents or files.
 796	Documents int
 797
 798	// IndexBytes is the amount of RAM used for index overhead.
 799	IndexBytes int64
 800
 801	// ContentBytes is the amount of RAM used for raw content.
 802	ContentBytes int64
 803
 804	// Sourcegraph specific stats below. These are not as efficient to calculate
 805	// as the above statistics. We experimentally measured about a 10% slower
 806	// shard load time. However, we find these values very useful to track and
 807	// computing them outside of load time introduces a lot of complexity.
 808
 809	// NewLinesCount is the number of newlines "\n" that appear in the zoekt
 810	// indexed documents. This is not exactly the same as line count, since it
 811	// will not include lines not terminated by "\n" (eg a file with no "\n", or
 812	// a final line without "\n"). Note: Zoekt deduplicates documents across
 813	// branches, so if a path has the same contents on multiple branches, there
 814	// is only one document for it. As such that document's newlines is only
 815	// counted once. See DefaultBranchNewLinesCount and AllBranchesNewLinesCount
 816	// for counts which do not deduplicate.
 817	NewLinesCount uint64
 818
 819	// DefaultBranchNewLinesCount is the number of newlines "\n" in the default
 820	// branch.
 821	DefaultBranchNewLinesCount uint64
 822
 823	// OtherBranchesNewLinesCount is the number of newlines "\n" in all branches
 824	// except the default branch.
 825	OtherBranchesNewLinesCount uint64
 826}
 827
 828func (s *RepoStats) Add(o *RepoStats) {
 829	// can't update Repos, since one repo may have multiple
 830	// shards.
 831	s.Shards += o.Shards
 832	s.IndexBytes += o.IndexBytes
 833	s.Documents += o.Documents
 834	s.ContentBytes += o.ContentBytes
 835
 836	// Sourcegraph specific
 837	s.NewLinesCount += o.NewLinesCount
 838	s.DefaultBranchNewLinesCount += o.DefaultBranchNewLinesCount
 839	s.OtherBranchesNewLinesCount += o.OtherBranchesNewLinesCount
 840}
 841
 842type RepoListEntry struct {
 843	Repository    Repository
 844	IndexMetadata IndexMetadata
 845	Stats         RepoStats
 846}
 847
 848// MinimalRepoListEntry is a subset of RepoListEntry. It was added after
 849// performance profiling of sourcegraph.com revealed that querying this
 850// information from Zoekt was causing lots of CPU and memory usage. Note: we
 851// can revisit this, how we store and query this information has changed a lot
 852// since this was introduced.
 853type MinimalRepoListEntry struct {
 854	// HasSymbols is exported since Sourcegraph uses this information at search
 855	// planning time to decide between Zoekt and an unindexed symbol search.
 856	//
 857	// Note: it pretty much is always true in practice.
 858	HasSymbols bool
 859
 860	// Branches is used by Sourcegraphs query planner to decided if it can use
 861	// zoekt or go via an unindexed code path.
 862	Branches []RepositoryBranch
 863
 864	// IndexTimeUnix is the IndexTime converted to unix time (number of seconds
 865	// since the epoch). This is to make it clear we are not transporting the
 866	// full fidelty timestamp (ie with milliseconds and location). Additionally
 867	// it saves 16 bytes in this struct.
 868	//
 869	// IndexTime is used as a heuristic in Sourcegraph to decide in aggregate
 870	// how many repositories need updating after a ranking change/etc.
 871	//
 872	// TODO(keegancsmith) audit updates to IndexTime and document how and when
 873	// it changes. Concerned about things like metadata updates or compound
 874	// shards leading to untrustworthy data here.
 875	IndexTimeUnix int64
 876}
 877
 878type ReposMap map[uint32]MinimalRepoListEntry
 879
 880// MarshalBinary implements a specialized encoder for ReposMap.
 881func (q *ReposMap) MarshalBinary() ([]byte, error) {
 882	return reposMapEncode(*q)
 883}
 884
 885// UnmarshalBinary implements a specialized decoder for ReposMap.
 886func (q *ReposMap) UnmarshalBinary(b []byte) error {
 887	var err error
 888	(*q), err = reposMapDecode(b)
 889	return err
 890}
 891
 892// RepoList holds a set of Repository metadata.
 893type RepoList struct {
 894	// Returned when ListOptions.Field is RepoListFieldRepos.
 895	Repos []*RepoListEntry
 896
 897	// ReposMap is set when ListOptions.Field is RepoListFieldReposMap.
 898	ReposMap ReposMap
 899
 900	Crashes int
 901
 902	// Stats response to a List request.
 903	// This is the aggregate RepoStats of all repos matching the input query.
 904	Stats RepoStats
 905}
 906
 907type Searcher interface {
 908	Search(ctx context.Context, q query.Q, opts *SearchOptions) (*SearchResult, error)
 909
 910	// List lists repositories. The query `q` can only contain
 911	// query.Repo atoms.
 912	List(ctx context.Context, q query.Q, opts *ListOptions) (*RepoList, error)
 913	Close()
 914
 915	// Describe the searcher for debug messages.
 916	String() string
 917}
 918
 919type RepoListField int
 920
 921const (
 922	RepoListFieldRepos    RepoListField = 0
 923	RepoListFieldReposMap               = 2
 924)
 925
 926type ListOptions struct {
 927	// Field decides which field to populate in RepoList response.
 928	Field RepoListField
 929}
 930
 931func (o *ListOptions) GetField() (RepoListField, error) {
 932	if o == nil {
 933		return RepoListFieldRepos, nil
 934	}
 935	switch o.Field {
 936	case RepoListFieldRepos, RepoListFieldReposMap:
 937		return o.Field, nil
 938	case 1:
 939		return 0, fmt.Errorf("RepoListFieldMinimal (%d) is no longer supported", o.Field)
 940	default:
 941		return 0, fmt.Errorf("unknown RepoListField %d", o.Field)
 942	}
 943}
 944
 945func (o *ListOptions) String() string {
 946	return fmt.Sprintf("%#v", o)
 947}
 948
 949type SearchOptions struct {
 950	// Return an upper-bound estimate of eligible documents in
 951	// stats.ShardFilesConsidered.
 952	EstimateDocCount bool
 953
 954	// Return the whole file.
 955	Whole bool
 956
 957	// Maximum number of matches: skip all processing an index
 958	// shard after we found this many non-overlapping matches.
 959	ShardMaxMatchCount int
 960
 961	// Maximum number of matches: stop looking for more matches
 962	// once we have this many matches across shards.
 963	TotalMaxMatchCount int
 964
 965	// Maximum number of matches: skip processing documents for a repository in
 966	// a shard once we have found ShardRepoMaxMatchCount.
 967	//
 968	// A compound shard may contain multiple repositories. This will most often
 969	// be set to 1 to find all repositories containing a result.
 970	ShardRepoMaxMatchCount int
 971
 972	// Abort the search after this much time has passed.
 973	MaxWallTime time.Duration
 974
 975	// FlushWallTime if non-zero will stop streaming behaviour at first and
 976	// instead will collate and sort results. At FlushWallTime the results will
 977	// be sent and then the behaviour will revert to the normal streaming.
 978	FlushWallTime time.Duration
 979
 980	// Truncates the number of documents (i.e. files) after collating and
 981	// sorting the results.
 982	MaxDocDisplayCount int
 983
 984	// Truncates the number of matchs after collating and sorting the results.
 985	MaxMatchDisplayCount int
 986
 987	// If set to a number greater than zero then up to this many number
 988	// of context lines will be added before and after each matched line.
 989	// Note that the included context lines might contain matches and
 990	// it's up to the consumer of the result to remove those lines.
 991	NumContextLines int
 992
 993	// If true, ChunkMatches will be returned in each FileMatch rather than LineMatches
 994	// EXPERIMENTAL: the behavior of this flag may be changed in future versions.
 995	ChunkMatches bool
 996
 997	// EXPERIMENTAL. If true, use text-search style scoring instead of the default
 998	// scoring formula. The scoring algorithm treats each match in a file as a term
 999	// and computes an approximation to BM25. When enabled, BM25 scoring is used for
1000	// the overall FileMatch score, as well as individual LineMatch and ChunkMatch scores.
1001	//
1002	// The calculation of IDF assumes that Zoekt visits all documents containing any
1003	// of the query terms during evaluation. This is true, for example, if all query
1004	// terms are ORed together.
1005	//
1006	// When enabled, all other scoring signals are ignored, including document ranks.
1007	UseBM25Scoring bool
1008
1009	// Trace turns on opentracing for this request if true and if the Jaeger address was provided as
1010	// a command-line flag
1011	Trace bool
1012
1013	// If set, the search results will contain debug information for scoring.
1014	DebugScore bool
1015
1016	// SpanContext is the opentracing span context, if it exists, from the zoekt client
1017	SpanContext map[string]string
1018}
1019
1020func (o *SearchOptions) SetDefaults() {
1021	if o.ShardMaxMatchCount == 0 {
1022		// We cap the total number of matches, so overly broad
1023		// searches don't crash the machine.
1024		o.ShardMaxMatchCount = 100000
1025	}
1026	if o.TotalMaxMatchCount == 0 {
1027		o.TotalMaxMatchCount = 10 * o.ShardMaxMatchCount
1028	}
1029}
1030
1031// String returns a succinct representation of the options. This is meant for
1032// human consumption in logs and traces.
1033//
1034// Note: some tracing systems have limits on length of values, so we take care
1035// to try and make this small, and include the important information near the
1036// front incase of truncation.
1037func (s *SearchOptions) String() string {
1038	var b strings.Builder
1039
1040	add := func(name, value string) {
1041		b.WriteString(name)
1042		b.WriteByte('=')
1043		b.WriteString(value)
1044		b.WriteByte(' ')
1045	}
1046	addInt := func(name string, value int) {
1047		if value != 0 {
1048			add(name, strconv.Itoa(value))
1049		}
1050	}
1051	addDuration := func(name string, value time.Duration) {
1052		if value != 0 {
1053			add(name, value.String())
1054		}
1055	}
1056	addBool := func(name string, value bool) {
1057		if !value {
1058			return
1059		}
1060		b.WriteString(name)
1061		b.WriteByte(' ')
1062	}
1063
1064	b.WriteString("zoekt.SearchOptions{ ")
1065
1066	addInt("ShardMaxMatchCount", s.ShardMaxMatchCount)
1067	addInt("TotalMaxMatchCount", s.TotalMaxMatchCount)
1068	addInt("ShardRepoMaxMatchCount", s.ShardRepoMaxMatchCount)
1069	addInt("MaxDocDisplayCount", s.MaxDocDisplayCount)
1070	addInt("MaxMatchDisplayCount", s.MaxMatchDisplayCount)
1071	addInt("NumContextLines", s.NumContextLines)
1072
1073	addDuration("MaxWallTime", s.MaxWallTime)
1074	addDuration("FlushWallTime", s.FlushWallTime)
1075
1076	addBool("EstimateDocCount", s.EstimateDocCount)
1077	addBool("Whole", s.Whole)
1078	addBool("ChunkMatches", s.ChunkMatches)
1079	addBool("UseBM25Scoring", s.UseBM25Scoring)
1080	addBool("Trace", s.Trace)
1081	addBool("DebugScore", s.DebugScore)
1082
1083	for k, v := range s.SpanContext {
1084		add("SpanContext."+k, strconv.Quote(v))
1085	}
1086
1087	b.WriteByte('}')
1088	return b.String()
1089}
1090
1091// Sender is the interface that wraps the basic Send method.
1092type Sender interface {
1093	Send(*SearchResult)
1094}
1095
1096// SenderFunc is an adapter to allow the use of ordinary functions as Sender.
1097// If f is a function with the appropriate signature, SenderFunc(f) is a Sender
1098// that calls f.
1099type SenderFunc func(result *SearchResult)
1100
1101func (f SenderFunc) Send(result *SearchResult) {
1102	f(result)
1103}
1104
1105// Streamer adds the method StreamSearch to the Searcher interface.
1106type Streamer interface {
1107	Searcher
1108	StreamSearch(ctx context.Context, q query.Q, opts *SearchOptions, sender Sender) (err error)
1109}
Configure Feed

Configure Feed