api.go at b4a5b7cee93f51cd1c9392da69ac5ae2f61cb05d · boltless.me/zoekt

fork of https://github.com/sourcegraph/zoekt
zoekt / api.go
at b4a5b7cee93f51cd1c9392da69ac5ae2f61cb05d 30 kB View raw
Julie Tibshirani Add ChunkMatch.BestLineMatch to return the best-scoring line (#884) 1y ago
   1// Copyright 2016 Google Inc. All rights reserved.
   2//
   3// Licensed under the Apache License, Version 2.0 (the "License");
   4// you may not use this file except in compliance with the License.
   5// You may obtain a copy of the License at
   6//
   7//    http://www.apache.org/licenses/LICENSE-2.0
   8//
   9// Unless required by applicable law or agreed to in writing, software
  10// distributed under the License is distributed on an "AS IS" BASIS,
  11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12// See the License for the specific language governing permissions and
  13// limitations under the License.
  14
  15package zoekt // import "github.com/sourcegraph/zoekt"
  16
  17import (
  18	"context"
  19	"encoding/json"
  20	"errors"
  21	"fmt"
  22	"reflect"
  23	"strconv"
  24	"strings"
  25	"time"
  26
  27	"github.com/sourcegraph/zoekt/query"
  28)
  29
  30const (
  31	mapHeaderBytes    uint64 = 48
  32	sliceHeaderBytes  uint64 = 24
  33	stringHeaderBytes uint64 = 16
  34	pointerSize       uint64 = 8
  35	interfaceBytes    uint64 = 16
  36)
  37
  38// FileMatch contains all the matches within a file.
  39type FileMatch struct {
  40	FileName string
  41
  42	// Repository is the globally unique name of the repo of the
  43	// match
  44	Repository string
  45
  46	// SubRepositoryName is the globally unique name of the repo,
  47	// if it came from a subrepository
  48	SubRepositoryName string `json:",omitempty"`
  49
  50	// SubRepositoryPath holds the prefix where the subrepository
  51	// was mounted.
  52	SubRepositoryPath string `json:",omitempty"`
  53
  54	// Commit SHA1 (hex) of the (sub)repo holding the file.
  55	Version string `json:",omitempty"`
  56
  57	// Detected language of the result.
  58	Language string
  59
  60	// For debugging. Needs DebugScore set, but public so tests in
  61	// other packages can print some diagnostics.
  62	Debug string `json:",omitempty"`
  63
  64	Branches []string `json:",omitempty"`
  65
  66	// One of LineMatches or ChunkMatches will be returned depending on whether
  67	// the SearchOptions.ChunkMatches is set.
  68	LineMatches  []LineMatch  `json:",omitempty"`
  69	ChunkMatches []ChunkMatch `json:",omitempty"`
  70
  71	// Only set if requested
  72	Content []byte `json:",omitempty"`
  73
  74	// Checksum of the content.
  75	Checksum []byte
  76
  77	// Ranking; the higher, the better.
  78	Score float64 `json:",omitempty"`
  79
  80	// RepositoryPriority is a Sourcegraph extension. It is used by Sourcegraph to
  81	// order results from different repositories relative to each other.
  82	RepositoryPriority float64 `json:",omitempty"`
  83
  84	// RepositoryID is a Sourcegraph extension. This is the ID of Repository in
  85	// Sourcegraph.
  86	RepositoryID uint32 `json:",omitempty"`
  87}
  88
  89func (m *FileMatch) sizeBytes() (sz uint64) {
  90	// Score
  91	sz += 8
  92
  93	for _, s := range []string{
  94		m.Debug,
  95		m.FileName,
  96		m.Repository,
  97		m.Language,
  98		m.SubRepositoryName,
  99		m.SubRepositoryPath,
 100		m.Version,
 101	} {
 102		sz += stringHeaderBytes + uint64(len(s))
 103	}
 104
 105	// Branches
 106	sz += sliceHeaderBytes
 107	for _, s := range m.Branches {
 108		sz += stringHeaderBytes + uint64(len(s))
 109	}
 110
 111	// LineMatches
 112	sz += sliceHeaderBytes
 113	for _, lm := range m.LineMatches {
 114		sz += lm.sizeBytes()
 115	}
 116
 117	// ChunkMatches
 118	sz += sliceHeaderBytes
 119	for _, cm := range m.ChunkMatches {
 120		sz += cm.sizeBytes()
 121	}
 122
 123	// RepositoryID
 124	sz += 4
 125
 126	// RepositoryPriority
 127	sz += 8
 128
 129	// Content
 130	sz += sliceHeaderBytes + uint64(len(m.Content))
 131
 132	// Checksum
 133	sz += sliceHeaderBytes + uint64(len(m.Checksum))
 134
 135	return
 136}
 137
 138// ChunkMatch is a set of non-overlapping matches within a contiguous range of
 139// lines in the file.
 140type ChunkMatch struct {
 141	DebugScore string
 142
 143	// Content is a contiguous range of complete lines that fully contains Ranges.
 144	// Lines will always include their terminating newline (if it exists).
 145	Content []byte
 146
 147	// Ranges is a set of matching ranges within this chunk. Each range is relative
 148	// to the beginning of the file (not the beginning of Content).
 149	Ranges []Range
 150
 151	// SymbolInfo is the symbol information associated with Ranges. If it is non-nil,
 152	// its length will equal that of Ranges. Any of its elements may be nil.
 153	SymbolInfo []*Symbol
 154
 155	// FileName indicates whether this match is a match on the file name, in
 156	// which case Content will contain the file name.
 157	FileName bool
 158
 159	// ContentStart is the location (inclusive) of the beginning of content
 160	// relative to the beginning of the file. It will always be at the
 161	// beginning of a line (Column will always be 1).
 162	ContentStart Location
 163
 164	// Score is the overall relevance score of this chunk.
 165	Score float64
 166
 167	// BestLineMatch is the line number of the highest-scoring line match in this chunk.
 168	// The line number represents the index in the full file, and is 1-based. If FileName: true,
 169	// this number will be 0.
 170	BestLineMatch uint32
 171}
 172
 173func (cm *ChunkMatch) sizeBytes() (sz uint64) {
 174	// Content
 175	sz += sliceHeaderBytes + uint64(len(cm.Content))
 176
 177	// ContentStart
 178	sz += cm.ContentStart.sizeBytes()
 179
 180	// FileName
 181	sz += 1
 182
 183	// Ranges
 184	sz += sliceHeaderBytes
 185	if len(cm.Ranges) > 0 {
 186		sz += uint64(len(cm.Ranges)) * cm.Ranges[0].sizeBytes()
 187	}
 188
 189	// SymbolInfo
 190	sz += sliceHeaderBytes
 191	for _, si := range cm.SymbolInfo {
 192		sz += pointerSize
 193		if si != nil {
 194			sz += si.sizeBytes()
 195		}
 196	}
 197
 198	// Score
 199	sz += 8
 200
 201	// DebugScore
 202	sz += stringHeaderBytes + uint64(len(cm.DebugScore))
 203
 204	return
 205}
 206
 207type Range struct {
 208	// The inclusive beginning of the range.
 209	Start Location
 210	// The exclusive end of the range.
 211	End Location
 212}
 213
 214func (r *Range) sizeBytes() uint64 {
 215	return r.Start.sizeBytes() + r.End.sizeBytes()
 216}
 217
 218type Location struct {
 219	// 0-based byte offset from the beginning of the file
 220	ByteOffset uint32
 221	// 1-based line number from the beginning of the file
 222	LineNumber uint32
 223	// 1-based column number (in runes) from the beginning of line
 224	Column uint32
 225}
 226
 227func (l *Location) sizeBytes() uint64 {
 228	return 3 * 4
 229}
 230
 231// LineMatch holds the matches within a single line in a file.
 232type LineMatch struct {
 233	// The line in which a match was found.
 234	Line []byte
 235	// The byte offset of the first byte of the line.
 236	LineStart int
 237	// The byte offset of the first byte past the end of the line.
 238	// This is usually the byte after the terminating newline, but can also be
 239	// the end of the file if there is no terminating newline
 240	LineEnd    int
 241	LineNumber int
 242
 243	// Before and After are only set when SearchOptions.NumContextLines is > 0
 244	Before []byte
 245	After  []byte
 246
 247	// If set, this was a match on the filename.
 248	FileName bool
 249
 250	// The higher the better. Only ranks the quality of the match
 251	// within the file, does not take rank of file into account
 252	Score      float64
 253	DebugScore string
 254
 255	LineFragments []LineFragmentMatch
 256}
 257
 258func (lm *LineMatch) sizeBytes() (sz uint64) {
 259	// Line
 260	sz += sliceHeaderBytes + uint64(len(lm.Line))
 261
 262	// LineStart, LineEnd, LineNumber
 263	sz += 3 * 8
 264
 265	// Before
 266	sz += sliceHeaderBytes + uint64(len(lm.Before))
 267
 268	// After
 269	sz += sliceHeaderBytes + uint64(len(lm.After))
 270
 271	// FileName
 272	sz += 1
 273
 274	// Score
 275	sz += 8
 276
 277	// DebugScore
 278	sz += stringHeaderBytes + uint64(len(lm.DebugScore))
 279
 280	// LineFragments
 281	sz += sliceHeaderBytes
 282	for _, lf := range lm.LineFragments {
 283		sz += lf.sizeBytes()
 284	}
 285
 286	return
 287}
 288
 289type Symbol struct {
 290	Sym        string
 291	Kind       string
 292	Parent     string
 293	ParentKind string
 294}
 295
 296func (s *Symbol) sizeBytes() uint64 {
 297	return 4*stringHeaderBytes + uint64(len(s.Sym)+len(s.Kind)+len(s.Parent)+len(s.ParentKind))
 298}
 299
 300// LineFragmentMatch a segment of matching text within a line.
 301type LineFragmentMatch struct {
 302	// Offset within the line, in bytes.
 303	LineOffset int
 304
 305	// Offset from file start, in bytes.
 306	Offset uint32
 307
 308	// Number bytes that match.
 309	MatchLength int
 310
 311	SymbolInfo *Symbol
 312}
 313
 314func (lfm *LineFragmentMatch) sizeBytes() (sz uint64) {
 315	// LineOffset
 316	sz += 8
 317
 318	// Offset
 319	sz += 4
 320
 321	// MatchLength
 322	sz += 8
 323
 324	// SymbolInfo
 325	sz += pointerSize
 326	if lfm.SymbolInfo != nil {
 327		sz += lfm.SymbolInfo.sizeBytes()
 328	}
 329
 330	return
 331}
 332
 333type FlushReason uint8
 334
 335const (
 336	FlushReasonTimerExpired FlushReason = 1 << iota
 337	FlushReasonFinalFlush
 338	FlushReasonMaxSize
 339)
 340
 341var FlushReasonStrings = map[FlushReason]string{
 342	FlushReasonTimerExpired: "timer_expired",
 343	FlushReasonFinalFlush:   "final_flush",
 344	FlushReasonMaxSize:      "max_size_reached",
 345}
 346
 347func (fr FlushReason) String() string {
 348	if v, ok := FlushReasonStrings[fr]; ok {
 349		return v
 350	}
 351
 352	return "none"
 353}
 354
 355// Stats contains interesting numbers on the search
 356type Stats struct {
 357	// Amount of I/O for reading contents.
 358	ContentBytesLoaded int64
 359
 360	// Amount of I/O for reading from index.
 361	IndexBytesLoaded int64
 362
 363	// Number of search shards that had a crash.
 364	Crashes int
 365
 366	// Wall clock time for this search
 367	Duration time.Duration
 368
 369	// Number of files containing a match.
 370	FileCount int
 371
 372	// Number of files in shards that we considered.
 373	ShardFilesConsidered int
 374
 375	// Files that we evaluated. Equivalent to files for which all
 376	// atom matches (including negations) evaluated to true.
 377	FilesConsidered int
 378
 379	// Files for which we loaded file content to verify substring matches
 380	FilesLoaded int
 381
 382	// Candidate files whose contents weren't examined because we
 383	// gathered enough matches.
 384	FilesSkipped int
 385
 386	// Shards that we scanned to find matches.
 387	ShardsScanned int
 388
 389	// Shards that we did not process because a query was canceled.
 390	ShardsSkipped int
 391
 392	// Shards that we did not process because the query was rejected by the
 393	// ngram filter indicating it had no matches.
 394	ShardsSkippedFilter int
 395
 396	// Number of non-overlapping matches
 397	MatchCount int
 398
 399	// Number of candidate matches as a result of searching ngrams.
 400	NgramMatches int
 401
 402	// NgramLookups is the number of times we accessed an ngram in the index.
 403	NgramLookups int
 404
 405	// Wall clock time for queued search.
 406	Wait time.Duration
 407
 408	// Aggregate wall clock time spent constructing and pruning the match tree.
 409	// This accounts for time such as lookups in the trigram index.
 410	MatchTreeConstruction time.Duration
 411
 412	// Aggregate wall clock time spent searching the match tree. This accounts
 413	// for the bulk of search work done looking for matches.
 414	MatchTreeSearch time.Duration
 415
 416	// Number of times regexp was called on files that we evaluated.
 417	RegexpsConsidered int
 418
 419	// FlushReason explains why results were flushed.
 420	FlushReason FlushReason
 421}
 422
 423func (s *Stats) sizeBytes() (sz uint64) {
 424	sz = 16 * 8 // This assumes we are running on a 64-bit architecture
 425	sz += 1     // FlushReason
 426
 427	return
 428}
 429
 430func (s *Stats) Add(o Stats) {
 431	s.ContentBytesLoaded += o.ContentBytesLoaded
 432	s.IndexBytesLoaded += o.IndexBytesLoaded
 433	s.Crashes += o.Crashes
 434	s.FileCount += o.FileCount
 435	s.FilesConsidered += o.FilesConsidered
 436	s.FilesLoaded += o.FilesLoaded
 437	s.FilesSkipped += o.FilesSkipped
 438	s.MatchCount += o.MatchCount
 439	s.NgramMatches += o.NgramMatches
 440	s.NgramLookups += o.NgramLookups
 441	s.ShardFilesConsidered += o.ShardFilesConsidered
 442	s.ShardsScanned += o.ShardsScanned
 443	s.ShardsSkipped += o.ShardsSkipped
 444	s.ShardsSkippedFilter += o.ShardsSkippedFilter
 445	s.Wait += o.Wait
 446	s.MatchTreeConstruction += o.MatchTreeConstruction
 447	s.MatchTreeSearch += o.MatchTreeSearch
 448	s.RegexpsConsidered += o.RegexpsConsidered
 449
 450	// We want the first non-zero FlushReason to be sticky. This is a useful
 451	// property when aggregating stats from several Zoekts.
 452	if s.FlushReason == 0 {
 453		s.FlushReason = o.FlushReason
 454	}
 455}
 456
 457// Zero returns true if stats is empty.
 458func (s *Stats) Zero() bool {
 459	if s == nil {
 460		return true
 461	}
 462
 463	return !(s.ContentBytesLoaded > 0 ||
 464		s.IndexBytesLoaded > 0 ||
 465		s.Crashes > 0 ||
 466		s.FileCount > 0 ||
 467		s.FilesConsidered > 0 ||
 468		s.FilesLoaded > 0 ||
 469		s.FilesSkipped > 0 ||
 470		s.MatchCount > 0 ||
 471		s.NgramMatches > 0 ||
 472		s.NgramLookups > 0 ||
 473		s.ShardFilesConsidered > 0 ||
 474		s.ShardsScanned > 0 ||
 475		s.ShardsSkipped > 0 ||
 476		s.ShardsSkippedFilter > 0 ||
 477		s.Wait > 0 ||
 478		s.MatchTreeConstruction > 0 ||
 479		s.MatchTreeSearch > 0 ||
 480		s.RegexpsConsidered > 0)
 481}
 482
 483// Progress contains information about the global progress of the running search query.
 484// This is used by the frontend to reorder results and emit them when stable.
 485// Sourcegraph specific: this is used when querying multiple zoekt-webserver instances.
 486type Progress struct {
 487	// Priority of the shard that was searched.
 488	Priority float64
 489
 490	// MaxPendingPriority is the maximum priority of pending result that is being searched in parallel.
 491	// This is used to reorder results when the result set is known to be stable-- that is, when a result's
 492	// Priority is greater than the max(MaxPendingPriority) from the latest results of each backend, it can be returned to the user.
 493	//
 494	// MaxPendingPriority decreases monotonically in each SearchResult.
 495	MaxPendingPriority float64
 496}
 497
 498func (p *Progress) sizeBytes() uint64 {
 499	return 2 * 8
 500}
 501
 502// SearchResult contains search matches and extra data
 503type SearchResult struct {
 504	Stats
 505
 506	// Do not encode this as we cannot encode -Inf in JSON
 507	Progress `json:"-"`
 508
 509	Files []FileMatch
 510
 511	// RepoURLs holds a repo => template string map.
 512	RepoURLs map[string]string
 513
 514	// FragmentNames holds a repo => template string map, for
 515	// the line number fragment.
 516	LineFragments map[string]string
 517}
 518
 519// SizeBytes is a best-effort estimate of the size of SearchResult in memory.
 520// The estimate does not take alignment into account. The result is a lower
 521// bound on the actual size in memory.
 522func (sr *SearchResult) SizeBytes() (sz uint64) {
 523	sz += sr.Stats.sizeBytes()
 524	sz += sr.Progress.sizeBytes()
 525
 526	// Files
 527	sz += sliceHeaderBytes
 528	for _, f := range sr.Files {
 529		sz += f.sizeBytes()
 530	}
 531
 532	// RepoURLs
 533	sz += mapHeaderBytes
 534	for k, v := range sr.RepoURLs {
 535		sz += stringHeaderBytes + uint64(len(k))
 536		sz += stringHeaderBytes + uint64(len(v))
 537	}
 538
 539	// LineFragments
 540	sz += mapHeaderBytes
 541	for k, v := range sr.LineFragments {
 542		sz += stringHeaderBytes + uint64(len(k))
 543		sz += stringHeaderBytes + uint64(len(v))
 544	}
 545
 546	return
 547}
 548
 549// RepositoryBranch describes an indexed branch, which is a name
 550// combined with a version.
 551type RepositoryBranch struct {
 552	Name    string
 553	Version string
 554}
 555
 556func (r RepositoryBranch) String() string {
 557	return fmt.Sprintf("%s@%s", r.Name, r.Version)
 558}
 559
 560// Repository holds repository metadata.
 561type Repository struct {
 562	// Sourcegraph's tenant ID
 563	TenantID int
 564
 565	// Sourcegraph's repository ID
 566	ID uint32
 567
 568	// The repository name
 569	Name string
 570
 571	// The repository URL.
 572	URL string
 573
 574	// The physical source where this repo came from, eg. full
 575	// path to the zip filename or git repository directory. This
 576	// will not be exposed in the UI, but can be used to detect
 577	// orphaned index shards.
 578	Source string
 579
 580	// The branches indexed in this repo.
 581	Branches []RepositoryBranch
 582
 583	// Nil if this is not the super project.
 584	SubRepoMap map[string]*Repository
 585
 586	// URL template to link to the commit of a branch
 587	CommitURLTemplate string
 588
 589	// The repository URL for getting to a file.  Has access to
 590	// {{.Version}}, {{.Path}}
 591	FileURLTemplate string
 592
 593	// The URL fragment to add to a file URL for line numbers. has
 594	// access to {{.LineNumber}}. The fragment should include the
 595	// separator, generally '#' or ';'.
 596	LineFragmentTemplate string
 597
 598	// Perf optimization: priority is set when we load the shard. It corresponds to
 599	// the value of "priority" stored in RawConfig.
 600	priority float64
 601
 602	// All zoekt.* configuration settings.
 603	RawConfig map[string]string
 604
 605	// Importance of the repository, bigger is more important
 606	Rank uint16
 607
 608	// IndexOptions is a hash of the options used to create the index for the
 609	// repo.
 610	IndexOptions string
 611
 612	// HasSymbols is true if this repository has indexed ctags
 613	// output. Sourcegraph specific: This field is more appropriate for
 614	// IndexMetadata. However, we store it here since the Sourcegraph frontend
 615	// can read this structure but not IndexMetadata.
 616	HasSymbols bool
 617
 618	// Tombstone is true if we are not allowed to search this repo.
 619	Tombstone bool
 620
 621	// LatestCommitDate is the date of the latest commit among all indexed Branches.
 622	// The date might be time.Time's 0-value if the repository was last indexed
 623	// before this field was added.
 624	LatestCommitDate time.Time
 625
 626	// FileTombstones is a set of file paths that should be ignored across all branches
 627	// in this shard.
 628	FileTombstones map[string]struct{} `json:",omitempty"`
 629}
 630
 631func (r *Repository) UnmarshalJSON(data []byte) error {
 632	// We define a new type so that we can use json.Unmarshal
 633	// without recursing into this same method.
 634	type repository *Repository
 635	repo := repository(r)
 636
 637	err := json.Unmarshal(data, repo)
 638	if err != nil {
 639		return err
 640	}
 641
 642	if v, ok := repo.RawConfig["repoid"]; ok {
 643		id, _ := strconv.ParseUint(v, 10, 32)
 644		r.ID = uint32(id)
 645	}
 646
 647	if v, ok := repo.RawConfig["tenantID"]; ok {
 648		id, _ := strconv.ParseInt(v, 10, 64)
 649		r.TenantID = int(id)
 650	}
 651
 652	// Sourcegraph indexserver doesn't set repo.Rank, so we set it here. Setting it
 653	// on read instead of during indexing allows us to avoid a complete reindex.
 654	//
 655	// Prefer "latestCommitDate" over "priority" for ranking. We keep priority for
 656	// backwards compatibility.
 657	if _, ok := repo.RawConfig["latestCommitDate"]; ok {
 658		// We use the number of months since 1970 as a simple measure of repo freshness.
 659		// It is monotonically increasing and stable across re-indexes and restarts.
 660		r.Rank = monthsSince1970(repo.LatestCommitDate)
 661	} else if v, ok := repo.RawConfig["priority"]; ok {
 662		r.priority, err = strconv.ParseFloat(v, 64)
 663		if err != nil {
 664			r.priority = 0
 665		}
 666
 667		// Sourcegraph indexserver doesn't set repo.Rank, so we set it here
 668		// based on priority. Setting it on read instead of during indexing
 669		// allows us to avoid a complete reindex.
 670		if r.Rank == 0 && r.priority > 0 {
 671			// Normalize the repo score within [0, maxUint16), with the midpoint at 5,000.
 672			// This means popular repos (roughly ones with over 5,000 stars) see diminishing
 673			// returns from more stars.
 674			r.Rank = uint16(r.priority / (5000.0 + r.priority) * maxUInt16)
 675		}
 676	}
 677
 678	return nil
 679}
 680
 681// monthsSince1970 returns the number of months since 1970. It returns values in
 682// the range [0, maxUInt16]. The upper bound is reached in the year 7431, the
 683// lower bound for all dates before 1970.
 684func monthsSince1970(t time.Time) uint16 {
 685	base := time.Unix(0, 0)
 686	if t.Before(base) {
 687		return 0
 688	}
 689	months := int(t.Year()-1970)*12 + int(t.Month()-1)
 690	return uint16(min(months, maxUInt16))
 691}
 692
 693// MergeMutable will merge x into r. mutated will be true if it made any
 694// changes. err is non-nil if we needed to mutate an immutable field.
 695//
 696// Note: SubRepoMap, IndexOptions and HasSymbol fields are ignored. They are
 697// computed while indexing so can't be synthesized from x.
 698//
 699// Note: We ignore RawConfig fields which are duplicated into Repository:
 700// name and id.
 701func (r *Repository) MergeMutable(x *Repository) (mutated bool, err error) {
 702	if r.ID != x.ID {
 703		// Sourcegraph: strange behaviour may occur if ID changes but names don't.
 704		return mutated, errors.New("ID is immutable")
 705	}
 706	if r.Name != x.Name {
 707		// Name is encoded into the shard name on disk. We need to re-index if it
 708		// changes.
 709		return mutated, errors.New("Name is immutable")
 710	}
 711	if !reflect.DeepEqual(r.Branches, x.Branches) {
 712		// Need a reindex if content changing.
 713		return mutated, errors.New("Branches is immutable")
 714	}
 715
 716	for k, v := range x.RawConfig {
 717		// We ignore name and id since they are encoded into the repository.
 718		if k == "name" || k == "id" {
 719			continue
 720		}
 721		if r.RawConfig == nil {
 722			mutated = true
 723			r.RawConfig = make(map[string]string)
 724		}
 725		if r.RawConfig[k] != v {
 726			mutated = true
 727			r.RawConfig[k] = v
 728		}
 729	}
 730
 731	if r.URL != x.URL {
 732		mutated = true
 733		r.URL = x.URL
 734	}
 735	if r.CommitURLTemplate != x.CommitURLTemplate {
 736		mutated = true
 737		r.CommitURLTemplate = x.CommitURLTemplate
 738	}
 739	if r.FileURLTemplate != x.FileURLTemplate {
 740		mutated = true
 741		r.FileURLTemplate = x.FileURLTemplate
 742	}
 743	if r.LineFragmentTemplate != x.LineFragmentTemplate {
 744		mutated = true
 745		r.LineFragmentTemplate = x.LineFragmentTemplate
 746	}
 747
 748	return mutated, nil
 749}
 750
 751// IndexMetadata holds metadata stored in the index file. It contains
 752// data generated by the core indexing library.
 753type IndexMetadata struct {
 754	IndexFormatVersion    int
 755	IndexFeatureVersion   int
 756	IndexMinReaderVersion int
 757	IndexTime             time.Time
 758	PlainASCII            bool
 759	LanguageMap           map[string]uint16
 760	ZoektVersion          string
 761	ID                    string
 762}
 763
 764// Statistics of a (collection of) repositories.
 765type RepoStats struct {
 766	// Repos is used for aggregrating the number of repositories.
 767	//
 768	// Note: This field is not populated on RepoListEntry.Stats (individual) but
 769	// only for RepoList.Stats (aggregate).
 770	Repos int
 771
 772	// Shards is the total number of search shards.
 773	Shards int
 774
 775	// Documents holds the number of documents or files.
 776	Documents int
 777
 778	// IndexBytes is the amount of RAM used for index overhead.
 779	IndexBytes int64
 780
 781	// ContentBytes is the amount of RAM used for raw content.
 782	ContentBytes int64
 783
 784	// Sourcegraph specific stats below. These are not as efficient to calculate
 785	// as the above statistics. We experimentally measured about a 10% slower
 786	// shard load time. However, we find these values very useful to track and
 787	// computing them outside of load time introduces a lot of complexity.
 788
 789	// NewLinesCount is the number of newlines "\n" that appear in the zoekt
 790	// indexed documents. This is not exactly the same as line count, since it
 791	// will not include lines not terminated by "\n" (eg a file with no "\n", or
 792	// a final line without "\n"). Note: Zoekt deduplicates documents across
 793	// branches, so if a path has the same contents on multiple branches, there
 794	// is only one document for it. As such that document's newlines is only
 795	// counted once. See DefaultBranchNewLinesCount and AllBranchesNewLinesCount
 796	// for counts which do not deduplicate.
 797	NewLinesCount uint64
 798
 799	// DefaultBranchNewLinesCount is the number of newlines "\n" in the default
 800	// branch.
 801	DefaultBranchNewLinesCount uint64
 802
 803	// OtherBranchesNewLinesCount is the number of newlines "\n" in all branches
 804	// except the default branch.
 805	OtherBranchesNewLinesCount uint64
 806}
 807
 808func (s *RepoStats) Add(o *RepoStats) {
 809	// can't update Repos, since one repo may have multiple
 810	// shards.
 811	s.Shards += o.Shards
 812	s.IndexBytes += o.IndexBytes
 813	s.Documents += o.Documents
 814	s.ContentBytes += o.ContentBytes
 815
 816	// Sourcegraph specific
 817	s.NewLinesCount += o.NewLinesCount
 818	s.DefaultBranchNewLinesCount += o.DefaultBranchNewLinesCount
 819	s.OtherBranchesNewLinesCount += o.OtherBranchesNewLinesCount
 820}
 821
 822type RepoListEntry struct {
 823	Repository    Repository
 824	IndexMetadata IndexMetadata
 825	Stats         RepoStats
 826}
 827
 828// MinimalRepoListEntry is a subset of RepoListEntry. It was added after
 829// performance profiling of sourcegraph.com revealed that querying this
 830// information from Zoekt was causing lots of CPU and memory usage. Note: we
 831// can revisit this, how we store and query this information has changed a lot
 832// since this was introduced.
 833type MinimalRepoListEntry struct {
 834	// HasSymbols is exported since Sourcegraph uses this information at search
 835	// planning time to decide between Zoekt and an unindexed symbol search.
 836	//
 837	// Note: it pretty much is always true in practice.
 838	HasSymbols bool
 839
 840	// Branches is used by Sourcegraphs query planner to decided if it can use
 841	// zoekt or go via an unindexed code path.
 842	Branches []RepositoryBranch
 843
 844	// IndexTimeUnix is the IndexTime converted to unix time (number of seconds
 845	// since the epoch). This is to make it clear we are not transporting the
 846	// full fidelty timestamp (ie with milliseconds and location). Additionally
 847	// it saves 16 bytes in this struct.
 848	//
 849	// IndexTime is used as a heuristic in Sourcegraph to decide in aggregate
 850	// how many repositories need updating after a ranking change/etc.
 851	//
 852	// TODO(keegancsmith) audit updates to IndexTime and document how and when
 853	// it changes. Concerned about things like metadata updates or compound
 854	// shards leading to untrustworthy data here.
 855	IndexTimeUnix int64
 856}
 857
 858type ReposMap map[uint32]MinimalRepoListEntry
 859
 860// MarshalBinary implements a specialized encoder for ReposMap.
 861func (q *ReposMap) MarshalBinary() ([]byte, error) {
 862	return reposMapEncode(*q)
 863}
 864
 865// UnmarshalBinary implements a specialized decoder for ReposMap.
 866func (q *ReposMap) UnmarshalBinary(b []byte) error {
 867	var err error
 868	(*q), err = reposMapDecode(b)
 869	return err
 870}
 871
 872// RepoList holds a set of Repository metadata.
 873type RepoList struct {
 874	// Returned when ListOptions.Field is RepoListFieldRepos.
 875	Repos []*RepoListEntry
 876
 877	// ReposMap is set when ListOptions.Field is RepoListFieldReposMap.
 878	ReposMap ReposMap
 879
 880	Crashes int
 881
 882	// Stats response to a List request.
 883	// This is the aggregate RepoStats of all repos matching the input query.
 884	Stats RepoStats
 885}
 886
 887type Searcher interface {
 888	Search(ctx context.Context, q query.Q, opts *SearchOptions) (*SearchResult, error)
 889
 890	// List lists repositories. The query `q` can only contain
 891	// query.Repo atoms.
 892	List(ctx context.Context, q query.Q, opts *ListOptions) (*RepoList, error)
 893	Close()
 894
 895	// Describe the searcher for debug messages.
 896	String() string
 897}
 898
 899type RepoListField int
 900
 901const (
 902	RepoListFieldRepos    RepoListField = 0
 903	RepoListFieldReposMap               = 2
 904)
 905
 906type ListOptions struct {
 907	// Field decides which field to populate in RepoList response.
 908	Field RepoListField
 909}
 910
 911func (o *ListOptions) GetField() (RepoListField, error) {
 912	if o == nil {
 913		return RepoListFieldRepos, nil
 914	}
 915	switch o.Field {
 916	case RepoListFieldRepos, RepoListFieldReposMap:
 917		return o.Field, nil
 918	case 1:
 919		return 0, fmt.Errorf("RepoListFieldMinimal (%d) is no longer supported", o.Field)
 920	default:
 921		return 0, fmt.Errorf("unknown RepoListField %d", o.Field)
 922	}
 923}
 924
 925func (o *ListOptions) String() string {
 926	return fmt.Sprintf("%#v", o)
 927}
 928
 929type SearchOptions struct {
 930	// Return an upper-bound estimate of eligible documents in
 931	// stats.ShardFilesConsidered.
 932	EstimateDocCount bool
 933
 934	// Return the whole file.
 935	Whole bool
 936
 937	// Maximum number of matches: skip all processing an index
 938	// shard after we found this many non-overlapping matches.
 939	ShardMaxMatchCount int
 940
 941	// Maximum number of matches: stop looking for more matches
 942	// once we have this many matches across shards.
 943	TotalMaxMatchCount int
 944
 945	// Maximum number of matches: skip processing documents for a repository in
 946	// a shard once we have found ShardRepoMaxMatchCount.
 947	//
 948	// A compound shard may contain multiple repositories. This will most often
 949	// be set to 1 to find all repositories containing a result.
 950	ShardRepoMaxMatchCount int
 951
 952	// Abort the search after this much time has passed.
 953	MaxWallTime time.Duration
 954
 955	// FlushWallTime if non-zero will stop streaming behaviour at first and
 956	// instead will collate and sort results. At FlushWallTime the results will
 957	// be sent and then the behaviour will revert to the normal streaming.
 958	FlushWallTime time.Duration
 959
 960	// Truncates the number of documents (i.e. files) after collating and
 961	// sorting the results.
 962	MaxDocDisplayCount int
 963
 964	// Truncates the number of matchs after collating and sorting the results.
 965	MaxMatchDisplayCount int
 966
 967	// If set to a number greater than zero then up to this many number
 968	// of context lines will be added before and after each matched line.
 969	// Note that the included context lines might contain matches and
 970	// it's up to the consumer of the result to remove those lines.
 971	NumContextLines int
 972
 973	// If true, ChunkMatches will be returned in each FileMatch rather than LineMatches
 974	// EXPERIMENTAL: the behavior of this flag may be changed in future versions.
 975	ChunkMatches bool
 976
 977	// EXPERIMENTAL. If true, use text-search style scoring instead of the default
 978	// scoring formula. The scoring algorithm treats each match in a file as a term
 979	// and computes an approximation to BM25.
 980	//
 981	// The calculation of IDF assumes that Zoekt visits all documents containing any
 982	// of the query terms during evaluation. This is true, for example, if all query
 983	// terms are ORed together.
 984	//
 985	// When enabled, all other scoring signals are ignored, including document ranks.
 986	UseBM25Scoring bool
 987
 988	// Trace turns on opentracing for this request if true and if the Jaeger address was provided as
 989	// a command-line flag
 990	Trace bool
 991
 992	// If set, the search results will contain debug information for scoring.
 993	DebugScore bool
 994
 995	// SpanContext is the opentracing span context, if it exists, from the zoekt client
 996	SpanContext map[string]string
 997}
 998
 999// String returns a succinct representation of the options. This is meant for
1000// human consumption in logs and traces.
1001//
1002// Note: some tracing systems have limits on length of values, so we take care
1003// to try and make this small, and include the important information near the
1004// front incase of truncation.
1005func (s *SearchOptions) String() string {
1006	var b strings.Builder
1007
1008	add := func(name, value string) {
1009		b.WriteString(name)
1010		b.WriteByte('=')
1011		b.WriteString(value)
1012		b.WriteByte(' ')
1013	}
1014	addInt := func(name string, value int) {
1015		if value != 0 {
1016			add(name, strconv.Itoa(value))
1017		}
1018	}
1019	addDuration := func(name string, value time.Duration) {
1020		if value != 0 {
1021			add(name, value.String())
1022		}
1023	}
1024	addBool := func(name string, value bool) {
1025		if !value {
1026			return
1027		}
1028		b.WriteString(name)
1029		b.WriteByte(' ')
1030	}
1031
1032	b.WriteString("zoekt.SearchOptions{ ")
1033
1034	addInt("ShardMaxMatchCount", s.ShardMaxMatchCount)
1035	addInt("TotalMaxMatchCount", s.TotalMaxMatchCount)
1036	addInt("ShardRepoMaxMatchCount", s.ShardRepoMaxMatchCount)
1037	addInt("MaxDocDisplayCount", s.MaxDocDisplayCount)
1038	addInt("MaxMatchDisplayCount", s.MaxMatchDisplayCount)
1039	addInt("NumContextLines", s.NumContextLines)
1040
1041	addDuration("MaxWallTime", s.MaxWallTime)
1042	addDuration("FlushWallTime", s.FlushWallTime)
1043
1044	addBool("EstimateDocCount", s.EstimateDocCount)
1045	addBool("Whole", s.Whole)
1046	addBool("ChunkMatches", s.ChunkMatches)
1047	addBool("UseBM25Scoring", s.UseBM25Scoring)
1048	addBool("Trace", s.Trace)
1049	addBool("DebugScore", s.DebugScore)
1050
1051	for k, v := range s.SpanContext {
1052		add("SpanContext."+k, strconv.Quote(v))
1053	}
1054
1055	b.WriteByte('}')
1056	return b.String()
1057}
1058
1059// Sender is the interface that wraps the basic Send method.
1060type Sender interface {
1061	Send(*SearchResult)
1062}
1063
1064// SenderFunc is an adapter to allow the use of ordinary functions as Sender.
1065// If f is a function with the appropriate signature, SenderFunc(f) is a Sender
1066// that calls f.
1067type SenderFunc func(result *SearchResult)
1068
1069func (f SenderFunc) Send(result *SearchResult) {
1070	f(result)
1071}
1072
1073// Streamer adds the method StreamSearch to the Searcher interface.
1074type Streamer interface {
1075	Searcher
1076	StreamSearch(ctx context.Context, q query.Q, opts *SearchOptions, sender Sender) (err error)
1077}
Configure Feed

Configure Feed