api.go at df7a7e7162cf7d7af4d4cdde3701c57950830676 · boltless.me/zoekt

fork of https://github.com/sourcegraph/zoekt
zoekt / api.go
at df7a7e7162cf7d7af4d4cdde3701c57950830676 29 kB View raw
Julie Tibshirani Rename UseKeywordScoring to mention BM25 (#778) 2y ago
   1// Copyright 2016 Google Inc. All rights reserved.
   2//
   3// Licensed under the Apache License, Version 2.0 (the "License");
   4// you may not use this file except in compliance with the License.
   5// You may obtain a copy of the License at
   6//
   7//    http://www.apache.org/licenses/LICENSE-2.0
   8//
   9// Unless required by applicable law or agreed to in writing, software
  10// distributed under the License is distributed on an "AS IS" BASIS,
  11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12// See the License for the specific language governing permissions and
  13// limitations under the License.
  14
  15package zoekt // import "github.com/sourcegraph/zoekt"
  16
  17import (
  18	"context"
  19	"encoding/json"
  20	"errors"
  21	"fmt"
  22	"reflect"
  23	"strconv"
  24	"strings"
  25	"time"
  26
  27	"github.com/sourcegraph/zoekt/query"
  28)
  29
  30const (
  31	mapHeaderBytes    uint64 = 48
  32	sliceHeaderBytes  uint64 = 24
  33	stringHeaderBytes uint64 = 16
  34	pointerSize       uint64 = 8
  35	interfaceBytes    uint64 = 16
  36)
  37
  38// FileMatch contains all the matches within a file.
  39type FileMatch struct {
  40	FileName string
  41
  42	// Repository is the globally unique name of the repo of the
  43	// match
  44	Repository string
  45
  46	// SubRepositoryName is the globally unique name of the repo,
  47	// if it came from a subrepository
  48	SubRepositoryName string `json:",omitempty"`
  49
  50	// SubRepositoryPath holds the prefix where the subrepository
  51	// was mounted.
  52	SubRepositoryPath string `json:",omitempty"`
  53
  54	// Commit SHA1 (hex) of the (sub)repo holding the file.
  55	Version string `json:",omitempty"`
  56
  57	// Detected language of the result.
  58	Language string
  59
  60	// For debugging. Needs DebugScore set, but public so tests in
  61	// other packages can print some diagnostics.
  62	Debug string `json:",omitempty"`
  63
  64	Branches []string `json:",omitempty"`
  65
  66	// One of LineMatches or ChunkMatches will be returned depending on whether
  67	// the SearchOptions.ChunkMatches is set.
  68	LineMatches  []LineMatch  `json:",omitempty"`
  69	ChunkMatches []ChunkMatch `json:",omitempty"`
  70
  71	// Only set if requested
  72	Content []byte `json:",omitempty"`
  73
  74	// Checksum of the content.
  75	Checksum []byte
  76
  77	// Ranking; the higher, the better.
  78	Score float64 `json:",omitempty"`
  79
  80	// RepositoryPriority is a Sourcegraph extension. It is used by Sourcegraph to
  81	// order results from different repositories relative to each other.
  82	RepositoryPriority float64 `json:",omitempty"`
  83
  84	// RepositoryID is a Sourcegraph extension. This is the ID of Repository in
  85	// Sourcegraph.
  86	RepositoryID uint32 `json:",omitempty"`
  87}
  88
  89func (m *FileMatch) sizeBytes() (sz uint64) {
  90	// Score
  91	sz += 8
  92
  93	for _, s := range []string{
  94		m.Debug,
  95		m.FileName,
  96		m.Repository,
  97		m.Language,
  98		m.SubRepositoryName,
  99		m.SubRepositoryPath,
 100		m.Version,
 101	} {
 102		sz += stringHeaderBytes + uint64(len(s))
 103	}
 104
 105	// Branches
 106	sz += sliceHeaderBytes
 107	for _, s := range m.Branches {
 108		sz += stringHeaderBytes + uint64(len(s))
 109	}
 110
 111	// LineMatches
 112	sz += sliceHeaderBytes
 113	for _, lm := range m.LineMatches {
 114		sz += lm.sizeBytes()
 115	}
 116
 117	// ChunkMatches
 118	sz += sliceHeaderBytes
 119	for _, cm := range m.ChunkMatches {
 120		sz += cm.sizeBytes()
 121	}
 122
 123	// RepositoryID
 124	sz += 4
 125
 126	// RepositoryPriority
 127	sz += 8
 128
 129	// Content
 130	sz += sliceHeaderBytes + uint64(len(m.Content))
 131
 132	// Checksum
 133	sz += sliceHeaderBytes + uint64(len(m.Checksum))
 134
 135	return
 136}
 137
 138// ChunkMatch is a set of non-overlapping matches within a contiguous range of
 139// lines in the file.
 140type ChunkMatch struct {
 141	DebugScore string
 142
 143	// Content is a contiguous range of complete lines that fully contains Ranges.
 144	// Lines will always include their terminating newline (if it exists).
 145	Content []byte
 146
 147	// Ranges is a set of matching ranges within this chunk. Each range is relative
 148	// to the beginning of the file (not the beginning of Content).
 149	Ranges []Range
 150
 151	// SymbolInfo is the symbol information associated with Ranges. If it is non-nil,
 152	// its length will equal that of Ranges. Any of its elements may be nil.
 153	SymbolInfo []*Symbol
 154
 155	// FileName indicates whether this match is a match on the file name, in
 156	// which case Content will contain the file name.
 157	FileName bool
 158
 159	// ContentStart is the location (inclusive) of the beginning of content
 160	// relative to the beginning of the file. It will always be at the
 161	// beginning of a line (Column will always be 1).
 162	ContentStart Location
 163
 164	Score float64
 165}
 166
 167func (cm *ChunkMatch) sizeBytes() (sz uint64) {
 168	// Content
 169	sz += sliceHeaderBytes + uint64(len(cm.Content))
 170
 171	// ContentStart
 172	sz += cm.ContentStart.sizeBytes()
 173
 174	// FileName
 175	sz += 1
 176
 177	// Ranges
 178	sz += sliceHeaderBytes
 179	if len(cm.Ranges) > 0 {
 180		sz += uint64(len(cm.Ranges)) * cm.Ranges[0].sizeBytes()
 181	}
 182
 183	// SymbolInfo
 184	sz += sliceHeaderBytes
 185	for _, si := range cm.SymbolInfo {
 186		sz += pointerSize
 187		if si != nil {
 188			sz += si.sizeBytes()
 189		}
 190	}
 191
 192	// Score
 193	sz += 8
 194
 195	// DebugScore
 196	sz += stringHeaderBytes + uint64(len(cm.DebugScore))
 197
 198	return
 199}
 200
 201type Range struct {
 202	// The inclusive beginning of the range.
 203	Start Location
 204	// The exclusive end of the range.
 205	End Location
 206}
 207
 208func (r *Range) sizeBytes() uint64 {
 209	return r.Start.sizeBytes() + r.End.sizeBytes()
 210}
 211
 212type Location struct {
 213	// 0-based byte offset from the beginning of the file
 214	ByteOffset uint32
 215	// 1-based line number from the beginning of the file
 216	LineNumber uint32
 217	// 1-based column number (in runes) from the beginning of line
 218	Column uint32
 219}
 220
 221func (l *Location) sizeBytes() uint64 {
 222	return 3 * 4
 223}
 224
 225// LineMatch holds the matches within a single line in a file.
 226type LineMatch struct {
 227	// The line in which a match was found.
 228	Line []byte
 229	// The byte offset of the first byte of the line.
 230	LineStart int
 231	// The byte offset of the first byte past the end of the line.
 232	// This is usually the byte after the terminating newline, but can also be
 233	// the end of the file if there is no terminating newline
 234	LineEnd    int
 235	LineNumber int
 236
 237	// Before and After are only set when SearchOptions.NumContextLines is > 0
 238	Before []byte
 239	After  []byte
 240
 241	// If set, this was a match on the filename.
 242	FileName bool
 243
 244	// The higher the better. Only ranks the quality of the match
 245	// within the file, does not take rank of file into account
 246	Score      float64
 247	DebugScore string
 248
 249	LineFragments []LineFragmentMatch
 250}
 251
 252func (lm *LineMatch) sizeBytes() (sz uint64) {
 253	// Line
 254	sz += sliceHeaderBytes + uint64(len(lm.Line))
 255
 256	// LineStart, LineEnd, LineNumber
 257	sz += 3 * 8
 258
 259	// Before
 260	sz += sliceHeaderBytes + uint64(len(lm.Before))
 261
 262	// After
 263	sz += sliceHeaderBytes + uint64(len(lm.After))
 264
 265	// FileName
 266	sz += 1
 267
 268	// Score
 269	sz += 8
 270
 271	// DebugScore
 272	sz += stringHeaderBytes + uint64(len(lm.DebugScore))
 273
 274	// LineFragments
 275	sz += sliceHeaderBytes
 276	for _, lf := range lm.LineFragments {
 277		sz += lf.sizeBytes()
 278	}
 279
 280	return
 281}
 282
 283type Symbol struct {
 284	Sym        string
 285	Kind       string
 286	Parent     string
 287	ParentKind string
 288}
 289
 290func (s *Symbol) sizeBytes() uint64 {
 291	return 4*stringHeaderBytes + uint64(len(s.Sym)+len(s.Kind)+len(s.Parent)+len(s.ParentKind))
 292}
 293
 294// LineFragmentMatch a segment of matching text within a line.
 295type LineFragmentMatch struct {
 296	// Offset within the line, in bytes.
 297	LineOffset int
 298
 299	// Offset from file start, in bytes.
 300	Offset uint32
 301
 302	// Number bytes that match.
 303	MatchLength int
 304
 305	SymbolInfo *Symbol
 306}
 307
 308func (lfm *LineFragmentMatch) sizeBytes() (sz uint64) {
 309	// LineOffset
 310	sz += 8
 311
 312	// Offset
 313	sz += 4
 314
 315	// MatchLength
 316	sz += 8
 317
 318	// SymbolInfo
 319	sz += pointerSize
 320	if lfm.SymbolInfo != nil {
 321		sz += lfm.SymbolInfo.sizeBytes()
 322	}
 323
 324	return
 325}
 326
 327type FlushReason uint8
 328
 329const (
 330	FlushReasonTimerExpired FlushReason = 1 << iota
 331	FlushReasonFinalFlush
 332	FlushReasonMaxSize
 333)
 334
 335var FlushReasonStrings = map[FlushReason]string{
 336	FlushReasonTimerExpired: "timer_expired",
 337	FlushReasonFinalFlush:   "final_flush",
 338	FlushReasonMaxSize:      "max_size_reached",
 339}
 340
 341func (fr FlushReason) String() string {
 342	if v, ok := FlushReasonStrings[fr]; ok {
 343		return v
 344	}
 345
 346	return "none"
 347}
 348
 349// Stats contains interesting numbers on the search
 350type Stats struct {
 351	// Amount of I/O for reading contents.
 352	ContentBytesLoaded int64
 353
 354	// Amount of I/O for reading from index.
 355	IndexBytesLoaded int64
 356
 357	// Number of search shards that had a crash.
 358	Crashes int
 359
 360	// Wall clock time for this search
 361	Duration time.Duration
 362
 363	// Number of files containing a match.
 364	FileCount int
 365
 366	// Number of files in shards that we considered.
 367	ShardFilesConsidered int
 368
 369	// Files that we evaluated. Equivalent to files for which all
 370	// atom matches (including negations) evaluated to true.
 371	FilesConsidered int
 372
 373	// Files for which we loaded file content to verify substring matches
 374	FilesLoaded int
 375
 376	// Candidate files whose contents weren't examined because we
 377	// gathered enough matches.
 378	FilesSkipped int
 379
 380	// Shards that we scanned to find matches.
 381	ShardsScanned int
 382
 383	// Shards that we did not process because a query was canceled.
 384	ShardsSkipped int
 385
 386	// Shards that we did not process because the query was rejected by the
 387	// ngram filter indicating it had no matches.
 388	ShardsSkippedFilter int
 389
 390	// Number of non-overlapping matches
 391	MatchCount int
 392
 393	// Number of candidate matches as a result of searching ngrams.
 394	NgramMatches int
 395
 396	// NgramLookups is the number of times we accessed an ngram in the index.
 397	NgramLookups int
 398
 399	// Wall clock time for queued search.
 400	Wait time.Duration
 401
 402	// Aggregate wall clock time spent constructing and pruning the match tree.
 403	// This accounts for time such as lookups in the trigram index.
 404	MatchTreeConstruction time.Duration
 405
 406	// Aggregate wall clock time spent searching the match tree. This accounts
 407	// for the bulk of search work done looking for matches.
 408	MatchTreeSearch time.Duration
 409
 410	// Number of times regexp was called on files that we evaluated.
 411	RegexpsConsidered int
 412
 413	// FlushReason explains why results were flushed.
 414	FlushReason FlushReason
 415}
 416
 417func (s *Stats) sizeBytes() (sz uint64) {
 418	sz = 16 * 8 // This assumes we are running on a 64-bit architecture
 419	sz += 1     // FlushReason
 420
 421	return
 422}
 423
 424func (s *Stats) Add(o Stats) {
 425	s.ContentBytesLoaded += o.ContentBytesLoaded
 426	s.IndexBytesLoaded += o.IndexBytesLoaded
 427	s.Crashes += o.Crashes
 428	s.FileCount += o.FileCount
 429	s.FilesConsidered += o.FilesConsidered
 430	s.FilesLoaded += o.FilesLoaded
 431	s.FilesSkipped += o.FilesSkipped
 432	s.MatchCount += o.MatchCount
 433	s.NgramMatches += o.NgramMatches
 434	s.NgramLookups += o.NgramLookups
 435	s.ShardFilesConsidered += o.ShardFilesConsidered
 436	s.ShardsScanned += o.ShardsScanned
 437	s.ShardsSkipped += o.ShardsSkipped
 438	s.ShardsSkippedFilter += o.ShardsSkippedFilter
 439	s.Wait += o.Wait
 440	s.MatchTreeConstruction += o.MatchTreeConstruction
 441	s.MatchTreeSearch += o.MatchTreeSearch
 442	s.RegexpsConsidered += o.RegexpsConsidered
 443
 444	// We want the first non-zero FlushReason to be sticky. This is a useful
 445	// property when aggregating stats from several Zoekts.
 446	if s.FlushReason == 0 {
 447		s.FlushReason = o.FlushReason
 448	}
 449}
 450
 451// Zero returns true if stats is empty.
 452func (s *Stats) Zero() bool {
 453	if s == nil {
 454		return true
 455	}
 456
 457	return !(s.ContentBytesLoaded > 0 ||
 458		s.IndexBytesLoaded > 0 ||
 459		s.Crashes > 0 ||
 460		s.FileCount > 0 ||
 461		s.FilesConsidered > 0 ||
 462		s.FilesLoaded > 0 ||
 463		s.FilesSkipped > 0 ||
 464		s.MatchCount > 0 ||
 465		s.NgramMatches > 0 ||
 466		s.NgramLookups > 0 ||
 467		s.ShardFilesConsidered > 0 ||
 468		s.ShardsScanned > 0 ||
 469		s.ShardsSkipped > 0 ||
 470		s.ShardsSkippedFilter > 0 ||
 471		s.Wait > 0 ||
 472		s.MatchTreeConstruction > 0 ||
 473		s.MatchTreeSearch > 0 ||
 474		s.RegexpsConsidered > 0)
 475}
 476
 477// Progress contains information about the global progress of the running search query.
 478// This is used by the frontend to reorder results and emit them when stable.
 479// Sourcegraph specific: this is used when querying multiple zoekt-webserver instances.
 480type Progress struct {
 481	// Priority of the shard that was searched.
 482	Priority float64
 483
 484	// MaxPendingPriority is the maximum priority of pending result that is being searched in parallel.
 485	// This is used to reorder results when the result set is known to be stable-- that is, when a result's
 486	// Priority is greater than the max(MaxPendingPriority) from the latest results of each backend, it can be returned to the user.
 487	//
 488	// MaxPendingPriority decreases monotonically in each SearchResult.
 489	MaxPendingPriority float64
 490}
 491
 492func (p *Progress) sizeBytes() uint64 {
 493	return 2 * 8
 494}
 495
 496// SearchResult contains search matches and extra data
 497type SearchResult struct {
 498	Stats
 499
 500	// Do not encode this as we cannot encode -Inf in JSON
 501	Progress `json:"-"`
 502
 503	Files []FileMatch
 504
 505	// RepoURLs holds a repo => template string map.
 506	RepoURLs map[string]string
 507
 508	// FragmentNames holds a repo => template string map, for
 509	// the line number fragment.
 510	LineFragments map[string]string
 511}
 512
 513// SizeBytes is a best-effort estimate of the size of SearchResult in memory.
 514// The estimate does not take alignment into account. The result is a lower
 515// bound on the actual size in memory.
 516func (sr *SearchResult) SizeBytes() (sz uint64) {
 517	sz += sr.Stats.sizeBytes()
 518	sz += sr.Progress.sizeBytes()
 519
 520	// Files
 521	sz += sliceHeaderBytes
 522	for _, f := range sr.Files {
 523		sz += f.sizeBytes()
 524	}
 525
 526	// RepoURLs
 527	sz += mapHeaderBytes
 528	for k, v := range sr.RepoURLs {
 529		sz += stringHeaderBytes + uint64(len(k))
 530		sz += stringHeaderBytes + uint64(len(v))
 531	}
 532
 533	// LineFragments
 534	sz += mapHeaderBytes
 535	for k, v := range sr.LineFragments {
 536		sz += stringHeaderBytes + uint64(len(k))
 537		sz += stringHeaderBytes + uint64(len(v))
 538	}
 539
 540	return
 541}
 542
 543// RepositoryBranch describes an indexed branch, which is a name
 544// combined with a version.
 545type RepositoryBranch struct {
 546	Name    string
 547	Version string
 548}
 549
 550func (r RepositoryBranch) String() string {
 551	return fmt.Sprintf("%s@%s", r.Name, r.Version)
 552}
 553
 554// Repository holds repository metadata.
 555type Repository struct {
 556	// Sourcegraph's repository ID
 557	ID uint32
 558
 559	// The repository name
 560	Name string
 561
 562	// The repository URL.
 563	URL string
 564
 565	// The physical source where this repo came from, eg. full
 566	// path to the zip filename or git repository directory. This
 567	// will not be exposed in the UI, but can be used to detect
 568	// orphaned index shards.
 569	Source string
 570
 571	// The branches indexed in this repo.
 572	Branches []RepositoryBranch
 573
 574	// Nil if this is not the super project.
 575	SubRepoMap map[string]*Repository
 576
 577	// URL template to link to the commit of a branch
 578	CommitURLTemplate string
 579
 580	// The repository URL for getting to a file.  Has access to
 581	// {{.Version}}, {{.Path}}
 582	FileURLTemplate string
 583
 584	// The URL fragment to add to a file URL for line numbers. has
 585	// access to {{.LineNumber}}. The fragment should include the
 586	// separator, generally '#' or ';'.
 587	LineFragmentTemplate string
 588
 589	// Perf optimization: priority is set when we load the shard. It corresponds to
 590	// the value of "priority" stored in RawConfig.
 591	priority float64
 592
 593	// All zoekt.* configuration settings.
 594	RawConfig map[string]string
 595
 596	// Importance of the repository, bigger is more important
 597	Rank uint16
 598
 599	// IndexOptions is a hash of the options used to create the index for the
 600	// repo.
 601	IndexOptions string
 602
 603	// HasSymbols is true if this repository has indexed ctags
 604	// output. Sourcegraph specific: This field is more appropriate for
 605	// IndexMetadata. However, we store it here since the Sourcegraph frontend
 606	// can read this structure but not IndexMetadata.
 607	HasSymbols bool
 608
 609	// Tombstone is true if we are not allowed to search this repo.
 610	Tombstone bool
 611
 612	// LatestCommitDate is the date of the latest commit among all indexed Branches.
 613	// The date might be time.Time's 0-value if the repository was last indexed
 614	// before this field was added.
 615	LatestCommitDate time.Time
 616
 617	// FileTombstones is a set of file paths that should be ignored across all branches
 618	// in this shard.
 619	FileTombstones map[string]struct{} `json:",omitempty"`
 620}
 621
 622func (r *Repository) UnmarshalJSON(data []byte) error {
 623	// We define a new type so that we can use json.Unmarshal
 624	// without recursing into this same method.
 625	type repository *Repository
 626	repo := repository(r)
 627
 628	err := json.Unmarshal(data, repo)
 629	if err != nil {
 630		return err
 631	}
 632
 633	if v, ok := repo.RawConfig["repoid"]; ok {
 634		id, _ := strconv.ParseUint(v, 10, 32)
 635		r.ID = uint32(id)
 636	}
 637
 638	if v, ok := repo.RawConfig["priority"]; ok {
 639		r.priority, err = strconv.ParseFloat(v, 64)
 640		if err != nil {
 641			r.priority = 0
 642		}
 643
 644		// Sourcegraph indexserver doesn't set repo.Rank, so we set it here
 645		// based on priority. Setting it on read instead of during indexing
 646		// allows us to avoid a complete reindex.
 647		if r.Rank == 0 && r.priority > 0 {
 648			// Normalize the repo score within [0, 1), with the midpoint at 5,000. This means popular
 649			// repos (roughly ones with over 5,000 stars) see diminishing returns from more stars.
 650			r.Rank = uint16(r.priority / (5000.0 + r.priority) * maxUInt16)
 651		}
 652	}
 653	return nil
 654}
 655
 656// MergeMutable will merge x into r. mutated will be true if it made any
 657// changes. err is non-nil if we needed to mutate an immutable field.
 658//
 659// Note: SubRepoMap, IndexOptions and HasSymbol fields are ignored. They are
 660// computed while indexing so can't be synthesized from x.
 661//
 662// Note: We ignore RawConfig fields which are duplicated into Repository:
 663// name and id.
 664func (r *Repository) MergeMutable(x *Repository) (mutated bool, err error) {
 665	if r.ID != x.ID {
 666		// Sourcegraph: strange behaviour may occur if ID changes but names don't.
 667		return mutated, errors.New("ID is immutable")
 668	}
 669	if r.Name != x.Name {
 670		// Name is encoded into the shard name on disk. We need to re-index if it
 671		// changes.
 672		return mutated, errors.New("Name is immutable")
 673	}
 674	if !reflect.DeepEqual(r.Branches, x.Branches) {
 675		// Need a reindex if content changing.
 676		return mutated, errors.New("Branches is immutable")
 677	}
 678
 679	for k, v := range x.RawConfig {
 680		// We ignore name and id since they are encoded into the repository.
 681		if k == "name" || k == "id" {
 682			continue
 683		}
 684		if r.RawConfig == nil {
 685			mutated = true
 686			r.RawConfig = make(map[string]string)
 687		}
 688		if r.RawConfig[k] != v {
 689			mutated = true
 690			r.RawConfig[k] = v
 691		}
 692	}
 693
 694	if r.URL != x.URL {
 695		mutated = true
 696		r.URL = x.URL
 697	}
 698	if r.CommitURLTemplate != x.CommitURLTemplate {
 699		mutated = true
 700		r.CommitURLTemplate = x.CommitURLTemplate
 701	}
 702	if r.FileURLTemplate != x.FileURLTemplate {
 703		mutated = true
 704		r.FileURLTemplate = x.FileURLTemplate
 705	}
 706	if r.LineFragmentTemplate != x.LineFragmentTemplate {
 707		mutated = true
 708		r.LineFragmentTemplate = x.LineFragmentTemplate
 709	}
 710
 711	return mutated, nil
 712}
 713
 714// IndexMetadata holds metadata stored in the index file. It contains
 715// data generated by the core indexing library.
 716type IndexMetadata struct {
 717	IndexFormatVersion    int
 718	IndexFeatureVersion   int
 719	IndexMinReaderVersion int
 720	IndexTime             time.Time
 721	PlainASCII            bool
 722	LanguageMap           map[string]uint16
 723	ZoektVersion          string
 724	ID                    string
 725}
 726
 727// Statistics of a (collection of) repositories.
 728type RepoStats struct {
 729	// Repos is used for aggregrating the number of repositories.
 730	//
 731	// Note: This field is not populated on RepoListEntry.Stats (individual) but
 732	// only for RepoList.Stats (aggregate).
 733	Repos int
 734
 735	// Shards is the total number of search shards.
 736	Shards int
 737
 738	// Documents holds the number of documents or files.
 739	Documents int
 740
 741	// IndexBytes is the amount of RAM used for index overhead.
 742	IndexBytes int64
 743
 744	// ContentBytes is the amount of RAM used for raw content.
 745	ContentBytes int64
 746
 747	// Sourcegraph specific stats below. These are not as efficient to calculate
 748	// as the above statistics. We experimentally measured about a 10% slower
 749	// shard load time. However, we find these values very useful to track and
 750	// computing them outside of load time introduces a lot of complexity.
 751
 752	// NewLinesCount is the number of newlines "\n" that appear in the zoekt
 753	// indexed documents. This is not exactly the same as line count, since it
 754	// will not include lines not terminated by "\n" (eg a file with no "\n", or
 755	// a final line without "\n"). Note: Zoekt deduplicates documents across
 756	// branches, so if a path has the same contents on multiple branches, there
 757	// is only one document for it. As such that document's newlines is only
 758	// counted once. See DefaultBranchNewLinesCount and AllBranchesNewLinesCount
 759	// for counts which do not deduplicate.
 760	NewLinesCount uint64
 761
 762	// DefaultBranchNewLinesCount is the number of newlines "\n" in the default
 763	// branch.
 764	DefaultBranchNewLinesCount uint64
 765
 766	// OtherBranchesNewLinesCount is the number of newlines "\n" in all branches
 767	// except the default branch.
 768	OtherBranchesNewLinesCount uint64
 769}
 770
 771func (s *RepoStats) Add(o *RepoStats) {
 772	// can't update Repos, since one repo may have multiple
 773	// shards.
 774	s.Shards += o.Shards
 775	s.IndexBytes += o.IndexBytes
 776	s.Documents += o.Documents
 777	s.ContentBytes += o.ContentBytes
 778
 779	// Sourcegraph specific
 780	s.NewLinesCount += o.NewLinesCount
 781	s.DefaultBranchNewLinesCount += o.DefaultBranchNewLinesCount
 782	s.OtherBranchesNewLinesCount += o.OtherBranchesNewLinesCount
 783}
 784
 785type RepoListEntry struct {
 786	Repository    Repository
 787	IndexMetadata IndexMetadata
 788	Stats         RepoStats
 789}
 790
 791// MinimalRepoListEntry is a subset of RepoListEntry. It was added after
 792// performance profiling of sourcegraph.com revealed that querying this
 793// information from Zoekt was causing lots of CPU and memory usage. Note: we
 794// can revisit this, how we store and query this information has changed a lot
 795// since this was introduced.
 796type MinimalRepoListEntry struct {
 797	// HasSymbols is exported since Sourcegraph uses this information at search
 798	// planning time to decide between Zoekt and an unindexed symbol search.
 799	//
 800	// Note: it pretty much is always true in practice.
 801	HasSymbols bool
 802
 803	// Branches is used by Sourcegraphs query planner to decided if it can use
 804	// zoekt or go via an unindexed code path.
 805	Branches []RepositoryBranch
 806
 807	// IndexTimeUnix is the IndexTime converted to unix time (number of seconds
 808	// since the epoch). This is to make it clear we are not transporting the
 809	// full fidelty timestamp (ie with milliseconds and location). Additionally
 810	// it saves 16 bytes in this struct.
 811	//
 812	// IndexTime is used as a heuristic in Sourcegraph to decide in aggregate
 813	// how many repositories need updating after a ranking change/etc.
 814	//
 815	// TODO(keegancsmith) audit updates to IndexTime and document how and when
 816	// it changes. Concerned about things like metadata updates or compound
 817	// shards leading to untrustworthy data here.
 818	IndexTimeUnix int64
 819}
 820
 821type ReposMap map[uint32]MinimalRepoListEntry
 822
 823// MarshalBinary implements a specialized encoder for ReposMap.
 824func (q *ReposMap) MarshalBinary() ([]byte, error) {
 825	return reposMapEncode(*q)
 826}
 827
 828// UnmarshalBinary implements a specialized decoder for ReposMap.
 829func (q *ReposMap) UnmarshalBinary(b []byte) error {
 830	var err error
 831	(*q), err = reposMapDecode(b)
 832	return err
 833}
 834
 835// RepoList holds a set of Repository metadata.
 836type RepoList struct {
 837	// Returned when ListOptions.Field is RepoListFieldRepos.
 838	Repos []*RepoListEntry
 839
 840	// ReposMap is set when ListOptions.Field is RepoListFieldReposMap.
 841	ReposMap ReposMap
 842
 843	Crashes int
 844
 845	// Stats response to a List request.
 846	// This is the aggregate RepoStats of all repos matching the input query.
 847	Stats RepoStats
 848}
 849
 850type Searcher interface {
 851	Search(ctx context.Context, q query.Q, opts *SearchOptions) (*SearchResult, error)
 852
 853	// List lists repositories. The query `q` can only contain
 854	// query.Repo atoms.
 855	List(ctx context.Context, q query.Q, opts *ListOptions) (*RepoList, error)
 856	Close()
 857
 858	// Describe the searcher for debug messages.
 859	String() string
 860}
 861
 862type RepoListField int
 863
 864const (
 865	RepoListFieldRepos    RepoListField = 0
 866	RepoListFieldReposMap               = 2
 867)
 868
 869type ListOptions struct {
 870	// Field decides which field to populate in RepoList response.
 871	Field RepoListField
 872}
 873
 874func (o *ListOptions) GetField() (RepoListField, error) {
 875	if o == nil {
 876		return RepoListFieldRepos, nil
 877	}
 878	switch o.Field {
 879	case RepoListFieldRepos, RepoListFieldReposMap:
 880		return o.Field, nil
 881	case 1:
 882		return 0, fmt.Errorf("RepoListFieldMinimal (%d) is no longer supported", o.Field)
 883	default:
 884		return 0, fmt.Errorf("unknown RepoListField %d", o.Field)
 885	}
 886}
 887
 888func (o *ListOptions) String() string {
 889	return fmt.Sprintf("%#v", o)
 890}
 891
 892type SearchOptions struct {
 893	// Return an upper-bound estimate of eligible documents in
 894	// stats.ShardFilesConsidered.
 895	EstimateDocCount bool
 896
 897	// Return the whole file.
 898	Whole bool
 899
 900	// Maximum number of matches: skip all processing an index
 901	// shard after we found this many non-overlapping matches.
 902	ShardMaxMatchCount int
 903
 904	// Maximum number of matches: stop looking for more matches
 905	// once we have this many matches across shards.
 906	TotalMaxMatchCount int
 907
 908	// Maximum number of matches: skip processing documents for a repository in
 909	// a shard once we have found ShardRepoMaxMatchCount.
 910	//
 911	// A compound shard may contain multiple repositories. This will most often
 912	// be set to 1 to find all repositories containing a result.
 913	ShardRepoMaxMatchCount int
 914
 915	// Abort the search after this much time has passed.
 916	MaxWallTime time.Duration
 917
 918	// FlushWallTime if non-zero will stop streaming behaviour at first and
 919	// instead will collate and sort results. At FlushWallTime the results will
 920	// be sent and then the behaviour will revert to the normal streaming.
 921	FlushWallTime time.Duration
 922
 923	// Truncates the number of documents (i.e. files) after collating and
 924	// sorting the results.
 925	MaxDocDisplayCount int
 926
 927	// Truncates the number of matchs after collating and sorting the results.
 928	MaxMatchDisplayCount int
 929
 930	// If set to a number greater than zero then up to this many number
 931	// of context lines will be added before and after each matched line.
 932	// Note that the included context lines might contain matches and
 933	// it's up to the consumer of the result to remove those lines.
 934	NumContextLines int
 935
 936	// If true, ChunkMatches will be returned in each FileMatch rather than LineMatches
 937	// EXPERIMENTAL: the behavior of this flag may be changed in future versions.
 938	ChunkMatches bool
 939
 940	// EXPERIMENTAL. If true, document ranks are used as additional input for
 941	// sorting matches.
 942	UseDocumentRanks bool
 943
 944	// EXPERIMENTAL. When UseDocumentRanks is enabled, this can be optionally set to adjust
 945	// their weight in the file match score. If the value is <= 0.0, the default weight value
 946	// will be used. This option is temporary and is only exposed for testing/ tuning purposes.
 947	DocumentRanksWeight float64
 948
 949	// EXPERIMENTAL. If true, use text-search style scoring instead of the default scoring formula.
 950	// The scoring algorithm treats each match in a file as a term and computes an approximation to
 951	// BM25. When enabled, all other scoring signals are ignored, including document ranks.
 952	UseBM25Scoring bool
 953
 954	// Trace turns on opentracing for this request if true and if the Jaeger address was provided as
 955	// a command-line flag
 956	Trace bool
 957
 958	// If set, the search results will contain debug information for scoring.
 959	DebugScore bool
 960
 961	// SpanContext is the opentracing span context, if it exists, from the zoekt client
 962	SpanContext map[string]string
 963}
 964
 965// String returns a succinct representation of the options. This is meant for
 966// human consumption in logs and traces.
 967//
 968// Note: some tracing systems have limits on length of values, so we take care
 969// to try and make this small, and include the important information near the
 970// front incase of truncation.
 971func (s *SearchOptions) String() string {
 972	var b strings.Builder
 973
 974	add := func(name, value string) {
 975		b.WriteString(name)
 976		b.WriteByte('=')
 977		b.WriteString(value)
 978		b.WriteByte(' ')
 979	}
 980	addInt := func(name string, value int) {
 981		if value != 0 {
 982			add(name, strconv.Itoa(value))
 983		}
 984	}
 985	addDuration := func(name string, value time.Duration) {
 986		if value != 0 {
 987			add(name, value.String())
 988		}
 989	}
 990	addBool := func(name string, value bool) {
 991		if !value {
 992			return
 993		}
 994		b.WriteString(name)
 995		b.WriteByte(' ')
 996	}
 997
 998	b.WriteString("zoekt.SearchOptions{ ")
 999
1000	addInt("ShardMaxMatchCount", s.ShardMaxMatchCount)
1001	addInt("TotalMaxMatchCount", s.TotalMaxMatchCount)
1002	addInt("ShardRepoMaxMatchCount", s.ShardRepoMaxMatchCount)
1003	addInt("MaxDocDisplayCount", s.MaxDocDisplayCount)
1004	addInt("MaxMatchDisplayCount", s.MaxMatchDisplayCount)
1005	addInt("NumContextLines", s.NumContextLines)
1006
1007	addDuration("MaxWallTime", s.MaxWallTime)
1008	addDuration("FlushWallTime", s.FlushWallTime)
1009
1010	if s.DocumentRanksWeight > 0 {
1011		add("DocumentRanksWeight", strconv.FormatFloat(s.DocumentRanksWeight, 'g', -1, 64))
1012	}
1013
1014	addBool("EstimateDocCount", s.EstimateDocCount)
1015	addBool("Whole", s.Whole)
1016	addBool("ChunkMatches", s.ChunkMatches)
1017	addBool("UseDocumentRanks", s.UseDocumentRanks)
1018	addBool("UseBM25Scoring", s.UseBM25Scoring)
1019	addBool("Trace", s.Trace)
1020	addBool("DebugScore", s.DebugScore)
1021
1022	for k, v := range s.SpanContext {
1023		add("SpanContext."+k, strconv.Quote(v))
1024	}
1025
1026	b.WriteByte('}')
1027	return b.String()
1028}
1029
1030// Sender is the interface that wraps the basic Send method.
1031type Sender interface {
1032	Send(*SearchResult)
1033}
1034
1035// SenderFunc is an adapter to allow the use of ordinary functions as Sender.
1036// If f is a function with the appropriate signature, SenderFunc(f) is a Sender
1037// that calls f.
1038type SenderFunc func(result *SearchResult)
1039
1040func (f SenderFunc) Send(result *SearchResult) {
1041	f(result)
1042}
1043
1044// Streamer adds the method StreamSearch to the Searcher interface.
1045type Streamer interface {
1046	Searcher
1047	StreamSearch(ctx context.Context, q query.Q, opts *SearchOptions, sender Sender) (err error)
1048}
Configure Feed

Configure Feed