api.go at c7f1e697599c4662ab7b984c12eb4ddaf9d23c59 · boltless.me/zoekt

fork of https://github.com/sourcegraph/zoekt
zoekt / api.go
at c7f1e697599c4662ab7b984c12eb4ddaf9d23c59 30 kB View raw
Stefan Hengl ranking: removing document ranks (#853) 2y ago
   1// Copyright 2016 Google Inc. All rights reserved.
   2//
   3// Licensed under the Apache License, Version 2.0 (the "License");
   4// you may not use this file except in compliance with the License.
   5// You may obtain a copy of the License at
   6//
   7//    http://www.apache.org/licenses/LICENSE-2.0
   8//
   9// Unless required by applicable law or agreed to in writing, software
  10// distributed under the License is distributed on an "AS IS" BASIS,
  11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12// See the License for the specific language governing permissions and
  13// limitations under the License.
  14
  15package zoekt // import "github.com/sourcegraph/zoekt"
  16
  17import (
  18	"context"
  19	"encoding/json"
  20	"errors"
  21	"fmt"
  22	"reflect"
  23	"strconv"
  24	"strings"
  25	"time"
  26
  27	"github.com/sourcegraph/zoekt/query"
  28)
  29
  30const (
  31	mapHeaderBytes    uint64 = 48
  32	sliceHeaderBytes  uint64 = 24
  33	stringHeaderBytes uint64 = 16
  34	pointerSize       uint64 = 8
  35	interfaceBytes    uint64 = 16
  36)
  37
  38// FileMatch contains all the matches within a file.
  39type FileMatch struct {
  40	FileName string
  41
  42	// Repository is the globally unique name of the repo of the
  43	// match
  44	Repository string
  45
  46	// SubRepositoryName is the globally unique name of the repo,
  47	// if it came from a subrepository
  48	SubRepositoryName string `json:",omitempty"`
  49
  50	// SubRepositoryPath holds the prefix where the subrepository
  51	// was mounted.
  52	SubRepositoryPath string `json:",omitempty"`
  53
  54	// Commit SHA1 (hex) of the (sub)repo holding the file.
  55	Version string `json:",omitempty"`
  56
  57	// Detected language of the result.
  58	Language string
  59
  60	// For debugging. Needs DebugScore set, but public so tests in
  61	// other packages can print some diagnostics.
  62	Debug string `json:",omitempty"`
  63
  64	Branches []string `json:",omitempty"`
  65
  66	// One of LineMatches or ChunkMatches will be returned depending on whether
  67	// the SearchOptions.ChunkMatches is set.
  68	LineMatches  []LineMatch  `json:",omitempty"`
  69	ChunkMatches []ChunkMatch `json:",omitempty"`
  70
  71	// Only set if requested
  72	Content []byte `json:",omitempty"`
  73
  74	// Checksum of the content.
  75	Checksum []byte
  76
  77	// Ranking; the higher, the better.
  78	Score float64 `json:",omitempty"`
  79
  80	// RepositoryPriority is a Sourcegraph extension. It is used by Sourcegraph to
  81	// order results from different repositories relative to each other.
  82	RepositoryPriority float64 `json:",omitempty"`
  83
  84	// RepositoryID is a Sourcegraph extension. This is the ID of Repository in
  85	// Sourcegraph.
  86	RepositoryID uint32 `json:",omitempty"`
  87}
  88
  89func (m *FileMatch) sizeBytes() (sz uint64) {
  90	// Score
  91	sz += 8
  92
  93	for _, s := range []string{
  94		m.Debug,
  95		m.FileName,
  96		m.Repository,
  97		m.Language,
  98		m.SubRepositoryName,
  99		m.SubRepositoryPath,
 100		m.Version,
 101	} {
 102		sz += stringHeaderBytes + uint64(len(s))
 103	}
 104
 105	// Branches
 106	sz += sliceHeaderBytes
 107	for _, s := range m.Branches {
 108		sz += stringHeaderBytes + uint64(len(s))
 109	}
 110
 111	// LineMatches
 112	sz += sliceHeaderBytes
 113	for _, lm := range m.LineMatches {
 114		sz += lm.sizeBytes()
 115	}
 116
 117	// ChunkMatches
 118	sz += sliceHeaderBytes
 119	for _, cm := range m.ChunkMatches {
 120		sz += cm.sizeBytes()
 121	}
 122
 123	// RepositoryID
 124	sz += 4
 125
 126	// RepositoryPriority
 127	sz += 8
 128
 129	// Content
 130	sz += sliceHeaderBytes + uint64(len(m.Content))
 131
 132	// Checksum
 133	sz += sliceHeaderBytes + uint64(len(m.Checksum))
 134
 135	return
 136}
 137
 138// ChunkMatch is a set of non-overlapping matches within a contiguous range of
 139// lines in the file.
 140type ChunkMatch struct {
 141	DebugScore string
 142
 143	// Content is a contiguous range of complete lines that fully contains Ranges.
 144	// Lines will always include their terminating newline (if it exists).
 145	Content []byte
 146
 147	// Ranges is a set of matching ranges within this chunk. Each range is relative
 148	// to the beginning of the file (not the beginning of Content).
 149	Ranges []Range
 150
 151	// SymbolInfo is the symbol information associated with Ranges. If it is non-nil,
 152	// its length will equal that of Ranges. Any of its elements may be nil.
 153	SymbolInfo []*Symbol
 154
 155	// FileName indicates whether this match is a match on the file name, in
 156	// which case Content will contain the file name.
 157	FileName bool
 158
 159	// ContentStart is the location (inclusive) of the beginning of content
 160	// relative to the beginning of the file. It will always be at the
 161	// beginning of a line (Column will always be 1).
 162	ContentStart Location
 163
 164	Score float64
 165}
 166
 167func (cm *ChunkMatch) sizeBytes() (sz uint64) {
 168	// Content
 169	sz += sliceHeaderBytes + uint64(len(cm.Content))
 170
 171	// ContentStart
 172	sz += cm.ContentStart.sizeBytes()
 173
 174	// FileName
 175	sz += 1
 176
 177	// Ranges
 178	sz += sliceHeaderBytes
 179	if len(cm.Ranges) > 0 {
 180		sz += uint64(len(cm.Ranges)) * cm.Ranges[0].sizeBytes()
 181	}
 182
 183	// SymbolInfo
 184	sz += sliceHeaderBytes
 185	for _, si := range cm.SymbolInfo {
 186		sz += pointerSize
 187		if si != nil {
 188			sz += si.sizeBytes()
 189		}
 190	}
 191
 192	// Score
 193	sz += 8
 194
 195	// DebugScore
 196	sz += stringHeaderBytes + uint64(len(cm.DebugScore))
 197
 198	return
 199}
 200
 201type Range struct {
 202	// The inclusive beginning of the range.
 203	Start Location
 204	// The exclusive end of the range.
 205	End Location
 206}
 207
 208func (r *Range) sizeBytes() uint64 {
 209	return r.Start.sizeBytes() + r.End.sizeBytes()
 210}
 211
 212type Location struct {
 213	// 0-based byte offset from the beginning of the file
 214	ByteOffset uint32
 215	// 1-based line number from the beginning of the file
 216	LineNumber uint32
 217	// 1-based column number (in runes) from the beginning of line
 218	Column uint32
 219}
 220
 221func (l *Location) sizeBytes() uint64 {
 222	return 3 * 4
 223}
 224
 225// LineMatch holds the matches within a single line in a file.
 226type LineMatch struct {
 227	// The line in which a match was found.
 228	Line []byte
 229	// The byte offset of the first byte of the line.
 230	LineStart int
 231	// The byte offset of the first byte past the end of the line.
 232	// This is usually the byte after the terminating newline, but can also be
 233	// the end of the file if there is no terminating newline
 234	LineEnd    int
 235	LineNumber int
 236
 237	// Before and After are only set when SearchOptions.NumContextLines is > 0
 238	Before []byte
 239	After  []byte
 240
 241	// If set, this was a match on the filename.
 242	FileName bool
 243
 244	// The higher the better. Only ranks the quality of the match
 245	// within the file, does not take rank of file into account
 246	Score      float64
 247	DebugScore string
 248
 249	LineFragments []LineFragmentMatch
 250}
 251
 252func (lm *LineMatch) sizeBytes() (sz uint64) {
 253	// Line
 254	sz += sliceHeaderBytes + uint64(len(lm.Line))
 255
 256	// LineStart, LineEnd, LineNumber
 257	sz += 3 * 8
 258
 259	// Before
 260	sz += sliceHeaderBytes + uint64(len(lm.Before))
 261
 262	// After
 263	sz += sliceHeaderBytes + uint64(len(lm.After))
 264
 265	// FileName
 266	sz += 1
 267
 268	// Score
 269	sz += 8
 270
 271	// DebugScore
 272	sz += stringHeaderBytes + uint64(len(lm.DebugScore))
 273
 274	// LineFragments
 275	sz += sliceHeaderBytes
 276	for _, lf := range lm.LineFragments {
 277		sz += lf.sizeBytes()
 278	}
 279
 280	return
 281}
 282
 283type Symbol struct {
 284	Sym        string
 285	Kind       string
 286	Parent     string
 287	ParentKind string
 288}
 289
 290func (s *Symbol) sizeBytes() uint64 {
 291	return 4*stringHeaderBytes + uint64(len(s.Sym)+len(s.Kind)+len(s.Parent)+len(s.ParentKind))
 292}
 293
 294// LineFragmentMatch a segment of matching text within a line.
 295type LineFragmentMatch struct {
 296	// Offset within the line, in bytes.
 297	LineOffset int
 298
 299	// Offset from file start, in bytes.
 300	Offset uint32
 301
 302	// Number bytes that match.
 303	MatchLength int
 304
 305	SymbolInfo *Symbol
 306}
 307
 308func (lfm *LineFragmentMatch) sizeBytes() (sz uint64) {
 309	// LineOffset
 310	sz += 8
 311
 312	// Offset
 313	sz += 4
 314
 315	// MatchLength
 316	sz += 8
 317
 318	// SymbolInfo
 319	sz += pointerSize
 320	if lfm.SymbolInfo != nil {
 321		sz += lfm.SymbolInfo.sizeBytes()
 322	}
 323
 324	return
 325}
 326
 327type FlushReason uint8
 328
 329const (
 330	FlushReasonTimerExpired FlushReason = 1 << iota
 331	FlushReasonFinalFlush
 332	FlushReasonMaxSize
 333)
 334
 335var FlushReasonStrings = map[FlushReason]string{
 336	FlushReasonTimerExpired: "timer_expired",
 337	FlushReasonFinalFlush:   "final_flush",
 338	FlushReasonMaxSize:      "max_size_reached",
 339}
 340
 341func (fr FlushReason) String() string {
 342	if v, ok := FlushReasonStrings[fr]; ok {
 343		return v
 344	}
 345
 346	return "none"
 347}
 348
 349// Stats contains interesting numbers on the search
 350type Stats struct {
 351	// Amount of I/O for reading contents.
 352	ContentBytesLoaded int64
 353
 354	// Amount of I/O for reading from index.
 355	IndexBytesLoaded int64
 356
 357	// Number of search shards that had a crash.
 358	Crashes int
 359
 360	// Wall clock time for this search
 361	Duration time.Duration
 362
 363	// Number of files containing a match.
 364	FileCount int
 365
 366	// Number of files in shards that we considered.
 367	ShardFilesConsidered int
 368
 369	// Files that we evaluated. Equivalent to files for which all
 370	// atom matches (including negations) evaluated to true.
 371	FilesConsidered int
 372
 373	// Files for which we loaded file content to verify substring matches
 374	FilesLoaded int
 375
 376	// Candidate files whose contents weren't examined because we
 377	// gathered enough matches.
 378	FilesSkipped int
 379
 380	// Shards that we scanned to find matches.
 381	ShardsScanned int
 382
 383	// Shards that we did not process because a query was canceled.
 384	ShardsSkipped int
 385
 386	// Shards that we did not process because the query was rejected by the
 387	// ngram filter indicating it had no matches.
 388	ShardsSkippedFilter int
 389
 390	// Number of non-overlapping matches
 391	MatchCount int
 392
 393	// Number of candidate matches as a result of searching ngrams.
 394	NgramMatches int
 395
 396	// NgramLookups is the number of times we accessed an ngram in the index.
 397	NgramLookups int
 398
 399	// Wall clock time for queued search.
 400	Wait time.Duration
 401
 402	// Aggregate wall clock time spent constructing and pruning the match tree.
 403	// This accounts for time such as lookups in the trigram index.
 404	MatchTreeConstruction time.Duration
 405
 406	// Aggregate wall clock time spent searching the match tree. This accounts
 407	// for the bulk of search work done looking for matches.
 408	MatchTreeSearch time.Duration
 409
 410	// Number of times regexp was called on files that we evaluated.
 411	RegexpsConsidered int
 412
 413	// FlushReason explains why results were flushed.
 414	FlushReason FlushReason
 415}
 416
 417func (s *Stats) sizeBytes() (sz uint64) {
 418	sz = 16 * 8 // This assumes we are running on a 64-bit architecture
 419	sz += 1     // FlushReason
 420
 421	return
 422}
 423
 424func (s *Stats) Add(o Stats) {
 425	s.ContentBytesLoaded += o.ContentBytesLoaded
 426	s.IndexBytesLoaded += o.IndexBytesLoaded
 427	s.Crashes += o.Crashes
 428	s.FileCount += o.FileCount
 429	s.FilesConsidered += o.FilesConsidered
 430	s.FilesLoaded += o.FilesLoaded
 431	s.FilesSkipped += o.FilesSkipped
 432	s.MatchCount += o.MatchCount
 433	s.NgramMatches += o.NgramMatches
 434	s.NgramLookups += o.NgramLookups
 435	s.ShardFilesConsidered += o.ShardFilesConsidered
 436	s.ShardsScanned += o.ShardsScanned
 437	s.ShardsSkipped += o.ShardsSkipped
 438	s.ShardsSkippedFilter += o.ShardsSkippedFilter
 439	s.Wait += o.Wait
 440	s.MatchTreeConstruction += o.MatchTreeConstruction
 441	s.MatchTreeSearch += o.MatchTreeSearch
 442	s.RegexpsConsidered += o.RegexpsConsidered
 443
 444	// We want the first non-zero FlushReason to be sticky. This is a useful
 445	// property when aggregating stats from several Zoekts.
 446	if s.FlushReason == 0 {
 447		s.FlushReason = o.FlushReason
 448	}
 449}
 450
 451// Zero returns true if stats is empty.
 452func (s *Stats) Zero() bool {
 453	if s == nil {
 454		return true
 455	}
 456
 457	return !(s.ContentBytesLoaded > 0 ||
 458		s.IndexBytesLoaded > 0 ||
 459		s.Crashes > 0 ||
 460		s.FileCount > 0 ||
 461		s.FilesConsidered > 0 ||
 462		s.FilesLoaded > 0 ||
 463		s.FilesSkipped > 0 ||
 464		s.MatchCount > 0 ||
 465		s.NgramMatches > 0 ||
 466		s.NgramLookups > 0 ||
 467		s.ShardFilesConsidered > 0 ||
 468		s.ShardsScanned > 0 ||
 469		s.ShardsSkipped > 0 ||
 470		s.ShardsSkippedFilter > 0 ||
 471		s.Wait > 0 ||
 472		s.MatchTreeConstruction > 0 ||
 473		s.MatchTreeSearch > 0 ||
 474		s.RegexpsConsidered > 0)
 475}
 476
 477// Progress contains information about the global progress of the running search query.
 478// This is used by the frontend to reorder results and emit them when stable.
 479// Sourcegraph specific: this is used when querying multiple zoekt-webserver instances.
 480type Progress struct {
 481	// Priority of the shard that was searched.
 482	Priority float64
 483
 484	// MaxPendingPriority is the maximum priority of pending result that is being searched in parallel.
 485	// This is used to reorder results when the result set is known to be stable-- that is, when a result's
 486	// Priority is greater than the max(MaxPendingPriority) from the latest results of each backend, it can be returned to the user.
 487	//
 488	// MaxPendingPriority decreases monotonically in each SearchResult.
 489	MaxPendingPriority float64
 490}
 491
 492func (p *Progress) sizeBytes() uint64 {
 493	return 2 * 8
 494}
 495
 496// SearchResult contains search matches and extra data
 497type SearchResult struct {
 498	Stats
 499
 500	// Do not encode this as we cannot encode -Inf in JSON
 501	Progress `json:"-"`
 502
 503	Files []FileMatch
 504
 505	// RepoURLs holds a repo => template string map.
 506	RepoURLs map[string]string
 507
 508	// FragmentNames holds a repo => template string map, for
 509	// the line number fragment.
 510	LineFragments map[string]string
 511}
 512
 513// SizeBytes is a best-effort estimate of the size of SearchResult in memory.
 514// The estimate does not take alignment into account. The result is a lower
 515// bound on the actual size in memory.
 516func (sr *SearchResult) SizeBytes() (sz uint64) {
 517	sz += sr.Stats.sizeBytes()
 518	sz += sr.Progress.sizeBytes()
 519
 520	// Files
 521	sz += sliceHeaderBytes
 522	for _, f := range sr.Files {
 523		sz += f.sizeBytes()
 524	}
 525
 526	// RepoURLs
 527	sz += mapHeaderBytes
 528	for k, v := range sr.RepoURLs {
 529		sz += stringHeaderBytes + uint64(len(k))
 530		sz += stringHeaderBytes + uint64(len(v))
 531	}
 532
 533	// LineFragments
 534	sz += mapHeaderBytes
 535	for k, v := range sr.LineFragments {
 536		sz += stringHeaderBytes + uint64(len(k))
 537		sz += stringHeaderBytes + uint64(len(v))
 538	}
 539
 540	return
 541}
 542
 543// RepositoryBranch describes an indexed branch, which is a name
 544// combined with a version.
 545type RepositoryBranch struct {
 546	Name    string
 547	Version string
 548}
 549
 550func (r RepositoryBranch) String() string {
 551	return fmt.Sprintf("%s@%s", r.Name, r.Version)
 552}
 553
 554// Repository holds repository metadata.
 555type Repository struct {
 556	// Sourcegraph's repository ID
 557	ID uint32
 558
 559	// The repository name
 560	Name string
 561
 562	// The repository URL.
 563	URL string
 564
 565	// The physical source where this repo came from, eg. full
 566	// path to the zip filename or git repository directory. This
 567	// will not be exposed in the UI, but can be used to detect
 568	// orphaned index shards.
 569	Source string
 570
 571	// The branches indexed in this repo.
 572	Branches []RepositoryBranch
 573
 574	// Nil if this is not the super project.
 575	SubRepoMap map[string]*Repository
 576
 577	// URL template to link to the commit of a branch
 578	CommitURLTemplate string
 579
 580	// The repository URL for getting to a file.  Has access to
 581	// {{.Version}}, {{.Path}}
 582	FileURLTemplate string
 583
 584	// The URL fragment to add to a file URL for line numbers. has
 585	// access to {{.LineNumber}}. The fragment should include the
 586	// separator, generally '#' or ';'.
 587	LineFragmentTemplate string
 588
 589	// Perf optimization: priority is set when we load the shard. It corresponds to
 590	// the value of "priority" stored in RawConfig.
 591	priority float64
 592
 593	// All zoekt.* configuration settings.
 594	RawConfig map[string]string
 595
 596	// Importance of the repository, bigger is more important
 597	Rank uint16
 598
 599	// IndexOptions is a hash of the options used to create the index for the
 600	// repo.
 601	IndexOptions string
 602
 603	// HasSymbols is true if this repository has indexed ctags
 604	// output. Sourcegraph specific: This field is more appropriate for
 605	// IndexMetadata. However, we store it here since the Sourcegraph frontend
 606	// can read this structure but not IndexMetadata.
 607	HasSymbols bool
 608
 609	// Tombstone is true if we are not allowed to search this repo.
 610	Tombstone bool
 611
 612	// LatestCommitDate is the date of the latest commit among all indexed Branches.
 613	// The date might be time.Time's 0-value if the repository was last indexed
 614	// before this field was added.
 615	LatestCommitDate time.Time
 616
 617	// FileTombstones is a set of file paths that should be ignored across all branches
 618	// in this shard.
 619	FileTombstones map[string]struct{} `json:",omitempty"`
 620}
 621
 622func (r *Repository) UnmarshalJSON(data []byte) error {
 623	// We define a new type so that we can use json.Unmarshal
 624	// without recursing into this same method.
 625	type repository *Repository
 626	repo := repository(r)
 627
 628	err := json.Unmarshal(data, repo)
 629	if err != nil {
 630		return err
 631	}
 632
 633	if v, ok := repo.RawConfig["repoid"]; ok {
 634		id, _ := strconv.ParseUint(v, 10, 32)
 635		r.ID = uint32(id)
 636	}
 637
 638	// Sourcegraph indexserver doesn't set repo.Rank, so we set it here. Setting it
 639	// on read instead of during indexing allows us to avoid a complete reindex.
 640	//
 641	// Prefer "latestCommitDate" over "priority" for ranking. We keep priority for
 642	// backwards compatibility.
 643	if _, ok := repo.RawConfig["latestCommitDate"]; ok {
 644		// We use the number of months since 1970 as a simple measure of repo freshness.
 645		// It is monotonically increasing and stable across re-indexes and restarts.
 646		r.Rank = monthsSince1970(repo.LatestCommitDate)
 647	} else if v, ok := repo.RawConfig["priority"]; ok {
 648		r.priority, err = strconv.ParseFloat(v, 64)
 649		if err != nil {
 650			r.priority = 0
 651		}
 652
 653		// Sourcegraph indexserver doesn't set repo.Rank, so we set it here
 654		// based on priority. Setting it on read instead of during indexing
 655		// allows us to avoid a complete reindex.
 656		if r.Rank == 0 && r.priority > 0 {
 657			// Normalize the repo score within [0, maxUint16), with the midpoint at 5,000.
 658			// This means popular repos (roughly ones with over 5,000 stars) see diminishing
 659			// returns from more stars.
 660			r.Rank = uint16(r.priority / (5000.0 + r.priority) * maxUInt16)
 661		}
 662	}
 663
 664	return nil
 665}
 666
 667// monthsSince1970 returns the number of months since 1970. It returns values in
 668// the range [0, maxUInt16]. The upper bound is reached in the year 7431, the
 669// lower bound for all dates before 1970.
 670func monthsSince1970(t time.Time) uint16 {
 671	base := time.Unix(0, 0)
 672	if t.Before(base) {
 673		return 0
 674	}
 675	months := int(t.Year()-1970)*12 + int(t.Month()-1)
 676	return uint16(min(months, maxUInt16))
 677}
 678
 679// MergeMutable will merge x into r. mutated will be true if it made any
 680// changes. err is non-nil if we needed to mutate an immutable field.
 681//
 682// Note: SubRepoMap, IndexOptions and HasSymbol fields are ignored. They are
 683// computed while indexing so can't be synthesized from x.
 684//
 685// Note: We ignore RawConfig fields which are duplicated into Repository:
 686// name and id.
 687func (r *Repository) MergeMutable(x *Repository) (mutated bool, err error) {
 688	if r.ID != x.ID {
 689		// Sourcegraph: strange behaviour may occur if ID changes but names don't.
 690		return mutated, errors.New("ID is immutable")
 691	}
 692	if r.Name != x.Name {
 693		// Name is encoded into the shard name on disk. We need to re-index if it
 694		// changes.
 695		return mutated, errors.New("Name is immutable")
 696	}
 697	if !reflect.DeepEqual(r.Branches, x.Branches) {
 698		// Need a reindex if content changing.
 699		return mutated, errors.New("Branches is immutable")
 700	}
 701
 702	for k, v := range x.RawConfig {
 703		// We ignore name and id since they are encoded into the repository.
 704		if k == "name" || k == "id" {
 705			continue
 706		}
 707		if r.RawConfig == nil {
 708			mutated = true
 709			r.RawConfig = make(map[string]string)
 710		}
 711		if r.RawConfig[k] != v {
 712			mutated = true
 713			r.RawConfig[k] = v
 714		}
 715	}
 716
 717	if r.URL != x.URL {
 718		mutated = true
 719		r.URL = x.URL
 720	}
 721	if r.CommitURLTemplate != x.CommitURLTemplate {
 722		mutated = true
 723		r.CommitURLTemplate = x.CommitURLTemplate
 724	}
 725	if r.FileURLTemplate != x.FileURLTemplate {
 726		mutated = true
 727		r.FileURLTemplate = x.FileURLTemplate
 728	}
 729	if r.LineFragmentTemplate != x.LineFragmentTemplate {
 730		mutated = true
 731		r.LineFragmentTemplate = x.LineFragmentTemplate
 732	}
 733
 734	return mutated, nil
 735}
 736
 737// IndexMetadata holds metadata stored in the index file. It contains
 738// data generated by the core indexing library.
 739type IndexMetadata struct {
 740	IndexFormatVersion    int
 741	IndexFeatureVersion   int
 742	IndexMinReaderVersion int
 743	IndexTime             time.Time
 744	PlainASCII            bool
 745	LanguageMap           map[string]uint16
 746	ZoektVersion          string
 747	ID                    string
 748}
 749
 750// Statistics of a (collection of) repositories.
 751type RepoStats struct {
 752	// Repos is used for aggregrating the number of repositories.
 753	//
 754	// Note: This field is not populated on RepoListEntry.Stats (individual) but
 755	// only for RepoList.Stats (aggregate).
 756	Repos int
 757
 758	// Shards is the total number of search shards.
 759	Shards int
 760
 761	// Documents holds the number of documents or files.
 762	Documents int
 763
 764	// IndexBytes is the amount of RAM used for index overhead.
 765	IndexBytes int64
 766
 767	// ContentBytes is the amount of RAM used for raw content.
 768	ContentBytes int64
 769
 770	// Sourcegraph specific stats below. These are not as efficient to calculate
 771	// as the above statistics. We experimentally measured about a 10% slower
 772	// shard load time. However, we find these values very useful to track and
 773	// computing them outside of load time introduces a lot of complexity.
 774
 775	// NewLinesCount is the number of newlines "\n" that appear in the zoekt
 776	// indexed documents. This is not exactly the same as line count, since it
 777	// will not include lines not terminated by "\n" (eg a file with no "\n", or
 778	// a final line without "\n"). Note: Zoekt deduplicates documents across
 779	// branches, so if a path has the same contents on multiple branches, there
 780	// is only one document for it. As such that document's newlines is only
 781	// counted once. See DefaultBranchNewLinesCount and AllBranchesNewLinesCount
 782	// for counts which do not deduplicate.
 783	NewLinesCount uint64
 784
 785	// DefaultBranchNewLinesCount is the number of newlines "\n" in the default
 786	// branch.
 787	DefaultBranchNewLinesCount uint64
 788
 789	// OtherBranchesNewLinesCount is the number of newlines "\n" in all branches
 790	// except the default branch.
 791	OtherBranchesNewLinesCount uint64
 792}
 793
 794func (s *RepoStats) Add(o *RepoStats) {
 795	// can't update Repos, since one repo may have multiple
 796	// shards.
 797	s.Shards += o.Shards
 798	s.IndexBytes += o.IndexBytes
 799	s.Documents += o.Documents
 800	s.ContentBytes += o.ContentBytes
 801
 802	// Sourcegraph specific
 803	s.NewLinesCount += o.NewLinesCount
 804	s.DefaultBranchNewLinesCount += o.DefaultBranchNewLinesCount
 805	s.OtherBranchesNewLinesCount += o.OtherBranchesNewLinesCount
 806}
 807
 808type RepoListEntry struct {
 809	Repository    Repository
 810	IndexMetadata IndexMetadata
 811	Stats         RepoStats
 812}
 813
 814// MinimalRepoListEntry is a subset of RepoListEntry. It was added after
 815// performance profiling of sourcegraph.com revealed that querying this
 816// information from Zoekt was causing lots of CPU and memory usage. Note: we
 817// can revisit this, how we store and query this information has changed a lot
 818// since this was introduced.
 819type MinimalRepoListEntry struct {
 820	// HasSymbols is exported since Sourcegraph uses this information at search
 821	// planning time to decide between Zoekt and an unindexed symbol search.
 822	//
 823	// Note: it pretty much is always true in practice.
 824	HasSymbols bool
 825
 826	// Branches is used by Sourcegraphs query planner to decided if it can use
 827	// zoekt or go via an unindexed code path.
 828	Branches []RepositoryBranch
 829
 830	// IndexTimeUnix is the IndexTime converted to unix time (number of seconds
 831	// since the epoch). This is to make it clear we are not transporting the
 832	// full fidelty timestamp (ie with milliseconds and location). Additionally
 833	// it saves 16 bytes in this struct.
 834	//
 835	// IndexTime is used as a heuristic in Sourcegraph to decide in aggregate
 836	// how many repositories need updating after a ranking change/etc.
 837	//
 838	// TODO(keegancsmith) audit updates to IndexTime and document how and when
 839	// it changes. Concerned about things like metadata updates or compound
 840	// shards leading to untrustworthy data here.
 841	IndexTimeUnix int64
 842}
 843
 844type ReposMap map[uint32]MinimalRepoListEntry
 845
 846// MarshalBinary implements a specialized encoder for ReposMap.
 847func (q *ReposMap) MarshalBinary() ([]byte, error) {
 848	return reposMapEncode(*q)
 849}
 850
 851// UnmarshalBinary implements a specialized decoder for ReposMap.
 852func (q *ReposMap) UnmarshalBinary(b []byte) error {
 853	var err error
 854	(*q), err = reposMapDecode(b)
 855	return err
 856}
 857
 858// RepoList holds a set of Repository metadata.
 859type RepoList struct {
 860	// Returned when ListOptions.Field is RepoListFieldRepos.
 861	Repos []*RepoListEntry
 862
 863	// ReposMap is set when ListOptions.Field is RepoListFieldReposMap.
 864	ReposMap ReposMap
 865
 866	Crashes int
 867
 868	// Stats response to a List request.
 869	// This is the aggregate RepoStats of all repos matching the input query.
 870	Stats RepoStats
 871}
 872
 873type Searcher interface {
 874	Search(ctx context.Context, q query.Q, opts *SearchOptions) (*SearchResult, error)
 875
 876	// List lists repositories. The query `q` can only contain
 877	// query.Repo atoms.
 878	List(ctx context.Context, q query.Q, opts *ListOptions) (*RepoList, error)
 879	Close()
 880
 881	// Describe the searcher for debug messages.
 882	String() string
 883}
 884
 885type RepoListField int
 886
 887const (
 888	RepoListFieldRepos    RepoListField = 0
 889	RepoListFieldReposMap               = 2
 890)
 891
 892type ListOptions struct {
 893	// Field decides which field to populate in RepoList response.
 894	Field RepoListField
 895}
 896
 897func (o *ListOptions) GetField() (RepoListField, error) {
 898	if o == nil {
 899		return RepoListFieldRepos, nil
 900	}
 901	switch o.Field {
 902	case RepoListFieldRepos, RepoListFieldReposMap:
 903		return o.Field, nil
 904	case 1:
 905		return 0, fmt.Errorf("RepoListFieldMinimal (%d) is no longer supported", o.Field)
 906	default:
 907		return 0, fmt.Errorf("unknown RepoListField %d", o.Field)
 908	}
 909}
 910
 911func (o *ListOptions) String() string {
 912	return fmt.Sprintf("%#v", o)
 913}
 914
 915type SearchOptions struct {
 916	// Return an upper-bound estimate of eligible documents in
 917	// stats.ShardFilesConsidered.
 918	EstimateDocCount bool
 919
 920	// Return the whole file.
 921	Whole bool
 922
 923	// Maximum number of matches: skip all processing an index
 924	// shard after we found this many non-overlapping matches.
 925	ShardMaxMatchCount int
 926
 927	// Maximum number of matches: stop looking for more matches
 928	// once we have this many matches across shards.
 929	TotalMaxMatchCount int
 930
 931	// Maximum number of matches: skip processing documents for a repository in
 932	// a shard once we have found ShardRepoMaxMatchCount.
 933	//
 934	// A compound shard may contain multiple repositories. This will most often
 935	// be set to 1 to find all repositories containing a result.
 936	ShardRepoMaxMatchCount int
 937
 938	// Abort the search after this much time has passed.
 939	MaxWallTime time.Duration
 940
 941	// FlushWallTime if non-zero will stop streaming behaviour at first and
 942	// instead will collate and sort results. At FlushWallTime the results will
 943	// be sent and then the behaviour will revert to the normal streaming.
 944	FlushWallTime time.Duration
 945
 946	// Truncates the number of documents (i.e. files) after collating and
 947	// sorting the results.
 948	MaxDocDisplayCount int
 949
 950	// Truncates the number of matchs after collating and sorting the results.
 951	MaxMatchDisplayCount int
 952
 953	// If set to a number greater than zero then up to this many number
 954	// of context lines will be added before and after each matched line.
 955	// Note that the included context lines might contain matches and
 956	// it's up to the consumer of the result to remove those lines.
 957	NumContextLines int
 958
 959	// If true, ChunkMatches will be returned in each FileMatch rather than LineMatches
 960	// EXPERIMENTAL: the behavior of this flag may be changed in future versions.
 961	ChunkMatches bool
 962
 963	// EXPERIMENTAL. If true, use text-search style scoring instead of the default
 964	// scoring formula. The scoring algorithm treats each match in a file as a term
 965	// and computes an approximation to BM25.
 966	//
 967	// The calculation of IDF assumes that Zoekt visits all documents containing any
 968	// of the query terms during evaluation. This is true, for example, if all query
 969	// terms are ORed together.
 970	//
 971	// When enabled, all other scoring signals are ignored, including document ranks.
 972	UseBM25Scoring bool
 973
 974	// Trace turns on opentracing for this request if true and if the Jaeger address was provided as
 975	// a command-line flag
 976	Trace bool
 977
 978	// If set, the search results will contain debug information for scoring.
 979	DebugScore bool
 980
 981	// SpanContext is the opentracing span context, if it exists, from the zoekt client
 982	SpanContext map[string]string
 983}
 984
 985// String returns a succinct representation of the options. This is meant for
 986// human consumption in logs and traces.
 987//
 988// Note: some tracing systems have limits on length of values, so we take care
 989// to try and make this small, and include the important information near the
 990// front incase of truncation.
 991func (s *SearchOptions) String() string {
 992	var b strings.Builder
 993
 994	add := func(name, value string) {
 995		b.WriteString(name)
 996		b.WriteByte('=')
 997		b.WriteString(value)
 998		b.WriteByte(' ')
 999	}
1000	addInt := func(name string, value int) {
1001		if value != 0 {
1002			add(name, strconv.Itoa(value))
1003		}
1004	}
1005	addDuration := func(name string, value time.Duration) {
1006		if value != 0 {
1007			add(name, value.String())
1008		}
1009	}
1010	addBool := func(name string, value bool) {
1011		if !value {
1012			return
1013		}
1014		b.WriteString(name)
1015		b.WriteByte(' ')
1016	}
1017
1018	b.WriteString("zoekt.SearchOptions{ ")
1019
1020	addInt("ShardMaxMatchCount", s.ShardMaxMatchCount)
1021	addInt("TotalMaxMatchCount", s.TotalMaxMatchCount)
1022	addInt("ShardRepoMaxMatchCount", s.ShardRepoMaxMatchCount)
1023	addInt("MaxDocDisplayCount", s.MaxDocDisplayCount)
1024	addInt("MaxMatchDisplayCount", s.MaxMatchDisplayCount)
1025	addInt("NumContextLines", s.NumContextLines)
1026
1027	addDuration("MaxWallTime", s.MaxWallTime)
1028	addDuration("FlushWallTime", s.FlushWallTime)
1029
1030	addBool("EstimateDocCount", s.EstimateDocCount)
1031	addBool("Whole", s.Whole)
1032	addBool("ChunkMatches", s.ChunkMatches)
1033	addBool("UseBM25Scoring", s.UseBM25Scoring)
1034	addBool("Trace", s.Trace)
1035	addBool("DebugScore", s.DebugScore)
1036
1037	for k, v := range s.SpanContext {
1038		add("SpanContext."+k, strconv.Quote(v))
1039	}
1040
1041	b.WriteByte('}')
1042	return b.String()
1043}
1044
1045// Sender is the interface that wraps the basic Send method.
1046type Sender interface {
1047	Send(*SearchResult)
1048}
1049
1050// SenderFunc is an adapter to allow the use of ordinary functions as Sender.
1051// If f is a function with the appropriate signature, SenderFunc(f) is a Sender
1052// that calls f.
1053type SenderFunc func(result *SearchResult)
1054
1055func (f SenderFunc) Send(result *SearchResult) {
1056	f(result)
1057}
1058
1059// Streamer adds the method StreamSearch to the Searcher interface.
1060type Streamer interface {
1061	Searcher
1062	StreamSearch(ctx context.Context, q query.Q, opts *SearchOptions, sender Sender) (err error)
1063}
Configure Feed

Configure Feed