api.go at 328724512814c0a71132e2e7658f71197573d611 · boltless.me/zoekt

fork of https://github.com/sourcegraph/zoekt
zoekt / api.go
at 328724512814c0a71132e2e7658f71197573d611 31 kB View raw
Julie Tibshirani Support BM25 scoring for chunk matches (#889) 1y ago
   1// Copyright 2016 Google Inc. All rights reserved.
   2//
   3// Licensed under the Apache License, Version 2.0 (the "License");
   4// you may not use this file except in compliance with the License.
   5// You may obtain a copy of the License at
   6//
   7//    http://www.apache.org/licenses/LICENSE-2.0
   8//
   9// Unless required by applicable law or agreed to in writing, software
  10// distributed under the License is distributed on an "AS IS" BASIS,
  11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12// See the License for the specific language governing permissions and
  13// limitations under the License.
  14
  15package zoekt // import "github.com/sourcegraph/zoekt"
  16
  17import (
  18	"context"
  19	"encoding/json"
  20	"errors"
  21	"fmt"
  22	"math"
  23	"reflect"
  24	"strconv"
  25	"strings"
  26	"time"
  27
  28	"github.com/sourcegraph/zoekt/query"
  29)
  30
  31const (
  32	mapHeaderBytes    uint64 = 48
  33	sliceHeaderBytes  uint64 = 24
  34	stringHeaderBytes uint64 = 16
  35	pointerSize       uint64 = 8
  36	interfaceBytes    uint64 = 16
  37)
  38
  39// FileMatch contains all the matches within a file.
  40type FileMatch struct {
  41	FileName string
  42
  43	// Repository is the globally unique name of the repo of the
  44	// match
  45	Repository string
  46
  47	// SubRepositoryName is the globally unique name of the repo,
  48	// if it came from a subrepository
  49	SubRepositoryName string `json:",omitempty"`
  50
  51	// SubRepositoryPath holds the prefix where the subrepository
  52	// was mounted.
  53	SubRepositoryPath string `json:",omitempty"`
  54
  55	// Commit SHA1 (hex) of the (sub)repo holding the file.
  56	Version string `json:",omitempty"`
  57
  58	// Detected language of the result.
  59	Language string
  60
  61	// For debugging. Needs DebugScore set, but public so tests in
  62	// other packages can print some diagnostics.
  63	Debug string `json:",omitempty"`
  64
  65	Branches []string `json:",omitempty"`
  66
  67	// One of LineMatches or ChunkMatches will be returned depending on whether
  68	// the SearchOptions.ChunkMatches is set.
  69	LineMatches  []LineMatch  `json:",omitempty"`
  70	ChunkMatches []ChunkMatch `json:",omitempty"`
  71
  72	// Only set if requested
  73	Content []byte `json:",omitempty"`
  74
  75	// Checksum of the content.
  76	Checksum []byte
  77
  78	// Ranking; the higher, the better.
  79	Score float64 `json:",omitempty"`
  80
  81	// RepositoryPriority is a Sourcegraph extension. It is used by Sourcegraph to
  82	// order results from different repositories relative to each other.
  83	RepositoryPriority float64 `json:",omitempty"`
  84
  85	// RepositoryID is a Sourcegraph extension. This is the ID of Repository in
  86	// Sourcegraph.
  87	RepositoryID uint32 `json:",omitempty"`
  88}
  89
  90func (m *FileMatch) sizeBytes() (sz uint64) {
  91	// Score
  92	sz += 8
  93
  94	for _, s := range []string{
  95		m.Debug,
  96		m.FileName,
  97		m.Repository,
  98		m.Language,
  99		m.SubRepositoryName,
 100		m.SubRepositoryPath,
 101		m.Version,
 102	} {
 103		sz += stringHeaderBytes + uint64(len(s))
 104	}
 105
 106	// Branches
 107	sz += sliceHeaderBytes
 108	for _, s := range m.Branches {
 109		sz += stringHeaderBytes + uint64(len(s))
 110	}
 111
 112	// LineMatches
 113	sz += sliceHeaderBytes
 114	for _, lm := range m.LineMatches {
 115		sz += lm.sizeBytes()
 116	}
 117
 118	// ChunkMatches
 119	sz += sliceHeaderBytes
 120	for _, cm := range m.ChunkMatches {
 121		sz += cm.sizeBytes()
 122	}
 123
 124	// RepositoryID
 125	sz += 4
 126
 127	// RepositoryPriority
 128	sz += 8
 129
 130	// Content
 131	sz += sliceHeaderBytes + uint64(len(m.Content))
 132
 133	// Checksum
 134	sz += sliceHeaderBytes + uint64(len(m.Checksum))
 135
 136	return
 137}
 138
 139// addScore increments the score of the FileMatch by the computed score. If
 140// debugScore is true, it also adds a debug string to the FileMatch. If raw is
 141// -1, it is ignored. Otherwise, it is added to the debug string.
 142func (m *FileMatch) addScore(what string, computed float64, raw float64, debugScore bool) {
 143	if computed != 0 && debugScore {
 144		var b strings.Builder
 145		fmt.Fprintf(&b, "%s", what)
 146		if raw != -1 {
 147			fmt.Fprintf(&b, "(%s)", strconv.FormatFloat(raw, 'f', -1, 64))
 148		}
 149		fmt.Fprintf(&b, ":%.2f, ", computed)
 150		m.Debug += b.String()
 151	}
 152	m.Score += computed
 153}
 154
 155// ChunkMatch is a set of non-overlapping matches within a contiguous range of
 156// lines in the file.
 157type ChunkMatch struct {
 158	DebugScore string
 159
 160	// Content is a contiguous range of complete lines that fully contains Ranges.
 161	// Lines will always include their terminating newline (if it exists).
 162	Content []byte
 163
 164	// Ranges is a set of matching ranges within this chunk. Each range is relative
 165	// to the beginning of the file (not the beginning of Content).
 166	Ranges []Range
 167
 168	// SymbolInfo is the symbol information associated with Ranges. If it is non-nil,
 169	// its length will equal that of Ranges. Any of its elements may be nil.
 170	SymbolInfo []*Symbol
 171
 172	// FileName indicates whether this match is a match on the file name, in
 173	// which case Content will contain the file name.
 174	FileName bool
 175
 176	// ContentStart is the location (inclusive) of the beginning of content
 177	// relative to the beginning of the file. It will always be at the
 178	// beginning of a line (Column will always be 1).
 179	ContentStart Location
 180
 181	// Score is the overall relevance score of this chunk.
 182	Score float64
 183
 184	// BestLineMatch is the line number of the highest-scoring line match in this chunk.
 185	// The line number represents the index in the full file, and is 1-based. If FileName: true,
 186	// this number will be 0.
 187	BestLineMatch uint32
 188}
 189
 190func (cm *ChunkMatch) sizeBytes() (sz uint64) {
 191	// Content
 192	sz += sliceHeaderBytes + uint64(len(cm.Content))
 193
 194	// ContentStart
 195	sz += cm.ContentStart.sizeBytes()
 196
 197	// FileName
 198	sz += 1
 199
 200	// Ranges
 201	sz += sliceHeaderBytes
 202	if len(cm.Ranges) > 0 {
 203		sz += uint64(len(cm.Ranges)) * cm.Ranges[0].sizeBytes()
 204	}
 205
 206	// SymbolInfo
 207	sz += sliceHeaderBytes
 208	for _, si := range cm.SymbolInfo {
 209		sz += pointerSize
 210		if si != nil {
 211			sz += si.sizeBytes()
 212		}
 213	}
 214
 215	// Score
 216	sz += 8
 217
 218	// DebugScore
 219	sz += stringHeaderBytes + uint64(len(cm.DebugScore))
 220
 221	return
 222}
 223
 224type Range struct {
 225	// The inclusive beginning of the range.
 226	Start Location
 227	// The exclusive end of the range.
 228	End Location
 229}
 230
 231func (r *Range) sizeBytes() uint64 {
 232	return r.Start.sizeBytes() + r.End.sizeBytes()
 233}
 234
 235type Location struct {
 236	// 0-based byte offset from the beginning of the file
 237	ByteOffset uint32
 238	// 1-based line number from the beginning of the file
 239	LineNumber uint32
 240	// 1-based column number (in runes) from the beginning of line
 241	Column uint32
 242}
 243
 244func (l *Location) sizeBytes() uint64 {
 245	return 3 * 4
 246}
 247
 248// LineMatch holds the matches within a single line in a file.
 249type LineMatch struct {
 250	// The line in which a match was found.
 251	Line []byte
 252	// The byte offset of the first byte of the line.
 253	LineStart int
 254	// The byte offset of the first byte past the end of the line.
 255	// This is usually the byte after the terminating newline, but can also be
 256	// the end of the file if there is no terminating newline
 257	LineEnd    int
 258	LineNumber int
 259
 260	// Before and After are only set when SearchOptions.NumContextLines is > 0
 261	Before []byte
 262	After  []byte
 263
 264	// If set, this was a match on the filename.
 265	FileName bool
 266
 267	// The higher the better. Only ranks the quality of the match
 268	// within the file, does not take rank of file into account
 269	Score      float64
 270	DebugScore string
 271
 272	LineFragments []LineFragmentMatch
 273}
 274
 275func (lm *LineMatch) sizeBytes() (sz uint64) {
 276	// Line
 277	sz += sliceHeaderBytes + uint64(len(lm.Line))
 278
 279	// LineStart, LineEnd, LineNumber
 280	sz += 3 * 8
 281
 282	// Before
 283	sz += sliceHeaderBytes + uint64(len(lm.Before))
 284
 285	// After
 286	sz += sliceHeaderBytes + uint64(len(lm.After))
 287
 288	// FileName
 289	sz += 1
 290
 291	// Score
 292	sz += 8
 293
 294	// DebugScore
 295	sz += stringHeaderBytes + uint64(len(lm.DebugScore))
 296
 297	// LineFragments
 298	sz += sliceHeaderBytes
 299	for _, lf := range lm.LineFragments {
 300		sz += lf.sizeBytes()
 301	}
 302
 303	return
 304}
 305
 306type Symbol struct {
 307	Sym        string
 308	Kind       string
 309	Parent     string
 310	ParentKind string
 311}
 312
 313func (s *Symbol) sizeBytes() uint64 {
 314	return 4*stringHeaderBytes + uint64(len(s.Sym)+len(s.Kind)+len(s.Parent)+len(s.ParentKind))
 315}
 316
 317// LineFragmentMatch a segment of matching text within a line.
 318type LineFragmentMatch struct {
 319	// Offset within the line, in bytes.
 320	LineOffset int
 321
 322	// Offset from file start, in bytes.
 323	Offset uint32
 324
 325	// Number bytes that match.
 326	MatchLength int
 327
 328	SymbolInfo *Symbol
 329}
 330
 331func (lfm *LineFragmentMatch) sizeBytes() (sz uint64) {
 332	// LineOffset
 333	sz += 8
 334
 335	// Offset
 336	sz += 4
 337
 338	// MatchLength
 339	sz += 8
 340
 341	// SymbolInfo
 342	sz += pointerSize
 343	if lfm.SymbolInfo != nil {
 344		sz += lfm.SymbolInfo.sizeBytes()
 345	}
 346
 347	return
 348}
 349
 350type FlushReason uint8
 351
 352const (
 353	FlushReasonTimerExpired FlushReason = 1 << iota
 354	FlushReasonFinalFlush
 355	FlushReasonMaxSize
 356)
 357
 358var FlushReasonStrings = map[FlushReason]string{
 359	FlushReasonTimerExpired: "timer_expired",
 360	FlushReasonFinalFlush:   "final_flush",
 361	FlushReasonMaxSize:      "max_size_reached",
 362}
 363
 364func (fr FlushReason) String() string {
 365	if v, ok := FlushReasonStrings[fr]; ok {
 366		return v
 367	}
 368
 369	return "none"
 370}
 371
 372// Stats contains interesting numbers on the search
 373type Stats struct {
 374	// Amount of I/O for reading contents.
 375	ContentBytesLoaded int64
 376
 377	// Amount of I/O for reading from index.
 378	IndexBytesLoaded int64
 379
 380	// Number of search shards that had a crash.
 381	Crashes int
 382
 383	// Wall clock time for this search
 384	Duration time.Duration
 385
 386	// Number of files containing a match.
 387	FileCount int
 388
 389	// Number of files in shards that we considered.
 390	ShardFilesConsidered int
 391
 392	// Files that we evaluated. Equivalent to files for which all
 393	// atom matches (including negations) evaluated to true.
 394	FilesConsidered int
 395
 396	// Files for which we loaded file content to verify substring matches
 397	FilesLoaded int
 398
 399	// Candidate files whose contents weren't examined because we
 400	// gathered enough matches.
 401	FilesSkipped int
 402
 403	// Shards that we scanned to find matches.
 404	ShardsScanned int
 405
 406	// Shards that we did not process because a query was canceled.
 407	ShardsSkipped int
 408
 409	// Shards that we did not process because the query was rejected by the
 410	// ngram filter indicating it had no matches.
 411	ShardsSkippedFilter int
 412
 413	// Number of non-overlapping matches
 414	MatchCount int
 415
 416	// Number of candidate matches as a result of searching ngrams.
 417	NgramMatches int
 418
 419	// NgramLookups is the number of times we accessed an ngram in the index.
 420	NgramLookups int
 421
 422	// Wall clock time for queued search.
 423	Wait time.Duration
 424
 425	// Aggregate wall clock time spent constructing and pruning the match tree.
 426	// This accounts for time such as lookups in the trigram index.
 427	MatchTreeConstruction time.Duration
 428
 429	// Aggregate wall clock time spent searching the match tree. This accounts
 430	// for the bulk of search work done looking for matches.
 431	MatchTreeSearch time.Duration
 432
 433	// Number of times regexp was called on files that we evaluated.
 434	RegexpsConsidered int
 435
 436	// FlushReason explains why results were flushed.
 437	FlushReason FlushReason
 438}
 439
 440func (s *Stats) sizeBytes() (sz uint64) {
 441	sz = 16 * 8 // This assumes we are running on a 64-bit architecture
 442	sz += 1     // FlushReason
 443
 444	return
 445}
 446
 447func (s *Stats) Add(o Stats) {
 448	s.ContentBytesLoaded += o.ContentBytesLoaded
 449	s.IndexBytesLoaded += o.IndexBytesLoaded
 450	s.Crashes += o.Crashes
 451	s.FileCount += o.FileCount
 452	s.FilesConsidered += o.FilesConsidered
 453	s.FilesLoaded += o.FilesLoaded
 454	s.FilesSkipped += o.FilesSkipped
 455	s.MatchCount += o.MatchCount
 456	s.NgramMatches += o.NgramMatches
 457	s.NgramLookups += o.NgramLookups
 458	s.ShardFilesConsidered += o.ShardFilesConsidered
 459	s.ShardsScanned += o.ShardsScanned
 460	s.ShardsSkipped += o.ShardsSkipped
 461	s.ShardsSkippedFilter += o.ShardsSkippedFilter
 462	s.Wait += o.Wait
 463	s.MatchTreeConstruction += o.MatchTreeConstruction
 464	s.MatchTreeSearch += o.MatchTreeSearch
 465	s.RegexpsConsidered += o.RegexpsConsidered
 466
 467	// We want the first non-zero FlushReason to be sticky. This is a useful
 468	// property when aggregating stats from several Zoekts.
 469	if s.FlushReason == 0 {
 470		s.FlushReason = o.FlushReason
 471	}
 472}
 473
 474// Zero returns true if stats is empty.
 475func (s *Stats) Zero() bool {
 476	if s == nil {
 477		return true
 478	}
 479
 480	return !(s.ContentBytesLoaded > 0 ||
 481		s.IndexBytesLoaded > 0 ||
 482		s.Crashes > 0 ||
 483		s.FileCount > 0 ||
 484		s.FilesConsidered > 0 ||
 485		s.FilesLoaded > 0 ||
 486		s.FilesSkipped > 0 ||
 487		s.MatchCount > 0 ||
 488		s.NgramMatches > 0 ||
 489		s.NgramLookups > 0 ||
 490		s.ShardFilesConsidered > 0 ||
 491		s.ShardsScanned > 0 ||
 492		s.ShardsSkipped > 0 ||
 493		s.ShardsSkippedFilter > 0 ||
 494		s.Wait > 0 ||
 495		s.MatchTreeConstruction > 0 ||
 496		s.MatchTreeSearch > 0 ||
 497		s.RegexpsConsidered > 0)
 498}
 499
 500// Progress contains information about the global progress of the running search query.
 501// This is used by the frontend to reorder results and emit them when stable.
 502// Sourcegraph specific: this is used when querying multiple zoekt-webserver instances.
 503type Progress struct {
 504	// Priority of the shard that was searched.
 505	Priority float64
 506
 507	// MaxPendingPriority is the maximum priority of pending result that is being searched in parallel.
 508	// This is used to reorder results when the result set is known to be stable-- that is, when a result's
 509	// Priority is greater than the max(MaxPendingPriority) from the latest results of each backend, it can be returned to the user.
 510	//
 511	// MaxPendingPriority decreases monotonically in each SearchResult.
 512	MaxPendingPriority float64
 513}
 514
 515func (p *Progress) sizeBytes() uint64 {
 516	return 2 * 8
 517}
 518
 519// SearchResult contains search matches and extra data
 520type SearchResult struct {
 521	Stats
 522
 523	// Do not encode this as we cannot encode -Inf in JSON
 524	Progress `json:"-"`
 525
 526	Files []FileMatch
 527
 528	// RepoURLs holds a repo => template string map.
 529	RepoURLs map[string]string
 530
 531	// FragmentNames holds a repo => template string map, for
 532	// the line number fragment.
 533	LineFragments map[string]string
 534}
 535
 536// SizeBytes is a best-effort estimate of the size of SearchResult in memory.
 537// The estimate does not take alignment into account. The result is a lower
 538// bound on the actual size in memory.
 539func (sr *SearchResult) SizeBytes() (sz uint64) {
 540	sz += sr.Stats.sizeBytes()
 541	sz += sr.Progress.sizeBytes()
 542
 543	// Files
 544	sz += sliceHeaderBytes
 545	for _, f := range sr.Files {
 546		sz += f.sizeBytes()
 547	}
 548
 549	// RepoURLs
 550	sz += mapHeaderBytes
 551	for k, v := range sr.RepoURLs {
 552		sz += stringHeaderBytes + uint64(len(k))
 553		sz += stringHeaderBytes + uint64(len(v))
 554	}
 555
 556	// LineFragments
 557	sz += mapHeaderBytes
 558	for k, v := range sr.LineFragments {
 559		sz += stringHeaderBytes + uint64(len(k))
 560		sz += stringHeaderBytes + uint64(len(v))
 561	}
 562
 563	return
 564}
 565
 566// RepositoryBranch describes an indexed branch, which is a name
 567// combined with a version.
 568type RepositoryBranch struct {
 569	Name    string
 570	Version string
 571}
 572
 573func (r RepositoryBranch) String() string {
 574	return fmt.Sprintf("%s@%s", r.Name, r.Version)
 575}
 576
 577// Repository holds repository metadata.
 578type Repository struct {
 579	// Sourcegraph's tenant ID
 580	TenantID int
 581
 582	// Sourcegraph's repository ID
 583	ID uint32
 584
 585	// The repository name
 586	Name string
 587
 588	// The repository URL.
 589	URL string
 590
 591	// The physical source where this repo came from, eg. full
 592	// path to the zip filename or git repository directory. This
 593	// will not be exposed in the UI, but can be used to detect
 594	// orphaned index shards.
 595	Source string
 596
 597	// The branches indexed in this repo.
 598	Branches []RepositoryBranch
 599
 600	// Nil if this is not the super project.
 601	SubRepoMap map[string]*Repository
 602
 603	// URL template to link to the commit of a branch
 604	CommitURLTemplate string
 605
 606	// The repository URL for getting to a file.  Has access to
 607	// {{.Version}}, {{.Path}}
 608	FileURLTemplate string
 609
 610	// The URL fragment to add to a file URL for line numbers. has
 611	// access to {{.LineNumber}}. The fragment should include the
 612	// separator, generally '#' or ';'.
 613	LineFragmentTemplate string
 614
 615	// Perf optimization: priority is set when we load the shard. It corresponds to
 616	// the value of "priority" stored in RawConfig.
 617	priority float64
 618
 619	// All zoekt.* configuration settings.
 620	RawConfig map[string]string
 621
 622	// Importance of the repository, bigger is more important
 623	Rank uint16
 624
 625	// IndexOptions is a hash of the options used to create the index for the
 626	// repo.
 627	IndexOptions string
 628
 629	// HasSymbols is true if this repository has indexed ctags
 630	// output. Sourcegraph specific: This field is more appropriate for
 631	// IndexMetadata. However, we store it here since the Sourcegraph frontend
 632	// can read this structure but not IndexMetadata.
 633	HasSymbols bool
 634
 635	// Tombstone is true if we are not allowed to search this repo.
 636	Tombstone bool
 637
 638	// LatestCommitDate is the date of the latest commit among all indexed Branches.
 639	// The date might be time.Time's 0-value if the repository was last indexed
 640	// before this field was added.
 641	LatestCommitDate time.Time
 642
 643	// FileTombstones is a set of file paths that should be ignored across all branches
 644	// in this shard.
 645	FileTombstones map[string]struct{} `json:",omitempty"`
 646}
 647
 648func (r *Repository) UnmarshalJSON(data []byte) error {
 649	// We define a new type so that we can use json.Unmarshal
 650	// without recursing into this same method.
 651	type repository *Repository
 652	repo := repository(r)
 653
 654	err := json.Unmarshal(data, repo)
 655	if err != nil {
 656		return err
 657	}
 658
 659	if v, ok := repo.RawConfig["repoid"]; ok {
 660		id, _ := strconv.ParseUint(v, 10, 32)
 661		r.ID = uint32(id)
 662	}
 663
 664	if v, ok := repo.RawConfig["tenantID"]; ok {
 665		id, _ := strconv.ParseInt(v, 10, 64)
 666		r.TenantID = int(id)
 667	}
 668
 669	// Sourcegraph indexserver doesn't set repo.Rank, so we set it here. Setting it
 670	// on read instead of during indexing allows us to avoid a complete reindex.
 671	//
 672	// Prefer "latestCommitDate" over "priority" for ranking. We keep priority for
 673	// backwards compatibility.
 674	if _, ok := repo.RawConfig["latestCommitDate"]; ok {
 675		// We use the number of months since 1970 as a simple measure of repo freshness.
 676		// It is monotonically increasing and stable across re-indexes and restarts.
 677		r.Rank = monthsSince1970(repo.LatestCommitDate)
 678	} else if v, ok := repo.RawConfig["priority"]; ok {
 679		r.priority, err = strconv.ParseFloat(v, 64)
 680		if err != nil {
 681			r.priority = 0
 682		}
 683
 684		// Sourcegraph indexserver doesn't set repo.Rank, so we set it here
 685		// based on priority. Setting it on read instead of during indexing
 686		// allows us to avoid a complete reindex.
 687		if r.Rank == 0 && r.priority > 0 {
 688			// Normalize the repo score within [0, maxUint16), with the midpoint at 5,000.
 689			// This means popular repos (roughly ones with over 5,000 stars) see diminishing
 690			// returns from more stars.
 691			r.Rank = uint16(r.priority / (5000.0 + r.priority) * math.MaxUint16)
 692		}
 693	}
 694
 695	return nil
 696}
 697
 698// monthsSince1970 returns the number of months since 1970. It returns values in
 699// the range [0, maxUInt16]. The upper bound is reached in the year 7431, the
 700// lower bound for all dates before 1970.
 701func monthsSince1970(t time.Time) uint16 {
 702	base := time.Unix(0, 0)
 703	if t.Before(base) {
 704		return 0
 705	}
 706	months := int(t.Year()-1970)*12 + int(t.Month()-1)
 707	return uint16(min(months, math.MaxUint16))
 708}
 709
 710// MergeMutable will merge x into r. mutated will be true if it made any
 711// changes. err is non-nil if we needed to mutate an immutable field.
 712//
 713// Note: SubRepoMap, IndexOptions and HasSymbol fields are ignored. They are
 714// computed while indexing so can't be synthesized from x.
 715//
 716// Note: We ignore RawConfig fields which are duplicated into Repository:
 717// name and id.
 718func (r *Repository) MergeMutable(x *Repository) (mutated bool, err error) {
 719	if r.ID != x.ID {
 720		// Sourcegraph: strange behaviour may occur if ID changes but names don't.
 721		return mutated, errors.New("ID is immutable")
 722	}
 723	if r.Name != x.Name {
 724		// Name is encoded into the shard name on disk. We need to re-index if it
 725		// changes.
 726		return mutated, errors.New("Name is immutable")
 727	}
 728	if !reflect.DeepEqual(r.Branches, x.Branches) {
 729		// Need a reindex if content changing.
 730		return mutated, errors.New("Branches is immutable")
 731	}
 732
 733	for k, v := range x.RawConfig {
 734		// We ignore name and id since they are encoded into the repository.
 735		if k == "name" || k == "id" {
 736			continue
 737		}
 738		if r.RawConfig == nil {
 739			mutated = true
 740			r.RawConfig = make(map[string]string)
 741		}
 742		if r.RawConfig[k] != v {
 743			mutated = true
 744			r.RawConfig[k] = v
 745		}
 746	}
 747
 748	if r.URL != x.URL {
 749		mutated = true
 750		r.URL = x.URL
 751	}
 752	if r.CommitURLTemplate != x.CommitURLTemplate {
 753		mutated = true
 754		r.CommitURLTemplate = x.CommitURLTemplate
 755	}
 756	if r.FileURLTemplate != x.FileURLTemplate {
 757		mutated = true
 758		r.FileURLTemplate = x.FileURLTemplate
 759	}
 760	if r.LineFragmentTemplate != x.LineFragmentTemplate {
 761		mutated = true
 762		r.LineFragmentTemplate = x.LineFragmentTemplate
 763	}
 764
 765	return mutated, nil
 766}
 767
 768// IndexMetadata holds metadata stored in the index file. It contains
 769// data generated by the core indexing library.
 770type IndexMetadata struct {
 771	IndexFormatVersion    int
 772	IndexFeatureVersion   int
 773	IndexMinReaderVersion int
 774	IndexTime             time.Time
 775	PlainASCII            bool
 776	LanguageMap           map[string]uint16
 777	ZoektVersion          string
 778	ID                    string
 779}
 780
 781// Statistics of a (collection of) repositories.
 782type RepoStats struct {
 783	// Repos is used for aggregrating the number of repositories.
 784	//
 785	// Note: This field is not populated on RepoListEntry.Stats (individual) but
 786	// only for RepoList.Stats (aggregate).
 787	Repos int
 788
 789	// Shards is the total number of search shards.
 790	Shards int
 791
 792	// Documents holds the number of documents or files.
 793	Documents int
 794
 795	// IndexBytes is the amount of RAM used for index overhead.
 796	IndexBytes int64
 797
 798	// ContentBytes is the amount of RAM used for raw content.
 799	ContentBytes int64
 800
 801	// Sourcegraph specific stats below. These are not as efficient to calculate
 802	// as the above statistics. We experimentally measured about a 10% slower
 803	// shard load time. However, we find these values very useful to track and
 804	// computing them outside of load time introduces a lot of complexity.
 805
 806	// NewLinesCount is the number of newlines "\n" that appear in the zoekt
 807	// indexed documents. This is not exactly the same as line count, since it
 808	// will not include lines not terminated by "\n" (eg a file with no "\n", or
 809	// a final line without "\n"). Note: Zoekt deduplicates documents across
 810	// branches, so if a path has the same contents on multiple branches, there
 811	// is only one document for it. As such that document's newlines is only
 812	// counted once. See DefaultBranchNewLinesCount and AllBranchesNewLinesCount
 813	// for counts which do not deduplicate.
 814	NewLinesCount uint64
 815
 816	// DefaultBranchNewLinesCount is the number of newlines "\n" in the default
 817	// branch.
 818	DefaultBranchNewLinesCount uint64
 819
 820	// OtherBranchesNewLinesCount is the number of newlines "\n" in all branches
 821	// except the default branch.
 822	OtherBranchesNewLinesCount uint64
 823}
 824
 825func (s *RepoStats) Add(o *RepoStats) {
 826	// can't update Repos, since one repo may have multiple
 827	// shards.
 828	s.Shards += o.Shards
 829	s.IndexBytes += o.IndexBytes
 830	s.Documents += o.Documents
 831	s.ContentBytes += o.ContentBytes
 832
 833	// Sourcegraph specific
 834	s.NewLinesCount += o.NewLinesCount
 835	s.DefaultBranchNewLinesCount += o.DefaultBranchNewLinesCount
 836	s.OtherBranchesNewLinesCount += o.OtherBranchesNewLinesCount
 837}
 838
 839type RepoListEntry struct {
 840	Repository    Repository
 841	IndexMetadata IndexMetadata
 842	Stats         RepoStats
 843}
 844
 845// MinimalRepoListEntry is a subset of RepoListEntry. It was added after
 846// performance profiling of sourcegraph.com revealed that querying this
 847// information from Zoekt was causing lots of CPU and memory usage. Note: we
 848// can revisit this, how we store and query this information has changed a lot
 849// since this was introduced.
 850type MinimalRepoListEntry struct {
 851	// HasSymbols is exported since Sourcegraph uses this information at search
 852	// planning time to decide between Zoekt and an unindexed symbol search.
 853	//
 854	// Note: it pretty much is always true in practice.
 855	HasSymbols bool
 856
 857	// Branches is used by Sourcegraphs query planner to decided if it can use
 858	// zoekt or go via an unindexed code path.
 859	Branches []RepositoryBranch
 860
 861	// IndexTimeUnix is the IndexTime converted to unix time (number of seconds
 862	// since the epoch). This is to make it clear we are not transporting the
 863	// full fidelty timestamp (ie with milliseconds and location). Additionally
 864	// it saves 16 bytes in this struct.
 865	//
 866	// IndexTime is used as a heuristic in Sourcegraph to decide in aggregate
 867	// how many repositories need updating after a ranking change/etc.
 868	//
 869	// TODO(keegancsmith) audit updates to IndexTime and document how and when
 870	// it changes. Concerned about things like metadata updates or compound
 871	// shards leading to untrustworthy data here.
 872	IndexTimeUnix int64
 873}
 874
 875type ReposMap map[uint32]MinimalRepoListEntry
 876
 877// MarshalBinary implements a specialized encoder for ReposMap.
 878func (q *ReposMap) MarshalBinary() ([]byte, error) {
 879	return reposMapEncode(*q)
 880}
 881
 882// UnmarshalBinary implements a specialized decoder for ReposMap.
 883func (q *ReposMap) UnmarshalBinary(b []byte) error {
 884	var err error
 885	(*q), err = reposMapDecode(b)
 886	return err
 887}
 888
 889// RepoList holds a set of Repository metadata.
 890type RepoList struct {
 891	// Returned when ListOptions.Field is RepoListFieldRepos.
 892	Repos []*RepoListEntry
 893
 894	// ReposMap is set when ListOptions.Field is RepoListFieldReposMap.
 895	ReposMap ReposMap
 896
 897	Crashes int
 898
 899	// Stats response to a List request.
 900	// This is the aggregate RepoStats of all repos matching the input query.
 901	Stats RepoStats
 902}
 903
 904type Searcher interface {
 905	Search(ctx context.Context, q query.Q, opts *SearchOptions) (*SearchResult, error)
 906
 907	// List lists repositories. The query `q` can only contain
 908	// query.Repo atoms.
 909	List(ctx context.Context, q query.Q, opts *ListOptions) (*RepoList, error)
 910	Close()
 911
 912	// Describe the searcher for debug messages.
 913	String() string
 914}
 915
 916type RepoListField int
 917
 918const (
 919	RepoListFieldRepos    RepoListField = 0
 920	RepoListFieldReposMap               = 2
 921)
 922
 923type ListOptions struct {
 924	// Field decides which field to populate in RepoList response.
 925	Field RepoListField
 926}
 927
 928func (o *ListOptions) GetField() (RepoListField, error) {
 929	if o == nil {
 930		return RepoListFieldRepos, nil
 931	}
 932	switch o.Field {
 933	case RepoListFieldRepos, RepoListFieldReposMap:
 934		return o.Field, nil
 935	case 1:
 936		return 0, fmt.Errorf("RepoListFieldMinimal (%d) is no longer supported", o.Field)
 937	default:
 938		return 0, fmt.Errorf("unknown RepoListField %d", o.Field)
 939	}
 940}
 941
 942func (o *ListOptions) String() string {
 943	return fmt.Sprintf("%#v", o)
 944}
 945
 946type SearchOptions struct {
 947	// Return an upper-bound estimate of eligible documents in
 948	// stats.ShardFilesConsidered.
 949	EstimateDocCount bool
 950
 951	// Return the whole file.
 952	Whole bool
 953
 954	// Maximum number of matches: skip all processing an index
 955	// shard after we found this many non-overlapping matches.
 956	ShardMaxMatchCount int
 957
 958	// Maximum number of matches: stop looking for more matches
 959	// once we have this many matches across shards.
 960	TotalMaxMatchCount int
 961
 962	// Maximum number of matches: skip processing documents for a repository in
 963	// a shard once we have found ShardRepoMaxMatchCount.
 964	//
 965	// A compound shard may contain multiple repositories. This will most often
 966	// be set to 1 to find all repositories containing a result.
 967	ShardRepoMaxMatchCount int
 968
 969	// Abort the search after this much time has passed.
 970	MaxWallTime time.Duration
 971
 972	// FlushWallTime if non-zero will stop streaming behaviour at first and
 973	// instead will collate and sort results. At FlushWallTime the results will
 974	// be sent and then the behaviour will revert to the normal streaming.
 975	FlushWallTime time.Duration
 976
 977	// Truncates the number of documents (i.e. files) after collating and
 978	// sorting the results.
 979	MaxDocDisplayCount int
 980
 981	// Truncates the number of matchs after collating and sorting the results.
 982	MaxMatchDisplayCount int
 983
 984	// If set to a number greater than zero then up to this many number
 985	// of context lines will be added before and after each matched line.
 986	// Note that the included context lines might contain matches and
 987	// it's up to the consumer of the result to remove those lines.
 988	NumContextLines int
 989
 990	// If true, ChunkMatches will be returned in each FileMatch rather than LineMatches
 991	// EXPERIMENTAL: the behavior of this flag may be changed in future versions.
 992	ChunkMatches bool
 993
 994	// EXPERIMENTAL. If true, use text-search style scoring instead of the default
 995	// scoring formula. The scoring algorithm treats each match in a file as a term
 996	// and computes an approximation to BM25. When enabled, BM25 scoring is used for
 997	// the overall FileMatch score, as well as individual LineMatch and ChunkMatch scores.
 998	//
 999	// The calculation of IDF assumes that Zoekt visits all documents containing any
1000	// of the query terms during evaluation. This is true, for example, if all query
1001	// terms are ORed together.
1002	//
1003	// When enabled, all other scoring signals are ignored, including document ranks.
1004	UseBM25Scoring bool
1005
1006	// Trace turns on opentracing for this request if true and if the Jaeger address was provided as
1007	// a command-line flag
1008	Trace bool
1009
1010	// If set, the search results will contain debug information for scoring.
1011	DebugScore bool
1012
1013	// SpanContext is the opentracing span context, if it exists, from the zoekt client
1014	SpanContext map[string]string
1015}
1016
1017// String returns a succinct representation of the options. This is meant for
1018// human consumption in logs and traces.
1019//
1020// Note: some tracing systems have limits on length of values, so we take care
1021// to try and make this small, and include the important information near the
1022// front incase of truncation.
1023func (s *SearchOptions) String() string {
1024	var b strings.Builder
1025
1026	add := func(name, value string) {
1027		b.WriteString(name)
1028		b.WriteByte('=')
1029		b.WriteString(value)
1030		b.WriteByte(' ')
1031	}
1032	addInt := func(name string, value int) {
1033		if value != 0 {
1034			add(name, strconv.Itoa(value))
1035		}
1036	}
1037	addDuration := func(name string, value time.Duration) {
1038		if value != 0 {
1039			add(name, value.String())
1040		}
1041	}
1042	addBool := func(name string, value bool) {
1043		if !value {
1044			return
1045		}
1046		b.WriteString(name)
1047		b.WriteByte(' ')
1048	}
1049
1050	b.WriteString("zoekt.SearchOptions{ ")
1051
1052	addInt("ShardMaxMatchCount", s.ShardMaxMatchCount)
1053	addInt("TotalMaxMatchCount", s.TotalMaxMatchCount)
1054	addInt("ShardRepoMaxMatchCount", s.ShardRepoMaxMatchCount)
1055	addInt("MaxDocDisplayCount", s.MaxDocDisplayCount)
1056	addInt("MaxMatchDisplayCount", s.MaxMatchDisplayCount)
1057	addInt("NumContextLines", s.NumContextLines)
1058
1059	addDuration("MaxWallTime", s.MaxWallTime)
1060	addDuration("FlushWallTime", s.FlushWallTime)
1061
1062	addBool("EstimateDocCount", s.EstimateDocCount)
1063	addBool("Whole", s.Whole)
1064	addBool("ChunkMatches", s.ChunkMatches)
1065	addBool("UseBM25Scoring", s.UseBM25Scoring)
1066	addBool("Trace", s.Trace)
1067	addBool("DebugScore", s.DebugScore)
1068
1069	for k, v := range s.SpanContext {
1070		add("SpanContext."+k, strconv.Quote(v))
1071	}
1072
1073	b.WriteByte('}')
1074	return b.String()
1075}
1076
1077// Sender is the interface that wraps the basic Send method.
1078type Sender interface {
1079	Send(*SearchResult)
1080}
1081
1082// SenderFunc is an adapter to allow the use of ordinary functions as Sender.
1083// If f is a function with the appropriate signature, SenderFunc(f) is a Sender
1084// that calls f.
1085type SenderFunc func(result *SearchResult)
1086
1087func (f SenderFunc) Send(result *SearchResult) {
1088	f(result)
1089}
1090
1091// Streamer adds the method StreamSearch to the Searcher interface.
1092type Streamer interface {
1093	Searcher
1094	StreamSearch(ctx context.Context, q query.Q, opts *SearchOptions, sender Sender) (err error)
1095}
Configure Feed

Configure Feed