fork of https://github.com/sourcegraph/zoekt
0

Configure Feed

Select the types of activity you want to include in your feed.

1// Copyright 2016 Google Inc. All rights reserved. 2// 3// Licensed under the Apache License, Version 2.0 (the "License"); 4// you may not use this file except in compliance with the License. 5// You may obtain a copy of the License at 6// 7// http://www.apache.org/licenses/LICENSE-2.0 8// 9// Unless required by applicable law or agreed to in writing, software 10// distributed under the License is distributed on an "AS IS" BASIS, 11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12// See the License for the specific language governing permissions and 13// limitations under the License. 14 15package zoekt 16 17import ( 18 "context" 19 "encoding/json" 20 "errors" 21 "fmt" 22 "math" 23 "reflect" 24 "strconv" 25 "strings" 26 "time" 27 28 "github.com/sourcegraph/zoekt/query" 29) 30 31const ( 32 mapHeaderBytes uint64 = 48 33 sliceHeaderBytes uint64 = 24 34 stringHeaderBytes uint64 = 16 35 pointerSize uint64 = 8 36) 37 38// FileMatch contains all the matches within a file. 39type FileMatch struct { 40 FileName string 41 42 // Repository is the globally unique name of the repo of the 43 // match 44 Repository string 45 46 // SubRepositoryName is the globally unique name of the repo, 47 // if it came from a subrepository 48 SubRepositoryName string `json:",omitempty"` 49 50 // SubRepositoryPath holds the prefix where the subrepository 51 // was mounted. 52 SubRepositoryPath string `json:",omitempty"` 53 54 // Commit SHA1 (hex) of the (sub)repo holding the file. 55 Version string `json:",omitempty"` 56 57 // Detected language of the result. 58 Language string 59 60 // For debugging. Needs DebugScore set, but public so tests in 61 // other packages can print some diagnostics. 62 Debug string `json:",omitempty"` 63 64 Branches []string `json:",omitempty"` 65 66 // One of LineMatches or ChunkMatches will be returned depending on whether 67 // the SearchOptions.ChunkMatches is set. 68 LineMatches []LineMatch `json:",omitempty"` 69 ChunkMatches []ChunkMatch `json:",omitempty"` 70 71 // Only set if requested 72 Content []byte `json:",omitempty"` 73 74 // Checksum of the content. 75 Checksum []byte 76 77 // Ranking; the higher, the better. 78 Score float64 `json:",omitempty"` 79 80 // RepositoryPriority is a Sourcegraph extension. It is used by Sourcegraph to 81 // order results from different repositories relative to each other. 82 RepositoryPriority float64 `json:",omitempty"` 83 84 // RepositoryID is a Sourcegraph extension. This is the ID of Repository in 85 // Sourcegraph. 86 RepositoryID uint32 `json:",omitempty"` 87} 88 89func (m *FileMatch) sizeBytes() (sz uint64) { 90 // Score 91 sz += 8 92 93 for _, s := range []string{ 94 m.Debug, 95 m.FileName, 96 m.Repository, 97 m.Language, 98 m.SubRepositoryName, 99 m.SubRepositoryPath, 100 m.Version, 101 } { 102 sz += stringHeaderBytes + uint64(len(s)) 103 } 104 105 // Branches 106 sz += sliceHeaderBytes 107 for _, s := range m.Branches { 108 sz += stringHeaderBytes + uint64(len(s)) 109 } 110 111 // LineMatches 112 sz += sliceHeaderBytes 113 for _, lm := range m.LineMatches { 114 sz += lm.sizeBytes() 115 } 116 117 // ChunkMatches 118 sz += sliceHeaderBytes 119 for _, cm := range m.ChunkMatches { 120 sz += cm.sizeBytes() 121 } 122 123 // RepositoryID 124 sz += 4 125 126 // RepositoryPriority 127 sz += 8 128 129 // Content 130 sz += sliceHeaderBytes + uint64(len(m.Content)) 131 132 // Checksum 133 sz += sliceHeaderBytes + uint64(len(m.Checksum)) 134 135 return 136} 137 138// AddScore increments the score of the FileMatch by the computed score. If 139// debugScore is true, it also adds a debug string to the FileMatch. If raw is 140// -1, it is ignored. Otherwise, it is added to the debug string. 141func (m *FileMatch) AddScore(what string, computed float64, raw float64, debugScore bool) { 142 if computed != 0 && debugScore { 143 var b strings.Builder 144 fmt.Fprintf(&b, "%s", what) 145 if raw != -1 { 146 fmt.Fprintf(&b, "(%s)", strconv.FormatFloat(raw, 'f', -1, 64)) 147 } 148 fmt.Fprintf(&b, ":%.2f, ", computed) 149 m.Debug += b.String() 150 } 151 m.Score += computed 152} 153 154// ChunkMatch is a set of non-overlapping matches within a contiguous range of 155// lines in the file. 156type ChunkMatch struct { 157 DebugScore string 158 159 // Content is a contiguous range of complete lines that fully contains Ranges. 160 // Lines will always include their terminating newline (if it exists). 161 Content []byte 162 163 // Ranges is a set of matching ranges within this chunk. Each range is relative 164 // to the beginning of the file (not the beginning of Content). 165 Ranges []Range 166 167 // SymbolInfo is the symbol information associated with Ranges. If it is non-nil, 168 // its length will equal that of Ranges. Any of its elements may be nil. 169 SymbolInfo []*Symbol 170 171 // FileName indicates whether this match is a match on the file name, in 172 // which case Content will contain the file name. 173 FileName bool 174 175 // ContentStart is the location (inclusive) of the beginning of content 176 // relative to the beginning of the file. It will always be at the 177 // beginning of a line (Column will always be 1). 178 ContentStart Location 179 180 // Score is the overall relevance score of this chunk. 181 Score float64 182 183 // BestLineMatch is the line number of the highest-scoring line match in this chunk. 184 // The line number represents the index in the full file, and is 1-based. If FileName: true, 185 // this number will be 0. 186 BestLineMatch uint32 187} 188 189func (cm *ChunkMatch) sizeBytes() (sz uint64) { 190 // Content 191 sz += sliceHeaderBytes + uint64(len(cm.Content)) 192 193 // ContentStart 194 sz += cm.ContentStart.sizeBytes() 195 196 // FileName 197 sz += 1 198 199 // Ranges 200 sz += sliceHeaderBytes 201 if len(cm.Ranges) > 0 { 202 sz += uint64(len(cm.Ranges)) * cm.Ranges[0].sizeBytes() 203 } 204 205 // SymbolInfo 206 sz += sliceHeaderBytes 207 for _, si := range cm.SymbolInfo { 208 sz += pointerSize 209 if si != nil { 210 sz += si.sizeBytes() 211 } 212 } 213 214 // Score 215 sz += 8 216 217 // DebugScore 218 sz += stringHeaderBytes + uint64(len(cm.DebugScore)) 219 220 return 221} 222 223type Range struct { 224 // The inclusive beginning of the range. 225 Start Location 226 // The exclusive end of the range. 227 End Location 228} 229 230func (r *Range) sizeBytes() uint64 { 231 return r.Start.sizeBytes() + r.End.sizeBytes() 232} 233 234type Location struct { 235 // 0-based byte offset from the beginning of the file 236 ByteOffset uint32 237 // 1-based line number from the beginning of the file 238 LineNumber uint32 239 // 1-based column number (in runes) from the beginning of line 240 Column uint32 241} 242 243func (l *Location) sizeBytes() uint64 { 244 return 3 * 4 245} 246 247// LineMatch holds the matches within a single line in a file. 248type LineMatch struct { 249 // The line in which a match was found. 250 Line []byte 251 // The byte offset of the first byte of the line. 252 LineStart int 253 // The byte offset of the first byte past the end of the line. 254 // This is usually the byte after the terminating newline, but can also be 255 // the end of the file if there is no terminating newline 256 LineEnd int 257 LineNumber int 258 259 // Before and After are only set when SearchOptions.NumContextLines is > 0 260 Before []byte 261 After []byte 262 263 // If set, this was a match on the filename. 264 FileName bool 265 266 // The higher the better. Only ranks the quality of the match 267 // within the file, does not take rank of file into account 268 Score float64 269 DebugScore string 270 271 LineFragments []LineFragmentMatch 272} 273 274func (lm *LineMatch) sizeBytes() (sz uint64) { 275 // Line 276 sz += sliceHeaderBytes + uint64(len(lm.Line)) 277 278 // LineStart, LineEnd, LineNumber 279 sz += 3 * 8 280 281 // Before 282 sz += sliceHeaderBytes + uint64(len(lm.Before)) 283 284 // After 285 sz += sliceHeaderBytes + uint64(len(lm.After)) 286 287 // FileName 288 sz += 1 289 290 // Score 291 sz += 8 292 293 // DebugScore 294 sz += stringHeaderBytes + uint64(len(lm.DebugScore)) 295 296 // LineFragments 297 sz += sliceHeaderBytes 298 for _, lf := range lm.LineFragments { 299 sz += lf.sizeBytes() 300 } 301 302 return 303} 304 305type Symbol struct { 306 Sym string 307 Kind string 308 Parent string 309 ParentKind string 310} 311 312func (s *Symbol) sizeBytes() uint64 { 313 return 4*stringHeaderBytes + uint64(len(s.Sym)+len(s.Kind)+len(s.Parent)+len(s.ParentKind)) 314} 315 316// LineFragmentMatch a segment of matching text within a line. 317type LineFragmentMatch struct { 318 // Offset within the line, in bytes. 319 LineOffset int 320 321 // Offset from file start, in bytes. 322 Offset uint32 323 324 // Number bytes that match. 325 MatchLength int 326 327 SymbolInfo *Symbol 328} 329 330func (lfm *LineFragmentMatch) sizeBytes() (sz uint64) { 331 // LineOffset 332 sz += 8 333 334 // Offset 335 sz += 4 336 337 // MatchLength 338 sz += 8 339 340 // SymbolInfo 341 sz += pointerSize 342 if lfm.SymbolInfo != nil { 343 sz += lfm.SymbolInfo.sizeBytes() 344 } 345 346 return 347} 348 349type FlushReason uint8 350 351const ( 352 FlushReasonTimerExpired FlushReason = 1 << iota 353 FlushReasonFinalFlush 354 FlushReasonMaxSize 355) 356 357var FlushReasonStrings = map[FlushReason]string{ 358 FlushReasonTimerExpired: "timer_expired", 359 FlushReasonFinalFlush: "final_flush", 360 FlushReasonMaxSize: "max_size_reached", 361} 362 363func (fr FlushReason) String() string { 364 if v, ok := FlushReasonStrings[fr]; ok { 365 return v 366 } 367 368 return "none" 369} 370 371// Stats contains interesting numbers on the search 372type Stats struct { 373 // Amount of I/O for reading contents. 374 ContentBytesLoaded int64 375 376 // Amount of I/O for reading from index. 377 IndexBytesLoaded int64 378 379 // Number of search shards that had a crash. 380 Crashes int 381 382 // Wall clock time for this search 383 Duration time.Duration 384 385 // Number of files containing a match. 386 FileCount int 387 388 // Number of files in shards that we considered. 389 ShardFilesConsidered int 390 391 // Files that we evaluated. Equivalent to files for which all 392 // atom matches (including negations) evaluated to true. 393 FilesConsidered int 394 395 // Files for which we loaded file content to verify substring matches 396 FilesLoaded int 397 398 // Candidate files whose contents weren't examined because we 399 // gathered enough matches. 400 FilesSkipped int 401 402 // Shards that we scanned to find matches. 403 ShardsScanned int 404 405 // Shards that we did not process because a query was canceled. 406 ShardsSkipped int 407 408 // Shards that we did not process because the query was rejected by the 409 // ngram filter indicating it had no matches. 410 ShardsSkippedFilter int 411 412 // Number of non-overlapping matches 413 MatchCount int 414 415 // Number of candidate matches as a result of searching ngrams. 416 NgramMatches int 417 418 // NgramLookups is the number of times we accessed an ngram in the index. 419 NgramLookups int 420 421 // Wall clock time for queued search. 422 Wait time.Duration 423 424 // Aggregate wall clock time spent constructing and pruning the match tree. 425 // This accounts for time such as lookups in the trigram index. 426 MatchTreeConstruction time.Duration 427 428 // Aggregate wall clock time spent searching the match tree. This accounts 429 // for the bulk of search work done looking for matches. 430 MatchTreeSearch time.Duration 431 432 // Number of times regexp was called on files that we evaluated. 433 RegexpsConsidered int 434 435 // FlushReason explains why results were flushed. 436 FlushReason FlushReason 437} 438 439func (s *Stats) sizeBytes() (sz uint64) { 440 sz = 16 * 8 // This assumes we are running on a 64-bit architecture 441 sz += 1 // FlushReason 442 443 return 444} 445 446func (s *Stats) Add(o Stats) { 447 s.ContentBytesLoaded += o.ContentBytesLoaded 448 s.IndexBytesLoaded += o.IndexBytesLoaded 449 s.Crashes += o.Crashes 450 s.FileCount += o.FileCount 451 s.FilesConsidered += o.FilesConsidered 452 s.FilesLoaded += o.FilesLoaded 453 s.FilesSkipped += o.FilesSkipped 454 s.MatchCount += o.MatchCount 455 s.NgramMatches += o.NgramMatches 456 s.NgramLookups += o.NgramLookups 457 s.ShardFilesConsidered += o.ShardFilesConsidered 458 s.ShardsScanned += o.ShardsScanned 459 s.ShardsSkipped += o.ShardsSkipped 460 s.ShardsSkippedFilter += o.ShardsSkippedFilter 461 s.Wait += o.Wait 462 s.MatchTreeConstruction += o.MatchTreeConstruction 463 s.MatchTreeSearch += o.MatchTreeSearch 464 s.RegexpsConsidered += o.RegexpsConsidered 465 466 // We want the first non-zero FlushReason to be sticky. This is a useful 467 // property when aggregating stats from several Zoekts. 468 if s.FlushReason == 0 { 469 s.FlushReason = o.FlushReason 470 } 471} 472 473// Zero returns true if stats is empty. 474func (s *Stats) Zero() bool { 475 if s == nil { 476 return true 477 } 478 479 return !(s.ContentBytesLoaded > 0 || 480 s.IndexBytesLoaded > 0 || 481 s.Crashes > 0 || 482 s.FileCount > 0 || 483 s.FilesConsidered > 0 || 484 s.FilesLoaded > 0 || 485 s.FilesSkipped > 0 || 486 s.MatchCount > 0 || 487 s.NgramMatches > 0 || 488 s.NgramLookups > 0 || 489 s.ShardFilesConsidered > 0 || 490 s.ShardsScanned > 0 || 491 s.ShardsSkipped > 0 || 492 s.ShardsSkippedFilter > 0 || 493 s.Wait > 0 || 494 s.MatchTreeConstruction > 0 || 495 s.MatchTreeSearch > 0 || 496 s.RegexpsConsidered > 0) 497} 498 499// Progress contains information about the global progress of the running search query. 500// This is used by the frontend to reorder results and emit them when stable. 501// Sourcegraph specific: this is used when querying multiple zoekt-webserver instances. 502type Progress struct { 503 // Priority of the shard that was searched. 504 Priority float64 505 506 // MaxPendingPriority is the maximum priority of pending result that is being searched in parallel. 507 // This is used to reorder results when the result set is known to be stable-- that is, when a result's 508 // Priority is greater than the max(MaxPendingPriority) from the latest results of each backend, it can be returned to the user. 509 // 510 // MaxPendingPriority decreases monotonically in each SearchResult. 511 MaxPendingPriority float64 512} 513 514func (p *Progress) sizeBytes() uint64 { 515 return 2 * 8 516} 517 518// SearchResult contains search matches and extra data 519type SearchResult struct { 520 Stats 521 522 // Do not encode this as we cannot encode -Inf in JSON 523 Progress `json:"-"` 524 525 Files []FileMatch 526 527 // RepoURLs holds a repo => template string map. 528 RepoURLs map[string]string 529 530 // FragmentNames holds a repo => template string map, for 531 // the line number fragment. 532 LineFragments map[string]string 533} 534 535// SizeBytes is a best-effort estimate of the size of SearchResult in memory. 536// The estimate does not take alignment into account. The result is a lower 537// bound on the actual size in memory. 538func (sr *SearchResult) SizeBytes() (sz uint64) { 539 sz += sr.Stats.sizeBytes() 540 sz += sr.Progress.sizeBytes() 541 542 // Files 543 sz += sliceHeaderBytes 544 for _, f := range sr.Files { 545 sz += f.sizeBytes() 546 } 547 548 // RepoURLs 549 sz += mapHeaderBytes 550 for k, v := range sr.RepoURLs { 551 sz += stringHeaderBytes + uint64(len(k)) 552 sz += stringHeaderBytes + uint64(len(v)) 553 } 554 555 // LineFragments 556 sz += mapHeaderBytes 557 for k, v := range sr.LineFragments { 558 sz += stringHeaderBytes + uint64(len(k)) 559 sz += stringHeaderBytes + uint64(len(v)) 560 } 561 562 return 563} 564 565// RepositoryBranch describes an indexed branch, which is a name 566// combined with a version. 567type RepositoryBranch struct { 568 Name string 569 Version string 570} 571 572func (r RepositoryBranch) String() string { 573 return fmt.Sprintf("%s@%s", r.Name, r.Version) 574} 575 576// Repository holds repository metadata. 577type Repository struct { 578 // Sourcegraph's tenant ID 579 TenantID int 580 581 // Sourcegraph's repository ID 582 ID uint32 583 584 // The repository name 585 Name string 586 587 // The repository URL. 588 URL string 589 590 // The physical source where this repo came from, eg. full 591 // path to the zip filename or git repository directory. This 592 // will not be exposed in the UI, but can be used to detect 593 // orphaned index shards. 594 Source string 595 596 // The branches indexed in this repo. 597 Branches []RepositoryBranch 598 599 // Nil if this is not the super project. 600 SubRepoMap map[string]*Repository 601 602 // URL template to link to the commit of a branch 603 CommitURLTemplate string 604 605 // The repository URL for getting to a file. Has access to 606 // {{.Version}}, {{.Path}} 607 FileURLTemplate string 608 609 // The URL fragment to add to a file URL for line numbers. has 610 // access to {{.LineNumber}}. The fragment should include the 611 // separator, generally '#' or ';'. 612 LineFragmentTemplate string 613 614 // Perf optimization: priority is set when we load the shard. It corresponds to 615 // the value of "priority" stored in RawConfig. 616 priority float64 617 618 // All zoekt.* configuration settings. 619 RawConfig map[string]string 620 621 // Importance of the repository, bigger is more important 622 Rank uint16 623 624 // IndexOptions is a hash of the options used to create the index for the 625 // repo. 626 IndexOptions string 627 628 // HasSymbols is true if this repository has indexed ctags 629 // output. Sourcegraph specific: This field is more appropriate for 630 // IndexMetadata. However, we store it here since the Sourcegraph frontend 631 // can read this structure but not IndexMetadata. 632 HasSymbols bool 633 634 // Tombstone is true if we are not allowed to search this repo. 635 Tombstone bool 636 637 // LatestCommitDate is the date of the latest commit among all indexed Branches. 638 // The date might be time.Time's 0-value if the repository was last indexed 639 // before this field was added. 640 LatestCommitDate time.Time 641 642 // FileTombstones is a set of file paths that should be ignored across all branches 643 // in this shard. 644 FileTombstones map[string]struct{} `json:",omitempty"` 645} 646 647func (r *Repository) UnmarshalJSON(data []byte) error { 648 // We define a new type so that we can use json.Unmarshal 649 // without recursing into this same method. 650 type repository *Repository 651 repo := repository(r) 652 653 err := json.Unmarshal(data, repo) 654 if err != nil { 655 return err 656 } 657 658 if v, ok := repo.RawConfig["repoid"]; ok { 659 id, _ := strconv.ParseUint(v, 10, 32) 660 r.ID = uint32(id) 661 } 662 663 if v, ok := repo.RawConfig["tenantID"]; ok { 664 id, _ := strconv.ParseInt(v, 10, 64) 665 r.TenantID = int(id) 666 } 667 668 // Sourcegraph indexserver doesn't set repo.Rank, so we set it here. Setting it 669 // on read instead of during indexing allows us to avoid a complete reindex. 670 // 671 // Prefer "latestCommitDate" over "priority" for ranking. We keep priority for 672 // backwards compatibility. 673 if _, ok := repo.RawConfig["latestCommitDate"]; ok { 674 // We use the number of months since 1970 as a simple measure of repo freshness. 675 // It is monotonically increasing and stable across re-indexes and restarts. 676 r.Rank = monthsSince1970(repo.LatestCommitDate) 677 } else if v, ok := repo.RawConfig["priority"]; ok { 678 r.priority, err = strconv.ParseFloat(v, 64) 679 if err != nil { 680 r.priority = 0 681 } 682 683 // Sourcegraph indexserver doesn't set repo.Rank, so we set it here 684 // based on priority. Setting it on read instead of during indexing 685 // allows us to avoid a complete reindex. 686 if r.Rank == 0 && r.priority > 0 { 687 // Normalize the repo score within [0, maxUint16), with the midpoint at 5,000. 688 // This means popular repos (roughly ones with over 5,000 stars) see diminishing 689 // returns from more stars. 690 r.Rank = uint16(r.priority / (5000.0 + r.priority) * math.MaxUint16) 691 } 692 } 693 694 return nil 695} 696 697func (r *Repository) GetPriority() float64 { 698 return r.priority 699} 700 701// monthsSince1970 returns the number of months since 1970. It returns values in 702// the range [0, maxUInt16]. The upper bound is reached in the year 7431, the 703// lower bound for all dates before 1970. 704func monthsSince1970(t time.Time) uint16 { 705 base := time.Unix(0, 0) 706 if t.Before(base) { 707 return 0 708 } 709 months := int(t.Year()-1970)*12 + int(t.Month()-1) 710 return uint16(min(months, math.MaxUint16)) 711} 712 713// MergeMutable will merge x into r. mutated will be true if it made any 714// changes. err is non-nil if we needed to mutate an immutable field. 715// 716// Note: SubRepoMap, IndexOptions and HasSymbol fields are ignored. They are 717// computed while indexing so can't be synthesized from x. 718// 719// Note: We ignore RawConfig fields which are duplicated into Repository: 720// name and id. 721func (r *Repository) MergeMutable(x *Repository) (mutated bool, err error) { 722 if r.ID != x.ID { 723 // Sourcegraph: strange behaviour may occur if ID changes but names don't. 724 return mutated, errors.New("ID is immutable") 725 } 726 if r.Name != x.Name { 727 // Name is encoded into the shard name on disk. We need to re-index if it 728 // changes. 729 return mutated, errors.New("Name is immutable") 730 } 731 if !reflect.DeepEqual(r.Branches, x.Branches) { 732 // Need a reindex if content changing. 733 return mutated, errors.New("Branches is immutable") 734 } 735 736 for k, v := range x.RawConfig { 737 // We ignore name and id since they are encoded into the repository. 738 if k == "name" || k == "id" { 739 continue 740 } 741 if r.RawConfig == nil { 742 mutated = true 743 r.RawConfig = make(map[string]string) 744 } 745 if r.RawConfig[k] != v { 746 mutated = true 747 r.RawConfig[k] = v 748 } 749 } 750 751 if r.URL != x.URL { 752 mutated = true 753 r.URL = x.URL 754 } 755 if r.CommitURLTemplate != x.CommitURLTemplate { 756 mutated = true 757 r.CommitURLTemplate = x.CommitURLTemplate 758 } 759 if r.FileURLTemplate != x.FileURLTemplate { 760 mutated = true 761 r.FileURLTemplate = x.FileURLTemplate 762 } 763 if r.LineFragmentTemplate != x.LineFragmentTemplate { 764 mutated = true 765 r.LineFragmentTemplate = x.LineFragmentTemplate 766 } 767 768 return mutated, nil 769} 770 771// IndexMetadata holds metadata stored in the index file. It contains 772// data generated by the core indexing library. 773type IndexMetadata struct { 774 IndexFormatVersion int 775 IndexFeatureVersion int 776 IndexMinReaderVersion int 777 IndexTime time.Time 778 PlainASCII bool 779 LanguageMap map[string]uint16 780 ZoektVersion string 781 ID string 782} 783 784// Statistics of a (collection of) repositories. 785type RepoStats struct { 786 // Repos is used for aggregrating the number of repositories. 787 // 788 // Note: This field is not populated on RepoListEntry.Stats (individual) but 789 // only for RepoList.Stats (aggregate). 790 Repos int 791 792 // Shards is the total number of search shards. 793 Shards int 794 795 // Documents holds the number of documents or files. 796 Documents int 797 798 // IndexBytes is the amount of RAM used for index overhead. 799 IndexBytes int64 800 801 // ContentBytes is the amount of RAM used for raw content. 802 ContentBytes int64 803 804 // Sourcegraph specific stats below. These are not as efficient to calculate 805 // as the above statistics. We experimentally measured about a 10% slower 806 // shard load time. However, we find these values very useful to track and 807 // computing them outside of load time introduces a lot of complexity. 808 809 // NewLinesCount is the number of newlines "\n" that appear in the zoekt 810 // indexed documents. This is not exactly the same as line count, since it 811 // will not include lines not terminated by "\n" (eg a file with no "\n", or 812 // a final line without "\n"). Note: Zoekt deduplicates documents across 813 // branches, so if a path has the same contents on multiple branches, there 814 // is only one document for it. As such that document's newlines is only 815 // counted once. See DefaultBranchNewLinesCount and AllBranchesNewLinesCount 816 // for counts which do not deduplicate. 817 NewLinesCount uint64 818 819 // DefaultBranchNewLinesCount is the number of newlines "\n" in the default 820 // branch. 821 DefaultBranchNewLinesCount uint64 822 823 // OtherBranchesNewLinesCount is the number of newlines "\n" in all branches 824 // except the default branch. 825 OtherBranchesNewLinesCount uint64 826} 827 828func (s *RepoStats) Add(o *RepoStats) { 829 // can't update Repos, since one repo may have multiple 830 // shards. 831 s.Shards += o.Shards 832 s.IndexBytes += o.IndexBytes 833 s.Documents += o.Documents 834 s.ContentBytes += o.ContentBytes 835 836 // Sourcegraph specific 837 s.NewLinesCount += o.NewLinesCount 838 s.DefaultBranchNewLinesCount += o.DefaultBranchNewLinesCount 839 s.OtherBranchesNewLinesCount += o.OtherBranchesNewLinesCount 840} 841 842type RepoListEntry struct { 843 Repository Repository 844 IndexMetadata IndexMetadata 845 Stats RepoStats 846} 847 848// MinimalRepoListEntry is a subset of RepoListEntry. It was added after 849// performance profiling of sourcegraph.com revealed that querying this 850// information from Zoekt was causing lots of CPU and memory usage. Note: we 851// can revisit this, how we store and query this information has changed a lot 852// since this was introduced. 853type MinimalRepoListEntry struct { 854 // HasSymbols is exported since Sourcegraph uses this information at search 855 // planning time to decide between Zoekt and an unindexed symbol search. 856 // 857 // Note: it pretty much is always true in practice. 858 HasSymbols bool 859 860 // Branches is used by Sourcegraphs query planner to decided if it can use 861 // zoekt or go via an unindexed code path. 862 Branches []RepositoryBranch 863 864 // IndexTimeUnix is the IndexTime converted to unix time (number of seconds 865 // since the epoch). This is to make it clear we are not transporting the 866 // full fidelty timestamp (ie with milliseconds and location). Additionally 867 // it saves 16 bytes in this struct. 868 // 869 // IndexTime is used as a heuristic in Sourcegraph to decide in aggregate 870 // how many repositories need updating after a ranking change/etc. 871 // 872 // TODO(keegancsmith) audit updates to IndexTime and document how and when 873 // it changes. Concerned about things like metadata updates or compound 874 // shards leading to untrustworthy data here. 875 IndexTimeUnix int64 876} 877 878type ReposMap map[uint32]MinimalRepoListEntry 879 880// MarshalBinary implements a specialized encoder for ReposMap. 881func (q *ReposMap) MarshalBinary() ([]byte, error) { 882 return reposMapEncode(*q) 883} 884 885// UnmarshalBinary implements a specialized decoder for ReposMap. 886func (q *ReposMap) UnmarshalBinary(b []byte) error { 887 var err error 888 (*q), err = reposMapDecode(b) 889 return err 890} 891 892// RepoList holds a set of Repository metadata. 893type RepoList struct { 894 // Returned when ListOptions.Field is RepoListFieldRepos. 895 Repos []*RepoListEntry 896 897 // ReposMap is set when ListOptions.Field is RepoListFieldReposMap. 898 ReposMap ReposMap 899 900 Crashes int 901 902 // Stats response to a List request. 903 // This is the aggregate RepoStats of all repos matching the input query. 904 Stats RepoStats 905} 906 907type Searcher interface { 908 Search(ctx context.Context, q query.Q, opts *SearchOptions) (*SearchResult, error) 909 910 // List lists repositories. The query `q` can only contain 911 // query.Repo atoms. 912 List(ctx context.Context, q query.Q, opts *ListOptions) (*RepoList, error) 913 Close() 914 915 // Describe the searcher for debug messages. 916 String() string 917} 918 919type RepoListField int 920 921const ( 922 RepoListFieldRepos RepoListField = 0 923 RepoListFieldReposMap = 2 924) 925 926type ListOptions struct { 927 // Field decides which field to populate in RepoList response. 928 Field RepoListField 929} 930 931func (o *ListOptions) GetField() (RepoListField, error) { 932 if o == nil { 933 return RepoListFieldRepos, nil 934 } 935 switch o.Field { 936 case RepoListFieldRepos, RepoListFieldReposMap: 937 return o.Field, nil 938 case 1: 939 return 0, fmt.Errorf("RepoListFieldMinimal (%d) is no longer supported", o.Field) 940 default: 941 return 0, fmt.Errorf("unknown RepoListField %d", o.Field) 942 } 943} 944 945func (o *ListOptions) String() string { 946 return fmt.Sprintf("%#v", o) 947} 948 949type SearchOptions struct { 950 // Return an upper-bound estimate of eligible documents in 951 // stats.ShardFilesConsidered. 952 EstimateDocCount bool 953 954 // Return the whole file. 955 Whole bool 956 957 // Maximum number of matches: skip all processing an index 958 // shard after we found this many non-overlapping matches. 959 ShardMaxMatchCount int 960 961 // Maximum number of matches: stop looking for more matches 962 // once we have this many matches across shards. 963 TotalMaxMatchCount int 964 965 // Maximum number of matches: skip processing documents for a repository in 966 // a shard once we have found ShardRepoMaxMatchCount. 967 // 968 // A compound shard may contain multiple repositories. This will most often 969 // be set to 1 to find all repositories containing a result. 970 ShardRepoMaxMatchCount int 971 972 // Abort the search after this much time has passed. 973 MaxWallTime time.Duration 974 975 // FlushWallTime if non-zero will stop streaming behaviour at first and 976 // instead will collate and sort results. At FlushWallTime the results will 977 // be sent and then the behaviour will revert to the normal streaming. 978 FlushWallTime time.Duration 979 980 // Truncates the number of documents (i.e. files) after collating and 981 // sorting the results. 982 MaxDocDisplayCount int 983 984 // Truncates the number of matchs after collating and sorting the results. 985 MaxMatchDisplayCount int 986 987 // If set to a number greater than zero then up to this many number 988 // of context lines will be added before and after each matched line. 989 // Note that the included context lines might contain matches and 990 // it's up to the consumer of the result to remove those lines. 991 NumContextLines int 992 993 // If true, ChunkMatches will be returned in each FileMatch rather than LineMatches 994 // EXPERIMENTAL: the behavior of this flag may be changed in future versions. 995 ChunkMatches bool 996 997 // EXPERIMENTAL. If true, use text-search style scoring instead of the default 998 // scoring formula. The scoring algorithm treats each match in a file as a term 999 // and computes an approximation to BM25. When enabled, BM25 scoring is used for 1000 // the overall FileMatch score, as well as individual LineMatch and ChunkMatch scores. 1001 // 1002 // The calculation of IDF assumes that Zoekt visits all documents containing any 1003 // of the query terms during evaluation. This is true, for example, if all query 1004 // terms are ORed together. 1005 // 1006 // When enabled, all other scoring signals are ignored, including document ranks. 1007 UseBM25Scoring bool 1008 1009 // Trace turns on opentracing for this request if true and if the Jaeger address was provided as 1010 // a command-line flag 1011 Trace bool 1012 1013 // If set, the search results will contain debug information for scoring. 1014 DebugScore bool 1015 1016 // SpanContext is the opentracing span context, if it exists, from the zoekt client 1017 SpanContext map[string]string 1018} 1019 1020func (o *SearchOptions) SetDefaults() { 1021 if o.ShardMaxMatchCount == 0 { 1022 // We cap the total number of matches, so overly broad 1023 // searches don't crash the machine. 1024 o.ShardMaxMatchCount = 100000 1025 } 1026 if o.TotalMaxMatchCount == 0 { 1027 o.TotalMaxMatchCount = 10 * o.ShardMaxMatchCount 1028 } 1029} 1030 1031// String returns a succinct representation of the options. This is meant for 1032// human consumption in logs and traces. 1033// 1034// Note: some tracing systems have limits on length of values, so we take care 1035// to try and make this small, and include the important information near the 1036// front incase of truncation. 1037func (s *SearchOptions) String() string { 1038 var b strings.Builder 1039 1040 add := func(name, value string) { 1041 b.WriteString(name) 1042 b.WriteByte('=') 1043 b.WriteString(value) 1044 b.WriteByte(' ') 1045 } 1046 addInt := func(name string, value int) { 1047 if value != 0 { 1048 add(name, strconv.Itoa(value)) 1049 } 1050 } 1051 addDuration := func(name string, value time.Duration) { 1052 if value != 0 { 1053 add(name, value.String()) 1054 } 1055 } 1056 addBool := func(name string, value bool) { 1057 if !value { 1058 return 1059 } 1060 b.WriteString(name) 1061 b.WriteByte(' ') 1062 } 1063 1064 b.WriteString("zoekt.SearchOptions{ ") 1065 1066 addInt("ShardMaxMatchCount", s.ShardMaxMatchCount) 1067 addInt("TotalMaxMatchCount", s.TotalMaxMatchCount) 1068 addInt("ShardRepoMaxMatchCount", s.ShardRepoMaxMatchCount) 1069 addInt("MaxDocDisplayCount", s.MaxDocDisplayCount) 1070 addInt("MaxMatchDisplayCount", s.MaxMatchDisplayCount) 1071 addInt("NumContextLines", s.NumContextLines) 1072 1073 addDuration("MaxWallTime", s.MaxWallTime) 1074 addDuration("FlushWallTime", s.FlushWallTime) 1075 1076 addBool("EstimateDocCount", s.EstimateDocCount) 1077 addBool("Whole", s.Whole) 1078 addBool("ChunkMatches", s.ChunkMatches) 1079 addBool("UseBM25Scoring", s.UseBM25Scoring) 1080 addBool("Trace", s.Trace) 1081 addBool("DebugScore", s.DebugScore) 1082 1083 for k, v := range s.SpanContext { 1084 add("SpanContext."+k, strconv.Quote(v)) 1085 } 1086 1087 b.WriteByte('}') 1088 return b.String() 1089} 1090 1091// Sender is the interface that wraps the basic Send method. 1092type Sender interface { 1093 Send(*SearchResult) 1094} 1095 1096// SenderFunc is an adapter to allow the use of ordinary functions as Sender. 1097// If f is a function with the appropriate signature, SenderFunc(f) is a Sender 1098// that calls f. 1099type SenderFunc func(result *SearchResult) 1100 1101func (f SenderFunc) Send(result *SearchResult) { 1102 f(result) 1103} 1104 1105// Streamer adds the method StreamSearch to the Searcher interface. 1106type Streamer interface { 1107 Searcher 1108 StreamSearch(ctx context.Context, q query.Q, opts *SearchOptions, sender Sender) (err error) 1109}