fork of https://github.com/sourcegraph/zoekt
0

Configure Feed

Select the types of activity you want to include in your feed.

1// Copyright 2016 Google Inc. All rights reserved. 2// 3// Licensed under the Apache License, Version 2.0 (the "License"); 4// you may not use this file except in compliance with the License. 5// You may obtain a copy of the License at 6// 7// http://www.apache.org/licenses/LICENSE-2.0 8// 9// Unless required by applicable law or agreed to in writing, software 10// distributed under the License is distributed on an "AS IS" BASIS, 11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12// See the License for the specific language governing permissions and 13// limitations under the License. 14 15package zoekt // import "github.com/sourcegraph/zoekt" 16 17import ( 18 "context" 19 "encoding/json" 20 "errors" 21 "fmt" 22 "reflect" 23 "strconv" 24 "strings" 25 "time" 26 27 "github.com/sourcegraph/zoekt/query" 28) 29 30const ( 31 mapHeaderBytes uint64 = 48 32 sliceHeaderBytes uint64 = 24 33 stringHeaderBytes uint64 = 16 34 pointerSize uint64 = 8 35 interfaceBytes uint64 = 16 36) 37 38// FileMatch contains all the matches within a file. 39type FileMatch struct { 40 FileName string 41 42 // Repository is the globally unique name of the repo of the 43 // match 44 Repository string 45 46 // SubRepositoryName is the globally unique name of the repo, 47 // if it came from a subrepository 48 SubRepositoryName string `json:",omitempty"` 49 50 // SubRepositoryPath holds the prefix where the subrepository 51 // was mounted. 52 SubRepositoryPath string `json:",omitempty"` 53 54 // Commit SHA1 (hex) of the (sub)repo holding the file. 55 Version string `json:",omitempty"` 56 57 // Detected language of the result. 58 Language string 59 60 // For debugging. Needs DebugScore set, but public so tests in 61 // other packages can print some diagnostics. 62 Debug string `json:",omitempty"` 63 64 Branches []string `json:",omitempty"` 65 66 // One of LineMatches or ChunkMatches will be returned depending on whether 67 // the SearchOptions.ChunkMatches is set. 68 LineMatches []LineMatch `json:",omitempty"` 69 ChunkMatches []ChunkMatch `json:",omitempty"` 70 71 // Only set if requested 72 Content []byte `json:",omitempty"` 73 74 // Checksum of the content. 75 Checksum []byte 76 77 // Ranking; the higher, the better. 78 Score float64 `json:",omitempty"` 79 80 // RepositoryPriority is a Sourcegraph extension. It is used by Sourcegraph to 81 // order results from different repositories relative to each other. 82 RepositoryPriority float64 `json:",omitempty"` 83 84 // RepositoryID is a Sourcegraph extension. This is the ID of Repository in 85 // Sourcegraph. 86 RepositoryID uint32 `json:",omitempty"` 87} 88 89func (m *FileMatch) sizeBytes() (sz uint64) { 90 // Score 91 sz += 8 92 93 for _, s := range []string{ 94 m.Debug, 95 m.FileName, 96 m.Repository, 97 m.Language, 98 m.SubRepositoryName, 99 m.SubRepositoryPath, 100 m.Version, 101 } { 102 sz += stringHeaderBytes + uint64(len(s)) 103 } 104 105 // Branches 106 sz += sliceHeaderBytes 107 for _, s := range m.Branches { 108 sz += stringHeaderBytes + uint64(len(s)) 109 } 110 111 // LineMatches 112 sz += sliceHeaderBytes 113 for _, lm := range m.LineMatches { 114 sz += lm.sizeBytes() 115 } 116 117 // ChunkMatches 118 sz += sliceHeaderBytes 119 for _, cm := range m.ChunkMatches { 120 sz += cm.sizeBytes() 121 } 122 123 // RepositoryID 124 sz += 4 125 126 // RepositoryPriority 127 sz += 8 128 129 // Content 130 sz += sliceHeaderBytes + uint64(len(m.Content)) 131 132 // Checksum 133 sz += sliceHeaderBytes + uint64(len(m.Checksum)) 134 135 return 136} 137 138// ChunkMatch is a set of non-overlapping matches within a contiguous range of 139// lines in the file. 140type ChunkMatch struct { 141 DebugScore string 142 143 // Content is a contiguous range of complete lines that fully contains Ranges. 144 // Lines will always include their terminating newline (if it exists). 145 Content []byte 146 147 // Ranges is a set of matching ranges within this chunk. Each range is relative 148 // to the beginning of the file (not the beginning of Content). 149 Ranges []Range 150 151 // SymbolInfo is the symbol information associated with Ranges. If it is non-nil, 152 // its length will equal that of Ranges. Any of its elements may be nil. 153 SymbolInfo []*Symbol 154 155 // FileName indicates whether this match is a match on the file name, in 156 // which case Content will contain the file name. 157 FileName bool 158 159 // ContentStart is the location (inclusive) of the beginning of content 160 // relative to the beginning of the file. It will always be at the 161 // beginning of a line (Column will always be 1). 162 ContentStart Location 163 164 Score float64 165} 166 167func (cm *ChunkMatch) sizeBytes() (sz uint64) { 168 // Content 169 sz += sliceHeaderBytes + uint64(len(cm.Content)) 170 171 // ContentStart 172 sz += cm.ContentStart.sizeBytes() 173 174 // FileName 175 sz += 1 176 177 // Ranges 178 sz += sliceHeaderBytes 179 if len(cm.Ranges) > 0 { 180 sz += uint64(len(cm.Ranges)) * cm.Ranges[0].sizeBytes() 181 } 182 183 // SymbolInfo 184 sz += sliceHeaderBytes 185 for _, si := range cm.SymbolInfo { 186 sz += pointerSize 187 if si != nil { 188 sz += si.sizeBytes() 189 } 190 } 191 192 // Score 193 sz += 8 194 195 // DebugScore 196 sz += stringHeaderBytes + uint64(len(cm.DebugScore)) 197 198 return 199} 200 201type Range struct { 202 // The inclusive beginning of the range. 203 Start Location 204 // The exclusive end of the range. 205 End Location 206} 207 208func (r *Range) sizeBytes() uint64 { 209 return r.Start.sizeBytes() + r.End.sizeBytes() 210} 211 212type Location struct { 213 // 0-based byte offset from the beginning of the file 214 ByteOffset uint32 215 // 1-based line number from the beginning of the file 216 LineNumber uint32 217 // 1-based column number (in runes) from the beginning of line 218 Column uint32 219} 220 221func (l *Location) sizeBytes() uint64 { 222 return 3 * 4 223} 224 225// LineMatch holds the matches within a single line in a file. 226type LineMatch struct { 227 // The line in which a match was found. 228 Line []byte 229 // The byte offset of the first byte of the line. 230 LineStart int 231 // The byte offset of the first byte past the end of the line. 232 // This is usually the byte after the terminating newline, but can also be 233 // the end of the file if there is no terminating newline 234 LineEnd int 235 LineNumber int 236 237 // Before and After are only set when SearchOptions.NumContextLines is > 0 238 Before []byte 239 After []byte 240 241 // If set, this was a match on the filename. 242 FileName bool 243 244 // The higher the better. Only ranks the quality of the match 245 // within the file, does not take rank of file into account 246 Score float64 247 DebugScore string 248 249 LineFragments []LineFragmentMatch 250} 251 252func (lm *LineMatch) sizeBytes() (sz uint64) { 253 // Line 254 sz += sliceHeaderBytes + uint64(len(lm.Line)) 255 256 // LineStart, LineEnd, LineNumber 257 sz += 3 * 8 258 259 // Before 260 sz += sliceHeaderBytes + uint64(len(lm.Before)) 261 262 // After 263 sz += sliceHeaderBytes + uint64(len(lm.After)) 264 265 // FileName 266 sz += 1 267 268 // Score 269 sz += 8 270 271 // DebugScore 272 sz += stringHeaderBytes + uint64(len(lm.DebugScore)) 273 274 // LineFragments 275 sz += sliceHeaderBytes 276 for _, lf := range lm.LineFragments { 277 sz += lf.sizeBytes() 278 } 279 280 return 281} 282 283type Symbol struct { 284 Sym string 285 Kind string 286 Parent string 287 ParentKind string 288} 289 290func (s *Symbol) sizeBytes() uint64 { 291 return 4*stringHeaderBytes + uint64(len(s.Sym)+len(s.Kind)+len(s.Parent)+len(s.ParentKind)) 292} 293 294// LineFragmentMatch a segment of matching text within a line. 295type LineFragmentMatch struct { 296 // Offset within the line, in bytes. 297 LineOffset int 298 299 // Offset from file start, in bytes. 300 Offset uint32 301 302 // Number bytes that match. 303 MatchLength int 304 305 SymbolInfo *Symbol 306} 307 308func (lfm *LineFragmentMatch) sizeBytes() (sz uint64) { 309 // LineOffset 310 sz += 8 311 312 // Offset 313 sz += 4 314 315 // MatchLength 316 sz += 8 317 318 // SymbolInfo 319 sz += pointerSize 320 if lfm.SymbolInfo != nil { 321 sz += lfm.SymbolInfo.sizeBytes() 322 } 323 324 return 325} 326 327type FlushReason uint8 328 329const ( 330 FlushReasonTimerExpired FlushReason = 1 << iota 331 FlushReasonFinalFlush 332 FlushReasonMaxSize 333) 334 335var FlushReasonStrings = map[FlushReason]string{ 336 FlushReasonTimerExpired: "timer_expired", 337 FlushReasonFinalFlush: "final_flush", 338 FlushReasonMaxSize: "max_size_reached", 339} 340 341func (fr FlushReason) String() string { 342 if v, ok := FlushReasonStrings[fr]; ok { 343 return v 344 } 345 346 return "none" 347} 348 349// Stats contains interesting numbers on the search 350type Stats struct { 351 // Amount of I/O for reading contents. 352 ContentBytesLoaded int64 353 354 // Amount of I/O for reading from index. 355 IndexBytesLoaded int64 356 357 // Number of search shards that had a crash. 358 Crashes int 359 360 // Wall clock time for this search 361 Duration time.Duration 362 363 // Number of files containing a match. 364 FileCount int 365 366 // Number of files in shards that we considered. 367 ShardFilesConsidered int 368 369 // Files that we evaluated. Equivalent to files for which all 370 // atom matches (including negations) evaluated to true. 371 FilesConsidered int 372 373 // Files for which we loaded file content to verify substring matches 374 FilesLoaded int 375 376 // Candidate files whose contents weren't examined because we 377 // gathered enough matches. 378 FilesSkipped int 379 380 // Shards that we scanned to find matches. 381 ShardsScanned int 382 383 // Shards that we did not process because a query was canceled. 384 ShardsSkipped int 385 386 // Shards that we did not process because the query was rejected by the 387 // ngram filter indicating it had no matches. 388 ShardsSkippedFilter int 389 390 // Number of non-overlapping matches 391 MatchCount int 392 393 // Number of candidate matches as a result of searching ngrams. 394 NgramMatches int 395 396 // NgramLookups is the number of times we accessed an ngram in the index. 397 NgramLookups int 398 399 // Wall clock time for queued search. 400 Wait time.Duration 401 402 // Aggregate wall clock time spent constructing and pruning the match tree. 403 // This accounts for time such as lookups in the trigram index. 404 MatchTreeConstruction time.Duration 405 406 // Aggregate wall clock time spent searching the match tree. This accounts 407 // for the bulk of search work done looking for matches. 408 MatchTreeSearch time.Duration 409 410 // Number of times regexp was called on files that we evaluated. 411 RegexpsConsidered int 412 413 // FlushReason explains why results were flushed. 414 FlushReason FlushReason 415} 416 417func (s *Stats) sizeBytes() (sz uint64) { 418 sz = 16 * 8 // This assumes we are running on a 64-bit architecture 419 sz += 1 // FlushReason 420 421 return 422} 423 424func (s *Stats) Add(o Stats) { 425 s.ContentBytesLoaded += o.ContentBytesLoaded 426 s.IndexBytesLoaded += o.IndexBytesLoaded 427 s.Crashes += o.Crashes 428 s.FileCount += o.FileCount 429 s.FilesConsidered += o.FilesConsidered 430 s.FilesLoaded += o.FilesLoaded 431 s.FilesSkipped += o.FilesSkipped 432 s.MatchCount += o.MatchCount 433 s.NgramMatches += o.NgramMatches 434 s.NgramLookups += o.NgramLookups 435 s.ShardFilesConsidered += o.ShardFilesConsidered 436 s.ShardsScanned += o.ShardsScanned 437 s.ShardsSkipped += o.ShardsSkipped 438 s.ShardsSkippedFilter += o.ShardsSkippedFilter 439 s.Wait += o.Wait 440 s.MatchTreeConstruction += o.MatchTreeConstruction 441 s.MatchTreeSearch += o.MatchTreeSearch 442 s.RegexpsConsidered += o.RegexpsConsidered 443 444 // We want the first non-zero FlushReason to be sticky. This is a useful 445 // property when aggregating stats from several Zoekts. 446 if s.FlushReason == 0 { 447 s.FlushReason = o.FlushReason 448 } 449} 450 451// Zero returns true if stats is empty. 452func (s *Stats) Zero() bool { 453 if s == nil { 454 return true 455 } 456 457 return !(s.ContentBytesLoaded > 0 || 458 s.IndexBytesLoaded > 0 || 459 s.Crashes > 0 || 460 s.FileCount > 0 || 461 s.FilesConsidered > 0 || 462 s.FilesLoaded > 0 || 463 s.FilesSkipped > 0 || 464 s.MatchCount > 0 || 465 s.NgramMatches > 0 || 466 s.NgramLookups > 0 || 467 s.ShardFilesConsidered > 0 || 468 s.ShardsScanned > 0 || 469 s.ShardsSkipped > 0 || 470 s.ShardsSkippedFilter > 0 || 471 s.Wait > 0 || 472 s.MatchTreeConstruction > 0 || 473 s.MatchTreeSearch > 0 || 474 s.RegexpsConsidered > 0) 475} 476 477// Progress contains information about the global progress of the running search query. 478// This is used by the frontend to reorder results and emit them when stable. 479// Sourcegraph specific: this is used when querying multiple zoekt-webserver instances. 480type Progress struct { 481 // Priority of the shard that was searched. 482 Priority float64 483 484 // MaxPendingPriority is the maximum priority of pending result that is being searched in parallel. 485 // This is used to reorder results when the result set is known to be stable-- that is, when a result's 486 // Priority is greater than the max(MaxPendingPriority) from the latest results of each backend, it can be returned to the user. 487 // 488 // MaxPendingPriority decreases monotonically in each SearchResult. 489 MaxPendingPriority float64 490} 491 492func (p *Progress) sizeBytes() uint64 { 493 return 2 * 8 494} 495 496// SearchResult contains search matches and extra data 497type SearchResult struct { 498 Stats 499 500 // Do not encode this as we cannot encode -Inf in JSON 501 Progress `json:"-"` 502 503 Files []FileMatch 504 505 // RepoURLs holds a repo => template string map. 506 RepoURLs map[string]string 507 508 // FragmentNames holds a repo => template string map, for 509 // the line number fragment. 510 LineFragments map[string]string 511} 512 513// SizeBytes is a best-effort estimate of the size of SearchResult in memory. 514// The estimate does not take alignment into account. The result is a lower 515// bound on the actual size in memory. 516func (sr *SearchResult) SizeBytes() (sz uint64) { 517 sz += sr.Stats.sizeBytes() 518 sz += sr.Progress.sizeBytes() 519 520 // Files 521 sz += sliceHeaderBytes 522 for _, f := range sr.Files { 523 sz += f.sizeBytes() 524 } 525 526 // RepoURLs 527 sz += mapHeaderBytes 528 for k, v := range sr.RepoURLs { 529 sz += stringHeaderBytes + uint64(len(k)) 530 sz += stringHeaderBytes + uint64(len(v)) 531 } 532 533 // LineFragments 534 sz += mapHeaderBytes 535 for k, v := range sr.LineFragments { 536 sz += stringHeaderBytes + uint64(len(k)) 537 sz += stringHeaderBytes + uint64(len(v)) 538 } 539 540 return 541} 542 543// RepositoryBranch describes an indexed branch, which is a name 544// combined with a version. 545type RepositoryBranch struct { 546 Name string 547 Version string 548} 549 550func (r RepositoryBranch) String() string { 551 return fmt.Sprintf("%s@%s", r.Name, r.Version) 552} 553 554// Repository holds repository metadata. 555type Repository struct { 556 // Sourcegraph's tenant ID 557 TenantID int 558 559 // Sourcegraph's repository ID 560 ID uint32 561 562 // The repository name 563 Name string 564 565 // The repository URL. 566 URL string 567 568 // The physical source where this repo came from, eg. full 569 // path to the zip filename or git repository directory. This 570 // will not be exposed in the UI, but can be used to detect 571 // orphaned index shards. 572 Source string 573 574 // The branches indexed in this repo. 575 Branches []RepositoryBranch 576 577 // Nil if this is not the super project. 578 SubRepoMap map[string]*Repository 579 580 // URL template to link to the commit of a branch 581 CommitURLTemplate string 582 583 // The repository URL for getting to a file. Has access to 584 // {{.Version}}, {{.Path}} 585 FileURLTemplate string 586 587 // The URL fragment to add to a file URL for line numbers. has 588 // access to {{.LineNumber}}. The fragment should include the 589 // separator, generally '#' or ';'. 590 LineFragmentTemplate string 591 592 // Perf optimization: priority is set when we load the shard. It corresponds to 593 // the value of "priority" stored in RawConfig. 594 priority float64 595 596 // All zoekt.* configuration settings. 597 RawConfig map[string]string 598 599 // Importance of the repository, bigger is more important 600 Rank uint16 601 602 // IndexOptions is a hash of the options used to create the index for the 603 // repo. 604 IndexOptions string 605 606 // HasSymbols is true if this repository has indexed ctags 607 // output. Sourcegraph specific: This field is more appropriate for 608 // IndexMetadata. However, we store it here since the Sourcegraph frontend 609 // can read this structure but not IndexMetadata. 610 HasSymbols bool 611 612 // Tombstone is true if we are not allowed to search this repo. 613 Tombstone bool 614 615 // LatestCommitDate is the date of the latest commit among all indexed Branches. 616 // The date might be time.Time's 0-value if the repository was last indexed 617 // before this field was added. 618 LatestCommitDate time.Time 619 620 // FileTombstones is a set of file paths that should be ignored across all branches 621 // in this shard. 622 FileTombstones map[string]struct{} `json:",omitempty"` 623} 624 625func (r *Repository) UnmarshalJSON(data []byte) error { 626 // We define a new type so that we can use json.Unmarshal 627 // without recursing into this same method. 628 type repository *Repository 629 repo := repository(r) 630 631 err := json.Unmarshal(data, repo) 632 if err != nil { 633 return err 634 } 635 636 if v, ok := repo.RawConfig["repoid"]; ok { 637 id, _ := strconv.ParseUint(v, 10, 32) 638 r.ID = uint32(id) 639 } 640 641 if v, ok := repo.RawConfig["tenantID"]; ok { 642 id, _ := strconv.ParseInt(v, 10, 64) 643 r.TenantID = int(id) 644 } 645 646 // Sourcegraph indexserver doesn't set repo.Rank, so we set it here. Setting it 647 // on read instead of during indexing allows us to avoid a complete reindex. 648 // 649 // Prefer "latestCommitDate" over "priority" for ranking. We keep priority for 650 // backwards compatibility. 651 if _, ok := repo.RawConfig["latestCommitDate"]; ok { 652 // We use the number of months since 1970 as a simple measure of repo freshness. 653 // It is monotonically increasing and stable across re-indexes and restarts. 654 r.Rank = monthsSince1970(repo.LatestCommitDate) 655 } else if v, ok := repo.RawConfig["priority"]; ok { 656 r.priority, err = strconv.ParseFloat(v, 64) 657 if err != nil { 658 r.priority = 0 659 } 660 661 // Sourcegraph indexserver doesn't set repo.Rank, so we set it here 662 // based on priority. Setting it on read instead of during indexing 663 // allows us to avoid a complete reindex. 664 if r.Rank == 0 && r.priority > 0 { 665 // Normalize the repo score within [0, maxUint16), with the midpoint at 5,000. 666 // This means popular repos (roughly ones with over 5,000 stars) see diminishing 667 // returns from more stars. 668 r.Rank = uint16(r.priority / (5000.0 + r.priority) * maxUInt16) 669 } 670 } 671 672 return nil 673} 674 675// monthsSince1970 returns the number of months since 1970. It returns values in 676// the range [0, maxUInt16]. The upper bound is reached in the year 7431, the 677// lower bound for all dates before 1970. 678func monthsSince1970(t time.Time) uint16 { 679 base := time.Unix(0, 0) 680 if t.Before(base) { 681 return 0 682 } 683 months := int(t.Year()-1970)*12 + int(t.Month()-1) 684 return uint16(min(months, maxUInt16)) 685} 686 687// MergeMutable will merge x into r. mutated will be true if it made any 688// changes. err is non-nil if we needed to mutate an immutable field. 689// 690// Note: SubRepoMap, IndexOptions and HasSymbol fields are ignored. They are 691// computed while indexing so can't be synthesized from x. 692// 693// Note: We ignore RawConfig fields which are duplicated into Repository: 694// name and id. 695func (r *Repository) MergeMutable(x *Repository) (mutated bool, err error) { 696 if r.ID != x.ID { 697 // Sourcegraph: strange behaviour may occur if ID changes but names don't. 698 return mutated, errors.New("ID is immutable") 699 } 700 if r.Name != x.Name { 701 // Name is encoded into the shard name on disk. We need to re-index if it 702 // changes. 703 return mutated, errors.New("Name is immutable") 704 } 705 if !reflect.DeepEqual(r.Branches, x.Branches) { 706 // Need a reindex if content changing. 707 return mutated, errors.New("Branches is immutable") 708 } 709 710 for k, v := range x.RawConfig { 711 // We ignore name and id since they are encoded into the repository. 712 if k == "name" || k == "id" { 713 continue 714 } 715 if r.RawConfig == nil { 716 mutated = true 717 r.RawConfig = make(map[string]string) 718 } 719 if r.RawConfig[k] != v { 720 mutated = true 721 r.RawConfig[k] = v 722 } 723 } 724 725 if r.URL != x.URL { 726 mutated = true 727 r.URL = x.URL 728 } 729 if r.CommitURLTemplate != x.CommitURLTemplate { 730 mutated = true 731 r.CommitURLTemplate = x.CommitURLTemplate 732 } 733 if r.FileURLTemplate != x.FileURLTemplate { 734 mutated = true 735 r.FileURLTemplate = x.FileURLTemplate 736 } 737 if r.LineFragmentTemplate != x.LineFragmentTemplate { 738 mutated = true 739 r.LineFragmentTemplate = x.LineFragmentTemplate 740 } 741 742 return mutated, nil 743} 744 745// IndexMetadata holds metadata stored in the index file. It contains 746// data generated by the core indexing library. 747type IndexMetadata struct { 748 IndexFormatVersion int 749 IndexFeatureVersion int 750 IndexMinReaderVersion int 751 IndexTime time.Time 752 PlainASCII bool 753 LanguageMap map[string]uint16 754 ZoektVersion string 755 ID string 756} 757 758// Statistics of a (collection of) repositories. 759type RepoStats struct { 760 // Repos is used for aggregrating the number of repositories. 761 // 762 // Note: This field is not populated on RepoListEntry.Stats (individual) but 763 // only for RepoList.Stats (aggregate). 764 Repos int 765 766 // Shards is the total number of search shards. 767 Shards int 768 769 // Documents holds the number of documents or files. 770 Documents int 771 772 // IndexBytes is the amount of RAM used for index overhead. 773 IndexBytes int64 774 775 // ContentBytes is the amount of RAM used for raw content. 776 ContentBytes int64 777 778 // Sourcegraph specific stats below. These are not as efficient to calculate 779 // as the above statistics. We experimentally measured about a 10% slower 780 // shard load time. However, we find these values very useful to track and 781 // computing them outside of load time introduces a lot of complexity. 782 783 // NewLinesCount is the number of newlines "\n" that appear in the zoekt 784 // indexed documents. This is not exactly the same as line count, since it 785 // will not include lines not terminated by "\n" (eg a file with no "\n", or 786 // a final line without "\n"). Note: Zoekt deduplicates documents across 787 // branches, so if a path has the same contents on multiple branches, there 788 // is only one document for it. As such that document's newlines is only 789 // counted once. See DefaultBranchNewLinesCount and AllBranchesNewLinesCount 790 // for counts which do not deduplicate. 791 NewLinesCount uint64 792 793 // DefaultBranchNewLinesCount is the number of newlines "\n" in the default 794 // branch. 795 DefaultBranchNewLinesCount uint64 796 797 // OtherBranchesNewLinesCount is the number of newlines "\n" in all branches 798 // except the default branch. 799 OtherBranchesNewLinesCount uint64 800} 801 802func (s *RepoStats) Add(o *RepoStats) { 803 // can't update Repos, since one repo may have multiple 804 // shards. 805 s.Shards += o.Shards 806 s.IndexBytes += o.IndexBytes 807 s.Documents += o.Documents 808 s.ContentBytes += o.ContentBytes 809 810 // Sourcegraph specific 811 s.NewLinesCount += o.NewLinesCount 812 s.DefaultBranchNewLinesCount += o.DefaultBranchNewLinesCount 813 s.OtherBranchesNewLinesCount += o.OtherBranchesNewLinesCount 814} 815 816type RepoListEntry struct { 817 Repository Repository 818 IndexMetadata IndexMetadata 819 Stats RepoStats 820} 821 822// MinimalRepoListEntry is a subset of RepoListEntry. It was added after 823// performance profiling of sourcegraph.com revealed that querying this 824// information from Zoekt was causing lots of CPU and memory usage. Note: we 825// can revisit this, how we store and query this information has changed a lot 826// since this was introduced. 827type MinimalRepoListEntry struct { 828 // HasSymbols is exported since Sourcegraph uses this information at search 829 // planning time to decide between Zoekt and an unindexed symbol search. 830 // 831 // Note: it pretty much is always true in practice. 832 HasSymbols bool 833 834 // Branches is used by Sourcegraphs query planner to decided if it can use 835 // zoekt or go via an unindexed code path. 836 Branches []RepositoryBranch 837 838 // IndexTimeUnix is the IndexTime converted to unix time (number of seconds 839 // since the epoch). This is to make it clear we are not transporting the 840 // full fidelty timestamp (ie with milliseconds and location). Additionally 841 // it saves 16 bytes in this struct. 842 // 843 // IndexTime is used as a heuristic in Sourcegraph to decide in aggregate 844 // how many repositories need updating after a ranking change/etc. 845 // 846 // TODO(keegancsmith) audit updates to IndexTime and document how and when 847 // it changes. Concerned about things like metadata updates or compound 848 // shards leading to untrustworthy data here. 849 IndexTimeUnix int64 850} 851 852type ReposMap map[uint32]MinimalRepoListEntry 853 854// MarshalBinary implements a specialized encoder for ReposMap. 855func (q *ReposMap) MarshalBinary() ([]byte, error) { 856 return reposMapEncode(*q) 857} 858 859// UnmarshalBinary implements a specialized decoder for ReposMap. 860func (q *ReposMap) UnmarshalBinary(b []byte) error { 861 var err error 862 (*q), err = reposMapDecode(b) 863 return err 864} 865 866// RepoList holds a set of Repository metadata. 867type RepoList struct { 868 // Returned when ListOptions.Field is RepoListFieldRepos. 869 Repos []*RepoListEntry 870 871 // ReposMap is set when ListOptions.Field is RepoListFieldReposMap. 872 ReposMap ReposMap 873 874 Crashes int 875 876 // Stats response to a List request. 877 // This is the aggregate RepoStats of all repos matching the input query. 878 Stats RepoStats 879} 880 881type Searcher interface { 882 Search(ctx context.Context, q query.Q, opts *SearchOptions) (*SearchResult, error) 883 884 // List lists repositories. The query `q` can only contain 885 // query.Repo atoms. 886 List(ctx context.Context, q query.Q, opts *ListOptions) (*RepoList, error) 887 Close() 888 889 // Describe the searcher for debug messages. 890 String() string 891} 892 893type RepoListField int 894 895const ( 896 RepoListFieldRepos RepoListField = 0 897 RepoListFieldReposMap = 2 898) 899 900type ListOptions struct { 901 // Field decides which field to populate in RepoList response. 902 Field RepoListField 903} 904 905func (o *ListOptions) GetField() (RepoListField, error) { 906 if o == nil { 907 return RepoListFieldRepos, nil 908 } 909 switch o.Field { 910 case RepoListFieldRepos, RepoListFieldReposMap: 911 return o.Field, nil 912 case 1: 913 return 0, fmt.Errorf("RepoListFieldMinimal (%d) is no longer supported", o.Field) 914 default: 915 return 0, fmt.Errorf("unknown RepoListField %d", o.Field) 916 } 917} 918 919func (o *ListOptions) String() string { 920 return fmt.Sprintf("%#v", o) 921} 922 923type SearchOptions struct { 924 // Return an upper-bound estimate of eligible documents in 925 // stats.ShardFilesConsidered. 926 EstimateDocCount bool 927 928 // Return the whole file. 929 Whole bool 930 931 // Maximum number of matches: skip all processing an index 932 // shard after we found this many non-overlapping matches. 933 ShardMaxMatchCount int 934 935 // Maximum number of matches: stop looking for more matches 936 // once we have this many matches across shards. 937 TotalMaxMatchCount int 938 939 // Maximum number of matches: skip processing documents for a repository in 940 // a shard once we have found ShardRepoMaxMatchCount. 941 // 942 // A compound shard may contain multiple repositories. This will most often 943 // be set to 1 to find all repositories containing a result. 944 ShardRepoMaxMatchCount int 945 946 // Abort the search after this much time has passed. 947 MaxWallTime time.Duration 948 949 // FlushWallTime if non-zero will stop streaming behaviour at first and 950 // instead will collate and sort results. At FlushWallTime the results will 951 // be sent and then the behaviour will revert to the normal streaming. 952 FlushWallTime time.Duration 953 954 // Truncates the number of documents (i.e. files) after collating and 955 // sorting the results. 956 MaxDocDisplayCount int 957 958 // Truncates the number of matchs after collating and sorting the results. 959 MaxMatchDisplayCount int 960 961 // If set to a number greater than zero then up to this many number 962 // of context lines will be added before and after each matched line. 963 // Note that the included context lines might contain matches and 964 // it's up to the consumer of the result to remove those lines. 965 NumContextLines int 966 967 // If true, ChunkMatches will be returned in each FileMatch rather than LineMatches 968 // EXPERIMENTAL: the behavior of this flag may be changed in future versions. 969 ChunkMatches bool 970 971 // EXPERIMENTAL. If true, use text-search style scoring instead of the default 972 // scoring formula. The scoring algorithm treats each match in a file as a term 973 // and computes an approximation to BM25. 974 // 975 // The calculation of IDF assumes that Zoekt visits all documents containing any 976 // of the query terms during evaluation. This is true, for example, if all query 977 // terms are ORed together. 978 // 979 // When enabled, all other scoring signals are ignored, including document ranks. 980 UseBM25Scoring bool 981 982 // Trace turns on opentracing for this request if true and if the Jaeger address was provided as 983 // a command-line flag 984 Trace bool 985 986 // If set, the search results will contain debug information for scoring. 987 DebugScore bool 988 989 // SpanContext is the opentracing span context, if it exists, from the zoekt client 990 SpanContext map[string]string 991} 992 993// String returns a succinct representation of the options. This is meant for 994// human consumption in logs and traces. 995// 996// Note: some tracing systems have limits on length of values, so we take care 997// to try and make this small, and include the important information near the 998// front incase of truncation. 999func (s *SearchOptions) String() string { 1000 var b strings.Builder 1001 1002 add := func(name, value string) { 1003 b.WriteString(name) 1004 b.WriteByte('=') 1005 b.WriteString(value) 1006 b.WriteByte(' ') 1007 } 1008 addInt := func(name string, value int) { 1009 if value != 0 { 1010 add(name, strconv.Itoa(value)) 1011 } 1012 } 1013 addDuration := func(name string, value time.Duration) { 1014 if value != 0 { 1015 add(name, value.String()) 1016 } 1017 } 1018 addBool := func(name string, value bool) { 1019 if !value { 1020 return 1021 } 1022 b.WriteString(name) 1023 b.WriteByte(' ') 1024 } 1025 1026 b.WriteString("zoekt.SearchOptions{ ") 1027 1028 addInt("ShardMaxMatchCount", s.ShardMaxMatchCount) 1029 addInt("TotalMaxMatchCount", s.TotalMaxMatchCount) 1030 addInt("ShardRepoMaxMatchCount", s.ShardRepoMaxMatchCount) 1031 addInt("MaxDocDisplayCount", s.MaxDocDisplayCount) 1032 addInt("MaxMatchDisplayCount", s.MaxMatchDisplayCount) 1033 addInt("NumContextLines", s.NumContextLines) 1034 1035 addDuration("MaxWallTime", s.MaxWallTime) 1036 addDuration("FlushWallTime", s.FlushWallTime) 1037 1038 addBool("EstimateDocCount", s.EstimateDocCount) 1039 addBool("Whole", s.Whole) 1040 addBool("ChunkMatches", s.ChunkMatches) 1041 addBool("UseBM25Scoring", s.UseBM25Scoring) 1042 addBool("Trace", s.Trace) 1043 addBool("DebugScore", s.DebugScore) 1044 1045 for k, v := range s.SpanContext { 1046 add("SpanContext."+k, strconv.Quote(v)) 1047 } 1048 1049 b.WriteByte('}') 1050 return b.String() 1051} 1052 1053// Sender is the interface that wraps the basic Send method. 1054type Sender interface { 1055 Send(*SearchResult) 1056} 1057 1058// SenderFunc is an adapter to allow the use of ordinary functions as Sender. 1059// If f is a function with the appropriate signature, SenderFunc(f) is a Sender 1060// that calls f. 1061type SenderFunc func(result *SearchResult) 1062 1063func (f SenderFunc) Send(result *SearchResult) { 1064 f(result) 1065} 1066 1067// Streamer adds the method StreamSearch to the Searcher interface. 1068type Streamer interface { 1069 Searcher 1070 StreamSearch(ctx context.Context, q query.Q, opts *SearchOptions, sender Sender) (err error) 1071}