fork of https://github.com/sourcegraph/zoekt
0

Configure Feed

Select the types of activity you want to include in your feed.

1// Copyright 2016 Google Inc. All rights reserved. 2// 3// Licensed under the Apache License, Version 2.0 (the "License"); 4// you may not use this file except in compliance with the License. 5// You may obtain a copy of the License at 6// 7// http://www.apache.org/licenses/LICENSE-2.0 8// 9// Unless required by applicable law or agreed to in writing, software 10// distributed under the License is distributed on an "AS IS" BASIS, 11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12// See the License for the specific language governing permissions and 13// limitations under the License. 14 15package zoekt // import "github.com/sourcegraph/zoekt" 16 17import ( 18 "context" 19 "encoding/json" 20 "errors" 21 "fmt" 22 "reflect" 23 "strconv" 24 "strings" 25 "time" 26 27 "github.com/sourcegraph/zoekt/query" 28) 29 30const ( 31 mapHeaderBytes uint64 = 48 32 sliceHeaderBytes uint64 = 24 33 stringHeaderBytes uint64 = 16 34 pointerSize uint64 = 8 35 interfaceBytes uint64 = 16 36) 37 38// FileMatch contains all the matches within a file. 39type FileMatch struct { 40 FileName string 41 42 // Repository is the globally unique name of the repo of the 43 // match 44 Repository string 45 46 // SubRepositoryName is the globally unique name of the repo, 47 // if it came from a subrepository 48 SubRepositoryName string `json:",omitempty"` 49 50 // SubRepositoryPath holds the prefix where the subrepository 51 // was mounted. 52 SubRepositoryPath string `json:",omitempty"` 53 54 // Commit SHA1 (hex) of the (sub)repo holding the file. 55 Version string `json:",omitempty"` 56 57 // Detected language of the result. 58 Language string 59 60 // For debugging. Needs DebugScore set, but public so tests in 61 // other packages can print some diagnostics. 62 Debug string `json:",omitempty"` 63 64 Branches []string `json:",omitempty"` 65 66 // One of LineMatches or ChunkMatches will be returned depending on whether 67 // the SearchOptions.ChunkMatches is set. 68 LineMatches []LineMatch `json:",omitempty"` 69 ChunkMatches []ChunkMatch `json:",omitempty"` 70 71 // Only set if requested 72 Content []byte `json:",omitempty"` 73 74 // Checksum of the content. 75 Checksum []byte 76 77 // Ranking; the higher, the better. 78 Score float64 `json:",omitempty"` 79 80 // RepositoryPriority is a Sourcegraph extension. It is used by Sourcegraph to 81 // order results from different repositories relative to each other. 82 RepositoryPriority float64 `json:",omitempty"` 83 84 // RepositoryID is a Sourcegraph extension. This is the ID of Repository in 85 // Sourcegraph. 86 RepositoryID uint32 `json:",omitempty"` 87} 88 89func (m *FileMatch) sizeBytes() (sz uint64) { 90 // Score 91 sz += 8 92 93 for _, s := range []string{ 94 m.Debug, 95 m.FileName, 96 m.Repository, 97 m.Language, 98 m.SubRepositoryName, 99 m.SubRepositoryPath, 100 m.Version, 101 } { 102 sz += stringHeaderBytes + uint64(len(s)) 103 } 104 105 // Branches 106 sz += sliceHeaderBytes 107 for _, s := range m.Branches { 108 sz += stringHeaderBytes + uint64(len(s)) 109 } 110 111 // LineMatches 112 sz += sliceHeaderBytes 113 for _, lm := range m.LineMatches { 114 sz += lm.sizeBytes() 115 } 116 117 // ChunkMatches 118 sz += sliceHeaderBytes 119 for _, cm := range m.ChunkMatches { 120 sz += cm.sizeBytes() 121 } 122 123 // RepositoryID 124 sz += 4 125 126 // RepositoryPriority 127 sz += 8 128 129 // Content 130 sz += sliceHeaderBytes + uint64(len(m.Content)) 131 132 // Checksum 133 sz += sliceHeaderBytes + uint64(len(m.Checksum)) 134 135 return 136} 137 138// ChunkMatch is a set of non-overlapping matches within a contiguous range of 139// lines in the file. 140type ChunkMatch struct { 141 DebugScore string 142 143 // Content is a contiguous range of complete lines that fully contains Ranges. 144 // Lines will always include their terminating newline (if it exists). 145 Content []byte 146 147 // Ranges is a set of matching ranges within this chunk. Each range is relative 148 // to the beginning of the file (not the beginning of Content). 149 Ranges []Range 150 151 // SymbolInfo is the symbol information associated with Ranges. If it is non-nil, 152 // its length will equal that of Ranges. Any of its elements may be nil. 153 SymbolInfo []*Symbol 154 155 // FileName indicates whether this match is a match on the file name, in 156 // which case Content will contain the file name. 157 FileName bool 158 159 // ContentStart is the location (inclusive) of the beginning of content 160 // relative to the beginning of the file. It will always be at the 161 // beginning of a line (Column will always be 1). 162 ContentStart Location 163 164 Score float64 165} 166 167func (cm *ChunkMatch) sizeBytes() (sz uint64) { 168 // Content 169 sz += sliceHeaderBytes + uint64(len(cm.Content)) 170 171 // ContentStart 172 sz += cm.ContentStart.sizeBytes() 173 174 // FileName 175 sz += 1 176 177 // Ranges 178 sz += sliceHeaderBytes 179 if len(cm.Ranges) > 0 { 180 sz += uint64(len(cm.Ranges)) * cm.Ranges[0].sizeBytes() 181 } 182 183 // SymbolInfo 184 sz += sliceHeaderBytes 185 for _, si := range cm.SymbolInfo { 186 sz += pointerSize 187 if si != nil { 188 sz += si.sizeBytes() 189 } 190 } 191 192 // Score 193 sz += 8 194 195 // DebugScore 196 sz += stringHeaderBytes + uint64(len(cm.DebugScore)) 197 198 return 199} 200 201type Range struct { 202 // The inclusive beginning of the range. 203 Start Location 204 // The exclusive end of the range. 205 End Location 206} 207 208func (r *Range) sizeBytes() uint64 { 209 return r.Start.sizeBytes() + r.End.sizeBytes() 210} 211 212type Location struct { 213 // 0-based byte offset from the beginning of the file 214 ByteOffset uint32 215 // 1-based line number from the beginning of the file 216 LineNumber uint32 217 // 1-based column number (in runes) from the beginning of line 218 Column uint32 219} 220 221func (l *Location) sizeBytes() uint64 { 222 return 3 * 4 223} 224 225// LineMatch holds the matches within a single line in a file. 226type LineMatch struct { 227 // The line in which a match was found. 228 Line []byte 229 // The byte offset of the first byte of the line. 230 LineStart int 231 // The byte offset of the first byte past the end of the line. 232 // This is usually the byte after the terminating newline, but can also be 233 // the end of the file if there is no terminating newline 234 LineEnd int 235 LineNumber int 236 237 // Before and After are only set when SearchOptions.NumContextLines is > 0 238 Before []byte 239 After []byte 240 241 // If set, this was a match on the filename. 242 FileName bool 243 244 // The higher the better. Only ranks the quality of the match 245 // within the file, does not take rank of file into account 246 Score float64 247 DebugScore string 248 249 LineFragments []LineFragmentMatch 250} 251 252func (lm *LineMatch) sizeBytes() (sz uint64) { 253 // Line 254 sz += sliceHeaderBytes + uint64(len(lm.Line)) 255 256 // LineStart, LineEnd, LineNumber 257 sz += 3 * 8 258 259 // Before 260 sz += sliceHeaderBytes + uint64(len(lm.Before)) 261 262 // After 263 sz += sliceHeaderBytes + uint64(len(lm.After)) 264 265 // FileName 266 sz += 1 267 268 // Score 269 sz += 8 270 271 // DebugScore 272 sz += stringHeaderBytes + uint64(len(lm.DebugScore)) 273 274 // LineFragments 275 sz += sliceHeaderBytes 276 for _, lf := range lm.LineFragments { 277 sz += lf.sizeBytes() 278 } 279 280 return 281} 282 283type Symbol struct { 284 Sym string 285 Kind string 286 Parent string 287 ParentKind string 288} 289 290func (s *Symbol) sizeBytes() uint64 { 291 return 4*stringHeaderBytes + uint64(len(s.Sym)+len(s.Kind)+len(s.Parent)+len(s.ParentKind)) 292} 293 294// LineFragmentMatch a segment of matching text within a line. 295type LineFragmentMatch struct { 296 // Offset within the line, in bytes. 297 LineOffset int 298 299 // Offset from file start, in bytes. 300 Offset uint32 301 302 // Number bytes that match. 303 MatchLength int 304 305 SymbolInfo *Symbol 306} 307 308func (lfm *LineFragmentMatch) sizeBytes() (sz uint64) { 309 // LineOffset 310 sz += 8 311 312 // Offset 313 sz += 4 314 315 // MatchLength 316 sz += 8 317 318 // SymbolInfo 319 sz += pointerSize 320 if lfm.SymbolInfo != nil { 321 sz += lfm.SymbolInfo.sizeBytes() 322 } 323 324 return 325} 326 327type FlushReason uint8 328 329const ( 330 FlushReasonTimerExpired FlushReason = 1 << iota 331 FlushReasonFinalFlush 332 FlushReasonMaxSize 333) 334 335var FlushReasonStrings = map[FlushReason]string{ 336 FlushReasonTimerExpired: "timer_expired", 337 FlushReasonFinalFlush: "final_flush", 338 FlushReasonMaxSize: "max_size_reached", 339} 340 341func (fr FlushReason) String() string { 342 if v, ok := FlushReasonStrings[fr]; ok { 343 return v 344 } 345 346 return "none" 347} 348 349// Stats contains interesting numbers on the search 350type Stats struct { 351 // Amount of I/O for reading contents. 352 ContentBytesLoaded int64 353 354 // Amount of I/O for reading from index. 355 IndexBytesLoaded int64 356 357 // Number of search shards that had a crash. 358 Crashes int 359 360 // Wall clock time for this search 361 Duration time.Duration 362 363 // Number of files containing a match. 364 FileCount int 365 366 // Number of files in shards that we considered. 367 ShardFilesConsidered int 368 369 // Files that we evaluated. Equivalent to files for which all 370 // atom matches (including negations) evaluated to true. 371 FilesConsidered int 372 373 // Files for which we loaded file content to verify substring matches 374 FilesLoaded int 375 376 // Candidate files whose contents weren't examined because we 377 // gathered enough matches. 378 FilesSkipped int 379 380 // Shards that we scanned to find matches. 381 ShardsScanned int 382 383 // Shards that we did not process because a query was canceled. 384 ShardsSkipped int 385 386 // Shards that we did not process because the query was rejected by the 387 // ngram filter indicating it had no matches. 388 ShardsSkippedFilter int 389 390 // Number of non-overlapping matches 391 MatchCount int 392 393 // Number of candidate matches as a result of searching ngrams. 394 NgramMatches int 395 396 // NgramLookups is the number of times we accessed an ngram in the index. 397 NgramLookups int 398 399 // Wall clock time for queued search. 400 Wait time.Duration 401 402 // Aggregate wall clock time spent constructing and pruning the match tree. 403 // This accounts for time such as lookups in the trigram index. 404 MatchTreeConstruction time.Duration 405 406 // Aggregate wall clock time spent searching the match tree. This accounts 407 // for the bulk of search work done looking for matches. 408 MatchTreeSearch time.Duration 409 410 // Number of times regexp was called on files that we evaluated. 411 RegexpsConsidered int 412 413 // FlushReason explains why results were flushed. 414 FlushReason FlushReason 415} 416 417func (s *Stats) sizeBytes() (sz uint64) { 418 sz = 16 * 8 // This assumes we are running on a 64-bit architecture 419 sz += 1 // FlushReason 420 421 return 422} 423 424func (s *Stats) Add(o Stats) { 425 s.ContentBytesLoaded += o.ContentBytesLoaded 426 s.IndexBytesLoaded += o.IndexBytesLoaded 427 s.Crashes += o.Crashes 428 s.FileCount += o.FileCount 429 s.FilesConsidered += o.FilesConsidered 430 s.FilesLoaded += o.FilesLoaded 431 s.FilesSkipped += o.FilesSkipped 432 s.MatchCount += o.MatchCount 433 s.NgramMatches += o.NgramMatches 434 s.NgramLookups += o.NgramLookups 435 s.ShardFilesConsidered += o.ShardFilesConsidered 436 s.ShardsScanned += o.ShardsScanned 437 s.ShardsSkipped += o.ShardsSkipped 438 s.ShardsSkippedFilter += o.ShardsSkippedFilter 439 s.Wait += o.Wait 440 s.MatchTreeConstruction += o.MatchTreeConstruction 441 s.MatchTreeSearch += o.MatchTreeSearch 442 s.RegexpsConsidered += o.RegexpsConsidered 443 444 // We want the first non-zero FlushReason to be sticky. This is a useful 445 // property when aggregating stats from several Zoekts. 446 if s.FlushReason == 0 { 447 s.FlushReason = o.FlushReason 448 } 449} 450 451// Zero returns true if stats is empty. 452func (s *Stats) Zero() bool { 453 if s == nil { 454 return true 455 } 456 457 return !(s.ContentBytesLoaded > 0 || 458 s.IndexBytesLoaded > 0 || 459 s.Crashes > 0 || 460 s.FileCount > 0 || 461 s.FilesConsidered > 0 || 462 s.FilesLoaded > 0 || 463 s.FilesSkipped > 0 || 464 s.MatchCount > 0 || 465 s.NgramMatches > 0 || 466 s.NgramLookups > 0 || 467 s.ShardFilesConsidered > 0 || 468 s.ShardsScanned > 0 || 469 s.ShardsSkipped > 0 || 470 s.ShardsSkippedFilter > 0 || 471 s.Wait > 0 || 472 s.MatchTreeConstruction > 0 || 473 s.MatchTreeSearch > 0 || 474 s.RegexpsConsidered > 0) 475} 476 477// Progress contains information about the global progress of the running search query. 478// This is used by the frontend to reorder results and emit them when stable. 479// Sourcegraph specific: this is used when querying multiple zoekt-webserver instances. 480type Progress struct { 481 // Priority of the shard that was searched. 482 Priority float64 483 484 // MaxPendingPriority is the maximum priority of pending result that is being searched in parallel. 485 // This is used to reorder results when the result set is known to be stable-- that is, when a result's 486 // Priority is greater than the max(MaxPendingPriority) from the latest results of each backend, it can be returned to the user. 487 // 488 // MaxPendingPriority decreases monotonically in each SearchResult. 489 MaxPendingPriority float64 490} 491 492func (p *Progress) sizeBytes() uint64 { 493 return 2 * 8 494} 495 496// SearchResult contains search matches and extra data 497type SearchResult struct { 498 Stats 499 500 // Do not encode this as we cannot encode -Inf in JSON 501 Progress `json:"-"` 502 503 Files []FileMatch 504 505 // RepoURLs holds a repo => template string map. 506 RepoURLs map[string]string 507 508 // FragmentNames holds a repo => template string map, for 509 // the line number fragment. 510 LineFragments map[string]string 511} 512 513// SizeBytes is a best-effort estimate of the size of SearchResult in memory. 514// The estimate does not take alignment into account. The result is a lower 515// bound on the actual size in memory. 516func (sr *SearchResult) SizeBytes() (sz uint64) { 517 sz += sr.Stats.sizeBytes() 518 sz += sr.Progress.sizeBytes() 519 520 // Files 521 sz += sliceHeaderBytes 522 for _, f := range sr.Files { 523 sz += f.sizeBytes() 524 } 525 526 // RepoURLs 527 sz += mapHeaderBytes 528 for k, v := range sr.RepoURLs { 529 sz += stringHeaderBytes + uint64(len(k)) 530 sz += stringHeaderBytes + uint64(len(v)) 531 } 532 533 // LineFragments 534 sz += mapHeaderBytes 535 for k, v := range sr.LineFragments { 536 sz += stringHeaderBytes + uint64(len(k)) 537 sz += stringHeaderBytes + uint64(len(v)) 538 } 539 540 return 541} 542 543// RepositoryBranch describes an indexed branch, which is a name 544// combined with a version. 545type RepositoryBranch struct { 546 Name string 547 Version string 548} 549 550func (r RepositoryBranch) String() string { 551 return fmt.Sprintf("%s@%s", r.Name, r.Version) 552} 553 554// Repository holds repository metadata. 555type Repository struct { 556 // Sourcegraph's repository ID 557 ID uint32 558 559 // The repository name 560 Name string 561 562 // The repository URL. 563 URL string 564 565 // The physical source where this repo came from, eg. full 566 // path to the zip filename or git repository directory. This 567 // will not be exposed in the UI, but can be used to detect 568 // orphaned index shards. 569 Source string 570 571 // The branches indexed in this repo. 572 Branches []RepositoryBranch 573 574 // Nil if this is not the super project. 575 SubRepoMap map[string]*Repository 576 577 // URL template to link to the commit of a branch 578 CommitURLTemplate string 579 580 // The repository URL for getting to a file. Has access to 581 // {{.Version}}, {{.Path}} 582 FileURLTemplate string 583 584 // The URL fragment to add to a file URL for line numbers. has 585 // access to {{.LineNumber}}. The fragment should include the 586 // separator, generally '#' or ';'. 587 LineFragmentTemplate string 588 589 // Perf optimization: priority is set when we load the shard. It corresponds to 590 // the value of "priority" stored in RawConfig. 591 priority float64 592 593 // All zoekt.* configuration settings. 594 RawConfig map[string]string 595 596 // Importance of the repository, bigger is more important 597 Rank uint16 598 599 // IndexOptions is a hash of the options used to create the index for the 600 // repo. 601 IndexOptions string 602 603 // HasSymbols is true if this repository has indexed ctags 604 // output. Sourcegraph specific: This field is more appropriate for 605 // IndexMetadata. However, we store it here since the Sourcegraph frontend 606 // can read this structure but not IndexMetadata. 607 HasSymbols bool 608 609 // Tombstone is true if we are not allowed to search this repo. 610 Tombstone bool 611 612 // LatestCommitDate is the date of the latest commit among all indexed Branches. 613 // The date might be time.Time's 0-value if the repository was last indexed 614 // before this field was added. 615 LatestCommitDate time.Time 616 617 // FileTombstones is a set of file paths that should be ignored across all branches 618 // in this shard. 619 FileTombstones map[string]struct{} `json:",omitempty"` 620} 621 622func (r *Repository) UnmarshalJSON(data []byte) error { 623 // We define a new type so that we can use json.Unmarshal 624 // without recursing into this same method. 625 type repository *Repository 626 repo := repository(r) 627 628 err := json.Unmarshal(data, repo) 629 if err != nil { 630 return err 631 } 632 633 if v, ok := repo.RawConfig["repoid"]; ok { 634 id, _ := strconv.ParseUint(v, 10, 32) 635 r.ID = uint32(id) 636 } 637 638 // Sourcegraph indexserver doesn't set repo.Rank, so we set it here. Setting it 639 // on read instead of during indexing allows us to avoid a complete reindex. 640 // 641 // Prefer "latestCommitDate" over "priority" for ranking. We keep priority for 642 // backwards compatibility. 643 if _, ok := repo.RawConfig["latestCommitDate"]; ok { 644 // We use the number of months since 1970 as a simple measure of repo freshness. 645 // It is monotonically increasing and stable across re-indexes and restarts. 646 r.Rank = monthsSince1970(repo.LatestCommitDate) 647 } else if v, ok := repo.RawConfig["priority"]; ok { 648 r.priority, err = strconv.ParseFloat(v, 64) 649 if err != nil { 650 r.priority = 0 651 } 652 653 // Sourcegraph indexserver doesn't set repo.Rank, so we set it here 654 // based on priority. Setting it on read instead of during indexing 655 // allows us to avoid a complete reindex. 656 if r.Rank == 0 && r.priority > 0 { 657 // Normalize the repo score within [0, maxUint16), with the midpoint at 5,000. 658 // This means popular repos (roughly ones with over 5,000 stars) see diminishing 659 // returns from more stars. 660 r.Rank = uint16(r.priority / (5000.0 + r.priority) * maxUInt16) 661 } 662 } 663 664 return nil 665} 666 667// monthsSince1970 returns the number of months since 1970. It returns values in 668// the range [0, maxUInt16]. The upper bound is reached in the year 7431, the 669// lower bound for all dates before 1970. 670func monthsSince1970(t time.Time) uint16 { 671 base := time.Unix(0, 0) 672 if t.Before(base) { 673 return 0 674 } 675 months := int(t.Year()-1970)*12 + int(t.Month()-1) 676 return uint16(min(months, maxUInt16)) 677} 678 679// MergeMutable will merge x into r. mutated will be true if it made any 680// changes. err is non-nil if we needed to mutate an immutable field. 681// 682// Note: SubRepoMap, IndexOptions and HasSymbol fields are ignored. They are 683// computed while indexing so can't be synthesized from x. 684// 685// Note: We ignore RawConfig fields which are duplicated into Repository: 686// name and id. 687func (r *Repository) MergeMutable(x *Repository) (mutated bool, err error) { 688 if r.ID != x.ID { 689 // Sourcegraph: strange behaviour may occur if ID changes but names don't. 690 return mutated, errors.New("ID is immutable") 691 } 692 if r.Name != x.Name { 693 // Name is encoded into the shard name on disk. We need to re-index if it 694 // changes. 695 return mutated, errors.New("Name is immutable") 696 } 697 if !reflect.DeepEqual(r.Branches, x.Branches) { 698 // Need a reindex if content changing. 699 return mutated, errors.New("Branches is immutable") 700 } 701 702 for k, v := range x.RawConfig { 703 // We ignore name and id since they are encoded into the repository. 704 if k == "name" || k == "id" { 705 continue 706 } 707 if r.RawConfig == nil { 708 mutated = true 709 r.RawConfig = make(map[string]string) 710 } 711 if r.RawConfig[k] != v { 712 mutated = true 713 r.RawConfig[k] = v 714 } 715 } 716 717 if r.URL != x.URL { 718 mutated = true 719 r.URL = x.URL 720 } 721 if r.CommitURLTemplate != x.CommitURLTemplate { 722 mutated = true 723 r.CommitURLTemplate = x.CommitURLTemplate 724 } 725 if r.FileURLTemplate != x.FileURLTemplate { 726 mutated = true 727 r.FileURLTemplate = x.FileURLTemplate 728 } 729 if r.LineFragmentTemplate != x.LineFragmentTemplate { 730 mutated = true 731 r.LineFragmentTemplate = x.LineFragmentTemplate 732 } 733 734 return mutated, nil 735} 736 737// IndexMetadata holds metadata stored in the index file. It contains 738// data generated by the core indexing library. 739type IndexMetadata struct { 740 IndexFormatVersion int 741 IndexFeatureVersion int 742 IndexMinReaderVersion int 743 IndexTime time.Time 744 PlainASCII bool 745 LanguageMap map[string]uint16 746 ZoektVersion string 747 ID string 748} 749 750// Statistics of a (collection of) repositories. 751type RepoStats struct { 752 // Repos is used for aggregrating the number of repositories. 753 // 754 // Note: This field is not populated on RepoListEntry.Stats (individual) but 755 // only for RepoList.Stats (aggregate). 756 Repos int 757 758 // Shards is the total number of search shards. 759 Shards int 760 761 // Documents holds the number of documents or files. 762 Documents int 763 764 // IndexBytes is the amount of RAM used for index overhead. 765 IndexBytes int64 766 767 // ContentBytes is the amount of RAM used for raw content. 768 ContentBytes int64 769 770 // Sourcegraph specific stats below. These are not as efficient to calculate 771 // as the above statistics. We experimentally measured about a 10% slower 772 // shard load time. However, we find these values very useful to track and 773 // computing them outside of load time introduces a lot of complexity. 774 775 // NewLinesCount is the number of newlines "\n" that appear in the zoekt 776 // indexed documents. This is not exactly the same as line count, since it 777 // will not include lines not terminated by "\n" (eg a file with no "\n", or 778 // a final line without "\n"). Note: Zoekt deduplicates documents across 779 // branches, so if a path has the same contents on multiple branches, there 780 // is only one document for it. As such that document's newlines is only 781 // counted once. See DefaultBranchNewLinesCount and AllBranchesNewLinesCount 782 // for counts which do not deduplicate. 783 NewLinesCount uint64 784 785 // DefaultBranchNewLinesCount is the number of newlines "\n" in the default 786 // branch. 787 DefaultBranchNewLinesCount uint64 788 789 // OtherBranchesNewLinesCount is the number of newlines "\n" in all branches 790 // except the default branch. 791 OtherBranchesNewLinesCount uint64 792} 793 794func (s *RepoStats) Add(o *RepoStats) { 795 // can't update Repos, since one repo may have multiple 796 // shards. 797 s.Shards += o.Shards 798 s.IndexBytes += o.IndexBytes 799 s.Documents += o.Documents 800 s.ContentBytes += o.ContentBytes 801 802 // Sourcegraph specific 803 s.NewLinesCount += o.NewLinesCount 804 s.DefaultBranchNewLinesCount += o.DefaultBranchNewLinesCount 805 s.OtherBranchesNewLinesCount += o.OtherBranchesNewLinesCount 806} 807 808type RepoListEntry struct { 809 Repository Repository 810 IndexMetadata IndexMetadata 811 Stats RepoStats 812} 813 814// MinimalRepoListEntry is a subset of RepoListEntry. It was added after 815// performance profiling of sourcegraph.com revealed that querying this 816// information from Zoekt was causing lots of CPU and memory usage. Note: we 817// can revisit this, how we store and query this information has changed a lot 818// since this was introduced. 819type MinimalRepoListEntry struct { 820 // HasSymbols is exported since Sourcegraph uses this information at search 821 // planning time to decide between Zoekt and an unindexed symbol search. 822 // 823 // Note: it pretty much is always true in practice. 824 HasSymbols bool 825 826 // Branches is used by Sourcegraphs query planner to decided if it can use 827 // zoekt or go via an unindexed code path. 828 Branches []RepositoryBranch 829 830 // IndexTimeUnix is the IndexTime converted to unix time (number of seconds 831 // since the epoch). This is to make it clear we are not transporting the 832 // full fidelty timestamp (ie with milliseconds and location). Additionally 833 // it saves 16 bytes in this struct. 834 // 835 // IndexTime is used as a heuristic in Sourcegraph to decide in aggregate 836 // how many repositories need updating after a ranking change/etc. 837 // 838 // TODO(keegancsmith) audit updates to IndexTime and document how and when 839 // it changes. Concerned about things like metadata updates or compound 840 // shards leading to untrustworthy data here. 841 IndexTimeUnix int64 842} 843 844type ReposMap map[uint32]MinimalRepoListEntry 845 846// MarshalBinary implements a specialized encoder for ReposMap. 847func (q *ReposMap) MarshalBinary() ([]byte, error) { 848 return reposMapEncode(*q) 849} 850 851// UnmarshalBinary implements a specialized decoder for ReposMap. 852func (q *ReposMap) UnmarshalBinary(b []byte) error { 853 var err error 854 (*q), err = reposMapDecode(b) 855 return err 856} 857 858// RepoList holds a set of Repository metadata. 859type RepoList struct { 860 // Returned when ListOptions.Field is RepoListFieldRepos. 861 Repos []*RepoListEntry 862 863 // ReposMap is set when ListOptions.Field is RepoListFieldReposMap. 864 ReposMap ReposMap 865 866 Crashes int 867 868 // Stats response to a List request. 869 // This is the aggregate RepoStats of all repos matching the input query. 870 Stats RepoStats 871} 872 873type Searcher interface { 874 Search(ctx context.Context, q query.Q, opts *SearchOptions) (*SearchResult, error) 875 876 // List lists repositories. The query `q` can only contain 877 // query.Repo atoms. 878 List(ctx context.Context, q query.Q, opts *ListOptions) (*RepoList, error) 879 Close() 880 881 // Describe the searcher for debug messages. 882 String() string 883} 884 885type RepoListField int 886 887const ( 888 RepoListFieldRepos RepoListField = 0 889 RepoListFieldReposMap = 2 890) 891 892type ListOptions struct { 893 // Field decides which field to populate in RepoList response. 894 Field RepoListField 895} 896 897func (o *ListOptions) GetField() (RepoListField, error) { 898 if o == nil { 899 return RepoListFieldRepos, nil 900 } 901 switch o.Field { 902 case RepoListFieldRepos, RepoListFieldReposMap: 903 return o.Field, nil 904 case 1: 905 return 0, fmt.Errorf("RepoListFieldMinimal (%d) is no longer supported", o.Field) 906 default: 907 return 0, fmt.Errorf("unknown RepoListField %d", o.Field) 908 } 909} 910 911func (o *ListOptions) String() string { 912 return fmt.Sprintf("%#v", o) 913} 914 915type SearchOptions struct { 916 // Return an upper-bound estimate of eligible documents in 917 // stats.ShardFilesConsidered. 918 EstimateDocCount bool 919 920 // Return the whole file. 921 Whole bool 922 923 // Maximum number of matches: skip all processing an index 924 // shard after we found this many non-overlapping matches. 925 ShardMaxMatchCount int 926 927 // Maximum number of matches: stop looking for more matches 928 // once we have this many matches across shards. 929 TotalMaxMatchCount int 930 931 // Maximum number of matches: skip processing documents for a repository in 932 // a shard once we have found ShardRepoMaxMatchCount. 933 // 934 // A compound shard may contain multiple repositories. This will most often 935 // be set to 1 to find all repositories containing a result. 936 ShardRepoMaxMatchCount int 937 938 // Abort the search after this much time has passed. 939 MaxWallTime time.Duration 940 941 // FlushWallTime if non-zero will stop streaming behaviour at first and 942 // instead will collate and sort results. At FlushWallTime the results will 943 // be sent and then the behaviour will revert to the normal streaming. 944 FlushWallTime time.Duration 945 946 // Truncates the number of documents (i.e. files) after collating and 947 // sorting the results. 948 MaxDocDisplayCount int 949 950 // Truncates the number of matchs after collating and sorting the results. 951 MaxMatchDisplayCount int 952 953 // If set to a number greater than zero then up to this many number 954 // of context lines will be added before and after each matched line. 955 // Note that the included context lines might contain matches and 956 // it's up to the consumer of the result to remove those lines. 957 NumContextLines int 958 959 // If true, ChunkMatches will be returned in each FileMatch rather than LineMatches 960 // EXPERIMENTAL: the behavior of this flag may be changed in future versions. 961 ChunkMatches bool 962 963 // EXPERIMENTAL. If true, document ranks are used as additional input for 964 // sorting matches. 965 UseDocumentRanks bool 966 967 // EXPERIMENTAL. When UseDocumentRanks is enabled, this can be optionally set to adjust 968 // their weight in the file match score. If the value is <= 0.0, the default weight value 969 // will be used. This option is temporary and is only exposed for testing/ tuning purposes. 970 DocumentRanksWeight float64 971 972 // EXPERIMENTAL. If true, use text-search style scoring instead of the default 973 // scoring formula. The scoring algorithm treats each match in a file as a term 974 // and computes an approximation to BM25. 975 // 976 // The calculation of IDF assumes that Zoekt visits all documents containing any 977 // of the query terms during evaluation. This is true, for example, if all query 978 // terms are ORed together. 979 // 980 // When enabled, all other scoring signals are ignored, including document ranks. 981 UseBM25Scoring bool 982 983 // Trace turns on opentracing for this request if true and if the Jaeger address was provided as 984 // a command-line flag 985 Trace bool 986 987 // If set, the search results will contain debug information for scoring. 988 DebugScore bool 989 990 // SpanContext is the opentracing span context, if it exists, from the zoekt client 991 SpanContext map[string]string 992} 993 994// String returns a succinct representation of the options. This is meant for 995// human consumption in logs and traces. 996// 997// Note: some tracing systems have limits on length of values, so we take care 998// to try and make this small, and include the important information near the 999// front incase of truncation. 1000func (s *SearchOptions) String() string { 1001 var b strings.Builder 1002 1003 add := func(name, value string) { 1004 b.WriteString(name) 1005 b.WriteByte('=') 1006 b.WriteString(value) 1007 b.WriteByte(' ') 1008 } 1009 addInt := func(name string, value int) { 1010 if value != 0 { 1011 add(name, strconv.Itoa(value)) 1012 } 1013 } 1014 addDuration := func(name string, value time.Duration) { 1015 if value != 0 { 1016 add(name, value.String()) 1017 } 1018 } 1019 addBool := func(name string, value bool) { 1020 if !value { 1021 return 1022 } 1023 b.WriteString(name) 1024 b.WriteByte(' ') 1025 } 1026 1027 b.WriteString("zoekt.SearchOptions{ ") 1028 1029 addInt("ShardMaxMatchCount", s.ShardMaxMatchCount) 1030 addInt("TotalMaxMatchCount", s.TotalMaxMatchCount) 1031 addInt("ShardRepoMaxMatchCount", s.ShardRepoMaxMatchCount) 1032 addInt("MaxDocDisplayCount", s.MaxDocDisplayCount) 1033 addInt("MaxMatchDisplayCount", s.MaxMatchDisplayCount) 1034 addInt("NumContextLines", s.NumContextLines) 1035 1036 addDuration("MaxWallTime", s.MaxWallTime) 1037 addDuration("FlushWallTime", s.FlushWallTime) 1038 1039 if s.DocumentRanksWeight > 0 { 1040 add("DocumentRanksWeight", strconv.FormatFloat(s.DocumentRanksWeight, 'g', -1, 64)) 1041 } 1042 1043 addBool("EstimateDocCount", s.EstimateDocCount) 1044 addBool("Whole", s.Whole) 1045 addBool("ChunkMatches", s.ChunkMatches) 1046 addBool("UseDocumentRanks", s.UseDocumentRanks) 1047 addBool("UseBM25Scoring", s.UseBM25Scoring) 1048 addBool("Trace", s.Trace) 1049 addBool("DebugScore", s.DebugScore) 1050 1051 for k, v := range s.SpanContext { 1052 add("SpanContext."+k, strconv.Quote(v)) 1053 } 1054 1055 b.WriteByte('}') 1056 return b.String() 1057} 1058 1059// Sender is the interface that wraps the basic Send method. 1060type Sender interface { 1061 Send(*SearchResult) 1062} 1063 1064// SenderFunc is an adapter to allow the use of ordinary functions as Sender. 1065// If f is a function with the appropriate signature, SenderFunc(f) is a Sender 1066// that calls f. 1067type SenderFunc func(result *SearchResult) 1068 1069func (f SenderFunc) Send(result *SearchResult) { 1070 f(result) 1071} 1072 1073// Streamer adds the method StreamSearch to the Searcher interface. 1074type Streamer interface { 1075 Searcher 1076 StreamSearch(ctx context.Context, q query.Q, opts *SearchOptions, sender Sender) (err error) 1077}