fork of https://github.com/sourcegraph/zoekt
0

Configure Feed

Select the types of activity you want to include in your feed.

1// Copyright 2016 Google Inc. All rights reserved. 2// 3// Licensed under the Apache License, Version 2.0 (the "License"); 4// you may not use this file except in compliance with the License. 5// You may obtain a copy of the License at 6// 7// http://www.apache.org/licenses/LICENSE-2.0 8// 9// Unless required by applicable law or agreed to in writing, software 10// distributed under the License is distributed on an "AS IS" BASIS, 11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12// See the License for the specific language governing permissions and 13// limitations under the License. 14 15package zoekt // import "github.com/sourcegraph/zoekt" 16 17import ( 18 "context" 19 "encoding/json" 20 "errors" 21 "fmt" 22 "reflect" 23 "strconv" 24 "strings" 25 "time" 26 27 "github.com/sourcegraph/zoekt/query" 28) 29 30const ( 31 mapHeaderBytes uint64 = 48 32 sliceHeaderBytes uint64 = 24 33 stringHeaderBytes uint64 = 16 34 pointerSize uint64 = 8 35 interfaceBytes uint64 = 16 36) 37 38// FileMatch contains all the matches within a file. 39type FileMatch struct { 40 FileName string 41 42 // Repository is the globally unique name of the repo of the 43 // match 44 Repository string 45 46 // SubRepositoryName is the globally unique name of the repo, 47 // if it came from a subrepository 48 SubRepositoryName string `json:",omitempty"` 49 50 // SubRepositoryPath holds the prefix where the subrepository 51 // was mounted. 52 SubRepositoryPath string `json:",omitempty"` 53 54 // Commit SHA1 (hex) of the (sub)repo holding the file. 55 Version string `json:",omitempty"` 56 57 // Detected language of the result. 58 Language string 59 60 // For debugging. Needs DebugScore set, but public so tests in 61 // other packages can print some diagnostics. 62 Debug string `json:",omitempty"` 63 64 Branches []string `json:",omitempty"` 65 66 // One of LineMatches or ChunkMatches will be returned depending on whether 67 // the SearchOptions.ChunkMatches is set. 68 LineMatches []LineMatch `json:",omitempty"` 69 ChunkMatches []ChunkMatch `json:",omitempty"` 70 71 // Only set if requested 72 Content []byte `json:",omitempty"` 73 74 // Checksum of the content. 75 Checksum []byte 76 77 // Ranking; the higher, the better. 78 Score float64 `json:",omitempty"` 79 80 // RepositoryPriority is a Sourcegraph extension. It is used by Sourcegraph to 81 // order results from different repositories relative to each other. 82 RepositoryPriority float64 `json:",omitempty"` 83 84 // RepositoryID is a Sourcegraph extension. This is the ID of Repository in 85 // Sourcegraph. 86 RepositoryID uint32 `json:",omitempty"` 87} 88 89func (m *FileMatch) sizeBytes() (sz uint64) { 90 // Score 91 sz += 8 92 93 for _, s := range []string{ 94 m.Debug, 95 m.FileName, 96 m.Repository, 97 m.Language, 98 m.SubRepositoryName, 99 m.SubRepositoryPath, 100 m.Version, 101 } { 102 sz += stringHeaderBytes + uint64(len(s)) 103 } 104 105 // Branches 106 sz += sliceHeaderBytes 107 for _, s := range m.Branches { 108 sz += stringHeaderBytes + uint64(len(s)) 109 } 110 111 // LineMatches 112 sz += sliceHeaderBytes 113 for _, lm := range m.LineMatches { 114 sz += lm.sizeBytes() 115 } 116 117 // ChunkMatches 118 sz += sliceHeaderBytes 119 for _, cm := range m.ChunkMatches { 120 sz += cm.sizeBytes() 121 } 122 123 // RepositoryID 124 sz += 4 125 126 // RepositoryPriority 127 sz += 8 128 129 // Content 130 sz += sliceHeaderBytes + uint64(len(m.Content)) 131 132 // Checksum 133 sz += sliceHeaderBytes + uint64(len(m.Checksum)) 134 135 return 136} 137 138// ChunkMatch is a set of non-overlapping matches within a contiguous range of 139// lines in the file. 140type ChunkMatch struct { 141 DebugScore string 142 143 // Content is a contiguous range of complete lines that fully contains Ranges. 144 // Lines will always include their terminating newline (if it exists). 145 Content []byte 146 147 // Ranges is a set of matching ranges within this chunk. Each range is relative 148 // to the beginning of the file (not the beginning of Content). 149 Ranges []Range 150 151 // SymbolInfo is the symbol information associated with Ranges. If it is non-nil, 152 // its length will equal that of Ranges. Any of its elements may be nil. 153 SymbolInfo []*Symbol 154 155 // FileName indicates whether this match is a match on the file name, in 156 // which case Content will contain the file name. 157 FileName bool 158 159 // ContentStart is the location (inclusive) of the beginning of content 160 // relative to the beginning of the file. It will always be at the 161 // beginning of a line (Column will always be 1). 162 ContentStart Location 163 164 // Score is the overall relevance score of this chunk. 165 Score float64 166 167 // BestLineMatch is the line number of the highest-scoring line match in this chunk. 168 // The line number represents the index in the full file, and is 1-based. If FileName: true, 169 // this number will be 0. 170 BestLineMatch uint32 171} 172 173func (cm *ChunkMatch) sizeBytes() (sz uint64) { 174 // Content 175 sz += sliceHeaderBytes + uint64(len(cm.Content)) 176 177 // ContentStart 178 sz += cm.ContentStart.sizeBytes() 179 180 // FileName 181 sz += 1 182 183 // Ranges 184 sz += sliceHeaderBytes 185 if len(cm.Ranges) > 0 { 186 sz += uint64(len(cm.Ranges)) * cm.Ranges[0].sizeBytes() 187 } 188 189 // SymbolInfo 190 sz += sliceHeaderBytes 191 for _, si := range cm.SymbolInfo { 192 sz += pointerSize 193 if si != nil { 194 sz += si.sizeBytes() 195 } 196 } 197 198 // Score 199 sz += 8 200 201 // DebugScore 202 sz += stringHeaderBytes + uint64(len(cm.DebugScore)) 203 204 return 205} 206 207type Range struct { 208 // The inclusive beginning of the range. 209 Start Location 210 // The exclusive end of the range. 211 End Location 212} 213 214func (r *Range) sizeBytes() uint64 { 215 return r.Start.sizeBytes() + r.End.sizeBytes() 216} 217 218type Location struct { 219 // 0-based byte offset from the beginning of the file 220 ByteOffset uint32 221 // 1-based line number from the beginning of the file 222 LineNumber uint32 223 // 1-based column number (in runes) from the beginning of line 224 Column uint32 225} 226 227func (l *Location) sizeBytes() uint64 { 228 return 3 * 4 229} 230 231// LineMatch holds the matches within a single line in a file. 232type LineMatch struct { 233 // The line in which a match was found. 234 Line []byte 235 // The byte offset of the first byte of the line. 236 LineStart int 237 // The byte offset of the first byte past the end of the line. 238 // This is usually the byte after the terminating newline, but can also be 239 // the end of the file if there is no terminating newline 240 LineEnd int 241 LineNumber int 242 243 // Before and After are only set when SearchOptions.NumContextLines is > 0 244 Before []byte 245 After []byte 246 247 // If set, this was a match on the filename. 248 FileName bool 249 250 // The higher the better. Only ranks the quality of the match 251 // within the file, does not take rank of file into account 252 Score float64 253 DebugScore string 254 255 LineFragments []LineFragmentMatch 256} 257 258func (lm *LineMatch) sizeBytes() (sz uint64) { 259 // Line 260 sz += sliceHeaderBytes + uint64(len(lm.Line)) 261 262 // LineStart, LineEnd, LineNumber 263 sz += 3 * 8 264 265 // Before 266 sz += sliceHeaderBytes + uint64(len(lm.Before)) 267 268 // After 269 sz += sliceHeaderBytes + uint64(len(lm.After)) 270 271 // FileName 272 sz += 1 273 274 // Score 275 sz += 8 276 277 // DebugScore 278 sz += stringHeaderBytes + uint64(len(lm.DebugScore)) 279 280 // LineFragments 281 sz += sliceHeaderBytes 282 for _, lf := range lm.LineFragments { 283 sz += lf.sizeBytes() 284 } 285 286 return 287} 288 289type Symbol struct { 290 Sym string 291 Kind string 292 Parent string 293 ParentKind string 294} 295 296func (s *Symbol) sizeBytes() uint64 { 297 return 4*stringHeaderBytes + uint64(len(s.Sym)+len(s.Kind)+len(s.Parent)+len(s.ParentKind)) 298} 299 300// LineFragmentMatch a segment of matching text within a line. 301type LineFragmentMatch struct { 302 // Offset within the line, in bytes. 303 LineOffset int 304 305 // Offset from file start, in bytes. 306 Offset uint32 307 308 // Number bytes that match. 309 MatchLength int 310 311 SymbolInfo *Symbol 312} 313 314func (lfm *LineFragmentMatch) sizeBytes() (sz uint64) { 315 // LineOffset 316 sz += 8 317 318 // Offset 319 sz += 4 320 321 // MatchLength 322 sz += 8 323 324 // SymbolInfo 325 sz += pointerSize 326 if lfm.SymbolInfo != nil { 327 sz += lfm.SymbolInfo.sizeBytes() 328 } 329 330 return 331} 332 333type FlushReason uint8 334 335const ( 336 FlushReasonTimerExpired FlushReason = 1 << iota 337 FlushReasonFinalFlush 338 FlushReasonMaxSize 339) 340 341var FlushReasonStrings = map[FlushReason]string{ 342 FlushReasonTimerExpired: "timer_expired", 343 FlushReasonFinalFlush: "final_flush", 344 FlushReasonMaxSize: "max_size_reached", 345} 346 347func (fr FlushReason) String() string { 348 if v, ok := FlushReasonStrings[fr]; ok { 349 return v 350 } 351 352 return "none" 353} 354 355// Stats contains interesting numbers on the search 356type Stats struct { 357 // Amount of I/O for reading contents. 358 ContentBytesLoaded int64 359 360 // Amount of I/O for reading from index. 361 IndexBytesLoaded int64 362 363 // Number of search shards that had a crash. 364 Crashes int 365 366 // Wall clock time for this search 367 Duration time.Duration 368 369 // Number of files containing a match. 370 FileCount int 371 372 // Number of files in shards that we considered. 373 ShardFilesConsidered int 374 375 // Files that we evaluated. Equivalent to files for which all 376 // atom matches (including negations) evaluated to true. 377 FilesConsidered int 378 379 // Files for which we loaded file content to verify substring matches 380 FilesLoaded int 381 382 // Candidate files whose contents weren't examined because we 383 // gathered enough matches. 384 FilesSkipped int 385 386 // Shards that we scanned to find matches. 387 ShardsScanned int 388 389 // Shards that we did not process because a query was canceled. 390 ShardsSkipped int 391 392 // Shards that we did not process because the query was rejected by the 393 // ngram filter indicating it had no matches. 394 ShardsSkippedFilter int 395 396 // Number of non-overlapping matches 397 MatchCount int 398 399 // Number of candidate matches as a result of searching ngrams. 400 NgramMatches int 401 402 // NgramLookups is the number of times we accessed an ngram in the index. 403 NgramLookups int 404 405 // Wall clock time for queued search. 406 Wait time.Duration 407 408 // Aggregate wall clock time spent constructing and pruning the match tree. 409 // This accounts for time such as lookups in the trigram index. 410 MatchTreeConstruction time.Duration 411 412 // Aggregate wall clock time spent searching the match tree. This accounts 413 // for the bulk of search work done looking for matches. 414 MatchTreeSearch time.Duration 415 416 // Number of times regexp was called on files that we evaluated. 417 RegexpsConsidered int 418 419 // FlushReason explains why results were flushed. 420 FlushReason FlushReason 421} 422 423func (s *Stats) sizeBytes() (sz uint64) { 424 sz = 16 * 8 // This assumes we are running on a 64-bit architecture 425 sz += 1 // FlushReason 426 427 return 428} 429 430func (s *Stats) Add(o Stats) { 431 s.ContentBytesLoaded += o.ContentBytesLoaded 432 s.IndexBytesLoaded += o.IndexBytesLoaded 433 s.Crashes += o.Crashes 434 s.FileCount += o.FileCount 435 s.FilesConsidered += o.FilesConsidered 436 s.FilesLoaded += o.FilesLoaded 437 s.FilesSkipped += o.FilesSkipped 438 s.MatchCount += o.MatchCount 439 s.NgramMatches += o.NgramMatches 440 s.NgramLookups += o.NgramLookups 441 s.ShardFilesConsidered += o.ShardFilesConsidered 442 s.ShardsScanned += o.ShardsScanned 443 s.ShardsSkipped += o.ShardsSkipped 444 s.ShardsSkippedFilter += o.ShardsSkippedFilter 445 s.Wait += o.Wait 446 s.MatchTreeConstruction += o.MatchTreeConstruction 447 s.MatchTreeSearch += o.MatchTreeSearch 448 s.RegexpsConsidered += o.RegexpsConsidered 449 450 // We want the first non-zero FlushReason to be sticky. This is a useful 451 // property when aggregating stats from several Zoekts. 452 if s.FlushReason == 0 { 453 s.FlushReason = o.FlushReason 454 } 455} 456 457// Zero returns true if stats is empty. 458func (s *Stats) Zero() bool { 459 if s == nil { 460 return true 461 } 462 463 return !(s.ContentBytesLoaded > 0 || 464 s.IndexBytesLoaded > 0 || 465 s.Crashes > 0 || 466 s.FileCount > 0 || 467 s.FilesConsidered > 0 || 468 s.FilesLoaded > 0 || 469 s.FilesSkipped > 0 || 470 s.MatchCount > 0 || 471 s.NgramMatches > 0 || 472 s.NgramLookups > 0 || 473 s.ShardFilesConsidered > 0 || 474 s.ShardsScanned > 0 || 475 s.ShardsSkipped > 0 || 476 s.ShardsSkippedFilter > 0 || 477 s.Wait > 0 || 478 s.MatchTreeConstruction > 0 || 479 s.MatchTreeSearch > 0 || 480 s.RegexpsConsidered > 0) 481} 482 483// Progress contains information about the global progress of the running search query. 484// This is used by the frontend to reorder results and emit them when stable. 485// Sourcegraph specific: this is used when querying multiple zoekt-webserver instances. 486type Progress struct { 487 // Priority of the shard that was searched. 488 Priority float64 489 490 // MaxPendingPriority is the maximum priority of pending result that is being searched in parallel. 491 // This is used to reorder results when the result set is known to be stable-- that is, when a result's 492 // Priority is greater than the max(MaxPendingPriority) from the latest results of each backend, it can be returned to the user. 493 // 494 // MaxPendingPriority decreases monotonically in each SearchResult. 495 MaxPendingPriority float64 496} 497 498func (p *Progress) sizeBytes() uint64 { 499 return 2 * 8 500} 501 502// SearchResult contains search matches and extra data 503type SearchResult struct { 504 Stats 505 506 // Do not encode this as we cannot encode -Inf in JSON 507 Progress `json:"-"` 508 509 Files []FileMatch 510 511 // RepoURLs holds a repo => template string map. 512 RepoURLs map[string]string 513 514 // FragmentNames holds a repo => template string map, for 515 // the line number fragment. 516 LineFragments map[string]string 517} 518 519// SizeBytes is a best-effort estimate of the size of SearchResult in memory. 520// The estimate does not take alignment into account. The result is a lower 521// bound on the actual size in memory. 522func (sr *SearchResult) SizeBytes() (sz uint64) { 523 sz += sr.Stats.sizeBytes() 524 sz += sr.Progress.sizeBytes() 525 526 // Files 527 sz += sliceHeaderBytes 528 for _, f := range sr.Files { 529 sz += f.sizeBytes() 530 } 531 532 // RepoURLs 533 sz += mapHeaderBytes 534 for k, v := range sr.RepoURLs { 535 sz += stringHeaderBytes + uint64(len(k)) 536 sz += stringHeaderBytes + uint64(len(v)) 537 } 538 539 // LineFragments 540 sz += mapHeaderBytes 541 for k, v := range sr.LineFragments { 542 sz += stringHeaderBytes + uint64(len(k)) 543 sz += stringHeaderBytes + uint64(len(v)) 544 } 545 546 return 547} 548 549// RepositoryBranch describes an indexed branch, which is a name 550// combined with a version. 551type RepositoryBranch struct { 552 Name string 553 Version string 554} 555 556func (r RepositoryBranch) String() string { 557 return fmt.Sprintf("%s@%s", r.Name, r.Version) 558} 559 560// Repository holds repository metadata. 561type Repository struct { 562 // Sourcegraph's tenant ID 563 TenantID int 564 565 // Sourcegraph's repository ID 566 ID uint32 567 568 // The repository name 569 Name string 570 571 // The repository URL. 572 URL string 573 574 // The physical source where this repo came from, eg. full 575 // path to the zip filename or git repository directory. This 576 // will not be exposed in the UI, but can be used to detect 577 // orphaned index shards. 578 Source string 579 580 // The branches indexed in this repo. 581 Branches []RepositoryBranch 582 583 // Nil if this is not the super project. 584 SubRepoMap map[string]*Repository 585 586 // URL template to link to the commit of a branch 587 CommitURLTemplate string 588 589 // The repository URL for getting to a file. Has access to 590 // {{.Version}}, {{.Path}} 591 FileURLTemplate string 592 593 // The URL fragment to add to a file URL for line numbers. has 594 // access to {{.LineNumber}}. The fragment should include the 595 // separator, generally '#' or ';'. 596 LineFragmentTemplate string 597 598 // Perf optimization: priority is set when we load the shard. It corresponds to 599 // the value of "priority" stored in RawConfig. 600 priority float64 601 602 // All zoekt.* configuration settings. 603 RawConfig map[string]string 604 605 // Importance of the repository, bigger is more important 606 Rank uint16 607 608 // IndexOptions is a hash of the options used to create the index for the 609 // repo. 610 IndexOptions string 611 612 // HasSymbols is true if this repository has indexed ctags 613 // output. Sourcegraph specific: This field is more appropriate for 614 // IndexMetadata. However, we store it here since the Sourcegraph frontend 615 // can read this structure but not IndexMetadata. 616 HasSymbols bool 617 618 // Tombstone is true if we are not allowed to search this repo. 619 Tombstone bool 620 621 // LatestCommitDate is the date of the latest commit among all indexed Branches. 622 // The date might be time.Time's 0-value if the repository was last indexed 623 // before this field was added. 624 LatestCommitDate time.Time 625 626 // FileTombstones is a set of file paths that should be ignored across all branches 627 // in this shard. 628 FileTombstones map[string]struct{} `json:",omitempty"` 629} 630 631func (r *Repository) UnmarshalJSON(data []byte) error { 632 // We define a new type so that we can use json.Unmarshal 633 // without recursing into this same method. 634 type repository *Repository 635 repo := repository(r) 636 637 err := json.Unmarshal(data, repo) 638 if err != nil { 639 return err 640 } 641 642 if v, ok := repo.RawConfig["repoid"]; ok { 643 id, _ := strconv.ParseUint(v, 10, 32) 644 r.ID = uint32(id) 645 } 646 647 if v, ok := repo.RawConfig["tenantID"]; ok { 648 id, _ := strconv.ParseInt(v, 10, 64) 649 r.TenantID = int(id) 650 } 651 652 // Sourcegraph indexserver doesn't set repo.Rank, so we set it here. Setting it 653 // on read instead of during indexing allows us to avoid a complete reindex. 654 // 655 // Prefer "latestCommitDate" over "priority" for ranking. We keep priority for 656 // backwards compatibility. 657 if _, ok := repo.RawConfig["latestCommitDate"]; ok { 658 // We use the number of months since 1970 as a simple measure of repo freshness. 659 // It is monotonically increasing and stable across re-indexes and restarts. 660 r.Rank = monthsSince1970(repo.LatestCommitDate) 661 } else if v, ok := repo.RawConfig["priority"]; ok { 662 r.priority, err = strconv.ParseFloat(v, 64) 663 if err != nil { 664 r.priority = 0 665 } 666 667 // Sourcegraph indexserver doesn't set repo.Rank, so we set it here 668 // based on priority. Setting it on read instead of during indexing 669 // allows us to avoid a complete reindex. 670 if r.Rank == 0 && r.priority > 0 { 671 // Normalize the repo score within [0, maxUint16), with the midpoint at 5,000. 672 // This means popular repos (roughly ones with over 5,000 stars) see diminishing 673 // returns from more stars. 674 r.Rank = uint16(r.priority / (5000.0 + r.priority) * maxUInt16) 675 } 676 } 677 678 return nil 679} 680 681// monthsSince1970 returns the number of months since 1970. It returns values in 682// the range [0, maxUInt16]. The upper bound is reached in the year 7431, the 683// lower bound for all dates before 1970. 684func monthsSince1970(t time.Time) uint16 { 685 base := time.Unix(0, 0) 686 if t.Before(base) { 687 return 0 688 } 689 months := int(t.Year()-1970)*12 + int(t.Month()-1) 690 return uint16(min(months, maxUInt16)) 691} 692 693// MergeMutable will merge x into r. mutated will be true if it made any 694// changes. err is non-nil if we needed to mutate an immutable field. 695// 696// Note: SubRepoMap, IndexOptions and HasSymbol fields are ignored. They are 697// computed while indexing so can't be synthesized from x. 698// 699// Note: We ignore RawConfig fields which are duplicated into Repository: 700// name and id. 701func (r *Repository) MergeMutable(x *Repository) (mutated bool, err error) { 702 if r.ID != x.ID { 703 // Sourcegraph: strange behaviour may occur if ID changes but names don't. 704 return mutated, errors.New("ID is immutable") 705 } 706 if r.Name != x.Name { 707 // Name is encoded into the shard name on disk. We need to re-index if it 708 // changes. 709 return mutated, errors.New("Name is immutable") 710 } 711 if !reflect.DeepEqual(r.Branches, x.Branches) { 712 // Need a reindex if content changing. 713 return mutated, errors.New("Branches is immutable") 714 } 715 716 for k, v := range x.RawConfig { 717 // We ignore name and id since they are encoded into the repository. 718 if k == "name" || k == "id" { 719 continue 720 } 721 if r.RawConfig == nil { 722 mutated = true 723 r.RawConfig = make(map[string]string) 724 } 725 if r.RawConfig[k] != v { 726 mutated = true 727 r.RawConfig[k] = v 728 } 729 } 730 731 if r.URL != x.URL { 732 mutated = true 733 r.URL = x.URL 734 } 735 if r.CommitURLTemplate != x.CommitURLTemplate { 736 mutated = true 737 r.CommitURLTemplate = x.CommitURLTemplate 738 } 739 if r.FileURLTemplate != x.FileURLTemplate { 740 mutated = true 741 r.FileURLTemplate = x.FileURLTemplate 742 } 743 if r.LineFragmentTemplate != x.LineFragmentTemplate { 744 mutated = true 745 r.LineFragmentTemplate = x.LineFragmentTemplate 746 } 747 748 return mutated, nil 749} 750 751// IndexMetadata holds metadata stored in the index file. It contains 752// data generated by the core indexing library. 753type IndexMetadata struct { 754 IndexFormatVersion int 755 IndexFeatureVersion int 756 IndexMinReaderVersion int 757 IndexTime time.Time 758 PlainASCII bool 759 LanguageMap map[string]uint16 760 ZoektVersion string 761 ID string 762} 763 764// Statistics of a (collection of) repositories. 765type RepoStats struct { 766 // Repos is used for aggregrating the number of repositories. 767 // 768 // Note: This field is not populated on RepoListEntry.Stats (individual) but 769 // only for RepoList.Stats (aggregate). 770 Repos int 771 772 // Shards is the total number of search shards. 773 Shards int 774 775 // Documents holds the number of documents or files. 776 Documents int 777 778 // IndexBytes is the amount of RAM used for index overhead. 779 IndexBytes int64 780 781 // ContentBytes is the amount of RAM used for raw content. 782 ContentBytes int64 783 784 // Sourcegraph specific stats below. These are not as efficient to calculate 785 // as the above statistics. We experimentally measured about a 10% slower 786 // shard load time. However, we find these values very useful to track and 787 // computing them outside of load time introduces a lot of complexity. 788 789 // NewLinesCount is the number of newlines "\n" that appear in the zoekt 790 // indexed documents. This is not exactly the same as line count, since it 791 // will not include lines not terminated by "\n" (eg a file with no "\n", or 792 // a final line without "\n"). Note: Zoekt deduplicates documents across 793 // branches, so if a path has the same contents on multiple branches, there 794 // is only one document for it. As such that document's newlines is only 795 // counted once. See DefaultBranchNewLinesCount and AllBranchesNewLinesCount 796 // for counts which do not deduplicate. 797 NewLinesCount uint64 798 799 // DefaultBranchNewLinesCount is the number of newlines "\n" in the default 800 // branch. 801 DefaultBranchNewLinesCount uint64 802 803 // OtherBranchesNewLinesCount is the number of newlines "\n" in all branches 804 // except the default branch. 805 OtherBranchesNewLinesCount uint64 806} 807 808func (s *RepoStats) Add(o *RepoStats) { 809 // can't update Repos, since one repo may have multiple 810 // shards. 811 s.Shards += o.Shards 812 s.IndexBytes += o.IndexBytes 813 s.Documents += o.Documents 814 s.ContentBytes += o.ContentBytes 815 816 // Sourcegraph specific 817 s.NewLinesCount += o.NewLinesCount 818 s.DefaultBranchNewLinesCount += o.DefaultBranchNewLinesCount 819 s.OtherBranchesNewLinesCount += o.OtherBranchesNewLinesCount 820} 821 822type RepoListEntry struct { 823 Repository Repository 824 IndexMetadata IndexMetadata 825 Stats RepoStats 826} 827 828// MinimalRepoListEntry is a subset of RepoListEntry. It was added after 829// performance profiling of sourcegraph.com revealed that querying this 830// information from Zoekt was causing lots of CPU and memory usage. Note: we 831// can revisit this, how we store and query this information has changed a lot 832// since this was introduced. 833type MinimalRepoListEntry struct { 834 // HasSymbols is exported since Sourcegraph uses this information at search 835 // planning time to decide between Zoekt and an unindexed symbol search. 836 // 837 // Note: it pretty much is always true in practice. 838 HasSymbols bool 839 840 // Branches is used by Sourcegraphs query planner to decided if it can use 841 // zoekt or go via an unindexed code path. 842 Branches []RepositoryBranch 843 844 // IndexTimeUnix is the IndexTime converted to unix time (number of seconds 845 // since the epoch). This is to make it clear we are not transporting the 846 // full fidelty timestamp (ie with milliseconds and location). Additionally 847 // it saves 16 bytes in this struct. 848 // 849 // IndexTime is used as a heuristic in Sourcegraph to decide in aggregate 850 // how many repositories need updating after a ranking change/etc. 851 // 852 // TODO(keegancsmith) audit updates to IndexTime and document how and when 853 // it changes. Concerned about things like metadata updates or compound 854 // shards leading to untrustworthy data here. 855 IndexTimeUnix int64 856} 857 858type ReposMap map[uint32]MinimalRepoListEntry 859 860// MarshalBinary implements a specialized encoder for ReposMap. 861func (q *ReposMap) MarshalBinary() ([]byte, error) { 862 return reposMapEncode(*q) 863} 864 865// UnmarshalBinary implements a specialized decoder for ReposMap. 866func (q *ReposMap) UnmarshalBinary(b []byte) error { 867 var err error 868 (*q), err = reposMapDecode(b) 869 return err 870} 871 872// RepoList holds a set of Repository metadata. 873type RepoList struct { 874 // Returned when ListOptions.Field is RepoListFieldRepos. 875 Repos []*RepoListEntry 876 877 // ReposMap is set when ListOptions.Field is RepoListFieldReposMap. 878 ReposMap ReposMap 879 880 Crashes int 881 882 // Stats response to a List request. 883 // This is the aggregate RepoStats of all repos matching the input query. 884 Stats RepoStats 885} 886 887type Searcher interface { 888 Search(ctx context.Context, q query.Q, opts *SearchOptions) (*SearchResult, error) 889 890 // List lists repositories. The query `q` can only contain 891 // query.Repo atoms. 892 List(ctx context.Context, q query.Q, opts *ListOptions) (*RepoList, error) 893 Close() 894 895 // Describe the searcher for debug messages. 896 String() string 897} 898 899type RepoListField int 900 901const ( 902 RepoListFieldRepos RepoListField = 0 903 RepoListFieldReposMap = 2 904) 905 906type ListOptions struct { 907 // Field decides which field to populate in RepoList response. 908 Field RepoListField 909} 910 911func (o *ListOptions) GetField() (RepoListField, error) { 912 if o == nil { 913 return RepoListFieldRepos, nil 914 } 915 switch o.Field { 916 case RepoListFieldRepos, RepoListFieldReposMap: 917 return o.Field, nil 918 case 1: 919 return 0, fmt.Errorf("RepoListFieldMinimal (%d) is no longer supported", o.Field) 920 default: 921 return 0, fmt.Errorf("unknown RepoListField %d", o.Field) 922 } 923} 924 925func (o *ListOptions) String() string { 926 return fmt.Sprintf("%#v", o) 927} 928 929type SearchOptions struct { 930 // Return an upper-bound estimate of eligible documents in 931 // stats.ShardFilesConsidered. 932 EstimateDocCount bool 933 934 // Return the whole file. 935 Whole bool 936 937 // Maximum number of matches: skip all processing an index 938 // shard after we found this many non-overlapping matches. 939 ShardMaxMatchCount int 940 941 // Maximum number of matches: stop looking for more matches 942 // once we have this many matches across shards. 943 TotalMaxMatchCount int 944 945 // Maximum number of matches: skip processing documents for a repository in 946 // a shard once we have found ShardRepoMaxMatchCount. 947 // 948 // A compound shard may contain multiple repositories. This will most often 949 // be set to 1 to find all repositories containing a result. 950 ShardRepoMaxMatchCount int 951 952 // Abort the search after this much time has passed. 953 MaxWallTime time.Duration 954 955 // FlushWallTime if non-zero will stop streaming behaviour at first and 956 // instead will collate and sort results. At FlushWallTime the results will 957 // be sent and then the behaviour will revert to the normal streaming. 958 FlushWallTime time.Duration 959 960 // Truncates the number of documents (i.e. files) after collating and 961 // sorting the results. 962 MaxDocDisplayCount int 963 964 // Truncates the number of matchs after collating and sorting the results. 965 MaxMatchDisplayCount int 966 967 // If set to a number greater than zero then up to this many number 968 // of context lines will be added before and after each matched line. 969 // Note that the included context lines might contain matches and 970 // it's up to the consumer of the result to remove those lines. 971 NumContextLines int 972 973 // If true, ChunkMatches will be returned in each FileMatch rather than LineMatches 974 // EXPERIMENTAL: the behavior of this flag may be changed in future versions. 975 ChunkMatches bool 976 977 // EXPERIMENTAL. If true, use text-search style scoring instead of the default 978 // scoring formula. The scoring algorithm treats each match in a file as a term 979 // and computes an approximation to BM25. 980 // 981 // The calculation of IDF assumes that Zoekt visits all documents containing any 982 // of the query terms during evaluation. This is true, for example, if all query 983 // terms are ORed together. 984 // 985 // When enabled, all other scoring signals are ignored, including document ranks. 986 UseBM25Scoring bool 987 988 // Trace turns on opentracing for this request if true and if the Jaeger address was provided as 989 // a command-line flag 990 Trace bool 991 992 // If set, the search results will contain debug information for scoring. 993 DebugScore bool 994 995 // SpanContext is the opentracing span context, if it exists, from the zoekt client 996 SpanContext map[string]string 997} 998 999// String returns a succinct representation of the options. This is meant for 1000// human consumption in logs and traces. 1001// 1002// Note: some tracing systems have limits on length of values, so we take care 1003// to try and make this small, and include the important information near the 1004// front incase of truncation. 1005func (s *SearchOptions) String() string { 1006 var b strings.Builder 1007 1008 add := func(name, value string) { 1009 b.WriteString(name) 1010 b.WriteByte('=') 1011 b.WriteString(value) 1012 b.WriteByte(' ') 1013 } 1014 addInt := func(name string, value int) { 1015 if value != 0 { 1016 add(name, strconv.Itoa(value)) 1017 } 1018 } 1019 addDuration := func(name string, value time.Duration) { 1020 if value != 0 { 1021 add(name, value.String()) 1022 } 1023 } 1024 addBool := func(name string, value bool) { 1025 if !value { 1026 return 1027 } 1028 b.WriteString(name) 1029 b.WriteByte(' ') 1030 } 1031 1032 b.WriteString("zoekt.SearchOptions{ ") 1033 1034 addInt("ShardMaxMatchCount", s.ShardMaxMatchCount) 1035 addInt("TotalMaxMatchCount", s.TotalMaxMatchCount) 1036 addInt("ShardRepoMaxMatchCount", s.ShardRepoMaxMatchCount) 1037 addInt("MaxDocDisplayCount", s.MaxDocDisplayCount) 1038 addInt("MaxMatchDisplayCount", s.MaxMatchDisplayCount) 1039 addInt("NumContextLines", s.NumContextLines) 1040 1041 addDuration("MaxWallTime", s.MaxWallTime) 1042 addDuration("FlushWallTime", s.FlushWallTime) 1043 1044 addBool("EstimateDocCount", s.EstimateDocCount) 1045 addBool("Whole", s.Whole) 1046 addBool("ChunkMatches", s.ChunkMatches) 1047 addBool("UseBM25Scoring", s.UseBM25Scoring) 1048 addBool("Trace", s.Trace) 1049 addBool("DebugScore", s.DebugScore) 1050 1051 for k, v := range s.SpanContext { 1052 add("SpanContext."+k, strconv.Quote(v)) 1053 } 1054 1055 b.WriteByte('}') 1056 return b.String() 1057} 1058 1059// Sender is the interface that wraps the basic Send method. 1060type Sender interface { 1061 Send(*SearchResult) 1062} 1063 1064// SenderFunc is an adapter to allow the use of ordinary functions as Sender. 1065// If f is a function with the appropriate signature, SenderFunc(f) is a Sender 1066// that calls f. 1067type SenderFunc func(result *SearchResult) 1068 1069func (f SenderFunc) Send(result *SearchResult) { 1070 f(result) 1071} 1072 1073// Streamer adds the method StreamSearch to the Searcher interface. 1074type Streamer interface { 1075 Searcher 1076 StreamSearch(ctx context.Context, q query.Q, opts *SearchOptions, sender Sender) (err error) 1077}