fork of https://github.com/sourcegraph/zoekt
0

Configure Feed

Select the types of activity you want to include in your feed.

at main 32 kB View raw
1// Copyright 2016 Google Inc. All rights reserved. 2// 3// Licensed under the Apache License, Version 2.0 (the "License"); 4// you may not use this file except in compliance with the License. 5// You may obtain a copy of the License at 6// 7// http://www.apache.org/licenses/LICENSE-2.0 8// 9// Unless required by applicable law or agreed to in writing, software 10// distributed under the License is distributed on an "AS IS" BASIS, 11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12// See the License for the specific language governing permissions and 13// limitations under the License. 14 15package zoekt 16 17import ( 18 "context" 19 "encoding/json" 20 "errors" 21 "fmt" 22 "math" 23 "reflect" 24 "strconv" 25 "strings" 26 "time" 27 28 "github.com/sourcegraph/zoekt/query" 29) 30 31const ( 32 mapHeaderBytes uint64 = 48 33 sliceHeaderBytes uint64 = 24 34 stringHeaderBytes uint64 = 16 35 pointerSize uint64 = 8 36) 37 38// FileMatch contains all the matches within a file. 39type FileMatch struct { 40 FileName string 41 42 // Repository is the globally unique name of the repo of the 43 // match 44 Repository string 45 46 // SubRepositoryName is the globally unique name of the repo, 47 // if it came from a subrepository 48 SubRepositoryName string `json:",omitempty"` 49 50 // SubRepositoryPath holds the prefix where the subrepository 51 // was mounted. 52 SubRepositoryPath string `json:",omitempty"` 53 54 // Commit SHA1 (hex) of the (sub)repo holding the file. 55 Version string `json:",omitempty"` 56 57 // Detected language of the result. 58 Language string 59 60 // For debugging. Needs DebugScore set, but public so tests in 61 // other packages can print some diagnostics. 62 Debug string `json:",omitempty"` 63 64 Branches []string `json:",omitempty"` 65 66 // One of LineMatches or ChunkMatches will be returned depending on whether 67 // the SearchOptions.ChunkMatches is set. 68 LineMatches []LineMatch `json:",omitempty"` 69 ChunkMatches []ChunkMatch `json:",omitempty"` 70 71 // Only set if requested 72 Content []byte `json:",omitempty"` 73 74 // Checksum of the content. 75 Checksum []byte 76 77 // Ranking; the higher, the better. 78 Score float64 `json:",omitempty"` 79 80 // RepositoryPriority is a Sourcegraph extension. It is used by Sourcegraph to 81 // order results from different repositories relative to each other. 82 RepositoryPriority float64 `json:",omitempty"` 83 84 // RepositoryID is a Sourcegraph extension. This is the ID of Repository in 85 // Sourcegraph. 86 RepositoryID uint32 `json:",omitempty"` 87} 88 89func (m *FileMatch) sizeBytes() (sz uint64) { 90 // Score 91 sz += 8 92 93 for _, s := range []string{ 94 m.Debug, 95 m.FileName, 96 m.Repository, 97 m.Language, 98 m.SubRepositoryName, 99 m.SubRepositoryPath, 100 m.Version, 101 } { 102 sz += stringHeaderBytes + uint64(len(s)) 103 } 104 105 // Branches 106 sz += sliceHeaderBytes 107 for _, s := range m.Branches { 108 sz += stringHeaderBytes + uint64(len(s)) 109 } 110 111 // LineMatches 112 sz += sliceHeaderBytes 113 for _, lm := range m.LineMatches { 114 sz += lm.sizeBytes() 115 } 116 117 // ChunkMatches 118 sz += sliceHeaderBytes 119 for _, cm := range m.ChunkMatches { 120 sz += cm.sizeBytes() 121 } 122 123 // RepositoryID 124 sz += 4 125 126 // RepositoryPriority 127 sz += 8 128 129 // Content 130 sz += sliceHeaderBytes + uint64(len(m.Content)) 131 132 // Checksum 133 sz += sliceHeaderBytes + uint64(len(m.Checksum)) 134 135 return 136} 137 138// AddScore increments the score of the FileMatch by the computed score. If 139// debugScore is true, it also adds a debug string to the FileMatch. If raw is 140// -1, it is ignored. Otherwise, it is added to the debug string. 141func (m *FileMatch) AddScore(what string, computed float64, raw float64, debugScore bool) { 142 if computed != 0 && debugScore { 143 var b strings.Builder 144 fmt.Fprintf(&b, "%s", what) 145 if raw != -1 { 146 fmt.Fprintf(&b, "(%s)", strconv.FormatFloat(raw, 'f', -1, 64)) 147 } 148 fmt.Fprintf(&b, ":%.2f, ", computed) 149 m.Debug += b.String() 150 } 151 m.Score += computed 152} 153 154// ChunkMatch is a set of non-overlapping matches within a contiguous range of 155// lines in the file. 156type ChunkMatch struct { 157 DebugScore string 158 159 // Content is a contiguous range of complete lines that fully contains Ranges. 160 // Lines will always include their terminating newline (if it exists). 161 Content []byte 162 163 // Ranges is a set of matching ranges within this chunk. Each range is relative 164 // to the beginning of the file (not the beginning of Content). 165 Ranges []Range 166 167 // SymbolInfo is the symbol information associated with Ranges. If it is non-nil, 168 // its length will equal that of Ranges. Any of its elements may be nil. 169 SymbolInfo []*Symbol 170 171 // FileName indicates whether this match is a match on the file name, in 172 // which case Content will contain the file name. 173 FileName bool 174 175 // ContentStart is the location (inclusive) of the beginning of content 176 // relative to the beginning of the file. It will always be at the 177 // beginning of a line (Column will always be 1). 178 ContentStart Location 179 180 // Score is the overall relevance score of this chunk. 181 Score float64 182 183 // BestLineMatch is the line number of the highest-scoring line match in this chunk. 184 // The line number represents the index in the full file, and is 1-based. If FileName: true, 185 // this number will be 0. 186 BestLineMatch uint32 187} 188 189func (cm *ChunkMatch) sizeBytes() (sz uint64) { 190 // Content 191 sz += sliceHeaderBytes + uint64(len(cm.Content)) 192 193 // ContentStart 194 sz += cm.ContentStart.sizeBytes() 195 196 // FileName 197 sz += 1 198 199 // Ranges 200 sz += sliceHeaderBytes 201 if len(cm.Ranges) > 0 { 202 sz += uint64(len(cm.Ranges)) * cm.Ranges[0].sizeBytes() 203 } 204 205 // SymbolInfo 206 sz += sliceHeaderBytes 207 for _, si := range cm.SymbolInfo { 208 sz += pointerSize 209 if si != nil { 210 sz += si.sizeBytes() 211 } 212 } 213 214 // Score 215 sz += 8 216 217 // DebugScore 218 sz += stringHeaderBytes + uint64(len(cm.DebugScore)) 219 220 return 221} 222 223type Range struct { 224 // The inclusive beginning of the range. 225 Start Location 226 // The exclusive end of the range. 227 End Location 228} 229 230func (r *Range) sizeBytes() uint64 { 231 return r.Start.sizeBytes() + r.End.sizeBytes() 232} 233 234type Location struct { 235 // 0-based byte offset from the beginning of the file 236 ByteOffset uint32 237 // 1-based line number from the beginning of the file 238 LineNumber uint32 239 // 1-based column number (in runes) from the beginning of line 240 Column uint32 241} 242 243func (l *Location) sizeBytes() uint64 { 244 return 3 * 4 245} 246 247// LineMatch holds the matches within a single line in a file. 248type LineMatch struct { 249 // The line in which a match was found. 250 Line []byte 251 // The byte offset of the first byte of the line. 252 LineStart int 253 // The byte offset of the first byte past the end of the line. 254 // This is usually the byte after the terminating newline, but can also be 255 // the end of the file if there is no terminating newline 256 LineEnd int 257 LineNumber int 258 259 // Before and After are only set when SearchOptions.NumContextLines is > 0 260 Before []byte 261 After []byte 262 263 // If set, this was a match on the filename. 264 FileName bool 265 266 // The higher the better. Only ranks the quality of the match 267 // within the file, does not take rank of file into account 268 Score float64 269 DebugScore string 270 271 LineFragments []LineFragmentMatch 272} 273 274func (lm *LineMatch) sizeBytes() (sz uint64) { 275 // Line 276 sz += sliceHeaderBytes + uint64(len(lm.Line)) 277 278 // LineStart, LineEnd, LineNumber 279 sz += 3 * 8 280 281 // Before 282 sz += sliceHeaderBytes + uint64(len(lm.Before)) 283 284 // After 285 sz += sliceHeaderBytes + uint64(len(lm.After)) 286 287 // FileName 288 sz += 1 289 290 // Score 291 sz += 8 292 293 // DebugScore 294 sz += stringHeaderBytes + uint64(len(lm.DebugScore)) 295 296 // LineFragments 297 sz += sliceHeaderBytes 298 for _, lf := range lm.LineFragments { 299 sz += lf.sizeBytes() 300 } 301 302 return 303} 304 305type Symbol struct { 306 Sym string 307 Kind string 308 Parent string 309 ParentKind string 310} 311 312func (s *Symbol) sizeBytes() uint64 { 313 return 4*stringHeaderBytes + uint64(len(s.Sym)+len(s.Kind)+len(s.Parent)+len(s.ParentKind)) 314} 315 316// LineFragmentMatch a segment of matching text within a line. 317type LineFragmentMatch struct { 318 // Offset within the line, in bytes. 319 LineOffset int 320 321 // Offset from file start, in bytes. 322 Offset uint32 323 324 // Number bytes that match. 325 MatchLength int 326 327 SymbolInfo *Symbol 328} 329 330func (lfm *LineFragmentMatch) sizeBytes() (sz uint64) { 331 // LineOffset 332 sz += 8 333 334 // Offset 335 sz += 4 336 337 // MatchLength 338 sz += 8 339 340 // SymbolInfo 341 sz += pointerSize 342 if lfm.SymbolInfo != nil { 343 sz += lfm.SymbolInfo.sizeBytes() 344 } 345 346 return 347} 348 349type FlushReason uint8 350 351const ( 352 FlushReasonTimerExpired FlushReason = 1 << iota 353 FlushReasonFinalFlush 354 FlushReasonMaxSize 355) 356 357var FlushReasonStrings = map[FlushReason]string{ 358 FlushReasonTimerExpired: "timer_expired", 359 FlushReasonFinalFlush: "final_flush", 360 FlushReasonMaxSize: "max_size_reached", 361} 362 363func (fr FlushReason) String() string { 364 if v, ok := FlushReasonStrings[fr]; ok { 365 return v 366 } 367 368 return "none" 369} 370 371// Stats contains interesting numbers on the search 372type Stats struct { 373 // Amount of I/O for reading contents. 374 ContentBytesLoaded int64 375 376 // Amount of I/O for reading from index. 377 IndexBytesLoaded int64 378 379 // Number of search shards that had a crash. 380 Crashes int 381 382 // Wall clock time for this search 383 Duration time.Duration 384 385 // Number of files containing a match. 386 FileCount int 387 388 // Number of files in shards that we considered. 389 ShardFilesConsidered int 390 391 // Files that we evaluated. Equivalent to files for which all 392 // atom matches (including negations) evaluated to true. 393 FilesConsidered int 394 395 // Files for which we loaded file content to verify substring matches 396 FilesLoaded int 397 398 // Candidate files whose contents weren't examined because we 399 // gathered enough matches. 400 FilesSkipped int 401 402 // Shards that we scanned to find matches. 403 ShardsScanned int 404 405 // Shards that we did not process because a query was canceled. 406 ShardsSkipped int 407 408 // Shards that we did not process because the query was rejected by the 409 // ngram filter indicating it had no matches. 410 ShardsSkippedFilter int 411 412 // Number of non-overlapping matches 413 MatchCount int 414 415 // Number of candidate matches as a result of searching ngrams. 416 NgramMatches int 417 418 // NgramLookups is the number of times we accessed an ngram in the index. 419 NgramLookups int 420 421 // Wall clock time for queued search. 422 Wait time.Duration 423 424 // Aggregate wall clock time spent constructing and pruning the match tree. 425 // This accounts for time such as lookups in the trigram index. 426 MatchTreeConstruction time.Duration 427 428 // Aggregate wall clock time spent searching the match tree. This accounts 429 // for the bulk of search work done looking for matches. 430 MatchTreeSearch time.Duration 431 432 // Number of times regexp was called on files that we evaluated. 433 RegexpsConsidered int 434 435 // FlushReason explains why results were flushed. 436 FlushReason FlushReason 437} 438 439func (s *Stats) sizeBytes() (sz uint64) { 440 sz = 16 * 8 // This assumes we are running on a 64-bit architecture 441 sz += 1 // FlushReason 442 443 return 444} 445 446func (s *Stats) Add(o Stats) { 447 s.ContentBytesLoaded += o.ContentBytesLoaded 448 s.IndexBytesLoaded += o.IndexBytesLoaded 449 s.Crashes += o.Crashes 450 s.FileCount += o.FileCount 451 s.FilesConsidered += o.FilesConsidered 452 s.FilesLoaded += o.FilesLoaded 453 s.FilesSkipped += o.FilesSkipped 454 s.MatchCount += o.MatchCount 455 s.NgramMatches += o.NgramMatches 456 s.NgramLookups += o.NgramLookups 457 s.ShardFilesConsidered += o.ShardFilesConsidered 458 s.ShardsScanned += o.ShardsScanned 459 s.ShardsSkipped += o.ShardsSkipped 460 s.ShardsSkippedFilter += o.ShardsSkippedFilter 461 s.Wait += o.Wait 462 s.MatchTreeConstruction += o.MatchTreeConstruction 463 s.MatchTreeSearch += o.MatchTreeSearch 464 s.RegexpsConsidered += o.RegexpsConsidered 465 466 // We want the first non-zero FlushReason to be sticky. This is a useful 467 // property when aggregating stats from several Zoekts. 468 if s.FlushReason == 0 { 469 s.FlushReason = o.FlushReason 470 } 471} 472 473// Zero returns true if stats is empty. 474func (s *Stats) Zero() bool { 475 if s == nil { 476 return true 477 } 478 479 return !(s.ContentBytesLoaded > 0 || 480 s.IndexBytesLoaded > 0 || 481 s.Crashes > 0 || 482 s.FileCount > 0 || 483 s.FilesConsidered > 0 || 484 s.FilesLoaded > 0 || 485 s.FilesSkipped > 0 || 486 s.MatchCount > 0 || 487 s.NgramMatches > 0 || 488 s.NgramLookups > 0 || 489 s.ShardFilesConsidered > 0 || 490 s.ShardsScanned > 0 || 491 s.ShardsSkipped > 0 || 492 s.ShardsSkippedFilter > 0 || 493 s.Wait > 0 || 494 s.MatchTreeConstruction > 0 || 495 s.MatchTreeSearch > 0 || 496 s.RegexpsConsidered > 0) 497} 498 499// Progress contains information about the global progress of the running search query. 500// This is used by the frontend to reorder results and emit them when stable. 501// Sourcegraph specific: this is used when querying multiple zoekt-webserver instances. 502type Progress struct { 503 // Priority of the shard that was searched. 504 Priority float64 505 506 // MaxPendingPriority is the maximum priority of pending result that is being searched in parallel. 507 // This is used to reorder results when the result set is known to be stable-- that is, when a result's 508 // Priority is greater than the max(MaxPendingPriority) from the latest results of each backend, it can be returned to the user. 509 // 510 // MaxPendingPriority decreases monotonically in each SearchResult. 511 MaxPendingPriority float64 512} 513 514func (p *Progress) sizeBytes() uint64 { 515 return 2 * 8 516} 517 518// SearchResult contains search matches and extra data 519type SearchResult struct { 520 Stats 521 522 // Do not encode this as we cannot encode -Inf in JSON 523 Progress `json:"-"` 524 525 Files []FileMatch 526 527 // RepoURLs holds a repo => template string map. 528 RepoURLs map[string]string 529 530 // FragmentNames holds a repo => template string map, for 531 // the line number fragment. 532 LineFragments map[string]string 533} 534 535// SizeBytes is a best-effort estimate of the size of SearchResult in memory. 536// The estimate does not take alignment into account. The result is a lower 537// bound on the actual size in memory. 538func (sr *SearchResult) SizeBytes() (sz uint64) { 539 sz += sr.Stats.sizeBytes() 540 sz += sr.Progress.sizeBytes() 541 542 // Files 543 sz += sliceHeaderBytes 544 for _, f := range sr.Files { 545 sz += f.sizeBytes() 546 } 547 548 // RepoURLs 549 sz += mapHeaderBytes 550 for k, v := range sr.RepoURLs { 551 sz += stringHeaderBytes + uint64(len(k)) 552 sz += stringHeaderBytes + uint64(len(v)) 553 } 554 555 // LineFragments 556 sz += mapHeaderBytes 557 for k, v := range sr.LineFragments { 558 sz += stringHeaderBytes + uint64(len(k)) 559 sz += stringHeaderBytes + uint64(len(v)) 560 } 561 562 return 563} 564 565// RepositoryBranch describes an indexed branch, which is a name 566// combined with a version. 567type RepositoryBranch struct { 568 Name string 569 Version string 570} 571 572func (r RepositoryBranch) String() string { 573 return fmt.Sprintf("%s@%s", r.Name, r.Version) 574} 575 576// Repository holds repository metadata. 577type Repository struct { 578 // Sourcegraph's tenant ID 579 TenantID int 580 581 // Sourcegraph's repository ID 582 ID uint32 583 584 // The repository name 585 Name string 586 587 // The repository URL. 588 URL string 589 590 // Additional metadata about the repository. 591 Metadata map[string]string 592 593 // The physical source where this repo came from, eg. full 594 // path to the zip filename or git repository directory. This 595 // will not be exposed in the UI, but can be used to detect 596 // orphaned index shards. 597 Source string 598 599 // The branches indexed in this repo. 600 Branches []RepositoryBranch 601 602 // Nil if this is not the super project. 603 SubRepoMap map[string]*Repository 604 605 // URL template to link to the commit of a branch 606 CommitURLTemplate string 607 608 // The repository URL for getting to a file. Has access to 609 // {{.Version}}, {{.Path}} 610 FileURLTemplate string 611 612 // The URL fragment to add to a file URL for line numbers. has 613 // access to {{.LineNumber}}. The fragment should include the 614 // separator, generally '#' or ';'. 615 LineFragmentTemplate string 616 617 // Perf optimization: priority is set when we load the shard. It corresponds to 618 // the value of "priority" stored in RawConfig. 619 priority float64 620 621 // All zoekt.* configuration settings. 622 RawConfig map[string]string 623 624 // Importance of the repository, bigger is more important 625 Rank uint16 626 627 // IndexOptions is a hash of the options used to create the index for the 628 // repo. 629 IndexOptions string 630 631 // HasSymbols is true if this repository has indexed ctags 632 // output. Sourcegraph specific: This field is more appropriate for 633 // IndexMetadata. However, we store it here since the Sourcegraph frontend 634 // can read this structure but not IndexMetadata. 635 HasSymbols bool 636 637 // Tombstone is true if we are not allowed to search this repo. 638 Tombstone bool 639 640 // LatestCommitDate is the date of the latest commit among all indexed Branches. 641 // The date might be time.Time's 0-value if the repository was last indexed 642 // before this field was added. 643 LatestCommitDate time.Time 644 645 // FileTombstones is a set of file paths that should be ignored across all branches 646 // in this shard. 647 FileTombstones map[string]struct{} `json:",omitempty"` 648} 649 650func (r *Repository) UnmarshalJSON(data []byte) error { 651 // We define a new type so that we can use json.Unmarshal 652 // without recursing into this same method. 653 type repository Repository 654 repo := (*repository)(r) 655 656 err := json.Unmarshal(data, repo) 657 if err != nil { 658 return err 659 } 660 661 if v, ok := repo.RawConfig["repoid"]; ok { 662 id, _ := strconv.ParseUint(v, 10, 32) 663 r.ID = uint32(id) 664 } 665 666 if v, ok := repo.RawConfig["tenantID"]; ok { 667 id, _ := strconv.ParseInt(v, 10, 64) 668 r.TenantID = int(id) 669 } 670 671 // Sourcegraph indexserver doesn't set repo.Rank, so we set it here. Setting it 672 // on read instead of during indexing allows us to avoid a complete reindex. 673 // 674 // Prefer "latestCommitDate" over "priority" for ranking. We keep priority for 675 // backwards compatibility. 676 if _, ok := repo.RawConfig["latestCommitDate"]; ok { 677 // We use the number of months since 1970 as a simple measure of repo freshness. 678 // It is monotonically increasing and stable across re-indexes and restarts. 679 r.Rank = monthsSince1970(repo.LatestCommitDate) 680 } else if v, ok := repo.RawConfig["priority"]; ok { 681 r.priority, err = strconv.ParseFloat(v, 64) 682 if err != nil { 683 r.priority = 0 684 } 685 686 // Sourcegraph indexserver doesn't set repo.Rank, so we set it here 687 // based on priority. Setting it on read instead of during indexing 688 // allows us to avoid a complete reindex. 689 if r.Rank == 0 && r.priority > 0 { 690 // Normalize the repo score within [0, maxUint16), with the midpoint at 5,000. 691 // This means popular repos (roughly ones with over 5,000 stars) see diminishing 692 // returns from more stars. 693 r.Rank = uint16(r.priority / (5000.0 + r.priority) * math.MaxUint16) 694 } 695 } 696 697 return nil 698} 699 700func (r *Repository) GetPriority() float64 { 701 return r.priority 702} 703 704// monthsSince1970 returns the number of months since 1970. It returns values in 705// the range [0, maxUInt16]. The upper bound is reached in the year 7431, the 706// lower bound for all dates before 1970. 707func monthsSince1970(t time.Time) uint16 { 708 base := time.Unix(0, 0) 709 if t.Before(base) { 710 return 0 711 } 712 months := int(t.Year()-1970)*12 + int(t.Month()-1) 713 return uint16(min(months, math.MaxUint16)) 714} 715 716// MergeMutable will merge x into r. mutated will be true if it made any 717// changes. err is non-nil if we needed to mutate an immutable field. 718// 719// Note: SubRepoMap, IndexOptions and HasSymbol fields are ignored. They are 720// computed while indexing so can't be synthesized from x. 721// 722// Note: We ignore RawConfig fields which are duplicated into Repository: 723// name and id. 724func (r *Repository) MergeMutable(x *Repository) (mutated bool, err error) { 725 if r.ID != x.ID { 726 // Sourcegraph: strange behaviour may occur if ID changes but names don't. 727 return mutated, errors.New("ID is immutable") 728 } 729 if r.Name != x.Name { 730 // Name is encoded into the shard name on disk. We need to re-index if it 731 // changes. 732 return mutated, errors.New("Name is immutable") 733 } 734 if !reflect.DeepEqual(r.Branches, x.Branches) { 735 // Need a reindex if content changing. 736 return mutated, errors.New("Branches is immutable") 737 } 738 739 for k, v := range x.RawConfig { 740 // We ignore name and id since they are encoded into the repository. 741 if k == "name" || k == "id" { 742 continue 743 } 744 if r.RawConfig == nil { 745 mutated = true 746 r.RawConfig = make(map[string]string) 747 } 748 if r.RawConfig[k] != v { 749 mutated = true 750 r.RawConfig[k] = v 751 } 752 } 753 754 if r.URL != x.URL { 755 mutated = true 756 r.URL = x.URL 757 } 758 if r.CommitURLTemplate != x.CommitURLTemplate { 759 mutated = true 760 r.CommitURLTemplate = x.CommitURLTemplate 761 } 762 if r.FileURLTemplate != x.FileURLTemplate { 763 mutated = true 764 r.FileURLTemplate = x.FileURLTemplate 765 } 766 if r.LineFragmentTemplate != x.LineFragmentTemplate { 767 mutated = true 768 r.LineFragmentTemplate = x.LineFragmentTemplate 769 } 770 771 return mutated, nil 772} 773 774// IndexMetadata holds metadata stored in the index file. It contains 775// data generated by the core indexing library. 776type IndexMetadata struct { 777 IndexFormatVersion int 778 IndexFeatureVersion int 779 IndexMinReaderVersion int 780 IndexTime time.Time 781 PlainASCII bool 782 LanguageMap map[string]uint16 783 ZoektVersion string 784 ID string 785} 786 787// Statistics of a (collection of) repositories. 788type RepoStats struct { 789 // Repos is used for aggregrating the number of repositories. 790 // 791 // Note: This field is not populated on RepoListEntry.Stats (individual) but 792 // only for RepoList.Stats (aggregate). 793 Repos int 794 795 // Shards is the total number of search shards. 796 Shards int 797 798 // Documents holds the number of documents or files. 799 Documents int 800 801 // IndexBytes is the amount of RAM used for index overhead. 802 IndexBytes int64 803 804 // ContentBytes is the amount of RAM used for raw content. 805 ContentBytes int64 806 807 // Sourcegraph specific stats below. These are not as efficient to calculate 808 // as the above statistics. We experimentally measured about a 10% slower 809 // shard load time. However, we find these values very useful to track and 810 // computing them outside of load time introduces a lot of complexity. 811 812 // NewLinesCount is the number of newlines "\n" that appear in the zoekt 813 // indexed documents. This is not exactly the same as line count, since it 814 // will not include lines not terminated by "\n" (eg a file with no "\n", or 815 // a final line without "\n"). Note: Zoekt deduplicates documents across 816 // branches, so if a path has the same contents on multiple branches, there 817 // is only one document for it. As such that document's newlines is only 818 // counted once. See DefaultBranchNewLinesCount and AllBranchesNewLinesCount 819 // for counts which do not deduplicate. 820 NewLinesCount uint64 821 822 // DefaultBranchNewLinesCount is the number of newlines "\n" in the default 823 // branch. 824 DefaultBranchNewLinesCount uint64 825 826 // OtherBranchesNewLinesCount is the number of newlines "\n" in all branches 827 // except the default branch. 828 OtherBranchesNewLinesCount uint64 829} 830 831func (s *RepoStats) Add(o *RepoStats) { 832 // can't update Repos, since one repo may have multiple 833 // shards. 834 s.Shards += o.Shards 835 s.IndexBytes += o.IndexBytes 836 s.Documents += o.Documents 837 s.ContentBytes += o.ContentBytes 838 839 // Sourcegraph specific 840 s.NewLinesCount += o.NewLinesCount 841 s.DefaultBranchNewLinesCount += o.DefaultBranchNewLinesCount 842 s.OtherBranchesNewLinesCount += o.OtherBranchesNewLinesCount 843} 844 845type RepoListEntry struct { 846 Repository Repository 847 IndexMetadata IndexMetadata 848 Stats RepoStats 849} 850 851// MinimalRepoListEntry is a subset of RepoListEntry. It was added after 852// performance profiling of sourcegraph.com revealed that querying this 853// information from Zoekt was causing lots of CPU and memory usage. Note: we 854// can revisit this, how we store and query this information has changed a lot 855// since this was introduced. 856type MinimalRepoListEntry struct { 857 // HasSymbols is exported since Sourcegraph uses this information at search 858 // planning time to decide between Zoekt and an unindexed symbol search. 859 // 860 // Note: it pretty much is always true in practice. 861 HasSymbols bool 862 863 // Branches is used by Sourcegraphs query planner to decided if it can use 864 // zoekt or go via an unindexed code path. 865 Branches []RepositoryBranch 866 867 // IndexTimeUnix is the IndexTime converted to unix time (number of seconds 868 // since the epoch). This is to make it clear we are not transporting the 869 // full fidelty timestamp (ie with milliseconds and location). Additionally 870 // it saves 16 bytes in this struct. 871 // 872 // IndexTime is used as a heuristic in Sourcegraph to decide in aggregate 873 // how many repositories need updating after a ranking change/etc. 874 // 875 // TODO(keegancsmith) audit updates to IndexTime and document how and when 876 // it changes. Concerned about things like metadata updates or compound 877 // shards leading to untrustworthy data here. 878 IndexTimeUnix int64 879} 880 881type ReposMap map[uint32]MinimalRepoListEntry 882 883// MarshalBinary implements a specialized encoder for ReposMap. 884func (q *ReposMap) MarshalBinary() ([]byte, error) { 885 return reposMapEncode(*q) 886} 887 888// UnmarshalBinary implements a specialized decoder for ReposMap. 889func (q *ReposMap) UnmarshalBinary(b []byte) error { 890 var err error 891 (*q), err = reposMapDecode(b) 892 return err 893} 894 895// RepoList holds a set of Repository metadata. 896type RepoList struct { 897 // Returned when ListOptions.Field is RepoListFieldRepos. 898 Repos []*RepoListEntry 899 900 // ReposMap is set when ListOptions.Field is RepoListFieldReposMap. 901 ReposMap ReposMap 902 903 Crashes int 904 905 // Stats response to a List request. 906 // This is the aggregate RepoStats of all repos matching the input query. 907 Stats RepoStats 908} 909 910type Searcher interface { 911 Search(ctx context.Context, q query.Q, opts *SearchOptions) (*SearchResult, error) 912 913 // List lists repositories. The query `q` can only contain 914 // query.Repo atoms. 915 List(ctx context.Context, q query.Q, opts *ListOptions) (*RepoList, error) 916 Close() 917 918 // Describe the searcher for debug messages. 919 String() string 920} 921 922type RepoListField int 923 924const ( 925 RepoListFieldRepos RepoListField = 0 926 RepoListFieldReposMap = 2 927) 928 929type ListOptions struct { 930 // Field decides which field to populate in RepoList response. 931 Field RepoListField 932} 933 934func (o *ListOptions) GetField() (RepoListField, error) { 935 if o == nil { 936 return RepoListFieldRepos, nil 937 } 938 switch o.Field { 939 case RepoListFieldRepos, RepoListFieldReposMap: 940 return o.Field, nil 941 case 1: 942 return 0, fmt.Errorf("RepoListFieldMinimal (%d) is no longer supported", o.Field) 943 default: 944 return 0, fmt.Errorf("unknown RepoListField %d", o.Field) 945 } 946} 947 948func (o *ListOptions) String() string { 949 return fmt.Sprintf("%#v", o) 950} 951 952type SearchOptions struct { 953 // Return an upper-bound estimate of eligible documents in 954 // stats.ShardFilesConsidered. 955 EstimateDocCount bool 956 957 // Return the whole file. 958 Whole bool 959 960 // Maximum number of matches: skip all processing an index 961 // shard after we found this many non-overlapping matches. 962 ShardMaxMatchCount int 963 964 // Maximum number of matches: stop looking for more matches 965 // once we have this many matches across shards. 966 TotalMaxMatchCount int 967 968 // Maximum number of matches: skip processing documents for a repository in 969 // a shard once we have found ShardRepoMaxMatchCount. 970 // 971 // A compound shard may contain multiple repositories. This will most often 972 // be set to 1 to find all repositories containing a result. 973 ShardRepoMaxMatchCount int 974 975 // Abort the search after this much time has passed. 976 MaxWallTime time.Duration 977 978 // FlushWallTime if non-zero will stop streaming behaviour at first and 979 // instead will collate and sort results. At FlushWallTime the results will 980 // be sent and then the behaviour will revert to the normal streaming. 981 FlushWallTime time.Duration 982 983 // Truncates the number of documents (i.e. files) after collating and 984 // sorting the results. 985 MaxDocDisplayCount int 986 987 // Truncates the number of matchs after collating and sorting the results. 988 MaxMatchDisplayCount int 989 990 // If set to a number greater than zero then up to this many number 991 // of context lines will be added before and after each matched line. 992 // Note that the included context lines might contain matches and 993 // it's up to the consumer of the result to remove those lines. 994 NumContextLines int 995 996 // If true, ChunkMatches will be returned in each FileMatch rather than LineMatches 997 // EXPERIMENTAL: the behavior of this flag may be changed in future versions. 998 ChunkMatches bool 999 1000 // EXPERIMENTAL. If true, use text-search style scoring instead of the default 1001 // scoring formula. The scoring algorithm treats each match in a file as a term 1002 // and computes an approximation to BM25. When enabled, BM25 scoring is used for 1003 // the overall FileMatch score, as well as individual LineMatch and ChunkMatch scores. 1004 // 1005 // The calculation of IDF assumes that Zoekt visits all documents containing any 1006 // of the query terms during evaluation. This is true, for example, if all query 1007 // terms are ORed together. 1008 // 1009 // When enabled, all other scoring signals are ignored, including document ranks. 1010 UseBM25Scoring bool 1011 1012 // Trace turns on opentracing for this request if true and if the Jaeger address was provided as 1013 // a command-line flag 1014 Trace bool 1015 1016 // If set, the search results will contain debug information for scoring. 1017 DebugScore bool 1018 1019 // SpanContext is the opentracing span context, if it exists, from the zoekt client 1020 SpanContext map[string]string 1021} 1022 1023func (o *SearchOptions) SetDefaults() { 1024 if o.ShardMaxMatchCount == 0 { 1025 // We cap the total number of matches, so overly broad 1026 // searches don't crash the machine. 1027 o.ShardMaxMatchCount = 100000 1028 } 1029 if o.TotalMaxMatchCount == 0 { 1030 o.TotalMaxMatchCount = 10 * o.ShardMaxMatchCount 1031 } 1032} 1033 1034// String returns a succinct representation of the options. This is meant for 1035// human consumption in logs and traces. 1036// 1037// Note: some tracing systems have limits on length of values, so we take care 1038// to try and make this small, and include the important information near the 1039// front incase of truncation. 1040func (s *SearchOptions) String() string { 1041 var b strings.Builder 1042 1043 add := func(name, value string) { 1044 b.WriteString(name) 1045 b.WriteByte('=') 1046 b.WriteString(value) 1047 b.WriteByte(' ') 1048 } 1049 addInt := func(name string, value int) { 1050 if value != 0 { 1051 add(name, strconv.Itoa(value)) 1052 } 1053 } 1054 addDuration := func(name string, value time.Duration) { 1055 if value != 0 { 1056 add(name, value.String()) 1057 } 1058 } 1059 addBool := func(name string, value bool) { 1060 if !value { 1061 return 1062 } 1063 b.WriteString(name) 1064 b.WriteByte(' ') 1065 } 1066 1067 b.WriteString("zoekt.SearchOptions{ ") 1068 1069 addInt("ShardMaxMatchCount", s.ShardMaxMatchCount) 1070 addInt("TotalMaxMatchCount", s.TotalMaxMatchCount) 1071 addInt("ShardRepoMaxMatchCount", s.ShardRepoMaxMatchCount) 1072 addInt("MaxDocDisplayCount", s.MaxDocDisplayCount) 1073 addInt("MaxMatchDisplayCount", s.MaxMatchDisplayCount) 1074 addInt("NumContextLines", s.NumContextLines) 1075 1076 addDuration("MaxWallTime", s.MaxWallTime) 1077 addDuration("FlushWallTime", s.FlushWallTime) 1078 1079 addBool("EstimateDocCount", s.EstimateDocCount) 1080 addBool("Whole", s.Whole) 1081 addBool("ChunkMatches", s.ChunkMatches) 1082 addBool("UseBM25Scoring", s.UseBM25Scoring) 1083 addBool("Trace", s.Trace) 1084 addBool("DebugScore", s.DebugScore) 1085 1086 for k, v := range s.SpanContext { 1087 add("SpanContext."+k, strconv.Quote(v)) 1088 } 1089 1090 b.WriteByte('}') 1091 return b.String() 1092} 1093 1094// Sender is the interface that wraps the basic Send method. 1095type Sender interface { 1096 Send(*SearchResult) 1097} 1098 1099// SenderFunc is an adapter to allow the use of ordinary functions as Sender. 1100// If f is a function with the appropriate signature, SenderFunc(f) is a Sender 1101// that calls f. 1102type SenderFunc func(result *SearchResult) 1103 1104func (f SenderFunc) Send(result *SearchResult) { 1105 f(result) 1106} 1107 1108// Streamer adds the method StreamSearch to the Searcher interface. 1109type Streamer interface { 1110 Searcher 1111 StreamSearch(ctx context.Context, q query.Q, opts *SearchOptions, sender Sender) (err error) 1112}