fork of https://github.com/sourcegraph/zoekt
0

Configure Feed

Select the types of activity you want to include in your feed.

1// Copyright 2016 Google Inc. All rights reserved. 2// 3// Licensed under the Apache License, Version 2.0 (the "License"); 4// you may not use this file except in compliance with the License. 5// You may obtain a copy of the License at 6// 7// http://www.apache.org/licenses/LICENSE-2.0 8// 9// Unless required by applicable law or agreed to in writing, software 10// distributed under the License is distributed on an "AS IS" BASIS, 11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12// See the License for the specific language governing permissions and 13// limitations under the License. 14 15package zoekt // import "github.com/sourcegraph/zoekt" 16 17import ( 18 "context" 19 "encoding/json" 20 "errors" 21 "fmt" 22 "math" 23 "reflect" 24 "strconv" 25 "strings" 26 "time" 27 28 "github.com/sourcegraph/zoekt/query" 29) 30 31const ( 32 mapHeaderBytes uint64 = 48 33 sliceHeaderBytes uint64 = 24 34 stringHeaderBytes uint64 = 16 35 pointerSize uint64 = 8 36 interfaceBytes uint64 = 16 37) 38 39// FileMatch contains all the matches within a file. 40type FileMatch struct { 41 FileName string 42 43 // Repository is the globally unique name of the repo of the 44 // match 45 Repository string 46 47 // SubRepositoryName is the globally unique name of the repo, 48 // if it came from a subrepository 49 SubRepositoryName string `json:",omitempty"` 50 51 // SubRepositoryPath holds the prefix where the subrepository 52 // was mounted. 53 SubRepositoryPath string `json:",omitempty"` 54 55 // Commit SHA1 (hex) of the (sub)repo holding the file. 56 Version string `json:",omitempty"` 57 58 // Detected language of the result. 59 Language string 60 61 // For debugging. Needs DebugScore set, but public so tests in 62 // other packages can print some diagnostics. 63 Debug string `json:",omitempty"` 64 65 Branches []string `json:",omitempty"` 66 67 // One of LineMatches or ChunkMatches will be returned depending on whether 68 // the SearchOptions.ChunkMatches is set. 69 LineMatches []LineMatch `json:",omitempty"` 70 ChunkMatches []ChunkMatch `json:",omitempty"` 71 72 // Only set if requested 73 Content []byte `json:",omitempty"` 74 75 // Checksum of the content. 76 Checksum []byte 77 78 // Ranking; the higher, the better. 79 Score float64 `json:",omitempty"` 80 81 // RepositoryPriority is a Sourcegraph extension. It is used by Sourcegraph to 82 // order results from different repositories relative to each other. 83 RepositoryPriority float64 `json:",omitempty"` 84 85 // RepositoryID is a Sourcegraph extension. This is the ID of Repository in 86 // Sourcegraph. 87 RepositoryID uint32 `json:",omitempty"` 88} 89 90func (m *FileMatch) sizeBytes() (sz uint64) { 91 // Score 92 sz += 8 93 94 for _, s := range []string{ 95 m.Debug, 96 m.FileName, 97 m.Repository, 98 m.Language, 99 m.SubRepositoryName, 100 m.SubRepositoryPath, 101 m.Version, 102 } { 103 sz += stringHeaderBytes + uint64(len(s)) 104 } 105 106 // Branches 107 sz += sliceHeaderBytes 108 for _, s := range m.Branches { 109 sz += stringHeaderBytes + uint64(len(s)) 110 } 111 112 // LineMatches 113 sz += sliceHeaderBytes 114 for _, lm := range m.LineMatches { 115 sz += lm.sizeBytes() 116 } 117 118 // ChunkMatches 119 sz += sliceHeaderBytes 120 for _, cm := range m.ChunkMatches { 121 sz += cm.sizeBytes() 122 } 123 124 // RepositoryID 125 sz += 4 126 127 // RepositoryPriority 128 sz += 8 129 130 // Content 131 sz += sliceHeaderBytes + uint64(len(m.Content)) 132 133 // Checksum 134 sz += sliceHeaderBytes + uint64(len(m.Checksum)) 135 136 return 137} 138 139// addScore increments the score of the FileMatch by the computed score. If 140// debugScore is true, it also adds a debug string to the FileMatch. If raw is 141// -1, it is ignored. Otherwise, it is added to the debug string. 142func (m *FileMatch) addScore(what string, computed float64, raw float64, debugScore bool) { 143 if computed != 0 && debugScore { 144 var b strings.Builder 145 fmt.Fprintf(&b, "%s", what) 146 if raw != -1 { 147 fmt.Fprintf(&b, "(%s)", strconv.FormatFloat(raw, 'f', -1, 64)) 148 } 149 fmt.Fprintf(&b, ":%.2f, ", computed) 150 m.Debug += b.String() 151 } 152 m.Score += computed 153} 154 155// ChunkMatch is a set of non-overlapping matches within a contiguous range of 156// lines in the file. 157type ChunkMatch struct { 158 DebugScore string 159 160 // Content is a contiguous range of complete lines that fully contains Ranges. 161 // Lines will always include their terminating newline (if it exists). 162 Content []byte 163 164 // Ranges is a set of matching ranges within this chunk. Each range is relative 165 // to the beginning of the file (not the beginning of Content). 166 Ranges []Range 167 168 // SymbolInfo is the symbol information associated with Ranges. If it is non-nil, 169 // its length will equal that of Ranges. Any of its elements may be nil. 170 SymbolInfo []*Symbol 171 172 // FileName indicates whether this match is a match on the file name, in 173 // which case Content will contain the file name. 174 FileName bool 175 176 // ContentStart is the location (inclusive) of the beginning of content 177 // relative to the beginning of the file. It will always be at the 178 // beginning of a line (Column will always be 1). 179 ContentStart Location 180 181 // Score is the overall relevance score of this chunk. 182 Score float64 183 184 // BestLineMatch is the line number of the highest-scoring line match in this chunk. 185 // The line number represents the index in the full file, and is 1-based. If FileName: true, 186 // this number will be 0. 187 BestLineMatch uint32 188} 189 190func (cm *ChunkMatch) sizeBytes() (sz uint64) { 191 // Content 192 sz += sliceHeaderBytes + uint64(len(cm.Content)) 193 194 // ContentStart 195 sz += cm.ContentStart.sizeBytes() 196 197 // FileName 198 sz += 1 199 200 // Ranges 201 sz += sliceHeaderBytes 202 if len(cm.Ranges) > 0 { 203 sz += uint64(len(cm.Ranges)) * cm.Ranges[0].sizeBytes() 204 } 205 206 // SymbolInfo 207 sz += sliceHeaderBytes 208 for _, si := range cm.SymbolInfo { 209 sz += pointerSize 210 if si != nil { 211 sz += si.sizeBytes() 212 } 213 } 214 215 // Score 216 sz += 8 217 218 // DebugScore 219 sz += stringHeaderBytes + uint64(len(cm.DebugScore)) 220 221 return 222} 223 224type Range struct { 225 // The inclusive beginning of the range. 226 Start Location 227 // The exclusive end of the range. 228 End Location 229} 230 231func (r *Range) sizeBytes() uint64 { 232 return r.Start.sizeBytes() + r.End.sizeBytes() 233} 234 235type Location struct { 236 // 0-based byte offset from the beginning of the file 237 ByteOffset uint32 238 // 1-based line number from the beginning of the file 239 LineNumber uint32 240 // 1-based column number (in runes) from the beginning of line 241 Column uint32 242} 243 244func (l *Location) sizeBytes() uint64 { 245 return 3 * 4 246} 247 248// LineMatch holds the matches within a single line in a file. 249type LineMatch struct { 250 // The line in which a match was found. 251 Line []byte 252 // The byte offset of the first byte of the line. 253 LineStart int 254 // The byte offset of the first byte past the end of the line. 255 // This is usually the byte after the terminating newline, but can also be 256 // the end of the file if there is no terminating newline 257 LineEnd int 258 LineNumber int 259 260 // Before and After are only set when SearchOptions.NumContextLines is > 0 261 Before []byte 262 After []byte 263 264 // If set, this was a match on the filename. 265 FileName bool 266 267 // The higher the better. Only ranks the quality of the match 268 // within the file, does not take rank of file into account 269 Score float64 270 DebugScore string 271 272 LineFragments []LineFragmentMatch 273} 274 275func (lm *LineMatch) sizeBytes() (sz uint64) { 276 // Line 277 sz += sliceHeaderBytes + uint64(len(lm.Line)) 278 279 // LineStart, LineEnd, LineNumber 280 sz += 3 * 8 281 282 // Before 283 sz += sliceHeaderBytes + uint64(len(lm.Before)) 284 285 // After 286 sz += sliceHeaderBytes + uint64(len(lm.After)) 287 288 // FileName 289 sz += 1 290 291 // Score 292 sz += 8 293 294 // DebugScore 295 sz += stringHeaderBytes + uint64(len(lm.DebugScore)) 296 297 // LineFragments 298 sz += sliceHeaderBytes 299 for _, lf := range lm.LineFragments { 300 sz += lf.sizeBytes() 301 } 302 303 return 304} 305 306type Symbol struct { 307 Sym string 308 Kind string 309 Parent string 310 ParentKind string 311} 312 313func (s *Symbol) sizeBytes() uint64 { 314 return 4*stringHeaderBytes + uint64(len(s.Sym)+len(s.Kind)+len(s.Parent)+len(s.ParentKind)) 315} 316 317// LineFragmentMatch a segment of matching text within a line. 318type LineFragmentMatch struct { 319 // Offset within the line, in bytes. 320 LineOffset int 321 322 // Offset from file start, in bytes. 323 Offset uint32 324 325 // Number bytes that match. 326 MatchLength int 327 328 SymbolInfo *Symbol 329} 330 331func (lfm *LineFragmentMatch) sizeBytes() (sz uint64) { 332 // LineOffset 333 sz += 8 334 335 // Offset 336 sz += 4 337 338 // MatchLength 339 sz += 8 340 341 // SymbolInfo 342 sz += pointerSize 343 if lfm.SymbolInfo != nil { 344 sz += lfm.SymbolInfo.sizeBytes() 345 } 346 347 return 348} 349 350type FlushReason uint8 351 352const ( 353 FlushReasonTimerExpired FlushReason = 1 << iota 354 FlushReasonFinalFlush 355 FlushReasonMaxSize 356) 357 358var FlushReasonStrings = map[FlushReason]string{ 359 FlushReasonTimerExpired: "timer_expired", 360 FlushReasonFinalFlush: "final_flush", 361 FlushReasonMaxSize: "max_size_reached", 362} 363 364func (fr FlushReason) String() string { 365 if v, ok := FlushReasonStrings[fr]; ok { 366 return v 367 } 368 369 return "none" 370} 371 372// Stats contains interesting numbers on the search 373type Stats struct { 374 // Amount of I/O for reading contents. 375 ContentBytesLoaded int64 376 377 // Amount of I/O for reading from index. 378 IndexBytesLoaded int64 379 380 // Number of search shards that had a crash. 381 Crashes int 382 383 // Wall clock time for this search 384 Duration time.Duration 385 386 // Number of files containing a match. 387 FileCount int 388 389 // Number of files in shards that we considered. 390 ShardFilesConsidered int 391 392 // Files that we evaluated. Equivalent to files for which all 393 // atom matches (including negations) evaluated to true. 394 FilesConsidered int 395 396 // Files for which we loaded file content to verify substring matches 397 FilesLoaded int 398 399 // Candidate files whose contents weren't examined because we 400 // gathered enough matches. 401 FilesSkipped int 402 403 // Shards that we scanned to find matches. 404 ShardsScanned int 405 406 // Shards that we did not process because a query was canceled. 407 ShardsSkipped int 408 409 // Shards that we did not process because the query was rejected by the 410 // ngram filter indicating it had no matches. 411 ShardsSkippedFilter int 412 413 // Number of non-overlapping matches 414 MatchCount int 415 416 // Number of candidate matches as a result of searching ngrams. 417 NgramMatches int 418 419 // NgramLookups is the number of times we accessed an ngram in the index. 420 NgramLookups int 421 422 // Wall clock time for queued search. 423 Wait time.Duration 424 425 // Aggregate wall clock time spent constructing and pruning the match tree. 426 // This accounts for time such as lookups in the trigram index. 427 MatchTreeConstruction time.Duration 428 429 // Aggregate wall clock time spent searching the match tree. This accounts 430 // for the bulk of search work done looking for matches. 431 MatchTreeSearch time.Duration 432 433 // Number of times regexp was called on files that we evaluated. 434 RegexpsConsidered int 435 436 // FlushReason explains why results were flushed. 437 FlushReason FlushReason 438} 439 440func (s *Stats) sizeBytes() (sz uint64) { 441 sz = 16 * 8 // This assumes we are running on a 64-bit architecture 442 sz += 1 // FlushReason 443 444 return 445} 446 447func (s *Stats) Add(o Stats) { 448 s.ContentBytesLoaded += o.ContentBytesLoaded 449 s.IndexBytesLoaded += o.IndexBytesLoaded 450 s.Crashes += o.Crashes 451 s.FileCount += o.FileCount 452 s.FilesConsidered += o.FilesConsidered 453 s.FilesLoaded += o.FilesLoaded 454 s.FilesSkipped += o.FilesSkipped 455 s.MatchCount += o.MatchCount 456 s.NgramMatches += o.NgramMatches 457 s.NgramLookups += o.NgramLookups 458 s.ShardFilesConsidered += o.ShardFilesConsidered 459 s.ShardsScanned += o.ShardsScanned 460 s.ShardsSkipped += o.ShardsSkipped 461 s.ShardsSkippedFilter += o.ShardsSkippedFilter 462 s.Wait += o.Wait 463 s.MatchTreeConstruction += o.MatchTreeConstruction 464 s.MatchTreeSearch += o.MatchTreeSearch 465 s.RegexpsConsidered += o.RegexpsConsidered 466 467 // We want the first non-zero FlushReason to be sticky. This is a useful 468 // property when aggregating stats from several Zoekts. 469 if s.FlushReason == 0 { 470 s.FlushReason = o.FlushReason 471 } 472} 473 474// Zero returns true if stats is empty. 475func (s *Stats) Zero() bool { 476 if s == nil { 477 return true 478 } 479 480 return !(s.ContentBytesLoaded > 0 || 481 s.IndexBytesLoaded > 0 || 482 s.Crashes > 0 || 483 s.FileCount > 0 || 484 s.FilesConsidered > 0 || 485 s.FilesLoaded > 0 || 486 s.FilesSkipped > 0 || 487 s.MatchCount > 0 || 488 s.NgramMatches > 0 || 489 s.NgramLookups > 0 || 490 s.ShardFilesConsidered > 0 || 491 s.ShardsScanned > 0 || 492 s.ShardsSkipped > 0 || 493 s.ShardsSkippedFilter > 0 || 494 s.Wait > 0 || 495 s.MatchTreeConstruction > 0 || 496 s.MatchTreeSearch > 0 || 497 s.RegexpsConsidered > 0) 498} 499 500// Progress contains information about the global progress of the running search query. 501// This is used by the frontend to reorder results and emit them when stable. 502// Sourcegraph specific: this is used when querying multiple zoekt-webserver instances. 503type Progress struct { 504 // Priority of the shard that was searched. 505 Priority float64 506 507 // MaxPendingPriority is the maximum priority of pending result that is being searched in parallel. 508 // This is used to reorder results when the result set is known to be stable-- that is, when a result's 509 // Priority is greater than the max(MaxPendingPriority) from the latest results of each backend, it can be returned to the user. 510 // 511 // MaxPendingPriority decreases monotonically in each SearchResult. 512 MaxPendingPriority float64 513} 514 515func (p *Progress) sizeBytes() uint64 { 516 return 2 * 8 517} 518 519// SearchResult contains search matches and extra data 520type SearchResult struct { 521 Stats 522 523 // Do not encode this as we cannot encode -Inf in JSON 524 Progress `json:"-"` 525 526 Files []FileMatch 527 528 // RepoURLs holds a repo => template string map. 529 RepoURLs map[string]string 530 531 // FragmentNames holds a repo => template string map, for 532 // the line number fragment. 533 LineFragments map[string]string 534} 535 536// SizeBytes is a best-effort estimate of the size of SearchResult in memory. 537// The estimate does not take alignment into account. The result is a lower 538// bound on the actual size in memory. 539func (sr *SearchResult) SizeBytes() (sz uint64) { 540 sz += sr.Stats.sizeBytes() 541 sz += sr.Progress.sizeBytes() 542 543 // Files 544 sz += sliceHeaderBytes 545 for _, f := range sr.Files { 546 sz += f.sizeBytes() 547 } 548 549 // RepoURLs 550 sz += mapHeaderBytes 551 for k, v := range sr.RepoURLs { 552 sz += stringHeaderBytes + uint64(len(k)) 553 sz += stringHeaderBytes + uint64(len(v)) 554 } 555 556 // LineFragments 557 sz += mapHeaderBytes 558 for k, v := range sr.LineFragments { 559 sz += stringHeaderBytes + uint64(len(k)) 560 sz += stringHeaderBytes + uint64(len(v)) 561 } 562 563 return 564} 565 566// RepositoryBranch describes an indexed branch, which is a name 567// combined with a version. 568type RepositoryBranch struct { 569 Name string 570 Version string 571} 572 573func (r RepositoryBranch) String() string { 574 return fmt.Sprintf("%s@%s", r.Name, r.Version) 575} 576 577// Repository holds repository metadata. 578type Repository struct { 579 // Sourcegraph's tenant ID 580 TenantID int 581 582 // Sourcegraph's repository ID 583 ID uint32 584 585 // The repository name 586 Name string 587 588 // The repository URL. 589 URL string 590 591 // The physical source where this repo came from, eg. full 592 // path to the zip filename or git repository directory. This 593 // will not be exposed in the UI, but can be used to detect 594 // orphaned index shards. 595 Source string 596 597 // The branches indexed in this repo. 598 Branches []RepositoryBranch 599 600 // Nil if this is not the super project. 601 SubRepoMap map[string]*Repository 602 603 // URL template to link to the commit of a branch 604 CommitURLTemplate string 605 606 // The repository URL for getting to a file. Has access to 607 // {{.Version}}, {{.Path}} 608 FileURLTemplate string 609 610 // The URL fragment to add to a file URL for line numbers. has 611 // access to {{.LineNumber}}. The fragment should include the 612 // separator, generally '#' or ';'. 613 LineFragmentTemplate string 614 615 // Perf optimization: priority is set when we load the shard. It corresponds to 616 // the value of "priority" stored in RawConfig. 617 priority float64 618 619 // All zoekt.* configuration settings. 620 RawConfig map[string]string 621 622 // Importance of the repository, bigger is more important 623 Rank uint16 624 625 // IndexOptions is a hash of the options used to create the index for the 626 // repo. 627 IndexOptions string 628 629 // HasSymbols is true if this repository has indexed ctags 630 // output. Sourcegraph specific: This field is more appropriate for 631 // IndexMetadata. However, we store it here since the Sourcegraph frontend 632 // can read this structure but not IndexMetadata. 633 HasSymbols bool 634 635 // Tombstone is true if we are not allowed to search this repo. 636 Tombstone bool 637 638 // LatestCommitDate is the date of the latest commit among all indexed Branches. 639 // The date might be time.Time's 0-value if the repository was last indexed 640 // before this field was added. 641 LatestCommitDate time.Time 642 643 // FileTombstones is a set of file paths that should be ignored across all branches 644 // in this shard. 645 FileTombstones map[string]struct{} `json:",omitempty"` 646} 647 648func (r *Repository) UnmarshalJSON(data []byte) error { 649 // We define a new type so that we can use json.Unmarshal 650 // without recursing into this same method. 651 type repository *Repository 652 repo := repository(r) 653 654 err := json.Unmarshal(data, repo) 655 if err != nil { 656 return err 657 } 658 659 if v, ok := repo.RawConfig["repoid"]; ok { 660 id, _ := strconv.ParseUint(v, 10, 32) 661 r.ID = uint32(id) 662 } 663 664 if v, ok := repo.RawConfig["tenantID"]; ok { 665 id, _ := strconv.ParseInt(v, 10, 64) 666 r.TenantID = int(id) 667 } 668 669 // Sourcegraph indexserver doesn't set repo.Rank, so we set it here. Setting it 670 // on read instead of during indexing allows us to avoid a complete reindex. 671 // 672 // Prefer "latestCommitDate" over "priority" for ranking. We keep priority for 673 // backwards compatibility. 674 if _, ok := repo.RawConfig["latestCommitDate"]; ok { 675 // We use the number of months since 1970 as a simple measure of repo freshness. 676 // It is monotonically increasing and stable across re-indexes and restarts. 677 r.Rank = monthsSince1970(repo.LatestCommitDate) 678 } else if v, ok := repo.RawConfig["priority"]; ok { 679 r.priority, err = strconv.ParseFloat(v, 64) 680 if err != nil { 681 r.priority = 0 682 } 683 684 // Sourcegraph indexserver doesn't set repo.Rank, so we set it here 685 // based on priority. Setting it on read instead of during indexing 686 // allows us to avoid a complete reindex. 687 if r.Rank == 0 && r.priority > 0 { 688 // Normalize the repo score within [0, maxUint16), with the midpoint at 5,000. 689 // This means popular repos (roughly ones with over 5,000 stars) see diminishing 690 // returns from more stars. 691 r.Rank = uint16(r.priority / (5000.0 + r.priority) * math.MaxUint16) 692 } 693 } 694 695 return nil 696} 697 698// monthsSince1970 returns the number of months since 1970. It returns values in 699// the range [0, maxUInt16]. The upper bound is reached in the year 7431, the 700// lower bound for all dates before 1970. 701func monthsSince1970(t time.Time) uint16 { 702 base := time.Unix(0, 0) 703 if t.Before(base) { 704 return 0 705 } 706 months := int(t.Year()-1970)*12 + int(t.Month()-1) 707 return uint16(min(months, math.MaxUint16)) 708} 709 710// MergeMutable will merge x into r. mutated will be true if it made any 711// changes. err is non-nil if we needed to mutate an immutable field. 712// 713// Note: SubRepoMap, IndexOptions and HasSymbol fields are ignored. They are 714// computed while indexing so can't be synthesized from x. 715// 716// Note: We ignore RawConfig fields which are duplicated into Repository: 717// name and id. 718func (r *Repository) MergeMutable(x *Repository) (mutated bool, err error) { 719 if r.ID != x.ID { 720 // Sourcegraph: strange behaviour may occur if ID changes but names don't. 721 return mutated, errors.New("ID is immutable") 722 } 723 if r.Name != x.Name { 724 // Name is encoded into the shard name on disk. We need to re-index if it 725 // changes. 726 return mutated, errors.New("Name is immutable") 727 } 728 if !reflect.DeepEqual(r.Branches, x.Branches) { 729 // Need a reindex if content changing. 730 return mutated, errors.New("Branches is immutable") 731 } 732 733 for k, v := range x.RawConfig { 734 // We ignore name and id since they are encoded into the repository. 735 if k == "name" || k == "id" { 736 continue 737 } 738 if r.RawConfig == nil { 739 mutated = true 740 r.RawConfig = make(map[string]string) 741 } 742 if r.RawConfig[k] != v { 743 mutated = true 744 r.RawConfig[k] = v 745 } 746 } 747 748 if r.URL != x.URL { 749 mutated = true 750 r.URL = x.URL 751 } 752 if r.CommitURLTemplate != x.CommitURLTemplate { 753 mutated = true 754 r.CommitURLTemplate = x.CommitURLTemplate 755 } 756 if r.FileURLTemplate != x.FileURLTemplate { 757 mutated = true 758 r.FileURLTemplate = x.FileURLTemplate 759 } 760 if r.LineFragmentTemplate != x.LineFragmentTemplate { 761 mutated = true 762 r.LineFragmentTemplate = x.LineFragmentTemplate 763 } 764 765 return mutated, nil 766} 767 768// IndexMetadata holds metadata stored in the index file. It contains 769// data generated by the core indexing library. 770type IndexMetadata struct { 771 IndexFormatVersion int 772 IndexFeatureVersion int 773 IndexMinReaderVersion int 774 IndexTime time.Time 775 PlainASCII bool 776 LanguageMap map[string]uint16 777 ZoektVersion string 778 ID string 779} 780 781// Statistics of a (collection of) repositories. 782type RepoStats struct { 783 // Repos is used for aggregrating the number of repositories. 784 // 785 // Note: This field is not populated on RepoListEntry.Stats (individual) but 786 // only for RepoList.Stats (aggregate). 787 Repos int 788 789 // Shards is the total number of search shards. 790 Shards int 791 792 // Documents holds the number of documents or files. 793 Documents int 794 795 // IndexBytes is the amount of RAM used for index overhead. 796 IndexBytes int64 797 798 // ContentBytes is the amount of RAM used for raw content. 799 ContentBytes int64 800 801 // Sourcegraph specific stats below. These are not as efficient to calculate 802 // as the above statistics. We experimentally measured about a 10% slower 803 // shard load time. However, we find these values very useful to track and 804 // computing them outside of load time introduces a lot of complexity. 805 806 // NewLinesCount is the number of newlines "\n" that appear in the zoekt 807 // indexed documents. This is not exactly the same as line count, since it 808 // will not include lines not terminated by "\n" (eg a file with no "\n", or 809 // a final line without "\n"). Note: Zoekt deduplicates documents across 810 // branches, so if a path has the same contents on multiple branches, there 811 // is only one document for it. As such that document's newlines is only 812 // counted once. See DefaultBranchNewLinesCount and AllBranchesNewLinesCount 813 // for counts which do not deduplicate. 814 NewLinesCount uint64 815 816 // DefaultBranchNewLinesCount is the number of newlines "\n" in the default 817 // branch. 818 DefaultBranchNewLinesCount uint64 819 820 // OtherBranchesNewLinesCount is the number of newlines "\n" in all branches 821 // except the default branch. 822 OtherBranchesNewLinesCount uint64 823} 824 825func (s *RepoStats) Add(o *RepoStats) { 826 // can't update Repos, since one repo may have multiple 827 // shards. 828 s.Shards += o.Shards 829 s.IndexBytes += o.IndexBytes 830 s.Documents += o.Documents 831 s.ContentBytes += o.ContentBytes 832 833 // Sourcegraph specific 834 s.NewLinesCount += o.NewLinesCount 835 s.DefaultBranchNewLinesCount += o.DefaultBranchNewLinesCount 836 s.OtherBranchesNewLinesCount += o.OtherBranchesNewLinesCount 837} 838 839type RepoListEntry struct { 840 Repository Repository 841 IndexMetadata IndexMetadata 842 Stats RepoStats 843} 844 845// MinimalRepoListEntry is a subset of RepoListEntry. It was added after 846// performance profiling of sourcegraph.com revealed that querying this 847// information from Zoekt was causing lots of CPU and memory usage. Note: we 848// can revisit this, how we store and query this information has changed a lot 849// since this was introduced. 850type MinimalRepoListEntry struct { 851 // HasSymbols is exported since Sourcegraph uses this information at search 852 // planning time to decide between Zoekt and an unindexed symbol search. 853 // 854 // Note: it pretty much is always true in practice. 855 HasSymbols bool 856 857 // Branches is used by Sourcegraphs query planner to decided if it can use 858 // zoekt or go via an unindexed code path. 859 Branches []RepositoryBranch 860 861 // IndexTimeUnix is the IndexTime converted to unix time (number of seconds 862 // since the epoch). This is to make it clear we are not transporting the 863 // full fidelty timestamp (ie with milliseconds and location). Additionally 864 // it saves 16 bytes in this struct. 865 // 866 // IndexTime is used as a heuristic in Sourcegraph to decide in aggregate 867 // how many repositories need updating after a ranking change/etc. 868 // 869 // TODO(keegancsmith) audit updates to IndexTime and document how and when 870 // it changes. Concerned about things like metadata updates or compound 871 // shards leading to untrustworthy data here. 872 IndexTimeUnix int64 873} 874 875type ReposMap map[uint32]MinimalRepoListEntry 876 877// MarshalBinary implements a specialized encoder for ReposMap. 878func (q *ReposMap) MarshalBinary() ([]byte, error) { 879 return reposMapEncode(*q) 880} 881 882// UnmarshalBinary implements a specialized decoder for ReposMap. 883func (q *ReposMap) UnmarshalBinary(b []byte) error { 884 var err error 885 (*q), err = reposMapDecode(b) 886 return err 887} 888 889// RepoList holds a set of Repository metadata. 890type RepoList struct { 891 // Returned when ListOptions.Field is RepoListFieldRepos. 892 Repos []*RepoListEntry 893 894 // ReposMap is set when ListOptions.Field is RepoListFieldReposMap. 895 ReposMap ReposMap 896 897 Crashes int 898 899 // Stats response to a List request. 900 // This is the aggregate RepoStats of all repos matching the input query. 901 Stats RepoStats 902} 903 904type Searcher interface { 905 Search(ctx context.Context, q query.Q, opts *SearchOptions) (*SearchResult, error) 906 907 // List lists repositories. The query `q` can only contain 908 // query.Repo atoms. 909 List(ctx context.Context, q query.Q, opts *ListOptions) (*RepoList, error) 910 Close() 911 912 // Describe the searcher for debug messages. 913 String() string 914} 915 916type RepoListField int 917 918const ( 919 RepoListFieldRepos RepoListField = 0 920 RepoListFieldReposMap = 2 921) 922 923type ListOptions struct { 924 // Field decides which field to populate in RepoList response. 925 Field RepoListField 926} 927 928func (o *ListOptions) GetField() (RepoListField, error) { 929 if o == nil { 930 return RepoListFieldRepos, nil 931 } 932 switch o.Field { 933 case RepoListFieldRepos, RepoListFieldReposMap: 934 return o.Field, nil 935 case 1: 936 return 0, fmt.Errorf("RepoListFieldMinimal (%d) is no longer supported", o.Field) 937 default: 938 return 0, fmt.Errorf("unknown RepoListField %d", o.Field) 939 } 940} 941 942func (o *ListOptions) String() string { 943 return fmt.Sprintf("%#v", o) 944} 945 946type SearchOptions struct { 947 // Return an upper-bound estimate of eligible documents in 948 // stats.ShardFilesConsidered. 949 EstimateDocCount bool 950 951 // Return the whole file. 952 Whole bool 953 954 // Maximum number of matches: skip all processing an index 955 // shard after we found this many non-overlapping matches. 956 ShardMaxMatchCount int 957 958 // Maximum number of matches: stop looking for more matches 959 // once we have this many matches across shards. 960 TotalMaxMatchCount int 961 962 // Maximum number of matches: skip processing documents for a repository in 963 // a shard once we have found ShardRepoMaxMatchCount. 964 // 965 // A compound shard may contain multiple repositories. This will most often 966 // be set to 1 to find all repositories containing a result. 967 ShardRepoMaxMatchCount int 968 969 // Abort the search after this much time has passed. 970 MaxWallTime time.Duration 971 972 // FlushWallTime if non-zero will stop streaming behaviour at first and 973 // instead will collate and sort results. At FlushWallTime the results will 974 // be sent and then the behaviour will revert to the normal streaming. 975 FlushWallTime time.Duration 976 977 // Truncates the number of documents (i.e. files) after collating and 978 // sorting the results. 979 MaxDocDisplayCount int 980 981 // Truncates the number of matchs after collating and sorting the results. 982 MaxMatchDisplayCount int 983 984 // If set to a number greater than zero then up to this many number 985 // of context lines will be added before and after each matched line. 986 // Note that the included context lines might contain matches and 987 // it's up to the consumer of the result to remove those lines. 988 NumContextLines int 989 990 // If true, ChunkMatches will be returned in each FileMatch rather than LineMatches 991 // EXPERIMENTAL: the behavior of this flag may be changed in future versions. 992 ChunkMatches bool 993 994 // EXPERIMENTAL. If true, use text-search style scoring instead of the default 995 // scoring formula. The scoring algorithm treats each match in a file as a term 996 // and computes an approximation to BM25. When enabled, BM25 scoring is used for 997 // the overall FileMatch score, as well as individual LineMatch and ChunkMatch scores. 998 // 999 // The calculation of IDF assumes that Zoekt visits all documents containing any 1000 // of the query terms during evaluation. This is true, for example, if all query 1001 // terms are ORed together. 1002 // 1003 // When enabled, all other scoring signals are ignored, including document ranks. 1004 UseBM25Scoring bool 1005 1006 // Trace turns on opentracing for this request if true and if the Jaeger address was provided as 1007 // a command-line flag 1008 Trace bool 1009 1010 // If set, the search results will contain debug information for scoring. 1011 DebugScore bool 1012 1013 // SpanContext is the opentracing span context, if it exists, from the zoekt client 1014 SpanContext map[string]string 1015} 1016 1017// String returns a succinct representation of the options. This is meant for 1018// human consumption in logs and traces. 1019// 1020// Note: some tracing systems have limits on length of values, so we take care 1021// to try and make this small, and include the important information near the 1022// front incase of truncation. 1023func (s *SearchOptions) String() string { 1024 var b strings.Builder 1025 1026 add := func(name, value string) { 1027 b.WriteString(name) 1028 b.WriteByte('=') 1029 b.WriteString(value) 1030 b.WriteByte(' ') 1031 } 1032 addInt := func(name string, value int) { 1033 if value != 0 { 1034 add(name, strconv.Itoa(value)) 1035 } 1036 } 1037 addDuration := func(name string, value time.Duration) { 1038 if value != 0 { 1039 add(name, value.String()) 1040 } 1041 } 1042 addBool := func(name string, value bool) { 1043 if !value { 1044 return 1045 } 1046 b.WriteString(name) 1047 b.WriteByte(' ') 1048 } 1049 1050 b.WriteString("zoekt.SearchOptions{ ") 1051 1052 addInt("ShardMaxMatchCount", s.ShardMaxMatchCount) 1053 addInt("TotalMaxMatchCount", s.TotalMaxMatchCount) 1054 addInt("ShardRepoMaxMatchCount", s.ShardRepoMaxMatchCount) 1055 addInt("MaxDocDisplayCount", s.MaxDocDisplayCount) 1056 addInt("MaxMatchDisplayCount", s.MaxMatchDisplayCount) 1057 addInt("NumContextLines", s.NumContextLines) 1058 1059 addDuration("MaxWallTime", s.MaxWallTime) 1060 addDuration("FlushWallTime", s.FlushWallTime) 1061 1062 addBool("EstimateDocCount", s.EstimateDocCount) 1063 addBool("Whole", s.Whole) 1064 addBool("ChunkMatches", s.ChunkMatches) 1065 addBool("UseBM25Scoring", s.UseBM25Scoring) 1066 addBool("Trace", s.Trace) 1067 addBool("DebugScore", s.DebugScore) 1068 1069 for k, v := range s.SpanContext { 1070 add("SpanContext."+k, strconv.Quote(v)) 1071 } 1072 1073 b.WriteByte('}') 1074 return b.String() 1075} 1076 1077// Sender is the interface that wraps the basic Send method. 1078type Sender interface { 1079 Send(*SearchResult) 1080} 1081 1082// SenderFunc is an adapter to allow the use of ordinary functions as Sender. 1083// If f is a function with the appropriate signature, SenderFunc(f) is a Sender 1084// that calls f. 1085type SenderFunc func(result *SearchResult) 1086 1087func (f SenderFunc) Send(result *SearchResult) { 1088 f(result) 1089} 1090 1091// Streamer adds the method StreamSearch to the Searcher interface. 1092type Streamer interface { 1093 Searcher 1094 StreamSearch(ctx context.Context, q query.Q, opts *SearchOptions, sender Sender) (err error) 1095}