fork of https://github.com/sourcegraph/zoekt
0

Configure Feed

Select the types of activity you want to include in your feed.

1// Copyright 2016 Google Inc. All rights reserved. 2// 3// Licensed under the Apache License, Version 2.0 (the "License"); 4// you may not use this file except in compliance with the License. 5// You may obtain a copy of the License at 6// 7// http://www.apache.org/licenses/LICENSE-2.0 8// 9// Unless required by applicable law or agreed to in writing, software 10// distributed under the License is distributed on an "AS IS" BASIS, 11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12// See the License for the specific language governing permissions and 13// limitations under the License. 14 15package zoekt // import "github.com/sourcegraph/zoekt" 16 17import ( 18 "context" 19 "encoding/json" 20 "errors" 21 "fmt" 22 "reflect" 23 "strconv" 24 "strings" 25 "time" 26 27 "github.com/sourcegraph/zoekt/query" 28) 29 30const ( 31 mapHeaderBytes uint64 = 48 32 sliceHeaderBytes uint64 = 24 33 stringHeaderBytes uint64 = 16 34 pointerSize uint64 = 8 35 interfaceBytes uint64 = 16 36) 37 38// FileMatch contains all the matches within a file. 39type FileMatch struct { 40 FileName string 41 42 // Repository is the globally unique name of the repo of the 43 // match 44 Repository string 45 46 // SubRepositoryName is the globally unique name of the repo, 47 // if it came from a subrepository 48 SubRepositoryName string `json:",omitempty"` 49 50 // SubRepositoryPath holds the prefix where the subrepository 51 // was mounted. 52 SubRepositoryPath string `json:",omitempty"` 53 54 // Commit SHA1 (hex) of the (sub)repo holding the file. 55 Version string `json:",omitempty"` 56 57 // Detected language of the result. 58 Language string 59 60 // For debugging. Needs DebugScore set, but public so tests in 61 // other packages can print some diagnostics. 62 Debug string `json:",omitempty"` 63 64 Branches []string `json:",omitempty"` 65 66 // One of LineMatches or ChunkMatches will be returned depending on whether 67 // the SearchOptions.ChunkMatches is set. 68 LineMatches []LineMatch `json:",omitempty"` 69 ChunkMatches []ChunkMatch `json:",omitempty"` 70 71 // Only set if requested 72 Content []byte `json:",omitempty"` 73 74 // Checksum of the content. 75 Checksum []byte 76 77 // Ranking; the higher, the better. 78 Score float64 `json:",omitempty"` 79 80 // RepositoryPriority is a Sourcegraph extension. It is used by Sourcegraph to 81 // order results from different repositories relative to each other. 82 RepositoryPriority float64 `json:",omitempty"` 83 84 // RepositoryID is a Sourcegraph extension. This is the ID of Repository in 85 // Sourcegraph. 86 RepositoryID uint32 `json:",omitempty"` 87} 88 89func (m *FileMatch) sizeBytes() (sz uint64) { 90 // Score 91 sz += 8 92 93 for _, s := range []string{ 94 m.Debug, 95 m.FileName, 96 m.Repository, 97 m.Language, 98 m.SubRepositoryName, 99 m.SubRepositoryPath, 100 m.Version, 101 } { 102 sz += stringHeaderBytes + uint64(len(s)) 103 } 104 105 // Branches 106 sz += sliceHeaderBytes 107 for _, s := range m.Branches { 108 sz += stringHeaderBytes + uint64(len(s)) 109 } 110 111 // LineMatches 112 sz += sliceHeaderBytes 113 for _, lm := range m.LineMatches { 114 sz += lm.sizeBytes() 115 } 116 117 // ChunkMatches 118 sz += sliceHeaderBytes 119 for _, cm := range m.ChunkMatches { 120 sz += cm.sizeBytes() 121 } 122 123 // RepositoryID 124 sz += 4 125 126 // RepositoryPriority 127 sz += 8 128 129 // Content 130 sz += sliceHeaderBytes + uint64(len(m.Content)) 131 132 // Checksum 133 sz += sliceHeaderBytes + uint64(len(m.Checksum)) 134 135 return 136} 137 138// ChunkMatch is a set of non-overlapping matches within a contiguous range of 139// lines in the file. 140type ChunkMatch struct { 141 DebugScore string 142 143 // Content is a contiguous range of complete lines that fully contains Ranges. 144 // Lines will always include their terminating newline (if it exists). 145 Content []byte 146 147 // Ranges is a set of matching ranges within this chunk. Each range is relative 148 // to the beginning of the file (not the beginning of Content). 149 Ranges []Range 150 151 // SymbolInfo is the symbol information associated with Ranges. If it is non-nil, 152 // its length will equal that of Ranges. Any of its elements may be nil. 153 SymbolInfo []*Symbol 154 155 // FileName indicates whether this match is a match on the file name, in 156 // which case Content will contain the file name. 157 FileName bool 158 159 // ContentStart is the location (inclusive) of the beginning of content 160 // relative to the beginning of the file. It will always be at the 161 // beginning of a line (Column will always be 1). 162 ContentStart Location 163 164 Score float64 165} 166 167func (cm *ChunkMatch) sizeBytes() (sz uint64) { 168 // Content 169 sz += sliceHeaderBytes + uint64(len(cm.Content)) 170 171 // ContentStart 172 sz += cm.ContentStart.sizeBytes() 173 174 // FileName 175 sz += 1 176 177 // Ranges 178 sz += sliceHeaderBytes 179 if len(cm.Ranges) > 0 { 180 sz += uint64(len(cm.Ranges)) * cm.Ranges[0].sizeBytes() 181 } 182 183 // SymbolInfo 184 sz += sliceHeaderBytes 185 for _, si := range cm.SymbolInfo { 186 sz += pointerSize 187 if si != nil { 188 sz += si.sizeBytes() 189 } 190 } 191 192 // Score 193 sz += 8 194 195 // DebugScore 196 sz += stringHeaderBytes + uint64(len(cm.DebugScore)) 197 198 return 199} 200 201type Range struct { 202 // The inclusive beginning of the range. 203 Start Location 204 // The exclusive end of the range. 205 End Location 206} 207 208func (r *Range) sizeBytes() uint64 { 209 return r.Start.sizeBytes() + r.End.sizeBytes() 210} 211 212type Location struct { 213 // 0-based byte offset from the beginning of the file 214 ByteOffset uint32 215 // 1-based line number from the beginning of the file 216 LineNumber uint32 217 // 1-based column number (in runes) from the beginning of line 218 Column uint32 219} 220 221func (l *Location) sizeBytes() uint64 { 222 return 3 * 4 223} 224 225// LineMatch holds the matches within a single line in a file. 226type LineMatch struct { 227 // The line in which a match was found. 228 Line []byte 229 // The byte offset of the first byte of the line. 230 LineStart int 231 // The byte offset of the first byte past the end of the line. 232 // This is usually the byte after the terminating newline, but can also be 233 // the end of the file if there is no terminating newline 234 LineEnd int 235 LineNumber int 236 237 // Before and After are only set when SearchOptions.NumContextLines is > 0 238 Before []byte 239 After []byte 240 241 // If set, this was a match on the filename. 242 FileName bool 243 244 // The higher the better. Only ranks the quality of the match 245 // within the file, does not take rank of file into account 246 Score float64 247 DebugScore string 248 249 LineFragments []LineFragmentMatch 250} 251 252func (lm *LineMatch) sizeBytes() (sz uint64) { 253 // Line 254 sz += sliceHeaderBytes + uint64(len(lm.Line)) 255 256 // LineStart, LineEnd, LineNumber 257 sz += 3 * 8 258 259 // Before 260 sz += sliceHeaderBytes + uint64(len(lm.Before)) 261 262 // After 263 sz += sliceHeaderBytes + uint64(len(lm.After)) 264 265 // FileName 266 sz += 1 267 268 // Score 269 sz += 8 270 271 // DebugScore 272 sz += stringHeaderBytes + uint64(len(lm.DebugScore)) 273 274 // LineFragments 275 sz += sliceHeaderBytes 276 for _, lf := range lm.LineFragments { 277 sz += lf.sizeBytes() 278 } 279 280 return 281} 282 283type Symbol struct { 284 Sym string 285 Kind string 286 Parent string 287 ParentKind string 288} 289 290func (s *Symbol) sizeBytes() uint64 { 291 return 4*stringHeaderBytes + uint64(len(s.Sym)+len(s.Kind)+len(s.Parent)+len(s.ParentKind)) 292} 293 294// LineFragmentMatch a segment of matching text within a line. 295type LineFragmentMatch struct { 296 // Offset within the line, in bytes. 297 LineOffset int 298 299 // Offset from file start, in bytes. 300 Offset uint32 301 302 // Number bytes that match. 303 MatchLength int 304 305 SymbolInfo *Symbol 306} 307 308func (lfm *LineFragmentMatch) sizeBytes() (sz uint64) { 309 // LineOffset 310 sz += 8 311 312 // Offset 313 sz += 4 314 315 // MatchLength 316 sz += 8 317 318 // SymbolInfo 319 sz += pointerSize 320 if lfm.SymbolInfo != nil { 321 sz += lfm.SymbolInfo.sizeBytes() 322 } 323 324 return 325} 326 327type FlushReason uint8 328 329const ( 330 FlushReasonTimerExpired FlushReason = 1 << iota 331 FlushReasonFinalFlush 332 FlushReasonMaxSize 333) 334 335var FlushReasonStrings = map[FlushReason]string{ 336 FlushReasonTimerExpired: "timer_expired", 337 FlushReasonFinalFlush: "final_flush", 338 FlushReasonMaxSize: "max_size_reached", 339} 340 341func (fr FlushReason) String() string { 342 if v, ok := FlushReasonStrings[fr]; ok { 343 return v 344 } 345 346 return "none" 347} 348 349// Stats contains interesting numbers on the search 350type Stats struct { 351 // Amount of I/O for reading contents. 352 ContentBytesLoaded int64 353 354 // Amount of I/O for reading from index. 355 IndexBytesLoaded int64 356 357 // Number of search shards that had a crash. 358 Crashes int 359 360 // Wall clock time for this search 361 Duration time.Duration 362 363 // Number of files containing a match. 364 FileCount int 365 366 // Number of files in shards that we considered. 367 ShardFilesConsidered int 368 369 // Files that we evaluated. Equivalent to files for which all 370 // atom matches (including negations) evaluated to true. 371 FilesConsidered int 372 373 // Files for which we loaded file content to verify substring matches 374 FilesLoaded int 375 376 // Candidate files whose contents weren't examined because we 377 // gathered enough matches. 378 FilesSkipped int 379 380 // Shards that we scanned to find matches. 381 ShardsScanned int 382 383 // Shards that we did not process because a query was canceled. 384 ShardsSkipped int 385 386 // Shards that we did not process because the query was rejected by the 387 // ngram filter indicating it had no matches. 388 ShardsSkippedFilter int 389 390 // Number of non-overlapping matches 391 MatchCount int 392 393 // Number of candidate matches as a result of searching ngrams. 394 NgramMatches int 395 396 // NgramLookups is the number of times we accessed an ngram in the index. 397 NgramLookups int 398 399 // Wall clock time for queued search. 400 Wait time.Duration 401 402 // Aggregate wall clock time spent constructing and pruning the match tree. 403 // This accounts for time such as lookups in the trigram index. 404 MatchTreeConstruction time.Duration 405 406 // Aggregate wall clock time spent searching the match tree. This accounts 407 // for the bulk of search work done looking for matches. 408 MatchTreeSearch time.Duration 409 410 // Number of times regexp was called on files that we evaluated. 411 RegexpsConsidered int 412 413 // FlushReason explains why results were flushed. 414 FlushReason FlushReason 415} 416 417func (s *Stats) sizeBytes() (sz uint64) { 418 sz = 16 * 8 // This assumes we are running on a 64-bit architecture 419 sz += 1 // FlushReason 420 421 return 422} 423 424func (s *Stats) Add(o Stats) { 425 s.ContentBytesLoaded += o.ContentBytesLoaded 426 s.IndexBytesLoaded += o.IndexBytesLoaded 427 s.Crashes += o.Crashes 428 s.FileCount += o.FileCount 429 s.FilesConsidered += o.FilesConsidered 430 s.FilesLoaded += o.FilesLoaded 431 s.FilesSkipped += o.FilesSkipped 432 s.MatchCount += o.MatchCount 433 s.NgramMatches += o.NgramMatches 434 s.NgramLookups += o.NgramLookups 435 s.ShardFilesConsidered += o.ShardFilesConsidered 436 s.ShardsScanned += o.ShardsScanned 437 s.ShardsSkipped += o.ShardsSkipped 438 s.ShardsSkippedFilter += o.ShardsSkippedFilter 439 s.Wait += o.Wait 440 s.MatchTreeConstruction += o.MatchTreeConstruction 441 s.MatchTreeSearch += o.MatchTreeSearch 442 s.RegexpsConsidered += o.RegexpsConsidered 443 444 // We want the first non-zero FlushReason to be sticky. This is a useful 445 // property when aggregating stats from several Zoekts. 446 if s.FlushReason == 0 { 447 s.FlushReason = o.FlushReason 448 } 449} 450 451// Zero returns true if stats is empty. 452func (s *Stats) Zero() bool { 453 if s == nil { 454 return true 455 } 456 457 return !(s.ContentBytesLoaded > 0 || 458 s.IndexBytesLoaded > 0 || 459 s.Crashes > 0 || 460 s.FileCount > 0 || 461 s.FilesConsidered > 0 || 462 s.FilesLoaded > 0 || 463 s.FilesSkipped > 0 || 464 s.MatchCount > 0 || 465 s.NgramMatches > 0 || 466 s.NgramLookups > 0 || 467 s.ShardFilesConsidered > 0 || 468 s.ShardsScanned > 0 || 469 s.ShardsSkipped > 0 || 470 s.ShardsSkippedFilter > 0 || 471 s.Wait > 0 || 472 s.MatchTreeConstruction > 0 || 473 s.MatchTreeSearch > 0 || 474 s.RegexpsConsidered > 0) 475} 476 477// Progress contains information about the global progress of the running search query. 478// This is used by the frontend to reorder results and emit them when stable. 479// Sourcegraph specific: this is used when querying multiple zoekt-webserver instances. 480type Progress struct { 481 // Priority of the shard that was searched. 482 Priority float64 483 484 // MaxPendingPriority is the maximum priority of pending result that is being searched in parallel. 485 // This is used to reorder results when the result set is known to be stable-- that is, when a result's 486 // Priority is greater than the max(MaxPendingPriority) from the latest results of each backend, it can be returned to the user. 487 // 488 // MaxPendingPriority decreases monotonically in each SearchResult. 489 MaxPendingPriority float64 490} 491 492func (p *Progress) sizeBytes() uint64 { 493 return 2 * 8 494} 495 496// SearchResult contains search matches and extra data 497type SearchResult struct { 498 Stats 499 500 // Do not encode this as we cannot encode -Inf in JSON 501 Progress `json:"-"` 502 503 Files []FileMatch 504 505 // RepoURLs holds a repo => template string map. 506 RepoURLs map[string]string 507 508 // FragmentNames holds a repo => template string map, for 509 // the line number fragment. 510 LineFragments map[string]string 511} 512 513// SizeBytes is a best-effort estimate of the size of SearchResult in memory. 514// The estimate does not take alignment into account. The result is a lower 515// bound on the actual size in memory. 516func (sr *SearchResult) SizeBytes() (sz uint64) { 517 sz += sr.Stats.sizeBytes() 518 sz += sr.Progress.sizeBytes() 519 520 // Files 521 sz += sliceHeaderBytes 522 for _, f := range sr.Files { 523 sz += f.sizeBytes() 524 } 525 526 // RepoURLs 527 sz += mapHeaderBytes 528 for k, v := range sr.RepoURLs { 529 sz += stringHeaderBytes + uint64(len(k)) 530 sz += stringHeaderBytes + uint64(len(v)) 531 } 532 533 // LineFragments 534 sz += mapHeaderBytes 535 for k, v := range sr.LineFragments { 536 sz += stringHeaderBytes + uint64(len(k)) 537 sz += stringHeaderBytes + uint64(len(v)) 538 } 539 540 return 541} 542 543// RepositoryBranch describes an indexed branch, which is a name 544// combined with a version. 545type RepositoryBranch struct { 546 Name string 547 Version string 548} 549 550func (r RepositoryBranch) String() string { 551 return fmt.Sprintf("%s@%s", r.Name, r.Version) 552} 553 554// Repository holds repository metadata. 555type Repository struct { 556 // Sourcegraph's repository ID 557 ID uint32 558 559 // The repository name 560 Name string 561 562 // The repository URL. 563 URL string 564 565 // The physical source where this repo came from, eg. full 566 // path to the zip filename or git repository directory. This 567 // will not be exposed in the UI, but can be used to detect 568 // orphaned index shards. 569 Source string 570 571 // The branches indexed in this repo. 572 Branches []RepositoryBranch 573 574 // Nil if this is not the super project. 575 SubRepoMap map[string]*Repository 576 577 // URL template to link to the commit of a branch 578 CommitURLTemplate string 579 580 // The repository URL for getting to a file. Has access to 581 // {{.Version}}, {{.Path}} 582 FileURLTemplate string 583 584 // The URL fragment to add to a file URL for line numbers. has 585 // access to {{.LineNumber}}. The fragment should include the 586 // separator, generally '#' or ';'. 587 LineFragmentTemplate string 588 589 // Perf optimization: priority is set when we load the shard. It corresponds to 590 // the value of "priority" stored in RawConfig. 591 priority float64 592 593 // All zoekt.* configuration settings. 594 RawConfig map[string]string 595 596 // Importance of the repository, bigger is more important 597 Rank uint16 598 599 // IndexOptions is a hash of the options used to create the index for the 600 // repo. 601 IndexOptions string 602 603 // HasSymbols is true if this repository has indexed ctags 604 // output. Sourcegraph specific: This field is more appropriate for 605 // IndexMetadata. However, we store it here since the Sourcegraph frontend 606 // can read this structure but not IndexMetadata. 607 HasSymbols bool 608 609 // Tombstone is true if we are not allowed to search this repo. 610 Tombstone bool 611 612 // LatestCommitDate is the date of the latest commit among all indexed Branches. 613 // The date might be time.Time's 0-value if the repository was last indexed 614 // before this field was added. 615 LatestCommitDate time.Time 616 617 // FileTombstones is a set of file paths that should be ignored across all branches 618 // in this shard. 619 FileTombstones map[string]struct{} `json:",omitempty"` 620} 621 622func (r *Repository) UnmarshalJSON(data []byte) error { 623 // We define a new type so that we can use json.Unmarshal 624 // without recursing into this same method. 625 type repository *Repository 626 repo := repository(r) 627 628 err := json.Unmarshal(data, repo) 629 if err != nil { 630 return err 631 } 632 633 if v, ok := repo.RawConfig["repoid"]; ok { 634 id, _ := strconv.ParseUint(v, 10, 32) 635 r.ID = uint32(id) 636 } 637 638 if v, ok := repo.RawConfig["priority"]; ok { 639 r.priority, err = strconv.ParseFloat(v, 64) 640 if err != nil { 641 r.priority = 0 642 } 643 644 // Sourcegraph indexserver doesn't set repo.Rank, so we set it here 645 // based on priority. Setting it on read instead of during indexing 646 // allows us to avoid a complete reindex. 647 if r.Rank == 0 && r.priority > 0 { 648 // Normalize the repo score within [0, 1), with the midpoint at 5,000. This means popular 649 // repos (roughly ones with over 5,000 stars) see diminishing returns from more stars. 650 r.Rank = uint16(r.priority / (5000.0 + r.priority) * maxUInt16) 651 } 652 } 653 return nil 654} 655 656// MergeMutable will merge x into r. mutated will be true if it made any 657// changes. err is non-nil if we needed to mutate an immutable field. 658// 659// Note: SubRepoMap, IndexOptions and HasSymbol fields are ignored. They are 660// computed while indexing so can't be synthesized from x. 661// 662// Note: We ignore RawConfig fields which are duplicated into Repository: 663// name and id. 664func (r *Repository) MergeMutable(x *Repository) (mutated bool, err error) { 665 if r.ID != x.ID { 666 // Sourcegraph: strange behaviour may occur if ID changes but names don't. 667 return mutated, errors.New("ID is immutable") 668 } 669 if r.Name != x.Name { 670 // Name is encoded into the shard name on disk. We need to re-index if it 671 // changes. 672 return mutated, errors.New("Name is immutable") 673 } 674 if !reflect.DeepEqual(r.Branches, x.Branches) { 675 // Need a reindex if content changing. 676 return mutated, errors.New("Branches is immutable") 677 } 678 679 for k, v := range x.RawConfig { 680 // We ignore name and id since they are encoded into the repository. 681 if k == "name" || k == "id" { 682 continue 683 } 684 if r.RawConfig == nil { 685 mutated = true 686 r.RawConfig = make(map[string]string) 687 } 688 if r.RawConfig[k] != v { 689 mutated = true 690 r.RawConfig[k] = v 691 } 692 } 693 694 if r.URL != x.URL { 695 mutated = true 696 r.URL = x.URL 697 } 698 if r.CommitURLTemplate != x.CommitURLTemplate { 699 mutated = true 700 r.CommitURLTemplate = x.CommitURLTemplate 701 } 702 if r.FileURLTemplate != x.FileURLTemplate { 703 mutated = true 704 r.FileURLTemplate = x.FileURLTemplate 705 } 706 if r.LineFragmentTemplate != x.LineFragmentTemplate { 707 mutated = true 708 r.LineFragmentTemplate = x.LineFragmentTemplate 709 } 710 711 return mutated, nil 712} 713 714// IndexMetadata holds metadata stored in the index file. It contains 715// data generated by the core indexing library. 716type IndexMetadata struct { 717 IndexFormatVersion int 718 IndexFeatureVersion int 719 IndexMinReaderVersion int 720 IndexTime time.Time 721 PlainASCII bool 722 LanguageMap map[string]uint16 723 ZoektVersion string 724 ID string 725} 726 727// Statistics of a (collection of) repositories. 728type RepoStats struct { 729 // Repos is used for aggregrating the number of repositories. 730 // 731 // Note: This field is not populated on RepoListEntry.Stats (individual) but 732 // only for RepoList.Stats (aggregate). 733 Repos int 734 735 // Shards is the total number of search shards. 736 Shards int 737 738 // Documents holds the number of documents or files. 739 Documents int 740 741 // IndexBytes is the amount of RAM used for index overhead. 742 IndexBytes int64 743 744 // ContentBytes is the amount of RAM used for raw content. 745 ContentBytes int64 746 747 // Sourcegraph specific stats below. These are not as efficient to calculate 748 // as the above statistics. We experimentally measured about a 10% slower 749 // shard load time. However, we find these values very useful to track and 750 // computing them outside of load time introduces a lot of complexity. 751 752 // NewLinesCount is the number of newlines "\n" that appear in the zoekt 753 // indexed documents. This is not exactly the same as line count, since it 754 // will not include lines not terminated by "\n" (eg a file with no "\n", or 755 // a final line without "\n"). Note: Zoekt deduplicates documents across 756 // branches, so if a path has the same contents on multiple branches, there 757 // is only one document for it. As such that document's newlines is only 758 // counted once. See DefaultBranchNewLinesCount and AllBranchesNewLinesCount 759 // for counts which do not deduplicate. 760 NewLinesCount uint64 761 762 // DefaultBranchNewLinesCount is the number of newlines "\n" in the default 763 // branch. 764 DefaultBranchNewLinesCount uint64 765 766 // OtherBranchesNewLinesCount is the number of newlines "\n" in all branches 767 // except the default branch. 768 OtherBranchesNewLinesCount uint64 769} 770 771func (s *RepoStats) Add(o *RepoStats) { 772 // can't update Repos, since one repo may have multiple 773 // shards. 774 s.Shards += o.Shards 775 s.IndexBytes += o.IndexBytes 776 s.Documents += o.Documents 777 s.ContentBytes += o.ContentBytes 778 779 // Sourcegraph specific 780 s.NewLinesCount += o.NewLinesCount 781 s.DefaultBranchNewLinesCount += o.DefaultBranchNewLinesCount 782 s.OtherBranchesNewLinesCount += o.OtherBranchesNewLinesCount 783} 784 785type RepoListEntry struct { 786 Repository Repository 787 IndexMetadata IndexMetadata 788 Stats RepoStats 789} 790 791// MinimalRepoListEntry is a subset of RepoListEntry. It was added after 792// performance profiling of sourcegraph.com revealed that querying this 793// information from Zoekt was causing lots of CPU and memory usage. Note: we 794// can revisit this, how we store and query this information has changed a lot 795// since this was introduced. 796type MinimalRepoListEntry struct { 797 // HasSymbols is exported since Sourcegraph uses this information at search 798 // planning time to decide between Zoekt and an unindexed symbol search. 799 // 800 // Note: it pretty much is always true in practice. 801 HasSymbols bool 802 803 // Branches is used by Sourcegraphs query planner to decided if it can use 804 // zoekt or go via an unindexed code path. 805 Branches []RepositoryBranch 806 807 // IndexTimeUnix is the IndexTime converted to unix time (number of seconds 808 // since the epoch). This is to make it clear we are not transporting the 809 // full fidelty timestamp (ie with milliseconds and location). Additionally 810 // it saves 16 bytes in this struct. 811 // 812 // IndexTime is used as a heuristic in Sourcegraph to decide in aggregate 813 // how many repositories need updating after a ranking change/etc. 814 // 815 // TODO(keegancsmith) audit updates to IndexTime and document how and when 816 // it changes. Concerned about things like metadata updates or compound 817 // shards leading to untrustworthy data here. 818 IndexTimeUnix int64 819} 820 821type ReposMap map[uint32]MinimalRepoListEntry 822 823// MarshalBinary implements a specialized encoder for ReposMap. 824func (q *ReposMap) MarshalBinary() ([]byte, error) { 825 return reposMapEncode(*q) 826} 827 828// UnmarshalBinary implements a specialized decoder for ReposMap. 829func (q *ReposMap) UnmarshalBinary(b []byte) error { 830 var err error 831 (*q), err = reposMapDecode(b) 832 return err 833} 834 835// RepoList holds a set of Repository metadata. 836type RepoList struct { 837 // Returned when ListOptions.Field is RepoListFieldRepos. 838 Repos []*RepoListEntry 839 840 // ReposMap is set when ListOptions.Field is RepoListFieldReposMap. 841 ReposMap ReposMap 842 843 Crashes int 844 845 // Stats response to a List request. 846 // This is the aggregate RepoStats of all repos matching the input query. 847 Stats RepoStats 848} 849 850type Searcher interface { 851 Search(ctx context.Context, q query.Q, opts *SearchOptions) (*SearchResult, error) 852 853 // List lists repositories. The query `q` can only contain 854 // query.Repo atoms. 855 List(ctx context.Context, q query.Q, opts *ListOptions) (*RepoList, error) 856 Close() 857 858 // Describe the searcher for debug messages. 859 String() string 860} 861 862type RepoListField int 863 864const ( 865 RepoListFieldRepos RepoListField = 0 866 RepoListFieldReposMap = 2 867) 868 869type ListOptions struct { 870 // Field decides which field to populate in RepoList response. 871 Field RepoListField 872} 873 874func (o *ListOptions) GetField() (RepoListField, error) { 875 if o == nil { 876 return RepoListFieldRepos, nil 877 } 878 switch o.Field { 879 case RepoListFieldRepos, RepoListFieldReposMap: 880 return o.Field, nil 881 case 1: 882 return 0, fmt.Errorf("RepoListFieldMinimal (%d) is no longer supported", o.Field) 883 default: 884 return 0, fmt.Errorf("unknown RepoListField %d", o.Field) 885 } 886} 887 888func (o *ListOptions) String() string { 889 return fmt.Sprintf("%#v", o) 890} 891 892type SearchOptions struct { 893 // Return an upper-bound estimate of eligible documents in 894 // stats.ShardFilesConsidered. 895 EstimateDocCount bool 896 897 // Return the whole file. 898 Whole bool 899 900 // Maximum number of matches: skip all processing an index 901 // shard after we found this many non-overlapping matches. 902 ShardMaxMatchCount int 903 904 // Maximum number of matches: stop looking for more matches 905 // once we have this many matches across shards. 906 TotalMaxMatchCount int 907 908 // Maximum number of matches: skip processing documents for a repository in 909 // a shard once we have found ShardRepoMaxMatchCount. 910 // 911 // A compound shard may contain multiple repositories. This will most often 912 // be set to 1 to find all repositories containing a result. 913 ShardRepoMaxMatchCount int 914 915 // Abort the search after this much time has passed. 916 MaxWallTime time.Duration 917 918 // FlushWallTime if non-zero will stop streaming behaviour at first and 919 // instead will collate and sort results. At FlushWallTime the results will 920 // be sent and then the behaviour will revert to the normal streaming. 921 FlushWallTime time.Duration 922 923 // Truncates the number of documents (i.e. files) after collating and 924 // sorting the results. 925 MaxDocDisplayCount int 926 927 // Truncates the number of matchs after collating and sorting the results. 928 MaxMatchDisplayCount int 929 930 // If set to a number greater than zero then up to this many number 931 // of context lines will be added before and after each matched line. 932 // Note that the included context lines might contain matches and 933 // it's up to the consumer of the result to remove those lines. 934 NumContextLines int 935 936 // If true, ChunkMatches will be returned in each FileMatch rather than LineMatches 937 // EXPERIMENTAL: the behavior of this flag may be changed in future versions. 938 ChunkMatches bool 939 940 // EXPERIMENTAL. If true, document ranks are used as additional input for 941 // sorting matches. 942 UseDocumentRanks bool 943 944 // EXPERIMENTAL. When UseDocumentRanks is enabled, this can be optionally set to adjust 945 // their weight in the file match score. If the value is <= 0.0, the default weight value 946 // will be used. This option is temporary and is only exposed for testing/ tuning purposes. 947 DocumentRanksWeight float64 948 949 // EXPERIMENTAL. If true, use keyword-style scoring instead of the default scoring formula. 950 // Currently, this treats each match in a file as a term and computes an approximation to BM25. 951 // When enabled, all other scoring signals are ignored, including document ranks. 952 UseKeywordScoring bool 953 954 // Trace turns on opentracing for this request if true and if the Jaeger address was provided as 955 // a command-line flag 956 Trace bool 957 958 // If set, the search results will contain debug information for scoring. 959 DebugScore bool 960 961 // SpanContext is the opentracing span context, if it exists, from the zoekt client 962 SpanContext map[string]string 963} 964 965// String returns a succinct representation of the options. This is meant for 966// human consumption in logs and traces. 967// 968// Note: some tracing systems have limits on length of values, so we take care 969// to try and make this small, and include the important information near the 970// front incase of truncation. 971func (s *SearchOptions) String() string { 972 var b strings.Builder 973 974 add := func(name, value string) { 975 b.WriteString(name) 976 b.WriteByte('=') 977 b.WriteString(value) 978 b.WriteByte(' ') 979 } 980 addInt := func(name string, value int) { 981 if value != 0 { 982 add(name, strconv.Itoa(value)) 983 } 984 } 985 addDuration := func(name string, value time.Duration) { 986 if value != 0 { 987 add(name, value.String()) 988 } 989 } 990 addBool := func(name string, value bool) { 991 if !value { 992 return 993 } 994 b.WriteString(name) 995 b.WriteByte(' ') 996 } 997 998 b.WriteString("zoekt.SearchOptions{ ") 999 1000 addInt("ShardMaxMatchCount", s.ShardMaxMatchCount) 1001 addInt("TotalMaxMatchCount", s.TotalMaxMatchCount) 1002 addInt("ShardRepoMaxMatchCount", s.ShardRepoMaxMatchCount) 1003 addInt("MaxDocDisplayCount", s.MaxDocDisplayCount) 1004 addInt("MaxMatchDisplayCount", s.MaxMatchDisplayCount) 1005 addInt("NumContextLines", s.NumContextLines) 1006 1007 addDuration("MaxWallTime", s.MaxWallTime) 1008 addDuration("FlushWallTime", s.FlushWallTime) 1009 1010 if s.DocumentRanksWeight > 0 { 1011 add("DocumentRanksWeight", strconv.FormatFloat(s.DocumentRanksWeight, 'g', -1, 64)) 1012 } 1013 1014 addBool("EstimateDocCount", s.EstimateDocCount) 1015 addBool("Whole", s.Whole) 1016 addBool("ChunkMatches", s.ChunkMatches) 1017 addBool("UseDocumentRanks", s.UseDocumentRanks) 1018 addBool("UseKeywordScoring", s.UseKeywordScoring) 1019 addBool("Trace", s.Trace) 1020 addBool("DebugScore", s.DebugScore) 1021 1022 for k, v := range s.SpanContext { 1023 add("SpanContext."+k, strconv.Quote(v)) 1024 } 1025 1026 b.WriteByte('}') 1027 return b.String() 1028} 1029 1030// Sender is the interface that wraps the basic Send method. 1031type Sender interface { 1032 Send(*SearchResult) 1033} 1034 1035// Streamer adds the method StreamSearch to the Searcher interface. 1036type Streamer interface { 1037 Searcher 1038 StreamSearch(ctx context.Context, q query.Q, opts *SearchOptions, sender Sender) (err error) 1039}