fork of https://github.com/sourcegraph/zoekt
0

Configure Feed

Select the types of activity you want to include in your feed.

score: introduce query.Boost to scale score (#728)

This commit introduces a new primitive Boost to our query language. It
allows boosting (or dampening) the contribution to the score a query
atoms will match contribute.

To achieve this we introduce boostMatchTree which records this weight.
We then adjust the visitMatches to take an initial score weight (1.0),
and then each time we recurse through a boostMatchTree the score weight
is multiplied by the boost weight. Additionally candidateMatch now has a
new field, scoreWeight, which records the weight at time of candidate
collection. Without boosting in the query this value will always be 1.

Finally when scoring a candidateMatch we take the final score for it and
multiply it by scoreWeight.

Note: we do not expose a way to set this in the query language, only the
query API.

Test Plan: Manual testing against webserver via the new phrase-boost URL
param. Additionally updated ranking tests to use the phrase booster.

+230 -52
+1 -1
api_test.go
··· 152 152 size: 112, 153 153 }, { 154 154 v: candidateMatch{}, 155 - size: 72, 155 + size: 80, 156 156 }, { 157 157 v: candidateChunk{}, 158 158 size: 40,
+5
bits.go
··· 16 16 17 17 import ( 18 18 "encoding/binary" 19 + "math" 19 20 "sort" 20 21 "unicode" 21 22 "unicode/utf8" ··· 391 392 func (m runeOffsetMap) sizeBytes() int { 392 393 return 8 * len(m) 393 394 } 395 + 396 + func epsilonEqualsOne(scoreWeight float64) bool { 397 + return scoreWeight == 1 || math.Abs(scoreWeight-1.0) < 1e-9 398 + }
+8
contentprovider.go
··· 660 660 } 661 661 } 662 662 663 + // scoreWeight != 1 means it affects score 664 + if !epsilonEqualsOne(m.scoreWeight) { 665 + score.score = score.score * m.scoreWeight 666 + if debug { 667 + score.what += fmt.Sprintf("boost:%.2f, ", m.scoreWeight) 668 + } 669 + } 670 + 663 671 if score.score > maxScore.score { 664 672 maxScore.score = score.score 665 673 maxScore.what = score.what
+30 -7
eval.go
··· 420 420 // whether there's an exact match on a symbol, the number of query clauses that matched, etc. 421 421 func (d *indexData) scoreFile(fileMatch *FileMatch, doc uint32, mt matchTree, known map[matchTree]bool, opts *SearchOptions) { 422 422 atomMatchCount := 0 423 - visitMatches(mt, known, func(mt matchTree) { 423 + visitMatchAtoms(mt, known, func(mt matchTree) { 424 424 atomMatchCount++ 425 425 }) 426 426 ··· 544 544 return m[i].byteOffset < m[j].byteOffset 545 545 } 546 546 547 + // setScoreWeight is a helper used by gatherMatches to set the weight based on 548 + // the score weight of the matchTree. 549 + func setScoreWeight(scoreWeight float64, cm []*candidateMatch) []*candidateMatch { 550 + for _, m := range cm { 551 + m.scoreWeight = scoreWeight 552 + } 553 + return cm 554 + } 555 + 547 556 // Gather matches from this document. This never returns a mixture of 548 557 // filename/content matches: if there are content matches, all 549 558 // filename matches are trimmed from the result. The matches are ··· 554 563 // but adjacent matches will remain. 555 564 func gatherMatches(mt matchTree, known map[matchTree]bool, merge bool) []*candidateMatch { 556 565 var cands []*candidateMatch 557 - visitMatches(mt, known, func(mt matchTree) { 566 + visitMatches(mt, known, 1, func(mt matchTree, scoreWeight float64) { 558 567 if smt, ok := mt.(*substrMatchTree); ok { 559 - cands = append(cands, smt.current...) 568 + cands = append(cands, setScoreWeight(scoreWeight, smt.current)...) 560 569 } 561 570 if rmt, ok := mt.(*regexpMatchTree); ok { 562 - cands = append(cands, rmt.found...) 571 + cands = append(cands, setScoreWeight(scoreWeight, rmt.found)...) 563 572 } 564 573 if rmt, ok := mt.(*wordMatchTree); ok { 565 - cands = append(cands, rmt.found...) 574 + cands = append(cands, setScoreWeight(scoreWeight, rmt.found)...) 566 575 } 567 576 if smt, ok := mt.(*symbolRegexpMatchTree); ok { 568 - cands = append(cands, smt.found...) 577 + cands = append(cands, setScoreWeight(scoreWeight, smt.found)...) 569 578 } 570 579 }) 571 580 ··· 590 599 // are non-overlapping. 591 600 sort.Sort((sortByOffsetSlice)(cands)) 592 601 res = cands[:0] 602 + mergeRun := 1 593 603 for i, c := range cands { 594 604 if i == 0 { 595 605 res = append(res, c) ··· 599 609 lastEnd := last.byteOffset + last.byteMatchSz 600 610 end := c.byteOffset + c.byteMatchSz 601 611 if lastEnd >= c.byteOffset { 612 + mergeRun++ 613 + 614 + // Average out the score across the merged candidates. Only do it if 615 + // we are boosting to avoid floating point funkiness in the normal 616 + // case. 617 + if !(epsilonEqualsOne(last.scoreWeight) && epsilonEqualsOne(c.scoreWeight)) { 618 + last.scoreWeight = ((last.scoreWeight * float64(mergeRun-1)) + c.scoreWeight) / float64(mergeRun) 619 + } 620 + 621 + // latest candidate goes further, update our end 602 622 if end > lastEnd { 603 623 last.byteMatchSz = end - last.byteOffset 604 624 } 625 + 605 626 continue 627 + } else { 628 + mergeRun = 1 606 629 } 607 630 608 631 res = append(res, c) ··· 649 672 // returns all branches containing docID. 650 673 func (d *indexData) gatherBranches(docID uint32, mt matchTree, known map[matchTree]bool) []string { 651 674 var mask uint64 652 - visitMatches(mt, known, func(mt matchTree) { 675 + visitMatchAtoms(mt, known, func(mt matchTree) { 653 676 bq, ok := mt.(*branchQueryMatchTree) 654 677 if !ok { 655 678 return
+5 -1
internal/e2e/e2e_rank_test.go
··· 118 118 t.Fatal(err) 119 119 } 120 120 121 + // q is marshalled as part of the test, so avoid our rewrites for 122 + // ranking. 123 + qSearch := query.ExpirementalPhraseBoost(q, rq.Query, query.ExperimentalPhraseBoostOptions{}) 124 + 121 125 sOpts := zoekt.SearchOptions{ 122 126 // Use the same options sourcegraph has by default 123 127 ChunkMatches: true, ··· 128 132 129 133 DebugScore: *debugScore, 130 134 } 131 - result, err := ss.Search(context.Background(), q, &sOpts) 135 + result, err := ss.Search(context.Background(), qSearch, &sOpts) 132 136 if err != nil { 133 137 t.Fatal(err) 134 138 }
+2 -2
internal/e2e/testdata/assets_are_not_configured_for_this_binary.txt
··· 3 3 targetRank: 1 4 4 5 5 **github.com/sourcegraph/sourcegraph/ui/assets/assets.go** 6 + 30: return nil, errors.New("assets are not configured for this binary, please see ui/assets") 7 + 34: panic("assets are not configured for this binary, please see ui/assets") 6 8 33:func (p FailingAssetsProvider) Assets() http.FileSystem { 7 - 14: Assets() http.FileSystem 8 - 1:package assets 9 9 hidden 12 more line matches 10 10 11 11 github.com/sourcegraph/sourcegraph/schema/schema.go
+25 -25
internal/e2e/testdata/generate_unit_test.txt
··· 1 1 queryString: generate unit test 2 2 query: (and substr:"generate" substr:"unit" substr:"test") 3 - targetRank: 11 3 + targetRank: 1 4 + 5 + **github.com/sourcegraph/cody/lib/shared/src/chat/recipes/generate-test.ts** 6 + 16: public title = 'Generate Unit Test' 7 + 14:export class GenerateTest implements Recipe { 8 + 15: public id: RecipeID = 'generate-unit-test' 9 + hidden 3 more line matches 10 + 11 + github.com/sourcegraph/sourcegraph/client/jetbrains/README.md 12 + 40:- Generate unit test 13 + 41:- Generate docstring 14 + 61:Cody is powered by Sourcegraph’s code graph and uses context of your codebase to extend its capabilities. By using context from entire repositories, Cody is able to give more accurate answers and generate idiomatic code. 15 + hidden 7 more line matches 16 + 17 + github.com/sourcegraph/cody/vscode/CHANGELOG.md 18 + 298:- The `/test` (Generate Unit Test) command was updated to use file dependencies and test examples when fetching context, in order to produce better results. To use this command, select code in your editor and run the `/test` command. It is recommended to set up test files before running the command to get optimal results. [pull/683](https://github.com/sourcegraph/cody/pull/683) [pull/602](https://github.com/sourcegraph/cody/pull/602) 19 + 218:- The `Generate Unit Tests` command has been improved with an enhanced context fetching process that produces test results with better quality. [pull/907](https://github.com/sourcegraph/cody/pull/907) 20 + 264:- The `Generate Unit Tests` command has been improved with an enhanced context fetching process that produces test results with better quality. [pull/907](https://github.com/sourcegraph/cody/pull/907) 21 + hidden 17 more line matches 22 + 23 + github.com/sourcegraph/sourcegraph/doc/cody/overview/install-jetbrains.md 24 + 158:- Generate unit test 25 + 138:Log in to your Sourcegraph instance and go to `settings` / `access token` (`https://<your-instance>.sourcegraph.com/users/<your-instance>/settings/tokens`). From here, generate a new access token. 26 + 159:- Generate docstring 27 + hidden 3 more line matches 4 28 5 29 github.com/sourcegraph/sourcegraph/cmd/frontend/internal/insights/resolvers/insight_series_resolver.go 6 30 300:func (j *seriesResolverGenerator) Generate(ctx context.Context, series types.InsightViewSeries, baseResolver baseInsightResolver, filters types.InsightViewFilters, options types.SeriesDisplayOptions) ([]graphqlbackend.InsightSeriesResolver, error) { ··· 13 37 187:func (rpt *Report) selectOutputUnit(g *graph.Graph) { 14 38 75: SampleUnit string // Unit for the sample data from the profile. 15 39 hidden 48 more line matches 16 - 17 - github.com/sourcegraph/sourcegraph/internal/codeintel/autoindexing/internal/inference/lua/test.lua 18 - 9: generate = function(_, paths) 19 - 6: patterns = { pattern.new_path_basename "sg-test" }, 20 - 8: -- Invoked as part of unit tests for the autoindexing service 21 - hidden 1 more line matches 22 - 23 - github.com/golang/go/src/cmd/internal/testdir/testdir_test.go 24 - 273:type test struct { 25 - 74:func Test(t *testing.T) { 26 - 263:type testCommon struct { 27 - hidden 120 more line matches 28 - 29 - github.com/golang/go/src/cmd/vendor/github.com/google/pprof/profile/profile.go 30 - 65: Unit string // seconds, nanoseconds, bytes, etc 31 - 77: NumUnit map[string][]string 32 - 68: unitX int64 33 - hidden 44 more line matches 34 - 35 - github.com/golang/go/src/cmd/link/internal/loader/loader.go 36 - 79: unit *sym.CompilationUnit 37 - 1544:func (l *Loader) SymUnit(i Sym) *sym.CompilationUnit { 38 - 228: generatedSyms Bitmap // symbols that generate their content, indexed by ext sym idx 39 - hidden 50 more line matches 40 40 41 41 hidden 245 more file matches
+3 -3
internal/e2e/testdata/rank_stats.txt
··· 1 1 queries: 14 2 - recall@1: 7 (50%) 3 - recall@5: 9 (64%) 4 - mrr: 0.579471 2 + recall@1: 9 (64%) 3 + recall@5: 11 (79%) 4 + mrr: 0.710733
+6 -7
internal/e2e/testdata/sourcegraphserver_docker_image_build.txt
··· 1 1 queryString: sourcegraph/server docker image build 2 2 query: (and substr:"sourcegraph/server" substr:"docker" substr:"image" substr:"build") 3 - targetRank: 14 3 + targetRank: 1 4 + 5 + **github.com/sourcegraph/sourcegraph/dev/tools.go** 6 + 7: // zoekt-* used in sourcegraph/server docker image build 7 + 1://go:build tools 8 + 2:// +build tools 4 9 5 10 github.com/sourcegraph/sourcegraph/dev/sg/internal/images/images.go 6 11 458: Build int ··· 31 36 45: latestReleaseKubernetesBuild = newPingResponse("5.1.8") 32 37 50: latestReleaseDockerComposeOrPureDocker = newPingResponse("5.1.8") 33 38 hidden 19 more line matches 34 - 35 - github.com/sourcegraph/sourcegraph/doc/admin/deploy/docker-single-container/index.md 36 - 1:# Docker Single Container Deployment 37 - 294:### Insiders build 38 - 238:### File system performance on Docker for Mac 39 - hidden 52 more line matches 40 39 41 40 hidden 15 more file matches
+2
matchiter.go
··· 27 27 substrBytes []byte 28 28 substrLowered []byte 29 29 30 + scoreWeight float64 31 + 30 32 file uint32 31 33 symbolIdx uint32 32 34
+50 -6
matchtree.go
··· 170 170 child matchTree 171 171 } 172 172 173 + type boostMatchTree struct { 174 + child matchTree 175 + boost float64 176 + } 177 + 173 178 // Don't visit this subtree for collecting matches. 174 179 type noVisitMatchTree struct { 175 180 matchTree ··· 392 397 t.child.prepare(doc) 393 398 } 394 399 400 + func (t *boostMatchTree) prepare(doc uint32) { 401 + t.child.prepare(doc) 402 + } 403 + 395 404 func (t *substrMatchTree) prepare(nextDoc uint32) { 396 405 t.matchIterator.prepare(nextDoc) 397 406 t.current = t.matchIterator.candidates() ··· 455 464 return t.child.nextDoc() 456 465 } 457 466 467 + func (t *boostMatchTree) nextDoc() uint32 { 468 + return t.child.nextDoc() 469 + } 470 + 458 471 func (t *branchQueryMatchTree) nextDoc() uint32 { 459 472 var start uint32 460 473 if t.firstDone { ··· 513 526 514 527 func (t *fileNameMatchTree) String() string { 515 528 return fmt.Sprintf("f(%v)", t.child) 529 + } 530 + 531 + func (t *boostMatchTree) String() string { 532 + return fmt.Sprintf("boost(%f, %v)", t.boost, t.child) 516 533 } 517 534 518 535 func (t *substrMatchTree) String() string { ··· 556 573 visitMatchTree(s.child, f) 557 574 case *fileNameMatchTree: 558 575 visitMatchTree(s.child, f) 576 + case *boostMatchTree: 577 + visitMatchTree(s.child, f) 559 578 case *symbolSubstrMatchTree: 560 579 visitMatchTree(s.substrMatchTree, f) 561 580 case *symbolRegexpMatchTree: ··· 575 594 }) 576 595 } 577 596 597 + func visitMatchAtoms(t matchTree, known map[matchTree]bool, f func(matchTree)) { 598 + visitMatches(t, known, 1, func(mt matchTree, _ float64) { 599 + f(mt) 600 + }) 601 + } 602 + 578 603 // visitMatches visits all atoms which can contribute matches. Note: This 579 604 // skips noVisitMatchTree. 580 - func visitMatches(t matchTree, known map[matchTree]bool, f func(matchTree)) { 605 + func visitMatches(t matchTree, known map[matchTree]bool, weight float64, f func(matchTree, float64)) { 581 606 switch s := t.(type) { 582 607 case *andMatchTree: 583 608 for _, ch := range s.children { 584 609 if known[ch] { 585 - visitMatches(ch, known, f) 610 + visitMatches(ch, known, weight, f) 586 611 } 587 612 } 588 613 case *andLineMatchTree: 589 - visitMatches(&s.andMatchTree, known, f) 614 + visitMatches(&s.andMatchTree, known, weight, f) 590 615 case *orMatchTree: 591 616 for _, ch := range s.children { 592 617 if known[ch] { 593 - visitMatches(ch, known, f) 618 + visitMatches(ch, known, weight, f) 594 619 } 595 620 } 621 + case *boostMatchTree: 622 + visitMatches(s.child, known, weight*s.boost, f) 596 623 case *symbolSubstrMatchTree: 597 - visitMatches(s.substrMatchTree, known, f) 624 + visitMatches(s.substrMatchTree, known, weight, f) 598 625 case *notMatchTree: 599 626 case *noVisitMatchTree: 600 627 // don't collect into negative trees. 601 628 case *fileNameMatchTree: 602 629 // We will just gather the filename if we do not visit this tree. 603 630 default: 604 - f(s) 631 + f(s, weight) 605 632 } 606 633 } 607 634 ··· 876 903 return evalMatchTree(cp, cost, known, t.child) 877 904 } 878 905 906 + func (t *boostMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState { 907 + return evalMatchTree(cp, cost, known, t.child) 908 + } 909 + 879 910 func (t *substrMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState { 880 911 if t.contEvaluated { 881 912 return matchesStateForSlice(t.current) ··· 995 1026 996 1027 return &fileNameMatchTree{ 997 1028 child: ct, 1029 + }, nil 1030 + 1031 + case *query.Boost: 1032 + ct, err := d.newMatchTree(s.Child, opt) 1033 + if err != nil { 1034 + return nil, err 1035 + } 1036 + 1037 + return &boostMatchTree{ 1038 + child: ct, 1039 + boost: s.Boost, 998 1040 }, nil 999 1041 1000 1042 case *query.Substring: ··· 1287 1329 return nil, nil 1288 1330 } 1289 1331 case *fileNameMatchTree: 1332 + mt.child, err = pruneMatchTree(mt.child) 1333 + case *boostMatchTree: 1290 1334 mt.child, err = pruneMatchTree(mt.child) 1291 1335 case *andLineMatchTree: 1292 1336 child, err := pruneMatchTree(&mt.andMatchTree)
+61
query/boost.go
··· 1 + package query 2 + 3 + type ExperimentalPhraseBoostOptions struct { 4 + // The phrase needs to contain atleast this many terms. This is based on the 5 + // parsed query. 6 + // 7 + // Defaults to 3. 8 + MinTerms int 9 + 10 + // Boost is how much to multiply the phrase match scores by. 11 + // 12 + // Defaults to 20. 13 + Boost float64 14 + } 15 + 16 + // ExperimentalPhraseBoost transforms q into a query containing exact matches 17 + // to phrase boosted. opts control how and when the boosting is done. 18 + // 19 + // Note: This is a temporary API and will be removed in future commits. 20 + func ExpirementalPhraseBoost(q Q, phrase string, opts ExperimentalPhraseBoostOptions) Q { 21 + if opts.MinTerms == 0 { 22 + opts.MinTerms = 3 23 + } 24 + if opts.Boost == 0 { 25 + opts.Boost = 20 26 + } 27 + 28 + contentAtoms := 0 29 + caseSensitive := false 30 + VisitAtoms(q, func(q Q) { 31 + switch s := q.(type) { 32 + case *Regexp: 33 + // Check atom is for content 34 + if s.Content || (s.Content == s.FileName) { 35 + caseSensitive = s.CaseSensitive 36 + contentAtoms++ 37 + } 38 + case *Substring: 39 + if s.Content || (s.Content == s.FileName) { 40 + caseSensitive = s.CaseSensitive 41 + contentAtoms++ 42 + } 43 + } 44 + }) 45 + 46 + if contentAtoms < opts.MinTerms { 47 + return q 48 + } 49 + 50 + return NewOr( 51 + &Boost{ 52 + Boost: opts.Boost, 53 + Child: &Substring{ 54 + Pattern: phrase, 55 + Content: true, 56 + CaseSensitive: caseSensitive, 57 + }, 58 + }, 59 + q, 60 + ) 61 + }
+25
query/query.go
··· 386 386 } 387 387 } 388 388 389 + // Boost scales the contribution to score of descendents. 390 + type Boost struct { 391 + Child Q 392 + // Boost will multiply the score of its descendents. Values less than 1 will 393 + // give less importance while values greater than 1 will give more 394 + // importance. 395 + Boost float64 396 + } 397 + 398 + func (q *Boost) String() string { 399 + return fmt.Sprintf("(boost %0.2f %s)", q.Boost, q.Child) 400 + } 401 + 389 402 // Substring is the most basic query: a query for a substring. 390 403 type Substring struct { 391 404 Pattern string ··· 609 622 case *Type: 610 623 child, changed := flatten(s.Child) 611 624 return &Type{Child: child, Type: s.Type}, changed 625 + case *Boost: 626 + child, changed := flatten(s.Child) 627 + return &Boost{Child: child, Boost: s.Boost}, changed 612 628 default: 613 629 return q, false 614 630 } ··· 680 696 return ch 681 697 } 682 698 return &Type{Child: ch, Type: s.Type} 699 + case *Boost: 700 + ch := evalConstants(s.Child) 701 + if _, ok := ch.(*Const); ok { 702 + return ch 703 + } 704 + return &Boost{Boost: s.Boost, Child: ch} 683 705 case *Substring: 684 706 if len(s.Pattern) == 0 { 685 707 return &Const{true} ··· 728 750 q = &Not{Child: Map(s.Child, f)} 729 751 case *Type: 730 752 q = &Type{Type: s.Type, Child: Map(s.Child, f)} 753 + case *Boost: 754 + q = &Boost{Boost: s.Boost, Child: Map(s.Child, f)} 731 755 } 732 756 return f(q) 733 757 } ··· 768 792 case *Or: 769 793 case *Not: 770 794 case *Type: 795 + case *Boost: 771 796 default: 772 797 v(iQ) 773 798 }
+7
web/server.go
··· 241 241 return nil, err 242 242 } 243 243 244 + // Experimental: The query string and boost exact phrases of it. 245 + if phraseBoost, err := strconv.ParseFloat(qvals.Get("phrase-boost"), 64); err == nil { 246 + q = query.ExpirementalPhraseBoost(q, queryStr, query.ExperimentalPhraseBoostOptions{ 247 + Boost: phraseBoost, 248 + }) 249 + } 250 + 244 251 repoOnly := true 245 252 query.VisitAtoms(q, func(q query.Q) { 246 253 _, ok := q.(*query.Repo)