fork of https://github.com/sourcegraph/zoekt
0

Configure Feed

Select the types of activity you want to include in your feed.

Ranking: adapt to new ranking service API (#553)

We are updating the ranking service API to only return a single rank
representing the log of each file's reference counts. This PR adapts
zoekt-indexserver to the new API shape, and updates the normalization strategy.
For now, it doesn't change the on-disk format to keep the change simple.

+59 -43
+9 -9
build/e2e_test.go
··· 1175 1175 1176 1176 cases := []struct { 1177 1177 name string 1178 - documentRanks []float64 1178 + documentRank float64 1179 1179 documentRanksWeight float64 1180 1180 wantScore float64 1181 1181 }{ ··· 1185 1185 wantScore: 7012.00, 1186 1186 }, 1187 1187 { 1188 - name: "score with document ranks", 1189 - documentRanks: []float64{0, 0, 0, 0, 0.8, 0, 0}, 1190 - // 5500 (partial symbol at boundary) + 1000 (Java class) + 500 (word match) + 7200 (file rank) + 10 (file order) 1191 - wantScore: 14212.00, 1188 + name: "score with document ranks", 1189 + documentRank: 0.8, 1190 + // 5500 (partial symbol at boundary) + 1000 (Java class) + 500 (word match) + 225 (file rank) + 10 (file order) 1191 + wantScore: 7237.00, 1192 1192 }, 1193 1193 { 1194 1194 name: "score with custom document ranks weight", 1195 - documentRanks: []float64{0, 0, 0, 0, 0.8, 0, 0}, 1195 + documentRank: 0.8, 1196 1196 documentRanksWeight: 1000.0, 1197 - // 5500 (partial symbol at boundary) + 1000 (Java class) + 500 (word match) + 800 (file rank) + 10 (file order) 1198 - wantScore: 7812.00, 1197 + // 5500 (partial symbol at boundary) + 1000 (Java class) + 500 (word match) + 25.00 (file rank) + 10 (file order) 1198 + wantScore: 7037.00, 1199 1199 }, 1200 1200 } 1201 1201 ··· 1206 1206 t.Fatalf("NewBuilder: %v", err) 1207 1207 } 1208 1208 1209 - err = b.Add(zoekt.Document{Name: "example.java", Content: exampleJava, Ranks: c.documentRanks}) 1209 + err = b.Add(zoekt.Document{Name: "example.java", Content: exampleJava, Ranks: []float64{c.documentRank}}) 1210 1210 if err != nil { 1211 1211 t.Fatal(err) 1212 1212 }
+26 -28
cmd/zoekt-sourcegraph-indexserver/sg.go
··· 78 78 // GetDocumentRanks returns a map from paths within the given repo to their 79 79 // rank vectors. Paths are assumed to be ordered by each pairwise component of 80 80 // the resulting vector, higher ranks coming earlier 81 - GetDocumentRanks(ctx context.Context, repoName string) (map[string][]float64, error) 81 + GetDocumentRanks(ctx context.Context, repoName string) (RepoPathRanks, error) 82 82 83 83 // UpdateIndexStatus sends a request to Sourcegraph to confirm that the 84 84 // given repositories have been indexed. 85 85 UpdateIndexStatus(repositories []indexStatus) error 86 + } 87 + 88 + type RepoPathRanks struct { 89 + MeanRank float64 `json:"mean_reference_count"` 90 + Paths map[string]float64 `json:"paths"` 86 91 } 87 92 88 93 func newSourcegraphClient(rootURL *url.URL, hostname string, batchSize int) *sourcegraphClient { ··· 166 171 167 172 // GetDocumentRanks asks Sourcegraph for a mapping of file paths to rank 168 173 // vectors. 169 - func (s *sourcegraphClient) GetDocumentRanks(ctx context.Context, repoName string) (map[string][]float64, error) { 174 + func (s *sourcegraphClient) GetDocumentRanks(ctx context.Context, repoName string) (RepoPathRanks, error) { 170 175 u := s.Root.ResolveReference(&url.URL{ 171 176 Path: "/.internal/ranks/" + strings.Trim(repoName, "/") + "/documents", 172 177 }) 173 178 174 179 b, err := s.get(ctx, u) 175 180 if err != nil { 176 - return nil, err 181 + return RepoPathRanks{}, err 177 182 } 178 183 179 - ranks := make(map[string][]float64) 184 + ranks := RepoPathRanks{} 180 185 err = json.Unmarshal(b, &ranks) 181 186 if err != nil { 182 - return nil, err 183 - } 184 - 185 - // Invariant: All rank vectors have the same length. 186 - first := true 187 - wantLen := -1 188 - for _, v := range ranks { 189 - if first { 190 - first = false 191 - wantLen = len(v) 192 - continue 193 - } 194 - if len(v) != wantLen { 195 - return nil, fmt.Errorf("found rank vectors of different length %d<>%d\n", wantLen, len(v)) 196 - } 187 + return RepoPathRanks{}, err 197 188 } 198 189 199 190 return ranks, nil ··· 517 508 } 518 509 519 510 // GetDocumentRanks expects a file where each line has the following format: 520 - // path<tab>rank... where rank is a float64 in [0,1]. Multiple ranks are 521 - // separated by a comma. Each line must have the same number of ranks. 522 - func (sf sourcegraphFake) GetDocumentRanks(ctx context.Context, repoName string) (map[string][]float64, error) { 511 + // path<tab>rank... where rank is a float64. 512 + func (sf sourcegraphFake) GetDocumentRanks(ctx context.Context, repoName string) (RepoPathRanks, error) { 523 513 dir := filepath.Join(sf.RootDir, filepath.FromSlash(repoName)) 524 514 525 515 fd, err := os.Open(filepath.Join(dir, "SG_DOCUMENT_RANKS")) 526 516 if err != nil { 527 - return nil, err 517 + return RepoPathRanks{}, err 528 518 } 529 519 530 - ranks := make(map[string][]float64) 520 + ranks := RepoPathRanks{} 531 521 522 + sum := 0.0 523 + count := 0 532 524 scanner := bufio.NewScanner(fd) 533 525 for scanner.Scan() { 534 526 s := scanner.Text() 535 527 pathRanks := strings.Split(s, "\t") 536 - ranks[pathRanks[0]] = floats64(pathRanks[1]) 528 + if rank, err := strconv.ParseFloat(pathRanks[1], 64); err == nil { 529 + ranks.Paths[pathRanks[0]] = rank 530 + sum += rank 531 + count++ 532 + } 537 533 } 534 + 538 535 if err := scanner.Err(); err != nil { 539 - return nil, err 536 + return RepoPathRanks{}, err 540 537 } 541 538 539 + ranks.MeanRank = sum / float64(count) 542 540 return ranks, nil 543 541 } 544 542 ··· 790 788 return nil, nil 791 789 } 792 790 793 - func (s sourcegraphNop) GetDocumentRanks(ctx context.Context, repoName string) (map[string][]float64, error) { 794 - return nil, nil 791 + func (s sourcegraphNop) GetDocumentRanks(ctx context.Context, repoName string) (RepoPathRanks, error) { 792 + return RepoPathRanks{}, nil 795 793 } 796 794 797 795 func (s sourcegraphNop) UpdateIndexStatus(repositories []indexStatus) error {
+9 -4
eval.go
··· 18 18 "context" 19 19 "fmt" 20 20 "log" 21 + "math" 21 22 "regexp/syntax" 22 23 "sort" 23 24 "strings" ··· 377 378 } 378 379 379 380 ranks := d.ranks[nextDoc] 380 - // This is a temporary workaround -- we only really want the PageRank score, and ignore 381 - // everything else. In a follow-up we'll simplify the rank format and remove this hack. 382 - if len(ranks) > 4 { 383 - fileMatch.addScore("file-rank", weight*d.ranks[nextDoc][4], opts.DebugScore) 381 + // The ranks slice always contains one entry representing the file rank (unless it's empty since the 382 + // file doesn't have a rank). This is left over from when documents could have multiple rank signals, 383 + // and we plan to clean this up. 384 + if len(ranks) > 0 { 385 + // The file rank represents a log (base 2) count. The log ranks should be bounded at 32, but we 386 + // cap it just in case to ensure it falls in the range [0, 1]. 387 + normalized := math.Min(1.0, ranks[0]/32.0) 388 + fileMatch.addScore("file-rank", weight*normalized, opts.DebugScore) 384 389 } 385 390 } 386 391
+15 -2
gitindex/index.go
··· 494 494 return fmt.Errorf("build.NewBuilder: %w", err) 495 495 } 496 496 497 - var ranks map[string][]float64 497 + var ranks repoPathRanks 498 498 if opts.BuildOptions.DocumentRanksPath != "" { 499 499 data, err := os.ReadFile(opts.BuildOptions.DocumentRanksPath) 500 500 if err != nil { ··· 554 554 if err != nil { 555 555 return err 556 556 } 557 + 558 + var pathRank []float64 559 + if rank, ok := ranks.Paths[keyFullPath]; ok { 560 + pathRank = []float64{rank} 561 + } 562 + fmt.Printf("path: %s, ranks length: %d", keyFullPath, len(pathRank)) 563 + 557 564 if err := builder.Add(zoekt.Document{ 558 565 SubRepositoryPath: key.SubRepoPath, 559 566 Name: keyFullPath, 560 567 Content: contents, 561 568 Branches: brs, 562 - Ranks: ranks[keyFullPath], 569 + Ranks: pathRank, 563 570 }); err != nil { 564 571 return fmt.Errorf("error adding document with name %s: %w", keyFullPath, err) 565 572 } 566 573 } 567 574 } 575 + 568 576 return builder.Finish() 577 + } 578 + 579 + type repoPathRanks struct { 580 + MeanRank float64 `json:"mean_reference_count"` 581 + Paths map[string]float64 `json:"paths"` 569 582 } 570 583 571 584 func newIgnoreMatcher(tree *object.Tree) (*ignore.Matcher, error) {