fork of https://github.com/sourcegraph/zoekt
0

Configure Feed

Select the types of activity you want to include in your feed.

Pull scoring into own function (#582)

The `indexData.Search` method is super long. This PR pulls most scoring logic
into its own function, which will make it easier to modify in future PRs.

Testing:
* All Zoekt tests still pass
* Ran example queries with/ without the change, and the search-debug output was
the same

+67 -62
+67 -62
eval.go
··· 264 264 cp.setDocument(nextDoc) 265 265 266 266 known := make(map[matchTree]bool) 267 - 268 267 md := d.repoMetaData[d.repos[nextDoc]] 269 268 270 269 for cost := costMin; cost <= costMax; cost++ { ··· 306 305 } 307 306 } 308 307 309 - atomMatchCount := 0 310 - visitMatches(mt, known, func(mt matchTree) { 311 - atomMatchCount++ 312 - }) 313 308 shouldMergeMatches := !opts.ChunkMatches 314 309 finalCands := gatherMatches(mt, known, shouldMergeMatches) 315 310 ··· 334 329 fileMatch.LineMatches = cp.fillMatches(finalCands, opts.NumContextLines, fileMatch.Language, opts.DebugScore) 335 330 } 336 331 337 - maxFileScore := 0.0 338 - repetitions := 0 339 - for i := range fileMatch.LineMatches { 340 - if maxFileScore < fileMatch.LineMatches[i].Score { 341 - maxFileScore = fileMatch.LineMatches[i].Score 342 - repetitions = 0 343 - } else if maxFileScore == fileMatch.LineMatches[i].Score { 344 - repetitions += 1 345 - } 346 - 347 - // Order by ordering in file. 348 - fileMatch.LineMatches[i].Score += scoreLineOrderFactor * (1.0 - (float64(i) / float64(len(fileMatch.LineMatches)))) 349 - } 350 - 351 - for i := range fileMatch.ChunkMatches { 352 - if maxFileScore < fileMatch.ChunkMatches[i].Score { 353 - maxFileScore = fileMatch.ChunkMatches[i].Score 354 - } 355 - 356 - // Order by ordering in file. 357 - fileMatch.ChunkMatches[i].Score += scoreLineOrderFactor * (1.0 - (float64(i) / float64(len(fileMatch.ChunkMatches)))) 358 - } 359 - 360 - // Maintain ordering of input files. This 361 - // strictly dominates the in-file ordering of 362 - // the matches. 363 - fileMatch.addScore("fragment", maxFileScore, opts.DebugScore) 364 - 365 - // Prefer docs with several top-scored matches. 366 - fileMatch.addScore("repetition-boost", scoreRepetitionFactor*float64(repetitions), opts.DebugScore) 367 - 368 - // atom-count boosts files with matches from more than 1 atom. The 369 - // maximum boost is scoreFactorAtomMatch. 370 - if atomMatchCount > 0 { 371 - fileMatch.addScore("atom", (1.0-1.0/float64(atomMatchCount))*scoreFactorAtomMatch, opts.DebugScore) 372 - } 373 - 374 - if opts.UseDocumentRanks && len(d.ranks) > int(nextDoc) { 375 - weight := scoreFileRankFactor 376 - if opts.DocumentRanksWeight > 0.0 { 377 - weight = opts.DocumentRanksWeight 378 - } 379 - 380 - ranks := d.ranks[nextDoc] 381 - // The ranks slice always contains one entry representing the file rank (unless it's empty since the 382 - // file doesn't have a rank). This is left over from when documents could have multiple rank signals, 383 - // and we plan to clean this up. 384 - if len(ranks) > 0 { 385 - // The file rank represents a log (base 2) count. The log ranks should be bounded at 32, but we 386 - // cap it just in case to ensure it falls in the range [0, 1]. 387 - normalized := math.Min(1.0, ranks[0]/32.0) 388 - fileMatch.addScore("file-rank", weight*normalized, opts.DebugScore) 389 - } 390 - } 391 - 392 - fileMatch.addScore("doc-order", scoreFileOrderFactor*(1.0-float64(nextDoc)/float64(len(d.boundaries))), opts.DebugScore) 393 - fileMatch.addScore("repo-rank", scoreRepoRankFactor*float64(md.Rank)/maxUInt16, opts.DebugScore) 332 + d.scoreFileMatch(&fileMatch, nextDoc, mt, known, opts) 394 333 395 334 fileMatch.Branches = d.gatherBranches(nextDoc, mt, known) 396 335 sortMatchesByScore(fileMatch.LineMatches) ··· 442 381 } 443 382 444 383 return &res, nil 384 + } 385 + 386 + func (d *indexData) scoreFileMatch(fileMatch *FileMatch, doc uint32, mt matchTree, known map[matchTree]bool, opts *SearchOptions) { 387 + atomMatchCount := 0 388 + visitMatches(mt, known, func(mt matchTree) { 389 + atomMatchCount++ 390 + }) 391 + 392 + // atom-count boosts files with matches from more than 1 atom. The 393 + // maximum boost is scoreFactorAtomMatch. 394 + if atomMatchCount > 0 { 395 + fileMatch.addScore("atom", (1.0-1.0/float64(atomMatchCount))*scoreFactorAtomMatch, opts.DebugScore) 396 + } 397 + 398 + maxFileScore := 0.0 399 + repetitions := 0 400 + for i := range fileMatch.LineMatches { 401 + if maxFileScore < fileMatch.LineMatches[i].Score { 402 + maxFileScore = fileMatch.LineMatches[i].Score 403 + repetitions = 0 404 + } else if maxFileScore == fileMatch.LineMatches[i].Score { 405 + repetitions += 1 406 + } 407 + 408 + // Order by ordering in file. 409 + fileMatch.LineMatches[i].Score += scoreLineOrderFactor * (1.0 - (float64(i) / float64(len(fileMatch.LineMatches)))) 410 + } 411 + 412 + for i := range fileMatch.ChunkMatches { 413 + if maxFileScore < fileMatch.ChunkMatches[i].Score { 414 + maxFileScore = fileMatch.ChunkMatches[i].Score 415 + } 416 + 417 + // Order by ordering in file. 418 + fileMatch.ChunkMatches[i].Score += scoreLineOrderFactor * (1.0 - (float64(i) / float64(len(fileMatch.ChunkMatches)))) 419 + } 420 + 421 + // Maintain ordering of input files. This 422 + // strictly dominates the in-file ordering of 423 + // the matches. 424 + fileMatch.addScore("fragment", maxFileScore, opts.DebugScore) 425 + 426 + // Prefer docs with several top-scored matches. 427 + fileMatch.addScore("repetition-boost", scoreRepetitionFactor*float64(repetitions), opts.DebugScore) 428 + 429 + if opts.UseDocumentRanks && len(d.ranks) > int(doc) { 430 + weight := scoreFileRankFactor 431 + if opts.DocumentRanksWeight > 0.0 { 432 + weight = opts.DocumentRanksWeight 433 + } 434 + 435 + ranks := d.ranks[doc] 436 + // The ranks slice always contains one entry representing the file rank (unless it's empty since the 437 + // file doesn't have a rank). This is left over from when documents could have multiple rank signals, 438 + // and we plan to clean this up. 439 + if len(ranks) > 0 { 440 + // The file rank represents a log (base 2) count. The log ranks should be bounded at 32, but we 441 + // cap it just in case to ensure it falls in the range [0, 1]. 442 + normalized := math.Min(1.0, ranks[0]/32.0) 443 + fileMatch.addScore("file-rank", weight*normalized, opts.DebugScore) 444 + } 445 + } 446 + 447 + md := d.repoMetaData[d.repos[doc]] 448 + fileMatch.addScore("doc-order", scoreFileOrderFactor*(1.0-float64(doc)/float64(len(d.boundaries))), opts.DebugScore) 449 + fileMatch.addScore("repo-rank", scoreRepoRankFactor*float64(md.Rank)/maxUInt16, opts.DebugScore) 445 450 } 446 451 447 452 func addRepo(res *SearchResult, repo *Repository) {