"""
Evaluation-metric ranking methods.
These methods map each model's responses to a scalar score and then convert
scores to ranks with :func:`scorio.utils.rank_scores`.
Notation
--------
Let :math:`R \\in \\{0,1,\\ldots,C\\}^{L \\times M \\times N}` denote model
outcomes, and define per-question correct-count summaries
:math:`k_{lm}=\\sum_{n=1}^{N} R_{lmn}` when outcomes are binary.
The module follows the score template
.. math::
s_l = \\frac{1}{M}\\sum_{m=1}^{M} g_m(k_{lm}, N; \\psi),
where :math:`g_m` depends on the selected evaluation metric.
"""
import numpy as np
from scipy.stats import norm
from scorio import eval
from scorio.utils import rank_scores
from ._base import validate_input
from ._types import RankMethod, RankResult
[docs]
def avg(
R: np.ndarray,
method: RankMethod = "competition",
return_scores: bool = False,
) -> RankResult:
"""
Rank models by mean accuracy over all questions and trials.
Method context:
This is the simplest pointwise ranking baseline: each model receives
one score equal to its empirical success rate across all ``M * N``
outcomes.
Args:
R: Binary outcome tensor with shape ``(L, M, N)`` or matrix
``(L, M)`` (treated as ``N=1``).
method: Tie-handling rule passed to ``rank_scores``.
One of ``"competition"``, ``"dense"``, ``"avg"``,
``"competition_max"``.
return_scores: If ``True``, return ``(ranking, scores)``.
Returns:
Ranking array of shape ``(L,)``.
If ``return_scores=True``, also returns ``scores`` of shape ``(L,)``.
Notation:
``R[l, m, n]`` is the binary outcome for model ``l``, question ``m``,
trial ``n``.
Formula:
.. math::
s_l^{\\mathrm{avg}} = \\frac{1}{MN}
\\sum_{m=1}^{M}\\sum_{n=1}^{N} R_{lmn}
Examples:
>>> import numpy as np
>>> from scorio import rank
>>> R = np.array([
... [[1, 1], [0, 1]],
... [[1, 0], [0, 0]],
... ])
>>> ranks, scores = rank.avg(R, return_scores=True)
>>> scores.round(3).tolist()
[0.75, 0.25]
>>> ranks.tolist()
[1, 2]
"""
R = validate_input(R)
L, _, _ = R.shape
scores = np.array([eval.avg(R[model, :, :])[0] for model in range(L)])
ranking = rank_scores(scores)[method]
return (ranking, scores) if return_scores else ranking
[docs]
def bayes(
R: np.ndarray,
w: np.ndarray | None = None,
R0: np.ndarray | None = None,
quantile: float | None = None,
method: RankMethod = "competition",
return_scores: bool = False,
) -> RankResult:
"""
Rank models with Bayes@N posterior statistics.
Method context:
For each model, this method computes Bayes@N posterior summary
statistics ``(mu_l, sigma_l)`` from categorical outcomes. Ranking can
be based on posterior mean (default) or a Normal-quantile conservative
score.
References:
Hariri, M., Samandar, A., Hinczewski, M., & Chaudhary, V. (2026).
Don't Pass@k: A Bayesian Framework for Large Language Model Evaluation.
*ICLR 2026*, *arXiv:2510.04265*.
https://arxiv.org/abs/2510.04265
Args:
R: Categorical outcome tensor with shape ``(L, M, N)`` or matrix
``(L, M)`` (treated as ``N=1``). Entries must be integers in
``{0, ..., C}``.
w: Weight vector of shape ``(C+1,)`` mapping categories to scores.
If not provided and R is binary (contains only 0 and 1), defaults
to ``[1, 0]``. For non-binary R, w is required.
R0: Optional prior outcomes. Supported shapes:
- ``(M, D)``: one shared prior matrix reused for all models.
- ``(L, M, D)``: model-specific prior outcomes.
quantile: Optional quantile ``q`` in ``[0, 1]``. If ``None``, rank by
posterior mean. Otherwise rank by ``mu_l + Phi^{-1}(q) sigma_l``.
method: Tie-handling rule for score-to-rank conversion.
return_scores: If ``True``, return ``(ranking, scores)``.
Returns:
Ranking array of shape ``(L,)``.
If ``return_scores=True``, also returns per-model scores used for
ranking (posterior means or quantile scores), shape ``(L,)``.
Notation:
``mu_l, sigma_l`` are Bayes@N posterior mean and uncertainty for model
``l`` computed by :func:`scorio.eval.bayes`.
Formula:
.. math::
s_l =
\\begin{cases}
\\mu_l, & \\text{if } q\\text{ is None} \\\\
\\mu_l + \\Phi^{-1}(q)\\,\\sigma_l, & \\text{otherwise}
\\end{cases}
Examples:
>>> import numpy as np
>>> from scorio import rank
>>> R = np.array([
... [[1, 0], [1, 1], [0, 0]],
... [[0, 0], [1, 0], [1, 1]],
... ])
>>> w = np.array([0.0, 1.0])
>>> R0 = np.array([[1, 1], [0, 1], [0, 0]]) # shared prior
>>> ranks, scores = rank.bayes(R, w=w, R0=R0, return_scores=True)
>>> ranks.shape, scores.shape
((2,), (2,))
Notes:
Lower quantiles (for example ``q=0.05``) implement conservative ranking
by penalizing posterior uncertainty.
"""
R = validate_input(R, binary_only=False)
L, M, N = R.shape
if quantile is not None and not (0.0 <= quantile <= 1.0):
raise ValueError(f"quantile must be in [0, 1]; got {quantile}")
R0_shared: np.ndarray | None = None
R0_per_model: np.ndarray | None = None
# Validate and normalize R0
if R0 is not None:
R0 = np.asarray(R0, dtype=int)
if R0.ndim == 2:
if R0.shape[0] != M:
raise ValueError(
f"Shared R0 must have shape (M={M}, D), got {R0.shape}"
)
R0_shared = R0
elif R0.ndim == 3:
if R0.shape[0] != L or R0.shape[1] != M:
raise ValueError(
f"Model-specific R0 must have shape (L={L}, M={M}, D), got {R0.shape}"
)
R0_per_model = R0
else:
raise ValueError(
"R0 must be shape (M, D) or (L, M, D); "
f"got ndim={R0.ndim} with shape {R0.shape}"
)
scores = np.zeros(L)
z = norm.ppf(quantile) if quantile is not None else None
for model in range(L):
model_R0 = R0_shared if R0_shared is not None else None
if R0_per_model is not None:
model_R0 = R0_per_model[model]
mu, sigma = eval.bayes(R[model], w, R0=model_R0)
if z is not None:
scores[model] = mu + z * sigma
else:
scores[model] = mu
ranking = rank_scores(scores)[method]
return (ranking, scores) if return_scores else ranking
[docs]
def pass_at_k(
R: np.ndarray,
k: int,
method: RankMethod = "competition",
return_scores: bool = False,
) -> RankResult:
"""
Rank models by the Pass@k metric.
Method context:
Pass@k measures the probability that at least one of ``k`` draws
without replacement is correct for a question. Scores are averaged
across questions per model.
References:
Chen, M., Tworek, J., Jun, H., et al. (2021).
Evaluating Large Language Models Trained on Code.
*arXiv:2107.03374*.
https://arxiv.org/abs/2107.03374
Args:
R: Binary outcome tensor of shape ``(L, M, N)`` or matrix ``(L, M)``.
k: Number of selected samples, with ``1 <= k <= N``.
method: Tie-handling rule for ``rank_scores``.
return_scores: If ``True``, return ``(ranking, scores)``.
Returns:
Ranking array of shape ``(L,)``.
If ``return_scores=True``, also returns per-model Pass@k scores.
Notation:
``nu_lm = sum_{n=1}^N R_lmn`` is the number of successes for model
``l`` on question ``m``.
Formula:
.. math::
s_l^{\\mathrm{Pass@}k}
= \\frac{1}{M} \\sum_{m=1}^{M}
\\left(1 - \\frac{{N-\\nu_{lm} \\choose k}}{{N \\choose k}}\\right)
Examples:
>>> import numpy as np
>>> from scorio import rank
>>> R = np.array([
... [[1, 1, 0], [0, 1, 0]],
... [[1, 0, 0], [0, 0, 0]],
... ])
>>> ranks, scores = rank.pass_at_k(R, k=2, return_scores=True)
>>> ranks.tolist()
[1, 2]
"""
R = validate_input(R)
L, _, _ = R.shape
scores = np.array([eval.pass_at_k(R[model, :, :], k) for model in range(L)])
ranking = rank_scores(scores)[method]
return (ranking, scores) if return_scores else ranking
[docs]
def pass_hat_k(
R: np.ndarray,
k: int,
method: RankMethod = "competition",
return_scores: bool = False,
) -> RankResult:
"""
Rank models by Pass-hat@k (G-Pass@k).
Method context:
Pass-hat@k is the probability that all ``k`` selected samples are
correct for a question, then averaged across questions.
References:
Yao, S., Shinn, N., Razavi, P., & Narasimhan, K. (2024).
tau-bench: A Benchmark for Tool-Agent-User Interaction in Real-World Domains.
*arXiv:2406.12045*.
https://arxiv.org/abs/2406.12045
Args:
R: Binary outcome tensor of shape ``(L, M, N)`` or matrix ``(L, M)``.
k: Number of selected samples, with ``1 <= k <= N``.
method: Tie-handling rule for ``rank_scores``.
return_scores: If ``True``, return ``(ranking, scores)``.
Returns:
Ranking array of shape ``(L,)``.
If ``return_scores=True``, also returns per-model Pass-hat@k scores.
Notation:
``nu_lm = sum_{n=1}^N R_lmn``.
Formula:
.. math::
s_l^{\\widehat{\\mathrm{Pass@}k}}
= \\frac{1}{M} \\sum_{m=1}^{M}
\\frac{{\\nu_{lm} \\choose k}}{{N \\choose k}}
Examples:
>>> import numpy as np
>>> from scorio import rank
>>> R = np.array([
... [[1, 1, 0], [0, 1, 0]],
... [[1, 0, 0], [0, 0, 0]],
... ])
>>> rank.pass_hat_k(R, k=1).tolist()
[1, 2]
"""
R = validate_input(R)
L, _, _ = R.shape
scores = np.array([eval.pass_hat_k(R[model, :, :], k) for model in range(L)])
ranking = rank_scores(scores)[method]
return (ranking, scores) if return_scores else ranking
[docs]
def g_pass_at_k_tau(
R: np.ndarray,
k: int,
tau: float,
method: RankMethod = "competition",
return_scores: bool = False,
) -> RankResult:
"""
Rank models by generalized G-Pass@k_tau.
Method context:
G-Pass@k_tau measures the probability of obtaining at least
``ceil(tau * k)`` successes in ``k`` draws without replacement.
It interpolates between Pass@k (small tau) and Pass-hat@k (tau=1).
References:
Liu, J., Liu, H., Xiao, L., et al. (2025).
Are Your LLMs Capable of Stable Reasoning?
*arXiv:2412.13147*.
https://arxiv.org/abs/2412.13147
Args:
R: Binary outcome tensor of shape ``(L, M, N)`` or matrix ``(L, M)``.
k: Number of selected samples, with ``1 <= k <= N``.
tau: Threshold parameter in ``[0, 1]``.
method: Tie-handling rule for ``rank_scores``.
return_scores: If ``True``, return ``(ranking, scores)``.
Returns:
Ranking array of shape ``(L,)``.
If ``return_scores=True``, also returns per-model G-Pass@k_tau scores.
Notation:
``X_lm ~ Hypergeom(N, nu_lm, k)`` where ``nu_lm`` is the success count
for model ``l`` and question ``m``.
Formula:
.. math::
s_l^{\\mathrm{G\\text{-}Pass@}k_\\tau}
= \\frac{1}{M} \\sum_{m=1}^{M}
\\Pr\\left(X_{lm} \\ge \\lceil \\tau k \\rceil\\right)
.. math::
\\Pr\\left(X_{lm} \\ge \\lceil \\tau k \\rceil\\right)
=
\\sum_{j=\\lceil \\tau k \\rceil}^{k}
\\frac{{\\nu_{lm} \\choose j}{N-\\nu_{lm} \\choose k-j}}
{{N \\choose k}}
Examples:
>>> import numpy as np
>>> from scorio import rank
>>> R = np.array([
... [[1, 1, 0], [0, 1, 0]],
... [[1, 0, 0], [0, 0, 0]],
... ])
>>> rank.g_pass_at_k_tau(R, k=2, tau=1.0).tolist() == rank.pass_hat_k(R, 2).tolist()
True
"""
R = validate_input(R)
L, _, _ = R.shape
scores = np.array(
[eval.g_pass_at_k_tau(R[model, :, :], k, tau) for model in range(L)]
)
ranking = rank_scores(scores)[method]
return (ranking, scores) if return_scores else ranking
[docs]
def mg_pass_at_k(
R: np.ndarray,
k: int,
method: RankMethod = "competition",
return_scores: bool = False,
) -> RankResult:
"""
Rank models by mG-Pass@k (mean generalized pass metric).
Method context:
mG-Pass@k aggregates G-Pass@k_tau for ``tau in [0.5, 1]`` via the
discrete summation proposed in the G-Pass literature, producing a
stability-focused score.
References:
Liu, J., Liu, H., Xiao, L., et al. (2025).
Are Your LLMs Capable of Stable Reasoning?
*arXiv:2412.13147*.
https://arxiv.org/abs/2412.13147
Args:
R: Binary outcome tensor of shape ``(L, M, N)`` or matrix ``(L, M)``.
k: Number of selected samples, with ``1 <= k <= N``.
method: Tie-handling rule for ``rank_scores``.
return_scores: If ``True``, return ``(ranking, scores)``.
Returns:
Ranking array of shape ``(L,)``.
If ``return_scores=True``, also returns per-model mG-Pass@k scores.
Notation:
``X_lm ~ Hypergeom(N, nu_lm, k)``, and
``m0 = ceil(k/2)``.
Formula:
.. math::
s_l^{\\mathrm{mG\\text{-}Pass@}k}
= \\frac{1}{M} \\sum_{m=1}^{M}
\\frac{2}{k} \\sum_{i=m_0+1}^{k}
\\Pr(X_{lm} \\ge i)
.. math::
\\frac{2}{k} \\sum_{i=m_0+1}^{k}
\\Pr(X_{lm} \\ge i)
=
\\frac{2}{k} \\, \\mathbb{E}\\left[(X_{lm}-m_0)_+\\right]
Examples:
>>> import numpy as np
>>> from scorio import rank
>>> R = np.array([
... [[1, 1, 0], [0, 1, 0]],
... [[1, 0, 0], [0, 0, 0]],
... ])
>>> ranks, scores = rank.mg_pass_at_k(R, k=2, return_scores=True)
>>> ranks.tolist()
[1, 2]
"""
R = validate_input(R)
L, _, _ = R.shape
scores = np.array([eval.mg_pass_at_k(R[model, :, :], k) for model in range(L)])
ranking = rank_scores(scores)[method]
return (ranking, scores) if return_scores else ranking
__all__ = [
"avg",
"bayes",
"pass_at_k",
"pass_hat_k",
"g_pass_at_k_tau",
"mg_pass_at_k",
]