Source code for scorio.eval.pass_at_k

"""Pass family metrics and uncertainty estimators for binary outcomes.

Quantify performance under test-time sampling by evaluating what happens when
``k`` responses are selected per question. This module provides the classic
Pass@k point estimators for finite observed trials and Bayesian uncertainty
estimators under a Beta posterior model for per-question success
probabilities.

Methods
-------
- ``pass_at_k``: probability that at least one selected trial is successful.
- ``pass_hat_k``: probability that all selected trials are successful.

Each metric has a companion ``*_ci`` function that returns
``(mu, sigma, lo, hi)`` under the Bayesian uncertainty model used in this
module. Generalized G-Pass variants live in :mod:`scorio.eval.gpass`.
"""

import math

import numpy as np
from scipy.special import betaln, comb

from .utils import _as_2d_int_matrix, _validate_binary, normal_credible_interval


[docs] def pass_at_k(R: np.ndarray, k: int) -> float: r""" Unbiased Pass@k estimator. Computes the probability that at least one of *k* randomly selected samples is correct, averaged over all *M* questions. References: Chen, M., Tworek, J., Jun, H., et al. (2021). Evaluating Large Language Models Trained on Code. *arXiv preprint arXiv:2107.03374*. https://arxiv.org/abs/2107.03374 Args: R: :math:`M \times N` binary matrix with entries in :math:`\{0, 1\}`. :math:`R_{\alpha i} = 1` if trial :math:`i` for question :math:`\alpha` passed, 0 otherwise. k: Number of samples to select (:math:`1 \le k \le N`). Returns: float: The average Pass@k score across all *M* questions. Notation: For each row :math:`\alpha`: .. math:: \nu_\alpha = \sum_{i=1}^{N} R_{\alpha i} \quad \text{(number of correct samples)} :math:`C(a, b)` denotes the binomial coefficient :math:`\binom{a}{b}`. Formula: .. math:: \text{Pass@k}_\alpha = 1 - \frac{C(N - \nu_\alpha, k)}{C(N, k)} .. math:: \text{Pass@k} = \frac{1}{M} \sum_{\alpha=1}^{M} \text{Pass@k}_\alpha Examples: >>> import numpy as np >>> R = np.array([[0, 1, 1, 0, 1], ... [1, 1, 0, 1, 1]]) >>> round(pass_at_k(R, 1), 6) 0.7 >>> round(pass_at_k(R, 2), 6) 0.95 """ R = _as_2d_int_matrix(R) _validate_binary(R) _, N = R.shape if not (1 <= k <= N): raise ValueError(f"k must satisfy 1 <= k <= N (N={N}); got k={k}") nu = np.sum(R, axis=1) denom = comb(N, k) vals = 1 - comb(N - nu, k) / denom # (M,) return float(np.mean(vals))
[docs] def pass_hat_k(R: np.ndarray, k: int) -> float: r""" Pass^k (Pass-hat@k): probability that all *k* selected trials are correct. Computes the probability that *k* randomly selected samples are ALL correct, averaged over all *M* questions. Also known as G-Pass@k. References: Yao, S., Shinn, N., Razavi, P., & Narasimhan, K. (2024). :math:`\tau`-bench: A Benchmark for Tool-Agent-User Interaction in Real-World Domains. *arXiv preprint arXiv:2406.12045*. https://arxiv.org/abs/2406.12045 Args: R: :math:`M \times N` binary matrix with entries in :math:`\{0, 1\}`. :math:`R_{\alpha i} = 1` if trial :math:`i` for question :math:`\alpha` passed, 0 otherwise. k: Number of samples to select (:math:`1 \le k \le N`). Returns: float: The average Pass^k score across all *M* questions. Notation: For each row :math:`\alpha`: .. math:: \nu_\alpha = \sum_{i=1}^{N} R_{\alpha i} \quad \text{(number of correct samples)} :math:`C(a, b)` denotes the binomial coefficient :math:`\binom{a}{b}`. Formula: .. math:: \hat{\text{Pass@k}}_\alpha = \frac{C(\nu_\alpha, k)}{C(N, k)} .. math:: \hat{\text{Pass@k}} = \frac{1}{M} \sum_{\alpha=1}^{M} \hat{\text{Pass@k}}_\alpha Examples: >>> import numpy as np >>> R = np.array([[0, 1, 1, 0, 1], ... [1, 1, 0, 1, 1]]) >>> round(pass_hat_k(R, 1), 6) 0.7 >>> round(pass_hat_k(R, 2), 6) 0.45 """ R = _as_2d_int_matrix(R) _validate_binary(R) _, N = R.shape if not (1 <= k <= N): raise ValueError(f"k must satisfy 1 <= k <= N (N={N}); got k={k}") nu = np.sum(R, axis=1) denom = comb(N, k) vals = comb(nu, k) / denom # (M,) return float(np.mean(vals))
def _beta_ratio(alpha: float, beta: float, a: int, b: int) -> float: """Compute Beta(alpha+a, beta+b) / Beta(alpha, beta) stably.""" return float(math.exp(betaln(alpha + a, beta + b) - betaln(alpha, beta))) def _binary_beta_posterior_params( R: np.ndarray, alpha0: float = 1.0, beta0: float = 1.0 ) -> tuple[np.ndarray, np.ndarray]: """Per-row Beta posterior parameters for binary outcomes with Beta(alpha0,beta0) prior.""" Rm = _as_2d_int_matrix(R) _validate_binary(Rm) _, N = Rm.shape c = np.sum(Rm, axis=1).astype(float) alpha = alpha0 + c beta = beta0 + (N - c) return alpha, beta def _pass_at_k_bayes( R: np.ndarray, k: int, alpha0: float = 1.0, beta0: float = 1.0 ) -> tuple[float, float]: """Posterior mean/std for the i.i.d. Pass@k quantity: 1 - (1 - p)^k.""" Rm = _as_2d_int_matrix(R) _validate_binary(Rm) M, N = Rm.shape if not (1 <= k <= N): raise ValueError(f"k must satisfy 1 <= k <= N (N={N}); got k={k}") alpha, beta = _binary_beta_posterior_params(Rm, alpha0=alpha0, beta0=beta0) means = np.empty(M, dtype=float) vars_ = np.empty(M, dtype=float) # g(p) = 1 - (1-p)^k for i in range(M): a_i = float(alpha[i]) b_i = float(beta[i]) e_qk = _beta_ratio(a_i, b_i, 0, k) # E[(1-p)^k] e_q2k = _beta_ratio(a_i, b_i, 0, 2 * k) # E[(1-p)^(2k)] m = 1.0 - e_qk e2 = 1.0 - 2.0 * e_qk + e_q2k v = max(0.0, e2 - m * m) means[i] = m vars_[i] = v mu = float(np.mean(means)) sigma = float(math.sqrt(float(np.sum(vars_))) / M) return mu, sigma def _pass_hat_k_bayes( R: np.ndarray, k: int, alpha0: float = 1.0, beta0: float = 1.0 ) -> tuple[float, float]: """Posterior mean/std for the i.i.d. Pass^k quantity: p^k.""" Rm = _as_2d_int_matrix(R) _validate_binary(Rm) M, N = Rm.shape if not (1 <= k <= N): raise ValueError(f"k must satisfy 1 <= k <= N (N={N}); got k={k}") alpha, beta = _binary_beta_posterior_params(Rm, alpha0=alpha0, beta0=beta0) means = np.empty(M, dtype=float) vars_ = np.empty(M, dtype=float) for i in range(M): a_i = float(alpha[i]) b_i = float(beta[i]) e_pk = _beta_ratio(a_i, b_i, k, 0) # E[p^k] e_p2k = _beta_ratio(a_i, b_i, 2 * k, 0) # E[p^(2k)] m = e_pk v = max(0.0, e_p2k - m * m) means[i] = m vars_[i] = v mu = float(np.mean(means)) sigma = float(math.sqrt(float(np.sum(vars_))) / M) return mu, sigma
[docs] def pass_at_k_ci( R: np.ndarray, k: int, confidence: float = 0.95, bounds: tuple[float, float] = (0.0, 1.0), alpha0: float = 1.0, beta0: float = 1.0, ) -> tuple[float, float, float, float]: r""" Bayesian :math:`\mu`, :math:`\sigma`, and credible interval for i.i.d. Pass@k. Treats each question's underlying correctness probability :math:`p` as latent with a :math:`\text{Beta}(\alpha_0, \beta_0)` posterior and propagates uncertainty to the dataset-level metric. Args: R: :math:`M \times N` binary matrix with entries in :math:`\{0, 1\}`. :math:`R_{\alpha i} = 1` if trial :math:`i` for question :math:`\alpha` passed, 0 otherwise. k: Number of samples to select (:math:`1 \le k \le N`). confidence: credibility level of the interval (default 0.95). bounds: ``(lo, hi)`` clipping bounds for the interval (default ``(0, 1)``). alpha0: Beta prior parameter :math:`\alpha_0` (default 1). beta0: Beta prior parameter :math:`\beta_0` (default 1). Returns: tuple[float, float, float, float]: :math:`(\mu,\; \sigma,\; \text{lo},\; \text{hi})` Notation: Per-question posterior: :math:`p_\alpha \mid R \sim \text{Beta}(\alpha_0 + c_\alpha,\; \beta_0 + N - c_\alpha)` where :math:`c_\alpha = \sum_i R_{\alpha i}`. Formula: The per-question i.i.d. quantity is: .. math:: g(p) = 1 - (1 - p)^k Its posterior mean and variance are: .. math:: \mathbb{E}[g(p_\alpha)] &= 1 - \frac{B(\alpha_\alpha,\; \beta_\alpha + k)}{B(\alpha_\alpha, \beta_\alpha)} \text{Var}[g(p_\alpha)] &= \mathbb{E}[g(p_\alpha)^2] - \mathbb{E}[g(p_\alpha)]^2 Dataset-level aggregation: .. math:: \mu &= \frac{1}{M} \sum_{\alpha} \mathbb{E}[g(p_\alpha)] \sigma &= \frac{1}{M} \sqrt{\sum_{\alpha} \text{Var}[g(p_\alpha)]} Examples: >>> import numpy as np >>> R = np.array([[0, 1, 1, 0, 1], ... [1, 1, 0, 1, 1]]) >>> mu, sigma, lo, hi = pass_at_k_ci(R, 1) >>> round(mu, 6), round(sigma, 6), round(lo, 4), round(hi, 4) (0.642857, 0.118451, 0.4107, 0.875) >>> mu, sigma, lo, hi = pass_at_k_ci(R, 2) >>> round(mu, 6), round(sigma, 6), round(lo, 4), round(hi, 4) (0.839286, 0.097263, 0.6487, 1.0) """ mu, sigma = _pass_at_k_bayes(R, k, alpha0=alpha0, beta0=beta0) lo, hi = normal_credible_interval( mu, sigma, credibility=confidence, two_sided=True, bounds=bounds ) return float(mu), float(sigma), float(lo), float(hi)
[docs] def pass_hat_k_ci( R: np.ndarray, k: int, confidence: float = 0.95, bounds: tuple[float, float] = (0.0, 1.0), alpha0: float = 1.0, beta0: float = 1.0, ) -> tuple[float, float, float, float]: r""" Bayesian :math:`\mu`, :math:`\sigma`, and credible interval for i.i.d. Pass^k. Treats each question's underlying correctness probability :math:`p` as latent with a :math:`\text{Beta}(\alpha_0, \beta_0)` posterior and propagates uncertainty to the dataset-level metric. Args: R: :math:`M \times N` binary matrix with entries in :math:`\{0, 1\}`. :math:`R_{\alpha i} = 1` if trial :math:`i` for question :math:`\alpha` passed, 0 otherwise. k: Number of samples to select (:math:`1 \le k \le N`). confidence: credibility level of the interval (default 0.95). bounds: ``(lo, hi)`` clipping bounds for the interval (default ``(0, 1)``). alpha0: Beta prior parameter :math:`\alpha_0` (default 1). beta0: Beta prior parameter :math:`\beta_0` (default 1). Returns: tuple[float, float, float, float]: :math:`(\mu,\; \sigma,\; \text{lo},\; \text{hi})` Formula: The per-question i.i.d. quantity is: .. math:: g(p) = p^k Its posterior mean and variance are: .. math:: \mathbb{E}[g(p_\alpha)] &= \frac{B(\alpha_\alpha + k,\; \beta_\alpha)}{B(\alpha_\alpha, \beta_\alpha)} \text{Var}[g(p_\alpha)] &= \mathbb{E}[g(p_\alpha)^2] - \mathbb{E}[g(p_\alpha)]^2 Examples: >>> import numpy as np >>> R = np.array([[0, 1, 1, 0, 1], ... [1, 1, 0, 1, 1]]) >>> mu, sigma, lo, hi = pass_hat_k_ci(R, 1) >>> round(mu, 6), round(sigma, 6), round(lo, 4), round(hi, 4) (0.642857, 0.118451, 0.4107, 0.875) >>> mu, sigma, lo, hi = pass_hat_k_ci(R, 2) >>> round(mu, 6), round(sigma, 6), round(lo, 4), round(hi, 4) (0.446429, 0.146167, 0.1599, 0.7329) """ mu, sigma = _pass_hat_k_bayes(R, k, alpha0=alpha0, beta0=beta0) lo, hi = normal_credible_interval( mu, sigma, credibility=confidence, two_sided=True, bounds=bounds ) return float(mu), float(sigma), float(lo), float(hi)
__all__ = [ "pass_at_k", "pass_hat_k", "pass_at_k_ci", "pass_hat_k_ci", ]