Source code for scorio.eval.gpass

r"""Generalized pass-family evaluation metrics for binary outcomes.

Estimate stability-style pass metrics from a binary outcome matrix
:math:`R \in \{0,1\}^{M \times N}`. ``G-Pass@k`` is the all-success
threshold, ``G-Pass@k``\ :sub:`\tau` requires at least
:math:`\lceil \tau k \rceil` successes, and ``mG-Pass@k`` averages over
thresholds :math:`\tau \in [0.5, 1.0]`. Bayesian summaries compute posterior
``mu`` and ``sigma`` under a Beta model for each question's latent success
rate.

Available API
-------------------
- ``g_pass_at_k`` returns the :math:`\tau = 1` alias of ``pass_hat_k``.
- ``g_pass_at_k_tau`` returns the thresholded generalized pass metric.
- ``mg_pass_at_k`` returns the mean generalized pass metric.
- companion ``*_ci`` functions return ``(mu, sigma, lo, hi)`` using a
  normal-approximation credible interval around ``mu``.
"""

import math

import numpy as np
from scipy.special import comb

from .pass_at_k import (
    _beta_ratio,
    _binary_beta_posterior_params,
    _pass_at_k_bayes,
    _pass_hat_k_bayes,
    pass_at_k,
    pass_hat_k,
    pass_hat_k_ci,
)
from .utils import _as_2d_int_matrix, _validate_binary, normal_credible_interval


[docs] def g_pass_at_k(R: np.ndarray, k: int) -> float: r""" Performance evaluation using G-Pass@k. Equivalent to :func:`~scorio.eval.pass_hat_k`, included for literature that uses the G-Pass@k naming convention for the :math:`\tau = 1` threshold. References: Liu, J., Liu, H., Xiao, L., et al. (2024). Are Your LLMs Capable of Stable Reasoning? *arXiv preprint arXiv:2412.13147*. https://arxiv.org/abs/2412.13147 Args: R: :math:`M \times N` binary matrix with entries in :math:`\{0, 1\}`. :math:`R_{\alpha i} = 1` if trial :math:`i` for question :math:`\alpha` passed, 0 otherwise. k: Number of samples to select (:math:`1 \le k \le N`). Returns: float: The average G-Pass@k score across all :math:`M` questions. Notation: For each row :math:`\alpha`: .. math:: \nu_\alpha = \sum_{i=1}^{N} R_{\alpha i} \quad \text{(number of correct samples)} :math:`C(a, b)` denotes the binomial coefficient :math:`\binom{a}{b}`. Formula: .. math:: \mathrm{G\text{-}Pass@}k_\alpha = \frac{C(\nu_\alpha, k)}{C(N, k)} .. math:: \mathrm{G\text{-}Pass@}k = \frac{1}{M} \sum_{\alpha=1}^{M} \mathrm{G\text{-}Pass@}k_\alpha Examples: >>> import numpy as np >>> R = np.array([[0, 1, 1, 0, 1], ... [1, 1, 0, 1, 1]]) >>> round(g_pass_at_k(R, 1), 6) 0.7 >>> round(g_pass_at_k(R, 2), 6) 0.45 """ return pass_hat_k(R, k)
[docs] def g_pass_at_k_tau(R: np.ndarray, k: int, tau: float) -> float: r""" Performance evaluation using G-Pass@k\ :sub:`τ`. References: Liu, J., Liu, H., Xiao, L., et al. (2024). Are Your LLMs Capable of Stable Reasoning? *arXiv preprint arXiv:2412.13147*. https://arxiv.org/abs/2412.13147 Args: R: :math:`M \times N` binary matrix with entries in :math:`\{0, 1\}`. :math:`R_{\alpha i} = 1` if trial :math:`i` for question :math:`\alpha` passed, 0 otherwise. k: Number of samples to select (:math:`1 \le k \le N`). tau: Threshold parameter :math:`\tau \in [0, 1]`. Requires at least :math:`\lceil \tau \cdot k \rceil` successes. When :math:`\tau = 0`, equivalent to Pass@k. When :math:`\tau = 1`, equivalent to Pass^k. Returns: float: The average G-Pass@k\ :sub:`τ` score across all :math:`M` questions. Notation: For each row :math:`\alpha`: .. math:: \nu_\alpha = \sum_{i=1}^{N} R_{\alpha i} \quad \text{(number of correct samples)} :math:`C(a, b)` denotes the binomial coefficient :math:`\binom{a}{b}`. :math:`j_0 = \lceil \tau \cdot k \rceil` is the minimum number of successes required. Formula: .. math:: \mathrm{G\text{-}Pass@}k_{\tau, \alpha} = \sum_{j=j_0}^{k} \frac{C(\nu_\alpha, j) \cdot C(N - \nu_\alpha, k - j)}{C(N, k)} .. math:: \mathrm{G\text{-}Pass@}k_\tau = \frac{1}{M} \sum_{\alpha=1}^{M} \mathrm{G\text{-}Pass@}k_{\tau, \alpha} Examples: >>> import numpy as np >>> R = np.array([[0, 1, 1, 0, 1], ... [1, 1, 0, 1, 1]]) >>> round(g_pass_at_k_tau(R, 2, 0.5), 6) 0.95 >>> round(g_pass_at_k_tau(R, 2, 1.0), 6) 0.45 """ R = _as_2d_int_matrix(R) _validate_binary(R) M, N = R.shape if not (0.0 <= tau <= 1.0): raise ValueError(f"tau must be in [0, 1]; got {tau}") if not (1 <= k <= N): raise ValueError(f"k must satisfy 1 <= k <= N (N={N}); got k={k}") if tau <= 0.0: return pass_at_k(R, k) nu = np.sum(R, axis=1) denom = comb(N, k) j0 = int(math.ceil(tau * k)) if j0 > k: return 0.0 vals = np.zeros(M, dtype=float) for j in range(j0, k + 1): vals += comb(nu, j) * comb(N - nu, k - j) / denom return float(np.mean(vals))
[docs] def mg_pass_at_k(R: np.ndarray, k: int) -> float: r""" Performance evaluation using mG-Pass@k. References: Liu, J., Liu, H., Xiao, L., et al. (2024). Are Your LLMs Capable of Stable Reasoning? *arXiv preprint arXiv:2412.13147*. https://arxiv.org/abs/2412.13147 Args: R: :math:`M \times N` binary matrix with entries in :math:`\{0, 1\}`. :math:`R_{\alpha i} = 1` if trial :math:`i` for question :math:`\alpha` passed, 0 otherwise. k: Number of samples to select (:math:`1 \le k \le N`). Returns: float: The average mG-Pass@k score across all :math:`M` questions. Notation: For each row :math:`\alpha`: .. math:: \nu_\alpha = \sum_{i=1}^{N} R_{\alpha i} \quad \text{(number of correct samples)} :math:`m = \lceil k/2 \rceil` is the majority threshold. Let :math:`X_\alpha \sim \mathrm{Hypergeometric}(N, \nu_\alpha, k)` be the number of correct samples among :math:`k` selections for row :math:`\alpha`. Formula: .. math:: \mathrm{mG\text{-}Pass@}k_\alpha = 2 \int_{0.5}^{1.0} \mathrm{G\text{-}Pass@}k_{\tau, \alpha} \, d\tau .. math:: \mathrm{mG\text{-}Pass@}k_\alpha = \frac{2}{k} \sum_{j=m+1}^{k} (j - m) \cdot P(X_\alpha = j) .. math:: P(X_\alpha = j) = \frac{C(\nu_\alpha, j) \cdot C(N - \nu_\alpha, k - j)}{C(N, k)} .. math:: \mathrm{mG\text{-}Pass@}k = \frac{1}{M} \sum_{\alpha=1}^{M} \mathrm{mG\text{-}Pass@}k_\alpha Examples: >>> import numpy as np >>> R = np.array([[0, 1, 1, 0, 1], ... [1, 1, 0, 1, 1]]) >>> round(mg_pass_at_k(R, 2), 6) 0.45 >>> round(mg_pass_at_k(R, 3), 6) 0.166667 """ R = _as_2d_int_matrix(R) _validate_binary(R) M, N = R.shape if not (1 <= k <= N): raise ValueError(f"k must satisfy 1 <= k <= N (N={N}); got k={k}") nu = np.sum(R, axis=1) denom = comb(N, k) majority = int(math.ceil(0.5 * k)) if majority >= k: return 0.0 vals = np.zeros(M, dtype=float) # mG per-question = (2/k) * E[(X - majority)_+], X ~ Hypergeom(N, nu, k) for j in range(majority + 1, k + 1): pmf = comb(nu, j) * comb(N - nu, k - j) / denom vals += (j - majority) * pmf vals *= 2.0 / k return float(np.mean(vals))
def _g_pass_at_k_tau_bayes( R: np.ndarray, k: int, tau: float, alpha0: float = 1.0, beta0: float = 1.0 ) -> tuple[float, float]: """Posterior mean/std for the i.i.d. G-Pass@k_τ quantity.""" Rm = _as_2d_int_matrix(R) _validate_binary(Rm) M, N = Rm.shape if not (0.0 <= tau <= 1.0): raise ValueError(f"tau must be in [0, 1]; got {tau}") if not (1 <= k <= N): raise ValueError(f"k must satisfy 1 <= k <= N (N={N}); got k={k}") if tau <= 0.0: return _pass_at_k_bayes(Rm, k, alpha0=alpha0, beta0=beta0) if tau >= 1.0: return _pass_hat_k_bayes(Rm, k, alpha0=alpha0, beta0=beta0) j0 = int(math.ceil(tau * k)) alpha, beta = _binary_beta_posterior_params(Rm, alpha0=alpha0, beta0=beta0) means = np.empty(M, dtype=float) vars_ = np.empty(M, dtype=float) # g(p) = Σ_{j=j0..k} C(k,j) p^j (1-p)^{k-j} js = list(range(j0, k + 1)) coeff = [float(comb(k, j)) for j in js] for i in range(M): a_i = float(alpha[i]) b_i = float(beta[i]) m = 0.0 for c_j, j in zip(coeff, js, strict=True): m += c_j * _beta_ratio(a_i, b_i, j, k - j) e2 = 0.0 for idx_j, j in enumerate(js): c_j = coeff[idx_j] for idx_l, l in enumerate(js): c_l = coeff[idx_l] e2 += c_j * c_l * _beta_ratio(a_i, b_i, j + l, 2 * k - (j + l)) v = max(0.0, e2 - m * m) means[i] = m vars_[i] = v mu = float(np.mean(means)) sigma = float(math.sqrt(float(np.sum(vars_))) / M) return mu, sigma def _mg_pass_at_k_bayes( R: np.ndarray, k: int, alpha0: float = 1.0, beta0: float = 1.0 ) -> tuple[float, float]: """Posterior mean/std for the i.i.d. mG-Pass@k quantity.""" Rm = _as_2d_int_matrix(R) _validate_binary(Rm) M, N = Rm.shape if not (1 <= k <= N): raise ValueError(f"k must satisfy 1 <= k <= N (N={N}); got k={k}") alpha, beta = _binary_beta_posterior_params(Rm, alpha0=alpha0, beta0=beta0) majority = int(math.ceil(0.5 * k)) if majority >= k: return 0.0, 0.0 js = list(range(majority + 1, k + 1)) coeff = [float((2.0 / k) * (j - majority) * comb(k, j)) for j in js] means = np.empty(M, dtype=float) vars_ = np.empty(M, dtype=float) for i in range(M): a_i = float(alpha[i]) b_i = float(beta[i]) m = 0.0 for c_j, j in zip(coeff, js, strict=True): m += c_j * _beta_ratio(a_i, b_i, j, k - j) e2 = 0.0 for idx_j, j in enumerate(js): c_j = coeff[idx_j] for idx_l, l in enumerate(js): c_l = coeff[idx_l] e2 += c_j * c_l * _beta_ratio(a_i, b_i, j + l, 2 * k - (j + l)) v = max(0.0, e2 - m * m) means[i] = m vars_[i] = v mu = float(np.mean(means)) sigma = float(math.sqrt(float(np.sum(vars_))) / M) return mu, sigma
[docs] def g_pass_at_k_ci( R: np.ndarray, k: int, confidence: float = 0.95, bounds: tuple[float, float] = (0.0, 1.0), alpha0: float = 1.0, beta0: float = 1.0, ) -> tuple[float, float, float, float]: r""" Bayesian posterior summary for G-Pass@k. G-Pass@k is the all-success threshold, so this is the same posterior target as :func:`~scorio.eval.pass_hat_k_ci`. Args: R: :math:`M \times N` binary matrix with entries in :math:`\{0,1\}`. k: Number of selected samples with ``1 <= k <= N``. confidence: credibility level of the interval. bounds: ``(lo, hi)`` clipping bounds for the interval. alpha0: Beta prior parameter :math:`\alpha_0`. beta0: Beta prior parameter :math:`\beta_0`. Returns: tuple[float, float, float, float]: :math:`(\mu,\; \sigma,\; \text{lo},\; \text{hi})`. """ return pass_hat_k_ci( R, k, confidence=confidence, bounds=bounds, alpha0=alpha0, beta0=beta0 )
[docs] def g_pass_at_k_tau_ci( R: np.ndarray, k: int, tau: float, confidence: float = 0.95, bounds: tuple[float, float] = (0.0, 1.0), alpha0: float = 1.0, beta0: float = 1.0, ) -> tuple[float, float, float, float]: r""" Bayesian posterior summary for thresholded G-Pass@k. The latent target is the probability that at least :math:`\lceil \tau k \rceil` of ``k`` i.i.d. samples are correct. Args: R: :math:`M \times N` binary matrix with entries in :math:`\{0,1\}`. k: Number of selected samples with ``1 <= k <= N``. tau: Threshold parameter in ``[0, 1]``. confidence: credibility level of the interval. bounds: ``(lo, hi)`` clipping bounds for the interval. alpha0: Beta prior parameter :math:`\alpha_0`. beta0: Beta prior parameter :math:`\beta_0`. Returns: tuple[float, float, float, float]: :math:`(\mu,\; \sigma,\; \text{lo},\; \text{hi})`. """ mu, sigma = _g_pass_at_k_tau_bayes(R, k, tau, alpha0=alpha0, beta0=beta0) lo, hi = normal_credible_interval( mu, sigma, credibility=confidence, two_sided=True, bounds=bounds ) return float(mu), float(sigma), float(lo), float(hi)
[docs] def mg_pass_at_k_ci( R: np.ndarray, k: int, confidence: float = 0.95, bounds: tuple[float, float] = (0.0, 1.0), alpha0: float = 1.0, beta0: float = 1.0, ) -> tuple[float, float, float, float]: r""" Bayesian posterior summary for mG-Pass@k. The latent target averages thresholded G-Pass@k over thresholds from ``0.5`` to ``1.0`` using the closed-form weighting in :func:`~scorio.eval.mg_pass_at_k`. Args: R: :math:`M \times N` binary matrix with entries in :math:`\{0,1\}`. k: Number of selected samples with ``1 <= k <= N``. confidence: credibility level of the interval. bounds: ``(lo, hi)`` clipping bounds for the interval. alpha0: Beta prior parameter :math:`\alpha_0`. beta0: Beta prior parameter :math:`\beta_0`. Returns: tuple[float, float, float, float]: :math:`(\mu,\; \sigma,\; \text{lo},\; \text{hi})`. """ mu, sigma = _mg_pass_at_k_bayes(R, k, alpha0=alpha0, beta0=beta0) lo, hi = normal_credible_interval( mu, sigma, credibility=confidence, two_sided=True, bounds=bounds ) return float(mu), float(sigma), float(lo), float(hi)
__all__ = [ "g_pass_at_k", "g_pass_at_k_tau", "mg_pass_at_k", "g_pass_at_k_ci", "g_pass_at_k_tau_ci", "mg_pass_at_k_ci", ]