Source code for scorio.eval.auc

r"""AUC@K evaluation metrics for binary outcomes.

Estimate normalized area under the Pass@j curve for budgets
:math:`j = 1, \ldots, k`. For a binary outcome matrix
:math:`R \in \{0,1\}^{M \times N}`, :math:`AUC@K` averages per-question
Pass@j values using trapezoidal weights matching Eq. (7) of Hu et al. (2026).
The Bayesian summary computes posterior ``mu`` and ``sigma`` under a Beta
model for each question's latent success rate.

Available API
-------------------
- ``auc_at_k`` returns the point estimate.
- ``auc_at_k_ci`` returns ``(mu, sigma, lo, hi)`` using a normal-approximation
  credible interval around ``mu``.
"""

import math

import numpy as np
from scipy.special import comb

from .pass_at_k import (
    _beta_ratio,
    _binary_beta_posterior_params,
    pass_at_k,
    pass_at_k_ci,
)
from .utils import _as_2d_int_matrix, _validate_binary, normal_credible_interval


def _validate_k(N: int, k: int) -> None:
    if not (1 <= k <= N):
        raise ValueError(f"k must satisfy 1 <= k <= N (N={N}); got k={k}")


def _pass_at_k_values_from_counts(nu: np.ndarray, N: int, k: int) -> np.ndarray:
    """Vectorized Pass@k values from per-row success counts."""
    denom = comb(N, k)
    return 1.0 - comb(N - nu, k) / denom


def _auc_at_k_coefficients(k: int) -> np.ndarray:
    """Eq. (7) trapezoidal-rule coefficients for AUC@K over Pass@1..Pass@K."""
    if k < 1:
        raise ValueError(f"k must be >= 1; got {k}")
    if k == 1:
        return np.array([1.0], dtype=float)
    coeff = np.full(k, 1.0 / (k - 1), dtype=float)
    coeff[0] = 0.5 / (k - 1)
    coeff[-1] = 0.5 / (k - 1)
    return coeff


[docs] def auc_at_k(R: np.ndarray, k: int) -> float: r""" Performance evaluation using AUC@K. References: Hu, Z., et al. (2026). Rewarding the Rare: Uniqueness-Aware RL for Creative Problem Solving in LLMs. *arXiv:2601.08763*. https://arxiv.org/abs/2601.08763 Args: R: :math:`M \times N` binary matrix with entries in :math:`\{0, 1\}`. :math:`R_{\alpha i} = 1` if trial :math:`i` for question :math:`\alpha` passed, 0 otherwise. k: Maximum sampling budget (:math:`1 \le k \le N`). Returns: float: The average AUC@K score across all :math:`M` questions. Notation: For each row :math:`\alpha`: .. math:: \nu_\alpha = \sum_{i=1}^{N} R_{\alpha i} \quad \text{(number of correct samples)} \mathrm{Pass@}j_\alpha = 1 - \frac{\binom{N - \nu_\alpha}{j}}{\binom{N}{j}} For :math:`k > 1`, define trapezoidal coefficients :math:`c_1 = c_k = \frac{1}{2(k-1)}` and :math:`c_j = \frac{1}{k-1}` for :math:`2 \le j \le k-1`. For :math:`k = 1`, :math:`\mathrm{AUC@1} = \mathrm{Pass@1}`. Formula: .. math:: \mathrm{AUC@}k_\alpha = \sum_{j=1}^{k} c_j \, \mathrm{Pass@}j_\alpha .. math:: \mathrm{AUC@}k = \frac{1}{M} \sum_{\alpha=1}^{M} \mathrm{AUC@}k_\alpha Equivalently, for :math:`k > 1`, .. math:: \mathrm{AUC@}k = \frac{1}{k - 1} \sum_{j=1}^{k-1} \frac{\mathrm{Pass@}j + \mathrm{Pass@}(j + 1)}{2} Examples: >>> import numpy as np >>> R = np.array([[0, 1, 1, 0, 1], ... [1, 1, 0, 1, 1]]) >>> round(auc_at_k(R, 1), 6) 0.7 >>> round(auc_at_k(R, 2), 6) 0.825 >>> round(auc_at_k(R, 3), 6) 0.9 """ Rm = _as_2d_int_matrix(R) _validate_binary(Rm) _, N = Rm.shape _validate_k(N, k) if k == 1: return pass_at_k(Rm, 1) nu = np.sum(Rm, axis=1) coeff = _auc_at_k_coefficients(k) vals = np.zeros(Rm.shape[0], dtype=float) for j, c_j in enumerate(coeff, start=1): vals += c_j * _pass_at_k_values_from_counts(nu, N, j) return float(np.mean(vals))
def _auc_at_k_bayes( R: np.ndarray, k: int, alpha0: float = 1.0, beta0: float = 1.0 ) -> tuple[float, float]: """Posterior mean/std for :func:`auc_at_k`.""" Rm = _as_2d_int_matrix(R) _validate_binary(Rm) M, N = Rm.shape _validate_k(N, k) alpha, beta = _binary_beta_posterior_params(Rm, alpha0=alpha0, beta0=beta0) coeff = _auc_at_k_coefficients(k) js = np.arange(1, k + 1, dtype=int) means = np.empty(M, dtype=float) vars_ = np.empty(M, dtype=float) # Eq. (7) becomes a weighted sum of Pass@j terms, and for Bernoulli # success rate p we use Pass@j(p) = 1 - (1 - p)^j. for i in range(M): a_i = float(alpha[i]) b_i = float(beta[i]) eq = np.array([_beta_ratio(a_i, b_i, 0, int(j)) for j in js], dtype=float) m = 1.0 - float(np.dot(coeff, eq)) e2 = 1.0 e2 -= 2.0 * float(np.dot(coeff, eq)) for idx_j, j in enumerate(js): c_j = float(coeff[idx_j]) for idx_l, l in enumerate(js): c_l = float(coeff[idx_l]) e2 += c_j * c_l * _beta_ratio(a_i, b_i, 0, int(j + l)) v = max(0.0, e2 - m * m) means[i] = m vars_[i] = v mu = float(np.mean(means)) sigma = float(math.sqrt(float(np.sum(vars_))) / M) return mu, sigma
[docs] def auc_at_k_ci( R: np.ndarray, k: int, confidence: float = 0.95, bounds: tuple[float, float] = (0.0, 1.0), alpha0: float = 1.0, beta0: float = 1.0, ) -> tuple[float, float, float, float]: r""" Bayesian posterior summary for the latent AUC@K target. The posterior model treats each question's success probability as a latent Bernoulli parameter with a Beta prior. It propagates that uncertainty through the AUC@K weighted sum of i.i.d. Pass@j targets. For ``k = 1``, AUC@1 is Pass@1, so this function returns :func:`pass_at_k_ci` with ``k = 1``. Args: R: :math:`M \times N` binary matrix with entries in :math:`\{0,1\}`. k: Maximum sampling budget with ``1 <= k <= N``. confidence: credibility level of the interval. bounds: ``(lo, hi)`` clipping bounds for the interval. alpha0: Beta prior parameter :math:`\alpha_0`. beta0: Beta prior parameter :math:`\beta_0`. Returns: tuple[float, float, float, float]: :math:`(\mu,\; \sigma,\; \text{lo},\; \text{hi})`. """ if k == 1: return pass_at_k_ci( R, 1, confidence=confidence, bounds=bounds, alpha0=alpha0, beta0=beta0, ) mu, sigma = _auc_at_k_bayes(R, k, alpha0=alpha0, beta0=beta0) lo, hi = normal_credible_interval( mu, sigma, credibility=confidence, two_sided=True, bounds=bounds ) return float(mu), float(sigma), float(lo), float(hi)
__all__ = [ "auc_at_k", "auc_at_k_ci", ]