Source code for scorio.eval.auc

r"""AUC@K evaluation metrics for binary outcomes.

Estimate normalized area under the Pass@j curve for budgets
:math:`j = 1, \ldots, k`. For a binary outcome matrix
:math:`R \in \{0,1\}^{M \times N}`, :math:`AUC@K` averages per-question
Pass@j values using trapezoidal weights matching Eq. (7) of Hu et al. (2026).
The Bayesian summary computes posterior ``mu`` and ``sigma`` under a Beta
model for each question's latent success rate.

Available API
-------------------
- ``auc_at_k`` returns the point estimate.
- ``auc_at_k_ci`` returns ``(mu, sigma, lo, hi)`` using a normal-approximation
  credible interval around ``mu``.
"""

import math

import numpy as np
from scipy.special import comb

from .pass_at_k import (
    _beta_ratio,
    _binary_beta_posterior_params,
    pass_at_k,
    pass_at_k_ci,
)
from .utils import _as_2d_int_matrix, _validate_binary, normal_credible_interval


def _validate_k(N: int, k: int) -> None:
    if not (1 <= k <= N):
        raise ValueError(f"k must satisfy 1 <= k <= N (N={N}); got k={k}")


def _pass_at_k_values_from_counts(nu: np.ndarray, N: int, k: int) -> np.ndarray:
    """Vectorized Pass@k values from per-row success counts."""
    denom = comb(N, k)
    return 1.0 - comb(N - nu, k) / denom


def _auc_at_k_coefficients(k: int) -> np.ndarray:
    """Eq. (7) trapezoidal-rule coefficients for AUC@K over Pass@1..Pass@K."""
    if k < 1:
        raise ValueError(f"k must be >= 1; got {k}")
    if k == 1:
        return np.array([1.0], dtype=float)
    coeff = np.full(k, 1.0 / (k - 1), dtype=float)
    coeff[0] = 0.5 / (k - 1)
    coeff[-1] = 0.5 / (k - 1)
    return coeff



[docs]
def auc_at_k(R: np.ndarray, k: int) -> float:
    r"""
    Performance evaluation using AUC@K.

    References:
        Hu, Z., et al. (2026).
        Rewarding the Rare: Uniqueness-Aware RL for Creative Problem Solving in
        LLMs. *arXiv:2601.08763*.
        https://arxiv.org/abs/2601.08763

    Args:
        R: :math:`M \times N` binary matrix with entries in :math:`\{0, 1\}`.
           :math:`R_{\alpha i} = 1` if trial :math:`i` for question
           :math:`\alpha` passed, 0 otherwise.
        k: Maximum sampling budget (:math:`1 \le k \le N`).

    Returns:
        float: The average AUC@K score across all :math:`M` questions.

    Notation:
        For each row :math:`\alpha`:

        .. math::

            \nu_\alpha = \sum_{i=1}^{N} R_{\alpha i} \quad \text{(number of correct samples)}

            \mathrm{Pass@}j_\alpha = 1 - \frac{\binom{N - \nu_\alpha}{j}}{\binom{N}{j}}

        For :math:`k > 1`, define trapezoidal coefficients
        :math:`c_1 = c_k = \frac{1}{2(k-1)}` and
        :math:`c_j = \frac{1}{k-1}` for :math:`2 \le j \le k-1`.
        For :math:`k = 1`, :math:`\mathrm{AUC@1} = \mathrm{Pass@1}`.

    Formula:
        .. math::

            \mathrm{AUC@}k_\alpha = \sum_{j=1}^{k} c_j \, \mathrm{Pass@}j_\alpha

        .. math::

            \mathrm{AUC@}k = \frac{1}{M} \sum_{\alpha=1}^{M} \mathrm{AUC@}k_\alpha

        Equivalently, for :math:`k > 1`,

        .. math::

            \mathrm{AUC@}k =
            \frac{1}{k - 1} \sum_{j=1}^{k-1}
            \frac{\mathrm{Pass@}j + \mathrm{Pass@}(j + 1)}{2}

    Examples:
        >>> import numpy as np
        >>> R = np.array([[0, 1, 1, 0, 1],
        ...               [1, 1, 0, 1, 1]])
        >>> round(auc_at_k(R, 1), 6)
        0.7
        >>> round(auc_at_k(R, 2), 6)
        0.825
        >>> round(auc_at_k(R, 3), 6)
        0.9
    """
    Rm = _as_2d_int_matrix(R)
    _validate_binary(Rm)
    _, N = Rm.shape
    _validate_k(N, k)

    if k == 1:
        return pass_at_k(Rm, 1)

    nu = np.sum(Rm, axis=1)
    coeff = _auc_at_k_coefficients(k)

    vals = np.zeros(Rm.shape[0], dtype=float)
    for j, c_j in enumerate(coeff, start=1):
        vals += c_j * _pass_at_k_values_from_counts(nu, N, j)
    return float(np.mean(vals))



def _auc_at_k_bayes(
    R: np.ndarray, k: int, alpha0: float = 1.0, beta0: float = 1.0
) -> tuple[float, float]:
    """Posterior mean/std for :func:`auc_at_k`."""
    Rm = _as_2d_int_matrix(R)
    _validate_binary(Rm)
    M, N = Rm.shape
    _validate_k(N, k)

    alpha, beta = _binary_beta_posterior_params(Rm, alpha0=alpha0, beta0=beta0)
    coeff = _auc_at_k_coefficients(k)
    js = np.arange(1, k + 1, dtype=int)

    means = np.empty(M, dtype=float)
    vars_ = np.empty(M, dtype=float)

    # Eq. (7) becomes a weighted sum of Pass@j terms, and for Bernoulli
    # success rate p we use Pass@j(p) = 1 - (1 - p)^j.
    for i in range(M):
        a_i = float(alpha[i])
        b_i = float(beta[i])

        eq = np.array([_beta_ratio(a_i, b_i, 0, int(j)) for j in js], dtype=float)
        m = 1.0 - float(np.dot(coeff, eq))

        e2 = 1.0
        e2 -= 2.0 * float(np.dot(coeff, eq))
        for idx_j, j in enumerate(js):
            c_j = float(coeff[idx_j])
            for idx_l, l in enumerate(js):
                c_l = float(coeff[idx_l])
                e2 += c_j * c_l * _beta_ratio(a_i, b_i, 0, int(j + l))

        v = max(0.0, e2 - m * m)
        means[i] = m
        vars_[i] = v

    mu = float(np.mean(means))
    sigma = float(math.sqrt(float(np.sum(vars_))) / M)
    return mu, sigma



[docs]
def auc_at_k_ci(
    R: np.ndarray,
    k: int,
    confidence: float = 0.95,
    bounds: tuple[float, float] = (0.0, 1.0),
    alpha0: float = 1.0,
    beta0: float = 1.0,
) -> tuple[float, float, float, float]:
    r"""
    Bayesian posterior summary for the latent AUC@K target.

    The posterior model treats each question's success probability as a
    latent Bernoulli parameter with a Beta prior. It propagates that
    uncertainty through the AUC@K weighted sum of i.i.d. Pass@j targets. For
    ``k = 1``, AUC@1 is Pass@1, so this function returns
    :func:`pass_at_k_ci` with ``k = 1``.

    Args:
        R: :math:`M \times N` binary matrix with entries in :math:`\{0,1\}`.
        k: Maximum sampling budget with ``1 <= k <= N``.
        confidence: credibility level of the interval.
        bounds: ``(lo, hi)`` clipping bounds for the interval.
        alpha0: Beta prior parameter :math:`\alpha_0`.
        beta0: Beta prior parameter :math:`\beta_0`.

    Returns:
        tuple[float, float, float, float]:
            :math:`(\mu,\; \sigma,\; \text{lo},\; \text{hi})`.
    """
    if k == 1:
        return pass_at_k_ci(
            R,
            1,
            confidence=confidence,
            bounds=bounds,
            alpha0=alpha0,
            beta0=beta0,
        )

    mu, sigma = _auc_at_k_bayes(R, k, alpha0=alpha0, beta0=beta0)
    lo, hi = normal_credible_interval(
        mu, sigma, credibility=confidence, two_sided=True, bounds=bounds
    )
    return float(mu), float(sigma), float(lo), float(hi)



__all__ = [
    "auc_at_k",
    "auc_at_k_ci",
]