Source code for scorio.eval.gpass

r"""Generalized pass-family evaluation metrics for binary outcomes.

Estimate stability-style pass metrics from a binary outcome matrix
:math:`R \in \{0,1\}^{M \times N}`. ``G-Pass@k`` is the all-success
threshold, ``G-Pass@k``\ :sub:`\tau` requires at least
:math:`\lceil \tau k \rceil` successes, and ``mG-Pass@k`` averages over
thresholds :math:`\tau \in [0.5, 1.0]`. Bayesian summaries compute posterior
``mu`` and ``sigma`` under a Beta model for each question's latent success
rate.

Available API
-------------------
- ``g_pass_at_k`` returns the :math:`\tau = 1` alias of ``pass_hat_k``.
- ``g_pass_at_k_tau`` returns the thresholded generalized pass metric.
- ``mg_pass_at_k`` returns the mean generalized pass metric.
- companion ``*_ci`` functions return ``(mu, sigma, lo, hi)`` using a
  normal-approximation credible interval around ``mu``.
"""

import math

import numpy as np
from scipy.special import comb

from .pass_at_k import (
    _beta_ratio,
    _binary_beta_posterior_params,
    _pass_at_k_bayes,
    _pass_hat_k_bayes,
    pass_at_k,
    pass_hat_k,
    pass_hat_k_ci,
)
from .utils import _as_2d_int_matrix, _validate_binary, normal_credible_interval



[docs]
def g_pass_at_k(R: np.ndarray, k: int) -> float:
    r"""
    Performance evaluation using G-Pass@k.

    Equivalent to :func:`~scorio.eval.pass_hat_k`, included for literature
    that uses the G-Pass@k naming convention for the
    :math:`\tau = 1` threshold.

    References:
        Liu, J., Liu, H., Xiao, L., et al. (2024).
        Are Your LLMs Capable of Stable Reasoning?
        *arXiv preprint arXiv:2412.13147*.
        https://arxiv.org/abs/2412.13147

    Args:
        R: :math:`M \times N` binary matrix with entries in :math:`\{0, 1\}`.
           :math:`R_{\alpha i} = 1` if trial :math:`i` for question
           :math:`\alpha` passed, 0 otherwise.
        k: Number of samples to select (:math:`1 \le k \le N`).

    Returns:
        float: The average G-Pass@k score across all :math:`M` questions.

    Notation:
        For each row :math:`\alpha`:

        .. math::

            \nu_\alpha = \sum_{i=1}^{N} R_{\alpha i} \quad \text{(number of correct samples)}

        :math:`C(a, b)` denotes the binomial coefficient :math:`\binom{a}{b}`.

    Formula:
        .. math::

            \mathrm{G\text{-}Pass@}k_\alpha = \frac{C(\nu_\alpha, k)}{C(N, k)}

        .. math::

            \mathrm{G\text{-}Pass@}k = \frac{1}{M} \sum_{\alpha=1}^{M}
                \mathrm{G\text{-}Pass@}k_\alpha

    Examples:
        >>> import numpy as np
        >>> R = np.array([[0, 1, 1, 0, 1],
        ...               [1, 1, 0, 1, 1]])
        >>> round(g_pass_at_k(R, 1), 6)
        0.7
        >>> round(g_pass_at_k(R, 2), 6)
        0.45
    """
    return pass_hat_k(R, k)




[docs]
def g_pass_at_k_tau(R: np.ndarray, k: int, tau: float) -> float:
    r"""
    Performance evaluation using G-Pass@k\ :sub:`τ`.

    References:
        Liu, J., Liu, H., Xiao, L., et al. (2024).
        Are Your LLMs Capable of Stable Reasoning?
        *arXiv preprint arXiv:2412.13147*.
        https://arxiv.org/abs/2412.13147

    Args:
        R: :math:`M \times N` binary matrix with entries in :math:`\{0, 1\}`.
           :math:`R_{\alpha i} = 1` if trial :math:`i` for question
           :math:`\alpha` passed, 0 otherwise.
        k: Number of samples to select (:math:`1 \le k \le N`).
        tau: Threshold parameter :math:`\tau \in [0, 1]`. Requires at
             least :math:`\lceil \tau \cdot k \rceil` successes.
             When :math:`\tau = 0`, equivalent to Pass@k.
             When :math:`\tau = 1`, equivalent to Pass^k.

    Returns:
        float: The average G-Pass@k\ :sub:`τ` score across all :math:`M` questions.

    Notation:
        For each row :math:`\alpha`:

        .. math::

            \nu_\alpha = \sum_{i=1}^{N} R_{\alpha i} \quad \text{(number of correct samples)}

        :math:`C(a, b)` denotes the binomial coefficient :math:`\binom{a}{b}`.

        :math:`j_0 = \lceil \tau \cdot k \rceil` is the minimum number of
        successes required.

    Formula:
        .. math::

            \mathrm{G\text{-}Pass@}k_{\tau, \alpha} = \sum_{j=j_0}^{k}
                \frac{C(\nu_\alpha, j) \cdot C(N - \nu_\alpha, k - j)}{C(N, k)}

        .. math::

            \mathrm{G\text{-}Pass@}k_\tau = \frac{1}{M} \sum_{\alpha=1}^{M}
                \mathrm{G\text{-}Pass@}k_{\tau, \alpha}

    Examples:
        >>> import numpy as np
        >>> R = np.array([[0, 1, 1, 0, 1],
        ...               [1, 1, 0, 1, 1]])
        >>> round(g_pass_at_k_tau(R, 2, 0.5), 6)
        0.95
        >>> round(g_pass_at_k_tau(R, 2, 1.0), 6)
        0.45
    """
    R = _as_2d_int_matrix(R)
    _validate_binary(R)
    M, N = R.shape

    if not (0.0 <= tau <= 1.0):
        raise ValueError(f"tau must be in [0, 1]; got {tau}")
    if not (1 <= k <= N):
        raise ValueError(f"k must satisfy 1 <= k <= N (N={N}); got k={k}")

    if tau <= 0.0:
        return pass_at_k(R, k)

    nu = np.sum(R, axis=1)
    denom = comb(N, k)

    j0 = int(math.ceil(tau * k))
    if j0 > k:
        return 0.0

    vals = np.zeros(M, dtype=float)
    for j in range(j0, k + 1):
        vals += comb(nu, j) * comb(N - nu, k - j) / denom
    return float(np.mean(vals))




[docs]
def mg_pass_at_k(R: np.ndarray, k: int) -> float:
    r"""
    Performance evaluation using mG-Pass@k.

    References:
        Liu, J., Liu, H., Xiao, L., et al. (2024).
        Are Your LLMs Capable of Stable Reasoning?
        *arXiv preprint arXiv:2412.13147*.
        https://arxiv.org/abs/2412.13147

    Args:
        R: :math:`M \times N` binary matrix with entries in :math:`\{0, 1\}`.
           :math:`R_{\alpha i} = 1` if trial :math:`i` for question
           :math:`\alpha` passed, 0 otherwise.
        k: Number of samples to select (:math:`1 \le k \le N`).

    Returns:
        float: The average mG-Pass@k score across all :math:`M` questions.

    Notation:
        For each row :math:`\alpha`:

        .. math::

            \nu_\alpha = \sum_{i=1}^{N} R_{\alpha i} \quad \text{(number of correct samples)}

        :math:`m = \lceil k/2 \rceil` is the majority threshold.
        Let :math:`X_\alpha \sim \mathrm{Hypergeometric}(N, \nu_\alpha, k)`
        be the number of correct samples among :math:`k` selections for row
        :math:`\alpha`.

    Formula:
        .. math::

            \mathrm{mG\text{-}Pass@}k_\alpha = 2 \int_{0.5}^{1.0}
                \mathrm{G\text{-}Pass@}k_{\tau, \alpha} \, d\tau

        .. math::

            \mathrm{mG\text{-}Pass@}k_\alpha = \frac{2}{k} \sum_{j=m+1}^{k}
                (j - m) \cdot P(X_\alpha = j)

        .. math::

            P(X_\alpha = j) = \frac{C(\nu_\alpha, j) \cdot C(N - \nu_\alpha, k - j)}{C(N, k)}

        .. math::

            \mathrm{mG\text{-}Pass@}k = \frac{1}{M} \sum_{\alpha=1}^{M}
                \mathrm{mG\text{-}Pass@}k_\alpha

    Examples:
        >>> import numpy as np
        >>> R = np.array([[0, 1, 1, 0, 1],
        ...               [1, 1, 0, 1, 1]])
        >>> round(mg_pass_at_k(R, 2), 6)
        0.45
        >>> round(mg_pass_at_k(R, 3), 6)
        0.166667
    """
    R = _as_2d_int_matrix(R)
    _validate_binary(R)
    M, N = R.shape

    if not (1 <= k <= N):
        raise ValueError(f"k must satisfy 1 <= k <= N (N={N}); got k={k}")

    nu = np.sum(R, axis=1)
    denom = comb(N, k)

    majority = int(math.ceil(0.5 * k))
    if majority >= k:
        return 0.0

    vals = np.zeros(M, dtype=float)
    # mG per-question = (2/k) * E[(X - majority)_+], X ~ Hypergeom(N, nu, k)
    for j in range(majority + 1, k + 1):
        pmf = comb(nu, j) * comb(N - nu, k - j) / denom
        vals += (j - majority) * pmf

    vals *= 2.0 / k
    return float(np.mean(vals))



def _g_pass_at_k_tau_bayes(
    R: np.ndarray, k: int, tau: float, alpha0: float = 1.0, beta0: float = 1.0
) -> tuple[float, float]:
    """Posterior mean/std for the i.i.d. G-Pass@k_τ quantity."""
    Rm = _as_2d_int_matrix(R)
    _validate_binary(Rm)
    M, N = Rm.shape

    if not (0.0 <= tau <= 1.0):
        raise ValueError(f"tau must be in [0, 1]; got {tau}")
    if not (1 <= k <= N):
        raise ValueError(f"k must satisfy 1 <= k <= N (N={N}); got k={k}")

    if tau <= 0.0:
        return _pass_at_k_bayes(Rm, k, alpha0=alpha0, beta0=beta0)
    if tau >= 1.0:
        return _pass_hat_k_bayes(Rm, k, alpha0=alpha0, beta0=beta0)

    j0 = int(math.ceil(tau * k))
    alpha, beta = _binary_beta_posterior_params(Rm, alpha0=alpha0, beta0=beta0)

    means = np.empty(M, dtype=float)
    vars_ = np.empty(M, dtype=float)

    # g(p) = Σ_{j=j0..k} C(k,j) p^j (1-p)^{k-j}
    js = list(range(j0, k + 1))
    coeff = [float(comb(k, j)) for j in js]

    for i in range(M):
        a_i = float(alpha[i])
        b_i = float(beta[i])

        m = 0.0
        for c_j, j in zip(coeff, js, strict=True):
            m += c_j * _beta_ratio(a_i, b_i, j, k - j)

        e2 = 0.0
        for idx_j, j in enumerate(js):
            c_j = coeff[idx_j]
            for idx_l, l in enumerate(js):
                c_l = coeff[idx_l]
                e2 += c_j * c_l * _beta_ratio(a_i, b_i, j + l, 2 * k - (j + l))

        v = max(0.0, e2 - m * m)
        means[i] = m
        vars_[i] = v

    mu = float(np.mean(means))
    sigma = float(math.sqrt(float(np.sum(vars_))) / M)
    return mu, sigma


def _mg_pass_at_k_bayes(
    R: np.ndarray, k: int, alpha0: float = 1.0, beta0: float = 1.0
) -> tuple[float, float]:
    """Posterior mean/std for the i.i.d. mG-Pass@k quantity."""
    Rm = _as_2d_int_matrix(R)
    _validate_binary(Rm)
    M, N = Rm.shape
    if not (1 <= k <= N):
        raise ValueError(f"k must satisfy 1 <= k <= N (N={N}); got k={k}")

    alpha, beta = _binary_beta_posterior_params(Rm, alpha0=alpha0, beta0=beta0)

    majority = int(math.ceil(0.5 * k))
    if majority >= k:
        return 0.0, 0.0

    js = list(range(majority + 1, k + 1))
    coeff = [float((2.0 / k) * (j - majority) * comb(k, j)) for j in js]

    means = np.empty(M, dtype=float)
    vars_ = np.empty(M, dtype=float)

    for i in range(M):
        a_i = float(alpha[i])
        b_i = float(beta[i])

        m = 0.0
        for c_j, j in zip(coeff, js, strict=True):
            m += c_j * _beta_ratio(a_i, b_i, j, k - j)

        e2 = 0.0
        for idx_j, j in enumerate(js):
            c_j = coeff[idx_j]
            for idx_l, l in enumerate(js):
                c_l = coeff[idx_l]
                e2 += c_j * c_l * _beta_ratio(a_i, b_i, j + l, 2 * k - (j + l))

        v = max(0.0, e2 - m * m)
        means[i] = m
        vars_[i] = v

    mu = float(np.mean(means))
    sigma = float(math.sqrt(float(np.sum(vars_))) / M)
    return mu, sigma



[docs]
def g_pass_at_k_ci(
    R: np.ndarray,
    k: int,
    confidence: float = 0.95,
    bounds: tuple[float, float] = (0.0, 1.0),
    alpha0: float = 1.0,
    beta0: float = 1.0,
) -> tuple[float, float, float, float]:
    r"""
    Bayesian posterior summary for G-Pass@k.

    G-Pass@k is the all-success threshold, so this is the same posterior
    target as :func:`~scorio.eval.pass_hat_k_ci`.

    Args:
        R: :math:`M \times N` binary matrix with entries in :math:`\{0,1\}`.
        k: Number of selected samples with ``1 <= k <= N``.
        confidence: credibility level of the interval.
        bounds: ``(lo, hi)`` clipping bounds for the interval.
        alpha0: Beta prior parameter :math:`\alpha_0`.
        beta0: Beta prior parameter :math:`\beta_0`.

    Returns:
        tuple[float, float, float, float]:
            :math:`(\mu,\; \sigma,\; \text{lo},\; \text{hi})`.
    """
    return pass_hat_k_ci(
        R, k, confidence=confidence, bounds=bounds, alpha0=alpha0, beta0=beta0
    )




[docs]
def g_pass_at_k_tau_ci(
    R: np.ndarray,
    k: int,
    tau: float,
    confidence: float = 0.95,
    bounds: tuple[float, float] = (0.0, 1.0),
    alpha0: float = 1.0,
    beta0: float = 1.0,
) -> tuple[float, float, float, float]:
    r"""
    Bayesian posterior summary for thresholded G-Pass@k.

    The latent target is the probability that at least
    :math:`\lceil \tau k \rceil` of ``k`` i.i.d. samples are correct.

    Args:
        R: :math:`M \times N` binary matrix with entries in :math:`\{0,1\}`.
        k: Number of selected samples with ``1 <= k <= N``.
        tau: Threshold parameter in ``[0, 1]``.
        confidence: credibility level of the interval.
        bounds: ``(lo, hi)`` clipping bounds for the interval.
        alpha0: Beta prior parameter :math:`\alpha_0`.
        beta0: Beta prior parameter :math:`\beta_0`.

    Returns:
        tuple[float, float, float, float]:
            :math:`(\mu,\; \sigma,\; \text{lo},\; \text{hi})`.
    """
    mu, sigma = _g_pass_at_k_tau_bayes(R, k, tau, alpha0=alpha0, beta0=beta0)
    lo, hi = normal_credible_interval(
        mu, sigma, credibility=confidence, two_sided=True, bounds=bounds
    )
    return float(mu), float(sigma), float(lo), float(hi)




[docs]
def mg_pass_at_k_ci(
    R: np.ndarray,
    k: int,
    confidence: float = 0.95,
    bounds: tuple[float, float] = (0.0, 1.0),
    alpha0: float = 1.0,
    beta0: float = 1.0,
) -> tuple[float, float, float, float]:
    r"""
    Bayesian posterior summary for mG-Pass@k.

    The latent target averages thresholded G-Pass@k over thresholds from
    ``0.5`` to ``1.0`` using the closed-form weighting in
    :func:`~scorio.eval.mg_pass_at_k`.

    Args:
        R: :math:`M \times N` binary matrix with entries in :math:`\{0,1\}`.
        k: Number of selected samples with ``1 <= k <= N``.
        confidence: credibility level of the interval.
        bounds: ``(lo, hi)`` clipping bounds for the interval.
        alpha0: Beta prior parameter :math:`\alpha_0`.
        beta0: Beta prior parameter :math:`\beta_0`.

    Returns:
        tuple[float, float, float, float]:
            :math:`(\mu,\; \sigma,\; \text{lo},\; \text{hi})`.
    """
    mu, sigma = _mg_pass_at_k_bayes(R, k, alpha0=alpha0, beta0=beta0)
    lo, hi = normal_credible_interval(
        mu, sigma, credibility=confidence, two_sided=True, bounds=bounds
    )
    return float(mu), float(sigma), float(lo), float(hi)



__all__ = [
    "g_pass_at_k",
    "g_pass_at_k_tau",
    "mg_pass_at_k",
    "g_pass_at_k_ci",
    "g_pass_at_k_tau_ci",
    "mg_pass_at_k_ci",
]