Source code for scorio.eval.maj

r"""Majority-vote metrics and uncertainty estimators for binary outcomes.

This module evaluates whether a sampled subset contains a strict majority of
correct traces. It is a binary-outcome wrapper around the generalized
threshold-pass family.

Methods
-------
- ``maj_at_k``: probability that more than half of the ``k`` selected traces
  are correct.

Each metric has a companion ``*_ci`` function that returns
``(mu, sigma, lo, hi)`` under the Bayesian uncertainty model used here.
"""

import numpy as np

from .gpass import g_pass_at_k_tau, g_pass_at_k_tau_ci
from .utils import _as_2d_int_matrix


def _majority_tau(k: int) -> float:
    """Return the threshold τ such that ceil(τ k) is a strict majority."""
    return ((k // 2) + 1) / k


[docs] def maj_at_k(R: np.ndarray, k: int) -> float: r""" Maj@k: strict-majority correctness over ``k`` samples. This metric measures the probability that a uniformly sampled subset of ``k`` observed traces contains strictly more than half correct solutions. It is a binary-outcome proxy for the majority-vote metrics often reported in reasoning papers when only correctness labels are available. Args: R: :math:`M \times N` binary outcome matrix. k: Number of sampled traces, with ``1 <= k <= N``. Returns: float: Average strict-majority success probability across prompts. Formula: Let :math:`\nu_\alpha = \sum_i R_{\alpha i}` and :math:`j_0 = \lfloor k/2 \rfloor + 1`. Then .. math:: \mathrm{Maj@k}_\alpha = \sum_{j=j_0}^{k} \frac{\binom{\nu_\alpha}{j}\binom{N-\nu_\alpha}{k-j}} {\binom{N}{k}}. Equivalently, this is :math:`\mathrm{G\text{-}Pass@k}_{\tau}` with :math:`\tau = j_0 / k`. Examples: >>> import numpy as np >>> R = np.array([[0, 1, 1, 0, 1], ... [1, 1, 0, 1, 1]]) >>> round(maj_at_k(R, 1), 6) 0.7 >>> round(maj_at_k(R, 2), 6) 0.45 >>> round(maj_at_k(R, 3), 6) 0.85 """ Rm = _as_2d_int_matrix(R) _, N = Rm.shape if not (1 <= k <= N): raise ValueError(f"k must satisfy 1 <= k <= N (N={N}); got k={k}") return g_pass_at_k_tau(Rm, k, tau=_majority_tau(k))
[docs] def maj_at_k_ci( R: np.ndarray, k: int, confidence: float = 0.95, bounds: tuple[float, float] = (0.0, 1.0), alpha0: float = 1.0, beta0: float = 1.0, ) -> tuple[float, float, float, float]: r""" Bayesian posterior summary for :func:`maj_at_k`. This reuses the generalized threshold-pass posterior with the strict majority threshold :math:`j_0 = \lfloor k/2 \rfloor + 1`. Args: R: :math:`M \times N` binary outcome matrix. k: Number of sampled traces, with ``1 <= k <= N``. confidence: Credibility level for the normal-approximation interval. bounds: ``(lo, hi)`` clipping bounds for the interval. alpha0: Beta prior parameter :math:`\alpha_0`. beta0: Beta prior parameter :math:`\beta_0`. Returns: tuple[float, float, float, float]: :math:`(\mu,\; \sigma,\; \text{lo},\; \text{hi})` Formula: This is exactly :func:`~scorio.eval.g_pass_at_k_tau_ci` evaluated at .. math:: \tau = \frac{\lfloor k/2 \rfloor + 1}{k}. Examples: >>> import numpy as np >>> R = np.array([[0, 1, 1, 0, 1], ... [1, 1, 0, 1, 1]]) >>> mu, sigma, lo, hi = maj_at_k_ci(R, 2) >>> round(mu, 6), round(sigma, 6), round(lo, 4), round(hi, 4) (0.446429, 0.146167, 0.1599, 0.7329) >>> mu, sigma, lo, hi = maj_at_k_ci(R, 3) >>> round(mu, 6), round(sigma, 6), round(lo, 4), round(hi, 4) (0.684524, 0.151958, 0.3867, 0.9824) """ Rm = _as_2d_int_matrix(R) _, N = Rm.shape if not (1 <= k <= N): raise ValueError(f"k must satisfy 1 <= k <= N (N={N}); got k={k}") return g_pass_at_k_tau_ci( Rm, k, tau=_majority_tau(k), confidence=confidence, bounds=bounds, alpha0=alpha0, beta0=beta0, )
__all__ = [ "maj_at_k", "maj_at_k_ci", ]