Source code for scorio.eval.bayes

r"""Bayes evaluation metrics for categorical outcomes.

Estimate :math:`\mu` and uncertainty (:math:`\sigma`) from repeated outcomes with optional
prior observations. :math:`Bayes@N` supports binary and multi-category outcomes
through a category-weight vector.

Let :math:`R \in \{0,\ldots,C\}^{M \times N}` be observed outcomes,
:math:`w \in \mathbb{R}^{C+1}` be category weights, and optional
:math:`R^0 \in \{0,\ldots,C\}^{M \times D}` be prior outcomes.
For each question :math:`\alpha` and class :math:`k`, Bayes@N forms counts
from :math:`R` and :math:`R^0`, adds Dirichlet plus one pseudo-counts, and
computes closed-form posterior moments ``mu`` and ``sigma``.

Available API
-------------------
- ``bayes`` returns ``(mu, sigma)``.
- ``bayes_ci`` returns ``(mu, sigma, lo, hi)`` using a normal-approximation
  credible interval around ``mu``.
"""

import numpy as np

from .utils import (
    _as_2d_int_matrix,
    _validate_matrix_range,
    normal_credible_interval,
)



[docs]
def bayes(
    R: np.ndarray,
    w: np.ndarray | None = None,
    R0: np.ndarray | None = None,
) -> tuple[float, float]:
    r"""
    Performance evaluation using the Bayes@N framework.

    References:
        Hariri, M., Samandar, A., Hinczewski, M., & Chaudhary, V. (2026).
        Don't Pass@k: A Bayesian Framework for Large Language Model Evaluation.
        *ICLR 2026*, *arXiv:2510.04265*.
        https://arxiv.org/abs/2510.04265

    Args:
        R: :math:`M \times N` int matrix with entries in :math:`\{0,\ldots,C\}`.
           Row :math:`\alpha` are the N outcomes for question :math:`\alpha`.
        w: length :math:`(C+1)` weight vector :math:`(w_0,\ldots,w_C)` that maps
           category k to score :math:`w_k`.
        R0: optional :math:`M \times D` int matrix supplying D prior outcomes per row.
             If omitted, :math:`D=0`.

    Returns:
        tuple[float, float]: :math:`(\mu, \sigma)` performance metric estimate and its uncertainty.

    Notation:
        :math:`\delta_{a,b}` is the Kronecker delta. For each row :math:`\alpha` and class :math:`k \in \{0,\ldots,C\}`:

        .. math::

            n_{\alpha k} &= \sum_{i=1}^N \delta_{k, R_{\alpha i}} \quad \text{(counts in R)}

            n^0_{\alpha k} &= 1 + \sum_{i=1}^D \delta_{k, R^0_{\alpha i}} \quad \text{(Dirichlet(+1) prior)}

            \nu_{\alpha k} &= n_{\alpha k} + n^0_{\alpha k}

        Effective sample size: :math:`T = 1 + C + D + N` (scalar)

    Formula:
        .. math::

            \mu = w_0 + \frac{1}{M \cdot T} \sum_{\alpha=1}^M \sum_{j=0}^C \nu_{\alpha j} (w_j - w_0)

        .. math::

            \sigma = \sqrt{ \frac{1}{M^2(T+1)} \sum_{\alpha=1}^M \left[
                \sum_j \frac{\nu_{\alpha j}}{T} (w_j - w_0)^2
                - \left( \sum_j \frac{\nu_{\alpha j}}{T} (w_j - w_0) \right)^2 \right] }

    Examples:
        >>> import numpy as np
        >>> R  = np.array([[0, 1, 2, 2, 1],
        ...                [1, 1, 0, 2, 2]])
        >>> w  = np.array([0.0, 0.5, 1.0])
        >>> R0 = np.array([[0, 2],
        ...                [1, 2]])

        With prior (D=2 → T=10):

        >>> mu, sigma = bayes(R, w, R0)
        >>> round(mu, 6), round(sigma, 6)
        (0.575, 0.084275)

        Without prior (D=0 → T=8):

        >>> mu2, sigma2 = bayes(R, w)
        >>> round(mu2, 6), round(sigma2, 6)
        (0.5625, 0.091998)

    """
    R = _as_2d_int_matrix(R)

    # Auto-detect binary matrix and set default w if not provided
    if w is None:
        unique_vals = np.unique(R)
        is_binary = len(unique_vals) <= 2 and np.all(np.isin(unique_vals, [0, 1]))

        if is_binary:
            w = np.array([0.0, 1.0])
        else:
            unique_str = ", ".join(map(str, sorted(unique_vals)))
            raise ValueError(
                f"R contains more than 2 unique values ({unique_str}), so weight vector 'w' must be provided. "
                f"Please specify a weight vector of length {len(unique_vals)} to map each category to a score."
            )
    w = np.asarray(w, dtype=float)
    M, N = R.shape
    C = w.size - 1

    if R0 is None:
        D = 0
        R0m = np.zeros((M, 0), dtype=int)
    else:
        R0m = np.asarray(R0, dtype=int)
        if R0m.ndim == 1:
            R0m = R0m.reshape(M, -1)
        if R0m.shape[0] != M:
            raise ValueError("R0 must have the same number of rows (M) as R.")
        D = R0m.shape[1]

    # Validate value ranges
    _validate_matrix_range(R, 0, C, "R")
    _validate_matrix_range(R0m, 0, C, "R0")

    T = 1 + C + D + N

    def row_bincount(A: np.ndarray, length: int) -> np.ndarray:
        """Count occurrences of 0..length-1 in each row of A."""
        if A.shape[1] == 0:
            return np.zeros((A.shape[0], length), dtype=int)
        out = np.zeros((A.shape[0], length), dtype=int)
        rows = np.repeat(np.arange(A.shape[0]), A.shape[1])
        np.add.at(out, (rows, A.ravel()), 1)
        return out

    # n_{αk} and n^0_{αk}
    n_counts = row_bincount(R, C + 1)
    n0_counts = row_bincount(R0m, C + 1) + 1  # add 1 to every class (Dirichlet prior)

    # ν_{αk} = n_{αk} + n^0_{αk}
    nu = n_counts + n0_counts  # shape: (M, C+1)

    # μ = w0 + (1/(M T)) * Σ_α Σ_j ν_{αj} (w_j - w0)
    delta_w = w - w[0]
    mu = w[0] + (nu @ delta_w).sum() / (M * T)

    # σ = [ (1/(M^2 (T+1))) * Σ_α { Σ_j (ν_{αj}/T)(w_j-w0)^2
    #       - ( Σ_j (ν_{αj}/T)(w_j-w0) )^2 } ]^{1/2}
    nu_over_T = nu / T
    termA = (nu_over_T * (delta_w**2)).sum(axis=1)
    termB = (nu_over_T @ delta_w) ** 2
    sigma = np.sqrt(((termA - termB).sum()) / (M**2 * (T + 1)))

    return float(mu), float(sigma)




[docs]
def bayes_ci(
    R: np.ndarray,
    w: np.ndarray | None = None,
    R0: np.ndarray | None = None,
    confidence: float = 0.95,
    bounds: tuple[float, float] | None = None,
) -> tuple[float, float, float, float]:
    r"""
    Bayes@N posterior mean, uncertainty, and credible interval.

    This is the interval-valued companion to :func:`bayes`. It computes
    :math:`\mu` and :math:`\sigma` with the Bayes@N posterior moments, then
    forms a central normal-approximation credible interval.

    Args:
        R: :math:`M \times N` int matrix with entries in
           :math:`\{0,\ldots,C\}`.
        w: optional length :math:`(C+1)` weight vector
           :math:`(w_0,\ldots,w_C)`. If omitted, ``R`` must be binary and
           ``w = [0, 1]`` is used.
        R0: optional :math:`M \times D` int matrix supplying prior outcomes
            for each row.
        confidence: credibility level of the interval.
        bounds: optional ``(lo, hi)`` clipping bounds for the interval.

    Returns:
        tuple[float, float, float, float]:
            :math:`(\mu,\; \sigma,\; \text{lo},\; \text{hi})`.

    Examples:
        >>> import numpy as np
        >>> R = np.array([[0, 1, 1, 0, 1],
        ...               [1, 1, 0, 1, 1]])
        >>> mu, sigma, lo, hi = bayes_ci(R, bounds=(0.0, 1.0))
        >>> round(mu, 6), round(sigma, 6), round(lo, 4), round(hi, 4)
        (0.642857, 0.118451, 0.4107, 0.875)
    """
    mu, sigma = bayes(R, w, R0)
    lo, hi = normal_credible_interval(
        mu, sigma, credibility=confidence, two_sided=True, bounds=bounds
    )
    return float(mu), float(sigma), float(lo), float(hi)



__all__ = [
    "bayes",
    "bayes_ci",
]