Source code for scorio.eval.bayes

r"""Bayes evaluation metrics for categorical outcomes.

Estimate :math:`\mu` and uncertainty (:math:`\sigma`) from repeated outcomes with optional
prior observations. :math:`Bayes@N` supports binary and multi-category outcomes
through a category-weight vector.

Let :math:`R \in \{0,\ldots,C\}^{M \times N}` be observed outcomes,
:math:`w \in \mathbb{R}^{C+1}` be category weights, and optional
:math:`R^0 \in \{0,\ldots,C\}^{M \times D}` be prior outcomes.
For each question :math:`\alpha` and class :math:`k`, Bayes@N forms counts
from :math:`R` and :math:`R^0`, adds Dirichlet plus one pseudo-counts, and
computes closed-form posterior moments ``mu`` and ``sigma``.

Available API
-------------------
- ``bayes`` returns ``(mu, sigma)``.
- ``bayes_ci`` returns ``(mu, sigma, lo, hi)`` using a normal-approximation
  credible interval around ``mu``.
"""

import numpy as np

from .utils import (
    _as_2d_int_matrix,
    _validate_matrix_range,
    normal_credible_interval,
)


[docs] def bayes( R: np.ndarray, w: np.ndarray | None = None, R0: np.ndarray | None = None, ) -> tuple[float, float]: r""" Performance evaluation using the Bayes@N framework. References: Hariri, M., Samandar, A., Hinczewski, M., & Chaudhary, V. (2026). Don't Pass@k: A Bayesian Framework for Large Language Model Evaluation. *ICLR 2026*, *arXiv:2510.04265*. https://arxiv.org/abs/2510.04265 Args: R: :math:`M \times N` int matrix with entries in :math:`\{0,\ldots,C\}`. Row :math:`\alpha` are the N outcomes for question :math:`\alpha`. w: length :math:`(C+1)` weight vector :math:`(w_0,\ldots,w_C)` that maps category k to score :math:`w_k`. R0: optional :math:`M \times D` int matrix supplying D prior outcomes per row. If omitted, :math:`D=0`. Returns: tuple[float, float]: :math:`(\mu, \sigma)` performance metric estimate and its uncertainty. Notation: :math:`\delta_{a,b}` is the Kronecker delta. For each row :math:`\alpha` and class :math:`k \in \{0,\ldots,C\}`: .. math:: n_{\alpha k} &= \sum_{i=1}^N \delta_{k, R_{\alpha i}} \quad \text{(counts in R)} n^0_{\alpha k} &= 1 + \sum_{i=1}^D \delta_{k, R^0_{\alpha i}} \quad \text{(Dirichlet(+1) prior)} \nu_{\alpha k} &= n_{\alpha k} + n^0_{\alpha k} Effective sample size: :math:`T = 1 + C + D + N` (scalar) Formula: .. math:: \mu = w_0 + \frac{1}{M \cdot T} \sum_{\alpha=1}^M \sum_{j=0}^C \nu_{\alpha j} (w_j - w_0) .. math:: \sigma = \sqrt{ \frac{1}{M^2(T+1)} \sum_{\alpha=1}^M \left[ \sum_j \frac{\nu_{\alpha j}}{T} (w_j - w_0)^2 - \left( \sum_j \frac{\nu_{\alpha j}}{T} (w_j - w_0) \right)^2 \right] } Examples: >>> import numpy as np >>> R = np.array([[0, 1, 2, 2, 1], ... [1, 1, 0, 2, 2]]) >>> w = np.array([0.0, 0.5, 1.0]) >>> R0 = np.array([[0, 2], ... [1, 2]]) With prior (D=2 → T=10): >>> mu, sigma = bayes(R, w, R0) >>> round(mu, 6), round(sigma, 6) (0.575, 0.084275) Without prior (D=0 → T=8): >>> mu2, sigma2 = bayes(R, w) >>> round(mu2, 6), round(sigma2, 6) (0.5625, 0.091998) """ R = _as_2d_int_matrix(R) # Auto-detect binary matrix and set default w if not provided if w is None: unique_vals = np.unique(R) is_binary = len(unique_vals) <= 2 and np.all(np.isin(unique_vals, [0, 1])) if is_binary: w = np.array([0.0, 1.0]) else: unique_str = ", ".join(map(str, sorted(unique_vals))) raise ValueError( f"R contains more than 2 unique values ({unique_str}), so weight vector 'w' must be provided. " f"Please specify a weight vector of length {len(unique_vals)} to map each category to a score." ) w = np.asarray(w, dtype=float) M, N = R.shape C = w.size - 1 if R0 is None: D = 0 R0m = np.zeros((M, 0), dtype=int) else: R0m = np.asarray(R0, dtype=int) if R0m.ndim == 1: R0m = R0m.reshape(M, -1) if R0m.shape[0] != M: raise ValueError("R0 must have the same number of rows (M) as R.") D = R0m.shape[1] # Validate value ranges _validate_matrix_range(R, 0, C, "R") _validate_matrix_range(R0m, 0, C, "R0") T = 1 + C + D + N def row_bincount(A: np.ndarray, length: int) -> np.ndarray: """Count occurrences of 0..length-1 in each row of A.""" if A.shape[1] == 0: return np.zeros((A.shape[0], length), dtype=int) out = np.zeros((A.shape[0], length), dtype=int) rows = np.repeat(np.arange(A.shape[0]), A.shape[1]) np.add.at(out, (rows, A.ravel()), 1) return out # n_{αk} and n^0_{αk} n_counts = row_bincount(R, C + 1) n0_counts = row_bincount(R0m, C + 1) + 1 # add 1 to every class (Dirichlet prior) # ν_{αk} = n_{αk} + n^0_{αk} nu = n_counts + n0_counts # shape: (M, C+1) # μ = w0 + (1/(M T)) * Σ_α Σ_j ν_{αj} (w_j - w0) delta_w = w - w[0] mu = w[0] + (nu @ delta_w).sum() / (M * T) # σ = [ (1/(M^2 (T+1))) * Σ_α { Σ_j (ν_{αj}/T)(w_j-w0)^2 # - ( Σ_j (ν_{αj}/T)(w_j-w0) )^2 } ]^{1/2} nu_over_T = nu / T termA = (nu_over_T * (delta_w**2)).sum(axis=1) termB = (nu_over_T @ delta_w) ** 2 sigma = np.sqrt(((termA - termB).sum()) / (M**2 * (T + 1))) return float(mu), float(sigma)
[docs] def bayes_ci( R: np.ndarray, w: np.ndarray | None = None, R0: np.ndarray | None = None, confidence: float = 0.95, bounds: tuple[float, float] | None = None, ) -> tuple[float, float, float, float]: r""" Bayes@N posterior mean, uncertainty, and credible interval. This is the interval-valued companion to :func:`bayes`. It computes :math:`\mu` and :math:`\sigma` with the Bayes@N posterior moments, then forms a central normal-approximation credible interval. Args: R: :math:`M \times N` int matrix with entries in :math:`\{0,\ldots,C\}`. w: optional length :math:`(C+1)` weight vector :math:`(w_0,\ldots,w_C)`. If omitted, ``R`` must be binary and ``w = [0, 1]`` is used. R0: optional :math:`M \times D` int matrix supplying prior outcomes for each row. confidence: credibility level of the interval. bounds: optional ``(lo, hi)`` clipping bounds for the interval. Returns: tuple[float, float, float, float]: :math:`(\mu,\; \sigma,\; \text{lo},\; \text{hi})`. Examples: >>> import numpy as np >>> R = np.array([[0, 1, 1, 0, 1], ... [1, 1, 0, 1, 1]]) >>> mu, sigma, lo, hi = bayes_ci(R, bounds=(0.0, 1.0)) >>> round(mu, 6), round(sigma, 6), round(lo, 4), round(hi, 4) (0.642857, 0.118451, 0.4107, 0.875) """ mu, sigma = bayes(R, w, R0) lo, hi = normal_credible_interval( mu, sigma, credibility=confidence, two_sided=True, bounds=bounds ) return float(mu), float(sigma), float(lo), float(hi)
__all__ = [ "bayes", "bayes_ci", ]