r"""Geometric pass/spectrum metrics for binary outcomes.
This module implements finite-bank geometric and threshold-spectrum metrics
together with approximate Beta-Bernoulli posterior summaries for latent
resampling quantities. The paper ``Geom@k: Fast to Converge, Slow to Drift``
defines a dataset-level endpoint blend; ``scorio`` also exposes a
questionwise Geom@k variant as the primary ``geom_at_k`` metric.
Notation
--------
For a binary matrix :math:`R \in \{0,1\}^{M \times N}`, fixed budget
:math:`k`, and threshold weights :math:`w = (w_1, \ldots, w_k)` with
non-negative entries and :math:`\sum_r w_r \le 1`, define the
threshold-spectrum summary
.. math::
S_{w,k}(R) = \sum_{r=1}^k w_r T_{r,k}(R),
where :math:`T_{r,k}(R)` is the dataset-level probability that a uniformly
sampled subset of size :math:`k` without replacement contains at least
:math:`r` correct trials.
The GeoSpectrum family is then
.. math::
\mathrm{GeoSpectrum}_{\lambda,w}@k(R)
= P_k(R)^\lambda \, S_{w,k}(R)^{1-\lambda},
where :math:`P_k(R)` is dataset-level Pass@k. The endpoint conventions are
:math:`\lambda = 0 \to S_{w,k}` and :math:`\lambda = 1 \to P_k`. The named
operating points are:
- ``geom_ds_at_k``: dataset-level endpoint blend with
:math:`\lambda = 1/2` and :math:`w_r = 1\{r = k\}`.
- ``geom_at_k``: questionwise endpoint blend, computed before averaging
across questions.
- ``GeoSpectrum*@k``: :math:`\lambda = 1/2` with upper-half weights
:math:`w_r = (2/k)\,1\{r \ge \lceil k/2 \rceil + 1\}`.
The ``*_ci`` functions implement the approximate posterior
credible intervals for the corresponding latent i.i.d. quantities under a
Beta-Bernoulli model.
Available API
-------------
- ``geom_at_k`` and ``geom_at_k_ci`` for the questionwise Pass/Unanimous
geometric blend.
- ``geom_ds_at_k`` and ``geom_ds_at_k_ci`` for the dataset-level
Pass/Unanimous blend.
- ``geo_spectrum_at_k`` and ``geo_spectrum_at_k_ci`` for
:math:`\mathrm{GeoSpectrum}_{\lambda,w}@k`.
- ``geo_spectrum_star_at_k`` and ``geo_spectrum_star_at_k_ci`` for the default
upper-half operating point.
- ``threshold_spectrum_at_k`` and ``threshold_spectrum_at_k_ci`` for
:math:`S_{w,k}`.
"""
import math
import numpy as np
from scipy.special import comb
from .pass_at_k import (
_beta_ratio,
_binary_beta_posterior_params,
)
from .pass_at_k import (
pass_at_k as _pass_at_k,
)
from .pass_at_k import (
pass_hat_k as _pass_hat_k,
)
from .utils import _as_2d_int_matrix, _validate_binary, normal_credible_interval
def _weighted_geometric_mean(
x: float, y: float, x_weight: float, y_weight: float
) -> float:
if x_weight == 0.0 and y_weight == 0.0:
raise ValueError("at least one power must be non-zero")
if x == 0.0 and x_weight < 0.0:
if y == 0.0 and y_weight > 0.0:
return 0.0
raise ValueError(
f"x_power must be non-negative when x is zero; got x_power={x_weight}"
)
if y == 0.0 and y_weight < 0.0:
if x == 0.0 and x_weight > 0.0:
return 0.0
raise ValueError(
f"y_power must be non-negative when y is zero; got y_power={y_weight}"
)
return float((x**x_weight) * (y**y_weight))
def _validate_beta_prior(alpha0: float, beta0: float) -> None:
if alpha0 <= 0.0 or beta0 <= 0.0:
raise ValueError(
f"alpha0 and beta0 must both be > 0 for a Beta prior; got {alpha0}, {beta0}"
)
def _validate_finite_bank_k(N: int, k: int) -> None:
if not (1 <= k <= N):
raise ValueError(f"k must satisfy 1 <= k <= N (N={N}); got k={k}")
def _validate_latent_k(k: int) -> None:
if k < 1:
raise ValueError(f"k must be >= 1; got k={k}")
def _resolve_lambda(lam: float, lambda_: float | None = None) -> float:
if lambda_ is not None:
if lam != 0.5:
raise TypeError("Specify at most one of 'lam' and 'lambda_'.")
lam = lambda_
if not (0.0 <= lam <= 1.0):
raise ValueError(f"lam must be in [0, 1]; got {lam}")
return float(lam)
def _unanimous_spectrum_weights(k: int) -> np.ndarray:
r"""Return endpoint weights :math:`w_r = 1\{r = k\}`."""
_validate_latent_k(k)
weights = np.zeros(k, dtype=float)
weights[-1] = 1.0
return weights
def _mg_spectrum_weights(k: int) -> np.ndarray:
r"""Return the upper-half weights used by ``GeoSpectrum*@k``.
These weights are given by
.. math::
w^{mG}_{r,k} = \frac{2}{k} 1\{r \ge \lceil k/2 \rceil + 1\}.
"""
_validate_latent_k(k)
weights = np.zeros(k, dtype=float)
weights[int(math.ceil(k / 2.0)) :] = 2.0 / k
return weights
def _validate_spectrum_weights(
weights: np.ndarray | list[float] | tuple[float, ...],
k: int,
) -> np.ndarray:
w = np.asarray(weights, dtype=float)
if w.ndim != 1 or w.shape[0] != k:
raise ValueError(f"weights must be a length-{k} 1D array; got shape {w.shape}")
if not np.all(np.isfinite(w)):
raise ValueError("weights must be finite")
if np.any(w < 0.0):
raise ValueError("weights must be non-negative")
weight_sum = float(np.sum(w))
if weight_sum > 1.0 + 1e-12:
raise ValueError(
f"weights must satisfy sum(weights) <= 1; got sum={weight_sum}"
)
return w
def _event_score_levels(weights: np.ndarray) -> np.ndarray:
r"""Return :math:`A_j = \sum_{r \le j} w_r` with :math:`A_0 = 0`.
:math:`A_j` is the credit assigned to a sampled subset of size :math:`k`
that contains exactly :math:`j` correct trials.
"""
return np.concatenate(([0.0], np.cumsum(weights, dtype=float)))
[docs]
def threshold_spectrum_at_k(
R: np.ndarray,
k: int,
weights: np.ndarray | list[float] | tuple[float, ...],
) -> float:
r"""Finite-bank threshold-spectrum summary :math:`S_{w,k}(R)`.
Args:
R: :math:`M \times N` binary matrix with entries in :math:`\{0,1\}`.
k: Sampling budget with :math:`1 \le k \le N`.
weights: Non-negative length-:math:`k` weights with
:math:`\sum_r w_r \le 1`.
Returns:
float: :math:`S_{w,k}(R)` averaged across questions.
Notes:
This summary is defined by
.. math::
S_{w,k}(R) = \sum_{r=1}^k w_r T_{r,k}(R).
The implementation uses the equivalent event-score representation from
Appendix C.4.
"""
Rm = _as_2d_int_matrix(R)
_validate_binary(Rm)
_, N = Rm.shape
_validate_finite_bank_k(N, k)
w = _validate_spectrum_weights(weights, k)
nu = np.sum(Rm, axis=1)
levels = _event_score_levels(w)
denom = float(comb(N, k))
vals = np.zeros_like(nu, dtype=float)
for j in range(1, k + 1):
credit = float(levels[j])
if credit == 0.0:
continue
vals += credit * comb(nu, j) * comb(N - nu, k - j) / denom
return float(np.mean(vals))
[docs]
def geom_ds_at_k(
R: np.ndarray, k: int, pass_power: float = 0.5, unanimous_power: float = 0.5
) -> float:
r"""
Dataset-level Pass/Unanimous geometric blend.
This is the endpoint GeoSpectrum operating point from the paper: it first
averages Pass@k and Unanimous@k across questions, then applies the
geometric blend. For the questionwise metric that blends before averaging,
use :func:`geom_at_k`.
The default operating point is the geometric mean of dataset-level
Pass@k and Unanimous@k (equivalently Pass^k). The same API also exposes
nearby operating points by letting callers adjust the exponents on the
Pass@k and Unanimous@k terms directly.
Args:
R: :math:`M \times N` binary matrix with entries in :math:`\{0,1\}`.
:math:`R_{\alpha i} = 1` if trial :math:`i` for question
:math:`\alpha` passed, 0 otherwise.
k: Sampling budget with :math:`1 \le k \le N`.
pass_power: Exponent applied to ``Pass@k``.
unanimous_power: Exponent applied to ``Unanimous@k``.
Returns:
float: The dataset-level endpoint score.
Formula:
.. math::
G_{\mathrm{ds},k}(R; a, b)
= \mathrm{Pass}@k(R)^a\,
\mathrm{Unanimous}@k(R)^b
with the default dataset-level endpoint operating point given by
:math:`a = b = 1/2`.
Examples:
>>> import numpy as np
>>> R = np.array([[0, 1, 1, 0, 1],
... [1, 1, 0, 1, 1]])
>>> round(geom_ds_at_k(R, 2), 6)
0.653835
"""
pass_score = _pass_at_k(R, k)
unanimous_score = _pass_hat_k(R, k)
return _weighted_geometric_mean(
pass_score, unanimous_score, pass_power, unanimous_power
)
[docs]
def geom_at_k(
R: np.ndarray, k: int, pass_power: float = 0.5, unanimous_power: float = 0.5
) -> float:
r"""
Questionwise Geom@k averaged across questions.
This is ``scorio``'s primary Geom@k metric. Unlike
:func:`geom_ds_at_k`, which blends dataset-level Pass@k and dataset-level
Unanimous@k, this function first computes the per-question quantities
.. math::
P_{\alpha,k} =
1 - \frac{\binom{N - \nu_\alpha}{k}}{\binom{N}{k}}
.. math::
U_{\alpha,k} =
\frac{\binom{\nu_\alpha}{k}}{\binom{N}{k}}
forms the geometric blend
.. math::
G_{\alpha,k} = P_{\alpha,k}^{a}\,U_{\alpha,k}^{b},
and only then averages across questions.
Args:
R: :math:`M \times N` binary matrix with entries in :math:`\{0,1\}`.
k: Sampling budget with :math:`1 \le k \le N`.
pass_power: Exponent applied to per-question ``Pass@k``.
unanimous_power: Exponent applied to per-question ``Unanimous@k``.
Returns:
float: The average questionwise ``Geom@k`` score.
Examples:
>>> import numpy as np
>>> R = np.array([[0, 1, 1, 0, 1],
... [1, 1, 0, 1, 1]])
>>> round(geom_at_k(R, 2), 6)
0.647106
"""
Rm = _as_2d_int_matrix(R)
_validate_binary(Rm)
_, N = Rm.shape
_validate_finite_bank_k(N, k)
nu = np.sum(Rm, axis=1)
denom = float(comb(N, k))
pass_vals = 1.0 - comb(N - nu, k) / denom
unanimous_vals = comb(nu, k) / denom
vals = np.empty(Rm.shape[0], dtype=float)
for i in range(Rm.shape[0]):
vals[i] = _weighted_geometric_mean(
float(pass_vals[i]),
float(unanimous_vals[i]),
pass_power,
unanimous_power,
)
return float(np.mean(vals))
[docs]
def geo_spectrum_at_k(
R: np.ndarray,
k: int,
lam: float = 0.5,
weights: np.ndarray | list[float] | tuple[float, ...] | None = None,
lambda_: float | None = None,
) -> float:
r"""
:math:`\mathrm{GeoSpectrum}_{\lambda,w}@k` on the observed finite bank.
By default ``weights=None`` selects the upper-half ``mG`` weights,
so the two-argument call ``geo_spectrum_at_k(R, k)`` remains the special
case
.. math::
\mathrm{GeoSpectrum}^*@k(R)
= \sqrt{\mathrm{Pass}@k(R)\,\mathrm{mG\text{-}Pass}@k(R)}.
This function also accepts the keyword alias ``lambda_=...`` for callers
that prefer naming the coupling parameter after the mathematical symbol.
Args:
R: :math:`M \times N` binary matrix with entries in :math:`\{0,1\}`.
k: Sampling budget with :math:`1 \le k \le N`.
lam: The coupling parameter :math:`\lambda` in :math:`[0,1]`.
weights: Spectrum weights :math:`w`. If omitted, uses the built-in
upper-half mG weights. Custom weights must be length-:math:`k`,
non-negative, finite, and satisfy :math:`\sum_r w_r \le 1`.
Returns:
float: :math:`\mathrm{GeoSpectrum}_{\lambda,w}@k(R)`.
Formula:
.. math::
\mathrm{GeoSpectrum}_{\lambda,w}@k(R)
= \mathrm{Pass}@k(R)^\lambda \, S_{w,k}(R)^{1-\lambda}
Examples:
>>> import numpy as np
>>> R = np.array([[0, 1, 1, 0, 1],
... [1, 1, 0, 1, 1]])
>>> round(geo_spectrum_at_k(R, 3), 6)
0.408248
>>> round(geo_spectrum_at_k(R, 3, lam=1.0), 6)
1.0
"""
lam = _resolve_lambda(lam, lambda_)
pass_score = _pass_at_k(R, k)
if lam == 1.0:
return pass_score
w = (
_mg_spectrum_weights(k)
if weights is None
else _validate_spectrum_weights(weights, k)
)
spectrum_score = threshold_spectrum_at_k(R, k, w)
return _weighted_geometric_mean(pass_score, spectrum_score, lam, 1.0 - lam)
def _pass_and_spectrum_row_posterior_moments(
R: np.ndarray,
k: int,
weights: np.ndarray,
alpha0: float = 1.0,
beta0: float = 1.0,
) -> tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
r"""Per-question posterior moments for latent Pass@k and spectrum scores.
Returns:
``(mean_pass, var_pass, mean_spectrum, var_spectrum, cov_pass_spectrum)``
arrays, one entry per question.
Notes:
Unlike the observed finite-bank metrics, these latent quantities are
defined for any integer :math:`k \ge 1`. The implementation therefore
does *not* restrict :math:`k` by the observed trial count :math:`N`.
"""
_validate_latent_k(k)
_validate_beta_prior(alpha0, beta0)
Rm = _as_2d_int_matrix(R)
_validate_binary(Rm)
M, _ = Rm.shape
w = _validate_spectrum_weights(weights, k)
alpha, beta = _binary_beta_posterior_params(Rm, alpha0=alpha0, beta0=beta0)
levels = _event_score_levels(w)
coeff = np.zeros(k + 1, dtype=float)
for j in range(1, k + 1):
coeff[j] = float(levels[j] * comb(k, j))
active_js = [j for j in range(1, k + 1) if coeff[j] != 0.0]
mean_pass = np.empty(M, dtype=float)
var_pass = np.empty(M, dtype=float)
mean_spec = np.empty(M, dtype=float)
var_spec = np.empty(M, dtype=float)
cov_ps = np.empty(M, dtype=float)
for i in range(M):
a_i = float(alpha[i])
b_i = float(beta[i])
eqk = _beta_ratio(a_i, b_i, 0, k)
eq2k = _beta_ratio(a_i, b_i, 0, 2 * k)
m_pass = 1.0 - eqk
v_pass = max(0.0, eq2k - eqk * eqk)
m_spec = 0.0
e2_spec = 0.0
e_ps = 0.0
for j in active_js:
c_j = float(coeff[j])
moment_j = _beta_ratio(a_i, b_i, j, k - j)
m_spec += c_j * moment_j
e_ps += c_j * (moment_j - _beta_ratio(a_i, b_i, j, 2 * k - j))
for l in active_js:
c_l = float(coeff[l])
e2_spec += c_j * c_l * _beta_ratio(a_i, b_i, j + l, 2 * k - (j + l))
v_spec = max(0.0, e2_spec - m_spec * m_spec)
cov = e_ps - m_pass * m_spec
mean_pass[i] = m_pass
var_pass[i] = v_pass
mean_spec[i] = m_spec
var_spec[i] = v_spec
cov_ps[i] = cov
return mean_pass, var_pass, mean_spec, var_spec, cov_ps
def _pass_and_spectrum_posterior_moments(
R: np.ndarray,
k: int,
weights: np.ndarray,
alpha0: float = 1.0,
beta0: float = 1.0,
) -> tuple[float, float, float, float, float]:
r"""Dataset-level posterior moments for latent Pass@k and spectrum scores."""
mean_pass, var_pass, mean_spec, var_spec, cov_ps = (
_pass_and_spectrum_row_posterior_moments(
R,
k,
weights,
alpha0=alpha0,
beta0=beta0,
)
)
M = mean_pass.size
mu_pass = float(np.mean(mean_pass))
mu_spec = float(np.mean(mean_spec))
var_pass_dataset = float(np.sum(var_pass) / (M**2))
var_spec_dataset = float(np.sum(var_spec) / (M**2))
cov_dataset = float(np.sum(cov_ps) / (M**2))
return mu_pass, var_pass_dataset, mu_spec, var_spec_dataset, cov_dataset
def _geo_spectrum_at_k_bayes(
R: np.ndarray,
k: int,
lam: float,
weights: np.ndarray,
alpha0: float = 1.0,
beta0: float = 1.0,
) -> tuple[float, float]:
r"""Approximate posterior mean/std for latent :math:`\mathrm{GeoSpectrum}_{\lambda,w}@k`."""
lam = _resolve_lambda(lam)
mu_pass, var_pass, mu_spec, var_spec, cov_ps = _pass_and_spectrum_posterior_moments(
R,
k,
weights,
alpha0=alpha0,
beta0=beta0,
)
if lam == 0.0:
return mu_spec, float(math.sqrt(max(0.0, var_spec)))
if lam == 1.0:
return mu_pass, float(math.sqrt(max(0.0, var_pass)))
mu = _weighted_geometric_mean(mu_pass, mu_spec, lam, 1.0 - lam)
if mu == 0.0:
return 0.0, 0.0
grad_pass = lam * (mu_pass ** (lam - 1.0)) * (mu_spec ** (1.0 - lam))
grad_spec = (1.0 - lam) * (mu_pass**lam) * (mu_spec ** (-lam))
sigma2 = (
(grad_pass**2) * var_pass
+ (grad_spec**2) * var_spec
+ 2.0 * grad_pass * grad_spec * cov_ps
)
return float(mu), float(math.sqrt(max(0.0, sigma2)))
def _geom_at_k_bayes(
R: np.ndarray,
k: int,
pass_power: float = 0.5,
unanimous_power: float = 0.5,
alpha0: float = 1.0,
beta0: float = 1.0,
) -> tuple[float, float]:
r"""Approximate posterior mean/std for latent questionwise Geom@k."""
(
mean_pass,
var_pass,
mean_unanimous,
var_unanimous,
cov_pu,
) = _pass_and_spectrum_row_posterior_moments(
R,
k,
_unanimous_spectrum_weights(k),
alpha0=alpha0,
beta0=beta0,
)
means = np.empty_like(mean_pass, dtype=float)
variances = np.empty_like(mean_pass, dtype=float)
for i in range(mean_pass.size):
mu_pass = float(mean_pass[i])
mu_unanimous = float(mean_unanimous[i])
mu = _weighted_geometric_mean(
mu_pass,
mu_unanimous,
pass_power,
unanimous_power,
)
means[i] = mu
if mu == 0.0:
variances[i] = 0.0
continue
grad_pass = 0.0
if pass_power != 0.0:
grad_pass = (
pass_power
* (mu_pass ** (pass_power - 1.0))
* (mu_unanimous**unanimous_power)
)
grad_unanimous = 0.0
if unanimous_power != 0.0:
grad_unanimous = (
unanimous_power
* (mu_pass**pass_power)
* (mu_unanimous ** (unanimous_power - 1.0))
)
variances[i] = max(
0.0,
(grad_pass**2) * float(var_pass[i])
+ (grad_unanimous**2) * float(var_unanimous[i])
+ 2.0 * grad_pass * grad_unanimous * float(cov_pu[i]),
)
mu = float(np.mean(means))
sigma = float(math.sqrt(float(np.sum(variances))) / mean_pass.size)
return mu, sigma
def _geom_ds_at_k_bayes(
R: np.ndarray,
k: int,
pass_power: float = 0.5,
unanimous_power: float = 0.5,
alpha0: float = 1.0,
beta0: float = 1.0,
) -> tuple[float, float]:
r"""Approximate posterior mean/std for latent dataset-level Geom@k."""
(
mu_pass,
var_pass,
mu_unanimous,
var_unanimous,
cov_pu,
) = _pass_and_spectrum_posterior_moments(
R,
k,
_unanimous_spectrum_weights(k),
alpha0=alpha0,
beta0=beta0,
)
mu = _weighted_geometric_mean(
mu_pass,
mu_unanimous,
pass_power,
unanimous_power,
)
if mu == 0.0:
return 0.0, 0.0
grad_pass = 0.0
if pass_power != 0.0:
grad_pass = (
pass_power
* (mu_pass ** (pass_power - 1.0))
* (mu_unanimous**unanimous_power)
)
grad_unanimous = 0.0
if unanimous_power != 0.0:
grad_unanimous = (
unanimous_power
* (mu_pass**pass_power)
* (mu_unanimous ** (unanimous_power - 1.0))
)
sigma2 = (
(grad_pass**2) * var_pass
+ (grad_unanimous**2) * var_unanimous
+ 2.0 * grad_pass * grad_unanimous * cov_pu
)
return float(mu), float(math.sqrt(max(0.0, sigma2)))
[docs]
def threshold_spectrum_at_k_ci(
R: np.ndarray,
k: int,
weights: np.ndarray | list[float] | tuple[float, ...],
confidence: float = 0.95,
bounds: tuple[float, float] = (0.0, 1.0),
alpha0: float = 1.0,
beta0: float = 1.0,
) -> tuple[float, float, float, float]:
r"""
Approximate posterior summary for the latent spectrum :math:`S_{w,k}(p)`.
Args:
R: :math:`M \times N` binary matrix with entries in :math:`\{0,1\}`.
k: Latent resampling budget. Once the posterior is defined, any integer
:math:`k \ge 1` is allowed.
weights: Non-negative length-:math:`k` weights with
:math:`\sum_r w_r \le 1`.
confidence: credibility level of the interval (default 0.95).
bounds: ``(lo, hi)`` clipping bounds for the interval
(default ``(0, 1)``).
alpha0: Beta prior parameter :math:`\alpha_0` (default 1).
beta0: Beta prior parameter :math:`\beta_0` (default 1).
Returns:
tuple[float, float, float, float]:
:math:`(\mu,\; \sigma,\; \text{lo},\; \text{hi})`
Notes:
Unlike :func:`threshold_spectrum_at_k`, the posterior target is defined
for latent i.i.d. resampling and therefore does not require
:math:`k \le N`.
Formula:
Let :math:`A_j = \sum_{r \le j} w_r`. The per-question latent target is
.. math::
g(p) = \sum_{j=1}^{k} A_j \binom{k}{j} p^j (1-p)^{k-j}.
Dataset-level aggregation uses
.. math::
\mu = \frac{1}{M} \sum_{\alpha=1}^{M} \mathbb{E}[g(p_\alpha)]
.. math::
\sigma = \frac{1}{M} \sqrt{
\sum_{\alpha=1}^{M} \mathrm{Var}[g(p_\alpha)]
}.
"""
w = _validate_spectrum_weights(weights, k)
_, _, mu_spec, var_spec, _ = _pass_and_spectrum_posterior_moments(
R,
k,
w,
alpha0=alpha0,
beta0=beta0,
)
sigma = float(math.sqrt(max(0.0, var_spec)))
lo, hi = normal_credible_interval(
mu_spec, sigma, credibility=confidence, two_sided=True, bounds=bounds
)
return float(mu_spec), sigma, float(lo), float(hi)
[docs]
def geom_at_k_ci(
R: np.ndarray,
k: int,
pass_power: float = 0.5,
unanimous_power: float = 0.5,
confidence: float = 0.95,
bounds: tuple[float, float] = (0.0, 1.0),
alpha0: float = 1.0,
beta0: float = 1.0,
) -> tuple[float, float, float, float]:
r"""
Approximate posterior summary for the questionwise Geom@k target.
This is the uncertainty counterpart of :func:`geom_at_k`: it applies a
first-order delta method to each question's latent Pass@k and
Unanimous@k quantities, then averages the resulting question-level
geometric blends.
Args:
R: :math:`M \times N` binary matrix with entries in :math:`\{0,1\}`.
k: Latent resampling budget. Once the posterior is defined, any integer
:math:`k \ge 1` is allowed.
pass_power: Exponent applied to each question's latent ``Pass@k``.
unanimous_power: Exponent applied to each question's latent
``Unanimous@k``.
confidence: credibility level of the interval (default 0.95).
bounds: ``(lo, hi)`` clipping bounds for the interval
(default ``(0, 1)``).
alpha0: Beta prior parameter :math:`\alpha_0` (default 1).
beta0: Beta prior parameter :math:`\beta_0` (default 1).
Returns:
tuple[float, float, float, float]:
:math:`(\mu,\; \sigma,\; \text{lo},\; \text{hi})`
Formula:
Let :math:`\mu_{P,\alpha}` and :math:`\mu_{U,\alpha}` denote the
posterior means of question :math:`\alpha`'s latent Pass@k and
Unanimous@k quantities. Then
.. math::
\mu \approx \frac{1}{M}\sum_\alpha
\mu_{P,\alpha}^{a}\,\mu_{U,\alpha}^{b}
and :math:`\sigma` is computed by per-question first-order delta
propagation through :math:`g(x, y) = x^a y^b`.
Examples:
>>> import numpy as np
>>> R = np.array([[0, 1, 1, 0, 1],
... [1, 1, 0, 1, 1]])
>>> mu, sigma, lo, hi = geom_at_k_ci(R, 2)
>>> round(mu, 6), round(sigma, 6), round(lo, 4), round(hi, 4)
(0.610666, 0.133107, 0.3498, 0.8716)
"""
mu, sigma = _geom_at_k_bayes(
R,
k,
pass_power=pass_power,
unanimous_power=unanimous_power,
alpha0=alpha0,
beta0=beta0,
)
lo, hi = normal_credible_interval(
mu, sigma, credibility=confidence, two_sided=True, bounds=bounds
)
return float(mu), float(sigma), float(lo), float(hi)
[docs]
def geom_ds_at_k_ci(
R: np.ndarray,
k: int,
pass_power: float = 0.5,
unanimous_power: float = 0.5,
confidence: float = 0.95,
bounds: tuple[float, float] = (0.0, 1.0),
alpha0: float = 1.0,
beta0: float = 1.0,
) -> tuple[float, float, float, float]:
r"""
Approximate posterior summary for the dataset-level Geom@k target.
This is the uncertainty counterpart of :func:`geom_ds_at_k` and matches
the dataset-level latent quantity introduced in the paper when
``pass_power = unanimous_power = 0.5``.
Args:
R: :math:`M \times N` binary matrix with entries in :math:`\{0,1\}`.
k: Latent resampling budget. Once the posterior is defined, any integer
:math:`k \ge 1` is allowed.
pass_power: Exponent applied to latent dataset-level ``Pass@k``.
unanimous_power: Exponent applied to latent dataset-level
``Unanimous@k``.
confidence: credibility level of the interval (default 0.95).
bounds: ``(lo, hi)`` clipping bounds for the interval
(default ``(0, 1)``).
alpha0: Beta prior parameter :math:`\alpha_0` (default 1).
beta0: Beta prior parameter :math:`\beta_0` (default 1).
Returns:
tuple[float, float, float, float]:
:math:`(\mu,\; \sigma,\; \text{lo},\; \text{hi})`
Formula:
Let :math:`\mu_P` and :math:`\mu_U` denote the posterior means of the
latent dataset-level Pass@k and Unanimous@k quantities. Then
.. math::
\mu \approx \mu_P^a\,\mu_U^b
and :math:`\sigma` is computed by first-order delta propagation through
:math:`g(x, y) = x^a y^b`.
Examples:
>>> import numpy as np
>>> R = np.array([[0, 1, 1, 0, 1],
... [1, 1, 0, 1, 1]])
>>> mu, sigma, lo, hi = geom_ds_at_k_ci(R, 2)
>>> round(mu, 6), round(sigma, 6), round(lo, 4), round(hi, 4)
(0.612112, 0.132755, 0.3519, 0.8723)
"""
mu, sigma = _geom_ds_at_k_bayes(
R,
k,
pass_power=pass_power,
unanimous_power=unanimous_power,
alpha0=alpha0,
beta0=beta0,
)
lo, hi = normal_credible_interval(
mu, sigma, credibility=confidence, two_sided=True, bounds=bounds
)
return float(mu), float(sigma), float(lo), float(hi)
[docs]
def geo_spectrum_at_k_ci(
R: np.ndarray,
k: int,
lam: float = 0.5,
weights: np.ndarray | list[float] | tuple[float, ...] | None = None,
lambda_: float | None = None,
confidence: float = 0.95,
bounds: tuple[float, float] = (0.0, 1.0),
alpha0: float = 1.0,
beta0: float = 1.0,
) -> tuple[float, float, float, float]:
r"""
Approximate posterior summary for latent
:math:`\mathrm{GeoSpectrum}_{\lambda,w}@k`.
As in :func:`geo_spectrum_at_k`, omitting ``weights`` selects the
``GeoSpectrum*@k`` operating point.
This function also accepts the keyword alias ``lambda_=...`` for callers
that prefer naming the coupling parameter after the mathematical symbol.
Args:
R: :math:`M \times N` binary matrix with entries in :math:`\{0,1\}`.
k: Latent resampling budget. Once the posterior is defined, any integer
:math:`k \ge 1` is allowed.
lam: The coupling parameter :math:`\lambda` in :math:`[0,1]`.
weights: Spectrum weights :math:`w`. If omitted, uses the built-in
upper-half mG weights unless :math:`\lambda = 1`, in which case
the spectrum term is irrelevant. Custom weights must be
length-:math:`k`, non-negative, finite, and satisfy
:math:`\sum_r w_r \le 1`.
confidence: credibility level of the interval (default 0.95).
bounds: ``(lo, hi)`` clipping bounds for the interval
(default ``(0, 1)``).
alpha0: Beta prior parameter :math:`\alpha_0` (default 1).
beta0: Beta prior parameter :math:`\beta_0` (default 1).
Returns:
tuple[float, float, float, float]:
:math:`(\mu,\; \sigma,\; \text{lo},\; \text{hi})`
Formula:
Let :math:`x` denote latent Pass@k and :math:`y` denote the latent
spectrum :math:`S_{w,k}`. The posterior mean is approximated by
.. math::
\mu \approx x^\lambda y^{1-\lambda}
evaluated at the posterior means of :math:`x` and :math:`y`, and
:math:`\sigma` is obtained by first-order delta propagation through
:math:`g(x, y) = x^\lambda y^{1-\lambda}`.
"""
lam = _resolve_lambda(lam, lambda_)
w = None
if lam != 1.0:
w = (
_mg_spectrum_weights(k)
if weights is None
else _validate_spectrum_weights(weights, k)
)
else:
# GeoSpectrum_{1,w}@k is exactly Pass@k, so ``w`` is irrelevant.
w = _unanimous_spectrum_weights(k)
mu, sigma = _geo_spectrum_at_k_bayes(
R,
k,
lam,
w,
alpha0=alpha0,
beta0=beta0,
)
lo, hi = normal_credible_interval(
mu, sigma, credibility=confidence, two_sided=True, bounds=bounds
)
return float(mu), float(sigma), float(lo), float(hi)
[docs]
def geo_spectrum_star_at_k(R: np.ndarray, k: int) -> float:
r"""
Explicit alias for the default ``GeoSpectrum*@k`` operating point.
Equivalent to calling :func:`geo_spectrum_at_k` with the default
upper-half ``mG`` spectrum weights.
Args:
R: :math:`M \times N` binary matrix with entries in :math:`\{0,1\}`.
k: Sampling budget with :math:`1 \le k \le N`.
Returns:
float: The ``GeoSpectrum*@k`` score.
"""
return geo_spectrum_at_k(R, k, lam=0.5, weights=_mg_spectrum_weights(k))
[docs]
def geo_spectrum_star_at_k_ci(
R: np.ndarray,
k: int,
confidence: float = 0.95,
bounds: tuple[float, float] = (0.0, 1.0),
alpha0: float = 1.0,
beta0: float = 1.0,
) -> tuple[float, float, float, float]:
r"""
Approximate posterior summary for latent ``GeoSpectrum*@k``.
Equivalent to :func:`geo_spectrum_at_k_ci` with the default upper-half
``mG`` spectrum weights.
Args:
R: :math:`M \times N` binary matrix with entries in :math:`\{0,1\}`.
k: Latent resampling budget. Once the posterior is defined, any integer
:math:`k \ge 1` is allowed.
confidence: credibility level of the interval (default 0.95).
bounds: ``(lo, hi)`` clipping bounds for the interval
(default ``(0, 1)``).
alpha0: Beta prior parameter :math:`\alpha_0` (default 1).
beta0: Beta prior parameter :math:`\beta_0` (default 1).
Returns:
tuple[float, float, float, float]:
:math:`(\mu,\; \sigma,\; \text{lo},\; \text{hi})`
"""
return geo_spectrum_at_k_ci(
R,
k,
confidence=confidence,
bounds=bounds,
alpha0=alpha0,
beta0=beta0,
)
__all__ = [
"geom_at_k",
"geom_at_k_ci",
"geom_ds_at_k",
"geom_ds_at_k_ci",
"geo_spectrum_at_k",
"geo_spectrum_at_k_ci",
"geo_spectrum_star_at_k",
"geo_spectrum_star_at_k_ci",
"threshold_spectrum_at_k",
"threshold_spectrum_at_k_ci",
]