Source code for scorio.rank.pointwise

"""
Pointwise ranking methods.

Pointwise methods aggregate each model's item-level performance into one scalar
without explicit head-to-head modeling.

Notation
--------

Let :math:`R \\in \\{0,1\\}^{L \\times M \\times N}` and define
:math:`k_{lm}=\\sum_{n=1}^{N} R_{lmn}`,
:math:`\\widehat{p}_{lm}=k_{lm}/N`.

The pointwise score template is

.. math::
    s_l = \\sum_{m=1}^{M} w_m \\, g\\!\\left(\\widehat{p}_{lm}, \\phi_m\\right),

where :math:`\\phi_m` is an item statistic, :math:`g` is a fixed transform,
and :math:`w_m` are nonnegative weights.
"""

import numpy as np

from scorio.utils import rank_scores

from ._base import validate_input
from ._types import RankMethod, RankResult



[docs]
def inverse_difficulty(
    R: np.ndarray,
    method: RankMethod = "competition",
    return_scores: bool = False,
    clip_range: tuple = (0.01, 0.99),
) -> RankResult:
    """
    Rank models by inverse-difficulty-weighted per-question accuracy.

    Method context:
        This pointwise method upweights hard questions by assigning each
        question an inverse weight proportional to the reciprocal global solve
        rate. It emphasizes rare successes while still aggregating all
        questions into one scalar score per model.

    References:
        Inverse probability weighting (Wikipedia):
        https://en.wikipedia.org/wiki/Inverse_probability_weighting

    Args:
        R: Binary outcome tensor with shape ``(L, M, N)`` or matrix
            ``(L, M)`` (treated as ``N=1``).
        method: Tie-handling rule passed to :func:`scorio.utils.rank_scores`.
            One of ``"competition"``, ``"competition_max"``, ``"dense"``,
            or ``"avg"``.
        return_scores: If ``True``, return ``(ranking, scores)``.
        clip_range: Two-sided clipping interval ``(a, b)`` applied to global
            solve rates before inversion. Must satisfy
            ``0 < a < b <= 1``.

    Returns:
        Ranking array of shape ``(L,)``.
        If ``return_scores=True``, also returns inverse-difficulty scores of
        shape ``(L,)``.

    Notation:
        ``k_lm = sum_{n=1}^N R_lmn`` and
        ``p_hat_lm = k_lm / N``.
        The global per-question solve rate is
        ``p_bar_m = (1/L) * sum_l p_hat_lm``.

    Formula:
        .. math::
            w_m \\propto \\frac{1}{\\operatorname{clip}(\\bar p_m, a, b)},
            \\quad \\sum_{m=1}^{M} w_m = 1

        .. math::
            s_l^{\\mathrm{inv\\text{-}diff}} =
            \\sum_{m=1}^{M} w_m \\, \\widehat{p}_{lm}

    Examples:
        >>> import numpy as np
        >>> from scorio import rank
        >>> R = np.array([
        ...     [[1, 1], [0, 0], [0, 0]],
        ...     [[0, 0], [1, 1], [0, 0]],
        ... ])
        >>> ranks, scores = rank.inverse_difficulty(R, return_scores=True)
        >>> ranks.tolist()
        [1, 1]
        >>> scores.shape
        (2,)

        >>> # tighter clipping is allowed
        >>> rank.inverse_difficulty(R, clip_range=(0.05, 0.95)).shape
        (2,)

    Notes:
        Very small clip lower bounds can make the weighting highly sensitive
        to a few rare solves. This implementation is a simple inverse-weighted
        pointwise scorer.
    """
    R = validate_input(R)

    if len(clip_range) != 2:
        raise ValueError("clip_range must be a length-2 tuple (low, high).")
    low = float(clip_range[0])
    high = float(clip_range[1])
    if not np.isfinite(low) or not np.isfinite(high):
        raise ValueError("clip_range values must be finite.")
    if not (0.0 < low < high <= 1.0):
        raise ValueError("clip_range must satisfy 0 < low < high <= 1.")

    # Global difficulty: average across all models and trials for each question
    question_difficulty = R.mean(axis=(0, 2))  # Shape: (M,)

    # Clip to avoid extreme weights
    question_difficulty = np.clip(question_difficulty, low, high)

    # Inverse difficulty weights (normalized)
    weights = 1.0 / question_difficulty
    total_weight = float(np.sum(weights))
    if not np.isfinite(total_weight) or total_weight <= 0.0:
        raise ValueError(
            "inverse-difficulty weights are not finite; choose a stricter clip_range."
        )
    weights = weights / total_weight

    # Per-model accuracy on each question
    model_question_accuracy = R.mean(axis=2)  # Shape: (L, M)

    scores = model_question_accuracy @ weights

    ranking = rank_scores(scores)[method]

    return (ranking, scores) if return_scores else ranking



__all__ = ["inverse_difficulty"]