Source code for scorio.rank.hodge_rank

r"""
HodgeRank: statistical ranking via combinatorial Hodge theory.

This module implements the weighted :math:`\ell_2` HodgeRank estimator from
Jiang, Lim, Yao, and Ye (2009).

Notation
--------

Let :math:`R \in \{0,1\}^{L \times M \times N}` and define decisive wins
:math:`W_{ij}` and ties :math:`T_{ij}` from pairwise model comparisons.
From these counts, construct a skew-symmetric edge flow :math:`Y` and
nonnegative symmetric edge weights :math:`w_{ij}`.

HodgeRank estimates a potential vector :math:`s \in \mathbb{R}^L` by

.. math::
    s^\star
    \in \arg\min_{s}
    \sum_{i<j} w_{ij}\left((s_j-s_i)-Y_{ij}\right)^2.

This yields the weighted Laplacian normal equations

.. math::
    \Delta_0 s^\star = -\operatorname{div}(Y), \qquad
    s^\star = -\Delta_0^{\dagger}\operatorname{div}(Y),

where :math:`\Delta_0^{\dagger}` is the Moore-Penrose pseudoinverse.
Only score differences are identifiable; adding a constant to :math:`s`
does not change the ranking.
"""

import numpy as np

from scorio.utils import rank_scores

from ._base import build_pairwise_counts, validate_input
from ._types import RankMethod


def _pairwise_flow_binary(wins: np.ndarray, ties: np.ndarray) -> np.ndarray:
    """
    Paper (Section 2.2.1), binary comparison statistic:

        Ȳ_ij = Pr{j > i} - Pr{i > j}

    For scorio's binary outcomes, we treat:
        j > i  ⇔  j correct and i incorrect.
    """
    total = wins + wins.T + ties
    Y = np.zeros_like(wins, dtype=float)
    mask = total > 0
    # wins[j,i] = wins.T[i,j]
    Y[mask] = (wins.T[mask] - wins[mask]) / total[mask]
    np.fill_diagonal(Y, 0.0)
    return Y


def _pairwise_flow_log_odds(
    wins: np.ndarray, ties: np.ndarray, epsilon: float = 0.5
) -> np.ndarray:
    """
    Paper (Section 2.2.1), logarithmic odds ratio statistic:

        Ȳ_ij = log( Pr{j >= i} / Pr{j <= i} )

    For binary outcomes, j >= i fails only when (i=1, j=0).
    """
    epsilon = float(epsilon)
    if (not np.isfinite(epsilon)) or epsilon <= 0:
        raise ValueError("epsilon must be > 0 for log-odds smoothing")

    total = wins + wins.T + ties
    Y = np.zeros_like(wins, dtype=float)

    # Pr{j >= i} = 1 - Pr{i > j} = (total - wins[i,j]) / total
    # Pr{j <= i} = 1 - Pr{j > i} = (total - wins[j,i]) / total
    # => ratio = (total - wins[i,j]) / (total - wins[j,i])
    mask = total > 0
    numerator = (total - wins + epsilon)[mask]
    denom = (total - wins.T + epsilon)[mask]
    Y[mask] = np.log(numerator / denom)

    np.fill_diagonal(Y, 0.0)
    return Y


def _weights_from_counts(
    wins: np.ndarray,
    ties: np.ndarray,
    weight_method: str,
) -> np.ndarray:
    total = wins + wins.T + ties

    weight_method = str(weight_method)
    if weight_method == "total":
        w = total.astype(float)
    elif weight_method == "decisive":
        w = (wins + wins.T).astype(float)
    elif weight_method == "uniform":
        w = (total > 0).astype(float)
    else:
        raise ValueError('weight_method must be one of: "total", "decisive", "uniform"')

    np.fill_diagonal(w, 0.0)
    return w


def _laplacian_from_weights(w: np.ndarray) -> np.ndarray:
    L = -w.astype(float).copy()
    np.fill_diagonal(L, 0.0)
    np.fill_diagonal(L, -L.sum(axis=1))
    return L


def _divergence(w: np.ndarray, Y: np.ndarray) -> np.ndarray:
    # (div Y)(i) = Σ_j w_ij Y_ij
    return (w * Y).sum(axis=1)


def _grad(scores: np.ndarray) -> np.ndarray:
    # (grad s)_ij = s_j - s_i
    s = np.asarray(scores, dtype=float)
    return s[None, :] - s[:, None]



[docs]
def hodge_rank(
    R: np.ndarray,
    pairwise_stat: str = "binary",
    weight_method: str = "total",
    epsilon: float = 0.5,
    method: RankMethod = "competition",
    return_scores: bool = False,
    return_diagnostics: bool = False,
) -> (
    np.ndarray
    | tuple[np.ndarray, np.ndarray]
    | tuple[np.ndarray, np.ndarray, dict[str, float]]
):
    """
    Rank models with l2 HodgeRank on pairwise-comparison graphs.

    Method context:
        HodgeRank treats aggregated pairwise outcomes as an edge flow and finds
        global potentials whose gradient best matches that flow in weighted least
        squares. The minimum-norm solution is obtained from a graph Laplacian
        pseudoinverse.

    References:
        Jiang, X., Lim, L.-H., Yao, Y., & Ye, Y. (2009).
        Statistical Ranking and Combinatorial Hodge Theory.
        https://arxiv.org/abs/0811.1067

    Args:
        R: Binary tensor of shape (L, M, N) (or (L, M) treated as N=1).
        pairwise_stat:
            - "binary": Ȳ_ij = P(j>i) - P(i>j)
            - "log_odds": Ȳ_ij = log(P(j>=i)/P(j<=i)) with additive smoothing
        weight_method:
            - "total": w_ij = #comparisons (including ties)
            - "decisive": w_ij = #non-tie comparisons (wins+losses)
            - "uniform": w_ij = 1 if comparable else 0
        epsilon: Additive smoothing (counts) used only for pairwise_stat="log_odds".
        method: Ranking method passed to `scorio.utils.rank_scores`.
        return_scores: If True, return (ranking, scores) where scores are the
            HodgeRank potentials s (higher ⇒ better).
        return_diagnostics: If True, also returns a small diagnostics dict with
            least-squares residual norms.

    Returns:
        Ranking array of shape ``(L,)`` by default.
        If ``return_scores=True``, returns ``(ranking, scores)``.
        If ``return_diagnostics=True``, returns
        ``(ranking, scores, diagnostics)``.

    Notation:
        ``Y``: skew-symmetric observed edge flow from pairwise outcomes.
        ``w_ij``: symmetric nonnegative edge weights.
        ``(grad s)_ij = s_j - s_i``.
        ``(div Y)_i = \\sum_j w_{ij} Y_{ij}``.
        ``\\Delta_0``: weighted graph Laplacian.

    Formula:
        .. math::
            s^{\\star} \\in
            \\arg\\min_{s\\in\\mathbb{R}^{L}}
            \\sum_{i<j} w_{ij}\\left((s_j-s_i)-Y_{ij}\\right)^2

        .. math::
            \\Delta_0 s^{\\star} = -\\operatorname{div}(Y),
            \\qquad
            s^{\\star} = -\\Delta_0^{\\dagger}\\operatorname{div}(Y)

    Examples:
        >>> import numpy as np
        >>> from scorio import rank
        >>> R = np.array([[[1, 1], [1, 1]], [[0, 0], [0, 0]]])
        >>> rank.hodge_rank(R).tolist()
        [1, 2]

    Notes:
        Scores are identifiable only up to an additive constant; rankings depend
        on score differences. Diagnostics report weighted residual norms
        ``||Y - grad(s)||_{2,w}``.
    """
    R = validate_input(R)
    L = R.shape[0]

    wins, ties = build_pairwise_counts(R)

    pairwise_stat = str(pairwise_stat)
    if pairwise_stat == "binary":
        Y = _pairwise_flow_binary(wins, ties)
    elif pairwise_stat == "log_odds":
        Y = _pairwise_flow_log_odds(wins, ties, epsilon=epsilon)
    else:
        raise ValueError('pairwise_stat must be one of: "binary", "log_odds"')

    w = _weights_from_counts(wins, ties, weight_method=weight_method)
    if not np.any(w > 0):
        scores = np.ones(L, dtype=float) / L
        ranking = rank_scores(scores)[method]
        if not return_scores:
            return ranking
        if not return_diagnostics:
            return ranking, scores
        return ranking, scores, {"residual_l2": 0.0, "relative_residual_l2": 0.0}

    Lap = _laplacian_from_weights(w)
    div = _divergence(w, Y)

    # Minimum-norm solution to Lap s = -div via Moore–Penrose inverse.
    scores = -np.linalg.pinv(Lap) @ div

    ranking = rank_scores(scores)[method]

    if not return_diagnostics and not return_scores:
        return ranking
    if not return_diagnostics and return_scores:
        return ranking, scores

    # Diagnostics: weighted residual norms for grad(s) vs Y.
    grad_s = _grad(scores)
    resid = Y - grad_s
    mask = np.triu(np.ones((L, L), dtype=bool), 1) & (w > 0)
    w_half = w[mask]
    r_half = resid[mask]
    y_half = Y[mask]

    residual_l2 = float(np.sqrt(np.sum(w_half * (r_half**2))))
    denom = float(np.sqrt(np.sum(w_half * (y_half**2))))
    rel = residual_l2 / denom if denom > 0 else 0.0

    return ranking, scores, {"residual_l2": residual_l2, "relative_residual_l2": rel}



__all__ = ["hodge_rank"]