Source code for scorio.rank.hodge_rank

r"""
HodgeRank: statistical ranking via combinatorial Hodge theory.

This module implements the weighted :math:`\ell_2` HodgeRank estimator from
Jiang, Lim, Yao, and Ye (2009).

Notation
--------

Let :math:`R \in \{0,1\}^{L \times M \times N}` and define decisive wins
:math:`W_{ij}` and ties :math:`T_{ij}` from pairwise model comparisons.
From these counts, construct a skew-symmetric edge flow :math:`Y` and
nonnegative symmetric edge weights :math:`w_{ij}`.

HodgeRank estimates a potential vector :math:`s \in \mathbb{R}^L` by

.. math::
    s^\star
    \in \arg\min_{s}
    \sum_{i<j} w_{ij}\left((s_j-s_i)-Y_{ij}\right)^2.

This yields the weighted Laplacian normal equations

.. math::
    \Delta_0 s^\star = -\operatorname{div}(Y), \qquad
    s^\star = -\Delta_0^{\dagger}\operatorname{div}(Y),

where :math:`\Delta_0^{\dagger}` is the Moore-Penrose pseudoinverse.
Only score differences are identifiable; adding a constant to :math:`s`
does not change the ranking.
"""

import numpy as np

from scorio.utils import rank_scores

from ._base import build_pairwise_counts, validate_input
from ._types import RankMethod


def _pairwise_flow_binary(wins: np.ndarray, ties: np.ndarray) -> np.ndarray:
    """
    Paper (Section 2.2.1), binary comparison statistic:

        Ȳ_ij = Pr{j > i} - Pr{i > j}

    For scorio's binary outcomes, we treat:
        j > i  ⇔  j correct and i incorrect.
    """
    total = wins + wins.T + ties
    Y = np.zeros_like(wins, dtype=float)
    mask = total > 0
    # wins[j,i] = wins.T[i,j]
    Y[mask] = (wins.T[mask] - wins[mask]) / total[mask]
    np.fill_diagonal(Y, 0.0)
    return Y


def _pairwise_flow_log_odds(
    wins: np.ndarray, ties: np.ndarray, epsilon: float = 0.5
) -> np.ndarray:
    """
    Paper (Section 2.2.1), logarithmic odds ratio statistic:

        Ȳ_ij = log( Pr{j >= i} / Pr{j <= i} )

    For binary outcomes, j >= i fails only when (i=1, j=0).
    """
    epsilon = float(epsilon)
    if (not np.isfinite(epsilon)) or epsilon <= 0:
        raise ValueError("epsilon must be > 0 for log-odds smoothing")

    total = wins + wins.T + ties
    Y = np.zeros_like(wins, dtype=float)

    # Pr{j >= i} = 1 - Pr{i > j} = (total - wins[i,j]) / total
    # Pr{j <= i} = 1 - Pr{j > i} = (total - wins[j,i]) / total
    # => ratio = (total - wins[i,j]) / (total - wins[j,i])
    mask = total > 0
    numerator = (total - wins + epsilon)[mask]
    denom = (total - wins.T + epsilon)[mask]
    Y[mask] = np.log(numerator / denom)

    np.fill_diagonal(Y, 0.0)
    return Y


def _weights_from_counts(
    wins: np.ndarray,
    ties: np.ndarray,
    weight_method: str,
) -> np.ndarray:
    total = wins + wins.T + ties

    weight_method = str(weight_method)
    if weight_method == "total":
        w = total.astype(float)
    elif weight_method == "decisive":
        w = (wins + wins.T).astype(float)
    elif weight_method == "uniform":
        w = (total > 0).astype(float)
    else:
        raise ValueError('weight_method must be one of: "total", "decisive", "uniform"')

    np.fill_diagonal(w, 0.0)
    return w


def _laplacian_from_weights(w: np.ndarray) -> np.ndarray:
    L = -w.astype(float).copy()
    np.fill_diagonal(L, 0.0)
    np.fill_diagonal(L, -L.sum(axis=1))
    return L


def _divergence(w: np.ndarray, Y: np.ndarray) -> np.ndarray:
    # (div Y)(i) = Σ_j w_ij Y_ij
    return (w * Y).sum(axis=1)


def _grad(scores: np.ndarray) -> np.ndarray:
    # (grad s)_ij = s_j - s_i
    s = np.asarray(scores, dtype=float)
    return s[None, :] - s[:, None]


[docs] def hodge_rank( R: np.ndarray, pairwise_stat: str = "binary", weight_method: str = "total", epsilon: float = 0.5, method: RankMethod = "competition", return_scores: bool = False, return_diagnostics: bool = False, ) -> ( np.ndarray | tuple[np.ndarray, np.ndarray] | tuple[np.ndarray, np.ndarray, dict[str, float]] ): """ Rank models with l2 HodgeRank on pairwise-comparison graphs. Method context: HodgeRank treats aggregated pairwise outcomes as an edge flow and finds global potentials whose gradient best matches that flow in weighted least squares. The minimum-norm solution is obtained from a graph Laplacian pseudoinverse. References: Jiang, X., Lim, L.-H., Yao, Y., & Ye, Y. (2009). Statistical Ranking and Combinatorial Hodge Theory. https://arxiv.org/abs/0811.1067 Args: R: Binary tensor of shape (L, M, N) (or (L, M) treated as N=1). pairwise_stat: - "binary": Ȳ_ij = P(j>i) - P(i>j) - "log_odds": Ȳ_ij = log(P(j>=i)/P(j<=i)) with additive smoothing weight_method: - "total": w_ij = #comparisons (including ties) - "decisive": w_ij = #non-tie comparisons (wins+losses) - "uniform": w_ij = 1 if comparable else 0 epsilon: Additive smoothing (counts) used only for pairwise_stat="log_odds". method: Ranking method passed to `scorio.utils.rank_scores`. return_scores: If True, return (ranking, scores) where scores are the HodgeRank potentials s (higher ⇒ better). return_diagnostics: If True, also returns a small diagnostics dict with least-squares residual norms. Returns: Ranking array of shape ``(L,)`` by default. If ``return_scores=True``, returns ``(ranking, scores)``. If ``return_diagnostics=True``, returns ``(ranking, scores, diagnostics)``. Notation: ``Y``: skew-symmetric observed edge flow from pairwise outcomes. ``w_ij``: symmetric nonnegative edge weights. ``(grad s)_ij = s_j - s_i``. ``(div Y)_i = \\sum_j w_{ij} Y_{ij}``. ``\\Delta_0``: weighted graph Laplacian. Formula: .. math:: s^{\\star} \\in \\arg\\min_{s\\in\\mathbb{R}^{L}} \\sum_{i<j} w_{ij}\\left((s_j-s_i)-Y_{ij}\\right)^2 .. math:: \\Delta_0 s^{\\star} = -\\operatorname{div}(Y), \\qquad s^{\\star} = -\\Delta_0^{\\dagger}\\operatorname{div}(Y) Examples: >>> import numpy as np >>> from scorio import rank >>> R = np.array([[[1, 1], [1, 1]], [[0, 0], [0, 0]]]) >>> rank.hodge_rank(R).tolist() [1, 2] Notes: Scores are identifiable only up to an additive constant; rankings depend on score differences. Diagnostics report weighted residual norms ``||Y - grad(s)||_{2,w}``. """ R = validate_input(R) L = R.shape[0] wins, ties = build_pairwise_counts(R) pairwise_stat = str(pairwise_stat) if pairwise_stat == "binary": Y = _pairwise_flow_binary(wins, ties) elif pairwise_stat == "log_odds": Y = _pairwise_flow_log_odds(wins, ties, epsilon=epsilon) else: raise ValueError('pairwise_stat must be one of: "binary", "log_odds"') w = _weights_from_counts(wins, ties, weight_method=weight_method) if not np.any(w > 0): scores = np.ones(L, dtype=float) / L ranking = rank_scores(scores)[method] if not return_scores: return ranking if not return_diagnostics: return ranking, scores return ranking, scores, {"residual_l2": 0.0, "relative_residual_l2": 0.0} Lap = _laplacian_from_weights(w) div = _divergence(w, Y) # Minimum-norm solution to Lap s = -div via Moore–Penrose inverse. scores = -np.linalg.pinv(Lap) @ div ranking = rank_scores(scores)[method] if not return_diagnostics and not return_scores: return ranking if not return_diagnostics and return_scores: return ranking, scores # Diagnostics: weighted residual norms for grad(s) vs Y. grad_s = _grad(scores) resid = Y - grad_s mask = np.triu(np.ones((L, L), dtype=bool), 1) & (w > 0) w_half = w[mask] r_half = resid[mask] y_half = Y[mask] residual_l2 = float(np.sqrt(np.sum(w_half * (r_half**2)))) denom = float(np.sqrt(np.sum(w_half * (y_half**2)))) rel = residual_l2 / denom if denom > 0 else 0.0 return ranking, scores, {"residual_l2": residual_l2, "relative_residual_l2": rel}
__all__ = ["hodge_rank"]