Source code for sys_mapping.diagnostics

"""Systematic mapping diagnostics: null tests, SNR ranking, footprint masking.

Implements:
  - Null test cross-correlations (Ross+2011)
  - SNR-based template ranking with three estimators (Tanidis+2026)
  - Footprint masking sensitivity diagnostics (Rodríguez-Monroy+2025)
"""

from __future__ import annotations

import numpy as np



[docs]
def null_test_cross_correlations(
    weights: np.ndarray,
    delta_t: np.ndarray,
    n_bootstrap: int = 100,
    seed: int = 0,
) -> dict[str, np.ndarray]:
    """Test for residual systematic contamination via weight × template correlations.

    After decontamination, the per-pixel systematic weights should be
    statistically independent of all template maps.  A significant
    Pearson correlation between ``weights`` and ``delta_t[i]`` indicates
    residual contamination from template ``i``.

    Parameters
    ----------
    weights:
        Per-pixel systematic weights (shape ``(n_pix,)``).
    delta_t:
        Template maps (shape ``(n_sys, n_pix)``).
    n_bootstrap:
        Number of permutation resamples for computing p-values.
    seed:
        Random seed for reproducibility.

    Returns
    -------
    result:
        Dictionary with:

        - ``"correlations"``: Pearson :math:`r(w, t_i)` for each template
          (shape ``(n_sys,)``).
        - ``"p_values"``: two-sided permutation p-value for each template
          (shape ``(n_sys,)``).  Small values indicate significant residual
          contamination.

    Notes
    -----
    The Pearson correlation is:

    .. math::

        r_i = \\frac{\\sum_p (w_p - \\bar w)(t_{i,p} - \\bar t_i)}
                    {\\sqrt{\\sum_p (w_p-\\bar w)^2 \\sum_p (t_{i,p}-\\bar t_i)^2}}

    A well-corrected field satisfies :math:`|r_i| \\approx 0` for all
    templates.  Permutation p-values are computed by shuffling ``weights``
    and recomputing the correlation ``n_bootstrap`` times.

    Examples
    --------
    >>> import numpy as np
    >>> from sys_mapping import null_test_cross_correlations
    >>> rng = np.random.default_rng(0)
    >>> n_pix, n_sys = 500, 3
    >>> weights = rng.standard_normal(n_pix) + 1.0
    >>> delta_t = rng.standard_normal((n_sys, n_pix))
    >>> result = null_test_cross_correlations(weights, delta_t, n_bootstrap=50)
    >>> result["correlations"].shape
    (3,)
    >>> result["p_values"].shape
    (3,)
    >>> np.all(result["p_values"] >= 0) and np.all(result["p_values"] <= 1)
    True

    References
    ----------
    Ross et al. 2011, MNRAS 417, 1350.
    """
    rng = np.random.default_rng(seed)
    n_sys = delta_t.shape[0]

    w_centered = weights - weights.mean()
    w_norm = np.sqrt(np.sum(w_centered**2))

    correlations = np.empty(n_sys)
    for i, t_i in enumerate(delta_t):
        t_centered = t_i - t_i.mean()
        t_norm = np.sqrt(np.sum(t_centered**2))
        if w_norm < 1e-30 or t_norm < 1e-30:
            correlations[i] = 0.0
        else:
            correlations[i] = float(np.sum(w_centered * t_centered) / (w_norm * t_norm))

    # Permutation p-values
    p_values = np.empty(n_sys)
    for i, t_i in enumerate(delta_t):
        t_centered = t_i - t_i.mean()
        t_norm = np.sqrt(np.sum(t_centered**2))
        obs_r = abs(correlations[i])
        count_extreme = 0
        for _ in range(n_bootstrap):
            w_shuffled = rng.permutation(w_centered)
            if w_norm > 1e-30 and t_norm > 1e-30:
                r_perm = abs(float(np.sum(w_shuffled * t_centered) / (w_norm * t_norm)))
            else:
                r_perm = 0.0
            if r_perm >= obs_r:
                count_extreme += 1
        p_values[i] = (count_extreme + 1) / (n_bootstrap + 1)

    return {"correlations": correlations, "p_values": p_values}




[docs]
def snr_template_ranking(
    delta_g_obs: np.ndarray,
    delta_t: np.ndarray,
    *,
    method: str = "template",
) -> np.ndarray:
    """Rank systematic templates by signal-to-noise ratio of their contamination.

    Three SNR definitions adapted from Tanidis et al. 2026 (Sec. 3) for galaxy
    clustering:

    - ``"template"``: :math:`|\\hat\\alpha_i| / \\sigma_{\\hat\\alpha_i}` from OLS.
    - ``"data"``: :math:`|\\mathrm{Corr}(\\delta_g, t_i)|`, the absolute
      cross-correlation between the galaxy field and each template.
    - ``"peak"``: peak pseudo-:math:`C_\\ell` cross-spectrum between
      :math:`\\delta_g` and :math:`t_i`, divided by the noise level.

    Parameters
    ----------
    delta_g_obs:
        Observed galaxy overdensity (shape ``(n_pix,)``).
    delta_t:
        Template maps (shape ``(n_sys, n_pix)``).
    method:
        One of ``"template"``, ``"data"``, ``"peak"``.

    Returns
    -------
    snr:
        SNR value for each template (shape ``(n_sys,)``).  Higher values
        indicate a more contaminating template.

    Notes
    -----
    Templates can be sorted by ``np.argsort(snr)[::-1]`` to obtain a
    ranking from most to least contaminating.  The ``"peak"`` method
    requires ``healpy`` (already a package dependency) and assumes the
    overdensity map covers the full sky (or has zeros outside the mask).

    Examples
    --------
    >>> import numpy as np
    >>> from sys_mapping import snr_template_ranking
    >>> rng = np.random.default_rng(0)
    >>> n_pix, n_sys = 3000, 4
    >>> delta_t = rng.standard_normal((n_sys, n_pix))
    >>> # Inject strong contamination from template 2
    >>> delta_g = 0.5 * delta_t[2] + rng.standard_normal(n_pix) * 0.2
    >>> snr = snr_template_ranking(delta_g, delta_t, method="data")
    >>> snr.shape
    (4,)
    >>> np.argmax(snr) == 2
    True

    References
    ----------
    Tanidis et al. 2026, MNRAS 547.
    """
    n_sys, n_pix = delta_t.shape

    if method == "template":
        X = delta_t.T  # (n_pix, n_sys)
        alpha_hat, res, rank, sv = np.linalg.lstsq(X, delta_g_obs, rcond=None)
        # OLS variance estimate: sigma^2 = RSS / (n - p)
        residual = delta_g_obs - X @ alpha_hat
        dof = max(n_pix - n_sys, 1)
        sigma2 = float(np.sum(residual**2) / dof)
        XtX_inv = np.linalg.pinv(X.T @ X)
        param_var = sigma2 * np.diag(XtX_inv)
        param_var = np.maximum(param_var, 1e-30)
        snr = np.abs(alpha_hat) / np.sqrt(param_var)
        return snr

    elif method == "data":
        g_centered = delta_g_obs - delta_g_obs.mean()
        g_norm = np.sqrt(np.sum(g_centered**2))
        snr = np.empty(n_sys)
        for i, t_i in enumerate(delta_t):
            t_centered = t_i - t_i.mean()
            t_norm = np.sqrt(np.sum(t_centered**2))
            if g_norm < 1e-30 or t_norm < 1e-30:
                snr[i] = 0.0
            else:
                snr[i] = abs(float(np.sum(g_centered * t_centered) / (g_norm * t_norm)))
        return snr

    elif method == "peak":
        import healpy as hp

        nside = hp.npix2nside(n_pix)
        lmax = 3 * nside - 1
        cl_g = hp.anafast(delta_g_obs, lmax=lmax)
        noise_level = np.median(np.abs(cl_g)) + 1e-30
        snr = np.empty(n_sys)
        for i, t_i in enumerate(delta_t):
            cl_cross = hp.anafast(delta_g_obs, map2=t_i, lmax=lmax)
            snr[i] = float(np.max(np.abs(cl_cross))) / noise_level
        return snr

    else:
        raise ValueError(f"method must be 'template', 'data', or 'peak', got '{method}'")




[docs]
def footprint_mask_diagnostics(
    delta_g_obs: np.ndarray,
    delta_t: np.ndarray,
    mask_fractions: np.ndarray,
    good_pixels: np.ndarray,
) -> dict[str, np.ndarray]:
    """Quantify sensitivity of systematic parameters to the masking threshold.

    For each masking fraction in ``mask_fractions``, the most poorly
    connected pixels (those with the lowest overdensity variance from the
    template model) are excluded, and OLS contamination amplitudes are
    re-estimated.  Large variation of the amplitudes with masking threshold
    indicates sensitivity to the survey boundary or to systematic outliers.

    This is the footprint masking sensitivity analysis of
    Rodríguez-Monroy et al. 2025, Sec. 4.

    Parameters
    ----------
    delta_g_obs:
        Observed galaxy overdensity at unmasked pixels (shape ``(n_pix,)``).
    delta_t:
        Template maps at unmasked pixels (shape ``(n_sys, n_pix)``).
    mask_fractions:
        Array of additional pixel fractions to mask on top of the baseline,
        in increasing order (e.g. ``[0.05, 0.10, 0.15, 0.20]``).
    good_pixels:
        Boolean mask over the full footprint (shape ``(n_full_pix,)``).
        Used only for computing a pixel-quality ranking (template variance
        at each pixel).

    Returns
    -------
    result:
        Dictionary with:

        - ``"alpha_hat"``: OLS amplitudes per masking level
          (shape ``(n_thresholds, n_sys)``).
        - ``"scatter"``: standard deviation of ``alpha_hat`` across masking
          levels (shape ``(n_sys,)``).  Near-zero scatter means the
          amplitude estimates are robust to masking.

    Notes
    -----
    Pixels are ranked by the RMS template value
    :math:`\\sigma_p = \\sqrt{\\sum_i t_{i,p}^2 / n_{\\rm sys}}`.  Pixels
    with the **lowest** :math:`\\sigma_p` (fewest systematic features) are
    masked first, so that the remaining pixels have progressively higher
    systematic leverage.

    Examples
    --------
    >>> import numpy as np
    >>> from sys_mapping import footprint_mask_diagnostics
    >>> rng = np.random.default_rng(0)
    >>> n_pix, n_sys = 4000, 3
    >>> delta_t = rng.standard_normal((n_sys, n_pix))
    >>> delta_g = 0.1 * delta_t[0] + rng.standard_normal(n_pix) * 0.5
    >>> good = np.ones(n_pix, dtype=bool)
    >>> fracs = np.array([0.0, 0.05, 0.10, 0.15])
    >>> result = footprint_mask_diagnostics(delta_g, delta_t, fracs, good)
    >>> result["alpha_hat"].shape
    (4, 3)
    >>> result["scatter"].shape
    (3,)

    References
    ----------
    Rodríguez-Monroy et al. 2025, arXiv:2509.07943.
    """
    n_sys, n_pix = delta_t.shape
    n_thresholds = len(mask_fractions)

    # Pixel-quality metric: RMS template value
    rms_per_pixel = np.sqrt(np.mean(delta_t**2, axis=0))  # (n_pix,)
    rank_order = np.argsort(rms_per_pixel)  # ascending — low rms pixels first

    alpha_all = np.empty((n_thresholds, n_sys))

    for k, frac in enumerate(mask_fractions):
        n_drop = int(np.floor(frac * n_pix))
        keep = np.ones(n_pix, dtype=bool)
        if n_drop > 0:
            drop_idx = rank_order[:n_drop]
            keep[drop_idx] = False

        dg_k = delta_g_obs[keep]
        dt_k = delta_t[:, keep]
        X = dt_k.T  # (n_keep, n_sys)
        alpha_k, *_ = np.linalg.lstsq(X, dg_k, rcond=None)
        alpha_all[k] = alpha_k

    scatter = np.std(alpha_all, axis=0)
    return {"alpha_hat": alpha_all, "scatter": scatter}