Source code for yaw.redshifts

"""This module implements a special containers that compute and describe
redshift distributions. These containers provide methods for error and
covariance estimation, plotting and computing mean redshifts.

True redshift distributions can be expressed through the :obj:`HistData` class
and can be computed from the :meth:`~yaw.catalogs.BaseCatalog.true_redshifts`
method of :obj:`~yaw.catalogs.BaseCatalog`.

Clustering redshift estimates can be expressed through the :obj:`RedshiftData`
class. Its different constructor methods, :meth:`~RedshiftData.from_corrdata`
and :meth:`~RedshiftData.from_corrfuncs`, provide easy interfaces to compute the
clustering redshifts and to mitigate the evolving galaxy bias, if the
corresponding autocorrelation functions (e.g. of the reference sample) are
known.
"""

from __future__ import annotations

import logging
from dataclasses import dataclass, field
from typing import TYPE_CHECKING

import numpy as np
import pandas as pd
import scipy.optimize
from deprecated import deprecated

from yaw.config import ResamplingConfig
from yaw.core.containers import SampledValue
from yaw.core.logging import TimedLog
from yaw.core.math import rebin, shift_histogram
from yaw.core.utils import TypePathStr
from yaw.correlation import CorrData

if TYPE_CHECKING:  # pragma: no cover
    from numpy.typing import NDArray

    from yaw.correlation import CorrFunc

__all__ = ["RedshiftData", "HistData"]


logger = logging.getLogger(__name__)



[docs]
@dataclass(frozen=True, repr=False, eq=False)
class RedshiftData(CorrData):
    """Container class object for redshift estimates.

    Contains the redshift binning, estimated fraction of galaxies at the given
    redshift (**not** a density), and resampled fractions (e.g. from jackknife
    or bootstrap). The resampled values are used to compute error estimates and
    covariance/correlation matrices. Provides some plotting methods for
    convenience.

    This container holds data in the form of
    :math:`\\frac{w_\\rm{sp}(z)}{\\sqrt{\\Delta z \\, w_\\rm{ss}(z) \\, w_\\rm{pp}(z)}}`,
    where :math:`w_\\rm{sp}` is the crosscorrelation function, and
    :math:`w_\\rm{ss}` and :math:`w_\\rm{pp}` are autocorrelation functions that
    account for the evolving galaxy bias. If no autocorrelation is provided, the
    data is still scaled by :math:`1/\\Delta z` compared to the crosscorrelation
    data in :obj:`~yaw.correlation.CorrData`.

    .. Note::
        This container should be constructed from a crosscorrelation measurement
        with one of the preferred methods :meth:`from_corrdata` or
        :meth:`from_corrfuncs`. These additionally allow galaxy bias mitigation
        by specifying any additionally measured autocorrelation functions.

    The comparison, addition and subtraction and indexing rules are inherited
    from :obj:`~yaw.core.containers.SampledData`, check the examples there.

    .. rubric:: Examples

    Create a redshift estimate from a crosscorrelation function and correct for
    the evolving bias of the reference sample using its autocorrelation
    function:

    >>> from yaw.examples import w_sp  # crosscorrelation
    >>> from yaw.examples import w_ss  # reference sample autocorrelation
    >>> nz = yaw.yaw.RedshiftData.from_corrfuncs(w_sp, ref_corr=w_ss)
    RedshiftData(n_bins=30, z='0.070...1.420', n_samples=64, method='jackknife')

    Use a different estimator when sampling the autocorrelation function, e.g.
    the Peebles-Hauser estimator:

    >>> nz = yaw.RedshiftData.from_corrfuncs(w_sp, ref_corr=w_ss, ref_est="PH")
    RedshiftData(n_bins=30, z='0.070...1.420', n_samples=64, method='jackknife')

    View the data for a subset of the redshift bins:

    >>> nz.bins[5:9].data
    array([2.5234212 , 1.96617211, 1.05342   , 0.67866257])

    View the same subset as series:

    >>> nz.bins[5:9].get_data()
    (0.295, 0.34]    2.523421
    (0.34, 0.385]    1.966172
    (0.385, 0.43]    1.053420
    (0.43, 0.475]    0.678663
    dtype: float64

    Get the redshift bin centers for these bins:

    >>> nz.bins[5:9].mids
    array([0.3175, 0.3625, 0.4075, 0.4525])

    Plot the redshift distribution, indicating a zero-line

    >>> nz.plot(zero_line=True)
    <Axes: >

    .. figure:: ../../_static/ncc_example.png
        :width: 400
        :alt: example clustering redshfit estimate

    Args:
        binning (:obj:`pandas.IntervalIndex`):
            The redshift bin edges used for this correlation function.
        data (:obj:`NDArray`):
            The correlation function values.
        samples (:obj:`NDArray`):
            The resampled correlation function values.
        method (:obj:`str`):
            The resampling method used, see :class:`~yaw.ResamplingConfig` for
            available options.
        info (:obj:`str`, optional):
            Descriptive text included in the headers of output files produced
            by :func:`CorrData.to_files`.
    """


[docs]
    @classmethod
    @deprecated(reason="renamed to RedshiftData.from_corrdata", version="2.3.2")
    def from_correlation_data(cls, *args, **kwargs):
        """
        .. deprecated:: 2.3.2
            Renamed to :meth:`from_corrdata`.
        """
        return cls.from_corrdata(*args, **kwargs)  # pragma: no cover



[docs]
    @classmethod
    def from_corrdata(
        cls,
        cross_data: CorrData,
        ref_data: CorrData | None = None,
        unk_data: CorrData | None = None,
        info: str | None = None,
    ) -> RedshiftData:
        """Compute redshift estimate from readily sampled function data.

        The required argument is a crosscorrelation measurement, additional
        parameters can specify sample autocorrelation measurements that are used
        to mitigate the evolving galaxy bias.

        Args:
            cross_corr (:obj:`CorrData`):
                Data from the sampled cross-correlation function.
            ref_corr (:obj:`CorrData`, optional):
                Data from the sampled reference sample autocorrelation function.
                Used to mitigate reference bias evolution.
            unk_corr (:obj:`CorrData`, optional):
                Data from the sampled unknown sample autocorrelation function.
                Used to mitigate unknown bias evolution.

        Returns:
            :obj:`RedshiftData`
        """
        logger.debug("computing clustering redshifts from correlation function samples")
        w_sp_data = cross_data.data
        w_sp_samp = cross_data.samples
        mitigate = []

        if ref_data is None:
            w_ss_data = np.float64(1.0)
            w_ss_samp = np.float64(1.0)
        else:
            try:
                ref_data.is_compatible(cross_data, require=True)
            except ValueError as e:
                raise ValueError(
                    "'ref_corr' correlation data is not compatible with 'cross_data'"
                ) from e
            w_ss_data = ref_data.data
            w_ss_samp = ref_data.samples
            mitigate.append("reference")

        if unk_data is None:
            w_pp_data = np.float64(1.0)
            w_pp_samp = np.float64(1.0)
        else:
            try:
                unk_data.is_compatible(cross_data, require=True)
            except ValueError as e:
                raise ValueError(
                    "'unk_data' correlation data is not compatible with " "'cross_data'"
                ) from e
            w_pp_data = unk_data.data
            w_pp_samp = unk_data.samples
            mitigate.append("unknown")

        if len(mitigate) > 0:
            logger.debug("mitigating %s sample bias", " and ".join(mitigate))
        N = cross_data.n_samples
        dzsq_data = cross_data.dz**2
        dzsq_samp = np.tile(dzsq_data, N).reshape((N, -1))
        nz_data = w_sp_data / np.sqrt(dzsq_data * w_ss_data * w_pp_data)
        nz_samp = w_sp_samp / np.sqrt(dzsq_samp * w_ss_samp * w_pp_samp)
        return cls(
            binning=cross_data.binning,
            data=nz_data,
            samples=nz_samp,
            method=cross_data.method,
            info=info,
        )



[docs]
    @classmethod
    @deprecated(reason="renamed to RedshiftData.from_corrfuncs", version="2.3.2")
    def from_correlation_functions(cls, *args, **kwargs):
        """
        .. deprecated:: 2.3.2
            Renamed to :meth:`from_corrfuncs`.
        """
        return cls.from_corrfuncs(*args, **kwargs)  # pragma: no cover



[docs]
    @classmethod
    def from_corrfuncs(
        cls,
        cross_corr: CorrFunc,
        ref_corr: CorrFunc | None = None,
        unk_corr: CorrFunc | None = None,
        *,
        cross_est: str | None = None,
        ref_est: str | None = None,
        unk_est: str | None = None,
        config: ResamplingConfig | None = None,
        info: str | None = None,
    ) -> RedshiftData:
        """Sample correlation functions to compute a redshift estimate.

        The required argument is a crosscorrelation measurement, additional
        parameters can specify sample autocorrelation measurements that are used
        to mitigate the evolving galaxy bias.

        Args:
            cross_corr (:obj:`CorrFunc`):
                The measured cross-correlation function.
            ref_corr (:obj:`CorrFunc`, optional):
                The measured reference sample autocorrelation function. Used to
                mitigate reference bias evolution.
            unk_corr (:obj:`CorrFunc`, optional):
                The measured unknown sample autocorrelation function. Used to
                mitigate unknown bias evolution.

        Returns:
            :obj:`RedshiftData`
        """
        if config is None:
            config = ResamplingConfig()
        with TimedLog(
            logger.debug,
            f"estimating clustering redshifts with method '{config.method}'",
        ):
            # check compatibilty before sampling anything
            if ref_corr is not None:
                try:
                    cross_corr.is_compatible(ref_corr, require=True)
                except ValueError as e:
                    raise ValueError(
                        "'ref_corr' correlation function is not compatible "
                        "with 'cross_corr'"
                    ) from e
            if unk_corr is not None:
                try:
                    cross_corr.is_compatible(unk_corr, require=True)
                except ValueError as e:
                    raise ValueError(
                        "'unk_corr' correlation function is not compatible "
                        "with 'cross_corr'"
                    ) from e
            # sample pair counts and evaluate estimator
            cross_data = cross_corr.sample(config, estimator=cross_est)
            if ref_corr is not None:
                ref_data = ref_corr.sample(config, estimator=ref_est)
            else:
                ref_data = None
            if unk_corr is not None:
                unk_data = unk_corr.sample(config, estimator=unk_est)
            else:
                unk_data = None
            return cls.from_corrdata(
                cross_data=cross_data, ref_data=ref_data, unk_data=unk_data, info=info
            )


    @property
    def _dat_desc(self) -> str:
        return "# n(z) estimate with symmetric 68% percentile confidence"

    @property
    def _smp_desc(self) -> str:
        return f"# {self.n_samples} {self.method} n(z) samples"

    @property
    def _cov_desc(self) -> str:
        return f"# n(z) estimate covariance matrix ({self.n_bins}x{self.n_bins})"


[docs]
    def normalised(self, to: CorrData | None = None) -> RedshiftData:
        """Obtain a normalised copy of the data.

        Either attempts to normalise the data by integration along the redshift
        axis or by fitting it to a provided reference data container (e.g. a
        known redshift distribution in a :obj:`HistData` container).

        .. Note::
            The fit does not use the uncertainties but weights the data points
            inversely to their amplitude.

        Args:
            to (:obj:`CorrData`, optional):
                Reference data to which the stored values are normalised by
                fitting.

        Returns:
            :obj:`RedshiftData`
        """
        if to is None:
            norm = np.nansum(self.dz * self.data)
        else:
            y_from = self.data
            y_to = to.data
            mask = np.isfinite(y_from) & np.isfinite(y_to) & (y_to > 0.0)
            norm = scipy.optimize.curve_fit(
                lambda x, norm: y_from[mask] / norm,  # x is a dummy variable
                xdata=to.mids[mask],
                ydata=y_to[mask],
                p0=[1.0],
                sigma=1 / y_to[mask],
            )[0][0]
        return self.__class__(
            binning=self.get_binning(),
            data=self.data / norm,
            samples=self.samples / norm,
            method=self.method,
            info=self.info,
        )



[docs]
    def mean(self):
        """Attempts to compute a mean redshift.

        .. Warning::
            This should be just considered an estimate since the redshift
            estimate is not a true probability density, due to residual negative
            correlation amplitudes.

        Returns:
            :obj:`~yaw.core.SampledValue`:
                Mean redshift for the redshift data and its samples in a data
                container.
        """
        norm = np.nansum(self.data)
        mean = np.nansum(self.data * self.mids) / norm
        samples = np.nansum(self.samples * self.mids, axis=1) / norm
        return SampledValue(value=mean, samples=samples, method=self.method)



[docs]
    def rebin(self, bins: NDArray) -> RedshiftData:
        """Attempts recomute the data for a different redshift binning.

        .. Warning::
            The result may be inaccurate since the redshift estimate is not a
            true probability density, due to residual negative correlation
            amplitudes.

        Args:
            bins (:obj:`NDArray`):
                Edges of the new redshift bins. May exceed or cover just a
                fraction of the original redshift range.

        Returns:
            :obj:`RedshiftData`:
        """
        old_bins = self.edges
        # shift main values
        data = rebin(bins, old_bins, self.data)
        # shift the value samples
        samples = np.empty([self.n_samples, len(bins) - 1], data.dtype)
        for i, sample in enumerate(self.samples):
            samples[i] = rebin(bins, old_bins, sample)

        return self.__class__(
            binning=pd.IntervalIndex.from_breaks(bins),
            data=data,
            samples=samples,
            method=self.method,
            info=self.info,
        )



[docs]
    def shift(
        self, dz: float | SampledValue = 0.0, *, amplitude: float | SampledValue = 1.0
    ) -> RedshiftData:
        """Attempts shift the data along the redshift axis.

        The shifting is performed by recomputing the redshift estimate with its
        original redshift bins which are shifted by some amount.

        .. Warning::
            The result may be inaccurate since the redshift estimate is not a
            true probability density, due to residual negative correlation
            amplitudes.

        Args:
            dz (:obj:`SampledValue` or :obj:`float`):
                The amplitude of the shift along the redshift axis. If the input
                provides samples of the shift, each redshift estimate sample is
                shifted individually to obtain a more accurate error estimate.
            amplitude (:obj:`SampledValue` or :obj:`float`):
                An optional ampltude factor applied to the redshift estimate.
                Same rules as for the ``dz`` parameter.

        Returns:
            :obj:`RedshiftData`:
        """
        if isinstance(amplitude, SampledValue):
            A_samples = amplitude.samples
            amplitude = amplitude.value
        else:
            A_samples = [amplitude] * self.n_samples
        if isinstance(dz, SampledValue):
            dz_samples = dz.samples
            dz = dz.value
        else:
            dz_samples = [dz] * self.n_samples

        bins = self.edges
        # shift main values
        data = shift_histogram(bins, self.data, A=amplitude, dx=dz)
        # shift the value samples
        samples = np.empty_like(self.samples)
        iter_samples = zip(self.samples, A_samples, dz_samples)
        for i, (sample, amplitude, dz) in enumerate(iter_samples):
            samples[i] = shift_histogram(bins, sample, A=amplitude, dx=dz)

        return self.__class__(
            binning=self.get_binning(),
            data=data,
            samples=samples,
            method=self.method,
            info=self.info,
        )





[docs]
@dataclass(frozen=True, repr=False)
class HistData(RedshiftData):
    """Container for histogram data.

    Contains the redshift binning, histogram counts, and resampled counts (e.g.
    from jackknife or bootstrap). The resampled values are used to compute error
    estimates and covariance/correlation matrices. Provides some plotting
    methods for convenience.

    Args:
        binning (:obj:`pandas.IntervalIndex`):
            The redshift bin edges used for this correlation function.
        data (:obj:`NDArray`):
            The correlation function values.
        samples (:obj:`NDArray`):
            The resampled correlation function values.
        method (:obj:`str`):
            The resampling method used, see :class:`~yaw.ResamplingConfig` for
            available options.
        info (:obj:`str`, optional):
            Descriptive text included in the headers of output files produced
            by :func:`CorrData.to_files`.
        density (:obj:`bool`):
            Whether the data is normalised, i.e. a density estimate.
    """

    density: bool = field(default=False)
    """Whether the data is normalised, i.e. a density estimate."""

    def __eq__(self, other: object) -> bool:
        parent_eq = super().__eq__(other)
        if isinstance(other, self.__class__):
            return self.density == other.density
        return parent_eq

    @property
    def _dat_desc(self) -> str:
        n = "normalised " if self.density else " "
        return f"# n(z) {n}histogram with symmetric 68% percentile confidence"

    @property
    def _smp_desc(self) -> str:
        n = "normalised " if self.density else " "
        return f"# {self.n_samples} {self.method} n(z) {n}histogram samples"

    @property
    def _cov_desc(self) -> str:
        n = "normalised " if self.density else " "
        return f"# n(z) {n}histogram covariance matrix ({self.n_bins}x{self.n_bins})"


[docs]
    @classmethod
    def from_files(cls, path_prefix: TypePathStr) -> HistData:
        new = super().from_files(path_prefix)
        with open(f"{path_prefix}.dat") as f:
            line = f.readline()
            density = "normalised" in line
        return cls(
            binning=new.get_binning(),
            data=new.data,
            samples=new.samples,
            method=new.method,
            density=density,
        )



[docs]
    def normalised(self, *args, **kwargs) -> HistData:
        """Obtain a normalised copy of the data.

        Normalises the data by integration along the redshift axis. This sets
        the containers :obj:`density` flag to ``True``.

        Returns:
            :obj:`HistData`
        """
        if self.density:  # guard from repeatedly altering the data
            return self
        zmin, zmax = self.edges[[0, -1]]
        width_correction = (zmax - zmin) / (self.n_bins * self.dz)
        data = self.data * width_correction
        samples = self.samples * width_correction
        norm = np.nansum(self.dz * data)
        return self.__class__(
            binning=self.get_binning(),
            data=data / norm,
            samples=samples / norm,
            method=self.method,
            info=self.info,
            density=True,
        )



[docs]
    def mean(self) -> SampledValue:
        """Compute the mean redshift.

        Returns:
            :obj:`~yaw.core.SampledValue`:
                Mean redshift for the redshift data and its samples in a data
                container.
        """
        normed = self.normalised()
        norm = np.nansum(normed.data)
        mean = np.nansum(normed.data * normed.mids) / norm
        samples = np.nansum(normed.samples * normed.mids, axis=1) / norm
        return SampledValue(value=mean, samples=samples, method=normed.method)



[docs]
    def rebin(self, bins: NDArray) -> HistData:
        """Recomute the data for a different redshift binning.

        .. Warning::
            The result may be inaccurate since the result is interpolated
            step-wise.

        Args:
            bins (:obj:`NDArray`):
                Edges of the new redshift bins. May exceed or cover just a
                fraction of the original redshift range.

        Returns:
            :obj:`HistData`:
        """
        result = super().rebin(bins)
        object.__setattr__(self, "density", self.density)
        return result



[docs]
    def shift(
        self, dz: float | SampledValue = 0.0, *, amplitude: float | SampledValue = 1.0
    ) -> HistData:
        """Shifts the data along the redshift axis.

        The shifting is performed by recomputing the histogram with its original
        redshift bins which are shifted by some amount.

        .. Warning::
            The result may be inaccurate since the result is interpolated
            step-wise.

        Args:
            dz (:obj:`SampledValue` or :obj:`float`):
                The amplitude of the shift along the redshift axis. If the input
                provides samples of the shift, each redshift estimate sample is
                shifted individually to obtain a more accurate error estimate.
            amplitude (:obj:`SampledValue` or :obj:`float`):
                An optional ampltude factor applied to the redshift estimate.
                Same rules as for the ``dz`` parameter.

        Returns:
            :obj:`HistData`:
        """
        result = super().shift(dz, amplitude=amplitude)
        if amplitude == 1.0:
            object.__setattr__(self, "density", self.density)
        else:
            object.__setattr__(self, "density", False)
        return result