"""This module implements a special containers that compute and describe
redshift distributions. These containers provide methods for error and
covariance estimation, plotting and computing mean redshifts.
True redshift distributions can be expressed through the :obj:`HistData` class
and can be computed from the :meth:`~yaw.catalogs.BaseCatalog.true_redshifts`
method of :obj:`~yaw.catalogs.BaseCatalog`.
Clustering redshift estimates can be expressed through the :obj:`RedshiftData`
class. Its different constructor methods, :meth:`~RedshiftData.from_corrdata`
and :meth:`~RedshiftData.from_corrfuncs`, provide easy interfaces to compute the
clustering redshifts and to mitigate the evolving galaxy bias, if the
corresponding autocorrelation functions (e.g. of the reference sample) are
known.
"""
from __future__ import annotations
import logging
from dataclasses import dataclass, field
from typing import TYPE_CHECKING
import numpy as np
import pandas as pd
import scipy.optimize
from deprecated import deprecated
from yaw.config import ResamplingConfig
from yaw.core.containers import SampledValue
from yaw.core.logging import TimedLog
from yaw.core.math import rebin, shift_histogram
from yaw.core.utils import TypePathStr
from yaw.correlation import CorrData
if TYPE_CHECKING: # pragma: no cover
from numpy.typing import NDArray
from yaw.correlation import CorrFunc
__all__ = ["RedshiftData", "HistData"]
logger = logging.getLogger(__name__)
[docs]
@dataclass(frozen=True, repr=False, eq=False)
class RedshiftData(CorrData):
"""Container class object for redshift estimates.
Contains the redshift binning, estimated fraction of galaxies at the given
redshift (**not** a density), and resampled fractions (e.g. from jackknife
or bootstrap). The resampled values are used to compute error estimates and
covariance/correlation matrices. Provides some plotting methods for
convenience.
This container holds data in the form of
:math:`\\frac{w_\\rm{sp}(z)}{\\sqrt{\\Delta z \\, w_\\rm{ss}(z) \\, w_\\rm{pp}(z)}}`,
where :math:`w_\\rm{sp}` is the crosscorrelation function, and
:math:`w_\\rm{ss}` and :math:`w_\\rm{pp}` are autocorrelation functions that
account for the evolving galaxy bias. If no autocorrelation is provided, the
data is still scaled by :math:`1/\\Delta z` compared to the crosscorrelation
data in :obj:`~yaw.correlation.CorrData`.
.. Note::
This container should be constructed from a crosscorrelation measurement
with one of the preferred methods :meth:`from_corrdata` or
:meth:`from_corrfuncs`. These additionally allow galaxy bias mitigation
by specifying any additionally measured autocorrelation functions.
The comparison, addition and subtraction and indexing rules are inherited
from :obj:`~yaw.core.containers.SampledData`, check the examples there.
.. rubric:: Examples
Create a redshift estimate from a crosscorrelation function and correct for
the evolving bias of the reference sample using its autocorrelation
function:
>>> from yaw.examples import w_sp # crosscorrelation
>>> from yaw.examples import w_ss # reference sample autocorrelation
>>> nz = yaw.yaw.RedshiftData.from_corrfuncs(w_sp, ref_corr=w_ss)
RedshiftData(n_bins=30, z='0.070...1.420', n_samples=64, method='jackknife')
Use a different estimator when sampling the autocorrelation function, e.g.
the Peebles-Hauser estimator:
>>> nz = yaw.RedshiftData.from_corrfuncs(w_sp, ref_corr=w_ss, ref_est="PH")
RedshiftData(n_bins=30, z='0.070...1.420', n_samples=64, method='jackknife')
View the data for a subset of the redshift bins:
>>> nz.bins[5:9].data
array([2.5234212 , 1.96617211, 1.05342 , 0.67866257])
View the same subset as series:
>>> nz.bins[5:9].get_data()
(0.295, 0.34] 2.523421
(0.34, 0.385] 1.966172
(0.385, 0.43] 1.053420
(0.43, 0.475] 0.678663
dtype: float64
Get the redshift bin centers for these bins:
>>> nz.bins[5:9].mids
array([0.3175, 0.3625, 0.4075, 0.4525])
Plot the redshift distribution, indicating a zero-line
>>> nz.plot(zero_line=True)
<Axes: >
.. figure:: ../../_static/ncc_example.png
:width: 400
:alt: example clustering redshfit estimate
Args:
binning (:obj:`pandas.IntervalIndex`):
The redshift bin edges used for this correlation function.
data (:obj:`NDArray`):
The correlation function values.
samples (:obj:`NDArray`):
The resampled correlation function values.
method (:obj:`str`):
The resampling method used, see :class:`~yaw.ResamplingConfig` for
available options.
info (:obj:`str`, optional):
Descriptive text included in the headers of output files produced
by :func:`CorrData.to_files`.
"""
[docs]
@classmethod
@deprecated(reason="renamed to RedshiftData.from_corrdata", version="2.3.2")
def from_correlation_data(cls, *args, **kwargs):
"""
.. deprecated:: 2.3.2
Renamed to :meth:`from_corrdata`.
"""
return cls.from_corrdata(*args, **kwargs) # pragma: no cover
[docs]
@classmethod
def from_corrdata(
cls,
cross_data: CorrData,
ref_data: CorrData | None = None,
unk_data: CorrData | None = None,
info: str | None = None,
) -> RedshiftData:
"""Compute redshift estimate from readily sampled function data.
The required argument is a crosscorrelation measurement, additional
parameters can specify sample autocorrelation measurements that are used
to mitigate the evolving galaxy bias.
Args:
cross_corr (:obj:`CorrData`):
Data from the sampled cross-correlation function.
ref_corr (:obj:`CorrData`, optional):
Data from the sampled reference sample autocorrelation function.
Used to mitigate reference bias evolution.
unk_corr (:obj:`CorrData`, optional):
Data from the sampled unknown sample autocorrelation function.
Used to mitigate unknown bias evolution.
Returns:
:obj:`RedshiftData`
"""
logger.debug("computing clustering redshifts from correlation function samples")
w_sp_data = cross_data.data
w_sp_samp = cross_data.samples
mitigate = []
if ref_data is None:
w_ss_data = np.float64(1.0)
w_ss_samp = np.float64(1.0)
else:
try:
ref_data.is_compatible(cross_data, require=True)
except ValueError as e:
raise ValueError(
"'ref_corr' correlation data is not compatible with 'cross_data'"
) from e
w_ss_data = ref_data.data
w_ss_samp = ref_data.samples
mitigate.append("reference")
if unk_data is None:
w_pp_data = np.float64(1.0)
w_pp_samp = np.float64(1.0)
else:
try:
unk_data.is_compatible(cross_data, require=True)
except ValueError as e:
raise ValueError(
"'unk_data' correlation data is not compatible with " "'cross_data'"
) from e
w_pp_data = unk_data.data
w_pp_samp = unk_data.samples
mitigate.append("unknown")
if len(mitigate) > 0:
logger.debug("mitigating %s sample bias", " and ".join(mitigate))
N = cross_data.n_samples
dzsq_data = cross_data.dz**2
dzsq_samp = np.tile(dzsq_data, N).reshape((N, -1))
nz_data = w_sp_data / np.sqrt(dzsq_data * w_ss_data * w_pp_data)
nz_samp = w_sp_samp / np.sqrt(dzsq_samp * w_ss_samp * w_pp_samp)
return cls(
binning=cross_data.binning,
data=nz_data,
samples=nz_samp,
method=cross_data.method,
info=info,
)
[docs]
@classmethod
@deprecated(reason="renamed to RedshiftData.from_corrfuncs", version="2.3.2")
def from_correlation_functions(cls, *args, **kwargs):
"""
.. deprecated:: 2.3.2
Renamed to :meth:`from_corrfuncs`.
"""
return cls.from_corrfuncs(*args, **kwargs) # pragma: no cover
[docs]
@classmethod
def from_corrfuncs(
cls,
cross_corr: CorrFunc,
ref_corr: CorrFunc | None = None,
unk_corr: CorrFunc | None = None,
*,
cross_est: str | None = None,
ref_est: str | None = None,
unk_est: str | None = None,
config: ResamplingConfig | None = None,
info: str | None = None,
) -> RedshiftData:
"""Sample correlation functions to compute a redshift estimate.
The required argument is a crosscorrelation measurement, additional
parameters can specify sample autocorrelation measurements that are used
to mitigate the evolving galaxy bias.
Args:
cross_corr (:obj:`CorrFunc`):
The measured cross-correlation function.
ref_corr (:obj:`CorrFunc`, optional):
The measured reference sample autocorrelation function. Used to
mitigate reference bias evolution.
unk_corr (:obj:`CorrFunc`, optional):
The measured unknown sample autocorrelation function. Used to
mitigate unknown bias evolution.
Returns:
:obj:`RedshiftData`
"""
if config is None:
config = ResamplingConfig()
with TimedLog(
logger.debug,
f"estimating clustering redshifts with method '{config.method}'",
):
# check compatibilty before sampling anything
if ref_corr is not None:
try:
cross_corr.is_compatible(ref_corr, require=True)
except ValueError as e:
raise ValueError(
"'ref_corr' correlation function is not compatible "
"with 'cross_corr'"
) from e
if unk_corr is not None:
try:
cross_corr.is_compatible(unk_corr, require=True)
except ValueError as e:
raise ValueError(
"'unk_corr' correlation function is not compatible "
"with 'cross_corr'"
) from e
# sample pair counts and evaluate estimator
cross_data = cross_corr.sample(config, estimator=cross_est)
if ref_corr is not None:
ref_data = ref_corr.sample(config, estimator=ref_est)
else:
ref_data = None
if unk_corr is not None:
unk_data = unk_corr.sample(config, estimator=unk_est)
else:
unk_data = None
return cls.from_corrdata(
cross_data=cross_data, ref_data=ref_data, unk_data=unk_data, info=info
)
@property
def _dat_desc(self) -> str:
return "# n(z) estimate with symmetric 68% percentile confidence"
@property
def _smp_desc(self) -> str:
return f"# {self.n_samples} {self.method} n(z) samples"
@property
def _cov_desc(self) -> str:
return f"# n(z) estimate covariance matrix ({self.n_bins}x{self.n_bins})"
[docs]
def normalised(self, to: CorrData | None = None) -> RedshiftData:
"""Obtain a normalised copy of the data.
Either attempts to normalise the data by integration along the redshift
axis or by fitting it to a provided reference data container (e.g. a
known redshift distribution in a :obj:`HistData` container).
.. Note::
The fit does not use the uncertainties but weights the data points
inversely to their amplitude.
Args:
to (:obj:`CorrData`, optional):
Reference data to which the stored values are normalised by
fitting.
Returns:
:obj:`RedshiftData`
"""
if to is None:
norm = np.nansum(self.dz * self.data)
else:
y_from = self.data
y_to = to.data
mask = np.isfinite(y_from) & np.isfinite(y_to) & (y_to > 0.0)
norm = scipy.optimize.curve_fit(
lambda x, norm: y_from[mask] / norm, # x is a dummy variable
xdata=to.mids[mask],
ydata=y_to[mask],
p0=[1.0],
sigma=1 / y_to[mask],
)[0][0]
return self.__class__(
binning=self.get_binning(),
data=self.data / norm,
samples=self.samples / norm,
method=self.method,
info=self.info,
)
[docs]
def mean(self):
"""Attempts to compute a mean redshift.
.. Warning::
This should be just considered an estimate since the redshift
estimate is not a true probability density, due to residual negative
correlation amplitudes.
Returns:
:obj:`~yaw.core.SampledValue`:
Mean redshift for the redshift data and its samples in a data
container.
"""
norm = np.nansum(self.data)
mean = np.nansum(self.data * self.mids) / norm
samples = np.nansum(self.samples * self.mids, axis=1) / norm
return SampledValue(value=mean, samples=samples, method=self.method)
[docs]
def rebin(self, bins: NDArray) -> RedshiftData:
"""Attempts recomute the data for a different redshift binning.
.. Warning::
The result may be inaccurate since the redshift estimate is not a
true probability density, due to residual negative correlation
amplitudes.
Args:
bins (:obj:`NDArray`):
Edges of the new redshift bins. May exceed or cover just a
fraction of the original redshift range.
Returns:
:obj:`RedshiftData`:
"""
old_bins = self.edges
# shift main values
data = rebin(bins, old_bins, self.data)
# shift the value samples
samples = np.empty([self.n_samples, len(bins) - 1], data.dtype)
for i, sample in enumerate(self.samples):
samples[i] = rebin(bins, old_bins, sample)
return self.__class__(
binning=pd.IntervalIndex.from_breaks(bins),
data=data,
samples=samples,
method=self.method,
info=self.info,
)
[docs]
def shift(
self, dz: float | SampledValue = 0.0, *, amplitude: float | SampledValue = 1.0
) -> RedshiftData:
"""Attempts shift the data along the redshift axis.
The shifting is performed by recomputing the redshift estimate with its
original redshift bins which are shifted by some amount.
.. Warning::
The result may be inaccurate since the redshift estimate is not a
true probability density, due to residual negative correlation
amplitudes.
Args:
dz (:obj:`SampledValue` or :obj:`float`):
The amplitude of the shift along the redshift axis. If the input
provides samples of the shift, each redshift estimate sample is
shifted individually to obtain a more accurate error estimate.
amplitude (:obj:`SampledValue` or :obj:`float`):
An optional ampltude factor applied to the redshift estimate.
Same rules as for the ``dz`` parameter.
Returns:
:obj:`RedshiftData`:
"""
if isinstance(amplitude, SampledValue):
A_samples = amplitude.samples
amplitude = amplitude.value
else:
A_samples = [amplitude] * self.n_samples
if isinstance(dz, SampledValue):
dz_samples = dz.samples
dz = dz.value
else:
dz_samples = [dz] * self.n_samples
bins = self.edges
# shift main values
data = shift_histogram(bins, self.data, A=amplitude, dx=dz)
# shift the value samples
samples = np.empty_like(self.samples)
iter_samples = zip(self.samples, A_samples, dz_samples)
for i, (sample, amplitude, dz) in enumerate(iter_samples):
samples[i] = shift_histogram(bins, sample, A=amplitude, dx=dz)
return self.__class__(
binning=self.get_binning(),
data=data,
samples=samples,
method=self.method,
info=self.info,
)
[docs]
@dataclass(frozen=True, repr=False)
class HistData(RedshiftData):
"""Container for histogram data.
Contains the redshift binning, histogram counts, and resampled counts (e.g.
from jackknife or bootstrap). The resampled values are used to compute error
estimates and covariance/correlation matrices. Provides some plotting
methods for convenience.
Args:
binning (:obj:`pandas.IntervalIndex`):
The redshift bin edges used for this correlation function.
data (:obj:`NDArray`):
The correlation function values.
samples (:obj:`NDArray`):
The resampled correlation function values.
method (:obj:`str`):
The resampling method used, see :class:`~yaw.ResamplingConfig` for
available options.
info (:obj:`str`, optional):
Descriptive text included in the headers of output files produced
by :func:`CorrData.to_files`.
density (:obj:`bool`):
Whether the data is normalised, i.e. a density estimate.
"""
density: bool = field(default=False)
"""Whether the data is normalised, i.e. a density estimate."""
def __eq__(self, other: object) -> bool:
parent_eq = super().__eq__(other)
if isinstance(other, self.__class__):
return self.density == other.density
return parent_eq
@property
def _dat_desc(self) -> str:
n = "normalised " if self.density else " "
return f"# n(z) {n}histogram with symmetric 68% percentile confidence"
@property
def _smp_desc(self) -> str:
n = "normalised " if self.density else " "
return f"# {self.n_samples} {self.method} n(z) {n}histogram samples"
@property
def _cov_desc(self) -> str:
n = "normalised " if self.density else " "
return f"# n(z) {n}histogram covariance matrix ({self.n_bins}x{self.n_bins})"
[docs]
@classmethod
def from_files(cls, path_prefix: TypePathStr) -> HistData:
new = super().from_files(path_prefix)
with open(f"{path_prefix}.dat") as f:
line = f.readline()
density = "normalised" in line
return cls(
binning=new.get_binning(),
data=new.data,
samples=new.samples,
method=new.method,
density=density,
)
[docs]
def normalised(self, *args, **kwargs) -> HistData:
"""Obtain a normalised copy of the data.
Normalises the data by integration along the redshift axis. This sets
the containers :obj:`density` flag to ``True``.
Returns:
:obj:`HistData`
"""
if self.density: # guard from repeatedly altering the data
return self
zmin, zmax = self.edges[[0, -1]]
width_correction = (zmax - zmin) / (self.n_bins * self.dz)
data = self.data * width_correction
samples = self.samples * width_correction
norm = np.nansum(self.dz * data)
return self.__class__(
binning=self.get_binning(),
data=data / norm,
samples=samples / norm,
method=self.method,
info=self.info,
density=True,
)
[docs]
def mean(self) -> SampledValue:
"""Compute the mean redshift.
Returns:
:obj:`~yaw.core.SampledValue`:
Mean redshift for the redshift data and its samples in a data
container.
"""
normed = self.normalised()
norm = np.nansum(normed.data)
mean = np.nansum(normed.data * normed.mids) / norm
samples = np.nansum(normed.samples * normed.mids, axis=1) / norm
return SampledValue(value=mean, samples=samples, method=normed.method)
[docs]
def rebin(self, bins: NDArray) -> HistData:
"""Recomute the data for a different redshift binning.
.. Warning::
The result may be inaccurate since the result is interpolated
step-wise.
Args:
bins (:obj:`NDArray`):
Edges of the new redshift bins. May exceed or cover just a
fraction of the original redshift range.
Returns:
:obj:`HistData`:
"""
result = super().rebin(bins)
object.__setattr__(self, "density", self.density)
return result
[docs]
def shift(
self, dz: float | SampledValue = 0.0, *, amplitude: float | SampledValue = 1.0
) -> HistData:
"""Shifts the data along the redshift axis.
The shifting is performed by recomputing the histogram with its original
redshift bins which are shifted by some amount.
.. Warning::
The result may be inaccurate since the result is interpolated
step-wise.
Args:
dz (:obj:`SampledValue` or :obj:`float`):
The amplitude of the shift along the redshift axis. If the input
provides samples of the shift, each redshift estimate sample is
shifted individually to obtain a more accurate error estimate.
amplitude (:obj:`SampledValue` or :obj:`float`):
An optional ampltude factor applied to the redshift estimate.
Same rules as for the ``dz`` parameter.
Returns:
:obj:`HistData`:
"""
result = super().shift(dz, amplitude=amplitude)
if amplitude == 1.0:
object.__setattr__(self, "density", self.density)
else:
object.__setattr__(self, "density", False)
return result