Source code for yaw.redshifts

"""
Implements two primary classes to store estimates of redshift distributions.

HistData is a tool to measure a redshift distribution histogram from a catalog
with redshift data. Stores histogram counts, jackknife samples (from spatial
patches) and a covariance matrix, similar to yaw.CorrData.

RedshiftData is a container for the final clustering redshift estimate. Can be
constructed directly from cross- and optional autocorrelation functions. Similar
to yaw.CorrData, but normalises all data by the width of the redshift bins.
"""

from __future__ import annotations

import logging
from typing import TYPE_CHECKING

import numpy as np
import scipy.optimize

from yaw.config import Configuration
from yaw.correlation.corrfunc import CorrData
from yaw.options import PlotStyle
from yaw.utils import parallel
from yaw.utils.logging import Indicator

if TYPE_CHECKING:
    from numpy.typing import NDArray
    from typing_extensions import Self

    from yaw.binning import Binning
    from yaw.catalog import Catalog, Patch
    from yaw.config import BinningConfig
    from yaw.correlation.corrfunc import CorrFunc, TypeCorrData

__all__ = [
    "HistData",
    "RedshiftData",
]

logger = logging.getLogger(__name__)


def _redshift_histogram(patch: Patch, binning: Binning) -> NDArray:
    """Worker function that computes a redshift histgram from a given patch and
    binning."""
    redshifts = patch.redshifts
    # numpy histogram uses the bin edges as closed intervals on both sides
    if binning.closed == "right":
        mask = redshifts > binning.edges[0]
    else:
        mask = redshifts < binning.edges[-1]

    weights = patch.weights[mask] if patch.has_weights else None

    counts, _ = np.histogram(redshifts[mask], binning.edges, weights=weights)
    return counts.astype(np.float64)


def resample_jackknife(observations: NDArray, patch_rows: bool = True) -> NDArray:
    """
    Compute jackknife samples from an array of histogram counts with shape
    (``num_bins``, ``num_patches``) per patch and redshift and the
    corresponding bin edges.
    """
    if not patch_rows:
        observations = observations.T
    num_patches = observations.shape[0]

    idx_range = np.arange(0, num_patches)
    idx_samples_full = np.tile(idx_range, num_patches)

    idx_jackknife = np.delete(idx_samples_full, idx_range).reshape((num_patches, -1))
    return observations[idx_jackknife].sum(axis=1)


[docs] class HistData(CorrData): """ Container for a redshift histogram. Implements convenience methods to compute a redshift histogram from a :obj:`~yaw.Catalog`, with jackknife samples constructed from the catalogs spatial patches. There are methods to estimate the standard error, covariance and correlation matrix, normalisation of the pair counts. Provides plotting methods and additionally implements ``len()``, comparison with the ``==`` operator and addition with ``+``/``-``. Args: binning: The redshift :obj:`~yaw.Binning` used to compute the histogram. data: Array containing the bin counts in each of the redshift bin. samples: 2-dim array containing jackknife samples of the histogram counts, expected to have shape (:obj:`num_samples`, :obj:`num_bins`). """ __slots__ = ("binning", "data", "samples")
[docs] @classmethod def from_catalog( cls: type[Self], catalog: Catalog, config: Configuration | BinningConfig, progress: bool = False, max_workers: int | None = None, ) -> Self: """ Compute a redshift histogram from a data catalog. Args: catalog: Data :obj:`~yaw.Catalog` with attached redshift data. config: :obj:`~yaw.Configuration` defining the redshift binning. Keyword Args: progress: Show a progress on the terminal (disabled by default). max_workers: Limit the number of parallel workers for this operation (all by default, only multiprocessing). """ if parallel.on_root(): logger.info("computing redshift histogram") if isinstance(config, Configuration): max_workers = max_workers or config.max_workers config = config.binning max_workers = parallel.ignore_max_workers_mpi(max_workers) patch_count_iter = parallel.iter_unordered( _redshift_histogram, catalog.values(), func_kwargs=dict(binning=config.binning), max_workers=max_workers, ) if progress: patch_count_iter = Indicator(patch_count_iter, len(catalog)) counts = np.empty((len(catalog), config.num_bins)) for i, patch_count in enumerate(patch_count_iter): counts[i] = patch_count parallel.bcast_auto(counts, root=0) return cls( config.binning.copy(), counts.sum(axis=0), resample_jackknife(counts), )
@property def _description_data(self) -> str: """Descriptive comment for header of .dat file.""" return "n(z) histogram with symmetric 68% percentile confidence" @property def _description_samples(self) -> str: """Descriptive comment for header of .smp file.""" return f"{self.num_samples} n(z) histogram jackknife samples" @property def _description_covariance(self) -> str: """Descriptive comment for header of .cov file.""" return f"n(z) histogram covariance matrix ({self.num_bins}x{self.num_bins})" _default_plot_style = PlotStyle.step
[docs] def normalised(self, *args, **kwargs) -> Self: """ Normalises the redshift histogram to a probability density. Any function arguments are discarded. Returns: A new instance with a normalisation factor applied to the counts and jackknife samples. """ if parallel.on_root(): logger.debug("normalising %s", type(self).__name__) edges = self.binning.edges dz = self.binning.dz width_correction = (edges.min() - edges.max()) / (self.num_bins * dz) data = self.data * width_correction samples = self.samples * width_correction norm = np.nansum(dz * data) data /= norm samples /= norm return type(self)(self.binning, data, samples)
[docs] class RedshiftData(CorrData): """ Container for a clustering redshift estimate. Implements conveniences methods to estimate the standard error, covariance and correlation matrix, normalisation, and plotting methods. Additionally implements ``len()``, comparison with the ``==`` operator and addition with ``+``/``-``. Args: binning: The redshift :obj:`~yaw.Binning` applied to the data. data: Array containing the values in each of the `N` redshift bin. samples: 2-dim array containing `M` jackknife samples of the data, expected to have shape (:obj:`num_samples`, :obj:`num_bins`). """ __slots__ = ("binning", "data", "samples")
[docs] @classmethod def from_corrdata( cls: type[Self], cross_data: CorrData, ref_data: CorrData | None = None, unk_data: CorrData | None = None, ) -> Self: """ Compute a redshift estimate from a set of cross- and autocorrelation functions. Computes the clustering redshift estimate with optional correction for galaxy sample biases: .. math:: n(z) = \\frac{w_{sp}(z)}{\\sqrt{\\Delta z^2 \\, w_{ss}(z) \\, w_{pp}(z)}} Args: cross_data: The cross-correlation function amplitude (:math:`w_{sp}`), must be a :obj:`~yaw.CorrData` instance. Keyword Args: ref_data: Optional autocorrelation function amplitude of the reference sample (:math:`w_{ss}`), must be a :obj:`~yaw.CorrData` nstance. unk_data: Optional autocorrelation function amplitude of the unknown sample (:math:`w_{pp}`), must be a :obj:`~yaw.CorrData` instance. Typically unknown quantity. Returns: The redshift estimate as :obj:`~yaw.RedshiftData`. """ if parallel.on_root(): logger.debug( "computing clustering redshifts from correlation function samples" ) w_sp_data = cross_data.data w_sp_samp = cross_data.samples used_autocorrs = [] if ref_data is None: w_ss_data = np.float64(1.0) w_ss_samp = np.float64(1.0) else: ref_data.is_compatible(cross_data, require=True) w_ss_data = ref_data.data w_ss_samp = ref_data.samples used_autocorrs.append("reference") if unk_data is None: w_pp_data = np.float64(1.0) w_pp_samp = np.float64(1.0) else: unk_data.is_compatible(cross_data, require=True) w_pp_data = unk_data.data w_pp_samp = unk_data.samples used_autocorrs.append("unknown") if parallel.on_root(): if len(used_autocorrs) > 0: bias_info_str = " and ".join(used_autocorrs) else: bias_info_str = "no" logger.debug("mitigating %s sample bias", bias_info_str) N = cross_data.num_samples dz2_data = cross_data.binning.dz**2 dz2_samples = np.tile(dz2_data, N).reshape((N, -1)) nz_data = w_sp_data / np.sqrt(dz2_data * w_ss_data * w_pp_data) nz_samples = w_sp_samp / np.sqrt(dz2_samples * w_ss_samp * w_pp_samp) return cls(cross_data.binning, nz_data, nz_samples)
[docs] @classmethod def from_corrfuncs( cls: type[Self], cross_corr: CorrFunc, ref_corr: CorrFunc | None = None, unk_corr: CorrFunc | None = None, ) -> Self: """ Compute a redshift estimate from a set of cross- and autocorrelation pair counts. Computes the correlation functions from the input pair counts and calls :meth:`from_corrdata()`. Args: cross_corr: The cross-correlation function pair counts (:math:`w_{sp}`), must be a :obj:`~yaw.CorrFunc` instance. Keyword Args: ref_corr: Optional autocorrelation function pair counts of the reference sample (:math:`w_{ss}`), must be a :obj:`~yaw.CorrFunc` instance. unk_corr: Optional autocorrelation function pair counts of the unknown sample (:math:`w_{pp}`), must be a :obj:`~yaw.CorrFunc` instance. Typically an unknown quantity. Returns: The redshift estimate as :obj:`~yaw.RedshiftData`. """ if ref_corr is not None: cross_corr.is_compatible(ref_corr, require=True) if unk_corr is not None: cross_corr.is_compatible(unk_corr, require=True) cross_data = cross_corr.sample() ref_data = ref_corr.sample() if ref_corr else None unk_data = unk_corr.sample() if unk_corr else None return cls.from_corrdata(cross_data, ref_data, unk_data)
@property def _description_data(self) -> str: """Descriptive comment for header of .dat file.""" return "n(z) estimate with symmetric 68% percentile confidence" @property def _description_samples(self) -> str: """Descriptive comment for header of .smp file.""" return f"{self.num_samples} n(z) jackknife samples" @property def _description_covariance(self) -> str: """Descriptive comment for header of .cov file.""" return f"n(z) estimate covariance matrix ({self.num_bins}x{self.num_bins})" _default_plot_style = PlotStyle.point
[docs] def normalised(self, target: TypeCorrData | None = None) -> Self: """ Attempts to normalise the redshift estimate to a probability density. By default rescales data and jackknife samples by computing a normalisation factor obtained from integrating the data over the redshift range of the binning. Alternatively, the normalisation may be optained by fitting to another data container to achieve a relative normalisation. .. warning:: Both approaches are inaccuarte due to noise fluctions (in particular negative correlation amplitudes). Keyword Args: target: Optional, when provided used as reference to fit the normalisation factor, must be a subclass of :obj:`~yaw.CorrData`. Returns: A new instance with a normalisation factor applied to the data and jackknife samples. """ if parallel.on_root(): msg = "normalising %s" if target is not None: msg += " to target distribution" logger.debug(msg, type(self).__name__) if target is None: norm = np.nansum(self.binning.dz * self.data) else: y_from = self.data y_target = target.data mask = np.isfinite(y_from) & np.isfinite(y_target) & (y_target > 0.0) popt, _ = scipy.optimize.curve_fit( lambda _, norm: y_from[mask] / norm, xdata=target.binning.mids[mask], ydata=y_target[mask], p0=[1.0], sigma=1 / y_target[mask], # usually works better for noisy data ) norm = popt[0] data = self.data / norm samples = self.samples / norm return type(self)(self.binning, data, samples)