Source code for yaw.core.containers

"""This module defines a few containers used throughout other modules.

Most importantly, they implement two containers for data with attached samples
(e.g. jackknife or bootstrap). Scalar values are implemented in
:obj:`SampledValue`, data that has been computed from redshift bins in
:obj:`SampledData`, which also serves as base class for most other containers
with redshift binning.
"""

from __future__ import annotations

import warnings
from collections.abc import Iterator, Sequence
from dataclasses import dataclass, field, fields
from typing import TYPE_CHECKING, Callable, Generic, NamedTuple, TypeVar

import numpy as np
import pandas as pd

from yaw.config import OPTIONS
from yaw.core.abc import BinnedQuantity, concatenate_bin_edges
from yaw.core.math import cov_from_samples

if TYPE_CHECKING:  # pragma: no cover
    from numpy.typing import NDArray
    from pandas import DataFrame, IntervalIndex, Series

__all__ = ["Indexer", "PatchIDs", "PatchCorrelationData", "SampledValue", "SampledData"]


_TK = TypeVar("_TK")
_TV = TypeVar("_TV")


class Indexer(Generic[_TK, _TV], Iterator):
    """Helper class to implemented a class attribute that can be used as
    indexer and iterator for the classes stored data (e.g. indexing patches or
    redshift bins).
    """

    def __init__(self, inst: _TV, builder: Callable[[_TV, _TK], _TV]) -> None:
        """Construct a new indexer.

        Args:
            inst:
                Class instance on which the indexing operations are applied.
            builder:
                Callable signature ``builder(inst, item) -> inst`` that
                constructs a new class instance with the indexing specified from
                ``item`` applied.


        The resulting indexer supports indexing and slicing (depending on the
        subclass implementation), as well as iteration, where instances holding
        individual items are yielded.
        """
        self._inst = inst
        self._builder = builder
        self._iter_loc = 0

    def __repr__(self) -> str:
        return f"{self.__class__.__name__}({self._inst.__class__.__name__})"

    def __getitem__(self, item: _TK) -> _TV:
        return self._builder(self._inst, item)

    def __next__(self) -> _TV:
        """Returns the next value and increments the iterator location index."""
        try:
            val = self[self._iter_loc]
        except IndexError:
            raise StopIteration
        else:
            self._iter_loc += 1
            return val

    def __iter__(self) -> Iterator[_TV]:
        """Returns a new instance of this class to have an independent iterator
        location index"""
        return self.__class__(inst=self._inst, builder=self._builder)


class PatchIDs(NamedTuple):
    """Named tuple that can hold a pair of patch indices."""

    id1: int
    """First patch index."""
    id2: int
    """Second patch index."""


@dataclass(frozen=True)
class PatchCorrelationData:
    """Container to hold the result of a pair counting operation between two
    spatial patches.

    Args:
        patches (:obj:`PatchIDs`):
            The indices of used the patches.
        totals1 (:obj:`NDArray`):
            Total number of objects after binning by redshift in first patch.
        totals1 (:obj:`NDArray`):
            Total number of objects after binning by redshift in second patch.
        counts (:obj:`dict`):
            Dictionary listing the number of counted pairs after binning by
            redshift. Each item represents results from a different scale.
    """

    patches: PatchIDs
    totals1: NDArray
    totals2: NDArray
    counts: dict[str, NDArray]


_Tscalar = TypeVar("_Tscalar", bound=np.number)


[docs] @dataclass(frozen=True) class SampledValue(Generic[_Tscalar]): """Container to hold a scalar value with an empirically estimated uncertainty from resampling. Supports comparison of the values and samples with ``==`` and ``!=``. .. rubric:: Examples Create a value container with 100 assumed jackknife samples that scatter around zero with a standard deviation of 0.1: >>> from numpy.random import normal >>> samples = normal(loc=0.0, scale=0.01, size=101) >>> value = yaw.core.SampledValue(0.0, samples, method="jackknife") >>> value SampledValue(value=0, error=0.963, n_samples=100, method='jackknife') Args: value: Numerical, scalar value. samples (:obj:`NDArray`): Samples of ``value`` obtained from resampling methods. method (:obj:`str`): Resampling method used to obtain the data samples, see :class:`~yaw.ResamplingConfig` for available options. """ value: _Tscalar """Numerical, scalar value.""" samples: NDArray[_Tscalar] """Samples of ``value`` obtained from resampling methods.""" method: str """Resampling method used to obtain the data samples, see :class:`~yaw.ResamplingConfig` for available options.""" error: _Tscalar = field(init=False) """The uncertainty (standard error) of the value.""" def __post_init__(self) -> None: if self.method not in OPTIONS.method: raise ValueError(f"unknown sampling method '{self.method}'") if self.method == "bootstrap": error = np.std(self.samples, ddof=1, axis=0) else: # jackknife error = np.std(self.samples, ddof=0, axis=0) * (self.n_samples - 1) object.__setattr__(self, "error", error) def __repr__(self) -> str: string = self.__class__.__name__ value = self.value error = self.error n_samples = self.n_samples method = self.method return f"{string}({value=:.3g}, {error=:.3g}, {n_samples=}, {method=})" def __str__(self) -> str: return f"{self.value:+.3g}+/-{self.error:.3g}" def __eq__(self, other: object) -> bool: if isinstance(other, SampledValue): return ( self.samples.shape == other.samples.shape and self.method == other.method and self.value == other.value and np.all(self.samples == other.samples) ) return NotImplemented @property def n_samples(self) -> int: """Number of samples used for error estimate.""" return len(self.samples)
_Tdata = TypeVar("_Tdata", bound="SampledData")
[docs] @dataclass(frozen=True, repr=False) class SampledData(BinnedQuantity): """Container for data and resampled data with redshift binning. Contains the redshift binning, data vector, and resampled data vector (e.g. jackknife or bootstrap samples). The resampled values are used to compute error estimates and covariance/correlation matrices. Args: binning (:obj:`pandas.IntervalIndex`): The redshift binning applied to the data. data (:obj:`NDArray`): The data values, one for each redshift bin. samples (:obj:`NDArray`): The resampled data values (e.g. jackknife or bootstrap samples). method (:obj:`str`): The resampling method used, see :class:`~yaw.ResamplingConfig` for available options. The container supports addition and subtraction, which return a new instance of the container, holding the modified data. This requires that both operands are compatible (same binning and same sampling). The operands are applied to the ``data`` and ``samples`` attribtes. Furthermore, the container supports indexing and iteration over the redshift bins using the :obj:`SampledData.bins` attribute. This attribute yields instances of :obj:`SampledData` containing a single bin when iterating. Slicing and indexing follows the same rules as the underlying ``data`` :obj:`NDArray`. Refer to :obj:`~yaw.correlation.CorrData` for some indexing and iteration examples. .. rubric:: Examples Create a redshift binning: >>> import pandas as pd >>> bins = pd.IntervalIndex.from_breaks([0.1, 0.2, 0.3]) >>> bins IntervalIndex([(0.1, 0.2], (0.2, 0.3]], dtype='interval[float64, right]') Create some sample data for the bins with value 1 and five assumed jackknife samples normal-distributed around 1. >>> import numpy as np >>> n_bins, n_samples = len(bins), 5 >>> data = np.ones(n_bins) >>> samples = np.random.normal(1.0, size=(n_samples, n_bins)) Create the container: >>> values = yaw.core.SampledData(bins, data, samples, method="jackknife") >>> values SampledData(n_bins=2, z='0.100...0.300', n_samples=10, method='jackknife') Add the container to itself and verify that the values are doubled: >>> summed = values + values >>> summed.data array([2., 2.]) The same applies to the samples: >>> summed.samples / values.samples array([[2., 2.], [2., 2.], [2., 2.], [2., 2.], [2., 2.]]) """ binning: IntervalIndex """The redshift bin intervals.""" data: NDArray """The data values, one for each redshift bin.""" samples: NDArray """Samples of the data values, shape (# samples, # bins).""" method: str """The resampling method used.""" covariance: NDArray = field(init=False) """Covariance matrix automatically computed from the resampled values.""" def __post_init__(self) -> None: if self.data.shape != (self.n_bins,): raise ValueError("unexpected shape of 'data' array") if not self.samples.shape[1] == self.n_bins: raise ValueError("number of bins for 'data' and 'samples' do not match") if self.method not in OPTIONS.method: raise ValueError(f"unknown sampling method '{self.method}'") covmat = cov_from_samples(self.samples, self.method) object.__setattr__(self, "covariance", np.atleast_2d(covmat)) def __repr__(self) -> str: string = super().__repr__()[:-1] n_samples = self.n_samples method = self.method return f"{string}, {n_samples=}, {method=})" def __eq__(self, other: object) -> bool: if isinstance(other, self.__class__): return ( self.samples.shape == other.samples.shape and self.method == other.method and np.all(self.data == other.data) and np.all(self.samples == other.samples) and (self.binning == other.binning).all() ) return NotImplemented def __add__(self, other: object) -> _Tdata: if not isinstance(other, self.__class__): self.is_compatible(other, require=True) return self.__class__( binning=self.get_binning(), data=self.data + other.data, samples=self.samples + other.samples, method=self.method, ) return NotImplemented def __sub__(self, other: object) -> _Tdata: if not isinstance(other, self.__class__): self.is_compatible(other, require=True) return self.__class__( binning=self.get_binning(), data=self.data - other.data, samples=self.samples - other.samples, method=self.method, ) return NotImplemented @property def bins(self: _Tdata) -> Indexer[int | slice | Sequence, _Tdata]: def builder(inst: _Tdata, item: int | slice | Sequence) -> _Tdata: if isinstance(item, int): item = [item] # try to take subsets along bin axis binning = inst.get_binning()[item] data = inst.data[item] samples = inst.samples[:, item] # determine which extra attributes need to be copied init_attrs = {field.name for field in fields(inst) if field.init} copy_attrs = init_attrs - {"binning", "data", "samples"} kwargs = dict(binning=binning, data=data, samples=samples) kwargs.update({attr: getattr(inst, attr) for attr in copy_attrs}) return inst.__class__(**kwargs) return Indexer(self, builder) @property def n_samples(self) -> int: """Number of samples used for error estimate.""" return len(self.samples) @property def error(self) -> NDArray: """The uncertainty (standard error) of the data. Returns: :obj:`NDArray` """ return np.sqrt(np.diag(self.covariance))
[docs] def get_binning(self) -> IntervalIndex: return self.binning
[docs] def is_compatible(self, other: SampledData, require: bool = False) -> bool: """Check whether this instance is compatible with another instance. Ensures that both objects are instances of the same class, that the redshift binning is identical, that the number of samples agree, and that the resampling method is identical. Args: other (:obj:`BinnedQuantity`): Object instance to compare to. require (:obj:`bool`, optional) Raise a ValueError if any of the checks fail. Returns: :obj:`bool` """ if not super().is_compatible(other, require): return False if self.n_samples != other.n_samples: if require: raise ValueError("number of samples do not agree") return False if self.method != other.method: if require: raise ValueError("resampling method does not agree") return False return True
[docs] def concatenate_bins(self: _Tdata, *data: _Tdata) -> _Tdata: for other in data: self.is_compatible(other, require=True) all_data: list[_Tdata] = [self, *data] binning = concatenate_bin_edges(*all_data) # concatenate data data = np.concatenate([d.data for d in all_data]) samples = np.concatenate([d.samples for d in all_data], axis=1) # determine which extra attributes need to be copied init_attrs = {field.name for field in fields(self) if field.init} copy_attrs = init_attrs - {"binning", "data", "samples"} kwargs = dict(binning=binning, data=data, samples=samples) kwargs.update({attr: getattr(self, attr) for attr in copy_attrs}) return self.__class__(**kwargs)
[docs] def get_data(self) -> Series: """Get the data as :obj:`pandas.Series` with the binning as index.""" return pd.Series(self.data, index=self.binning)
[docs] def get_samples(self) -> DataFrame: """Get the data as :obj:`pandas.DataFrame` with the binning as index. The columns are labelled numerically and each represent one of the samples.""" return pd.DataFrame(self.samples.T, index=self.binning)
[docs] def get_error(self) -> Series: """Get value error estimate (diagonal of covariance matrix) as series with its corresponding redshift bin intervals as index. Returns: :obj:`pandas.Series` """ return pd.Series(self.error, index=self.binning)
[docs] def get_covariance(self) -> DataFrame: """Get value covariance matrix as data frame with its corresponding redshift bin intervals as index and column labels. Returns: :obj:`pandas.DataFrame` """ return pd.DataFrame( data=self.covariance, index=self.binning, columns=self.binning )
[docs] def get_correlation(self) -> DataFrame: """Get value correlation matrix as data frame with its corresponding redshift bin intervals as index and column labels. Returns: :obj:`pandas.DataFrame` """ stdev = np.sqrt(np.diag(self.covariance)) with warnings.catch_warnings(): warnings.simplefilter("ignore") corr = self.covariance / np.outer(stdev, stdev) corr[self.covariance == 0] = 0 return pd.DataFrame(data=corr, index=self.binning, columns=self.binning)