Source code for yaw.core.containers

"""This module defines a few containers used throughout other modules.

Most importantly, they implement two containers for data with attached samples
(e.g. jackknife or bootstrap). Scalar values are implemented in
:obj:`SampledValue`, data that has been computed from redshift bins in
:obj:`SampledData`, which also serves as base class for most other containers
with redshift binning.
"""

from __future__ import annotations

import warnings
from collections.abc import Iterator, Sequence
from dataclasses import dataclass, field, fields
from typing import TYPE_CHECKING, Callable, Generic, NamedTuple, TypeVar

import numpy as np
import pandas as pd

from yaw.config import OPTIONS
from yaw.core.abc import BinnedQuantity, concatenate_bin_edges
from yaw.core.math import cov_from_samples

if TYPE_CHECKING:  # pragma: no cover
    from numpy.typing import NDArray
    from pandas import DataFrame, IntervalIndex, Series

__all__ = ["Indexer", "PatchIDs", "PatchCorrelationData", "SampledValue", "SampledData"]


_TK = TypeVar("_TK")
_TV = TypeVar("_TV")


class Indexer(Generic[_TK, _TV], Iterator):
    """Helper class to implemented a class attribute that can be used as
    indexer and iterator for the classes stored data (e.g. indexing patches or
    redshift bins).
    """

    def __init__(self, inst: _TV, builder: Callable[[_TV, _TK], _TV]) -> None:
        """Construct a new indexer.

        Args:
            inst:
                Class instance on which the indexing operations are applied.
            builder:
                Callable signature ``builder(inst, item) -> inst`` that
                constructs a new class instance with the indexing specified from
                ``item`` applied.


        The resulting indexer supports indexing and slicing (depending on the
        subclass implementation), as well as iteration, where instances holding
        individual items are yielded.
        """
        self._inst = inst
        self._builder = builder
        self._iter_loc = 0

    def __repr__(self) -> str:
        return f"{self.__class__.__name__}({self._inst.__class__.__name__})"

    def __getitem__(self, item: _TK) -> _TV:
        return self._builder(self._inst, item)

    def __next__(self) -> _TV:
        """Returns the next value and increments the iterator location index."""
        try:
            val = self[self._iter_loc]
        except IndexError:
            raise StopIteration
        else:
            self._iter_loc += 1
            return val

    def __iter__(self) -> Iterator[_TV]:
        """Returns a new instance of this class to have an independent iterator
        location index"""
        return self.__class__(inst=self._inst, builder=self._builder)


class PatchIDs(NamedTuple):
    """Named tuple that can hold a pair of patch indices."""

    id1: int
    """First patch index."""
    id2: int
    """Second patch index."""


@dataclass(frozen=True)
class PatchCorrelationData:
    """Container to hold the result of a pair counting operation between two
    spatial patches.

    Args:
        patches (:obj:`PatchIDs`):
            The indices of used the patches.
        totals1 (:obj:`NDArray`):
            Total number of objects after binning by redshift in first patch.
        totals1 (:obj:`NDArray`):
            Total number of objects after binning by redshift in second patch.
        counts (:obj:`dict`):
            Dictionary listing the number of counted pairs after binning by
            redshift. Each item represents results from a different scale.
    """

    patches: PatchIDs
    totals1: NDArray
    totals2: NDArray
    counts: dict[str, NDArray]


_Tscalar = TypeVar("_Tscalar", bound=np.number)



[docs]
@dataclass(frozen=True)
class SampledValue(Generic[_Tscalar]):
    """Container to hold a scalar value with an empirically estimated
    uncertainty from resampling.

    Supports comparison of the values and samples with ``==`` and ``!=``.

    .. rubric:: Examples

    Create a value container with 100 assumed jackknife samples that scatter
    around zero with a standard deviation of 0.1:

    >>> from numpy.random import normal
    >>> samples = normal(loc=0.0, scale=0.01, size=101)
    >>> value = yaw.core.SampledValue(0.0, samples, method="jackknife")
    >>> value
    SampledValue(value=0, error=0.963, n_samples=100, method='jackknife')

    Args:
        value:
            Numerical, scalar value.
        samples (:obj:`NDArray`):
            Samples of ``value`` obtained from resampling methods.
        method (:obj:`str`):
            Resampling method used to obtain the data samples, see
            :class:`~yaw.ResamplingConfig` for available options.
    """

    value: _Tscalar
    """Numerical, scalar value."""
    samples: NDArray[_Tscalar]
    """Samples of ``value`` obtained from resampling methods."""
    method: str
    """Resampling method used to obtain the data samples, see
    :class:`~yaw.ResamplingConfig` for available options."""
    error: _Tscalar = field(init=False)
    """The uncertainty (standard error) of the value."""

    def __post_init__(self) -> None:
        if self.method not in OPTIONS.method:
            raise ValueError(f"unknown sampling method '{self.method}'")
        if self.method == "bootstrap":
            error = np.std(self.samples, ddof=1, axis=0)
        else:  # jackknife
            error = np.std(self.samples, ddof=0, axis=0) * (self.n_samples - 1)
        object.__setattr__(self, "error", error)

    def __repr__(self) -> str:
        string = self.__class__.__name__
        value = self.value
        error = self.error
        n_samples = self.n_samples
        method = self.method
        return f"{string}({value=:.3g}, {error=:.3g}, {n_samples=}, {method=})"

    def __str__(self) -> str:
        return f"{self.value:+.3g}+/-{self.error:.3g}"

    def __eq__(self, other: object) -> bool:
        if isinstance(other, SampledValue):
            return (
                self.samples.shape == other.samples.shape
                and self.method == other.method
                and self.value == other.value
                and np.all(self.samples == other.samples)
            )
        return NotImplemented

    @property
    def n_samples(self) -> int:
        """Number of samples used for error estimate."""
        return len(self.samples)



_Tdata = TypeVar("_Tdata", bound="SampledData")



[docs]
@dataclass(frozen=True, repr=False)
class SampledData(BinnedQuantity):
    """Container for data and resampled data with redshift binning.

    Contains the redshift binning, data vector, and resampled data vector (e.g.
    jackknife or bootstrap samples). The resampled values are used to compute
    error estimates and covariance/correlation matrices.

    Args:
        binning (:obj:`pandas.IntervalIndex`):
            The redshift binning applied to the data.
        data (:obj:`NDArray`):
            The data values, one for each redshift bin.
        samples (:obj:`NDArray`):
            The resampled data values (e.g. jackknife or bootstrap samples).
        method (:obj:`str`):
            The resampling method used, see :class:`~yaw.ResamplingConfig` for
            available options.

    The container supports addition and subtraction, which return a new instance
    of the container, holding the modified data. This requires that both
    operands are compatible (same binning and same sampling). The operands are
    applied to the ``data`` and ``samples`` attribtes.

    Furthermore, the container supports indexing and iteration over the redshift
    bins using the :obj:`SampledData.bins` attribute. This attribute yields
    instances of :obj:`SampledData` containing a single bin when iterating.
    Slicing and indexing follows the same rules as the underlying ``data``
    :obj:`NDArray`. Refer to :obj:`~yaw.correlation.CorrData` for some indexing
    and iteration examples.

    .. rubric:: Examples

    Create a redshift binning:

    >>> import pandas as pd
    >>> bins = pd.IntervalIndex.from_breaks([0.1, 0.2, 0.3])
    >>> bins
    IntervalIndex([(0.1, 0.2], (0.2, 0.3]], dtype='interval[float64, right]')

    Create some sample data for the bins with value 1 and five assumed jackknife
    samples normal-distributed around 1.

    >>> import numpy as np
    >>> n_bins, n_samples = len(bins), 5
    >>> data = np.ones(n_bins)
    >>> samples = np.random.normal(1.0, size=(n_samples, n_bins))

    Create the container:

    >>> values = yaw.core.SampledData(bins, data, samples, method="jackknife")
    >>> values
    SampledData(n_bins=2, z='0.100...0.300', n_samples=10, method='jackknife')

    Add the container to itself and verify that the values are doubled:

    >>> summed = values + values
    >>> summed.data
    array([2., 2.])

    The same applies to the samples:

    >>> summed.samples / values.samples
    array([[2., 2.],
           [2., 2.],
           [2., 2.],
           [2., 2.],
           [2., 2.]])
    """

    binning: IntervalIndex
    """The redshift bin intervals."""
    data: NDArray
    """The data values, one for each redshift bin."""
    samples: NDArray
    """Samples of the data values, shape (# samples, # bins)."""
    method: str
    """The resampling method used."""
    covariance: NDArray = field(init=False)
    """Covariance matrix automatically computed from the resampled values."""

    def __post_init__(self) -> None:
        if self.data.shape != (self.n_bins,):
            raise ValueError("unexpected shape of 'data' array")
        if not self.samples.shape[1] == self.n_bins:
            raise ValueError("number of bins for 'data' and 'samples' do not match")
        if self.method not in OPTIONS.method:
            raise ValueError(f"unknown sampling method '{self.method}'")

        covmat = cov_from_samples(self.samples, self.method)
        object.__setattr__(self, "covariance", np.atleast_2d(covmat))

    def __repr__(self) -> str:
        string = super().__repr__()[:-1]
        n_samples = self.n_samples
        method = self.method
        return f"{string}, {n_samples=}, {method=})"

    def __eq__(self, other: object) -> bool:
        if isinstance(other, self.__class__):
            return (
                self.samples.shape == other.samples.shape
                and self.method == other.method
                and np.all(self.data == other.data)
                and np.all(self.samples == other.samples)
                and (self.binning == other.binning).all()
            )
        return NotImplemented

    def __add__(self, other: object) -> _Tdata:
        if not isinstance(other, self.__class__):
            self.is_compatible(other, require=True)
            return self.__class__(
                binning=self.get_binning(),
                data=self.data + other.data,
                samples=self.samples + other.samples,
                method=self.method,
            )
        return NotImplemented

    def __sub__(self, other: object) -> _Tdata:
        if not isinstance(other, self.__class__):
            self.is_compatible(other, require=True)
            return self.__class__(
                binning=self.get_binning(),
                data=self.data - other.data,
                samples=self.samples - other.samples,
                method=self.method,
            )
        return NotImplemented

    @property
    def bins(self: _Tdata) -> Indexer[int | slice | Sequence, _Tdata]:
        def builder(inst: _Tdata, item: int | slice | Sequence) -> _Tdata:
            if isinstance(item, int):
                item = [item]
            # try to take subsets along bin axis
            binning = inst.get_binning()[item]
            data = inst.data[item]
            samples = inst.samples[:, item]
            # determine which extra attributes need to be copied
            init_attrs = {field.name for field in fields(inst) if field.init}
            copy_attrs = init_attrs - {"binning", "data", "samples"}

            kwargs = dict(binning=binning, data=data, samples=samples)
            kwargs.update({attr: getattr(inst, attr) for attr in copy_attrs})
            return inst.__class__(**kwargs)

        return Indexer(self, builder)

    @property
    def n_samples(self) -> int:
        """Number of samples used for error estimate."""
        return len(self.samples)

    @property
    def error(self) -> NDArray:
        """The uncertainty (standard error) of the data.

        Returns:
            :obj:`NDArray`
        """
        return np.sqrt(np.diag(self.covariance))


[docs]
    def get_binning(self) -> IntervalIndex:
        return self.binning



[docs]
    def is_compatible(self, other: SampledData, require: bool = False) -> bool:
        """Check whether this instance is compatible with another instance.

        Ensures that both objects are instances of the same class, that the
        redshift binning is identical, that the number of samples agree, and
        that the resampling method is identical.

        Args:
            other (:obj:`BinnedQuantity`):
                Object instance to compare to.
            require (:obj:`bool`, optional)
                Raise a ValueError if any of the checks fail.

        Returns:
            :obj:`bool`
        """
        if not super().is_compatible(other, require):
            return False
        if self.n_samples != other.n_samples:
            if require:
                raise ValueError("number of samples do not agree")
            return False
        if self.method != other.method:
            if require:
                raise ValueError("resampling method does not agree")
            return False
        return True



[docs]
    def concatenate_bins(self: _Tdata, *data: _Tdata) -> _Tdata:
        for other in data:
            self.is_compatible(other, require=True)
        all_data: list[_Tdata] = [self, *data]
        binning = concatenate_bin_edges(*all_data)
        # concatenate data
        data = np.concatenate([d.data for d in all_data])
        samples = np.concatenate([d.samples for d in all_data], axis=1)
        # determine which extra attributes need to be copied
        init_attrs = {field.name for field in fields(self) if field.init}
        copy_attrs = init_attrs - {"binning", "data", "samples"}

        kwargs = dict(binning=binning, data=data, samples=samples)
        kwargs.update({attr: getattr(self, attr) for attr in copy_attrs})
        return self.__class__(**kwargs)



[docs]
    def get_data(self) -> Series:
        """Get the data as :obj:`pandas.Series` with the binning as index."""
        return pd.Series(self.data, index=self.binning)



[docs]
    def get_samples(self) -> DataFrame:
        """Get the data as :obj:`pandas.DataFrame` with the binning as index.
        The columns are labelled numerically and each represent one of the
        samples."""
        return pd.DataFrame(self.samples.T, index=self.binning)



[docs]
    def get_error(self) -> Series:
        """Get value error estimate (diagonal of covariance matrix) as series
        with its corresponding redshift bin intervals as index.

        Returns:
            :obj:`pandas.Series`
        """
        return pd.Series(self.error, index=self.binning)



[docs]
    def get_covariance(self) -> DataFrame:
        """Get value covariance matrix as data frame with its corresponding
        redshift bin intervals as index and column labels.

        Returns:
            :obj:`pandas.DataFrame`
        """
        return pd.DataFrame(
            data=self.covariance, index=self.binning, columns=self.binning
        )



[docs]
    def get_correlation(self) -> DataFrame:
        """Get value correlation matrix as data frame with its corresponding
        redshift bin intervals as index and column labels.

        Returns:
            :obj:`pandas.DataFrame`
        """
        stdev = np.sqrt(np.diag(self.covariance))
        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            corr = self.covariance / np.outer(stdev, stdev)
        corr[self.covariance == 0] = 0
        return pd.DataFrame(data=corr, index=self.binning, columns=self.binning)