Source code for ctapipe.io.tableio

"""
Base functionality for reading and writing tabular data
"""
import re
from abc import ABCMeta, abstractmethod
from collections import defaultdict

import numpy as np
from astropy.time import Time
from astropy.units import Quantity

from ..core import Component
from ..instrument import SubarrayDescription

__all__ = ["TableReader", "TableWriter"]


[docs]class TableWriter(Component, metaclass=ABCMeta):
    """
    Base class for writing tabular data stored in `~ctapipe.core.Container`

    A single write will add a new row to the output table,
    where each `~ctapipe.core.Field` becomes a column.
    Subclasses of this implement specific output types.

    See Also
    --------
    ctapipe.io.HDF5TableWriter: Implementation of this base class for writing HDF5 files
    """

    def __init__(self, parent=None, add_prefix=False, **kwargs):
        super().__init__(parent=parent, **kwargs)
        self._transform_regexps = defaultdict(dict)  # user requested, may be regexps
        self._transforms = defaultdict(dict)  # fully expanded explicit names
        self._exclusions = defaultdict(list)  # columns to exclude
        self.add_prefix = add_prefix

    def __enter__(self):

        return self

    def __exit__(self, exc_type, exc_val, exc_tb):
        self.close()

[docs]    def exclude(self, table_regexp, pattern):
        """Exclude any columns (Fields) matching the pattern from being written. The
        patterns are matched with re.fullmatch.

        Parameters
        ----------
        table_regexp: str
            regexp matching to match table name to apply exclusion
        pattern: str
            regular expression string to match column name in the table

        """
        table_regexp = table_regexp.lstrip("/")
        self._exclusions[re.compile(table_regexp)].append(re.compile(pattern))

    def _is_column_excluded(self, table_name, col_name):
        for table_regexp, col_regexp_list in self._exclusions.items():
            if table_regexp.fullmatch(table_name):
                for col_regexp in col_regexp_list:
                    if col_regexp.fullmatch(col_name):
                        return True
        return False

[docs]    def add_column_transform(self, table_name, col_name, transform):
        """Add a transformation function for a column. This function will be called on
        the value in the container before it is written to the output file.

        Parameters
        ----------
        table_name: str
            identifier of table being written
        col_name: str
            name of column in the table (or item in the Container).
        transform: ColumnTransform
            function that tranforms input into output

        """
        # allow leading slash
        self._transforms[table_name.lstrip("/")][col_name] = transform
        self.log.debug(
            "Added transform: {}/{} -> {}".format(table_name, col_name, transform)
        )

[docs]    def add_column_transform_regexp(self, table_regexp, col_regexp, transform):
        """Add a transformation function for a set of columns and tables that match the
        given regular expressions. Each requested transform pattern will be
        turned into an explicit column transform when the table schema is built.

        Parameters
        ----------
        table_name: regexp
            pattern matching the table name (via re.matchall)
        col_name: regexp
            pattern matching the column name if the table name also matches
            (via re.matchall)
        transform: ColumnTransform
            function that tranformns input value into output value

        """
        # allow leading slash
        self._transform_regexps[table_regexp.lstrip("/")][col_regexp] = transform
        self.log.debug(
            "Requested transform for pattern: %s/%s -> %s",
            table_regexp,
            col_regexp,
            transform,
        )

    def _realize_regexp_transforms(self, table_name, containers):
        """Loops though all requested transform regexps, checks if they apply the the
        given table, if so, checks each Field in the given Container, and if
        that matches, calls `self.add_column_transform(table, fieldname)` to
        create an explicit (non-regexp) transform. This is done for speed
        reasons: if we called the regexp for each table and each column, it adds
        a significant overhead.

        This should be called when building the table schema.

        Parameters
        ----------
        table_name: str
            table name
        containers: List[Container]
            List of containers to check
        """
        for table_regexp, column_regexp_dict in self._transform_regexps.items():
            if not re.fullmatch(table_regexp, table_name):
                continue

            self.log.debug("Table '%s' matched pattern '%s'", table_name, table_regexp)

            for column_regexp, transform in column_regexp_dict.items():
                for container in containers:
                    for col_name, _ in container.items(add_prefix=self.add_prefix):

                        if re.fullmatch(column_regexp, col_name):
                            self.log.debug(
                                "Column '%s' matched pattern '%s'",
                                col_name,
                                column_regexp,
                            )

                            self.add_column_transform(
                                table_name=table_name,
                                col_name=col_name,
                                transform=transform,
                            )

[docs]    @abstractmethod
    def write(self, table_name, containers, **kwargs):
        """
        Write the contents of the given container or containers to a table.
        The first call to write  will create a schema and initialize the table
        within the file.
        The shape of data within the container must not change between calls,
        since variable-length arrays are not supported.

        Parameters
        ----------
        table_name: str
            name of table to write to
        containers: ctapipe.core.Container or iterable thereof
            container instance(s) to write
        **kwargs:
            may be passed to a lower level implementation to set options
        """
        pass

[docs]    @abstractmethod
    def open(self, filename, **kwargs):
        """
        open an output file

        Parameters
        ----------
        filename: str
            output file name
        kwargs:
            any extra args to pass to the subclass open method
        """
        pass

[docs]    @abstractmethod
    def close(self):
        """Close open writer"""
        pass

    def _apply_col_transform(self, table_name, col_name, value):
        """
        apply value transform function if it exists for this column
        """
        if col_name in self._transforms[table_name]:
            tr = self._transforms[table_name][col_name]
            value = tr(value)
        return value


[docs]class TableReader(Component, metaclass=ABCMeta):
    """
    Base class for row-wise table readers. Generally methods that read a
    full table at once are preferred to this method, since they are faster,
    but this can be used to re-play a table row by row into a
    `ctapipe.core.Container` class (the opposite of TableWriter)
    """

    def __init__(self):
        super().__init__()
        self._transforms = defaultdict(dict)

    def __enter__(self):
        return self

    def __exit__(self, exc_type, exc_val, exc_tb):
        self.close()

[docs]    def add_column_transform(self, table_name, col_name, transform):
        """
        Add a transformation function for a column. This function will be
        called on the value in the container before it is written to the
        output file.

        Parameters
        ----------
        table_name: str
            identifier of table being written
        col_name: str
            name of column in the table (or item in the Container)
        transform: callable
            function that take a value and returns a new one
        """
        self._transforms[table_name][col_name] = transform
        self.log.debug(
            "Added transform: {}/{} -> {}".format(table_name, col_name, transform)
        )

    def _apply_col_transform(self, table_name, col_name, value):
        """
        apply value transform function if it exists for this column
        """
        if col_name in self._transforms[table_name]:
            tr = self._transforms[table_name][col_name]
            value = tr.inverse(value)
        return value

[docs]    @abstractmethod
    def read(self, table_name, containers, prefixes, **kwargs):
        """
        Returns a generator that reads the next row from the table into the
        given container.  The generator returns the same container. Note that
        no containers are copied, the data are overwritten inside.

        Parameters
        ----------
        table_name: str
            name of table to read from
        containers: ctapipe.core.Container or iterable thereof
            Container instance(s) to fill
        prefixes: bool, str or iterable of str
            prefixes used during writing of the table
        """
        pass

[docs]    @abstractmethod
    def open(self, filename, **kwargs):
        pass

[docs]    @abstractmethod
    def close(self):
        pass


class ColumnTransform(metaclass=ABCMeta):
    """
    A Transformation to be applied before serialization / after deserialization.

    The ``TableWriter`` will call the transform on the data to be stored and
    ``TableReader`` will call `.inverse`` to reverse the transformation
    when a transformation is detected in the file through metadata.

    Transformations implement ``get_meta`` to provide the necessary metadata
    for inverting the transformation on reading.
    """

    @abstractmethod
    def __call__(self, value):
        pass

    @abstractmethod
    def inverse(self, value):
        """Invert the transformation applied in ``__call__``"""
        return value

    @abstractmethod
    def get_meta(self, colname):
        """Metadata to be stored in the header information of the table

        Needs to fully describe the transformation so upon reading, the
        transformation can be inverted based on this metadata.
        """
        return {}


class TimeColumnTransform(ColumnTransform):
    """A Column transformation that converts astropy time objects to MJD TAI"""

    def __init__(self, scale, format):
        self.scale = scale
        self.format = format

    def __call__(self, value: Time):
        """
        Convert an astropy time object to an mjd value in tai scale
        """
        return getattr(getattr(value, self.scale), self.format)

    def inverse(self, value):
        return Time(value, scale=self.scale, format=self.format, copy=False)

    def get_meta(self, colname):
        return {
            f"CTAFIELD_{colname}_TRANSFORM": "time",
            f"CTAFIELD_{colname}_TIME_FORMAT": self.format,
            f"CTAFIELD_{colname}_TIME_SCALE": self.scale,
        }


class QuantityColumnTransform(ColumnTransform):
    """A Column Transform that transforms quantities to their values in the given unit"""

    def __init__(self, unit):
        self.unit = unit

    def __call__(self, value):
        return value.to_value(self.unit)

    def inverse(self, value):
        return Quantity(value, self.unit, copy=False)

    def get_meta(self, colname):
        return {
            f"CTAFIELD_{colname}_TRANSFORM": "quantity",
            f"CTAFIELD_{colname}_UNIT": self.unit.to_string("vounit"),
        }


class FixedPointColumnTransform(ColumnTransform):
    """
    Apply a scale, offset and dtype conversion.

    Can be used to store values as fixed point by using an integer dtype
    and a scale that is a power of 10.

    The transforms reserves 3 integers to represent -inf, nan and inf.
    Underflowing values are converted to -inf and overflowing to inf.

    For unsigned target dtype:
    -inf: maxval - 2
    nan: maxval - 1
    inf: maxval

    For signed target dtype:
    -inf: minval
    nan: minval + 1
    inf: maxval

    When reading, these special values must not be interpreted as valid integer
    values but be transformed back into -inf, nan, inf respectively.

    This is a lossy transformation.
    """

    def __init__(self, scale, offset, source_dtype, target_dtype):
        self.scale = scale
        self.offset = offset
        self.source_dtype = np.dtype(source_dtype)
        self.target_dtype = np.dtype(target_dtype)
        self.unsigned = self.target_dtype.kind == "u"

        iinfo = np.iinfo(self.target_dtype)

        # use three highest values for nan markers for unsigned case
        if self.unsigned:
            self.neginf = iinfo.max - 2
            self.nan = iinfo.max - 1
            self.posinf = iinfo.max

            # this leaves this inclusive range for the valid values
            self.minval = 0
            self.maxval = iinfo.max - 3
        else:
            self.neginf = iinfo.min
            self.nan = iinfo.min + 1
            self.posinf = iinfo.max

            # this leaves this inclusive range for the valid values
            self.minval = iinfo.min + 2
            self.maxval = iinfo.max - 1

    def __call__(self, value):
        is_scalar = np.array(value, copy=False).shape == ()
        value = np.atleast_1d(value).astype(self.source_dtype, copy=False)

        scaled = np.round(value * self.scale) + self.offset

        # convert under/overflow values to -inf/inf
        scaled[scaled > self.maxval] = np.inf
        scaled[scaled < self.minval] = -np.inf

        nans = np.isnan(scaled)
        pos_inf = np.isposinf(scaled)
        neg_inf = np.isneginf(scaled)

        result = scaled.astype(self.target_dtype)
        result[nans] = self.nan
        result[neg_inf] = self.neginf
        result[pos_inf] = self.posinf

        if is_scalar:
            return np.squeeze(result)

        return result

    def inverse(self, value):
        is_scalar = np.array(value, copy=False).shape == ()
        value = np.atleast_1d(value)

        result = (value.astype(self.source_dtype) - self.offset) / self.scale
        result = np.atleast_1d(result)

        nans = value == self.nan
        pos_inf = value == self.posinf
        neg_inf = value == self.neginf

        result[nans] = np.nan
        result[neg_inf] = -np.inf
        result[pos_inf] = np.inf

        if is_scalar:
            return np.squeeze(result)

        return result

    def get_meta(self, colname: str):
        return {
            f"CTAFIELD_{colname}_TRANSFORM": "fixed_point",
            f"CTAFIELD_{colname}_TRANSFORM_SCALE": self.scale,
            f"CTAFIELD_{colname}_TRANSFORM_DTYPE": str(self.source_dtype),
            f"CTAFIELD_{colname}_TRANSFORM_OFFSET": self.offset,
            f"CTAFIELD_{colname}_NAN_VALUE": self.nan,
            f"CTAFIELD_{colname}_POSINF_VALUE": self.posinf,
            f"CTAFIELD_{colname}_NEGINF_VALUE": self.neginf,
        }


class EnumColumnTransform(ColumnTransform):
    """Store the value of an enum"""

    def __init__(self, enum):
        self.enum = enum

    @staticmethod
    def __call__(value):
        return value.value

    def inverse(self, value):
        return self.enum(value)

    def get_meta(self, colname):
        return {
            f"CTAFIELD_{colname}_TRANSFORM": "enum",
            f"CTAFIELD_{colname}_ENUM": self.enum,
        }


def encode_utf8_max_len(string, max_length):
    """Encode a string to utf-8 with max length in bytes

    This will not create invalid utf-8 data and thus the resulting
    bytes object might be shorter than ``max_length`` if max_length
    falls into a multi-byte codepoint.

    This might still create nonsensical output for cases like combining
    diacritics and emoji, but
    a) it will always successfully decode as utf-8
    b) we can probably live with not supporting all emojis
    when truncating strings in hdf5 data of ctapipe

    See https://stackoverflow.com/a/56724327/3838691
    """
    utf8 = string.encode("utf-8")

    if len(utf8) <= max_length:
        return utf8

    while (utf8[max_length] & 0b1100_0000) == 0b1000_0000:
        max_length -= 1

    return utf8[:max_length]


class StringTransform(ColumnTransform):
    """
    Encode strings as utf-8 bytes using a max_length

    Byte values are truncated after ``max_length``, this might result
    in invalid utf-8!. Trailing utf-8 bytes are ignored when inversing the
    transform.

    Should not be used anymore when tables starts supporting
    variable length strings in tables.
    """

    def __init__(self, max_length):
        self.max_length = max_length
        self.dtype = f"S{max_length:d}"

    def __call__(self, value):
        if isinstance(value, str):
            return encode_utf8_max_len(value, self.max_length)

        return np.array(
            [encode_utf8_max_len(v, self.max_length) for v in value]
        ).astype(self.dtype)

    def inverse(self, value):
        if isinstance(value, bytes):
            return value.decode("utf-8")

        # astropy table columns somehow try to handle byte columns as strings
        # when iterating, this does not work here, convert to np.array
        value = np.array(value, copy=False)
        return np.array([v.decode("utf-8") for v in value])

    def get_meta(self, colname):
        return {
            f"CTAFIELD_{colname}_TRANSFORM": "string",
            f"CTAFIELD_{colname}_MAXLEN": self.max_length,
        }


class TelListToMaskTransform(ColumnTransform):
    """convert variable-length list of tel_ids to a fixed-length mask"""

    def __init__(self, subarray: SubarrayDescription):
        self._forward = subarray.tel_ids_to_mask
        self._inverse = subarray.tel_mask_to_tel_ids

    def __call__(self, value):
        if value is None:
            return None

        return self._forward(value)

    def inverse(self, value):
        if value is None:
            return None
        return self._inverse(value)

    def get_meta(self, colname):
        return {f"CTAFIELD_{colname}_TRANSFORM": "tel_list_to_mask"}