Source code for edflow.data.believers.meta

import os
import numpy as np
import yaml
import re

from edflow.data.dataset_mixin import DatasetMixin
from edflow.util import retrieve, get_obj_from_str, pp2mkdtable, pop_keypath
from edflow.util import walk, set_value, edprint
from edflow.data.believers.meta_loaders import DEFAULT_LOADERS

try:
    from IPython import get_ipython
    from IPython.display import display, Markdown

    __COULD_HAVE_IPYTHON__ = True
except ImportError:
    __COULD_HAVE_IPYTHON__ = False


[docs]class MetaDataset(DatasetMixin):
    """
    The :class:`MetaDataset` allows for easy data reading using a simple
    interface.

    All you need to do is hand the constructor a path and it will look for all
    data in a special format and load it as numpy arrays. If further specified
    in a meta data file or the name of the label array, when calling the
    getitem method of the dataset, a special loader function will be called.

    Let's take a look at an example data folder of the following structure:

    .. code-block:: bash

        root/
        ├ meta.yaml
        ├ images/
        │  ├ image_1.png
        │  ├ image_2.png
        │  ├ image_3.png
        ...
        │  └ image_10000.png
        ├ image:image-*-10000-*-str.npy
        ├ attr1-*-10000-*-int.npy
        ├ attr2-*-10000x2-*-int.npy
        └ kps-*-10000x17x3-*-int.npy


    The ``meta.yaml`` file looks like this:

    .. code-block:: yaml

        description: |
            This is a dataset which loads images.
            All paths to the images are in the label `image`.

        loader_kwargs:
            image:
                support: "-1->1"

    The resulting dataset has the following labels:

        - ``image_``: the paths to the images. Note the extra ``_`` at the end.
        - ``attr1``
        - ``attr2``
        - ``kps``

    When using the ``__getitem__`` method of the dataset, the image loader will
    be applied to the image label at the given index and the image will be
    loaded from the given path.

    As we have specifed loader kweyword arguments, we will get the images with
    a support of ``[-1, 1]``.

    """

[docs]    def __init__(self, root):
        """
        Parameters
        ----------
        root : str
            Where to look for all the data.
        """
        meta_path = os.path.join(root, "meta.yaml")
        self.meta = meta = yaml.safe_load(open(meta_path, "r"))

        labels = load_labels(os.path.join(root, "labels"))
        self.loaders, self.loader_kwargs = setup_loaders(labels, meta)
        self.labels = clean_keys(labels, self.loaders)

        class Lenner:
            def __init__(self):
                self.l = None
                self.visited = []

            def __call__(self, key, label):
                if self.l is None:
                    self.l = len(label)
                else:
                    if len(label) != self.l:
                        raise ValueError(
                            f"Label {key} has a different length "
                            "than the other labels.\n"
                            f"Already seen: {self.visited}"
                        )
                self.visited += [key]

        L = Lenner()
        walk(self.labels, L, pass_key=True)

        self.num_examples = L.l

        self.append_labels = True

    def __len__(self):
        return self.num_examples

[docs]    def get_example(self, idx):
        """Loads all loadable data from the labels.
        
        Parameters
        ----------
        idx : int
            The index of the example to load
        """
        example = {}

        for key, loader in self.loaders.items():
            kwargs = self.loader_kwargs[key]
            example[key] = loader(self.labels[key + "_"][idx], **kwargs)

        return example

    def __repr__(self):
        if (
            __COULD_HAVE_IPYTHON__
            and hasattr(get_ipython(), "config")
            and "IPKernelApp" in get_ipython().config
        ):
            label_str = pp2mkdtable(self.labels, True)
        else:
            label_str = pp2mkdtable(self.labels, False)

        descr = self.meta.get("description", "MetaDataset")

        repr_str = f"{descr}\n\n# Labels\n{label_str}"

        return repr_str

[docs]    def show(self):
        repr_str = self.__repr__()

        expand_ = self.expand
        self.expand = True

        if (
            __COULD_HAVE_IPYTHON__
            and hasattr(get_ipython(), "config")
            and "IPKernelApp" in get_ipython().config
        ):
            repr_str += f"\n\n# Example 0\n{pp2mkdtable(self.__getitem__(0), True)}"
            display(Markdown(repr_str))
        else:
            repr_str += f"\n\n# Example 0\n{pp2mkdtable(self.__getitem__(0), True)}"
            print(repr_str)

        self.expand = expand_


[docs]def setup_loaders(labels, meta_dict):
    """Creates a map of key -> function pairs, which can be used to postprocess
    label values at each ``__getitem__`` call.

    Loaders defined in :attr:`meta_dict` supersede those definde in the label
    keys.

    Parameters
    ----------
    labels : dict(str, numpy.memmap)
        Labels contain all load-easy dataset relevant data. If the key follows
        the pattern ``name:loader``, this function will try to finde the
        corresponding loader in :attr:`DEFAULT_LOADERS`.
    meta_dict : dict
        A dictionary containing all dataset relevent information, which is the
        same for all examples. This function will try to find the entry
        ``loaders`` in the dictionary, which must contain another ``dict`` with
        ``name:loader`` pairs. Here ``loader`` must be either an entry in
        :attr:`DEFAULT_LOADERS` or a loadable import path.
        You can additionally define an entry ``loader_kwargs``, which must
        contain ``name:dict`` pairs. The dictionary is passed as keyword
        arguments to the loader corresponding to ``name``.

    Returns
    -------
    loaders : dict
        Name, function pairs, to apply loading logic based on the labels with
        the specified names.
    loader_kwargs : dict
        Name, dict pairs. The dicts are passed to the loader functions as
        keyword arguments.
    """

    loaders = {}
    loader_kwargs = {}

    for k in labels.keys():
        k, l = loader_from_key(k)
        if l is not None:
            loaders[k] = l

    meta_loaders = retrieve(meta_dict, "loaders", default={})
    meta_loader_kwargs = retrieve(meta_dict, "loader_kwargs", default={})

    loaders.update(meta_loaders)

    for k, l in loaders.items():
        if l in DEFAULT_LOADERS:
            loaders[k] = DEFAULT_LOADERS[l]
        else:
            loaders[k] = get_obj_from_str(l)

        if k in meta_loader_kwargs:
            loader_kwargs[k] = meta_loader_kwargs[k]
        else:
            loader_kwargs[k] = {}

    return loaders, loader_kwargs


[docs]def load_labels(root):
    """
    Parameters
    ----------
    root : str
        Where to look for the labels.

    Returns
    -------
    labels : dict
        All labels as ``np.memmap`` s.
    """

    regex = re.compile(r".*-\*-.*-\*-.*\.npy")

    label_files = _get_label_files(root)

    class Loader:
        def __init__(self):
            self.labels = {}

        def __call__(self, key_path, path):
            if isinstance(path, str) and regex.match(path):
                f = os.path.basename(path)
                f_ = f[: -len(".npy")]
                key_, shape, dtype = f_.split("-*-")
                shape = tuple([int(s) for s in shape.split("x")])

                key_path = key_path.split("/")
                if len(key_path) == 1:
                    key = key_
                else:
                    key = "/".join(key_path[:-1] + [key_])

                mmap = np.memmap(path, mode="c", shape=shape, dtype=dtype)

                set_value(self.labels, key, mmap)

    L = Loader()
    walk(label_files, L, pass_key=True)

    return L.labels


[docs]def clean_keys(labels, loaders):
    """Removes all loader information from the keys.

    Parameters
    ----------
    labels : dict(str, numpy.memmap)
        Labels contain all load-easy dataset relevant data. 
    
    Returns
    -------
    labels : dict(str, numpy.memmap)
        The original labels, with keys without the ``:loader`` part.
    """

    class Cleaner:
        def __init__(self):
            self.to_delete = []
            self.to_set = []

        def __call__(self, key, val):
            k, l = loader_from_key(key)
            if l is not None:
                self.to_set += [[k + "_", retrieve(labels, key)]]
                self.to_delete += [key]

    C = Cleaner()
    walk(labels, C, pass_key=True)

    for key, val in C.to_set:
        set_value(labels, key, val)

    for key in C.to_delete:
        pop_keypath(labels, key)

    for k_ in list(loaders.keys()):
        if k_ in labels:
            k = k_ + "_"
            labels[k] = labels[k_]
            del labels[k_]

    return labels


[docs]def loader_from_key(key):
    """Returns the name, loader pair given a key."""

    if ":" in key:
        return key.split(":")
    return key, None


def _get_label_files(root):

    regex = re.compile(r".*-\*-.*-\*-.*\.npy")

    def f(path, regex):
        d = {}

        name_ = os.path.basename(path)

        if os.path.isdir(path):
            for name in os.listdir(path):
                d[name] = f(os.path.join(path, name), regex)
        else:
            if regex.match(path):
                d = path
            else:
                d = None
        return d

    root_, name = os.path.split(root)
    structure = f(root, regex)

    return structure