Source code for edflow.data.believers.meta
import os
import numpy as np
import yaml
import re
from edflow.data.dataset_mixin import DatasetMixin
from edflow.util import retrieve, get_obj_from_str, pp2mkdtable, pop_keypath
from edflow.util import walk, set_value, edprint
from edflow.data.believers.meta_loaders import DEFAULT_LOADERS
try:
from IPython import get_ipython
from IPython.display import display, Markdown
__COULD_HAVE_IPYTHON__ = True
except ImportError:
__COULD_HAVE_IPYTHON__ = False
[docs]class MetaDataset(DatasetMixin):
"""
The :class:`MetaDataset` allows for easy data reading using a simple
interface.
All you need to do is hand the constructor a path and it will look for all
data in a special format and load it as numpy arrays. If further specified
in a meta data file or the name of the label array, when calling the
getitem method of the dataset, a special loader function will be called.
Let's take a look at an example data folder of the following structure:
.. code-block:: bash
root/
├ meta.yaml
├ images/
│ ├ image_1.png
│ ├ image_2.png
│ ├ image_3.png
...
│ └ image_10000.png
├ image:image-*-10000-*-str.npy
├ attr1-*-10000-*-int.npy
├ attr2-*-10000x2-*-int.npy
└ kps-*-10000x17x3-*-int.npy
The ``meta.yaml`` file looks like this:
.. code-block:: yaml
description: |
This is a dataset which loads images.
All paths to the images are in the label `image`.
loader_kwargs:
image:
support: "-1->1"
The resulting dataset has the following labels:
- ``image_``: the paths to the images. Note the extra ``_`` at the end.
- ``attr1``
- ``attr2``
- ``kps``
When using the ``__getitem__`` method of the dataset, the image loader will
be applied to the image label at the given index and the image will be
loaded from the given path.
As we have specifed loader kweyword arguments, we will get the images with
a support of ``[-1, 1]``.
"""
[docs] def __init__(self, root):
"""
Parameters
----------
root : str
Where to look for all the data.
"""
meta_path = os.path.join(root, "meta.yaml")
self.meta = meta = yaml.safe_load(open(meta_path, "r"))
labels = load_labels(os.path.join(root, "labels"))
self.loaders, self.loader_kwargs = setup_loaders(labels, meta)
self.labels = clean_keys(labels, self.loaders)
class Lenner:
def __init__(self):
self.l = None
self.visited = []
def __call__(self, key, label):
if self.l is None:
self.l = len(label)
else:
if len(label) != self.l:
raise ValueError(
f"Label {key} has a different length "
"than the other labels.\n"
f"Already seen: {self.visited}"
)
self.visited += [key]
L = Lenner()
walk(self.labels, L, pass_key=True)
self.num_examples = L.l
self.append_labels = True
def __len__(self):
return self.num_examples
[docs] def get_example(self, idx):
"""Loads all loadable data from the labels.
Parameters
----------
idx : int
The index of the example to load
"""
example = {}
for key, loader in self.loaders.items():
kwargs = self.loader_kwargs[key]
example[key] = loader(self.labels[key + "_"][idx], **kwargs)
return example
def __repr__(self):
if (
__COULD_HAVE_IPYTHON__
and hasattr(get_ipython(), "config")
and "IPKernelApp" in get_ipython().config
):
label_str = pp2mkdtable(self.labels, True)
else:
label_str = pp2mkdtable(self.labels, False)
descr = self.meta.get("description", "MetaDataset")
repr_str = f"{descr}\n\n# Labels\n{label_str}"
return repr_str
[docs] def show(self):
repr_str = self.__repr__()
expand_ = self.expand
self.expand = True
if (
__COULD_HAVE_IPYTHON__
and hasattr(get_ipython(), "config")
and "IPKernelApp" in get_ipython().config
):
repr_str += f"\n\n# Example 0\n{pp2mkdtable(self.__getitem__(0), True)}"
display(Markdown(repr_str))
else:
repr_str += f"\n\n# Example 0\n{pp2mkdtable(self.__getitem__(0), True)}"
print(repr_str)
self.expand = expand_
[docs]def setup_loaders(labels, meta_dict):
"""Creates a map of key -> function pairs, which can be used to postprocess
label values at each ``__getitem__`` call.
Loaders defined in :attr:`meta_dict` supersede those definde in the label
keys.
Parameters
----------
labels : dict(str, numpy.memmap)
Labels contain all load-easy dataset relevant data. If the key follows
the pattern ``name:loader``, this function will try to finde the
corresponding loader in :attr:`DEFAULT_LOADERS`.
meta_dict : dict
A dictionary containing all dataset relevent information, which is the
same for all examples. This function will try to find the entry
``loaders`` in the dictionary, which must contain another ``dict`` with
``name:loader`` pairs. Here ``loader`` must be either an entry in
:attr:`DEFAULT_LOADERS` or a loadable import path.
You can additionally define an entry ``loader_kwargs``, which must
contain ``name:dict`` pairs. The dictionary is passed as keyword
arguments to the loader corresponding to ``name``.
Returns
-------
loaders : dict
Name, function pairs, to apply loading logic based on the labels with
the specified names.
loader_kwargs : dict
Name, dict pairs. The dicts are passed to the loader functions as
keyword arguments.
"""
loaders = {}
loader_kwargs = {}
for k in labels.keys():
k, l = loader_from_key(k)
if l is not None:
loaders[k] = l
meta_loaders = retrieve(meta_dict, "loaders", default={})
meta_loader_kwargs = retrieve(meta_dict, "loader_kwargs", default={})
loaders.update(meta_loaders)
for k, l in loaders.items():
if l in DEFAULT_LOADERS:
loaders[k] = DEFAULT_LOADERS[l]
else:
loaders[k] = get_obj_from_str(l)
if k in meta_loader_kwargs:
loader_kwargs[k] = meta_loader_kwargs[k]
else:
loader_kwargs[k] = {}
return loaders, loader_kwargs
[docs]def load_labels(root):
"""
Parameters
----------
root : str
Where to look for the labels.
Returns
-------
labels : dict
All labels as ``np.memmap`` s.
"""
regex = re.compile(r".*-\*-.*-\*-.*\.npy")
label_files = _get_label_files(root)
class Loader:
def __init__(self):
self.labels = {}
def __call__(self, key_path, path):
if isinstance(path, str) and regex.match(path):
f = os.path.basename(path)
f_ = f[: -len(".npy")]
key_, shape, dtype = f_.split("-*-")
shape = tuple([int(s) for s in shape.split("x")])
key_path = key_path.split("/")
if len(key_path) == 1:
key = key_
else:
key = "/".join(key_path[:-1] + [key_])
mmap = np.memmap(path, mode="c", shape=shape, dtype=dtype)
set_value(self.labels, key, mmap)
L = Loader()
walk(label_files, L, pass_key=True)
return L.labels
[docs]def clean_keys(labels, loaders):
"""Removes all loader information from the keys.
Parameters
----------
labels : dict(str, numpy.memmap)
Labels contain all load-easy dataset relevant data.
Returns
-------
labels : dict(str, numpy.memmap)
The original labels, with keys without the ``:loader`` part.
"""
class Cleaner:
def __init__(self):
self.to_delete = []
self.to_set = []
def __call__(self, key, val):
k, l = loader_from_key(key)
if l is not None:
self.to_set += [[k + "_", retrieve(labels, key)]]
self.to_delete += [key]
C = Cleaner()
walk(labels, C, pass_key=True)
for key, val in C.to_set:
set_value(labels, key, val)
for key in C.to_delete:
pop_keypath(labels, key)
for k_ in list(loaders.keys()):
if k_ in labels:
k = k_ + "_"
labels[k] = labels[k_]
del labels[k_]
return labels
[docs]def loader_from_key(key):
"""Returns the name, loader pair given a key."""
if ":" in key:
return key.split(":")
return key, None
def _get_label_files(root):
regex = re.compile(r".*-\*-.*-\*-.*\.npy")
def f(path, regex):
d = {}
name_ = os.path.basename(path)
if os.path.isdir(path):
for name in os.listdir(path):
d[name] = f(os.path.join(path, name), regex)
else:
if regex.match(path):
d = path
else:
d = None
return d
root_, name = os.path.split(root)
structure = f(root, regex)
return structure