Source code for edflow.data.agnostics.csv_dset

from edflow.data.dataset_mixin import DatasetMixin
import warnings
import pandas as pd
import numpy as np


[docs]class CsvDataset(DatasetMixin):
    """Using a csv file as index, this Dataset returns only the entries in the
    csv file, but can be easily extended to load other data using the
    :class:`ProcessedDatasets`.
    """

[docs]    def __init__(self, csv_root, **pandas_kwargs):
        """
        Parameters
        ----------
        csv_root : str
            Path/to/the/csv containing all datapoints. The
            first line in the file should contain the names for the
            attributes in the corresponding columns.
        pandas_kwargs : kwargs
            Passed to :func:`pandas.read_csv` when loading the csv file.
        """

        self.root = csv_root
        self.data = pd.read_csv(csv_root, **pandas_kwargs)

        # Stacking allows to also contain higher dimensional data in the csv
        # file like bounding boxes or keypoints.
        # Just make sure to load the data correctly, e.g. by passing the
        # converter ast.literal_val for the corresponding column.
        with warnings.catch_warnings():
            # Pandas will complain, that we are trying to add a column when
            # doing `self.data.labels = labels`. We can ignore this message.
            warnings.simplefilter("ignore", category=UserWarning)

            self.labels = {k: np.stack(self.data[k].values) for k in self.data}

[docs]    def get_example(self, idx):
        """Returns all entries in row :attr:`idx` of the labels."""

        # Labels are a pandas dataframe. `.iloc[idx]` returns the row at index
        # idx. Converting to dict results in column_name: row_entry pairs.
        return dict(self.data.iloc[idx])