Source code for allensdk.brain_observatory.data_release_utils.metadata_utils.utils

from typing import Optional

import pandas as pd
import pathlib
import warnings

from allensdk.brain_observatory.data_release_utils \
    .metadata_utils.id_generator import (
        FileIDGenerator)


[docs]def add_file_paths_to_metadata_table(
        metadata_table: pd.DataFrame,
        id_generator: FileIDGenerator,
        file_dir: pathlib.Path,
        file_prefix: Optional[str],
        index_col: str,
        data_dir_col: Optional[str],
        on_missing_file: str,
        file_suffix: str = 'nwb',
        file_stem: Optional[str] = None
) -> pd.DataFrame:
    """
    Add file_id and file_path columns to session dataframe.

    Parameters
    ----------
    metadata_table: pd.DataFrame
        The dataframe to which we are adding
        file_id and file_path

    id_generator: FileIDGenerator
        For maintaining a unique mapping between file_path and file_id

    file_dir: pathlib.Path
        directory where files will be found

    file_prefix: str
        Prefix of file names

    index_col: str
        Column in metadata_table used to index files

    data_dir_col
        Column in metadata_table denoting directory structure of data
        For example if data is stored under each session_id
            <session_id> / file_a
            <session_id> / file_b
            ...
        then give the name of the session_id col here

        If None, data is assumed to be stored flat

    on_missing_file: str
        Specifies how to handle missing files
            'error' -> raise an exception
            'warning' -> assign dummy file_id and warn
            'skip' -> drop that row from the table and warn

    file_suffix

    file_stem
        Explicit file stem. Overrides dynamic naming of files

    Returns
    -------
    metadata_table:
        The same as the input dataframe but with file_id and file_path
        columns added

    Notes
    -----
    Files are assumed to be named like
    {file_dir}/{file_prefix}_{metadata_table.index_col}.{file_suffix}
    """

    if on_missing_file not in ('error', 'warn', 'skip'):
        msg = ("on_missing_file must be one of ('error', "
               "'warn', or 'skip'); you passed in "
               f"{on_missing_file}")
        raise ValueError(msg)

    new_data = []
    missing_files = []
    metadata_table = metadata_table.set_index(index_col)

    for row in metadata_table.itertuples():
        data_dir = getattr(row, data_dir_col, row.Index)

        if file_stem is None:
            file_stem_ = \
                f'{file_prefix}_{row.Index}' if file_prefix is not None else \
                f'{row.Index}'
        else:
            file_stem_ = file_stem

        if data_dir is None:
            # If `data_dir` is not given, assume files stored flat
            file_path = file_dir / f'{file_stem_}.{file_suffix}'
        else:
            # assume files stored under data_dir
            file_path = file_dir / f'{data_dir}' / \
                        f'{file_stem_}.{file_suffix}'

        if not file_path.exists():
            file_id = id_generator.dummy_value
            missing_files.append(file_path.resolve().absolute())
        else:
            file_id = id_generator.id_from_path(file_path=file_path)
        str_path = str(file_path.resolve().absolute())
        new_data.append(
            {'file_id': file_id,
             'file_path': str_path,
             index_col: row.Index})

    if len(missing_files) > 0:
        msg = "The following files do not exist:"
        for file_path in missing_files:
            msg += f"\n{file_path}"
        if on_missing_file == 'error':
            raise RuntimeError(msg)
        else:
            warnings.warn(msg)

    new_df = pd.DataFrame(data=new_data)
    metadata_table = metadata_table.join(
                new_df.set_index(index_col),
                on=index_col,
                how='left')
    if on_missing_file == 'skip' and len(missing_files) > 0:
        metadata_table = metadata_table.drop(
            metadata_table.loc[
                metadata_table.file_id == id_generator.dummy_value].index)

    metadata_table = metadata_table.reset_index()

    return metadata_table
Source code for allensdk.brain_observatory.data_release_utils.metadata_utils.utils

Contents

Questions