Source code for allensdk.brain_observatory.data_release_utils.metadata_utils.utils
from typing import Optional
import pandas as pd
import pathlib
import warnings
from allensdk.brain_observatory.data_release_utils \
.metadata_utils.id_generator import (
FileIDGenerator)
[docs]def add_file_paths_to_metadata_table(
metadata_table: pd.DataFrame,
id_generator: FileIDGenerator,
file_dir: pathlib.Path,
file_prefix: Optional[str],
index_col: str,
data_dir_col: Optional[str],
on_missing_file: str,
file_suffix: str = 'nwb',
file_stem: Optional[str] = None
) -> pd.DataFrame:
"""
Add file_id and file_path columns to session dataframe.
Parameters
----------
metadata_table: pd.DataFrame
The dataframe to which we are adding
file_id and file_path
id_generator: FileIDGenerator
For maintaining a unique mapping between file_path and file_id
file_dir: pathlib.Path
directory where files will be found
file_prefix: str
Prefix of file names
index_col: str
Column in metadata_table used to index files
data_dir_col
Column in metadata_table denoting directory structure of data
For example if data is stored under each session_id
<session_id> / file_a
<session_id> / file_b
...
then give the name of the session_id col here
If None, data is assumed to be stored flat
on_missing_file: str
Specifies how to handle missing files
'error' -> raise an exception
'warning' -> assign dummy file_id and warn
'skip' -> drop that row from the table and warn
file_suffix
file_stem
Explicit file stem. Overrides dynamic naming of files
Returns
-------
metadata_table:
The same as the input dataframe but with file_id and file_path
columns added
Notes
-----
Files are assumed to be named like
{file_dir}/{file_prefix}_{metadata_table.index_col}.{file_suffix}
"""
if on_missing_file not in ('error', 'warn', 'skip'):
msg = ("on_missing_file must be one of ('error', "
"'warn', or 'skip'); you passed in "
f"{on_missing_file}")
raise ValueError(msg)
new_data = []
missing_files = []
metadata_table = metadata_table.set_index(index_col)
for row in metadata_table.itertuples():
data_dir = getattr(row, data_dir_col, row.Index)
if file_stem is None:
file_stem_ = \
f'{file_prefix}_{row.Index}' if file_prefix is not None else \
f'{row.Index}'
else:
file_stem_ = file_stem
if data_dir is None:
# If `data_dir` is not given, assume files stored flat
file_path = file_dir / f'{file_stem_}.{file_suffix}'
else:
# assume files stored under data_dir
file_path = file_dir / f'{data_dir}' / \
f'{file_stem_}.{file_suffix}'
if not file_path.exists():
file_id = id_generator.dummy_value
missing_files.append(file_path.resolve().absolute())
else:
file_id = id_generator.id_from_path(file_path=file_path)
str_path = str(file_path.resolve().absolute())
new_data.append(
{'file_id': file_id,
'file_path': str_path,
index_col: row.Index})
if len(missing_files) > 0:
msg = "The following files do not exist:"
for file_path in missing_files:
msg += f"\n{file_path}"
if on_missing_file == 'error':
raise RuntimeError(msg)
else:
warnings.warn(msg)
new_df = pd.DataFrame(data=new_data)
metadata_table = metadata_table.join(
new_df.set_index(index_col),
on=index_col,
how='left')
if on_missing_file == 'skip' and len(missing_files) > 0:
metadata_table = metadata_table.drop(
metadata_table.loc[
metadata_table.file_id == id_generator.dummy_value].index)
metadata_table = metadata_table.reset_index()
return metadata_table