Source code for allensdk.brain_observatory.data_release_utils.metadata_utils.utils
import pandas as pd
import pathlib
import warnings
from allensdk.brain_observatory.data_release_utils \
.metadata_utils.id_generator import (
FileIDGenerator)
[docs]def add_file_paths_to_metadata_table(
metadata_table: pd.DataFrame,
id_generator: FileIDGenerator,
file_dir: pathlib.Path,
file_prefix: str,
index_col: str,
on_missing_file: str) -> pd.DataFrame:
"""
Add file_id and file_path columns to session dataframe.
Parameters
----------
metadata_table: pd.DataFrame
The dataframe to which we are adding
file_id and file_path
id_generator: FileIDGenerator
For maintaining a unique mapping between file_path and file_id
file_dir: pathlib.Path
directory where files will be found
file_prefix: str
Prefix of file names
index_col: str
Column in metadata_table used to index files
on_missing_file: str
Specifies how to handle missing files
'error' -> raise an exception
'warning' -> assign dummy file_id and warn
'skip' -> drop that row from the table and warn
Returns
-------
metadata_table:
The same as the input dataframe but with file_id and file_path
columns added
Notes
-----
Files are assumed to be named like
{file_dir}/{file_prefix}_{metadata_table.index_col}.nwb
"""
if on_missing_file not in ('error', 'warn', 'skip'):
msg = ("on_missing_file must be one of ('error', "
"'warn', or 'skip'); you passed in "
f"{on_missing_file}")
raise ValueError(msg)
file_suffix = 'nwb'
new_data = []
missing_files = []
for file_index in metadata_table[index_col].values:
file_path = file_dir / f'{file_prefix}_{file_index}.{file_suffix}'
if not file_path.exists():
file_id = id_generator.dummy_value
missing_files.append(file_path.resolve().absolute())
else:
file_id = id_generator.id_from_path(file_path=file_path)
str_path = str(file_path.resolve().absolute())
new_data.append(
{'file_id': file_id,
'file_path': str_path,
index_col: file_index})
if len(missing_files) > 0:
msg = "The following files do not exist:"
for file_path in missing_files:
msg += f"\n{file_path}"
if on_missing_file == 'error':
raise RuntimeError(msg)
else:
warnings.warn(msg)
new_df = pd.DataFrame(data=new_data)
metadata_table = metadata_table.join(
new_df.set_index(index_col),
on=index_col,
how='left')
if on_missing_file == 'skip' and len(missing_files) > 0:
metadata_table = metadata_table.drop(
metadata_table.loc[
metadata_table.file_id == id_generator.dummy_value].index)
return metadata_table