Source code for allensdk.brain_observatory.data_release_utils.metadata_utils.utils

from typing import Optional

import pandas as pd
import pathlib
import warnings

from allensdk.brain_observatory.data_release_utils \
    .metadata_utils.id_generator import (
        FileIDGenerator)


[docs]def add_file_paths_to_metadata_table( metadata_table: pd.DataFrame, id_generator: FileIDGenerator, file_dir: pathlib.Path, file_prefix: Optional[str], index_col: str, data_dir_col: Optional[str], on_missing_file: str, file_suffix: str = 'nwb', file_stem: Optional[str] = None ) -> pd.DataFrame: """ Add file_id and file_path columns to session dataframe. Parameters ---------- metadata_table: pd.DataFrame The dataframe to which we are adding file_id and file_path id_generator: FileIDGenerator For maintaining a unique mapping between file_path and file_id file_dir: pathlib.Path directory where files will be found file_prefix: str Prefix of file names index_col: str Column in metadata_table used to index files data_dir_col Column in metadata_table denoting directory structure of data For example if data is stored under each session_id <session_id> / file_a <session_id> / file_b ... then give the name of the session_id col here If None, data is assumed to be stored flat on_missing_file: str Specifies how to handle missing files 'error' -> raise an exception 'warning' -> assign dummy file_id and warn 'skip' -> drop that row from the table and warn file_suffix file_stem Explicit file stem. Overrides dynamic naming of files Returns ------- metadata_table: The same as the input dataframe but with file_id and file_path columns added Notes ----- Files are assumed to be named like {file_dir}/{file_prefix}_{metadata_table.index_col}.{file_suffix} """ if on_missing_file not in ('error', 'warn', 'skip'): msg = ("on_missing_file must be one of ('error', " "'warn', or 'skip'); you passed in " f"{on_missing_file}") raise ValueError(msg) new_data = [] missing_files = [] metadata_table = metadata_table.set_index(index_col) for row in metadata_table.itertuples(): data_dir = getattr(row, data_dir_col, row.Index) if file_stem is None: file_stem_ = \ f'{file_prefix}_{row.Index}' if file_prefix is not None else \ f'{row.Index}' else: file_stem_ = file_stem if data_dir is None: # If `data_dir` is not given, assume files stored flat file_path = file_dir / f'{file_stem_}.{file_suffix}' else: # assume files stored under data_dir file_path = file_dir / f'{data_dir}' / \ f'{file_stem_}.{file_suffix}' if not file_path.exists(): file_id = id_generator.dummy_value missing_files.append(file_path.resolve().absolute()) else: file_id = id_generator.id_from_path(file_path=file_path) str_path = str(file_path.resolve().absolute()) new_data.append( {'file_id': file_id, 'file_path': str_path, index_col: row.Index}) if len(missing_files) > 0: msg = "The following files do not exist:" for file_path in missing_files: msg += f"\n{file_path}" if on_missing_file == 'error': raise RuntimeError(msg) else: warnings.warn(msg) new_df = pd.DataFrame(data=new_data) metadata_table = metadata_table.join( new_df.set_index(index_col), on=index_col, how='left') if on_missing_file == 'skip' and len(missing_files) > 0: metadata_table = metadata_table.drop( metadata_table.loc[ metadata_table.file_id == id_generator.dummy_value].index) metadata_table = metadata_table.reset_index() return metadata_table