Source code for allensdk.core.dataframe_utils

from typing import List

import pandas as pd

# Null value fill for integer Pandas.Series objects. NWB currently doesn't
# support using the new Int64 type that has explicit N/A values so we fill
# instead with -99.
INT_NULL = -99

"""A collection of utilities to manipulate pandas DataFrames."""


[docs]def patch_df_from_other( target_df: pd.DataFrame, source_df: pd.DataFrame, columns_to_patch: List[str], index_column: str, ) -> pd.DataFrame: """ Overwrite column values in target_df from column values in source_df in rows where the two dataframes share a value of index_column. Parameters ---------- target_df: pd.DataFrame The dataframe whose columns will get overwritten source_df: pd.DataFrame The dataframe from which correct values are to be read columns_to_patch: List[str] The columns to be overwritten index_column: str The column to join the dataframes on Returns ------- patched_df: pd.DataFrame target_df except with the specified columns and rows overwritten. Notes ----- If any of the columns_to_patch are not in target_df, they will be added. This function starts by creating a copy of target_df, so it will not alter the argument in-place. """ target_df = target_df.copy(deep=True) original_index = target_df.index.name if original_index is not None: target_df = target_df.reset_index() msg = "" if index_column not in target_df.columns: msg += f"{index_column} not in target_df\n" if index_column not in source_df.columns: msg += f"{index_column} not in source_df\n" else: index_values = source_df[index_column].values if len(set(index_values)) != len(index_values): msg += f"{index_column} values in source_df are not unique\n" for column in columns_to_patch: if column not in source_df: msg += f"{column} not in source_df\n" if column not in target_df: target_df[column] = None if index_column in columns_to_patch: msg += ( f"{index_column} is in the list of " f"columns to patch {columns_to_patch}; " "unsure how to handle that case\n" ) if len(msg) > 0: msg = f"failures in patch_df_from_other:\n{msg}" raise ValueError(msg) target_df = target_df.set_index(index_column) patch_df = source_df[columns_to_patch + [index_column]] patch_df = patch_df.set_index(index_column) target_df.update(patch_df, join="left", overwrite=True) target_df = target_df.reset_index() if original_index is not None: target_df = target_df.set_index(original_index) return target_df
[docs]def enforce_df_column_order( input_df: pd.DataFrame, column_order: List[str] ) -> pd.DataFrame: """Return the data frame but with columns ordered. Parameters ---------- input_df : pandas.DataFrame Data frame with columns to be ordered. column_order : list of str Ordering of column names to enforce. Columns not specified are shifted to the end of the order but retain their order amongst others not specified. If a specified column is not in the DataFrame it is ignored. Returns ------- output_df : pandas.DataFrame DataFrame the same as the input but with columns reordered. """ # Use only columns that are in the input dataframe's columns. pruned_order = [] for col in column_order: if col in input_df.columns: pruned_order.append(col) # Get the full list of columns in the data frame with our ordered columns # first. pruned_order.extend( list(set(input_df.columns).difference(set(pruned_order))) ) return input_df[pruned_order]
[docs]def enforce_df_int_typing( input_df: pd.DataFrame, int_columns: List[str], use_pandas_type: object = False ) -> pd.DataFrame: """Enforce integer typing for columns that may have lost int typing when combined into the final DataFrame. Parameters ---------- input_df : pandas.DataFrame DataFrame with typing to enforce. int_columns : list of str Columns to enforce int typing and fill any NaN/None values with the value set in INT_NULL in this file. Requested columns not in the dataframe are ignored. use_pandas_type : bool Instead of filling with the value INT_NULL to enforce integer typing, use the pandas type Int64. This type can have issues converting to numpy/array type values. Returns ------- output_df : pandas.DataFrame DataFrame specific columns hard typed to Int64 to allow NA values without resorting to float type. """ for col in int_columns: if col in input_df.columns: if use_pandas_type: input_df[col] = input_df[col].astype("Int64") else: input_df[col] = input_df[col].fillna(INT_NULL).astype(int) return input_df
[docs]def return_one_dataframe_row_only( input_table: pd.DataFrame, index_value: int, table_name: str ) -> pd.Series: """Lookup and return one and only one row from the DataFrame returning an informative error if no or multiple rows are returned for a given index. This method is used mainly to return a more informative error when attempting to retrieve metadata from the values behavior cache metadata tables. Parameters ---------- input_table : pandas.DataFrame Input dataframe to retrieve row from. index_value : int Index of the row to return. Must match an index in the input dataframe/table. i.e. in the case of ecephys_session_table or behavior_session_table. table_name : str Name of the table being returned. Used to output the table name in case of error. Returns ------- row : pandas.Series Row corresponding to the input index. """ try: row = input_table.loc[index_value] except KeyError: raise RuntimeError( f"The {table_name} should have " "1 and only 1 entry for a given " f"{input_table.index.name}. No indexed rows found for " f"id={index_value}" ) if not isinstance(row, pd.Series): raise RuntimeError( f"The {table_name} should have " "1 and only 1 entry for a given " f"{input_table.index.name}. For " f"{index_value} " f" there are {len(row)} entries." ) return row