Source code for allensdk.core.dataframe_utils

from typing import List
import pandas as pd


[docs]def patch_df_from_other( target_df: pd.DataFrame, source_df: pd.DataFrame, columns_to_patch: List[str], index_column: str) -> pd.DataFrame: """ Overwrite column values in target_df from column values in source_df in rows where the two dataframes share a value of index_column. Parameters ---------- target_df: pd.DataFrame The dataframe whose columns will get overwritten source_df: pd.DataFrame The dataframe from which correct values are to be read columns_to_patch: List[str] The columns to be overwritten index_column: str The column to join the dataframes on Returns ------- patched_df: pd.DataFrame target_df except with the specified columns and rows overwritten. Notes ----- If any of the columns_to_patch are not in target_df, they will be added. This function starts by creating a copy of target_df, so it will not alter the argument in-place. """ target_df = target_df.copy(deep=True) original_index = target_df.index.name if original_index is not None: target_df = target_df.reset_index() msg = "" if index_column not in target_df.columns: msg += f"{index_column} not in target_df\n" if index_column not in source_df.columns: msg += f"{index_column} not in source_df\n" else: index_values = source_df[index_column].values if len(set(index_values)) != len(index_values): msg += f"{index_column} values in source_df are not unique\n" for column in columns_to_patch: if column not in source_df: msg += f"{column} not in source_df\n" if column not in target_df: target_df[column] = None if index_column in columns_to_patch: msg += (f"{index_column} is in the list of " f"columns to patch {columns_to_patch}; " "unsure how to handle that case\n") if len(msg) > 0: msg = f"failures in patch_df_from_other:\n{msg}" raise ValueError(msg) target_df = target_df.set_index(index_column) patch_df = source_df[columns_to_patch + [index_column]] patch_df = patch_df.set_index(index_column) target_df.update( patch_df, join='left', overwrite=True) target_df = target_df.reset_index() if original_index is not None: target_df = target_df.set_index(original_index) return target_df