Source code for allensdk.core.dataframe_utils

from typing import List
import pandas as pd


[docs]def patch_df_from_other(
        target_df: pd.DataFrame,
        source_df: pd.DataFrame,
        columns_to_patch: List[str],
        index_column: str) -> pd.DataFrame:
    """
    Overwrite column values in target_df from column
    values in source_df in rows where the two dataframes
    share a value of index_column.

    Parameters
    ----------
    target_df: pd.DataFrame
        The dataframe whose columns will get overwritten

    source_df: pd.DataFrame
        The dataframe from which correct values are to be read

    columns_to_patch: List[str]
        The columns to be overwritten

    index_column: str
        The column to join the dataframes on

    Returns
    -------
    patched_df: pd.DataFrame
        target_df except with the specified columns and rows
        overwritten.

    Notes
    -----
    If any of the columns_to_patch are not in target_df, they
    will be added.

    This function starts by creating a copy of target_df, so
    it will not alter the argument in-place.
    """
    target_df = target_df.copy(deep=True)
    original_index = target_df.index.name
    if original_index is not None:
        target_df = target_df.reset_index()

    msg = ""
    if index_column not in target_df.columns:
        msg += f"{index_column} not in target_df\n"

    if index_column not in source_df.columns:
        msg += f"{index_column} not in source_df\n"
    else:
        index_values = source_df[index_column].values
        if len(set(index_values)) != len(index_values):
            msg += f"{index_column} values in source_df are not unique\n"

    for column in columns_to_patch:
        if column not in source_df:
            msg += f"{column} not in source_df\n"
        if column not in target_df:
            target_df[column] = None

    if index_column in columns_to_patch:
        msg += (f"{index_column} is in the list of "
                f"columns to patch {columns_to_patch}; "
                "unsure how to handle that case\n")

    if len(msg) > 0:
        msg = f"failures in patch_df_from_other:\n{msg}"
        raise ValueError(msg)

    target_df = target_df.set_index(index_column)

    patch_df = source_df[columns_to_patch + [index_column]]
    patch_df = patch_df.set_index(index_column)

    target_df.update(
        patch_df,
        join='left',
        overwrite=True)

    target_df = target_df.reset_index()
    if original_index is not None:
        target_df = target_df.set_index(original_index)
    return target_df
Source code for allensdk.core.dataframe_utils

Contents

Questions