Source code for allensdk.core.dataframe_utils

from typing import List

import pandas as pd

# Null value fill for integer Pandas.Series objects. NWB currently doesn't
# support using the new Int64 type that has explicit N/A values so we fill
# instead with -99.
INT_NULL = -99

"""A collection of utilities to manipulate pandas DataFrames."""


[docs]def patch_df_from_other(
    target_df: pd.DataFrame,
    source_df: pd.DataFrame,
    columns_to_patch: List[str],
    index_column: str,
) -> pd.DataFrame:
    """
    Overwrite column values in target_df from column
    values in source_df in rows where the two dataframes
    share a value of index_column.

    Parameters
    ----------
    target_df: pd.DataFrame
        The dataframe whose columns will get overwritten

    source_df: pd.DataFrame
        The dataframe from which correct values are to be read

    columns_to_patch: List[str]
        The columns to be overwritten

    index_column: str
        The column to join the dataframes on

    Returns
    -------
    patched_df: pd.DataFrame
        target_df except with the specified columns and rows
        overwritten.

    Notes
    -----
    If any of the columns_to_patch are not in target_df, they
    will be added.

    This function starts by creating a copy of target_df, so
    it will not alter the argument in-place.
    """
    target_df = target_df.copy(deep=True)
    original_index = target_df.index.name
    if original_index is not None:
        target_df = target_df.reset_index()

    msg = ""
    if index_column not in target_df.columns:
        msg += f"{index_column} not in target_df\n"

    if index_column not in source_df.columns:
        msg += f"{index_column} not in source_df\n"
    else:
        index_values = source_df[index_column].values
        if len(set(index_values)) != len(index_values):
            msg += f"{index_column} values in source_df are not unique\n"

    for column in columns_to_patch:
        if column not in source_df:
            msg += f"{column} not in source_df\n"
        if column not in target_df:
            target_df[column] = None

    if index_column in columns_to_patch:
        msg += (
            f"{index_column} is in the list of "
            f"columns to patch {columns_to_patch}; "
            "unsure how to handle that case\n"
        )

    if len(msg) > 0:
        msg = f"failures in patch_df_from_other:\n{msg}"
        raise ValueError(msg)

    target_df = target_df.set_index(index_column)

    patch_df = source_df[columns_to_patch + [index_column]]
    patch_df = patch_df.set_index(index_column)

    target_df.update(patch_df, join="left", overwrite=True)

    target_df = target_df.reset_index()
    if original_index is not None:
        target_df = target_df.set_index(original_index)
    return target_df


[docs]def enforce_df_column_order(
        input_df: pd.DataFrame,
        column_order: List[str]
) -> pd.DataFrame:
    """Return the data frame but with columns ordered.

    Parameters
    ----------
    input_df : pandas.DataFrame
        Data frame with columns to be ordered.
    column_order : list of str
        Ordering of column names to enforce. Columns not specified are shifted
        to the end of the order but retain their order amongst others not
        specified. If a specified column is not in the DataFrame it is ignored.

    Returns
    -------
    output_df : pandas.DataFrame
        DataFrame the same as the input but with columns reordered.
    """
    # Use only columns that are in the input dataframe's columns.
    pruned_order = []
    for col in column_order:
        if col in input_df.columns:
            pruned_order.append(col)
    # Get the full list of columns in the data frame with our ordered columns
    # first.
    pruned_order.extend(
        list(set(input_df.columns).difference(set(pruned_order)))
    )
    return input_df[pruned_order]


[docs]def enforce_df_int_typing(
        input_df: pd.DataFrame,
        int_columns: List[str],
        use_pandas_type: object = False
) -> pd.DataFrame:
    """Enforce integer typing for columns that may have lost int typing when
    combined into the final DataFrame.

    Parameters
    ----------
    input_df : pandas.DataFrame
        DataFrame with typing to enforce.
    int_columns : list of str
        Columns to enforce int typing and fill any NaN/None values with the
        value set in INT_NULL in this file. Requested columns not in the
        dataframe are ignored.
    use_pandas_type : bool
        Instead of filling with the value INT_NULL to enforce integer typing,
        use the pandas type Int64. This type can have issues converting to
        numpy/array type values.

    Returns
    -------
    output_df : pandas.DataFrame
        DataFrame specific columns hard typed to Int64 to allow NA values
        without resorting to float type.
    """
    for col in int_columns:
        if col in input_df.columns:
            if use_pandas_type:
                input_df[col] = input_df[col].astype("Int64")
            else:
                input_df[col] = input_df[col].fillna(INT_NULL).astype(int)
    return input_df


[docs]def return_one_dataframe_row_only(
    input_table: pd.DataFrame, index_value: int, table_name: str
) -> pd.Series:
    """Lookup and return one and only one row from the DataFrame returning
    an informative error if no or multiple rows are returned for a given
    index.

    This method is used mainly to return a more informative error when
    attempting to retrieve metadata from the values behavior cache metadata
    tables.

    Parameters
    ----------
    input_table : pandas.DataFrame
        Input dataframe to retrieve row from.
    index_value : int
        Index of the row to return. Must match an index in the input
         dataframe/table. i.e. in the case of ecephys_session_table or
        behavior_session_table.
    table_name : str
        Name of the table being returned. Used to output the table name
        in case of error.

    Returns
    -------
    row : pandas.Series
        Row corresponding to the input index.
    """
    try:
        row = input_table.loc[index_value]
    except KeyError:
        raise RuntimeError(
            f"The {table_name} should have "
            "1 and only 1 entry for a given "
            f"{input_table.index.name}. No indexed rows found for "
            f"id={index_value}"
        )
    if not isinstance(row, pd.Series):
        raise RuntimeError(
            f"The {table_name} should have "
            "1 and only 1 entry for a given "
            f"{input_table.index.name}. For "
            f"{index_value} "
            f" there are {len(row)} entries."
        )
    return row
Source code for allensdk.core.dataframe_utils

Contents

Questions