Source code for allensdk.core.pickle_utils
from typing import Union, Any
import pickle
import gzip
import pathlib
[docs]def load_and_sanitize_pickle(
pickle_path: Union[str, pathlib.Path]) -> Any:
"""
Load the data from a pickle file and pass it through
sanitize_pickle_data, so that all bytes in the data are
cast to strings.
Parameters
----------
pickle_path: Union[str, pathlib.Path]
Path to the pickle file
Returns
-------
pickle_data: Any
The data that was in the pickle file
Notes
-----
Because sanitize_pickle_data alters the data in-place,
this method encapsulates loading and sanitizing so that
users do not think they have a pre-sanitization copy
of the data available.
"""
if isinstance(pickle_path, str):
pickle_path = pathlib.Path(pickle_path)
if pickle_path.name.endswith('gz'):
open_method = gzip.open
elif pickle_path.name.endswith('pkl'):
open_method = open
else:
raise ValueError("Can open .pkl and .gz files; "
f"you gave {pickle_path.resolve().absolute()}")
with open_method(pickle_path, 'rb') as in_file:
raw_data = pickle.load(in_file, encoding='bytes')
return _sanitize_pickle_data(raw_data)
def _sanitize_pickle_data(
raw_data: Union[list, dict]) -> Union[list, dict]:
"""
Sometimes data read from the pickle file comes with keys that
are strings; sometimes it comes with keys that are bytes.
This method iterates over the elements in the pickle file, casting
the bytes to strings, returning the same object with the mapped
keys.
Note
----
Alters raw_data in-place
"""
if isinstance(raw_data, dict):
raw_data = _sanitize_dict(raw_data)
elif isinstance(raw_data, list):
raw_data = _sanitize_list(raw_data)
return raw_data
def _sanitize_list(
raw_data: list) -> list:
"""
Sanitize a list read from the pickle file, casting bytes
into str and returning the sanitized list.
Note
----
Alters raw_data in place
"""
for idx, element in enumerate(raw_data):
if isinstance(element, list) or isinstance(element, tuple):
raw_data[idx] = _sanitize_list_or_tuple(element)
elif isinstance(element, dict):
raw_data[idx] = _sanitize_dict(element)
elif isinstance(element, bytes):
raw_data[idx] = element.decode('utf-8')
else:
pass
return raw_data
def _sanitize_tuple(
raw_data: tuple) -> tuple:
"""
Sanitize a list read from the pickle file, casting bytes
into str and returning the sanitized list.
"""
output = list(raw_data)
output = _sanitize_list(output)
output = tuple(output)
return output
def _sanitize_list_or_tuple(
raw_data: Union[list, tuple]) -> Union[list, tuple]:
"""
Sanitize a list or tuple read from the pickle file,
casting bytes into str and returning the sanitized iterable.
Note
----
Alters raw_data in place (if a list)
"""
if isinstance(raw_data, list):
return _sanitize_list(raw_data)
elif isinstance(raw_data, tuple):
return _sanitize_tuple(raw_data)
raise ValueError("Can only process lists or tuples; "
f"you gave {type(raw_data)}")
def _sanitize_dict(
raw_data: dict) -> dict:
"""
Sanitize a dict read from the pickle file, casting bytes
into str and returning the sanitized dict.
Note
----
Alters raw_data in-place
"""
key_list = list(raw_data.keys())
for this_key in key_list:
this_value = raw_data.pop(this_key)
if isinstance(this_key, bytes):
this_key = this_key.decode('utf-8')
if isinstance(this_value, list) or isinstance(this_value, tuple):
this_value = _sanitize_list_or_tuple(this_value)
elif isinstance(this_value, dict):
this_value = _sanitize_dict(this_value)
elif isinstance(this_value, bytes):
this_value = this_value.decode('utf-8')
raw_data[this_key] = this_value
return raw_data