Source code for allensdk.api.cloud_cache.manifest
from typing import Dict, List, Any
import json
import pathlib
from typing import Union
from allensdk.api.cloud_cache.utils import relative_path_from_url # noqa: E501
from allensdk.api.cloud_cache.file_attributes import CacheFileAttributes # noqa: E501
[docs]class Manifest(object):
"""
A class for loading and manipulating the online manifest.json associated
with a dataset release
Each Manifest instance should represent the data for 1 and only 1
manifest.json file.
Parameters
----------
cache_dir: str or pathlib.Path
The path to the directory where local copies of files will be stored
json_input:
A ''.read()''-supporting file-like object containing
a JSON document to be deserialized (i.e. same as the
first argument to json.load)
use_static_project_dir: bool
When determining what the local path of a remote resource
(data or metadata file) should be, the Manifest class will typically
create a versioned project subdirectory under the user provided
`cache_dir` (e.g. f"{cache_dir}/{project_name}-{manifest_version}")
to allow the possibility of multiple manifest (and data) versions to be
used. In certain cases, like when using a project's s3 bucket
directly as the cache_dir, the project directory name needs to be
static (e.g. f"{cache_dir}/{project_name}"). When set to True,
the Manifest class will use a static project directory to determine
local paths for remote resources. Defaults to False.
"""
def __init__(
self,
cache_dir: Union[str, pathlib.Path],
json_input,
use_static_project_dir: bool = False
):
if isinstance(cache_dir, str):
self._cache_dir = pathlib.Path(cache_dir).resolve()
elif isinstance(cache_dir, pathlib.Path):
self._cache_dir = cache_dir.resolve()
else:
raise ValueError("cache_dir must be either a str "
"or a pathlib.Path; "
f"got {type(cache_dir)}")
self._use_static_project_dir = use_static_project_dir
self._data: Dict[str, Any] = json.load(json_input)
if not isinstance(self._data, dict):
raise ValueError("Expected to deserialize manifest into a dict; "
f"instead got {type(self._data)}")
self._project_name: str = self._data["project_name"]
self._version: str = self._data['manifest_version']
self._file_id_column: str = self._data['metadata_file_id_column_name']
self._data_pipeline: str = self._data["data_pipeline"]
self._metadata_file_names: List[str] = [
file_name for file_name in self._data['metadata_files']
]
self._metadata_file_names.sort()
self._file_id_values: List[Any] = [ii for ii in
self._data['data_files'].keys()]
self._file_id_values.sort()
@property
def project_name(self):
"""
The name of the project whose data and metadata files this
manifest tracks.
"""
return self._project_name
@property
def version(self):
"""
The version of the dataset currently loaded
"""
return self._version
@property
def file_id_column(self):
"""
The column in the metadata files used to uniquely
identify data files
"""
return self._file_id_column
@property
def metadata_file_names(self):
"""
List of metadata file names associated with this dataset
"""
return self._metadata_file_names
@property
def file_id_values(self):
"""
List of valid file_id values
"""
return self._file_id_values
def _create_file_attributes(self,
remote_path: str,
version_id: str,
file_hash: str) -> CacheFileAttributes:
"""
Create the cache_file_attributes describing a file.
This method does the work of assigning a local_path for a remote file.
Parameters
----------
remote_path: str
The full URL to a file
version_id: str
The string specifying the version of the file
file_hash: str
The (hexadecimal) file hash of the file
Returns
-------
CacheFileAttributes
"""
if self._use_static_project_dir:
# If we only want to support 1 version of the project on disk
# like when mounting the project S3 bucket as a file system
project_dir_name = f"{self._project_name}"
else:
# If we want to support multiple versions of the project on disk
# paths should be built like:
# {cache_dir} / {project_name}-{manifest_version} / relative_path
# Example:
# my_cache_dir/visual-behavior-ophys-1.0.0/behavior_sessions/etc...
project_dir_name = f"{self._project_name}-{self._version}"
project_dir = self._cache_dir / project_dir_name
# The convention of the data release tool is to have all
# relative_paths from remote start with the project name which
# we want to remove since we already specified a project_dir_name
relative_path = relative_path_from_url(remote_path)
shaved_rel_path = "/".join(relative_path.split("/")[1:])
local_path = project_dir / shaved_rel_path
obj = CacheFileAttributes(
remote_path,
version_id,
file_hash,
local_path
)
return obj
[docs] def data_file_attributes(self, file_id) -> CacheFileAttributes:
"""
Return the CacheFileAttributes associated with a data file
Parameters
----------
file_id:
The identifier of the data file whose attributes are to be
returned. Must be a key in self._data['data_files']
Return
------
CacheFileAttributes
Raises
------
RuntimeError
If you try to run this method when self._data is None (meaning
you haven't yet loaded a manifest.json file)
ValueError
If the file_id is not a valid option
"""
if self._data is None:
raise RuntimeError("You cannot retrieve data_file_attributes;\n"
"you have not yet loaded a manifest.json file")
if file_id not in self._data['data_files']:
valid_keys = list(self._data['data_files'].keys())
valid_keys.sort()
raise ValueError(f"file_id: {file_id}\n"
"Is not a data file listed in manifest:\n"
f"{valid_keys}")
file_data = self._data['data_files'][file_id]
return self._create_file_attributes(file_data['url'],
file_data['version_id'],
file_data['file_hash'])