Source code for allensdk.api.cloud_cache.utils
from typing import Optional, Union
from pathlib import Path
import warnings
import re
import urllib.parse as url_parse
import hashlib
[docs]def bucket_name_from_url(url: str) -> Optional[str]:
"""
Read in a URL and return the name of the AWS S3 bucket it points towards.
Parameters
----------
URL: str
A generic URL, suitable for retrieving an S3 object via an
HTTP GET request.
Returns
-------
str
An AWS S3 bucket name. Note: if 's3.amazonaws.com' does not occur in
the URL, this method will return None and emit a warning.
Note
-----
URLs passed to this method should conform to the "new" scheme as described
here
https://aws.amazon.com/blogs/aws/amazon-s3-path-deprecation-plan-the-rest-of-the-story/
"""
s3_pattern = re.compile('\.s3[\.,a-z,0-9,\-]*\.amazonaws.com') # noqa: W605, E501
url_params = url_parse.urlparse(url)
raw_location = url_params.netloc
s3_match = s3_pattern.search(raw_location)
if s3_match is None:
warnings.warn(f"{s3_pattern} does not occur in url {url}")
return None
s3_match = raw_location[s3_match.start():s3_match.end()]
return url_params.netloc.replace(s3_match, '')
[docs]def relative_path_from_url(url: str) -> str:
"""
Read in a url and return the relative path of the object
Parameters
----------
url: str
The url of the object whose path you want
Returns
-------
str:
Relative path of the object
Notes
-----
This method returns a str rather than a pathlib.Path because
it is used to get the S3 object Key from a URL. If using
Pathlib.path on a Windows system, the '/' will get transformed
into '\', confusing S3.
"""
url_params = url_parse.urlparse(url)
return url_params.path[1:]
[docs]def file_hash_from_path(file_path: Union[str, Path]) -> str:
"""
Return the hexadecimal file hash for a file
Parameters
----------
file_path: Union[str, Path]
path to a file
Returns
-------
str:
The file hash (Blake2b; hexadecimal) of the file
"""
hasher = hashlib.blake2b()
with open(file_path, 'rb') as in_file:
chunk = in_file.read(1000000)
while len(chunk) > 0:
hasher.update(chunk)
chunk = in_file.read(1000000)
return hasher.hexdigest()