Source code for allensdk.api.cloud_cache.utils

from typing import Optional, Union
from pathlib import Path
import warnings
import re
import urllib.parse as url_parse
import hashlib


[docs]def bucket_name_from_url(url: str) -> Optional[str]: """ Read in a URL and return the name of the AWS S3 bucket it points towards. Parameters ---------- URL: str A generic URL, suitable for retrieving an S3 object via an HTTP GET request. Returns ------- str An AWS S3 bucket name. Note: if 's3.amazonaws.com' does not occur in the URL, this method will return None and emit a warning. Note ----- URLs passed to this method should conform to the "new" scheme as described here https://aws.amazon.com/blogs/aws/amazon-s3-path-deprecation-plan-the-rest-of-the-story/ """ s3_pattern = re.compile('\.s3[\.,a-z,0-9,\-]*\.amazonaws.com') # noqa: W605, E501 url_params = url_parse.urlparse(url) raw_location = url_params.netloc s3_match = s3_pattern.search(raw_location) if s3_match is None: warnings.warn(f"{s3_pattern} does not occur in url {url}") return None s3_match = raw_location[s3_match.start():s3_match.end()] return url_params.netloc.replace(s3_match, '')
[docs]def relative_path_from_url(url: str) -> str: """ Read in a url and return the relative path of the object Parameters ---------- url: str The url of the object whose path you want Returns ------- str: Relative path of the object Notes ----- This method returns a str rather than a pathlib.Path because it is used to get the S3 object Key from a URL. If using Pathlib.path on a Windows system, the '/' will get transformed into '\', confusing S3. """ url_params = url_parse.urlparse(url) return url_params.path[1:]
[docs]def file_hash_from_path(file_path: Union[str, Path]) -> str: """ Return the hexadecimal file hash for a file Parameters ---------- file_path: Union[str, Path] path to a file Returns ------- str: The file hash (Blake2b; hexadecimal) of the file """ hasher = hashlib.blake2b() with open(file_path, 'rb') as in_file: chunk = in_file.read(1000000) while len(chunk) > 0: hasher.update(chunk) chunk = in_file.read(1000000) return hasher.hexdigest()