Source code for allensdk.api.cloud_cache.utils

from typing import Optional, Union
from pathlib import Path
import warnings
import re
import urllib.parse as url_parse
import hashlib


[docs]def bucket_name_from_url(url: str) -> Optional[str]:
    """
    Read in a URL and return the name of the AWS S3 bucket it points towards.

    Parameters
    ----------
    URL: str
        A generic URL, suitable for retrieving an S3 object via an
        HTTP GET request.

    Returns
    -------
    str
        An AWS S3 bucket name. Note: if 's3.amazonaws.com' does not occur in
        the URL, this method will return None and emit a warning.

    Note
    -----
    URLs passed to this method should conform to the "new" scheme as described
    here
    https://aws.amazon.com/blogs/aws/amazon-s3-path-deprecation-plan-the-rest-of-the-story/
    """
    s3_pattern = re.compile('\.s3[\.,a-z,0-9,\-]*\.amazonaws.com')  # noqa: W605, E501
    url_params = url_parse.urlparse(url)
    raw_location = url_params.netloc
    s3_match = s3_pattern.search(raw_location)

    if s3_match is None:
        warnings.warn(f"{s3_pattern} does not occur in url {url}")
        return None

    s3_match = raw_location[s3_match.start():s3_match.end()]
    return url_params.netloc.replace(s3_match, '')


[docs]def relative_path_from_url(url: str) -> str:
    """
    Read in a url and return the relative path of the object

    Parameters
    ----------
    url: str
        The url of the object whose path you want

    Returns
    -------
    str:
        Relative path of the object

    Notes
    -----
    This method returns a str rather than a pathlib.Path because
    it is used to get the S3 object Key from a URL. If using
    Pathlib.path on a Windows system, the '/' will get transformed
    into '\', confusing S3.
    """
    url_params = url_parse.urlparse(url)
    return url_params.path[1:]


[docs]def file_hash_from_path(file_path: Union[str, Path]) -> str:
    """
    Return the hexadecimal file hash for a file

    Parameters
    ----------
    file_path: Union[str, Path]
        path to a file

    Returns
    -------
    str:
        The file hash (Blake2b; hexadecimal) of the file
    """
    hasher = hashlib.blake2b()
    with open(file_path, 'rb') as in_file:
        chunk = in_file.read(1000000)
        while len(chunk) > 0:
            hasher.update(chunk)
            chunk = in_file.read(1000000)
    return hasher.hexdigest()
Source code for allensdk.api.cloud_cache.utils

Contents

Questions