[hub] Support for huggingface model hub 🎉 (#377)

julien-c · mpariente · web-flow · commit b13741e36256 · 2020-12-07T18:56:12.000+01:00
Co-authored-by: Pariente Manuel &lt;pariente.mnl@gmail.com&gt;
diff --git a/asteroid/utils/hub_utils.py b/asteroid/utils/hub_utils.py
@@ -1,6 +1,19 @@
+import fnmatch
+import io
+import json
 import os
-from torch import hub
+import sys
+import tempfile
+from contextlib import contextmanager
+from functools import partial
 from hashlib import sha256
+from typing import BinaryIO, Dict, Optional, Union
+from urllib.parse import urlparse
+
+import requests
+import torch
+from filelock import FileLock
+from torch import hub
 
 
 CACHE_DIR = os.getenv(
@@ -23,14 +36,18 @@
 
 SR_HASHTABLE = {k: 8000.0 if not "DeMask" in k else 16000.0 for k in MODELS_URLS_HASHTABLE}
 
+HF_WEIGHTS_NAME = "pytorch_model.bin"
+HUGGINGFACE_CO_PREFIX = "https://huggingface.co/{model_id}/resolve/{revision}/{filename}"
+
 
 def cached_download(filename_or_url):
-    """Download from URL with torch.hub and cache the result in ASTEROID_CACHE.
+    """Download from URL and cache the result in ASTEROID_CACHE.
 
     Args:
         filename_or_url (str): Name of a model as named on the Zenodo Community
-            page (ex: ``"mpariente/ConvTasNet_WHAM!_sepclean"``), or an URL to a model
-            file (ex: ``"https://zenodo.org/.../model.pth"``), or a filename
+            page (ex: ``"mpariente/ConvTasNet_WHAM!_sepclean"``), or model id from
+            the Hugging Face model hub (ex: ``"julien-c/DPRNNTasNet-ks16_WHAM_sepclean"``),
+            or a URL to a model file (ex: ``"https://zenodo.org/.../model.pth"``), or a filename
             that exists locally (ex: ``"local/tmp_model.pth"``)
 
     Returns:
@@ -39,11 +56,22 @@ def cached_download(filename_or_url):
     if os.path.isfile(filename_or_url):
         return filename_or_url
 
-    if filename_or_url in MODELS_URLS_HASHTABLE:
+    if urlparse(filename_or_url).scheme in ("http", "https"):
+        url = filename_or_url
+    elif filename_or_url in MODELS_URLS_HASHTABLE:
         url = MODELS_URLS_HASHTABLE[filename_or_url]
     else:
-        # Give a chance to direct URL, torch.hub will handle exceptions
-        url = filename_or_url
+        # Finally, let's try to find it on Hugging Face model hub
+        # e.g. julien-c/DPRNNTasNet-ks16_WHAM_sepclean is a valid model id
+        # and  julien-c/DPRNNTasNet-ks16_WHAM_sepclean@main supports specifying a commit/branch/tag.
+        if "@" in filename_or_url:
+            model_id = filename_or_url.split("@")[0]
+            revision = filename_or_url.split("@")[1]
+        else:
+            model_id = filename_or_url
+            revision = None
+        url = hf_bucket_url(model_id=model_id, filename=HF_WEIGHTS_NAME, revision=revision)
+        return hf_get_from_cache(url, cache_dir=get_cache_dir())
     cached_filename = url_to_filename(url)
     cached_dir = os.path.join(get_cache_dir(), cached_filename)
     cached_path = os.path.join(cached_dir, "model.pth")
@@ -68,3 +96,216 @@ def url_to_filename(url):
 def get_cache_dir():
     os.makedirs(CACHE_DIR, exist_ok=True)
     return CACHE_DIR
+
+
+def hf_bucket_url(
+    model_id: str, filename: str, subfolder: Optional[str] = None, revision: Optional[str] = None
+) -> str:
+    """
+    Resolve a model identifier, a file name, and an optional revision id, to a huggingface.co-hosted url, redirecting
+    to Cloudfront (a Content Delivery Network, or CDN) for large files.
+
+    Cloudfront is replicated over the globe so downloads are way faster for the end user (and it also lowers our
+    bandwidth costs).
+
+    Cloudfront aggressively caches files by default (default TTL is 24 hours), however this is not an issue here
+    because we migrated to a git-based versioning system on huggingface.co, so we now store the files on S3/Cloudfront
+    in a content-addressable way (i.e., the file name is its hash). Using content-addressable filenames means cache
+    can't ever be stale.
+
+    In terms of client-side caching from this library, we base our caching on the objects' ETag. An object' ETag is:
+    its sha1 if stored in git, or its sha256 if stored in git-lfs.
+    """
+    if subfolder is not None:
+        filename = f"{subfolder}/{filename}"
+
+    if revision is None:
+        revision = "main"
+    return HUGGINGFACE_CO_PREFIX.format(model_id=model_id, revision=revision, filename=filename)
+
+
+def hf_url_to_filename(url: str, etag: Optional[str] = None) -> str:
+    """
+    Convert `url` into a hashed filename in a repeatable way. If `etag` is specified, append its hash to the url's,
+    delimited by a period.
+    """
+    url_bytes = url.encode("utf-8")
+    filename = sha256(url_bytes).hexdigest()
+
+    if etag:
+        etag_bytes = etag.encode("utf-8")
+        filename += "." + sha256(etag_bytes).hexdigest()
+
+    return filename
+
+
+def http_user_agent(user_agent: Union[Dict, str, None] = None) -> str:
+    """
+    Formats a user-agent string with basic info about a request.
+    """
+    from .. import __version__ as asteroid_version  # Avoid circular imports
+
+    ua = "asteroid/{}; python/{}".format(asteroid_version, sys.version.split()[0])
+    ua += "; torch/{}".format(torch.__version__)
+    if isinstance(user_agent, dict):
+        ua += "; " + "; ".join("{}/{}".format(k, v) for k, v in user_agent.items())
+    elif isinstance(user_agent, str):
+        ua += "; " + user_agent
+    return ua
+
+
+def http_get(
+    url: str,
+    temp_file: BinaryIO,
+    proxies=None,
+    resume_size=0,
+    user_agent: Union[Dict, str, None] = None,
+):
+    """
+    Donwload remote file. Do not gobble up errors.
+    """
+    headers = {"user-agent": http_user_agent(user_agent)}
+    if resume_size > 0:
+        headers["Range"] = "bytes=%d-" % (resume_size,)
+    r = requests.get(url, stream=True, proxies=proxies, headers=headers)
+    r.raise_for_status()
+    for chunk in r.iter_content(chunk_size=1024):
+        if chunk:  # filter out keep-alive new chunks
+            temp_file.write(chunk)
+
+
+def hf_get_from_cache(
+    url: str,
+    cache_dir: str,
+    force_download=False,
+    proxies=None,
+    etag_timeout=10,
+    resume_download=False,
+    user_agent: Union[Dict, str, None] = None,
+    local_files_only=False,
+) -> Optional[str]:  # pragma: no cover
+    """
+    Given a URL, look for the corresponding file in the local cache. If it's not there, download it. Then return the
+    path to the cached file.
+
+    Return:
+        Local path (string) of file or if networking is off, last version of file cached on disk.
+
+    Raises:
+        In case of non-recoverable file (non-existent or inaccessible url + no cache on disk).
+    """
+
+    os.makedirs(cache_dir, exist_ok=True)
+
+    url_to_download = url
+    etag = None
+    if not local_files_only:
+        try:
+            headers = {"user-agent": http_user_agent(user_agent)}
+            r = requests.head(
+                url, headers=headers, allow_redirects=False, proxies=proxies, timeout=etag_timeout
+            )
+            r.raise_for_status()
+            etag = r.headers.get("X-Linked-Etag") or r.headers.get("ETag")
+            # We favor a custom header indicating the etag of the linked resource, and
+            # we fallback to the regular etag header.
+            # If we don't have any of those, raise an error.
+            if etag is None:
+                raise OSError(
+                    "Distant resource does not have an ETag, we won't be able to reliably ensure reproducibility."
+                )
+            # In case of a redirect,
+            # save an extra redirect on the request.get call,
+            # and ensure we download the exact atomic version even if it changed
+            # between the HEAD and the GET (unlikely, but hey).
+            if 300 <= r.status_code <= 399:
+                url_to_download = r.headers["Location"]
+        except (requests.exceptions.ConnectionError, requests.exceptions.Timeout):
+            # etag is already None
+            pass
+
+    filename = hf_url_to_filename(url, etag)
+
+    # get cache path to put the file
+    cache_path = os.path.join(cache_dir, filename)
+
+    # etag is None == we don't have a connection or we passed local_files_only.
+    # try to get the last downloaded one
+    if etag is None:
+        if os.path.exists(cache_path):
+            return cache_path
+        else:
+            matching_files = [
+                file
+                for file in fnmatch.filter(os.listdir(cache_dir), filename.split(".")[0] + ".*")
+                if not file.endswith(".json") and not file.endswith(".lock")
+            ]
+            if len(matching_files) > 0:
+                return os.path.join(cache_dir, matching_files[-1])
+            else:
+                # If files cannot be found and local_files_only=True,
+                # the models might've been found if local_files_only=False
+                # Notify the user about that
+                if local_files_only:
+                    raise ValueError(
+                        "Cannot find the requested files in the cached path and outgoing traffic has been"
+                        " disabled. To enable model look-ups and downloads online, set 'local_files_only'"
+                        " to False."
+                    )
+                else:
+                    raise ValueError(
+                        "Connection error, and we cannot find the requested files in the cached path."
+                        " Please try again or make sure your Internet connection is on."
+                    )
+
+    # From now on, etag is not None.
+    if os.path.exists(cache_path) and not force_download:
+        return cache_path
+
+    # Prevent parallel downloads of the same file with a lock.
+    lock_path = cache_path + ".lock"
+    with FileLock(lock_path):
+
+        # If the download just completed while the lock was activated.
+        if os.path.exists(cache_path) and not force_download:
+            # Even if returning early like here, the lock will be released.
+            return cache_path
+
+        if resume_download:
+            incomplete_path = cache_path + ".incomplete"
+
+            @contextmanager
+            def _resumable_file_manager() -> "io.BufferedWriter":
+                with open(incomplete_path, "ab") as f:
+                    yield f
+
+            temp_file_manager = _resumable_file_manager
+            if os.path.exists(incomplete_path):
+                resume_size = os.stat(incomplete_path).st_size
+            else:
+                resume_size = 0
+        else:
+            temp_file_manager = partial(
+                tempfile.NamedTemporaryFile, mode="wb", dir=cache_dir, delete=False
+            )
+            resume_size = 0
+
+        # Download to temporary file, then copy to cache dir once finished.
+        # Otherwise you get corrupt cache entries if the download gets interrupted.
+        with temp_file_manager() as temp_file:
+            http_get(
+                url_to_download,
+                temp_file,
+                proxies=proxies,
+                resume_size=resume_size,
+                user_agent=user_agent,
+            )
+
+        os.replace(temp_file.name, cache_path)
+
+        meta = {"url": url, "etag": etag}
+        meta_path = cache_path + ".json"
+        with open(meta_path, "w") as meta_file:
+            json.dump(meta, meta_file)
+
+    return cache_path
diff --git a/setup.py b/setup.py
@@ -29,6 +29,8 @@
         "torch_stoi",
         "asteroid-filterbanks",
         "librosa",
+        "filelock",
+        "requests",
     ],
     extras_require={
         "tests": ["pytest"],
diff --git a/tests/models/models_test.py b/tests/models/models_test.py
@@ -22,6 +22,10 @@
 from asteroid.models.base_models import BaseModel
 
 
+HF_EXAMPLE_MODEL_IDENTIFER = "julien-c/DPRNNTasNet-ks16_WHAM_sepclean"
+# An actual model hosted on huggingface.co
+
+
 def test_set_sample_rate_raises_warning():
     model = BaseModel(sample_rate=8000.0)
     with pytest.warns(UserWarning):
@@ -90,6 +94,11 @@ def test_dprnntasnet_sep():
     assert isinstance(out, np.ndarray)
 
 
+def test_dprnntasnet_sep_from_hf():
+    model = DPRNNTasNet.from_pretrained(HF_EXAMPLE_MODEL_IDENTIFER)
+    assert isinstance(model, DPRNNTasNet)
+
+
 @pytest.mark.parametrize("fb", ["free", "stft", "analytic_free", "param_sinc"])
 def test_save_and_load_dprnn(fb):
     _default_test_model(
diff --git a/tests/utils/hub_utils_test.py b/tests/utils/hub_utils_test.py
@@ -2,10 +2,46 @@
 from asteroid.utils import hub_utils
 
 
+HF_EXAMPLE_MODEL_IDENTIFER = "julien-c/DPRNNTasNet-ks16_WHAM_sepclean"
+# An actual model hosted on huggingface.co
+
+REVISION_ID_ONE_SPECIFIC_COMMIT = "8ab5ef18ef2eda141dd11a5d037a8bede7804ce4"
+# One particular commit (not the top of `main`)
+
+
 def test_download():
     # We download
     path1 = hub_utils.cached_download("mpariente/ConvTasNet_WHAM!_sepclean")
     assert os.path.isfile(path1)
     # We use cache
     path2 = hub_utils.cached_download("mpariente/ConvTasNet_WHAM!_sepclean")
     assert path1 == path2
+
+
+def test_hf_download():
+    # We download
+    path1 = hub_utils.cached_download(HF_EXAMPLE_MODEL_IDENTIFER)
+    assert os.path.isfile(path1)
+    # We use cache
+    path2 = hub_utils.cached_download(HF_EXAMPLE_MODEL_IDENTIFER)
+    assert path1 == path2
+    # However if specifying a particular commit,
+    # file will be different.
+    path3 = hub_utils.cached_download(
+        f"{HF_EXAMPLE_MODEL_IDENTIFER}@{REVISION_ID_ONE_SPECIFIC_COMMIT}"
+    )
+    assert path3 != path1
+
+
+def test_http_user_agent():
+    ua1 = hub_utils.http_user_agent("foobar/1")
+    assert "foobar/1" in ua1
+    ua2 = hub_utils.http_user_agent({"foobar": 1})
+    assert ua1 == ua2
+
+
+def test_hf_bucket_url():
+    url = hub_utils.hf_bucket_url(
+        model_id="julien-c/foo", filename="model.bin", subfolder="folder", revision="v1.0.0"
+    )
+    assert isinstance(url, str)