import logging
import os
from pathlib import Path
from urllib.parse import urlparse
import requests
from tqdm.auto import tqdm
from .filelock import FileLock
__all__ = ["download_file", "download_cached", "download_file_cached"]
log = logging.getLogger(__name__)
[docs]def download_file(url, path, auth=None, chunk_size=10240, progress=False):
"""
Download a file. Will write to ``path + '.part'`` while downloading
and rename after successful download to the final name.
Parameters
----------
url: str or url
The URL to download
path: pathlib.Path or str
Where to store the downloaded data.
auth: None or tuple of (username, password) or a request.AuthBase instance.
chunk_size: int
Chunk size for writing the data file, 10 kB by default.
"""
log.info(f"Downloading {url} to {path}")
name = urlparse(url).path.split("/")[-1]
path = Path(path)
part_file = None
with requests.get(url, stream=True, auth=auth, timeout=5) as r:
# make sure the request is successful
r.raise_for_status()
total = float(r.headers.get("Content-Length", float("inf")))
pbar = tqdm(
total=total,
disable=not progress,
unit="B",
unit_scale=True,
desc=f"Downloading {name}",
)
try:
# open a .part file to avoid creating
# a broken file at the intended location
part_file = path.with_suffix(path.suffix + ".part")
part_file.parent.mkdir(parents=True, exist_ok=True)
with part_file.open("wb") as f:
for chunk in r.iter_content(chunk_size=chunk_size):
f.write(chunk)
pbar.update(len(chunk))
except BaseException: # we really want to catch everything here
# cleanup part file if something goes wrong
if part_file is not None and part_file.is_file():
part_file.unlink()
raise
# when successful, move to intended location
part_file.rename(path)
def get_cache_path(url, cache_name="ctapipe", env_override="CTAPIPE_CACHE"):
if os.getenv(env_override):
base = Path(os.environ["CTAPIPE_CACHE"])
else:
base = Path.home() / ".cache" / cache_name
url = urlparse(url)
path = os.path.join(url.netloc.rstrip("/"), url.path.lstrip("/"))
path = base / path
return path
[docs]def download_cached(
url, cache_name="ctapipe", auth=None, env_prefix="CTAPIPE_DATA_", progress=False
):
path = get_cache_path(url, cache_name=cache_name)
path.parent.mkdir(parents=True, exist_ok=True)
lock_file = path.with_suffix(path.suffix + ".lock")
with FileLock(lock_file):
# if we already dowloaded the file, just use it
if path.is_file():
log.debug(f"{url} is available in cache.")
return path
if auth is True:
try:
auth = (
os.environ[env_prefix + "USER"],
os.environ[env_prefix + "PASSWORD"],
)
except KeyError:
raise KeyError(
f'You need to set the env variables "{env_prefix}USER"'
f' and "{env_prefix}PASSWORD" to download test files.'
) from None
download_file(url=url, path=path, auth=auth, progress=progress)
return path
[docs]def download_file_cached(
name,
cache_name="ctapipe",
auth=None,
env_prefix="CTAPIPE_DATA_",
default_url="http://cccta-dataserver.in2p3.fr/data/",
progress=False,
):
"""
Downloads a file from a dataserver and caches the result locally
in ``$HOME/.cache/<cache_name>``.
If the file is found in the cache, no new download is performed.
Parameters
----------
name: str or pathlib.Path
the name of the file, relative to the data server url
cache_name: str
What name to use for the cache directory
env_prefix: str
Prefix for the environt variables used for overriding the URL,
and providing username and password in case authentication is required.
auth: True, None or tuple of (username, password)
Authentication data for the request. Will be passed to ``requests.get``.
If ``True``, read username and password for the request from
the env variables ``env_prefix + 'USER'`` and ``env_prefix + PASSWORD``
default_url: str
The default url from which to download ``name``, can be overriden
by setting the env variable ``env_prefix + URL``
Returns
-------
path: pathlib.Path
the full path to the downloaded data.
"""
log.debug(f"File {name} is not available in cache, downloading.")
base_url = os.environ.get(env_prefix + "URL", default_url).rstrip("/")
url = base_url + "/" + str(name).lstrip("/")
return download_cached(
url, cache_name=cache_name, auth=auth, env_prefix=env_prefix, progress=progress
)