Skip to content
Snippets Groups Projects
Commit b4070c0f authored by EL KAOUI Imad-Eddine's avatar EL KAOUI Imad-Eddine
Browse files

Download code from osf storage

parent 5414024f
No related branches found
No related tags found
No related merge requests found
import fnmatch
import glob
import warnings
import os
import re
import json
import hashlib
import numpy as np
import numbers
import pickle
import urllib
import time
import sys
import warnings
import zipfile
import tarfile
import shutil
import collections.abc
import contextlib
from io import BytesIO
import pandas as pd
from scipy.io import loadmat
from scipy.io.matlab.miobase import MatReadError
from sklearn.utils import Bunch, deprecated
def _fetch_file(url, data_dir, resume=True, overwrite=False,
md5sum=None, username=None, password=None, handlers=None,
verbose=1):
"""Load requested file, downloading it if needed or requested.
Parameters
----------
url: string
Contains the url of the file to be downloaded.
data_dir: string
Path of the data directory. Used for data storage in the specified
location.
resume: bool, optional
If true, try to resume partially downloaded files
overwrite: bool, optional
If true and file already exists, delete it.
md5sum: string, optional
MD5 sum of the file. Checked if download of the file is required
username: string, optional
Username used for basic HTTP authentication
password: string, optional
Password used for basic HTTP authentication
handlers: list of BaseHandler, optional
urllib handlers passed to urllib.request.build_opener. Used by
advanced users to customize request handling.
verbose: int, optional
verbosity level (0 means no message).
Returns
-------
files: string
Absolute path of downloaded file.
Notes
-----
If, for any reason, the download procedure fails, all downloaded files are
removed.
"""
handlers = handlers if handlers else []
# Determine data path
if not os.path.exists(data_dir):
os.makedirs(data_dir)
# Determine filename using URL
parse = urllib.parse.urlparse(url)
file_name = os.path.basename(parse.path)
if file_name == '':
file_name = md5_hash(parse.path)
temp_file_name = file_name + ".part"
full_name = os.path.join(data_dir, file_name)
temp_full_name = os.path.join(data_dir, temp_file_name)
if os.path.exists(full_name):
if overwrite:
os.remove(full_name)
else:
return full_name
if os.path.exists(temp_full_name):
if overwrite:
os.remove(temp_full_name)
t0 = time.time()
local_file = None
initial_size = 0
try:
# Download data
url_opener = urllib.request.build_opener(*handlers)
request = urllib.request.Request(url)
request.add_header('Connection', 'Keep-Alive')
if username is not None and password is not None:
if not url.startswith('https'):
raise ValueError(
'Authentication was requested on a non secured URL (%s).'
'Request has been blocked for security reasons.' % url)
# Note: HTTPBasicAuthHandler is not fitted here because it relies
# on the fact that the server will return a 401 error with proper
# www-authentication header, which is not the case of most
# servers.
encoded_auth = base64.b64encode(
(username + ':' + password).encode())
request.add_header(b'Authorization', b'Basic ' + encoded_auth)
if verbose > 0:
displayed_url = url.split('?')[0] if verbose == 1 else url
print('Downloading data from %s ...' % displayed_url)
if resume and os.path.exists(temp_full_name):
# Download has been interrupted, we try to resume it.
local_file_size = os.path.getsize(temp_full_name)
# If the file exists, then only download the remainder
request.add_header("Range", "bytes=%s-" % (local_file_size))
try:
data = url_opener.open(request)
content_range = data.info().get('Content-Range')
if (content_range is None or not content_range.startswith(
'bytes %s-' % local_file_size)):
raise IOError('Server does not support resuming')
except Exception:
# A wide number of errors can be raised here. HTTPError,
# URLError... I prefer to catch them all and rerun without
# resuming.
if verbose > 0:
print('Resuming failed, try to download the whole file.')
return _fetch_file(
url, data_dir, resume=False, overwrite=overwrite,
md5sum=md5sum, username=username, password=password,
handlers=handlers, verbose=verbose)
local_file = open(temp_full_name, "ab")
initial_size = local_file_size
else:
data = url_opener.open(request)
local_file = open(temp_full_name, "wb")
_chunk_read_(data, local_file, report_hook=(verbose > 0),
initial_size=initial_size, verbose=verbose)
# temp file must be closed prior to the move
if not local_file.closed:
local_file.close()
shutil.move(temp_full_name, full_name)
dt = time.time() - t0
if verbose > 0:
# Complete the reporting hook
sys.stderr.write(' ...done. ({0:.0f} seconds, {1:.0f} min)\n'
.format(dt, dt // 60))
except (urllib.error.HTTPError, urllib.error.URLError):
sys.stderr.write("Error while fetching file %s; dataset "
"fetching aborted." % (file_name))
raise
finally:
if local_file is not None:
if not local_file.closed:
local_file.close()
if md5sum is not None:
if (_md5_sum_file(full_name) != md5sum):
raise ValueError("File %s checksum verification has failed."
" Dataset fetching aborted." % local_file)
return full_name
def _chunk_read_(response, local_file, chunk_size=8192, report_hook=None,
initial_size=0, total_size=None, verbose=1):
"""Download a file chunk by chunk and show advancement
Parameters
----------
response: urllib.response.addinfourl
Response to the download request in order to get file size
local_file: file
Hard disk file where data should be written
chunk_size: int, optional
Size of downloaded chunks. Default: 8192
report_hook: bool
Whether or not to show downloading advancement. Default: None
initial_size: int, optional
If resuming, indicate the initial size of the file
total_size: int, optional
Expected final size of download (None means it is unknown).
verbose: int, optional
verbosity level (0 means no message).
Returns
-------
data: string
The downloaded file.
"""
try:
if total_size is None:
total_size = response.info().get('Content-Length').strip()
total_size = int(total_size) + initial_size
except Exception as e:
if verbose > 2:
print("Warning: total size could not be determined.")
if verbose > 3:
print("Full stack trace: %s" % e)
total_size = None
bytes_so_far = initial_size
t0 = time_last_display = time.time()
while True:
chunk = response.read(chunk_size)
bytes_so_far += len(chunk)
time_last_read = time.time()
if (report_hook and
# Refresh report every second or when download is
# finished.
(time_last_read > time_last_display + 1. or not chunk)):
_chunk_report_(bytes_so_far,
total_size, initial_size, t0)
time_last_display = time_last_read
if chunk:
local_file.write(chunk)
else:
break
return
def _get_dataset_descr(ds_name):
module_path = os.path.dirname(os.path.abspath(__file__))
fname = ds_name
try:
with open(os.path.join(module_path, 'description', fname + '.rst'),
'rb') as rst_file:
descr = rst_file.read()
except IOError:
descr = ''
if descr == '':
print("Warning: Could not find dataset description.")
return descrdef
def _fetch_files(data_dir, files, resume=True, mock=False, verbose=1):
"""Load requested dataset, downloading it if needed or requested.
This function retrieves files from the hard drive or download them from
the given urls. Note to developpers: All the files will be first
downloaded in a sandbox and, if everything goes well, they will be moved
into the folder of the dataset. This prevents corrupting previously
downloaded data. In case of a big dataset, do not hesitate to make several
calls if needed.
Parameters
----------
data_dir: string
Path of the data directory. Used for data storage in a specified
location.
files: list of (string, string, dict)
List of files and their corresponding url with dictionary that contains
options regarding the files. Eg. (file_path, url, opt). If a file_path
is not found in data_dir, as in data_dir/file_path the download will
be immediately cancelled and any downloaded files will be deleted.
Options supported are:
* 'move' if renaming the file or moving it to a subfolder is needed
* 'uncompress' to indicate that the file is an archive
* 'md5sum' to check the md5 sum of the file
* 'overwrite' if the file should be re-downloaded even if it exists
resume: bool, optional
If true, try resuming download if possible
mock: boolean, optional
If true, create empty files if the file cannot be downloaded. Test use
only.
verbose: int, optional
verbosity level (0 means no message).
Returns
-------
files: list of string
Absolute paths of downloaded files on disk
"""
# There are two working directories here:
# - data_dir is the destination directory of the dataset
# - temp_dir is a temporary directory dedicated to this fetching call. All
# files that must be downloaded will be in this directory. If a corrupted
# file is found, or a file is missing, this working directory will be
# deleted.
files = list(files)
files_pickle = pickle.dumps([(file_, url) for file_, url, _ in files])
files_md5 = hashlib.md5(files_pickle).hexdigest()
temp_dir = os.path.join(data_dir, files_md5)
# Create destination dir
if not os.path.exists(data_dir):
os.makedirs(data_dir)
# Abortion flag, in case of error
abort = None
files_ = []
for file_, url, opts in files:
# 3 possibilities:
# - the file exists in data_dir, nothing to do.
# - the file does not exists: we download it in temp_dir
# - the file exists in temp_dir: this can happen if an archive has been
# downloaded. There is nothing to do
# Target file in the data_dir
target_file = os.path.join(data_dir, file_)
# Target file in temp dir
temp_target_file = os.path.join(temp_dir, file_)
# Whether to keep existing files
overwrite = opts.get('overwrite', False)
if (abort is None and (overwrite or (not os.path.exists(target_file) and not
os.path.exists(temp_target_file)))):
# We may be in a global read-only repository. If so, we cannot
# download files.
if not os.access(data_dir, os.W_OK):
raise ValueError('Dataset files are missing but dataset'
' repository is read-only. Contact your data'
' administrator to solve the problem')
if not os.path.exists(temp_dir):
os.mkdir(temp_dir)
md5sum = opts.get('md5sum', None)
dl_file = _fetch_file(url, temp_dir, resume=resume,
verbose=verbose, md5sum=md5sum,
username=opts.get('username', None),
password=opts.get('password', None),
handlers=opts.get('handlers', []),
overwrite=overwrite)
if 'move' in opts:
# XXX: here, move is supposed to be a dir, it can be a name
move = os.path.join(temp_dir, opts['move'])
move_dir = os.path.dirname(move)
if not os.path.exists(move_dir):
os.makedirs(move_dir, exist_ok = True)
shutil.move(dl_file, move)
dl_file = move
if 'uncompress' in opts:
try:
if not mock or os.path.getsize(dl_file) != 0:
_uncompress_file(dl_file, verbose=verbose)
else:
os.remove(dl_file)
except Exception as e:
abort = str(e)
if (abort is None and not os.path.exists(target_file) and not
os.path.exists(temp_target_file)):
if not mock:
warnings.warn('An error occured while fetching %s' % file_)
abort = ("Dataset has been downloaded but requested file was "
"not provided:\nURL: %s\n"
"Target file: %s\nDownloaded: %s" %
(url, target_file, dl_file))
else:
if not os.path.exists(os.path.dirname(temp_target_file)):
os.makedirs(os.path.dirname(temp_target_file))
open(temp_target_file, 'w').close()
if abort is not None:
if os.path.exists(temp_dir):
shutil.rmtree(temp_dir)
raise IOError('Fetching aborted: ' + abort)
files_.append(target_file)
# If needed, move files from temps directory to final directory.
if os.path.exists(temp_dir):
# XXX We could only moved the files requested
# XXX Movetree can go wrong
movetree(temp_dir, data_dir)
shutil.rmtree(temp_dir)
return files_
def movetree(src, dst):
"""Move an entire tree to another directory. Any existing file is
overwritten"""
names = os.listdir(src)
# Create destination dir if it does not exist
if not os.path.exists(dst):
os.makedirs(dst)
errors = []
for name in names:
srcname = os.path.join(src, name)
dstname = os.path.join(dst, name)
try:
if os.path.isdir(srcname) and os.path.isdir(dstname):
movetree(srcname, dstname)
os.rmdir(srcname)
else:
shutil.move(srcname, dstname)
except (IOError, os.error) as why:
errors.append((srcname, dstname, str(why)))
# catch the Error from the recursive movetree so that we can
# continue with other files
except Exception as err:
errors.extend(err.args[0])
if errors:
raise Exception(errors)
def md5_hash(string):
m = hashlib.md5()
m.update(string.encode('utf-8'))
return m.hexdigest()
def _chunk_report_(bytes_so_far, total_size, initial_size, t0):
"""Show downloading percentage.
Parameters
----------
bytes_so_far: int
Number of downloaded bytes
total_size: int
Total size of the file (may be 0/None, depending on download method).
t0: int
The time in seconds (as returned by time.time()) at which the
download was resumed / started.
initial_size: int
If resuming, indicate the initial size of the file.
If not resuming, set to zero.
"""
if not total_size:
sys.stderr.write("\rDownloaded %d of ? bytes." % (bytes_so_far))
else:
# Estimate remaining download time
total_percent = float(bytes_so_far) / total_size
current_download_size = bytes_so_far - initial_size
bytes_remaining = total_size - bytes_so_far
dt = time.time() - t0
download_rate = current_download_size / max(1e-8, float(dt))
# Minimum rate of 0.01 bytes/s, to avoid dividing by zero.
time_remaining = bytes_remaining / max(0.01, download_rate)
# Trailing whitespace is to erase extra char when message length
# varies
sys.stderr.write(
"\rDownloaded %d of %d bytes (%.1f%%, %s remaining)"
% (bytes_so_far, total_size, total_percent * 100,
_format_time(time_remaining)))
def _format_time(t):
if t > 60:
return "%4.1fmin" % (t / 60.)
else:
return " %5.1fs" % (t)
if __name__ == "__main__":
# Build data URLs that will be fetched
files = {}
# Download from the relevant OSF project, using hashes generated
# from the OSF API. Note the trailing slash. For more info, see:
# https://gist.github.com/emdupre/3cb4d564511d495ea6bf89c6a577da74
n_sites = 399
site_mask = np.arange(1, n_sites + 1)
print("site_mask : {}".format(site_mask))
site_id_max = "Site%02d" % n_sites
print("site_id_max : {}".format(site_id_max))
data_dir = '/home/brain/test_download'
index_path = '/home/brain/audio-tagging-silent-cities/index.json'
with open(index_path, "rt") as of:
index = json.load(of)
n_sites = len(site_mask)
site_ids = []
for i in site_mask:
if i<10:
site_id = '000' + str(i)
site_ids.append(site_id)
if i<100 and i>9:
site_id = '00' + str(i)
site_ids.append(site_id)
if i>99:
site_id = '0' + str(i)
site_ids.append(site_id)
print("site_ids : {}".format(site_ids))
datatype = "zip"
#Could be used for site locations.
files = {}
filenames = []
root_url = "https://osf.io/download{0}/"
site_ids = ['0048']
data_type = "zip"
for site_id in site_ids:
sitelist = index[str(site_id)]
len_site = len(sitelist)
for i in range(len_site):
file_path = sitelist[i][0]
opts = {"move": file_path}
file_url = sitelist[i][1]
file_url = root_url.format(file_url)
filenames.append((file_path, file_url, opts))
files.setdefault(data_type, []).append(file_path)
print("files : {}".format(files))
print("filenames : {}".format(filenames))
_fetch_files(data_dir, filenames, verbose=0)
for key, value in files.items():
files[key] = [os.path.join(data_dir, val) for val in value]
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment