diff --git a/download.py b/download.py new file mode 100644 index 0000000000000000000000000000000000000000..5546f5be21f96742a259b2dac52f31da44bb1c2d --- /dev/null +++ b/download.py @@ -0,0 +1,525 @@ +import fnmatch +import glob +import warnings +import os +import re +import json + +import hashlib +import numpy as np +import numbers +import pickle +import urllib +import time +import sys +import warnings +import zipfile +import tarfile +import shutil +import collections.abc +import contextlib + + +from io import BytesIO + + +import pandas as pd +from scipy.io import loadmat +from scipy.io.matlab.miobase import MatReadError +from sklearn.utils import Bunch, deprecated + + + + +def _fetch_file(url, data_dir, resume=True, overwrite=False, + md5sum=None, username=None, password=None, handlers=None, + verbose=1): + """Load requested file, downloading it if needed or requested. + Parameters + ---------- + url: string + Contains the url of the file to be downloaded. + data_dir: string + Path of the data directory. Used for data storage in the specified + location. + resume: bool, optional + If true, try to resume partially downloaded files + overwrite: bool, optional + If true and file already exists, delete it. + md5sum: string, optional + MD5 sum of the file. Checked if download of the file is required + username: string, optional + Username used for basic HTTP authentication + password: string, optional + Password used for basic HTTP authentication + handlers: list of BaseHandler, optional + urllib handlers passed to urllib.request.build_opener. Used by + advanced users to customize request handling. + verbose: int, optional + verbosity level (0 means no message). + Returns + ------- + files: string + Absolute path of downloaded file. + Notes + ----- + If, for any reason, the download procedure fails, all downloaded files are + removed. + """ + handlers = handlers if handlers else [] + # Determine data path + if not os.path.exists(data_dir): + os.makedirs(data_dir) + + # Determine filename using URL + parse = urllib.parse.urlparse(url) + file_name = os.path.basename(parse.path) + if file_name == '': + file_name = md5_hash(parse.path) + + temp_file_name = file_name + ".part" + full_name = os.path.join(data_dir, file_name) + temp_full_name = os.path.join(data_dir, temp_file_name) + if os.path.exists(full_name): + if overwrite: + os.remove(full_name) + else: + return full_name + if os.path.exists(temp_full_name): + if overwrite: + os.remove(temp_full_name) + t0 = time.time() + local_file = None + initial_size = 0 + + try: + # Download data + url_opener = urllib.request.build_opener(*handlers) + request = urllib.request.Request(url) + request.add_header('Connection', 'Keep-Alive') + if username is not None and password is not None: + if not url.startswith('https'): + raise ValueError( + 'Authentication was requested on a non secured URL (%s).' + 'Request has been blocked for security reasons.' % url) + # Note: HTTPBasicAuthHandler is not fitted here because it relies + # on the fact that the server will return a 401 error with proper + # www-authentication header, which is not the case of most + # servers. + encoded_auth = base64.b64encode( + (username + ':' + password).encode()) + request.add_header(b'Authorization', b'Basic ' + encoded_auth) + if verbose > 0: + displayed_url = url.split('?')[0] if verbose == 1 else url + print('Downloading data from %s ...' % displayed_url) + if resume and os.path.exists(temp_full_name): + # Download has been interrupted, we try to resume it. + local_file_size = os.path.getsize(temp_full_name) + # If the file exists, then only download the remainder + request.add_header("Range", "bytes=%s-" % (local_file_size)) + try: + data = url_opener.open(request) + content_range = data.info().get('Content-Range') + if (content_range is None or not content_range.startswith( + 'bytes %s-' % local_file_size)): + raise IOError('Server does not support resuming') + except Exception: + # A wide number of errors can be raised here. HTTPError, + # URLError... I prefer to catch them all and rerun without + # resuming. + if verbose > 0: + print('Resuming failed, try to download the whole file.') + return _fetch_file( + url, data_dir, resume=False, overwrite=overwrite, + md5sum=md5sum, username=username, password=password, + handlers=handlers, verbose=verbose) + local_file = open(temp_full_name, "ab") + initial_size = local_file_size + else: + data = url_opener.open(request) + local_file = open(temp_full_name, "wb") + _chunk_read_(data, local_file, report_hook=(verbose > 0), + initial_size=initial_size, verbose=verbose) + # temp file must be closed prior to the move + if not local_file.closed: + local_file.close() + shutil.move(temp_full_name, full_name) + dt = time.time() - t0 + if verbose > 0: + # Complete the reporting hook + sys.stderr.write(' ...done. ({0:.0f} seconds, {1:.0f} min)\n' + .format(dt, dt // 60)) + except (urllib.error.HTTPError, urllib.error.URLError): + sys.stderr.write("Error while fetching file %s; dataset " + "fetching aborted." % (file_name)) + raise + finally: + if local_file is not None: + if not local_file.closed: + local_file.close() + if md5sum is not None: + if (_md5_sum_file(full_name) != md5sum): + raise ValueError("File %s checksum verification has failed." + " Dataset fetching aborted." % local_file) + return full_name + + + + + +def _chunk_read_(response, local_file, chunk_size=8192, report_hook=None, + initial_size=0, total_size=None, verbose=1): + """Download a file chunk by chunk and show advancement + Parameters + ---------- + response: urllib.response.addinfourl + Response to the download request in order to get file size + local_file: file + Hard disk file where data should be written + chunk_size: int, optional + Size of downloaded chunks. Default: 8192 + report_hook: bool + Whether or not to show downloading advancement. Default: None + initial_size: int, optional + If resuming, indicate the initial size of the file + total_size: int, optional + Expected final size of download (None means it is unknown). + verbose: int, optional + verbosity level (0 means no message). + Returns + ------- + data: string + The downloaded file. + """ + try: + if total_size is None: + total_size = response.info().get('Content-Length').strip() + total_size = int(total_size) + initial_size + except Exception as e: + if verbose > 2: + print("Warning: total size could not be determined.") + if verbose > 3: + print("Full stack trace: %s" % e) + total_size = None + bytes_so_far = initial_size + + t0 = time_last_display = time.time() + while True: + chunk = response.read(chunk_size) + bytes_so_far += len(chunk) + time_last_read = time.time() + if (report_hook and + # Refresh report every second or when download is + # finished. + (time_last_read > time_last_display + 1. or not chunk)): + _chunk_report_(bytes_so_far, + total_size, initial_size, t0) + time_last_display = time_last_read + if chunk: + local_file.write(chunk) + else: + break + + return + + + + +def _get_dataset_descr(ds_name): + module_path = os.path.dirname(os.path.abspath(__file__)) + + fname = ds_name + + try: + with open(os.path.join(module_path, 'description', fname + '.rst'), + 'rb') as rst_file: + descr = rst_file.read() + except IOError: + descr = '' + + if descr == '': + print("Warning: Could not find dataset description.") + + return descrdef + + + + +def _fetch_files(data_dir, files, resume=True, mock=False, verbose=1): + """Load requested dataset, downloading it if needed or requested. + This function retrieves files from the hard drive or download them from + the given urls. Note to developpers: All the files will be first + downloaded in a sandbox and, if everything goes well, they will be moved + into the folder of the dataset. This prevents corrupting previously + downloaded data. In case of a big dataset, do not hesitate to make several + calls if needed. + Parameters + ---------- + data_dir: string + Path of the data directory. Used for data storage in a specified + location. + files: list of (string, string, dict) + List of files and their corresponding url with dictionary that contains + options regarding the files. Eg. (file_path, url, opt). If a file_path + is not found in data_dir, as in data_dir/file_path the download will + be immediately cancelled and any downloaded files will be deleted. + Options supported are: + * 'move' if renaming the file or moving it to a subfolder is needed + * 'uncompress' to indicate that the file is an archive + * 'md5sum' to check the md5 sum of the file + * 'overwrite' if the file should be re-downloaded even if it exists + resume: bool, optional + If true, try resuming download if possible + mock: boolean, optional + If true, create empty files if the file cannot be downloaded. Test use + only. + verbose: int, optional + verbosity level (0 means no message). + Returns + ------- + files: list of string + Absolute paths of downloaded files on disk + """ + # There are two working directories here: + # - data_dir is the destination directory of the dataset + # - temp_dir is a temporary directory dedicated to this fetching call. All + # files that must be downloaded will be in this directory. If a corrupted + # file is found, or a file is missing, this working directory will be + # deleted. + files = list(files) + files_pickle = pickle.dumps([(file_, url) for file_, url, _ in files]) + files_md5 = hashlib.md5(files_pickle).hexdigest() + temp_dir = os.path.join(data_dir, files_md5) + + # Create destination dir + if not os.path.exists(data_dir): + os.makedirs(data_dir) + + # Abortion flag, in case of error + abort = None + + files_ = [] + for file_, url, opts in files: + # 3 possibilities: + # - the file exists in data_dir, nothing to do. + # - the file does not exists: we download it in temp_dir + # - the file exists in temp_dir: this can happen if an archive has been + # downloaded. There is nothing to do + + # Target file in the data_dir + target_file = os.path.join(data_dir, file_) + # Target file in temp dir + temp_target_file = os.path.join(temp_dir, file_) + # Whether to keep existing files + overwrite = opts.get('overwrite', False) + if (abort is None and (overwrite or (not os.path.exists(target_file) and not + os.path.exists(temp_target_file)))): + + # We may be in a global read-only repository. If so, we cannot + # download files. + if not os.access(data_dir, os.W_OK): + raise ValueError('Dataset files are missing but dataset' + ' repository is read-only. Contact your data' + ' administrator to solve the problem') + + if not os.path.exists(temp_dir): + os.mkdir(temp_dir) + md5sum = opts.get('md5sum', None) + + dl_file = _fetch_file(url, temp_dir, resume=resume, + verbose=verbose, md5sum=md5sum, + username=opts.get('username', None), + password=opts.get('password', None), + handlers=opts.get('handlers', []), + overwrite=overwrite) + if 'move' in opts: + # XXX: here, move is supposed to be a dir, it can be a name + move = os.path.join(temp_dir, opts['move']) + move_dir = os.path.dirname(move) + if not os.path.exists(move_dir): + os.makedirs(move_dir, exist_ok = True) + shutil.move(dl_file, move) + dl_file = move + if 'uncompress' in opts: + try: + if not mock or os.path.getsize(dl_file) != 0: + _uncompress_file(dl_file, verbose=verbose) + else: + os.remove(dl_file) + except Exception as e: + abort = str(e) + + if (abort is None and not os.path.exists(target_file) and not + os.path.exists(temp_target_file)): + if not mock: + warnings.warn('An error occured while fetching %s' % file_) + abort = ("Dataset has been downloaded but requested file was " + "not provided:\nURL: %s\n" + "Target file: %s\nDownloaded: %s" % + (url, target_file, dl_file)) + else: + if not os.path.exists(os.path.dirname(temp_target_file)): + os.makedirs(os.path.dirname(temp_target_file)) + open(temp_target_file, 'w').close() + if abort is not None: + if os.path.exists(temp_dir): + shutil.rmtree(temp_dir) + raise IOError('Fetching aborted: ' + abort) + files_.append(target_file) + # If needed, move files from temps directory to final directory. + if os.path.exists(temp_dir): + # XXX We could only moved the files requested + # XXX Movetree can go wrong + movetree(temp_dir, data_dir) + shutil.rmtree(temp_dir) + return files_ + + + +def movetree(src, dst): + """Move an entire tree to another directory. Any existing file is + overwritten""" + names = os.listdir(src) + + # Create destination dir if it does not exist + if not os.path.exists(dst): + os.makedirs(dst) + errors = [] + + for name in names: + srcname = os.path.join(src, name) + dstname = os.path.join(dst, name) + try: + if os.path.isdir(srcname) and os.path.isdir(dstname): + movetree(srcname, dstname) + os.rmdir(srcname) + else: + shutil.move(srcname, dstname) + except (IOError, os.error) as why: + errors.append((srcname, dstname, str(why))) + # catch the Error from the recursive movetree so that we can + # continue with other files + except Exception as err: + errors.extend(err.args[0]) + if errors: + raise Exception(errors) + + + +def md5_hash(string): + m = hashlib.md5() + m.update(string.encode('utf-8')) + return m.hexdigest() + + +def _chunk_report_(bytes_so_far, total_size, initial_size, t0): + """Show downloading percentage. + Parameters + ---------- + bytes_so_far: int + Number of downloaded bytes + total_size: int + Total size of the file (may be 0/None, depending on download method). + t0: int + The time in seconds (as returned by time.time()) at which the + download was resumed / started. + initial_size: int + If resuming, indicate the initial size of the file. + If not resuming, set to zero. + """ + + if not total_size: + sys.stderr.write("\rDownloaded %d of ? bytes." % (bytes_so_far)) + + else: + # Estimate remaining download time + total_percent = float(bytes_so_far) / total_size + + current_download_size = bytes_so_far - initial_size + bytes_remaining = total_size - bytes_so_far + dt = time.time() - t0 + download_rate = current_download_size / max(1e-8, float(dt)) + # Minimum rate of 0.01 bytes/s, to avoid dividing by zero. + time_remaining = bytes_remaining / max(0.01, download_rate) + + # Trailing whitespace is to erase extra char when message length + # varies + sys.stderr.write( + "\rDownloaded %d of %d bytes (%.1f%%, %s remaining)" + % (bytes_so_far, total_size, total_percent * 100, + _format_time(time_remaining))) + +def _format_time(t): + if t > 60: + return "%4.1fmin" % (t / 60.) + else: + return " %5.1fs" % (t) + + + + +if __name__ == "__main__": + # Build data URLs that will be fetched + files = {} + # Download from the relevant OSF project, using hashes generated + # from the OSF API. Note the trailing slash. For more info, see: + # https://gist.github.com/emdupre/3cb4d564511d495ea6bf89c6a577da74 + + n_sites = 399 + site_mask = np.arange(1, n_sites + 1) + print("site_mask : {}".format(site_mask)) + site_id_max = "Site%02d" % n_sites + print("site_id_max : {}".format(site_id_max)) + + data_dir = '/home/brain/test_download' + index_path = '/home/brain/audio-tagging-silent-cities/index.json' + with open(index_path, "rt") as of: + index = json.load(of) + + n_sites = len(site_mask) + site_ids = [] + for i in site_mask: + if i<10: + site_id = '000' + str(i) + site_ids.append(site_id) + if i<100 and i>9: + site_id = '00' + str(i) + site_ids.append(site_id) + if i>99: + site_id = '0' + str(i) + site_ids.append(site_id) + + + print("site_ids : {}".format(site_ids)) + datatype = "zip" + #Could be used for site locations. + files = {} + filenames = [] + root_url = "https://osf.io/download{0}/" + site_ids = ['0048'] + data_type = "zip" + for site_id in site_ids: + sitelist = index[str(site_id)] + len_site = len(sitelist) + for i in range(len_site): + + + file_path = sitelist[i][0] + opts = {"move": file_path} + file_url = sitelist[i][1] + file_url = root_url.format(file_url) + + + filenames.append((file_path, file_url, opts)) + files.setdefault(data_type, []).append(file_path) + + + print("files : {}".format(files)) + print("filenames : {}".format(filenames)) + + _fetch_files(data_dir, filenames, verbose=0) + for key, value in files.items(): + files[key] = [os.path.join(data_dir, val) for val in value] + + + \ No newline at end of file