Download code from osf storage

b4070c0f · EL KAOUI Imad-Eddine · 5414024f · b4070c0f
Commit b4070c0f authored Aug 13, 2020 by EL KAOUI Imad-Eddine
--- a/download.py
+++ b/download.py
+import fnmatch
+import glob
+import warnings
+import os
+import re
+import json
+import hashlib
+import numpy as np
+import numbers
+import pickle
+import urllib
+import time
+import sys
+import warnings
+import zipfile
+import tarfile
+import shutil
+import collections.abc
+import contextlib
+from io import BytesIO
+import pandas as pd
+from scipy.io import loadmat
+from scipy.io.matlab.miobase import MatReadError
+from sklearn.utils import Bunch, deprecated
+def _fetch_file(url, data_dir, resume=True, overwrite=False,
+                md5sum=None, username=None, password=None, handlers=None,
+                verbose=1):
+    """Load requested file, downloading it if needed or requested.
+    Parameters
+    ----------
+    url: string
+        Contains the url of the file to be downloaded.
+    data_dir: string
+        Path of the data directory. Used for data storage in the specified
+        location.
+    resume: bool, optional
+        If true, try to resume partially downloaded files
+    overwrite: bool, optional
+        If true and file already exists, delete it.
+    md5sum: string, optional
+        MD5 sum of the file. Checked if download of the file is required
+    username: string, optional
+        Username used for basic HTTP authentication
+    password: string, optional
+        Password used for basic HTTP authentication
+    handlers: list of BaseHandler, optional
+        urllib handlers passed to urllib.request.build_opener. Used by
+        advanced users to customize request handling.
+    verbose: int, optional
+        verbosity level (0 means no message).
+    Returns
+    -------
+    files: string
+        Absolute path of downloaded file.
+    Notes
+    -----
+    If, for any reason, the download procedure fails, all downloaded files are
+    removed.
+    """
+    handlers = handlers if handlers else []
+    # Determine data path
+    if not os.path.exists(data_dir):
+        os.makedirs(data_dir)
+    # Determine filename using URL
+    parse = urllib.parse.urlparse(url)
+    file_name = os.path.basename(parse.path)
+    if file_name == '':
+        file_name = md5_hash(parse.path)
+    temp_file_name = file_name + ".part"
+    full_name = os.path.join(data_dir, file_name)
+    temp_full_name = os.path.join(data_dir, temp_file_name)
+    if os.path.exists(full_name):
+        if overwrite:
+            os.remove(full_name)
+        else:
+            return full_name
+    if os.path.exists(temp_full_name):
+        if overwrite:
+            os.remove(temp_full_name)
+    t0 = time.time()
+    local_file = None
+    initial_size = 0
+    try:
+        # Download data
+        url_opener = urllib.request.build_opener(*handlers)
+        request = urllib.request.Request(url)
+        request.add_header('Connection', 'Keep-Alive')
+        if username is not None and password is not None:
+            if not url.startswith('https'):
+                raise ValueError(
+                    'Authentication was requested on a non  secured URL (%s).'
+                    'Request has been blocked for security reasons.' % url)
+            # Note: HTTPBasicAuthHandler is not fitted here because it relies
+            # on the fact that the server will return a 401 error with proper
+            # www-authentication header, which is not the case of most
+            # servers.
+            encoded_auth = base64.b64encode(
+                (username + ':' + password).encode())
+            request.add_header(b'Authorization', b'Basic ' + encoded_auth)
+        if verbose > 0:
+            displayed_url = url.split('?')[0] if verbose == 1 else url
+            print('Downloading data from %s ...' % displayed_url)
+        if resume and os.path.exists(temp_full_name):
+            # Download has been interrupted, we try to resume it.
+            local_file_size = os.path.getsize(temp_full_name)
+            # If the file exists, then only download the remainder
+            request.add_header("Range", "bytes=%s-" % (local_file_size))
+            try:
+                data = url_opener.open(request)
+                content_range = data.info().get('Content-Range')
+                if (content_range is None or not content_range.startswith(
+                        'bytes %s-' % local_file_size)):
+                    raise IOError('Server does not support resuming')
+            except Exception:
+                # A wide number of errors can be raised here. HTTPError,
+                # URLError... I prefer to catch them all and rerun without
+                # resuming.
+                if verbose > 0:
+                    print('Resuming failed, try to download the whole file.')
+                return _fetch_file(
+                    url, data_dir, resume=False, overwrite=overwrite,
+                    md5sum=md5sum, username=username, password=password,
+                    handlers=handlers, verbose=verbose)
+            local_file = open(temp_full_name, "ab")
+            initial_size = local_file_size
+        else:
+            data = url_opener.open(request)
+            local_file = open(temp_full_name, "wb")
+        _chunk_read_(data, local_file, report_hook=(verbose > 0),
+                     initial_size=initial_size, verbose=verbose)
+        # temp file must be closed prior to the move
+        if not local_file.closed:
+            local_file.close()
+        shutil.move(temp_full_name, full_name)
+        dt = time.time() - t0
+        if verbose > 0:
+            # Complete the reporting hook
+            sys.stderr.write(' ...done. ({0:.0f} seconds, {1:.0f} min)\n'
+                             .format(dt, dt // 60))
+    except (urllib.error.HTTPError, urllib.error.URLError):
+        sys.stderr.write("Error while fetching file %s; dataset "
+                         "fetching aborted." % (file_name))
+        raise
+    finally:
+        if local_file is not None:
+            if not local_file.closed:
+                local_file.close()
+    if md5sum is not None:
+        if (_md5_sum_file(full_name) != md5sum):
+            raise ValueError("File %s checksum verification has failed."
+                             " Dataset fetching aborted." % local_file)
+    return full_name
+def _chunk_read_(response, local_file, chunk_size=8192, report_hook=None,
+                 initial_size=0, total_size=None, verbose=1):
+    """Download a file chunk by chunk and show advancement
+    Parameters
+    ----------
+    response: urllib.response.addinfourl
+        Response to the download request in order to get file size
+    local_file: file
+        Hard disk file where data should be written
+    chunk_size: int, optional
+        Size of downloaded chunks. Default: 8192
+    report_hook: bool
+        Whether or not to show downloading advancement. Default: None
+    initial_size: int, optional
+        If resuming, indicate the initial size of the file
+    total_size: int, optional
+        Expected final size of download (None means it is unknown).
+    verbose: int, optional
+        verbosity level (0 means no message).
+    Returns
+    -------
+    data: string
+        The downloaded file.
+    """
+    try:
+        if total_size is None:
+            total_size = response.info().get('Content-Length').strip()
+        total_size = int(total_size) + initial_size
+    except Exception as e:
+        if verbose > 2:
+            print("Warning: total size could not be determined.")
+            if verbose > 3:
+                print("Full stack trace: %s" % e)
+        total_size = None
+    bytes_so_far = initial_size
+    t0 = time_last_display = time.time()
+    while True:
+        chunk = response.read(chunk_size)
+        bytes_so_far += len(chunk)
+        time_last_read = time.time()
+        if (report_hook and
+                # Refresh report every second or when download is
+                # finished.
+                (time_last_read > time_last_display + 1. or not chunk)):
+            _chunk_report_(bytes_so_far,
+                           total_size, initial_size, t0)
+            time_last_display = time_last_read
+        if chunk:
+            local_file.write(chunk)
+        else:
+            break
+    return
+def _get_dataset_descr(ds_name):
+    module_path = os.path.dirname(os.path.abspath(__file__))
+    fname = ds_name
+    try:
+        with open(os.path.join(module_path, 'description', fname + '.rst'),
+                  'rb') as rst_file:
+            descr = rst_file.read()
+    except IOError:
+        descr = ''
+    if descr == '':
+        print("Warning: Could not find dataset description.")
+    return descrdef
+def _fetch_files(data_dir, files, resume=True, mock=False, verbose=1):
+    """Load requested dataset, downloading it if needed or requested.
+    This function retrieves files from the hard drive or download them from
+    the given urls. Note to developpers: All the files will be first
+    downloaded in a sandbox and, if everything goes well, they will be moved
+    into the folder of the dataset. This prevents corrupting previously
+    downloaded data. In case of a big dataset, do not hesitate to make several
+    calls if needed.
+    Parameters
+    ----------
+    data_dir: string
+        Path of the data directory. Used for data storage in a specified
+        location.
+    files: list of (string, string, dict)
+        List of files and their corresponding url with dictionary that contains
+        options regarding the files. Eg. (file_path, url, opt). If a file_path
+        is not found in data_dir, as in data_dir/file_path the download will
+        be immediately cancelled and any downloaded files will be deleted.
+        Options supported are:
+            * 'move' if renaming the file or moving it to a subfolder is needed
+            * 'uncompress' to indicate that the file is an archive
+            * 'md5sum' to check the md5 sum of the file
+            * 'overwrite' if the file should be re-downloaded even if it exists
+    resume: bool, optional
+        If true, try resuming download if possible
+    mock: boolean, optional
+        If true, create empty files if the file cannot be downloaded. Test use
+        only.
+    verbose: int, optional
+        verbosity level (0 means no message).
+    Returns
+    -------
+    files: list of string
+        Absolute paths of downloaded files on disk
+    """
+    # There are two working directories here:
+    # - data_dir is the destination directory of the dataset
+    # - temp_dir is a temporary directory dedicated to this fetching call. All
+    #   files that must be downloaded will be in this directory. If a corrupted
+    #   file is found, or a file is missing, this working directory will be
+    #   deleted.
+    files = list(files)
+    files_pickle = pickle.dumps([(file_, url) for file_, url, _ in files])
+    files_md5 = hashlib.md5(files_pickle).hexdigest()
+    temp_dir = os.path.join(data_dir, files_md5)
+    # Create destination dir
+    if not os.path.exists(data_dir):
+        os.makedirs(data_dir)
+    # Abortion flag, in case of error
+    abort = None
+    files_ = []
+    for file_, url, opts in files:
+        # 3 possibilities:
+        # - the file exists in data_dir, nothing to do.
+        # - the file does not exists: we download it in temp_dir
+        # - the file exists in temp_dir: this can happen if an archive has been
+        #   downloaded. There is nothing to do
+        # Target file in the data_dir
+        target_file = os.path.join(data_dir, file_)
+        # Target file in temp dir
+        temp_target_file = os.path.join(temp_dir, file_)
+        # Whether to keep existing files
+        overwrite = opts.get('overwrite', False)
+        if (abort is None and (overwrite or (not os.path.exists(target_file) and not
+                os.path.exists(temp_target_file)))):
+            # We may be in a global read-only repository. If so, we cannot
+            # download files.
+            if not os.access(data_dir, os.W_OK):
+                raise ValueError('Dataset files are missing but dataset'
+                                 ' repository is read-only. Contact your data'
+                                 ' administrator to solve the problem')
+            if not os.path.exists(temp_dir):
+                os.mkdir(temp_dir)
+            md5sum = opts.get('md5sum', None)
+            dl_file = _fetch_file(url, temp_dir, resume=resume,
+                                  verbose=verbose, md5sum=md5sum,
+                                  username=opts.get('username', None),
+                                  password=opts.get('password', None),
+                                  handlers=opts.get('handlers', []),
+                                  overwrite=overwrite)
+            if 'move' in opts:
+                # XXX: here, move is supposed to be a dir, it can be a name
+                move = os.path.join(temp_dir, opts['move'])
+                move_dir = os.path.dirname(move)
+                if not os.path.exists(move_dir):
+                    os.makedirs(move_dir, exist_ok = True)
+                shutil.move(dl_file, move)
+                dl_file = move
+            if 'uncompress' in opts:
+                try:
+                    if not mock or os.path.getsize(dl_file) != 0:
+                        _uncompress_file(dl_file, verbose=verbose)
+                    else:
+                        os.remove(dl_file)
+                except Exception as e:
+                    abort = str(e)
+        if (abort is None and not os.path.exists(target_file) and not
+                os.path.exists(temp_target_file)):
+            if not mock:
+                warnings.warn('An error occured while fetching %s' % file_)
+                abort = ("Dataset has been downloaded but requested file was "
+                         "not provided:\nURL: %s\n"
+                         "Target file: %s\nDownloaded: %s" %
+                         (url, target_file, dl_file))
+            else:
+                if not os.path.exists(os.path.dirname(temp_target_file)):
+                    os.makedirs(os.path.dirname(temp_target_file))
+                open(temp_target_file, 'w').close()
+        if abort is not None:
+            if os.path.exists(temp_dir):
+                shutil.rmtree(temp_dir)
+            raise IOError('Fetching aborted: ' + abort)
+        files_.append(target_file)
+    # If needed, move files from temps directory to final directory.
+    if os.path.exists(temp_dir):
+        # XXX We could only moved the files requested
+        # XXX Movetree can go wrong
+        movetree(temp_dir, data_dir)
+        shutil.rmtree(temp_dir)
+    return files_
+def movetree(src, dst):
+    """Move an entire tree to another directory. Any existing file is
+    overwritten"""
+    names = os.listdir(src)
+    # Create destination dir if it does not exist
+    if not os.path.exists(dst):
+        os.makedirs(dst)
+    errors = []
+    for name in names:
+        srcname = os.path.join(src, name)
+        dstname = os.path.join(dst, name)
+        try:
+            if os.path.isdir(srcname) and os.path.isdir(dstname):
+                movetree(srcname, dstname)
+                os.rmdir(srcname)
+            else:
+                shutil.move(srcname, dstname)
+        except (IOError, os.error) as why:
+            errors.append((srcname, dstname, str(why)))
+        # catch the Error from the recursive movetree so that we can
+        # continue with other files
+        except Exception as err:
+            errors.extend(err.args[0])
+    if errors:
+        raise Exception(errors)
+def md5_hash(string):
+    m = hashlib.md5()
+    m.update(string.encode('utf-8'))
+    return m.hexdigest()
+def _chunk_report_(bytes_so_far, total_size, initial_size, t0):
+    """Show downloading percentage.
+    Parameters
+    ----------
+    bytes_so_far: int
+        Number of downloaded bytes
+    total_size: int
+        Total size of the file (may be 0/None, depending on download method).
+    t0: int
+        The time in seconds (as returned by time.time()) at which the
+        download was resumed / started.
+    initial_size: int
+        If resuming, indicate the initial size of the file.
+        If not resuming, set to zero.
+    """
+    if not total_size:
+        sys.stderr.write("\rDownloaded %d of ? bytes." % (bytes_so_far))
+    else:
+        # Estimate remaining download time
+        total_percent = float(bytes_so_far) / total_size
+        current_download_size = bytes_so_far - initial_size
+        bytes_remaining = total_size - bytes_so_far
+        dt = time.time() - t0
+        download_rate = current_download_size / max(1e-8, float(dt))
+        # Minimum rate of 0.01 bytes/s, to avoid dividing by zero.
+        time_remaining = bytes_remaining / max(0.01, download_rate)
+        # Trailing whitespace is to erase extra char when message length
+        # varies
+        sys.stderr.write(
+            "\rDownloaded %d of %d bytes (%.1f%%, %s remaining)"
+            % (bytes_so_far, total_size, total_percent * 100,
+               _format_time(time_remaining)))
+def _format_time(t):
+    if t > 60:
+        return "%4.1fmin" % (t / 60.)
+    else:
+        return " %5.1fs" % (t)
+if __name__ == "__main__":
+    # Build data URLs that will be fetched
+    files = {}
+    # Download from the relevant OSF project, using hashes generated
+    # from the OSF API. Note the trailing slash. For more info, see:
+    # https://gist.github.com/emdupre/3cb4d564511d495ea6bf89c6a577da74
+    n_sites = 399
+    site_mask = np.arange(1, n_sites + 1)
+    print("site_mask : {}".format(site_mask))
+    site_id_max = "Site%02d" % n_sites
+    print("site_id_max : {}".format(site_id_max))
+    data_dir = '/home/brain/test_download'
+    index_path = '/home/brain/audio-tagging-silent-cities/index.json'
+    with open(index_path, "rt") as of:
+        index = json.load(of)
+    n_sites = len(site_mask)
+    site_ids = []
+    for i in site_mask:
+        if i<10:
+            site_id = '000' + str(i)
+            site_ids.append(site_id)
+        if i<100 and i>9:
+            site_id = '00' + str(i)
+            site_ids.append(site_id)
+        if i>99:
+            site_id = '0' + str(i)
+            site_ids.append(site_id)
+    print("site_ids : {}".format(site_ids))
+    datatype = "zip"
+    #Could be used for site locations.
+    files = {}
+    filenames = []
+    root_url = "https://osf.io/download{0}/"
+    site_ids = ['0048']
+    data_type = "zip"
+    for site_id in site_ids:
+        sitelist = index[str(site_id)]
+        len_site = len(sitelist)
+        for i in range(len_site):
+            file_path = sitelist[i][0]
+            opts = {"move": file_path}
+            file_url = sitelist[i][1]
+            file_url = root_url.format(file_url)
+            filenames.append((file_path, file_url, opts))
+            files.setdefault(data_type, []).append(file_path)
+    print("files : {}".format(files))
+    print("filenames : {}".format(filenames))
+    _fetch_files(data_dir, filenames, verbose=0)
+    for key, value in files.items():
+        files[key] = [os.path.join(data_dir, val) for val in value]
\ No newline at end of file