Question

背景
对于我们的研究，我们目前需要下载 ~ 15.000 files 。虽然数据库有它的命令行工具，但支持＆＃34;批量＆＃34;下载，按顺序执行15.000次运行是完全不可行的（正如命令行工具当前所做的那样）。

简单的数学
我使用当前可用的命令行工具下载了几次运行，并采用了平均运行时间，即每个文件 ~ 20分钟（如果不是更多）。所以要对所有15.000个文件执行此操作，这将需要15.000 * 20 / 60 / 24 = 208 days ，这只有在每小时脚本运行时获得报酬时才会很好;）

批量命令行脚本
这是允许用户下载批量数据的子脚本_{（不是我做的）}：

注意：
我稍微改了一下，所以我可以直接从IDE运行它（所以不必为每一个小改动启动命令行）

'''
Created on 27/10/2015
@author: Maxim Scheremetjew
amended 07/11/2016 by Maxim Scheremetjew
version: 1.1
'''

import sys
import argparse
import csv
import os
import urllib.request, urllib.parse, urllib.error
import urllib.request, urllib.error, urllib.parse
from urllib.error import URLError
from io import StringIO


def _download_resource_by_url(url, output_file_name):
    """Kicks off a download and stores the file at the given path.
    Arguments:
    'url' -- Resource location.
    'output_file_name' -- Path of the output file.
    """
    print("Starting the download of the following file...")
    print(url)
    print("Saving file in:\n" + output_file_name)

    try:
        urllib.request.urlretrieve(url, output_file_name)
    except URLError as url_error:
        print(url_error)
        raise
    except  IOError as io_error:
        print(io_error)
        raise
    print("Download finished.")


def _get_number_of_chunks(url_template, study_id, sample_id, run_id, version, domain, file_type):
    """
    Returns the number of chunks for the given set of parameters (study, sample and run identifier).
    """
    print("Getting the number of chunks from the following URL...")
    url_get_number_of_chunks = url_template % (
        study_id, sample_id, run_id, version, domain, file_type)
    print(url_get_number_of_chunks)
    try:
        file_stream_handler = urllib.request.urlopen(url_get_number_of_chunks)
        result = int(file_stream_handler.read())
        print("Retrieved " + str(result) + " chunks.")
        return result
    except URLError as url_error:
        print(url_error)
        raise
    except IOError as io_error:
        print(io_error)
        raise
    except ValueError as e:
        print(e)
        print("Skipping this run! Could not retrieve the number of chunks for this URL. " \
              "Check the version number in the URL and check if the run is available online.")
        return 0


def _get_file_stream_handler(url_template, study_id):
    """
    Returns a file stream handler for the given URL.
    """
    print("Getting the list of project runs...")
    url_get_project_runs = url_template % (study_id)
    try:
        req = urllib.request.Request(url=url_get_project_runs, headers={'Content-Type': 'text/plain'})
        res = urllib.request.urlopen(req)
        dec_res =  res.read().decode()
        sys.stderr.write(str(dec_res))
        return dec_res
    except URLError as url_error:
        print(url_error)
        raise
    except  IOError as io_error:
        print(io_error)
        raise
    except ValueError as e:
        print(e)
        print("Could not retrieve any runs. Open the retrieval URL further down in your browser and see if you get any results back. Program will exit now.")
        print(url_get_project_runs)
        raise


def _print_program_settings(project_id, version, selected_file_types_list, output_path, root_url):
    print("Running the program with the following setting...")
    print("Project: " + project_id)
    print("Pipeline version: " + version)
    print("Selected file types: " + ",".join(selected_file_types_list))
    print("Root URL: " + root_url)
    print("Writing result to: " + output_path)


def start(args):


        function_file_type_list = ["InterProScan", "GOAnnotations", "GOSlimAnnotations"]
        sequences_file_type_list = ["ProcessedReads", "ReadsWithPredictedCDS", "ReadsWithMatches", "ReadsWithoutMatches",
                                    "PredictedCDS", "PredictedCDSWithoutAnnotation", "PredictedCDSWithAnnotation",
                                    "PredictedORFWithoutAnnotation", "ncRNA-tRNA-FASTA"]
        taxonomy_file_type_list = ["5S-rRNA-FASTA", "16S-rRNA-FASTA", "23S-rRNA-FASTA", "OTU-TSV", "OTU-BIOM",
                                   "OTU-table-HDF5-BIOM", "OTU-table-JSON-BIOM", "NewickTree", "NewickPrunedTree"]
        # Default list of available file types
        default_file_type_list = sequences_file_type_list + function_file_type_list + taxonomy_file_type_list

        # Parse script parameters

        # Parse the project accession
        study_id = args['project_id']

        # Parse the values for the file type parameter
        selected_file_types_list = []
        if not args['file_type']:
            # If not specified use the default set of file types
            selected_file_types_list = default_file_type_list
        else:
            # Remove whitespaces
            selected_file_types_str = args['file_type'].replace(" ", "")
            # Set all functional result file types
            if selected_file_types_str == "AllFunction":
                selected_file_types_list = function_file_type_list
            elif selected_file_types_str == "AllTaxonomy":
                selected_file_types_list = taxonomy_file_type_list
            elif selected_file_types_str == "AllSequences":
                selected_file_types_list = sequences_file_type_list
            # Set defined file types
            elif len(selected_file_types_str.split(",")) > 1:
                selected_file_types_list = selected_file_types_str.split(",")
            # Set single file type
            else:
                selected_file_types_list.append(selected_file_types_str)

        # Parse the analysis version
        version = args['version']

        root_url = "https://www.ebi.ac.uk"
        study_url_template = root_url + "/metagenomics/projects/%s/runs"
        number_of_chunks_url_template = root_url + "/metagenomics/projects/%s/samples/%s/runs/%s/results/versions/%s/%s/%s/chunks"
        chunk_url_template = root_url + "/metagenomics/projects/%s/samples/%s/runs/%s/results/versions/%s/%s/%s/chunks/%s"
        download_url_template = root_url + "/metagenomics/projects/%s/samples/%s/runs/%s/results/versions/%s/%s/%s"

        # Print out the program settings
        _print_program_settings(study_id, version, selected_file_types_list, args['output_path'], root_url)

        # Iterating over all file types
        for file_type in selected_file_types_list:
            domain = None
            fileExtension = None
            # Boolean flag to indicate if a file type is chunked or not
            is_chunked = True
            # Set the result file domain (sequences, function or taxonomy) dependent on the file type
            # Set output file extension (tsv, faa or fasta) dependent on the file type
            if file_type == 'InterProScan':
                domain = "function"
                fileExtension = ".tsv.gz"
            elif file_type == 'GOSlimAnnotations' or file_type == 'GOAnnotations':
                domain = "function"
                fileExtension = ".csv"
                is_chunked = False
            # PredictedCDS is version 1.0 and 2.0 only, from version 3.0 on this file type was replaced by
            # PredictedCDSWithAnnotation (PredictedCDS can be gained by concatenation of the 2 sequence file types now)
            elif file_type == 'PredictedCDS' or file_type == 'PredicatedCDSWithoutAnnotation' or file_type == \
                    'PredictedCDSWithAnnotation':
                if file_type == 'PredictedCDSWithAnnotation' and (version == '1.0' or version == '2.0'):
                    print("File type '" + file_type + "' is not available for version " + version + "!")
                    continue
                elif file_type == 'PredictedCDS' and version == '3.0':
                    print("File type '" + file_type + "' is not available for version " + version + "!")
                    continue
                domain = "sequences"
                fileExtension = ".faa.gz"
            elif file_type == 'ncRNA-tRNA-FASTA':
                domain = "sequences"
                fileExtension = ".fasta"
                is_chunked = False
            elif file_type == '5S-rRNA-FASTA' or file_type == '16S-rRNA-FASTA' or file_type == '23S-rRNA-FASTA':
                is_chunked = False
                domain = "taxonomy"
                fileExtension = ".fasta"
            # NewickPrunedTree is version 2 only
            # NewickTree is version 1 only
            elif file_type == 'NewickPrunedTree' or file_type == 'NewickTree':
                if file_type == 'NewickPrunedTree' and version == '1.0':
                    print("File type '" + file_type + "' is not available for version " + version + "!")
                    continue
                if file_type == 'NewickTree' and version == '2.0':
                    print("File type '" + file_type + "' is not available for version " + version + "!")
                    continue
                is_chunked = False
                domain = "taxonomy"
                fileExtension = ".tree"
            elif file_type == 'OTU-TSV':
                is_chunked = False
                domain = "taxonomy"
                fileExtension = ".tsv"
            # OTU-BIOM is version 1 only
            # OTU-table-HDF5-BIOM and OTU-table-JSON-BIOM are version 2 only
            elif file_type == 'OTU-BIOM' or file_type == 'OTU-table-HDF5-BIOM' or file_type == 'OTU-table-JSON-BIOM':
                if file_type == 'OTU-BIOM' and version == '2.0':
                    print("File type '" + file_type + "' is not available for version " + version + "!")
                    continue
                if (file_type == 'OTU-table-HDF5-BIOM' or file_type == 'OTU-table-JSON-BIOM') and version == '1.0':
                    print("File type '" + file_type + "' is not available for version " + version + "!")
                    continue
                is_chunked = False
                domain = "taxonomy"
                fileExtension = ".biom"
            else:
                domain = "sequences"
                fileExtension = ".fasta.gz"

            # Retrieve a file stream handler from the given URL and iterate over each line (each run) and build the download link using the variables from above
            file_stream_handler = _get_file_stream_handler(study_url_template, study_id)
            reader = csv.reader(StringIO(file_stream_handler), delimiter=',')

            for study_id, sample_id, run_id in reader:
                print(study_id + ", " + sample_id + ", " + run_id)

                output_path = args['output_path'] + "/" + study_id + "/" + file_type
                if not os.path.exists(output_path):
                    os.makedirs(output_path)

                if is_chunked:
                    number_of_chunks = _get_number_of_chunks(number_of_chunks_url_template, study_id, sample_id, run_id,
                                                             version, domain, file_type)

                    for chunk in range(1, number_of_chunks + 1):
                        output_file_name = output_path + "/" + run_id.replace(" ", "").replace(",",
                                                                                               "-") + "_" + file_type + "_" + str(
                                chunk) + fileExtension
                        rootUrl = chunk_url_template % (study_id, sample_id, run_id, version, domain, file_type, chunk)
                        _download_resource_by_url(rootUrl, output_file_name)
                else:
                    output_file_name = output_path + "/" + run_id.replace(" ", "").replace(",",
                                                                                           "-") + "_" + file_type + fileExtension
                    rootUrl = download_url_template % (study_id, sample_id, run_id, version, domain, file_type)
                    _download_resource_by_url(rootUrl, output_file_name)

        print("Program finished.")

start({'project_id':'ERP001736',
       'file_type': 'ProcessedReads,16S-rRNA-FASTA,OTU-TSV',
       'version': '2.0',
       'output_path':''})

我在想什么
我有（一点点）多线程/多处理/异步请求的经验，但无法弄清楚在这种情况下我应该做些什么。我的Linux服务器上有20个CPU，所以我可以做一些MP（~ 208 / 20 = 10+ 天），但根据我之前的经验， CPU只能用于~1-5％，这似乎是浪费容量。我还没有使用其他两种方法解决这类问题，我将它们用于简单的http请求（只需要一个页面并获取结果，而不是以块的形式下载文件）。

真正的问题

下载所有这些15.000个文件的最快方法是什么（顺序绝对不是一个选项）？

如果它不太耗时，请提供您的意思的代码示例（或参考）。

更新
我使用nload来衡量数据流 - 带宽的使用情况，同时运行脚本，下载1个文件（当然还有后台进程，但这些似乎可以忽略不计，只有几个Mb＆＃39 ; s）。我在4个时间点做了这个并平均了数字：

Curr: 110    Mbit/s
Min:   30    Mbit/s
Avg:   90    Mbit/s
Max:  159.75 Mbit/s
Ttl:  752.41 GByte

Python：使用请求

真正的问题

0 个答案: