背景
对于我们的研究,我们目前需要下载 ~ 15.000 files
。虽然数据库有它的命令行工具,但支持"批量"下载,按顺序执行15.000次运行是完全不可行的(正如命令行工具当前所做的那样)。
简单的数学
我使用当前可用的命令行工具下载了几次运行,并采用了平均运行时间,即每个文件 ~ 20
分钟(如果不是更多)。所以要对所有15.000个文件执行此操作,这将需要15.000 * 20 / 60 / 24 =
208 days
,这只有在每小时脚本运行时获得报酬时才会很好;)
批量命令行脚本
这是允许用户下载批量数据的子脚本(不是我做的):
注意:
我稍微改了一下,所以我可以直接从IDE运行它(所以不必为每一个小改动启动命令行)
'''
Created on 27/10/2015
@author: Maxim Scheremetjew
amended 07/11/2016 by Maxim Scheremetjew
version: 1.1
'''
import sys
import argparse
import csv
import os
import urllib.request, urllib.parse, urllib.error
import urllib.request, urllib.error, urllib.parse
from urllib.error import URLError
from io import StringIO
def _download_resource_by_url(url, output_file_name):
"""Kicks off a download and stores the file at the given path.
Arguments:
'url' -- Resource location.
'output_file_name' -- Path of the output file.
"""
print("Starting the download of the following file...")
print(url)
print("Saving file in:\n" + output_file_name)
try:
urllib.request.urlretrieve(url, output_file_name)
except URLError as url_error:
print(url_error)
raise
except IOError as io_error:
print(io_error)
raise
print("Download finished.")
def _get_number_of_chunks(url_template, study_id, sample_id, run_id, version, domain, file_type):
"""
Returns the number of chunks for the given set of parameters (study, sample and run identifier).
"""
print("Getting the number of chunks from the following URL...")
url_get_number_of_chunks = url_template % (
study_id, sample_id, run_id, version, domain, file_type)
print(url_get_number_of_chunks)
try:
file_stream_handler = urllib.request.urlopen(url_get_number_of_chunks)
result = int(file_stream_handler.read())
print("Retrieved " + str(result) + " chunks.")
return result
except URLError as url_error:
print(url_error)
raise
except IOError as io_error:
print(io_error)
raise
except ValueError as e:
print(e)
print("Skipping this run! Could not retrieve the number of chunks for this URL. " \
"Check the version number in the URL and check if the run is available online.")
return 0
def _get_file_stream_handler(url_template, study_id):
"""
Returns a file stream handler for the given URL.
"""
print("Getting the list of project runs...")
url_get_project_runs = url_template % (study_id)
try:
req = urllib.request.Request(url=url_get_project_runs, headers={'Content-Type': 'text/plain'})
res = urllib.request.urlopen(req)
dec_res = res.read().decode()
sys.stderr.write(str(dec_res))
return dec_res
except URLError as url_error:
print(url_error)
raise
except IOError as io_error:
print(io_error)
raise
except ValueError as e:
print(e)
print("Could not retrieve any runs. Open the retrieval URL further down in your browser and see if you get any results back. Program will exit now.")
print(url_get_project_runs)
raise
def _print_program_settings(project_id, version, selected_file_types_list, output_path, root_url):
print("Running the program with the following setting...")
print("Project: " + project_id)
print("Pipeline version: " + version)
print("Selected file types: " + ",".join(selected_file_types_list))
print("Root URL: " + root_url)
print("Writing result to: " + output_path)
def start(args):
function_file_type_list = ["InterProScan", "GOAnnotations", "GOSlimAnnotations"]
sequences_file_type_list = ["ProcessedReads", "ReadsWithPredictedCDS", "ReadsWithMatches", "ReadsWithoutMatches",
"PredictedCDS", "PredictedCDSWithoutAnnotation", "PredictedCDSWithAnnotation",
"PredictedORFWithoutAnnotation", "ncRNA-tRNA-FASTA"]
taxonomy_file_type_list = ["5S-rRNA-FASTA", "16S-rRNA-FASTA", "23S-rRNA-FASTA", "OTU-TSV", "OTU-BIOM",
"OTU-table-HDF5-BIOM", "OTU-table-JSON-BIOM", "NewickTree", "NewickPrunedTree"]
# Default list of available file types
default_file_type_list = sequences_file_type_list + function_file_type_list + taxonomy_file_type_list
# Parse script parameters
# Parse the project accession
study_id = args['project_id']
# Parse the values for the file type parameter
selected_file_types_list = []
if not args['file_type']:
# If not specified use the default set of file types
selected_file_types_list = default_file_type_list
else:
# Remove whitespaces
selected_file_types_str = args['file_type'].replace(" ", "")
# Set all functional result file types
if selected_file_types_str == "AllFunction":
selected_file_types_list = function_file_type_list
elif selected_file_types_str == "AllTaxonomy":
selected_file_types_list = taxonomy_file_type_list
elif selected_file_types_str == "AllSequences":
selected_file_types_list = sequences_file_type_list
# Set defined file types
elif len(selected_file_types_str.split(",")) > 1:
selected_file_types_list = selected_file_types_str.split(",")
# Set single file type
else:
selected_file_types_list.append(selected_file_types_str)
# Parse the analysis version
version = args['version']
root_url = "https://www.ebi.ac.uk"
study_url_template = root_url + "/metagenomics/projects/%s/runs"
number_of_chunks_url_template = root_url + "/metagenomics/projects/%s/samples/%s/runs/%s/results/versions/%s/%s/%s/chunks"
chunk_url_template = root_url + "/metagenomics/projects/%s/samples/%s/runs/%s/results/versions/%s/%s/%s/chunks/%s"
download_url_template = root_url + "/metagenomics/projects/%s/samples/%s/runs/%s/results/versions/%s/%s/%s"
# Print out the program settings
_print_program_settings(study_id, version, selected_file_types_list, args['output_path'], root_url)
# Iterating over all file types
for file_type in selected_file_types_list:
domain = None
fileExtension = None
# Boolean flag to indicate if a file type is chunked or not
is_chunked = True
# Set the result file domain (sequences, function or taxonomy) dependent on the file type
# Set output file extension (tsv, faa or fasta) dependent on the file type
if file_type == 'InterProScan':
domain = "function"
fileExtension = ".tsv.gz"
elif file_type == 'GOSlimAnnotations' or file_type == 'GOAnnotations':
domain = "function"
fileExtension = ".csv"
is_chunked = False
# PredictedCDS is version 1.0 and 2.0 only, from version 3.0 on this file type was replaced by
# PredictedCDSWithAnnotation (PredictedCDS can be gained by concatenation of the 2 sequence file types now)
elif file_type == 'PredictedCDS' or file_type == 'PredicatedCDSWithoutAnnotation' or file_type == \
'PredictedCDSWithAnnotation':
if file_type == 'PredictedCDSWithAnnotation' and (version == '1.0' or version == '2.0'):
print("File type '" + file_type + "' is not available for version " + version + "!")
continue
elif file_type == 'PredictedCDS' and version == '3.0':
print("File type '" + file_type + "' is not available for version " + version + "!")
continue
domain = "sequences"
fileExtension = ".faa.gz"
elif file_type == 'ncRNA-tRNA-FASTA':
domain = "sequences"
fileExtension = ".fasta"
is_chunked = False
elif file_type == '5S-rRNA-FASTA' or file_type == '16S-rRNA-FASTA' or file_type == '23S-rRNA-FASTA':
is_chunked = False
domain = "taxonomy"
fileExtension = ".fasta"
# NewickPrunedTree is version 2 only
# NewickTree is version 1 only
elif file_type == 'NewickPrunedTree' or file_type == 'NewickTree':
if file_type == 'NewickPrunedTree' and version == '1.0':
print("File type '" + file_type + "' is not available for version " + version + "!")
continue
if file_type == 'NewickTree' and version == '2.0':
print("File type '" + file_type + "' is not available for version " + version + "!")
continue
is_chunked = False
domain = "taxonomy"
fileExtension = ".tree"
elif file_type == 'OTU-TSV':
is_chunked = False
domain = "taxonomy"
fileExtension = ".tsv"
# OTU-BIOM is version 1 only
# OTU-table-HDF5-BIOM and OTU-table-JSON-BIOM are version 2 only
elif file_type == 'OTU-BIOM' or file_type == 'OTU-table-HDF5-BIOM' or file_type == 'OTU-table-JSON-BIOM':
if file_type == 'OTU-BIOM' and version == '2.0':
print("File type '" + file_type + "' is not available for version " + version + "!")
continue
if (file_type == 'OTU-table-HDF5-BIOM' or file_type == 'OTU-table-JSON-BIOM') and version == '1.0':
print("File type '" + file_type + "' is not available for version " + version + "!")
continue
is_chunked = False
domain = "taxonomy"
fileExtension = ".biom"
else:
domain = "sequences"
fileExtension = ".fasta.gz"
# Retrieve a file stream handler from the given URL and iterate over each line (each run) and build the download link using the variables from above
file_stream_handler = _get_file_stream_handler(study_url_template, study_id)
reader = csv.reader(StringIO(file_stream_handler), delimiter=',')
for study_id, sample_id, run_id in reader:
print(study_id + ", " + sample_id + ", " + run_id)
output_path = args['output_path'] + "/" + study_id + "/" + file_type
if not os.path.exists(output_path):
os.makedirs(output_path)
if is_chunked:
number_of_chunks = _get_number_of_chunks(number_of_chunks_url_template, study_id, sample_id, run_id,
version, domain, file_type)
for chunk in range(1, number_of_chunks + 1):
output_file_name = output_path + "/" + run_id.replace(" ", "").replace(",",
"-") + "_" + file_type + "_" + str(
chunk) + fileExtension
rootUrl = chunk_url_template % (study_id, sample_id, run_id, version, domain, file_type, chunk)
_download_resource_by_url(rootUrl, output_file_name)
else:
output_file_name = output_path + "/" + run_id.replace(" ", "").replace(",",
"-") + "_" + file_type + fileExtension
rootUrl = download_url_template % (study_id, sample_id, run_id, version, domain, file_type)
_download_resource_by_url(rootUrl, output_file_name)
print("Program finished.")
start({'project_id':'ERP001736',
'file_type': 'ProcessedReads,16S-rRNA-FASTA,OTU-TSV',
'version': '2.0',
'output_path':''})
我在想什么
我有(一点点)多线程/多处理/异步请求的经验,但无法弄清楚在这种情况下我应该做些什么。我的Linux服务器上有20个CPU,所以我可以做一些MP(~ 208 / 20 =
10+
天),但根据我之前的经验, CPU只能用于~1-5%,这似乎是浪费容量。我还没有使用其他两种方法解决这类问题,我将它们用于简单的http请求(只需要一个页面并获取结果,而不是以块的形式下载文件)。
下载所有这些15.000个文件的最快方法是什么(顺序绝对不是一个选项)?
如果它不太耗时,请提供您的意思的代码示例(或参考)。
更新
我使用nload
来衡量数据流 - 带宽的使用情况,同时运行脚本,下载1个文件(当然还有后台进程,但这些似乎可以忽略不计,只有几个Mb&#39 ; s)。我在4个时间点做了这个并平均了数字:
Curr: 110 Mbit/s
Min: 30 Mbit/s
Avg: 90 Mbit/s
Max: 159.75 Mbit/s
Ttl: 752.41 GByte