所以我试图使用Python 2.7编写一个非常简单的Internet Download Manager欺骗
应该根据字节范围查询文件HTTP头,获取字节范围并在no.of线程(为简单起见,我硬编码2)之间传播下载,然后再将文件部分连接在一起。
该脚本可以轻松下载csv文件和文本文件,而不会丢失文件的完整性。 MD5校验和保持不变
问题是对于具有一点复杂性的文件,例如bin文件,zip文件,视频文件和音乐文件,由于某种原因,完整性会丢失。我认为字节的顺序是混乱的。
示例:
Python源代码:
from __future__ import print_function
import threading
import urllib
import urllib2
import time
threads = []
parts = {}
# url to open
url = "http://www.sample-videos.com/audio/mp3/india-national-anthem.mp3"
u = urllib.urlopen(url)
# define file
file_name = "test.mp3"
f = open(file_name, 'wb')
# open url and get header info
def get_file_size(url):
stream_size = u.info()['Content-Length']
file_size = stream_size
return file_size
start = 0
#get stream size
end = get_file_size(url)
# specify block size
block_sz = 512
#algo to divide work among 2 threads
def calculate_no_of_bytes_for_thread1():
full_stream_size = end
first_thread = {'start':0, 'end':(int(full_stream_size)/2)}
print(first_thread)
return first_thread
#algo to divide work among 2 threads
def calculate_no_of_bytes_for_thread2():
full_stream_size = end
second_thread= {'start':int(full_stream_size)/2,'end': int(full_stream_size)}
print(second_thread)
return second_thread
# download function
def download_thread(url ,id,start,end):
current_size = int(float(start)/1024)
total_size = int(float(end)/1024)
print ("Start at_"+str(current_size) + "Ends at_" + str(total_size))
# specify request range and init stream
req = urllib2.Request(url)
req.headers['Range'] = 'bytes=%s-%s' % (start, end)
while True:
buffer = u.read(block_sz)
if not buffer:
break
start += len(buffer)
f.write(buffer)
thread_id = id
status = "Thread ID_" +str(thread_id) + "Downloaded_" + str(int(start/1024)) + "Total_" +str(total_size)
print (status)
#starts 2 threads
def start_threads():
for i in range(2):
#if first loop, start thread 1
if(i==0):
start = calculate_no_of_bytes_for_thread1().get('start')
end = calculate_no_of_bytes_for_thread1().get('end')
print("Thread 1 started")
t = threading.Thread(target=download_thread, args=(url,i,start,end))
t.start()
threads.append( t)
#if second loop, start thread 2
if(i==1):
start = calculate_no_of_bytes_for_thread2().get('start')
end = calculate_no_of_bytes_for_thread2().get('end')
print("Thread 2 started")
t = threading.Thread(target=download_thread, args=(url,i,start,end))
t.start()
threads.append( t)
# Join threads back (order doesn't matter, you just want them all)
for i in threads:
i.join()
# Sort parts and you're done
# result = ''
# for i in range(2):
# result += parts[i*block_sz]
#start benchmarking
start_time = time.clock()
start_threads()
print ("Finito!")
end_time = time.clock()
benchmark = str(end_time - start_time)
print ("Download took_" +benchmark)
f.close()
所以经过Mark的一些见解,(谢谢)我得到了脚本工作,它完美地下载了文件。我了解到每一个字节都很重要!所以这是工作代码:
import urllib
import urllib2
import threading
import time
f = open("newfile.zip", "wb")
url = "http://greenbookhymns.s3.amazonaws.com/245to257.zip"
parts = {}
threads = []
#gets file size
d = urllib.urlopen(url)
file_size = d.info()['Content-Length']
print ("File Size = " + str(file_size))
#get thread_no
thread_no = int(file_size) / 1000000
#urllib2 range download function
def download(thread_no,start_point, end_point):
req = urllib2.Request(url)
req.headers['Range'] = 'bytes=%s-%s' % (start_point, end_point)
f = urllib2.urlopen(req)
parts[thread_no] = f.read()
#launch threads targeting download function
def thread_launcher(thread_no):
for i in range(thread_no):
if i == 0:
t = threading.Thread(target=download, args=(i,0,1000000,))
t.start()
threads.append( t)
print "iteration 0starting_point0ending_point1000000"
elif i > 0:
start_point = (i*1000000)+1
end_point = (i*1000000)+1000000
t = threading.Thread(target=download, args=(i,start_point,end_point,))
t.start()
threads.append( t)
print "iteration" + str(i) + "starting_point" + str(start_point) + "end_point" + str(end_point)
last_file_part_start_point = (thread_no * 1000000) +1
remaining_bytes= int(file_size) - int(thread_no*1000000)
print str(remaining_bytes)
last_file_part_end_point = (thread_no*1000000) + remaining_bytes
print "iteration" + str(thread_no) + "starting_point" + str(last_file_part_start_point) + "end_point" + str(last_file_part_end_point)
t = threading.Thread(target=download, args=(thread_no,(last_file_part_start_point), last_file_part_end_point,))
t.start()
threads.append(t)
thread_launcher(thread_no)
# Join threads back (order doesn't matter, you just want them all)
for i in threads:
i.join()
# Sort parts and you're done
result = ''
for i in range(thread_no+1):
result += parts[i]
f.write(result)
f.close()
exit()
答案 0 :(得分:0)
您的下载线程函数会在收到数据时将数据写入文件f
。你有两个并行运行的线程,因此它将收到的数据混淆到文件中。