分段下载期间二进制文件损坏

时间:2015-12-05 16:13:04

标签: python multithreading python-2.7 for-loop urllib

所以我试图使用Python 2.7编写一个非常简单的Internet Download Manager欺骗

应该根据字节范围查询文件HTTP头,获取字节范围并在no.of线程(为简单起见,我硬编码2)之间传播下载,然后再将文件部分连接在一起。

该脚本可以轻松下载csv文件和文本文件,而不会丢失文件的完整性。 MD5校验和保持不变

问题是对于具有一点复杂性的文件,例如bin文件,zip文件,视频文件和音乐文件,由于某种原因,完整性会丢失。我认为字节的顺序是混乱的。

示例:

  1. mp3 downloaded through Chrome

  2. mp3 downloaded through my script with 2 threads

  3. Python源代码:

    from __future__ import print_function
    
    import threading
    import urllib
    import urllib2
    
    import time
    
    threads = []
    parts = {}
    
    # url to open
    url = "http://www.sample-videos.com/audio/mp3/india-national-anthem.mp3"
    u = urllib.urlopen(url)
    
    # define file
    file_name = "test.mp3"
    f = open(file_name, 'wb')
    
    
    # open url and get header info
    def get_file_size(url):
        stream_size =  u.info()['Content-Length']
        file_size = stream_size
        return file_size
    
    start = 0
    #get stream size
    end = get_file_size(url)
    # specify block size
    block_sz = 512
    
    #algo to divide work among 2 threads
    def calculate_no_of_bytes_for_thread1():
        full_stream_size = end
        first_thread = {'start':0, 'end':(int(full_stream_size)/2)}
        print(first_thread)
        return first_thread
    
    #algo to divide work among 2 threads
    def calculate_no_of_bytes_for_thread2():
        full_stream_size = end
        second_thread= {'start':int(full_stream_size)/2,'end': int(full_stream_size)}
        print(second_thread)
        return second_thread
    
    
    
    # download function
    def download_thread(url ,id,start,end):
        current_size = int(float(start)/1024)
        total_size = int(float(end)/1024)
        print ("Start at_"+str(current_size) + "Ends at_" + str(total_size))
    
        # specify request range and init stream
        req = urllib2.Request(url)
        req.headers['Range'] = 'bytes=%s-%s' % (start, end)
    
        while True:
            buffer = u.read(block_sz)
            if not buffer:
                break
            start += len(buffer)
            f.write(buffer)
            thread_id = id
            status =  "Thread ID_" +str(thread_id) + "Downloaded_" + str(int(start/1024)) + "Total_" +str(total_size)
            print (status)
    
    #starts 2 threads
    def start_threads():
        for i in range(2):
            #if first loop, start thread 1
            if(i==0):
                start = calculate_no_of_bytes_for_thread1().get('start')
                end = calculate_no_of_bytes_for_thread1().get('end')
                print("Thread 1 started")
                t = threading.Thread(target=download_thread, args=(url,i,start,end))
                t.start()
                threads.append( t)
            #if second loop, start thread 2
            if(i==1):
                start = calculate_no_of_bytes_for_thread2().get('start')
                end = calculate_no_of_bytes_for_thread2().get('end')
                print("Thread 2 started")
                t = threading.Thread(target=download_thread, args=(url,i,start,end))
                t.start()
                threads.append( t)
    
        # Join threads back (order doesn't matter, you just want them all)
        for i in threads:
           i.join()
    
        # Sort parts and you're done
        # result = ''
        # for i in range(2):
        #     result += parts[i*block_sz]
    
    #start benchmarking
    start_time = time.clock()
    
    start_threads()
    
    print ("Finito!")
    
    end_time = time.clock()
    benchmark = str(end_time - start_time)
    print ("Download took_" +benchmark)
    
    f.close()
    

    所以经过Mark的一些见解,(谢谢)我得到了脚本工作,它完美地下载了文件。我了解到每一个字节都很重要!所以这是工作代码

    import urllib
    import urllib2
    import threading
    import time
    
    f = open("newfile.zip", "wb")
    url = "http://greenbookhymns.s3.amazonaws.com/245to257.zip"
    parts = {}
    threads = []
    
    #gets file size
    d = urllib.urlopen(url)
    file_size = d.info()['Content-Length']
    print ("File Size = " + str(file_size))
    
    #get thread_no
    thread_no = int(file_size) / 1000000
    
    #urllib2 range download function
    def download(thread_no,start_point, end_point):
        req = urllib2.Request(url)
        req.headers['Range'] = 'bytes=%s-%s' % (start_point, end_point)
        f = urllib2.urlopen(req)
        parts[thread_no] = f.read()
    
    #launch threads targeting download function
    def thread_launcher(thread_no):
        for i in range(thread_no):
            if i == 0:
                t = threading.Thread(target=download, args=(i,0,1000000,))
                t.start()
                threads.append( t)
                print "iteration 0starting_point0ending_point1000000"
            elif i > 0:
                start_point = (i*1000000)+1
                end_point = (i*1000000)+1000000
                t = threading.Thread(target=download, args=(i,start_point,end_point,))
                t.start()
                threads.append( t)
                print "iteration" + str(i) + "starting_point" + str(start_point) + "end_point" + str(end_point)
    
            last_file_part_start_point = (thread_no * 1000000) +1
            remaining_bytes= int(file_size) - int(thread_no*1000000)
            print str(remaining_bytes)
            last_file_part_end_point = (thread_no*1000000) + remaining_bytes
            print "iteration" + str(thread_no) + "starting_point" + str(last_file_part_start_point) + "end_point" + str(last_file_part_end_point)
            t = threading.Thread(target=download, args=(thread_no,(last_file_part_start_point), last_file_part_end_point,))
            t.start()
            threads.append(t)
    
    thread_launcher(thread_no)
    
    # Join threads back (order doesn't matter, you just want them all)
    for i in threads:
        i.join()
    
    # Sort parts and you're done
    result = ''
    for i in range(thread_no+1):
        result += parts[i]
    
    f.write(result)
    
    f.close()
    
    exit()
    

1 个答案:

答案 0 :(得分:0)

您的下载线程函数会在收到数据时将数据写入文件f。你有两个并行运行的线程,因此它将收到的数据混淆到文件中。