Python文件在文件中寻找+写入输出奇怪的“NUL”

时间:2015-05-31 10:16:08

标签: python

我正在编写一个下载程序,它会将url拆分为部分并使用线程下载,可能我不会使用“join”因为join =无法流式传输(如果所有线程都未完成则无法写入文件)

但问题是f.seek和写输出真的很奇怪的文件,文件的内容总是有“NUL”字符(在Notepad ++中),文件中的文本只是整个文件的1/3。

大家好,感谢大家的帮助,这是我的2.0版代码,感谢Padraic Cunningham提出的建议和解释,我修改我的代码几乎就像你建议的那样: 所以请帮我查一下代码,我觉得需要你帮忙把它转换成http.server文件流的方法:

import os, requests
import threading
import urllib3
import urllib.request, urllib.error, urllib.parse
import time
import re

pool = urllib3.PoolManager(maxsize=10)
URL = "https://raw.githubusercontent.com/langpavel/tampermonkey/master/src/emulation.js"
fileName = "1.js"
countsize = 0
#if os.path.exists(fileName):
 #   os.remove(fileName)

def defwrite(filename,data,offset):
  f = open(filename,'wb')
  f.seek(offset)
  f.write(data)
  f.close()

def buildRange(url, numsplits):
    global pool
    value = int(requests.head(url, headers={'Accept-Encoding': 'identity'}).headers.get('content-length', None))
    print("Fullsize: ", value)
    print("Try devide with 3 :", value / 3)
    lst = []
    for i in range(numsplits):
        if i == range(numsplits):
            lst.append('%s-%s' % (i * value//numsplits + 1, i * value//numsplits + 1 + (value - (i * value//numsplits + 1))))
        if i == 0:
            lst.append('%s-%s' % (0, value//numsplits))
        else:
            lst.append('%s-%s' % (i * value//numsplits + 1, (i + 1) * value//numsplits))
    return lst

def main(url=None, splitBy=3):
    global fileName, pool, countsize
    start_time = time.time()
    if not url:
        print("Please Enter some url to begin download.")
        return

    #fileName = "1.jpg"

    #print("%s bytes to download." % sizeInBytes)
   # if not sizeInBytes:
    #    print("Size cannot be determined.")
     #   return
    #sinzeInBytes = buildRange(url, 
    dataDict = {}
    f = open(fileName,'wb')

    # split total num bytes into ranges
    #ranges = buildRange(url,int(sizeInBytes), splitBy)
    ranges = buildRange(url, splitBy)
    print(ranges)
    def downloadChunk(idx, irange):
        print(idx)
        #time.sleep(1*idx)
        #req = urllib.request.Request(url)
        #req.headers['Range'] = 'bytes={}'.format(irange)
        headers = urllib3._collections.HTTPHeaderDict()
        headers.add('Range', 'bytes=' + str(irange))
        data = pool.urlopen('GET', URL, headers=headers).data
        #print(data)
        #print("finish: " + str(irange))
        offset = int(re.sub("(^.*?)-(.*?)$", "\\1", irange))
        print(offset)
       # print(irange)
        f.seek(offset, 0)
        #f.truncate(0)
        #print(f.tell())
        f.write(data)
        #f.read()
        #f.close()
        countsize = countsize + offset


        #defwrite("1.txt", req, re.sub("(^.*?)-", "\\1", str(irange)))

    # create one downloading thread per chunk
    downloaders = [
        threading.Thread(
            target=downloadChunk,
            args=(idx, irange),
        )
        for idx,irange in enumerate(ranges)
        ]


    # start threads, let run in parallel, wait for all to finish
    for th in downloaders:
        th.start()
        #th.isAlive()
    #for th in downloaders:
        #th.join()
        #print(th.join)
    print(countsize)
    #print('done: got {} chunks, total {} bytes'.format(
    #    len(dataDict), sum( (
    ##        len(chunk) for chunk in list(dataDict.values())
     #   ) )
    #))

    #print("--- %s seconds ---" % str(time.time() - start_time))

#    if os.path.exists(fileName):
 #       os.remove(fileName)
     #reassemble file in correct order
    #with open(fileName, 'wb') as fh:
    #    for _idx,chunk in sorted(dataDict.items()):
    #        fh.write(chunk)
    #stream_chunk = 16 * 1024
    #with open(fileName, 'wb') as fp:
    #  while True:
    #      for _idx,chunk in sorted(dataDict.items()):
            #fh.write(chunk)
     #       chunking = chunk.read(stream_chunk)
      #      if not chunk:
       #         break
        #    fp.write(chunking)


   # print("Finished Writing file %s" % fileName)
    #print('file size {} bytes'.format(os.path.getsize(fileName)))

if __name__ == '__main__':
   if os.path.exists(fileName):
     os.remove(fileName)
   main(URL, splitBy=16)

这是我的代码,请帮我修复它:版本1.0,忽略它,上面的版本2.0:

import os, requests
import threading
import urllib3
import urllib.request, urllib.error, urllib.parse
import time
import re

pool = urllib3.PoolManager(maxsize=10)
URL = "https://raw.githubusercontent.com/langpavel/tampermonkey/master/src/emulation.js"
fileName = "1.js"
#if os.path.exists(fileName):
 #   os.remove(fileName)

def defwrite(filename,data,offset):
  f = open(filename,'wb')
  f.seek(offset)
  f.write(data)
  f.close()

def buildRange(value, numsplits):
    lst = []
    for i in range(numsplits):
        if i == range(numsplits):
            lst.append('%s-%s' % (int(round(1 + i * value/(numsplits*1.0),0)), int(value - round(1 + i * value/(numsplits*1.0) + value/(numsplits*1.0)-1, 0))))
        if i == 0:
            lst.append('%s-%s' % (i, int(round(1 + i * value/(numsplits*1.0) + value/(numsplits*1.0)-1, 0))))
        else:
            lst.append('%s-%s' % (int(round(1 + i * value/(numsplits*1.0),0)), int(round(1 + i * value/(numsplits*1.0) + value/(numsplits*1.0)-1, 0))))
    return lst

def main(url=None, splitBy=3):
    global fileName, pool
    start_time = time.time()
    if not url:
        print("Please Enter some url to begin download.")
        return

    #fileName = "1.jpg"
    sizeInBytes = requests.head(url, headers={'Accept-Encoding': 'identity'}).headers.get('content-length', None)
    print("%s bytes to download." % sizeInBytes)
    if not sizeInBytes:
        print("Size cannot be determined.")
        return

    dataDict = {}

    # split total num bytes into ranges
    ranges = buildRange(int(sizeInBytes), splitBy)

    def downloadChunk(idx, irange):
        print(idx)
        #req = urllib.request.Request(url)
        #req.headers['Range'] = 'bytes={}'.format(irange)
        headers = urllib3._collections.HTTPHeaderDict()
        headers.add('Range', 'bytes=' + str(irange))
        data = pool.urlopen('GET', URL, headers=headers).data
        print(data)
        print("finish: " + str(irange))
        offset = int(re.sub("(^.*?)-(.*?)$", "\\1", irange))
        #print(offset)
       # print(irange)
        f = open(fileName,'wb')
        f.seek(offset)
        #f.truncate(0)
        #print(f.tell())
        f.write(data)
        #f.read()
        #f.close()



        #defwrite("1.txt", req, re.sub("(^.*?)-", "\\1", str(irange)))

    # create one downloading thread per chunk
    downloaders = [
        threading.Thread(
            target=downloadChunk,
            args=(idx, irange),
        )
        for idx,irange in enumerate(ranges)
        ]

    # start threads, let run in parallel, wait for all to finish
    for th in downloaders:
        th.start()
        #th.isAlive()
    #for th in downloaders:
        #th.join()
        #print(th.join)

    #print('done: got {} chunks, total {} bytes'.format(
    #    len(dataDict), sum( (
    ##        len(chunk) for chunk in list(dataDict.values())
     #   ) )
    #))

    #print("--- %s seconds ---" % str(time.time() - start_time))

#    if os.path.exists(fileName):
 #       os.remove(fileName)
     #reassemble file in correct order
    #with open(fileName, 'wb') as fh:
    #    for _idx,chunk in sorted(dataDict.items()):
    #        fh.write(chunk)
    #stream_chunk = 16 * 1024
    #with open(fileName, 'wb') as fp:
    #  while True:
    #      for _idx,chunk in sorted(dataDict.items()):
            #fh.write(chunk)
     #       chunking = chunk.read(stream_chunk)
      #      if not chunk:
       #         break
        #    fp.write(chunking)


   # print("Finished Writing file %s" % fileName)
    #print('file size {} bytes'.format(os.path.getsize(fileName)))

if __name__ == '__main__':
    main(URL, splitBy=3)

1 个答案:

答案 0 :(得分:1)

您使用目标函数为downloadChunk的三个主题,使用wb打开文件三次覆盖,这样您就获得了1/3的内容。你也没有明显的理由打电话寻求。如果要附加到文件,则每次都使用a打开,或者只在函数外打开一次文件。 您正试图寻找使用空文件并写入,以便空字节来自。

如果要打开文件进行读写,可以使用行缓冲进行搜索:

 with open("whatever.file", "r+b",buffering=1) as f

然后使用该文件写入,不要在函数中保持打开并覆盖,文件也必须存在。