Python:如何从gzip压缩中流/管道数据?

时间:2012-01-11 16:35:28

标签: python stream gzip

我需要做这样的事情,但是在python:

dd if=/dev/sdb | gzip -c | curl ftp upload

我无法使用Popen的整个命令,因为:

  1. 我需要非阻止操作
  2. 我需要进度信息(尝试通过proc.stderr循环无效)
  3. 另一件大事是我无法在上传之前在内存或磁盘上创建压缩的gzip文件。

    所以这就是我想要弄清楚如何做,gzip_stream_of_strings(输入)是未知的:

    import os, pycurl
    filename = '/path/to/super/large/file.img'
    filesize = os.path.getsize(filename)
    
    def progress(dl_left, dl_completed, ul_left, ul_completed):
        return (ul_completed/filesize)*100
    
    def main():
        c = pycurl.Curl()
        c.setopt(c.URL, 'ftp://IP/save_as.img.gz')
        c.setopt(pycurl.NOPROGRESS, 0)
        c.setopt(pycurl.PROGRESSFUNCTION, progress)
        c.setopt(pycurl.UPLOAD, 1)
        c.setopt(pycurl.INFILESIZE, filesize)
        c.setopt(pycurl.USERPWD, 'user:passwd')
        with open(filename) as input:
            c.setopt(pycurl.READFUNCTION, gzip_stream_of_stings(input))
            c.perform()
            c.close()
    

    非常感谢任何帮助!

    修改: 这是解决方案:

    from gzip import GzipFile
    from StringIO import StringIO
    
    CHUNCK_SIZE = 1024
    
    class GZipPipe(StringIO):
        """This class implements a compression pipe suitable for asynchronous 
        process.
        Credit to cdvddt @ http://snippets.dzone.com/posts/show/5644
    
        @param source: this is the input file to compress
        @param name: this is stored as the name in the gzip header
        @function read: call this to read(size) chunks from the gzip stream        
        """
        def __init__(self, source = None, name = "data"):
            StringIO.__init__(self)
    
            self.source = source
            self.source_eof = False
            self.buffer = ""
            self.zipfile = GzipFile(name, 'wb', 9, self)
    
        def write(self, data):
            self.buffer += data
    
        def read(self, size = -1):
            while ((len(self.buffer) < size) or (size == -1)) and not self.source_eof:
                if self.source == None: 
                    break
                chunk = self.source.read(CHUNCK_SIZE)
                self.zipfile.write(chunk)
                if (len(chunk) < CHUNCK_SIZE) :
                    self.source_eof = True
                    self.zipfile.flush()
                    self.zipfile.close()
                    break
    
            if size == 0:
                result = ""
            if size >= 1:
                result = self.buffer[0:size]
                self.buffer = self.buffer[size:]
            else:
                result = self.buffer
                self.buffer = ""
    
            return result
    

    像这样使用:

    with open(filename) as input:
        c.setopt(pycurl.READFUNCTION, GZipPipe(input).read)
    

1 个答案:

答案 0 :(得分:2)

内置的zlib库允许使用任何文件类型对象,包括文本流。

import os, pycurl, zlib
from cStringIO import StringIO
filename = '/path/to/super/large/file.img'
filesize = os.path.getsize(filename)

def progress(dl_left, dl_completed, ul_left, ul_completed):
    return (ul_completed/filesize)*100

def main():
    c = pycurl.Curl()
    c.setopt(c.URL, 'ftp://IP/save_as.img.gz')
    c.setopt(pycurl.NOPROGRESS, 0)
    c.setopt(pycurl.PROGRESSFUNCTION, progress)
    c.setopt(pycurl.UPLOAD, 1)
    c.setopt(pycurl.INFILESIZE, filesize)
    c.setopt(pycurl.USERPWD, 'user:passwd')
    with open(filename) as input:
        s = StringIO()
        c.setopt(pycurl.READFUNCTION, s.write(zlib.compress(input.readlines())))
        c.perform()
        c.close()

我没有测试过这个。有关其他信息,请参阅this SO question