为什么python zipfile模块比C更快?

时间:2015-09-15 00:50:19

标签: python zip extraction

我正在编写一个模块,需要能够快速处理大量的zip文件。因此,我将使用在C而不是Python中实现的东西(我将从中调用提取器)。为了尝试测试哪种方法最快,我写了一个测试脚本,比较linux的'unzip'命令和czipfile python模块(c zip zipctor周围的包装器)。作为一个控件,我使用了原生的python zipfile模块。

该脚本在100~1MB文件中创建一个大约100MB的zip文件。它着眼于3个场景。 A)文件都只是随机的字节串。 B)文件只是随机的十六进制字符C)文件是带有换行符的统一随机句子。

在所有情况下,zipfile(在python中实现)的性能与c中实现的两个提取器相当或明显更好。

为什么会发生这种情况的任何想法?该脚本已附加。需要czipfile和shell中的'unzip'命令。

from datetime import datetime
import zipfile
import czipfile
import os, binascii, random

class ZipTestError(Exception):
    pass



class ZipTest:

    procs = ['zipfile', 'czipfile', 'os']
    type_map = {'r':'Random', 'h':'Random Hex', 's':'Sentences'}

    # three types. t=='r' is random noise files directly out of urandom. t=='h' is urandom noise converted to ascii characters. t=='s' are randomly constructed sentences with line breaks.
    def __init__(self):
        print """Testing Random Byte Files:
"""
        self.test('r')
        self.test('h')
        self.test('s')




    @staticmethod
    def rand_name():
        return binascii.b2a_hex(os.urandom(10))

    def make_file(self, t): 
        f_name = self.rand_name()
        f = open(f_name, 'w')
        if t == 'r':
            f.write(os.urandom(1048576))
        elif t == 'h':
            f.write(binascii.b2a_hex(os.urandom(1048576)))
        elif t == 's':
            for i in range(76260):
                ops = ['dog', 'cat', 'rat']
                ops2 = ['meat', 'wood', 'fish']
                n1 = int(random.random()*10) % 3
                n2 = int(random.random()*10) % 3
                sentence = """The {0} eats {1}
    """.format(ops[n1], ops2[n2])
                f.write(sentence)
        else:
            raise ZipTestError('Invalid Type')
        f.close()
        return f_name

    #create a ~100MB zip file to test extraction on.
    def create_zip_test(self, t):
        self.file_names = []
        self.output_names = []
        for i in range(100):
            self.file_names.append(self.make_file(t))
        self.zip_name = self.rand_name()
        output = zipfile.ZipFile(self.zip_name, 'w', zipfile.ZIP_DEFLATED)
        for f in self.file_names:
            output.write(f)
        output.close()


    def clean_up(self, rem_zip = False):
        for f in self.file_names:
            os.remove(f)
        self.file_names = []
        for f in self.output_names:
            os.remove(f)
        self.output_names = []
        if rem_zip:
            if getattr(self, 'zip_name', False):
                os.remove(self.zip_name)
            self.zip_name = False

    def display_res(self, res, t):
        print """
{0} results:
""".format(self.type_map[t])
        for p in self.procs:
            print"""
{0} = {1} milliseconds""".format(p, str(res[p]))


    def test(self, t):
        self.create_zip_test(t)
        res = self.unzip()
        self.display_res(res, t)
        self.clean_up(rem_zip = True)


    def unzip(self):
        res = dict()
        for p in self.procs:
            self.clean_up()
            res[p] = getattr(self, "unzip_with_{0}".format(p))()
        return res

    def unzip_with_zipfile(self):
        return self.unzip_with_python(zipfile)

    def unzip_with_czipfile(self):
        return self.unzip_with_python(czipfile)

    def unzip_with_python(self, mod):
        f = open(self.zip_name)
        zf = mod.ZipFile(f)
        start = datetime.now()
        op = './'
        for name in zf.namelist():
            zf.extract(name,op)
            self.output_names.append(name)
        end = datetime.now()
        total = end-start
        ms = total.microseconds
        ms += (total.seconds) * 1000000
        return ms /1000

    def unzip_with_os(self):
        f = open(self.zip_name)
        start = datetime.now()
        zf = zipfile.ZipFile(f)
        for name in zf.namelist():
            self.output_names.append(name)   
        os.system("unzip -qq {0}".format(f.name))
        end = datetime.now()
        total = end-start
        ms = total.microseconds
        ms += (total.seconds) * 1000000
        return ms /1000






if __name__ == '__main__':
    ZipTest()

2 个答案:

答案 0 :(得分:1)

如上所述,解密是在python中完成的,而不是解压缩。所以zipfile就像其他两个一样使用c实现。

答案 1 :(得分:1)

即使C通常比解释语言更快,假设算法相同,不同的缓冲策略也会有所不同。这里有一些证据:

我对您的脚本进行了一些更改。差异在下面。

我在os.system之前启动了秒表。此更改不明显,因为从中央目录中读取条目很快。所以我保存了zip文件并用Python外部的time shell内置来测量解压缩时间。结果表明,启动新流程的开销并不重要。

更有趣的变化是添加了libarchive。我得到的结果是这样的(毫秒):

             Random     Hex  Sentences
zipfile         368    1909        604
czipfile        241    1600       2313
os              707    2225        784
shell-measured  797    2272        737
libarchive      248    1513        451
         EXTRACTION METHOD

请注意,结果每次都会有几毫秒的变化。 shell测量真实用户 sys 时间(参见What do 'real', 'user' and 'sys' mean in the output of time(1)?)。上图反映了实时,与其他测量值保持一致。

strace -c -w可以更好地分析系统调用解压缩问题。它显示 Hex 的读取峰值:

             Random     Hex  Sentences
read            805   14597      12816
write          2600    3200       1600
         SYSTEM CALLS ISSUED BY unzip

现在对于diff(它假定原始脚本在您运行ziptest.py的同一目录中命名为patch < _diff_,请参阅

--- ziptest.py.orig 2017-05-25 10:36:03.106994889 +0200
+++ ziptest.py  2017-05-25 11:30:42.032598259 +0200
@@ -2,6 +2,7 @@
 import zipfile
 import czipfile
 import os, binascii, random
+import libarchive.public

 class ZipTestError(Exception):
     pass
@@ -10,7 +11,7 @@

 class ZipTest:

-    procs = ['zipfile', 'czipfile', 'os']
+    procs = ['zipfile', 'czipfile', 'os', 'libarchive']
     type_map = {'r':'Random', 'h':'Random Hex', 's':'Sentences'}

     # three types. t=='r' is random noise files directly out of urandom. t=='h' is urandom noise converted to ascii characters. t=='s' are randomly constructed sentences with line breaks.
@@ -119,10 +120,10 @@

     def unzip_with_os(self):
         f = open(self.zip_name)
-        start = datetime.now()
         zf = zipfile.ZipFile(f)
         for name in zf.namelist():
             self.output_names.append(name)   
+        start = datetime.now()
         os.system("unzip -qq {0}".format(f.name))
         end = datetime.now()
         total = end-start
@@ -130,7 +131,15 @@
         ms += (total.seconds) * 1000000
         return ms /1000

-
+    def unzip_with_libarchive(self):
+        start = datetime.now()
+        for entry in libarchive.public.file_pour(self.zip_name):
+            self.output_names.append(str(entry))
+        end = datetime.now()
+        total = end-start
+        ms = total.microseconds
+        ms += (total.seconds) * 1000000
+        return ms /1000