我正在编写一个模块,需要能够快速处理大量的zip文件。因此,我将使用在C而不是Python中实现的东西(我将从中调用提取器)。为了尝试测试哪种方法最快,我写了一个测试脚本,比较linux的'unzip'命令和czipfile python模块(c zip zipctor周围的包装器)。作为一个控件,我使用了原生的python zipfile模块。
该脚本在100~1MB文件中创建一个大约100MB的zip文件。它着眼于3个场景。 A)文件都只是随机的字节串。 B)文件只是随机的十六进制字符C)文件是带有换行符的统一随机句子。
在所有情况下,zipfile(在python中实现)的性能与c中实现的两个提取器相当或明显更好。
为什么会发生这种情况的任何想法?该脚本已附加。需要czipfile和shell中的'unzip'命令。
from datetime import datetime
import zipfile
import czipfile
import os, binascii, random
class ZipTestError(Exception):
pass
class ZipTest:
procs = ['zipfile', 'czipfile', 'os']
type_map = {'r':'Random', 'h':'Random Hex', 's':'Sentences'}
# three types. t=='r' is random noise files directly out of urandom. t=='h' is urandom noise converted to ascii characters. t=='s' are randomly constructed sentences with line breaks.
def __init__(self):
print """Testing Random Byte Files:
"""
self.test('r')
self.test('h')
self.test('s')
@staticmethod
def rand_name():
return binascii.b2a_hex(os.urandom(10))
def make_file(self, t):
f_name = self.rand_name()
f = open(f_name, 'w')
if t == 'r':
f.write(os.urandom(1048576))
elif t == 'h':
f.write(binascii.b2a_hex(os.urandom(1048576)))
elif t == 's':
for i in range(76260):
ops = ['dog', 'cat', 'rat']
ops2 = ['meat', 'wood', 'fish']
n1 = int(random.random()*10) % 3
n2 = int(random.random()*10) % 3
sentence = """The {0} eats {1}
""".format(ops[n1], ops2[n2])
f.write(sentence)
else:
raise ZipTestError('Invalid Type')
f.close()
return f_name
#create a ~100MB zip file to test extraction on.
def create_zip_test(self, t):
self.file_names = []
self.output_names = []
for i in range(100):
self.file_names.append(self.make_file(t))
self.zip_name = self.rand_name()
output = zipfile.ZipFile(self.zip_name, 'w', zipfile.ZIP_DEFLATED)
for f in self.file_names:
output.write(f)
output.close()
def clean_up(self, rem_zip = False):
for f in self.file_names:
os.remove(f)
self.file_names = []
for f in self.output_names:
os.remove(f)
self.output_names = []
if rem_zip:
if getattr(self, 'zip_name', False):
os.remove(self.zip_name)
self.zip_name = False
def display_res(self, res, t):
print """
{0} results:
""".format(self.type_map[t])
for p in self.procs:
print"""
{0} = {1} milliseconds""".format(p, str(res[p]))
def test(self, t):
self.create_zip_test(t)
res = self.unzip()
self.display_res(res, t)
self.clean_up(rem_zip = True)
def unzip(self):
res = dict()
for p in self.procs:
self.clean_up()
res[p] = getattr(self, "unzip_with_{0}".format(p))()
return res
def unzip_with_zipfile(self):
return self.unzip_with_python(zipfile)
def unzip_with_czipfile(self):
return self.unzip_with_python(czipfile)
def unzip_with_python(self, mod):
f = open(self.zip_name)
zf = mod.ZipFile(f)
start = datetime.now()
op = './'
for name in zf.namelist():
zf.extract(name,op)
self.output_names.append(name)
end = datetime.now()
total = end-start
ms = total.microseconds
ms += (total.seconds) * 1000000
return ms /1000
def unzip_with_os(self):
f = open(self.zip_name)
start = datetime.now()
zf = zipfile.ZipFile(f)
for name in zf.namelist():
self.output_names.append(name)
os.system("unzip -qq {0}".format(f.name))
end = datetime.now()
total = end-start
ms = total.microseconds
ms += (total.seconds) * 1000000
return ms /1000
if __name__ == '__main__':
ZipTest()
答案 0 :(得分:1)
如上所述,解密是在python中完成的,而不是解压缩。所以zipfile就像其他两个一样使用c实现。
答案 1 :(得分:1)
即使C通常比解释语言更快,假设算法相同,不同的缓冲策略也会有所不同。这里有一些证据:
我对您的脚本进行了一些更改。差异在下面。
我在os.system
之前启动了秒表。此更改不明显,因为从中央目录中读取条目很快。所以我保存了zip文件并用Python外部的time
shell内置来测量解压缩时间。结果表明,启动新流程的开销并不重要。
更有趣的变化是添加了libarchive。我得到的结果是这样的(毫秒):
Random Hex Sentences
zipfile 368 1909 604
czipfile 241 1600 2313
os 707 2225 784
shell-measured 797 2272 737
libarchive 248 1513 451
EXTRACTION METHOD
请注意,结果每次都会有几毫秒的变化。 shell测量真实,用户和 sys 时间(参见What do 'real', 'user' and 'sys' mean in the output of time(1)?)。上图反映了实时,与其他测量值保持一致。
strace -c -w
可以更好地分析系统调用解压缩问题。它显示 Hex 的读取峰值:
Random Hex Sentences
read 805 14597 12816
write 2600 3200 1600
SYSTEM CALLS ISSUED BY unzip
现在对于diff(它假定原始脚本在您运行ziptest.py
的同一目录中命名为patch < _diff_
,请参阅patch,diff)
--- ziptest.py.orig 2017-05-25 10:36:03.106994889 +0200
+++ ziptest.py 2017-05-25 11:30:42.032598259 +0200
@@ -2,6 +2,7 @@
import zipfile
import czipfile
import os, binascii, random
+import libarchive.public
class ZipTestError(Exception):
pass
@@ -10,7 +11,7 @@
class ZipTest:
- procs = ['zipfile', 'czipfile', 'os']
+ procs = ['zipfile', 'czipfile', 'os', 'libarchive']
type_map = {'r':'Random', 'h':'Random Hex', 's':'Sentences'}
# three types. t=='r' is random noise files directly out of urandom. t=='h' is urandom noise converted to ascii characters. t=='s' are randomly constructed sentences with line breaks.
@@ -119,10 +120,10 @@
def unzip_with_os(self):
f = open(self.zip_name)
- start = datetime.now()
zf = zipfile.ZipFile(f)
for name in zf.namelist():
self.output_names.append(name)
+ start = datetime.now()
os.system("unzip -qq {0}".format(f.name))
end = datetime.now()
total = end-start
@@ -130,7 +131,15 @@
ms += (total.seconds) * 1000000
return ms /1000
-
+ def unzip_with_libarchive(self):
+ start = datetime.now()
+ for entry in libarchive.public.file_pour(self.zip_name):
+ self.output_names.append(str(entry))
+ end = datetime.now()
+ total = end-start
+ ms = total.microseconds
+ ms += (total.seconds) * 1000000
+ return ms /1000