实现以下逻辑的最快方法是什么:
def xor(data, key):
l = len(key)
buff = ""
for i in range(0, len(data)):
buff += chr(ord(data[i]) ^ ord(key[i % l]))
return buff
在我的情况下, key 是20字节sha1摘要,而数据是20字节和几(1,2,3)兆字节之间的一些二进制数据
更新
好的伙计们。这是一个快3.5倍的实现,它将数据和密钥分为4个,2个或1个字节(在我的情况下,大部分时间是4字节长整数):def xor(data, key):
index = len(data) % 4
size = (4, 1, 2, 1)[index]
type = ('L', 'B', 'H', 'B')[index]
key_len = len(key)/size
data_len = len(data)/size
key_fmt = "<" + str(key_len) + type;
data_fmt = "<" + str(data_len) + type;
key_list = struct.unpack(key_fmt, key)
data_list = struct.unpack(data_fmt, data)
result = []
for i in range(data_len):
result.append (key_list[i % key_len] ^ data_list[i])
return struct.pack(data_fmt, *result)
使用大量内存,但就我而言,这并不是什么大问题。
任何想法如何提高速度几次? : - )
最终更新:
好的,好的... numpy完成了这项工作。那真是太快了:
def xor(data, key):
import numpy, math
# key multiplication in order to match the data length
key = (key*int(math.ceil(float(len(data))/float(len(key)))))[:len(data)]
# Select the type size in bytes
for i in (8,4,2,1):
if not len(data) % i: break
if i == 8: dt = numpy.dtype('<Q8');
elif i == 4: dt = numpy.dtype('<L4');
elif i == 2: dt = numpy.dtype('<H2');
else: dt = numpy.dtype('B');
return numpy.bitwise_xor(numpy.fromstring(key, dtype=dt), numpy.fromstring(data, dtype=dt)).tostring()
初始实现需要8分50秒来处理一个千兆字节,第二个 - 大约2分30秒,最后一个只需要...... 0分10秒。
感谢任何贡献想法和代码的人。你们太棒了!
答案 0 :(得分:1)
如果len(data)
很大,您可能会发现xrange
有显着改善。实际上,您可以使用enumerate
完全替换范围函数。您也可以使用列表而不是附加到字符串中受益。
def xor(data, key):
l = len(key)
buff = []
for idx, val in enumerate(data):
buff.append(chr(ord(val) ^ ord(key[idx % l]))
return ''.join(buff)
我还没有计时,但是我认为对于大量数据来说,我会更快一点。确保你衡量每一个变化。
如果分析表明对ord()
的调用实际上需要时间,您可以提前在key
中的所有值上运行它以在循环中保存调用。
您也可以将循环转换为普通的旧列表理解,但它会对可读性产生负面影响。无论如何,试一试,看看它是否更快。
答案 1 :(得分:1)
未经测试
不知道它是否更快
假设len(mystring)是4的倍数
def xor(hash,mystring):
s = struct.Struct("<L")
v1 = memoryview(hash)
tab1 = []
for i in range(5):
tab1.append(s.unpack_from(v1,i*4)
v2 = memoryview(mystring)
tab2=[]
for i in range(len(mystring)/4):
tab2.append(s.unpack_from(v1,i*4))
tab3 = []
try:
for i in range(len(mystring)/20):
for j in range(5):
tab3.append(s.pack(tab1[j]^tab2[5*i+j]))
expect IndexError:
pass
return "".join(tab3)
答案 2 :(得分:1)
此代码应适用于Python 2.6+,包括Py3k。
from binascii import hexlify as _hexlify
from binascii import unhexlify as _unhexlify
def packl(lnum, padmultiple=0):
"""Packs the lnum (which must be convertable to a long) into a
byte string 0 padded to a multiple of padmultiple bytes in size. 0
means no padding whatsoever, so that packing 0 result in an empty
string. The resulting byte string is the big-endian two's
complement representation of the passed in long."""
if lnum == 0:
return b'\0' * padmultiple
elif lnum < 0:
raise ValueError("Can only convert non-negative numbers.")
s = hex(lnum)[2:]
s = s.rstrip('L')
if len(s) & 1:
s = '0' + s
s = _unhexlify(s)
if (padmultiple != 1) and (padmultiple != 0):
filled_so_far = len(s) % padmultiple
if filled_so_far != 0:
s = b'\0' * (padmultiple - filled_so_far) + s
return s
def unpackl(bytestr):
"""Treats a byte string as a sequence of base 256 digits
representing an unsigned integer in big-endian format and converts
that representation into a Python integer."""
return int(_hexlify(bytestr), 16) if len(bytestr) > 0 else 0
def xor(data, key):
dlen = len(data)
klen = len(key)
if dlen > klen:
key = key * ((dlen + klen - 1) // klen)
key = key[:dlen]
result = packl(unpackl(data) ^ unpackl(key))
if len(result) < dlen:
result = b'\0' * (dlen - len(result)) + result
return result
这也适用于Python 2.7和3.x.它具有比前一个更简单的优点,同时在大约相同的时间内完成相同的操作:
from binascii import hexlify as _hexlify
from binascii import unhexlify as _unhexlify
def xor(data, key):
dlen = len(data)
klen = len(key)
if dlen > klen:
key = key * ((dlen + klen - 1) // klen)
key = key[:dlen]
data = int(_hexlify(data), 16)
key = int(_hexlify(key), 16)
result = (data ^ key) | (1 << (dlen * 8 + 7))
# Python 2.6/2.7 only lines (comment out in Python 3.x)
result = memoryview(hex(result))
result = (result[4:-1] if result[-1] == 'L' else result[4:])
# Python 3.x line
#result = memoryview(hex(result).encode('ascii'))[4:]
result = _unhexlify(result)
return result
答案 3 :(得分:1)
免责声明:正如其他海报所说,这是加密文件的一种非常糟糕的方式。 This article演示了如何简单地扭转这种混淆。
首先,一个简单的xor算法:
def xor(a,b,_xor8k=lambda a,b:struct.pack("!1000Q",*map(operator.xor,
struct.unpack("!1000Q",a),
struct.unpack("!1000Q",b)))
):
if len(a)<=8000:
s="!%iQ%iB"%divmod(len(a),8)
return struct.pack(s,*map(operator.xor,
struct.unpack(s,a),
struct.unpack(s,b)))
a=bytearray(a)
for i in range(8000,len(a),8000):
a[i-8000:i]=_xor8k(
a[i-8000:i],
b[i-8000:i])
a[i:]=xor(a[i:],b[i:])
return str(a)
其次是包装xor算法:
def xor_wrap(data,key,_struct8k=struct.Struct("!1000Q")):
l=len(key)
if len(data)>=8000:
keyrpt=key*((7999+2*l)//l)#this buffer is accessed with whatever offset is required for a given 8k block
#this expression should create at most 1 more copy of the key than is needed
data=bytearray(data)
offset=-8000#initial offset, set to zero on first loop iteration
modulo=0#offset used to access the repeated key
for offset in range(0,len(data)-7999,8000):
_struct8k.pack_into(data,offset,*map(operator.xor,
_struct8k.unpack_from(data,offset),
_struct8k.unpack_from(keyrpt,modulo)))
modulo+=8000;modulo%=l
offset+=8000
else:offset=0;keyrpt=key*(len(data)//l+1)#simple calculation guaranteed to be enough
rest=len(data)-offset
srest=struct.Struct("!%iQ%iB"%divmod(len(data)-offset,8))
srest.pack_into(data,offset,*map(operator.xor,
srest.unpack_from(data,offset),
srest.unpack_from(keyrpt,modulo)))
return data
答案 4 :(得分:0)
这是一个只使用Python内置和标准模块的版本,看起来非常快 - 虽然我还没有将它与你的numpy版本进行比较。它使用了Python Cryptography Toolkit中的一些优化转换函数,如图所示。
# Part of the Python Cryptography Toolkit
# found here:
# http://www.google.com/codesearch/p?hl=en#Y_gnTlD6ECg/trunk/src/gdata/Crypto/Util/number.py&q=lang:python%20%22def%20long_to_bytes%22&sa=N&cd=1&ct=rc
# Improved conversion functions contributed by Barry Warsaw, after
# careful benchmarking
import struct
def long_to_bytes(n, blocksize=0):
"""long_to_bytes(n:long, blocksize:int) : string
Convert a long integer to a byte string.
If optional blocksize is given and greater than zero, pad the front of the
byte string with binary zeros so that the length is a multiple of
blocksize.
"""
# after much testing, this algorithm was deemed to be the fastest
s = ''
n = long(n)
pack = struct.pack
while n > 0:
s = pack('>I', n & 0xffffffffL) + s
n = n >> 32
# strip off leading zeros
for i in range(len(s)):
if s[i] != '\000':
break
else:
# only happens when n == 0
s = '\000'
i = 0
s = s[i:]
# add back some pad bytes. this could be done more efficiently w.r.t. the
# de-padding being done above, but sigh...
if blocksize > 0 and len(s) % blocksize:
s = (blocksize - len(s) % blocksize) * '\000' + s
return s
def bytes_to_long(s):
"""bytes_to_long(string) : long
Convert a byte string to a long integer.
This is (essentially) the inverse of long_to_bytes().
"""
acc = 0L
unpack = struct.unpack
length = len(s)
if length % 4:
extra = (4 - length % 4)
s = '\000' * extra + s
length = length + extra
for i in range(0, length, 4):
acc = (acc << 32) + unpack('>I', s[i:i+4])[0]
return acc
# original code in SO question
def xor_orig(data, key):
l = len(key)
buff = ""
for i in range(0, len(data)):
buff += chr(ord(data[i]) ^ ord(key[i % l]))
return buff
# faster pure python version
def xor_new(data, key):
import math
# key multiplication in order to match the data length
key = (key*int( math.ceil(float(len(data))/float(len(key)))))[:len(data)]
# convert key and data to long integers
key_as_long = bytes_to_long(key)
data_as_long = bytes_to_long(data)
# xor the numbers together and convert the result back to a byte string
return long_to_bytes(data_as_long ^ key_as_long)
if __name__=='__main__':
import random
import sha
TEST_DATA_LEN = 100000
data = ''.join(chr(random.randint(0, 255)) for i in xrange(TEST_DATA_LEN))
key = sha.new(data).digest()
assert xor_new(data, key) == xor_orig(data, key)
print 'done'
答案 5 :(得分:0)
根据我在第一篇文章中的评论,如果坚持使用(CASE
WHEN SUBSTRING_INDEX(SUBSTRING_INDEX(cal_participants.name, ' ', 1), ' ', -1) = 7591
THEN "Europe"
END) as memberfirst
进行键填充和按位XOR运算,则可以相当快地处理大型文件,如下所示:
numpy
答案 6 :(得分:-1)
你拥有的东西已经达到了Python的速度。
如果您确实需要更快,请在C。
中实施