我正在尝试处理具有已知格式的二进制文件。此格式涉及标题的块,后跟可变长度数据。这个结构有三个级别,组(也可以作为组数据中的子组出现),记录和字段(尚未在代码中表示)。
我的问题是,如果我从一个单独的函数调用它,那么下面的Record类型的read方法(在分析时)平均需要2秒运行870,000次:
cdef Record header
cdef unsigned char header_type[4]
cdef FILE *fileobject = fopen('filename', 'rb')
for _ in range(900000):
fread(&header_type, 4, 1, fileobject)
header = Record()
header.read(fileobject)
fseek(fileobject, 0, SEEK_SET)
需要大约250毫秒才能运行。
我正在运行的代码是:
# cython: profile=True
import collections
from libc.stdio cimport *
cdef class Record:
cdef unsigned long flags, id, revision
cdef unsigned short version, unknown
def __cinit__(self):
self.flags = 0
self.id = 0
self.revision = 0
self.version = 0
self.unknown = 0
cdef void read(self, FILE* fileobject):
cdef unsigned long data_size = 0
# reading it all in a chunk gave the same speed
# also tried skipping over the header, same speed
fread(&data_size, 1, 4, fileobject)
fread(&self.flags, 1, 4, fileobject)
fread(&self.id, 1, 4, fileobject)
fread(&self.revision, 1, 4, fileobject)
fread(&self.version, 1, 2, fileobject)
fread(&self.unknown, 1, 2, fileobject)
# here would be a similar loop as the one in the GRUP
# read method to parse the fields.
fseek(fileobject, data_size, SEEK_CUR)
cdef class GRUP:
cdef signed long group_type
cdef unsigned short stamp, unknown1, version, unknown2
cdef char label[4]
cdef list data
def __init__(self):
self.label = [b'0', b'0', b'0', b'0']
self.group_type = 0
self.stamp = 0
self.unknown1 = 0
self.version = 0
self.unknown2 = 0
self.data = []
cdef read(self, FILE* fileobject):
cdef long int group_size = 0
cdef long int end_data_position = 0
cdef char record_type[4]
cdef GRUP group
cdef Record record
# reading it all in a chunk gave the same speed
# also tried skipping over the header, same speed
fread(&group_size, 1, 4, fileobject)
fread(&self.label, 1, 4, fileobject)
fread(&self.group_type, 1, 4, fileobject)
fread(&self.stamp, 1, 2, fileobject)
fread(&self.unknown1, 1, 2, fileobject)
fread(&self.version, 1, 2, fileobject)
fread(&self.unknown2, 1, 2, fileobject)
end_data_position = ftell(fileobject) + group_size - 24
while ftell(fileobject) < end_data_position:
fread(&record_type, 1, 4, fileobject)
if record_type[:4] == b'GRUP':
group = GRUP()
group.read(fileobject)
self.data.append(group)
else:
record = Record()
record.read(fileobject)
self.data.append(record)
cdef class ESP(object):
cdef object top_groups
cdef Record header
def __init__(self):
self.header = None
self.top_groups = collections.OrderedDict()
cpdef read(str source):
cdef unsigned char header_type[4]
cdef unsigned char group_type[4]
cdef size_t bytes_read
result = ESP()
cdef FILE *fileobject = fopen(source.encode('UTF-8'), 'rb')
if fileobject == NULL:
raise TypeError("source argument is not a valid filename.")
fread(&header_type, 1, 4, fileobject)
result.header = Record()
result.header.read(fileobject)
while True:
bytes_read = fread(&group_type, 1, sizeof(group_type), fileobject)
if bytes_read < 4:
break
group = GRUP()
group.read(fileobject)
result.top_groups[group.label.decode()] = group
fclose(fileobject)
return result
profiling data。目标时间将低于一秒(这将包括解析字段)。我做错了什么使得它运行得如此之慢或者我还能做些什么呢?
由于