我使用这段代码:
from struct import Struct
import struct
def read_chunk(fmt, fileobj):
chunk_struct = Struct(fmt)
chunk = fileobj.read(chunk_struct.size)
return chunk_struct.unpack(chunk)
def read_record(fileobj):
author_id, len_author_name = read_chunk('ii', f)
author_name, nu_of_publ = read_chunk(str(len_author_name)+'si', f) # 's' or 'c' ?
record = { 'author_id': author_id,
'author_name': author_name,
'publications': [] }
for pub in range(nu_of_publ):
pub_id, len_pub_title = read_chunk('ii', f)
pub_title, num_pub_auth = read_chunk(str(len_pub_title)+'si', f)
record['publications'].append({
'publication_id': pub_id,
'publication_title': pub_title,
'publication_authors': [] })
for auth in range(num_pub_auth):
len_pub_auth_name = read_chunk('i', f)
pub_auth_name = read_chunk(str(len_pub_auth_name)+'s', f)
record['publications']['publication_authors'].append({'name': pub_auth_name})
year_publ, nu_of_cit = read_chunk('ii', f)
# Finish building your record with the remaining fields...
for cit in range(nu_of_cit):
cit_id, len_cit_title = read_chunk('ii', f)
cit_title, num_cit_auth = read_chunk(str(len_cit_title)+'si', f)
for cit_auth in range(num_cit_auth):
len_cit_auth_name = read_chunk('i', f)
cit_auth_name = read_chunk(str(len_cit_auth_name)+'s', f)
year_cit_publ = read_chunk('i', f)
return record
def parse_file(filename):
records = []
with open(filename, 'rb') as f:
while True:
try:
records.append(read_record(f))
except struct.error:
break
阅读此文件:
https://drive.google.com/open?id=0B3SYAHrxLP69NHlWc25KeXFHNVE
采用以下格式:
在函数read_record中,它读取了正确的变量author_id,len_author_name,author_name,但nu_of_publ及以下变量未被读取正确。
知道什么是错的吗?
当我运行这段代码时:
author_id, len_author_name = read_chunk('LL', f)
author_name, nu_of_publ= read_chunk(str(len_author_name)+'sL', f)
#nu_of_publ = read_chunk('I', f)# 's' or 'c' ?
record = { 'author_id': author_id,
'author_name': author_name,
'publications': [] }
print (record, nu_of_publ)
for pub in range(nu_of_publ):
pub_id, len_pub_title = read_chunk('LL', f)
print (pub_id, len_pub_title)
我取这个结果:
{' author_name':b' Scott Shenker',' author_id':1,' publications':[]} 256 15616 1953384704
但它会打印200而不是256,1而不是15616等。
答案 0 :(得分:0)
此格式不正确:
author_name, nu_of_publ = read_chunk(str(len_author_name)+'si', f)
您正在定义N个字符和整数的结构。这些结构是aligned,就像你在c中定义的结构一样:
struct {
char author_name[N];
int nu_of_publ;
};
对齐的作用是:它将每个int的开头放在一个4的倍数位置。这是完成的(在C中),因为CPU已经过优化以访问这些地址。
因此,如果作者姓名的长度为6,则在读取下一个整数之前将跳过接下来的两个字节。
分离结构的一种解决方案:
author_name = read_chunk(str(len_author_name)+'s', f)
nu_of_publ, = read_chunk('i', f)
注意: nu_of_publ(nu_of_publ,
)之后的逗号是解包read_chunk
返回的元组。
另一种解决方案是根据table from spec指定=
开头的结构:
author_name, nu_of_publ = read_chunk('={}si'.format(len_author_name), f)