我将文本文件分成以下格式的字符串列表:
['DATE','NAME', 'RT','1A','541','09947','199407',552','09949','BOON','101C','SMITH','00321','1553678','1851243','561','559','004789',1749201',ANDERSON']
我想使用item [0:-1] .isdigit()和item [-1] .isalpha()中的项创建一个dict,所以在上面的示例中,这将是1A和101C。然后我想只添加int(item.isdigit())>的项目。 100000,其中符合此条件的项目通过for循环(或者可能是while循环)组装到新列表中,直到循环到达下一个键值。
结果将是dct = {'1A': ['199407'], '101C':['1553678','1851243','1749201']}
我目前正在设置索引错误,尽管在迭代达到键列表中项目的长度时会暂停一段时间。在收到此错误之前,我正在以不同方式索引值并获取空字典。一旦索引错误得到修复,我期待得到另一个空的dict。
这是我的代码:
# create a list of the dictionary keys to find values in 1A format
# in order to avoid key error when building dict, do not add duplicate
# values to list. Needs to be a list andd not tuple so it can be indexed
for line in lines:
if line[0:-1].isdigit() and line[-1].isalpha() and line not in keys:
keys.append(line)
print str(keys) + " " + str(len(keys))
# build a list of values for each item in keys. Should find the first
# key and check if a converted string to number is > 100000. If it is
# the value is appended to the valLst. If the next key is encountered
# the nested loop breaks and valLst is added to the current key. The
# primary loop moves to the next key while the nested loop should only
# consider items between the current primary iterable and the next.
passes = 0
while passes <=len(keys): # exit loop before index error
for key in keys:
passes += 1
curKey = keys.index(key) # current primary iterable position
nextKey = curKey + 1 # next primary iterable position
print "Passes: " + str(passes)
valLst = [] # empty list for dct values--resets after nested loop break
for line in lines: #iterate through text
if line == keys[nextKey]: # the next key value is encountered in text
break
dict[key] = valLst # valList added to current dict key
curLine = lines.index(line) # start at current key value found in text
if curLine == key: # find current key in text
nextLine = curLine + 1 # get index of next value after current key in text
val = lines[nextLine] # next text value
if val.isdigit(): #append value to valLst if it is > 100000
num = int(val)
if num > 100000:
valLst.append(num)
这是我当前的错误:
Traceback (most recent call last):
File "C:\Python27\Lib\site-packages\pythonwin\pywin\framework\scriptutils.py", line 323, in RunScript
debugger.run(codeObject, __main__.__dict__, start_stepping=0)
File "C:\Python27\Lib\site-packages\pythonwin\pywin\debugger\__init__.py", line 60, in run
_GetCurrentDebugger().run(cmd, globals,locals, start_stepping)
File "C:\Python27\Lib\site-packages\pythonwin\pywin\debugger\debugger.py", line 654, in run
exec cmd in globals, locals
File "C:\Users\user\Desktop\Scripts\PDF_Extractor.py", line 1, in <module>
from cStringIO import StringIO
IndexError: list index out of range
我一直在研究列表理解,但是在这种情况下,我们没有足够好地掌握它们。我是否使用上面的代码朝着正确的方向前进,或者我可以采用的列表理解方法如下:
valLst = {key for keys in lines for line in line if line == key and int(line.isdigit()) > 100000 valLst.append(line)}
答案 0 :(得分:0)
keys = ['DATE', 'NAME', 'RT', '1A', '541', '09947', '199407', '552', '09949', 'BOON', \
'101C', 'SMITH', '00321', '1553678', '1851243', '561', '559', '004789', '1749201', 'ANDERSON']
from collections import OrderedDict
valList = OrderedDict()
for k in keys:
if len(k) > 0:
if k[0].isdigit() and k[-1].isalpha() and ' ' not in k and k not in valList.keys():
valList[k] = []
try:
if int(k) > 100000:
try:
valList[valList.keys()[-1]].append(k)
except ValueError:
valList[valList.keys()[-1]] = k
except ValueError:
continue
print valList
输出:
OrderedDict([('1Y', ['15538870', '15922112', '16037395', '16069918', '16116102', '16292996', '16658378', '16700710', '16783588', '16832641', '16944735', '16994444', '313132', '12722185', '11415965', '10966593', '9983979', '8573715', '11733178', '552204', '3150537', '552422', '8013132', '9298415', '8742458', '8626402', '4708497', '11687768', '12192686', '734061', '734171', '9896029', '8636757', '2662814', '10407886', '11730755', '4504371', '9187313', '2362896', '7891338', '3519990', '12293652', '9226220', '5984854', '3295145', '1068579', '2031247', '11242586', '8408050', '8440673', '2752194', '5843333', '1740045', '2584772']), ('2A', ['16174735', '16330036', '16334662', '16345573', '16350100', '16376985', '16397823', '16411821', '16435182', '16443451', '16449626', '16574945', '16590154', '16597759', '16615837', '16649016', '16756921', '16762759', '16795828', '16879043', '16887968', '16900090', '16900428', '16902522', '16910127']), ('3A', ['16320336', '16328934', '16331684', '16346347', '16360892', '16370045', '16407413', '16408287', '16444990', '16446211', '16453706', '16467695', '16468032', '11697249', '11843287', '1339389', '2435865', '10001948', '4760965', '2480063', '13588296', '1813233', '11741885', '8972714', '9688478', '16070245']), ('3Y', ['13226120', '13232404', '13233834', '13235601', '13238679', '13241985', '13247504', '13249817', '13262823', '13268442', '13269981', '13270318', '13272413', '13282003', '13284535', '13288943', '13294453'])])
或一次检查一个字典,以确认我们得到了预期的字典键和项目:
for d in valList.items():
print d
OrderedDict([
('1Y', ['15538870', '15922112', '16037395', '16069918', '16116102', '16292996', '16658378', '16700710', '16783588', '16832641', '16944735', '16994444', '313132', '12722185', '11415965', '10966593', '9983979', '8573715', '11733178', '552204', '3150537', '552422', '8013132', '9298415', '8742458', '8626402', '4708497', '11687768', '12192686', '734061', '734171', '9896029', '8636757', '2662814', '10407886', '11730755', '4504371', '9187313', '2362896', '7891338', '3519990', '12293652', '9226220', '5984854', '3295145', '1068579', '2031247', '11242586', '8408050', '8440673', '2752194', '5843333', '1740045', '2584772']), ('2A', ['16174735', '16330036', '16334662', '16345573', '16350100', '16376985', '16397823', '16411821', '16435182', '16443451', '16449626', '16574945', '16590154', '16597759', '16615837', '16649016', '16756921', '16762759', '16795828', '16879043', '16887968', '16900090', '16900428', '16902522', '16910127']), ('3A', ['16320336', '16328934', '16331684', '16346347', '16360892', '16370045', '16407413', '16408287', '16444990', '16446211', '16453706', '16467695', '16468032', '11697249', '11843287', '1339389', '2435865', '10001948', '4760965', '2480063', '13588296', '1813233', '11741885', '8972714', '9688478', '16070245']), ('3Y', ['13226120', '13232404', '13233834', '13235601', '13238679', '13241985', '13247504', '13249817', '13262823', '13268442', '13269981', '13270318', '13272413', '13282003', '13284535', '13288943', '13294453'])])
('1Y', ['15538870', '15922112', '16037395', '16069918', '16116102', '16292996', '16658378', '16700710', '16783588', '16832641', '16944735', '16994444', '313132', '12722185', '11415965', '10966593', '9983979', '8573715', '11733178', '552204', '3150537', '552422', '8013132', '9298415', '8742458', '8626402', '4708497', '11687768', '12192686', '734061', '734171', '9896029', '8636757', '2662814', '10407886', '11730755', '4504371', '9187313', '2362896', '7891338', '3519990', '12293652', '9226220', '5984854', '3295145', '1068579', '2031247', '11242586', '8408050', '8440673', '2752194', '5843333', '1740045', '2584772'])
('2A', ['16174735', '16330036', '16334662', '16345573', '16350100', '16376985', '16397823', '16411821', '16435182', '16443451', '16449626', '16574945', '16590154', '16597759', '16615837', '16649016', '16756921', '16762759', '16795828', '16879043', '16887968', '16900090', '16900428', '16902522', '16910127'])
('3A', ['16320336', '16328934', '16331684', '16346347', '16360892', '16370045', '16407413', '16408287', '16444990', '16446211', '16453706', '16467695', '16468032', '11697249', '11843287', '1339389', '2435865', '10001948', '4760965', '2480063', '13588296', '1813233', '11741885', '8972714', '9688478', '16070245'])
('3Y', ['13226120', '13232404', '13233834', '13235601', '13238679', '13241985', '13247504', '13249817', '13262823', '13268442', '13269981', '13270318', '13272413', '13282003', '13284535', '13288943', '13294453'])