我有两个轻量级的XML解析器(见下文)。一(1)更有效,但结果与熊猫奇怪地相互作用。另一个很慢,但不会导致异常错误。问题是,我找不到两者实际会产生什么不同的东西。
1)
keep=[]
for en,a in enumerate(open(inputfile,'r')):
if not a.startswith("<Record"):continue
line=a.strip().split("'")
line[0]=line[0].replace("<Record ",'')
line.pop() # deletes the final "/>"
store={}
for x in range(0, len(line), 2):
if not line[x]:continue
line[x] = line[x].strip().rstrip("=")
print(line[x])
print(line[x+1])
store[line[x]]=line[x+1]
if line[x] == "id17":
id17s.append(line[x+1])
keep.append(store)
doc = pandas.DataFrame(keep,index=id17s) # contains all XML records.
2)
keep=[]
for en,a in enumerate(open(inputfile,'r')):
if not a.startswith("<Record"):continue
line=a.strip().split("' ")
line[0]=line[0].replace("<Record ",'')
line.pop()
store={}
for x in range(len(line)):
if not line[x]:continue
try:
trait,evalx=line[x].replace("'",'').strip().split('=',1)
except:
print line
print line[x].replace("'",'').strip().split('=')
sys.exit()
store[trait]=evalx
if trait == "id17":
id17s.append(evalx)
keep.append(store)
doc = pandas.DataFrame(keep,index=id17s) # contains all XML records.
两个解析器都跟着
doc_df = doc.apply(pandas.to_numeric, args=('ignore',)) # set missing values to nan
header=list(doc_df.columns.values) #Get columns names (header)