我有一个程序,该程序需要452,793个xml文件,将其转换为pandas数据帧,然后将其转换为.csv文件...应该是这样。它经历了从特定标签中提取信息并将其插入数据框中特定位置的过程。但是在452,793的文件6,274中,它抛出此错误“ ParseError:格式不正确(无效令牌):第47行,第27列” error message picture.
我已经尝试添加 encoding =“ utf-8-sig” 和 encoding =“ utf-8-sig” ,但是没有运气。
这是我的代码。让我知道是否需要我解释任何一个问题。存在缩进错误。如果没有正确缩进,那是由于导入我的代码时堆栈溢出格式化。
import xml.etree.ElementTree as ET
import pandas as pd
import glob as glob
from urllib.request import urlopen as uReq
import urllib.request as ur
import shutil
import zipfile
pd.set_option('display.max_columns', 500)
run = 0
globfiles = glob.glob("C:/Users/bbrown/Projects/Database/NSF/Trash/*.xml")
glob.glob("C:/Users/bbrown/Projects/Database/NSF/Trash/*.xml")
for file in globfiles: #The top portion
run += 1
print('globbing ',run, file)
parser = ET.XMLParser(encoding="utf-8")
tree = ET.parse(file, parser)
root = tree.getroot()
x = []
y = []
for child in root:
for child2 in child:
x.append(child2.tag)
y.append(child2.text)
for i in range(0, len(x)):
if x[i] == 'AwardTitle':
data = [y[i]]
df1 = pd.DataFrame(data, columns = ['AwardTitle'])
if x[i] == 'AwardEffectiveDate':
data = [y[i]]
df2 = pd.DataFrame(data, columns = ['AwardEffectiveDate'])
if x[i] == 'AwardExpirationDate':
data = [y[i]]
df3 = pd.DataFrame(data, columns = ['AwardExpirationDate'])
if x[i] == 'AwardTotalIntnAmount':
data = [y[i]]
df4 = pd.DataFrame(data, columns = ['AwardTotalIntnAmount'])
if x[i] == 'AwardAmount':
data = [y[i]]
df5 = pd.DataFrame(data, columns = ['AwardAmount'])
if x[i] == 'AbstractNarration':
data = [y[i]]
df6 = pd.DataFrame(data, columns = ['AbstractNarration'])
if x[i] == 'MinAmdLetterDate':
data = [y[i]]
df7 = pd.DataFrame(data, columns = ['MinAmdLetterDate'])
if x[i] == 'MaxAmdLetterDate': #index 10
data = [y[i]]
df8 = pd.DataFrame(data, columns = ['MaxAmdLetterDate'])
if x[i] == 'AwardID': #index 12
data = [y[i]]
df9 = pd.DataFrame(data, columns = ['AwardID'])
a = []
b = []
#0,1,6,7,8,9,10...22
for child in root:
for child2 in child:
for child3 in child2:
a.append(child3.tag)
b.append(child3.text)
for i in range(0, len(a)):
if a[i] == 'Value':
data = [b[i]]
df10 = pd.DataFrame(data, columns = ['Value'])
if a[i] == 'Code' and i == 1:
data = [b[i]]
df11 = pd.DataFrame(data, columns = ['Organization Code'])
if a[i] == 'SignBlockName':
data = [b[i]]
df12 = pd.DataFrame(data, columns = ['SignBlockName'])
if a[i] == 'FirstName':
x = 0
iterlist = []
namelist = []
for word in a:
if word == 'FirstName':
iterlist.append(x)
x += 1
for eachnum in iterlist:
namelist.append(b[eachnum])
data = [str(namelist)]
df13 = pd.DataFrame(data, columns = ['FirstName'])
if a[i] == 'LastName':
x = 0
iterlist = []
namelist = []
for word in a:
if word == 'LastName':
iterlist.append(x)
x += 1
for eachnum in iterlist:
namelist.append(b[eachnum])
data = [str(namelist)]
df14 = pd.DataFrame(data, columns = ['LastName'])
if a[i] == 'EmailAddress':
x = 0
iterlist = []
namelist = []
for word in a:
if word == 'EmailAddress':
iterlist.append(x)
x += 1
for eachnum in iterlist:
namelist.append(b[eachnum])
data = [str(namelist)]
df15 = pd.DataFrame(data, columns = ['EmailAddress'])
if a[i] == 'StartDate':
x = 0
iterlist = []
namelist = []
for word in a:
if word == 'StartDate':
iterlist.append(x)
x += 1
for eachnum in iterlist:
namelist.append(b[eachnum])
data = [str(namelist)]
df16 = pd.DataFrame(data, columns = ['StartDate'])
if a[i] == 'EndDate':
x = 0
iterlist = []
namelist = []
for word in a:
if word == 'EndDate':
iterlist.append(x)
x += 1
for eachnum in iterlist:
namelist.append(b[eachnum])
data = [str(namelist)]
df17 = pd.DataFrame(data, columns = ['EndDate'])
if a[i] == 'RoleCode':
x = 0
iterlist = []
namelist = []
for word in a:
if word == 'RoleCode':
iterlist.append(x)
x += 1
for eachnum in iterlist:
namelist.append(b[eachnum])
data = [str(namelist)]
df18 = pd.DataFrame(data, columns = ['RoleCode'])
if a[i] == 'Name':
data = [b[i]]
df19 = pd.DataFrame(data, columns = ['Name'])
if a[i] == 'CityName':
data = [b[i]]
df20 = pd.DataFrame(data, columns = ['CityName'])
if a[i] == 'ZipCode':
data = [b[i]]
df21 = pd.DataFrame(data, columns = ['ZipCode'])
if a[i] == 'PhoneNumber':
data = [b[i]]
df22 = pd.DataFrame(data, columns = ['PhoneNumber'])
if a[i] == 'StreetAddress':
data = [b[i]]
df23 = pd.DataFrame(data, columns = ['StreetAddress'])
if a[i] == 'StateName':
data = [b[i]]
df24 = pd.DataFrame(data, columns = ['StateName'])
if a[i] == 'CountryName':
data = [b[i]]
df25 = pd.DataFrame(data, columns = ['CountryName'])
if a[i] == 'StateCode':
data = [b[i]]
df26 = pd.DataFrame(data, columns = ['StateCode'])
if a[i] == 'Code' and i != 1:
x = 0
iterlist = []
namelist = []
for word in a:
if word == 'Code':
iterlist.append(x)
x += 1
for eachnum in iterlist:
if eachnum != 1:
value = b[eachnum],b[eachnum+1]
namelist.append(value)
data = [str(namelist)]
df27 = pd.DataFrame(data, columns = ['Codes'])
aaa=[]
bbb=[]
for child in root:
for child2 in child:
aa.append(child2.tag)
bb.append(child2.text)
for child3 in child2:
a.append(child3.tag)
b.append(child3.text)
for child4 in child3:
aaa.append(child4.tag)
bbb.append(child4.text)
dfvar = pd.concat([df1, df2, df3, df4, df5, df6, df7, df8, df9, df10, df11, df12, df13, df14, df15, df16, df17, df18,df19, df20, df21, df22, df23, df24, df25, df26, df27], axis=1)
df = df.append(dfvar)
**您可以从此处下载xml文件的示例*-https://www.nsf.gov/awardsearch/download?DownloadFileName=2019&All=true
我只需要它来停止抛出错误。通过修复引起问题的原因或忽略该错误。否则,这是金色的!