我正在解析一个只有3列的XML文件。第三列(叙述性)有时(每100条记录中的1条)长于4000个字符。如果长度超过4000个字符,我想在字典中创建一个附加字段。因此,每增加4000个字符,就创建一个新字段。
#this function does various xml cleanups
def getvalueofnode(node):
if node is None:
return None
else:
soup = BeautifulSoup(node.text)
text = soup.get_text()
text = text.replace("\n", " ")
text = text.replace("\r", " ")
text = text.replace(' +', ' ')
text = text.replace('|', '-')
text = text.replace('_+', '_')
return text
record = []
for node in parsedXML.getroot():
recordline = node.find('DATA_RECORD'), node.find('CASE_KEY'), node.find('DESCRIPTION'), node.find('CASE_NARRATIVE')
recordlineprocessed = {
'casekey': getvalueofnode(recordline[1]),
'description': getvalueofnode(recordline[2]),
'narrative': getvalueofnode(recordline[3])
}
record.append(recordlineprocessed)
df_xml = pd.DataFrame.from_dict(record)
df_xml.to_csv(outfile, sep='\t', index = False)