我写了一个python来索引包含在XML文件中的文件夹中的数据,但是仅在运行一个文件夹索引时出现此错误:xml.etree.ElementTree.ParseError:格式不正确(无效令牌):行5933第23栏 我的python代码:
import os
from xml.etree import ElementTree
from elasticsearch import Elasticsearch
es = Elasticsearch([{'host': 'localhost', 'port': 9200}])
def start(path):
tree = ElementTree.parse(path)
root = tree.getroot()
print(root)
docs = tree.findall('.//DOC')
for doc in docs:
title = doc.find('TITLE').text
text = doc.find('TEXT').text
all_dates = doc.findall('DATE')
date = ''
for d in all_dates:
if d.attrib["calender"] == "Western":
date = d.text
break
doc = {
'title': title,
'text': text,
'date': date
}
insert_to_elastic(doc)
def insert_to_elastic(doc):
es.index(index='hamshahri', doc_type='document', body=doc)
def xml_files():
folders = ['2002','2003','2004','2005']
xml_list = []
for item in folders:
path = '/home/course/web/Hamshahri/'+ item
for filename in os.listdir(path):
fullname = os.path.join(path, filename)
if fullname.endswith('.xml'):
xml_list.append(fullname)
return xml_list
xmls = xml_files()
for path in xmls:
print(path)
start(path)