places = []
persons = []
unknown = []
newlist = []
filename = 'file.html'
tree = etree.parse(filename)
input_file = open(filename, 'rU')
def extract(tree):
<some code>
return places
return persons
return unknown
def change_class():
extract(tree)
for line in input_file:
for x in places:
for z in unknown:
if x+'</dfn>' in line:
newline = line.replace('"person"', '"place"')
newlist.append(newline)
elif z+'</dfn>' in line:
newline = line.replace('"person"','"undefined"')
newlist.append(newline)
else:
newlist.append(line)
break
break
for x in newlist:
print x
我有一个这样的html文件,错误的类值:
<html> <head></head> <body> <p class ='person'><dfn>New-York</dfn> <p class = 'place'><dfn>John Doe</dfn> <p class ='person'><dfn>Paris</dfn> <p class = 'place'><dfn>Jane Doe</dfn> </body> </html>
我的脚本允许我重新打印同一个文件,但它只替换两个列表中第一项(地点和未知)的类值:
<html> <head></head> <body> <p class ='place'><dfn>New-York</dfn> <p class = 'unknown'><dfn>John Doe</dfn> <p class ='person'><dfn>Paris</dfn> <p class = 'place'><dfn>Jane Doe</dfn> </body> </html>
然后它有点停止迭代两个列表并直接进入else-step并将所有其余部分添加到newlist而不进行替换。 Python yelds没有错误,list也是用extract()函数成功提取的,我检查了......
答案 0 :(得分:1)
known_places = #list of known places
unkowns = #list of unknown places and persons
newlist = []
for line in input_file:
if any(place in line for place in Known_places):
line = line.replace("person", "place")
elif any(unkown in line for unkown in unkowns):
line = line.replace("person","undefined")
newlist.append(line)
这样的事可能有用。
答案 1 :(得分:0)
我删除了我的其他答案,因为它试图解决你没有的问题。我看到你已经接受了答案,但也要看看BeautifulSoup解决方案。
from bs4 import BeautifulSoup
PLACES = ["New-York","Paris"] # etc
PEOPLE = ["John Doe","Jane Doe"] # etc
soup = BeautifulSoup(open('file.txt'))
paragraphs = soup("p") # grabs all the <p>...</p> elements
for p in paragraphs:
if p.dfn.string in PLACES:
p['class'] = 'place'
elif p.dfn.string in PEOPLE:
p['class'] = 'person'
str(soup)
现在是您的HTML文档,已根据要求进行了修改。