我是Beautiful Soup的新手。 我需要从HTML文件中获取数据。
<div class="ques_ans_block">
<div class="question">
<p>is this correct ?</p>
<div>
<p class="answer"></p>
<div class="moreinfo" style="display: block;">
<p class="answer"> <p>
<p class="answer"></p>
</div>
</div>
条件是,可以存在“moreinfo”div或abscent。
所以我需要为每个ques_ans_block找到问题和答案(包括来自“moreinfo”的回答,如果有的话)innertext?
答案 0 :(得分:0)
这将输出为包含Question,answer和FaqID的json。
import bs4
import json
import codecs
arrayList = []
bsp = bs4.BeautifulSoup(open('input.html'))
ques_ans_block = bsp.find_all("div", {"class": "ques_ans_block"})
s = ""
count = 1
for i in ques_ans_block:
data = {}
q = i.select('.question')
for a in q:
s+=a.text+"\n"
for a in q:
a.extract()
data["Question"] = s
del i['.question']
v = ""
a = i.select('p')
for a in a:
v+=a.text+"\n"
a = i.select('li')
for a in a:
v+=a.text+"\n"
data["Answer"] = v
data["FaqId"] = count
print "\n"
arrayList.append(data)
count = count + 1
s = ""
#print arrayList
with codecs.open('output.json','wt','utf-8') as outfile:
json.dump(arrayList, outfile, indent=4)