我试图在以下网站上搜索不同的值(问题的答案)。 “https://www.unpri.org/organisation/schroders-144205”,更具体地说是网站附带的报告。 https://reporting.unpri.org/surveys/PRI-Reporting-Framework-2016/6a23ed84-6bbf-4416-9d0b-6c49f63bc9ac/79894dbc337a40828d895f9402aa63de/html/2/?lang=&a=1。
如果问题未得到回答,我想在列表中添加一个空格,并在答案时添加答案。我现在已经尝试了很多不同的方法,并且会在这里提出代码,即使这是一个非常糟糕的混乱。
**所以问题是 - 我如何在报告链接上搜索问题的答案,并且在问题没有得到解答时,添加一个空白元素?应将所有答案或空白元素添加到列表中。
urls = ['https://www.unpri.org/organisation/schroders-144205']
for i in urls:
browser.visit(i)
window = browser.windows[0]
window.is_current = True
temp_list = []
sourcenew = browser.html
soupnew = bs.BeautifulSoup(sourcenew, 'lxml')
temp_list.append(browser.url)
for info in soupnew.find_all('span', class_ = 'org-type' ):
string_com = str(info.text)
if len(string_com) == 16:
string_com = string_com.replace(' ', ' ')[1:-1]
elif len(string_com) == 11:
string_com = string_com.replace(' ', ' ')[1:-1]
elif len(string_com) == 10:
string_com = string_com.replace(' ', ' ')[1:-1]
elif len(string_com) == 12:
string_com = string_com.replace(' ', ' ')[1:-1]
elif len(string_com) == 13:
string_com = string_com.replace(' ', ' ')[1:-1]
else:
string_com = string_com.replace(' ', ' ')[40:-37]
temp_list.append(string_com)
if len(browser.find_by_xpath('//*[@id="main-
content"]/div[2]/div/div/div[2]/p/a')) > 0:
browser.find_by_xpath('//*[@id="main-
content"]/div[2]/div/div/div[2]/p/a').click()
time.sleep(2)
if len(browser.windows) > 1:
window = browser.windows[1]
window.is_current = True
sourcenew2 = browser.html
soupnew2 = bs.BeautifulSoup(sourcenew2, 'lxml')
parent = soupnew2.select('div[class="indent type_^ parent_S"]')
header_values = []
for r in parent:
headers = r.find_all("h3")
for header in headers:
if header is not None:
fake_radio_button = r.find("img", src="/Style/img/checkedradio.png")
real_radio_button = r.select("input[checked='checked']")
if fake_radio_button == None:
if real_radio_button == None:
header_values.append('')
else:
if len(real_radio_button) > 0:
header_values.append(
real_radio_button[0].attrs["data-original"])
else:
header_values.append("")
else:
header_values.append( fake_radio_button.parent.find(
"span").get_text(strip=True))
text_values1 = []
text_values2 = []
for r in parent:
headers = r.find_all("h3")
for header in headers:
if header is not None:
fake_radio_button = r.find_all("img", src="/Style/img/checkedcheckbox.png")
real_radio_button = r.select("input[checked='checked']")
for b in fake_radio_button:
if b == None:
if real_radio_button == None:
text_values1.append('')
else:
if len(real_radio_button) > 0:
text_values1.append(
real_radio_button[0].attrs["data-original"])
else:
text_values1.append("")
else:
text_values1.append( b.parent.find(
"span").get_text(strip=True))
for r in parent:
headers = r.find_all("h3")
for header in headers:
if header is not None:
fake_radio_button1 = r.find("img", src="/Style/img/checkedcheckbox.png")
real_radio_button1 = r.select("input[checked='checked']")
if fake_radio_button1 == None:
if real_radio_button1 == None:
text_values2.append('')
else:
if len(real_radio_button1) > 0:
text_values2.append(
real_radio_button1[0].attrs["data-original"])
else:
text_values2.append("")
else:
text_values2.append(fake_radio_button1.parent.find(
"span").get_text(strip=True))
text_values3 = []
for r in parent:
headersss = r.find_all("span", class_ = 'n-text-p response')
for headerss in headersss:
if headerss is not None:
text_values3.append(headerss.get_text(strip=True))
for r in parent:
headersss = r.find_all("span", class_ = 'response number')
for headerss in headersss:
if headerss is not None:
text_values3.append(headerss.get_text(strip=True))
else:
text_values3.append('')
for r in parent:
headersss = r.find_all("span", class_ = 'response date')
for headerss in headersss:
if headerss is not None:
text_values3.append(headerss.get_text(strip=True))
else:
text_values.append('')
list_final = []
def f7(seq):
seen = set()
seen_add = seen.add
return [x for x in seq if not (x in seen or seen_add(x))]
list_final.append(f7(temp_list))
list_final.append(f7(header_values))
list_final.append(f7(text_values1))
list_final.append(f7(text_values2))
list_final.append(f7(text_values3))
print(list_final)
答案 0 :(得分:0)
我查看了你的链接,复选框不是实际的复选框元素,而是图像,但你想要的仍然可以完成。如果你看,
这是选中的单选按钮中的图像
<img src="/Style/img/checkedradio.png" class="readradio">
这是未经检查的图片标签
<img src="/Style/img/uncheckedradio.png" class="readradio">
因此,您可以根据以下内容选择已选中或未选中的答案:
all_question_blocks = soup_obj.findAll("div",{"class":"question-block"})
for question_block in all_question_blocks:
checked = question_block.findAll("a",{"src=":"/Style/img/checkedradio.png"}
#all your checked attributes, if empty then not answered
unchecked = question_block.findAll("a",{"src=":"/Style/img/uncheckedradio.png"}
然后,如果要提取其他属性,则可以在HTML树中向上移动并逐个获取父元素。
希望这有帮助!