我为TripAdvisor做了一个非常好的刮刀,它满足了我的所有需求,然后我试着在休息了四天之后使用它而出现问题,我很快意识到TA改变了一些标签,我做了适当的改变,我仍然无法像以前一样工作。我想抓住“内容”的价值。元素中的标记。 这是元素:
<div class="prw_rup prw_common_bubble_rating bubble_rating" data-prwidget-init="" data-prwidget-name="common_bubble_rating"><span alt="5 of 5 bubbles" class="ui_bubble_rating bubble_50" content="5" property="ratingValue" style="font-size:18px;"></span></div>
这是代码:
for bubs in data.findAll('div',{'class':"prw_rup prw_common_bubble_rating bubble_rating"}):
print([img["content"] for img in bubs.select("img[content]")])
但现在它只给了我一个空的&#39; []&#39;而不是“&#39; 5”的内容。 有人知道可能会有什么变化吗?
这是我的其余代码
import urllib
import urllib.request
from bs4 import BeautifulSoup
import re
import os
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
theurl = "https://www.tripadvisor.com/Hotels-g147364-c3-Cayman_Islands-Hotels.html"
thepage = urllib
thepage = urllib.request.urlopen(theurl)
soup = BeautifulSoup(thepage, "html.parser")
base_url = "https://www.tripadvisor.com"
urls = []
init_info = []
init_data = open('/Users/paribaker/Desktop/scrapping/TripAdvisor/Inv/speccaydata.txt', 'w')
for link in soup.findAll('a',href=re.compile('/Hotel_Review')):
urls.append(base_url + (link.get('href')).strip("#REVIEWS"))
def remove_duplicates(urls):
output= []
seen = set()
for line in urls:
if line not in seen:
output.append(line)
seen.add(line)
return output
urls2 = remove_duplicates(urls)
for url in urls2:
try:
driver = webdriver.Chrome()
driver.get(url)
element = driver.find_element_by_id("taplc_prodp13n_hr_sur_review_filter_controls_0_filterLang_ALL").click()
print("succesfull")
moreinfo = driver.page_source
moresoup = BeautifulSoup(moreinfo,"html.parser")
driver.close()
#moreinfo = urllib
#moreinfo = urllib.request.urlopen(url)
#moresoup = BeautifulSoup(moreinfo,"html.parser")
except:
print("none")
for data in moresoup.findAll('div', {"class":"heading_2014 hr_heading"}):
try:
for title in data.findAll('h1',{'id':"HEADING"}):
init_info.append(title.text.strip("\n")+ ",\t")
for add_data in data.findAll('span',{'class':'format_address'}):
print((add_data.find('span',{'class':'street-address'}).text +",\t"))
init_info.append(add_data.find('span',{'class':'street-address'}).text +",\t")
init_info.append(add_data.find('span',{'class':'locality'}).text + ",\t")
init_info.append(add_data.find('span',{'class':'country-name'}).text + ",\t")
for reviews in data.findAll('a',{'class':'more taLnk'}):
init_info.append(reviews.text).strip("\n")
init_info.append(", \t")
#init_info.append([img["alt"] for img in stars.select("img[alt]")])
#init_info.append([img["content"] for img in stars.select("img[content]")])
except :
init_info.append("N/A" + ", /t")
&#13;
答案 0 :(得分:1)
content="5"
属性的元素是span
,而不是img
。
这会得到你想要的吗?
for bubs in data.findAll('div',{'class':"prw_rup prw_common_bubble_rating bubble_rating"}):
print([elem["content"] for elem in bubs.select("span[content]")])