我正尝试在beeradvocate.com上抓取用户评论啤酒的数据,以分析用户对不同啤酒类型的态度。但是我只能得到前几页的结果,保持空白
情况:
我的方法
def review_scrape (beer_link, number_of_ratings):
reviews=[]
rate =[]
for pages_i in range(0,int(number_of_ratings),25): #site shows 25 resulst/page
session = requests.session() # Start the session
payload = {'login':'suzie102', 'password':''}
page1 = session.post("https://www.beeradvocate.com/community/login/login", data=payload)
url = beer_link+'/?view=beer&sort=&start=%d'%(pages_i)
page1= session.get(url)
time.sleep(3)
soup1 = lxml.html.fromstring(page1.text)
rate_i = soup1.xpath('//span[@class = "muted"]/text()')[8::3]
print(url)
reviews_i = soup1.xpath('//div/text()')
reviews.append(reviews_i)
print(len(reviews))
rate.append(rate_i)
return rate,reviews
结果:
答案 0 :(得分:0)
我只看到一个问题。
url = beer_link+'/?view=beer&sort=&start=%d'%(pages_i)
/是多余的,您需要的是 url = beer_link +'?view = beer&sort =&start =%d'%(pages_i)
这就是为什么您的链接打印中有//?视图。
我可以看到有指向下一页的锚链接“下一个”。我建议使用while循环或递归。
除此之外,我看不到您的脚本中缺少什么。其他所有东西看起来都井井有条,而且应该可以正常工作。
如果您可以为我们提供更多详细信息,我们可能还有更多工作需要。
答案 1 :(得分:0)
更新,感谢大家的评论,我尝试将其与硒一起使用。现在可以使用
def webstite_scrape_p2 (beer_link, number_of_ratings):
driver = webdriver.Chrome('/home/sam/Downloads/chromedriver')
url = 'https://www.beeradvocate.com/community/login/'
driver.get(url)
loginelement = WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.XPATH, '//form[@class="xenForm formOverlay"]//dd//input[@name ="login"]')))
loginelement.send_keys('suzie102')
pwelement = WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.XPATH, '//form[@class="xenForm formOverlay"]//dl[@class ="ctrlUnit"]//dd//ul//li[@id = "ctrl_pageLogin_registered_Disabler"]//input[@name ="password"]')))
pwelement.send_keys('')
page_click = WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.XPATH, '//form[@class="xenForm formOverlay"]//dl[@class ="ctrlUnit submitUnit"]//dd//input[@type ="submit"]')))
page_click.click()
rate = []
reviews =[]
avg_user =[]
for link, i in zip(beer_link, number_of_rev):
for pages_i in tqdm(range(0,int(i),25)): #site shows 25 resulst/page)
new_url = link+'?view=beer&sort=&start=%d'%(pages_i)
print(new_url)
driver.get(new_url)
#print(driver.find_element_by_name("hideRatings").is_selected())
#check_box = WebDriverWait(driver,20).until(EC.element_to_be_clickable((By.XPATH, '//form[@style="display:inline;margin:0;padding:0;"]//input[@type = "checkbox"]')))#check_box.click()
#check_box.click()
time.sleep(5)
driver.get(new_url)
page_source = driver.page_source
soup = BeautifulSoup(page_source,'html.parser')
rate_i = [ i.get_text() for i in soup.find_all('span', class_ = "muted")][8::3]
rate.append(rate_i)
reviews_i = [ i.get_text() for i in soup.find_all('div')]
reviews.append(reviews_i)
avg_i = [i.get_text() for i in soup.find_all('span', class_= "BAscore_norm")]
avg_user.append(avg_i)
return rate, reviews, avg_user