我正在抓取booking.com,并获取酒店名称,等级,酒店描述,许多评论,等级,地址,类别(检查它是酒店还是旅馆)和位置。
我的问题:
1)我刮了30页,到最后,我在许多评论,评分和类别中都缺少值。当我返回并检查站点时,所有值都可用于这些字段。我不确定为什么没有捕获。请指教吗?
2)如果我刮单个页面,则不会丢失任何值。是什么原因?
#Importing necessary library
from selenium import webdriver
from bs4 import BeautifulSoup
from selenium.webdriver.firefox.options import Options
import pandas as pd
import time
import re
import requests
from itertools import zip_longest
from webdriver_manager.chrome import ChromeDriverManager
category = []
name = []
address = []
reviews = []
review_title = []
ratings = []
description = []
facilit = []
driver = webdriver.Chrome(ChromeDriverManager().install())
for pageno in range(0,750, 25):
print(pageno)
#driver = webdriver.Chrome(ChromeDriverManager().install())
driver.get("https://www.booking.com/searchresults.en-gb.html?aid=397642&label=gog235jc-1FCAEoggI46AdIM1gDaK4BiAEBmAEJuAEXyAEM2AEB6AEB-AEMiAIBqAIDuALm3eDyBcACAQ&sid=422b3ff3c0e98b522259ad1cad2505ea&tmpl=searchresults&ac_click_type=b&ac_position=0&checkin_month=7&checkin_monthday=1&checkin_year=2020&checkout_month=7&checkout_monthday=15&checkout_year=2020&class_interval=1&dest_id=-1506909&dest_type=city&dtdisc=0&from_sf=1&group_adults=2&group_children=0&iata=AKL&inac=0&index_postcard=0&label_click=undef&no_rooms=1&postcard=0&raw_dest_type=city&room1=A%2CA&sb_price_type=total&search_selected=1&shw_aparth=1&slp_r_match=0&src=index&src_elem=sb&srpvid=0da61131394c0103&ss=Auckland%2C%20Auckland%20Region%2C%20New%20Zealand&ss_all=0&ss_raw=Auckland&ssb=empty&sshis=0&top_ufis=1&rows=25&offset=0" + str(pageno))
time.sleep(1)
summaryItems = driver.find_elements_by_xpath("//a[contains(@class, 'hotel_name_link url')]")
job_links = [summaryItem.get_attribute("href") for summaryItem in summaryItems]
for job_link in job_links:
driver.get(job_link)
time.sleep(1)
try:
job_title = driver.find_element_by_xpath("//*[@class='hp__hotel-type-badge']").text
category.append(job_title)
except:
job_title = "None"
try:
hotel = driver.find_element_by_id('hp_hotel_name').text.strip('Hotel')
name.append(hotel)
except:
hotel = "None"
try:
add = driver.find_element_by_id('showMap2').find_element_by_class_name('hp_address_subtitle').text
address.append(add)
except:
add = "None"
try:
reviews = driver.find_element_by_class_name('bui-review-score--end').find_element_by_class_name('bui-review-score__text').text
review_title.append(reviews)
except:
reviews = "None"
try:
rating = driver.find_element_by_class_name('bui-review-score--end').find_element_by_class_name('bui-review-score__badge').text
ratings.append(rating)
except:
rating = "None"
try:
desc = driver.find_element_by_xpath("//div[@id='property_description_content']").text
description.append(desc)
except:
rating = "None"
driver.close()
# Converting all the details into dataframe and csv file
final = []
for item in zip_longest(name, address, review_title, ratings, description, category):
final.append(item)
df4 = pd.DataFrame(
final, columns=['Hotel_name', 'Address', 'Number_of_review', 'Ratings', 'Description', 'Category' ])
#df.to_csv('booked.csv')