我的代码显示了酒店的第一页。为什么它没有显示更多?
import csv
import requests
from bs4 import BeautifulSoup
hotels=[]
i=0
url0 = 'https://www.tripadvisor.com/Hotels-g295424-Dubai_Emirate_of_Dubai- Hotels.html#EATERY_LIST_CONTENTS'
r = requests.get(url0)
data = r.text
soup = BeautifulSoup(r.text, "html.parser")with open('hotels_Data.csv','wb') as file:
for link in soup.findAll('a', {'property_title'}):
print('https://www.tripadvisor.com/Hotels-g295424-' + link.get('href'))
print(link.string)
for i in range(20):
while int(i) <= (20):
i = str(i)
url1 = 'https://www.tripadvisor.com/Hotels-g295424-oa' + i + '- Dubai_Emirate_of_Dubai-Hotels.html#EATERY_LIST_CONTENTS'
r1 = requests.get(url1)
data1 = r1.text
soup1 = BeautifulSoup(data1, "html.parser")
for link in soup1.findAll('a', {'property_title','price'}):
print('https://www.tripadvisor.com/Hotels-g294212-' + link.get('href'))
print(link.string)
for link in soup.select("a.reference.internal"):
url1 = link["href"]
absolute_url = urljoin(base_url, url1)
print(url1, absolute_url)
writer = csv.writer(file)
for row in hotels:
writer.writerow([s.encode("utf-8") for s in row])
break
答案 0 :(得分:0)
检查页面底部下一页的链接 - 此门户网站不使用页码 - 1
,2
,3
等 - 但提供偏移量 - 0
,30
,60
,90
等。 (因为它在页面上显示30个优惠)
因此,您必须在网址中使用值0
,30
,60
,90
等
"...-oa" + offset + "-Dubai_Emirate..."
你可以使用ie。 range(0, 250, 30)
获取值0
,30
,60
,90
。
import requests
from bs4 import BeautifulSoup
for offset in range(0, 250, 30):
print('--- page offset:', offset, '---')
url = 'https://www.tripadvisor.com/Hotels-g295424-oa' + str(offset) + '-Dubai_Emirate_of_Dubai-Hotels.html#EATERY_LIST_CONTENTS'
r = requests.get(url)
soup = BeautifulSoup(r.text, "html.parser")
for link in soup.find_all('a', {'property_title'}):
print(link.text)
但是可以提供超过250的优惠,因此您必须获取指向最后一页的链接才能获得正确的值而不是250
import requests
from bs4 import BeautifulSoup
offset = 0
url = 'https://www.tripadvisor.com/Hotels-g295424-oa' + str(offset) + '-Dubai_Emirate_of_Dubai-Hotels.html#EATERY_LIST_CONTENTS'
r = requests.get(url)
soup = BeautifulSoup(r.text, "html.parser")
for link in soup.find_all('a', {'last'}):
page_number = link.get('data-page-number')
last_offset = int(page_number) * 30
print('last offset:', last_offset)
在last_offset+1
range(0, last_offset+1, 30)
编辑:餐馆使用JavaScript和AJAX加载数据
import requests
from bs4 import BeautifulSoup
size = 30
# direct url - doesn't have expected information
#url = 'https://www.tripadvisor.com/Restaurants-g187791-Rome_Lazio.html'
# url used by AJAX
url = 'https://www.tripadvisor.com/RestaurantSearch?Action=PAGE&geo=187791&ajax=1&itags=10591&sortOrder=relevance&o=a' + str(size) + '&availSearchEnabled=true&eaterydate=2017_04_27&date=2017-04-28&time=20%3A00%3A00&people=2'
r = requests.get(url)
soup = BeautifulSoup(r.text, "html.parser")
link = soup.find_all('a')[-1]
page_number = link.get('data-page-number')
last_offset = int(page_number) * size # *30
print('last offset:', last_offset)
offset = link.get('data-offset')
offset = int(offset) + size # +30
print('offset:', offset)