来自tripadvisor的酒店..如何从1到10页的所有页面获取酒店并存储?

时间:2016-12-08 15:36:37

标签: python csv web-scraping beautifulsoup python-3.5

我的代码显示了酒店的第一页。为什么它没有显示更多?

import csv

import requests

from bs4 import BeautifulSoup

hotels=[]
i=0

url0 = 'https://www.tripadvisor.com/Hotels-g295424-Dubai_Emirate_of_Dubai-     Hotels.html#EATERY_LIST_CONTENTS'

r = requests.get(url0)

data = r.text
soup = BeautifulSoup(r.text, "html.parser")with open('hotels_Data.csv','wb') as file:

for link in soup.findAll('a', {'property_title'}):
    print('https://www.tripadvisor.com/Hotels-g295424-' + link.get('href'))
    print(link.string)


for i in range(20):
   while int(i) <= (20):
    i = str(i)

    url1 = 'https://www.tripadvisor.com/Hotels-g295424-oa' + i + '-  Dubai_Emirate_of_Dubai-Hotels.html#EATERY_LIST_CONTENTS'
    r1 = requests.get(url1)
    data1 = r1.text
    soup1 = BeautifulSoup(data1, "html.parser")
    for link in soup1.findAll('a', {'property_title','price'}):
        print('https://www.tripadvisor.com/Hotels-g294212-' +    link.get('href'))
        print(link.string)
        for link in soup.select("a.reference.internal"):
            url1 = link["href"]
            absolute_url = urljoin(base_url, url1)

            print(url1, absolute_url)       
        writer = csv.writer(file)
        for row in hotels:
            writer.writerow([s.encode("utf-8") for s in row])                                                
break

1 个答案:

答案 0 :(得分:0)

检查页面底部下一页的链接 - 此门户网站不使用页码 - 123等 - 但提供偏移量 - 0306090等。 (因为它在页面上显示30个优惠)

因此,您必须在网址中使用值0306090

"...-oa" + offset + "-Dubai_Emirate..."

你可以使用ie。 range(0, 250, 30)获取值0306090

import requests
from bs4 import BeautifulSoup

for offset in range(0, 250, 30):
    print('--- page offset:', offset, '---')

    url = 'https://www.tripadvisor.com/Hotels-g295424-oa' + str(offset) + '-Dubai_Emirate_of_Dubai-Hotels.html#EATERY_LIST_CONTENTS'

    r = requests.get(url)
    soup = BeautifulSoup(r.text, "html.parser")

    for link in soup.find_all('a', {'property_title'}):
          print(link.text)

但是可以提供超过250的优惠,因此您必须获取指向最后一页的链接才能获得正确的值而不是250

import requests
from bs4 import BeautifulSoup

offset = 0
url = 'https://www.tripadvisor.com/Hotels-g295424-oa' + str(offset) + '-Dubai_Emirate_of_Dubai-Hotels.html#EATERY_LIST_CONTENTS'

r = requests.get(url)
soup = BeautifulSoup(r.text, "html.parser")

for link in soup.find_all('a', {'last'}):
    page_number = link.get('data-page-number')
    last_offset = int(page_number) * 30
    print('last offset:', last_offset)

last_offset+1

中使用range(0, last_offset+1, 30)

编辑:餐馆使用JavaScript和AJAX加载数据

import requests
from bs4 import BeautifulSoup

size = 30

# direct url - doesn't have expected information
#url = 'https://www.tripadvisor.com/Restaurants-g187791-Rome_Lazio.html'

# url used by AJAX
url = 'https://www.tripadvisor.com/RestaurantSearch?Action=PAGE&geo=187791&ajax=1&itags=10591&sortOrder=relevance&o=a' + str(size) + '&availSearchEnabled=true&eaterydate=2017_04_27&date=2017-04-28&time=20%3A00%3A00&people=2'

r = requests.get(url)
soup = BeautifulSoup(r.text, "html.parser")

link = soup.find_all('a')[-1]
page_number = link.get('data-page-number')
last_offset = int(page_number) * size # *30
print('last offset:', last_offset)

offset = link.get('data-offset')
offset = int(offset) + size # +30
print('offset:', offset)