来自tripadvisor的评论

时间:2017-12-17 14:49:55

标签: python python-3.x python-2.7 beautifulsoup

假设我正在从网址抓取评论

https://www.tripadvisor.com/Hotel_Review-g562819-d289642-Reviews-Hotel_Caserio-Playa_del_Ingles_Maspalomas_Gran_Canaria_Canary_Islands.html

它包含我想要抓取的评论的页面内容。那么我怎样才能抓住所有下一页的评论。

我使用了以下代码,但仍然只显示第一页中的评论!

$(document).ready(function() {
  $("input.positive-numeric-only").on("keydown", function(e) {
    var char = e.originalEvent.key.replace(/[^0-9^.^,]/, "");
    if (char.length == 0 && !(e.originalEvent.ctrlKey || e.originalEvent.metaKey)) {
      e.preventDefault();
    }
  });

  $("input.positive-numeric-only").bind("paste", function(e) {
    var numbers = e.originalEvent.clipboardData
      .getData("text")
      .replace(/[^0-9^.^,]/g, "");
    e.preventDefault();
    var the_val = parseFloat(numbers);
    if (the_val > 0) {
      $(this).val(the_val.toFixed(2));
    }
  });

  $("input.positive-numeric-only").focusout(function(e) {
    if (!isNaN(this.value) && this.value.length != 0) {
      this.value = Math.abs(parseFloat(this.value)).toFixed(2);
    } else {
      this.value = 0;
    }
  });
});

2 个答案:

答案 0 :(得分:3)

基于example for scrapy

服务器添加到网址(在.html之前的任何地方)

  • -or5获取第二页,
  • -or10获取第三页,

您甚至可以跳过单词(适用于SEO)并仅使用

https://www.tripadvisor.com/g562819-d289642-or5.html
https://www.tripadvisor.com/g562819-d289642-or10.html

获取评论的下一页。

from bs4 import BeautifulSoup
import requests
import re
#import webbrowser

def get_soup(url):

    headers = {'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:57.0) Gecko/20100101 Firefox/57.0'}

    r = s.get(url, headers=headers)

    #with open('temp.html', 'wb') as f:
    #    f.write(r.content)
    #    webbrowser.open('temp.html')

    if r.status_code != 200:
        print('status code:', r.status_code)
    else:
        return BeautifulSoup(r.text, 'html.parser')

def parse(url, response):

    if not response:
        print('no response:', url)
        return

    # get number of reviews
    num_reviews = response.find('span', class_='reviews_header_count').text
    num_reviews = num_reviews[1:-1] # remove `( )`
    num_reviews = num_reviews.replace(',', '') # remove `,`
    num_reviews = int(num_reviews)
    print('num_reviews:', num_reviews, type(num_reviews))

    # create template for urls to pages with reviews
    url = url.replace('.html', '-or{}.html')
    print('template:', url)

    # load pages with reviews
    for offset in range(0, num_reviews, 5):
        print('url:', url.format(offset))
        url_ = url.format(offset)
        parse_reviews(url_, get_soup(url_))
        return # for test only - to stop after first page

def parse_reviews(url, response):
    print('review:', url)

    if not response:
        print('no response:', url)
        return

    # get every review
    for idx, review in enumerate(response.find_all('div', class_='review-container')):
        item = {
            'hotel_name': response.find('h1', class_='heading_title').text,
            'review_title': review.find('span', class_='noQuotes').text,
            'review_body': review.find('p', class_='partial_entry').text,
            'review_date': review.find('span', class_='relativeDate')['title'],#.text,#[idx],
            'num_reviews_reviewer': review.find('span', class_='badgetext').text,
            'reviewer_name': review.find('span', class_='scrname').text,
            'bubble_rating': review.select_one('div.reviewItemInline span.ui_bubble_rating')['class'][1][7:],
        }

        results.append(item) # <--- add to global list

        #~ yield item
        for key,val in item.items():
            print(key, ':', val)
        print('----')
        #return # for test only - to stop after first review


# --- main ---

s = requests.Session()

start_urls = [
    'https://www.tripadvisor.com/Hotel_Review-g562819-d289642-Reviews-Hotel_Caserio-Playa_del_Ingles_Maspalomas_Gran_Canaria_Canary_Islands.html',
    #'https://www.tripadvisor.com/Hotel_Review-g60795-d102542-Reviews-Courtyard_Philadelphia_Airport-Philadelphia_Pennsylvania.html',
    #'https://www.tripadvisor.com/Hotel_Review-g60795-d122332-Reviews-The_Ritz_Carlton_Philadelphia-Philadelphia_Pennsylvania.html',
]

results = [] # <--- global list for items

for url in start_urls:
    parse(url, get_soup(url))

import pandas as pd

df = pd.DataFrame(results) # <--- convert list to DataFrame
df.to_csv('output.csv')    # <--- save in file

答案 1 :(得分:-1)

为什么我在运行上述代码@furas

时出现此错误
AttributeError: 'NoneType' object has no attribute 'text' 
AttributeError   Traceback (most recent call last)
<ipython-input-15-40ebad987bb5> in <module>() 
 75 for url in start_urls:
-76     parse(url, get_soup(url))
<ipython-input-15-40ebad987bb5> in parse(url, response)
 46         print('url:', url.format(offset))
 47         url_ = url.format(offset)
 48         parse_reviews(url_, get_soup(url_))
 49         #return # for test only - to stop after first page
 50 
<ipython-input-15-40ebad987bb5> in parse_reviews(url, response)
 62 'review_body': review.find('p',     class_='partial_entry'),
 63 'review_date': review.find('span', class_='ratingDate relativeDate'['title'],#.text,#[idx],
 64 'num_reviews_reviewer': review.find('span',class_='badgetext').text,
 65 'reviewer_name': review.find('span', class_='scrname').text,
 66 'bubble_rating': review.select_one('div.reviewItemInline  span.ui_bubble_rating')['class'][1][7:],
AttributeError: 'NoneType' object has no attribute 'text'