Python 爬网 - Amazom 评论爬网与 BeautifulSoup

时间:2021-01-07 14:03:03

标签: python-3.x web-scraping beautifulsoup web-crawler amazon

我尝试使用 Jupeter notebook 从亚马逊抓取评论数据。

但是有来自服务器的响应 503。

有人知道这是怎么回事吗?

这是网址。 https://www.amazon.com/Apple-MWP22AM-A-AirPods-Pro/product-reviews/B07ZPC9QD4/ref=cm_cr_arp_d_paging_btm_next_2?ie=UTF8&reviewerType=all_reviews&pageNumber=

这是我的代码。

import re, requests, csv 
from bs4 import BeautifulSoup 
from time import sleep

def reviews_info(div): 
    review_text = div.find("div", "a-row a-spacing-small review-data").get_text() 
    review_author = div.find("span", "a-profile-name").get_text()
    review_stars = div.find("span", "a-icon-alt").get_text() 
    on_review_date = div.find('span', 'a-size-base a-color-secondary review-date').get_text() 
    review_date = [x.strip() for x in re.sub("on ", "", on_review_date).split(",")] 

    return { "review_text" : review_text, 
            "review_author" : review_author, 
            "review_stars" : review_stars, 
            "review_date": review_date }
base_url = 'https://www.amazon.com/Apple-MWP22AM-A-AirPods-Pro/product-reviews/B07ZPC9QD4/ref=cm_cr_arp_d_paging_btm_next_2?ie=UTF8&reviewerType=all_reviews&pageNumber='


reviews = [] 

NUM_PAGES = 8

for page_num in range(1, NUM_PAGES + 1): 
    print("souping page", page_num, ",", len(reviews), "data collected") 
    url = base_url + str(page_num) 
    soup = BeautifulSoup(requests.get(url).text, 'lxml') 

    for div in soup('div', 'a-section review'): 
        reviews.append(reviews_info(div)) 
    
    sleep(30)

我终于试过了

requests.get(url)

输出是

<Response [503]>

我也试过了

requests.get(url).text()

输出是

TypeError: 'str' object is not callable

亚马逊是否阻止了抓取?

非常感谢您的回答!

3 个答案:

答案 0 :(得分:0)

当您尝试抓取请求时,亚马逊会使用 python 请求库阻止对他们服务器的请求。您可以尝试将 Selenium 与可能可以解决问题的 Chrome 浏览器一起使用。这是Selenium的python版本:https://selenium-python.readthedocs.io/

答案 1 :(得分:0)

我尝试过网络驱动程序。

这是我的代码。

from selenium import webdriver
import re
import requests 
import csv 
from bs4 import BeautifulSoup 
from time import sleep

review_list = []
NUM_PAGE = 8

base_url = 'https://www.amazon.com/Apple-MWP22AM-A-AirPods-Pro/product-reviews/B07ZPC9QD4/ref=cm_cr_arp_d_paging_btm_next_2?ie=UTF8&reviewerType=all_reviews&pageNumber='

for num_page in range(1, NUM_PAGE + 1):
    chrome_driver = '/Users/chromedriver'
    driver = webdriver.Chrome(chrome_driver)

    url = base_url + str(num_page)
    driver.get(url)

    src = driver.page_source
    source = BeautifulSoup(src, 'lxml', from_encoding='utf-8')

    driver.close()

    print("souping page", num_page, ",", len(source.find_all('div', 'a-section celwidget')), "의 data를 수집")

    for source in source.find_all('div', 'a-section celwidget'): 
        review_text = source.find("div", "a-row a-spacing-small review-data").get_text() 
        review_author = source.find("span", "a-profile-name").get_text()
        review_stars = source.find("span", "a-icon-alt").get_text() 
        on_review_date = source.find('span', 'a-size-base a-color-secondary review-date').get_text() 
        #review_date = [x.strip() for x in re.sub("on ", "", on_review_date).split(",")] 

        review = { "review_text" : review_text, 
                "review_author" : review_author, 
                "review_stars" : review_stars, 
                "review_date": on_review_date }

        review_list.append(review)
    
    sleep(10)

答案 2 :(得分:0)

比使用 selenium/webdriver 快得多的解决方案,但使用代理的成本更高。我使用 proxycrawl - 除了作为客户之外,我根本不隶属于他们。我还建议使用像 Scrapy 这样的抓取框架。这将有助于避免在其他功能中使用请求之间的可变时间进行检测。

您为每次成功的刮擦付费 - 您无需为不成功的刮擦付费。这是我找到的最便宜的代理解决方案。

你像这样使用它:

import scrapy  # scraping framework to parse data
from proxycrawl.proxycrawl_api import ProxyCrawlAPI
from datetime import datetime  # used to convert review date string into datetime object. Useful if you plan to insert into an SQL db.

api = ProxyCrawlAPI({'token': 'NON-JS TOKEN'})
apijava = ProxyCrawlAPI({'token': 'JS TOKEN'})

def start_requests(self):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.193 Safari/537.36'
    }
    url = 'https://www.amazon.com/product-reviews/B07ZPC9QD4/ref=cm_cr_arp_d_paging_btm_next_2?ie=UTF8&reviewerType=all_reviews'  # you don't need the product title in the url
    # build proxcrawl url
    pcurl = api.buildURL(url, {})
    yield scrapy.Request(pcurl, callback=self.parse, errback=self.errback_httpbin, headers=headers, meta={'asin': 'B07ZC90D4'})


def parse(self, response):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.193 Safari/537.36'
    }
    reviews_count = response.xpath('//*[@id="cm_cr-review_list"]/div[@data-hook="review"]').getall()
    asin = response.meta['asin']
    asin_title = response.xpath('//*[@id="cm_cr-product_info"]/div/div[2]/div/div/div[2]/div[1]/h1/a/text()').get()
    if reviews_count is not None:  # review_count = number of reviews
        for review_index in range(len(reviews_count)):
            review_index += 1
            review_title = response.xpath('//*[@id="cm_cr-review_list"]/div[@data-hook="review"][' +
                                          str(review_index) + ']/div/div/div[2]/a[2]/span/text()').get()
            review_rating_string = response.xpath('//*[@id="cm_cr-review_list"]/div[@data-hook="review"][' +
                                                  str(review_index) + ']/div/div/div[2]/a[1]/@title').get()
            review_date_string = response.xpath('//*[@id="cm_cr-review_list"]/div[@data-hook="review"][' +
                                                str(
                                                    review_index) + ']/div/div/span[@data-hook="review-date"]/text()').get()
            review_body = response.xpath('//*[@id="cm_cr-review_list"]/div[@data-hook="review"][' +
                                         str(review_index) + ']/div/div/div[4]/span/span/text()').get()
            review_rating = str(review_rating_string).split(' ', 1)[0]
            # get rid of the 00:00:00 time
            review_date = str(datetime.strptime(review_date_string, '%B %d, %Y')).split(' ', 1)[0]
            date_of_cur_review = datetime.strptime(review_date, '%Y-%m-%d')
            
            # DO SOMETHING HERE. INSERT INTO A DB?
            #####
            
            # go to next page if there is one
            if review_index == 10:
                next_page = response.xpath('//*[@class="a-last"]/a/@href').get()
                if next_page is not None:
                    headers = {'User-Agent': headers}
                    yield response.follow(api.buildURL('https://www.amazon.com' + next_page, {}),
                                          callback=self.parse, errback=self.errback_httpbin, headers=headers,
                                          meta={'asin': response.meta['asin']})