我正在抓取亚马逊网站的评论。
我觉得抓狂的评论。
有时,您无法抓取您的网页。怎么了?
我是否需要长时间睡觉?
您是否因为评论中的照片而无法抓取? 对不起,我排除了某些部分并抓了。我不知道这句话。如果你不介意,可以教我吗?
还有其他原因吗?
import re, requests, csv
from bs4 import BeautifulSoup
from time import sleep
##CMD : chcp 65001
def reviews_info(div):
review_text = div.find("div", "a-row review-data").span.text
review_author = div.find("a", "a-size-base a-link-normal author").text
review_stars = div.find("div", "a-row").a.text
on_review_date = div.find('span', 'a-size-base a-color-secondary review-date').text
review_date = [x.strip() for x in re.sub("on ", "", on_review_date).split(",")]
return {
"review_text" : review_text,
"review_author" : review_author,
"review_stars" : review_stars,
"review_date": review_date
}
base_url = "https://www.amazon.com/GRACE-KARIN-BoatNeck-Sleeveless-Vintage/product-reviews/B016XUCYZO/ref=cm_cr_dp_d_show_all_top?ie=UTF8&showViewpoints=1&sortBy=helpful&pageNumber="
reviews = []
NUM_PAGES = 472
for page_num in range(1, NUM_PAGES + 1):
print("souping page", page_num, ",", len(reviews), " data")
url = base_url + str(page_num)
soup = BeautifulSoup(requests.get(url).text, 'lxml')
for div in soup('div', 'a-section review'):
reviews.append(reviews_info(div))
sleep(30)
###################################################
# Save dict data
keys = reviews[0].keys()
with open('amazon_GRACE KARIN BoatNeck Sleeveless Vintage Tea Dress with Belt_review.csv', 'w', encoding="utf-8") as f:
dict_writer = csv.DictWriter(f, delimiter=',', lineterminator='\n', fieldnames=keys)
dict_writer.writeheader()
dict_writer.writerows(reviews)