假设我正在从网址抓取评论
它包含我想要抓取的评论的页面内容。那么我怎样才能抓住所有下一页的评论。
我使用了以下代码,但仍然只显示第一页中的评论!
$(document).ready(function() {
$("input.positive-numeric-only").on("keydown", function(e) {
var char = e.originalEvent.key.replace(/[^0-9^.^,]/, "");
if (char.length == 0 && !(e.originalEvent.ctrlKey || e.originalEvent.metaKey)) {
e.preventDefault();
}
});
$("input.positive-numeric-only").bind("paste", function(e) {
var numbers = e.originalEvent.clipboardData
.getData("text")
.replace(/[^0-9^.^,]/g, "");
e.preventDefault();
var the_val = parseFloat(numbers);
if (the_val > 0) {
$(this).val(the_val.toFixed(2));
}
});
$("input.positive-numeric-only").focusout(function(e) {
if (!isNaN(this.value) && this.value.length != 0) {
this.value = Math.abs(parseFloat(this.value)).toFixed(2);
} else {
this.value = 0;
}
});
});
答案 0 :(得分:3)
服务器添加到网址(在.html
之前的任何地方)
-or5
获取第二页,-or10
获取第三页,等
您甚至可以跳过单词(适用于SEO
)并仅使用
https://www.tripadvisor.com/g562819-d289642-or5.html
https://www.tripadvisor.com/g562819-d289642-or10.html
获取评论的下一页。
from bs4 import BeautifulSoup
import requests
import re
#import webbrowser
def get_soup(url):
headers = {'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:57.0) Gecko/20100101 Firefox/57.0'}
r = s.get(url, headers=headers)
#with open('temp.html', 'wb') as f:
# f.write(r.content)
# webbrowser.open('temp.html')
if r.status_code != 200:
print('status code:', r.status_code)
else:
return BeautifulSoup(r.text, 'html.parser')
def parse(url, response):
if not response:
print('no response:', url)
return
# get number of reviews
num_reviews = response.find('span', class_='reviews_header_count').text
num_reviews = num_reviews[1:-1] # remove `( )`
num_reviews = num_reviews.replace(',', '') # remove `,`
num_reviews = int(num_reviews)
print('num_reviews:', num_reviews, type(num_reviews))
# create template for urls to pages with reviews
url = url.replace('.html', '-or{}.html')
print('template:', url)
# load pages with reviews
for offset in range(0, num_reviews, 5):
print('url:', url.format(offset))
url_ = url.format(offset)
parse_reviews(url_, get_soup(url_))
return # for test only - to stop after first page
def parse_reviews(url, response):
print('review:', url)
if not response:
print('no response:', url)
return
# get every review
for idx, review in enumerate(response.find_all('div', class_='review-container')):
item = {
'hotel_name': response.find('h1', class_='heading_title').text,
'review_title': review.find('span', class_='noQuotes').text,
'review_body': review.find('p', class_='partial_entry').text,
'review_date': review.find('span', class_='relativeDate')['title'],#.text,#[idx],
'num_reviews_reviewer': review.find('span', class_='badgetext').text,
'reviewer_name': review.find('span', class_='scrname').text,
'bubble_rating': review.select_one('div.reviewItemInline span.ui_bubble_rating')['class'][1][7:],
}
results.append(item) # <--- add to global list
#~ yield item
for key,val in item.items():
print(key, ':', val)
print('----')
#return # for test only - to stop after first review
# --- main ---
s = requests.Session()
start_urls = [
'https://www.tripadvisor.com/Hotel_Review-g562819-d289642-Reviews-Hotel_Caserio-Playa_del_Ingles_Maspalomas_Gran_Canaria_Canary_Islands.html',
#'https://www.tripadvisor.com/Hotel_Review-g60795-d102542-Reviews-Courtyard_Philadelphia_Airport-Philadelphia_Pennsylvania.html',
#'https://www.tripadvisor.com/Hotel_Review-g60795-d122332-Reviews-The_Ritz_Carlton_Philadelphia-Philadelphia_Pennsylvania.html',
]
results = [] # <--- global list for items
for url in start_urls:
parse(url, get_soup(url))
import pandas as pd
df = pd.DataFrame(results) # <--- convert list to DataFrame
df.to_csv('output.csv') # <--- save in file
答案 1 :(得分:-1)
为什么我在运行上述代码@furas
时出现此错误AttributeError: 'NoneType' object has no attribute 'text'
AttributeError Traceback (most recent call last)
<ipython-input-15-40ebad987bb5> in <module>()
75 for url in start_urls:
-76 parse(url, get_soup(url))
<ipython-input-15-40ebad987bb5> in parse(url, response)
46 print('url:', url.format(offset))
47 url_ = url.format(offset)
48 parse_reviews(url_, get_soup(url_))
49 #return # for test only - to stop after first page
50
<ipython-input-15-40ebad987bb5> in parse_reviews(url, response)
62 'review_body': review.find('p', class_='partial_entry'),
63 'review_date': review.find('span', class_='ratingDate relativeDate'['title'],#.text,#[idx],
64 'num_reviews_reviewer': review.find('span',class_='badgetext').text,
65 'reviewer_name': review.find('span', class_='scrname').text,
66 'bubble_rating': review.select_one('div.reviewItemInline span.ui_bubble_rating')['class'][1][7:],
AttributeError: 'NoneType' object has no attribute 'text'