我正在尝试为一个大学项目剪贴一个网站。该网站是:https://www.influenster.com/reviews/samsung-galaxy-s9
我想获得每个用户提供的产品评级,这些评级不是文本格式,如下所示。我希望从内容中提取值4。
我尝试了几种方法来做到这一点。但是每次都会出现错误,并且无法检索正确的数据:
from urllib.request import Request, urlopen
from bs4 import BeautifulSoup
stars_comb=[]
req = Request('https://www.influenster.com/reviews/samsung-galaxy-s9', headers={'User-Agent': 'Mozilla/5.0'})
webpage = urlopen(req).read()
soup = BeautifulSoup(webpage, 'html.parser')
ratings = soup.find_all('div', class_='avg-stars')
print(ratings)
请帮助我,我是编程和python的新手。
答案 0 :(得分:1)
您需要浏览所有10页的评论,而忽略每页底部也使用avg-stars
类的其他10条产品评论,请尝试这样的操作,首先将仅针对在搜索avg-stars
类之前,先使用Samsung Galaxy S9 Phone:
from bs4 import BeautifulSoup
import requests
def main():
all_review_stars = []
base_url = 'https://www.influenster.com/reviews/samsung-galaxy-s9?review_page='
last_page_num = 10
for page_num in range(1, last_page_num + 1):
page_link = base_url + str(page_num)
page_response = requests.get(page_link, headers={'User-Agent': 'Mozilla/5.0'}, timeout=5)
page_content = BeautifulSoup(page_response.content, "html.parser")
reviews_stars_for_page = page_content.find_all("div", class_="review-item-stars")
for review_stars in reviews_stars_for_page:
all_review_stars.append(review_stars.find("div", class_="avg-stars")['data-stars'])
print(f"Got stars for page {page_num}")
print(f"Retrived the stars given from {len(all_review_stars)} reviews")
all_review_stars = list(map(int, all_review_stars))
print(all_review_stars)
if __name__ == '__main__':
main()
输出:
Got stars for page 1
Got stars for page 2
Got stars for page 3
Got stars for page 4
Got stars for page 5
Got stars for page 6
Got stars for page 7
Got stars for page 8
Got stars for page 9
Got stars for page 10
Retrived the stars given from 94 reviews
[5, 5, 5, 4, 5, 5, 5, 4, 3, 5, 3, 5, 5, 5, 5, 5, 4, 5, 5, 4, 5, 5, 5, 5, 3, 5, 5, 4, 5, 5, 4, 2, 5, 5, 3, 5, 5, 4, 5, 5, 5, 5, 5, 4, 3, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 3, 4, 4, 4, 2, 5, 4, 5, 5, 5, 4, 5, 5, 5, 5, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 3, 5, 4, 4, 5, 5, 4, 5]
答案 1 :(得分:0)
ratings = soup.find_all('div', class_='avg-stars')
for rating in ratings:
print(rating.get('data-stars'))
4.6063829787234
4.6063829787234
5
5
...
如果客户在所有页面上都评论过:
import math
from bs4 import BeautifulSoup
req = Request('https://www.influenster.com/reviews/samsung-galaxy-s9', headers={'User-Agent': 'Mozilla/5.0'})
webpage = urlopen(req).read()
soup = BeautifulSoup(webpage, 'html.parser')
total_review = soup.find('div', class_='product-highlights-results').get('data-reviews-count')
per_page_review_limit = 10
total_pages = math.ceil( int(total_review ) / per_page_review_limit )
for (page in range(1, toatl_pages+1)):
req = Request('https://www.influenster.com/reviews/samsung-galaxy-s9?review_page={}'.format(page), headers={'User-Agent': 'Mozilla/5.0'})
webpage = urlopen(req).read()
soup = BeautifulSoup(webpage, 'html.parser')
print('stars from review_page: {}'.format(page))
for star in soup.find_all('div', class_='review-item-stars'):
print(star.div['data-stars'])
答案 2 :(得分:0)
您可以从脚本标记之一的json中的reviewCount计算页数,然后循环。以下打印内容回顾了所有页面上的文字和星级。
import requests
import re
from bs4 import BeautifulSoup as bs
import math
baseUrl = 'https://www.influenster.com/reviews/samsung-galaxy-s9?review_page={}'
url = 'https://www.influenster.com/reviews/samsung-galaxy-s9'
reviewsPerPage = 10
headers = {'User-Agent' : 'Mozilla/5.0'}
with requests.Session() as s:
r = s.get(url, headers = headers)
soup = bs(r.content, 'lxml')
reg = re.compile(r'"reviewCount": "(\d+)"')
data = soup.find('script', text=reg).text
numReviews = int(reg.findall(data)[0])
numPages = math.ceil(numReviews/reviewsPerPage)
stars = [item['data-stars'] for item in soup.select('.review-item [data-stars]')]
reviewText = [item.text.strip().replace('\xa0','') for item in soup.select('.review-text')]
results = list(zip(reviewText,stars))
print(results)
if numPages > 1:
for page in range(2, numPages + 1):
r = s.get(baseUrl.format(page), headers = headers)
soup = bs(r.content, 'lxml')
stars = [item['content'] for item in soup.select('[itemprop="ratingValue"]')]
reviewText = [item.text.strip().replace('\xa0','') for item in soup.select('.review-text')]
results = list(zip(reviewText,stars))
print(results)