我想在下面的网页上提取产品的ASIN号。我可以提取我需要的其他一些元素,但不能提取ASIN编号。 ASIN编号遵循Amazon上HTML的'data-asin'元素。然后,我想以与其他元素相同的方式打印输出。感谢您的提前帮助
import csv
from bs4 import BeautifulSoup
from selenium import webdriver
path = "C:\Program Files (x86)\chromedriver.exe"
driver = webdriver.Chrome(path)
def get_url(search_term):
"""Generate a url from search term"""
template = 'https://www.amazon.co.uk/s?k={}&ref=nb_sb_noss_2'
search_term = search_term.replace(' ','+')
return template.format(search_term)
url = get_url('Ultrawide monitor')
print(url)
driver.get(url)
soup = BeautifulSoup(driver.page_source, 'html.parser')
results = soup.find_all('div',{'data-component-type': 's-search-result'})
item = results [0]
atag = item.h2.a
atag.text
description = atag.text.strip()
url = 'https//www.amazon.com'+atag.get('href')
price_parent = item.find('span', 'a-price')
price = price_parent.find('span', 'a-offscreen').text
rating = item.i.text
review_count = item.find('span', {'class': 'a-size-base', 'dir':
'auto'}).text
print(description)
print(price)
print(rating)
print(review_count)
答案 0 :(得分:1)
您可以直接从JSON API抓取数据(而不使用BeautifulSoup),例如:
import requests
import pandas as pd
url = 'https://api-prod.footballindex.co.uk/football.allTradable24hrchanges?page={page}&per_page={per_page}&sort=asc'
page = 1
all_data = {}
while True:
print('Processing page {}...'.format(page))
data = requests.get(url.format(page=page, per_page=5000)).json()
if data['count'] == 0:
break
# uncomment this to print all data:
# from pprint import pprint
# pprint(data)
all_data.setdefault('id', []).extend(d['id'] for d in data['items'])
all_data.setdefault('country', []).extend(d['country'] for d in data['items'])
all_data.setdefault('nationalTeam', []).extend(d['nationalTeam'] for d in data['items'])
all_data.setdefault('nationality', []).extend(d['nationality'] for d in data['items'])
all_data.setdefault('team', []).extend(d['team'] for d in data['items'])
all_data.setdefault('price', []).extend(d['price'] for d in data['items'])
all_data.setdefault('scoreSell', []).extend(d['scoreSell'] for d in data['items'])
all_data.setdefault('penceChange', []).extend(d['penceChange'] for d in data['items'])
page += 1
df = pd.DataFrame(all_data)
print(df)
打印:
...
Processing page 32...
Processing page 33...
id country nationalTeam nationality team price scoreSell penceChange
0 habib-diallo Senegal False Senegal Metz 1.19 0.71 0.09
1 sehrou-guirassy France False France Amiens 0.90 0.54 0.05
2 romain-del-castillo France False None Rennes 0.58 0.35 0.04
3 samuel-bastien Belgium False Belgium Standard Liège 0.57 0.34 0.04
4 jann-fiete-arp Germany False Germany FC Bayern München 1.43 0.86 0.03
... ... ... ... ... ... ... ... ...
3110 kieran-trippier England False England Atlético de Madrid 0.65 0.39 -0.01
3111 kevin-malcuit France False France Napoli 0.39 0.23 -0.01
3112 alen-halilovic Croatia False Croatia sc Heerenveen 0.36 0.22 -0.01
3113 bernardo-espinosa Colombia False Colombia Espanyol 0.18 0.11 -0.01
3114 johan-djourou Switzerland False Côte d'Ivoire Hamburger SV 0.12 0.07 -0.01
[3115 rows x 8 columns]
编辑:
firefox的URL截屏:
答案 1 :(得分:1)
安德烈(Andrej)的解决方案是正确的,但是您可以稍作改动以一次性获得全部数据,然后使用json_normalize。这是另一种实现方式,您可以看到。
import requests
import math
from pandas.io.json import json_normalize
url = 'https://api-prod.footballindex.co.uk/football.allTradable24hrchanges'
per_page = 5000
page = 1
payload = {
'page':'%s' %page,
'per_page':'%s' %per_page,
'sort':'asc'}
print ('Gathering page: %s' %page)
jsonData = requests.get(url, params=payload).json()
total_pages = math.ceil(jsonData['total'] / per_page)
df = json_normalize(jsonData['items'])
cols = ['id', 'country', 'nationalTeam','nationality','team', 'price', 'scoreSell', 'penceChange']
df = df[cols]
if total_pages > 1:
for page in range(2,total_pages+1):
print ('Gathering page: %s' %page)
payload = {
'page':'%s' %page,
'per_page':'%s' %per_page,
'sort':'asc'}
jsonData = requests.get(url, params=payload).json()
temp_df = json_normalize(jsonData['items'])
df = df.append(temp_df[cols], sort=False).reset_index(drop=True)
答案 2 :(得分:0)
检查网站。 如果页面是 {https://www.footballindex.co.uk/players/1,https://www.footballindex.co.uk/players/2,https://www.footballindex.co.uk/players/3 ...} 将抓取的代码作为一部分在所有页面上进行迭代
page = 0
for i in range(1,n): #will iterate till n-1 page
page = i
result = requests.get('https://www.footballindex.co.uk/players'+str(page))
src = result.content
soup = BeautifulSoup(src,'lxml')
players1 = soup.find("script").text
players2 = players1.split('= ', 1)[1]
players3 = json.loads(players2)
df = pd.DataFrame(
[item['id'],item['country'],item['nationalTeam
,item['sector'],item['nationality'],item['team']
,item['buyPrice'],item['sellPrice'],item['penceChange']
,item['changePercent']]for item in players3['playersReducer']['players']
)
类似的东西