我希望能够将来自多个New Egg网页的数据网络抓取到单个csv文件中。
我目前能够将一个New Egg网页中的数据网络抓取到一个csv文件中,但是我希望一次对多个页面进行网络抓取。
from bs4 import BeautifulSoup
import requests
import bs4
from urllib.request import urlopen as uReq
from bs4 import BeautifulSoup as soup
#my_url = 'https://www.newegg.com/Desktop-Graphics-Cards/SubCategory/ID-48?PageSize=96'
my_url = 'https://www.newegg.com/Desktop-Graphics-Cards/SubCategory/ID-48/Page-1?PageSize=96'
#opening up connection and grabbing the page
uClient = uReq(my_url)
page_html = uClient.read()
uClient.close()
#html parsing
page_soup = soup(page_html, "html.parser")
#grabs each product
containers = page_soup.findAll("div", {"class":"item-container"})
filename = "99FINAL.csv"
f = open(filename, "w")
headers = "Brand, Title, Shipping, Price\n"
f.write(headers)
# only for information, not used in url
page = 0
while True:
print('---', page, '---')
r = requests.get(my_url)
soup = BeautifulSoup(r.content, "html.parser")
# String substitution for HTML
for container in containers:
brand_container = container.findAll("a", {"class":"item-title"})
brand = brand_container[0].text
title_container = container.findAll("a", {"class":"item-title"})
title = title_container[0].text
shipping_container = container.findAll("li", {"class":"price-ship"})
shipping = shipping_container[0].text.strip()
price = container.findAll("li", {"class":"price-current"})
price = price[0]
pricing_container = price.findAll("strong")
pricing = pricing_container[0].text
cents_container = price.findAll("sup")
centing = cents_container[0].text
print("brand: " + brand.partition(' ')[0])
print("title: " + title)
print("shipping: " + shipping)
print("pricing: " + pricing)
print("centing: " + centing)
f.write(brand.partition(' ')[0] + "," + title.replace(",","|") + "," + shipping.partition(' ')[0] + "," + pricing.replace(",","")+centing + "\n")
break
# link to next page
next_page = soup.find("button", {"title": "Next"})
if next_page:
my_url = next_page.get("href")
page += 1
else:
break # exit `while True`for "Page" in my_url
f.close()
该算法已成功对第一页进行网页抓取,但是切换到下一页时存在问题。
我得到的错误是: “ MissingSchema:无效的URL'None':未提供模式。也许您是说http://None?”
答案 0 :(得分:1)
使用pandas
库
import requests
from bs4 import BeautifulSoup
import pandas as pd
page = 1
data = []
url = 'https://www.newegg.com/Desktop-Graphics-Cards/SubCategory/ID-48/Page-{}?PageSize=96'
while True:
# format website url with updated page number
my_url = url.format(str(page))
r = requests.get(my_url)
soup = BeautifulSoup(r.content, "html.parser")
containers = soup.find_all("div", {"class": "item-container"})
for container in containers:
brand_container = container.findAll("a", {"class":"item-title"})
brand = brand_container[0].text
title_container = container.findAll("a", {"class":"item-title"})
title = title_container[0].text
shipping_container = container.findAll("li", {"class":"price-ship"})
shipping = shipping_container[0].text.strip()
price = container.findAll("li", {"class":"price-current"})
price = price[0]
pricing_container = price.findAll("strong")
pricing = pricing_container[0].text
cents_container = price.findAll("sup")
centing = cents_container[0].text
print("brand: " + brand.partition(' ')[0])
print("title: " + title)
print("shipping: " + shipping)
print("pricing: " + pricing)
print("centing: " + centing)
product = {"brand":brand,"title":title,"shipping":shipping,"pricing":pricing,"centing":centing}
data.append(product)
break
next_page = soup.find("button", {"title": "Next"})
if next_page:
page += 1
else:
break
print(data)
df = pd.DataFrame(data)
print(df)
# it will save csv file in current project directory with product.csv file name
df.to_csv("product.csv")
报废前五项产品详细信息:
[{'brand': 'ASUS ROG Strix GeForce RTX 2070 DirectX 12 ROG-STRIX-RTX2070-8G-GAMING 8GB 256-Bit GDDR6 PCI Express 3.0 HDCP Ready Video Card', 'title': 'ASUS ROG Strix GeForce RTX 2070 DirectX 12 ROG-STRIX-RTX2070-8G-GAMING 8GB 256-Bit GDDR6 PCI Express 3.0 HDCP Ready Video Card', 'shipping': 'Free Shipping', 'pricing': '509', 'centing': '.99'}, {'brand': 'ASUS ROG Strix GeForce RTX 2070 DirectX 12 ROG-STRIX-RTX2070-8G-GAMING 8GB 256-Bit GDDR6 PCI Express 3.0 HDCP Ready Video Card', 'title': 'ASUS ROG Strix GeForce RTX 2070 DirectX 12 ROG-STRIX-RTX2070-8G-GAMING 8GB 256-Bit GDDR6 PCI Express 3.0 HDCP Ready Video Card', 'shipping': 'Free Shipping', 'pricing': '509', 'centing': '.99'}, {'brand': 'ASUS ROG Strix GeForce RTX 2070 DirectX 12 ROG-STRIX-RTX2070-8G-GAMING 8GB 256-Bit GDDR6 PCI Express 3.0 HDCP Ready Video Card', 'title': 'ASUS ROG Strix GeForce RTX 2070 DirectX 12 ROG-STRIX-RTX2070-8G-GAMING 8GB 256-Bit GDDR6 PCI Express 3.0 HDCP Ready Video Card', 'shipping': 'Free Shipping', 'pricing': '509', 'centing': '.99'}, {'brand': 'ASUS ROG Strix GeForce RTX 2070 DirectX 12 ROG-STRIX-RTX2070-8G-GAMING 8GB 256-Bit GDDR6 PCI Express 3.0 HDCP Ready Video Card', 'title': 'ASUS ROG Strix GeForce RTX 2070 DirectX 12 ROG-STRIX-RTX2070-8G-GAMING 8GB 256-Bit GDDR6 PCI Express 3.0 HDCP Ready Video Card', 'shipping': 'Free Shipping', 'pricing': '509', 'centing': '.99'}, {'brand': 'ASUS ROG Strix GeForce RTX 2070 DirectX 12 ROG-STRIX-RTX2070-8G-GAMING 8GB 256-Bit GDDR6 PCI Express 3.0 HDCP Ready Video Card', 'title': 'ASUS ROG Strix GeForce RTX 2070 DirectX 12 ROG-STRIX-RTX2070-8G-GAMING 8GB 256-Bit GDDR6 PCI Express 3.0 HDCP Ready Video Card', 'shipping': 'Free Shipping', 'pricing': '509', 'centing': '.99'}, {'brand': 'ASUS ROG Strix GeForce RTX 2070 DirectX 12 ROG-STRIX-RTX2070-8G-GAMING 8GB 256-Bit GDDR6 PCI Express 3.0 HDCP Ready Video Card', 'title': 'ASUS ROG Strix GeForce RTX 2070 DirectX 12 ROG-STRIX-RTX2070-8G-GAMING 8GB 256-Bit GDDR6 PCI Express 3.0 HDCP Ready Video Card', 'shipping': 'Free Shipping', 'pricing': '509', 'centing': '.99'}]
如果不安装熊猫,请安装它:
pip3 install pandas
答案 1 :(得分:1)
您可以使用while
循环来保持抓取,直到加载带有禁用的“下一步”按钮的页面:
from bs4 import BeautifulSoup as soup
import requests, csv, re
def get_products(d):
full_price = [i.text for i in getattr(d.find('li', {'class':'price-current'}), 'find_all', lambda _:[])(re.compile('strong|sup'))]
return [getattr(d.find('a', {'class':'item-brand'}), 'img', {'title':'N/A'})['title'], d.find('a', {'class':'item-title'}).text, *full_price, d.find('li', {'class':'price-ship'}).text]
rows, count, header = [], 1, ['brand', 'title', 'pricing', 'centing', 'shipping']
while True:
d = soup(requests.get(f'https://www.newegg.com/Desktop-Graphics-Cards/SubCategory/ID-48/Page-{count}?PageSize=96').text, 'html.parser')
rows.extend([get_products(i) for i in d.find_all('div', {'class':re.compile('item\-container')})])
if 'disabled' in d.find('button', {'title':'Next'}).attrs:
break
count += 1
with open('graphics_cards.csv', 'w') as f:
write = csv.writer(f)
write.writerows([header, *rows])
输出(前五个产品)
brand,title,pricing,centing,shipping
ASUS,ASUS ROG Strix GeForce RTX 2070 DirectX 12 ROG-STRIX-RTX2070-8G-GAMING 8GB 256-Bit GDDR6 PCI Express 3.0 HDCP Ready Video Card,509,.99,"
Free Shipping
"
MSI,MSI Radeon RX 580 DirectX 12 RX 580 ARMOR 8G OC 8GB 256-Bit GDDR5 PCI Express x16 HDCP Ready CrossFireX Support Video Card,194,.99,"
Free Shipping
"
GIGABYTE,"GIGABYTE GeForce GTX 1660 GAMING OC 6G Graphics Card, 3 x WINDFORCE Fans, 6GB 192-Bit GDDR5, GV-N1660GAMING OC-6GD Video Card",229,.99,"
Free Shipping
"
Sapphire Tech,SAPPHIRE NITRO+ Radeon RX Vega 64 DirectX 12 100410NT+SR 8GB 2048-Bit HBM2 PCI Express 3.0 Video Card,399,.99,"
Free Shipping
"
ASUS,ASUS Radeon RX 580 O4G Dual-fan OC Edition GDDR5 DP HDMI DVI VR Ready AMD Graphics Card (DUAL-RX580-O4G),189,.99,"
Free Shipping
"