我刮了一个PS4游戏网站,它为我提供了品牌,产品名称以及运输信息和价格。 我正在寻找的输出是两个表,每个表具有两列。第一个表中的列是product_name和brand,第二个表中的列是product_name和shipping。然后我要合并这两个数据框。
#!pip install beautifulsoup4
from urllib.request import urlopen as uReq
from bs4 import BeautifulSoup as soup
import pandas as pd
from collections import defaultdict
import re
url='https://www.newegg.com/PS4-Video-Games/SubCategory/ID-3141'
with uReq(url) as uClient:
page = uClient.read()
# parsing
page_soup = soup(page, "html.parser")
# grabs products
containers= page_soup.findAll("div",{"class":"item-container"})
# file
filename = "products.csv"
d = defaultdict(list)
# fill dict
for container in containers:
brand = container.div.div.a.img["title"]
title = container.findAll("a", {"class":"item-title"})
product_name = title[0].text
shipping_container = container.findAll("li", {"class":"price-ship"})
shipping = shipping_container[0].text.strip()
d['brand'].append(brand)
d['product'].append(product_name)
d['shipping'].append(shipping)
# create dataframe
df = pd.DataFrame(d)
# clean shipping column
df['shipping'] = df['shipping'].apply(lambda x: 0 if x == 'Free Shipping' else x)
df['shipping'] = df['shipping'].apply(lambda x: 0 if x == 'Special Shipping' else x) # probably should be handled in a special way
df['shipping'] = df['shipping'].apply(lambda x: x if x == 0 else re.sub("[^0-9]", "", x))
df['shipping'] = df['shipping'].astype(float)
# save dataframe to csv file
df.to_csv('dataframe.csv', index=False)
# choose rows where shipping is less than 5.99
print(df[df['shipping'] > 200]) ```