所以我已经为此工作了一段时间,但似乎无法找到答案或弄清楚。因此,我正在从Steam中提取数据,我需要弄清楚如何获取平台(例如mac)并将其转换为数字(字符串编号)。例如,如果游戏支持mac,它将在我的列表中显示为“ 1”,但如果不支持,则它将显示为“ 0”。我遇到的问题是代码仅运行一次并将其全部设置为“ 1”。
from urllib.request import urlopen as uReq
from bs4 import BeautifulSoup as soup
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
from datetime import timedelta
import datetime
import time
import csv
my_url = 'https://store.steampowered.com/search/?specials=1&page=1'
#opening up connectin, grabbing the page
uClient = uReq(my_url)
page_html = uClient.read()
uClient.close()
#html parsing
page_soup = soup(page_html, "html.parser")
#grab products
containers = page_soup.findAll("div", {"class":"responsive_search_name_combined"})
filename = "products.csv"
f = open(filename, "w", encoding='UTF-8')
headers = "Titles, Release_date, Discount, Price before, Price after, Positive review, Reviewers, Win, Lin, Osx, Time \n"
f.write(headers)
#f.write(headers)
#len(containers)
#containers[1]
ts = time.time()
st = datetime.datetime.fromtimestamp(ts).strftime('%Y-%m-%d %H:%M:%S')
print(st)
for container in containers:
titles_container = container.findAll("span",{"class":"title"})
titl = titles_container[0].text
print(titl)
product_container = container.findAll("div",{"class":"search_released"})
product_date = product_container[0].text
print(product_date)
product_discount_container = container.findAll("div",{"class":"search_discount"})
product_discount = product_discount_container[0].text
print(product_discount)
product_price_container_before = container.findAll("div",{"class":"search_price"})
product_price_before = product_price_container_before[0].text
test = re.findall('(\d+\W)',product_price_before)
testing = test[0] + test[1]
print(testing)
product_price_container_after = container.findAll("div",{"class":"discounted"})
for product_price_after in product_price_container_after:
product_price_after.find("span").extract()
print(product_price_after.text)
product_review_container = container.findAll("span",{"class":"search_review_summary"})
for product_review in product_review_container:
prr = product_review.get('data-tooltip-html')
a = re.findall('(\d+%)|(\d+\d+)',prr)
c = a[1][1]
print(c)
product_platform_container = container.findAll("span",{"class":"platform_img"})
for product_platform in product_platform_container:
platform = product_platform.get('class')[1]
platt = re.findall('(\Aw)',platform)
plattt = re.findall('(\Am)',platform)
platttt = re.findall('(\Al)',platform)
print(platt)
print(plattt)
print(platttt)
for p in plattt:
if "m" in p:
macken = "1"
elif "m" not in p:
macken = "0"
print(macken)
f.write(titl + "," + product_date.replace(",","") + "," + product_discount.replace("\n", "") + "," + testing.replace(",", ".") + "," + product_price_after.text.replace("\n","").replace(" ", "").replace(",",".").replace("\t\t\t\t\t\t\t","") + "," + a[0][0] + "," + c.replace(",","") + "," + y + "," + macken + "," + "blah" + "," + st + "\n")
f.close()
pd.read_csv("products.csv", error_bad_lines=False)
我也将其写到csv文件中。因此,当我将其写入csv文件时,只需说1、1、1、1、1 ...
我正在从以下页面获取数据:'https://store.steampowered.com/search/?specials=1&page=1'
我知道这个问题有点令人困惑,因此希望您能提供帮助,如果您需要更多代码,请告诉我。
答案 0 :(得分:1)
your statement was wrong that is why you getting 1, see the code below!
import requests,csv
from bs4 import BeautifulSoup
req = requests.get('https://store.steampowered.com/search/?specials=1&page=1')
soup = BeautifulSoup(req.content,'html.parser')
data = []
for platform in soup.find_all('div', attrs={'class':'col search_name ellipsis'}):
title = platform.find('span',attrs={'class':'title'}).text
if platform.find('span',attrs={'class':'win'}):
win = '1'
else:
win = '0'
if platform.find('span',attrs={'class':'mac'}):
mac = '1'
else:
mac = '0'
if platform.find('span',attrs={'class':'linux'}):
linux = '1'
else:
linux = '0'
data.append({
'title':title.encode('utf-8'),
'win':win,
'mac':mac,
'linux':linux})
with open('data.csv', 'w', newline='') as f:
fields = ['title','win','mac','linux']
writer = csv.DictWriter(f, fieldnames=fields)
writer.writeheader()
writer.writerows(data)
答案 1 :(得分:0)
这是我要怎么做:
import csv
# ...
rows = []
product_platform_container = container.findAll("span",{"class":"platform_img"})
for product_platform in product_platform_container:
platform = product_platform.get('class')[1]
win_p = re.findall('(\Aw)',platform)
mac_p = re.findall('(\Am)',platform)
linux_p = re.findall('(\Al)',platform)
print(win_p)
print(mac_p)
print(linux_p)
row = {
"linux": 1 if linux_p else 0,
"win": 1 if win_p else 0,
"mac": 1 if mac_p else 0
}
rows.append(row)
# After you parsed all entries...
fieldnames = ['mac', 'win', 'linux']
writer = csv.DictWriter(f, fieldnames=fieldnames)
writer.writeheader()
for row in rows:
writer.writerow(row)
说明:在用re
标识平台之后,我们创建了csv行,其中mac
,win
和linux
仅在以下情况下才有1它们对应的匹配项(mac_p
,win_p
和linux_p
)不为空。 f
是您打开的文件对象。
结帐this文章,介绍了如何在python中使用csv文件。