我不喜欢漂亮的汤,我正尝试从this网站提取数据。
import bs4
import requests as re
import pandas as pd
class roto_PlayerStats:
class roto_Player:
def __init__(self):
self.name = ""
self.team = ""
self.pos = ""
self.salary = 0
self.minutes = 0
self.reb = 0
self.ast = 0
self.stl = 0
self.blk = 0
self.to = 0
self.pts = 0
self.usg = 0
self.fpts = 0
def __init__(self):
self.players =[]
def load-data(self):
response = re.get("https://rotogrinders.com/game-stats/nba-player?site=draftkings&range=season")
soup = BeautifulSoup(response.content, "html.parser")
for x in soup.find_all('"id'):
#code to load the individual data?
来自的数据位于数组中,结构类似于以下内容。这是加载单个玩家数据的正确方法吗?
$(document).ready(function() {
var data = [{"id":915,"player":"J.R. Smith","team":"CLE","pos":"SHW","salary":null,"opp":"N\/A","gp":8,"min":"150.00","fgm":18,"fga":51,"ftm":8,"fta":8,"3pm":9,"3pa":27,"reb":13,"ast":13,"stl":10,"blk":2,"to":9,"pts":53,"usg":"18.08","pace":64,"fpts":"115.10"}, {}...]
来自同一域的其他网页的新功能
class grinder_Team:
def __init__(self):
self.name = ""
self.gp = 0
self.minutes = 0
self.reb = 0
self.ast = 0
self.stl = 0
self.blk = 0
self.to = 0
self.pts = 0
self.pace = 0
self.fpts = 0
class grinder_TeamStats:
def __init__(self):
self.teams = []
response = requests.get("https://rotogrinders.com/team-stats/nba-earned?site=draftkings&range=season")
soup = BeautifulSoup(response.content, 'html.parser')
proj_stats = soup.find('section', {'class': 'pag bdy'})
script = proj_stats.find('script')
data = re.search(r"data\s*=\s*(.*);", script.text).group(1)
stats = json.loads(data)
for team in stats:
# do x
print ("finished")
我在这条线上出现错误
data = re.search(r"data\s*=\s*(.*);", script.text).group(1)
说
AttributeError: 'NoneType' object has no attribute 'group'
我不确定为什么会这样,因为然后我为两个链接打印了脚本和script.text变量,它们的输出非常相似。
答案 0 :(得分:0)
响应中的数据在JavaScript <script>
标记中。加载数据的一种方法是使用regex
在脚本中找到var data
并将其加载到json
对象中:
import bs4
import requests
import pandas as pd
import re
proj_stats = soup.find('div', {'id': 'proj-stats'})
script = proj_stats.find('script')
data = re.search(r"data\s*=\s*(.*);", script.text).group(1)
stats = json.loads(data)
现在,您有一个词典列表,其中包含可以存储到玩家类别中的每个玩家的统计信息:
$ print(stats)
[{u'gp': 8, u'pos': u'SHW', u'player': u'J.R. Smith', u'pts': 53, u'id': 915, u'usg': u'18.08', u'min': u'150.00', u'fta': 8, u'to': 9, u'blk': 2, u'reb': 13, u'ftm': 8, u'opp': u'N/A', u'ast': 13, u'fgm': 18, u'3pm': 9, u'3pa': 27, u'fga': 51, u'salary': None, u'fpts': u'117.25', u'stl': 10, u'pace': 64, u'team': u'CLE'}, ...]
请注意:请勿import requests as re
,因为re
将覆盖re
正则表达式模块的导入。
答案 1 :(得分:0)
该脚本没有分号;
,将其从正则表达式中删除,您可以剪切BeautifulSoup并直接使用re
response = requests.get("https://rotogrinders.com/team-stats/nba-earned?site=draftkings&range=season")
data = re.search(r"data\s*=\s*(.*)", response.content).group(1)
stats = json.loads(data)