在Python中通过网络抓取添加数据

时间:2019-03-01 10:30:53

标签: python web-scraping

我有这个python脚本,用于从understat.com获取xG值(特别感谢@ chitown88)。

我想在比赛中至少收到一个红旗的球队名称上添加一个星号(*)。例如https://understat.com/match/9458中的哈德斯菲尔德(Huddersfield)收到红牌,因此在输出中,如果名称旁边可以有*,即哈德斯菲尔德*(Huddersfield *)。

有什么想法吗?

这是我的python脚本:

import requests
import json
import re
from pandas.io.json import json_normalize
import pandas as pd

response = requests.get('https://understat.com/match/9458')

shotsData = re.search("shotsData\s+=\s+JSON.parse\('([^']+)", response.text)
decoded_string = bytes(shotsData.groups()[0], 'utf-8').decode('unicode_escape')
shotsObj = json.loads(decoded_string)

match_info = re.search("match_info\s+=\s+JSON.parse\('([^']+)", response.text)
decoded_string = bytes(match_info.groups()[0], 'utf-8').decode('unicode_escape')
matchObj = json.loads(decoded_string)


rostersData = re.search("rostersData\s+=\s+JSON.parse\('([^']+)", response.text)
decoded_string = bytes(rostersData.groups()[0], 'utf-8').decode('unicode_escape')
rostersObj = json.loads(decoded_string)


# Shots Data into a DataFrame
away_shots_df = json_normalize(shotsObj['a'])
home_shots_df = json_normalize(shotsObj['h'])
shots_df = away_shots_df.append(home_shots_df)



# Rosters Data into a DataFrame
away_rosters_df = pd.DataFrame()
for key, v in rostersObj['a'].items():
    temp_df = pd.DataFrame.from_dict([v])
    away_rosters_df = away_rosters_df.append(temp_df)

home_rosters_df = pd.DataFrame()
for key, v in rostersObj['h'].items():
    temp_df = pd.DataFrame.from_dict([v])
    home_rosters_df = home_rosters_df.append(temp_df)

rosters_df = away_rosters_df.append(home_rosters_df)

teams_dict = {'a':matchObj['team_a'], 'h':matchObj['team_h']}
match_title = matchObj['team_h'] + ' vs. ' + matchObj['team_a']

#print (shots_df)

# Cumulative chart of xG from the shotsData
import numpy as np

# Convert 'minute' astype int and sort the dataframe by 'minute'
shots_df['minute'] = shots_df['minute'].astype(int)
shots_df['xG'] = shots_df['xG'].astype(float)

timing_chart_df = shots_df[['h_a', 'minute', 'xG']].sort_values('minute')
timing_chart_df['h_a'] = timing_chart_df['h_a'].map(teams_dict)

# Get max value of the 'minute' column to interpolate minute interval between that range
max_value = timing_chart_df['minute'].max()

# Aggregate xG within the same minute
timing_chart_df = timing_chart_df.groupby(['h_a','minute'], as_index=False)['xG'].sum()

# Interpolate for each team/group
min_idx = np.arange(timing_chart_df['minute'].max() + 1)
m_idx = pd.MultiIndex.from_product([timing_chart_df['h_a'].unique(), min_idx], names=['h_a', 'minute'])


# Calculate the running sum
timing_chart_df = timing_chart_df.set_index(['h_a', 'minute']).reindex(m_idx, fill_value=0).reset_index()
timing_chart_df['running_sum_xG'] = timing_chart_df.groupby('h_a')['xG'].cumsum()


timing_chart_T_df = timing_chart_df.pivot(index='h_a', columns='minute', values='running_sum_xG')
timing_chart_T_df = timing_chart_T_df.reset_index().rename(columns={timing_chart_T_df.index.name:match_title})

print (timing_chart_T_df.to_string())

1 个答案:

答案 0 :(得分:2)

再次见到你很高兴。

您可以简单地检查数据框中的红卡总数:

    "dependencies": {
         "package1": "file:../../vendor/somefolders/package1",
         "package2": "file:../../vendor/somefolders/package2"
    }

然后将其连接到您想要的文本上:即:

if away_rosters_df['red_card'].astype(int).sum() > 0:
    a_red_card = '*'
else:
    a_red_card = ''


if home_rosters_df['red_card'].astype(int).sum() > 0:
    h_red_card = '*'
else:
    h_red_card = ''

完整代码:

teams_dict = {'a':matchObj['team_a']+a_red_card, 'h':matchObj['team_h']+h_red_card}

输出:

import requests
import json
import re
from pandas.io.json import json_normalize
import pandas as pd

response = requests.get('https://understat.com/match/9458')

shotsData = re.search("shotsData\s+=\s+JSON.parse\('([^']+)", response.text)
decoded_string = bytes(shotsData.groups()[0], 'utf-8').decode('unicode_escape')
shotsObj = json.loads(decoded_string)

match_info = re.search("match_info\s+=\s+JSON.parse\('([^']+)", response.text)
decoded_string = bytes(match_info.groups()[0], 'utf-8').decode('unicode_escape')
matchObj = json.loads(decoded_string)


rostersData = re.search("rostersData\s+=\s+JSON.parse\('([^']+)", response.text)
decoded_string = bytes(rostersData.groups()[0], 'utf-8').decode('unicode_escape')
rostersObj = json.loads(decoded_string)


# Shots Data into a DataFrame
away_shots_df = json_normalize(shotsObj['a'])
home_shots_df = json_normalize(shotsObj['h'])
shots_df = away_shots_df.append(home_shots_df)



# Rosters Data into a DataFrame
away_rosters_df = pd.DataFrame()
for key, v in rostersObj['a'].items():
    temp_df = pd.DataFrame.from_dict([v])
    away_rosters_df = away_rosters_df.append(temp_df)



home_rosters_df = pd.DataFrame()
for key, v in rostersObj['h'].items():
    temp_df = pd.DataFrame.from_dict([v])
    home_rosters_df = home_rosters_df.append(temp_df)    

rosters_df = away_rosters_df.append(home_rosters_df) 


if away_rosters_df['red_card'].astype(int).sum() > 0:
    a_red_card = '*'
else:
    a_red_card = ''


if home_rosters_df['red_card'].astype(int).sum() > 0:
    h_red_card = '*'
else:
    h_red_card = ''

teams_dict = {'a':matchObj['team_a']+a_red_card, 'h':matchObj['team_h']+h_red_card}
match_title = matchObj['team_h'] + ' vs. ' + matchObj['team_a']


#########################################################################
# Timing Chart is an aggregation (running sum) of xG from the shotsData
#########################################################################
import numpy as np

# Convert 'minute' astype int and sort the dataframe by 'minute'
shots_df['minute'] = shots_df['minute'].astype(int)
shots_df['xG'] = shots_df['xG'].astype(float)

timing_chart_df = shots_df[['h_a', 'minute', 'xG']].sort_values('minute')
timing_chart_df['h_a'] = timing_chart_df['h_a'].map(teams_dict)

# Get max value of the 'minute' column to interpolate minute interval between that range
max_value = timing_chart_df['minute'].max()

# Aggregate xG within the same minute
timing_chart_df = timing_chart_df.groupby(['h_a','minute'], as_index=False)['xG'].sum()

# Interpolate for each team/group
min_idx = np.arange(timing_chart_df['minute'].max() + 1)
m_idx = pd.MultiIndex.from_product([timing_chart_df['h_a'].unique(), min_idx], names=['h_a', 'minute'])


# Calculate the running sum
timing_chart_df = timing_chart_df.set_index(['h_a', 'minute']).reindex(m_idx, fill_value=0).reset_index()
timing_chart_df['running_sum_xG'] = timing_chart_df.groupby('h_a')['xG'].cumsum()


timing_chart_T_df = timing_chart_df.pivot(index='h_a', columns='minute', values='running_sum_xG')
timing_chart_T_df = timing_chart_T_df.reset_index().rename(columns={timing_chart_T_df.index.name:match_title})


from datetime import datetime

home_team = matchObj['team_h']+h_red_card
away_team = matchObj['team_a']+a_red_card

league = matchObj['league']
season = matchObj['season']
date = matchObj['date']
datetime_object = datetime.strptime(date, '%Y-%m-%d %H:%M:%S')
date = datetime_object.strftime('%A, %B %d, %Y')

results_df = pd.DataFrame([[league, season, date, home_team, away_team]], columns = ['League','Season','Date','Home team','Away team'])

home_xg_sum = timing_chart_df[timing_chart_df['h_a'] == home_team].pivot(index='h_a', columns='minute', values='running_sum_xG')
away_xg_sum = timing_chart_df[timing_chart_df['h_a'] == away_team].pivot(index='h_a', columns='minute', values='running_sum_xG')



data = [league, season, date, home_team, away_team] + home_xg_sum.values.tolist()[0] + away_xg_sum.values.tolist()[0]
cols =  ['League','Season','Date','Home team','Away team'] + list(home_xg_sum.columns) + list(away_xg_sum.columns)


results_df = pd.DataFrame([data], columns = cols)