我正在尝试使用Python将多个网页中的数据导入到数据表中。 基本上,自2000年以来,我一直在尝试下载某些团队的出勤数据。
这是我到目前为止所拥有的:
import requests
import pandas as pd
import numpy as np
#What is the effect of a rival team's performance on a team's attendance
Teams = ['LAA', 'LAD', 'NYY', 'NYM', 'CHC', 'CHW', 'OAK', 'SFG']
Years = []
for year in range(2000,2020):
Years.append(str(year))
bbattend = pd.DataFrame(columns=['GM_Num','Date','Team','Home','Opp','W/L','R','RA','Inn','W-L','Rank','GB','Time','D/N','Attendance','Streak','Game_Win','Wins','Losses','Net_Wins'])
for team in Teams:
for year in Years:
url = 'https://www.baseball-reference.com/teams/' + team + '/' + year +'-schedule-scores.shtml'
html = requests.get(url).content
df_list = pd.read_html(html)
df = df_list[-1]
#Formatting data table
df.rename(columns={"Gm#": "GM_Num", "Unnamed: 4": "Home", "Tm": "Team", "D/N": "Night"}, inplace = True)
df['Home'] = df['Home'].apply(lambda x: 0 if x == '@' else 1)
df['Game_Win'] = df['W/L'].astype(str).str[0]
df['Game_Win'] = df['Game_Win'].apply(lambda x: 0 if x == 'L' else 1)
df['Night'] = df['Night'].apply(lambda x: 1 if x == 'N' else 0)
df['Streak'] = df['Streak'].apply(lambda x: -1*len(x) if '-' in x else len(x))
df.drop('Unnamed: 2', axis=1, inplace = True)
df.drop('Orig. Scheduled', axis=1, inplace = True)
df.drop('Win', axis=1, inplace = True)
df.drop('Loss', axis=1, inplace = True)
df.drop('Save', axis=1, inplace = True)
#Drop rows that do not have data
df = df[df['GM_Num'].str.isdigit()]
WL = df["W-L"].str.split("-", n = 1, expand = True)
df["Wins"] = WL[0].astype(dtype=np.int64)
df["Losses"] = WL[1].astype(dtype=np.int64)
df['Net_Wins'] = df['Wins'] - df['Losses']
bbattend.append(df)
bbattend
当我通过使用特定链接而不是尝试使用串联来创建url来分别在循环中执行操作时,这似乎起作用。
但是,使用此代码,我得到了错误:
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-77-997e6aeea77e> in <module>
16 url = 'https://www.baseball-reference.com/teams/' + team + '/' + year +'-schedule-scores.shtml'
17 html = requests.get(url).content
---> 18 df_list = pd.read_html(html)
19 df = df_list[-1]
20 #Formatting data table
~/anaconda3/lib/python3.7/site-packages/pandas/io/html.py in read_html(io, match, flavor, header, index_col, skiprows, attrs, parse_dates, tupleize_cols, thousands, encoding, decimal, converters, na_values, keep_default_na, displayed_only)
1092 decimal=decimal, converters=converters, na_values=na_values,
1093 keep_default_na=keep_default_na,
-> 1094 displayed_only=displayed_only)
~/anaconda3/lib/python3.7/site-packages/pandas/io/html.py in _parse(flavor, io, match, attrs, encoding, displayed_only, **kwargs)
914 break
915 else:
--> 916 raise_with_traceback(retained)
917
918 ret = []
~/anaconda3/lib/python3.7/site-packages/pandas/compat/__init__.py in raise_with_traceback(exc, traceback)
418 if traceback == Ellipsis:
419 _, _, traceback = sys.exc_info()
--> 420 raise exc.with_traceback(traceback)
421 else:
422 # this version of raise is a syntax error in Python 3
ValueError: No tables found
我不太明白错误消息在说什么。 我将不胜感激!