我要从网站上取一些桌子。我已经能够获取数据框列表,每个数据框都对应于网站上的一个表。但是,当我尝试将它们连接到一个数据帧时,第一个表的值如应有的那样存在,但其他表的值除列索引外均为NaN。代码为:
import pandas as pd
import requests
from bs4 import BeautifulSoup
from tabulate import tabulate
res = requests.get("https://www.atptour.com/en/players/rafael-nadal/n409/fedex-atp-win-loss")
soup = BeautifulSoup(res.content,'lxml')
table = soup.find_all(class_="mega-table")
all_data = []
data = pd.read_html(str(table[0]), header =None, index_col= 0)
data = data[0].dropna(axis=0, thresh = 4)
all_data.append(data)
for i in range(1,len(table)):
data = pd.read_html(str(table[i]), header =None, index_col= 0, skiprows= 0)
data = data[0].dropna(axis=0, thresh = 4)
data.columns = [all_data[0].columns]
all_data.append(data)
print(all_data)
df = pd.concat(all_data)
print(df)
:list of dataframes concatenated dataframe
任何帮助将不胜感激
答案 0 :(得分:0)
我将对此进行一些编辑以包括实际的解释,并且仍然需要进行一些更改。
import pandas as pd
import requests
from bs4 import BeautifulSoup
import pprint
col_names = ['YTD W/L', 'YTD Fedex Index', 'Career W/L', 'Career Fedex Index', 'Titles']
res = requests.get("https://www.atptour.com/en/players/rafael-nadal/n409/fedex-atp-win-loss")
soup = BeautifulSoup(res.content, features='lxml')
all_tables = soup.find_all(class_='mega-table')
all_data = []
for curr_table in all_tables:
curr_data = pd.read_html(str(curr_table), header=None, index_col=0)[0]
curr_data = curr_data[curr_data.iloc[:, 0] != '-']
curr_data.rename(columns=dict(zip(curr_data.columns, col_names)), inplace=True)
curr_data = curr_data.astype(dtype={'Titles': 'Int64'})
all_data.append(curr_data)
res_df = pd.concat(all_data, sort=False, ignore_index=True)
pprint.pprint(res_df)
最终输出:
YTD W/L YTD Fedex Index Career W/L Career Fedex Index Titles
0 51 - 6 0.895 970 - 196 0.832 84
1 24 - 2 0.923 271 - 38 0.877 19
2 22 - 2 0.917 384 - 79 0.829 35
3 10 - 4 0.714 236 - 150 0.611 NaN
4 7 - 4 0.636 169 - 89 0.655 NaN
5 4 - 1 0.800 84 - 37 0.694 NaN
6 4 - 2 0.667 159 - 73 0.685 NaN
7 1 - 0 1.000 22 - 12 0.647 NaN
8 21 - 3 0.875 436 - 39 0.918 59
9 5 - 1 0.833 71 - 20 0.780 4
10 25 - 2 0.926 461 - 131 0.779 21
11 0 - 0 0.000 2 - 6 0.250 0
12 4 - 0 1.000 82 - 37 0.689 2
13 47 - 6 0.887 888 - 159 0.848 82
14 49 - 1 0.980 858 - 45 0.950 NaN
15 2 - 5 0.286 112 - 151 0.426 NaN
16 46 - 6 0.885 869 - 181 0.828 NaN
17 5 - 0 1.000 101 - 15 0.871 NaN