Web爬网:如何从动态页面获取信息?

时间:2018-11-23 02:19:21

标签: html json python-3.x web-scraping beautifulsoup

我是网络抓取的新手。我知道如何从HTML或JSON获取数据,但是在某个地方我不知道该怎么做。我想获得在此页面的简短图表中可以看到的点和X的位置。

http://www.fiba.basketball/euroleaguewomen/18-19/game/2410/Nadezhda-ZVVZ-USK-Praha#|tab=shot_chart

我该怎么做?

1 个答案:

答案 0 :(得分:3)

我也很新,但是我会不断学习。看起来此页面是动态的,因此您需要先使用Selenium来加载页面,然后再使用beautifulsoup抓取html,以从“ Made Shots”和“ Missed Shots”中获取x和y坐标。因此,我试了一下,并能够获得带有x,y坐标以及“制造”或“未命中”的数据框。

我随后将其绘制出来只是为了检查它是否匹配,并且它似乎绕着x轴翻转了。我相信这是因为当您以这种图形方式绘制图表时,左上角就是您的(0,0)。因此,当您要绘制时,您的y坐标将相反。我可能是错的。

更是如此,这是我使用的代码。

import pandas as pd
import bs4 
from selenium import webdriver 

driver = webdriver.Chrome('C:\chromedriver_win32\chromedriver.exe')
driver.get('http://www.fiba.basketball/euroleaguewomen/18-19/game/2410/Nadezhda-ZVVZ-USK-Praha#|tab=shot_chart')

html = driver.page_source
soup = bs4.BeautifulSoup(html,'html.parser')

made_shots = soup.findAll("svg", {"class": "shot-hit icon icon-point clickable"})   
missed_shots = soup.findAll("svg", {"class": "shot-miss icon icon-miss clickable"})   

def get_coordiantes(element, label):
    results = pd.DataFrame()
    for point in element:
        x_point = float(point.get('x'))
        y_point = float(point.get('y'))
        marker = label
        temp_df = pd.DataFrame([[x_point, y_point, marker]], columns=['x','y','marker'])
        results = results.append(temp_df)
    return results

made_results = get_coordiantes(made_shots, 'made')
missed_results = get_coordiantes(missed_shots, 'missed')

results = made_results.append(missed_results)
results = results.reset_index(drop=True)

results['y'] = results['y'] * -1 

driver.close()

提供以下输出:

In [6]:results.head(5)
Out[6]: 
       x      y marker
0   33.0 -107.0   made
1  159.0 -160.0   made
2  143.0 -197.0   made
3   38.0 -113.0   made
4   65.0 -130.0   made

当我绘制它时:

import seaborn as sns
import numpy as np

# Add a column: the color depends of x and y values, but you can use whatever function.
value=(results['marker'] == 'made')
results['color']= np.where( value==True , "green", "red")

# plot
sns.regplot(data=results, x="x", y="y", fit_reg=False, scatter_kws={'facecolors':results['color']})

enter image description here

附加:我敢肯定,有一种更好,更有效,更干净的方式对此进行编码。但是,只是动态地做到这一点,就想到了这一点。它应该带你去。随意研究它并查看html源代码,开始看看它如何捕获不同的数据。玩得开心。

import pandas as pd
import bs4 
from selenium import webdriver 

driver = webdriver.Chrome('C:\chromedriver_win32\chromedriver.exe')
driver.get('http://www.fiba.basketball/euroleaguewomen/18-19/game/2410/Nadezhda-ZVVZ-USK-Praha#|tab=shot_chart')

html = driver.page_source
soup = bs4.BeautifulSoup(html,'html.parser')


###############################################################################

shots = soup.findAll("g", {"class": "shot-item"})   

results = pd.DataFrame()
for point in shots:
    hit = point.get('data-play-by-play-action-hit')
    action_id = point.get('data-play-by-play-action-id')
    period = point.get('data-play-by-play-action-period')
    player_id = point.get('data-play-by-play-action-player-id')
    team_id = point.get('data-play-by-play-action-team-id')

    x_point = float(point.find('svg').get('x'))
    y_point = float(point.find('svg').get('y'))

    temp_df = pd.DataFrame([[hit, action_id, period, player_id, team_id, x_point, y_point]], 
                           columns=['hit','action_id','period','player_id','team_id','x','y'])
    results = results.append(temp_df)

results['y'] = results['y'] * -1 
results = results.reset_index(drop=True)



###############################################################################

player_ids = soup.findAll('label', {"class": "item-label"})  

players = pd.DataFrame()
for player in player_ids:
    player_id = player.find('input').get('data-play-by-play-action-player-id')
    if player_id == None:
        continue

    player_name = player.find('span').text

    temp_df = pd.DataFrame([[player_id, player_name]], 
                           columns=['player_id','player_name'])

    players = players.append(temp_df)

players = players.reset_index(drop=True)

###############################################################################

team_ids = soup.findAll('div', {"class": "header-scores_desktop"})
teams_A = team_ids[0].find('div', {"class": "team-A"})
team_id_A = teams_A.find('img').get('src').rsplit('/')[-1]
team_name_A = teams_A.find('span').text
teams_B = team_ids[0].find('div', {"class": "team-B"})
team_id_B = teams_B.find('img').get('src').rsplit('/')[-1]
team_name_B = teams_B.find('span').text

teams = pd.DataFrame([[team_id_A, team_name_A],[team_id_B,team_name_B]], 
                           columns=['team_id','team_name'])

teams = teams.reset_index(drop=True)

###############################################################################

actions = pd.DataFrame()

action_ids = soup.findAll('div', {"class": "overlay-wrapper"})

for action in action_ids:
    action_id = action.get('data-play-by-play-action-id')
    time_remaining = action.find('div').find('span', {'class': 'time'}).text
    full_name = action.find('div').find('span', {'class': 'athlete-name'}).text

    if not action.find('div').find('span', {'class': 'action-code'}):
        result_of_action = '+0'
    else:
        result_of_action = action.find('div').find('span', {'class': 'action-code'}).text

    action_description = action.find('div').find('span', {'class': 'action-description'}).text

    team_A_score = action.find('div').find('span', {'class': 'team-A'}).text
    team_B_score = action.find('div').find('span', {'class': 'team-B'}).text


    temp_df = pd.DataFrame([[action_id, time_remaining, full_name, result_of_action, team_A_score, team_B_score, action_description]], 
                           columns=['action_id','time_remaining', 'full_name', 'result_of_action', team_name_A+'_score', team_name_B+' score', 'action-description'])

    actions = actions.append(temp_df)


actions = actions.reset_index(drop=True)


###############################################################################

results = pd.merge(results, players, how='left', on='player_id')
results = pd.merge(results, teams, how='left', on='team_id')
results = pd.merge(results, actions, how='left', on='action_id') 

driver.close()    

为了对其进行一点清理,您可以对行进行排序,以使它们按顺序排列,从头到尾逐一播放

results.sort_values(['period', 'time_remaining'], ascending=[True, False], inplace=True)
results = results.reset_index(drop=True)