尝试自动进行某些数据抓取,但需要了解如何在抓取和导出之间合并数据

时间:2020-10-13 09:25:15

标签: python selenium beautifulsoup

代码如下:

import pandas as pd
from bs4 import BeautifulSoup
from time import sleep
from random import randint

from selenium import webdriver

from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By


driver = webdriver.Chrome("chromedriver 3")

player_ids = ['114']
rd_id = 1

# iterating through player_ids

for i in player_ids:
    team_id = i
    base_url = "https://bet365.apps.imgarena.com/golf/3.18.0/full/?eventId=183&language=en&options=eyJ2aWRlb1BsYXliYWNrRW5hYmxlZCI6ZmFsc2V9#/leaderboard/team/"
    mid_url = "?roundNo="
    end_url = "&holeNo=1"
    team_url = base_url + str(team_id) + mid_url + str(rd_id) + end_url

    driver.get(team_url)


    shotsbtn = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.XPATH,'//*[@id="root"]/div[1]/div[1]/div[1]/div[1]/div[3]/div[3]/span[1]')))
    shotsbtn.click()

    results = []

#   iterating through each hole 1-18

    for i in range(1, 3):
        hole_id = i
        base_xpath = '//*[@id="root"]/div[1]/div[1]/div[1]/div[1]/div[2]/div[3]/div[1]/div[2]/div[1]/div['
        end_xpath = ']/span[1]'
        full_xpath = base_xpath + str(hole_id) + end_xpath

        sleep(randint(2,5))

        holebtn = WebDriverWait(driver, 5).until(EC.presence_of_element_located((By.XPATH,full_xpath)))
        holebtn.click()

        sleep(randint(2,5))

        content = driver.page_source
        soup = BeautifulSoup(content, "html.parser")

#   pulling required shot data from each hole

        for a in soup.findAll('div', {'class': 'text__42f569fe no-name__42f569fe'}):
            name = a.find
            results.append(name)


#   exporting information by player to.csv

    df = pd.DataFrame({'Player': results})
    df.to_csv('player_' + str(team_id) + '.csv', index=False, encoding='utf-8')

我想做的是提取“玩家名称”和/或“ team_id”以及每个“ hole_id”,以便每个镜头都被唯一标识

此刻,我将其打开为.csv:

此刻我看起来像这样:

Shot 4: </span <span in the hole for Par</span</div
Shot 3: </span <span stroke 3, 41 yds to Green, 9 ft 6 in. left to pin</span</div
Shot 2: </span <span stroke 2, 135 yds to Rough, 43 yds left to pin</span</div
Shot 1: </span <span stroke 1, 281 yds to Rough, 177 yds left to pin</span</div

理想情况下,我希望它看起来像这样:

Hole 1, 114, 'Hatton, Tyrrell', Shot 4: </span <span in the hole for Par</span</div
Hole 1, 114, 'Hatton, Tyrrell', Shot 3: </span <span stroke 3, 41 yds to Green, 9 ft 6 in. left to pin</span</div
Hole 1, 114, 'Hatton, Tyrrell', Shot 2: </span <span stroke 2, 135 yds to Rough, 43 yds left to pin</span</div
Hole 1, 114, 'Hatton, Tyrrell', Shot 1: </span <span stroke 1, 281 yds to Rough, 177 yds left to pin</span</div

1 个答案:

答案 0 :(得分:1)

经过数小时的努力,我得出了最终的代码。在这里:

import pandas as pd
from bs4 import BeautifulSoup
from time import sleep
from random import randint

from selenium import webdriver

from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By


driver = webdriver.Chrome()

player_ids = ['114','115']
rd_id = 1

dictionary = {'Hole Number': [],
              'Id': [],
              'Player name': [],
              'Shot number': [],
              'Shot details': []}

# iterating through player_ids

name_list = []
hole_num_list = []
id_list = []
shot_data_final = []
shot_num_list = []

for i in player_ids:
    team_id = i
    base_url = "https://bet365.apps.imgarena.com/golf/3.18.0/full/?eventId=183&language=en&options=eyJ2aWRlb1BsYXliYWNrRW5hYmxlZCI6ZmFsc2V9#/leaderboard/team/"
    mid_url = "?roundNo="
    end_url = "&holeNo=1"
    team_url = base_url + str(team_id) + mid_url + str(rd_id) + end_url

    driver.get(team_url)


    shotsbtn = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.XPATH,'//*[@id="root"]/div[1]/div[1]/div[1]/div[1]/div[3]/div[3]/span[1]')))
    shotsbtn.click()

    results = []

#   iterating through each hole 1-18

    for i in range(1, 3):

        hole_id = i
        base_xpath = '//*[@id="root"]/div[1]/div[1]/div[1]/div[1]/div[2]/div[3]/div[1]/div[2]/div[1]/div['
        end_xpath = ']/span[1]'
        full_xpath = base_xpath + str(hole_id) + end_xpath

        sleep(randint(2,5))

        holebtn = WebDriverWait(driver, 5).until(EC.presence_of_element_located((By.XPATH,full_xpath)))
        holebtn.click()

        sleep(randint(2,5))

        content = driver.page_source
        soup = BeautifulSoup(content, "html.parser")

        name = soup.find('div',class_ = "player-name__691c96bc").text

        hole_num = int(soup.find('span',class_ = "inner__-3c5fde4a active__-3c5fde4a").text)

        shot_data = soup.find('ul',class_ = "list__3ed248d4").find_all('li')

        for shot in shot_data:
            shott = shot.text.split(':')
            shot_num = shott[0]
            shot_num_list.append(shot_num)
            shot_data_final.append(shott[-1].strip())


        for x in range(len(shot_data_final) - len(id_list)):
            name_list.append(name)
            hole_num_list.append(hole_num)
            id_list.append(team_id)
        
        hole_num_list.append('')
        id_list.append('')
        name_list.append('')
        shot_num_list.append('')
        shot_data_final.append('')

driver.close()
dictionary['Hole Number'] = hole_num_list
dictionary['Id'] = id_list
dictionary['Player name'] = name_list
dictionary['Shot number'] = shot_num_list
dictionary['Shot details'] = shot_data_final

df = pd.DataFrame(dictionary)

df.to_csv('Shots.csv',index=False)

输出:

enter image description here

该代码几乎是不言自明的,因此我认为没有太多要解释的内容。希望这会有所帮助!