更好的网页搜集方法?

时间:2018-08-09 03:53:44

标签: python-3.x selenium web-scraping

我只是好奇是否有人可以帮助我解决问题。我目前正在此链接(https://www.basketball-reference.com/leagues/NBA_1981_games-october.html上开始抓取。我正在抓取每个月的所有表格“时间表”,然后转到下一年。从1989年到2001年(每个月),我都能成功完成工作,并按我期望的格式工作。但是我的代码是如此脆弱。...我很好奇是否有更好的方法可以向我解释,而不是仅仅将调度表作为大量文本引入,然后进行拼接以满足我的需要。例如,这是我的代码:

from selenium import webdriver as wd
from selenium.webdriver.support.ui import WebDriverWait 
from selenium.webdriver.support import expected_conditions as EC 
from selenium.webdriver.common.by import By
from selenium.common.exceptions import TimeoutException
import pandas as pd
import os


chromeDriverPath = r'path of chromeDriver used by Selenium'
browser = wd.Chrome(executable_path= chromeDriverPath)

#Create the links needed
link ="https://www.basketball-reference.com/leagues/NBA_"
years = range(1989,2018,1)
months = ['october', 'november', 'december', 'january', 'february', 'march', 
          'april', 'may', 'june', 'july', 'august', 'september']

hour_list = ['1:00','1:30', '1:40','2:00','2:30','3:00','3:30','4:00','4:30','5:00',
             '5:30','6:00','6:30','7:00','7:30','8:00','8:30','9:00','9:30',
             '10:00','10:30','11:00','11:30','12:00', '12:30','12:40'] 
ampm = ['pm', 'am']

def scrape(url):
    try:
        browser.get(url) 
        schedule = WebDriverWait(browser,5).until(EC.presence_of_all_elements_located((By.ID, "schedule")))
    except TimeoutException:
        print(str(url) + ' does not exist!')
        return       
    o_players = [schedule[i].text for i in range(0, len(schedule))]
    o_players = ''.join(o_players)
    o_players = o_players.splitlines()
    o_players = o_players[1:]
    o_players = [x.replace(',','') for x in o_players]
    o_players = [x.split(' ') for x in o_players]

    l0 = []
    l1 = []
    l2 = []
    for x in o_players:
        if "at" in x:
            l1.append(x[:x.index("at")])
        elif 'Game' in x:
            l0.append(x[:x.index("Game")])
        else:
            l2.append(x)

    l3 = l1 + l2 + l0

    for x in l3:

        for y in x:
            if y in hour_list:
                x.remove(y)
        for t in x:  
            if t in ampm:
                x.remove(t)

    ot = ['OT','2OT', '3OT', '4OT','5OT']  
    for x in l3:
        x.insert(0,'N/A')
        if x[-1] != 'Score' and x[-1] not in ot:
            x.insert(1,x[-1])
        else:
            x.insert(1,'N/A')
        for y in ot:
            if y in x:
                x.remove('N/A')
                x.remove(y)
                x.insert(0,y) 
    l3 = [t for t in l3 if 'Playoffs' not in t]

    for x in l3:
        if len(x) == 17:
            x.insert(0,' '.join(x[6:9]))
            x.insert(1,' '.join(x[11:14]))
            x.insert(1, x[11])
            x.insert(3, x[16])

        if len(x) == 16 and x[-1] != 'Score':
            if x[8].isdigit():
                x.insert(0,' '.join(x[6:8]))
                x.insert(1,' '.join(x[10:13]))
                x.insert(1, x[10])
                x.insert(3, x[15])
            else:
                x.insert(0,' '.join(x[6:9]))
                x.insert(1,' '.join(x[11:13]))
                x.insert(1, x[11])
                x.insert(3, x[15])

        if len(x) == 16 and x[-1] == 'Score':
            x.insert(0,' '.join(x[6:9]))
            x.insert(1, ' '.join(x[11:14]))
            x.insert(1, x[11])
            x.insert(3, x[16])

        if len(x) == 15 and x[-1] != 'Score':
            x.insert(0,' '.join(x[6:8]))
            x.insert(1,' '.join(x[10:12]))
            x.insert(1, x[10])
            x.insert(3, x[14])



        if len(x) == 15 and x[-1] == 'Score':
            if x[8].isdigit():
                x.insert(0,' '.join(x[6:8]))
                x.insert(1,' '.join(x[10:13]))
                x.insert(1, x[10])
                x.insert(3, x[15])
            else:
                x.insert(0,' '.join(x[6:9]))
                x.insert(1,' '.join(x[11:13]))
                x.insert(1, x[11])
                x.insert(3, x[15])

        if len(x) == 14:
            x.insert(0,' '.join(x[6:8]))
            x.insert(1,' '.join(x[10:12]))
            x.insert(1, x[10])
            x.insert(3, x[14])
    l4 = []
    for x in l3:
        x = x[:10]
        l4.append(x)

    #Working With Pandas to Standardize Data
    df = pd.DataFrame(l4)
    df['Date'] = df[7] + ' '+ df[8] + ', ' + df[9]
    df['Date'] = pd.to_datetime(df['Date'])
    df = df.sort_values(by=['Date'])
    headers =  ['Visitor', 'Visitor Points', 'Home', 'Home Points', 'OT', 
                'Attendance','Weekday', 'Month', 'Day', 'Year', 'Date' ]
    headers_order = ['Date', 'Weekday', 'Day', 'Month', 'Year', 'Visitor', 'Visitor Points',
                     'Home', 'Home Points', 'OT', 'Attendance']
    df.columns = headers
    df = df[headers_order]


    file_exists = os.path.isfile("NBA_Scrape.csv")  
    if not file_exists:
        df.to_csv('NBA_Scrape.csv', mode='a', header=True, index=False)
    else:
        df.to_csv('NBA_Scrape.csv', mode='a', header=False, index=False)

for x in years:
    link0 = link+str(x)+'_games-'
    for y in months:
        final_links = link0+str(y)+'.html'
        scrape(final_links)

我认为我的代码在2001年开始返回错误。我想从现在开始。请帮我更好地刮。我想有一种更精通的方法,例如遍历表“时间表”中的每个元素,并将每个元素附加到熊猫的不同列表或不同列中?请帮我一下。

谢谢, 乔

1 个答案:

答案 0 :(得分:1)

您的目标是完全静态的,因此无需运行硒。我建议使用Scrapy python库。它旨在满足所有Web抓取需求。它是难以置信的快速和灵活的工具。您可以使用xpath分别从页面中提取所有元素,而不必将其视为大量文本。