我只是好奇是否有人可以帮助我解决问题。我目前正在此链接(https://www.basketball-reference.com/leagues/NBA_1981_games-october.html上开始抓取。我正在抓取每个月的所有表格“时间表”,然后转到下一年。从1989年到2001年(每个月),我都能成功完成工作,并按我期望的格式工作。但是我的代码是如此脆弱。...我很好奇是否有更好的方法可以向我解释,而不是仅仅将调度表作为大量文本引入,然后进行拼接以满足我的需要。例如,这是我的代码:
from selenium import webdriver as wd
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.common.exceptions import TimeoutException
import pandas as pd
import os
chromeDriverPath = r'path of chromeDriver used by Selenium'
browser = wd.Chrome(executable_path= chromeDriverPath)
#Create the links needed
link ="https://www.basketball-reference.com/leagues/NBA_"
years = range(1989,2018,1)
months = ['october', 'november', 'december', 'january', 'february', 'march',
'april', 'may', 'june', 'july', 'august', 'september']
hour_list = ['1:00','1:30', '1:40','2:00','2:30','3:00','3:30','4:00','4:30','5:00',
'5:30','6:00','6:30','7:00','7:30','8:00','8:30','9:00','9:30',
'10:00','10:30','11:00','11:30','12:00', '12:30','12:40']
ampm = ['pm', 'am']
def scrape(url):
try:
browser.get(url)
schedule = WebDriverWait(browser,5).until(EC.presence_of_all_elements_located((By.ID, "schedule")))
except TimeoutException:
print(str(url) + ' does not exist!')
return
o_players = [schedule[i].text for i in range(0, len(schedule))]
o_players = ''.join(o_players)
o_players = o_players.splitlines()
o_players = o_players[1:]
o_players = [x.replace(',','') for x in o_players]
o_players = [x.split(' ') for x in o_players]
l0 = []
l1 = []
l2 = []
for x in o_players:
if "at" in x:
l1.append(x[:x.index("at")])
elif 'Game' in x:
l0.append(x[:x.index("Game")])
else:
l2.append(x)
l3 = l1 + l2 + l0
for x in l3:
for y in x:
if y in hour_list:
x.remove(y)
for t in x:
if t in ampm:
x.remove(t)
ot = ['OT','2OT', '3OT', '4OT','5OT']
for x in l3:
x.insert(0,'N/A')
if x[-1] != 'Score' and x[-1] not in ot:
x.insert(1,x[-1])
else:
x.insert(1,'N/A')
for y in ot:
if y in x:
x.remove('N/A')
x.remove(y)
x.insert(0,y)
l3 = [t for t in l3 if 'Playoffs' not in t]
for x in l3:
if len(x) == 17:
x.insert(0,' '.join(x[6:9]))
x.insert(1,' '.join(x[11:14]))
x.insert(1, x[11])
x.insert(3, x[16])
if len(x) == 16 and x[-1] != 'Score':
if x[8].isdigit():
x.insert(0,' '.join(x[6:8]))
x.insert(1,' '.join(x[10:13]))
x.insert(1, x[10])
x.insert(3, x[15])
else:
x.insert(0,' '.join(x[6:9]))
x.insert(1,' '.join(x[11:13]))
x.insert(1, x[11])
x.insert(3, x[15])
if len(x) == 16 and x[-1] == 'Score':
x.insert(0,' '.join(x[6:9]))
x.insert(1, ' '.join(x[11:14]))
x.insert(1, x[11])
x.insert(3, x[16])
if len(x) == 15 and x[-1] != 'Score':
x.insert(0,' '.join(x[6:8]))
x.insert(1,' '.join(x[10:12]))
x.insert(1, x[10])
x.insert(3, x[14])
if len(x) == 15 and x[-1] == 'Score':
if x[8].isdigit():
x.insert(0,' '.join(x[6:8]))
x.insert(1,' '.join(x[10:13]))
x.insert(1, x[10])
x.insert(3, x[15])
else:
x.insert(0,' '.join(x[6:9]))
x.insert(1,' '.join(x[11:13]))
x.insert(1, x[11])
x.insert(3, x[15])
if len(x) == 14:
x.insert(0,' '.join(x[6:8]))
x.insert(1,' '.join(x[10:12]))
x.insert(1, x[10])
x.insert(3, x[14])
l4 = []
for x in l3:
x = x[:10]
l4.append(x)
#Working With Pandas to Standardize Data
df = pd.DataFrame(l4)
df['Date'] = df[7] + ' '+ df[8] + ', ' + df[9]
df['Date'] = pd.to_datetime(df['Date'])
df = df.sort_values(by=['Date'])
headers = ['Visitor', 'Visitor Points', 'Home', 'Home Points', 'OT',
'Attendance','Weekday', 'Month', 'Day', 'Year', 'Date' ]
headers_order = ['Date', 'Weekday', 'Day', 'Month', 'Year', 'Visitor', 'Visitor Points',
'Home', 'Home Points', 'OT', 'Attendance']
df.columns = headers
df = df[headers_order]
file_exists = os.path.isfile("NBA_Scrape.csv")
if not file_exists:
df.to_csv('NBA_Scrape.csv', mode='a', header=True, index=False)
else:
df.to_csv('NBA_Scrape.csv', mode='a', header=False, index=False)
for x in years:
link0 = link+str(x)+'_games-'
for y in months:
final_links = link0+str(y)+'.html'
scrape(final_links)
我认为我的代码在2001年开始返回错误。我想从现在开始。请帮我更好地刮。我想有一种更精通的方法,例如遍历表“时间表”中的每个元素,并将每个元素附加到熊猫的不同列表或不同列中?请帮我一下。
谢谢, 乔
答案 0 :(得分:1)
您的目标是完全静态的,因此无需运行硒。我建议使用Scrapy python库。它旨在满足所有Web抓取需求。它是难以置信的快速和灵活的工具。您可以使用xpath分别从页面中提取所有元素,而不必将其视为大量文本。