任何人都可以看到此代码有什么问题吗?
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions
from selenium.webdriver.support.ui import WebDriverWait
import csv
def races(main_url):
driver = webdriver.Chrome()
driver.get(main_url)
driver.implicitly_wait(2)
races = driver.find_elements_by_class_name('time-location')
races = [race.text[:5] for race in races]
races = [race.replace(':', '') for race in races]
driver.close()
return races
def scrape(url):
driver = webdriver.Chrome()
driver.get(url)
driver.implicitly_wait(2)
driver.find_elements_by_class_name('racecard-ajax-link')[1].click()
WebDriverWait(driver, 5).until(expected_conditions.presence_of_element_located((By.XPATH, '//[@id="tab-racecard-sectional-times"]/div/div[1]/div[1]/div[2]/div/button')))
for horse in driver.find_elements_by_class_name('card-item'):
horseName = horse.find_element_by_class_name('form-link').text
times = horse.find_elements_by_class_name('sectionals-time')
times = [time.text for time in times]
print('{}: {}'.format(horseName, times))
print()
driver.close()
因此,在下面的这一点上,我尝试将数据保存到df,但是在打开时会返回一个空白文档。 df = open('jan1.csv','w +')不应该将抓取的数据存储到csv文件中。我显然缺少了一些东西,但看不到。
def main():
df = open('jan1.csv', 'w+')
df.close()
date = '1-January-2018'
main_url = 'http://www.attheraces.com/racecard/Southwell/' + date
for race in races(main_url):
url = main_url + '/' + race
print(url)
scrape(url)
if __name__ == '__main__':
main()
答案 0 :(得分:0)
您的代码似乎在几个地方坏了,即使修复了,我也会遇到超时错误。
尝试以下步骤:
添加熊猫以方便数据处理:
import pandas as pd
def scrape(url):
driver = webdriver.Chrome()
driver.get(url)
driver.implicitly_wait(2)
driver.find_elements_by_class_name('racecard-ajax-link')[1].click()
WebDriverWait(driver, 5).until(expected_conditions.presence_of_element_located((By.XPATH, '//[@id="tab-racecard-sectional-times"]/div/div[1]/div[1]/div[2]/div/button')))
# add empty list to save scraped data
data = []
for horse in driver.find_elements_by_class_name('card-item'):
horseName = horse.find_element_by_class_name('form-link').text
times = horse.find_elements_by_class_name('sectionals-time')
times = [time.text for time in times]
print('{}: {}'.format(horseName, times))
data.append([horseName, times])
print()
driver.close()
# return your data!
return data
然后在您的主要功能中进行更改:
def main():
date = '1-January-2018'
main_url = 'http://www.attheraces.com/racecard/Southwell/' + date
tmp = []
for race in races(main_url):
url = main_url + '/' + race
print(url)
tmp.append(scrape(url))
df = pd.DataFrame(tmp)
df.to_csv("jan1.csv")
或者如果您只想坚持使用csv(不使用大熊猫):
with open("jan1.csv", "w+") as file:
file.write(your_data_var_here)