我正在抓取网上的一些数据并将其写入大约6个数据帧。然后,我想将每个数据帧写入Excel文件中的单独工作表。我已经在网上看过并尝试了两种不同的东西,我无法得到我之后的结果。如果我使用以下代码,它只会将最后一个数据帧写入excel,其他所有内容都会被覆盖:
book = "Sample.xlsx"
rb = openpyxl.load_workbook(book)
rb.create_sheet(pitches[x] + ' Data')
activeSheet = pitches[x] + ' Data'
writer = pd.ExcelWriter(book, engine='xlsxwriter')
combinedDF.to_excel(writer, sheet_name=activeSheet, index=False)
writer.save()
如果我使用以下代码部分,它会创建每个单独的工作表,但不会将任何数据框数据写入excel文件:
book = "Sample.xlsx"
rb = openpyxl.load_workbook(book)
rb.create_sheet(pitches[x] + ' Data')
activeSheet = pitches[x] + ' Data'
combinedDF.to_excel(book, sheet_name=activeSheet, index=False)
rb.save(book)
以下是完整代码:
from selenium import webdriver
from selenium.webdriver.support.ui import Select
from selenium.webdriver.common.keys import Keys
import time
from bs4 import BeautifulSoup
import requests
import pandas as pd
import openpyxl
book = "Baseball Savant Data.xlsx"
rb = openpyxl.load_workbook(book)
pitches = ['Fastball', '2 Seam Fastball', 'Cut Fastball', 'Split-Finger
Fastball', 'Sinker', 'Slider', 'Changeup', 'Curveball']
beginningTime = time.time()
browser = webdriver.Chrome()
browser.get('http://www.baseballsavant.com')
browser.maximize_window()
linkPage = browser.find_element_by_link_text('Statcast Search')
linkPage.click()
time.sleep(2)
myMinimumPitchCount = browser.find_element_by_xpath("""//*
[@id="min_pitches"]/option[@value='500']""").click()
myMinimumResultCount= browser.find_element_by_xpath("""//*
[@id="min_results"]/option[@value='50']""").click()
pitchCode = ['FF','FT','FC','FS','SI','SL','CH','CU']
time.sleep(2)
x = 0
y = 0
while x < len(pitchCode):
if x == 0:
current = ('chk_PT_' + pitchCode[x])
pitchSelection = browser.find_element_by_class_name("mock-pulldown-
container")
pitchSelection.click()
currentPitch = browser.find_element_by_id(current).click()
searchButton = browser.find_element_by_xpath("""//*
[@id="pfx_form"]/div[2]/div/input[1]""").click()
time.sleep(3)
while y < 2:
if y == 0:
currentURL = browser.current_url
r = requests.get(currentURL)
soup=BeautifulSoup(r.text, "html.parser")
table_headers_data = soup.find("table", {"id" :
"search_results"})
statistics = soup.findAll("tr", {"class" : "search_row"})
table_headers = [th.text.strip() for th in
table_headers_data.findAll('th')[0:5]]
data_rows = statistics[:]
player_data = [[td.text.strip() for td in
data_rows[i].findAll('td')[0:5]]
for i in range(len(data_rows))]
dfPitchCount = pd.DataFrame(player_data, index=None,
columns=table_headers)
print('Y = ' + str(y))
y+=1
elif y != 0:
wOBAAllowed = browser.find_element_by_xpath("""//*
[@id="sort_col"]/option[@value='woba']""").click()
searchButton = browser.find_element_by_xpath("""//*
[@id="pfx_form"]/div[2]/div/input[1]""").click()
time.sleep(2)
currentURL = browser.current_url
r = requests.get(currentURL)
soup=BeautifulSoup(r.text, "html.parser")
table_headers_data = soup.find("table", {"id" :
"search_results"})
statistics = soup.findAll("tr", {"class" : "search_row"})
table_headers = [th.text.strip() for th in
table_headers_data.findAll('th')[0:4]]
data_rows = statistics[:]
player_data = [[td.text.strip() for td in
data_rows[i].findAll('td')[0:4]]
for i in range(len(data_rows))]
dfwOBA = pd.DataFrame(player_data, index=None,
columns=table_headers)
combinedDF = pd.merge(dfPitchCount, dfwOBA, how='left',
on="Player", sort=False, indicator = "True")
print(rb.get_sheet_names())
rb.create_sheet(pitches[x] + ' Data')
activeSheet = pitches[x] + ' Data'
writer = pd.ExcelWriter(book, engine='xlsxwriter')
combinedDF.to_excel(writer, sheet_name=activeSheet,
index=False )
writer.save()
pitchSort = browser.find_element_by_xpath("""//*
[@id="sort_col"]/option[@value='pitches']""").click()
print('Y = ' + str(y))
y+=1
print('this is ' + str(x))
x+=1
elif x != 0:
y=0
print('y boogers = ' + str(y))
pitchSelection = browser.find_element_by_class_name("mock-pulldown-
container")
pitchSelection.click()
time.sleep(5)
current = ('chk_PT_' + pitchCode[x])
previous = ('chk_PT_' + pitchCode[x-1])
previousPitch = browser.find_element_by_id(previous)
previousPitch.click()
time.sleep(1)
print(current)
pitchSelection.click()
currentPitch = browser.find_element_by_id(current)
currentPitch.click()
time.sleep(1)
print(previous)
pitchSort = browser.find_element_by_xpath("""//*
[@id="sort_col"]/option[@value='pitches']""").click()
searchButton = browser.find_element_by_xpath("""//*
[@id="pfx_form"]/div[2]/div/input[1]""").click()
while y < 2:
if y == 0:
currentURL = browser.current_url
r = requests.get(currentURL)
soup=BeautifulSoup(r.text, "html.parser")
table_headers_data = soup.find("table", {"id" :
"search_results"})
statistics = soup.findAll("tr", {"class" : "search_row"})
table_headers = [th.text.strip() for th in
table_headers_data.findAll('th')[0:5]]
data_rows = statistics[:]
player_data = [[td.text.strip() for td in
data_rows[i].findAll('td')[0:5]]
for i in range(len(data_rows))]
dfPitchCount = pd.DataFrame(player_data, index=None,
columns=table_headers)
y+=1
elif y != 0:
wOBAAllowed = browser.find_element_by_xpath("""//*
[@id="sort_col"]/option[@value='woba']""").click()
searchButton = browser.find_element_by_xpath("""//*
[@id="pfx_form"]/div[2]/div/input[1]""").click()
time.sleep(2)
currentURL = browser.current_url
r = requests.get(currentURL)
soup=BeautifulSoup(r.text, "html.parser")
table_headers_data = soup.find("table", {"id" :
"search_results"})
statistics = soup.findAll("tr", {"class" : "search_row"})
table_headers = [th.text.strip() for th in
table_headers_data.findAll('th')[0:4]]
data_rows = statistics[:]
player_data = [[td.text.strip() for td in
data_rows[i].findAll('td')[0:4]]
for i in range(len(data_rows))]
dfwOBA = pd.DataFrame(player_data, index=None,
columns=table_headers)
combinedDF = pd.merge(dfPitchCount, dfwOBA, how='left',
on="Player", sort=False, indicator = "True")
print(combinedDF)
print(rb.get_sheet_names())
rb.create_sheet(pitches[x] + ' Data')
activeSheet = pitches[x] + ' Data'
writer = pd.ExcelWriter(book, engine='xlsxwriter')
combinedDF.to_excel(writer, sheet_name=activeSheet,
index=False)
writer.save()
pitchSort = browser.find_element_by_xpath("""//*
[@id="sort_col"]/option[@value='pitches']""").click()
y+=1
x+=1
答案 0 :(得分:1)
您似乎错过了最重要的来源:to_excel
的pandas文档:https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.to_excel.html
所以,将writer = pd.ExcelWriter(book, engine='xlsxwriter')
和writer.save()
放在循环之外:第一个开始x
循环,第二个循环之后:你应该打开并保存excel文件曾经,不是每张纸上写的。