使用Selenium和BS抓取网站。我的代码根据州名打开一个网站,输入所需信息,单击一个按钮,然后将表格导出到excel。将每个状态数据导出到excel后,关闭Firefox浏览器后,针对不同的状态再次重复相同的代码。
导出3个或4个状态后,间歇性地得到上面的错误代码。我重新启动内核。另外,我还修改了Firefox中的设置,以清除每次退出时的缓存。 不知道为什么会这样。
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import re
import pandas as pd
from selenium.webdriver.firefox.firefox_binary import FirefoxBinary
import time
productName = []
number = []
divList = []
binary = FirefoxBinary('C:/Users/GBVYR/AppData/Local/Mozilla Firefox/firefox.exe')
driver = webdriver.Firefox(firefox_binary=binary, executable_path = r'C:\Python\geckodriver.exe')
driver.get("Website_State=TX")
radioBtn = driver.find_element_by_id("ContentPlaceHolder1_company")
radioBtn.click()
element = driver.find_element_by_id("ContentPlaceHolder1_TextBoxInput1")
element.send_keys("Company Name")
Search = driver.find_element_by_id("ContentPlaceHolder1_view1Continue")
Search.click()
driver.implicitly_wait(15)
driver.find_element_by_id("ctl00_ContentPlaceHolder1_176").click()
time.sleep(30)
html = driver.page_source
soup = BeautifulSoup(html,'lxml')
#Get Product name
for line in soup.findAll('span', attrs ={'class': ppisreportspanprodname'}):
productName.append(line.text)
#Get number
links = soup.find_all('a')
for link in links:
if "xxx" in link.get("href"):
number.append(link.text)
df = pd.DataFrame({'Product':productName, 'Registration': number})
df.to_excel("C:\Python\Exportedfile.xls")