我正在尝试使python selenium脚本正常工作,该脚本应该执行以下操作:
获取文本文件BookTitle.txt,该文件是“书名”列表。
然后使用Python / Selenium在网站GoodReads.com上搜索该标题。
获取结果的URL,并创建一个新的.CSV文件,其第1列=书名,第2列=网站URL
我希望我们能使它正常工作,然后请逐步帮助我使其运行。
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.firefox.options import Options
from pyvirtualdisplay import Display
#from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import Select
from selenium.webdriver.common import keys
import csv
import time
import json
class Book:
def __init__(self, title, url):
self.title = title
self.url = url
def __iter__(self):
return iter([self.title, self.url])
url = 'https://www.goodreads.com/'
def create_csv_file():
header = ['Title', 'URL']
with open('/home/l/gDrive/AudioBookReviews/WebScraping/GoodReadsBooksNew.csv', 'w+', encoding='utf-8') as csv_file:
wr = csv.writer(csv_file, delimiter=',')
wr.writerow(header)
def read_from_txt_file():
lines = [line.rstrip('\n') for line in open('/home/l/gDrive/AudioBookReviews/WebScraping/BookTitles.txt', encoding='utf-8')]
return lines
def init_selenium():
chrome_options = Options()
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')
options = Options()
options.add_argument('--headless')
global driver
driver = webdriver.Chrome("/home/l/gDrive/AudioBookReviews/WebScraping/chromedriver", chrome_options=chrome_options)
driver.get(url)
time.sleep(30)
driver.get('https://www.goodreads.com/search?q=')
def search_for_title(title):
search_field = driver.find_element_by_xpath('//*[@id="search_query_main"]')
search_field.clear()
search_field.send_keys(title)
search_button = driver.find_element_by_xpath('/html/body/div[2]/div[3]/div[1]/div[1]/div[2]/form/div[1]/input[3]')
search_button.click()
def scrape_url():
try:
url = driver.find_element_by_css_selector('a.bookTitle').get_attribute('href')
except:
url = "N/A"
return url
def write_into_csv_file(vendor):
with open('/home/l/gDrive/AudioBookReviews/WebScraping/GoodReadsBooksNew.csv', 'a', encoding='utf-8') as csv_file:
wr = csv.writer(csv_file, delimiter=',')
wr.writerow(list(vendor))
create_csv_file()
titles = read_from_txt_file()
init_selenium()
for title in titles:
search_for_title(title)
url = scrape_url()
book = Book(title, url)
write_into_csv_file(book)
运行以上操作,出现以下错误:
回溯(最近通话最近):文件 “ /home/l/gDrive/AudioBookReviews/WebScraping/GoodreadsScraper.py”, 第68行,在 init_selenium()文件“ /home/l/gDrive/AudioBookReviews/WebScraping/GoodreadsScraper.py”, init_selenium中的第41行 driver = webdriver.Chrome(“ / home / l / gDrive / AudioBookReviews / WebScraping / chromedriver”, chrome_options = chrome_options)文件 “ /usr/local/lib/python3.6/dist-packages/selenium/webdriver/chrome/webdriver.py”, 第81行,初始化 required_capabilities = desired_capabilities)文件“ /usr/local/lib/python3.6/dist-packages/selenium/webdriver/remote/webdriver.py”, 第157行,在 init self.start_session(功能,浏览器配置文件)文件“ /usr/local/lib/python3.6/dist-packages/selenium/webdriver/remote/webdriver.py”, 第252行,位于start_session中 响应= self.execute(Command.NEW_SESSION,参数)文件“ /usr/local/lib/python3.6/dist-packages/selenium/webdriver/remote/webdriver.py”, 第321行,在执行中 self.error_handler.check_response(响应)文件“ /usr/local/lib/python3.6/dist-packages/selenium/webdriver/remote/errorhandler.py”, 第242行,在check_response中 引发exception_class(消息,屏幕,堆栈跟踪)selenium.common.exceptions.WebDriverException:消息:未知错误: Chrome无法启动:异常退出(未知错误: DevToolsActivePort文件不存在)(该过程始于 chrome位置/ usr / bin / google-chrome不再运行,因此 ChromeDriver假设Chrome已崩溃。)(驱动程序信息: chromedriver = 2.44.609551 (5d576e9a44fe4c5b6a07e568f1ebc753f1214634),平台= Linux 4.15.0-60-通用x86_64)
答案 0 :(得分:1)
我现在可以看到几个错误:
1)您必须取消注释chrome选项并注释Firefox,因为稍后在代码中传递chromedriver时会
# from selenium.webdriver.firefox.options import Options
from selenium.webdriver.chrome.options import Options
顺便说一句,pyvirtualdisplay是 headless chrome的替代品,您不需要导入它。
2)您已经实例化了两次Option,并且仅使用第一个。将您的代码更改为:
def init_selenium():
chrome_options = Options()
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')
chrome_options.add_argument('--headless')
我猜这两个只是开始,遇到下一个无法解决的问题时,请编辑您的问题。
答案 1 :(得分:1)
您正在使用chrome驱动程序,但在导入时将其注释掉。
from selenium.webdriver.chrome.options import Options
在搜索功能中,过程为: 获取页面->查找搜索框->输入值->输入键->获取结果。
类似的东西:
def search_for_title(title):
driver.get('https://www.goodreads.com/search?q=')
search_field = driver.find_element_by_name('q')
search_field.clear()
search_field.send_keys(title)
search_field.send_keys(keys.Keys.RETURN) # you missed this part
url = driver.find_element_by_xpath(
'/html/body/div[2]/div[3]/div[1]/div[2]/div[2]/table/tbody/tr[1]/td[2]/a')
print(url.get_attribute('href'))