我正在试图抓取一个网站并保存信息,目前我有两个问题。
首先,当我使用selenium点击按钮(在这种情况下是一个加载更多结果按钮)时,它不会点击直到结束,我似乎无法找出原因。
另一个问题是它没有保存到parse_article函数中的csv文件。
这是我的代码:
import scrapy
from selenium import webdriver
import scrapy
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from selenium.webdriver.common.by import By
import csv
class ProductSpider(scrapy.Spider):
name = "Southwestern"
allowed_domains = ['www.reuters.com/']
start_urls = [
'https://www.reuters.com/search/news?blob=National+Health+Investors%2c+Inc.']
def __init__(self):
self.driver = webdriver.Chrome()
def parse(self, response):
self.driver.get(response.url)
while True:
next = self.driver.find_element_by_class_name(
"search-result-more-txt")
#next = self.driver.find_element_by_xpath('//*[@id="content"]/section[2]/div/div[1]/div[4]/div/div[4]/div[1]')
# maybe do it with this
#button2 = driver.find_element_by_xpath("//*[contains(text(), 'Super')]")
try:
next.click()
# get the data and write it to scrapy items
except:
break
SET_SELECTOR = '.search-result-content'
for articles in self.driver.find_elements(By.CSS_SELECTOR, SET_SELECTOR):
item = {}
# get the date
item["date"] = articles.find_element_by_css_selector('h5').text
# title
item["title"] = articles.find_element_by_css_selector('h3 a').text
item["link"] = articles.find_element_by_css_selector(
'a').get_attribute('href')
print(item["link"])
yield scrapy.Request(url=item["link"], callback=self.parse_article, meta={'item': item})
self.driver.close()
def parse_article(self, response):
item = response.meta['item']
texts = response.xpath(
"//div[contains(@class, 'StandardArticleBody')]//text()").extract()
if "National Health Investors" in texts:
item = response.meta['item']
row = [item["date"], item["title"], item["link"]]
with open('Websites.csv', 'w') as outcsv:
writer = csv.writer(outcsv)
writer.writerow(row)
答案 0 :(得分:1)
尝试使用implicit_wait或explicit_wait:
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
# An implicit wait tells WebDriver to poll the DOM for a certain amount of time when trying to find any element
# (or elements) not immediately available.
driver.implicitly_wait(implicit_wait)
# An explicit wait is code you define to wait for a certain condition to occur before proceeding further
# in the code.
wait = WebDriverWait(self.driver, <time in seconds>)
wait.until(EC.presence_of_element_located((By.XPATH, button_xpath)))
答案 1 :(得分:-1)