from scrapy_selenium import SeleniumRequest
import scrapy
from selenium import webdriver
class testspider1(scrapy.Spider):
driver=webdriver.Firefox(executable_path=r"C:\Users\test\Desktop\geckodriver")
name = 'test5'
start_urls=['http://httpbin.org/ip']
def parse(self, response):
print(response.body)
url = "https://www.target.com/p/cesar-canine-cuisine-filet-mignon-flavor-wet-dog-food-3-5oz-tray/-/A-14903668"
yield SeleniumRequest(url=url,callback=self.parse_result)
def parse_result(self,response):
image = response.xpath('//*[@id="mainContainer"]/div/div/div[1]/div[1]/div[2]/div[1]/div/div/div/div/div/div/div/a/div/div/div/div/div/img/@src').extract_first()
price = response.selector.xpath('//*[@id="mainContainer"]/div/div/div[1]/div[2]/div/div[1]/span/text()').extract_first()
print(image)
print("\n\n")
print(price)
设置文件:
from shutil import which
BOT_NAME = 'seleniumtest'
SPIDER_MODULES = ['seleniumtest.spiders']
NEWSPIDER_MODULE = 'seleniumtest.spiders'
SELENIUM_DRIVER_NAME = 'firefox'
SELENIUM_DRIVER_EXECUTABLE_PATH = which('geckodriver')
SELENIUM_BROWSER_EXECUTABLE_PATH = which(r"C:\Users\test\Desktop\geckodriver")
ROBOTSTXT_OBEY = True
DOWNLOADER_MIDDLEWARES = {
'scrapy_selenium.SeleniumMiddleware': 800
}
documentation on scrapy-selenium
我已经按照说明进行了逐步操作,但是驱动程序没有遵循任何链接。我相信这两个请求都是由scrapy处理的。我不想更改__init__
,因为我希望通过scrapy(单独)使用scrapy-Selenium处理某些请求。
我检查了passing-selenium-driver-to-scrapy,但它更改了整个init
,使硒成为self.driver。
我希望SeleniumRequest
的其他请求可以通过抓取Request
来处理
注意:我还以该站点为示例网站,该站点使用java显示结果,如果尚未呈现由不完整(单独)数据处理的结果,那么结果将为空列表
答案 0 :(得分:1)
我用chrome替换了Firefox:
from webdriver_manager.chrome import ChromeDriverManager
driver = webdriver.Chrome(ChromeDriverManager().install())