我花了很长时间试图弄明白这一点无济于事。我已经阅读了很多关于传回HtmlResponse和使用selenium中间件但是很难理解如何构建代码并实现我的解决方案。
这是我的蜘蛛代码:
import scrapy
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from time import sleep
count = 0
class ContractSpider(scrapy.Spider):
name = "contracts"
def start_requests(self):
urls = [
'https://www.contractsfinder.service.gov.uk/Search/Results',
]
for url in urls:
yield scrapy.Request(url=url, callback=self.parse)
def __init__(self):
self.driver = webdriver.Firefox()
self.driver.get("https://www.contractsfinder.service.gov.uk/Search/Results")
elem2 = self.driver.find_element_by_name("open")
elem2.click()
sleep(5)
elem = self.driver.find_element_by_name("awarded")
elem.click()
sleep(5)
elem3 = self.driver.find_element_by_id("awarded_date")
elem3.click()
sleep(5)
elem4 = self.driver.find_element_by_name("awarded_from")
elem4.send_keys("01/03/2018")
elem4.send_keys(Keys.RETURN)
sleep(5)
elem5 = self.driver.find_element_by_name("awarded_to")
elem5.send_keys("16/03/2018")
elem5.send_keys(Keys.RETURN)
sleep(5)
elem6 = self.driver.find_element_by_name("adv_search")
self.driver.execute_script("arguments[0].scrollIntoView(true);", elem6)
elem6.send_keys(Keys.RETURN)
def parse(self, response):
global count
count += 1
strcount = str(count)
page = self.driver.get(response.url)
filename = strcount+'quotes-%s.html' % page
with open(filename, 'wb') as f:
f.write(response.body)
self.log('Saved file %s' % filename)
for a in response.css('a.standard-paginate-next'):
yield response.follow(a, callback=self.parse)
selenium部分的工作原理是调用firefox,正在进行各种java交互,并加载最后一页结果。
代码的scrapy部分似乎正在工作(因为它找到了selenium加载的firefox webdriver的下一个按钮并点击了 - 我可以通过观看webdriver firefox本身看到这一点 - 但是,实际的抓取发生了(将HTML保存到我的c:\驱动器上)正在抓取网址' https://www.contractsfinder.service.gov.uk/Search/Results'单独且没有来自firefox webdriver的selenium引发的java交互。
我想我理解为什么它不能按照我的意愿工作的一些原因,例如在start_requests中我引用了原始URL,这意味着加载了selenium的页面是蜘蛛没有使用,但每次我尝试通过使用各种不同的方法从webdriver创建响应来读取stackoverflow时,我得到了各种错误,因为我的理解并不好足够 - 以为我发布了一个版本,其中硒和& scrapy元素正在做某事,但请有人解释并向我展示将2个元素链接在一起的最佳方法,即,一旦selenium完成 - 使用firefox webdriver加载页面并将其传递给scrapy来做它的东西?任何反馈都非常赞赏。
答案 0 :(得分:2)
正如你所说,scrapy打开你的初始网址,而不是Selenium修改的网页。
如果你想从Selenium获取页面,你应该使用driver.page_source.encode('utf-8')(编码不是强制性的)。您也可以将它与scrapy Selector一起使用:
response = Selector(text=driver.page_source.encode('utf-8'))
按照惯例使用响应后。
修改强>
我会尝试这样的事情(注意,我还没有测试过代码):
import scrapy
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from time import sleep
count = 0
class ContractSpider(scrapy.Spider):
name = "contracts"
def start_requests(self):
urls = [
'https://www.contractsfinder.service.gov.uk/Search/Results',
]
for url in urls:
yield scrapy.Request(url=url, callback=self.parse)
def __init__(self):
driver = webdriver.Firefox()
# An implicit wait tells WebDriver to poll the DOM for a certain amount of time when trying to find any element
# (or elements) not immediately available.
driver.implicitly_wait(5)
@staticmethod
def get__response(url):
self.driver.get("url")
elem2 = self.driver.find_element_by_name("open")
elem2.click()
elem = self.driver.find_element_by_name("awarded")
elem.click()
elem3 = self.driver.find_element_by_id("awarded_date")
elem3.click()
elem4 = self.driver.find_element_by_name("awarded_from")
elem4.send_keys("01/03/2018")
elem4.send_keys(Keys.RETURN)
elem5 = self.driver.find_element_by_name("awarded_to")
elem5.send_keys("16/03/2018")
elem5.send_keys(Keys.RETURN)
elem6 = self.driver.find_element_by_name("adv_search")
self.driver.execute_script("arguments[0].scrollIntoView(true);", elem6)
elem6.send_keys(Keys.RETURN)
return self.driver.page_source.encode('utf-8')
def parse(self, response):
global count
count += 1
strcount = str(count)
# Here you got response from webdriver
# you can use selectors to extract data from it
selenium_response = Selector(text=self.get_selenium_response(response.url))
...
答案 1 :(得分:1)
结合@Alex K和其他公司的解决方案,这是我经过测试的代码:
import scrapy
from selenium import webdriver
from selenium.webdriver.support.wait import WebDriverWait
...
def __init__(self, name=None, **kwargs):
super(MySpider, self).__init__(name, **kwargs)
self.driver = webdriver.Chrome()
@staticmethod
def get_selenium_response(driver, url):
driver.get(url)
# in case of explicit amount of time
# time.sleep(5)
# in case of wait until element been found
try:
def find(driver):
table_el = driver.find_element_by_xpath('//*[@id="table_el"]')
if table_el:
return table_el
else:
return False
element = WebDriverWait(driver, 5).until(find)
return driver.page_source.encode('utf-8')
except:
driver.quit()
def parse(self, response):
response = scrapy.Selector(
text=self.get_selenium_response(self.driver, response.url))
# ...parse response as normally