我正在学习Python,并且正试图在下拉菜单中抓取此page以获取特定值。之后,我需要单击结果表上的每个项目以检索特定信息。我能够选择项目并检索webdriver上的信息。但我不知道如何将响应URL传递给crawlspider。
driver = webdriver.Firefox()
driver.get('http://www.cppcc.gov.cn/CMS/icms/project1/cppcc/wylibary/wjWeiYuanList.jsp')
more_btn = WebDriverWait(driver, 20).until(
EC.visibility_of_element_located((By.ID, '_button_select'))
)
more_btn.click()
## select specific value from the dropdown
driver.find_element_by_css_selector("select#tabJcwyxt_jiebie > option[value='teyaoxgrs']").click()
driver.find_element_by_css_selector("select#tabJcwyxt_jieci > option[value='d11jie']").click()
search2 = driver.find_element_by_class_name('input_a2')
search2.click()
time.sleep(5)
## convert html to "nice format"
text_html=driver.page_source.encode('utf-8')
html_str=str(text_html)
## this is a hack that initiates a "TextResponse" object (taken from the Scrapy module)
resp_for_scrapy=TextResponse('none',200,{},html_str,[],None)
## convert html to "nice format"
text_html=driver.page_source.encode('utf-8')
html_str=str(text_html)
resp_for_scrapy=TextResponse('none',200,{},html_str,[],None)
所以这就是我被困的地方。我能够使用上面的代码进行查询。但是如何将 resp_for_scrapy 传递给 crawlspider ?我用 resp_for_scrapy 代替项目,但这没有用。
## spider
class ProfileSpider(CrawlSpider):
name = 'pccprofile2'
allowed_domains = ['cppcc.gov.cn']
start_urls = ['http://www.cppcc.gov.cn/CMS/icms/project1/cppcc/wylibary/wjWeiYuanList.jsp']
def parse(self, resp_for_scrapy):
hxs = HtmlXPathSelector(resp_for_scrapy)
for post in resp_for_scrapy.xpath('//div[@class="table"]//ul//li'):
items = []
item = Ppcprofile2Item()
item ["name"] = hxs.select("//h1/text()").extract()
item ["title"] = hxs.select("//div[@id='contentbody']//tr//td//text()").extract()
items.append(item)
##click next page
while True:
next = self.driver.findElement(By.linkText("下一页"))
try:
next.click()
except:
break
return(items)
任何建议都将非常感谢!!!!
编辑我包含了一个中间件类,可以从蜘蛛类之前的下拉列表中进行选择。但现在没有错误也没有结果。
class JSMiddleware(object):
def process_request(self, request, spider):
driver = webdriver.PhantomJS()
driver.get('http://www.cppcc.gov.cn/CMS/icms/project1/cppcc/wylibary/wjWeiYuanList.jsp')
# select from the dropdown
more_btn = WebDriverWait(driver, 20).until(
EC.visibility_of_element_located((By.ID, '_button_select'))
)
more_btn.click()
driver.find_element_by_css_selector("select#tabJcwyxt_jiebie > option[value='teyaoxgrs']").click()
driver.find_element_by_css_selector("select#tabJcwyxt_jieci > option[value='d11jie']").click()
search2 = driver.find_element_by_class_name('input_a2')
search2.click()
time.sleep(5)
#get the response
body = driver.page_source
return HtmlResponse(driver.current_url, body=body, encoding='utf-8', request=request)
class ProfileSpider(CrawlSpider):
name = 'pccprofile2'
rules = [Rule(SgmlLinkExtractor(allow=(),restrict_xpaths=("//div[@class='table']")), callback='parse_item')]
def parse_item(self, response):
hxs = HtmlXPathSelector(response)
items = []
item = Ppcprofile2Item()
item ["name"] = hxs.select("//h1/text()").extract()
item ["title"] = hxs.select("//div[@id='contentbody']//tr//td//text()").extract()
items.append(item)
#click next page
while True:
next = response.findElement(By.linkText("下一页"))
try:
next.click()
except:
break
return(items)
答案 0 :(得分:20)
在使用Scrapy定期处理之前,使用Downloader Middleware来捕获需要硒的页面:
下载器中间件是Scrapy的请求/响应处理的钩子框架。它是一个轻量级的低级系统,用于全局改变Scrapy的请求和响应。
这是使用PhantomJS的一个非常基本的例子:
from scrapy.http import HtmlResponse
from selenium import webdriver
class JSMiddleware(object):
def process_request(self, request, spider):
driver = webdriver.PhantomJS()
driver.get(request.url)
body = driver.page_source
return HtmlResponse(driver.current_url, body=body, encoding='utf-8', request=request)
一旦您返回HtmlResponse
(或TextResponse
,如果您真正想要的话),Scrapy将停止处理下载程序并放入蜘蛛{{1}方法:
如果它返回一个Response对象,Scrapy将不会打扰任何其他对象 process_request()或process_exception()方法,或者相应的方法 下载功能;它会回复那个回应。 process_response() 每次响应时都会调用已安装的中间件的方法。
在这种情况下,您可以像使用HTML一样继续使用您的蜘蛛parse
方法,但页面上的JS已经执行。
提示:由于Downloader Middleware的parse
方法接受蜘蛛作为参数,您可以在蜘蛛中添加条件以检查是否需要处理JS,并且将允许您使用完全相同的蜘蛛类处理JS和非JS页面。
答案 1 :(得分:7)
这是Scrapy和Selenium的中间件
from scrapy.http import HtmlResponse
from scrapy.utils.python import to_bytes
from selenium import webdriver
from scrapy import signals
class SeleniumMiddleware(object):
@classmethod
def from_crawler(cls, crawler):
middleware = cls()
crawler.signals.connect(middleware.spider_opened, signals.spider_opened)
crawler.signals.connect(middleware.spider_closed, signals.spider_closed)
return middleware
def process_request(self, request, spider):
request.meta['driver'] = self.driver # to access driver from response
self.driver.get(request.url)
body = to_bytes(self.driver.page_source) # body must be of type bytes
return HtmlResponse(self.driver.current_url, body=body, encoding='utf-8', request=request)
def spider_opened(self, spider):
self.driver = webdriver.Firefox()
def spider_closed(self, spider):
self.driver.close()
还需要添加settings.py
DOWNLOADER_MIDDLEWARES = {
'youproject.middlewares.selenium.SeleniumMiddleware': 200
}
根据docs确定天气为200
或其他内容。
使用scrapy和selenium更新 firefox无头模式
如果您想在无头模式下运行firefox,请安装xvfb
sudo apt-get install -y xvfb
sudo pip install pyvirtualdisplay
并使用此中间件
from shutil import which
from pyvirtualdisplay import Display
from scrapy import signals
from scrapy.http import HtmlResponse
from scrapy.utils.project import get_project_settings
from selenium import webdriver
from selenium.webdriver.firefox.firefox_binary import FirefoxBinary
settings = get_project_settings()
HEADLESS = True
class SeleniumMiddleware(object):
@classmethod
def from_crawler(cls, crawler):
middleware = cls()
crawler.signals.connect(middleware.spider_opened, signals.spider_opened)
crawler.signals.connect(middleware.spider_closed, signals.spider_closed)
return middleware
def process_request(self, request, spider):
self.driver.get(request.url)
request.meta['driver'] = self.driver
body = str.encode(self.driver.page_source)
return HtmlResponse(self.driver.current_url, body=body, encoding='utf-8', request=request)
def spider_opened(self, spider):
if HEADLESS:
self.display = Display(visible=0, size=(1280, 1024))
self.display.start()
binary = FirefoxBinary(settings.get('FIREFOX_EXE') or which('firefox'))
self.driver = webdriver.Firefox(firefox_binary=binary)
def spider_closed(self, spider):
self.driver.close()
if HEADLESS:
self.display.stop()
settings.py
包含
FIREFOX_EXE = '/path/to/firefox.exe'
问题是某些版本的firefox不能使用selenium。 要解决此问题,您可以从here下载firefox版本47.0.1(此版本完美无缺地工作),然后将其解压缩并将其放入您的selenium项目中。然后将firefox路径修改为
FIREFOX_EXE = '/path/to/your/scrapyproject/firefox/firefox.exe'