我是scrapy的新手,python。对于动态加载,我使用了ajax调用爬行蜘蛛。我无法抓取页面。我写的代码是:
import scrapy
import re
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import Selector
from scrapy.spider import BaseSpider
from demo.items import demo
from selenium import webdriver
class demoSpider(scrapy.Spider):
name = "demodallurls"
allowed_domains = ["demo.com/businessfinder/"]
start_urls = ['http://www.demo.com/businessfinder/company/All/All/A/']
def __init__(self):
self.driver = webdriver.Remote("http://127.0.0.1:4444/wd/hub",webdriver.DesiredCapabilities.HTMLUNITWITHJS)
def parse(self, response):
self.driver.get(response.url)
self.driver.implicitly_wait(10)
hxs = Selector(response)
item = demoItem
finalurls = []
while True:
next = self.driver.find_element_by_xpath('.//span[@class="ver_11 viewLink"]/a')
print "-------------next------------",next
try:
next.click()
item['page'] = response.url
urls = self.driver.find_elements_by_xpath('.//h3[@class="fleft"]/a')
print "===============urls============",urls
for url in urls:
url = url.get_attribute("href")
print "...................url.......................",url
finalurls.append(url)
item['urls'] = finalurls
except:
break
self.driver.close()
return item
我的Items.py是:
import scrapy
from scrapy.item import Item,Field
class demoItem(scrapy.Item):
page = Field()
urls = Field()
pass
当我尝试抓取它时出现错误:
File "/usr/lib/python2.7/urllib2.py", line 404, in open
response = self._open(req, data)
File "/usr/lib/python2.7/urllib2.py", line 422, in _open
'_open', req)
File "/usr/lib/python2.7/urllib2.py", line 382, in _call_chain
result = func(*args)
File "/usr/lib/python2.7/urllib2.py", line 1214, in http_open
return self.do_open(httplib.HTTPConnection, req)
File "/usr/lib/python2.7/urllib2.py", line 1184, in do_open
raise URLError(err)
urllib2.URLError: <urlopen error [Errno 111] Connection refused>