我正在使用selenium-webdriver为scrapy抓取工具渲染javascript,但它看起来不像是对angularjs'ng-href'链接进行了抓取。 scrapy是否会抓取“ng-href”链接?如果没有,我怎样才能让它抓取'ng-href'链接?
from scrapy.selector import Selector
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors import LinkExtractor
from CAP.items import Website
from scrapy.mail import MailSender
from scrapy.http import Request
from selenium import webdriver
import time
from scrapy.http import TextResponse
class HomeSpider(CrawlSpider):
name = "capseleniums"
allowed_domains = ["www.ecommerce.com", "learn.ecommerce.com", "health.ecommerce.com", "wm15.ecommerce.com", "wm13.ecommerce.com", "wm12.ecommerce.com" ]
handle_httpstatus_list = [500, 502, 503, 504, 400, 408, 404]
def start_requests(self):
start_urls = reversed( [
'http://wm12.ecommerce.com/health-wellness-center/',
'http://wm13.ecommerce.com/Cook/',
'http://wm15.ecommerce.com/electronics-resource-center/',
'http://health.ecommerce.com/vitamins-wellness-center/',
'http://learn.ecommerce.com/Tips-Ideas/',
] )
return [ Request(url = start_url) for start_url in start_urls ]
def trim(link_text):
return link_text.strip(' \t\n\r')
rules = (
Rule(
LinkExtractor(
allow=(),
deny=(),
process_value=trim,
),
callback="parse_items",
follow=False,),
)
def __init__(self, category=None, *args, **kwargs):
self.driver = webdriver.PhantomJS(service_args=['--load-images=no'])
super(HomeSpider, self).__init__(*args, **kwargs)
def __del__(self):
self.driver.stop()
def parse_items(self, response):
hxs = self.driver
hxs.get(response.url)
time.sleep(1)
body = hxs.page_source
sel_response = TextResponse(url=response.url, body=body, encoding = 'utf-8')
hxs = Selector(sel_response)
sites = hxs.xpath('//html')
items = []
if response.status == 404:
for site in sites:
item = Website()
item['url'] = response.meta.get('redirect_urls', [response.url])[0]
item['referer'] = response.request.headers.get('Referer')
item['status'] = response.status
items.append(item)
return items
if hxs.xpath('/html/head/title/text()[contains(.,"invalid")]'):
for site in sites:
item = Website()
item['url'] = response.meta.get('redirect_urls', [response.url])[0]
item['referer'] = response.request.headers.get('Referer')
item['status'] = response.status
items.append(item)
return items
elif hxs.xpath('//head/link[@rel="canonical"]/@href[contains(.,"invalid-category-id")]'):
for site in sites:
item = Website()
item['url'] = response.meta.get('redirect_urls', [response.url])[0]
item['referer'] = response.request.headers.get('Referer')
item['status'] = response.status
items.append(item)
return items
else:
if hxs.xpath('//*[@class="result-summary-container"]/text()[contains(.,"Showing 0 of")]'):
for site in sites:
item = Website()
item['url'] = response.meta.get('redirect_urls', [response.url])[0]
item['referer'] = response.request.headers.get('Referer')
item['status'] = response.status
items.append(item)
return items
答案 0 :(得分:2)
By default,它会在href
和a
标记的area
属性中查找链接。
您只需要另外配置attrs
参数并包含ng-href
属性:
LinkExtractor(attrs=['href', 'ng-href'], callback="parse_items", follow=False),