我有一组域名,我正在使用scrapy抓取,我想查看是否所有外发链接指向' www.ecommerce.com'有效/无效。我使用的是CrawlSpider,但我不希望CrawlSpider遵循' www.ecommerce.com'的链接,只解析http状态。 我如何让scrapy解析传出链接,但不按照传出链接URL上的链接?
from scrapy.selector import Selector
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors import LinkExtractor
from CAP.items import Website
from scrapy.mail import MailSender
from scrapy.http import Request
from selenium import webdriver
import time
from scrapy.http import TextResponse
class HomeSpider(CrawlSpider):
name = "capseleniums"
allowed_domains = ["www.ecommerce.com", "learn.ecommerce.com", "health.ecommerce.com", "wm15.ecommerce.com", "wm13.ecommerce.com", "wm12.ecommerce.com" ]
handle_httpstatus_list = [500, 502, 503, 504, 400, 408, 404]
def start_requests(self):
start_urls = reversed( [
'http://wm12.ecommerce.com/health-wellness-center/',
'http://wm13.ecommerce.com/Cook/',
'http://wm15.ecommerce.com/electronics-resource-center/',
'http://health.ecommerce.com/vitamins-wellness-center/',
'http://learn.ecommerce.com/Tips-Ideas/',
] )
return [ Request(url = start_url) for start_url in start_urls ]
def trim(link_text):
return link_text.strip(' \t\n\r')
rules = (
Rule(
LinkExtractor(
allow=(),
deny=(),
process_value=trim,
restrict_xpaths=("//*[not(@class='GlobalFooter')]","//*[not(@id='footer_links')]")
),
callback="parse_items",
follow=False,),
)
def __init__(self, category=None, *args, **kwargs):
self.driver = webdriver.PhantomJS(service_args=['--load-images=no'])
super(HomeSpider, self).__init__(*args, **kwargs)
def __del__(self):
self.driver.stop()
def parse_items(self, response):
hxs = self.driver
hxs.get(response.url)
time.sleep(1)
body = hxs.page_source
sel_response = TextResponse(url=response.url, body=body, encoding = 'utf-8')
hxs = Selector(sel_response)
sites = hxs.xpath('//html')
items = []
if response.status == 404:
for site in sites:
item = Website()
item['url'] = response.meta.get('redirect_urls', [response.url])[0]
item['referer'] = response.request.headers.get('Referer')
item['status'] = response.status
items.append(item)
return items
if hxs.xpath('/html/head/title/text()[contains(.,"invalid")]'):
for site in sites:
item = Website()
item['url'] = response.meta.get('redirect_urls', [response.url])[0]
item['referer'] = response.request.headers.get('Referer')
item['status'] = response.status
items.append(item)
return items
elif hxs.xpath('//head/link[@rel="canonical"]/@href[contains(.,"invalid-category-id")]'):
for site in sites:
item = Website()
item['url'] = response.meta.get('redirect_urls', [response.url])[0]
item['referer'] = response.request.headers.get('Referer')
item['status'] = response.status
items.append(item)
return items
else:
if hxs.xpath('//*[@class="result-summary-container"]/text()[contains(.,"Showing 0 of")]'):
for site in sites:
item = Website()
item['url'] = response.meta.get('redirect_urls', [response.url])[0]
item['referer'] = response.request.headers.get('Referer')
item['status'] = response.status
items.append(item)
return items