import scrapy
from scrapy.crawler import CrawlerProcess
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors import LinkExtractor
from items import BackpageItem, CityvibeItem
from scrapy.shell import inspect_response
import re
import time
import sys
class MySpider(CrawlSpider):
name = 'example'
allowed_domains = ['www.example.com']
# Set last_age to decide how many pages are crawled
last_page = 10
start_urls = ['http://www.example.com/washington/?page=%s' % page for page in xrange(1,last_page)]
rules = (
#Follow all links inside <div class="cat"> and calls parse_item on each link
Rule(LinkExtractor(
restrict_xpaths=('//a[@name="listing_link"]')),
callback='parse_item'),
)
# Extract relevent text from the website into a ExampleItem
def parse_item(self, response):
item = ExampleItem()
item['title'] = response.xpath('string(//h2[@class="post-title"]/text())').extract()
item['desc'] = response.xpath('string(//div[@class="section post-body"]/text())').extract()
item['url'] = response.url
item['location'] = response.xpath('string(//div[@class="posting"]/div[2]/text())').extract()
item['posted_date'] = response.xpath('string(//div[@class="post-date"]/span/text())').extract()#.re("(?<=Posted\s*).*")
item['crawled_date'] = time.strftime("%c")
# not sure how to get the other image urls right now
item['image_urls'] = response.xpath('string(//div[@class="section post-contact-container"]/div/div/img/@src)').extract()
# I can't find this section on any pages right now
item['other_ad_urls'] = response.xpath('//a[@name="listing_link"]/@href').extract()
item['phone_number'] = "".join(response.xpath('//div[@class="post-info"]/span[contains(text(), "Phone")]/following-sibling::a/text()').extract())
item['email'] = "".join(response.xpath('//div[@class="post-info"]/span[contains(text(), "Email")]/following-sibling::a/text()').extract())
item['website'] = "".join(response.xpath('//div[@class="post-info limit"]/span[contains(text(), "Website")]/following-sibling::a/text()').extract())
item['name'] = response.xpath('//div[@class="post-name"]/text()').extract()
#uncomment for debugging
#inspect_response(response, self)
return item
# process1 = CrawlerProcess({
# 'ITEM_PIPELINES': {
# #'scrapy.contrib.pipeline.images.ImagesPipeline': 1
# 'backpage.pipelines.GeolocationPipeline': 4,
# 'backpage.pipelines.LocationExtractionPipeline': 3,
# 'backpage.pipelines.BackpagePipeline': 5
# }
# });
process1 = CrawlerProcess({
'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'
})
process1.crawl(MySpider)
process1.start()
当我使用
从命令行运行它时,我的蜘蛛工作正常scrapy crawl example
但我需要运行多个蜘蛛,所以我想将它们全部放在脚本中并使用CrawlerProcess。当我尝试运行时,我收到错误,
AttributeError: 'CrawlerProcess' object has no attribute 'crawl'
这是scrapy版本0.24.6。 所有项目和管道都是正确的,因为蜘蛛从命令行工作。
答案 0 :(得分:1)
Scrapy和Scrapyd之间存在(是吗?)兼容性问题。我需要运行Scrapy 0.24和Scrapyd 1.0.1。 这是Github上的问题 https://github.com/scrapy/scrapyd/issues/100#issuecomment-115268880