我一直在尝试使用scthon和python来搜集craigslist帖子。 该守则最初来自Tsung Hung。
我的蜘蛛代码:
from scrapy.contrib.linkextractors import LinkExtractor
from scrapy.contrib.linkextractors.lxmlhtml import LxmlLinkExtractor
from scrapy.contrib.spiders import Rule, CrawlSpider
from scrapy.selector import Selector
from scrapy_craigslist.items import ScrapyCraigslistItem
class MySpider(CrawlSpider):
name = 'craigslist'
allowed_domains = ['sfbay.craigslist.org']
start_urls = ['https://sfbay.craigslist.org/search/apa?']
rules = (
Rule(LxmlLinkExtractor(
allow=(r'sfbay.craigslist.org/search/apa.*'),
deny = (r'.*format\=rss.*')
),
callback="parse_items_1",
follow= True,
),
)
def parse_items_1(self, response):
self.logger.info('You are now crawling: %s', response.url)
items = []
hxs = Selector(response)
contents = hxs.xpath("//div[@class='rows']/*")
for content in contents:
item = ScrapyCraigslistItem()
item ["title"] = content.xpath("//p/span/span/a/span/text()").extract()[0]
k = content.xpath("//p/a/@href").extract()[0]
item ['ad_url'] = 'https://sfbay.craigslist.org{}'.format(''.join(k))
item ["post_date"] = content.xpath("//p/span/span/time/text()").extract()[0]
item ["post_date_specific"] = content.xpath("//p/span/span/time/@datetime").extract()[0]
item ["price"] = content.xpath("//p/span/span[@class='l2']/span/text()").extract()[0]
item ["location"] = content.xpath("//p/span/span[@class='l2']/span[@class='pnr']/small/text()").extract()[0].strip()
return items
我的项目文件如下:
import scrapy
class ScrapyCraigslistItem(scrapy.Item):
title = scrapy.Field()
post_date = scrapy.Field()
post_date_specific = scrapy.Field()
price = scrapy.Field()
location = scrapy.Field()
ad_url = scrapy.Field()
当我尝试运行时,我在终端中没有收到任何错误消息,但是当我运行时我的csv文件" scrapy crawl craigslist -o output.csv"只是空着。