Question

我一直在尝试使用scthon和python来搜集craigslist帖子。该守则最初来自Tsung Hung。

我的蜘蛛代码：

from scrapy.contrib.linkextractors import LinkExtractor
from scrapy.contrib.linkextractors.lxmlhtml import LxmlLinkExtractor
from scrapy.contrib.spiders import Rule, CrawlSpider
from scrapy.selector import Selector
from scrapy_craigslist.items import ScrapyCraigslistItem


class MySpider(CrawlSpider):
    name = 'craigslist'
    allowed_domains = ['sfbay.craigslist.org']
    start_urls = ['https://sfbay.craigslist.org/search/apa?']

rules = (
    Rule(LxmlLinkExtractor(
        allow=(r'sfbay.craigslist.org/search/apa.*'),
        deny = (r'.*format\=rss.*')
    ),
        callback="parse_items_1",
        follow= True,
         ),
    )


def parse_items_1(self, response):

    self.logger.info('You are now crawling: %s', response.url)
    items = []
    hxs = Selector(response)
    contents = hxs.xpath("//div[@class='rows']/*")
    for content in contents:
        item = ScrapyCraigslistItem()
        item ["title"] = content.xpath("//p/span/span/a/span/text()").extract()[0]
        k = content.xpath("//p/a/@href").extract()[0]
        item ['ad_url'] = 'https://sfbay.craigslist.org{}'.format(''.join(k))
        item ["post_date"] = content.xpath("//p/span/span/time/text()").extract()[0]
        item ["post_date_specific"] = content.xpath("//p/span/span/time/@datetime").extract()[0]
        item ["price"] = content.xpath("//p/span/span[@class='l2']/span/text()").extract()[0]
        item ["location"] = content.xpath("//p/span/span[@class='l2']/span[@class='pnr']/small/text()").extract()[0].strip()
    return items

我的项目文件如下：

import scrapy

class ScrapyCraigslistItem(scrapy.Item):
    title = scrapy.Field()
    post_date = scrapy.Field()
    post_date_specific = scrapy.Field()
    price = scrapy.Field()
    location = scrapy.Field()
    ad_url = scrapy.Field()

当我尝试运行时，我在终端中没有收到任何错误消息，但是当我运行时我的csv文件＆＃34; scrapy crawl craigslist -o output.csv＆＃34;只是空着。

scrapy输出空csv

0 个答案: