Question

这是一个scrapy代码，我想从mouthshut.com抓取数据，它包含两者之间的强大标记。我能够运行它并且标题即将出现，但它们是空白的。为什么它没有提取任何数据？

import scrapy
from scrapy.selector import Selector

from shut.items import ShutItem

class criticspider(scrapy.Spider):
    name ="shut"
    allowed_domains =["mouthshut.com"]
    start_urls =["http://www.mouthshut.com/mobile-operators/vodafone-mobile-operator-reviews-925020930"]

    def parse(self,response):
        hxs = Selector(response)
        sites = hxs.select('//li[@class="profile"]')
        items = []
        for site in sites:
            item = ShutItem()
            item['title'] = site.select('//strong[@style=" font-size: 15px;font-weight: 700;"]//a/text()').extract()
            #item['date'] = site.select('div[@class="review_stats"]//div[@class="date"]/text()').extract()
            #item['desc'] = site.select('div[@class="review_body"]//span[@class="blurb blurb_expanded"]/text()').extract()
            items.append(item)
    return items

Answer 1

您应该使用管道从蜘蛛中提取数据！这是一个将数据提取到json文件的示例：

pipelines.py

# -*- coding: utf-8 -*-

# python import
from scrapy import signals, log
from scrapy.contrib.exporter import JsonItemExporter
from datetime import datetime
import os

# project import
from items import tgju
from pymongo import MongoClient


# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html


def get_items(module):
    md = module.__dict__
    return (str(md[c].__name__) for c in md if (isinstance(md[c], type) and md[c].__module__ == module.__name__))


class JsonPipeline(object):
    def __init__(self):
        self.files = dict()
        self.exporter = dict()

    @classmethod
    def from_crawler(cls, crawler):
        pipeline = cls()
        crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
        crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
        return pipeline

    def spider_opened(self, spider):
        for key in get_items(tgju):
            path = os.path.join('temp', key)
            if not os.path.exists(path):
                os.makedirs(path)
            self.files[key] = open(os.path.join(path,
                                                '%s_%s_%s.json' % (spider.name,
                                                                   key.lower(),
                                                                   datetime.now().strftime('%Y%m%dT%H%M%S'))),
                                   'w+b')

            self.exporter[key] = JsonItemExporter(self.files[key])
            self.exporter[key].start_exporting()

    def spider_closed(self, spider):
        for key in get_items(tgju):
            self.exporter[key].finish_exporting()
            self.files.pop(key).close()

    def process_item(self, item, spider):

        try:
            log.msg('-----------------%s------------------' % item.__class__.__name__)
            self.exporter[item.__class__.__name__].export_item(item)
        except KeyError:
            pass
        return item

将此行添加到您的设置文件中：

ITEM_PIPELINES = {
    'pipelines.JsonPipeline': 800,
}

并尝试yield每个项目而不是return。

<强>更新：还要把你的蜘蛛变成这个......

import scrapy
from scrapy.selector import Selector

from shut.items import ShutItem

class criticspider(scrapy.Spider):
    name ="shut"
    allowed_domains =["mouthshut.com"]
    start_urls =["http://www.mouthshut.com/mobile-operators/vodafone-mobile-operator-reviews-925020930"]

    def parse(self,response):
        hxs = Selector(response)
        sites = hxs.select('//li[@class="profile"]')
        for site in sites:
            item = ShutItem()
            item['title'] = site.select('//strong[@style=" font-size: 15px;font-weight: 700;"]//a/text()').extract()
            #item['date'] = site.select('div[@class="review_stats"]//div[@class="date"]/text()').extract()
            #item['desc'] = site.select('div[@class="review_body"]//span[@class="blurb blurb_expanded"]/text()').extract()
            yield item

Answer 2

def parse(self,response):
    hxs = HtmlXPathSelector(response)
    sites = hxs.select('//div[@class="reviewtitle fl"]')
    for site in sites:
        item = ShutItem()
        item['title'] = site.select('//strong[@style="  font-size: 15px;font-weight: 700;"]/a/text()').extract()
        #item['date'] = site.select('div[@class="review_stats"]//div[@class="date"]/text()').extract()
        #item['desc'] = site.select('div[@class="review_body"]//span[@class="blurb blurb_expanded"]/text()').extract()
        yield item

这很有效。

2015-01-21 19:06:33+0800 [shut] DEBUG: Scraped from <200 http://www.mouthshut.com/mobile-operators/vodafone-mobile-operator-reviews-925020930>
    {'title': [u'Vodafone 3G - Useless in Bangalore',
               u'Worst Mobile Operator Ever',
               u'Worst 3g connectivity of vodafone in bangalore',
               u'Pathetic Network 3G',
               u'HOW DO THEY STILL DO BUSINESS WITH SUCH SERVICES!!',
               u'Bad customer service',
               u'Vodafone Kolkata \u2013 My worst ever experience.',
               u'Network connectivity - permanent nemesis',
               u'VODAFONE MOBILE OPERATOR',
               u'Beware of Vodafone billing plans',
               u'Vodafone changed my billing plan without my notice',
               u'Pathetic service.  They deduct balance unnecessari',
               u'Worst service from Vodafone',
               u'Forget Vodafone',
               u'Vodafone Data Services sucks',
               u'Outgoing calls has been barred',
               u'Vodafone Sucks',
               u'Worst Customer satisfaction I have ever Faced',
               u'Untrained Customer Care... Seems like headline de',
               u'3rd Party downloads - shameless way to make money!']}

在这里你应该知道： 1.收益远远好于scrapy列表。 2. li节点不是强的父节点。 3.强stype的值有一些空白。

Scrapy不提取数据

2 个答案:

pipelines.py