我的主要文件:
import scrapy
from scrapy.exceptions import CloseSpider
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.http import Request
class Product(scrapy.Item):
brand = scrapy.Field()
title = scrapy.Field()
link = scrapy.Field()
name = scrapy.Field()
title = scrapy.Field()
date = scrapy.Field()
heading = scrapy.Field()
data = scrapy.Field()
Model_name = scrapy.Field()
class aqaqspider(CrawlSpider):
name = "mouth_shut_new"
allowed_domains = ["mouthshut.com"]
start_urls = ["http://www.mouthshut.com/mobile-phones/Yu-Yureka-reviews-925723476"
]
rules = (
Rule(
SgmlLinkExtractor(allow=('.*\-page-.*',)),
callback="parse_start_url",
follow=True),
)
def parse_start_url(self, response):
products = response.xpath('//div[@id="allreviews"]/ul/li')
items = []
if not products:
raise CloseSpider("No more products!")
for product in products:
item = Product()
#item['Model_name'] = product.xpath('/html/body/form/div[12]/div/div[5]/div/div[1]/div[3]/ul/li[1]/h1/a/span/text()').extract()
item['name'] = product.xpath('.//li[@class="profile"]/div/a/span/text()').extract()[0]
item['title'] = product.xpath('.//div[@class="reviewtitle fl"]/strong/a/text()').extract()[0]
item['date'] = product.xpath('.//div[@class="reviewrate"]//span[@class="datetime"]/span/span/span/text()').extract()[0]
item['link'] = product.xpath('.//div[@class="reviewtitle fl"]/strong/a/@href').extract()[0]
if item['link']:
if 'http://' not in item['link']:
item['link'] = urljoin(response.url, item['link'])
yield scrapy.Request(item['link'],
meta={'item': item},
callback=self.anchor_page)
items.append(item)
def anchor_page(self, response):
old_item = response.request.meta['item']
old_item['data'] = response.xpath('.//div[@itemprop="description"]/p/text()').extract()
yield old_item
# yield Request(url="http://www.mouthshut.com/Product/mobileListing.aspx?cid=925602729&f1=1&view=list&nsort1=0&nsort2=2015-06-01%2016:12:23.000&ntype=3&mpad=1&ran=0.3691624044781373&dcal=Intex%20Aqua%20Xtreme" ,
# headers={"Referer": "http://www.mouthshut.com/mobile-phones.php", "X-Requested-With": "XMLHttpRequest"},
# callback=self.parse,
# dont_filter=True)
我的settings.py:
# -*- coding: utf-8 -*-
# Scrapy settings for mouth project
#
# For simplicity, this file contains only the most important settings by
# default. All the other settings are documented here:
#
# http://doc.scrapy.org/en/latest/topics/settings.html
#
BOT_NAME = 'mouth'
SPIDER_MODULES = ['mouth.spiders']
NEWSPIDER_MODULE = 'mouth.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'mouth (+http://www.yourdomain.com)'
ITEM_PIPELINES = {'mouth.pipelines.MongoDBPipeline':300}
MONGODB_HOST = 'localhost' # Change in prod
MONGODB_PORT = 27017 # Change in prod
MONGODB_DATABASE = "mobiles_complaints" # Change in prod
MONGODB_COLLECTION = "Yu_Yureka"
MONGODB_USERNAME = "" # Change in prod
MONGODB_PASSWORD = "" # Change in prod
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'consumer (+http://www.yourdomain.com)'
我的pipelines.py:
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
import pymongo
from scrapy.conf import settings
from scrapy import log
class MongoDBPipeline(object):
def __init__(self):
connection = pymongo.Connection(settings['MONGODB_HOST'], settings['MONGODB_PORT'])
db = connection[settings['MONGODB_DATABASE']]
self.collection = db[settings['MONGODB_COLLECTION']]
def process_item(self, item, spider):
self.collection.insert(dict(item))
log.msg("Item wrote to MongoDB database {}, collection {}, at host {}, port {}".format(
settings['MONGODB_DATABASE'],
settings['MONGODB_COLLECTION'],
settings['MONGODB_HOST'],
settings['MONGODB_PORT']))
return item
我跑了sc scrapy crawl mouth_shut_new
。但是我的数据没有存储在数据库中。在输出中,它应该显示数据存储在mongo和集合名称中。我缺少什么?
答案 0 :(得分:2)
process_item()
方法没有正确缩进,应该是:
class MongoDBPipeline(object):
def __init__(self):
connection = pymongo.Connection(settings['MONGODB_HOST'], settings['MONGODB_PORT'])
db = connection[settings['MONGODB_DATABASE']]
self.collection = db[settings['MONGODB_COLLECTION']]
def process_item(self, item, spider):
self.collection.insert(dict(item))
log.msg("Item wrote to MongoDB database {}, collection {}, at host {}, port {}".format(
settings['MONGODB_DATABASE'],
settings['MONGODB_COLLECTION'],
settings['MONGODB_HOST'],
settings['MONGODB_PORT']))
return item
答案 1 :(得分:0)
你没有在回调函数中产生该项:callback =" parse_start_url",你应该这样做:
def parse_start_ul(self, response):
...
for product in products:
item = Product()
....
yield item