寻求理解Scrapy Callback

时间:2015-01-27 03:04:42

标签: python mysql callback scrapy web-crawler

我正在努力掌握Scrapy Callback的概念。我无法找到任何解决我的问题的答案,因为我需要在两个文件中产生两次但仍然能够回调。

这是我的蜘蛛:

import scrapy
import csv

from scrapycrawler.items import DmozItem
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.lxmlhtml import LxmlLinkExtractor
from scrapy.selector import Selector
from scrapy.selector import HtmlXPathSelector
from scrapy.http import Request


class DmozSpider(CrawlSpider):
   name = "dmoz"
   allowed_domains = ["snipplr.com"]


def start_requests(self):
    #for i in xrange(1000):
    for i in range(1, 1000):
        yield self.make_requests_from_url("http://www.snipplr.com/all/page/%d" % i)

def parse(self, response):
    for sel in response.xpath('//ol[@class="snippets marg"]/li[1]/h3'):
        item = DmozItem()
        item['title'] = sel.xpath('a/text()').extract()
        item['link'] = sel.xpath('a/@href').extract()
        return Request(item['link'],  request.meta={'item':item}, callback=self.parse2)
        yield item

def parse2(self, response):
    for sel in response.xpath('//div[@class="description"]'):
        item = response.meta["item"]            
        item['desc'] = sel.xpath('p/text()').extract()
        yield item

这是我的管道:

import csv
from scrapy.exceptions import DropItem
from scrapy import log
import sys
import mysql.connector

class CsvWriterPipeline(object):

    def __init__(self):
        self.connection = mysql.connector.connect(host='localhost', user='sq', passwd='rt', db='sq')
        self.cursor = self.connection.cursor()

    def process_item(self, item, spider):
        self.cursor.execute("SELECT title,url FROM items WHERE title= %s", item['title'])
        result = self.cursor.fetchone()
        if result:

            log.msg("Item already in database: %s" % item, level=log.DEBUG)
        else:
            self.cursor.execute(
               "INSERT INTO items (title, url) VALUES (%s, %s, %s)",
                    (item['title'][0], item['link'], item['desc'][0]))
            self.connection.commit()

            log.msg("Item stored : " % item, level=log.DEBUG)
        return item

    def handle_error(self, e):
            log.err(e)

i am basically trying to get the data both from the first page and the page there after the page has been crawled. I am using Scrapy webcrawler an mysql. 

1 个答案:

答案 0 :(得分:0)

您只需要yield一个Request,而不是return

def parse(self, response):
    for sel in response.xpath('//ol[@class="snippets marg"]/li[1]/h3'):
        item = DmozItem()
        item['title'] = sel.xpath('a/text()').extract()
        item['link'] = sel.xpath('a/@href').extract()
        yield Request(item['link'],  request.meta={'item':item}, callback=self.parse2)
        yield item