为什么我的scrapy不刮任何东西?

时间:2016-07-22 07:11:18

标签: python scrapy

我不知道问题出在哪里可能非常容易解决,因为我不熟悉scrapy。我希望找到一个解决方案。提前谢谢。

我正在使用utnutu 14.04,python 3.4

我的蜘蛛:

import scrapy
from scrapy.linkextractors import LinkExtractor
from name.items import Actress

class ActressSpider(scrapy.Spider):
name = "name_list"
allowed_domains = ["dmm.co.jp"]
start_urls = ["http://actress.dmm.co.jp/-/list/=/keyword=%s/" % c for c in ['a', 'i', 'u', 'e', 'o', 'ka', 'ki', 'ku', 'ke', 'ko', 'sa', 'si', 'su', 'se', 'so', 'ta', 'ti', 'tu', 'te', 'to', 'na', 'ni', 'nu', 'ne', 'no', 'ha', 'hi', 'hu', 'he', 'ho', 'ma', 'mi', 'mu', 'me', 'mo', 'ya', 'yu', 'yo', 'ra', 'ri', 'ru', 're', 'ro', 'wa']]

def parse(self, response):
    for sel in response.xpath('//*[@id="mu"]/table[2]/tr/td[2]/a/@href'):
        url = response.urljoin(sel.extract())
        yield scrapy.Request(url, callback = self.parse_actress_detail)

    next_page = response.xpath('//*[@id="mu"]/table[1]/tr[2]/td[2]/a/@href')
    for urlnext in next_page:
        if urlnext:
            pagination = response.urljoin(urlnext.extract())
        yield scrapy.Request(pagination, callback = self.parse)


def parse_actress_detail(self, response):
    for sel in response.xpath('//*[@id="mu"]/table[1]'):
        item = Actress()
        url = resposne.url
        name = sel.xpath('tr[3]/td/table/tr/td[1]/img/@alt').extract()
        item['name'] = name[0].encode('utf-8')
        item['name_en'] = sel.xpath('tr[3]/td/table/tr/td[1]/img/@src').extract()
        birth = sel.xpath('tr[3]/td/table/tr/td[2]/table/tr[1]/td[2]/text()').extract()
        item['birth'] = birth[0].encode('utf-8')
        starsign = sel.xpath('tr[3]/td/table/tr/td[2]/table/tr[2]/td[2]/text()').extract()
        item['starsign'] = starsign[0].encode('utf-8')
        bloodtype = sel.xpath('tr[3]/td/table/tr/td[2]/table/tr[3]/td[2]/text()').extract()
        item['bloodtype'] = bloodtype[0].encode('utf-8')
        boobs = sel.xpath('tr[3]/td/table/tr/td[2]/table/tr[4]/td[2]/text()').extract()
        item['boobs'] = boobs[0].encode('utf-8')
        home = sel.xpath('tr[3]/td/table/tr/td[2]/table/tr[5]/td[2]/text()').extract()
        item['home'] = home[0].encode('utf-8')
        hobby = sel.xpath('tr[3]/td/table/tr/td[2]/table/tr[6]/td[2]/text()').extract()
        item['hobby'] = hobby[0].encode('utf-8')
        item['image_urls'] = sel.xpath('tr[3]/td/table/tr/td[1]/img/@src').extract()
        request = scrapy.Request(url, callback=self.parse_actress_detail2, meta={'item':item})
        yield request

# another link section of parse's request url
def parse_actress_detail2(self, response):
    for sel in response.xpath('//*[@id="mu"]/table[4]/tr/td[1]/a/@href'):
        url = response.urljoin(sel.extract())
        request = scrapy.Request(url, callback = self.parse_movie_detail, meta={'item':item})
        yield request

    next_page = response.xpath('//*[@id="mu"]/table[5]/tr/td/a/@href')
    for urlnext in next_page:
        if urlnext:
            pagination = response.urljoin(urlnext.extract())
        yield scrapy.Request(pagination, callback = self.parse_actress_detail2)


def parse_movie_detail(self, response):
    for sel in response.xpath('//*[@id="content"]/tr[1]/td[1]'):
        item = response.meta['item']
        release_date = sel.xpath('table/tr[1]/td[2]/text()').extract()
        item['release_date'] = release_date[0].encode('utf-8')
        running_time = sel.xpath('table/tr[2]/td[2]/text()').extract()
        item['running_time'] = running_time[0].encode('utf-8')
        cast = sel.xpath('table/tr[3]/td[2]/a/text()').extract()
        castjoin = [n.encode('utf-8') for n in cast]
        item['cast'] = b', '.join(castjoin)
        series = sel.xpath('table/tr[4]/td[2]/text()').extract()
        item['series'] = series[0].encode('utf-8')
        manufacturer = sel.xpath('table/tr[5]/td[2]/text()').extract()
        item['manufacturer'] = manufacturer[0].encode('utf-8')
        label = sel.xpath('table/tr[6]/td[2]/text()').extract()
        item['label'] = label[0].encode('utf-8')
        number = sel.xpath('//*[@id="cid_block"]/text()').extract()
        item['number']  = number[0].encode('utf-8')
        yield item

日志:

'downloader/request_bytes': 4350197,
'downloader/request_count': 10107,
'downloader/request_method_count/GET': 10107,
'downloader/response_bytes': 169329414,
'downloader/response_count': 10107,
'downloader/response_status_count/200': 9905,
'downloader/response_status_count/301': 202,
'dupefilter/filtered': 3212,
'finish_reason': 'finished',
'finish_time': datetime.datetime(2016, 7, 22, 5, 41, 0, 920779),
'log_count/DEBUG': 203,
'log_count/INFO': 13,
'request_depth_max': 5,
'response_received_count': 9905,
'scheduler/dequeued': 10107,
'scheduler/dequeued/memory': 10107,
'scheduler/enqueued': 10107,
'scheduler/enqueued/memory': 10107,
'spider_exceptions/NameError': 9659,
'start_time': datetime.datetime(2016, 7, 22, 5, 28, 25, 342801)

非常感谢任何帮助。

1 个答案:

答案 0 :(得分:1)

在您的统计数据中,'spider_exceptions/NameError': 9659,看起来很可疑。

我认为问题出在你的parse_actress_detail2回调中。在第一个循环中,未定义item

def parse_actress_detail2(self, response):
    for sel in response.xpath('//*[@id="mu"]/table[4]/tr/td[1]/a/@href'):
        url = response.urljoin(sel.extract())

        request = scrapy.Request(url,
                                 callback = self.parse_movie_detail,
                                 meta={'item':item})
        #                                      ^
        #                                      |
        #                                     here               
        yield request

您可能意味着meta={'item': response.meta['item']}就像在parse_movie_detail中那样。