我不知道问题出在哪里可能非常容易解决,因为我不熟悉scrapy。我希望找到一个解决方案。提前谢谢。
我正在使用utnutu 14.04,python 3.4
我的蜘蛛:
``
class EnActressSpider(scrapy.Spider):
name = "en_name"
allowed_domains = ["www.r18.com/", "r18.com/"]
start_urls = ["http://www.r18.com/videos/vod/movies/actress/letter=a/sort=popular/page=1",]
def parse(self, response):
for sel in response.xpath('//*[@id="contents"]/div[2]/section/div[3]/ul/li'):
item = En_Actress()
item['image_urls'] = sel.xpath('a/p/img/@src').extract()
name_link = sel.xpath('a/@href').extract()
request = scrapy.Request(name_link, callback = self.parse_item, dont_filter=True)
request.meta['item'] = item
yield request
next_page = response.css("#contents > div.main > section > div.cmn-sec-item01.pb00 > div > ol > li.next > a::attr('href')")
if next_page:
url = response.urljoin(next_page[0].extract())
yield scrapy.Request(url, self.parse, dont_filter=True)
def parse_item(self, response):
item = reponse.meta['item']
name = response.xpath('//*[@id="contents"]/div[1]/ul/li[5]/span/text()')
item['name'] = name[0].encode('utf-8')
yield item
``
LOG:
``
{'downloader/request_bytes': 988,
'downloader/request_count': 2,
'downloader/request_method_count/GET': 2,
'downloader/response_bytes': 48547,
'downloader/response_count': 2,
'downloader/response_status_count/200': 1,
'downloader/response_status_count/301': 1,
'finish_reason': 'finished',
'finish_time': datetime.datetime(2016, 7, 25, 6, 46, 36, 940936),
'log_count/DEBUG': 1,
'log_count/INFO': 1,
'response_received_count': 1,
'scheduler/dequeued': 2,
'scheduler/dequeued/memory': 2,
'scheduler/enqueued': 2,
'scheduler/enqueued/memory': 2,
'spider_exceptions/TypeError': 1,
'start_time': datetime.datetime(2016, 7, 25, 6, 46, 35, 908281)}
``
非常感谢任何帮助。
答案 0 :(得分:0)
似乎语法错误很少。我把它清理干净了,好像在这里工作得很好。
我做的另一项修改是从dont_filter
对象中移除Request
参数,因为您不想删除重复项。还调整了allowed_domains,因为它过滤掉了一些内容。
将来你应该发布整个日志。
import scrapy
class EnActressSpider(scrapy.Spider):
name = "en_name"
allowed_domains = ["r18.com"]
start_urls = ["http://www.r18.com/videos/vod/movies/actress/letter=a/sort=popular/page=1", ]
def parse(self, response):
for sel in response.xpath('//*[@id="contents"]/div[2]/section/div[3]/ul/li'):
item = dict()
item['image_urls'] = sel.xpath('a/p/img/@src').extract()
name_link = sel.xpath('a/@href').extract_first()
request = scrapy.Request(name_link, callback=self.parse_item)
request.meta['item'] = item
yield request
next_page = response.css(
"#contents > div.main > section > div.cmn-sec-item01.pb00 > "
"div > ol > li.next > a::attr('href')").extract_first()
if next_page:
url = response.urljoin(next_page)
yield scrapy.Request(url, self.parse)
def parse_item(self, response):
item = response.meta['item']
name = response.xpath('//*[@id="contents"]/div[1]/ul/li[5]/span/text()').extract_first()
item['name'] = name.encode('utf-8')
yield item