因此,在从开始parse
方法开始向下2个请求的函数中记录日志时遇到问题。这是代码:
from datetime import datetime
import scrapy
import requests
import re
import os
class ScrapyTest(scrapy.Spider):
"""
Generic crawler
"""
name = "test"
start_urls = [
'http://www.reddit.com',
]
def __init__(self, *args, **kwargs):
super(ScrapyTest, self).__init__(*args, **kwargs)
def parse(self, response):
"""
Entry point for the crawler
"""
self.logger.debug('starting off in the parse function')
yield scrapy.Request(self.start_urls[0], callback=self.parse_hw_post)
def parse_hw_images(self, image_links):
self.logger.debug("inside parse_hw_images about to scrapy request parse_hw_image")
yield scrapy.Request(self.start_urls[0], callback=self.parse_hw_image)
def parse_hw_image(self, response):
self.logger.debug('inside ________internal________ parse hw image')
yield 'test string to yield in to'
def parse_hw_post(self, response):
# Save the images to a tmp directory for now
self.logger.debug('in parse_hw_post')
self.parse_hw_images('whatever')
现在显示的唯一日志记录是Starting off in the parse function
,然后是inside parse_hw_images about to scrapy request parse_hw_image
预期的行为是:
解析
parse_hw_post
parse_hw_images
parse_hw_image
谁能看到我正在做什么?
答案 0 :(得分:1)
yield scrapy.Request(self.start_urls[0], callback=self.parse)
意味着您要使用相同的URL调用相同的parse
方法,因此scrapy会将其过滤为重复的URL。
设置DUPEFILTER_DEBUG=True
以查看重复的URL。
答案 1 :(得分:0)
def parse_hw_post(self, response):
# Save the images to a tmp directory for now
self.logger.debug('in parse_hw_post')
for req in self.parse_hw_images('whatever'):
yield req