我在每个请求之间设置延迟,并希望看到效果。但它似乎什么都没发生。我设置了
DOWNLOAD_DELAY=5
CONCURRENT_REQUESTS=1
CONCURRNT_REQUESTS_PER_IP=1
RANDOM_DOWNLOAD_DELY=False
我认为如果有效,我会看到每次请求之间的延迟为5秒。但它没有发生。
以下代码段是Spider:
class Useragent(BaseSpider):
name = 'useragent'
settings.overrides['DOWNLOAD_DELAY'] = 5
settings.overrides['CONCURRENT_REQUESTS'] = 1
settings.overrides['CONCURRENT_REQUESTS_PER_DOMAIN'] = 1
settings.overrides['RANDOM_DOWNLOAD_DELAY'] = False
fn_useragents = "utils/useragents.txt"
fp_useragents = open(fn_useragents, 'rb')
total_lines = len(fp_useragents.readlines())
fp_useragents.close()
if not os.path.isdir("data"):
os.mkdir("data")
fn_log = "data/log.txt"
fp_log = open(fn_log, "ab+")
def start_requests(self):
urls = [
'http://www.dangdang.com',
'http://www.360buy.com',
'http://www.amazon.com.cn',
'http://www.taobao.com'
]
for url in urls:
ua = linecache.getline(Useragent.fn_useragents, randint(1, Useragent.total_lines)).strip()
url_headers = {'User-Agent': ua}
yield Request(url, callback=self.parse_origin, headers=url_headers)
def parse_origin(self, response):
current_url = response.url
headers = response.request.headers
data_log = current_url
for k, v in headers.items():
header = "%s\t%s" % (k, v)
data_log = "\n".join((data_log, header))
Useragent.fp_log.write("%s\n" % data_log)
更新
我写了另一个蜘蛛,看看设置DOWNLOAD_DELAY的效果,以下是代码:
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.conf import settings
import sys, os, time
reload(sys)
sys.setdefaultencoding('utf-8')
class TestCrawl(CrawlSpider):
name = 'crawldelay'
start_urls = [
'http://www.dangdang.com',
]
rules = (
Rule(SgmlLinkExtractor(allow=('.+'),), callback="parse_origin"),
)
def __init__(self):
CrawlSpider.__init__(self)
if not os.path.isdir("data"):
os.mkdir("data")
self.fn_log = "data/log.txt"
self.fp_log = open(self.fn_log, 'ab+')
settings.overrides['DOWNLOAD_DELAY'] = 60
settings.overrides['RANDOM_DOWNLOAD_DELAY'] = False
settings.overrides['CONCURRENT_REQUESTS'] = 1
settings.overrides['CONCURRENT_REQUESTS_PER_IP'] = 1
def parse_origin(self, response):
current_url = response.url
data_log = "%s\n%s\n\n" % (current_url, time.asctime())
self.fp_log.write(data_log)
以下是我用来查看设置DOWNLOAD_DELAY效果的日志文件的一部分:
http://living.dangdang.com/furniture
Mon Aug 27 10:49:50 2012
http://static.dangdang.com/topic/744/200778.shtml
Mon Aug 27 10:49:50 2012
http://survey.dangdang.com/html/2389.html
Mon Aug 27 10:49:50 2012
http://fashion.dangdang.com/watch
Mon Aug 27 10:49:50 2012
https://login.dangdang.com/signin.aspx?returnurl=http://customer.dangdang.com/wishlist/
Mon Aug 27 10:49:50 2012
http://www.hd315.gov.cn/beian/view.asp?bianhao=010202001051000098
Mon Aug 27 10:49:51 2012
https://ss.cnnic.cn/verifyseal.dll?pa=2940051&sn=2010091900100002234
Mon Aug 27 10:49:51 2012
但似乎DOWNLOAD_DELAY没有明显效果。
答案 0 :(得分:1)
这是由dnscache
(延迟)的实施引起的
CONCURRENT_REQUESTS_PER_IP
仅适用于第二个同域请求
您可以覆盖get()
的{{1}}方法,使其返回固定值
它导致scrapy看到对同一IP的每个请求。
LocalCache
测试你的蜘蛛:
from scrapy.utils.datatypes import LocalCache
LocalCache.get = lambda *args:'fake-dummy-domain'
答案 1 :(得分:0)
您只能将属性赋值和方法直接放在类主体中。如果要初始化对象代码,则需要覆盖__init__()
:
class UseragentSpider(BaseSpider):
name = 'useragent'
fn_log = "data/log.txt"
fn_useragents = "utils/useragents.txt"
DOWNLOAD_DELAY = 5
def __init__(self, name=None, **kwargs):
settings.overrides['CONCURRENT_REQUESTS'] = 1
settings.overrides['CONCURRENT_REQUESTS_PER_DOMAIN'] = 1
settings.overrides['RANDOM_DOWNLOAD_DELAY'] = False
fp_useragents = open(self.fn_useragents, 'rb')
self.total_lines = len(fp_useragents.readlines())
fp_useragents.close()
if not os.path.isdir("data"):
os.mkdir("data")
self.fp_log = open(self.fn_log, "ab+")
# remember to call BaseSpider __init__() since we're overriding it
super(UseragentSpider, self).__init__(name, **kwargs)
def start_requests(self):
urls = ['http://www.dangdang.com',
'http://www.360buy.com',
'http://www.amazon.com.cn',
'http://www.taobao.com',
]
for url in urls:
ua = linecache.getline(self.fn_useragents, randint(1, self.total_lines)).strip()
url_headers = {'User-Agent': ua}
yield Request(url, callback=self.parse_origin, headers=url_headers)
def parse_origin(self, response):
headers = response.request.headers
data_log = response.url
for k, v in headers.items():
header = "%s\t%s" % (k, v)
data_log = "\n".join((data_log, header))
self.fp_log.write("%s\n" % data_log)