我无法弄清楚为什么我的管道没有保存文件。这是代码:
VIDEOS_DIR = '/home/dmitry/videos'
class VideoDownloadPipeline(MediaPipeline):
def get_media_requests(self, item, info):
return Request(item['file'], meta={'item': item})
def media_downloaded(self, response, request, info):
item = response.meta.get('item')
video = response.body
video_basename = item['file'].split('/')[-1]
new_filename = os.path.join(VIDEOS_DIR, video_basename)
f = open(new_filename, 'wb')
f.write(video)
f.close()
def item_completed(self, results, item, info):
item['file'] = item['file'].split('/')[-1]
return item
之前我有其他一些代码,但它不是并发的,所以在继续解析之前我必须先等待每个视频先下载:
class VideoDownloadPipeline(object):
def process_item(self, item, spider):
video_basename = item['file'].split('/')[-1]
new_filename = os.path.join(VIDEOS_DIR, video_basename)
downloaded = False
for i in range(5):
try:
video = urllib2.urlopen(item['file']).read()
downloaded = True
break
except:
continue
if not downloaded:
raise DropItem("Couldn't download file from %s" % item)
f = open(new_filename, 'wb')
f.write(video)
f.close()
item['file'] = video_basename
return item
这是我的settings.py
:
PROJECT_ROOT = os.path.abspath(os.path.dirname(__file__))
BOT_NAME = 'videos_parser'
SPIDER_MODULES = ['videos_parser.spiders']
NEWSPIDER_MODULE = 'videos_parser.spiders'
ITEM_PIPELINES = {
'videos_parser.pipelines.VideoFileSizePipeline': 300,
'videos_parser.pipelines.VideoExistingInDBPipeline': 350,
'videos_parser.pipelines.VideoModeratePipeline': 400,
'videos_parser.pipelines.VideoDownloadPipeline': 500,
'videos_parser.pipelines.JsonWriterPipeline': 800,
}
EXTENSIONS = {
'scrapy.contrib.closespider.CloseSpider': 100,
}
CLOSESPIDER_ITEMCOUNT = 50
DOWNLOAD_TIMEOUT = 60
更新
我在log.msg()
和get_media_requests
中添加了一些media_downloaded
语句,因为我看到get_media_requests
被调用而media_download
不是因为:< / p>
2014-07-23 08:58:20+0400 [xhamster] DEBUG: Retrying <GET http://somesite/video.mp4> (failed 1 times): [<twisted.python.failure.Failure <class 'twisted.internet.error.ConnectionLost'>>]
但我可以使用浏览器下载此文件。
答案 0 :(得分:0)
由于DOWNLOAD_TIMEOUT
,我只是错过了有关该文件被蜘蛛丢弃的信息的行。