我想使用Scrapy下载文件并导航ftp://ftp.co.palm-beach.fl.us/Building%20Permits/
处的文件夹。
这是我的蜘蛛:
# -*- coding: utf-8 -*-
import scrapy
from scrapy.http import Request
class LatestPermitsSpider(scrapy.Spider):
name= "latest_permits"
allowed_domains=["ftp.co.palm-beach.fl.us"]
handle_httpstatus_list = [404]
ftpUser= "the_username"
ftpPW= "the_password"
permitFilesDir= "ftp://ftp.co.palm-beach.fl.us/Building%20Permits/"
def start_requests(self):
yield Request(
url=self.permitFilesDir,
meta={
"ftp_user": self.ftpUser,
"ftp_password": self.ftpPW
}
)
def parse(self,response):
print response.body
当我运行scrapy crawl latest_permits
时,我收到此错误:
ConnectionLost: ('FTP connection lost', <twisted.python.failure.Failure twisted.protocols.ftp.CommandFailed: ['530 Sorry, no ANONYMOUS access allowed.']>)
即使我提供了正确的用户名和密码,为什么会出现此错误?
答案 0 :(得分:1)
请查看以下scrapy的源代码
https://github.com/scrapy/scrapy/blob/master/scrapy/core/downloader/handlers/ftp.py
问题不在于您的username
或password
。问题是scrapy
仅支持files
使用ftp
下载它不会添加对列表目录的支持。您使用的网址是目录网址
实际使用包名ftptree
使用以下代码添加handlers.py
import json
from twisted.protocols.ftp import FTPFileListProtocol
from scrapy.http import Response
from scrapy.core.downloader.handlers.ftp import FTPDownloadHandler
class FtpListingHandler(FTPDownloadHandler):
def gotClient(self, client, request, filepath):
self.client = client
protocol = FTPFileListProtocol()
return client.list(filepath, protocol).addCallbacks(
callback=self._build_response, callbackArgs=(request, protocol),
errback=self._failed, errbackArgs=(request,))
def _build_response(self, result, request, protocol):
self.result = result
body = json.dumps(protocol.files)
return Response(url=request.url, status=200, body=body)
然后在settings.py
使用
DOWNLOAD_HANDLERS = {'ftp': 'cralwername.handlers.FtpListingHandler'}
蜘蛛样本
import os
import json
from urlparse import urlparse
from scrapy import Spider
from scrapy.http.request import Request
from ftptree_crawler.items import FtpTreeLeaf
class AnonFtpRequest(Request):
anon_meta = {'ftp_user': 'anonymous',
'ftp_password': 'laserson@cloudera.com'}
def __init__(self, *args, **kwargs):
super(AnonFtpRequest, self).__init__(*args, **kwargs)
self.meta.update(self.anon_meta)
class FtpTreeSpider(Spider):
name = 'ftptree'
def __init__(self, config_file, *args, **kwargs):
super(FtpTreeSpider, self).__init__(*args, **kwargs)
with open(config_file, 'r') as ip:
config = json.loads(ip.read())
url = 'ftp://%s/%s' % (config['host'], config['root_path'])
self.start_url = url
self.site_id = config['id']
def start_requests(self):
yield AnonFtpRequest(self.start_url)
def parse(self, response):
url = urlparse(response.url)
basepath = url.path
files = json.loads(response.body)
for f in files:
if f['filetype'] == 'd':
path = os.path.join(response.url, f['filename'])
request = AnonFtpRequest(path)
yield request
if f['filetype'] == '-':
path = os.path.join(basepath, f['filename'])
result = FtpTreeLeaf(
filename=f['filename'], path=path, size=f['size'])
yield result
链接以查看是否需要更多信息
https://github.com/laserson/ftptree/blob/master/ftptree_crawler/