我编写了python 2.7脚本,用于将请求页面的页面内容保存到脚本所在的特定文件中
import scrapy
import os
import urllib2
class hTMLCODEaMAZONE(scrapy.Spider):
name = 'HTMLCODE'
opener = urllib2.build_opener()
opener.addheaders = [('User-agent', 'Mozilla/5.0')]
st_urls = [
'https://www.amazon.co.uk/VSafety-12034AN-G-Proceed-Assembly-Portrait/dp/B01NBC6659/ref=sr_1_2096?s=diy&rps=1&ie=UTF8&qid=1491382440&sr=1-2096',
'https://www.amazon.co.uk/VSafety-13014AF-R-Equipment-Plastic-Square/dp/B01NBC5ZXM/ref=sr_1_2097?s=diy&rps=1&ie=UTF8&qid=1491382440&sr=1-2097',
]
for link in st_urls:
response = opener.open(link)
results(response)
def results(response):
FNAME = str(response.url)
s = FNAME.split('/dp/')[1]
FFNAME = s.split('/ref')[0] + '.html'
f = open(FFNAME,'w')
f.write(str(response.read))
f.close()
file_info = os.stat(FFNAME)
fsize = file_info.st_size
# yield {
# 'pagelink': response.url,
# 'file size':fsize,
# }
当我运行这个时,我在Windows PowerShell中运行脚本时出错了
PS G:\AMAZON FILES> scrapy runspider .\HTMLCODE.py
2017-04-16 12:46:29 [scrapy.utils.log] INFO: Scrapy 1.3.3 started (bot:
scrapybot)
2017-04-16 12:46:29 [scrapy.utils.log] INFO: Overridden settings:
{'SPIDER_LOADER_WARN_ONLY': True}
Traceback (most recent call last):
File "c:\python27\lib\runpy.py", line 174, in _run_module_as_main
"__main__", fname, loader, pkg_name)
File "c:\python27\lib\runpy.py", line 72, in _run_code
exec code in run_globals
File "C:\Python27\Scripts\scrapy.exe\__main__.py", line 9, in <module>
File "c:\python27\lib\site-packages\scrapy\cmdline.py", line 142, in
execute
_run_print_help(parser, _run_command, cmd, args, opts)
File "c:\python27\lib\site-packages\scrapy\cmdline.py", line 88, in
_run_print_help
func(*a, **kw)
File "c:\python27\lib\site-packages\scrapy\cmdline.py", line 149, in
_run_command
cmd.run(args, opts)
File "c:\python27\lib\site-packages\scrapy\commands\runspider.py", line
80, in run
module = _import_file(filename)
File "c:\python27\lib\site-packages\scrapy\commands\runspider.py", line
21, in _import_file
module = import_module(fname)
File "c:\python27\lib\importlib\__init__.py", line 37, in import_module
__import__(name)
File "G:\AMAZON FILES\HTMLCODE.py", line 5, in <module>
class hTMLCODEaMAZONE(scrapy.Spider):
File "G:\AMAZON FILES\HTMLCODE.py", line 17, in hTMLCODEaMAZONE
response = opener.open(link)
File "c:\python27\lib\urllib2.py", line 429, in open
response = self._open(req, data)
File "c:\python27\lib\urllib2.py", line 447, in _open
'_open', req)
File "c:\python27\lib\urllib2.py", line 407, in _call_chain
result = func(*args)
File "c:\python27\lib\urllib2.py", line 1241, in https_open
context=self._context)
File "c:\python27\lib\urllib2.py", line 1198, in do_open
raise URLError(err)
urllib2.URLError: <urlopen error EOF occurred in violation of protocol (_ssl.c:661)>
我需要将FFNAME和fsize变量的值存储到.csv文件中,以便我使用urllib.request.urlopen()尝试过该记录,但仍然给出了此错误