我正在尝试使用this解决方案解析网页,如下所示:
from bs4 import BeautifulSoup as bs
import re
import time
import random
----------------------
import socks
import socket
# Can be socks4/5
socks.setdefaultproxy(socks.PROXY_TYPE_SOCKS5,'127.0.0.1', 9050)
socket.socket = socks.socksocket
# Magic!
def getaddrinfo(*args):
return [(socket.AF_INET, socket.SOCK_STREAM, 6, '', (args[0], args[1]))]
socket.getaddrinfo = getaddrinfo
----------------------
import urllib2
# define urls
start_url = 'http://www.exmple.com'
# get web page
hdr = request_header()
req = urllib2.Request(start_url)
for key, value in hdr.items():
req.add_header(key, value)
page = urllib2.urlopen(req)
soup = bs(page.read(), 'lxml')
但是我收到了这个错误:
Traceback (most recent call last):
File "soupParse.py", line 159, in <module>
all_r = main()
File "soupParse.py", line 35, in main
page = urllib2.urlopen(req)
File "/usr/lib/python2.7/urllib2.py", line 127, in urlopen
return _opener.open(url, data, timeout)
File "/usr/lib/python2.7/urllib2.py", line 410, in open
response = meth(req, response)
File "/usr/lib/python2.7/urllib2.py", line 523, in http_response
'http', request, response, code, msg, hdrs)
File "/usr/lib/python2.7/urllib2.py", line 448, in error
return self._call_chain(*args)
File "/usr/lib/python2.7/urllib2.py", line 382, in _call_chain
result = func(*args)
File "/usr/lib/python2.7/urllib2.py", line 531, in http_error_default
raise HTTPError(req.get_full_url(), code, msg, hdrs, fp)
urllib2.HTTPError: HTTP Error 403: Forbidden
这是标题函数:
# create random request header
def request_header():
# change default User-Agent of the request
user_agent = ['Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:24.0) Gecko/20100101 Firefox/24.0',
'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:40.0) Gecko/20100101 Firefox/40.1',
'Mozilla/5.0 (X11; Linux i586; rv:31.0) Gecko/20100101 Firefox/31.0',
'Mozilla/5.0 (Windows NT 6.3; rv:36.0) Gecko/20100101 Firefox/36.0',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10; rv:33.0) Gecko/20100101 Firefox/33.0',
'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:31.0) Gecko/20130401 Firefox/31.0',
'Mozilla/5.0 (Windows NT 5.1; rv:31.0) Gecko/20100101 Firefox/31.0',
'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:29.0) Gecko/20120101 Firefox/29.0',
'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:25.0) Gecko/20100101 Firefox/29.0',
'Mozilla/5.0 (X11; OpenBSD amd64; rv:28.0) Gecko/20100101 Firefox/28.0',
'Mozilla/5.0 (X11; Linux x86_64; rv:28.0) Gecko/20100101 Firefox/28.0',
'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.1 Safari/537.36',
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.0 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2226.0 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.4; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2225.0 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2225.0 Safari/537.36',
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',]
ua = random.choice(user_agent)
hdr = {'User-Agent': ua,
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
'Accept-Encoding': 'none',
'Accept-Language': 'en-US,en;q=0.8',
'Connection': 'keep-alive'}
return hdr
我对这个话题不太熟悉,所以我很难理解这个问题。请帮忙。谢谢。
更新
我能够确定,此错误仅发生在urllib2
。例如,如果我使用Requests
,则没有错误。我没有把它作为答案,因为我不知道为什么会出现这个问题。如果有人知道,我会很高兴听到它。
祝你好运,快乐的scrapy!
答案 0 :(得分:1)
我强烈建议启动Wireshark并确保您的请求按照您认为的方式进行代理。
BeautifulSoup可能是这里的罪魁祸首,因为它逻辑上会首先导入套接字模块,所以请尝试进行以下导入:
import socks # Import this first no matter what
import socket
import re
import time
import random
from bs4 import BeautifulSoup as bs