我的代码应该来自特定的json,如url(输出网页提供的不是JSON,这是必需的)。当我通过连接A得到它时,它会返回以下错误:
Traceback (most recent call last):
File "C:\Users\nihadazimli\AppData\Local\Programs\Python\Python35\lib\threading.py", line 914, in _bootstrap_inner
self.run()
File "G:/Internship/quantsol-text/web-crawler/mynet_new/date_gaining.py", line 20, in run
main_func(self.counter)
File "G:/Internship/quantsol-text/web-crawler/mynet_new/date_gaining.py", line 166, in main_func
total=url_to_dict(url)
File "G:/Internship/quantsol-text/web-crawler/mynet_new/date_gaining.py", line 79, in url_to_dict
data = urllib.request.urlopen(url).read().decode('utf-8')
File "C:\Users\nihadazimli\AppData\Local\Programs\Python\Python35\lib\urllib\request.py", line 163, in urlopen
return opener.open(url, data, timeout)
File "C:\Users\nihadazimli\AppData\Local\Programs\Python\Python35\lib\urllib\request.py", line 472, in open
response = meth(req, response)
File "C:\Users\nihadazimli\AppData\Local\Programs\Python\Python35\lib\urllib\request.py", line 582, in http_response
'http', request, response, code, msg, hdrs)
File "C:\Users\nihadazimli\AppData\Local\Programs\Python\Python35\lib\urllib\request.py", line 510, in error
return self._call_chain(*args)
File "C:\Users\nihadazimli\AppData\Local\Programs\Python\Python35\lib\urllib\request.py", line 444, in _call_chain
result = func(*args)
File "C:\Users\nihadazimli\AppData\Local\Programs\Python\Python35\lib\urllib\request.py", line 590, in http_error_default
raise HTTPError(req.full_url, code, msg, hdrs, fp)
urllib.error.HTTPError: HTTP Error 404: Not Found
有趣的是,当我尝试通过连接B获取信息时,它工作正常,但是在10000-20000次迭代后我得到了跟踪错误:
Exception in thread Thread-9:
Traceback (most recent call last):
File "C:\Users\nihadazimli\AppData\Local\Programs\Python\Python35\lib\urllib\request.py", line 1254, in do_open
h.request(req.get_method(), req.selector, req.data, headers)
File "C:\Users\nihadazimli\AppData\Local\Programs\Python\Python35\lib\http\client.py", line 1106, in request
self._send_request(method, url, body, headers)
File "C:\Users\nihadazimli\AppData\Local\Programs\Python\Python35\lib\http\client.py", line 1151, in _send_request
self.endheaders(body)
File "C:\Users\nihadazimli\AppData\Local\Programs\Python\Python35\lib\http\client.py", line 1102, in endheaders
self._send_output(message_body)
File "C:\Users\nihadazimli\AppData\Local\Programs\Python\Python35\lib\http\client.py", line 934, in _send_output
self.send(msg)
File "C:\Users\nihadazimli\AppData\Local\Programs\Python\Python35\lib\http\client.py", line 877, in send
self.connect()
File "C:\Users\nihadazimli\AppData\Local\Programs\Python\Python35\lib\http\client.py", line 849, in connect
(self.host,self.port), self.timeout, self.source_address)
File "C:\Users\nihadazimli\AppData\Local\Programs\Python\Python35\lib\socket.py", line 711, in create_connection
raise err
File "C:\Users\nihadazimli\AppData\Local\Programs\Python\Python35\lib\socket.py", line 702, in create_connection
sock.connect(sa)
TimeoutError: [WinError 10060] A connection attempt failed because the connected party did not properly respond after a period of time, or established connection failed because connected host has failed to respond
我在网上搜索了几个小时的连接错误B连接B的错误主要是因为连接问题或代理而发生的。我尝试了这个解决方案与几个不同的代理,它没有工作,或者在几千次迭代后给出了相同的错误:
proxy_support = urllib.request.ProxyHandler({"http": "http://208.83.106.105:9999"})
opener = urllib.request.build_opener(proxy_support)
urllib.request.install_opener(opener)
问题部分如下:
class myThread (threading.Thread):
def __init__(self, threadID, name, counter):
threading.Thread.__init__(self)
self.threadID = threadID
self.name = name
self.counter = counter
def run(self):
main_func(self.counter)
def url_to_dict(url):
hdr = {
'User-Agent': 'Chrome/60.0.3112.101 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Safari/537.11 Mozilla/55.0.2',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
'Accept-Encoding': 'none',
'Accept-Language': 'en-US,en;q=0.8',
'Connection': 'keep-alive'}
data2= urllib.request.Request(url,headers= {'User-Agent': 'Mozilla/5.0'})
# proxy_support = urllib.request.ProxyHandler({"http": "http://61.233.25.166:80"})
# opener = urllib2.build_opener(proxy_support)
# urllib2.install_opener(opener)
data = urllib.request.urlopen(url).read().decode('utf-8')
json_type_string = re.findall('({.*})', data)[0]
json_data = json.loads(json_type_string)
total_page = json_data['data']['totalPage']
return json_data,total_page
def main_func(counter):
proxy_support = urllib.request.ProxyHandler({"http": "http://208.83.106.105:9999"})
opener = urllib.request.build_opener(proxy_support)
urllib.request.install_opener(opener)
for x in range(len(url_list)):
url=url_list[x]
company_name=company_list[x]
total=url_to_dict(url)
total_page=total[1]
for y in range(int(total_page/10)):
index = url.find('config[page]=')
index2 = url.find('&config[reply')
k = y*10
url = url[:index+13] + str(counter+k) + url[index2:]
print(url)
data = url_to_dict(url)
parsed_data = get_data(data)
add_to_mongo(parsed_data,company_name)