以下是我编写的网络抓取程序,用于下载我们学院的学生的身份证照片。所有学生的图像的URL都是相同的,我们只需要从记事本文件“ID.txt”中替换我提供的URL中的ID号。以下是我写的代码 -
from selenium import webdriver
driver=webdriver.Chrome(executable_path=r'C:\Users\user1712\Downloads\Chrome Downloads\chromedriver_win32\chromedriver.exe')
driver.get('https://swd.bits-goa.ac.in/student_pagetemp1?PHPSESSID=ecm2utnjvml8kpkpp8dh2dvnq0')
# ID.txt contains id card numbers of students. Each ID in a separate row
filename = 'ID.txt'
with open(filename) as f:
data = f.readlines()
import csv
import urllib.request
reader = csv.reader(data)
for row in reader:
# url of each student is almost same. Only thing is that we have to change the ID in the url to get the image address of a student
url="https://swd.bits-goa.ac.in/css/studentImg/"+str(row)+".jpg"
fullname=str(row)+".jpg"
urllib.request.urlretrieve(url, fullname)
以下是我得到的错误 -
Traceback (most recent call last):
File "C:\Users\KAUSTUBH\AppData\Local\Programs\Python\Python36-32\lib\urllib\request.py", line 1318, in do_open
encode_chunked=req.has_header('Transfer-encoding'))
File "C:\Users\KAUSTUBH\AppData\Local\Programs\Python\Python36-32\lib\http\client.py", line 1239, in request
self._send_request(method, url, body, headers, encode_chunked)
File "C:\Users\KAUSTUBH\AppData\Local\Programs\Python\Python36-32\lib\http\client.py", line 1285, in _send_request
self.endheaders(body, encode_chunked=encode_chunked)
File "C:\Users\KAUSTUBH\AppData\Local\Programs\Python\Python36-32\lib\http\client.py", line 1234, in endheaders
self._send_output(message_body, encode_chunked=encode_chunked)
File "C:\Users\KAUSTUBH\AppData\Local\Programs\Python\Python36-32\lib\http\client.py", line 1026, in _send_output
self.send(msg)
File "C:\Users\KAUSTUBH\AppData\Local\Programs\Python\Python36-32\lib\http\client.py", line 964, in send
self.connect()
File "C:\Users\KAUSTUBH\AppData\Local\Programs\Python\Python36-32\lib\http\client.py", line 1400, in connect
server_hostname=server_hostname)
File "C:\Users\KAUSTUBH\AppData\Local\Programs\Python\Python36-32\lib\ssl.py", line 407, in wrap_socket
_context=self, _session=session)
File "C:\Users\KAUSTUBH\AppData\Local\Programs\Python\Python36-32\lib\ssl.py", line 814, in __init__
self.do_handshake()
File "C:\Users\KAUSTUBH\AppData\Local\Programs\Python\Python36-32\lib\ssl.py", line 1068, in do_handshake
self._sslobj.do_handshake()
File "C:\Users\KAUSTUBH\AppData\Local\Programs\Python\Python36-32\lib\ssl.py", line 689, in do_handshake
self._sslobj.do_handshake()
ssl.SSLError: [SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed (_ssl.c:777)
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "C:\Users\KAUSTUBH\Downloads\Web scraping\swd trial.py", line 19, in <module>
urllib.request.urlretrieve(url, fullname)
File "C:\Users\KAUSTUBH\AppData\Local\Programs\Python\Python36-32\lib\urllib\request.py", line 248, in urlretrieve
with contextlib.closing(urlopen(url, data)) as fp:
File "C:\Users\KAUSTUBH\AppData\Local\Programs\Python\Python36-32\lib\urllib\request.py", line 223, in urlopen
return opener.open(url, data, timeout)
File "C:\Users\KAUSTUBH\AppData\Local\Programs\Python\Python36-32\lib\urllib\request.py", line 526, in open
response = self._open(req, data)
File "C:\Users\KAUSTUBH\AppData\Local\Programs\Python\Python36-32\lib\urllib\request.py", line 544, in _open
'_open', req)
File "C:\Users\KAUSTUBH\AppData\Local\Programs\Python\Python36-32\lib\urllib\request.py", line 504, in _call_chain
result = func(*args)
File "C:\Users\KAUSTUBH\AppData\Local\Programs\Python\Python36-32\lib\urllib\request.py", line 1361, in https_open
context=self._context, check_hostname=self._check_hostname)
File "C:\Users\KAUSTUBH\AppData\Local\Programs\Python\Python36-32\lib\urllib\request.py", line 1320, in do_open
raise URLError(err)
urllib.error.URLError: <urlopen error [SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed (_ssl.c:777)>
答案 0 :(得分:1)
为了跳过SSL错误,您需要在启动chromedriver时添加选项--ignore-certificate-errors
。
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
chrome_options = Options()
chrome_options.add_argument("--ignore-certificate-errors")
driver = webdriver.Chrome(chrome_options=chrome_options)
driver.get('https://swd.bits-goa.ac.in/student_pagetemp1?PHPSESSID=ecm2utnjvml8kpkpp8dh2dvnq0')