我写了一个网络爬取程序来从Transfermarkt.de中刮取数据
首先,我从过去10年的20个最大交易中获取数据
headers = {'User-Agent':
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.106 Safari/537.36'}
df_consolidado = pd.DataFrame()
df = {}
temporadas = list(range(2009,2020))
#Crio os lista para armazenar as informações
Jogadores = []
Valores_Transf = []
Href = []
for t in temporadas:
print(t)
for p in range(1,5):
#Carrega a pagina p do temporada t
page = "https://www.transfermarkt.de/transfers/transferrekorde/statistik/top/saison_id/" + str(t) + "/land_id//ausrichtung//spielerposition_id//altersklasse/u23/leihe//w_s//plus/0/galerie/0/page/" + str(p)
print(page)
pageTree = requests.get(page, headers=headers)
pageSoup = BeautifulSoup(pageTree.content, 'html.parser')
#Pega os dados das transferências
jogador = pageSoup.find_all("a", {"class": "spielprofil_tooltip"})
valor_transf = pageSoup.find_all("td", {"class": "rechts hauptlink"})
#Introduzo as informações nas listas
for i in range(0,25):
Jogadores.append(jogador[i].text)
Valores_Transf.append(float(valor_transf[i].text.replace('Mio.', '').replace('€', '').replace(',', '.').replace('Leihgebühr:', '').replace('Leih-Ende', '0')))
Href.append(jogador[i]['href'])
df[t] = pd.DataFrame({"Temporada":int(t),"Jogador":Jogadores,"Valor Transferência":Valores_Transf, "Ref":Href})
然后,我将所有这些dfs合并:
#Combinar os vários dfs gerados
df = pd.concat([df[2009], df[2010],df[2011], df[2012], df[2013], df[2014],df[2015], df[2016], df[2017], df[2018], df[2019]])
但是在最后也是最重要的一步,我发现了一些麻烦。通过以下代码,我试图获得更多详细信息:
headers = {'User-Agent':
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.106 Safari/537.36'}
Altura = []
Idade_Atual = []
Idade_Transf = []
Maior_Valor = []
Data_Max_Valor = []
for index, row in df.iterrows():
page = "https://www.transfermarkt.de" + row['Ref']
pageTree = requests.get(page, headers=headers, timeout=1000)
pageSoup = BeautifulSoup(pageTree.content, 'html.parser')
#Carrego o objeto contendo os dados
dados_agrupados = None
dados_agrupados = pageSoup.find_all("table", {"class": "auflistung"})
dados = []
tabela = []
print(page)
for d in dados_agrupados:
dados.extend(d.find_all("td"))
tabela.extend(d.find_all("th"))
#Verifico a estrutura da table para copiar os dados
for t in range(len(tabela)):
if tabela[t].text == "Height:":
if dados[t].text != "N/A":
Altura.append(float(dados[t].text.rstrip(' m').replace(',', '.')))
else:
Altura.append(0)
if tabela[t].text == "Age:":
Idade_Atual.append(int(dados[t].text))
Idade_Transf.append(int((row['Temporada']-2020) + int(dados[t].text)))
if tabela[t].text == "Foot:":
Pe_Dominante.append(dados[t].text)
#Carrego o objeto contendo o maior valor de mercado do jogador
dados_agrupados_val = None
dados_agrupados_val = pageSoup.find_all("div", {"class": "right-td"})
Data_Max_Valor.append(int(dados_agrupados_val[2].text.replace(' ', '')[-5:-1]))
if "k" in str(dados_agrupados_val[2].text.replace('Mio.', '').replace('€', '').replace(',', '.').replace(' ', '')[:-12]):
Maior_Valor.append(float(dados_agrupados_val[2].text.replace('Mio.', '').replace('Â', '').replace('â', '').replace('¬', '').replace('k', '').replace('€', '').replace(',', '.').replace(' ', '')[:-12])/1000)
else:
Maior_Valor.append(float(dados_agrupados_val[2].text.replace('Mio.', '').replace('Â', '').replace('â', '').replace('¬', '').replace('€', '').replace(',', '.').replace(' ', '')[:-13]))
df["Altura"] = Altura
df["Idade_Atual"] = Idade_Atual
df["Idade_Transf"] = Idade_Transf
df["Max_Valor"] = Maior_Valor
df["Data_Max_Valor"] = Data_Max_Valor
#Idade calculado quando o máximo valor de mercado foi atingido
df["Idade_Max_Valor"] = df["Data_Max_Valor"] - (df["Temporada"]-df["Idade_Transf"])
df
但是我最终遇到以下错误:
---------------------------------------------------------------------------
SysCallError Traceback (most recent call last)
~\Anaconda3\lib\site-packages\urllib3\contrib\pyopenssl.py in wrap_socket(self, sock, server_side, do_handshake_on_connect, suppress_ragged_eofs, server_hostname)
455 try:
--> 456 cnx.do_handshake()
457 except OpenSSL.SSL.WantReadError:
~\Anaconda3\lib\site-packages\OpenSSL\SSL.py in do_handshake(self)
1914 result = _lib.SSL_do_handshake(self._ssl)
-> 1915 self._raise_ssl_error(self._ssl, result)
1916
~\Anaconda3\lib\site-packages\OpenSSL\SSL.py in _raise_ssl_error(self, ssl, result)
1638 if errno != 0:
-> 1639 raise SysCallError(errno, errorcode.get(errno))
1640 raise SysCallError(-1, "Unexpected EOF")
SysCallError: (10054, 'WSAECONNRESET')
During handling of the above exception, another exception occurred:
SSLError Traceback (most recent call last)
~\Anaconda3\lib\site-packages\urllib3\connectionpool.py in urlopen(self, method, url, body, headers, retries, redirect, assert_same_host, timeout, pool_timeout, release_conn, chunked, body_pos, **response_kw)
599 body=body, headers=headers,
--> 600 chunked=chunked)
601
~\Anaconda3\lib\site-packages\urllib3\connectionpool.py in _make_request(self, conn, method, url, timeout, chunked, **httplib_request_kw)
342 try:
--> 343 self._validate_conn(conn)
344 except (SocketTimeout, BaseSSLError) as e:
~\Anaconda3\lib\site-packages\urllib3\connectionpool.py in _validate_conn(self, conn)
838 if not getattr(conn, 'sock', None): # AppEngine might not have `.sock`
--> 839 conn.connect()
840
~\Anaconda3\lib\site-packages\urllib3\connection.py in connect(self)
343 server_hostname=server_hostname,
--> 344 ssl_context=context)
345
~\Anaconda3\lib\site-packages\urllib3\util\ssl_.py in ssl_wrap_socket(sock, keyfile, certfile, cert_reqs, ca_certs, server_hostname, ssl_version, ciphers, ssl_context, ca_cert_dir)
346 if HAS_SNI and server_hostname is not None:
--> 347 return context.wrap_socket(sock, server_hostname=server_hostname)
348
~\Anaconda3\lib\site-packages\urllib3\contrib\pyopenssl.py in wrap_socket(self, sock, server_side, do_handshake_on_connect, suppress_ragged_eofs, server_hostname)
461 except OpenSSL.SSL.Error as e:
--> 462 raise ssl.SSLError('bad handshake: %r' % e)
463 break
SSLError: ("bad handshake: SysCallError(10054, 'WSAECONNRESET')",)
During handling of the above exception, another exception occurred:
MaxRetryError Traceback (most recent call last)
~\Anaconda3\lib\site-packages\requests\adapters.py in send(self, request, stream, timeout, verify, cert, proxies)
448 retries=self.max_retries,
--> 449 timeout=timeout
450 )
~\Anaconda3\lib\site-packages\urllib3\connectionpool.py in urlopen(self, method, url, body, headers, retries, redirect, assert_same_host, timeout, pool_timeout, release_conn, chunked, body_pos, **response_kw)
637 retries = retries.increment(method, url, error=e, _pool=self,
--> 638 _stacktrace=sys.exc_info()[2])
639 retries.sleep()
~\Anaconda3\lib\site-packages\urllib3\util\retry.py in increment(self, method, url, response, error, _pool, _stacktrace)
398 if new_retry.is_exhausted():
--> 399 raise MaxRetryError(_pool, url, error or ResponseError(cause))
400
MaxRetryError: HTTPSConnectionPool(host='www.transfermarkt.de', port=443): Max retries exceeded with url: /bojan-krkic/profil/spieler/44675 (Caused by SSLError(SSLError("bad handshake: SysCallError(10054, 'WSAECONNRESET')")))
During handling of the above exception, another exception occurred:
SSLError Traceback (most recent call last)
<ipython-input-5-7f98723c208e> in <module>
11 for index, row in df.iterrows():
12 page = "https://www.transfermarkt.de" + row['Ref']
---> 13 pageTree = requests.get(page, headers=headers, timeout=1000)
14 pageSoup = BeautifulSoup(pageTree.content, 'html.parser')
15
~\Anaconda3\lib\site-packages\requests\api.py in get(url, params, **kwargs)
73
74 kwargs.setdefault('allow_redirects', True)
---> 75 return request('get', url, params=params, **kwargs)
76
77
~\Anaconda3\lib\site-packages\requests\api.py in request(method, url, **kwargs)
58 # cases, and look like a memory leak in others.
59 with sessions.Session() as session:
---> 60 return session.request(method=method, url=url, **kwargs)
61
62
~\Anaconda3\lib\site-packages\requests\sessions.py in request(self, method, url, params, data, headers, cookies, files, auth, timeout, allow_redirects, proxies, hooks, stream, verify, cert, json)
531 }
532 send_kwargs.update(settings)
--> 533 resp = self.send(prep, **send_kwargs)
534
535 return resp
~\Anaconda3\lib\site-packages\requests\sessions.py in send(self, request, **kwargs)
644
645 # Send the request
--> 646 r = adapter.send(request, **kwargs)
647
648 # Total elapsed time of the request (approximately)
~\Anaconda3\lib\site-packages\requests\adapters.py in send(self, request, stream, timeout, verify, cert, proxies)
512 if isinstance(e.reason, _SSLError):
513 # This branch is for urllib3 v1.22 and later.
--> 514 raise SSLError(e, request=request)
515
516 raise ConnectionError(e, request=request)
SSLError: HTTPSConnectionPool(host='www.transfermarkt.de', port=443): Max retries exceeded with url: /bojan-krkic/profil/spieler/44675 (Caused by SSLError(SSLError("bad handshake: SysCallError(10054, 'WSAECONNRESET')")))
有人知道发生了什么吗?
答案 0 :(得分:0)
这似乎是服务器端的问题,而不是您的代码。您得到的SysCallError
代码是Windows套接字错误代码。引用Microsoft Docs:
WSAECONNRESET 10054
连接被对等方重置。
远程主机强行关闭了现有连接。如果远程主机上的对等应用程序突然停止,主机重新启动,主机或远程网络接口被禁用,或者远程主机使用硬关闭(请参阅setsockopt,以获取有关远程SO_LINGER选项的更多信息,通常会导致这种情况)插座)。如果在进行一个或多个操作时由于保持活动检测到故障而导致连接断开,也可能导致此错误。正在进行的操作因WSAENETRESET而失败。后续操作因WSAECONNRESET而失败。
这可能是暂时的问题。我能够运行您的脚本并发送请求很长时间没有问题。如果在向该服务器发送请求时仍然发生这种情况,请考虑使用以下方法捕获这些异常:
try:
pageTree = requests.get(page, headers=headers, timeout=1000)
except requests.exceptions.SSLError as e:
print(f'request to {page} failed: {e}')
# or retry the request until it succeeds
答案 1 :(得分:0)
问题可能是因为,如果您使用的是pyOpenSSL版本,请尝试在此open an issue的帮助下对其进行更新。