从Transfermarkt网上抓取数据

时间:2020-06-22 18:15:49

标签: python web-scraping python-requests

我写了一个网络爬取程序来从Transfermarkt.de中刮取数据

首先,我从过去10年的20个最大交易中获取数据

headers = {'User-Agent': 
           'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.106 Safari/537.36'}

df_consolidado = pd.DataFrame()
df = {}
temporadas = list(range(2009,2020))

#Crio os lista para armazenar as informações
Jogadores = []
Valores_Transf = []
Href = []

for t in temporadas:
    print(t)
    for p in range(1,5):
        #Carrega a pagina p do temporada t
        page = "https://www.transfermarkt.de/transfers/transferrekorde/statistik/top/saison_id/" + str(t) + "/land_id//ausrichtung//spielerposition_id//altersklasse/u23/leihe//w_s//plus/0/galerie/0/page/" + str(p)        
        print(page)
        pageTree = requests.get(page, headers=headers)
        pageSoup = BeautifulSoup(pageTree.content, 'html.parser')

        #Pega os dados das transferências
        jogador = pageSoup.find_all("a", {"class": "spielprofil_tooltip"})        
        valor_transf = pageSoup.find_all("td", {"class": "rechts hauptlink"})

        #Introduzo as informações nas listas
        for i in range(0,25):
            Jogadores.append(jogador[i].text)
            Valores_Transf.append(float(valor_transf[i].text.replace('Mio.', '').replace('€', '').replace(',', '.').replace('Leihgebühr:', '').replace('Leih-Ende', '0')))
            Href.append(jogador[i]['href'])

    df[t] = pd.DataFrame({"Temporada":int(t),"Jogador":Jogadores,"Valor Transferência":Valores_Transf, "Ref":Href})

然后,我将所有这些dfs合并:

#Combinar os vários dfs gerados
df = pd.concat([df[2009], df[2010],df[2011], df[2012], df[2013], df[2014],df[2015], df[2016], df[2017], df[2018], df[2019]])

但是在最后也是最重要的一步,我发现了一些麻烦。通过以下代码,我试图获得更多详细信息:

headers = {'User-Agent': 
           'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.106 Safari/537.36'}

Altura = []
Idade_Atual = []
Idade_Transf = []
Maior_Valor = []
Data_Max_Valor = []


for index, row in df.iterrows():
    page = "https://www.transfermarkt.de" + row['Ref']
    pageTree = requests.get(page, headers=headers, timeout=1000)    
    pageSoup = BeautifulSoup(pageTree.content, 'html.parser')

    #Carrego o objeto contendo os dados
    dados_agrupados = None
    dados_agrupados = pageSoup.find_all("table", {"class": "auflistung"})
    
    dados = []
    tabela = []
    print(page)
    for d in dados_agrupados:
        dados.extend(d.find_all("td"))
        tabela.extend(d.find_all("th"))

        #Verifico a estrutura da table para copiar os dados
        for t in range(len(tabela)):            
            if tabela[t].text == "Height:":                
                if dados[t].text != "N/A":
                    Altura.append(float(dados[t].text.rstrip(' m').replace(',', '.')))
                else:
                    Altura.append(0)
                
            if tabela[t].text == "Age:":
                Idade_Atual.append(int(dados[t].text))
                Idade_Transf.append(int((row['Temporada']-2020) + int(dados[t].text)))
            
            if tabela[t].text == "Foot:":
                Pe_Dominante.append(dados[t].text)
                
    
    #Carrego o objeto contendo o maior valor de mercado do jogador
    dados_agrupados_val = None
    dados_agrupados_val = pageSoup.find_all("div", {"class": "right-td"})    

    Data_Max_Valor.append(int(dados_agrupados_val[2].text.replace(' ', '')[-5:-1]))
    if "k" in str(dados_agrupados_val[2].text.replace('Mio.', '').replace('€', '').replace(',', '.').replace(' ', '')[:-12]):
        Maior_Valor.append(float(dados_agrupados_val[2].text.replace('Mio.', '').replace('Â', '').replace('â', '').replace('¬', '').replace('k', '').replace('€', '').replace(',', '.').replace(' ', '')[:-12])/1000)
    else:
        Maior_Valor.append(float(dados_agrupados_val[2].text.replace('Mio.', '').replace('Â', '').replace('â', '').replace('¬', '').replace('€', '').replace(',', '.').replace(' ', '')[:-13]))
    
df["Altura"] = Altura
df["Idade_Atual"] = Idade_Atual
df["Idade_Transf"] = Idade_Transf           
df["Max_Valor"] = Maior_Valor
df["Data_Max_Valor"] = Data_Max_Valor

#Idade calculado quando o máximo valor de mercado foi atingido
df["Idade_Max_Valor"] = df["Data_Max_Valor"] - (df["Temporada"]-df["Idade_Transf"])

df

但是我最终遇到以下错误:

---------------------------------------------------------------------------
SysCallError                              Traceback (most recent call last)
~\Anaconda3\lib\site-packages\urllib3\contrib\pyopenssl.py in wrap_socket(self, sock, server_side, do_handshake_on_connect, suppress_ragged_eofs, server_hostname)
    455             try:
--> 456                 cnx.do_handshake()
    457             except OpenSSL.SSL.WantReadError:

~\Anaconda3\lib\site-packages\OpenSSL\SSL.py in do_handshake(self)
   1914         result = _lib.SSL_do_handshake(self._ssl)
-> 1915         self._raise_ssl_error(self._ssl, result)
   1916 

~\Anaconda3\lib\site-packages\OpenSSL\SSL.py in _raise_ssl_error(self, ssl, result)
   1638                     if errno != 0:
-> 1639                         raise SysCallError(errno, errorcode.get(errno))
   1640                 raise SysCallError(-1, "Unexpected EOF")

SysCallError: (10054, 'WSAECONNRESET')

During handling of the above exception, another exception occurred:

SSLError                                  Traceback (most recent call last)
~\Anaconda3\lib\site-packages\urllib3\connectionpool.py in urlopen(self, method, url, body, headers, retries, redirect, assert_same_host, timeout, pool_timeout, release_conn, chunked, body_pos, **response_kw)
    599                                                   body=body, headers=headers,
--> 600                                                   chunked=chunked)
    601 

~\Anaconda3\lib\site-packages\urllib3\connectionpool.py in _make_request(self, conn, method, url, timeout, chunked, **httplib_request_kw)
    342         try:
--> 343             self._validate_conn(conn)
    344         except (SocketTimeout, BaseSSLError) as e:

~\Anaconda3\lib\site-packages\urllib3\connectionpool.py in _validate_conn(self, conn)
    838         if not getattr(conn, 'sock', None):  # AppEngine might not have  `.sock`
--> 839             conn.connect()
    840 

~\Anaconda3\lib\site-packages\urllib3\connection.py in connect(self)
    343             server_hostname=server_hostname,
--> 344             ssl_context=context)
    345 

~\Anaconda3\lib\site-packages\urllib3\util\ssl_.py in ssl_wrap_socket(sock, keyfile, certfile, cert_reqs, ca_certs, server_hostname, ssl_version, ciphers, ssl_context, ca_cert_dir)
    346         if HAS_SNI and server_hostname is not None:
--> 347             return context.wrap_socket(sock, server_hostname=server_hostname)
    348 

~\Anaconda3\lib\site-packages\urllib3\contrib\pyopenssl.py in wrap_socket(self, sock, server_side, do_handshake_on_connect, suppress_ragged_eofs, server_hostname)
    461             except OpenSSL.SSL.Error as e:
--> 462                 raise ssl.SSLError('bad handshake: %r' % e)
    463             break

SSLError: ("bad handshake: SysCallError(10054, 'WSAECONNRESET')",)

During handling of the above exception, another exception occurred:

MaxRetryError                             Traceback (most recent call last)
~\Anaconda3\lib\site-packages\requests\adapters.py in send(self, request, stream, timeout, verify, cert, proxies)
    448                     retries=self.max_retries,
--> 449                     timeout=timeout
    450                 )

~\Anaconda3\lib\site-packages\urllib3\connectionpool.py in urlopen(self, method, url, body, headers, retries, redirect, assert_same_host, timeout, pool_timeout, release_conn, chunked, body_pos, **response_kw)
    637             retries = retries.increment(method, url, error=e, _pool=self,
--> 638                                         _stacktrace=sys.exc_info()[2])
    639             retries.sleep()

~\Anaconda3\lib\site-packages\urllib3\util\retry.py in increment(self, method, url, response, error, _pool, _stacktrace)
    398         if new_retry.is_exhausted():
--> 399             raise MaxRetryError(_pool, url, error or ResponseError(cause))
    400 

MaxRetryError: HTTPSConnectionPool(host='www.transfermarkt.de', port=443): Max retries exceeded with url: /bojan-krkic/profil/spieler/44675 (Caused by SSLError(SSLError("bad handshake: SysCallError(10054, 'WSAECONNRESET')")))

During handling of the above exception, another exception occurred:

SSLError                                  Traceback (most recent call last)
<ipython-input-5-7f98723c208e> in <module>
     11 for index, row in df.iterrows():
     12     page = "https://www.transfermarkt.de" + row['Ref']
---> 13     pageTree = requests.get(page, headers=headers, timeout=1000)
     14     pageSoup = BeautifulSoup(pageTree.content, 'html.parser')
     15 

~\Anaconda3\lib\site-packages\requests\api.py in get(url, params, **kwargs)
     73 
     74     kwargs.setdefault('allow_redirects', True)
---> 75     return request('get', url, params=params, **kwargs)
     76 
     77 

~\Anaconda3\lib\site-packages\requests\api.py in request(method, url, **kwargs)
     58     # cases, and look like a memory leak in others.
     59     with sessions.Session() as session:
---> 60         return session.request(method=method, url=url, **kwargs)
     61 
     62 

~\Anaconda3\lib\site-packages\requests\sessions.py in request(self, method, url, params, data, headers, cookies, files, auth, timeout, allow_redirects, proxies, hooks, stream, verify, cert, json)
    531         }
    532         send_kwargs.update(settings)
--> 533         resp = self.send(prep, **send_kwargs)
    534 
    535         return resp

~\Anaconda3\lib\site-packages\requests\sessions.py in send(self, request, **kwargs)
    644 
    645         # Send the request
--> 646         r = adapter.send(request, **kwargs)
    647 
    648         # Total elapsed time of the request (approximately)

~\Anaconda3\lib\site-packages\requests\adapters.py in send(self, request, stream, timeout, verify, cert, proxies)
    512             if isinstance(e.reason, _SSLError):
    513                 # This branch is for urllib3 v1.22 and later.
--> 514                 raise SSLError(e, request=request)
    515 
    516             raise ConnectionError(e, request=request)

SSLError: HTTPSConnectionPool(host='www.transfermarkt.de', port=443): Max retries exceeded with url: /bojan-krkic/profil/spieler/44675 (Caused by SSLError(SSLError("bad handshake: SysCallError(10054, 'WSAECONNRESET')")))

有人知道发生了什么吗?

2 个答案:

答案 0 :(得分:0)

这似乎是服务器端的问题,而不是您的代码。您得到的SysCallError代码是Windows套接字错误代码。引用Microsoft Docs

WSAECONNRESET 10054

连接被对等方重置。
远程主机强行关闭了现有连接。如果远程主机上的对等应用程序突然停止,主机重新启动,主机或远程网络接口被禁用,或者远程主机使用硬关闭(请参阅setsockopt,以获取有关远程SO_LINGER选项的更多信息,通常会导致这种情况)插座)。如果在进行一个或多个操作时由于保持活动检测到故障而导致连接断开,也可能导致此错误。正在进行的操作因WSAENETRESET而失败。后续操作因WSAECONNRESET而失败。

这可能是暂时的问题。我能够运行您的脚本并发送请求很长时间没有问题。如果在向该服务器发送请求时仍然发生这种情况,请考虑使用以下方法捕获这些异常:

try:
    pageTree = requests.get(page, headers=headers, timeout=1000)
except requests.exceptions.SSLError as e:
    print(f'request to {page} failed: {e}')
    # or retry the request until it succeeds

答案 1 :(得分:0)

问题可能是因为,如果您使用的是pyOpenSSL版本,请尝试在此open an issue的帮助下对其进行更新。