网页抓取挑战

时间:2021-02-27 15:27:59

标签: python beautifulsoup

我正在学习如何在随机挑战范围内使用 BeautifulSoup(作为练习,因为我想开始抓取挑战范围)。

import requests
from bs4 import BeautifulSoup as bs
import pandas as pd

r=requests.get('https://smashchile.challonge.com/ss1')
webpage= bs(r.content)

但是我遇到了一个错误(见底部)。 这是我第一次网络抓取,我想了解更多关于它的法律限制。

服务条款如下: (Link) 未经我们明确书面许可,出于任何目的使用任何机器人、蜘蛛、爬虫或其他自动化方式访问本网站或服务;但是,该规定不适用于搜索引擎的索引或更新。

提前致谢;)


RemoteDisconnected                        Traceback (most recent call last)
~/Desktop/Programming/JupNbEnv/lib/python3.7/site-packages/urllib3/connectionpool.py in urlopen(self, method, url, body, headers, retries, redirect, assert_same_host, timeout, pool_timeout, release_conn, chunked, body_pos, **response_kw)
    705                 headers=headers,
--> 706                 chunked=chunked,
    707             )

~/Desktop/Programming/JupNbEnv/lib/python3.7/site-packages/urllib3/connectionpool.py in _make_request(self, conn, method, url, timeout, chunked, **httplib_request_kw)
    444                     # Otherwise it looks like a bug in the code.
--> 445                     six.raise_from(e, None)
    446         except (SocketTimeout, BaseSSLError, SocketError) as e:

~/Desktop/Programming/JupNbEnv/lib/python3.7/site-packages/urllib3/packages/six.py in raise_from(value, from_value)

~/Desktop/Programming/JupNbEnv/lib/python3.7/site-packages/urllib3/connectionpool.py in _make_request(self, conn, method, url, timeout, chunked, **httplib_request_kw)
    439                 try:
--> 440                     httplib_response = conn.getresponse()
    441                 except BaseException as e:

/usr/lib/python3.7/http/client.py in getresponse(self)
   1335             try:
-> 1336                 response.begin()
   1337             except ConnectionError:

/usr/lib/python3.7/http/client.py in begin(self)
    305         while True:
--> 306             version, status, reason = self._read_status()
    307             if status != CONTINUE:

/usr/lib/python3.7/http/client.py in _read_status(self)
    274             # sending a valid response.
--> 275             raise RemoteDisconnected("Remote end closed connection without"
    276                                      " response")

RemoteDisconnected: Remote end closed connection without response

During handling of the above exception, another exception occurred:

ProtocolError                             Traceback (most recent call last)
~/Desktop/Programming/JupNbEnv/lib/python3.7/site-packages/requests/adapters.py in send(self, request, stream, timeout, verify, cert, proxies)
    448                     retries=self.max_retries,
--> 449                     timeout=timeout
    450                 )

~/Desktop/Programming/JupNbEnv/lib/python3.7/site-packages/urllib3/connectionpool.py in urlopen(self, method, url, body, headers, retries, redirect, assert_same_host, timeout, pool_timeout, release_conn, chunked, body_pos, **response_kw)
    755             retries = retries.increment(
--> 756                 method, url, error=e, _pool=self, _stacktrace=sys.exc_info()[2]
    757             )

~/Desktop/Programming/JupNbEnv/lib/python3.7/site-packages/urllib3/util/retry.py in increment(self, method, url, response, error, _pool, _stacktrace)
    530             if read is False or not self._is_method_retryable(method):
--> 531                 raise six.reraise(type(error), error, _stacktrace)
    532             elif read is not None:

~/Desktop/Programming/JupNbEnv/lib/python3.7/site-packages/urllib3/packages/six.py in reraise(tp, value, tb)
    733             if value.__traceback__ is not tb:
--> 734                 raise value.with_traceback(tb)
    735             raise value

~/Desktop/Programming/JupNbEnv/lib/python3.7/site-packages/urllib3/connectionpool.py in urlopen(self, method, url, body, headers, retries, redirect, assert_same_host, timeout, pool_timeout, release_conn, chunked, body_pos, **response_kw)
    705                 headers=headers,
--> 706                 chunked=chunked,
    707             )

~/Desktop/Programming/JupNbEnv/lib/python3.7/site-packages/urllib3/connectionpool.py in _make_request(self, conn, method, url, timeout, chunked, **httplib_request_kw)
    444                     # Otherwise it looks like a bug in the code.
--> 445                     six.raise_from(e, None)
    446         except (SocketTimeout, BaseSSLError, SocketError) as e:

~/Desktop/Programming/JupNbEnv/lib/python3.7/site-packages/urllib3/packages/six.py in raise_from(value, from_value)

~/Desktop/Programming/JupNbEnv/lib/python3.7/site-packages/urllib3/connectionpool.py in _make_request(self, conn, method, url, timeout, chunked, **httplib_request_kw)
    439                 try:
--> 440                     httplib_response = conn.getresponse()
    441                 except BaseException as e:

/usr/lib/python3.7/http/client.py in getresponse(self)
   1335             try:
-> 1336                 response.begin()
   1337             except ConnectionError:

/usr/lib/python3.7/http/client.py in begin(self)
    305         while True:
--> 306             version, status, reason = self._read_status()
    307             if status != CONTINUE:

/usr/lib/python3.7/http/client.py in _read_status(self)
    274             # sending a valid response.
--> 275             raise RemoteDisconnected("Remote end closed connection without"
    276                                      " response")

ProtocolError: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))

During handling of the above exception, another exception occurred:

ConnectionError                           Traceback (most recent call last)
<ipython-input-1-49ffef2d4435> in <module>
      3 import pandas as pd
      4 
----> 5 r=requests.get('https://smashchile.challonge.com/ss1')
      6 webpage= bs(r.content)

~/Desktop/Programming/JupNbEnv/lib/python3.7/site-packages/requests/api.py in get(url, params, **kwargs)
     74 
     75     kwargs.setdefault('allow_redirects', True)
---> 76     return request('get', url, params=params, **kwargs)
     77 
     78 

~/Desktop/Programming/JupNbEnv/lib/python3.7/site-packages/requests/api.py in request(method, url, **kwargs)
     59     # cases, and look like a memory leak in others.
     60     with sessions.Session() as session:
---> 61         return session.request(method=method, url=url, **kwargs)
     62 
     63 

~/Desktop/Programming/JupNbEnv/lib/python3.7/site-packages/requests/sessions.py in request(self, method, url, params, data, headers, cookies, files, auth, timeout, allow_redirects, proxies, hooks, stream, verify, cert, json)
    540         }
    541         send_kwargs.update(settings)
--> 542         resp = self.send(prep, **send_kwargs)
    543 
    544         return resp

~/Desktop/Programming/JupNbEnv/lib/python3.7/site-packages/requests/sessions.py in send(self, request, **kwargs)
    653 
    654         # Send the request
--> 655         r = adapter.send(request, **kwargs)
    656 
    657         # Total elapsed time of the request (approximately)

~/Desktop/Programming/JupNbEnv/lib/python3.7/site-packages/requests/adapters.py in send(self, request, stream, timeout, verify, cert, proxies)
    496 
    497         except (ProtocolError, socket.error) as err:
--> 498             raise ConnectionError(err, request=request)
    499 
    500         except MaxRetryError as e:

ConnectionError: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))

1 个答案:

答案 0 :(得分:0)

要解决挑战,您需要的是标题。否则,服务器(正确地)认为您是机器人并拒绝连接。

例如:

import requests
from bs4 import BeautifulSoup

headers = {
    "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.192 Safari/537.36",
}

webpage = BeautifulSoup(
    requests.get('https://smashchile.challonge.com/ss1', headers=headers).content,
    "html.parser",
    ).select_one(".user-profile-block>.details>*.-nopad")
print(webpage.getText(strip=True))

输出:

KLG | DDC | Keen

附注:您正在抓取的网站大部分是动态的,因此您不会从内容中获得太多信息,因为 b4 根本看不到它。