''' 我想对需要登录的网站进行网页抓取。我尝试了两种不同的代码方法。我仍然无法执行登录。 '''
import requests
from bs4 import BeautifulSoup
import http.cookiejar
import urllib.request
import urllib.parse
cj = http.cookiejar.CookieJar()
opener = urllib.request.build_opener(urllib.request.HTTPCookieProcessor(cj))
urllib.request.install_opener(opener)
authentication_url=
'http://127.0.0.1/orangehrm4.3.1/symfony/web/index.php/auth/login'
payload = {'txtUsername': '<username>', 'txtPassword': '<password>'}
data = urllib.parse.urlencode(payload).encode("utf-8")
req = urllib.request.Request(authentication_url, data)
resp = urllib.request.urlopen(req)
contents = resp.read()
import requests
from lxml import html
USERNAME = "<USERNAME>"
PASSWORD = "<PASSWORD>"
LOGIN_URL =
"http://127.0.0.1/orangehrm-4.3.1/symfony/web/index.php/auth/login"
URL = "http://127.0.0.1/orangehrm-4.3.1/symfony/web/index.php/dashboard"
def main():
session_requests = requests.session()
# Getting login csrf token
result = session_requests.get(LOGIN_URL)
tree = html.fromstring(result.text)
crsf_token =
list(set(tree.xpath("//input[@name='_csrf_token']/@value")))[0]
payload = {
"txtUsername": USERNAME,
"txtPassword": PASSWORD,
"_csrf_token": authenticity_token
}
result = session_requests.post(LOGIN_URL, data = payload, headers =
dict(referer = LOGIN_URL))
#URL to scrap
result = session_requests.get(URL)
soup = BeautifulSoup(result.text, 'html.parser')
div = soup.find('div', id='branding')
print(div)
if __name__ == '__main__':
main()
''' 使用这些方法后,我们只能获取登录页面的数据。我假设我们无法登录,因为我想在登录后访问页面数据。
弄清楚使用Python BeautifulSoup登录网站后如何执行Web抓取将非常有帮助。 '''
答案 0 :(得分:0)
尝试使用可用的授权方法代替使用有效负载方法。例如,这里使用HTTPBasicAuth
import requests
from requests.auth import HTTPBasicAuth
USERNAME = "<USERNAME>"
PASSWORD = "<PASSWORD>"
BASIC_AUTH = HTTPBasicAuth(USERNAME, PASSWORD)
LOGIN_URL = "http://127.0.0.1/orangehrm-4.3.1/symfony/web/index.php/auth/login"
response = requests.get(LOGIN_URL,headers={},auth=BASIC_AUTH)