我正在尝试使用mechanize来抓取一个需要我登录的网站。这是我的代码的开头。
#!/usr/bin/python
#scrape the admissions part of SAFE
import mechanize
import cookielib
from BeautifulSoup import BeautifulSoup
import html2text
# Browser
br = mechanize.Browser()
# Cookie Jar
cj = cookielib.LWPCookieJar()
br.set_cookiejar(cj)
# Browser options
br.set_handle_equiv(True)
br.set_handle_gzip(True)
br.set_handle_redirect(True)
br.set_handle_referer(True)
br.set_handle_robots(False)
br.set_handle_refresh(mechanize._http.HTTPRefreshProcessor(), max_time=1)
br.addheaders = [('User-agent', 'Chrome')]
# The site we will navigate into, handling it's session
br.open('https://url')
# View available forms
for f in br.forms():
print f
这给了我
<POST https://userstuff application/x-www-form-urlencoded
<HiddenControl(lt=LT-227363-Ja4QpRvdxrbQF0nb7XcR2jQDydH43s) (readonly)>
<HiddenControl(execution=e1s1) (readonly)>
<HiddenControl(_eventId=submit) (readonly)>
<TextControl(username=)>
<PasswordControl(password=)>
<SubmitButtonControl(submit=) (readonly)>
<CheckboxControl(warn=[on])>>
我现在如何输入用户名和密码?
我试过
# Select the first (index zero) form
br.select_form(nr=0)
# User credentials
br.form['username'] = 'username'
br.form['password'] = 'password'
# Login
br.submit()
但这似乎不起作用。
答案 0 :(得分:3)
最后这对我有用
#!/usr/bin/python
#scraper
import mechanize
import cookielib
from BeautifulSoup import BeautifulSoup
import html2text
# Browser
br = mechanize.Browser()
# Cookie Jar
cj = cookielib.LWPCookieJar()
br.set_cookiejar(cj)
# Browser options
br.set_handle_equiv(True)
br.set_handle_gzip(True)
br.set_handle_redirect(True)
br.set_handle_referer(True)
br.set_handle_robots(False)
br.set_handle_refresh(mechanize._http.HTTPRefreshProcessor(), max_time=1)
br.addheaders = [('User-agent', 'Chrome')]
# The site we will navigate into, handling it's session
br.open('url1')
# View available forms
for f in br.forms():
if f.attrs['id'] == 'fm1':
br.form = f
break
# User credentials
br.form['username'] = 'password'
br.form['password'] = 'username'
# Login
br.submit()
#Now we need to confirm again
br.open('https://url2')
# Select the first (index zero) form
br.select_form(nr=0)
# Login
br.submit()
print(br.open('https:url2').read())
答案 1 :(得分:0)
我会查看html表单,而不是机械化给你的。以下是我过去试图填写的表格示例。
<input type="text" name="user_key" value="">
<input type="password" name="user_password">
以下是我使用上述表格登录该网站的代码
# Browser
br = mechanize.Browser()
# Cookie Jar
cj = cookielib.LWPCookieJar()
br.set_cookiejar(cj)
# Browser options
br.set_handle_equiv(True)
br.set_handle_gzip(True)
br.set_handle_redirect(True)
br.set_handle_refresh(False)
br.set_handle_referer(True)
br.set_handle_robots(False)
# Follows refresh 0 but not hangs on refresh > 0
br.set_handle_refresh(mechanize._http.HTTPRefreshProcessor(), max_time=1)
# User-Agent
br.addheaders = [('User-agent', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1')]
# The site we will navigate into, handling it's session
br.open('https://www.website.com/login')
#select the first form
br.select_form(nr=0)
#user credentials
br['user_key'] = 'myusername@gmail.com'
br['user_password'] = 'mypassword'
# Login
br.submit()
link = 'http://www.website.com/url_i_want_to_scrape'
br.open(link)
response = br.response().read()
print response
您的问题可能是您选择了错误的表单,并提供了错误的字段名称