我试图先登录imdb.com,然后再访问imdb上的另一个URL,然后再抓取一些数据。但是我无法登录,我也不知道为什么? (我有真实的登录凭据,在这里使用示例)
class QuotesLoginSpider(scrapy.Spider):
name = 'q'
login_url = 'https://secure.imdb.com/ap/signin?openid.pape.max_auth_age=0&openid.return_to=https%3A%2F%2Fwww.imdb.c' \
'om%2Fap-signin-handler&openid.identity=http%3A%2F%2Fspecs.openid.net%2Fauth%2F2.0%2Fidentifier_select&' \
'openid.assoc_handle=imdb_pro_us&openid.mode=checkid_setup&siteState=eyJvcGVuaWQuYXNzb2NfaGFuZGxlIjoiaW1' \
'kYl9wcm9fdXMiLCJyZWRpcmVjdFRvIjoiaHR0cHM6Ly9wcm8uaW1kYi5jb20vIn0&openid.claimed_id=http%3A%2F%2Fspecs.op' \
'enid.net%2Fauth%2F2.0%2Fidentifier_select&openid.ns=http%3A%2F%2Fspecs.openid.net%2Fauth%2F2.0'
start_urls = [login_url]
def parse(self, response):
# extract the token value
token = response.xpath("//input[@name='appActionToken']/@value").get()
# create a python dictionary with the form values
data = {
'appActionToken': token,
'email': 'myemail@gmail.com',
'password': 'mypassword',
}
# submit a POST request to it
yield scrapy.FormRequest(url=self.login_url, formdata=data, callback=self.parse_quotes)
def parse_quotes(self, response):
open_in_browser(response)
print(response.xpath("//span[@class='display-name']/text()").get())
我希望登录并获得我的名字,但是我什么都没有。
答案 0 :(得分:0)
您是否尝试过使用Selenium库?此代码应使您登录,然后可以通过find_element_by_xpath()方法查找文本
import selenium
url = 'https://secure.imdb.com/ap/signin?
openid.pape.max_auth_age=0&openid.return_to=https%3A%2F%2Fwww.imdb.com%2Fap-
signin- handler&openid.identity=http%3A%2F%2Fspecs.openid.net%2Fauth%2F2.0%2Fidentifier_select&openid.assoc_handle=imdb_pro_us&openid.mode=checkid_setup&siteState=eyJvcGVuaWQuYXNzb2NfaGFuZGxlIjoiaW1kYl9wcm9fdXMiLCJyZWRpcmVjdFRvIjoiaHR0cHM6Ly9wcm8uaW1kYi5jb20vIn0&openid.claimed_id=http%3A%2F%2Fspecs.openid.net%2Fauth%2F2.0%2Fidentifier_select&openid.ns=http%3A%2F%2Fspecs.openid.net%2Fauth%2F2.0'
driver = webdriver.Chrome('location_of_driver')
driver.get(url)
driver.find_element_by_id('ap_email').send_keys('username')
driver.find_element_by_id('ap_password').send_keys('password')
driver.find_element_by_id('signInSubmit').click()