尝试使用scrapy登录到网站以抓取数据,但是它没有登录,我无法弄清楚原因。我使用了发布在此网站https://doc.scrapy.org/en/latest/topics/request-response.html#using-formrequest-from-response-to-simulate-a-user-login上的示例代码,所以我不知道到底是怎么了。
# Import from other python files and scrapy files and the needed csv file containing all URLs/proxies/ua
import csv
import scrapy
#from scrapy.spiders import Spider
from scrapy_splash import SplashRequest
from ..items import NameItem
from scrapy.http import FormRequest
#from ..items import GameItem
#from scrapy.http import FormRequest
########################## SPALSHSPIDER.PY OVERVIEW #####################################################
# process the csv file so the url + ip address + useragent pairs are the same as defined in the file
# returns a list of dictionaries, example:
# [ {'url': 'http://www.starcitygames.com/catalog/category/Rivals%20of%20Ixalan',
# 'ip': 'http://204.152.114.244:8050',
# 'ua': "Mozilla/5.0 (BlackBerry; U; BlackBerry 9320; en-GB) AppleWebKit/534.11"},
# ...
# ]
# plus python file also scrapes all URLs returning needed info and goes to all apges associated with URL by clicking next button
# Function to read csv file that contains URLs that are paried with proxies and user agents
def process_csv(csv_file):
# Initialize data
data = []
# Initialize reader
reader = csv.reader(csv_file)
next(reader)
# While inside csv file and not at end of csv file
for fields in reader:
# Set URL
if fields[0] != "":
url = fields[0]
else:
continue # skip the whole row if the url column is empty
#Set proxy and pair with correct URL
if fields[1] != "":
ip = "http://" + fields[1] + ":8050" # adding http and port because this is the needed scheme
# Set user agent and pair with correct URL
if fields[2] != "":
useragent = fields[2]
# Put all three together
data.append({"url": url, "ip": ip, "ua": useragent})
# Return URL paried with ua and proxy
return data
class LoginSpider(scrapy.Spider):
# Name of Spider
name = 'LoginSpider'
# getting all the url + ip address + useragent pairs then request them
def start_requests(self):
# get the file path of the csv file that contains the pairs from the settings.py
with open(self.settings["PROXY_CSV_FILE"], mode="r") as csv_file:
# requests is a list of dictionaries like this -> {url: str, ua: str, ip: str}
requests = process_csv(csv_file)
for req in requests:
# Return needed url with set delay of 3 seconds
yield SplashRequest(url=req["url"], callback=self.parse, args={"wait": 3},
# Pair with user agent specified in csv file
headers={"User-Agent": req["ua"]},
# Sets splash_url to whatever the current proxy that goes with current URL is instead of actual splash url
splash_url = req["ip"],
)
def parse(self, response):
yield scrapy.FormRequest.from_response(
response,
formdata={'username': 'username', 'password': 'password'},
callback=self.after_login
)
def after_login(self, response):
# check login succeed before going on
if b"authentication failed" in response.body:
self.logger.error("Login failed")
else:
self.logger.error("Login succeeded!")
item = NameItem()
item["Name"] = response.css("div.column_data::text").get()
item["Email"] = response.css("div#display_email.editable_display::text").get()
yield item
答案 0 :(得分:0)
您需要更改用户名和密码的名称,以匹配网站上HTML代码(即ex_usr_email和ex_usr_pass)中的内容,然后您需要使用包含行formcss = '#existing_users form'
。希望这会有所帮助。
import scrapy
from ..items import NameItem
class LoginSpider(scrapy.Spider):
name = "LoginSpider"
start_urls = ["http://www.starcitygames.com/buylist/"]
def parse(self, response):
return scrapy.FormRequest.from_response(
response,
formcss='#existing_users form',
formdata={'ex_usr_email': 'email123@example.com', 'ex_usr_pass': 'password123'},
callback=self.after_login
)