努力使用scrapy登录网站

时间:2019-06-19 20:59:07

标签: python scrapy

尝试使用scrapy登录到网站以抓取数据,但是它没有登录,我无法弄清楚原因。我使用了发布在此网站https://doc.scrapy.org/en/latest/topics/request-response.html#using-formrequest-from-response-to-simulate-a-user-login上的示例代码,所以我不知道到底是怎么了。

# Import from other python files and scrapy files and the needed csv file containing all URLs/proxies/ua
import csv
import scrapy
#from scrapy.spiders import Spider
from scrapy_splash import SplashRequest
from ..items import NameItem
from scrapy.http import FormRequest
#from ..items import GameItem
#from scrapy.http import FormRequest
##########################          SPALSHSPIDER.PY OVERVIEW      #####################################################
# process the csv file so the url + ip address + useragent pairs are the same as defined in the file
# returns a list of dictionaries, example:
# [ {'url': 'http://www.starcitygames.com/catalog/category/Rivals%20of%20Ixalan',
#    'ip': 'http://204.152.114.244:8050',
#    'ua': "Mozilla/5.0 (BlackBerry; U; BlackBerry 9320; en-GB) AppleWebKit/534.11"},
#    ...
# ]
# plus python file also scrapes all URLs returning needed info and goes to all apges associated with URL by clicking next button

# Function to read csv file that contains URLs that are paried with proxies and user agents
def process_csv(csv_file):
    # Initialize data
    data = []
    # Initialize reader
    reader = csv.reader(csv_file)
    next(reader)

    # While inside csv file and not at end of csv file
    for fields in reader:

        # Set URL
        if fields[0] != "":
            url = fields[0]
        else:
            continue # skip the whole row if the url column is empty
        #Set proxy and pair with correct URL
        if fields[1] != "":
            ip = "http://" + fields[1] + ":8050" # adding http and port because this is the needed scheme
        # Set user agent and pair with correct URL
        if fields[2] != "":
            useragent = fields[2]
        # Put all three together
        data.append({"url": url, "ip": ip, "ua": useragent})
    # Return URL paried with ua and proxy
    return data



class LoginSpider(scrapy.Spider):
    # Name of Spider
    name = 'LoginSpider'
    # getting all the url + ip address + useragent pairs then request them

    def start_requests(self):
        # get the file path of the csv file that contains the pairs from the settings.py
        with open(self.settings["PROXY_CSV_FILE"], mode="r") as csv_file:
            # requests is a list of dictionaries like this -> {url: str, ua: str, ip: str}

            requests = process_csv(csv_file)
            for req in requests:
                # Return needed url with set delay of 3 seconds
                yield SplashRequest(url=req["url"], callback=self.parse, args={"wait": 3},
                    # Pair with user agent specified in csv file
                    headers={"User-Agent": req["ua"]},
                    # Sets splash_url to whatever the current proxy that goes with current URL  is instead of actual splash url
                    splash_url = req["ip"],
                    )


    def parse(self, response):
        yield scrapy.FormRequest.from_response(
                response,
                formdata={'username': 'username', 'password': 'password'},
                callback=self.after_login
                )
    def after_login(self, response):
        # check login succeed before going on
        if b"authentication failed" in response.body:
            self.logger.error("Login failed")

        else:
            self.logger.error("Login succeeded!")
            item = NameItem()
            item["Name"] = response.css("div.column_data::text").get()
            item["Email"] = response.css("div#display_email.editable_display::text").get()
            yield item

1 个答案:

答案 0 :(得分:0)

您需要更改用户名和密码的名称,以匹配网站上HTML代码(即ex_usr_email和ex_usr_pass)中的内容,然后您需要使用包含行formcss = '#existing_users form'。希望这会有所帮助。

import scrapy
from ..items import NameItem

class LoginSpider(scrapy.Spider):
    name = "LoginSpider"
    start_urls = ["http://www.starcitygames.com/buylist/"]

    def parse(self, response):

        return scrapy.FormRequest.from_response(
        response,
        formcss='#existing_users form',
        formdata={'ex_usr_email': 'email123@example.com', 'ex_usr_pass': 'password123'},
        callback=self.after_login
        )