当我在cmd中编写此命令时
草率爬网引号-o item.csv -a u = test_user_name -a p = test_passporw_name -a urls = http://books.toscrape.com/
正在显示
提高ValueError('请求网址中缺少方案:%s'%self._url) ValueError:请求网址中缺少方案:h
# -*- coding: utf-8 -*-
from scrapy.contrib.spiders.init import InitSpider
from scrapy.http import Request, FormRequest
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.contrib.spiders import Rule
from scrapy.utils.response import open_in_browser
from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
class QuotesSpider(InitSpider):
name = 'quotes'
allowed_domains = ['quotes.toscrape.com']
login_page='http://quotes.toscrape.com/login'
start_urls = ['']
username=''
password=''
def __init__(self,u,p,urls):
self.username=u
self.password=p
self.start_urls=urls
def init_request(self):
#"""This function is called before crawling starts."""
return Request(url=self.login_page, callback=self.login)
def login(self, response):
csrf_token=response.xpath('//*[@name="csrf_token"]//@value').extract_first()
return FormRequest.from_response(response,
formdata={'csrf_token': csrf_token,
'username': self.username,
'password': self.password,
},
callback=self.check_login_response)
def check_login_response(self, response):
# open_in_browser(response)
#"""Check the response returned by a login request to see if we aresuccessfully logged in."""
if "Logout" in response.body:
self.log("\n\n\nSuccessfully logged in. Let's start crawling!\n\n\n")
# Now the crawling can begin..
return self.initialized() # ****THIS LINE FIXED THE LAST PROBLEM*****
else:
self.log("\n\n\nFailed, Bad times :(\n\n\n")
# Something went wrong, we couldn't log in, so nothing happens.
def parse(self, response):
open_in_browser(response)
答案 0 :(得分:0)
self.start_urls=urls
使start_urls
成为字符串而不是列表。
这使得该字符串中的每个字符都被解释为url。
只需将start_urls
设为列表,您的代码就可以使用:
self.start_urls = [urls]
此外,您无需将变量初始化为虚拟值,也无需自己解析csrf_token(使用FormRequest.from_response()
时将自动完成)
在旁注中,您的代码看起来像是为旧版本的scrapy编写的-大多数导入已被移动,重命名或弃用。
也许您应该通过快速阅读文档来刷新代码。
答案 1 :(得分:-1)
您应该使用loginform库。
尝试此代码
# -*- coding: utf-8 -*-
import scrapy
from loginform import fill_login_form
class QuotesSpiderSpider(scrapy.Spider):
name = 'quotes_spider'
allowed_domains = ['quotes.toscrape.com']
start_urls = ['http://quotes.toscrape.com/']
login_url = 'http://quotes.toscrape.com/login'
login_user = 'your-username'
login_password = 'secret-password-here'
def start_requests(self):
yield scrapy.Request(self.login_url, self.parse_login)
def parse_login(self, response):
data, url, method = fill_login_form(response.url, response.body,self.login_user, self.login_password)
return scrapy.FormRequest(url, formdata=dict(data),method=method, callback=self.start_crawl)
def start_crawl(self, response):
for url in self.start_urls:
yield scrapy.Request(url)
def parse(self, response):
quotes = response.xpath("//div[@class='quote']//span[@class='text']/text()").extract()
yield {'quotes': quotes}
我的树形结构可以帮助您整理文件
tree .
.
├── scrapy.cfg
└── scrapy_spider
├── __init__.py
├── items.py
├── middlewares.py
├── pipelines.py
├── __pycache__
│ ├── __init__.cpython-36.pyc
│ └── settings.cpython-36.pyc
├── settings.py
└── spiders
├── example.py
├── __init__.py
├── __pycache__
│ ├── example.cpython-36.pyc
│ ├── __init__.cpython-36.pyc
│ └── quotes_spider.cpython-36.pyc
└── quotes_spider.py