我正在尝试抓取一个Twitter帐户,以了解关注者和朋友帐户的数量。刮擦是用Scrapy完成的。
我开始请求登录到Twitter但未通过以下日志消息:
我在Scrapy上运行的蜘蛛是:404 https://twitter.com/sessions/change_locale>:HTTP状态代码为 未处理或不允许
from scrapy.spiders.init import InitSpider
from scrapy.http import Request, FormRequest
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import Rule
class CrawlSpider(InitSpider):
name = 'twitter_friends'
allowed_domains = ['twitter.com']
login_page = 'https://www.twitter.com/login'
start_urls = ['https://twitter.com/account/following',
'https://twitter.com/account/following']
rules = (
Rule(LinkExtractor(allow=r'/+'),
callback='parse_item', follow=True),
)
def init_request(self):
print "called init request..."
return Request(url=self.login_page, callback=self.login)
def login(self, response):
print "called login ..."
twitter_response = FormRequest.from_response(response,
formdata={'username': 'myUserName', 'password': 'myPassword'},
callback=self.check_login_response)
return twitter_response
def check_login_response(self, response):
if "Hi sara" in response.body:
self.log("Successfully logged in. Let's start crawling!")
# Now the crawling can begin..
self.initialized()
else:
self.log("Bad times :(")
# Something went wrong, we couldn't log in, so nothing happens.
def parse_item(self, response):
# here I parse the urls
任何帮助?