得到这段代码:
from scrapy.http import Request, FormRequest
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from scrapy.selector import HtmlXPathSelector
from scrapy.http import FormRequest
from retrieve_image import retrieve_image
class LoginSpider(CrawlSpider):
name = 'loginspider'
login_page = 'http://145.100.108.148/login5/login.php'
start_urls = ['http://145.100.108.148/login5/index.php']
username = 'test@hotmail.com'
password = 'test'
def init_request(self):
return Request(url=self.login_page, callback=self.start_requests)
def start_requests(self):
print ("\n start_request is here \n")
yield Request(
url = self.login_page,
callback = self.login,
cookies={'PHPSESSID': 'something'},
dont_filter = True
)
def login(self, response):
captcha = execute.split('\n')[0]
print(captcha)
print ("\n Login is here! \n")
retrieve_image(response, self.login_page)
在此页面上有一个与会话相关联的验证码图像。
我正在尝试通过retrieve_image.py从网址下载CAPTCHA图片。我应该使用相同的会话来执行此操作,但在运行刮刀时,它会下载一个空图像,这意味着它不是同一个会话。我认为会话是平等的,因为PHPSESSID是相等的。
这是retrieve_image.py:
import bs4
import lxml
import requests
import io
import urllib2
def retrieve_image(page, server):
for i in range(1):
cookies = dict(PHPSESSID='something')
soup = bs4.BeautifulSoup(page.body, "lxml")
samples = soup.find(id="captcha", src=True)['src']
another = soup.find('div', {'class:', 'test'})
#print(page.body)
#print("Another text -> " + another.text)
image_url = str(server + samples)
print(image_url)
print('\n HERE IS I \n')
print(requests.get(image_url, cookies=cookies))
print('\n')
img_data = requests.get(image_url, cookies=cookies).content
# print("doe ff image data\n" +img_data)
myString = '_num' + str(i) + '.png'
print(another.text + myString)
with open(another.text + myString, 'wb') as handler:
handler.write(img_data)
在settings.py中
COOKIES_ENABLED = True
COOKIES_DEBUG = True
答案 0 :(得分:1)
为什么要使用外部脚本下载验证码图像?
只需使用相同的Scrapy的Request方法下载图片。
def login(self, response):
#If captcha exists
if response.css("#captcha").extract_first() is not None:
captchaImageLink = response.css("#captcha::attr(src)").extract_first()
yield Request(captchaImageLink, callback = self.saveCaptchaImage)
def saveCaptchaImage(self, response):
output = open("captchaImageFile.jpg","wb")
output.write(response.body)
output.close()