这是我的蜘蛛,我通过脚本运行它来解析本地dokuwiki的内容:
DEBUG = True
if DEBUG:
f_debug = open('debug.log','w')
md5s = []
class DokuWikiMd5Spider(scrapy.Spider):
name = 'dokuwikispider'
start_urls = ['https://dokuwiki.mjcc.lasil.ru/doku.php']
visited = []
custom_settings = {
'CONCURRENT_REQUESTS': 1,
}
@staticmethod
def get_page_name(url):
url = url.replace("https://dokuwiki.mjcc.lasil.ru/doku.php?", '')
if 'id=start&do=search' in url:
# because credentials are in URL, here we cut only page name
# https://dokuwiki.mjcc.lasil.ru/doku.php?id=start&do=search&id=%D0%BF%D0%BE%D1%81%D1%82%D0%B0%D0%B2%D1%89%D0%B8%D0%BA%D0%B8_%D0%B8_%D0%BA%D0%BE%D0%BD%D1%82%D0%B0%D0%BA%D1%82%D1%8B&q=&p=PASSWORD&u=admin
m = re.findall('id=([^&]+)', url)
return m[1]
else:
m = re.search('id=([^&]+)', url)
return m.group(1)
def parse(self, response):
password = keyring.get_password('dokuwiki', 'admin')
return scrapy.FormRequest.from_response(
response,
formdata = {'u': 'admin', 'p': password},
callback = self.after_login
)
def after_login(self, response):
# check login succeed before going on
if b"authentication failed" in response.body:
self.logger.error("Login failed")
return
# continue scraping with authenticated session...
if DEBUG:
f_debug.write("parsing: {}\n".format(response.url))
text = response.text
# cut everything except page content, not to depend on wiki settings when comparing
m = re.findall('.*(<!-- wikipage start -->.*<!-- wikipage stop -->).*', text, re.DOTALL)
text = m[0][0]
# with open(r'F:\TEMP\test.html','w') as f:
# f.write(text)
md5 = hashlib.md5()
md5.update(text.encode('utf-8'))
md5s.append({'url': self.get_page_name(response.url), 'md5': md5.hexdigest()})
yield {'url': self.get_page_name(response.url), 'md5': md5.hexdigest()}
for next_page in response.xpath('//a/@href'):
next_url = next_page.extract()
if DEBUG:
f_debug.write("\t?next page: {}\n".format(next_url))
if 'doku.php?id=' in next_url:
# to process every page name only one time
next_page_name = self.get_page_name(next_url)
if next_page_name not in self.visited:
if DEBUG:
f_debug.write("\t\t!\n")
self.visited.append(next_page_name)
yield response.follow("https://dokuwiki.mjcc.lasil.ru/{}&u=admin&p={}".format(next_url, keyring.get_password('dokuwiki', 'admin')), self.after_login)
process = CrawlerProcess({
'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'
})
process.crawl(DokuWikiMd5Spider)
process.start() # the script will block here until the crawling is finished
因此,在调试消息中,我看到了那个蜘蛛皱巴巴的页面“ wiki_backup”:
2019-01-28 19:49:22 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://dokuwiki.mjcc.lasil.ru//doku.php?id=wiki_backup&u=admin&p=PASSWORD> (referer: https://dokuwiki.mjcc.lasil.ru//doku.php?id=%D1%81%D0%BE%D0%B7%D0%B4%D0%B0%D0%BD%D0%B8%D0%B5_%D0%B8_%D0%BF%D1%80%D0%BE%D0%B2%D0%B5%D1%80%D0%BA%D0%B0_%D0%B1%D1%8D%D0%BA%D0%B0%D0%BF%D0%BE%D0%B2&u=admin&p=PASSWORD)
但是该页面甚至没有被解析过,正如您在“ debug.log”中看到的那样:
root@F91_Moin20:/home/ishayahu # cat debug.log | grep wiki_backup
?next page: /doku.php?id=wiki_backup
答案 0 :(得分:1)
问题出在某种程度上,spider如何检查身份验证是否失败。它(如本教程中一样)搜索“身份验证失败”一词,但是由于页面内容中包含相同的单词,Spider认为这是一个身份验证错误,因此停止处理该页面。 应该有另一种方法来检查验证是否真的失败了。