我正在使用scrapy .24.4,我试图从威吓专家那里获取一些信息并且我几乎得到了它,我可以获取所有页面上的所有信息,除了第一页(或者start_url)。我已经尝试了parse_start_url并添加了规则,但却无法让它发挥作用。我确定它只是我忽略的东西,但是我整个周末都在看它,只是需要休息一下。如果有人有任何建议等,我会很感激哦。我确实让它在start_url中使用了一个范围,但它看起来有点不雅,而且我正在努力学习正确的方法。非常感谢提前!!
import scrapy
import re
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.contrib.spiders import CrawlSpider, Rule
from threatexpert.items import ThreatExpert
import urlparse
class ThreatExpertSpider(scrapy.Spider):
name = 'threatexpert'
start_urls = ["http://www.threatexpert.com/reports.aspx?tf=2&sl=1"]
def parse(self, response):
print '++++++++++++++++++++++++pull all page links+++++++++++++++++++++++'
urls = response.xpath('//a[contains(@href, "page")]/@href').extract()
for url in urls:
url = urlparse.urljoin(response.url, url)
self.log('Found follow url: %s' % url)
yield scrapy.Request(url, callback = self.parse_links)
def parse_links(self, response):
print '++++++++++++++++++++++++pull item urls++++++++++++++++++++++++++'
urls = response.xpath('//a[contains(@href, "md5")]/@href').extract()
for url in urls:
url = urlparse.urljoin(response.url, url)
self.log('Found follow url: %s' % url)
yield scrapy.Request(url, callback = self.parse_items)
def parse_items(self, response):
self.log("Hi, this is an item page! %s" % response.url)
item = ThreatExpert()
item['md5'] = response.xpath('//html/body/ul[1]/ul/ul/li[1]/text()').re(r"File MD5: ([\w, ]+)")
yield item
答案 0 :(得分:1)
很多,非常感谢它引导我完成工作的回应!只是使用了错误的类而不是类ThreatExpertSpider(scrapy.Spider),我使用了类ThreatExpertSpider(CrawlSpider):我仍然不完全确定它是如何工作的,但确实如此。我知道RTFM,哈哈,但我正在学习。如果其他人正在寻找这个,那么这对我有用。
import scrapy
import re
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.contrib.spiders import CrawlSpider, Rule
from threatexpert.items import ThreatExpert
import urlparse
class ThreatExpertSpider(CrawlSpider):
name = 'threatexpert'
start_urls = ["http://www.threatexpert.com/reports.aspx?tf=3&sl=1"]
rules = (
Rule(SgmlLinkExtractor(allow=r'page=\d'), callback='parse_links', follow=True),
)
def parse_start_url(self, response):
print '++++++++++++++++++++++++parse_start_url+++++++++++++++++++++++'
return self.parse_items(response)
# urls = response.xpath('//a[contains(@href, "page")]/@href').extract()
# for url in urls:
# url = urlparse.urljoin(response.url, url)
# self.log('Found follow url: %s' % url)
# yield scrapy.Request(url, callback = self.parse_links)
def parse_links(self, response):
print '++++++++++++++++++++++++pull item urls++++++++++++++++++++++++++'
urls = response.xpath('//a[contains(@href, "md5")]/@href').extract()
for url in urls:
url = urlparse.urljoin(response.url, url)
self.log('Found follow url: %s' % url)
yield scrapy.Request(url, callback = self.parse_items)
def parse_items(self, response):
self.log("Hi, this is an item page! %s" % response.url)
item = ThreatExpert()
item['md5'] = response.xpath('//html/body/ul[1]/ul/ul/li[1]/text()').re(r"File MD5: ([\w, ]+)")
# item['callback'] = response.xpath('//*[contains(text(), "The following Host Names were requested from a host database:")]/following-sibling::ul/li/text()').extract()
# if item['callback']:
# item['callback'] = response.xpath('//*[contains(text(), "The following Host Names were requested from a host database:")]/following-sibling::ul/li/text()').extract()
# else:
# del item['callback']
yield item
答案 1 :(得分:0)
请参考以下代码,这对我有用。如果您有任何疑问,请通过命令更新。
from scrapy.spider import BaseSpider
from scrapy.http import Request
import re
from urlparse import urljoin
from scrapy.selector import HtmlXPathSelector
from threatexpert.items import ThreatExpert
import inspect
class usmallspider(BaseSpider):
name = 'threatexpert'
start_urls = ["http://www.threatexpert.com/reports.aspx?tf=2&sl=1"]
def parse(self, response):
hxs = HtmlXPathSelector(response)
urls = response.xpath('//a[contains(@href, "page")]/@href').extract()
for url in urls:
url = urljoin(response.url, url)
print url
if url:
yield Request(url, callback=self.parse_links)
def parse_links(self, response):
hxs = HtmlXPathSelector(response)
urls = response.xpath('//a[contains(@href, "md5")]/@href').extract()
for url in urls:
url = urljoin(response.url, url)
if url:
yield Request(url, callback = self.parse_items)
def parse_items(self, response):
itm=[]
item = MallUk1Item()
hxs = HtmlXPathSelector(response)
item['md5'] = response.xpath('//html/body/ul[1]/ul/ul/li[1]/text()').re(r"File MD5: ([\w, ]+)")
itm.append(item)
return itm