我正在编写一个刮刀,如果元数据中包含任何给定的关键字,并且包含' htt'在URL中,按照它们并重复该过程两次,因此抓取的深度将为2.这是我的代码:
from scrapy.spider import Spider
from scrapy import Selector
from socialmedia.items import SocialMediaItem
from scrapy.contrib.spiders import Rule, CrawlSpider
from scrapy.selector import HtmlXPathSelector
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
class MySpider(Spider):
name = 'smm'
allowed_domains = ['*']
start_urls = ['http://en.wikipedia.org/wiki/Social_media']
rules = (
Rule(SgmlLinkExtractor(allow=()), callback="parse_items", follow= True),
)
def parse_items(self, response):
items = []
#Define keywords present in metadata to scrap the webpage
keywords = ['social media','social business','social networking','social marketing','online marketing','social selling',
'social customer experience management','social cxm','social cem','social crm','google analytics','seo','sem',
'digital marketing','social media manager','community manager']
for link in response.xpath("//a"):
item = SocialMediaItem()
#Extract webpage keywords
metakeywords = link.xpath('//meta[@name="keywords"]').extract()
#Compare keywords and extract if one of the defined keyboards is present in the metadata
if (keywords in metaKW for metaKW in metakeywords):
item['SourceTitle'] = link.xpath('/html/head/title').extract()
item['TargetTitle'] = link.xpath('text()').extract()
item['link'] = link.xpath('@href').extract()
outbound = str(link.xpath('@href').extract())
if 'http' in outbound:
items.append(item)
return items
但是我收到了这个错误:
Traceback (most recent call last):
File "C:\Anaconda\lib\site-packages\twisted\internet\base.py", line 1201, in mainLoop
self.runUntilCurrent()
File "C:\Anaconda\lib\site-packages\twisted\internet\base.py", line 824, in runUntilCurrent
call.func(*call.args, **call.kw)
File "C:\Anaconda\lib\site-packages\twisted\internet\defer.py", line 382, in callback
self._startRunCallbacks(result)
File "C:\Anaconda\lib\site-packages\twisted\internet\defer.py", line 490, in _startRunCallbacks
self._runCallbacks()
--- <exception caught here> ---
File "C:\Anaconda\lib\site-packages\twisted\internet\defer.py", line 577, in _runCallbacks
current.result = callback(current.result, *args, **kw)
File "C:\Anaconda\lib\site-packages\scrapy\spider.py", line 56, in parse
raise NotImplementedError
exceptions.NotImplementedError:
您能帮我跟踪其网址中包含http的链接吗? 谢谢!
达尼
答案 0 :(得分:1)
这里忽略了这条规则有两个主要原因:
CrawlSpider
,而不是常规Spider
parse_items()
不存在。将parse()
重命名为parse_items()
。答案 1 :(得分:1)
在您的代码中,将class MySpider(Spider):
更改为class Myspider(crawlSpider)
。