我只是想用scrapy制作一个简单的蜘蛛从XML文件中获取数据。这就是我想出的:
from scrapy.contrib.spiders import XMLFeedSpider
class MySpider(XMLFeedSpider):
name = 'testproject'
allowed_domains = ['www.w3schools.com']
start_urls = ['http://www.w3schools.com/xml/note.xml']
itertag = 'note'
def parse_node(self, response, node):
to = node.select('to/text()').extract()
from = node.select('from/text()').extract()
heading = node.select('heading/text()').extract()
body = node.select('body/text()').extract()
def parse_node(self, response, node):
log.msg('Hi, this is a <%s> node!: %s' % (self.itertag, ''.join(node.extract())))
item = Item()
item['to'] = node.select('to').extract()
item['from'] = node.select('from').extract()
item['heading'] = node.select('heading').extract()
item['body'] = node.select('body').extract()
return item
这是数据集: http://www.w3schools.com/xml/note.xml
当我尝试运行它时,遗憾的是它不起作用。我认为这与我如何映射标签有关。这是错误:
File "/usr/local/bin/scrapy", line 11, in <module>
sys.exit(execute())
File "/usr/local/lib/python2.7/dist-packages/scrapy/cmdline.py", line 143, in execute
_run_print_help(parser, _run_command, cmd, args, opts)
File "/usr/local/lib/python2.7/dist-packages/scrapy/cmdline.py", line 89, in _run_print_help
func(*a, **kw)
File "/usr/local/lib/python2.7/dist-packages/scrapy/cmdline.py", line 150, in _run_command
cmd.run(args, opts)
File "/usr/local/lib/python2.7/dist-packages/scrapy/commands/crawl.py", line 57, in run
crawler = self.crawler_process.create_crawler()
File "/usr/local/lib/python2.7/dist-packages/scrapy/crawler.py", line 87, in create_crawler
self.crawlers[name] = Crawler(self.settings)
File "/usr/local/lib/python2.7/dist-packages/scrapy/crawler.py", line 25, in __init__
self.spiders = spman_cls.from_crawler(self)
File "/usr/local/lib/python2.7/dist-packages/scrapy/spidermanager.py", line 35, in from_crawler
sm = cls.from_settings(crawler.settings)
File "/usr/local/lib/python2.7/dist-packages/scrapy/spidermanager.py", line 31, in from_settings
return cls(settings.getlist('SPIDER_MODULES'))
File "/usr/local/lib/python2.7/dist-packages/scrapy/spidermanager.py", line 22, in __init__
for module in walk_modules(name):
File "/usr/local/lib/python2.7/dist-packages/scrapy/utils/misc.py", line 68, in walk_modules
submod = import_module(fullpath)
File "/usr/lib/python2.7/importlib/__init__.py", line 37, in import_module
__import__(name)
File "/var/www/spider/crawler/spiders/jobsite.py", line 11
from = node.select('from/text()').extract()
^
SyntaxError: invalid syntax
任何帮助都将不胜感激。
答案 0 :(得分:0)
将中的单词更改为不同的单词,在python中它是一个关键字。