import scrapy
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from scrapy.selector import HtmlXPathSelector
from scrapy.http import Request
from Erowid.items import ErowidItem
import os
class ExperiencesSpider(CrawlSpider):
name = "experiences"
allowed_domains = ["www.erowid.org"]
start_urls = ['https://www.erowid.org/experiences/exp_list.shtml']
rules = [Rule(LinkExtractor(allow =('subs/exp_[a-zA-Z]+.shtml')),callback='parse_item', follow = True)
Rule(LinkExtractor(allow =('subs/exp_[a-zA-Z]+.shtml')), follow = True)
]
def parse_item(self, response):
filename = str(response.url)[44:-6]
selectors = response.css('table')
if not os.path.exists('drugs-%s' % (filename)): ##Make the file
os.makedirs('drugs-%s' % (filename))
list_of_experience = selectors.xpath('//table[@class="exp-cat-table"]/tr/td/a/@href').extract()
for item in list_of_experience:
request_url = str(item)
Request(url="http://www.erowid.org" + request_url, callback = 'request_experience')
def request_experience(self, response):
selectors = response.css('div')
for selector in selectors:
experience = ErowidItem()
experience['Author'] = selector.xpath('//div[@class="author"]/a/text()').extract()
experience['Title'] = selector.xpath('//div[@class="title"]/text()').extract()
experience['Substance'] = selector.xpath('//div[@class="substance"]/text()').extract()
experience['Text'] = selector.xpath("//div[@class = 'report-text-surround']/text()").extract()
title = str(experience['Substance']) + " "+ str(experience['Title'])
with open(os.path.join('drugs-%s' % (filename), title),"a") as fid:
fid.write(str(experience) + "\n")
我正在尝试使用scrapy从Erowid中抓取数据,我正在尝试格式化数据,以便对于每种物质我都有一个以“物质 - 经验标题”形式命名的文件。
我的规则让我的蜘蛛爬过包含https://www.erowid.org/experiences/subs/exp_Acacia_confusa.shtml的网站列表。然后,我获得了体验的所有链接,并通过第二次请求将其放入此次旨在从体验中收集数据。
我打算以上面提到的格式存储数据,即“实质 - 经验标题”。对于每种物质,我想制作一个充满该页面文件的目录。
但是,我的代码会生成目录,但不会写出我想要的文件。
导致此错误的原因是什么?
答案 0 :(得分:0)
根据documentation of scrapy.http.Request
-
回调(可调用) - 将使用此请求的响应(一旦下载)调用的函数作为其第一个参数。
回调应该是可调用函数,而不是它的字符串,在尝试将其作为Request
对象的回调函数发送之前,您还需要定义函数。
示例 -
import scrapy
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from scrapy.selector import HtmlXPathSelector
from scrapy.http import Request
from Erowid.items import ErowidItem
import os
class ExperiencesSpider(CrawlSpider):
name = "experiences"
allowed_domains = ["www.erowid.org"]
start_urls = ['https://www.erowid.org/experiences/exp_list.shtml']
rules = [Rule(LinkExtractor(allow =('subs/exp_[a-zA-Z]+.shtml')),callback='parse_item', follow = True)
Rule(LinkExtractor(allow =('subs/exp_[a-zA-Z]+.shtml')), follow = True)
]
def request_experience(self, response):
selectors = response.css('div')
for selector in selectors:
experience = ErowidItem()
experience['Author'] = selector.xpath('//div[@class="author"]/a/text()').extract()
experience['Title'] = selector.xpath('//div[@class="title"]/text()').extract()
experience['Substance'] = selector.xpath('//div[@class="substance"]/text()').extract()
experience['Text'] = selector.xpath("//div[@class = 'report-text-surround']/text()").extract()
title = str(experience['Substance']) + " "+ str(experience['Title'])
with open(os.path.join('drugs-%s' % (self.filename), title),"a") as fid:
fid.write(str(experience) + "\n")
def parse_item(self, response):
self.filename = str(response.url)[44:-6]
selectors = response.css('table')
if not os.path.exists('drugs-%s' % (self.filename)): ##Make the file
os.makedirs('drugs-%s' % (self.filename))
list_of_experience = selectors.xpath('//table[@class="exp-cat-table"]/tr/td/a/@href').extract()
for item in list_of_experience:
request_url = str(item)
Request(url="http://www.erowid.org" + request_url, callback = self.request_experience)