我有一个名为crawler的Scrapy项目。
我的蜘蛛看起来像这样:
class WikiSpider(scrapy.Spider):
name = "wiki-spider"
def start_requests(self):
for url in CrawlerUrls().wiki:
yield scrapy.Request(url=url, callback=self.parse)
def parse(self, response):
# first part: save pages html to bigdata directory
page = response.url.split("/")[4]
page_dirname = 'bigdata'
filename = '%s.html' % page
with open(os.path.join(page_dirname,filename), 'wb') as f:
f.write(response.body)
self.log('Saved file %s' % filename)
# second part: extract text for the item for document corpus
item = CrawlerItem()
item['url'] = response.url
item['title'] = response.css('h1::text').extract_first()
item['text'] = response.xpath('//div[@id="mw-content-text"]//text()')\
.extract()
tags_list = [response.url.split("/")[2],
response.url.split("/")[3]]
more_tags = [x.lower() for x in remove_stopwords(response.url\
.split("/")[4].split("_"))]
for tag in more_tags:
tag = re.sub('[^a-zA-Z]', '', tag) # alphanumeric values only
tags_list.append(tag)
item['tags'] = tags_list
return item
运行它时,出现此错误:
File "c:\users\oerha\anaconda3\envs\nlp\lib\site-packages\scrapy\spiderloader.py", line 71, in load
raise KeyError("Spider not found: {}".format(spider_name))
KeyError: 'Spider not found: spider'
我看着另一个question,看来我做对了所有事情。
这是我的设置:
BOT_NAME = 'crawler'
SPIDER_MODULES = ['crawler.spiders']
NEWSPIDER_MODULE = 'crawler.spiders'
run-spider.py 导入scrapy#用于爬行和抓取的面向对象框架 导入os#操作系统命令
page_dirnames = ['wiki']
for name in page_dirnames:
if not os.path.exists(name):
os.makedirs(name)
# function for walking and printing directory structure
def list_all(current_directory):
for root, dirs, files in os.walk(current_directory):
level = root.replace(current_directory, '').count(os.sep)
indent = ' ' * 4 * (level)
print('{}{}/'.format(indent, os.path.basename(root)))
subindent = ' ' * 4 * (level + 1)
for f in files:
print('{}{}'.format(subindent, f))
# examine the directory structure
current_directory = os.getcwd()
list_all(current_directory)
# list the avaliable spiders
print('\nScrapy spider names:\n')
os.system('scrapy list')
# for JSON lines we use this command
os.system('scrapy crawl spider -o items.jl')
print('\nJSON lines written to items.jl\n')
# run the scraper exporting results as a dictionary XML text file items.xml
# os.system('scrapy crawl quotes -o items.xml')
我使用python run-spider.py运行文件