我使用scrapy为网络爬虫编写了一个简单的代码。我的测试代码可以在下面找到。
import scrapy
class ExtractUrls(scrapy.Spider):
name = "extract"
# request function
def start_requests(self):
urls = [ 'https://www.iitdh.ac.in/events.php', ]
for url in urls:
yield scrapy.Request(url = url, callback = self.parse)
# Parse function
def parse(self, response):
# Extra feature to get title
title = response.css('title::text').extract_first()
# Get anchor tags
links = response.css('a::attr(href)').extract()
for link in links:
yield
{
'title': title,
'links': link
}
import scrapy
class GfgItem(scrapy.Item):
# define the fields for your item here like:
name = scrapy.Field()
title = scrapy.Field()
pass
BOT_NAME = 'gfg'
SPIDER_MODULES = ['gfg.spiders']
NEWSPIDER_MODULE = 'gfg.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'gfg (+http://www.yourdomain.com)'
# Obey robots.txt rules
ROBOTSTXT_OBEY = True
运行$scrapy crawl extract -o links.json -t json
时,我希望将输出存储在links.json
文件中,但是我的links.json
文件仍然为空。为什么?
答案 0 :(得分:0)
尝试使用GfgItem()
:
from items import GfgItem
.....
for link in links:
item = GfgItem()
item["title"] = title
item["name"] = link
yield item