我从多个目录收集链接,然后将它们作为链接变量插入到start_urls中
import scrapy
class SplashSpider(scrapy.Spider):
f = open('text.txt')
links = f.read()
name = 'spide'
start_urls = [str(links)]
f.close()
def parse(self, response):
title = response.css('.title::text').extract()
description = response.css("div#desc").extract()
title = list(map(str.strip, title))
description = list(map(str.strip, description))
yield{
'Title': title,
'Main Info': description,
}
但是我遇到了一个错误:scrapy.exceptions.NotSupported: Unsupported URL scheme '': no handler available for that scheme
我的text.txt文件:
'https:// url1.com','https:// url2.com', ... , 'https:// url300000.com', 'https:// url300001.com'
答案 0 :(得分:0)
import scrapy
class SplashSpider(scrapy.Spider):
with open('text.txt') as f:
links = f.readlines()
links = list(map(lambda x: x.strip().replace(' ', ''), links))
name = 'spider'
start_urls = links
def parse(self, response):
title = response.css('.title::text').extract()
description = response.css("div#desc").extract()
title = list(map(str.strip, title))
description = list(map(str.strip, description))
yield{
'Title': title,
'Main Info': description,
}