我试图从这个网站上抓取所有CSV:transparentnevada.com
当您导航到特定代理机构http://transparentnevada.com/salaries/2016/university-nevada-reno/,然后点击下载记录时,会出现指向多个CSV的链接。我想下载所有的CSV。
我的蜘蛛运行并且似乎抓取所有记录但是没有下载任何内容:
import scrapy
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from scrapy.http import Request
class Spider2(CrawlSpider):
#name of the spider
name = 'nevada'
#list of allowed domains
allowed_domains = ['transparentnevada.com']
#starting url for scraping
start_urls = ['http://transparentnevada.com/salaries/all/']
rules = [
Rule(LinkExtractor(
allow=['/salaries/all/*']),
follow=True),
Rule(LinkExtractor(
allow=['/salaries/2016/*/']),
follow=True),
Rule(LinkExtractor(
allow=['/salaries/2016/*/#']),
callback='parse_article',
follow=True),
]
#setting the location of the output csv file
custom_settings = {
'FEED_FORMAT' : "csv",
'FEED_URI' : 'tmp/nevada2.csv'
}
def parse_article(self, response):
for href in response.css('div.view-downloads a[href$=".csv"]::attr(href)').extract():
yield Request(
url=response.urljoin(href),
callback=self.save_pdf
)
def save_pdf(self, response):
path = response.url.split('/')[-1]
self.logger.info('Saving CSV %s', path)
with open(path, 'wb') as f:
f.write(response.body)