我想递归地删除Google学术搜索作者(从init作者->查找其所有合著者),并在达到最大数量后停下来。这是可以掌握给定起始URL的所有作者的代码。但是,我们如何才能遍历新作者并递归地制作它,然后在获得约1000个唯一作者之后停止?
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
class GoogleScholarSpider(CrawlSpider):
#name of the spider
name = 'googlescholar'
#list of allowed domains
allowed_domains = ['www://scholar.google.com']
#starting url for scraping
start_urls = ['https://scholar.google.com/citations?user=IT-vb_kAAAAJ&hl=en&oi=sra']
#setting the location of the output csv file
custom_settings = {
'FEED_URI' : 'tmp/gscholar.csv'
}
rules = (Rule(SgmlLinkExtractor(), callback="parse", follow= True),) ## NOT WORKING
def parse(self, response):
#Remove XML namespaces
response.selector.remove_namespaces()
scholar_name = response.xpath('//body/div/div/div/div/div/div/ul/li/div/span/a/text()').extract() ## get value inside tag
for item in zip(scholar_name):
scraped_info = {
'scholar_name' : item[0],
}
yield scraped_info