我一直在尝试使用以下代码从https://academic.oup.com/ilarjournal抓取有关文章的信息:
class BasicSpider(scrapy.Spider):
name = 'ILAR'
def start_requests(self):
start_urls = ['https://academic.oup.com/ilarjournal/issue-archive']
for url in start_urls:
yield scrapy.Request(url=url, callback = self.parse)
def parse_item(self, response):
item = PropertiesItem()
item['authors'] = response.xpath("//*[contains(@class,'linked-name')]/text()").extract()
self.log("authors %s" % item['authors'])
articleTags = response.xpath("//*[@id='ContentTab']/div[1]/div/div//p/text()").extract()
article = ''.join(articleTags)
#self.log('ARTICLE TEXT IS: '+article)
textFileTitle = response.xpath('//*[@id="ContentColumn"]/div[2]/div[1]/div/div/h1/text()').extract()
fileTitle = ''.join(textFileTitle)
pureFileTitle = fileTitle.replace('\n','').replace(' ','').replace('\r','')
self.log("TEXT TITLE: " + pureFileTitle)
item['title'] = pureFileTitle
self.log("title %s" % item['title'])
articleFile = str('D:/some path/' + pureFileTitle[:-2] + '.txt')
with open (articleFile, 'wb') as newArticle:
newArticle.write(article.encode('utf-8'))
item['url'] = response.url
item['project'] = self.settings.get('BOT_NAME')
item['spider'] = self.name
item['date'] = datetime.datetime.now()
return item
def parse(self,response):
#Get the year and issue URLs and yield Requests
year_selector = response.xpath('//*[contains(@class,"IssueYear")]//@href')
for url in year_selector.extract():
if not year_selector.select('//*[contains(@class,"society-logo-block")]'):
yield Request((urljoin(response.url, url)), dont_filter=True)
else:
yield Request(urljoin(response.url, url))
issue_selector = response.xpath('//*[contains(@id,"item_Resource")]//@href')
for url in issue_selector.extract():
if not issue_selector.select('//*[contains(@class,"society-logo-block")]'):
yield Request((urljoin(response.url, url)), dont_filter=True)
else:
yield Request(urljoin(response.url, url))
#Get the articles URLs and yield Requests
article_selector = response.xpath('//*[contains(@class,"viewArticleLink")]//@href')
for url in article_selector.extract():
if not article_selector.select('//*[contains(@class,"society-logo-block")]'):
yield Request((urljoin(response.url, url)), dont_filter=True)
else:
yield Request(urljoin(response.url, url), callback=self.parse_item)
代理的设置如下:
RETRY_TIMES = 10
RETRY_HTTP_CODES = [500, 503, 504, 400, 403, 404, 408, 302]
DOWNLOADER_MIDDLEWARES = {
'scrapy.downloadermiddlewares.retry.RetryMiddleware': 90,
'scrapy_proxies.RandomProxy': 100,
'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware': 110,
}
PROXY_LIST = 'C:/some path/proxies.csv'
PROXY_MODE = 0
但是,当我尝试运行代码时,它会获取所有url,但似乎不会产生任何项目。外壳只会不断显示这些错误:
2018-08-29 16:53:38 [scrapy.proxies]调试:使用代理http://103.203.133.170:8080,还剩8个代理
2018-08-29 16:53:38 [scrapy.downloadermiddlewares.redirect]调试:从以下位置将(302)重定向到https://academic.oup.com/ilarjournal/article/53/1/E99/656113> https://academic.oup.com/ilarjournal/article-abstract/53/1/E99/656113>
2018-08-29 16:53:38 [scrapy.proxies]调试:未找到代理用户通行证
另一个可能重要的事情是,我尝试使用没有代理的蜘蛛程序,但对于所有文章,它仍然会返回302错误。对于任何想法可能有什么问题,或者在其他主题上已经有好的解决方案,我们将不胜感激。
答案 0 :(得分:1)
30x代码是正常的重定向,您应该允许它们发生。
似乎您的parse_item
方法正在返回值而不是屈服,请尝试将return item
替换为yield item
。