我是 Python 语言的新手,我开始使用 Scrapy,这太棒了,我想知道是否有任何方法可以减少编写代码并避免大量 if 语句也避免代码中的重复:
titleText = response.css('title::text').get()
if response.status == 200:
if len(response.css('title').extract()) == 0:
self.data['crawl']['meta']['missingTitle'].append({
'url': response.url,
'latency': response.meta.get('download_latency')
})
elif len(response.css('title').extract()) == 1:
if len(titleText) > 70:
self.data['crawl']['meta']['longTitle'].append({
'url': response.url,
'text': titleText,
'latency': response.meta.get('download_latency')
})
elif len(titleText) < 50:
self.data['crawl']['meta']['shortTitle'].append({
'url': response.url,
'text': titleText,
'latency': response.meta.get('download_latency')
})
else:
self.data['crawl']['meta']['multipleTitleTag'].append({
'url': response.url,
'text': titleText,
'latency': response.meta.get('download_latency')
})
elif response.status in self.handle_httpstatus_list:
self.data['crawl']['blockers']['url'].append({
'url': response.url,
'code':response.status,
'text': titleText,
'latency': response.meta.get('download_latency')
})
谢谢
答案 0 :(得分:1)
你有很多重复的代码,但如果你有这么多不同的可能性,所有这些都很难消除。
titleText = response.css('title::text').get()
dataObj = {
'url': response.url,
'latency': response.meta.get('download_latency')
}
title = response.css('title').extract()
tagName = 'missingTitle' #default
if response.status == 200:
if len(title) == 0:
tagName = 'missingTitle'
else:
dataObj['text'] = titleText
if len(title) > 1:
tagName = 'multipleTitleTag'
elif len(title) ==1:
if len(titleText) > 70:
tagName = 'longTitle'
elif len(titleText) < 50:
tagName = 'shortTitle'
else:
#Really nothing done if title between 50 and 70?
pass
self.data['crawl']['meta'][tagName].append(dataObj)
elif response.status in self.handle_httpstatus_list:
dataObj['code'] = response.status
dataObj['text'] = titleText
self.data['crawl']['meta']['multipleTitleTag'].append(dataObj)
标题标签功能可以移到辅助函数中,让事情变得更简洁。在那里,标签名称的选择可以很好地结构化,并带有早期回报。总的来说它可能看起来像这样(相应地调整细节):
def get_tag_name(self, title, titleText):
if len(title) == 0:
return 'missingTitle'
if len(title) > 1:
return 'multipleTitleTag'
if len(titleText) > 70:
return 'longTitle'
if len(titleText) < 50:
return 'longTitle'
return 'DEFAULT' #???
def the_main_one(self, response):
titleText = response.css('title::text').get()
dataObj = {
'url': response.url,
'latency': response.meta.get('download_latency')
}
tagName = 'multipleTitleTag' #default
title = response.css('title').extract()
if response.status == 200:
tagName = self.get_tag_name(title, titleText)
if len(title) == 0:
dataObj['text'] = titleText
elif response.status in self.handle_httpstatus_list:
dataObj['code'] = response.status
dataObj['text'] = titleText
self.data['crawl']['meta'][tagName].append(dataObj)
它可能不会短很多,但它肯定会消除一些混乱并有助于以后维护代码
如果您在同一级别有很多 IF,您可以考虑不同的方法。现在这似乎是正确的(至少对我而言)。
答案 1 :(得分:0)
titleText = response.css('title::text').get()
metaResponse = {'text':titleText, 'url':response.url, 'httpCode':response.status, 'latency':response.meta.get('download_latency')}
if response.status == 200:
if len(response.css('title').extract()) == 0:
self.data['crawl']['meta']['missingTitle'].append(metaResponse)
elif len(response.css('title').extract()) == 1:
if len(titleText) > 70:
self.data['crawl']['meta']['longTitle'].append(metaResponse)
elif len(titleText) < 40:
self.data['crawl']['meta']['shortTitle'].append(metaResponse)
else:
self.data['crawl']['meta']['multipleTitleTag'].append(metaResponse)
elif response.status in self.handle_httpstatus_list:
self.data['crawl']['blockers']['url'].append(metaResponse)