我刚开始涉猎,我想从来自中国的Twitter型网站微博中抓取一些用户的数据。当我运行“ scrapy crawl weibo_spider -o data.csv”时,我一直收到此消息:
2019-11-10 17:49:10 [scrapy.core.engine] DEBUG: Crawled (302) <GET https://weibo.cn/2803301701/info> (referer: None)
2019-11-10 17:49:14 [scrapy.core.engine] DEBUG: Crawled (302) <GET https://weibo.cn/u/2803301701> (referer: https://weibo.cn/2803301701/info)
2019-11-10 17:49:14 [scrapy.core.scraper] DEBUG: Scraped from <302 https://weibo.cn/u/2803301701>
{'_id': '2803301701', 'crawl_time': 1573379351}
2019-11-10 17:49:18 [scrapy.core.engine] DEBUG: Crawled (302) <GET https://weibo.cn/2803301701/profile?page=1> (referer: https://weibo.cn/u/2803301701)
2019-11-10 17:49:18 [scrapy.core.scraper] ERROR: Spider error processing <GET https://weibo.cn/2803301701/profile?page=1> (referer: https://weibo.cn/u/2803301701)
Traceback (most recent call last):
File "/opt/anaconda3/lib/python3.7/site-packages/scrapy/utils/defer.py", line 102, in iter_errback
yield next(it)
File "/opt/anaconda3/lib/python3.7/site-packages/scrapy/spidermiddlewares/offsite.py", line 30, in process_spider_output
for x in result:
File "/opt/anaconda3/lib/python3.7/site-packages/scrapy/spidermiddlewares/referer.py", line 339, in <genexpr>
return (_set_referer(r) for r in result or ())
File "/opt/anaconda3/lib/python3.7/site-packages/scrapy/spidermiddlewares/urllength.py", line 37, in <genexpr>
return (r for r in result or () if _filter(r))
File "/opt/anaconda3/lib/python3.7/site-packages/scrapy/spidermiddlewares/depth.py", line 58, in <genexpr>
return (r for r in result or () if _filter(r))
File "/Users/lveshengyichou/Documents/WeiboSpider-simple/sina/spiders/weibospikder.py", line 121, in parse_tweet
tweet_nodes = tree_node.xpath('//div[@class="c" and @id]')
AttributeError: 'NoneType' object has no attribute 'xpath'
错误在最后一行。但是,我意识到我拥有“ NoneType”对象的原因应该是“ DEBUG:Crawled(302)...”。我不知道如何处理这些错误。 这是我的一些代码:
class WeiboSpider(Spider):
name = "weibo_spider"
base_url = "https://weibo.cn"
handle_httpstatus_list = [302]
def start_requests(self):
start_uids = [
'2803301701',
]
for uid in start_uids:
yield Request(url="https://weibo.cn/%s/info" % uid, callback=self.parse_information)
def parse_information(self, response):
information_item = InformationItem()
information_item['crawl_time'] = int(time.time())
selector = Selector(response)
information_item['_id'] = re.findall('(\d+)/info', response.url)[0]
text1 = ";".join(selector.xpath('body/div[@class="c"]//text()').extract())
nick_name = re.findall('昵称;?[::]?(.*?);', text1)
gender = re.findall('性别;?[::]?(.*?);', text1)
place = re.findall('地区;?[::]?(.*?);', text1)
briefIntroduction = re.findall('简介;?[::]?(.*?);', text1)
birthday = re.findall('生日;?[::]?(.*?);', text1)
sex_orientation = re.findall('性取向;?[::]?(.*?);', text1)
sentiment = re.findall('感情状况;?[::]?(.*?);', text1)
vip_level = re.findall('会员等级;?[::]?(.*?);', text1)
authentication = re.findall('认证;?[::]?(.*?);', text1)
labels = re.findall('标签;?[::]?(.*?)更多>>', text1)
if nick_name and nick_name[0]:
information_item["nick_name"] = nick_name[0].replace(u"\xa0", "")
if gender and gender[0]:
information_item["gender"] = gender[0].replace(u"\xa0", "")
if place and place[0]:
place = place[0].replace(u"\xa0", "").split(" ")
information_item["province"] = place[0]
if len(place) > 1:
information_item["city"] = place[1]
if briefIntroduction and briefIntroduction[0]:
information_item["brief_introduction"] = briefIntroduction[0].replace(u"\xa0", "")
if birthday and birthday[0]:
information_item['birthday'] = birthday[0]
if sex_orientation and sex_orientation[0]:
if sex_orientation[0].replace(u"\xa0", "") == gender[0]:
information_item["sex_orientation"] = "同性恋"
else:
information_item["sex_orientation"] = "异性恋"
if sentiment and sentiment[0]:
information_item["sentiment"] = sentiment[0].replace(u"\xa0", "")
if vip_level and vip_level[0]:
information_item["vip_level"] = vip_level[0].replace(u"\xa0", "")
if authentication and authentication[0]:
information_item["authentication"] = authentication[0].replace(u"\xa0", "")
if labels and labels[0]:
information_item["labels"] = labels[0].replace(u"\xa0", ",").replace(';', '').strip(',')
request_meta = response.meta
request_meta['item'] = information_item
yield Request(self.base_url + '/u/{}'.format(information_item['_id']),
callback=self.parse_further_information,
meta=request_meta, dont_filter=True, priority=1)
def parse_further_information(self, response):
text = response.text
information_item = response.meta['item']
tweets_num = re.findall('微博\[(\d+)\]', text)
if tweets_num:
information_item['tweets_num'] = int(tweets_num[0])
follows_num = re.findall('关注\[(\d+)\]', text)
if follows_num:
information_item['follows_num'] = int(follows_num[0])
fans_num = re.findall('粉丝\[(\d+)\]', text)
if fans_num:
information_item['fans_num'] = int(fans_num[0])
yield information_item
# tweets
yield Request(url=self.base_url + '/{}/profile?page=1'.format(information_item['_id']),
callback=self.parse_tweet,
priority=1)
# followees
yield Request(url=self.base_url + '/{}/follow?page=1'.format(information_item['_id']),
callback=self.parse_follow,
dont_filter=True)
# followers
yield Request(url=self.base_url + '/{}/fans?page=1'.format(information_item['_id']),
callback=self.parse_fans,
dont_filter=True)
def parse_tweet(self, response):
if response.url.endswith('page=1'):
all_page = re.search(r'/> 1/(\d+)页</div>', response.text)
if all_page:
all_page = all_page.group(1)
all_page = int(all_page)
for page_num in range(2, all_page + 1):
page_url = response.url.replace('page=1', 'page={}'.format(page_num))
yield Request(page_url, self.parse_tweet, dont_filter=True, meta=response.meta)
"""
解析本页的数据
"""
tree_node = etree.HTML(response.body)
tweet_nodes = tree_node.xpath('//div[@class="c" and @id]')
...
最后一行是错误。我从github找到了这段代码,除了我自己添加了第四行“ handle_httpstatus_list = [302]”。
任何帮助将不胜感激!谢谢!