def scrapeFacebookPageFeedStatus(page_id, access_token):
# -*- coding: utf-8 -*-
with open('%s_facebook_statuses.csv' % page_id, 'wb') as file:
w = csv.writer(file)
w.writerow(["status_id", "status_message", "link_name", "status_type", "status_link",
"status_published", "num_likes", "num_comments", "num_shares"])
has_next_page = True
num_processed = 0 # keep a count on how many we've processed
scrape_starttime = datetime.datetime.now()
print "Scraping %s Facebook Page: %s\n" % (page_id, scrape_starttime)
statuses = getFacebookPageFeedData(page_id, access_token, 100)
while has_next_page:
for status in statuses['data']:
w.writerow(processFacebookPageFeedStatus(status))
# output progress occasionally to make sure code is not stalling
num_processed += 1
if num_processed % 1000 == 0:
print "%s Statuses Processed: %s" % (num_processed, datetime.datetime.now())
# if there is no next page, we're done.
if 'paging' in statuses.keys():
statuses = json.loads(request_until_succeed(statuses['paging']['next']))
else:
has_next_page = False
print "\nDone!\n%s Statuses Processed in %s" % (num_processed, datetime.datetime.now() - scrape_starttime)
scrapeFacebookPageFeedStatus(page_id, access_token)
UnicodeEncodeError: 'ascii' codec can't encode characters in position 40-43: ordinal not in range(128)
我正在编写代码来浏览Facebook页面以收集cvs文件中的所有帖子。 当只有英语时,代码正常工作,但是 当我尝试搜索以阿拉伯语发布的页面时,会出现上述错误。 我知道解决方案是使用utf-8,但我不知道如何在代码上实现它。
答案 0 :(得分:0)
您的问题可能不在此代码中,我怀疑是在您的processFacebookPageFeedStatus函数中。但是当您格式化字段时,您需要确保任何可能包含unicode字符的字符都在utf-8中被解码(或根据需要进行编码)。
import codecs
field_a = "some unicode text in here"
field_a.decode('utf-8') -----> \u1234\u........
field_a.encode('utf-8') -----> Back to original unicode
您的CSV可能不支持unicode,因此您需要解码源数据中的每个字段。
调试unicode很痛苦,但是有很多关于编码/解码unicode的不同问题的帖子
答案 1 :(得分:0)
导入sys
重载(SYS).setdefaultencoding(" UTF-8&#34)
我添加了这段代码,当我在pandas中打开这个文件时它工作正常。
没有其他错误或现在的情况