我收集运行以下python代码的波斯语推文:
#!/usr/bin/env python
# -*- coding: UTF-8 -*-
import sys
import tweepy
import json
import os
consumer_key ="xxxx"
consumer_secret ="xxxx"
access_key = "xxxx"
access_secret = "xxxx"
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_key, access_secret)
api = tweepy.API(auth)
save_file = open("Out.json", 'a')
t1 = u""
class CustomStreamListener(tweepy.StreamListener):
def __init__(self, api):
self.api = api
super(tweepy.StreamListener, self).__init__()
# self.list_of_tweets = []
def on_data(self, tweet):
print tweet
save_file.write(str(tweet))
def on_error(self, status_code):
print >> sys.stderr, 'Encountered error with status code:', status_code
return True # Don't kill the stream
print "Stream restarted"
def on_timeout(self):
print >> sys.stderr, 'Timeout...'
return True # Don't kill the stream
print "Stream restarted"
def start_stream():
while True:
try:
sapi = tweepy.streaming.Stream(auth, CustomStreamListener(api))
sapi.filter(track=[t1])
except:
continue
start_stream()
它以原始ASCII文本返回推文文本,其中UTF-8编码字符由反斜杠转义表示。我想以一种直接以UTF-8编码格式将检索到的推文保存在“Out.json”中的方式更改代码。
{
"created_at": "Tue Feb 07 08:04:17 +0000 2017",
"id": 828877025049972737,
"id_str": "828877025049972737",
"text": "\u0644\u0637\u0641\u0627 \u0628\u0647 \u062d\u06cc\u0648\u0627\u0646\u0627\u062a \u063a\u06cc\u0631\u062e\u0627\u0646\u06af\u06cc \u063a\u0630\u0627\u00a0\u0646\u062f\u0647\u06cc\u062f https:\/\/t.co\/gFi5XCVQww https:\/\/t.co\/pQWPqbvJVF",
"display_text_range": [0, 58],
"source": "\u003ca href=\"http:\/\/publicize.wp.com\/\" rel=\"nofollow\"\u003eWordPress.com\u003c\/a\u003e",
"truncated": false,
"in_reply_to_status_id": null,
"in_reply_to_status_id_str": null,
"in_reply_to_user_id": null,
"in_reply_to_user_id_str": null,
"in_reply_to_screen_name": null,
...
"lang": "fa",
"timestamp_ms": "1486454657219"
}
答案 0 :(得分:0)
删除self.value()
来电:
str
如果没有帮助,请使用codecs模块打开文件:
# ...
def on_data(self, tweet):
print tweet
save_file.write(tweet)
# ...
答案 1 :(得分:0)
StreamListener.on_data()
方法传递从Twitter收到的原始JSON数据。这个数据包含有效的JSON转义序列。
如果您想直接保存UTF-8数据,那么将\uhhhh
转义序列替换为实际的Unicode代码点,您就必须重新编码推文。之后使用Saving utf-8 texts in json.dumps as UTF8, not as \u escape sequence保存数据。
请注意,将多个JSON对象写入一个文件会使该文件本身无效JSON。您可以通过注入换行符生成JSON Lines output(标准json.dumps()
输出不会在生成的JSON文档中生成换行符),然后使用this answer逐个读取这些条目。
因此,代码的重要部分应如下所示:
import json
save_file = open("Out.json", 'a')
class CustomStreamListener(tweepy.StreamListener):
# ...
def on_data(self, tweet):
tweet = json.loads(tweet)
json_doc = json.dumps(tweet, ensure_ascii=False)
save_file.write(json_doc.encode('utf8') + '\n')