我有一个像这样的json推文的txt文件,每条推文都在[“text”:& “is_quote_status”:]和日期是在行尾,如何将推文和日期提取到csv文件?
我终于能够用2个代码解决这个问题了。感谢@mark,需要时间来弄明白,但我做到了。
清理json文件。:
fin = open("sim.txt")
fout = open("output.txt", "w+")
delete_list = ['ObjectId(', 'NumberLong(','ISODate(', ')']
for line in fin:
for word in delete_list:
line = line.replace(word, "")
fout.write(line)
fin.close()
fout.close()
提取推文和日期以将其存储在.csv
中import json
import csv
infile = open("output1.txt","r")
outfile=open("output4.csv","w")
json_s=infile.read()
writer=csv.writer(outfile)
for data in(json.loads(json_s)):
x=data['text'].encode("utf-8")
y=data['created_at_date'].encode("utf-8")
writer.writerow([x,y])
infile.close()
outfile.close()
print 'DONE'
示例.json字符串我有 -
{
"_id": "582f4fbd44b65941a0a81213",
"contributors": null,
"truncated": false,
"text": "Tonight at 10 PM ET, 7 PM PT, on @FoxNews, a one hour special on me and my life by @HarveyLevinTMZ. Enjoy!",
"is_quote_status": false,
"in_reply_to_status_id": null,
"id": "799660246788612100",
"favorite_count": 15765,
"source": "<a href=\"http://twitter.com\" rel=\"nofollow\">Twitter Web Client</a>",
"retweeted": false,
"coordinates": null,
"entities": {
"symbols": [],
"user_mentions": [{
"id": 1367531,
"indices": [33, 41],
"id_str": "1367531",
"screen_name": "FoxNews",
"name": "Fox News"
}, {
"id": 36098990,
"indices": [83, 98],
"id_str": "36098990",
"screen_name": "HarveyLevinTMZ",
"name": "Harvey Levin"
}],
"hashtags": [],
"urls": []
},
"in_reply_to_screen_name": null,
"in_reply_to_user_id": null,
"retweet_count": 5251,
"id_str": "799660246788612100",
"favorited": false,
"user": {
"id": 25073877,
"id_str": "25073877"
},
"geo": null,
"in_reply_to_user_id_str": null,
"lang": "en",
"created_at": "Fri Nov 18 17:07:14 +0000 2016",
"in_reply_to_status_id_str": null,
"place": null,
"created_at_date": "2016-11-18T17:07:14Z"
}
答案 0 :(得分:0)
请注意json路径,并且文本文件中必须包含有效的json。
<强> /path/to/json/file.json 强>
[{
"_id": "dummyid1",
"contributors": null,
"truncated": false,
"text": "Dummy tweet 1",
"is_quote_status": false,
"in_reply_to_status_id": null,
"id": "799660246788612100",
"favorite_count": 15765,
"source": "<a href=\"http://twitter.com\" rel=\"nofollow\">Twitter Web Client</a>",
"retweeted": false,
"coordinates": null,
"entities": {
"symbols": [],
"user_mentions": [{
"id": 1367531,
"indices": [33, 41],
"id_str": "1367531",
"screen_name": "FoxNews",
"name": "Fox News"
}, {
"id": 36098990,
"indices": [83, 98],
"id_str": "36098990",
"screen_name": "HarveyLevinTMZ",
"name": "Harvey Levin"
}],
"hashtags": [],
"urls": []
},
"in_reply_to_screen_name": null,
"in_reply_to_user_id": null,
"retweet_count": 5251,
"id_str": "799660246788612100",
"favorited": false,
"user": {
"id": 25073877,
"id_str": "25073877"
},
"geo": null,
"in_reply_to_user_id_str": null,
"lang": "en",
"created_at": "Fri Nov 18 17:07:14 +0000 2016",
"in_reply_to_status_id_str": null,
"place": null,
"created_at_date": "2016-11-18T17:07:14Z"
},
{
"_id": "dummyid2",
"contributors": null,
"truncated": false,
"text": "Dummy tweet 2",
"is_quote_status": false,
"in_reply_to_status_id": null,
"id": "799660246788612100",
"favorite_count": 15765,
"source": "<a href=\"http://twitter.com\" rel=\"nofollow\">Twitter Web Client</a>",
"retweeted": false,
"coordinates": null,
"entities": {
"symbols": [],
"user_mentions": [{
"id": 1367531,
"indices": [33, 41],
"id_str": "1367531",
"screen_name": "FoxNews",
"name": "Fox News"
}, {
"id": 36098990,
"indices": [83, 98],
"id_str": "36098990",
"screen_name": "HarveyLevinTMZ",
"name": "Harvey Levin"
}],
"hashtags": [],
"urls": []
},
"in_reply_to_screen_name": null,
"in_reply_to_user_id": null,
"retweet_count": 5251,
"id_str": "799660246788612100",
"favorited": false,
"user": {
"id": 25073877,
"id_str": "25073877"
},
"geo": null,
"in_reply_to_user_id_str": null,
"lang": "en",
"created_at": "Fri Nov 18 17:07:14 +0000 2016",
"in_reply_to_status_id_str": null,
"place": null,
"created_at_date": "2016-11-18T17:07:14Z"
}
]
<强> script.py 强>
import json
with open('/path/to/json/file.json', 'r') as f:
json_string = f.read()
datas_from_json = json.loads(json_string) # json string now a iterable list
for data in datas_from_json:
print(data['text'])
# outputs
# Dummy tweet 1
# Dummy tweet 2
答案 1 :(得分:0)
使用Pandas。
可以简化此过程考虑到/path/to/input.json
或/path/to/input.txt
有一个有效的json文件,只要存在有效的json,文件扩展名就无关紧要。
import pandas as pd
df = pd.read_json("path/to/input.txt")
df[["text", "created_at_date"]].to_csv("output.csv", index=False)