我不知道为什么,但我的代码给了我一个空的数据框,它工作了一段时间,现在它已经不再了,请帮忙
我的代码点读取熊猫中的json文件只能获取我需要的数据
import json
import pandas as pd
import os
import numpy as np
import time
start_time = time.clock()
# Set output size
from builtins import print
pd.set_option('display.width', 320)
# this finds our json files
path_to_json = 'C:/Users/Alex/Desktop/1'
json_files = [pos_json for pos_json in os.listdir(path_to_json) if pos_json.endswith('.json')]
print(json_files)
jsons_data = pd.DataFrame(columns=['created_at', 'text', 'lang'])
# we need both the json and an index number so use enumerate()
i = 0
for index, js in enumerate(json_files):
with open(os.path.join(path_to_json, js)) as json_file:
for line in json_file:
json_object=json.loads(line)
if (json_object.get('created_at') and json_object.get('text') and json_object.get('lang')) :
created_at = json_object['created_at']
text = json_object['text']
lang = json_object['lang']
# d = {'created_at':created_at, 'text':text, 'lang':lang}
pandaLine = pd.DataFrame(np.array([[created_at, text, lang]]), columns=['created_at', 'text', 'lang'], index=[i])
# print pandaLine
jsons_data = jsons_data.append(pandaLine, ignore_index=False)
i = i + 1
# here I push a list of data into a pandas DataFrame at row given by 'index'
# Now that we have the pertinent json data in our DataFrame let's look at it
print (jsons_data.head())
print (jsons_data[0:10])
# We only keep the tweet in English
jsons_data = jsons_data.ix[jsons_data['lang'] == 'en']
# We don't need the language of the tweets anymore so we drop it
jsons_data = jsons_data.drop('lang', 1)
# We sort the tweet by date of creation
jsons_data = jsons_data.sort_values(by='created_at')
print(jsons_data)
jsons_data.to_json("C:/Users/Alex/Desktop/TEST2.json")
print (time.clock() - start_time, "seconds")