Question

我不知道为什么，但我的代码给了我一个空的数据框，它工作了一段时间，现在它已经不再了，请帮忙

我的代码点读取熊猫中的json文件只能获取我需要的数据

import json
import pandas as pd
import os
import numpy as np
import time
start_time = time.clock()

# Set output size
from builtins import print

pd.set_option('display.width', 320)

# this finds our json files
path_to_json = 'C:/Users/Alex/Desktop/1'
json_files = [pos_json for pos_json in os.listdir(path_to_json) if pos_json.endswith('.json')]
print(json_files)

jsons_data = pd.DataFrame(columns=['created_at', 'text', 'lang'])

# we need both the json and an index number so use enumerate()
i = 0
for index, js in enumerate(json_files):
    with open(os.path.join(path_to_json, js)) as json_file:
        for line in json_file:
            json_object=json.loads(line)
            if (json_object.get('created_at') and json_object.get('text') and json_object.get('lang')) :
                created_at = json_object['created_at']
                text = json_object['text']
                lang = json_object['lang']
                # d = {'created_at':created_at, 'text':text, 'lang':lang}
                pandaLine = pd.DataFrame(np.array([[created_at, text, lang]]), columns=['created_at', 'text', 'lang'], index=[i])
                # print pandaLine

                jsons_data = jsons_data.append(pandaLine, ignore_index=False)
                i = i + 1


        # here I push a list of data into a pandas DataFrame at row given by 'index'

# Now that we have the pertinent json data in our DataFrame let's look at it
print (jsons_data.head())
print (jsons_data[0:10])

# We only keep the tweet in English
jsons_data = jsons_data.ix[jsons_data['lang'] == 'en']

# We don't need the language of the tweets anymore so we drop it
jsons_data = jsons_data.drop('lang', 1)

# We sort the tweet by date of creation
jsons_data = jsons_data.sort_values(by='created_at')

print(jsons_data)

jsons_data.to_json("C:/Users/Alex/Desktop/TEST2.json")

print (time.clock() - start_time, "seconds")

Python用panda返回一个空的dataFrame

0 个答案: