Question

我是python的新手，一直在网上搜索该问题的解决方案，但未找到任何解决方案。我有一个pandas数据框字典，其中的键是“ Year”，值是该年的pandas数据框。这是示例数据：

def target_non_target(input_frames_folder,model_file,output):

    if not os.path.exists(output):
        os.makedirs(output,exist_ok=True)

    count=0
    folders = glob(input_frames_folder)

    img_list = []

    for folder in folders:
        folder_name=os.path.basename(folder)
        #print(folder_name)
        out_path = output +"\\" + folder_name
        print(out_path)
        os.makedirs(out_path,exist_ok=True)

        for f in glob(folder+"/*.jpg"):
            img_list.append(f)

        for i in range(len(img_list)):
            v1=os.path.basename(img_list[i])
            img_name = os.path.splitext(v1)[0]
            image = cv2.imread(img_list[i])
            orig = image.copy()

            image = cv2.resize(image, (28, 28))
            image = image.astype("float") / 255.0
            image = img_to_array(image)
            image = np.expand_dims(image, axis=0)

            print("[INFO] loading network...")
            model = load_model(model_file)

            (non_target, target) = model.predict(image)[0]

            if target > non_target:
                label = "Target"
            else:
                label = "Non Target"

            probab = target if target > non_target else non_target
            label = "{}: {:.2f}%".format(label, probab * 100)

            op = imutils.resize(orig, width=400)
            cv2.putText(op, label, (10, 25),  cv2.FONT_HERSHEY_SIMPLEX,0.7, (0, 255, 0), 2)

            if target > non_target:
                cv2.imwrite(out_path+"/"+"\\{}.jpg".format(img_name),orig)
            cv2.waitKey(0)

        img_list = []   # this is the end of for folder in folders, reset list
    #return target_op


frames_folder = ("C:\\Python36\\videos\\videos_new\\*")
model = ("C:\\Python35\\target_non_target\\target_non_target.model")
output_folder = ("C:\\Python35\\target_non_target\\Target_images_new\\")
target_check = target_non_target(frames_folder,model,output_folder)

我选择从所有数据框的列表开始，因为这是我真正的问题中数据的导入方式。有了数据框列表后，我就创建了这些数据框的字典。

import pandas as pd
import numpy as np
from collections import defaultdict

##Creating Dataframes
data1_2018 =[[1,2018,80], [2,2018,70]]
data2_2018 = [[1,2018,77], [3,2018,62]]
data3_2018 = [[1,2018,82], [2,2018,88], [4,2018,66]]

data1_2017 = [[1,2017,80], [5,2017,70]]
data2_2017 = [[1,2017,77], [3,2017,62]]
data3_2017 = [[1,2017,50], [2,2017,52], [4,2017,51]]

df1_2018 = pd.DataFrame(data1_2018, columns = ['ID', 'Year', 'Score_1'])
df2_2018 = pd.DataFrame(data2_2018, columns = ['ID', 'Year', 'Score_2'])
df3_2018 = pd.DataFrame(data3_2018, columns = ['ID', 'Year', 'Score_3'])


df1_2017 = pd.DataFrame(data1_2017, columns = ['ID', 'Year', 'Score_1'])
df2_2017 = pd.DataFrame(data2_2017, columns = ['ID', 'Year', 'Score_2'])
df3_2017 = pd.DataFrame(data3_2017, columns = ['ID', 'Year', 'Score_3'])

###Creating list of all dataframes
all_df_list = [df1_2018,df2_2018,df3_2018,df1_2017,df2_2017,df3_2017]

现在，我的问题是..您能否在每个组中循环一个数据帧，并通过“ ID”将它们与外部合并在一起。所需的输出将是每年仅一个数据帧的列表或字典。这是每年所需的结果：

yearly_dfs = defaultdict(list)
####Loop for creating dict with keys being years and values being dfs for that year
for df in all_df_list:
    for yr, yr_df in df.groupby('Year'):
        yearly_dfs[yr].append(yr_df)

任何帮助将不胜感激！

谢谢！

Answer 1

使用agg函数pandas.concat使用DataFrame.groupby和first'Year'＆'ID'，然后将grouby与'{Year' ：

df_all = (pd.concat(all_df_list, sort=False)
          .groupby(['ID', 'Year']).first().reset_index())

df_years = {yr: df for yr, df in df_all.groupby('Year')}

访问方式：

df_years[2017]

   ID  Year  Score_1  Score_2  Score_3
0   1  2017     80.0     77.0     50.0
2   2  2017      NaN      NaN     52.0
4   3  2017      NaN     62.0      NaN
6   4  2017      NaN      NaN     51.0
8   5  2017     70.0      NaN      NaN

df_years[2018]

  ID  Year  Score_1  Score_2  Score_3
1   1  2018     80.0     77.0     82.0
3   2  2018     70.0      NaN     88.0
5   3  2018      NaN     62.0      NaN
7   4  2018      NaN      NaN     66.0

在字典键中的数据帧上进行外部合并

1 个答案: