将熊猫数据框转换为其他格式

时间:2020-03-17 21:59:04

标签: python pandas

我有一个这样的数据框

class FirebaseData : ObservableObject {
    @Published var posts = [Post]()

    let postsCollection = Firestore.firestore().collection("Posts")

    init() {
        self.fetchPosts()
    }

    //MARK: Fetch Data
    private func fetchPosts() {
        self.postsCollection.addSnapshotListener { (documentSnapshot, err) in
            if err != nil {
                print("Error fetching posts: \(err!.localizedDescription)")
                return
            } else {
                documentSnapshot!.documentChanges.forEach { diff in
                    if diff.type == .added {
                        let post = self.createPostFromDocument(document: diff.document)
                        self.posts.append(post)
                    } else if diff.type == .modified {
                        self.posts = self.posts.map { (post) -> Post in
                            if post.id == diff.document.documentID {
                                return self.createPostFromDocument(document: diff.document)
                            } else {
                                return post
                            }
                        }
                    } else if diff.type == .removed {
                        for index in self.posts.indices {
                            if self.posts[index].id == diff.document.documentID {
                                self.posts.remove(at: index)
                            }
                        }
                    }
                }
            }
        }
    }

可以使用以下代码制作

             col1  col2  col3  col4
id Category                        
a  blue         4     1     3     0
   red          1     0     0     4
b  red          0     1     8     5

我希望输出看起来像这样

df = pd.DataFrame({ 'id': ['a','a','b'],'Category': ['red','blue','red'], 'col1': [1,4,0], 'col2': [0,1,1],'col3' : [0,3,8], 'col4': [4,0,5]})
sum_df = df.groupby(['id','Category']).agg({'col1': 'sum', 'col2': 'sum','col3': 'sum', 'col4': 'sum'})

我希望每个 id red_col1 red_col2 red_col3 red_col4 blue_col1 blue_col2 blue_col3 blue_col4 0 a 1 0 0 4 4.0 1.0 3.0 0.0 1 b 0 1 8 5 NaN NaN NaN NaN 在一行中都是唯一的,并将值汇总到相应的列中。数据集中有1000个ID,类别也有1000个。如果某列没有与之关联的值,则该列应为空白,即id

3 个答案:

答案 0 :(得分:2)

使用DataFrame.unstack,然后重命名列:

new_df = df.unstack('Category')
new_df.columns = [f'{color}_{col}' for col, color in new_df.columns]
new_df=new_df.sort_index(axis=1).reset_index()
print(new_df)

  id  blue_col1  blue_col2  blue_col3  blue_col4  red_col1  red_col2  \
0  a        4.0        1.0        3.0        0.0       1.0       0.0   
1  b        NaN        NaN        NaN        NaN       0.0       1.0   

   red_col3  red_col4  
0       0.0       4.0  
1       8.0       5.0  

答案 1 :(得分:1)

IIUC

s=df.unstack().sort_index(level=1,axis=1)
s.columns=s.columns.map('{0[1]}_{0[0]}'.format) 
s
Out[136]: 
    blue_col1  blue_col2  blue_col3  ...  red_col2  red_col3  red_col4
id                                   ...                              
a         4.0        1.0        3.0  ...       0.0       0.0       4.0
b         NaN        NaN        NaN  ...       1.0       8.0       5.0
[2 rows x 8 columns]

答案 2 :(得分:0)

使用取消堆叠

df = df.unstack()

然后您可以添加前缀

df = df.add_prefix(category'_')

这将解决