编辑:
熊猫1.0.5中存在错误,升级到1.1.1后没有错误。
我有一个看起来像这样的熊猫数据框:
Name Date Price Label Y Z
foo1 1/1/20 100 1 _ _
foo1 1/1/20 200 2 _ _
. . . . . .
. . . . . .
foo1 1/8/20 240 1 _ _
foo2 1/2/20 500 1 _ _
. . . . . .
. . . . . .
foo2 1/7/20 423 4 _ _
. . . . . .
. . . . . .
Name
列有80个唯一值,即foo1-foo80 Date
值Label
值我想创建一个表,它将有80行(每个名称每个)和20 * 4 + 1列(每个Date-Label组合20x4,名称1个)。
最终数据框应如下所示:
**Name 1/1/20(Label1) 1/1/20(Label2) 1/1/20(Label3) 1/1/20(Label4) 1/2/20(Label1) ... 4/7/20(Label4)**
foo1 100 200 300 -1 -1 -1
foo2 -1 -1 -1 -1 500 -1
...............
...............
-1表示原始条目中没有特定Name-Date-Label组合的条目。
我基本上是熊猫新手,我当然可以手动迭代构建数据框(if..else解决方案),但我相信有一个更快,更易懂的解决方案。
> df.columns
Index(['A', 'B', 'Date', 'C',
'D', 'Price', 'Label', 'E',
'Name', 'F', 'G', 'H', 'I',
'J'],
dtype='object')
> df.head(10).to_dict('list')
{'A': [160, 457, 457, 482, 482, 482, 482, 423, 223, 506],
'B': ['8/27/2015 0:00',
'10/15/2015 0:00',
'10/15/2015 0:00',
'10/28/2015 0:00',
'10/28/2015 0:00',
'10/28/2015 0:00',
'10/28/2015 0:00',
'9/29/2015 0:00',
'9/9/2015 0:00',
'11/9/2015 0:00'],
'Date': ['8/28/2015 0:00',
'10/16/2015 0:00',
'10/16/2015 0:00',
'10/29/2015 0:00',
'10/29/2015 0:00',
'10/29/2015 0:00',
'10/29/2015 0:00',
'9/30/2015 0:00',
'9/10/2015 0:00',
'11/10/2015 0:00'],
'C': [5, 5, 5, 5, 5, 5, 5, 5, 5, 5],
'D': [1271, 1825, 1825, 1455, 1455, 1455, 1455, 2522, 1385, 1765],
'Price': [1058, 1685, 1615, 1195, 1255, 1279, 1295, 2285, 1285, 1665],
'Label': [3, 3, 2, 1, 3, 4, 2, 2, 1, 4],
'E': [13, 127, 127, -1, -1, -1, -1, -1, -1, -1],
'Name': ['foo1',
'foo2',
'foo2',
'foo3',
'foo3',
'foo3',
'foo3',
'foo4',
'foo4',
'foo3'],
'F': [4, 4, 4, 3, 3, 3, 3, 3, 3, 3],
'G': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
'H': ['Friday',
'Friday',
'Friday',
'Thursday',
'Thursday',
'Thursday',
'Thursday',
'Wednesday',
'Thursday',
'Tuesday'],
'I': [213, 140, 210, 260, 200, 176, 160, 237, 100, 100],
'J': [16.758457907159716,
7.671232876712329,
11.506849315068493,
17.869415807560138,
13.745704467353955,
12.096219931271474,
10.996563573883162,
9.397303727200637,
7.220216606498194,
5.6657223796034]}
使用
import pandas_datareader.data as web
import pandas as pd
import numpy as np
from datetime import datetime
df = pd.DataFrame({
# 'A': [160, 457, 457, 482, 482, 482, 482, 423, 223, 506],
# 'B': ['8/27/2015 0:00','10/15/2015 0:00','10/15/2015 0:00','10/28/2015 0:00','10/28/2015 0:00','10/28/2015 0:00','10/28/2015 0:00','9/29/2015 0:00','9/9/2015 0:00','11/9/2015 0:00'],
'Date': ['8/28/2015 0:00','10/16/2015 0:00','10/16/2015 0:00','10/29/2015 0:00','10/29/2015 0:00','10/29/2015 0:00','10/29/2015 0:00','9/30/2015 0:00','9/10/2015 0:00','11/10/2015 0:00'],
# 'C': [5, 5, 5, 5, 5, 5, 5, 5, 5, 5],
# 'D': [1271, 1825, 1825, 1455, 1455, 1455, 1455, 2522, 1385, 1765],
'Price': [1058, 1685, 1615, 1195, 1255, 1279, 1295, 2285, 1285, 1665],
'Label': [3, 3, 2, 1, 3, 4, 2, 2, 1, 4],
# 'E': [13, 127, 127, -1, -1, -1, -1, -1, -1, -1],
'Name': ['foo1','foo2','foo2','foo3','foo3','foo3','foo3','foo4','foo4','foo3'],
# 'F': [4, 4, 4, 3, 3, 3, 3, 3, 3, 3],
# 'G': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
# 'H': ['Friday','Friday','Friday','Thursday','Thursday','Thursday','Thursday','Wednesday','Thursday','Tuesday'],
# 'I': [213, 140, 210, 260, 200, 176, 160, 237, 100, 100],
# 'J': [16.758457907159716,7.671232876712329,11.506849315068493,17.869415807560138,13.745704467353955,12.096219931271474,10.996563573883162,9.397303727200637,7.220216606498194,5.6657223796034]
})
df.pivot(index='Name', columns=['Date', 'Label'], values='Price')`
我明白了
ValueError Traceback (most recent call last)
~\anaconda3\lib\site-packages\pandas\core\indexes\multi.py in _get_level_number(self, level)
1294 try:
-> 1295 level = self.names.index(level)
1296 except ValueError:
ValueError: 'Date' is not in list
During handling of the above exception, another exception occurred:
KeyError Traceback (most recent call last)
<ipython-input-17-542e5c02777d> in <module>
----> 1 df.pivot(index='Name', columns=['Date', 'Label'], values='Price')
~\anaconda3\lib\site-packages\pandas\core\frame.py in pivot(self, index, columns, values)
5921 from pandas.core.reshape.pivot import pivot
5922
-> 5923 return pivot(self, index=index, columns=columns, values=values)
5924
5925 _shared_docs[
~\anaconda3\lib\site-packages\pandas\core\reshape\pivot.py in pivot(data, index, columns, values)
448 else:
449 indexed = data._constructor_sliced(data[values].values, index=index)
--> 450 return indexed.unstack(columns)
451
452
~\anaconda3\lib\site-packages\pandas\core\series.py in unstack(self, level, fill_value)
3548 from pandas.core.reshape.reshape import unstack
3549
-> 3550 return unstack(self, level, fill_value)
3551
3552 # ----------------------------------------------------------------------
~\anaconda3\lib\site-packages\pandas\core\reshape\reshape.py in unstack(obj, level, fill_value)
396 # _unstack_multiple only handles MultiIndexes,
397 # and isn't needed for a single level
--> 398 return _unstack_multiple(obj, level, fill_value=fill_value)
399 else:
400 level = level[0]
~\anaconda3\lib\site-packages\pandas\core\reshape\reshape.py in _unstack_multiple(data, clocs, fill_value)
318 index = data.index
319
--> 320 clocs = [index._get_level_number(i) for i in clocs]
321
322 rlocs = [i for i in range(index.nlevels) if i not in clocs]
~\anaconda3\lib\site-packages\pandas\core\reshape\reshape.py in <listcomp>(.0)
318 index = data.index
319
--> 320 clocs = [index._get_level_number(i) for i in clocs]
321
322 rlocs = [i for i in range(index.nlevels) if i not in clocs]
~\anaconda3\lib\site-packages\pandas\core\indexes\multi.py in _get_level_number(self, level)
1296 except ValueError:
1297 if not is_integer(level):
-> 1298 raise KeyError(f"Level {level} not found")
1299 elif level < 0:
1300 level += self.nlevels
KeyError: 'Level Date not found'
谢谢!
答案 0 :(得分:2)
您正在寻找df.pivot
df = df.pivot(index='Name', columns=['Date', 'Label'], values='Price')
警告:如果重复任何名称-日期-标签组合(即出现在多行中),则会引发错误。使用pivot_table
或更好的groupby
+ unstack
如果Name
,Date
和Label
在索引中,请使用unstack
代替pivot
使用示例数据更新
df = pd.DataFrame({
# 'A': [160, 457, 457, 482, 482, 482, 482, 423, 223, 506],
# 'B': ['8/27/2015 0:00','10/15/2015 0:00','10/15/2015 0:00','10/28/2015 0:00','10/28/2015 0:00','10/28/2015 0:00','10/28/2015 0:00','9/29/2015 0:00','9/9/2015 0:00','11/9/2015 0:00'],
'Date': ['8/28/2015 0:00','10/16/2015 0:00','10/16/2015 0:00','10/29/2015 0:00','10/29/2015 0:00','10/29/2015 0:00','10/29/2015 0:00','9/30/2015 0:00','9/10/2015 0:00','11/10/2015 0:00'],
# 'C': [5, 5, 5, 5, 5, 5, 5, 5, 5, 5],
# 'D': [1271, 1825, 1825, 1455, 1455, 1455, 1455, 2522, 1385, 1765],
'Price': [1058, 1685, 1615, 1195, 1255, 1279, 1295, 2285, 1285, 1665],
'Label': [3, 3, 2, 1, 3, 4, 2, 2, 1, 4],
# 'E': [13, 127, 127, -1, -1, -1, -1, -1, -1, -1],
'Name': ['foo1','foo2','foo2','foo3','foo3','foo3','foo3','foo4','foo4','foo3'],
# 'F': [4, 4, 4, 3, 3, 3, 3, 3, 3, 3],
# 'G': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
# 'H': ['Friday','Friday','Friday','Thursday','Thursday','Thursday','Thursday','Wednesday','Thursday','Tuesday'],
# 'I': [213, 140, 210, 260, 200, 176, 160, 237, 100, 100],
# 'J': [16.758457907159716,7.671232876712329,11.506849315068493,17.869415807560138,13.745704467353955,12.096219931271474,10.996563573883162,9.397303727200637,7.220216606498194,5.6657223796034]
})
df.Date = pd.to_datetime(df.Date)
df = df.pivot(index='Name', columns=['Date', 'Label'], values='Price')
df = df.fillna(-1)
print(df)
输出
Date 2015-08-28 2015-10-16 2015-10-29 ... 2015-09-30 2015-09-10 2015-11-10
Label 3 3 2 1 ... 2 2 1 4
Name ...
foo1 1058.0 NaN NaN NaN ... NaN NaN NaN NaN
foo2 NaN 1685.0 1615.0 NaN ... NaN NaN NaN NaN
foo3 NaN NaN NaN 1195.0 ... 1295.0 NaN NaN 1665.0
foo4 NaN NaN NaN NaN ... NaN 2285.0 1285.0 NaN
[4 rows x 10 columns]