假设我有以下熊猫DataFrame:
从熊猫导入DataFrame 从numpy导入范围
lst = [ range(10), range(11,21), range(21,31) ]
df = DataFrame(lst).T.set_index(arange(0.1, 1.1, 0.1))
0 1 2
0.1 0 11 21
0.2 1 12 22
0.3 2 13 23
0.4 3 14 24
0.5 4 15 25
0.6 5 16 26
0.7 6 17 27
0.8 7 18 28
0.9 8 19 29
1.0 9 20 30
我想使用不同的索引范围选择每一列。
例如列:
所以我的DataFrame应该是这样的:
0 1 2
0.1 NaN NaN 21.0
0.2 NaN 12.0 22.0
0.6 5.0 NaN NaN
0.7 6.0 NaN NaN
0.8 7.0 NaN NaN
0.9 8.0 NaN NaN
我当前的解决方案是:
idx = array([ [0.6, 0.9], [0.2, 0.3], [0.1, 0.3] ])
df2 = DataFrame((df[col][i[0]:i[1]] for i, col in zip(idx, df))).T
也许不是更好的解决方案。
感谢大家的回答。
比较 我编写了一个脚本来对每个答案进行基准测试。 基准分为两部分:
STD
:仅适用于标准索引(0,1,2,3,...)的答案 GEN
:适用于一般索引的答案
from numpy import arange, array, linspace
from numpy.random import rand, randint
from pandas import DataFrame
from timeit import Timer
# yellowhat
def yellowhat(df, idx):
df2 = DataFrame((df[col][i[0]:i[1]] for i, col in zip(idx, df))).T
return df2
# user3483203
def user3483203(df, idx):
from numpy import arange
r = arange(df.shape[0])[:, None]
m = (idx[:,0] <= r) & (idx[:,1] > r)
df2 = df.mask(~m).dropna(how='all')
return df2
def user3483203_2(df, idx):
from numpy import zeros, bool8, arange
def foo(a, idx):
out = zeros(a, dtype=bool8)
for (i, j), k in zip(idx, arange(a[1])):
out[i:j, k] = True
return out
df2 = df.mask(~foo(df.shape, idx)).dropna(how='all')
return df2
def user3483203_mod(df, idx):
r = df.index.values[:,None]
m = (r >= idx[:,0]) & (r <= idx[:,1])
df2 = df.mask(~m).dropna(how='all')
return df2
#
def GeorgeLPerkins(df, idx):
from pandas import DataFrame
dct = {i : row for i, row in enumerate(idx)}
df2 = DataFrame(columns = df.columns, index=df.index)
for k in dct:
df2[k] = df[k][dct[k][0] : dct[k][1]]
return df2
#
def piRSquared(df, idx):
tups = sorted([(i, j) for j, args in enumerate(idx) for i in range(*args)])
df2 = df.stack().loc[tups].unstack()
return df2
#
def sacul(df, idx):
from pandas import concat
df2 = concat([df[col].iloc[range(*idx[i])] for i,col in enumerate(df.columns)],axis=1)
return df2
def sacul_2(df, idx):
df2 = df.apply(lambda x: x.iloc[range(*idx[df.columns.get_loc(x.name)])])
return df2
# Benchmark Index STD
nRow, nCol = 1000, 500
df = DataFrame(rand(nRow, nCol))
idx = df.index[randint(nRow, size=(nCol, 2))].values
idx.sort(axis=1)
print('STD')
for func in [yellowhat, GeorgeLPerkins, user3483203, user3483203_2, user3483203_mod, piRSquared, sacul, sacul_2]:
nmFunc = func.__name__
print(nmFunc)
t = Timer("%s(df, idx)"%nmFunc, "from __main__ import df, idx, %s"%nmFunc).timeit(10)
print(' %8.2f sec'%t)
print('')
# Benchmark Index GEN
idx = linspace(0, 1, nRow)
df = DataFrame(rand(nRow, nCol)).set_index(idx)
idx = idx[randint(nRow, size=(nCol, 2))]
idx.sort(axis=1)
print('GEN')
for func in [yellowhat, GeorgeLPerkins, user3483203_mod]:
nmFunc = func.__name__
print(nmFunc)
t = Timer("%s(df, idx)"%nmFunc, "from __main__ import df, idx, %s"%nmFunc).timeit(10)
print(' %8.2f sec'%t)
print('')
这些是我机器上的结果:
STD
yellowhat
4.56 sec
GeorgeLPerkins
26.10 sec
user3483203
0.56 sec
user3483203_2
0.57 sec
user3483203_mod
0.63 sec
piRSquared
31.84
sacul
6.50
sacul_2
7.15 sec
GEN
yellowhat
5.13
GeorgeLPerkins
27.07
user3483203_mod
0.52 sec
感谢大家的回答。
答案 0 :(得分:2)
我不确定这是否真的比您拥有的要好,但是您可以遍历各列,使用index
将*
解包到一个范围中,并连接得到的数据帧:
pd.concat([df[col].iloc[range(*index[i])] for i,col in enumerate(df.columns)],axis=1)
0 1 2
1 NaN NaN 22.0
2 NaN 13.0 23.0
6 6.0 NaN NaN
7 7.0 NaN NaN
8 8.0 NaN NaN
或者用apply的另一种方式:使用每一列的索引号使用df.columns.get_loc(x.name)
来索引您的索引列表:
df.apply(lambda x: x.iloc[range(*index[df.columns.get_loc(x.name)])])
0 1 2
1 NaN NaN 22.0
2 NaN 13.0 23.0
6 6.0 NaN NaN
7 7.0 NaN NaN
8 8.0 NaN NaN
答案 1 :(得分:2)
stack
然后选择loc
tups = sorted([(i, j) for j, args in enumerate(index) for i in range(*args)])
df.stack().loc[tups].unstack()
0 1 2
1 NaN NaN 22.0
2 NaN 13.0 23.0
6 6.0 NaN NaN
7 7.0 NaN NaN
8 8.0 NaN NaN
pd.Series({
(i, j): df.at[i, j] for j, args in enumerate(index) for i in range(*args)
}).unstack()
0 1 2
1 NaN NaN 22.0
2 NaN 13.0 23.0
6 6.0 NaN NaN
7 7.0 NaN NaN
8 8.0 NaN NaN
答案 2 :(得分:2)
更新 :我问another question关于如何向量化此问题的方法,@ Divakar发布了excellent answer,可在此处应用:
r = np.arange(df.shape[0])[:, None]
m = (idx[:,0] <= r) & (idx[:,1] > r)
df.mask(~m).dropna(how='all')
0 1 2
1 NaN NaN 22.0
2 NaN 13.0 23.0
6 6.0 NaN NaN
7 7.0 NaN NaN
8 8.0 NaN NaN
旧的非矢量化方法 :
此方法使用基础的numpy
数组创建掩码:
def foo(a, idx):
out = np.zeros(a, dtype=np.bool8)
for (i, j), k in zip(idx, np.arange(a[1])):
out[i:j, k] = True
return out
df.mask(~foo(df.shape, idx)).dropna(how='all')
输出:
0 1 2
1 NaN NaN 22.0
2 NaN 13.0 23.0
6 6.0 NaN NaN
7 7.0 NaN NaN
8 8.0 NaN NaN
答案 3 :(得分:0)
您可以使用字典来代替列表中的“索引”:
import pandas as pd
lst = [ range(10), range(11,21), range(21,31) ]
df = pd.DataFrame(lst).T
dict = {0:[6,9], 1:[2,3], 2:[1,3]}
df2 = pd.DataFrame(columns = df.columns, index=df.index)
for k in dict:
df2[k] = df[k][dict[k][0]:dict[k][1]+1]
print(df2)