我想通过操作将一个组应用于Pandas DataFrame而不执行任何聚合。相反,我只想将层次结构反映在MultiIndex中。
import pandas as pd
def multi_index_group_by(df, columns):
# TODO: How to write this? (Hard-coded to give the desired result for the example.)
if columns == ["b"]:
df.index = pd.MultiIndex(levels=[[0,1],[0,1,2]], labels=[[0,1,0,1,0],[0,0,1,1,2]])
return df
if columns == ["c"]:
df.index = pd.MultiIndex(levels=[[0,1],[0,1],[0,1]], labels=[[0,1,0,1,0],[0,0,0,1,1],[0,0,1,0,0]])
return df
if __name__ == '__main__':
df = pd.DataFrame({
"a": [0,1,2,3,4],
"b": ["b0","b1","b0","b1","b0"],
"c": ["c0","c0","c0","c1","c1"],
})
print(df.index.values) # [0,1,2,3,4]
# Add level of grouping
df = multi_index_group_by(df, ["b"])
print(df.index.values) # [(0, 0) (1, 0) (0, 1) (1, 1) (0, 2)]
# Examples
print(df.loc[0]) # Group 0
print(df.loc[1,1]) # Group 1, Item 1
# Add level of grouping
df = multi_index_group_by(df, ["c"])
print(df.index.values) # [(0, 0, 0) (1, 0, 0) (0, 0, 1) (1, 1, 0) (0, 1, 0)]
# Examples
print(df.loc[0]) # Group 0
print(df.loc[0,0]) # Group 0, Sub-Group 0
print(df.loc[0,0,1]) # Group 0, Sub-Group 0, Item 1
实施multi_index_group_by
的最佳方法是什么?以下几乎可以工作,但结果索引不是数字:
index_columns = []
# Add level of grouping
index_columns += ["b"]
print(df.set_index(index_columns, drop=False))
# Add level of grouping
index_columns += ["c"]
print(df.set_index(index_columns, drop=False))
编辑:为了澄清,在示例中,最终索引应该相当于:
[
[ #b0
[ #c0
{"a": 0, "b": "b0", "c": "c0"},
{"a": 2, "b": "b0", "c": "c0"},
],
[ #c1
{"a": 4, "b": "b0", "c": "c1"},
]
],
[ #b1
[ #c0
{"a": 1, "b": "b1", "c": "c0"},
],
[ #c1
{"a": 3, "b": "b1", "c": "c1"},
]
]
]
编辑:这是我到目前为止所做的最好的事情:
def autoincrement(value=0):
def _autoincrement(*args, **kwargs):
nonlocal value
result = value
value += 1
return result
return _autoincrement
def swap_levels(df, i, j):
order = list(range(len(df.index.levels)))
order[i], order[j] = order[j], order[i]
return df.reorder_levels(order)
def multi_index_group_by(df, columns):
new_index = df.groupby(columns)[columns[0]].aggregate(autoincrement())
result = df.join(new_index.rename("_new_index"), on=columns)
result.set_index('_new_index', append=True, drop=True, inplace=True)
result.index.name = None
result = swap_levels(result, -2, -1)
return result
它给出了正确的结果,除了最后一级,它没有变化。仍感觉还有很大的改进空间。
答案 0 :(得分:2)
如果您愿意使用sklearn套餐,可以使用LabelEncoder:
var path = 'd:\\folder1\\folder2\\assets\\images\\62f9a0f4-98b9-4dd0-8047-ed1a3cc306cf.png';
var parts = path.split('\\');
var file = parts.pop();
var fileAndPar = parts.pop() + '\\' + file;
console.log(fileAndPar);
它对每列的标签进行编码,其值介于0和n_classes-1
之间调用
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
def multi_index_group_by(df, columns):
df.index = pd.MultiIndex.from_tuples( zip( *[ le.fit_transform( df[col] ) for col in columns ] ) )
return df
给你
multi_index_group_by( ['b','c'] )
答案 1 :(得分:1)
此代码可以满足您的需求:
index_columns = []
replace_values = {}
index_columns += ["b"]
replace_values.update({'b0':0, 'b1':1})
df[['idx_{}'.format(i) for i in index_columns]] = df[index_columns].replace(replace_values)
print(df.set_index(['idx_{}'.format(i) for i in index_columns], drop=True))
index_columns += ["c"]
replace_values.update({'c0':0, 'c1':1})
df[['idx_{}'.format(i) for i in index_columns]] = df[index_columns].replace(replace_values)
print(df.set_index(['idx_{}'.format(i) for i in index_columns], drop=True))
# If you want the 3rd ('c') level MultiIndex:
df['d'] = [0,0,1,0,0]
print(df.set_index(['idx_{}'.format(i) for i in index_columns] + ['d'], drop=True))