Question

我正在尝试做pandas int or float column to percentage distribution。它适用于某些数据集，而对于其他数据集则会出错。

我收到重复条目错误。当我使用unstack时，此代码在 df1 中有问题。我正在为数据框架做准备。

import pandas as pd
import numpy as np
data = [{'Petal_width': 0.2, 'Petal_length': 1.4, 'Sepal_width': 3.5, 'Sepal_length': 5.1, 'Colour': 'a', 'Species_name': ' Setosa'}, {'Petal_width': 0.3, 'Petal_length': 1.4, 'Sepal_width': 3.0, 'Sepal_length': 4.6, 'Colour': 'b', 'Species_name': ' Setosa'}, {'Petal_width': 0.2, 'Petal_length': 1.3, 'Sepal_width': 3.6, 'Sepal_length': 4.7, 'Colour': 'a', 'Species_name': ' Setosa'}, {'Petal_width': 0.2, 'Petal_length': 1.5, 'Sepal_width': 3.1, 'Sepal_length': 4.6, 'Colour': 'c', 'Species_name': ' Setosa'}, {'Petal_width': 0.2, 'Petal_length': 1.4, 'Sepal_width': 3.6, 'Sepal_length': 5.0, 'Colour': 'b', 'Species_name': ' Setosa'}, {'Petal_width': 0.4, 'Petal_length': 1.7, 'Sepal_width': 3.9, 'Sepal_length': 5.4, 'Colour': 'b', 'Species_name': ' Setosa'}, {'Petal_width': 0.3, 'Petal_length': 1.4, 'Sepal_width': 3.4, 'Sepal_length': 4.6, 'Colour': 'b', 'Species_name': ' Setosa'}]
df = pd.DataFrame(data)
df = df.select_dtypes(exclude=['object']) #numerical
print(df)

values = [[0.1, 0.2, 1.3, 1.8, 2.5], [1.0, 1.4, 4.5, 5.1, 6.9], [2.0, 3.0, 3.4, 4.4], [4.3, 5.0, 5.5, 6.3, 7.9]]


for i in range(len(df.columns)):
    s=pd.cut(df.iloc[:,i],values[i]).dropna()
    x=s.map(lambda x : x.left).astype(int).to_frame('V')
    y=s.map(lambda x : x.right).astype(int).to_frame('V')
    y['r']=abs((x.V-df.iloc[:,i])/(y.V-x.V))
    x['r']=abs(1-y['r'])

    df1=pd.concat([x,y]).set_index('V',append=True).\
           r.unstack(fill_value=0).reset_index(drop=True).\
            reindex(columns=values[i],index=df.index,fill_value=0)
    print(df1)

错误：熊猫数据框中的条目重复

0 个答案: