Pandas / Python将两列转换为矩阵。矩阵

时间:2017-08-16 05:01:06

标签: python pandas

我可以使用以下命令将两列成功转换为矩阵。

dfb = datab.parse("a")

dfb

    Name       Product
0   Mike       Apple,pear
1   John       Orange,Banana
2   Bob        Banana
3   Connie      Pear


pd.get_dummies(dfb.Product).groupby(dfb.Name).apply(max)


    Apple,pear  Banana  Orange,Banana   Pear
Name                
Bob         0   1   0   0
Connie      0   0   0   1
John        0   0   1   0
Mike        1   0   0   0

但是,我想要的矩阵如下。

      Apple     Banana  Orange  Pear
Name                
Bob        0    1   0   0
Connie     0    0   0   1
John       0    1   1   0
Mike       1    0   0   1

2 个答案:

答案 0 :(得分:5)

<强> 1

set_index需要get_dummies

df = dfb.set_index('Name').Product.str.get_dummies(',')
print (df)
        Apple  Banana  Orange  Pear
Name                               
Mike        1       0       0     1
John        0       1       1     0
Bob         0       1       0     0
Connie      0       0       0     1

<强> 2

pandas.get_dummies针对新DataFarme的{​​{3}},针对列的最后split的解决方案,因此axis=1level=0以及汇总{{1 }}:

max

第3

dfb = dfb.set_index('Name') df = pd.get_dummies(dfb.Product.str.split(',', expand=True), prefix='', prefix_sep='') .groupby(axis=1, level=0).max() print (df) Apple Banana Orange Pear Name Mike 1 0 0 1 John 0 1 1 0 Bob 0 1 0 0 Connie 0 0 0 1 split的解决方案:

MultiLabelBinarizer

如果列from sklearn.preprocessing import MultiLabelBinarizer mlb = MultiLabelBinarizer() df = pd.DataFrame(mlb.fit_transform(dfb.Product.str.split(',')), columns=mlb.classes_, index=dfb.Name) print (df) Apple Banana Orange Pear Name Mike 1 0 0 1 John 0 1 1 0 Bob 0 1 0 0 Connie 0 0 0 1 中有重复项:

Name

答案 1 :(得分:2)

请参阅下面的时间

选项1

pir0 = lambda dfb: pd.get_dummies(dfb.Name).T.dot(
    dfb.Product.str.title().str.get_dummies(','))
pir0(dfb)

        Apple  Banana  Orange  Pear
Bob         0       1       0     0
Connie      0       0       0     1
John        0       1       1     0
Mike        1       0       0     1

选项2

from cytoolz import concat

def pir1(dfb):
    f0, u0 = pd.factorize(dfb.Name.values)
    p = [x.title().split(',') for x in dfb.Product.values.tolist()]
    l = [len(y) for y in p]
    f1, u1 = pd.factorize(list(concat(p)))
    n, m = u0.size, u1.size

    return pd.DataFrame(
        np.bincount(f0.repeat(l) * m + f1, minlength=n * m).reshape(n, m),
        u0, u1)

pir1(dfb)

        Apple  Pear  Orange  Banana
Mike        1     1       0       0
John        0     0       1       1
Bob         0     0       0       1
Connie      0     1       0       0

选项3

def pir2(dfb):
    f0, u0 = pd.factorize(dfb.Name.values)
    p = [x.title().split(',') for x in dfb.Product.values.tolist()]
    l = [len(y) for y in p]
    f1, u1 = pd.factorize(list(concat(p)))
    n, m = u0.size, u1.size

    a = np.zeros((n, m), dtype=int)
    a[f0.repeat(l), f1] = 1

    return pd.DataFrame(a, u0, u1)

pir2(dfb)

        Apple  Pear  Orange  Banana
Mike        1     1       0       0
John        0     0       1       1
Bob         0     0       0       1
Connie      0     1       0       0

<强>时序
以下代码

results = pd.DataFrame(
    index=pd.Index([10, 30, 100, 300, 1000, 3000, 10000, 30000]),
    columns='pir0 pir1 pir2 jez0 jez1 jez2'.split()
)

for i in results.index:
    d = pd.concat([dfb] * i, ignore_index=True)
    for j in results.columns:
        stmt = '{}(d)'.format(j)
        setp = 'from __main__ import d, {}'.format(j)
        results.set_value(i, j, timeit(stmt, setp, number=20))

ax = results.plot(loglog=True)
ax.legend(ncol=2)

enter image description here

pir0 = lambda dfb: pd.get_dummies(dfb.Name).T.dot(dfb.Product.str.title().str.get_dummies(',')).astype(bool).astype(int)

from cytoolz import concat

def pir1(dfb):
    f0, u0 = pd.factorize(dfb.Name.values)
    p = [x.title().split(',') for x in dfb.Product.values.tolist()]
    l = [len(y) for y in p]
    f1, u1 = pd.factorize(list(concat(p)))
    n, m = u0.size, u1.size

    return pd.DataFrame(
        np.bincount(f0.repeat(l) * m + f1, minlength=n * m).reshape(n, m).astype(bool).astype(int),
        u0, u1)

def pir2(dfb):
    f0, u0 = pd.factorize(dfb.Name.values)
    p = [x.title().split(',') for x in dfb.Product.values.tolist()]
    l = [len(y) for y in p]
    f1, u1 = pd.factorize(list(concat(p)))
    n, m = u0.size, u1.size

    a = np.zeros((n, m), dtype=int)
    a[f0.repeat(l), f1] = 1

    return pd.DataFrame(a, u0, u1)

jez0 = lambda dfb: dfb.set_index('Name').Product.str.get_dummies(',')

jez1 = lambda dfb: pd.get_dummies(
    dfb.set_index('Name').Product.str.split(',', expand=True),
    prefix='', prefix_sep='').groupby(axis=1, level=0).sum()

def jez2(dfb):
    mlb = MultiLabelBinarizer()
    return pd.DataFrame(
        mlb.fit_transform(dfb.Product.str.split(',')),
        dfb.Name, mlb.classes_
    )