
时间:2017-09-09 20:26:26

标签: python pandas numpy dataframe



A = np.random.rand(2,3,4)
array([[[ 0.43793885,  0.40078139,  0.48078691,  0.05334248],
    [ 0.76331509,  0.82514441,  0.86169078,  0.86496111],
    [ 0.75572665,  0.80860943,  0.79995337,  0.63123724]],

   [[ 0.20648946,  0.57042315,  0.71777265,  0.34155005],
    [ 0.30843717,  0.39381407,  0.12623462,  0.93481552],
    [ 0.3267771 ,  0.64097038,  0.30405215,  0.57726629]]])

df = pd.DataFrame(A.flatten())

我试图像这样生成x / y / z列:

           A  z  y  x
0   0.437939  0  0  0
1   0.400781  0  0  1
2   0.480787  0  0  2
3   0.053342  0  0  3
4   0.763315  0  1  0
5   0.825144  0  1  1
6   0.861691  0  1  2
7   0.864961  0  1  3
21  0.640970  1  2  1
22  0.304052  1  2  2
23  0.577266  1  2  3


dimnames = ['z', 'y', 'x']
ranges   = [ np.arange(x) for x in A.shape ]
ix       = [ x.flatten()  for x in np.meshgrid(*ranges) ]
for name, col in zip(dimnames, ix):
    df[name] = col
df = df.set_index(dimnames).squeeze()


z  y  x
0  0  0    0.437939
      1    0.400781
      2    0.480787
      3    0.053342
1  0  0    0.763315
      1    0.825144
      2    0.861691
      3    0.864961
0  1  0    0.755727
      1    0.808609
      2    0.799953
      3    0.631237
1  1  0    0.206489
      1    0.570423
      2    0.717773
      3    0.341550
0  2  0    0.308437
      1    0.393814
      2    0.126235
      3    0.934816
1  2  0    0.326777
      1    0.640970
      2    0.304052
      3    0.577266

print A[0,1,0]

print print df.loc[0,1,0]


6 个答案:

答案 0 :(得分:5)


    Z  Y  X         A
0   0  0  0  0.437939
1   0  0  1  0.400781
2   0  0  2  0.480787
3   0  0  3  0.053342
21  1  2  1  0.640970
22  1  2  2  0.304052
23  1  2  3  0.577266


In [321]: %timeit  using_cartesian_product(A, columns)
100 loops, best of 3: 13.8 ms per loop

In [318]: %timeit using_multiindex(A, columns)
10 loops, best of 3: 35.6 ms per loop

In [320]: %timeit indices_merged_arr_generic(A, columns)
10 loops, best of 3: 29.1 ms per loop

In [319]: %timeit using_product(A)
1 loop, best of 3: 461 ms per loop

如果性能是首要任务,请考虑使用senderle's cartesian_product。 (见下面的代码。)


import numpy as np
import pandas as pd
import functools
import itertools as IT
import string
product = IT.product

def cartesian_product_broadcasted(*arrays):
    http://stackoverflow.com/a/11146645/190597 (senderle)
    broadcastable = np.ix_(*arrays)
    broadcasted = np.broadcast_arrays(*broadcastable)
    dtype = np.result_type(*arrays)
    rows, cols = functools.reduce(np.multiply, broadcasted[0].shape), len(broadcasted)
    out = np.empty(rows * cols, dtype=dtype)
    start, end = 0, rows
    for a in broadcasted:
        out[start:end] = a.reshape(-1)
        start, end = end, end + rows
    return out.reshape(cols, rows).T

def using_cartesian_product(A, columns):
    shape = A.shape
    coords = cartesian_product_broadcasted(*[np.arange(s, dtype='int') for s in shape])
    df = pd.DataFrame(coords, columns=columns)
    df['A'] = A.flatten()
    return df

def using_multiindex(A, columns):
    shape = A.shape
    index = pd.MultiIndex.from_product([range(s)for s in shape], names=columns)
    df = pd.DataFrame({'A': A.flatten()}, index=index).reset_index()
    return df

def indices_merged_arr_generic(arr, columns):
    n = arr.ndim
    grid = np.ogrid[tuple(map(slice, arr.shape))]
    out = np.empty(arr.shape + (n+1,), dtype=arr.dtype)
    for i in range(n):
        out[...,i] = grid[i]
    out[...,-1] = arr
    out.shape = (-1,n+1)
    df = pd.DataFrame(out, columns=['A']+columns)
    return df

def using_product(A):
    x, y, z = A.shape
    x_, y_, z_ = zip(*product(range(x), range(y), range(z)))
    df = pd.DataFrame(A.flatten()).assign(x=x_, y=y_, z=z_)
    return df

A = np.random.random((100,100,100))
shape = A.shape
columns = list(string.ascii_uppercase[-len(shape):][::-1])


Last login: Sat Sep  9 12:40:05 on console
You have new mail.
-bash: /Users/me/google-cloud-sdk/path.bash.inc: No such file or directory
-bash: /Users/me/google-cloud-sdk/completion.bash.inc: No such file or directory
ME:~ myshell$ 

答案 1 :(得分:3)

from itertools import product

A = np.random.rand(2, 3, 4)
x, y, z = A.shape
x_, y_, z_ = zip(*product(range(x), range(y), range(z)))
df = pd.DataFrame(A.flatten()).assign(x=x_, y=y_, z=z_)
>>> df

           0  x  y  z
0   0.548814  0  0  0
1   0.715189  0  0  1
2   0.602763  0  0  2
3   0.544883  0  0  3
4   0.423655  0  1  0
5   0.645894  0  1  1
6   0.437587  0  1  2
7   0.891773  0  1  3
8   0.963663  0  2  0
9   0.383442  0  2  1
10  0.791725  0  2  2
11  0.528895  0  2  3
12  0.568045  1  0  0
13  0.925597  1  0  1
14  0.071036  1  0  2
15  0.087129  1  0  3
16  0.020218  1  1  0
17  0.832620  1  1  1
18  0.778157  1  1  2
19  0.870012  1  1  3
20  0.978618  1  2  0
21  0.799159  1  2  1
22  0.461479  1  2  2
23  0.780529  1  2  3

答案 2 :(得分:2)

我的解决方案基于Divakar关于def indices_merged_arr(arr): n = arr.ndim grid = np.ogrid[tuple(map(slice, arr.shape))] out = np.empty(arr.shape + (n+1,), dtype=arr.dtype) for i in range(n): out[...,i+1] = grid[i] out[...,0] = arr out.shape = (-1,n+1) return out A = np.array([[[ 0.43793885, 0.40078139, 0.48078691, 0.05334248], [ 0.76331509, 0.82514441, 0.86169078, 0.86496111], [ 0.75572665, 0.80860943, 0.79995337, 0.63123724]], [[ 0.20648946, 0.57042315, 0.71777265, 0.34155005], [ 0.30843717, 0.39381407, 0.12623462, 0.93481552], [ 0.3267771 , 0.64097038, 0.30405215, 0.57726629]]]) df = pd.DataFrame(indices_merged_arr(A), columns=list('Axyz')) df A x y z 0 0.437939 0.0 0.0 0.0 1 0.400781 0.0 0.0 1.0 2 0.480787 0.0 0.0 2.0 3 0.053342 0.0 0.0 3.0 4 0.763315 0.0 1.0 0.0 5 0.825144 0.0 1.0 1.0 6 0.861691 0.0 1.0 2.0 7 0.864961 0.0 1.0 3.0 8 0.755727 0.0 2.0 0.0 9 0.808609 0.0 2.0 1.0 10 0.799953 0.0 2.0 2.0 11 0.631237 0.0 2.0 3.0 12 0.206489 1.0 0.0 0.0 13 0.570423 1.0 0.0 1.0 14 0.717773 1.0 0.0 2.0 15 0.341550 1.0 0.0 3.0 16 0.308437 1.0 1.0 0.0 17 0.393814 1.0 1.0 1.0 18 0.126235 1.0 1.0 2.0 19 0.934816 1.0 1.0 3.0 20 0.326777 1.0 2.0 0.0 21 0.640970 1.0 2.0 1.0 22 0.304052 1.0 2.0 2.0 23 0.577266 1.0 2.0 3.0 的{​​{3}}回答。此函数适用于任何维度的任何数组。


答案 3 :(得分:1)


A = np.random.rand(2,3,4)
dimnames = ['z', 'y', 'x']
ranges   = [ np.arange(x) for x in A.shape ]
ix       = [ x.flatten()  for x in np.meshgrid(*ranges, indexing='ij') ]
for name, col in zip(dimnames, ix):
    df[name] = col
df = df.set_index(dimnames).squeeze()

# Compare the results
for ix, val in df.iteritems():
    print ix, val == A[ix]
(0, 0, 0) True
(0, 0, 1) True
(0, 0, 2) True
(0, 0, 3) True
(0, 1, 0) True
(0, 1, 1) True
(0, 1, 2) True
(0, 1, 3) True
(0, 2, 0) True
(0, 2, 1) True
(0, 2, 2) True
(0, 2, 3) True
(1, 0, 0) True
(1, 0, 1) True
(1, 0, 2) True
(1, 0, 3) True
(1, 1, 0) True
(1, 1, 1) True
(1, 1, 2) True
(1, 1, 3) True
(1, 2, 0) True
(1, 2, 1) True
(1, 2, 2) True
(1, 2, 3) True

答案 4 :(得分:0)


x,y,z = np.indices(A.shape)

df = pd.DataFrame(np.array([p.flatten() for p in [x,y,z,A]]).T

答案 5 :(得分:-1)

def ndarray_to_indexed_2d(data):
    idx = np.column_stack(np.unravel_index(np.arange(np.product(data.shape[:-1])), data.shape[:-1]))
    data2d = np.hstack((idx, data.reshape(np.product(data.shape[:-1]), data.shape[-1])))
    return data2d