我有DataFrame
:
import pandas as pd
df = pd.DataFrame({'Board': ['A', 'B'], 'Off': ['C', 'D'], 'Stops': ['Q/W/E', 'Z'], 'Pax': [10, 100]})
看起来像:
Board Off Pax Stops
0 A C 10 Q/W/E
1 B D 100 Z
我希望按DataFrame
列进行Stops
拆分,并将Board
和Off
重新排列为Pax
值重复的行,如下所示;
Board Off Pax
0 A Q 10
1 Q W 10
2 W E 10
3 E C 10
4 B Z 100
5 Z D 100
对此有任何帮助将不胜感激。
答案 0 :(得分:4)
我打破了步骤
df['New']=df[['Board','Stops','Off']].apply(lambda x : '/'.join(x),1)
df['New2']=df['New'].str.split('/').apply(lambda x : list(zip(x[:-1],x[1:])))
namedict = {0 : 'Board',1:'Off'}
df[['Pax','New2']].set_index('Pax').New2.apply(pd.Series).\
stack().apply(pd.Series).reset_index().\
drop('level_1',1).rename(columns=namedict)
Out[1260]:
Pax Board Off
0 10 A Q
1 10 Q W
2 10 W E
3 10 E C
4 100 B Z
5 100 Z D
答案 1 :(得分:3)
以下是使用拆分和pd.concat
的一种方法,使用时会将元组转换为sereis pd.DataFrame
#Split the by `/`
df['New'] = (df['Board']+'/'+df['Stops']+'/'+df['Off']).str.split('/')
#0 [A, Q, W, E, C]
#1 [B, Z, D]
#Name: New, dtype: object
temp = df['New'].apply(lambda x : list(zip(x,x[1:])))
# Zip will lead to output given (1,2,3) -> [(1,2),(2,3)]
#0 [(A, Q), (Q, W), (W, E), (E, C)]
#1 [(B, Z), (Z, D)]
#Name: New, dtype: object
di = {0 : 'Board',1:'Off','index':'Pax'}
# Concat each row by converting it to DataFrame with key as Pax and rename the columns.
ndf = pd.concat([pd.DataFrame(i,index=np.repeat(j,len(i))) for (i,j) in zip(temp,df['Pax'].values)])\
.reset_index().rename(columns=di)
输出:
Pax Board Off 0 10 A Q 1 10 Q W 2 10 W E 3 10 E C 0 100 B Z 1 100 Z D
答案 2 :(得分:3)
from itertools import islice
#https://stackoverflow.com/a/6822773/2901002
#added a for return Pax
def window(a, seq, n=2):
"Returns a sliding window (of width n) over data from the iterable"
" s -> (s0,s1,...s[n-1]), (s1,s2,...,sn), ... "
it = iter(seq)
result = tuple(islice(it, n))
if len(result) == n:
yield (*result, a)
for elem in it:
result = result[1:] + (elem,)
yield (*result, a)
#join all columns
a = df['Board'] + '/' + df['Stops'] + '/' + df['Off']
#split to lists
a = a.str.split('/')
#apply window function to each value and flatten
L = [(j,k,l) for x, y in zip(a, df['Pax']) for j,k,l in list(window(y, x, 2)) ]
print (L)
[('A', 'Q', 10), ('Q', 'W', 10), ('W', 'E', 10),
('E', 'C', 10), ('B', 'Z', 100), ('Z', 'D', 100)]
#DataFrame constructor
df = pd.DataFrame(L, columns=['Board','Off','Pax'])
print (df)
Board Off Pax
0 A Q 10
1 Q W 10
2 W E 10
3 E C 10
4 B Z 100
5 Z D 100
<强>计时强>:
import pandas as pd
N = 1000
L1 = list('ABCDEFGHIJKLMNOP')
L = ['Q/W/E','Q1/W1/E1','Z','A/B/C/D']
df = pd.DataFrame({'Board': np.random.choice(L1, N),
'Off': np.random.choice(L1, N),
'Stops': np.random.choice(L, N),
'Pax': np.random.randint(100, size=N)})
print (df)
def bharath(df):
df['New'] = (df['Board']+'/'+df['Stops']+'/'+df['Off']).str.split('/')
temp = df['New'].apply(lambda x : list(zip(x,x[1:])))
di = {0 : 'Board',1:'Off'}
return pd.concat([pd.DataFrame(i,index=np.repeat(j,len(i))) for (i,j) in zip(temp,df['Pax'].values)]).reset_index().rename(columns=di)
def jez(df):
from itertools import islice
#https://stackoverflow.com/a/6822773/2901002
#added a for return Pax
def window(a, seq, n=2):
"Returns a sliding window (of width n) over data from the iterable"
" s -> (s0,s1,...s[n-1]), (s1,s2,...,sn), ... "
it = iter(seq)
result = tuple(islice(it, n))
if len(result) == n:
yield (*result, a)
for elem in it:
result = result[1:] + (elem,)
yield (*result, a)
a = df['Board'] + '/' + df['Stops'] + '/' + df['Off']
a = a.str.split('/')
L = [(j,k,l) for x, y in zip(a, df['Pax']) for j,k,l in list(window(y, x, 2)) ]
return pd.DataFrame(L, columns=['Board','Off','Pax'])
def wen(df):
df['New']=df[['Board','Stops','Off']].apply(lambda x : '/'.join(x),1)
df['New2']= df['New'].str.split('/').apply(lambda x : list(zip(x[:-1],x[1:])))
namedict = {0 : 'Board',1:'Off'}
return df[['Pax','New2']].set_index('Pax').New2.apply(pd.Series).stack().apply(pd.Series).reset_index().drop('level_1',1).rename(columns=namedict)
print (jez(df))
#print (bharath(df))
print (wen(df))
In [433]: %timeit (jez(df))
100 loops, best of 3: 6.6 ms per loop
In [434]: %timeit (wen(df))
1 loop, best of 3: 747 ms per loop
In [450]: %timeit (bharath(df))
1 loop, best of 3: 406 ms per loop
答案 3 :(得分:1)
相对更具可读性和效率:
def expand_rows(series):
"""
expand a row by splitting on "/"
"""
return series.str.split("/", expand=True).stack()
d = {'Board': lambda dff: dff['Board'] + '/' + dff['Stops'],
'Off': lambda dff: dff['Stops'] + '/' + dff['Off']
}
df.set_index('Pax').assign(**d)[['Board', 'Off']].apply(expand_rows).reset_index(level=0)
使用来自@ jezrael帖子的数据集计时,它产生以下结果:
每回路15.8 ms±146μs(平均值±标准偏差,7次运行,每次100次循环)