对于数据框的每一行,我需要:
我以包含随机字符列表的系列为例
为了更新“最后一列”,我试图使用一个包含while循环的函数,但我不知道如何完成它,实现此目的的最佳实践是什么?
In[5]:
import pandas as pd
import numpy as np
df = pd.DataFrame({
'List': ['6,f,e,w,m,i,n', '7,m,2,n,3,k,i', 'h,e,a,l,5,v,8', 'c,t,i,v,t,n,1', 'o,q,k,2,p', '6,b,p,n,7,1,k', '3,u,v,q,e,1,z,w', 'm,h,o,b,8,6,n'
]})
In[6]:
df
Out[6]:
List
0 6,f,e,w,m,i,n
1 7,m,2,n,3,k,i
2 h,e,a,l,5,v,8
3 c,t,i,v,t,n,1
4 o,q,k,2,p
5 6,b,p,n,7,1,k
6 3,u,v,q,e,1,z,w
7 m,h,o,b,8,6,n
In[14]:
df['Last'] = df['List'].str.split(',').str[-1]
df['List-length'] = df['List'].str.split(",").apply(len)
df['frequency'] = df.groupby('Last')['Last'].transform('count'
df
Out[14]:
List Last List-length frequency
0 6,f,e,w,m,i,n n 7 2
1 7,m,2,n,3,k,i i 7 1
2 h,e,a,l,5,v,8 8 7 1
3 c,t,i,v,t,n,1 1 7 1
4 o,q,k,2,p p 5 1
5 6,b,p,n,7,1,k k 7 1
6 3,u,v,q,e,1,z,w w 8 1
7 m,h,o,b,8,6,n n 7 2
In[1]:
def avoid_singles(d):
index = -2
remaining_items = d['List-length']
number_of_singles = d.loc[d['frequency'] == 1].size
while number_of_singles >= 1:
d['Last'] = np.where((df['frequency'] == 1) & (d['List-length'] >= abs(index)), d['List'].str.split(",").str[index], d['Last'])
df['frequency'] = df.groupby('Last')['Last'].transform('count')
number_of_singles = d.loc[d['frequency'] == 1].size
index += -1
avoid_singles(df)
以及预期的Last
列:
Last
0 n
1 k
2 h
3 n
4 k
5 k
6 3
7 n
答案 0 :(得分:2)
您可以使用DataFrame.apply
浏览样本,然后为每个样本的最后一个字符计算np.equal.outer
; np.argwhere
让我们选择符合此条件的第一个字符:
import numpy as np
import pandas as pd
df = pd.DataFrame({'List': ['6,f,e,w,m,i,n', '7,m,2,n,3,k,i', 'h,e,a,l,5,v,8', 'c,t,i,v,t,n,1', 'o,q,k,2,p', '6,b,p,n,7,1,k', '3,u,v,q,e,1,z,w', 'm,h,o,b,8,6,n']})
def get_char(row):
l_reverse = row.l[::-1]
mask = np.equal.outer(l_reverse, tmp.l.str[-1])
mask[:, row.i] = False # Do not match with same row.
mask[-1, 0] = True # Set any element in last row to True so we can fallback to the last character.
return l_reverse[np.argwhere(mask)[0, 0]] # Select the first matching character.
tmp = pd.DataFrame.from_dict(dict(
l=df.List.str.split(','),
i=np.arange(len(df))
))
df['Last'] = tmp.apply(get_char, axis=1)
输出以下内容:
0 6,f,e,w,m,i,n n
1 7,m,2,n,3,k,i k
2 h,e,a,l,5,v,8 h
3 c,t,i,v,t,n,1 n
4 o,q,k,2,p k
5 6,b,p,n,7,1,k 1
6 3,u,v,q,e,1,z,w 1
7 m,h,o,b,8,6,n n
请注意样本5、6分别输出1
和1
(与您提供的示例相反),但这是根据您指定的规则({{ 1}}不是其他任何行的最后一个字符,但是k
是(示例3))。
答案 1 :(得分:1)
与@a_guest相同的结果,但不放入numpy。他们对我来说看起来更优雅,运行速度更快。如果要重用数据,则将值保存在DataFrame中而不是列表中可能会节省将来的工作。
In [0]: %timeit mine()
9.7 ms ± 295 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
In [1]: %timeit theirs()
5.97 ms ± 131 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
import pandas as pd
stringlist = ['6,f,e,w,m,i,n', '7,m,2,n,3,k,i', 'h,e,a,l,5,v,8', 'c,t,i,v,t,n,1',
'o,q,k,2,p', '6,b,p,n,7,1,k', '3,u,v,q,e,1,z,w', 'm,h,o,b,8,6,n']
# Split strings into a nested list with the elements reversed
nested = [s.split(',')[::-1] for s in stringlist]
df = pd.DataFrame(nested)
# keep the first strings of each list as the fallback case
first_strings = pd.Series([s.split(',')[0] for s in stringlist])
def next_valid(x):
"""Remove NaN values and select the first remaining value. Return NaN
if an IndexError is raised because no values remained after removing NaNs."""
try:
result = x.dropna(how='any').iat[0]
except IndexError:
result = pd.np.nan
return result
# mask the last strings that don't appear in any other row
last_strings = df.loc[:, 0].where(df.loc[:, 0].duplicated(keep=False))
# mask string_i to string_i-1 that are not the last string of any row
not_last_strings = df.loc[:, 1:].where(df.loc[:, 1:].isin(df.loc[:, 0].unique()))
# in descending order, choose the next valid string...
# ...or, if no strings were the last string of another row, return NaN
nextbest = not_last_strings.apply(next_valid, axis=1)
# where the next best string is NaN, use the fallback value
substitutes = nextbest.where(nextbest.notnull(), first_strings)
# where last strings are unique, use the next best string
result = last_strings.where(last_strings.notnull(), substitutes)
In [2]: pd.DataFrame([last_strings, nextbest, first_strings, substitutes, result],
index=['last_strings', 'nextbest', 'first_strings', 'substitutes', 'result']).T
last_strings nextbest first_strings substitutes result
0 n i 6 i n
1 NaN k 7 k k
2 NaN NaN h h h
3 NaN n c n n
4 NaN k o k k
5 NaN 1 6 1 1
6 NaN 1 3 1 1
7 n 8 m 8 n