如果列中的条目包含列表中的单词,我想在我的python pandas数据帧中添加一个标志
对于我们可以使用的任何separete行
any(word in train['a'][0] for word in words)
我试图制作一个模式
import pandas as pd
import numpy as np
words=['photos','pictures', ' pics ', 'pix', 'image']
pattern = '|'.join(words)
train=pd.DataFrame()
train['a']=words
我尝试过使用contains但它没有获得模式
def emb_col_1(tr, te, col, pat, suf):
tr["0_"+col+suf]=0
tr.loc[tr[col].str.contains(pat), "0_"+col+suf] =1
#tr.loc[tr[col].str.count(pat)>0, "0_"+col+suf] =1
#tr.loc[(word in tr[col].str for word in pat), "0_"+col+suf] =1
#tr["0_"+col+suf] = np.where(tr[col].str.contains(pat, case=False, na=False), 1, 0)
#tr["0_"+col+suf] = np.where(any(word in train[col] for word in pat), 1, 0)
emb_col_1(train, test, 'a', words, '_p')
emb_col_1(train, test, 'a', pattern, '_p')
提前谢谢
答案 0 :(得分:0)
我相信你需要:
words=['photos','pictures', ' pics ', 'pix', 'image']
#remeove trailining whitespaces by strip
pattern = '|'.join([x.strip() for x in words])
train=pd.DataFrame()
#added more values for test
train['a']=words + ['a','pics sss']
print (train)
#remove unused te
def emb_col_1(tr, col, pat, suf):
#convert True and Falses to 1 and 0 by astype(int)
tr["0_"+col+suf] = tr[col].str.contains(pat, case=False, na=False).astype(int)
#return DataFrame
return tr
df1 = emb_col_1(train, 'a', pattern, '_p')
print (df1)
a 0_a_p
0 photos 1
1 pictures 1
2 pics 1
3 pix 1
4 image 1
5 a 0
6 pics sss 1
编辑:
words=['photos',' pics ', 'pix', 'image']
#remeove trailining whitespaces by strip
pattern = '|'.join([r'\b{}\b'.format(x.strip()) for x in words])
train=pd.DataFrame()
#added more values for test
train['a']=words + ['a','pics sss', 'pictures']
print (train)
a
0 photos
1 pics
2 pix
3 image
4 a
5 pics sss
6 pictures
#remove unused te
def emb_col_1(tr, col, pat, suf):
#convert True and Falses to 1 and 0 by astype(int)
tr["0_"+col+suf] = tr[col].str.contains(pat, case=False, na=False).astype(int)
#return DataFrame
return tr
df1 = emb_col_1(train, 'a', pattern, '_p')
print (df1)
a 0_a_p
0 photos 1
1 pics 1
2 pix 1
3 image 1
4 a 0
5 pics sss 1
6 pictures 0