我正在尝试比较熊猫数据框中的两列。我要从正文消息中删除用户名
raw_data = {
'user_name': ['name1 name2', 'nameX nameY '],
'text_body': ['this is the text were i should delete name1 and name2',
'this is the text were i should delete nameX and nameY']}
df_a = pd.DataFrame(raw_data, columns = ['user_name', 'text_body'])
df_a
我将两个pd列分成多个标记,以遍历应该删除单词的seconde列。
def sent_to_words(sentences):
# function to transforms a sentence into tokens
tokens = [nltk.word_tokenize(sent) for sent in sentences]
return tokens
def remover_user_name(text_body,user_name):
#sent_to_words is a function that transforms the raw data into small tokens
user_name_token = sent_to_words(user_name)
for row in dataset.itertuples(index=True, name='Pandas'):
for user in user_name_token:
dataset['user_clean'] = data.apply(lambda x: data.str.strip(user) for user in user_name_token)
return dataset['user_clean'].tolist()
data = dataset['Textemsg'].apply(lambda x: clean_data(x))
user_name = to_lower(dataset['user_name'])
dataaa = remover_user_name(data,user_name)
print(dataaa)
这就是我得到的错误:
TypeError Traceback (most recent call last)
<ipython-input-104-9b39af043e09> in <module>()
1 data = dataset['Textemsg'].apply(lambda x: clean_data(x))
2 user_name = to_lower(dataset['user_name'])
----> 3 dataaa = remover_user_name(data,user_name)
4 print(dataaa)
<ipython-input-103-0a5a8bce7b52> in remover_user_name(data, user_name)
3 for row in dataset.itertuples(index=True, name='Pandas'):
4 for user in user_name_token:
----> 5 dataset['user_clean'] = data.apply(lambda x: data.str.strip(user) for user in user_name_token)
6 return dataset['user_clean'].tolist()
/opt/conda/lib/python3.6/site-packages/pandas/core/series.py in apply(self, func, convert_dtype, args, **kwds)
3192 else:
3193 values = self.astype(object).values
-> 3194 mapped = lib.map_infer(values, f, convert=convert_dtype)
3195
3196 if len(mapped) and isinstance(mapped[0], Series):
pandas/_libs/src/inference.pyx in pandas._libs.lib.map_infer()
TypeError: 'generator' object is not callable
答案 0 :(得分:1)
在这里,我删除了text_body
中所有单词,user_name
。
def remove_words_from_text_body(row):
# Seperate the words to remove by the space between them
words_to_remove = row['user_name'].split(" ")
# Get the text_body as a starting template
text_body = row['text_body']
# For each word that we want to remove, replace it with "" (blank)
for word in words_to_remove:
text_body = text_body.replace(word, "")
return text_body
因此,当您运行时:
df_a['cleaned_text'] = df_a.apply(remove_words_from_text_body, axis=1)
您将得到:
user_name text_body cleaned_text
0 name1 name2 this is the text were i should delete name1 an... this is the text were i should delete and
1 nameX nameY this is the text were i should delete nameX an... this is the text were i should delete and