这是我要在python中通过一次对Pandas数据帧所做的所有事情:
我正在使用Dask并行化我的Python数据框函数。
这是一个示例函数:
df2 = df.map_partitions(lambda d: d.replace(r'\t|\r|\n', '', regex=True))
但是,我要执行的所有上述预处理步骤都具有上述功能之一。有没有一种方法可以合并所有正则表达式?我考虑使用或管道,但我不知道这是否是最佳解决方案。
答案 0 :(得分:2)
您可以创建一个sklearn变压器,为您完成所有工作。
例如
import re
from sklearn.base import TransformerMixin
class TextCleaner(TransformerMixin):
'''
Text cleaner example
'''
def __init__(self,
words_only=False,
emoji_normalize=False,
remove_digits=True,
lower_case=False,
stop_words = None,
token=False):
self.words_only = words_only
self.word_normalize = word_normalize
self.emoji_normalize = emoji_normalize
self.remove_digits = remove_digits
self.lower_case = lower_case
self.stop_words = stop_words
self.token = token
def fit(self, X, y=None):
return self
def transform(self, X):
self.X = X
# eyes [nose] mouth | mouth [nose] eyes pattern
emoticons = r"(?:[<>]?[:;=8][\-o\*\']?[\)\]\(\[dDpP/\:\}\{@\|\\]|[\)\]\(\[dDpP/\:\}\{@\|\\][\-o\*\']?[:;=8][<>]?)"
emoticon_re = re.compile(emoticons, re.VERBOSE | re.I | re.UNICODE)
# Keep word only. Digit are consider true Emojis false
if self.words_only:
clean_text = self.X.apply(lambda x: (re.sub('[\W]+', ' ', x)))
else:
clean_text = self.X.apply(lambda x: ('{}{}'.format(re.sub('[\W]+', ' ', x),
''.join(re.findall(emoticon_re, x)))))
# normalize emoji?
if self.emoji_normalize:
clean_text = self.X.apply( lambda x:
(re.sub('[\W]+', ' ', x) +
' '.join(re.findall(emoticon_re, x)).replace(';',':').replace('-',''))
)
if self.remove_digits:
clean_text = clean_text.apply(
lambda x: x.translate(str.maketrans('', '', '0123456789')))
if self.lower_case:
clean_text = clean_text.str.lower()
# list of stop words
if self.stop_words is not None:
_ = [word for word in clean_text.split() if word not in stop_words]
clean_text = ' '.join(word for word in _)
if self.token:
return clean_text.str.split()
else:
return clean_text
您可以将此变压器与其他变压器和分类器链接在一起。例如
from sklearn.linear_model import LogisticRegression
word_vec_para = dict(
ngram_range=(1, 2),
tokenizer=token,
lowercase=True,
min_df=1)
pipe = Pipeline(steps=[
('text_clean', TextCleaner(
remove_digits=True, lower_case=True, stop_words=your_stop_words)),
('word_vec', CountVectorizer(**word_vec_para)),
('word_tdf', TfidfTransformer(sublinear_tf=True)), ('fe_se', SelectKBest(chi2, k='all')),
('log_reg', LogisticRegression(verbose=1))])
pipe.fit(X,y)