如何将我的代码重写为可以再次调用的函数
我的代码
stopwords=nltk.corpus.stopwords.words('english')
user_defined_stop_words=['st','rd','kwun tong','kwai chung','kwun','tong']
new_stop_words=stopwords+user_defined_stop_words
data['Clean_addr'] = data['Adj_Addr'].apply(lambda x: ' '.join([item.lower() for item in x.split()]))
data['Clean_addr']=data['Clean_addr'].apply(lambda x:"".join([item.lower() for item in x if not item.isdigit()]))
data['Clean_addr']=data['Clean_addr'].apply(lambda x:"".join([item.lower() for item in x if item not in string.punctuation]))
data['Clean_addr'] = data['Clean_addr'].apply(lambda x: ' '.join([item.lower() for item in x.split() if item not in (new_stop_words)]))
cv = CountVectorizer( max_features = 200,analyzer='word',ngram_range=(1, 3))
cv_addr = cv.fit_transform(data.pop('Clean_addr'))
for i, col in enumerate(cv.get_feature_names()):
data[col] = pd.SparseSeries(cv_addr[:, i].toarray().ravel(), fill_value=0)
任何帮助表示感谢。
答案 0 :(得分:-1)
以下是我的代码
的参考资料import nltk
import string
wnlemma = nltk.WordNetLemmatizer()
addstopwords = ['the','is','it','may','was', '1', '2', '3', '4', '5', '6',
'7', '8', '9', '0', 'employee', 'employer', 'approximately']
newstopwords=stopwords.words("English") + addstopwords
# pre-process and join into string function
def pre_process_str(text):
# tokenize
tokens = word_tokenize(text)
# lower-case and remove stopwords
tokens=[word.lower() for word in tokens if word not in newstopwords]
# wordnet lemmatization
tokens=[wnlemma.lemmatize(t) for t in tokens]
# remove puncutation
tokens=[word for word in tokens if word not in string.punctuation]
# remove words less than 3 letters
tokens = [word for word in tokens if len(word) >= 3]
# join as string
text_after_process=" ".join(tokens)
return(text_after_process)