将NLTK代码重写为可在Python中多次使用的函数

时间:2018-01-01 09:27:11

标签: python function pandas nltk

如何将我的代码重写为可以再次调用的函数

我的代码

stopwords=nltk.corpus.stopwords.words('english')
user_defined_stop_words=['st','rd','kwun tong','kwai chung','kwun','tong']                    
new_stop_words=stopwords+user_defined_stop_words
data['Clean_addr'] = data['Adj_Addr'].apply(lambda x: ' '.join([item.lower() for item in x.split()]))
data['Clean_addr']=data['Clean_addr'].apply(lambda x:"".join([item.lower() for item in x if  not  item.isdigit()]))
data['Clean_addr']=data['Clean_addr'].apply(lambda x:"".join([item.lower() for item in x if item not in string.punctuation]))
data['Clean_addr'] = data['Clean_addr'].apply(lambda x: ' '.join([item.lower() for item in x.split() if item not in (new_stop_words)]))
cv = CountVectorizer( max_features = 200,analyzer='word',ngram_range=(1, 3)) 
cv_addr = cv.fit_transform(data.pop('Clean_addr'))
for i, col in enumerate(cv.get_feature_names()):
    data[col] = pd.SparseSeries(cv_addr[:, i].toarray().ravel(), fill_value=0)

任何帮助表示感谢。

1 个答案:

答案 0 :(得分:-1)

以下是我的代码

的参考资料
import nltk
import string
wnlemma = nltk.WordNetLemmatizer()
addstopwords = ['the','is','it','may','was', '1', '2', '3', '4', '5', '6', 
               '7', '8', '9', '0', 'employee', 'employer', 'approximately']
newstopwords=stopwords.words("English") + addstopwords

# pre-process and join into string function
def pre_process_str(text):
    # tokenize
    tokens = word_tokenize(text)

    # lower-case and remove stopwords
    tokens=[word.lower() for word in tokens if word not in newstopwords]

    # wordnet lemmatization
    tokens=[wnlemma.lemmatize(t) for t in tokens]

    # remove puncutation
    tokens=[word for word in tokens if word not in string.punctuation]

    # remove words less than 3 letters
    tokens = [word for word in tokens if len(word) >= 3]

    # join as string
    text_after_process=" ".join(tokens)

    return(text_after_process)