我试图通过在python中使用CountVectorizer来清理语料库。我写了下面的代码,但认为tokenize和stem_tokens函数不起作用,因为我没有得到必需的功能,一些特殊字符被插入到功能中。
import pandas as pd
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
import sklearn
import pickle
import os
import string
import sklearn.feature_extraction.text
import pandas
import nltk
from nltk.stem.porter import PorterStemmer
data = pd.read_csv("Data.csv",encoding='cp1252')
description = data[['Description','Group']]
#splitting data sets into train and test using Sklearn
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(description.Description, description.Group, random_state=1)
stemmer = PorterStemmer()
def stem_tokens(tokens, stemmer):
stemmed = []
for item in tokens:
stemmed.append(stemmer.stem(item))
return stemmed
def tokenize(text):
tokens = nltk.word_tokenize(text)
tokens = [i for i in tokens if i not in string.punctuation]
stems = stem_tokens(tokens, stemmer)
return stems
##vect = CountVectorizer(tokenizer=tokenize, stop_words='english',lowercase=True, ngram_range=(1,2))
vect = CountVectorizer(stop_words='english',lowercase=True, ngram_range=(1,2))
train_dtm = vect.fit_transform(X_train.astype('U'))
test_dtm = vect.transform(X_test.astype('U'))
是因为我使用的是ngram_range =(1,2)。 任何人都可以帮助我知道如何使用Python中的CountVectorizer删除数字,标点符号和词干。
请建议。