我可以添加哪些额外的功能来清洁此代码中的文本以提高准确性?最好先使用nltk.sent_tokenize然后再使用nltk.word_tokenize,或者我们可以直接使用nltk.word_tokenize,也可以在countvectorizer()中使用nltk.sent_tokenize。
# Cleaning the texts
import re
import nltk
nltk.download('stopwords')
from nltk.tokenize import RegexpTokenizer
sa_stop_words = nltk.corpus.stopwords.words("english")
#words that might invert a sentence's meaning
white_list = [
'what', 'but', 'if', 'because', 'as', 'until', 'against',
'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again',
'further', 'then', 'once', 'here', 'there', 'why', 'how', 'all', 'any',
'most', 'other', 'some', 'such', 'no', 'not', 'only', 'own',
'same', 'so', 'than', 'too', 'can', 'will', 'just', 'dont', 'should']
#take these cut of the standard nltk stop word lists
sa_stop_words = [sw for sw in sa_stop_words if sw not in white_list]
corpus = []
for i in range(0, 50000):
review = re.sub('[^a-zA-Z]', ' ', dataset['review'][i])
review = re.sub('<[^<]+?>', ' ', dataset['review'][i])
tokenizer = RegexpTokenizer(r'\w+')
review = tokenizer.tokenize(review)
review = ' '.join(review)
corpus.append(review)
# Creating the Bag of Words model
from sklearn.feature_extraction.text import CountVectorizer
#vectorize means we turn non_numeric data into an array of numbers
cv = CountVectorizer(
max_features = 300,
lowercase = True, #for demonstration true by default
tokenizer = nltk.sent_tokenize,
tokenizer = nltk.word_tokenize,#use the nltk tokenizer
stop_words = sa_stop_words, #remove stop words
min_df = 5, #minimum document frequency the word must appear more
ngram_range = (1, 2)
)
这是我用来清除文本的代码。