import nltk
from nltk.corpus import PlaintextCorpusReader
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
#file directory, contains 1000 files
doc_dirname_politics = "E:/s10/SENTIMENT ANALISYS/SESI3/NLP-Data-sets/Dats sets/mini_newsgroups/mini_newsgroups/talk.politics.misc"
doc_dirname_comps = "E:/s10/SENTIMENT ANALISYS/SESI3/NLP-Data-sets/Dats sets/mini_newsgroups/mini_newsgroups/comp.os.ms-windows.misc"
politics_news_corpus = PlaintextCorpusReader(doc_dirname_politics,'.*')
comp_news_corpus = PlaintextCorpusReader(doc_dirname_comps, '.*')
import re
from nltk.stem.porter import PorterStemmer
stemmer = PorterStemmer()
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
def custom_preprocessor(text):
text = re.sub(r'\W+|\d+|_', ' ', text) #removing numbers and
punctuations
text = nltk.word_tokenize(text) #tokenizing
text = [word for word in text if not word in stop_words] #English
Stopwords
text = [lemmatizer.lemmatize(word) for word in text]
return text
这就是源代码。如果要在TF IDF
中实例化的文件中执行doc_dirname_politics
,该使用哪种代码。我已经找到了该教程,但是没有找到可以解决我问题的教程。