运行以下代码时,我得到以下结果:
引发NotFittedError(msg%{'name':type(estimator)。 name }) sklearn.exceptions.NotFittedError:CountVectorizer - 词汇表没有 装。
我多次尝试解决它,所以, 问题是什么 ???? 导入numpy为np 来自sklearn.feature_extraction.text导入CountVectorizer 来自nltk.corpus import brown
class bagOfWords:
def chunker(self,input_data, N):
input_words = input_data.split(' ')
output = []
cur_chunk = []
count = 0
for word in input_words:
cur_chunk.append(word)
count += 1
if count == N:
output.append(' '.join(cur_chunk))
count, cur_chunk = 0, []
output.append(' '.join(cur_chunk))
return output
def generate_text_chunks(self,input_data,chunk_size):
text_chunks = self.chunker(input_data,chunk_size)
return text_chunks
def generate_chunks(self,chunks,input_data,chunk_size):
chunks = []
text_chunks=self.generate_text_chunks(input_data,chunk_size)
for count, chunk in enumerate(text_chunks):
d = {'index': count, 'text': chunk}
chunks.append(d)
return chunks
#mindf&maxdf is integer
def generate_DocTerm_Matrix (self,input_data,chunk_size,mindf,maxdf):
# Extract the document term matrix
chunks=[]
chunks=self.generate_chunks(chunks,input_data,chunk_size)
count_vectorizer = CountVectorizer(min_df=mindf, max_df=maxdf)
document_term_matrix = count_vectorizer.fit_transform([chunk['text'] for chunk in chunks])
return document_term_matrix
def genertae_vocabulary(self,mindf, maxdf):
count_vectorizer = CountVectorizer(mindf, maxdf)
# Extract the vocabulary and display it
vocabulary = np.array(count_vectorizer.get_feature_names())
print("\nVocabulary:\n", vocabulary)
return vocabulary
# Generate names for chunks
def generate_chunk_names(self,text_chunks):
chunk_names = []
for i in range(len(text_chunks)):
chunk_names.append('Chunk-' + str(i+1))
return chunk_names
def print_Bags(self,input_data,chunk_size,mindf, maxdf):
text_chunks=self.chunker(input_data, chunk_size)
chunk_names=self.generate_chunk_names(text_chunks)
vocabulary = self.genertae_vocabulary(mindf, maxdf)
document_term_matrix = self.generate_DocTerm_Matrix(input_data, chunk_size, 7, 20)
formatted_text = '{:>12}' * (len(chunk_names) + 1)
print('\n', formatted_text.format('Word', *chunk_names), '\n')
print('\n', formatted_text.format('Word', *chunk_names), '\n')
for word, item in zip('vocabulary', document_term_matrix.T):
# 'item' is a 'csr_matrix' data structure
output = [word] + [str(freq) for freq in item.data]
print(formatted_text.format(*output))
if __name__=='__main__':
input_data = ' '.join(brown.words()[:12000])
chunk_size = 800
mindf=7
maxdf=20
bagOfWords1=bagOfWords()
bagOfWords1.print_Bags(input_data,chunk_size,mindf, maxdf)
"""
Print the document term matrix
print("\nDocument term matrix:")
formatted_text = '{:>12}' * (len(chunk_names) + 1)
print('\n', formatted_text.format('Word', *chunk_names), '\n')
for word, item in zip(vocabulary, document_term_matrix.T):
# 'item' is a 'csr_matrix' data structure
output = [word] + [str(freq) for freq in item.data]
print(formatted_text.format(*output))
"""