sklearn.exceptions.NotFittedError:CountVectorizer - 未安装词汇表

时间:2018-06-05 10:52:06

标签: python-3.x nltk

运行以下代码时,我得到以下结果:

  

引发NotFittedError(msg%{'name':type(estimator)。 name })   sklearn.exceptions.NotFittedError:CountVectorizer - 词汇表没有   装。

我多次尝试解决它,所以, 问题是什么 ????         导入numpy为np         来自sklearn.feature_extraction.text导入CountVectorizer         来自nltk.corpus import brown

    class bagOfWords:

        def chunker(self,input_data, N):
            input_words = input_data.split(' ')
            output = []

            cur_chunk = []
            count = 0
            for word in input_words:
                cur_chunk.append(word)
                count += 1
                if count == N:
                    output.append(' '.join(cur_chunk))
                    count, cur_chunk = 0, []

            output.append(' '.join(cur_chunk))
            return output


        def generate_text_chunks(self,input_data,chunk_size):
            text_chunks = self.chunker(input_data,chunk_size)
            return text_chunks


        def generate_chunks(self,chunks,input_data,chunk_size):
            chunks = []
            text_chunks=self.generate_text_chunks(input_data,chunk_size)
            for count, chunk in enumerate(text_chunks):
                d = {'index': count, 'text': chunk}
                chunks.append(d)
            return chunks


        #mindf&maxdf is integer
        def generate_DocTerm_Matrix (self,input_data,chunk_size,mindf,maxdf):
            # Extract the document term matrix
            chunks=[]
            chunks=self.generate_chunks(chunks,input_data,chunk_size)
            count_vectorizer = CountVectorizer(min_df=mindf, max_df=maxdf)
            document_term_matrix = count_vectorizer.fit_transform([chunk['text'] for chunk in chunks])
            return document_term_matrix



        def genertae_vocabulary(self,mindf, maxdf):
            count_vectorizer = CountVectorizer(mindf, maxdf)
            # Extract the vocabulary and display it
            vocabulary = np.array(count_vectorizer.get_feature_names())
            print("\nVocabulary:\n", vocabulary)
            return vocabulary


        # Generate names for chunks
        def generate_chunk_names(self,text_chunks):
            chunk_names = []
            for i in range(len(text_chunks)):
                chunk_names.append('Chunk-' + str(i+1))
            return chunk_names

        def print_Bags(self,input_data,chunk_size,mindf, maxdf):

            text_chunks=self.chunker(input_data, chunk_size)
            chunk_names=self.generate_chunk_names(text_chunks)
            vocabulary = self.genertae_vocabulary(mindf, maxdf)
            document_term_matrix = self.generate_DocTerm_Matrix(input_data, chunk_size, 7, 20)

            formatted_text = '{:>12}' * (len(chunk_names) + 1)
            print('\n', formatted_text.format('Word', *chunk_names), '\n')
            print('\n', formatted_text.format('Word', *chunk_names), '\n')
            for word, item in zip('vocabulary', document_term_matrix.T):
                # 'item' is a 'csr_matrix' data structure
                output = [word] + [str(freq) for freq in item.data]
                print(formatted_text.format(*output))



    if __name__=='__main__':
        input_data = ' '.join(brown.words()[:12000])
        chunk_size = 800
        mindf=7
        maxdf=20
        bagOfWords1=bagOfWords()
        bagOfWords1.print_Bags(input_data,chunk_size,mindf, maxdf)

    """
        Print the document term matrix
         print("\nDocument term matrix:")
        formatted_text = '{:>12}' * (len(chunk_names) + 1)
        print('\n', formatted_text.format('Word', *chunk_names), '\n')
        for word, item in zip(vocabulary, document_term_matrix.T):
            # 'item' is a 'csr_matrix' data structure
            output = [word] + [str(freq) for freq in item.data]
            print(formatted_text.format(*output))

    """

0 个答案:

没有答案