用户代码未处理UnpicklingError:无效加载密钥,'%'

时间:2016-11-10 07:21:58

标签: python dictionary anaconda gensim corpus

我已经提到了网站https://radimrehurek.com/gensim/tut2.html。我遇到了错误UnpicklingError未被用户代码处理:无效加载密钥'%'。如何清除该错误?我已经引用了其他查询并包含了klepto包,但仍然存在该错误。我正在使用anacoanda2。这是代码: -

import logging
import xml.etree.cElementTree
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
level=logging.INFO)
import os
import klepto
from gensim import corpora

documents = ["Human machine interface for lab abc computer applications",
         "A survey of user opinion of computer system response time",
         "The EPS user interface management system",
         "System and human system engineering testing of EPS",              
         "Relation of user perceived response time to error measurement",
         "The generation of random binary unordered trees",
         "The intersection graph of paths in trees",
         "Graph minors IV Widths of trees and well quasi ordering",
         "Graph minors A survey"]
# remove common words and tokenize
stoplist = set('for a of the and to in'.split())
texts = [[word for word in document.lower().split() if word not in stoplist]
     for document in documents]

# remove words that appear only once
from collections import defaultdict
frequency = defaultdict(int)
for text in texts:
for token in text:
    frequency[token] += 1

texts = [[token for token in text if frequency[token] > 1] for text in texts]

from pprint import pprint  # pretty-printer
pprint(texts)

dictionary = corpora.Dictionary(texts)
dictionary.save_as_text('/tmp/deerwester.dict')  # store the dictionary, for future reference
print(dictionary)

print(dictionary.token2id)

new_doc = "Human computer interaction"
new_vec = dictionary.doc2bow(new_doc.lower().split())
print(new_vec)  # the word "interaction" does not appear in the dictionary and is ignored

corpus = [dictionary.doc2bow(text) for text in texts]
corpora.MmCorpus.serialize('/tmp/deerwester.dict', corpus)  # store to disk, for later use
for c in corpus:
print(c)

class MyCorpus(object):
def __iter__(self):
    for line in open('/datasets/mycorpus.txt'):
        # assume there's one document per line, tokens separated by whitespace
        yield dictionary.doc2bow(line.lower().split())

corpus_memory_friendly = MyCorpus() # doesn't load the corpus into memory!
print(corpus_memory_friendly)

for vector in corpus_memory_friendly:  # load one vector into memory at a time
print(vector)

from six import iteritems

# collect statistics about all tokens
dictionary = corpora.Dictionary(line.lower().split() for line in open('/datasets/mycorpus.txt'))

# remove stop words and words that appear only once
stop_ids = [dictionary.token2id[stopword] for stopword in stoplist 
        if stopword in dictionary.token2id]
once_ids = [tokenid for tokenid, docfreq in iteritems(dictionary.dfs) if docfreq == 1]

# remove stop words and words that appear only once
dictionary.filter_tokens(stop_ids + once_ids)

# remove gaps in id sequence after words that were removed
dictionary.compactify()
print(dictionary)

# create a toy corpus of 2 documents, as a plain Python list
corpus = [[(1, 0.5)], []]  # make one document empty, for the heck of it

corpora.MmCorpus.serialize('/tmp/corpus.mm', corpus)

corpora.SvmLightCorpus.serialize('/tmp/corpus.svmlight', corpus)
corpora.BleiCorpus.serialize('/tmp/corpus.lda-c', corpus)
corpora.LowCorpus.serialize('/tmp/corpus.low', corpus)

corpus = corpora.MmCorpus('/tmp/corpus.mm')

print(corpus)

# one way of printing a corpus: load it entirely into memory
print(list(corpus))  # calling list() will convert any sequence to a plain Python list


# another way of doing it: print one document at a time, making use of the streaming interface
for doc in corpus:
print(doc)

corpora.BleiCorpus.serialize('/tmp/corpus.lda-c', corpus)

import gensim
import numpy as np
numpy_matrix = np.random.randint(10, size=[5,2])
corpus = gensim.matutils.Dense2Corpus(numpy_matrix)
numpy_matrix_dense = gensim.matutils.corpus2dense(corpus, num_terms=10)

import scipy.sparse
scipy_sparse_matrix = scipy.sparse.random(5,2)
corpus = gensim.matutils.Sparse2Corpus(scipy_sparse_matrix)
scipy_csc_matrix = gensim.matutils.corpus2csc(corpus)

from gensim import corpora, models, similarities
if (os.path.exists("/tmp/deerwester.dict")):
dictionary = corpora.Dictionary.load('/tmp/deerwester.dict')
corpus = corpora.MmCorpus('/tmp/deerwester.mm')
print("Used files generated from first tutorial")
else:
print("Please run first tutorial to generate data set")

tfidf = models.TfidfModel(corpus) # step 1 -- initialize a model

doc_bow = [(0, 1), (1, 1)]
print(tfidf[doc_bow]) # step 2 -- use the model to transform vectors

corpus_tfidf = tfidf[corpus]
for doc in corpus_tfidf:
print(doc)

lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=2) # initialize an LSI transformation
corpus_lsi = lsi[corpus_tfidf] # create a double wrapper over the original corpus: bow->tfidf->fold-in-lsi

lsi.print_topics(2)

for doc in corpus_lsi: # both bow->tfidf and tfidf->lsi transformations are actually executed here, on the fly
print(doc)

lsi.save('/tmp/model.lsi') # same for tfidf, lda, ...
lsi = models.LsiModel.load('/tmp/model.lsi')

model = models.TfidfModel(corpus, normalize=True)

model = models.LsiModel(tfidf_corpus, id2word=dictionary, num_topics=300)

model.add_documents(another_tfidf_corpus) # now LSI has been trained on tfidf_corpus + another_tfidf_corpus
lsi_vec = model[tfidf_vec] # convert some new document into the LSI space, without affecting the model

model.add_documents(more_documents) # tfidf_corpus + another_tfidf_corpus + more_documents
lsi_vec = model[tfidf_vec]

model = models.RpModel(tfidf_corpus, num_topics=500)

model = models.LdaModel(corpus, id2word=dictionary, num_topics=100)

model = models.HdpModel(corpus, id2word=dictionary)

0 个答案:

没有答案