我正在尝试根据https://joshuanewlan.com/spacy-and-markovify上的教程,使用markovify,textacy和spacy从单词列表中生成句子。我真的很讨厌他,所以请考虑到这一点。 我正在使用的训练片段只是一些小写形式的简单片段的列表,保存在.txt文件中。
代码
import textacy
from textacy import preprocessing
import re
from unidecode import unidecode
import spacy
# Loads portuguese dictionary from spacy
lang = spacy.load('pt_core_news_sm')
# Loads the training keywords and sets up language
frases = ('~/git/SentGen/frasestreino/teste.txt')
corpus = textacy.Corpus.load (lang,frases)
class TaggedText(markovify.Text):
def sentence_split(self, text):
# Splits full-text string into a list of sentences.
sentence_list = []
for doc in corpus:
sentence_list += list(doc.sents)
return sentence_list
def word_split(self, sentence):
#Splits a sentence into a list of words.
return ["::".join((word.orth_,word.pos_)) for word in sentence]
def word_join(self, words):
sentence = " ".join(word.split("::")[0] for word in words)
return sentence
def test_sentence_input(self, sentence):
# rejects sentences that contain the type of punctuation
# that would look strange on its own
# in a randomly-generated sentence.
sentence = sentence.text
reject_pat = re.compile(r"(^')|('$)|\s'|'\s|[\"(\(\)\[\])]")
# Decode unicode, mainly to normalize fancy quotation marks
if sentence.__class__.__name__ == "str":
decoded = sentence
else:
decoded = unidecode(sentence)
# Sentence shouldn't contain problematic characters
if re.search(reject_pat, decoded): return False
return True
def generate_corpus(self, text):
#Given a text string, returns a list of lists; that is, a list of
#"sentences," each of which is a list of words.
#Before splitting into words, the sentences are filtered through
# pip in`self.test_sentence_input`
sentences = self.sentence_split(text)
passing = filter(self.test_sentence_input, sentences)
runs = map(self.word_split, sentences)
print(runs[0])
return runs
# Generated the model
model = TaggedText(corpus)
# A sentence based on the model
print(model.make_sentence())
the errors I'm getting:
```Traceback (most recent call last):
File "/home/mdinis/git/SentGen/Markowo.py", line 12, in <module>
corpus = textacy.Corpus.load (lang,frases)
File "/home/mdinis/.local/lib/python2.7/site-packages/textacy/corpus.py", line 604, in load
msg = srsly.msgpack_loads(f.read())
File "/home/mdinis/.local/lib/python2.7/site-packages/srsly/_msgpack_api.py", line 29, in msgpack_loads
msg = msgpack.loads(data, raw=False, use_list=use_list)
File "/home/mdinis/.local/lib/python2.7/site-packages/srsly/msgpack/__init__.py", line 60, in unpackb
return _unpackb(packed, **kwargs)
File "_unpacker.pyx", line 199, in srsly.msgpack._unpacker.unpackb
srsly.msgpack.exceptions.ExtraData: unpack(b) received extra data.```