请检查我获取POS矢量的代码。而不是获取POS标签矢量我只是在POS.Eg中获取字母矢量而不是获取POS标签矢量CC,DT,PRP等我得到C,D和矢量p上。
#get word and pos tagger
def get_pos_tagger(self, document):
# tokenizer
tokens = nltk.word_tokenize(document)
# get pos tagger
posTagger = nltk.pos_tag(tokens=tokens)
tags = []
for (word, tag) in posTagger:
tags.append(tag)
return tags
def get_tag_and_training_data(self):
tags=[]
documents=[]
line_counter=1
with open(self.filename) as csvfile:
spamreader = csv.reader(csvfile, delimiter=",")
for line in spamreader:
#Initialize the token list for line
tags.append(int(line[0]))
documents.append(line[1].lower() + " " + line[2].lower())
return tags,documents
# build pos model
def buildPosModel(self):
tags, documents = self.get_tag_and_training_data()
sentences = []
for document in documents:
sentences += self.get_pos_tagger(document)
print(sentences)
modelPos = gensim.models.Word2Vec(sentences=sentences, size=100, min_count=1, window=5, workers=cores)
modelPos.wv.save_word2vec_format('word2vecposmodel.bin', binary=False)
return modelPos