似乎格式是,对于每一行,字符串都是'字数#.......所以很容易拆分它。 但是当我用下面的脚本分割它们时
import numpy as np
def loadGloveModel(gloveFile):
print "Loading Glove Model"
f = open(gloveFile,'r')
model = {}
for line in f:
splitLine = line.split()
word = splitLine[0]
embedding = np.array([float(val) for val in splitLine[1:]])
model[word] = embedding
print "Done.",len(model)," words loaded!"
return model
我装了手套840B 300d.txt。但得到错误,我打印我得到的splitLine
['contact', 'name@domain.com', '0.016426', '0.13728', '0.18781', '0.75784', '0.44012', '0.096794' ... ]
或
['.', '.', '.', '.', '0.033459', '-0.085658', '0.27155', ...]
请注意,此脚本在手套中工作正常.6。*
答案 0 :(得分:0)
我认为以下内容可能会有所帮助:
def process_glove_line(line, dim):
word = None
embedding = None
try:
splitLine = line.split()
word = " ".join(splitLine[:len(splitLine)-dim])
embedding = np.array([float(val) for val in splitLine[-dim:]])
except:
print(line)
return word, embedding
def load_glove_model(glove_filepath, dim):
with open(glove_filepath, encoding="utf8" ) as f:
content = f.readlines()
model = {}
for line in content:
word, embedding = process_glove_line(line, dim)
if embedding is not None:
model[word] = embedding
return model
model= load_glove_model("glove.840B.300d.txt", 300)