我已经定义了一个列表X
来获取文件的内容并提供Theano中skip-though模型的预处理器。错误显示X is not defined
。
https://github.com/ryankiros/skip-thoughts/blob/master/skipthoughts.py
def encode(model, X, use_norm=True, verbose=True, batch_size=128, use_eos=False):
"""
Encode sentences in the list X. Each entry will return a vector
"""
# first, do preprocessing
X = preprocess(X)
# word dictionary and init
d = defaultdict(lambda : 0)
for w in model['utable'].keys():
d[w] = 1
ufeatures = numpy.zeros((len(X), model['uoptions']['dim']), dtype='float32')
bfeatures = numpy.zeros((len(X), 2 * model['boptions']['dim']), dtype='float32')
# length dictionary
ds = defaultdict(list)
captions = [s.split() for s in X]
for i,s in enumerate(captions):
ds[len(s)].append(i)
def preprocess(text):
"""
Preprocess text for encoder
"""
X = []
#For _chat.txt to list_______________________________________
f= open("/var/www/html/_chat.txt", "r")
file_list = f.readlines()
first_three = file_list[1:3]
X = [x.replace('\t',' ') for x in first_three]
X = [x.replace('\x00','') for x in X]
X = [x.replace(' \r\n','') for x in X]
#______________________________________________________________
sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
for t in text:
sents = sent_detector.tokenize(t)
result = ''
for s in sents:
tokens = word_tokenize(s)
result += ' ' + ' '.join(tokens)
X.append(result)
return X