我正在尝试使用READ.MD文件中提到的语法运行github program 运行程序后,我收到以下错误:
>>> from deepsphinx.api import Predict
>>> ds = Predict(Predict.default_flags(),'batch-21937.data-00000-of-00001')
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "/home/aims-whm/deepsphinx-master/deepsphinx/api.py", line 24, in __init__
1.0)
File "/home/aims-whm/deepsphinx-master/deepsphinx/seq2seq_model.py", line 241, in seq2seq_model
keep_prob)
File "/home/aims-whm/deepsphinx-master/deepsphinx/seq2seq_model.py", line 51, in encoding_layer
dtype=tf.float32)
File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/ops/rnn.py", line 396, in bidirectional_dynamic_rnn
seq_dim=time_dim, batch_dim=batch_dim)
File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/ops/rnn.py", line 389, in _reverse
seq_dim=seq_dim, batch_dim=batch_dim)
File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/ops/array_ops.py", line 2355, in reverse_sequence
name=name)
File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/ops/gen_array_ops.py", line 2633, in reverse_sequence
batch_dim=batch_dim, name=name)
File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/framework/op_def_library.py", line 589, in apply_op
param_name=input_name)
File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/framework/op_def_library.py", line 60, in _SatisfiesTypeConstraint
", ".join(dtypes.as_dtype(x).name for x in allowed_list)))
TypeError: Value passed to parameter 'seq_lengths' has DataType float64 not in list of allowed values: int32, int64
所以我尝试检查问题是什么,找到了seq2seq_model.py
文件。当我到达第51行时,它写成float32
,我认为是正确的。但我找不到那里的津贴。
这里是我正在处理的两个文件:
seq2seq_model.py
"""Tensorflow model for speech recognition"""
import tensorflow as tf
from deepsphinx.vocab import VOCAB_SIZE, VOCAB_TO_INT
from deepsphinx.utils import FLAGS
from deepsphinx.lm import LMCellWrapper
from deepsphinx.attention import BahdanauAttentionCutoff
def encoding_layer(
input_lengths,
rnn_inputs,
keep_prob):
""" Encoding layer for the model.
Args:
input_lengths (Tensor): A tensor of input lenghts of instances in
batches
rnn_inputs (Tensor): Inputs
Returns:
Encoding output, LSTM state, output length
"""
for layer in range(FLAGS.num_layers):
with tf.variable_scope('encoder_{}'.format(layer)):
cell_fw = tf.contrib.rnn.LSTMCell(
FLAGS.rnn_size,
initializer=tf.random_uniform_initializer(
-0.1, 0.1, seed=2))
cell_fw = tf.contrib.rnn.DropoutWrapper(
cell_fw,
output_keep_prob=keep_prob,
variational_recurrent=True,
dtype=tf.float32,
input_size=rnn_inputs.get_shape()[2])
cell_bw = tf.contrib.rnn.LSTMCell(
FLAGS.rnn_size,
initializer=tf.random_uniform_initializer(
-0.1, 0.1, seed=2))
cell_bw = tf.contrib.rnn.DropoutWrapper(
cell_bw,
output_keep_prob=keep_prob,
variational_recurrent=True,
dtype=tf.float32,
input_size=rnn_inputs.get_shape()[2])
enc_output, enc_state = tf.nn.bidirectional_dynamic_rnn(
cell_fw,
cell_bw,
rnn_inputs,
input_lengths,
dtype=tf.float32)
if layer != FLAGS.num_layers - 1:
rnn_inputs = tf.concat(enc_output, 2)
# Keep only every second element in the sequence
rnn_inputs = rnn_inputs[:, ::2, :]
input_lengths = (input_lengths + 1) / 2
# Join outputs since we are using a bidirectional RNN
enc_output = tf.concat(enc_output, 2)
return enc_output, enc_state, input_lengths
def get_dec_cell(
enc_output,
enc_output_lengths,
use_lm,
fst,
tile_size,
keep_prob):
"""Decoding cell for attention based model
Return:
`RNNCell` Instance
"""
lstm = tf.contrib.rnn.LSTMCell(
FLAGS.rnn_size,
initializer=tf.random_uniform_initializer(-0.1, 0.1, seed=2))
dec_cell_inp = tf.contrib.rnn.DropoutWrapper(
lstm,
output_keep_prob=keep_prob,
variational_recurrent=True,
dtype=tf.float32)
lstm = tf.contrib.rnn.LSTMCell(
FLAGS.rnn_size,
initializer=tf.random_uniform_initializer(-0.1, 0.1, seed=2))
dec_cell = tf.contrib.rnn.DropoutWrapper(
lstm,
output_keep_prob=keep_prob,
variational_recurrent=True,
dtype=tf.float32)
dec_cell_out = tf.contrib.rnn.LSTMCell(
FLAGS.rnn_size,
num_proj=VOCAB_SIZE,
initializer=tf.random_uniform_initializer(-0.1, 0.1, seed=2))
dec_cell = tf.contrib.rnn.MultiRNNCell(
[dec_cell_inp] +
[dec_cell] * (FLAGS.num_decoding_layers - 2) +
[dec_cell_out])
enc_output = tf.contrib.seq2seq.tile_batch(
enc_output,
tile_size)
enc_output_lengths = tf.contrib.seq2seq.tile_batch(
enc_output_lengths,
tile_size)
attn_mech = BahdanauAttentionCutoff(
FLAGS.rnn_size,
enc_output,
enc_output_lengths,
normalize=True,
name='BahdanauAttention')
dec_cell = tf.contrib.seq2seq.AttentionWrapper(
dec_cell,
attn_mech,
VOCAB_SIZE,
output_attention=True)
if use_lm:
dec_cell = LMCellWrapper(dec_cell, fst, 5)
return dec_cell
#pylint: disable-msg=too-many-arguments
def training_decoding_layer(
target_data,
target_lengths,
enc_output,
enc_output_lengths,
fst,
keep_prob):
""" Training decoding layer for the model.
Returns:
Training logits
"""
target_data = tf.concat(
[tf.fill([FLAGS.batch_size, 1], VOCAB_TO_INT['<s>']),
target_data[:, :-1]], 1)
dec_cell = get_dec_cell(
enc_output,
enc_output_lengths,
FLAGS.use_train_lm,
fst,
1,
keep_prob)
initial_state = dec_cell.zero_state(
dtype=tf.float32,
batch_size=FLAGS.batch_size)
target_data = tf.nn.embedding_lookup(
tf.eye(VOCAB_SIZE),
target_data)
training_helper = tf.contrib.seq2seq.TrainingHelper(
inputs=target_data,
sequence_length=target_lengths,
time_major=False)
training_decoder = tf.contrib.seq2seq.BasicDecoder(
dec_cell,
training_helper,
initial_state)
training_logits, _, _ = tf.contrib.seq2seq.dynamic_decode(
training_decoder,
output_time_major=False,
impute_finished=True)
return training_logits
def inference_decoding_layer(
enc_output,
enc_output_lengths,
fst,
keep_prob):
""" Inference decoding layer for the model.
Returns:
Predictions
"""
dec_cell = get_dec_cell(
enc_output,
enc_output_lengths,
FLAGS.use_inference_lm,
fst,
FLAGS.beam_width,
keep_prob)
initial_state = dec_cell.zero_state(
dtype=tf.float32,
batch_size=FLAGS.batch_size * FLAGS.beam_width)
start_tokens = tf.fill(
[FLAGS.batch_size],
VOCAB_TO_INT['<s>'],
name='start_tokens')
inference_decoder = tf.contrib.seq2seq.BeamSearchDecoder(
dec_cell,
tf.eye(VOCAB_SIZE),
start_tokens,
VOCAB_TO_INT['</s>'],
initial_state,
FLAGS.beam_width)
predictions, _, _ = tf.contrib.seq2seq.dynamic_decode(
inference_decoder,
output_time_major=False,
maximum_iterations=FLAGS.max_output_len)
return predictions
def seq2seq_model(
input_data,
target_data,
input_lengths,
target_lengths,
fst,
keep_prob):
""" Attention based model
Returns:
Logits, Predictions, Training operation, Cost, Step, Scores of beam
search
"""
enc_output, _, enc_lengths = encoding_layer(
input_lengths,
input_data,
keep_prob)
with tf.variable_scope("decode"):
training_logits = training_decoding_layer(
target_data,
target_lengths,
enc_output,
enc_lengths,
fst,
keep_prob)
with tf.variable_scope("decode", reuse=True):
predictions = inference_decoding_layer(
enc_output,
enc_lengths,
fst,
keep_prob)
# Create tensors for the training logits and predictions
training_logits = tf.identity(
training_logits.rnn_output,
name='logits')
scores = tf.identity(
predictions.beam_search_decoder_output.scores,
name='scores')
predictions = tf.identity(
predictions.predicted_ids,
name='predictions')
# Create the weights for sequence_loss
masks = tf.sequence_mask(
target_lengths,
tf.reduce_max(target_lengths),
dtype=tf.float32,
name='masks')
with tf.name_scope("optimization"):
# Loss function
cost = tf.contrib.seq2seq.sequence_loss(
training_logits,
target_data,
masks)
tf.summary.scalar('cost', cost)
step = tf.contrib.framework.get_or_create_global_step()
# Optimizer
optimizer = tf.train.AdamOptimizer(FLAGS.learning_rate)
# Gradient Clipping
gradients = optimizer.compute_gradients(cost)
capped_gradients = [
(tf.clip_by_value(grad, -5., 5.), var)
for grad, var in gradients if grad is not None]
train_op = optimizer.apply_gradients(capped_gradients, step)
return training_logits, predictions, train_op, cost, step, scores
我认为另一个有问题的文件是
api.py
""" A minimal API for the prediction for an audio file """
from deepsphinx.seq2seq_model import seq2seq_model
from deepsphinx.utils import FLAGS
from deepsphinx.vocab import VOCAB, VOCAB_SIZE
from deepsphinx.data import get_features
import tensorflow as tf
class Predict(object):
""" Set flags and restore from checkpoint """
def __init__(self, flags, checkpoint_path, lm_fst=None):
self.graph = tf.Graph()
with self.graph.as_default():
flags['batch_size'] = 1
# TODO: Use higher level API
FLAGS.__dict__['__flags'] = flags
self.input_length = tf.placeholder(tf.int32, shape=[1])
self.input = tf.placeholder(tf.float32, shape=[1, None, FLAGS.nfilt * 3 + 1])
_, self.predictions, _, _, _, _ = seq2seq_model(
self.input,
tf.placeholder(tf.float32, shape=[1, VOCAB_SIZE]),
self.input_length,
tf.placeholder(tf.int32, shape=[1]),
lm_fst,
1.0)
self.sess = tf.Session(graph=self.graph)
tf.train.Saver().restore(self.sess, checkpoint_path)
@staticmethod
def default_flags():
return {'nfilt': 40,
'max_output_len': 250,
'rnn_size': 256,
'num_layers': 3,
'num_decoding_layers': 3,
'batch_size': 1,
'beam_width': 16,
'cutoff_range': 200,
'use_train_lm': False,
'use_inference_lm': False,
'learning_rate': 0.0
}
def predict(self, audio_file):
""" Predict and return string output by beam search """
feat = get_features(audio_file)
pred = self.sess.run(self.predictions, feed_dict={
self.input: [feat], self.input_length: [feat.shape[0]]})
return ''.join([VOCAB[l] for l in pred[0, :, 0]])
请帮助我解决这个问题,因为我没有解决这个问题。请建议我解决它有什么帮助。