class NB(object):
def __init__(self, theta=None, theta_init=[0.0],
             scale_factor=1.0, scope='nbinom_loss/',
             debug=False, **theta_kwargs):

    self.eps = 1e-10
    self.scale_factor = scale_factor
    self.debug = debug
    self.scope = scope

    with tf.name_scope(self.scope):
        # a variable may be given by user or it can be created here
        if theta is None:
            theta = tf.Variable(theta_init, dtype=tf.float32,
                                name='theta', **theta_kwargs)

        # keep a reference to the variable itself
        self.theta_variable = theta

        # to keep dispersion always non-negative
        self.theta = tf.nn.softplus(theta)

def loss(self, y_true, y_pred, reduce=True):
    scale_factor = self.scale_factor
    eps = self.eps

    with tf.name_scope(self.scope):
        y_true = tf.cast(y_true, tf.float32)
        y_pred = tf.cast(y_pred, tf.float32) * scale_factor

        theta = 1.0/(self.theta+eps)

        t1 = -tf.lgamma(y_true+theta+eps) 
        t2 = tf.lgamma(theta+eps)
        t3 = tf.lgamma(y_true+1.0) 
        t4 = -(theta * (tf.log(theta+eps)))
        t5 = -(y_true * (tf.log(y_pred+eps)))
        t6 = (theta+y_true) * tf.log(theta+y_pred+eps)      

        if self.debug:
            tf.summary.histogram('t1', t1)
            tf.summary.histogram('t2', t2)
            tf.summary.histogram('t3', t3)
            tf.summary.histogram('t4', t4)
            tf.summary.histogram('t5', t5)
            tf.summary.histogram('t6', t6)

        final = t1 + t2 + t3 + t4 + t5 + t6

        if reduce:
            final = tf.reduce_mean(final)

    return final

def build_graph(feed_previous = False):


global_step = tf.Variable(
              collections=[tf.GraphKeys.GLOBAL_STEP, tf.GraphKeys.GLOBAL_VARIABLES])

weights = {
    'out': tf.get_variable('Weights_out', \
                           shape = [hidden_dim, output_dim], \
                           dtype = tf.float32, \
                           initializer = tf.contrib.layers.xavier_initializer()),

    'out_dec_inp': tf.get_variable('Weights_out_dec', \
                           shape = [output_dim+1, output_dim], \
                           dtype = tf.float32, \
                           initializer = tf.contrib.layers.xavier_initializer()),
biases = {
    'out': tf.get_variable('Biases_out', \
                           shape = [output_dim], \
                           dtype = tf.float32, \
                           initializer = tf.zeros_initializer()),

    'out_dec_inp': tf.get_variable('Biases_out_dec', \
                           shape = [output_dim], \
                           dtype = tf.float32, \
                           initializer = tf.zeros_initializer()),

with tf.variable_scope('Seq2seq'):
    # Encoder: inputs
    enc_inp = [
        tf.placeholder(tf.float32, shape=(None, input_dim), name="inp_{}".format(t))
           for t in range(input_seq_len)

    # Decoder: target outputs
    target_seq = [
        tf.placeholder(tf.float32, shape=(None, output_dim), name="y".format(t))
          for t in range(output_seq_len)

    # Extreme events bool vectors for input seq
    #input_seq_extremes_bool = [
    #    tf.placeholder(tf.float32, shape=(None, 1), name="event_bool".format(t))
    #      for t in range(input_seq_len)

    # Extreme events bool vectors for output seq
    output_seq_extremes_bool = [
        tf.placeholder(tf.float32, shape=(None, 1), name="event_bool".format(t))
          for t in range(output_seq_len)

    # Give a "GO" token to the decoder. 
    # If dec_inp are fed into decoder as inputs, this is 'guided' training; otherwise only the 
    # first element will be fed as decoder input which is then 'un-guided'
    dec_inp = [ tf.zeros_like(target_seq[0], dtype=tf.float32, name="GO") ] + target_seq[:-1]
    dec_inp = [ tf.concat([b, d], 1) for b, d in zip(output_seq_extremes_bool, dec_inp) ]

    #enc_inp = [ tf.concat([b, e], 1) for b, e in zip(input_seq_extremes_bool, enc_inp_raw) ]

    with tf.variable_scope('LSTMCell'): 
        cells = []
        for i in range(num_stacked_layers):
            with tf.variable_scope('RNN_{}'.format(i)):
        cell = tf.contrib.rnn.MultiRNNCell(cells)

    def _rnn_decoder(decoder_inputs,
      """RNN decoder for the sequence-to-sequence model.
        decoder_inputs: A list of 2D Tensors [batch_size x input_size].
        initial_state: 2D Tensor with shape [batch_size x cell.state_size].
        cell: rnn_cell.RNNCell defining the cell function and size.
        loop_function: If not None, this function will be applied to the i-th output
          in order to generate the i+1-st input, and decoder_inputs will be ignored,
          except for the first element ("GO" symbol). This can be used for decoding,
          but also for training to emulate http://arxiv.org/abs/1506.03099.
          Signature -- loop_function(prev, i) = next
            * prev is a 2D Tensor of shape [batch_size x output_size],
            * i is an integer, the step number (when advanced control is needed),
            * next is a 2D Tensor of shape [batch_size x input_size].
        scope: VariableScope for the created subgraph; defaults to "rnn_decoder".
        A tuple of the form (outputs, state), where:
          outputs: A list of the same length as decoder_inputs of 2D Tensors with
            shape [batch_size x output_size] containing generated outputs.
          state: The state of each cell at the final time-step.
            It is a 2D Tensor of shape [batch_size x cell.state_size].
            (Note that in some cases, like basic RNN cell or GRU cell, outputs and
             states can be the same. They are different for LSTM cells though.)
      with variable_scope.variable_scope(scope or "rnn_decoder"):
        state = initial_state
        outputs = []
        prev = None
        for i, inp in enumerate(decoder_inputs):
          if loop_function is not None and prev is not None:
            with variable_scope.variable_scope("loop_function", reuse=True):
              inp = loop_function(prev, i)
            inp = tf.matmul(inp, weights['out_dec_inp']) + biases['out_dec_inp']
          if i > 0:
          output, state = cell(inp, state)
          if loop_function is not None:
            prev = output
      return outputs, state

    def _basic_rnn_seq2seq(encoder_inputs,
      """Basic RNN sequence-to-sequence model.
      This model first runs an RNN to encode encoder_inputs into a state vector,
      then runs decoder, initialized with the last encoder state, on decoder_inputs.
      Encoder and decoder use the same RNN cell type, but don't share parameters.
        encoder_inputs: A list of 2D Tensors [batch_size x input_size].
        decoder_inputs: A list of 2D Tensors [batch_size x input_size].
        feed_previous: Boolean; if True, only the first of decoder_inputs will be
          used (the "GO" symbol), all other inputs will be generated by the previous 
          decoder output using _loop_function below. If False, decoder_inputs are used 
          as given (the standard decoder case).
        dtype: The dtype of the initial state of the RNN cell (default: tf.float32).
        scope: VariableScope for the created subgraph; default: "basic_rnn_seq2seq".
        A tuple of the form (outputs, state), where:
          outputs: A list of the same length as decoder_inputs of 2D Tensors with
            shape [batch_size x output_size] containing the generated outputs.
          state: The state of each decoder cell in the final time-step.
            It is a 2D Tensor of shape [batch_size x cell.state_size].
      with variable_scope.variable_scope(scope or "basic_rnn_seq2seq"):
        enc_cell = copy.deepcopy(cell)
        _, enc_state = rnn.static_rnn(enc_cell, encoder_inputs, dtype=dtype)
        if feed_previous:
            return _rnn_decoder(decoder_inputs, enc_state, cell, _loop_function)
            return _rnn_decoder(decoder_inputs, enc_state, cell)

    def _loop_function(prev, i):
      '''Naive implementation of loop function for _rnn_decoder. Transform prev from 
      dimension [batch_size x hidden_dim] to [batch_size x output_dim], which will be
      used as decoder input of next time step '''
      #return tf.matmul(prev, weights['out']) + biases['out']
      temp_out = tf.matmul(prev, weights['out']) + biases['out']
      temp_concat = tf.concat([output_seq_extremes_bool[i], temp_out], 1)
      return tf.matmul(temp_concat, weights['out_dec_inp']) + biases['out_dec_inp']

    dec_outputs, dec_memory = _basic_rnn_seq2seq(
        feed_previous = feed_previous

    reshaped_outputs = [tf.matmul(i, weights['out']) + biases['out'] for i in dec_outputs]
 # Training loss and optimizer

with tf.variable_scope('Loss'):  
    nb = NB()
    nbinom_loss, param_theta = nb.loss, nb.theta
    output_loss = 0
    for _y, _Y in zip(reshaped_outputs, target_seq):
        output_loss += nbinom_loss(_Y, _y)
    # L2 regularization for weights and biases
    reg_loss = 0
    for tf_var in tf.trainable_variables():
        if 'Biases_' in tf_var.name or 'Weights_' in tf_var.name:
            reg_loss += tf.reduce_mean(tf.nn.l2_loss(tf_var))
    loss = output_loss + lambda_l2_reg * reg_loss 

with tf.variable_scope('Optimizer'):
    optimizer = tf.contrib.layers.optimize_loss(

saver = tf.train.Saver

return dict(
    enc_inp = enc_inp, 
    target_seq = target_seq, 
    train_op = optimizer, 
    saver = saver, 
    output_loss = reg_loss,
    reshaped_outputs = reshaped_outputs,
    output_seq_extremes_bool = output_seq_extremes_bool, 

total_iteractions = 5000
batch_size = 10
train_losses = []
val_losses = []

rnn_model = build_graph(feed_previous=False)

saver = tf.train.Saver()
init = tf.global_variables_initializer()

with tf.Session() as sess:


    print("Training losses: ")
    for i in range(total_iteractions):
        batch_input, batch_output, batch_in_event_bool, batch_out_event_bool =       generate_train_supervised_samples(i=i, batch_size=batch_size)
        feed_dict = {rnn_model['enc_inp'][t]: batch_input[:,t] for t in range(input_seq_len)}
        feed_dict.update({rnn_model['target_seq'][t]: batch_output[:,t] for t in range(output_seq_len)})
        feed_dict.update({rnn_model['output_seq_extremes_bool'][t]: batch_out_event_bool[:,t].reshape(-1,1) for t in range(output_seq_len)})
        _, loss_t = sess.run([rnn_model['train_op'], rnn_model['loss']], feed_dict)
    temp_saver = rnn_model['saver']()
    save_path = temp_saver.save(sess, os.path.join('./', 'multivariate_ts_trial_case'))

print("Checkpoint saved at: ", save_path)

