我一直试图在变型自动编码器模型上的pytorch和tensorflow之间进行比较。我在两者上建立了完全相同的模型。然而,pytorch模型的性能优于始终显示约4倍损失的张量流,并且似乎能够学习到更好的损失,从而在大多数时期都保持了减少。
我真的不明白为什么这种差异应该那么大。并且我检查了所有超参数在两个参数上是否相同:Adam优化器,BatchNormalization和ReLu函数。我甚至将tensorflow上的初始化程序更改为与线性/密集函数的pytorch相同。
class Encoder(tf.keras.layers.Layer):
def __init__(self, input_dim,intermediate_dim,latent_dim):
super(Encoder, self).__init__()
self.hidden_layer = tf.keras.Sequential([
tf.keras.layers.InputLayer(input_shape=(input_dim,)),
tf.keras.layers.Dense(units=intermediate_dim,use_bias=True),
tf.keras.layers.BatchNormalization(momentum=0.01, epsilon=0.001),
tf.keras.layers.ReLU()
])
self.q_mu = tf.keras.layers.Dense(units=latent_dim,activation=None,use_bias=True)
self.q_var = tf.keras.layers.Dense(units=latent_dim,activation=None,use_bias=True)
@tf.function
def reparameterize(self, mean, logvar):
x = tfp.distributions.Normal(mean,tf.math.sqrt(logvar)).sample()
return x
@tf.function
def call(self, input_features):
hidden = self.hidden_layer(input_features)
mu = self.q_mu(hidden)
var = tf.math.exp(self.q_var(hidden)) + 1e-4
z = self.reparameterize(mu,var)
return mu, var,z
class Decoder(tf.keras.layers.Layer):
def __init__(self, output_dim,intermediate_dim,latent_dim):
super(Decoder, self).__init__()
self.px_decoder = tf.keras.Sequential([
tf.keras.layers.InputLayer(input_shape=(latent_dim,)),
tf.keras.layers.Dense(units=intermediate_dim,use_bias=True),
tf.keras.layers.BatchNormalization(momentum=0.01, epsilon=0.001),
tf.keras.layers.ReLU()
])
self.scale_decoder = tf.keras.Sequential([
tf.keras.layers.Dense(units=output_dim,activation=None,use_bias=True), tf.keras.layers.Activation('softmax')])
self.px_r_decoder = tf.keras.layers.Dense(units=output_dim,use_bias=True,activation=None)
self.px_dropout_decoder = tf.keras.layers.Dense(units=output_dim,use_bias=True,activation=None)
@tf.function
def call(self, z):
px = self.px_decoder(z)
px_scale = self.scale_decoder(px)
px_dropout = self.px_dropout_decoder(px)
px_rate = px_scale#*tf.math.exp(library)
px_r = self.px_r_decoder(px)
return px_rate, px_r,px_dropout
class VAE(tf.keras.Model):
def __init__(self, latent_dim,intermediate_dim, output_dim):
super(VAE,self).__init__()
self.q_encoder = Encoder(output_dim,intermediate_dim,latent_dim)
self.decoder = Decoder(output_dim,intermediate_dim,latent_dim)
@tf.function
def log_nb_positive(self,x, mu, theta, eps=1e-8):
log_theta_mu_eps = tf.math.log(theta + mu + eps)
negative_likelihood = (theta * (tf.math.log(theta + eps) - log_theta_mu_eps)
+ x * (tf.math.log(mu + eps) - log_theta_mu_eps)
+ tf.math.lgamma(x + theta)
- tf.math.lgamma(theta)
- tf.math.lgamma(x + 1)
)
return negative_likelihood
@tf.function
def loss_f(self,outputs,batch_features):
qz_m = outputs['qz_m']
qz_v = outputs['qz_v']
px_rate = outputs['px_rate']
px_r = outputs['px_r']
ql_m = outputs['ql_m']
ql_v = outputs['ql_v']
RCL = tf.math.reduce_sum(-self.log_nb_positive(batch_features, px_rate, px_r),axis=1)
Muz = tf.ones_like(qz_m)*0
Varz = tf.ones_like(qz_v)*1
local_l_mean,local_l_var = tf.math.reduce_mean(batch_features,axis=1), tf.math.reduce_variance(batch_features,axis=1)
kld = tfp.distributions.kl_divergence(tfp.distributions.Normal(qz_m, tf.math.sqrt(qz_v)), tfp.distributions.Normal(Muz, Varz))
loss = RCL,tf.math.reduce_sum(kld,axis=1)
return loss
@tf.function
def inference(self, input_features):
qz_m, qz_v, z,ql_m,ql_v,library = self.q_encoder(input_features)
px_rate,px_r,_ = self.decoder(z)
px_r = tf.math.exp(px_r)
return {'px_rate':px_rate,'px_r':px_r,'qz_m':qz_m,'qz_v':qz_v,'ql_m':ql_m,'ql_v':ql_v}
@tf.function
def call(self, input_features):
outputs = self.inference(input_features)
loss = self.loss_f(outputs,input_features)
return loss
#### Loss is obtained using gradienttape something like this:
class Trainer:
def __init__(self, model,batch_features,lr=1e-3,eps=0.01):
import datetime
self.model = model
self.batch_features = batch_features
self.opt = tf.optimizers.Adam(learning_rate=lr,epsilon=eps,amsgrad=True,decay=0)
self.logdir = os.path.join("logs", datetime.datetime.now().strftime("%Y%m%d-%H%M%S"))
self.writer = tf.summary.create_file_writer(self.logdir)
self.train_loss = tf.keras.metrics.Mean(name='train_loss')
@tf.function
def single_pass(self,curr_epoch,n_epochs,bf):
with tf.GradientTape(persistent=True) as tape:
batch_size = bf.shape[0]
rcl,kld = self.model(bf)
kl_weight = 1
lossSingle_Pass = tf.reduce_mean(rcl + kld*kl_weight)
gradients = tape.gradient(lossSingle_Pass, self.model.trainable_variables)
gradient_variables = zip(gradients, self.model.trainable_variables)
self.opt.apply_gradients(gradient_variables)
def train(self,n_epochs=100):
self.n_epochs = n_epochs
with self.writer.as_default():
with tf.summary.record_if(True):
for e in n_epochs:
for step,bf in enumerate(training_dataset):
batch_size = bf.shape[0]
self.single_pass(e,self.n_epochs,bf)
kl_weight = min(1, e / n_epochs)
rcl, kld = self.model(bf)
lossTrain = tf.reduce_mean(rcl + kld*kl_weight)
running += lossTrain/batch_size
likelihood += tf.reduce_mean(rcl)/batch_size
tf.summary.scalar('Running Loss', lossTrain, step=e)
def reparameterize_gaussian(mu, var):
return Normal(mu, var.sqrt()).rsample()
torch.manual_seed(1)
################################################################ Encoder Network ##########################################################################################
class Encoder(nn.Module):
def __init__(self, input_dim, hidden_dim, latent_dim):
super().__init__()
''' Gene '''
self.q_linear = nn.Sequential(
nn.Linear(input_dim,hidden_dim,bias=True),
nn.BatchNorm1d(hidden_dim,momentum=0.01, eps=0.001),
nn.ReLU())
self.q_mu = nn.Sequential(
nn.Linear(hidden_dim,latent_dim))
self.q_var = nn.Sequential(
nn.Linear(hidden_dim,latent_dim))
def forward(self, x):
hidden = self.q_linear(x)
mean = self.q_mu(hidden)
log_var = torch.exp(self.q_var(hidden))+ 1e-4
z= reparameterize_gaussian(mean,log_var)
return mean, log_var,z
################################################################ Decoder Network ##########################################################################################
class Decoder(nn.Module):
def __init__(self, latent_dim, hidden_dim, output_dim):
super().__init__()
self.px_decoder = nn.Sequential(
nn.Linear(latent_dim, hidden_dim,bias=True),
nn.BatchNorm1d(hidden_dim,momentum=0.01, eps=0.001),
nn.ReLU()
)
self.scale_decoder = nn.Sequential(
nn.Linear(hidden_dim, output_dim),nn.Softmax(dim=-1))
self.px_r_decoder = nn.Sequential(nn.Linear(hidden_dim, output_dim))
self.px_dropout_decoder = nn.Sequential(nn.Linear(hidden_dim,output_dim))
def forward(self, x):
px = self.px_decoder(x)
px_scale = self.scale_decoder(px)
px_dropout = self.px_dropout_decoder(px)
px_rate = px_scale
px_r = self.px_r_decoder(px)
return px_rate, px_r,px_dropout
################################################################ Variational Autoencoder scVI Negative Binomial Model ##########################################################################################
class VAE(nn.Module):
def __init__(self, input_dim, hidden_dim, latent_dim):
super().__init__()
self.q_encoder = Encoder(input_dim, hidden_dim, latent_dim)
self.decoder = Decoder(latent_dim, hidden_dim, input_dim)
def sample_from_posterior(self, x, give_mean=False):
qz_m, qz_v, z,_,_,_ = self.q_encoder(x) # y only used in VAEC
if give_mean:
z = qz_m
return z
def forward(self, x):
qz_m, qz_v, z,ql_m,ql_v,library = self.q_encoder(x)
px_rate,px_r,_ = self.decoder(z,library)
px_r = torch.exp(px_r)
return px_rate, px_r, qz_m, qz_v,ql_m,ql_v,library
def log_nb_positive(x, mu, theta, eps=1e-8):
log_theta_mu_eps = torch.log(theta + mu + eps)
res = (theta * (torch.log(theta + eps) - log_theta_mu_eps)
+ x * (torch.log(mu + eps) - log_theta_mu_eps)
+ torch.lgamma(x + theta)
- torch.lgamma(theta)
- torch.lgamma(x + 1))
return res
def get_reconstruction_loss(x, px_rate, px_r, **kwargs):
reconst_loss = -log_nb_positive(x, px_rate, px_r).sum(dim=-1)
return reconst_loss
def calculate_lossNEW(x, p_mean, p_log_var,qz_mu,qz_var):
RCL = get_reconstruction_loss(x, p_mean, p_log_var)
mean = torch.zeros_like(qz_mu)
scale = torch.ones_like(qz_var)
kld = kl(Normal(qz_mu, torch.sqrt(qz_var)), Normal(mean, scale)).sum(dim=1)
return RCL,kld
optimizer = optim.Adam(model.parameters(), lr=1e-3,eps=0.01)
num_epochs = 100
losses = []
rlosses1 = []
for e in num_epochs:
running_loss1 = 0
for z,x in enumerate(mini_batches):
#========forward pass=====================================]
px_rate,px_r,qz_mu, qz_var = model(x)
loss = calculate_lossNEW(x, px_rate,px_r,qz_mu,qz_var)
if e is not None:
kl_weight = min(1, e / 100)
else:
kl_weight = 1.0
like = loss[0].mean()
klditem = loss[1].mean()
loss = (loss[0] + loss[1]*kl_weight).mean()
#=======backward pass=====================================
optimizer.zero_grad() # zero the gradients on each pass before the update
loss.backward() # backpropagate the loss through the model
optimizer.step() # update the gradients w.r.t the loss
losses.append(loss.item())
running_loss1 += loss.item()/batch_size
rlosses1.append(running_loss1)
很抱歉,代码长度很大,但是损失函数完全相同。在理论上,在pytorch和tensorflow计算之间,KL散度也应该相似。