我正在根据本文https://arxiv.org/pdf/1612.04530.pdf以及Josef Ondrej找到here之前的工作,为Keras开发一个Permutational Equivariant Layer。
图层本身是由多个层组成的Keras模型:
from keras import backend as K
from keras import losses
from keras.layers import Average, Add, Concatenate, Maximum, Input, Dense, Lambda
from keras.models import Model
from keras.engine.topology import Layer
def PermutationEquivariant(input_shape, layer_size, tuple_dim = 2, reduce_fun = "sum", dense_params = {}):
"""
Implements a permutation equivariant layer.
Each batch in our data consists of `input_shape[0]` observations
each with `input_shape[1]` features.
Args:
input_shape -- A pair of `int` - (number of observations in one batch x
number of features of each observation). The batch dimension is not included.
layer_size -- `int`. Size of dense layer applied to each tuple of observations.
tuple_dim -- A `int`, how many observations to put in one tuple.
reduce_fun -- A `string`, type of function to "average" over all tuples starting with the same index.
Returns:
g -- A keras Model - the permutation equivariant layer.
It consists of one tuple layer that creates all possible `tuple_dim`-tuples
of observations, sorted on an axis along which the first index is constant.
The same dense layer is applied on every tuple and then some symmetric pooling function is applied
across all tuples with the same first index (for example mean or maximum).
"""
inputs = Input(shape=input_shape)## input_shape: batch_size x row x col
## SeperatedTuple layer
x = SeperatedTuples(tuple_dim, input_shape = input_shape)(inputs)## out_shape: batch_size x row x row ** (tuple_dim-1) x tuple_dim*col
## Dense layer -- implemented with a conv layer
# Use the same dense layer for each tuple
dense_input_shape = (tuple_dim*input_shape[1], ) # batch_size x tuple_dim*col
dense_layer = Dense(input_shape = dense_input_shape, units=layer_size, **dense_params)
# iterate through rows
x_i_list = []
for i in range(input_shape[0]):
xi_j_list = []
# applying the dense layer to each tuple where first index equals i
# here we could also use a 1x1 convolution. Instead of reusing
# the dense layer for each tuple, we would be reusing the kernels
for j in range(input_shape[0] ** (tuple_dim-1)):
input_ij = Lambda(lambda x : x[:,i,j,:], output_shape=(tuple_dim*input_shape[-1],))(x) ##out_shape: batch_size x tuple_dim * col
xi_j_list += [dense_layer(input_ij)] ## xi_j_list-shape: row x batch_size x layer_size
## Pooling layer
# Pooling the list of the dense outputs of all the tuples where first index equals i to out_shape: batch_size x layer_size
# note that axis=0 because in previous step row-axis comes before batch_size-axis
# Use Lambda Wrapper to preserve the output being a Keras Tensor
if reduce_fun == "mean":
pooling_layer = Average(axis=1)
#pooling_layer = Lambda(lambda x : K.mean(x, axis = 0))
elif reduce_fun == "max":
pooling_layer = Maximum()
#pooling_layer = Lambda(lambda x : K.max(x, axis = 0))
elif reduce_fun == "sum":
pooling_layer = Add()
#pooling_layer = Lambda(lambda x : K.sum(x, axis = 0))
else:
raise ValueError("Invalid value for argument `reduce_fun` provided. ")
xi = pooling_layer(xi_j_list) ## xi-shape: batch_size x layer_size
x_i_list += [xi]
# x_i_list-shape:
# Concatenate the results of each row
x = Lambda(lambda x : K.stack(x, axis=1), output_shape = (input_shape[0], layer_size))(x_i_list) ## out_shape: batch_size x row x layer_size
model = Model(inputs=inputs, outputs=x)
return model
class SeperatedTuples(Layer):
"""
Creates all possible tuples of rows of 2D tensor, with an additional axis
along which the first elements are constant.
In the case of tuple_dim = 2, from one input batch:
x_1,
x_2,
...
x_n,
where x_i are rows of the tensor, it creates 3D output tensor:
[[x_1 | x_1, x_1 | x_2 ... x_1 | x_n],
[x_2 | x_1, x_2 | x_2 ... x_2 | x_n],
...
... x_n | x_n]]
Args:
tuple_dim -- A `int`. Dimension of one tuple (i.e. how many rows from the input
tensor to combine to create a row in output tensor)
input_shape -- A `tuple` of `int`. In the most frequent case where our data
has shape (batch_size x num_rows x num_cols) this should be (num_rows x num_cols).
"""
def __init__(self, tuple_dim = 2, **kwargs):
self.tuple_dim = tuple_dim
super(SeperatedTuples, self).__init__(**kwargs)
def create_indices(self, n, k = 2):
"""
Creates all integer valued coordinate k-tuples in k dimensional hypercube with edge size n.
for example n = 4, k = 2
returns [[0, 0], [0, 1], [0, 2], [0, 3],
[1, 0], [1, 1], [1, 2], [1, 3],
...
[3, 0], [3, 1], [3, 2], [3, 3]]
Args:
n -- A `int`, edge size of the hypercube.
k -- A `int`, dimension of the hypercube.
Returns:
indices_n_k -- A `list` of `list` of `int`. Each inner list represents coordinates of one integer point
in the hypercube.
"""
if k == 0:
indices_n_k = [[]]
else:
indices_n_k_minus_1 = self.create_indices(n, k-1)
indices_n_k = [[i] + indices_n_k_minus_1[c] for i in range(n) for c in range(n**(k-1))]
return indices_n_k
def create_seperated_indices(self, n, k = 2):
"""
Same as create_indices, just that there is an additional axis along which the first value of the tuples is constant
for example n = 4, k = 2
returns [[[0, 0], [0, 1], [0, 2], [0, 3]],
[[1, 0], [1, 1], [1, 2], [1, 3]],
...
[[3, 0], [3, 1], [3, 2], [3, 3]]]
shape: row x row x k
"""
indices = self.create_indices(n,k)
seperated_indices = [indices[i:i + n] for i in range(0, len(indices), n)]
return seperated_indices
def build(self, input_shape):
# Create indexing tuple
self.gathering_indices = self.create_seperated_indices(input_shape[-2], self.tuple_dim)
super(SeperatedTuples, self).build(input_shape) # Be sure to call this somewhere!
def call(self, x):
"""
input_dim : batch_size x rows x cols
output_dim : batch_size x rows x rows ** (tuple_dim-1) x cols * tuple_dim
"""
stacks_of_tuples = K.map_fn(
fn = lambda z : ## z shape: row x col
K.stack(
[K.concatenate(
[K.reshape(
K.gather(z, i), ## shape: tuple_dim x col
shape = (1,-1)
) ## shape: 1 x tuple_dim*col
for i in indices # i-dim: tuple_dim, indices-shape: row x tuple_dim
], ## shape: row x 1 x tuple_dim*col
axis = 0
) ## shape: row x tuple_dim*col
for indices in self.gathering_indices # gathering_indices-shape: row x row x tuple_dim
],
axis=0), ## shape: row x row x tuple_dim*col
elems = x ## shape: batch_size x row x col
) ## shape: batch_size x row x row x tuple_dim*col
return stacks_of_tuples
def compute_output_shape(self, input_shape):
"""
input_shape: batch_size x rows x cols
output_shape: batch_size x rows x rows ** (tuple_dim-1) x cols * tuple_dim
"""
output_shape = list(input_shape)
output_shape[-1] = output_shape[-1] * self.tuple_dim
output_shape[-2] = output_shape[-2] ** self.tuple_dim
return tuple(output_shape)
单独测试PermutationEquivariant
层时,一切似乎都正常(运行1)。但是,当我尝试将其合并到一个更大的模型中时,输出只会重复(运行2)。
from keras.models import Model
from keras.layers import Input, Lambda
import numpy as np
# parameters for Permutational Equivariant layer
input_shape = (2,5)
dense_params = {'kernel_initializer': 'glorot_normal', 'bias_initializer': 'glorot_normal', 'activation': 'tanh'}
sample = np.random.random((1,) + input_shape)
# run 1: Using only the PermutationEquivariant layer as a model by itself seems to work
model_1 = PermutationEquivariant(input_shape=input_shape, layer_size=10, tuple_dim=2, reduce_fun="sum", dense_params = dense_params)
model_1.compile(optimizer='sgd', loss='categorical_crossentropy')
print("model_1: \n", model_1.predict(sample))
#model_1:
#[[[-1.0494264 -1.6808903 1.2861781 -0.90004706 1.6178854
# 1.6686234 -1.5724193 1.2454509 0.3730019 -1.4580158 ]
# [-1.3904197 -1.467866 1.0848606 -1.2094728 1.6304723
# 1.6369174 -1.4074551 0.58116794 0.292305 -1.7162979 ]]]
# run 2: Incorporating the PermutationEquivariant layer inside another model makes the output constant along the first axis
inputs = Input(shape=input_shape)
x = PermutationEquivariant(input_shape=input_shape, layer_size=10, tuple_dim=2, reduce_fun="sum", dense_params = dense_params)(inputs)
model_2 = Model(inputs=inputs,outputs = x)
model_2.compile(optimizer='sgd', loss='categorical_crossentropy')
print("model_2: \n", model_2.predict(sample))
enter code here
#model_2:
# [[[ 0.72823656 1.2213255 -0.28404936 1.4711846 -0.49544945
# 1.7930243 -0.7502286 1.892496 -1.675402 -0.2252224 ]
# [ 0.72823656 1.2213255 -0.28404936 1.4711846 -0.49544945
# 1.7930243 -0.7502286 1.892496 -1.675402 -0.2252224 ]]]
我尝试过theano和tensorflow作为后端,两者都有相同的结果。有没有人知道为什么它在另一个模型里面时表现不同/我错过了什么?我感谢任何帮助!