MarkusThill / bioma-tcn-ae

Minimal Working Example of a (baseline) Temporal Convolutional Autoencoder (TCN-AE) for Anomaly Detection in Time Series
41 stars 14 forks source link

Question about where "Feature map reduction" should be added #3

Open ZZY18 opened 9 months ago

ZZY18 commented 9 months ago

As for the Feature map reduction mentioned in the article[1], I did not find it in your code or keras_tcn. So I have a problem with adding this layer, which is where to add it:

About whether "Feature map reduction" should be added before or after Residual Connections in TCN, like:

out = self.net(x)  # net is TCN

res = x if self.downsample is None else self.downsample(x)

return self.relu(out + res), out

Should I go for reduction ‘out’ or ‘self.relu(out + res)’? Looking forward to your response!

[1]Thill M, Konen W, Wang H, et al. Temporal convolutional autoencoder for unsupervised anomaly detection in time series[J]. Applied Soft Computing, 2021, 112: 107751.

MarkusThill commented 9 months ago

The model in this repo is a baseline model, which does not have these Feature Map Reduction layers that you mentioned. Your model should look more like this (the following code should run in Google Colab and give you an idea; but is far from beeing polished and a few - more detailed - aspects from the paper are missing):

# %%
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt

def build_model(shape_X, args, verbose=1):
    sampling_factor = args['latent_sample_rate']
    i = tf.keras.Input(batch_shape=(None, shape_X[1], shape_X[2]))  

    e = i
    dil_layers = list()
    for k in args["dilations"]:
        e = tf.keras.layers.Conv1D(filters=args['nb_filters'], 
                                   kernel_size=args['kernel_size'], 
                                   activation="relu",
                                   padding=args['padding'], 
                                   dilation_rate=k,
                                   kernel_initializer=args['conv_kernel_init'],
                                   bias_initializer=args['conv_bias_init'])(e)

        e1 = tf.keras.layers.Conv1D(filters=args['nb_skip_filters'], 
                                    kernel_size=1,
                                    activation=args['activation_conv1d'], 
                                    padding=args['padding'],
                                    kernel_initializer=args['conv_kernel_init'],
                                    bias_initializer=args['conv_bias_init'])(e)

        # reduce size of feature map, if wanted
        if args['nb_skip_filters'] == args["nb_filters_map_reduce"]:
            e = e1
        elif args["nb_filters_map_reduce"] is not None:
            e = tf.keras.layers.Conv1D(filters=args['nb_filters_map_reduce'], 
                                       kernel_size=1, 
                                       activation="relu",
                                       padding=args['padding'],
                                       kernel_initializer=args['conv_kernel_init'],
                                       bias_initializer=args['conv_bias_init'])(e)
        dil_layers.append(e1)

    e = tf.keras.layers.Concatenate(axis=-1)(dil_layers)

    if args["stepwise_updownsample"]: # Stepwise downsampling
        # Here, we downsample the time series in multiple steps by a factor of 2
        layers = int(np.log2(args["latent_sample_rate"]))  
        lost_one = []
        length = shape_X[1]
        for j in range(layers):
            lost_one.append(length % 2 > 0)
            e = tf.keras.layers.Conv1D(filters=args['nb_filters_ae'], 
                                       kernel_size=args['kernel_size_ae'],
                                       activation='relu', 
                                       padding=args['padding'],
                                       kernel_initializer=args['conv_kernel_init'],
                                       bias_initializer=args['conv_bias_init'])(e)

            e = args['pooler'](pool_size=2, 
                               strides=None, 
                               padding='valid', 
                               data_format='channels_last')(e)
            length //= 2

        enc_flat = tf.keras.layers.Conv1D(filters=args['filters_bneck'], 
                                          kernel_size=1,
                                          activation=args['activation_conv1d'], 
                                          padding=args['padding'],
                                          kernel_initializer=args['conv_kernel_init'],
                                          bias_initializer=args['conv_bias_init'])(e)
        d = enc_flat
        for j in range(layers):
            d = tf.keras.layers.Conv1D(filters=args['nb_filters_ae'], 
                                       kernel_size=args['kernel_size_ae'],
                                       activation='relu', 
                                       padding=args['padding'],
                                       kernel_initializer=args['conv_kernel_init'],
                                       bias_initializer=args['conv_bias_init'])(d)

            # One could also use Upsampling1D()
            d = tf.keras.layers.Conv1DTranspose(filters=args['nb_filters_ae'], 
                                                kernel_size=args['kernel_size_ae'], 
                                                padding="same", 
                                                strides=2, 
                                                activation="relu")(d)
            if lost_one[-(j + 1)]:
                d = tf.keras.layers.ZeroPadding1D(padding=(0, 1))(d)
    else: # Hard downsampling
        enc_flat = tf.keras.layers.Conv1D(filters=args['filters_bneck'], 
                                          kernel_size=1,
                                          activation=args['activation_conv1d'], 
                                          padding=args['padding'])(e)

        # Do some average (max) pooling to get a compressed representation 
        # of the time series (e.g. a sequence of length 8)
        enc_pooled = args['pooler'](pool_size=sampling_factor, 
                                    strides=None, 
                                    padding='valid',
                                    data_format='channels_last')(enc_flat)

        # Maybe put the pooled values through a non-linear layer first?
        # Currently, just stick with identity function...
        enc_out = tf.keras.layers.Activation("linear")(enc_pooled)
        # ...
        # Now we should have a short sequence, which we will upsample 
        # again and then attempt to reconstruct the original series

        # One could also use Upsampling1D()
        d = tf.keras.layers.Conv1DTranspose(filters=16, # hard-coded ...
                                            kernel_size=4, # also hard-coded ...
                                            padding="same", 
                                            strides=sampling_factor,
                                            activation=args['activation_conv1d'])(enc_out)

    dil_layers = list()
    for k in reversed(args["dilations"]):
        d = tf.keras.layers.Conv1D(filters=args['nb_filters'], 
                                   kernel_size=args['kernel_size'], 
                                   activation="relu",
                                   padding=args['padding'], 
                                   dilation_rate=k,
                                   kernel_initializer=args['conv_kernel_init'],
                                   bias_initializer=args['conv_bias_init'])(d)
        d1 = tf.keras.layers.Conv1D(filters=args['nb_skip_filters'], 
                                    kernel_size=1,
                                    activation=args['activation_conv1d'], 
                                    padding=args['padding'],
                                    kernel_initializer=args['conv_kernel_init'],
                                    bias_initializer=args['conv_bias_init'])(d)

        # reduce size of feature map, if wanted
        if args['nb_skip_filters'] == args["nb_filters_map_reduce"]:
            d = d1
        elif args["nb_filters_map_reduce"] is not None:
            d = tf.keras.layers.Conv1D(filters=args['nb_filters_map_reduce'], 
                                       kernel_size=1, 
                                       activation="relu",
                                       padding=args['padding'],
                                       kernel_initializer=args['conv_kernel_init'],
                                       bias_initializer=args['conv_bias_init'])(d)

        dil_layers.append(d1)

    d = tf.keras.layers.Concatenate(axis=-1)(dil_layers)

    # Finally, put the concatenated outputs through a dense layer, 
    # to get the reconstructed signal
    o = tf.keras.layers.Dense(shape_X[2], activation='linear')(d)
    model = tf.keras.Model(inputs=[i], outputs=[o])

    if verbose > 1:
        model.summary()
        tf.keras.utils.plot_model(
            model,
            to_file="model.png",
        )

    return model

# %%
# The parameters are far from beeing optimal and were just guessed for 
# illustrative purposes. In general, it requires some
# time to get a feeling for the parameters and how to choose them. Also, the
# current model setup is quite rigid (e.g., there is no particular reason, 
# why certain layers should have the same amount of filters, etc.)
args = {
    'dilations': (1, 2, 4, 8, 16, 32, 64),
    'pooler': tf.keras.layers.AveragePooling1D,
    'padding': 'same',  # 'same', 'causal'
    'activation_conv1d': 'relu',
    'conv_bias_init': 'zeros',
    'conv_kernel_init': 'glorot_normal',
    'kernel_size': 8,
    'latent_sample_rate': 32, # This specifies the bottleneck...
    'filters_bneck': 4,  # ... as well as this...
    'nb_filters': 64,
    'kernel_size_ae': 4,
    'nb_filters_ae': 16,
    'nb_filters_map_reduce': 16,
    'nb_skip_filters': 8,
    'stepwise_updownsample': True, # False: Do a hard downsampling with 1 layer
  }

# %%
# Create some arbitrary training data (batch x time x channels)
# Use some time series length which is not a power of 2 for demonstration purposes
data_shape = 200, (1<<14) + 13, 3

train_X = np.sin(0.001 * np.arange(np.product(data_shape))).reshape(data_shape)
train_X += 0.05 * np.random.randn(*data_shape)
print("train_X.shape:", train_X.shape)

# %%
model = build_model(train_X.shape, args, verbose=2)

# Compile model
adam = tf.keras.optimizers.Adam(learning_rate=0.001, 
                                beta_1=0.9, 
                                beta_2=0.999, 
                                epsilon=1e-08, 
                                amsgrad=True)

model.compile(loss='logcosh', optimizer=adam, metrics=["mae", "mse", "logcosh"])

# %%
history = model.fit(train_X, train_X,
                    batch_size=16,
                    epochs=30,
                    validation_split=.1,
                    shuffle=True,
                    callbacks=None,
                    verbose=1)

# %%
ex = train_X[[33]] # 33rd time series
pred = model(ex)

plt.figure()
plt.plot(ex[0,:,0], label="original") # plot 0-th channel
plt.plot(pred[0,:,0], label="reconstructed")  # plot 0-th channel
plt.legend()
plt.show()

I hope this helps a bit. Otherwise, please let me know...