How to add attention layer to a Bi-LSTM

Solution 1:

This can be a possible custom solution with a custom layer that computes attention on the positional/temporal dimension

from tensorflow.keras.layers import Layer
from tensorflow.keras import backend as K

class Attention(Layer):
    def __init__(self, return_sequences=True):
        self.return_sequences = return_sequences
    def build(self, input_shape):
        self.W=self.add_weight(name="att_weight", shape=(input_shape[-1],1),
        self.b=self.add_weight(name="att_bias", shape=(input_shape[1],1),
    def call(self, x):
        e = K.tanh(,self.W)+self.b)
        a = K.softmax(e, axis=1)
        output = x*a
        if self.return_sequences:
            return output
        return K.sum(output, axis=1)

it's build to receive 3D tensors and output 3D tensors (return_sequences=True) or 2D tensors (return_sequences=False). below a dummy example

# dummy data creation

max_len = 100
max_words = 333
emb_dim = 126

n_sample = 5
X = np.random.randint(0,max_words, (n_sample,max_len))
Y = np.random.randint(0,2, n_sample)

with return_sequences=True

model = Sequential()
model.add(Embedding(max_words, emb_dim, input_length=max_len))
model.add(Bidirectional(LSTM(32, return_sequences=True)))
model.add(Attention(return_sequences=True)) # receive 3D and output 3D
model.add(Dense(1, activation='sigmoid'))

model.compile('adam', 'binary_crossentropy'),Y, epochs=3)

with return_sequences=False

model = Sequential()
model.add(Embedding(max_words, emb_dim, input_length=max_len))
model.add(Bidirectional(LSTM(32, return_sequences=True)))
model.add(Attention(return_sequences=False)) # receive 3D and output 2D
model.add(Dense(1, activation='sigmoid'))

model.compile('adam', 'binary_crossentropy'),Y, epochs=3)

You can integrate it into your networks easily

here the running notebook

Solution 2:

In case, someone is using only Tensorflow and not keras externally, this is the way to do it.

import tensorflow as tf

class Attention(tf.keras.layers.Layer):

    def __init__(self, return_sequences=True, name=None, **kwargs):
        super(Attention, self).__init__(name=name)
        self.return_sequences = return_sequences
        super(Attention, self).__init__(**kwargs)

    def build(self, input_shape):
        self.W=self.add_weight(name="att_weight", shape=(input_shape[-1],1),
                           initializer="glorot_uniform", trainable=True)
        self.b=self.add_weight(name="att_bias", shape=(input_shape[1],1),
                           initializer="glorot_uniform", trainable=True)
        super(Attention, self).build(input_shape)

    def call(self, x):
        e = tf.keras.activations.tanh(, self.W) + self.b)
        a = tf.keras.activations.softmax(e, axis=1)
        output = x * a
        if self.return_sequences:
            return a, output
        return a, tf.keras.backend.sum(output, axis=1)

    def get_config(self):
        config = super().get_config().copy()
            'return_sequences': self.return_sequences 
        return config