Recurrent Neural Networks (RNNs)#

RLtools initially only supports the GRU (Gated Recurrent Unit), a widely used and time-tested RNN architecture.

In this example, we show the supervised training of a simple sequence model that learns to do the set operation output = max(inputs). As in the previous examples, we import the required datastructures and models first:

[1]:
#include <vector>
#include <algorithm>
#include <numeric>
#include <iostream>

#define RL_TOOLS_BACKEND_ENABLE_OPENBLAS
#include <rl_tools/operations/cpu_mux.h>
#include <rl_tools/nn/optimizers/adam/instance/operations_generic.h>
#include <rl_tools/nn/operations_cpu_mux.h>
#include <rl_tools/nn/layers/gru/operations_generic.h>
#include <rl_tools/nn_models/sequential/operations_generic.h>
#include <rl_tools/nn/optimizers/adam/operations_generic.h>
#include <rl_tools/nn/loss_functions/mse/operations_generic.h>
namespace rlt = rl_tools;
#pragma cling load("openblas")

Then setup the environment:

[2]:
using T = float;
using DEVICE = rlt::devices::DEVICE_FACTORY<rlt::devices::DefaultCPUSpecification>;
using RNG = DEVICE::SPEC::RANDOM::ENGINE<>;
using TI = typename DEVICE::index_t;
constexpr bool DYNAMIC_ALLOCATION = true;

Now we can configure the sequence model. Here we use a GRU that directly takes the input and transforms it into its latent space. This latent space is then decoded by the OUTPUT_LAYER to predict the outputs:

[3]:
constexpr TI SEQUENCE_LENGTH = 10;
constexpr TI BATCH_SIZE = 10;
constexpr TI INPUT_DIM = 1;
constexpr TI HIDDEN_DIM = 8;
constexpr TI OUTPUT_DIM = 1;
using INPUT_SHAPE = rlt::tensor::Shape<TI, SEQUENCE_LENGTH, BATCH_SIZE, INPUT_DIM>;
using GRU_CONFIG = rlt::nn::layers::gru::Configuration<T, TI, HIDDEN_DIM, rlt::nn::parameters::groups::Normal, DYNAMIC_ALLOCATION>;
using GRU = rlt::nn::layers::gru::BindConfiguration<GRU_CONFIG>;
using OUTPUT_LAYER_CONFIG = rlt::nn::layers::dense::Configuration<T, TI, OUTPUT_DIM, rlt::nn::activation_functions::IDENTITY>;
using OUTPUT_LAYER = rlt::nn::layers::dense::BindConfiguration<OUTPUT_LAYER_CONFIG>;

As usual, we assemble these layers into a nn_models::sequential which is a sequence of layers and implements compile-time autodiff:

[4]:
template <typename T_CONTENT, typename T_NEXT_MODULE = rlt::nn_models::sequential::OutputModule>
using Module = typename rlt::nn_models::sequential::Module<T_CONTENT, T_NEXT_MODULE>;

using MODULE_CHAIN = Module<GRU, Module<OUTPUT_LAYER>>;
using CAPABILITY = rlt::nn::capability::Gradient<rlt::nn::parameters::Adam>;
using MODEL = rlt::nn_models::sequential::Build<CAPABILITY, MODULE_CHAIN, INPUT_SHAPE>;

We need an optimizer as well, of course:

[5]:
struct ADAM_PARAMS: rlt::nn::optimizers::adam::DEFAULT_PARAMETERS_TENSORFLOW<T>{
    static constexpr T ALPHA = 0.003;
};
using ADAM_SPEC = rlt::nn::optimizers::adam::Specification<T, TI, ADAM_PARAMS>;
using OPTIMIZER = rlt::nn::optimizers::Adam<ADAM_SPEC>;

Now we can instantiate, allocate and initialize the data structures:

[6]:
constexpr TI DATASET_SIZE = 1000;
constexpr TI TESTSET_SIZE = 100;

DEVICE device;
RNG rng;
MODEL model;
MODEL::Buffer<> buffer;
using TEST_MODEL_TMP = MODEL::template CHANGE_BATCH_SIZE<TI, TESTSET_SIZE>; // inference only model with test set as batch size
using TEST_MODEL = TEST_MODEL_TMP::template CHANGE_CAPABILITY<rlt::nn::capability::Forward<>>;
TEST_MODEL test_model;
TEST_MODEL::Buffer<> test_buffer;
MODEL::State<> state;
OPTIMIZER optimizer;
rlt::Tensor<rlt::tensor::Specification<T, TI, MODEL::INPUT_SHAPE>> input;
rlt::Tensor<rlt::tensor::Specification<T, TI, MODEL::OUTPUT_SHAPE>> output_target, d_output;

using DATASET_SHAPE = rlt::tensor::Shape<TI, DATASET_SIZE, SEQUENCE_LENGTH, INPUT_DIM>;
using DATASET_TARGET_SHAPE = rlt::tensor::Shape<TI, DATASET_SIZE, SEQUENCE_LENGTH, OUTPUT_DIM>;
rlt::Tensor<rlt::tensor::Specification<T, TI, DATASET_SHAPE>> dataset_X;
rlt::Tensor<rlt::tensor::Specification<T, TI, DATASET_TARGET_SHAPE>> dataset_y;

using TESTSET_SHAPE = rlt::tensor::Shape<TI, TESTSET_SIZE, SEQUENCE_LENGTH, INPUT_DIM>;
using TESTSET_TARGET_SHAPE = rlt::tensor::Shape<TI, TESTSET_SIZE, SEQUENCE_LENGTH, OUTPUT_DIM>;
rlt::Tensor<rlt::tensor::Specification<T, TI, TESTSET_SHAPE>> testset_X;
rlt::Tensor<rlt::tensor::Specification<T, TI, TESTSET_TARGET_SHAPE>> testset_y;
using TESTSET_SHAPE_PERMUTED = rlt::tensor::Shape<TI, SEQUENCE_LENGTH, TESTSET_SIZE, INPUT_DIM>;
using TESTSET_TARGET_SHAPE_PERMUTED = rlt::tensor::Shape<TI, SEQUENCE_LENGTH, TESTSET_SIZE, OUTPUT_DIM>;
rlt::Tensor<rlt::tensor::Specification<T, TI, TESTSET_SHAPE_PERMUTED>> testset_X_permuted;
rlt::Tensor<rlt::tensor::Specification<T, TI, TESTSET_TARGET_SHAPE_PERMUTED>> testset_y_permuted;
rlt::Tensor<rlt::tensor::Specification<T, TI, TESTSET_TARGET_SHAPE_PERMUTED>> testset_output_permuted;

rlt::init(device);
rlt::malloc(device, rng);
constexpr TI SEED = 0;
rlt::init(device, rng, SEED);
rlt::malloc(device, model);
rlt::malloc(device, test_model);
rlt::malloc(device, buffer);
rlt::malloc(device, test_buffer);
rlt::malloc(device, state);
rlt::malloc(device, optimizer);
rlt::malloc(device, input);
rlt::malloc(device, output_target);
rlt::malloc(device, d_output);
rlt::malloc(device, dataset_X);
rlt::malloc(device, dataset_y);
rlt::malloc(device, testset_X);
rlt::malloc(device, testset_y);
rlt::malloc(device, testset_X_permuted);
rlt::malloc(device, testset_y_permuted);
rlt::malloc(device, testset_output_permuted);
rlt::init_weights(device, model, rng);
rlt::reset_optimizer_state(device, optimizer, model);

The toy task we are facing here is output = max(inputs) hence we sample random numbers from a Gaussian and then calculate the max for the target values:

[7]:
template <typename DATASET_X, typename DATASET_Y>
void max_dataset(DATASET_X& dataset_X, DATASET_Y& dataset_y){
    static_assert(DATASET_X::SHAPE::FIRST == DATASET_Y::SHAPE::FIRST);
    static_assert(DATASET_X::SHAPE::template GET<1> == DATASET_Y::SHAPE::template GET<1>);
    rlt::randn(device, dataset_X, rng);
    for(TI sample_i = 0; sample_i < DATASET_X::SHAPE::FIRST; sample_i++){
        T max;
        bool max_set = false;
        for(TI step_i = 0; step_i < DATASET_X::SHAPE::template GET<1>; step_i++){
            T el = rlt::get(device, dataset_X, sample_i, step_i, 0);
            if(!max_set || el > max){
                max_set = true;
                max = el;
            }
            rlt::set(device, dataset_y, max, sample_i, step_i, 0);
        }
    }
}

We want to generate a training and test set. We created a test_model that natively operates on BATCH_SIZE = TESTSET_SIZE so we can directly feed the testset into it without creating an additional, batched loop. The standard input format in RLtools is (SEQUENCE_STEPS x BATCH_SAMPLES x FEATURES) hence we permute the dataset generated with the previous function:

[8]:
max_dataset(dataset_X, dataset_y);
max_dataset(testset_X, testset_y);
// models operate on (SEQUENCE_STEP x BATCH_SIZE x FEATURE_DIM) for performance reasons:
auto permuted_X = rlt::permute(device, testset_X, rlt::tensor::PermutationSpec<0, 1>{});
auto permuted_y = rlt::permute(device, testset_y, rlt::tensor::PermutationSpec<0, 1>{});
rlt::copy(device, device, permuted_X, testset_X_permuted);
rlt::copy(device, device, permuted_y, testset_y_permuted);

Here is an example of an input sequence and the expected output:

[9]:
std::cout << "Input: \n";
rlt::print(device, rlt::view(device, dataset_X, 0));
std::cout << "Expected output: " << std::endl;
rlt::print(device, rlt::view(device, dataset_y, 0));
Input:
  -1.507621e+00
   1.071986e+00
   8.269271e-01
   1.601774e+00
  -1.074195e+00
  -5.420533e-01
  -6.830205e-01
   1.492320e+00
   6.583855e-02
   9.513746e-01

Expected output:
  -1.507621e+00
   1.071986e+00
   1.071986e+00
   1.601774e+00
   1.601774e+00
   1.601774e+00
   1.601774e+00
   1.601774e+00
   1.601774e+00
   1.601774e+00

Now we have everything in place to train the model:

[10]:
std::vector<TI> indices(DATASET_SIZE);
std::iota(indices.begin(), indices.end(), 0); // fill with range 0..DATASET_SIZE
constexpr TI N_EPOCH = 100;
for(TI epoch_i = 0; epoch_i < N_EPOCH; epoch_i++){
    T epoch_loss = 0;
    std::shuffle(indices.begin(), indices.end(), rng.engine);
    for(TI batch_i = 0; batch_i < DATASET_SIZE / BATCH_SIZE; batch_i++){
        for(TI sequence_i = 0; sequence_i < BATCH_SIZE; sequence_i++){
            TI index = BATCH_SIZE * batch_i + sequence_i;
            auto input_sample = rlt::view(device, input, sequence_i, rlt::tensor::ViewSpec<1>{});
            auto output_sample = rlt::view(device, output_target, sequence_i, rlt::tensor::ViewSpec<1>{});
            auto dataset_input_sample = rlt::view(device, dataset_X, indices[index], rlt::tensor::ViewSpec<0>{});
            auto dataset_output_sample = rlt::view(device, dataset_y, indices[index], rlt::tensor::ViewSpec<0>{});
            rlt::copy(device, device, dataset_input_sample, input_sample);
            rlt::copy(device, device, dataset_output_sample, output_sample);
        }
        rlt::forward(device, model, input, buffer, rng);
        auto output = rlt::output(device, model);
        auto output_matrix_view = rlt::matrix_view(device, output);
        auto output_target_matrix_view = rlt::matrix_view(device, output_target);
        auto d_output_matrix_view = rlt::matrix_view(device, d_output);
        rlt::nn::loss_functions::mse::gradient(device, output_matrix_view, output_target_matrix_view, d_output_matrix_view);
        T batch_loss = rlt::nn::loss_functions::mse::evaluate(device, output_matrix_view, output_target_matrix_view);
        epoch_loss += batch_loss;
        rlt::zero_gradient(device, model);
        rlt::backward(device, model, input, d_output, buffer);
        rlt::step(device, optimizer, model);
    }
    epoch_loss /= DATASET_SIZE / BATCH_SIZE;
    if((epoch_i+1) % 5 == 0){
        rlt::copy(device, device, model, test_model);
        rlt::evaluate(device, test_model, testset_X_permuted, testset_output_permuted, test_buffer, rng);
        T test_loss = rlt::nn::loss_functions::mse::evaluate(device, testset_output_permuted, testset_y_permuted);
        std::cout << "Epoch " << (epoch_i+1) << " train loss: " << epoch_loss << " test loss: " << test_loss << std::endl;
    }
}
Epoch 5 train loss: 4.026309e-02 test loss: 3.559775e-02
Epoch 10 train loss: 1.722529e-02 test loss: 1.540120e-02
Epoch 15 train loss: 1.022129e-02 test loss: 7.987553e-03
Epoch 20 train loss: 6.953578e-03 test loss: 4.966619e-03
Epoch 25 train loss: 5.002954e-03 test loss: 3.588097e-03
Epoch 30 train loss: 3.616025e-03 test loss: 2.619849e-03
Epoch 35 train loss: 2.852915e-03 test loss: 2.276205e-03
Epoch 40 train loss: 2.285772e-03 test loss: 1.623374e-03
Epoch 45 train loss: 1.853573e-03 test loss: 1.247458e-03
Epoch 50 train loss: 1.495403e-03 test loss: 1.218547e-03
Epoch 55 train loss: 1.328354e-03 test loss: 1.133017e-03
Epoch 60 train loss: 1.153364e-03 test loss: 1.316995e-03
Epoch 65 train loss: 1.080623e-03 test loss: 8.134897e-04
Epoch 70 train loss: 9.213265e-04 test loss: 9.398991e-04
Epoch 75 train loss: 8.744255e-04 test loss: 1.394223e-03
Epoch 80 train loss: 8.876204e-04 test loss: 7.558167e-04
Epoch 85 train loss: 8.151167e-04 test loss: 6.435094e-04
Epoch 90 train loss: 7.562749e-04 test loss: 7.290478e-04
Epoch 95 train loss: 6.210597e-04 test loss: 6.309046e-04
Epoch 100 train loss: 5.976068e-04 test loss: 7.673404e-04

Now we check if the predictions of the model are plausible:

[11]:
std::cout << std::fixed << std::setprecision(2); // fixed point printing
for(TI sequence_i = 0; sequence_i < 5; sequence_i++){
    std::cout << "Test sequence " << sequence_i << std::endl;
    std::cout << "Input => Target ~ Predicted" << std::endl;
    for(TI step_i = 0; step_i < SEQUENCE_LENGTH; step_i++){
        std::cout << "  " << std::setw(5) << rlt::get(device, testset_X_permuted, step_i, sequence_i, 0);
        std::cout << " => " << std::setw(5) << rlt::get(device, testset_y_permuted, step_i, sequence_i, 0);
        std::cout << " ~ " << std::setw(5) << rlt::get(device, testset_output_permuted, step_i, sequence_i, 0);
        std::cout << std::endl;
    }
}
Test sequence 0
Input => Target ~ Predicted
   0.65 =>  0.65 ~  0.66
   0.89 =>  0.89 ~  0.88
   1.38 =>  1.38 ~  1.37
   0.74 =>  1.38 ~  1.37
  -1.00 =>  1.38 ~  1.37
   0.06 =>  1.38 ~  1.39
   0.07 =>  1.38 ~  1.40
  -0.98 =>  1.38 ~  1.39
   1.20 =>  1.38 ~  1.39
   0.12 =>  1.38 ~  1.40
Test sequence 1
Input => Target ~ Predicted
  -0.08 => -0.08 ~ -0.09
   0.21 =>  0.21 ~  0.21
  -0.19 =>  0.21 ~  0.21
  -1.47 =>  0.21 ~  0.21
  -0.16 =>  0.21 ~  0.20
  -1.06 =>  0.21 ~  0.21
  -0.36 =>  0.21 ~  0.21
   0.62 =>  0.62 ~  0.60
   0.31 =>  0.62 ~  0.61
  -0.30 =>  0.62 ~  0.63
Test sequence 2
Input => Target ~ Predicted
   1.29 =>  1.29 ~  1.32
   0.08 =>  1.29 ~  1.30
   0.76 =>  1.29 ~  1.30
   0.14 =>  1.29 ~  1.31
   0.43 =>  1.29 ~  1.32
  -1.59 =>  1.29 ~  1.32
  -0.13 =>  1.29 ~  1.32
  -0.30 =>  1.29 ~  1.33
  -0.64 =>  1.29 ~  1.32
   1.25 =>  1.29 ~  1.34
Test sequence 3
Input => Target ~ Predicted
  -0.44 => -0.44 ~ -0.44
   0.62 =>  0.62 ~  0.63
  -0.06 =>  0.62 ~  0.64
   1.33 =>  1.33 ~  1.34
   1.23 =>  1.33 ~  1.36
   0.40 =>  1.33 ~  1.37
   0.32 =>  1.33 ~  1.38
  -0.11 =>  1.33 ~  1.38
  -0.75 =>  1.33 ~  1.38
   0.25 =>  1.33 ~  1.38
Test sequence 4
Input => Target ~ Predicted
  -0.10 => -0.10 ~ -0.11
   0.17 =>  0.17 ~  0.17
   0.01 =>  0.17 ~  0.19
   0.66 =>  0.66 ~  0.65
   0.79 =>  0.79 ~  0.80
  -0.81 =>  0.79 ~  0.81
  -0.85 =>  0.79 ~  0.82
  -0.19 =>  0.79 ~  0.83
   0.05 =>  0.79 ~  0.83
   0.55 =>  0.79 ~  0.82