Recurrent Neural Networks (RNNs)#
RLtools initially only supports the GRU (Gated Recurrent Unit), a widely used and time-tested RNN architecture.
In this example, we show the supervised training of a simple sequence model that learns to do the set operation output = max(inputs)
. As in the previous examples, we import the required datastructures and models first:
[1]:
#include <vector>
#include <algorithm>
#include <numeric>
#include <iostream>
#define RL_TOOLS_BACKEND_ENABLE_OPENBLAS
#include <rl_tools/operations/cpu_mux.h>
#include <rl_tools/nn/optimizers/adam/instance/operations_generic.h>
#include <rl_tools/nn/operations_cpu_mux.h>
#include <rl_tools/nn/layers/gru/operations_generic.h>
#include <rl_tools/nn_models/sequential/operations_generic.h>
#include <rl_tools/nn/optimizers/adam/operations_generic.h>
#include <rl_tools/nn/loss_functions/mse/operations_generic.h>
namespace rlt = rl_tools;
#pragma cling load("openblas")
Then setup the environment:
[2]:
using T = float;
using DEVICE = rlt::devices::DEVICE_FACTORY<rlt::devices::DefaultCPUSpecification>;
using RNG = DEVICE::SPEC::RANDOM::ENGINE<>;
using TI = typename DEVICE::index_t;
constexpr bool DYNAMIC_ALLOCATION = true;
Now we can configure the sequence model. Here we use a GRU that directly takes the input and transforms it into its latent space. This latent space is then decoded by the OUTPUT_LAYER
to predict the outputs:
[3]:
constexpr TI SEQUENCE_LENGTH = 10;
constexpr TI BATCH_SIZE = 10;
constexpr TI INPUT_DIM = 1;
constexpr TI HIDDEN_DIM = 8;
constexpr TI OUTPUT_DIM = 1;
using INPUT_SHAPE = rlt::tensor::Shape<TI, SEQUENCE_LENGTH, BATCH_SIZE, INPUT_DIM>;
using GRU_CONFIG = rlt::nn::layers::gru::Configuration<T, TI, HIDDEN_DIM, rlt::nn::parameters::groups::Normal, DYNAMIC_ALLOCATION>;
using GRU = rlt::nn::layers::gru::BindConfiguration<GRU_CONFIG>;
using OUTPUT_LAYER_CONFIG = rlt::nn::layers::dense::Configuration<T, TI, OUTPUT_DIM, rlt::nn::activation_functions::IDENTITY>;
using OUTPUT_LAYER = rlt::nn::layers::dense::BindConfiguration<OUTPUT_LAYER_CONFIG>;
As usual, we assemble these layers into a nn_models::sequential
which is a sequence of layers and implements compile-time autodiff:
[4]:
template <typename T_CONTENT, typename T_NEXT_MODULE = rlt::nn_models::sequential::OutputModule>
using Module = typename rlt::nn_models::sequential::Module<T_CONTENT, T_NEXT_MODULE>;
using MODULE_CHAIN = Module<GRU, Module<OUTPUT_LAYER>>;
using CAPABILITY = rlt::nn::capability::Gradient<rlt::nn::parameters::Adam>;
using MODEL = rlt::nn_models::sequential::Build<CAPABILITY, MODULE_CHAIN, INPUT_SHAPE>;
We need an optimizer as well, of course:
[5]:
struct ADAM_PARAMS: rlt::nn::optimizers::adam::DEFAULT_PARAMETERS_TENSORFLOW<T>{
static constexpr T ALPHA = 0.003;
};
using ADAM_SPEC = rlt::nn::optimizers::adam::Specification<T, TI, ADAM_PARAMS>;
using OPTIMIZER = rlt::nn::optimizers::Adam<ADAM_SPEC>;
Now we can instantiate, allocate and initialize the data structures:
[6]:
constexpr TI DATASET_SIZE = 1000;
constexpr TI TESTSET_SIZE = 100;
DEVICE device;
RNG rng;
MODEL model;
MODEL::Buffer<> buffer;
using TEST_MODEL_TMP = MODEL::template CHANGE_BATCH_SIZE<TI, TESTSET_SIZE>; // inference only model with test set as batch size
using TEST_MODEL = TEST_MODEL_TMP::template CHANGE_CAPABILITY<rlt::nn::capability::Forward<>>;
TEST_MODEL test_model;
TEST_MODEL::Buffer<> test_buffer;
MODEL::State<> state;
OPTIMIZER optimizer;
rlt::Tensor<rlt::tensor::Specification<T, TI, MODEL::INPUT_SHAPE>> input;
rlt::Tensor<rlt::tensor::Specification<T, TI, MODEL::OUTPUT_SHAPE>> output_target, d_output;
using DATASET_SHAPE = rlt::tensor::Shape<TI, DATASET_SIZE, SEQUENCE_LENGTH, INPUT_DIM>;
using DATASET_TARGET_SHAPE = rlt::tensor::Shape<TI, DATASET_SIZE, SEQUENCE_LENGTH, OUTPUT_DIM>;
rlt::Tensor<rlt::tensor::Specification<T, TI, DATASET_SHAPE>> dataset_X;
rlt::Tensor<rlt::tensor::Specification<T, TI, DATASET_TARGET_SHAPE>> dataset_y;
using TESTSET_SHAPE = rlt::tensor::Shape<TI, TESTSET_SIZE, SEQUENCE_LENGTH, INPUT_DIM>;
using TESTSET_TARGET_SHAPE = rlt::tensor::Shape<TI, TESTSET_SIZE, SEQUENCE_LENGTH, OUTPUT_DIM>;
rlt::Tensor<rlt::tensor::Specification<T, TI, TESTSET_SHAPE>> testset_X;
rlt::Tensor<rlt::tensor::Specification<T, TI, TESTSET_TARGET_SHAPE>> testset_y;
using TESTSET_SHAPE_PERMUTED = rlt::tensor::Shape<TI, SEQUENCE_LENGTH, TESTSET_SIZE, INPUT_DIM>;
using TESTSET_TARGET_SHAPE_PERMUTED = rlt::tensor::Shape<TI, SEQUENCE_LENGTH, TESTSET_SIZE, OUTPUT_DIM>;
rlt::Tensor<rlt::tensor::Specification<T, TI, TESTSET_SHAPE_PERMUTED>> testset_X_permuted;
rlt::Tensor<rlt::tensor::Specification<T, TI, TESTSET_TARGET_SHAPE_PERMUTED>> testset_y_permuted;
rlt::Tensor<rlt::tensor::Specification<T, TI, TESTSET_TARGET_SHAPE_PERMUTED>> testset_output_permuted;
rlt::init(device);
rlt::malloc(device, rng);
constexpr TI SEED = 0;
rlt::init(device, rng, SEED);
rlt::malloc(device, model);
rlt::malloc(device, test_model);
rlt::malloc(device, buffer);
rlt::malloc(device, test_buffer);
rlt::malloc(device, state);
rlt::malloc(device, optimizer);
rlt::malloc(device, input);
rlt::malloc(device, output_target);
rlt::malloc(device, d_output);
rlt::malloc(device, dataset_X);
rlt::malloc(device, dataset_y);
rlt::malloc(device, testset_X);
rlt::malloc(device, testset_y);
rlt::malloc(device, testset_X_permuted);
rlt::malloc(device, testset_y_permuted);
rlt::malloc(device, testset_output_permuted);
rlt::init_weights(device, model, rng);
rlt::reset_optimizer_state(device, optimizer, model);
The toy task we are facing here is output = max(inputs)
hence we sample random numbers from a Gaussian and then calculate the max for the target values:
[7]:
template <typename DATASET_X, typename DATASET_Y>
void max_dataset(DATASET_X& dataset_X, DATASET_Y& dataset_y){
static_assert(DATASET_X::SHAPE::FIRST == DATASET_Y::SHAPE::FIRST);
static_assert(DATASET_X::SHAPE::template GET<1> == DATASET_Y::SHAPE::template GET<1>);
rlt::randn(device, dataset_X, rng);
for(TI sample_i = 0; sample_i < DATASET_X::SHAPE::FIRST; sample_i++){
T max;
bool max_set = false;
for(TI step_i = 0; step_i < DATASET_X::SHAPE::template GET<1>; step_i++){
T el = rlt::get(device, dataset_X, sample_i, step_i, 0);
if(!max_set || el > max){
max_set = true;
max = el;
}
rlt::set(device, dataset_y, max, sample_i, step_i, 0);
}
}
}
We want to generate a training and test set. We created a test_model
that natively operates on BATCH_SIZE = TESTSET_SIZE
so we can directly feed the testset into it without creating an additional, batched loop. The standard input format in RLtools is (SEQUENCE_STEPS x BATCH_SAMPLES x FEATURES)
hence we permute the dataset generated with the previous function:
[8]:
max_dataset(dataset_X, dataset_y);
max_dataset(testset_X, testset_y);
// models operate on (SEQUENCE_STEP x BATCH_SIZE x FEATURE_DIM) for performance reasons:
auto permuted_X = rlt::permute(device, testset_X, rlt::tensor::PermutationSpec<0, 1>{});
auto permuted_y = rlt::permute(device, testset_y, rlt::tensor::PermutationSpec<0, 1>{});
rlt::copy(device, device, permuted_X, testset_X_permuted);
rlt::copy(device, device, permuted_y, testset_y_permuted);
Here is an example of an input sequence and the expected output:
[9]:
std::cout << "Input: \n";
rlt::print(device, rlt::view(device, dataset_X, 0));
std::cout << "Expected output: " << std::endl;
rlt::print(device, rlt::view(device, dataset_y, 0));
Input:
-1.507621e+00
1.071986e+00
8.269271e-01
1.601774e+00
-1.074195e+00
-5.420533e-01
-6.830205e-01
1.492320e+00
6.583855e-02
9.513746e-01
Expected output:
-1.507621e+00
1.071986e+00
1.071986e+00
1.601774e+00
1.601774e+00
1.601774e+00
1.601774e+00
1.601774e+00
1.601774e+00
1.601774e+00
Now we have everything in place to train the model:
[10]:
std::vector<TI> indices(DATASET_SIZE);
std::iota(indices.begin(), indices.end(), 0); // fill with range 0..DATASET_SIZE
constexpr TI N_EPOCH = 100;
for(TI epoch_i = 0; epoch_i < N_EPOCH; epoch_i++){
T epoch_loss = 0;
std::shuffle(indices.begin(), indices.end(), rng.engine);
for(TI batch_i = 0; batch_i < DATASET_SIZE / BATCH_SIZE; batch_i++){
for(TI sequence_i = 0; sequence_i < BATCH_SIZE; sequence_i++){
TI index = BATCH_SIZE * batch_i + sequence_i;
auto input_sample = rlt::view(device, input, sequence_i, rlt::tensor::ViewSpec<1>{});
auto output_sample = rlt::view(device, output_target, sequence_i, rlt::tensor::ViewSpec<1>{});
auto dataset_input_sample = rlt::view(device, dataset_X, indices[index], rlt::tensor::ViewSpec<0>{});
auto dataset_output_sample = rlt::view(device, dataset_y, indices[index], rlt::tensor::ViewSpec<0>{});
rlt::copy(device, device, dataset_input_sample, input_sample);
rlt::copy(device, device, dataset_output_sample, output_sample);
}
rlt::forward(device, model, input, buffer, rng);
auto output = rlt::output(device, model);
auto output_matrix_view = rlt::matrix_view(device, output);
auto output_target_matrix_view = rlt::matrix_view(device, output_target);
auto d_output_matrix_view = rlt::matrix_view(device, d_output);
rlt::nn::loss_functions::mse::gradient(device, output_matrix_view, output_target_matrix_view, d_output_matrix_view);
T batch_loss = rlt::nn::loss_functions::mse::evaluate(device, output_matrix_view, output_target_matrix_view);
epoch_loss += batch_loss;
rlt::zero_gradient(device, model);
rlt::backward(device, model, input, d_output, buffer);
rlt::step(device, optimizer, model);
}
epoch_loss /= DATASET_SIZE / BATCH_SIZE;
if((epoch_i+1) % 5 == 0){
rlt::copy(device, device, model, test_model);
rlt::evaluate(device, test_model, testset_X_permuted, testset_output_permuted, test_buffer, rng);
T test_loss = rlt::nn::loss_functions::mse::evaluate(device, testset_output_permuted, testset_y_permuted);
std::cout << "Epoch " << (epoch_i+1) << " train loss: " << epoch_loss << " test loss: " << test_loss << std::endl;
}
}
Epoch 5 train loss: 4.026309e-02 test loss: 3.559775e-02
Epoch 10 train loss: 1.722529e-02 test loss: 1.540120e-02
Epoch 15 train loss: 1.022129e-02 test loss: 7.987553e-03
Epoch 20 train loss: 6.953578e-03 test loss: 4.966619e-03
Epoch 25 train loss: 5.002954e-03 test loss: 3.588097e-03
Epoch 30 train loss: 3.616025e-03 test loss: 2.619849e-03
Epoch 35 train loss: 2.852915e-03 test loss: 2.276205e-03
Epoch 40 train loss: 2.285772e-03 test loss: 1.623374e-03
Epoch 45 train loss: 1.853573e-03 test loss: 1.247458e-03
Epoch 50 train loss: 1.495403e-03 test loss: 1.218547e-03
Epoch 55 train loss: 1.328354e-03 test loss: 1.133017e-03
Epoch 60 train loss: 1.153364e-03 test loss: 1.316995e-03
Epoch 65 train loss: 1.080623e-03 test loss: 8.134897e-04
Epoch 70 train loss: 9.213265e-04 test loss: 9.398991e-04
Epoch 75 train loss: 8.744255e-04 test loss: 1.394223e-03
Epoch 80 train loss: 8.876204e-04 test loss: 7.558167e-04
Epoch 85 train loss: 8.151167e-04 test loss: 6.435094e-04
Epoch 90 train loss: 7.562749e-04 test loss: 7.290478e-04
Epoch 95 train loss: 6.210597e-04 test loss: 6.309046e-04
Epoch 100 train loss: 5.976068e-04 test loss: 7.673404e-04
Now we check if the predictions of the model are plausible:
[11]:
std::cout << std::fixed << std::setprecision(2); // fixed point printing
for(TI sequence_i = 0; sequence_i < 5; sequence_i++){
std::cout << "Test sequence " << sequence_i << std::endl;
std::cout << "Input => Target ~ Predicted" << std::endl;
for(TI step_i = 0; step_i < SEQUENCE_LENGTH; step_i++){
std::cout << " " << std::setw(5) << rlt::get(device, testset_X_permuted, step_i, sequence_i, 0);
std::cout << " => " << std::setw(5) << rlt::get(device, testset_y_permuted, step_i, sequence_i, 0);
std::cout << " ~ " << std::setw(5) << rlt::get(device, testset_output_permuted, step_i, sequence_i, 0);
std::cout << std::endl;
}
}
Test sequence 0
Input => Target ~ Predicted
0.65 => 0.65 ~ 0.66
0.89 => 0.89 ~ 0.88
1.38 => 1.38 ~ 1.37
0.74 => 1.38 ~ 1.37
-1.00 => 1.38 ~ 1.37
0.06 => 1.38 ~ 1.39
0.07 => 1.38 ~ 1.40
-0.98 => 1.38 ~ 1.39
1.20 => 1.38 ~ 1.39
0.12 => 1.38 ~ 1.40
Test sequence 1
Input => Target ~ Predicted
-0.08 => -0.08 ~ -0.09
0.21 => 0.21 ~ 0.21
-0.19 => 0.21 ~ 0.21
-1.47 => 0.21 ~ 0.21
-0.16 => 0.21 ~ 0.20
-1.06 => 0.21 ~ 0.21
-0.36 => 0.21 ~ 0.21
0.62 => 0.62 ~ 0.60
0.31 => 0.62 ~ 0.61
-0.30 => 0.62 ~ 0.63
Test sequence 2
Input => Target ~ Predicted
1.29 => 1.29 ~ 1.32
0.08 => 1.29 ~ 1.30
0.76 => 1.29 ~ 1.30
0.14 => 1.29 ~ 1.31
0.43 => 1.29 ~ 1.32
-1.59 => 1.29 ~ 1.32
-0.13 => 1.29 ~ 1.32
-0.30 => 1.29 ~ 1.33
-0.64 => 1.29 ~ 1.32
1.25 => 1.29 ~ 1.34
Test sequence 3
Input => Target ~ Predicted
-0.44 => -0.44 ~ -0.44
0.62 => 0.62 ~ 0.63
-0.06 => 0.62 ~ 0.64
1.33 => 1.33 ~ 1.34
1.23 => 1.33 ~ 1.36
0.40 => 1.33 ~ 1.37
0.32 => 1.33 ~ 1.38
-0.11 => 1.33 ~ 1.38
-0.75 => 1.33 ~ 1.38
0.25 => 1.33 ~ 1.38
Test sequence 4
Input => Target ~ Predicted
-0.10 => -0.10 ~ -0.11
0.17 => 0.17 ~ 0.17
0.01 => 0.17 ~ 0.19
0.66 => 0.66 ~ 0.65
0.79 => 0.79 ~ 0.80
-0.81 => 0.79 ~ 0.81
-0.85 => 0.79 ~ 0.82
-0.19 => 0.79 ~ 0.83
0.05 => 0.79 ~ 0.83
0.55 => 0.79 ~ 0.82