Custom Environment#
You can find a repo with a corresponding example/template at https://github.com/rl-tools/example
As always we first include the elementary operations
[1]:
#define RL_TOOLS_BACKEND_ENABLE_OPENBLAS
#include <rl_tools/operations/cpu_mux.h>
#include <rl_tools/nn/operations_cpu_mux.h>
#include <rl_tools/nn_models/operations_cpu.h>
namespace rlt = rl_tools;
#pragma cling load("openblas")
Next we define the datastructures for our new environment. The main data structures are for the state of the environmnet (MyPendulumState
) and for the environment itself (MyPendulum
). As usual in RLtools, we assemble all template parameters of the environment into a specification (MyPendulumSpecification
) so that we do not need to repeat them in every function template. Furthermore we separate out the parameters. With RLtools environments we distinguish between three levels of
“state”: - Environment: Compile-time, should not change at runtime (though this is is not enforced to allow for hackability) - Parameters: Constant throughout an episode. This allows for e.g. domain randomization. It can also carry cues for the visualization (the constness during an episode is also not enforce but considered a best practice) - State: Sampled from the initial distribution at the beginning of an episode and can then change on every step
To work with the RLtools API, the environment data structure (MyPendulum
) needs to have the fields: - T
: Floating point type - TI
: Index/unsigned integer type - Parameters
: Parameters datastructure (can also purly contain compile-time constexpr
), should be a Plain Old Data (POD) structure so that it works well on GPUs and microcontrollers - State
: State datastructure, should be a Plain Old Data
(POD) for the same reasons - OBSERVATION_DIM
: Dimension of the observations - ACTION_DIM
: Dimension of the actions
[2]:
template <typename T>
struct MyPendulumParameters {
constexpr static T G = 10;
constexpr static T MAX_SPEED = 8;
constexpr static T MAX_TORQUE = 2;
constexpr static T DT = 0.05;
constexpr static T M = 1;
constexpr static T L = 1;
constexpr static T INITIAL_STATE_MIN_ANGLE = -rlt::math::PI<T>;
constexpr static T INITIAL_STATE_MAX_ANGLE = rlt::math::PI<T>;
constexpr static T INITIAL_STATE_MIN_SPEED = -1;
constexpr static T INITIAL_STATE_MAX_SPEED = 1;
};
template <typename T_T, typename T_TI, typename T_PARAMETERS = MyPendulumParameters<T_T>>
struct MyPendulumSpecification{
using T = T_T;
using TI = T_TI;
using PARAMETERS = T_PARAMETERS;
};
template <typename T, typename TI>
struct MyPendulumState{
static constexpr TI DIM = 2;
T theta;
T theta_dot;
};
template <typename TI>
struct MyPendulumFourierObservation{
static constexpr TI DIM = 3; // cos(theta), sin(theta), theta_dot
};
template <typename T_SPEC>
struct MyPendulum{
using SPEC = T_SPEC;
using T = typename SPEC::T;
using TI = typename SPEC::TI;
using Parameters = typename SPEC::PARAMETERS;
using State = MyPendulumState<T, TI>;
using Observation = MyPendulumFourierObservation<TI>;
using ObservationPrivileged = Observation;
static constexpr TI ACTION_DIM = 1;
static constexpr TI N_AGENTS = 1;
};
Next we can start defining operations on these datastructures. Note that they should be in the rl_tools
namespace so that the RLtools algorithms (such as the on-/off-policy runner) can find and dispatch to them. If you want to use functions outside the rl_tools
namespace you can just implement proxy functions that call your arbitrary functions. In our case we do not need dynamic memory allocation or initialization, hence just implement them as a
NOP. The sample_initial_state
function samples random initial states and the initial_state
provides a deterministic initial state (for deterministic evaluations). In the case of the pendulum a reasonable choice for the latter could be e.g. the state where it is hanging downwards with zero velocity.
[3]:
namespace rl_tools{
template<typename DEVICE, typename SPEC>
static void malloc(DEVICE& device, const MyPendulum<SPEC>& env){}
template<typename DEVICE, typename SPEC>
static void free(DEVICE& device, const MyPendulum<SPEC>& env){}
template<typename DEVICE, typename SPEC>
static void init(DEVICE& device, const MyPendulum<SPEC>& env){}
template<typename DEVICE, typename SPEC, typename RNG>
static void sample_initial_parameters(DEVICE& device, const MyPendulum<SPEC>& env, typename MyPendulum<SPEC>::Parameters& parameters, RNG& rng){ }
template<typename DEVICE, typename SPEC>
static void initial_parameters(DEVICE& device, const MyPendulum<SPEC>& env, typename MyPendulum<SPEC>::Parameters& parameters){ }
template<typename DEVICE, typename SPEC, typename RNG>
static void sample_initial_state(DEVICE& device, const MyPendulum<SPEC>& env, const typename MyPendulum<SPEC>::Parameters& parameters, typename MyPendulum<SPEC>::State& state, RNG& rng){
state.theta = random::uniform_real_distribution(typename DEVICE::SPEC::RANDOM(), SPEC::PARAMETERS::INITIAL_STATE_MIN_ANGLE, SPEC::PARAMETERS::INITIAL_STATE_MAX_ANGLE, rng);
state.theta_dot = random::uniform_real_distribution(typename DEVICE::SPEC::RANDOM(), SPEC::PARAMETERS::INITIAL_STATE_MIN_SPEED, SPEC::PARAMETERS::INITIAL_STATE_MAX_SPEED, rng);
}
template<typename DEVICE, typename SPEC>
static void initial_state(DEVICE& device, const MyPendulum<SPEC>& env, const typename MyPendulum<SPEC>::Parameters& parameters, typename MyPendulum<SPEC>::State& state){
state.theta = -rlt::math::PI<typename SPEC::T>;
state.theta_dot = 0;
}
}
In the following we define some helper functions. Note: the usage of rlt::math::xxx
for math functions seems tedious over e.g. std::xxx
but it allows to dispatch to the right math implementations on GPUs and microcontrollers and hence running the same code on any device.
[4]:
template <typename T>
T clip(T x, T min, T max){
x = x < min ? min : (x > max ? max : x);
return x;
}
template <typename DEVICE, typename T>
T f_mod_python(const DEVICE& dev, T a, T b){
return a - b * rlt::math::floor(dev, a / b);
}
template <typename DEVICE, typename T>
T angle_normalize(const DEVICE& dev, T x){
return f_mod_python(dev, (x + rlt::math::PI<T>), (2 * rlt::math::PI<T>)) - rlt::math::PI<T>;
}
Next we implement the most important operations (which resemble the OpenAI gym interface): - step
: Takes a state
, executes an action
and sets the next_state
- reward
: Returns the reward based on the state
, action
, and next_state
- observe
: Observes the state
. For fully observed environments this should basically just flatten the ::State
data structure and possibly apply some observation noise. For partially observable environments the observation can
also just contain parts of the information in the ::State
- terminated
: Returns a boolean flag signalling if the state
is a terminal state
[5]:
namespace rl_tools{
template<typename DEVICE, typename SPEC, typename ACTION_SPEC, typename RNG>
typename SPEC::T step(DEVICE& device, const MyPendulum<SPEC>& env, const typename MyPendulum<SPEC>::Parameters& parameters, const typename MyPendulum<SPEC>::State& state, const Matrix<ACTION_SPEC>& action, typename MyPendulum<SPEC>::State& next_state, RNG& rng) {
static_assert(ACTION_SPEC::ROWS == 1);
static_assert(ACTION_SPEC::COLS == 1);
typedef typename SPEC::T T;
typedef typename SPEC::PARAMETERS PARAMS;
T u_normalised = get(action, 0, 0);
T u = PARAMS::MAX_TORQUE * u_normalised;
T g = PARAMS::G;
T m = PARAMS::M;
T l = PARAMS::L;
T dt = PARAMS::DT;
u = clip(u, -PARAMS::MAX_TORQUE, PARAMS::MAX_TORQUE);
T newthdot = state.theta_dot + (3 * g / (2 * l) * rlt::math::sin(device.math, state.theta) + 3.0 / (m * l * l) * u) * dt;
newthdot = clip(newthdot, -PARAMS::MAX_SPEED, PARAMS::MAX_SPEED);
T newth = state.theta + newthdot * dt;
next_state.theta = newth;
next_state.theta_dot = newthdot;
return SPEC::PARAMETERS::DT;
}
template<typename DEVICE, typename SPEC, typename ACTION_SPEC, typename RNG>
static typename SPEC::T reward(DEVICE& device, const MyPendulum<SPEC>& env, const typename MyPendulum<SPEC>::Parameters& parameters, const typename MyPendulum<SPEC>::State& state, const Matrix<ACTION_SPEC>& action, const typename MyPendulum<SPEC>::State& next_state, RNG& rng){
typedef typename SPEC::T T;
T angle_norm = angle_normalize(device.math, state.theta);
T u_normalised = get(action, 0, 0);
T u = SPEC::PARAMETERS::MAX_TORQUE * u_normalised;
T costs = angle_norm * angle_norm + 0.1 * state.theta_dot * state.theta_dot + 0.001 * (u * u);
return -costs;
}
template<typename DEVICE, typename SPEC, typename OBS_TYPE_SPEC, typename OBS_SPEC, typename RNG>
static void observe(DEVICE& device, const MyPendulum<SPEC>& env, const typename MyPendulum<SPEC>::Parameters& parameters, const typename MyPendulum<SPEC>::State& state, const MyPendulumFourierObservation<OBS_TYPE_SPEC>&, Matrix<OBS_SPEC>& observation, RNG& rng){
static_assert(OBS_SPEC::ROWS == 1);
static_assert(OBS_SPEC::COLS == 3);
typedef typename SPEC::T T;
set(observation, 0, 0, rlt::math::cos(device.math, state.theta));
set(observation, 0, 1, rlt::math::sin(device.math, state.theta));
set(observation, 0, 2, state.theta_dot);
}
template<typename DEVICE, typename SPEC, typename RNG>
RL_TOOLS_FUNCTION_PLACEMENT static bool terminated(DEVICE& device, const MyPendulum<SPEC>& env, const typename MyPendulum<SPEC>::Parameters& parameters, const typename MyPendulum<SPEC>::State state, RNG& rng){
using T = typename SPEC::T;
return false;
}
}
Since the training functions for the RL algorithms need to execute these operations they need to be defined before the RL data-collection and training operations. Hence in the following we include the RL (Loop Interface) operations. Note: when setting up your project you might want to assemble the previous data-structure definitions and operations into a header file so that all the #include
directives are at the beginning of your code (still remember
to include the header files for your environment before the RL operations). A recommended structure (that RLtools follows internally as well) is to put the the environment (for this example) into a my_pendulum
directory. Then the datastructures are in my_pendulum/my_pendulum.h
and the operations are in my_pendulum/operations_generic.h
. operations_generic.h
means that these are pure C++, dependency-free operations that can run on any device. If you need to use external libraries
(e.g. std::xxx
or nlohmann::json
) you should separate out these operations into a device specific header, e.g. my_pendulum/operations_cpu.h
.
[6]:
#include <rl_tools/rl/algorithms/ppo/loop/core/config.h>
#include <rl_tools/rl/algorithms/ppo/loop/core/operations_generic.h>
#include <rl_tools/rl/loop/steps/evaluation/config.h>
#include <rl_tools/rl/loop/steps/evaluation/operations_generic.h>
Finally, we can use our new environment and train it using the Loop Interface (same as in the previous chapter):
[7]:
using DEVICE = rlt::devices::DEVICE_FACTORY<>;
using RNG = decltype(rlt::random::default_engine(typename DEVICE::SPEC::RANDOM{}));
using T = float;
using TI = typename DEVICE::index_t;
[8]:
using PENDULUM_SPEC = MyPendulumSpecification<T, TI, MyPendulumParameters<T>>;
using ENVIRONMENT = MyPendulum<PENDULUM_SPEC>;
[9]:
struct LOOP_CORE_PARAMETERS: rlt::rl::algorithms::ppo::loop::core::DefaultParameters<T, TI, ENVIRONMENT>{
static constexpr TI EPISODE_STEP_LIMIT = 200;
static constexpr TI TOTAL_STEP_LIMIT = 300000;
static constexpr TI STEP_LIMIT = TOTAL_STEP_LIMIT/(ON_POLICY_RUNNER_STEPS_PER_ENV * N_ENVIRONMENTS) + 1; // number of PPO steps
};
using LOOP_CORE_CONFIG = rlt::rl::algorithms::ppo::loop::core::Config<T, TI, RNG, ENVIRONMENT, LOOP_CORE_PARAMETERS>;
[10]:
template <typename NEXT>
struct LOOP_EVAL_PARAMETERS: rlt::rl::loop::steps::evaluation::Parameters<T, TI, NEXT>{
static constexpr TI EVALUATION_INTERVAL = 4;
static constexpr TI NUM_EVALUATION_EPISODES = 10;
static constexpr TI N_EVALUATIONS = NEXT::CORE_PARAMETERS::STEP_LIMIT / EVALUATION_INTERVAL;
};
using LOOP_CONFIG = rlt::rl::loop::steps::evaluation::Config<LOOP_CORE_CONFIG, LOOP_EVAL_PARAMETERS<LOOP_CORE_CONFIG>>;
using LOOP_STATE = typename LOOP_CONFIG::template State<LOOP_CONFIG>;
[11]:
DEVICE device;
TI seed = 1;
LOOP_STATE ls;
rlt::malloc(device, ls);
rlt::init(device, ls, seed);
ls.actor_optimizer.parameters.alpha = 1e-3; // increasing the learning rate leads to faster training of the Pendulum-v1 environment
ls.critic_optimizer.parameters.alpha = 1e-3;
[12]:
while(!rlt::step(device, ls)){
}
Step: 0/74 Mean return: -1300.94 Mean episode length: 200
Step: 4/74 Mean return: -1230.49 Mean episode length: 200
Step: 8/74 Mean return: -1117.24 Mean episode length: 200
Step: 12/74 Mean return: -702.198 Mean episode length: 200
Step: 16/74 Mean return: -545.253 Mean episode length: 200
Step: 20/74 Mean return: -417.902 Mean episode length: 200
Step: 24/74 Mean return: -212.429 Mean episode length: 200
Step: 28/74 Mean return: -160.032 Mean episode length: 200
Step: 32/74 Mean return: -156.78 Mean episode length: 200
Step: 36/74 Mean return: -171.889 Mean episode length: 200
Step: 40/74 Mean return: -146.42 Mean episode length: 200
Step: 44/74 Mean return: -111.336 Mean episode length: 200
Step: 48/74 Mean return: -196.376 Mean episode length: 200
Step: 52/74 Mean return: -182.804 Mean episode length: 200
Step: 56/74 Mean return: -160.231 Mean episode length: 200
Step: 60/74 Mean return: -183.741 Mean episode length: 200
Step: 64/74 Mean return: -413.776 Mean episode length: 200
Step: 68/74 Mean return: -318.449 Mean episode length: 200