Reinforcement Learning

AiDotNet provides 80+ reinforcement learning agents for training AI in interactive environments.

Overview

Reinforcement learning (RL) trains agents to make decisions by interacting with environments and learning from rewards.

Agent → Action → Environment → State, Reward → Agent (learns)

Quick Start

using AiDotNet.ReinforcementLearning;

// Create environment
var env = new CartPoleEnvironment<float>();

// Create agent
var agent = new DQNAgent<float>(new DQNConfig<float>
{
    StateSize = env.ObservationSpace.Shape[0],
    ActionSize = env.ActionSpace.N,
    HiddenLayers = new[] { 128, 128 },
    LearningRate = 1e-3f,
    Gamma = 0.99f,
    EpsilonStart = 1.0f,
    EpsilonEnd = 0.01f,
    EpsilonDecay = 0.995f,
    ReplayBufferSize = 100000,
    BatchSize = 64
});

// Training loop
for (int episode = 0; episode < 500; episode++)
{
    var state = env.Reset();
    float totalReward = 0;

    while (!env.Done)
    {
        var action = agent.SelectAction(state);
        var (nextState, reward, done, info) = env.Step(action);

        agent.Remember(state, action, reward, nextState, done);
        agent.Learn();

        state = nextState;
        totalReward += reward;
    }

    Console.WriteLine($"Episode {episode}: Reward = {totalReward}");
}

Value-Based Methods

Deep Q-Network (DQN)

var config = new DQNConfig<float>
{
    StateSize = 4,
    ActionSize = 2,
    HiddenLayers = new[] { 128, 128 },

    // Learning parameters
    LearningRate = 1e-3f,
    Gamma = 0.99f,  // Discount factor

    // Exploration
    EpsilonStart = 1.0f,
    EpsilonEnd = 0.01f,
    EpsilonDecay = 0.995f,

    // Experience replay
    ReplayBufferSize = 100000,
    BatchSize = 64,

    // Target network
    TargetUpdateFrequency = 100
};

var agent = new DQNAgent<float>(config);

Double DQN

var agent = new DoubleDQNAgent<float>(new DoubleDQNConfig<float>
{
    StateSize = 4,
    ActionSize = 2,
    // ... same as DQN
});

Dueling DQN

var agent = new DuelingDQNAgent<float>(new DuelingDQNConfig<float>
{
    StateSize = 4,
    ActionSize = 2,
    ValueStreamLayers = new[] { 128 },
    AdvantageStreamLayers = new[] { 128 },
    // ... other config
});

Rainbow DQN

Combines all DQN improvements:

var agent = new RainbowAgent<float>(new RainbowConfig<float>
{
    StateSize = 4,
    ActionSize = 2,

    // Distributional RL
    NumAtoms = 51,
    VMin = -10,
    VMax = 10,

    // Prioritized Experience Replay
    PriorityAlpha = 0.6f,
    PriorityBetaStart = 0.4f,

    // Noisy Networks (replaces epsilon-greedy)
    NoisyNetSigma = 0.5f,

    // N-step returns
    NSteps = 3
});

Policy Gradient Methods

REINFORCE

var agent = new REINFORCEAgent<float>(new REINFORCEConfig<float>
{
    StateSize = 4,
    ActionSize = 2,
    HiddenLayers = new[] { 128 },
    LearningRate = 1e-3f,
    Gamma = 0.99f,
    EntropyCoefficient = 0.01f
});

Proximal Policy Optimization (PPO)

A2C / A3C

// Advantage Actor-Critic
var agent = new A2CAgent<float>(new A2CConfig<float>
{
    StateSize = 4,
    ActionSize = 2,
    HiddenLayers = new[] { 256, 256 },
    LearningRate = 7e-4f,
    ValueCoefficient = 0.5f,
    EntropyCoefficient = 0.01f,
    MaxGradNorm = 0.5f,
    NumEnvs = 8
});

// Asynchronous (multi-threaded)
var agent = new A3CAgent<float>(new A3CConfig<float>
{
    // ... same config
    NumWorkers = 8
});

TRPO

var agent = new TRPOAgent<float>(new TRPOConfig<float>
{
    StateSize = 4,
    ActionSize = 2,
    MaxKL = 0.01f,  // KL divergence constraint
    Damping = 0.1f,
    LineSearchSteps = 10
});

Actor-Critic Methods

Soft Actor-Critic (SAC)

State-of-the-art for continuous control:

var agent = new SACAgent<float>(new SACConfig<float>
{
    StateSize = 8,
    ActionSize = 4,  // Continuous actions
    ActionBounds = (-1f, 1f),
    HiddenLayers = new[] { 256, 256 },

    // SAC-specific
    Alpha = 0.2f,  // Temperature (entropy coefficient)
    AutoTuneAlpha = true,  // Automatic temperature tuning
    TargetEntropy = -4f,  // -dim(A)

    // Learning
    LearningRate = 3e-4f,
    Gamma = 0.99f,
    Tau = 0.005f,  // Soft target update
    BatchSize = 256,
    ReplayBufferSize = 1000000
});

Twin Delayed DDPG (TD3)

var agent = new TD3Agent<float>(new TD3Config<float>
{
    StateSize = 8,
    ActionSize = 4,
    ActionBounds = (-1f, 1f),
    HiddenLayers = new[] { 256, 256 },

    // TD3-specific
    PolicyDelay = 2,  // Update policy every 2 critic updates
    TargetNoise = 0.2f,
    NoiseClip = 0.5f,
    ExplorationNoise = 0.1f,

    LearningRate = 3e-4f,
    Gamma = 0.99f,
    Tau = 0.005f
});

DDPG

var agent = new DDPGAgent<float>(new DDPGConfig<float>
{
    StateSize = 8,
    ActionSize = 4,
    ActionBounds = (-1f, 1f),
    ActorLearningRate = 1e-4f,
    CriticLearningRate = 1e-3f,
    OUNoiseTheta = 0.15f,
    OUNoiseSigma = 0.2f
});

Model-Based RL

World Models

var worldModel = new WorldModel<float>(new WorldModelConfig<float>
{
    StateSize = 8,
    ActionSize = 4,
    LatentSize = 32,
    HiddenSize = 256,
    SequenceLength = 16
});

// Train world model
await worldModel.TrainAsync(trajectories);

// Plan using learned model
var agent = new ModelPredictiveControlAgent<float>(
    worldModel: worldModel,
    planningHorizon: 15,
    numSimulations: 1000);

Dreamer

var agent = new DreamerAgent<float>(new DreamerConfig<float>
{
    StateSize = 64,
    ActionSize = 4,
    ImageObservation = true,

    // World model
    LatentSize = 30,
    RecurrentStateSize = 200,

    // Imagination
    ImaginationHorizon = 15,
    NumImaginations = 100,

    // Training
    ModelLearningRate = 6e-4f,
    ActorLearningRate = 8e-5f,
    CriticLearningRate = 8e-5f
});

Multi-Agent RL

MADDPG

var agents = new MADDPGAgents<float>(new MADDPGConfig<float>
{
    NumAgents = 3,
    StateSize = 8,
    ActionSize = 2,
    SharedCritic = true,
    LearningRate = 1e-3f
});

// Training with multiple agents
var states = env.Reset();
while (!env.Done)
{
    var actions = agents.SelectActions(states);
    var (nextStates, rewards, dones, infos) = env.Step(actions);
    agents.Remember(states, actions, rewards, nextStates, dones);
    agents.Learn();
    states = nextStates;
}

QMIX

var agent = new QMIXAgent<float>(new QMIXConfig<float>
{
    NumAgents = 5,
    StateSize = 10,
    ActionSize = 5,
    MixingEmbedSize = 32,
    HypernetEmbedSize = 64
});

Environments

Built-in Environments

// Classic control
var cartPole = new CartPoleEnvironment<float>();
var mountainCar = new MountainCarEnvironment<float>();
var pendulum = new PendulumEnvironment<float>();
var acrobot = new AcrobotEnvironment<float>();

// Continuous control
var halfCheetah = new HalfCheetahEnvironment<float>();
var humanoid = new HumanoidEnvironment<float>();
var ant = new AntEnvironment<float>();

// Atari (requires ROM)
var breakout = new AtariEnvironment<float>("Breakout-v4");
var pong = new AtariEnvironment<float>("Pong-v4");

Custom Environment

public class MyEnvironment : Environment<float>
{
    public override Space ObservationSpace => new BoxSpace(low: -10, high: 10, shape: new[] { 4 });
    public override Space ActionSpace => new DiscreteSpace(3);

    public override float[] Reset()
    {
        // Reset environment and return initial state
        return new float[] { 0, 0, 0, 0 };
    }

    public override (float[] state, float reward, bool done, Dictionary<string, object> info) Step(int action)
    {
        // Execute action and return next state, reward, done flag
        var nextState = ComputeNextState(action);
        var reward = ComputeReward();
        var done = IsTerminal();

        return (nextState, reward, done, new Dictionary<string, object>());
    }
}

Vectorized Environments

// Run multiple environments in parallel
var envs = new VectorizedEnvironment<float>(
    makeEnv: () => new CartPoleEnvironment<float>(),
    numEnvs: 8);

var states = envs.Reset();  // Shape: [8, 4]
var (nextStates, rewards, dones, infos) = envs.Step(actions);  // actions shape: [8]

Training Utilities

Callbacks

var callbacks = new CallbackList<float>
{
    new EpisodeLoggerCallback(logEvery: 10),
    new TensorBoardCallback(logDir: "./logs"),
    new CheckpointCallback(saveEvery: 100, path: "./checkpoints"),
    new EarlyStoppingCallback(patience: 50, minDelta: 1.0f)
};

await agent.TrainAsync(env, numEpisodes: 1000, callbacks: callbacks);

Reward Shaping

var shapedEnv = new RewardShapingWrapper<float>(
    env: baseEnv,
    shapingFunction: (state, action, nextState, reward) =>
    {
        // Add potential-based shaping
        var shaping = Potential(nextState) - Potential(state);
        return reward + shaping;
    });

Curriculum Learning

var curriculum = new CurriculumLearning<float>(
    initialDifficulty: 0.1f,
    maxDifficulty: 1.0f,
    successThreshold: 0.8f,
    windowSize: 100);

// Automatically increases difficulty as agent improves
var env = curriculum.WrapEnvironment(baseEnv);

Saving and Loading

// Save agent
await agent.SaveAsync("./agent_checkpoint");

// Load agent
var loadedAgent = await DQNAgent<float>.LoadAsync("./agent_checkpoint");

// Save just the policy network
await agent.SavePolicyAsync("./policy.aidotnet");

Algorithm Comparison

Algorithm	Type	Action Space	Sample Efficiency	Stability
DQN	Value	Discrete	Medium	High
Rainbow	Value	Discrete	High	High
PPO	Policy	Both	Low	Very High
SAC	Actor-Critic	Continuous	High	High
TD3	Actor-Critic	Continuous	High	High
DDPG	Actor-Critic	Continuous	Medium	Medium
A2C	Policy	Both	Low	Medium

Next Steps

Neural Networks - Build custom network architectures
Distributed Training - Scale training across GPUs
Optimizers - Training optimization

Home

Getting Started

Core Concepts

Reference

Community

Uh oh!

Reinforcement Learning

Reinforcement Learning

Overview

Quick Start

Value-Based Methods

Deep Q-Network (DQN)

Double DQN

Dueling DQN

Rainbow DQN

Policy Gradient Methods

REINFORCE

Proximal Policy Optimization (PPO)

A2C / A3C

TRPO

Actor-Critic Methods

Soft Actor-Critic (SAC)

Twin Delayed DDPG (TD3)

DDPG

Model-Based RL

World Models

Dreamer

Multi-Agent RL

MADDPG

QMIX

Environments

Built-in Environments

Custom Environment

Vectorized Environments

Training Utilities

Callbacks

Reward Shaping

Curriculum Learning

Saving and Loading

Algorithm Comparison

Next Steps

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Clone this wiki locally