Optimizing with PGA-AURORA in Jax¶

This notebook shows how to use QDax to find diverse and performing controllers in MDPs with PGA-AURORA. It can be run locally or on Google Colab. We recommand to use a GPU. This notebook will show:

how to define the problem
how to create an emitter
how to create an AURORA instance and mix it with the right emitter to define PGA-AURORA
which functions must be defined before training
how to launch a certain number of training steps
how to visualise the optimization process
how to save/load a repertoire

#@title Installs and Imports
!pip install ipympl |tail -n 1
# %matplotlib widget
# from google.colab import output
# output.enable_custom_widget_manager()

import os

from IPython.display import clear_output
import functools
from typing import Dict, Any

import jax
import jax.numpy as jnp

try:
    import brax
except:
    !pip install git+https://github.com/google/brax.git@v0.9.2 |tail -n 1
    import brax

try:
    import flax
except:
    !pip install --no-deps git+https://github.com/google/flax.git@v0.7.4 |tail -n 1
    import flax

try:
    import chex
except:
    !pip install --no-deps git+https://github.com/deepmind/chex.git@v0.1.83 |tail -n 1
    import chex

try:
    import jumanji
except:
    !pip install "jumanji==0.3.1"
    import jumanji

try:
    import qdax
except:
    !pip install --no-deps git+https://github.com/adaptive-intelligent-robotics/QDax@main |tail -n 1
    import qdax


from qdax.core.aurora import AURORA
from qdax.core.containers.unstructured_repertoire import UnstructuredRepertoire
from qdax import environments
from qdax.tasks.brax_envs import (
    create_default_brax_task_components,
    get_aurora_scoring_fn,
)
from qdax.environments.bd_extractors import (
    AuroraExtraInfoNormalization,
    get_aurora_encoding,
)
from qdax.core.neuroevolution.buffers.buffer import QDTransition
from qdax.core.neuroevolution.networks.networks import MLP
from qdax.core.emitters.mutation_operators import isoline_variation
from qdax.core.emitters.pga_me_emitter import PGAMEConfig, PGAMEEmitter

from qdax.types import Observation
from qdax.utils import train_seq2seq


if "COLAB_TPU_ADDR" in os.environ:
  from jax.tools import colab_tpu
  colab_tpu.setup_tpu()


clear_output()

#@title QD Training Definitions Fields
#@markdown ---
env_batch_size = 100 #@param {type:"number"}
env_name = 'walker2d_uni'#@param['ant_uni', 'hopper_uni', 'walker2d_uni', 'halfcheetah_uni', 'humanoid_uni', 'ant_omni', 'humanoid_omni']
episode_length = 250 #@param {type:"integer"}
max_iterations = 50 #@param {type:"integer"}
seed = 42 #@param {type:"integer"}
policy_hidden_layer_sizes = (64, 64) #@param {type:"raw"}
iso_sigma = 0.005 #@param {type:"number"}
line_sigma = 0.05 #@param {type:"number"}
num_init_cvt_samples = 50000 #@param {type:"integer"}
num_centroids = 1024 #@param {type:"integer"}
min_bd = 0. #@param {type:"number"}
max_bd = 1.0 #@param {type:"number"}

lstm_batch_size = 128 #@param {type:"integer"}

observation_option = "no_sd" #@param['no_sd', 'only_sd', 'full']
hidden_size = 5 #@param {type:"integer"}
l_value_init = 0.2 #@param {type:"number"}

traj_sampling_freq = 10 #@param {type:"integer"}
max_observation_size = 25 #@param {type:"integer"}
prior_descriptor_dim = 2 #@param {type:"integer"}

proportion_mutation_ga = 0.5 #@param {type:"number"}

# TD3 params
replay_buffer_size = 1000000 #@param {type:"number"}
critic_hidden_layer_size = (256, 256) #@param {type:"raw"}
critic_learning_rate = 3e-4 #@param {type:"number"}
greedy_learning_rate = 3e-4 #@param {type:"number"}
policy_learning_rate = 1e-3 #@param {type:"number"}
noise_clip = 0.5 #@param {type:"number"}
policy_noise = 0.2 #@param {type:"number"}
discount = 0.99 #@param {type:"number"}
reward_scaling = 1.0 #@param {type:"number"}
transitions_batch_size = 256 #@param {type:"number"}
soft_tau_update = 0.005 #@param {type:"number"}
num_critic_training_steps = 300 #@param {type:"number"}
num_pg_training_steps = 100 #@param {type:"number"}
policy_delay = 2 #@param {type:"number"}

log_freq = 5 #@param {type:"integer"}
#@markdown ---

Init environment, policy, population params, init states of the env¶

Define the environment in which the policies will be trained. In this notebook, we focus on controllers learning to move a robot in a physical simulation. We also define the shared policy, that every individual in the population will use. Once the policy is defined, all individuals are defined by their parameters, that corresponds to their genotype.

# Init environment
env = environments.create(env_name, episode_length=episode_length)

# Init a random key
random_key = jax.random.PRNGKey(seed)

# Init policy network
policy_layer_sizes = policy_hidden_layer_sizes + (env.action_size,)
policy_network = MLP(
    layer_sizes=policy_layer_sizes,
    kernel_init=jax.nn.initializers.lecun_uniform(),
    final_activation=jnp.tanh,
)

# Init population of controllers
random_key, subkey = jax.random.split(random_key)
keys = jax.random.split(subkey, num=env_batch_size)
fake_batch = jnp.zeros(shape=(env_batch_size, env.observation_size))
init_variables = jax.vmap(policy_network.init)(keys, fake_batch)


# Create the initial environment states
random_key, subkey = jax.random.split(random_key)
keys = jnp.repeat(jnp.expand_dims(subkey, axis=0), repeats=env_batch_size, axis=0)
reset_fn = jax.jit(jax.vmap(env.reset))
init_states = reset_fn(keys)

Define the way the policy interacts with the env¶

Now that the environment and policy has been defined, it is necessary to define a function that describes how the policy must be used to interact with the environment and to store transition data.

# Define the fonction to play a step with the policy in the environment
def play_step_fn(
  env_state,
  policy_params,
  random_key,
):
    """
    Play an environment step and return the updated state and the transition.
    """

    actions = policy_network.apply(policy_params, env_state.obs)

    state_desc = env_state.info["state_descriptor"]
    next_state = env.step(env_state, actions)

    transition = QDTransition(
        obs=env_state.obs,
        next_obs=next_state.obs,
        rewards=next_state.reward,
        dones=next_state.done,
        actions=actions,
        truncations=next_state.info["truncation"],
        state_desc=state_desc,
        next_state_desc=next_state.info["state_descriptor"],
    )

    return next_state, policy_params, random_key, transition

Define the scoring function and the way metrics are computed¶

The scoring function is used in the evaluation step to determine the fitness and behavior descriptor of each individual.

# Prepare the scoring function
env, policy_network, scoring_fn, random_key = create_default_brax_task_components(
    env_name=env_name,
    random_key=random_key,
)

def observation_extractor_fn(
    data: QDTransition,
) -&gt; Observation:
    """Extract observation from the state."""
    state_obs = data.obs[:, ::traj_sampling_freq, :max_observation_size]

    # add the x/y position - (batch_size, traj_length, 2)
    state_desc = data.state_desc[:, ::traj_sampling_freq]

    if observation_option == "full":
        observations = jnp.concatenate([state_desc, state_obs], axis=-1)
    elif observation_option == "no_sd":
        observations = state_obs
    elif observation_option == "only_sd":
        observations = state_desc
    else:
        raise ValueError("Unknown observation option.")

    return observations

# Prepare the scoring function
aurora_scoring_fn = get_aurora_scoring_fn(
    scoring_fn=scoring_fn,
    observation_extractor_fn=observation_extractor_fn,
)

# Get minimum reward value to make sure qd_score are positive
reward_offset = environments.reward_offset[env_name]

# Define a metrics function
def metrics_fn(repertoire: UnstructuredRepertoire) -&gt; Dict:

    # Get metrics
    grid_empty = repertoire.fitnesses == -jnp.inf
    qd_score = jnp.sum(repertoire.fitnesses, where=~grid_empty)
    # Add offset for positive qd_score
    qd_score += reward_offset * episode_length * jnp.sum(1.0 - grid_empty)
    coverage = 100 * jnp.mean(1.0 - grid_empty)
    max_fitness = jnp.max(repertoire.fitnesses)

    return {"qd_score": qd_score, "max_fitness": max_fitness, "coverage": coverage}

Define the emitter¶

The emitter is used to evolve the population at each mutation step.

# Define the PG-emitter config
pga_emitter_config = PGAMEConfig(
    env_batch_size=env_batch_size,
    batch_size=transitions_batch_size,
    proportion_mutation_ga=proportion_mutation_ga,
    critic_hidden_layer_size=critic_hidden_layer_size,
    critic_learning_rate=critic_learning_rate,
    greedy_learning_rate=greedy_learning_rate,
    policy_learning_rate=policy_learning_rate,
    noise_clip=noise_clip,
    policy_noise=policy_noise,
    discount=discount,
    reward_scaling=reward_scaling,
    replay_buffer_size=replay_buffer_size,
    soft_tau_update=soft_tau_update,
    num_critic_training_steps=num_critic_training_steps,
    num_pg_training_steps=num_pg_training_steps,
    policy_delay=policy_delay,
)

# Get the emitter
variation_fn = functools.partial(
    isoline_variation, iso_sigma=iso_sigma, line_sigma=line_sigma
)

pg_emitter = PGAMEEmitter(
    config=pga_emitter_config,
    policy_network=policy_network,
    env=env,
    variation_fn=variation_fn,
)

Instantiate and initialise the MAP Elites algorithm¶

aurora_dims = hidden_size
centroids = jnp.zeros(shape=(num_centroids, aurora_dims))

@jax.jit
def update_scan_fn(carry: Any, unused: Any) -&gt; Any:
    """Scan the udpate function."""
    (
        repertoire,
        emitter_state,
        random_key,
        aurora_extra_info
    ) = carry

    # update
    (repertoire, emitter_state, metrics, random_key,) = aurora.update(
        repertoire,
        emitter_state,
        random_key,
        aurora_extra_info=aurora_extra_info,
    )

    return (
        (repertoire, emitter_state, random_key, aurora_extra_info),
        metrics,
    )

# Init algorithm
# AutoEncoder Params and INIT
obs_dim = jnp.minimum(env.observation_size, max_observation_size)
if observation_option == "full":
    observations_dims = (
        episode_length // traj_sampling_freq,
        obs_dim + prior_descriptor_dim,
    )
elif observation_option == "no_sd":
    observations_dims = (
        episode_length // traj_sampling_freq,
        obs_dim,
    )
elif observation_option == "only_sd":
    observations_dims = (episode_length // traj_sampling_freq, prior_descriptor_dim)
else:
    ValueError("The chosen option is not correct.")

# Define the seq2seq model
model = train_seq2seq.get_model(
    observations_dims[-1], True, hidden_size=hidden_size
)

# Init the model params
random_key, subkey = jax.random.split(random_key)
model_params = train_seq2seq.get_initial_params(
    model, subkey, (1, *observations_dims)
)

print(jax.tree_map(lambda x: x.shape, model_params))

# Define the encoder function
encoder_fn = jax.jit(
    functools.partial(
        get_aurora_encoding,
        model=model,
    )
)

# Define the training function
train_fn = functools.partial(
    train_seq2seq.lstm_ae_train,
    model=model,
    batch_size=lstm_batch_size,
)

# Instantiate AURORA
aurora = AURORA(
    scoring_function=aurora_scoring_fn,
    emitter=pg_emitter,
    metrics_function=metrics_fn,
    encoder_function=encoder_fn,
    training_function=train_fn,
)

# init the model params
random_key, subkey = jax.random.split(random_key)
model_params = train_seq2seq.get_initial_params(
    model, subkey, (1, *observations_dims)
)

print(jax.tree_map(lambda x: x.shape, model_params))

# define arbitrary observation's mean/std
mean_observations = jnp.zeros(observations_dims[-1])
std_observations = jnp.ones(observations_dims[-1])

# init all the information needed by AURORA to compute encodings
aurora_extra_info = AuroraExtraInfoNormalization.create(
    model_params,
    mean_observations,
    std_observations,
)

# init step of the aurora algorithm
repertoire, emitter_state, aurora_extra_info, random_key = aurora.init(
    init_variables,
    aurora_extra_info,
    jnp.asarray(l_value_init),
    max_observation_size,
    random_key,
)

# initializing means and stds and AURORA
random_key, subkey = jax.random.split(random_key)
repertoire, aurora_extra_info = aurora.train(
    repertoire, model_params, iteration=0, random_key=subkey
)

# design aurora's schedule
default_update_base = 10
update_base = int(jnp.ceil(default_update_base / log_freq))
schedules = jnp.cumsum(jnp.arange(update_base, 1000, update_base))

Launch AURORA iterations¶

current_step_estimation = 0
num_iterations = 0

# Main loop
n_target = 1024

previous_error = jnp.sum(repertoire.fitnesses != -jnp.inf) - n_target

iteration = 0
while iteration &lt; max_iterations:

    (
        (repertoire, emitter_state, random_key, aurora_extra_info),
        metrics,
    ) = jax.lax.scan(
        update_scan_fn,
        (repertoire, emitter_state, random_key, aurora_extra_info),
        (),
        length=log_freq,
    )

    num_iterations = iteration * log_freq

    # update nb steps estimation
    current_step_estimation += env_batch_size * episode_length * log_freq

    # autoencoder steps and CVC
    if (iteration + 1) in schedules:
        # train the autoencoder
        random_key, subkey = jax.random.split(random_key)
        repertoire, aurora_extra_info = aurora.train(
            repertoire, model_params, iteration, subkey
        )

    elif iteration % 2 == 0:
        repertoire, previous_error = aurora.container_size_control(
            repertoire,
            target_size=n_target,
            previous_error=previous_error,
        )


    iteration += 1

for k, v in metrics.items():
    print(k, " - ", v[-1])