Different reproduced performance of Square

pengzhi1998 commented 3 months ago

Dear Authors,

Thank you so much again for providing such a great and helpful repo!!

I have been following this page to reproduce the results of Square (Nut Assembly Task). Here is the generated config file I'm using:

{
    "algo_name": "bc",
    "experiment": {
        "name": "core_bc_rnn_square_ph_image",
        "validate": true,
        "logging": {
            "terminal_output_to_txt": true,
            "log_tb": true,
            "log_wandb": false,
            "wandb_proj_name": "debug"
        },
        "save": {
            "enabled": true,
            "every_n_seconds": null,
            "every_n_epochs": 20,
            "epochs": [],
            "on_best_validation": false,
            "on_best_rollout_return": false,
            "on_best_rollout_success_rate": true
        },
        "epoch_every_n_steps": 500,
        "validation_epoch_every_n_steps": 50,
        "env": null,
        "additional_envs": null,
        "render": false,
        "render_video": true,
        "keep_all_videos": false,
        "video_skip": 5,
        "rollout": {
            "enabled": true,
            "n": 50,
            "horizon": 400,
            "rate": 20,
            "warmstart": 0,
            "terminate_on_success": true
        }
    },
    "train": {
        "data": "../../tests/assets/image.hdf5",
        "output_dir": "../../tests/assets/core/bc_rnn/square/ph/image/trained_models",
        "num_data_workers": 2,
        "hdf5_cache_mode": "low_dim",
        "hdf5_use_swmr": true,
        "hdf5_load_next_obs": false,
        "hdf5_normalize_obs": false,
        "hdf5_filter_key": "train",
        "hdf5_validation_filter_key": "valid",
        "seq_length": 10,
        "pad_seq_length": true,
        "frame_stack": 1,
        "pad_frame_stack": true,
        "dataset_keys": [
            "actions",
            "rewards",
            "dones"
        ],
        "goal_mode": null,
        "cuda": true,
        "batch_size": 16,
        "num_epochs": 600,
        "seed": 1
    },
    "algo": {
        "optim_params": {
            "policy": {
                "optimizer_type": "adam",
                "learning_rate": {
                    "initial": 0.0001,
                    "decay_factor": 0.1,
                    "epoch_schedule": [],
                    "scheduler_type": "multistep"
                },
                "regularization": {
                    "L2": 0.0
                }
            }
        },
        "loss": {
            "l2_weight": 1.0,
            "l1_weight": 0.0,
            "cos_weight": 0.0
        },
        "actor_layer_dims": [],
        "gaussian": {
            "enabled": false,
            "fixed_std": false,
            "init_std": 0.1,
            "min_std": 0.01,
            "std_activation": "softplus",
            "low_noise_eval": true
        },
        "gmm": {
            "enabled": true,
            "num_modes": 5,
            "min_std": 0.0001,
            "std_activation": "softplus",
            "low_noise_eval": true
        },
        "vae": {
            "enabled": false,
            "latent_dim": 14,
            "latent_clip": null,
            "kl_weight": 1.0,
            "decoder": {
                "is_conditioned": true,
                "reconstruction_sum_across_elements": false
            },
            "prior": {
                "learn": false,
                "is_conditioned": false,
                "use_gmm": false,
                "gmm_num_modes": 10,
                "gmm_learn_weights": false,
                "use_categorical": false,
                "categorical_dim": 10,
                "categorical_gumbel_softmax_hard": false,
                "categorical_init_temp": 1.0,
                "categorical_temp_anneal_step": 0.001,
                "categorical_min_temp": 0.3
            },
            "encoder_layer_dims": [
                300,
                400
            ],
            "decoder_layer_dims": [
                300,
                400
            ],
            "prior_layer_dims": [
                300,
                400
            ]
        },
        "rnn": {
            "enabled": true,
            "horizon": 10,
            "hidden_dim": 1000,
            "rnn_type": "LSTM",
            "num_layers": 2,
            "open_loop": false,
            "kwargs": {
                "bidirectional": false
            }
        },
        "transformer": {
            "enabled": false,
            "context_length": 10,
            "embed_dim": 512,
            "num_layers": 6,
            "num_heads": 8,
            "emb_dropout": 0.1,
            "attn_dropout": 0.1,
            "block_output_dropout": 0.1,
            "sinusoidal_embedding": false,
            "activation": "gelu",
            "supervise_all_steps": false,
            "nn_parameter_for_timesteps": true
        }
    },
    "observation": {
        "modalities": {
            "obs": {
                "low_dim": [
                    "robot0_eef_pos",
                    "robot0_eef_quat",
                    "robot0_gripper_qpos"
                ],
                "rgb": [
                    "agentview_image",
                    "robot0_eye_in_hand_image"
                ],
                "depth": [],
                "scan": []
            },
            "goal": {
                "low_dim": [],
                "rgb": [],
                "depth": [],
                "scan": []
            }
        },
        "encoder": {
            "low_dim": {
                "core_class": null,
                "core_kwargs": {},
                "obs_randomizer_class": null,
                "obs_randomizer_kwargs": {}
            },
            "rgb": {
                "core_class": "VisualCore",
                "core_kwargs": {
                    "feature_dimension": 64,
                    "backbone_class": "ResNet18Conv",
                    "backbone_kwargs": {
                        "pretrained": false,
                        "input_coord_conv": false
                    },
                    "pool_class": "SpatialSoftmax",
                    "pool_kwargs": {
                        "num_kp": 32,
                        "learnable_temperature": false,
                        "temperature": 1.0,
                        "noise_std": 0.0
                    }
                },
                "obs_randomizer_class": "CropRandomizer",
                "obs_randomizer_kwargs": {
                    "crop_height": 76,
                    "crop_width": 76,
                    "num_crops": 1,
                    "pos_enc": false
                }
            },
            "depth": {
                "core_class": "VisualCore",
                "core_kwargs": {},
                "obs_randomizer_class": null,
                "obs_randomizer_kwargs": {}
            },
            "scan": {
                "core_class": "ScanCore",
                "core_kwargs": {},
                "obs_randomizer_class": null,
                "obs_randomizer_kwargs": {}
            }
        }
    },
    "meta": {
        "hp_base_config_file": null,
        "hp_keys": [],
        "hp_values": []
    }
}

However, despite multiple evaluations during training and testing, the highest success rates I achieved were between 0.4 and 0.64. I haven't seen a result of 0.82 as reported in the paper. Could you advise on how to achieve this success rate? Did I make a mistake?

Thank you so much for your help and your precious time! Look forward to your reply!

Best regards, Pengzhi

amandlek commented 3 months ago

Your config looks accurate to me. What version of robosuite are you using, and which branch? It might be worth trying the offline_study branch of robosuite, along with v0.2 of robomimic (you would need to re-download the data though, since the image observations will look different). See this note.

pengzhi1998 commented 3 months ago

Thank you so much for your response and help! After training with the image_v141.hdf5 dataset, the performance has improved significantly.

I noticed that the performance of the model trained using image-based demonstrations previously, downloaded directly from robomimic v0.2 was poor. I'm wondering what differences in the data (the image.hdf5 file from robomimic v0.2 and the image_v141.hdf5 extracted from demo_v141.hdf5 in robomimic v0.3) might be causing this discrepancy? Do they use different observations?
Additionally, I found your tutorial on using pretrained models. However, I only located the link for the lift task (http://downloads.cs.stanford.edu/downloads/rt_benchmark/model_zoo/lift/bc_rnn/lift_ph_low_dim_epoch_1000_succ_100.pth). Where may I find pretrained models for other tasks with image inputs?

Thank you so much again for your guidance!

amandlek commented 3 months ago

Re (1): some textures in the environment have changed between robosuite v1.2 and v1.4 - this would explain why the model performance degraded (since you were training with image observations from v1.2 and evaluating with image observations from v1.4).

Re (2): The links should be here - but these were trained on robosuite v1.2.

pengzhi1998 commented 3 months ago

Thank you so much for your help!

ARISE-Initiative / robomimic

Different reproduced performance of Square #157