vwxyzjn / lm-human-preference-details

RLHF implementation details of OAI's 2019 codebase
MIT License
152 stars 7 forks source link

Various refactor #24

Closed vwxyzjn closed 1 year ago

vwxyzjn commented 1 year ago

This branch was actually what used in the blog post 😅 Think it's ready to merge now.

vwxyzjn commented 1 year ago

Everything checks out.

python -m openrlbenchmark.rlops_multi_metrics \
    --filters '?we=openrlbenchmark&wpn=lm-human-preferences&xaxis=elapsed/episodes&ceik=task_id&cen=task.value.policy.initial_model&metrics=objective/score&metrics=objective/kl&metrics=objective/entropy&metrics=objective/score_total&metrics=objective/kl_coef&metrics=ppo/loss/total&metrics=ppo/loss/value&metrics=ppo/loss/policy&metrics=ppo/policy/clipfrac&metrics=ppo/policy/entropy&metrics=ppo/returns/mean&metrics=ppo/policy/approxkl&metrics=ppo/val/clipfrac&metrics=ppo/val/error&metrics=ppo/val/mean&metrics=ppo/returns/var&metrics=ppo/val/vpred' \
        '124M?tag=refactor&tag=policy' \
    --filters '?we=openrlbenchmark&wpn=lm_human_preference_details&xaxis=ppo/episode&ceik=rewards.value.label_dataset&cen=exp_name&metrics=objective/scores&metrics=objective/kl&metrics=objective/entropy&metrics=objective/score_total&metrics=objective/kl_coef&metrics=ppo/loss/total&metrics=ppo/loss/value&metrics=ppo/loss/policy_avg&metrics=ppo/policy/clipfrac_avg&metrics=ppo/policy/entropy_avg&metrics=ppo/returns/mean&metrics=ppo/policy/approxkl_avg&metrics=ppo/val/clipfrac_avg&metrics=ppo/val/error&metrics=ppo/val/mean&metrics=ppo/returns/var&metrics=ppo/val/vpred' \
        'train_policy_accelerate_tf_adam_gpt2?tag=1.0.0b1&cl=tf_adam,gpt2' \
    --env-ids sentiment descriptiveness \
    --env-ids sentiment/offline_5k.json descriptiveness/offline_5k.json \
    --no-check-empty-runs \
    --pc.ncols 6 \
    --pc.ncols-legend 1 \
    --pc.xlabel 'Episodes' \
    --output-filename static/lm-details/main \
    --scan-history

python -m openrlbenchmark.rlops_multi_metrics \
    --filters '?we=openrlbenchmark&wpn=lm_human_preference_details&xaxis=ppo/episode&ceik=rewards.value.label_dataset&cen=exp_name&metrics=objective/scores&metrics=objective/kl&metrics=objective/entropy&metrics=objective/score_total&metrics=objective/kl_coef&metrics=ppo/loss/total&metrics=ppo/loss/value&metrics=ppo/loss/policy_avg&metrics=ppo/policy/clipfrac_avg&metrics=ppo/policy/entropy_avg&metrics=ppo/returns/mean&metrics=ppo/policy/approxkl_avg&metrics=ppo/val/clipfrac_avg&metrics=ppo/val/error&metrics=ppo/val/mean&metrics=ppo/returns/var&metrics=ppo/val/vpred' \
        'train_policy_accelerate_tf_adam_gpt2?tag=1.0.0b1&cl=tf_adam,gpt2' \
        'train_policy_accelerate_tf_adam_gpt2_grad_accu?tag=1.0.0b1&cl=tf_adam,gpt2,gradient_accumulation' \
    --env-ids descriptiveness/offline_5k.json \
    --no-check-empty-runs \
    --pc.ncols 6 \
    --pc.ncols-legend 1 \
    --pc.xlabel 'Episodes' \
    --output-filename static/lm-details/test_grad_accu \
    --scan-history

python -m openrlbenchmark.rlops_multi_metrics \
    --filters '?we=openrlbenchmark&wpn=lm_human_preference_details&xaxis=ppo/episode&ceik=rewards.value.label_dataset&cen=exp_name&metrics=objective/scores&metrics=objective/kl&metrics=ppo/policy/clipfrac_avg&metrics=ppo/policy/approxkl_avg' \
        'train_policy_accelerate_tf_adam_gpt2?tag=1.0.0b1&cl=tf_adam,gpt2' \
        'train_policy_accelerate_pt_adam_gpt2?tag=1.0.0b1&cl=pt_adam,gpt2' \
    --env-ids sentiment/offline_5k.json descriptiveness/offline_5k.json \
    --no-check-empty-runs \
    --pc.ncols 4 \
    --pc.ncols-legend 1 \
    --pc.xlabel 'Episodes' \
    --output-filename static/lm-details/adam_gpt2 \
    --scan-history

python -m openrlbenchmark.rlops_multi_metrics \
    --filters '?we=openrlbenchmark&wpn=lm_human_preference_details&xaxis=ppo/episode&ceik=rewards.value.label_dataset&cen=exp_name&metrics=objective/scores&metrics=objective/kl&metrics=ppo/policy/clipfrac_avg&metrics=ppo/policy/approxkl_avg' \
        'train_policy_accelerate_tf_adam_gpt2_xl_grad_accu?tag=1.0.0b1&cl=tf_adam,gpt2-xl' \
        'train_policy_accelerate_pt_adam_gpt2_xl_grad_accu?tag=1.0.0b1&cl=pt_adam,gpt2-xl' \
    --env-ids sentiment/offline_5k.json descriptiveness/offline_5k.json \
    --no-check-empty-runs \
    --pc.ncols 4 \
    --pc.ncols-legend 1 \
    --pc.xlabel 'Episodes' \
    --output-filename static/lm-details/adam_gpt2_xl \
    --scan-history

python -m openrlbenchmark.rlops_multi_metrics \
    --filters '?we=openrlbenchmark&wpn=lm_human_preference_details&xaxis=ppo/episode&ceik=rewards.value.label_dataset&cen=exp_name&metrics=objective/scores&metrics=objective/kl&metrics=objective/entropy&metrics=objective/score_total&metrics=objective/kl_coef&metrics=ppo/loss/total&metrics=ppo/loss/value&metrics=ppo/loss/policy_avg&metrics=ppo/policy/clipfrac_avg&metrics=ppo/policy/entropy_avg&metrics=ppo/returns/mean&metrics=ppo/policy/approxkl_avg&metrics=ppo/val/clipfrac_avg&metrics=ppo/val/error&metrics=ppo/val/mean&metrics=ppo/returns/var&metrics=ppo/val/vpred' \
        'train_policy_accelerate_tf_adam_gpt2?tag=1.0.0b1&cl=tf_adam,gpt2' \
        'train_policy_accelerate_tf_adam_cerebras_gpt_111M?tag=1.0.0b1&cl=pt_adam,cerebras_gpt_111M' \
        'train_policy_accelerate_tf_adam_pythia-160m?tag=1.0.0b1&cl=pt_adam,pythia-160m' \
    --env-ids descriptiveness/offline_5k.json \
    --no-check-empty-runs \
    --pc.ncols 6 \
    --pc.ncols-legend 1 \
    --pc.xlabel 'Episodes' \
    --output-filename static/lm-details/different_models \
    --scan-history
liutianlin0121 commented 1 year ago

LGTM, thanks!