NVIDIA / warp

A Python framework for high performance GPU simulation and graphics
https://nvidia.github.io/warp/
Other
4.15k stars 232 forks source link

Is this performance normal? #108

Closed zoezhou1999 closed 1 year ago

zoezhou1999 commented 1 year ago

Hi I am trying to optimize the velocity by supervising on video trajectory. I found the performance is not that good as the provided examples. is it normal? thank you!


import sys
import os
import math
from collections import defaultdict
from typing import Union
import numpy as np
import warp as wp
import warp.sim
from pxr import UsdGeom, Usd
import torch

wp.init()

class ForwardRenderingV3(torch.autograd.Function):
    @staticmethod
    def forward(ctx, velocity, example):

        wp.synchronize_device()
        ctx.tape = wp.Tape()
        ctx.model = example.model

        ctx.velocity = wp.from_torch(velocity, dtype=wp.spatial_vector)
        # ctx.states = example.states
        ctx.sim_steps = example.sim_steps

        # allocate sim states for trajectory
        ctx.states = []
        for i in range(ctx.sim_steps + 1):
            ctx.states.append(ctx.model.state(requires_grad=True))

        with ctx.tape:
            wp.sim.eval_fk(ctx.model, ctx.model.joint_q, ctx.model.joint_qd, None, ctx.states[0])

            ctx.states[0].body_qd=ctx.velocity

            for i in range(ctx.sim_steps):
                ctx.states[i].clear_forces()
                wp.sim.collide(ctx.model, ctx.states[i])
                example.integrator.simulate(ctx.model, ctx.states[i], ctx.states[i+1], example.sim_dt)

        wp.synchronize_device()

        return torch.stack([wp.to_torch(ctx.states[i].body_q) for i in range(ctx.sim_steps)])

    @staticmethod
    def backward(ctx, grad_output):
        # ensure Torch operations complete before running Warp
        wp.synchronize_device()

        for i in range(ctx.sim_steps):
            ctx.states[i].body_q.grad = wp.from_torch(grad_output[i], dtype=wp.transform)

        ctx.tape.backward()
        # ensure Warp operations complete before returning data to Torch
        wp.synchronize_device()

        return (wp.to_torch(ctx.tape.gradients[ctx.states[0].body_qd]), None)

class Example:
    frame_dt = 1.0 / 60.0

    sim_duration = 3 # seconds
    frame_steps = int(sim_duration / frame_dt)
    sim_substeps = 10
    sim_dt = frame_dt / sim_substeps
    sim_steps = frame_steps * sim_substeps

    sim_time = 0.0
    train_iters = 250

    def __init__(self, args):
        self.num_bodies = 8
        self.scale = 0.8
        self.ke = 1.0e5
        self.kd = 250.0
        self.kf = 500.0

        self.args = args

        bunny = self.load_mesh(os.path.join(os.path.dirname(__file__), "assets/bunny.usd"), "/bunny/bunny")

        gt_builder = wp.sim.ModelBuilder()
        i = 10
        b = gt_builder.add_body(
            origin=wp.transform(
                (i * 0.5 * self.scale, 1.0 + i * 1.7 * self.scale, 4.0 + i * 0.5 * self.scale),
                wp.quat_from_axis_angle((0.0, 1.0, 0.0), math.pi * 0.1 * i),
            )
        )
        s = gt_builder.add_shape_mesh(
            body=b,
            mesh=bunny,
            pos=(0.0, 0.0, 0.0),
            scale=(self.scale, self.scale, self.scale),
            ke=self.ke,
            kd=self.kd,
            kf=self.kf,
            density=1e3,
        )
        # finalize model
        self.gt_model = gt_builder.finalize()
        self.gt_model.ground = True
        self.gt_model.joint_attach_ke = 1600.0
        self.gt_model.joint_attach_kd = 20.0

        builder = wp.sim.ModelBuilder()
        i = 10
        b = builder.add_body(
            origin=wp.transform(
                (i * 0.5 * self.scale, 1.0 + i * 1.7 * self.scale, 4.0 + i * 0.5 * self.scale),
                wp.quat_from_axis_angle((0.0, 1.0, 0.0), math.pi * 0.1 * i),
            )
        )
        s = builder.add_shape_mesh(
            body=b,
            mesh=bunny,
            pos=(0.0, 0.0, 0.0),
            scale=(self.scale, self.scale, self.scale),
            ke=self.ke,
            kd=self.kd,
            kf=self.kf,
            density=1e3,
        )
        # finalize model
        self.model = builder.finalize()
        self.model.ground = True
        self.model.joint_attach_ke = 1600.0
        self.model.joint_attach_kd = 20.0

        self.integrator = wp.sim.SemiImplicitIntegrator()
        # self.integrator = wp.sim.XPBDIntegrator()

        self.torch_device = wp.device_to_torch(self.model.device)

        # # allocate sim states for trajectory
        # self.states = []
        # for i in range(self.sim_steps + 1):
        #     self.states.append(self.model.state(requires_grad=True))

        # allocate sim states for trajectory
        self.gt_states = []
        for i in range(self.sim_steps + 1):
            self.gt_states.append(self.gt_model.state(requires_grad=False))

        self.get_gt_frames()

    def load_mesh(self, filename, path):
        asset_stage = Usd.Stage.Open(filename)
        mesh_geom = UsdGeom.Mesh(asset_stage.GetPrimAtPath(path))

        points = np.array(mesh_geom.GetPointsAttr().Get())
        indices = np.array(mesh_geom.GetFaceVertexIndicesAttr().Get()).flatten()

        return wp.sim.Mesh(points, indices)

    @torch.no_grad()
    def get_gt_frames(self):
        wp.sim.eval_fk(self.gt_model, self.gt_model.joint_q, self.gt_model.joint_qd, None, self.gt_states[0])

        # Velocity
        velocity = wp.array([(torch.rand(1).item() * 2 - 1) * 10, (torch.rand(1).item() * 2 - 1) * 2, 0, (torch.rand(1).item() * 2 - 1) * 10, (torch.rand(1).item() * 2 - 1) * 2, 0], dtype=wp.spatial_vector)

        self.gt_states[0].body_qd = velocity

        self.gt_body_q = []
        for i in range(0, self.sim_steps):
            self.gt_states[i].clear_forces()
            wp.sim.collide(self.gt_model, self.gt_states[i])
            self.integrator.simulate(self.gt_model, self.gt_states[i], self.gt_states[i+1], self.sim_dt)

            self.gt_body_q.append(warp.to_torch(self.gt_states[i].body_q).detach())

    def train(self):

        velocity = torch.zeros([1, 6], requires_grad=True, device=self.torch_device)
        mse = torch.nn.MSELoss()
        self.train_iters = 800
        # lr_rate=0.001
        # lr_rate=0.1
        opt = torch.optim.Adam([velocity], lr=0.1)
        for i in range(self.train_iters):
            pred_seq = []
            pred_seq_mask = []

            body_list = ForwardRenderingV3.apply(velocity, self)

            loss = 0
            for k in range(body_list.shape[0]):
                # if k%self.sim_substeps!=0:
                #     continue
                loss += mse(body_list[k], self.gt_body_q[k])

            # loss += mse(body_list[i%body_list.shape[0]], self.gt_body_q[i%body_list.shape[0]])

            loss/=body_list.shape[0]
            print("loss", loss)

            # loss.backward(retain_graph=True)
            loss.backward()
            # with torch.no_grad():
            #     print("velocity.grad", velocity.grad)
            #     # velocity -= torch.clip(velocity.grad, -0.1, 0.1) * lr_rate
            #     velocity -= velocity.grad * lr_rate
            #     velocity.grad.zero_()
            print("velocity.grad", velocity.grad)
            opt.step()
            opt.zero_grad()

            print("self.gt_states[0].body_qd", self.gt_states[0].body_qd, "velocity", velocity)

import argparse

parser = argparse.ArgumentParser()
parser.add_argument(
    "--output_dir",
    type=str,
    default="",
    help="output_dir",
)
output_dir = "./outputs/example_sim_rigid_contact_grad_check/"
os.makedirs(output_dir, exist_ok=True)

import random

seed=42
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)

args = parser.parse_args()
args.output_dir = output_dir

robot = Example(args)
robot.train()
zoezhou1999 commented 1 year ago

I have tried to allocate states inside torch.autograd.Function, it seems make the training more stable but the nan loss sometimes still appear or the optimization is not that perfect as provided examples (by making it supervise on all video trajectory and using Adam optimizer with lr=0.1). This performance is normal or I did something wrong in the code? thank you!

zoezhou1999 commented 1 year ago

When I set the train_iters to 100000, then at some point, an error is thrown

Warp CUDA error 2: out of memory (/buildAgent/work/3db450722a274445/warp/native/warp.cu:215)
Traceback (most recent call last):
  File "example_sim_rigid_contact_grad_check_sample.py", line 253, in <module>
    robot.train()
  File "example_sim_rigid_contact_grad_check_sample.py", line 203, in train
    body_list = ForwardRenderingV3.apply(velocity, self)
  File "example_sim_rigid_contact_grad_check_sample.py", line 43, in forward
    ctx.states.append(ctx.model.state(requires_grad=True))
  File "/mnt/colab_public/datasets/zyhz/conda_env/zyhz/lib/python3.7/site-packages/warp/sim/model.py", line 606, in state
    self.body_count, dtype=wp.spatial_vector, device=s.body_q.device, requires_grad=requires_grad
  File "/mnt/colab_public/datasets/zyhz/conda_env/zyhz/lib/python3.7/site-packages/warp/context.py", line 2617, in zeros
    raise RuntimeError("Memory allocation failed on device: {} for {} bytes".format(device, num_bytes))
RuntimeError: Memory allocation failed on device: cuda:0 for 24 bytes
zoezhou1999 commented 1 year ago

This memory increasing issue also happens in example_sim_fk_grad_torch.py example.

mmacklin commented 1 year ago

Hi @zoezhou1999, we recently fixed a memory leak with gradients when propagating back to Torch, please see this commit from @eric-heiden for details: 1de0d850ceeddf191b1718797f1f7dc120ec3e51.