NVIDIA / modulus

Open-source deep-learning framework for building, training, and fine-tuning deep learning models using state-of-the-art Physics-ML methods
https://developer.nvidia.com/modulus
Apache License 2.0
985 stars 236 forks source link

🐛[BUG]: IndexError: list index out of range when training BiStride MeshGraphNet #695

Open AndreaPi opened 3 days ago

AndreaPi commented 3 days ago

Version

0.8.0

On which installation method(s) does this occur?

Docker

Describe the issue

I'm trying to train a BiStride MeshGraphNet on my dataset (very similar to DrivAerNet), but I keep getting errors. It looks like it's expecting the data in the graph to have a very specific structure, unlike MeshGraphNet which is better written (and it trains on my data). The error I'm getting is

Traceback (most recent call last):
  File "/workspace/.../test_bsms_mgn.py", line 292, in <module>
    batch_loss = trainer.train(graph['graph'])
  File "/workspace/..../test_bsms_mgn.py", line 245, in train
    loss = self.forward(graph)
  File "/workspace/.../test_bsms_mgn.py", line 251, in forward
    pred = self.model(graph.ndata["x"], graph.edata["x"], graph)
  File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1714, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1725, in _call_impl
    return forward_call(*args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/modulus/models/meshgraphnet/bsms_mgn.py", line 165, in forward
    x = self.bistride_processor(x, ms_ids, ms_edges, node_pos)
  File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1714, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1725, in _call_impl
    return forward_call(*args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/modulus/models/gnn_layers/bsms.py", line 291, in forward
    h = self.down_gmps[i](h, m_gs[i], pos)
IndexError: list index out of range

Can you help? It would be useful if you provided an example to test BiStride MeshGraphNet out, but the only example mentioned in the documentation regards the Ahmed body dataset which is not included in the examplesfolder. https://docs.nvidia.com/deeplearning/modulus/modulus-core/examples/cfd/aero_graph_net/readme.html#bsms-mgn-training

Minimum reproducible example

This is the dataset class:

class MyDataset(DGLDataset, Datapipe):
    def __init__(
        self,
        dir_list_file: str | Path,
        num_samples: int = None,
        invar_keys: Iterable[str] = ("pos", "X1", "X2"),
        outvar_keys: Iterable[str] = ("Y",),
        normalize_keys: Iterable[str] = None,
        cache_dir: str | Path = None, # "./cache/",
        force_reload: bool = False,
        name: str = "dataset",
        verbose: bool = False,
        triangulate: bool = True, 
        downsampling_rate: int = 1,
        **kwargs,
    ) -> None:
        DGLDataset.__init__(self, name=name, force_reload=force_reload, verbose=verbose)
        Datapipe.__init__(self, meta=MetaData())

        with open(dir_list_file, 'r') as file:
            lines = [line.rstrip() for line in file]
        self.dir_list = [Path(f) for f in lines]
        for folder in self.dir_list:
            if not folder.is_dir():
                raise ValueError(
                    f"Path {folder} does not exist or is not a folder."
                )
        self.surface_filename = "surface.vtp"
        self.op_cond_json = "opcond.json"

        self.downsampling_rate = downsampling_rate
        self.triangulate = triangulate
        self.num_samples = num_samples
        self.input_keys = list(invar_keys)
        self.output_keys = list(outvar_keys)
        print(f"Input keys: {self.input_keys}")
        print(f"Output keys: {self.output_keys}")

        if normalize_keys:
            self.normalize_keys = list(normalize_keys)

        cache_dir_parent = self.dir_list[0].parent
        self.cache_dir = (
            self._get_cache_dir(cache_dir_parent, Path(cache_dir))
            if cache_dir is not None
            else None
        )

        list_op_cond = []
        for folder in self.dir_list:
            with open(folder / self.op_cond_json, "r") as fin:
                opc = json.load(fin)
            opc["folder"] = folder
            list_op_cond.append(opc)
        self.op_cond = pd.DataFrame(list_op_cond)
        self.op_cond.sort_values(by="folder", inplace=True)

        if self.num_samples:
            if self.num_samples > len(self.op_cond):
                raise ValueError(
                    f"Number of available {self.split} dataset entries "
                    f"({len(self.op_cond)}) is less than the number of samples "
                    f"({self.num_samples})"
                )
            self.op_cond = self.op_cond.iloc[:self.num_samples, ]            

        numerical_df = self.op_cond.select_dtypes(include='number')
        normalized_df = (numerical_df - numerical_df.min()) / (numerical_df.max() - numerical_df.min())
        self.op_cond[numerical_df.columns] = normalized_df

    def __len__(self) -> int:
        return len(self.op_cond)

    def __getitem__(self, idx: int) -> dgl.DGLGraph:
        if not 0 <= idx < len(self):
            raise IndexError(f"Invalid {idx = }, must be in [0, {len(self)})")

        folder_path = self.op_cond.at[idx, "folder"]

        if self.cache_dir is None:
            graph = self._create_dgl_graph(folder_path, idx)
        else:
            cached_graph_filename = self.cache_dir / (folder_path.name + ".bin")
            if not self._force_reload and cached_graph_filename.is_file():
                gs, _ = dgl.load_graphs(str(cached_graph_filename))
                if len(gs) != 1:
                    raise ValueError(f"Expected to load 1 graph but got {len(gs)}.")
                graph = gs[0]
            else:
                graph = self._create_dgl_graph(folder_path)
                dgl.save_graphs(str(cached_graph_filename), [graph])

        graph.ndata["x"] = torch.cat([graph.ndata[k] for k in self.input_keys], dim=-1)
        graph.ndata["y"] = torch.cat([graph.ndata[k] for k in self.output_keys], dim=-1)

        return {
            "name": folder_path.name,
            "graph": graph,
            "X1": torch.tensor(self.op_cond.at[idx, "X1"], dtype=torch.float32),
            "X2": torch.tensor(self.op_cond.at[idx, "X2"], dtype=torch.float32),}

    @staticmethod
    def _get_cache_dir(data_dir, cache_dir):
        if not cache_dir.is_absolute():
            cache_dir = data_dir / cache_dir
        return cache_dir.resolve()

    def _create_dgl_graph(
        self,
        name: str,
        idx: int,
        to_bidirected: bool = True,
        dtype: torch.dtype | str = torch.int32,
    ) -> dgl.DGLGraph:

        def extract_edges(mesh: pv.PolyData) -> list[tuple[int, int]]:
            polys = mesh.GetPolys()
            if polys is None:
                raise ValueError("Failed to get polygons from the mesh.")

            polys.InitTraversal()
            edge_list = []
            for _ in range(polys.GetNumberOfCells()):
                id_list = vtk.vtkIdList()
                polys.GetNextCell(id_list)
                num_ids = id_list.GetNumberOfIds()
                for j in range(num_ids - 1):
                    edge_list.append(  # noqa: PERF401
                        (id_list.GetId(j), id_list.GetId(j + 1))
                    )
                # Add the final edge between the last and the first vertices.
                edge_list.append((id_list.GetId(num_ids - 1), id_list.GetId(0)))

            return edge_list

        surface_vtp_path = Path(name) / self.surface_filename

        surface_mesh = pv.read(surface_vtp_path)
        if self.triangulate:
            tmp_decimated_points = surface_mesh.points[::self.downsampling_rate,:]
            tmp_decimated_field = {}
            for target in self.output_keys:
                tmp_decimated_field[target] = surface_mesh[target][::self.downsampling_rate].reshape(-1,1)
            cloud = pv.PolyData(tmp_decimated_points)

            surface_mesh = cloud.delaunay_2d()
            for target in self.output_keys:
                surface_mesh[target] = tmp_decimated_field[target]

        edge_list = extract_edges(surface_mesh)

        graph = dgl.graph(edge_list, idtype=dtype)
        graph.ndata["pos"] = torch.tensor(surface_mesh.points, dtype=torch.float32)
        scalar_inputs = [k for k in self.input_keys if k != "pos" ]
        for k in scalar_inputs:
            graph.ndata[k] = torch.ones(surface_mesh.n_points, 1, dtype=torch.float32) * self.op_cond.loc[idx, k]

        for k in self.output_keys:
            graph.ndata[k] = torch.tensor(surface_mesh.point_data[k].reshape(-1, 1), dtype=torch.float32)

        u, v = graph.edges()
        pos = graph.ndata["pos"]
        disp = pos[u] - pos[v]
        disp_norm = torch.linalg.norm(disp, dim=-1, keepdim=True)
        graph.edata["x"] = torch.cat((disp, disp_norm), dim=-1)
        return graph

And this is the __init__method of my trainer class:

class BSMGNTrainer:
    def __init__(self, cfg: DictConfig):
        self.dataset = MyDataset('/.../training_folders.txt',
                                          num_samples=cfg.num_samples, triangulate=cfg.triangulate, downsampling_rate=cfg.downsampling_rate,
                                          outvar_keys=cfg.target)
        self.dataloader = GraphDataLoader(self.dataset, 
                                          shuffle=cfg.shuffle,
                                          batch_size=1,
                                          num_workers=cfg.num_workers,
                                          pin_memory=True, 
                                          drop_last=True,)
        self.model = BiStrideMeshGraphNet(
            input_dim_nodes=len(self.dataset.input_keys) + 2,
            output_dim=len(self.dataset.output_keys),
            input_dim_edges= 4, 
            mlp_activation_fn= 'relu', 
            aggregation= 'sum',
            hidden_dim_processor=cfg.neurons,
            hidden_dim_node_encoder=cfg.neurons,
            hidden_dim_edge_encoder=cfg.neurons,
            hidden_dim_node_decoder=cfg.neurons)
        self.model = self.model.to(device)     
        self.model.train()
        self.loss = torch.nn.L1Loss()
        self.optimizer = torch.optim.Adam(self.model.parameters(), lr=cfg.lr)
        self.scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer=self.optimizer, gamma=0.99985)

Relevant log output

No response

Environment details

No response

mnabian commented 2 days ago

@Alexey-Kamenev could you please take a look?

Alexey-Kamenev commented 1 day ago

You are correct, BSMS MGN expects the data in a certain format. To enable this format, you need to wrap your dataset class in BistrideMultiLayerGraphDataset like it's done in the Ahmed body example. You can do this either in the code or by using Hydra config - check out the BSMS Ahmed body experiment and corresponding dataset config.