jiawei-ren / dreamgaussian4d

[arXiv 2023] DreamGaussian4D: Generative 4D Gaussian Splatting
https://jiawei-ren.github.io/projects/dreamgaussian4d/
MIT License
527 stars 32 forks source link

Segmentation fault #19

Open Picrew opened 1 month ago

Picrew commented 1 month ago

Segmentation fault

I'm also meeting Segmentation fault error,when runpython dg.py --config configs/dg.yaml input=data/CONSISTENT4D_DATA/in-the-wild/blooming_rose/0.png

Ref

Env

I used Python 3.10.15 ,torch 2.3.1+cu118 for 4090 gpu

Debug

I also want debug this error, try to pdb if name == "main":

if __name__ == "__main__":
    import argparse
    from omegaconf import OmegaConf
    import traceback
    import sys
    import torch
    import psutil
    import cProfile
    import pstats
    import faulthandler
    import gc

    # 启用Python的故障处理程序
    faulthandler.enable()

    def print_memory_status():
        # 系统内存
        mem = psutil.virtual_memory()
        print(f"\nSystem Memory:")
        print(f"Total: {mem.total / 1024**3:.2f}GB")
        print(f"Available: {mem.available / 1024**3:.2f}GB")
        print(f"Used: {mem.used / 1024**3:.2f}GB")
        print(f"Percentage: {mem.percent}%")

        # GPU内存
        if torch.cuda.is_available():
            print(f"\nGPU Memory:")
            print(f"Allocated: {torch.cuda.memory_allocated()/1024**2:.2f}MB")
            print(f"Cached: {torch.cuda.memory_reserved()/1024**2:.2f}MB")

        # 垃圾回收信息
        print(f"\nGarbage Collector:")
        print(f"Garbage objects: {len(gc.get_objects())}")

    try:
        print("\n=== Initial Memory State ===")
        print_memory_status()

        parser = argparse.ArgumentParser()
        parser.add_argument("--config", required=True, help="path to the yaml config file")
        args, extras = parser.parse_known_args()

        # CUDA信息
        print("\n=== CUDA Information ===")
        print(f"CUDA available: {torch.cuda.is_available()}")
        if torch.cuda.is_available():
            print(f"CUDA device count: {torch.cuda.device_count()}")
            print(f"Current CUDA device: {torch.cuda.current_device()}")
            print(f"Device name: {torch.cuda.get_device_name()}")
            print(f"CUDA version: {torch.version.cuda}")
            print(f"cuDNN version: {torch.backends.cudnn.version()}")

        # 加载配置
        print("\n=== Loading Configuration ===")
        opt = OmegaConf.merge(OmegaConf.load(args.config), OmegaConf.from_cli(extras))
        print("Configuration loaded:", opt)

        if 'CONSISTENT4D' in opt.input:
            opt.save_path = opt.input.split('/')[-2] if opt.save_path == '' else opt.save_path
        else:
            opt.save_path = os.path.splitext(os.path.basename(opt.input))[0] if opt.save_path == '' else opt.save_path

        print(f"Save path set to: {opt.save_path}")

        print("\n=== Memory State Before GUI Init ===")
        print_memory_status()

        print("\nInitializing GUI...")
        gui = GUI(opt)
        print("GUI initialized successfully")

        print("\n=== Memory State After GUI Init ===")
        print_memory_status()

        # 使用性能分析器
        print("\n=== Starting Training with Performance Profiling ===")
        profiler = cProfile.Profile()
        profiler.enable()

        try:
            print(f"\nStarting training with {opt.iters} iterations...")
            gui.train(opt.iters)
        except Exception as train_error:
            print("\n=== Training Error ===")
            print(f"Error during training: {str(train_error)}")
            raise train_error
        finally:
            profiler.disable()

            print("\n=== Memory State After Training ===")
            print_memory_status()

            # 打印性能分析结果
            print("\n=== Performance Profile ===")
            stats = pstats.Stats(profiler).sort_stats('cumulative')
            stats.print_stats(30)

        print("\nTraining completed successfully")

    except Exception as e:
        print("\n=== Error Details ===")
        print(f"Error type: {type(e).__name__}")
        print(f"Error message: {str(e)}")

        print("\n=== Stack Trace ===")
        traceback.print_exc()

        print("\n=== System Information ===")
        print(f"Python version: {sys.version}")
        print(f"PyTorch version: {torch.__version__}")

        print("\n=== Final Memory State ===")
        print_memory_status()

        # 如果是CUDA相关错误
        if "cuda" in str(e).lower():
            print("\n=== CUDA Error Information ===")
            try:
                print(f"CUDA version: {torch.version.cuda}")
                print(f"cuDNN version: {torch.backends.cudnn.version()}")
                # 尝试重置CUDA
                torch.cuda.empty_cache()
                print("CUDA cache cleared")
            except:
                print("Unable to fetch CUDA information")

        sys.exit(1)

I don't think this error for mmo, So I want ask for how to solve this error?