Open Picrew opened 1 month ago
I'm also meeting Segmentation fault error,when runpython dg.py --config configs/dg.yaml input=data/CONSISTENT4D_DATA/in-the-wild/blooming_rose/0.png
python dg.py --config configs/dg.yaml input=data/CONSISTENT4D_DATA/in-the-wild/blooming_rose/0.png
I used Python 3.10.15 ,torch 2.3.1+cu118 for 4090 gpu
I also want debug this error, try to pdb if name == "main":
if __name__ == "__main__": import argparse from omegaconf import OmegaConf import traceback import sys import torch import psutil import cProfile import pstats import faulthandler import gc # 启用Python的故障处理程序 faulthandler.enable() def print_memory_status(): # 系统内存 mem = psutil.virtual_memory() print(f"\nSystem Memory:") print(f"Total: {mem.total / 1024**3:.2f}GB") print(f"Available: {mem.available / 1024**3:.2f}GB") print(f"Used: {mem.used / 1024**3:.2f}GB") print(f"Percentage: {mem.percent}%") # GPU内存 if torch.cuda.is_available(): print(f"\nGPU Memory:") print(f"Allocated: {torch.cuda.memory_allocated()/1024**2:.2f}MB") print(f"Cached: {torch.cuda.memory_reserved()/1024**2:.2f}MB") # 垃圾回收信息 print(f"\nGarbage Collector:") print(f"Garbage objects: {len(gc.get_objects())}") try: print("\n=== Initial Memory State ===") print_memory_status() parser = argparse.ArgumentParser() parser.add_argument("--config", required=True, help="path to the yaml config file") args, extras = parser.parse_known_args() # CUDA信息 print("\n=== CUDA Information ===") print(f"CUDA available: {torch.cuda.is_available()}") if torch.cuda.is_available(): print(f"CUDA device count: {torch.cuda.device_count()}") print(f"Current CUDA device: {torch.cuda.current_device()}") print(f"Device name: {torch.cuda.get_device_name()}") print(f"CUDA version: {torch.version.cuda}") print(f"cuDNN version: {torch.backends.cudnn.version()}") # 加载配置 print("\n=== Loading Configuration ===") opt = OmegaConf.merge(OmegaConf.load(args.config), OmegaConf.from_cli(extras)) print("Configuration loaded:", opt) if 'CONSISTENT4D' in opt.input: opt.save_path = opt.input.split('/')[-2] if opt.save_path == '' else opt.save_path else: opt.save_path = os.path.splitext(os.path.basename(opt.input))[0] if opt.save_path == '' else opt.save_path print(f"Save path set to: {opt.save_path}") print("\n=== Memory State Before GUI Init ===") print_memory_status() print("\nInitializing GUI...") gui = GUI(opt) print("GUI initialized successfully") print("\n=== Memory State After GUI Init ===") print_memory_status() # 使用性能分析器 print("\n=== Starting Training with Performance Profiling ===") profiler = cProfile.Profile() profiler.enable() try: print(f"\nStarting training with {opt.iters} iterations...") gui.train(opt.iters) except Exception as train_error: print("\n=== Training Error ===") print(f"Error during training: {str(train_error)}") raise train_error finally: profiler.disable() print("\n=== Memory State After Training ===") print_memory_status() # 打印性能分析结果 print("\n=== Performance Profile ===") stats = pstats.Stats(profiler).sort_stats('cumulative') stats.print_stats(30) print("\nTraining completed successfully") except Exception as e: print("\n=== Error Details ===") print(f"Error type: {type(e).__name__}") print(f"Error message: {str(e)}") print("\n=== Stack Trace ===") traceback.print_exc() print("\n=== System Information ===") print(f"Python version: {sys.version}") print(f"PyTorch version: {torch.__version__}") print("\n=== Final Memory State ===") print_memory_status() # 如果是CUDA相关错误 if "cuda" in str(e).lower(): print("\n=== CUDA Error Information ===") try: print(f"CUDA version: {torch.version.cuda}") print(f"cuDNN version: {torch.backends.cudnn.version()}") # 尝试重置CUDA torch.cuda.empty_cache() print("CUDA cache cleared") except: print("Unable to fetch CUDA information") sys.exit(1)
I don't think this error for mmo, So I want ask for how to solve this error?
Segmentation fault
I'm also meeting Segmentation fault error,when run
python dg.py --config configs/dg.yaml input=data/CONSISTENT4D_DATA/in-the-wild/blooming_rose/0.png
Ref
Env
I used Python 3.10.15 ,torch 2.3.1+cu118 for 4090 gpu
Debug
I also want debug this error, try to pdb if name == "main":
I don't think this error for mmo, So I want ask for how to solve this error?