Closed leescpeter closed 8 months ago
Hi, this looks like an installation problem related to python multiprocessing, that breaks the huggingface datasets map
call. This is not a problem directly related to this project. You might be able to bypass it by setting impl.threads=0
, but probably better to fix the installation.
thx I set impl.threads=1 passed,but got another error: Error executing job with overrides: ['name=test', 'arch=hf-bert-base', 'train=bert-base', 'data=sanity-check-2', 'dryrun=True', 'impl.microbatch_size=2', 'impl.threads=1'] Traceback (most recent call last): File "/root/cramming/pretrain.py", line 155, in launch cramming.utils.main_launcher(cfg, main_training_process, job_name="pretraining") File "/root/cramming/cramming/utils.py", line 54, in main_launcher metrics = main_fn(cfg, setup) ^^^^^^^^^^^^^^^^^^^ File "/root/cramming/pretrain.py", line 23, in main_training_process modelengine, , _, dataloader = cramming.load_backend( ^^^^^^^^^^^^^^^^^^^^^^ File "/root/cramming/cramming/backend/prepare_backend.py", line 16, in load_backend return initialize_torch(model, dataset, tokenizer, cfg_train, cfg_impl, setup=setup) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/root/cramming/cramming/backend/torch_default.py", line 53, in initialize_torch model_engine = TorchEngineMinimal(model, cfg_train, cfg_impl, setup=setup, seq_length=tokenizer.model_max_length) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/root/cramming/cramming/backend/torch_default.py", line 91, in init model = torch.compile( ^^^^^^^^^^^^^^ File "/usr/local/lib/python3.11/dist-packages/torch/init.py", line 1440, in compile backend = _TorchCompileInductorWrapper(mode, options, dynamic) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/usr/local/lib/python3.11/dist-packages/torch/init.py", line 1334, in init self.apply_options(options) File "/usr/local/lib/python3.11/dist-packages/torch/init.py", line 1376, in apply_options raise RuntimeError( RuntimeError: Unexpected optimization option max_autotune_gemm, known options are ['debug', 'disable_progress', 'verbose_progress', 'cpp_wrapper', 'dce', 'static_weight_shapes', 'size_asserts', 'pick_loop_orders', 'inplace_buffers', 'benchmark_harness', 'epilogue_fusion', 'epilogue_fusion_first', 'pattern_matcher', 'reordering', 'max_autotune', 'realize_reads_threshold', 'realize_bytes_threshold', 'realize_acc_reads_threshold', 'fallback_random', 'implicit_fallbacks', 'tune_layout', 'aggressive_fusion', 'max_fusion_size', 'unroll_reductions_threshold', 'comment_origin', 'developer_warnings', 'compile_threads', 'kernel_name_max_ops', 'shape_padding', 'permute_fusion', 'profiler_mark_wrapper_call', '_raise_error_for_testing', 'cpp.threads', 'cpp.dynamic_threads', 'cpp.simdlen', 'cpp.min_chunk_size', 'cpp.cxx', 'cpp.enable_kernel_profile', 'cpp.weight_prepack', 'triton.cudagraphs', 'triton.debug_sync_graph', 'triton.debug_sync_kernel', 'triton.dense_indexing', 'triton.max_tiles', 'triton.autotune_pointwise', 'triton.tiling_prevents_pointwise_fusion', 'triton.tiling_prevents_reduction_fusion', 'triton.ordered_kernel_names', 'triton.descriptive_kernel_names', 'triton.persistent_reductions', 'trace.enabled', 'trace.debug_log', 'trace.info_log', 'trace.fx_graph', 'trace.fx_graph_transformed', 'trace.ir_pre_fusion', 'trace.ir_post_fusion', 'trace.output_code', 'trace.graph_diagram', 'trace.compile_profile', 'trace.upload_tar']
Set the environment variable HYDRA_FULL_ERROR=1 for a complete stack trace.
What inductor settings you have depends on the installed pytorch version. In any case you can remove the modifications to the inductor with impl._inductor_vars=null
.
Name: torch Version: 2.0.1 Summary: Tensors and Dynamic neural networks in Python with strong GPU acceleration Home-page: https://pytorch.org/ Author: PyTorch Team Author-email: packages@pytorch.org License: BSD-3 Location: /usr/local/lib/python3.11/dist-packages Requires: filelock, jinja2, networkx, nvidia-cublas-cu11, nvidia-cuda-cupti-cu11, nvidia-cuda-nvrtc-cu11, nvidia-cuda-runtime-cu11, nvidia-cudnn-cu11, nvidia-cufft-cu11, nvidia-curand-cu11, nvidia-cusolver-cu11, nvidia-cusparse-cu11, nvidia-nccl-cu11, nvidia-nvtx-cu11, sympy, triton, typing-extensions Required-by: cramming, triton
this is the version,thx
yeah you need impl._inductor_vars=null
. I've simplified the inductor settings on main to make this smoother.
ok thx,fixed
Hi, I got the same FileNotFoundError, but adding impl.threads=0 or 1 does not help, any idea what might be the reason? Thanks!
Hi! Which FileNotFoundError exactly?
Error Message: ''' [2024-01-08 14:52:14,323] Model with config {'architectures': ['BertForMaskedLM'], 'attention_probs_dropout_prob': 0.1, 'hidden_act': 'gelu', 'hidden_dropout_prob': 0.1, 'hidden_size': 768, 'initializer_range': 0.02, 'intermediate_size': 3072, 'layer_norm_eps': 1e-12, 'max_position_embeddings': 512, 'num_attention_heads': 12, 'num_hidden_layers': 12, 'pad_token_id': 0, 'position_embedding_type': 'absolute', 'type_vocab_size': 2, 'use_cache': True} loaded with 111,241,472 parameters. [2024-01-08 14:52:14,324] Now preparing source ag_news... [2024-01-08 14:52:15,757] Found cached dataset ag_news (file:///localdisk/home/user/Work/Repositories/cramming/outputs/data/ag_news/default/0.0.0/bc2bcb40336ace1a0374767fc29bb0296cdaf8a6da7298436239c54d79180548) Error executing job with overrides: ['name=test', 'arch=hf-bert-base', 'train=bert-base', 'data=sanity-check-2', 'dryrun=True', 'impl.microbatch_size=2', 'impl.threads=0'] Traceback (most recent call last): File "/localdisk/home/user/Work/Repositories/cramming/cramming/data/pretraining_preparation.py", line 47, in load_pretraining_corpus tokenized_dataset = datasets.load_from_disk(data_path) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/localdisk/home/user/miniconda3/lib/python3.11/site-packages/datasets/load.py", line 1886, in load_from_disk raise FileNotFoundError(f"Directory {dataset_path} not found") FileNotFoundError: Directory /localdisk/home/user/Work/Repositories/cramming/outputs/data/sanity-check-2_BPEx32768_324a8001208359684a2025ba5bd5f119 not found
During handling of the above exception, another exception occurred:
Traceback (most recent call last): File "/localdisk/home/user/Work/Repositories/cramming/pretrain.py", line 196, in launch cramming.utils.main_launcher(cfg, main_training_process, job_name="pretraining") File "/localdisk/home/user/Work/Repositories/cramming/cramming/utils.py", line 54, in main_launcher metrics = main_fn(cfg, setup) ^^^^^^^^^^^^^^^^^^^ File "/localdisk/home/user/Work/Repositories/cramming/pretrain.py", line 21, in main_training_process dataset, tokenizer = cramming.load_pretraining_corpus(cfg.data, cfg.impl) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/localdisk/home/user/Work/Repositories/cramming/cramming/data/pretraining_preparation.py", line 63, in load_pretraining_corpus preprocessed_dataset, new_tokenizer = preprocess_dataset( ^^^^^^^^^^^^^^^^^^^ File "/localdisk/home/user/Work/Repositories/cramming/cramming/data/pretraining_preparation.py", line 120, in preprocess_dataset raw_dataset = datasets.load_dataset( ^^^^^^^^^^^^^^^^^^^^^^ File "/localdisk/home/user/miniconda3/lib/python3.11/site-packages/datasets/load.py", line 1810, in load_dataset ds = builder_instance.as_dataset(split=split, verification_mode=verification_mode, in_memory=keep_in_memory) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/localdisk/home/user/miniconda3/lib/python3.11/site-packages/datasets/builder.py", line 1107, in as_dataset raise NotImplementedError(f"Loading a dataset cached in a {type(self._fs).name} is not supported.") NotImplementedError: Loading a dataset cached in a LocalFileSystem is not supported.
Set the environment variable HYDRA_FULL_ERROR=1 for a complete stack trace. '''
Ok, so it's a problem with LocalFileSystem
, are you able to load huggingface datasets in normal settings (without this repo)?
Thanks! the reply was inspiring.
The problem turns out to be caused by the version of datasets, running pip install -U datasets
solves the problem (2.12.0 -> 2.16.1)
ok, I'm glad you were able to fix it!
Error executing job with overrides: ['name=test', 'arch=hf-bert-base', 'train=bert-base', 'data=sanity-check-2', 'dryrun=True', 'impl.microbatch_size=2'] Traceback (most recent call last): File "/root/cramming/cramming/data/pretraining_preparation.py", line 47, in load_pretraining_corpus tokenized_dataset = datasets.load_from_disk(data_path) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/usr/local/lib/python3.11/dist-packages/datasets/load.py", line 1898, in load_from_disk raise FileNotFoundError(f"Directory {dataset_path} not found") FileNotFoundError: Directory /root/cramming/outputs/data/sanity-check-2_BPEx32768_324a8001208359684a2025ba5bd5f119 not found
During handling of the above exception, another exception occurred:
Traceback (most recent call last): File "/root/cramming/pretrain.py", line 155, in launch cramming.utils.main_launcher(cfg, main_training_process, job_name="pretraining") File "/root/cramming/cramming/utils.py", line 54, in main_launcher metrics = main_fn(cfg, setup) ^^^^^^^^^^^^^^^^^^^ File "/root/cramming/pretrain.py", line 21, in main_training_process dataset, tokenizer = cramming.load_pretraining_corpus(cfg.data, cfg.impl) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/root/cramming/cramming/data/pretraining_preparation.py", line 63, in load_pretraining_corpus preprocessed_dataset, new_tokenizer = preprocess_dataset( ^^^^^^^^^^^^^^^^^^^ File "/root/cramming/cramming/data/pretraining_preparation.py", line 169, in preprocess_dataset tokenized_dataset = _huggingface_preprocessing(raw_data, tokenizer, cfg_data, num_threads=num_threads) # Tokenize, group, sort... ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/root/cramming/cramming/data/pretraining_preparation.py", line 238, in _huggingface_preprocessing tokenized_dataset = raw_dataset.map( ^^^^^^^^^^^^^^^^ File "/usr/local/lib/python3.11/dist-packages/datasets/arrow_dataset.py", line 580, in wrapper out: Union["Dataset", "DatasetDict"] = func(self, *args, *kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/usr/local/lib/python3.11/dist-packages/datasets/arrow_dataset.py", line 545, in wrapper out: Union["Dataset", "DatasetDict"] = func(self, args, **kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/usr/local/lib/python3.11/dist-packages/datasets/arrow_dataset.py", line 3170, in map with Pool(len(kwargs_per_job)) as pool: ^^^^^^^^^^^^^^^^^^^^^^^^^ File "/usr/local/lib/python3.11/dist-packages/multiprocess/context.py", line 119, in Pool return Pool(processes, initializer, initargs, maxtasksperchild, ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/usr/local/lib/python3.11/dist-packages/multiprocess/pool.py", line 191, in init self._setup_queues() File "/usr/local/lib/python3.11/dist-packages/multiprocess/pool.py", line 346, in _setup_queues self._inqueue = self._ctx.SimpleQueue() ^^^^^^^^^^^^^^^^^^^^^^^ File "/usr/local/lib/python3.11/dist-packages/multiprocess/context.py", line 113, in SimpleQueue return SimpleQueue(ctx=self.get_context()) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/usr/local/lib/python3.11/dist-packages/multiprocess/queues.py", line 344, in init self._rlock = ctx.Lock() ^^^^^^^^^^ File "/usr/local/lib/python3.11/dist-packages/multiprocess/context.py", line 68, in Lock return Lock(ctx=self.get_context()) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/usr/local/lib/python3.11/dist-packages/multiprocess/synchronize.py", line 168, in init SemLock.init(self, SEMAPHORE, 1, 1, ctx=ctx) File "/usr/local/lib/python3.11/dist-packages/multiprocess/synchronize.py", line 86, in init register(self._semlock.name, "semaphore") File "/usr/local/lib/python3.11/dist-packages/multiprocess/resource_tracker.py", line 158, in register self._send('REGISTER', name, rtype) File "/usr/local/lib/python3.11/dist-packages/multiprocess/resource_tracker.py", line 165, in _send self.ensure_running() File "/usr/local/lib/python3.11/dist-packages/multiprocess/resource_tracker.py", line 132, in ensure_running pid = util.spawnv_passfds(exe, args, fds_to_pass) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/usr/local/lib/python3.11/dist-packages/multiprocess/util.py", line 452, in spawnv_passfds return _posixsubprocess.fork_exec( ^^^^^^^^^^^^^^^^^^^^^^^^^^^ TypeError: fork_exec() takes exactly 23 arguments (21 given)
Set the environment variable HYDRA_FULL_ERROR=1 for a complete stack trace.