Open zhuliyi0 opened 1 year ago
its LoRA Dreambooth script. does not happen very often.
Probably yes, it's colab backend related issues, the drive is disconnected so it can't save tensorboard logs or any file to drive https://github.com/googlecolab/colabtools/issues/3525 https://github.com/googlecolab/colabtools/issues/3441 https://github.com/googlecolab/colabtools/issues/3436 https://github.com/googlecolab/colabtools/issues/3562
I'm also facing this issue.Have we found any workarounds?
I just learned to live with it : P Refresh runtime after every training.
And the training resumes from where it stopped? else what's the point?
I had this error randomly occur several times, all during training. Seems like a connection issue?
Full error msg:
╭───────────────────── Traceback (most recent call last) ──────────────────────╮ │ /content/kohya-trainer/train_network.py:752 in │
│ │
│ 749 │ args = parser.parse_args() │
│ 750 │ args = train_util.read_config_from_file(args, parser) │
│ 751 │ │
│ ❱ 752 │ train(args) │
│ 753 │
│ │
│ /content/kohya-trainer/train_network.py:633 in train │
│ │
│ 630 │ │ │ │
│ 631 │ │ │ if args.logging_dir is not None: │
│ 632 │ │ │ │ logs = generate_step_logs(args, current_loss, avr_loss │
│ ❱ 633 │ │ │ │ accelerator.log(logs, step=global_step) │
│ 634 │ │ │ │
│ 635 │ │ │ if global_step >= args.max_train_steps: │
│ 636 │ │ │ │ break │
│ │
│ /usr/local/lib/python3.9/dist-packages/accelerate/accelerator.py:466 in │
│ wrapper │
│ │
│ 463 │ │ @wraps(func) │
│ 464 │ │ def wrapper(self, *args, *kwargs): │
│ 465 │ │ │ if self.is_main_process or not self.use_distributed: │
│ ❱ 466 │ │ │ │ return func(self, args, kwargs) │
│ 467 │ │ │
│ 468 │ │ return wrapper │
│ 469 │
│ │
│ /usr/local/lib/python3.9/dist-packages/accelerate/accelerator.py:1575 in log │
│ │
│ 1572 │ │ │ │ ``` │
│ 1573 │ │ """ │
│ 1574 │ │ for tracker in self.trackers: │
│ ❱ 1575 │ │ │ tracker.log(values, step=step, log_kwargs.get(tracker.n │
│ 1576 │ │
│ 1577 │ @on_main_process │
│ 1578 │ def end_training(self): │
│ │
│ /usr/local/lib/python3.9/dist-packages/accelerate/tracking.py:207 in log │
│ │
│ 204 │ │ │ │ self.writer.add_text(k, v, global_step=step, kwargs) │
│ 205 │ │ │ elif isinstance(v, dict): │
│ 206 │ │ │ │ self.writer.add_scalars(k, v, global_step=step, kwar │
│ ❱ 207 │ │ self.writer.flush() │
│ 208 │ │ logger.debug("Successfully logged to TensorBoard") │
│ 209 │ │
│ 210 │ def finish(self): │
│ │
│ /usr/local/lib/python3.9/dist-packages/torch/utils/tensorboard/writer.py:120 │
│ 0 in flush │
│ │
│ 1197 │ │ if self.all_writers is None: │
│ 1198 │ │ │ return │
│ 1199 │ │ for writer in self.all_writers.values(): │
│ ❱ 1200 │ │ │ writer.flush() │
│ 1201 │ │
│ 1202 │ def close(self): │
│ 1203 │ │ if self.all_writers is None: │
│ │
│ /usr/local/lib/python3.9/dist-packages/torch/utils/tensorboard/writer.py:150 │
│ in flush │
│ │
│ 147 │ │ Call this method to make sure that all pending events have be │
│ 148 │ │ disk. │
│ 149 │ │ """ │
│ ❱ 150 │ │ self.event_writer.flush() │
│ 151 │ │
│ 152 │ def close(self): │
│ 153 │ │ """Flushes the event file to disk and close the file. │
│ │
│ /usr/local/lib/python3.9/dist-packages/tensorboard/summary/writer/event_file │
│ _writer.py:121 in flush │
│ │
│ 118 │ │ Call this method to make sure that all pending events have bee │
│ 119 │ │ written to disk. │
│ 120 │ │ """ │
│ ❱ 121 │ │ self._async_writer.flush() │
│ 122 │ │
│ 123 │ def close(self): │
│ 124 │ │ """Performs a final flush of the event file to disk, stops the │
│ │
│ /usr/local/lib/python3.9/dist-packages/tensorboard/summary/writer/event_file │
│ _writer.py:177 in flush │
│ │
│ 174 │ │ │ if self._closed: │
│ 175 │ │ │ │ raise IOError("Writer is closed") │
│ 176 │ │ │ self._byte_queue.join() │
│ ❱ 177 │ │ │ self._writer.flush() │
│ 178 │ │
│ 179 │ def close(self): │
│ 180 │ │ """Closes the underlying writer, flushing any pending writes f │
│ │
│ /usr/local/lib/python3.9/dist-packages/tensorboard/summary/writer/record_wri │
│ ter.py:43 in flush │
│ │
│ 40 │ │ self._writer.write(header + header_crc + data + footer_crc) │
│ 41 │ │
│ 42 │ def flush(self): │
│ ❱ 43 │ │ self._writer.flush() │
│ 44 │ │
│ 45 │ def close(self): │
│ 46 │ │ self._writer.close() │
│ │
│ /usr/local/lib/python3.9/dist-packages/tensorflow/python/lib/io/file_io.py:2 │
│ 19 in flush │
│ │
│ 216 │ data would survive an application crash but not necessarily an OS │
│ 217 │ """ │
│ 218 │ if self._writable_file: │
│ ❱ 219 │ self._writable_file.flush() │
│ 220 │
│ 221 def close(self): │
│ 222 │ r"""Closes the file. │
╰──────────────────────────────────────────────────────────────────────────────╯
FailedPreconditionError:
/content/drive/MyDrive/LoRA/logs/lora_lvchengguiyuxi_try10_token1style_10r_96d_l
r1cos1020230411055107/network_train/events.out.tfevents.1681192377.f7d356c2b99e.
82264.0; Transport endpoint is not connected
steps: 46% 10516/23000 [4:30:22<5:20:58, 1.54s/it, loss=0.0889]
╭───────────────────── Traceback (most recent call last) ──────────────────────╮
│ /usr/local/bin/accelerate:8 in │
│ │
│ 5 from accelerate.commands.accelerate_cli import main │
│ 6 if name == 'main': │
│ 7 │ sys.argv[0] = re.sub(r'(-script.pyw|.exe)?$', '', sys.argv[0]) │
│ ❱ 8 │ sys.exit(main()) │
│ 9 │
│ │
│ /usr/local/lib/python3.9/dist-packages/accelerate/commands/accelerate_cli.py │
│ :45 in main │
│ │
│ 42 │ │ exit(1) │
│ 43 │ │
│ 44 │ # Run │
│ ❱ 45 │ args.func(args) │
│ 46 │
│ 47 │
│ 48 if name == "main": │
│ │
│ /usr/local/lib/python3.9/dist-packages/accelerate/commands/launch.py:1104 in │
│ launch_command │
│ │
│ 1101 │ elif defaults is not None and defaults.compute_environment == Com │
│ 1102 │ │ sagemaker_launcher(defaults, args) │
│ 1103 │ else: │
│ ❱ 1104 │ │ simple_launcher(args) │
│ 1105 │
│ 1106 │
│ 1107 def main(): │
│ │
│ /usr/local/lib/python3.9/dist-packages/accelerate/commands/launch.py:567 in │
│ simple_launcher │
│ │
│ 564 │ process = subprocess.Popen(cmd, env=current_env) │
│ 565 │ process.wait() │
│ 566 │ if process.returncode != 0: │
│ ❱ 567 │ │ raise subprocess.CalledProcessError(returncode=process.return │
│ 568 │
│ 569 │
│ 570 def multi_gpu_launcher(args): │
╰──────────────────────────────────────────────────────────────────────────────╯
CalledProcessError: Command '['/usr/bin/python3', 'train_network.py',
'--sample_prompts=/content/LoRA/config/sample_prompt.txt',
'--dataset_config=/content/LoRA/config/dataset_config.toml',
'--config_file=/content/LoRA/config/config_file.toml']' returned non-zero exit
status 1.