Open Paper-Dragon opened 3 months ago
need more details. How can I reproduce?
Thanks for your reply, I added some system information, hope it helps
os version:
root@x570-wifi:/home/user# cat /etc/os-release
PRETTY_NAME="Ubuntu 22.04.2 LTS"
NAME="Ubuntu"
VERSION_ID="22.04"
VERSION="22.04.2 LTS (Jammy Jellyfish)"
VERSION_CODENAME=jammy
ID=ubuntu
ID_LIKE=debian
HOME_URL="https://www.ubuntu.com/"
SUPPORT_URL="https://help.ubuntu.com/"
BUG_REPORT_URL="https://bugs.launchpad.net/ubuntu/"
PRIVACY_POLICY_URL="https://www.ubuntu.com/legal/terms-and-policies/privacy-policy"
UBUNTU_CODENAME=jammy
rocm-smi
root@x570-wifi:/home/user# rocm-smi
======================================= ROCm System Management Interface =======================================
================================================= Concise Info =================================================
Device [Model : Revision] Temp Power Partitions SCLK MCLK Fan Perf PwrCap VRAM% GPU%
Name (20 chars) (Edge) (Avg) (Mem, Compute)
================================================================================================================
0 [0xe445 : 0xc5] 66.0°C 166.0W N/A, N/A 2575Mhz 96Mhz 54.9% auto 186.0W 22% 99%
Navi 22 [Radeon RX 6
================================================================================================================
============================================= End of ROCm SMI Log ==============================================
package version
root@x570-wifi:/home/user# apt list | grep rocm
WARNING: apt does not have a stable CLI interface. Use with caution in scripts.
libfcgi-procmanager-maxrequests-perl/jammy,jammy 0.2-1.1 all
libfcgi-procmanager-perl/jammy,jammy 0.28-1.1 all
procmail-lib/jammy,jammy 1:2009.1202-4.1 all
procmail/jammy 3.22-26build2 amd64
procmeter3/jammy 3.6-3 amd64
rocm-bandwidth-test6.0.2/jammy 1.4.0.60002-115~22.04 amd64
rocm-bandwidth-test/jammy 1.4.0.60002-115~22.04 amd64
rocm-clang-ocl6.0.2/jammy 0.5.0.60002-115~22.04 amd64
rocm-clang-ocl/jammy,now 0.5.0.60002-115~22.04 amd64 [installed,automatic]
rocm-cmake6.0.2/jammy 0.11.0.60002-115~22.04 amd64
rocm-cmake/jammy,now 0.11.0.60002-115~22.04 amd64 [installed,automatic]
rocm-core6.0.2/jammy 6.0.2.60002-115~22.04 amd64
rocm-core/jammy,now 6.0.2.60002-115~22.04 amd64 [installed,automatic]
rocm-dbgapi6.0.2/jammy 0.71.0.60002-115~22.04 amd64
rocm-dbgapi/jammy,now 0.71.0.60002-115~22.04 amd64 [installed,automatic]
rocm-debug-agent6.0.2/jammy 2.0.3.60002-115~22.04 amd64
rocm-debug-agent/jammy,now 2.0.3.60002-115~22.04 amd64 [installed,automatic]
rocm-dev6.0.2/jammy 6.0.2.60002-115~22.04 amd64
rocm-dev/jammy 6.0.2.60002-115~22.04 amd64
rocm-developer-tools6.0.2/jammy 6.0.2.60002-115~22.04 amd64
rocm-developer-tools/jammy,now 6.0.2.60002-115~22.04 amd64 [installed,automatic]
rocm-device-libs6.0.2/jammy 1.0.0.60002-115~22.04 amd64
rocm-device-libs/jammy,now 1.0.0.60002-115~22.04 amd64 [installed,automatic]
rocm-dkms/jammy 6.0.2.60002-115~22.04 amd64
rocm-gdb6.0.2/jammy 13.2.60002-115~22.04 amd64
rocm-gdb/jammy,now 13.2.60002-115~22.04 amd64 [installed,automatic]
rocm-hip-libraries6.0.2/jammy 6.0.2.60002-115~22.04 amd64
rocm-hip-libraries/jammy,now 6.0.2.60002-115~22.04 amd64 [installed,automatic]
rocm-hip-runtime-dev6.0.2/jammy 6.0.2.60002-115~22.04 amd64
rocm-hip-runtime-dev/jammy,now 6.0.2.60002-115~22.04 amd64 [installed,automatic]
rocm-hip-runtime6.0.2/jammy 6.0.2.60002-115~22.04 amd64
rocm-hip-runtime/jammy,now 6.0.2.60002-115~22.04 amd64 [installed]
rocm-hip-sdk6.0.2/jammy 6.0.2.60002-115~22.04 amd64
rocm-hip-sdk/jammy,now 6.0.2.60002-115~22.04 amd64 [installed,automatic]
rocm-khronos-cts/jammy 60002-115~22.04 amd64
rocm-language-runtime6.0.2/jammy 6.0.2.60002-115~22.04 amd64
rocm-language-runtime/jammy,now 6.0.2.60002-115~22.04 amd64 [installed,automatic]
rocm-libs6.0.2/jammy 6.0.2.60002-115~22.04 amd64
rocm-libs/jammy 6.0.2.60002-115~22.04 amd64
rocm-llvm6.0.2/jammy 17.0.0.24012.60002-115~22.04 amd64
rocm-llvm/jammy,now 17.0.0.24012.60002-115~22.04 amd64 [installed,automatic]
rocm-ml-libraries6.0.2/jammy 6.0.2.60002-115~22.04 amd64
rocm-ml-libraries/jammy,now 6.0.2.60002-115~22.04 amd64 [installed,automatic]
rocm-ml-sdk6.0.2/jammy 6.0.2.60002-115~22.04 amd64
rocm-ml-sdk/jammy,now 6.0.2.60002-115~22.04 amd64 [installed,automatic]
rocm-ocl-icd6.0.2/jammy 2.0.0.60002-115~22.04 amd64
rocm-ocl-icd/jammy,now 2.0.0.60002-115~22.04 amd64 [installed,automatic]
rocm-ocltst6.0.2/jammy 2.0.0.60002-115~22.04 amd64
rocm-ocltst/jammy 2.0.0.60002-115~22.04 amd64
rocm-opencl-dev6.0.2/jammy 2.0.0.60002-115~22.04 amd64
rocm-opencl-dev/jammy,now 2.0.0.60002-115~22.04 amd64 [installed,automatic]
rocm-opencl-runtime6.0.2/jammy 6.0.2.60002-115~22.04 amd64
rocm-opencl-runtime/jammy,now 6.0.2.60002-115~22.04 amd64 [installed]
rocm-opencl-sdk6.0.2/jammy 6.0.2.60002-115~22.04 amd64
rocm-opencl-sdk/jammy,now 6.0.2.60002-115~22.04 amd64 [installed,automatic]
rocm-opencl6.0.2/jammy 2.0.0.60002-115~22.04 amd64
rocm-opencl/jammy,now 2.0.0.60002-115~22.04 amd64 [installed,automatic]
rocm-openmp-sdk6.0.2/jammy 6.0.2.60002-115~22.04 amd64
rocm-openmp-sdk/jammy,now 6.0.2.60002-115~22.04 amd64 [installed,automatic]
rocm-smi-lib6.0.2/jammy 6.0.0.60002-115~22.04 amd64
rocm-smi-lib/jammy,now 6.0.0.60002-115~22.04 amd64 [installed,automatic]
rocm-utils6.0.2/jammy 6.0.2.60002-115~22.04 amd64
rocm-utils/jammy,now 6.0.2.60002-115~22.04 amd64 [installed,automatic]
rocm-validation-suite6.0.2/jammy 1.0.60002.60002-115~22.04 amd64
rocm-validation-suite/jammy 1.0.60002.60002-115~22.04 amd64
rocm6.0.2/jammy 6.0.2.60002-115~22.04 amd64
rocm/jammy,now 6.0.2.60002-115~22.04 amd64 [installed]
rocminfo6.0.2/jammy 1.0.0.60002-115~22.04 amd64
rocminfo/jammy,now 1.0.0.60002-115~22.04 amd64 [installed,automatic]
torch and other package version
requests==2.31.0
torch==2.2.1
accelerate==0.27.0
transformers==4.38.1
datasets==2.17.1
numpy==1.26.4
gitpython==3.1.42
some train code
print_in_color("Starting training...", "\033[34m") # Blue for start
tokenizer = AutoTokenizer.from_pretrained(task_args["model_name"])
def tokenize_function(examples):
return tokenizer(
examples["text"], padding="max_length", truncation=True
)
model = AutoModelForSequenceClassification.from_pretrained(
task_args["model_name"], num_labels=task_args["num_labels"]
)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(f"[debug] cuda status: {torch.cuda.is_available()}")
model.to(device)
print("[debug] load model done")
dataset = load_dataset(task_args["dataset_name"])
print("[debug] load dataset")
tokenized_datasets = dataset.map(tokenize_function, batched=True)
small_train_dataset = (
tokenized_datasets["train"].shuffle(seed=task_args["seed"]).select(range(task_args["num_rows"]))
)
small_eval_dataset = (
tokenized_datasets["train"].shuffle(seed=task_args["seed"]).select(range(task_args["num_rows"]))
)
training_args = TrainingArguments(
output_dir="my_model", evaluation_strategy="epoch", save_strategy='epoch',
)
print("[debug] start train")
trainer = Trainer(
model=model,
args=training_args,
train_dataset=small_train_dataset,
eval_dataset=small_eval_dataset,
compute_metrics=compute_metrics,
)
trainer.train()
trainer.save_model("my_model")
the error on
trainer.train()