lshqqytiger / ZLUDA

CUDA on AMD GPUs
Apache License 2.0
182 stars 4 forks source link

found **Cannot get amd_mem_obj** error #11

Open Paper-Dragon opened 3 months ago

Paper-Dragon commented 3 months ago

:3:hip_memory.cpp           :3375: 729649642184 us: [pid:3676907 tid:0x753391c17000] hipPointerGetAttributes: Returned hipErrorInvalidValue :
:3:hip_memory.cpp           :3327: 729649642292 us: [pid:3676907 tid:0x753391c17000]  hipPointerGetAttributes ( 0x7ffc0d014fc0, 0x5bfd1bc90040 )
:1:hip_memory.cpp           :3374: 729649642401 us: [pid:3676907 tid:0x753391c17000] Cannot get amd_mem_obj for ptr: 0x5bfd1bc90040

:3:hip_memory.cpp           :3375: 729649642555 us: [pid:3676907 tid:0x753391c17000] hipPointerGetAttributes: Returned hipErrorInvalidValue :
:3:hip_memory.cpp           :3327: 729649642661 us: [pid:3676907 tid:0x753391c17000]  hipPointerGetAttributes ( 0x7ffc0d014fc0, 0x5bfd1bc90040 )
:1:hip_memory.cpp           :3374: 729649642773 us: [pid:3676907 tid:0x753391c17000] Cannot get amd_mem_obj for ptr: 0x5bfd1bc90040

:3:hip_memory.cpp           :3375: 729649642928 us: [pid:3676907 tid:0x753391c17000] hipPointerGetAttributes: Returned hipErrorInvalidValue :
:3:hip_memory.cpp           :3327: 729649643035 us: [pid:3676907 tid:0x753391c17000]  hipPointerGetAttributes ( 0x7ffc0d014fc0, 0x5bfd1bc90040 )
:1:hip_memory.cpp           :3374: 729649643145 us: [pid:3676907 tid:0x753391c17000] Cannot get amd_mem_obj for ptr: 0x5bfd1bc90040

:3:hip_memory.cpp           :3375: 729649643404 us: [pid:3676907 tid:0x753391c17000] hipPointerGetAttributes: Returned hipErrorInvalidValue :
:3:hip_memory.cpp           :3327: 729649643535 us: [pid:3676907 tid:0x753391c17000]  hipPointerGetAttributes ( 0x7ffc0d014fc0, 0x5bfd1bbaa340 )
:1:hip_memory.cpp           :3374: 729649643654 us: [pid:3676907 tid:0x753391c17000] Cannot get amd_mem_obj for ptr: 0x5bfd1bbaa340

:3:hip_memory.cpp           :3375: 729649643808 us: [pid:3676907 tid:0x753391c17000] hipPointerGetAttributes: Returned hipErrorInvalidValue :
:3:hip_memory.cpp           :3327: 729649643917 us: [pid:3676907 tid:0x753391c17000]  hipPointerGetAttributes ( 0x7ffc0d014fc0, 0x5bfd1bbaa340 )
:1:hip_memory.cpp           :3374: 729649644033 us: [pid:3676907 tid:0x753391c17000] Cannot get amd_mem_obj for ptr: 0x5bfd1bbaa340

:3:hip_memory.cpp           :3375: 729649644189 us: [pid:3676907 tid:0x753391c17000] hipPointerGetAttributes: Returned hipErrorInvalidValue :
:3:hip_memory.cpp           :3327: 729649644296 us: [pid:3676907 tid:0x753391c17000]  hipPointerGetAttributes ( 0x7ffc0d014fc0, 0x5bfd1bbaa340 )
:1:hip_memory.cpp           :3374: 729649644406 us: [pid:3676907 tid:0x753391c17000] Cannot get amd_mem_obj for ptr: 0x5bfd1bbaa340

:3:hip_memory.cpp           :3375: 729649644559 us: [pid:3676907 tid:0x753391c17000] hipPointerGetAttributes: Returned hipErrorInvalidValue :
:3:hip_memory.cpp           :3327: 729649644666 us: [pid:3676907 tid:0x753391c17000]  hipPointerGetAttributes ( 0x7ffc0d014fc0, 0x5bfd1bbaa340 )
:1:hip_memory.cpp           :3374: 729649644774 us: [pid:3676907 tid:0x753391c17000] Cannot get amd_mem_obj for ptr: 0x5bfd1bbaa340

:3:hip_memory.cpp           :3375: 729649644928 us: [pid:3676907 tid:0x753391c17000] hipPointerGetAttributes: Returned hipErrorInvalidValue :
:3:hip_memory.cpp           :3327: 729649645034 us: [pid:3676907 tid:0x753391c17000]  hipPointerGetAttributes ( 0x7ffc0d014fc0, 0x5bfd1bbaa340 )
:1:hip_memory.cpp           :3374: 729649645142 us: [pid:3676907 tid:0x753391c17000] Cannot get amd_mem_obj for ptr: 0x5bfd1bbaa340

:3:hip_memory.cpp           :3375: 729649645295 us: [pid:3676907 tid:0x753391c17000] hipPointerGetAttributes: Returned hipErrorInvalidValue :
:3:hip_memory.cpp           :3327: 729649645402 us: [pid:3676907 tid:0x753391c17000]  hipPointerGetAttributes ( 0x7ffc0d014fc0, 0x5bfd1bbaa340 )
:1:hip_memory.cpp           :3374: 729649645510 us: [pid:3676907 tid:0x753391c17000] Cannot get amd_mem_obj for ptr: 0x5bfd1bbaa340
lshqqytiger commented 3 months ago

need more details. How can I reproduce?

Paper-Dragon commented 3 months ago

Thanks for your reply, I added some system information, hope it helps

os version:

root@x570-wifi:/home/user# cat /etc/os-release
PRETTY_NAME="Ubuntu 22.04.2 LTS"
NAME="Ubuntu"
VERSION_ID="22.04"
VERSION="22.04.2 LTS (Jammy Jellyfish)"
VERSION_CODENAME=jammy
ID=ubuntu
ID_LIKE=debian
HOME_URL="https://www.ubuntu.com/"
SUPPORT_URL="https://help.ubuntu.com/"
BUG_REPORT_URL="https://bugs.launchpad.net/ubuntu/"
PRIVACY_POLICY_URL="https://www.ubuntu.com/legal/terms-and-policies/privacy-policy"
UBUNTU_CODENAME=jammy

rocm-smi

root@x570-wifi:/home/user# rocm-smi

======================================= ROCm System Management Interface =======================================
================================================= Concise Info =================================================
Device  [Model : Revision]    Temp    Power   Partitions      SCLK     MCLK   Fan    Perf  PwrCap  VRAM%  GPU%
        Name (20 chars)       (Edge)  (Avg)   (Mem, Compute)
================================================================================================================
0       [0xe445 : 0xc5]       66.0°C  166.0W  N/A, N/A        2575Mhz  96Mhz  54.9%  auto  186.0W   22%   99%
        Navi 22 [Radeon RX 6
================================================================================================================
============================================= End of ROCm SMI Log ==============================================

package version

root@x570-wifi:/home/user# apt list | grep rocm

WARNING: apt does not have a stable CLI interface. Use with caution in scripts.

libfcgi-procmanager-maxrequests-perl/jammy,jammy 0.2-1.1 all
libfcgi-procmanager-perl/jammy,jammy 0.28-1.1 all
procmail-lib/jammy,jammy 1:2009.1202-4.1 all
procmail/jammy 3.22-26build2 amd64
procmeter3/jammy 3.6-3 amd64
rocm-bandwidth-test6.0.2/jammy 1.4.0.60002-115~22.04 amd64
rocm-bandwidth-test/jammy 1.4.0.60002-115~22.04 amd64
rocm-clang-ocl6.0.2/jammy 0.5.0.60002-115~22.04 amd64
rocm-clang-ocl/jammy,now 0.5.0.60002-115~22.04 amd64 [installed,automatic]
rocm-cmake6.0.2/jammy 0.11.0.60002-115~22.04 amd64
rocm-cmake/jammy,now 0.11.0.60002-115~22.04 amd64 [installed,automatic]
rocm-core6.0.2/jammy 6.0.2.60002-115~22.04 amd64
rocm-core/jammy,now 6.0.2.60002-115~22.04 amd64 [installed,automatic]
rocm-dbgapi6.0.2/jammy 0.71.0.60002-115~22.04 amd64
rocm-dbgapi/jammy,now 0.71.0.60002-115~22.04 amd64 [installed,automatic]
rocm-debug-agent6.0.2/jammy 2.0.3.60002-115~22.04 amd64
rocm-debug-agent/jammy,now 2.0.3.60002-115~22.04 amd64 [installed,automatic]
rocm-dev6.0.2/jammy 6.0.2.60002-115~22.04 amd64
rocm-dev/jammy 6.0.2.60002-115~22.04 amd64
rocm-developer-tools6.0.2/jammy 6.0.2.60002-115~22.04 amd64
rocm-developer-tools/jammy,now 6.0.2.60002-115~22.04 amd64 [installed,automatic]
rocm-device-libs6.0.2/jammy 1.0.0.60002-115~22.04 amd64
rocm-device-libs/jammy,now 1.0.0.60002-115~22.04 amd64 [installed,automatic]
rocm-dkms/jammy 6.0.2.60002-115~22.04 amd64
rocm-gdb6.0.2/jammy 13.2.60002-115~22.04 amd64
rocm-gdb/jammy,now 13.2.60002-115~22.04 amd64 [installed,automatic]
rocm-hip-libraries6.0.2/jammy 6.0.2.60002-115~22.04 amd64
rocm-hip-libraries/jammy,now 6.0.2.60002-115~22.04 amd64 [installed,automatic]
rocm-hip-runtime-dev6.0.2/jammy 6.0.2.60002-115~22.04 amd64
rocm-hip-runtime-dev/jammy,now 6.0.2.60002-115~22.04 amd64 [installed,automatic]
rocm-hip-runtime6.0.2/jammy 6.0.2.60002-115~22.04 amd64
rocm-hip-runtime/jammy,now 6.0.2.60002-115~22.04 amd64 [installed]
rocm-hip-sdk6.0.2/jammy 6.0.2.60002-115~22.04 amd64
rocm-hip-sdk/jammy,now 6.0.2.60002-115~22.04 amd64 [installed,automatic]
rocm-khronos-cts/jammy 60002-115~22.04 amd64
rocm-language-runtime6.0.2/jammy 6.0.2.60002-115~22.04 amd64
rocm-language-runtime/jammy,now 6.0.2.60002-115~22.04 amd64 [installed,automatic]
rocm-libs6.0.2/jammy 6.0.2.60002-115~22.04 amd64
rocm-libs/jammy 6.0.2.60002-115~22.04 amd64
rocm-llvm6.0.2/jammy 17.0.0.24012.60002-115~22.04 amd64
rocm-llvm/jammy,now 17.0.0.24012.60002-115~22.04 amd64 [installed,automatic]
rocm-ml-libraries6.0.2/jammy 6.0.2.60002-115~22.04 amd64
rocm-ml-libraries/jammy,now 6.0.2.60002-115~22.04 amd64 [installed,automatic]
rocm-ml-sdk6.0.2/jammy 6.0.2.60002-115~22.04 amd64
rocm-ml-sdk/jammy,now 6.0.2.60002-115~22.04 amd64 [installed,automatic]
rocm-ocl-icd6.0.2/jammy 2.0.0.60002-115~22.04 amd64
rocm-ocl-icd/jammy,now 2.0.0.60002-115~22.04 amd64 [installed,automatic]
rocm-ocltst6.0.2/jammy 2.0.0.60002-115~22.04 amd64
rocm-ocltst/jammy 2.0.0.60002-115~22.04 amd64
rocm-opencl-dev6.0.2/jammy 2.0.0.60002-115~22.04 amd64
rocm-opencl-dev/jammy,now 2.0.0.60002-115~22.04 amd64 [installed,automatic]
rocm-opencl-runtime6.0.2/jammy 6.0.2.60002-115~22.04 amd64
rocm-opencl-runtime/jammy,now 6.0.2.60002-115~22.04 amd64 [installed]
rocm-opencl-sdk6.0.2/jammy 6.0.2.60002-115~22.04 amd64
rocm-opencl-sdk/jammy,now 6.0.2.60002-115~22.04 amd64 [installed,automatic]
rocm-opencl6.0.2/jammy 2.0.0.60002-115~22.04 amd64
rocm-opencl/jammy,now 2.0.0.60002-115~22.04 amd64 [installed,automatic]
rocm-openmp-sdk6.0.2/jammy 6.0.2.60002-115~22.04 amd64
rocm-openmp-sdk/jammy,now 6.0.2.60002-115~22.04 amd64 [installed,automatic]
rocm-smi-lib6.0.2/jammy 6.0.0.60002-115~22.04 amd64
rocm-smi-lib/jammy,now 6.0.0.60002-115~22.04 amd64 [installed,automatic]
rocm-utils6.0.2/jammy 6.0.2.60002-115~22.04 amd64
rocm-utils/jammy,now 6.0.2.60002-115~22.04 amd64 [installed,automatic]
rocm-validation-suite6.0.2/jammy 1.0.60002.60002-115~22.04 amd64
rocm-validation-suite/jammy 1.0.60002.60002-115~22.04 amd64
rocm6.0.2/jammy 6.0.2.60002-115~22.04 amd64
rocm/jammy,now 6.0.2.60002-115~22.04 amd64 [installed]
rocminfo6.0.2/jammy 1.0.0.60002-115~22.04 amd64
rocminfo/jammy,now 1.0.0.60002-115~22.04 amd64 [installed,automatic]

torch and other package version

requests==2.31.0
torch==2.2.1
accelerate==0.27.0
transformers==4.38.1
datasets==2.17.1
numpy==1.26.4
gitpython==3.1.42

some train code

    print_in_color("Starting training...", "\033[34m")  # Blue for start

    tokenizer = AutoTokenizer.from_pretrained(task_args["model_name"])

    def tokenize_function(examples):
        return tokenizer(
            examples["text"], padding="max_length", truncation=True
        )

    model = AutoModelForSequenceClassification.from_pretrained(
        task_args["model_name"], num_labels=task_args["num_labels"]
    )
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    print(f"[debug] cuda status: {torch.cuda.is_available()}")
    model.to(device)
    print("[debug] load model done")
    dataset = load_dataset(task_args["dataset_name"])
    print("[debug] load dataset")
    tokenized_datasets = dataset.map(tokenize_function, batched=True)

    small_train_dataset = (
        tokenized_datasets["train"].shuffle(seed=task_args["seed"]).select(range(task_args["num_rows"]))
    )
    small_eval_dataset = (
        tokenized_datasets["train"].shuffle(seed=task_args["seed"]).select(range(task_args["num_rows"]))
    )
    training_args = TrainingArguments(
        output_dir="my_model", evaluation_strategy="epoch", save_strategy='epoch',
    )
    print("[debug] start train")
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=small_train_dataset,
        eval_dataset=small_eval_dataset,
        compute_metrics=compute_metrics,
    )
    trainer.train()
    trainer.save_model("my_model")

the error on

trainer.train()