Open rst0git opened 3 months ago
@fxkamd could you take a look at this?
@fdavid-amd is investigating
Could you attach your HelloWorld.cpp?
Could you attach your HelloWorld.cpp?
There is a dropdown in https://github.com/checkpoint-restore/criu/issues/2450#issue-2421617263 with the code of HelloWorld.cpp
. This code is based on the example from the HIP-Examples repository.
A friendly reminder that this issue had no activity for 30 days.
Running
criu dump
for a simple ROCm application (HelloWorld.cpp
) on Ubuntu 22.04 (6.5.0-44-generic
kernel) causes kernel crash. This problem occurs with CRIU installed from both master and criu-dev branches.HelloWorld.cpp:
```C++ #include
#include
#include
#include
#include
#include
#include
#include
#define SAMPLE_VERSION "HIP-Examples-Application-v1.0"
#define SUCCESS 0
#define FAILURE 1
using namespace std;
__global__ void helloworld(char* in, char* out)
{
int num = hipThreadIdx_x + hipBlockDim_x * hipBlockIdx_x;
out[num] = in[num] + 1;
}
int main(int argc, char* argv[])
{
hipDeviceProp_t devProp;
hipGetDeviceProperties(&devProp, 0);
cout << " System minor " << devProp.minor << endl;
cout << " System major " << devProp.major << endl;
cout << " agent prop name " << devProp.name << endl;
while (1) {
/* Initial input,output for the host and create memory objects for the kernel*/
const char* input = "GdkknVnqkc";
size_t strlength = strlen(input);
cout << "input string:" << endl;
cout << input << endl;
char *output = (char*) malloc(strlength + 1);
char* inputBuffer;
char* outputBuffer;
hipMalloc((void**)&inputBuffer, (strlength + 1) * sizeof(char));
hipMalloc((void**)&outputBuffer, (strlength + 1) * sizeof(char));
hipMemcpy(inputBuffer, input, (strlength + 1) * sizeof(char), hipMemcpyHostToDevice);
hipLaunchKernelGGL(
helloworld,
dim3(1),
dim3(strlength),
0, 0,
inputBuffer, outputBuffer
);
hipMemcpy(output, outputBuffer,(strlength + 1) * sizeof(char), hipMemcpyDeviceToHost);
hipFree(inputBuffer);
hipFree(outputBuffer);
//Add the terminal character to the end of output.
output[strlength] = '\0';
cout << "\noutput string:" << endl;
cout << output << endl;
free(output);
sleep(1);
}
std::cout<<"Passed!\n";
return SUCCESS;
}
```
rocminfo
journalctl --system
dmesg
A similar problem also occurs with kernel version
6.8.0-38-generic
:dmesg
A similar problem occurs on RHEL 9.4 (
5.14.0-427.26.1.el9_4.x86_64
) whencriu dump
exits with an error (e.g., when the--shell-job
option is not specified). This causes an immediate system reboot.