BVLC / caffe

Caffe: a fast open framework for deep learning.
http://caffe.berkeleyvision.org/
Other
34.1k stars 18.7k forks source link

Segfault ScaleLayerTest/3.TestBackwardBroadcastMiddleInPlace #5610

Open maxlem opened 7 years ago

maxlem commented 7 years ago

Issue summary

Kernel compilation error during ScaleLayerTest/3.TestBackwardBroadcastMiddleInPlace

Not sure if this belongs to ROCm project or caffe..

Steps to reproduce

clone and build opencl branch 1.0-1015-gc60c950 (see attached makefile) make runtest

Makefile.zip

Your system configuration

Operating system: Ubuntu 16.10 with ROCm 1.5 kernel Compiler: g++ (Ubuntu 6.2.0-5ubuntu12) 6.2.0 20161005 CUDA version (if applicable): CUDNN version (if applicable): BLAS: open Python or MATLAB version (for pycaffe and matcaffe respectively): 2.7

[ RUN      ] ScaleLayerTest/3.TestBackwardBroadcastMiddleInPlace
clang version 4.0 
Target: amdgcn-amd-amdhsa-opencl
Thread model: posix
InstalledDir: /opt/rocm/opencl/bin/x86_64

========================================================

AN INTERNAL KERNEL BUILD ERROR OCCURRED!
device name = gfx801
error = -11
memory pattern = Cached global memory based block gemv, computing kernel generator
Subproblem dimensions: dims[0].itemY = 32, dims[0].itemX = 1, dims[0].y = 32, dims[0].x = 1, dims[0].bwidth = 32; ; dims[1].itemY = 4, dims[1].itemX = 1, dims[1].y = 4, dims[1].x = 1, dims[1].bwidth = 4; ; 
Parallelism granularity: pgran->wgDim = 1, pgran->wgSize[0] = 64, pgran->wgSize[1] = 1, pgran->wfSize = 64
Kernel extra flags: 540810656

Source:

#ifdef cl_khr_fp64
#pragma OPENCL EXTENSION cl_khr_fp64 : enable
#else
#pragma OPENCL EXTENSION cl_amd_fp64 : enable
#endif

typedef union GPtr {
    __global float *f;
    __global double *d;
    __global float2 *f2v;
    __global double2 *d2v;
    __global float4 *f4v;
    __global double4 *d4v;
    __global float8 *f8v;
    __global double8 *d8v;
    __global float16 *f16v;
    __global double16 *d16v;
} GPtr;

typedef union LPtr {
    __local float *f;
    __local double *d;
    __local float2 *f2v;
    __local double2 *d2v;
    __local float4 *f4v;
    __local double4 *d4v;
    __local float8 *f8v;
    __local double8 *d8v;
    __local float16 *f16v;
    __local double16 *d16v;
} LPtr;

typedef union PPtr {
    float *f;
    double *d;
    float2 *f2v;
    double2 *d2v;
    float4 *f4v;
    double4 *d4v;
    float8 *f8v;
    double8 *d8v;
    float16 *f16v;
    double16 *d16v;
} PPtr;

__attribute__((reqd_work_group_size(64, 1, 1)))
void __kernel
dgemv(
    uint M,
    uint N,
    const double alpha,
    const __global double *restrict A,
    const __global double *restrict X,
    __global double *Y,
    uint lda)
{
    // M always denotes length of Y and N denotes length of X in the kernel
    double4 a0;
    double4 x0;
    double4 y0;
    y0 = 0;

    __local double4 localRes[64][1];
    uint coordA = (get_group_id(0) * 8 + get_local_id(0) % 8) * 4;
    uint k0 = (get_local_id(0) / 8) * 4;

    if (coordA < M && k0 < N) {
        const GPtr Ag = {(__global double*)A};
        const GPtr Xg = {(__global double*)X};

        uint Ntail = N % 4;
        N -= Ntail;

        uint k = k0;
        for (; k < N; k += 32) {
            const uint xk = k / 1;
            x0.s0 = Xg.d[xk + 0];
            x0.s1 = Xg.d[xk + 1];
            x0.s2 = Xg.d[xk + 2];
            x0.s3 = Xg.d[xk + 3];
            /* -- Tiles multiplier -- */
            const uint4 ay = {mad24(coordA % M, lda, 0u), mad24((coordA + 1) % M, lda, 0u), mad24((coordA + 2) % M, lda, 0u),
                        mad24((coordA + 3) % M, lda, 0u)};
            const uint4 ak = (uint4)(0, 1, 2, 3) + k;

            a0.s0 = Ag.d[ay.s0 + ak.s0];
            a0.s1 = Ag.d[ay.s0 + ak.s1];
            a0.s2 = Ag.d[ay.s0 + ak.s2];
            a0.s3 = Ag.d[ay.s0 + ak.s3];

            y0.s0 = mad(a0.s0, x0.s0, y0.s0);
            y0.s0 = mad(a0.s1, x0.s1, y0.s0);
            y0.s0 = mad(a0.s2, x0.s2, y0.s0);
            y0.s0 = mad(a0.s3, x0.s3, y0.s0);

            a0.s0 = Ag.d[ay.s1 + ak.s0];
            a0.s1 = Ag.d[ay.s1 + ak.s1];
            a0.s2 = Ag.d[ay.s1 + ak.s2];
            a0.s3 = Ag.d[ay.s1 + ak.s3];

            y0.s1 = mad(a0.s0, x0.s0, y0.s1);
            y0.s1 = mad(a0.s1, x0.s1, y0.s1);
            y0.s1 = mad(a0.s2, x0.s2, y0.s1);
            y0.s1 = mad(a0.s3, x0.s3, y0.s1);

            a0.s0 = Ag.d[ay.s2 + ak.s0];
            a0.s1 = Ag.d[ay.s2 + ak.s1];
            a0.s2 = Ag.d[ay.s2 + ak.s2];
            a0.s3 = Ag.d[ay.s2 + ak.s3];

            y0.s2 = mad(a0.s0, x0.s0, y0.s2);
            y0.s2 = mad(a0.s1, x0.s1, y0.s2);
            y0.s2 = mad(a0.s2, x0.s2, y0.s2);
            y0.s2 = mad(a0.s3, x0.s3, y0.s2);

            a0.s0 = Ag.d[ay.s3 + ak.s0];
            a0.s1 = Ag.d[ay.s3 + ak.s1];
            a0.s2 = Ag.d[ay.s3 + ak.s2];
            a0.s3 = Ag.d[ay.s3 + ak.s3];

            y0.s3 = mad(a0.s0, x0.s0, y0.s3);
            y0.s3 = mad(a0.s1, x0.s1, y0.s3);
            y0.s3 = mad(a0.s2, x0.s2, y0.s3);
            y0.s3 = mad(a0.s3, x0.s3, y0.s3);
            /* ---------------------- */
        }
        N += Ntail;
        if (k < N) {
            x0.s0 = X[k + 0 < N ? k : 0];
            x0.s1 = X[k + 1 < N ? k + 1 : 0];
            x0.s2 = X[k + 2 < N ? k + 2 : 0];
            x0.s3 = X[k + 3 < N ? k + 3 : 0];
            x0.s0 = k + 0 < N ? x0.s0 : 0;
            x0.s1 = k + 1 < N ? x0.s1 : 0;
            x0.s2 = k + 2 < N ? x0.s2 : 0;
            x0.s3 = k + 3 < N ? x0.s3 : 0;
            /* -- Tiles multiplier -- */
            const uint4 ay = {mad24(coordA % M, lda, 0u), mad24((coordA + 1) % M, lda, 0u), mad24((coordA + 2) % M, lda, 0u),
                        mad24((coordA + 3) % M, lda, 0u)};
            const uint4 ak = ((uint4)(0, 1, 2, 3) + k) % N;

            a0.s0 = Ag.d[ay.s0 + ak.s0];
            a0.s1 = Ag.d[ay.s0 + ak.s1];
            a0.s2 = Ag.d[ay.s0 + ak.s2];
            a0.s3 = Ag.d[ay.s0 + ak.s3];

            a0.s0 = (k < N) ? a0.s0 : 0;
            a0.s1 = (k + 1 < N) ? a0.s1 : 0;
            a0.s2 = (k + 2 < N) ? a0.s2 : 0;
            a0.s3 = (k + 3 < N) ? a0.s3 : 0;

            y0.s0 = mad(a0.s0, x0.s0, y0.s0);
            y0.s0 = mad(a0.s1, x0.s1, y0.s0);
            y0.s0 = mad(a0.s2, x0.s2, y0.s0);
            y0.s0 = mad(a0.s3, x0.s3, y0.s0);

            a0.s0 = Ag.d[ay.s1 + ak.s0];
            a0.s1 = Ag.d[ay.s1 + ak.s1];
            a0.s2 = Ag.d[ay.s1 + ak.s2];
            a0.s3 = Ag.d[ay.s1 + ak.s3];

            a0.s0 = (k < N) ? a0.s0 : 0;
            a0.s1 = (k + 1 < N) ? a0.s1 : 0;
            a0.s2 = (k + 2 < N) ? a0.s2 : 0;
            a0.s3 = (k + 3 < N) ? a0.s3 : 0;

            y0.s1 = mad(a0.s0, x0.s0, y0.s1);
            y0.s1 = mad(a0.s1, x0.s1, y0.s1);
            y0.s1 = mad(a0.s2, x0.s2, y0.s1);
            y0.s1 = mad(a0.s3, x0.s3, y0.s1);

            a0.s0 = Ag.d[ay.s2 + ak.s0];
            a0.s1 = Ag.d[ay.s2 + ak.s1];
            a0.s2 = Ag.d[ay.s2 + ak.s2];
            a0.s3 = Ag.d[ay.s2 + ak.s3];

            a0.s0 = (k < N) ? a0.s0 : 0;
            a0.s1 = (k + 1 < N) ? a0.s1 : 0;
            a0.s2 = (k + 2 < N) ? a0.s2 : 0;
            a0.s3 = (k + 3 < N) ? a0.s3 : 0;

            y0.s2 = mad(a0.s0, x0.s0, y0.s2);
            y0.s2 = mad(a0.s1, x0.s1, y0.s2);
            y0.s2 = mad(a0.s2, x0.s2, y0.s2);
            y0.s2 = mad(a0.s3, x0.s3, y0.s2);

            a0.s0 = Ag.d[ay.s3 + ak.s0];
            a0.s1 = Ag.d[ay.s3 + ak.s1];
            a0.s2 = Ag.d[ay.s3 + ak.s2];
            a0.s3 = Ag.d[ay.s3 + ak.s3];

            a0.s0 = (k < N) ? a0.s0 : 0;
            a0.s1 = (k + 1 < N) ? a0.s1 : 0;
            a0.s2 = (k + 2 < N) ? a0.s2 : 0;
            a0.s3 = (k + 3 < N) ? a0.s3 : 0;

            y0.s3 = mad(a0.s0, x0.s0, y0.s3);
            y0.s3 = mad(a0.s1, x0.s1, y0.s3);
            y0.s3 = mad(a0.s2, x0.s2, y0.s3);
            y0.s3 = mad(a0.s3, x0.s3, y0.s3);
            /* ---------------------- */
        }
    }
    localRes[get_local_id(0)][0] = y0;
    barrier(CLK_LOCAL_MEM_FENCE);

    if (get_local_id(0) < 8 && coordA < M && k0 < N) {
        for (uint i = 1; i < 8; i++) {
            y0 += localRes[get_local_id(0) + i*8][0];
        }
        Y += coordA;
        double4 r0;
        GPtr uC;
        uC.f = Y;
        r0.s0 = Y[coordA + 0 >= M ? 0 : 0];
        r0.s1 = Y[coordA + 1 >= M ? 0 : 1];
        r0.s2 = Y[coordA + 2 >= M ? 0 : 2];
        r0.s3 = Y[coordA + 3 >= M ? 0 : 3];
        r0 = alpha * y0;
        Y[coordA + 3 >= M ? 0 : 3] = r0.s3;
        Y[coordA + 2 >= M ? 0 : 2] = r0.s2;
        Y[coordA + 1 >= M ? 0 : 1] = r0.s1;
        Y[coordA + 0 >= M ? 0 : 0] = r0.s0;
    }
}

Build log:

/tmp/AMD_2643_51/t_2643_53.cl:68:26: warning: incompatible pointer types initializing '__global float *' with an expression of
      type '__global double *' [-Wincompatible-pointer-types]
        const GPtr Ag = {(__global double*)A};
                         ^~~~~~~~~~~~~~~~~~~
/tmp/AMD_2643_51/t_2643_53.cl:69:26: warning: incompatible pointer types initializing '__global float *' with an expression of
      type '__global double *' [-Wincompatible-pointer-types]
        const GPtr Xg = {(__global double*)X};
                         ^~~~~~~~~~~~~~~~~~~
/tmp/AMD_2643_51/t_2643_53.cl:214:14: warning: incompatible pointer types assigning to '__global float *' from '__global double *'
      [-Wincompatible-pointer-types]
        uC.f = Y;
             ^ ~
3 warnings generated.
error: unable to execute command: Segmentation fault (core dumped)
error: clang frontend command failed due to signal (use -v to see invocation)
note: diagnostic msg: PLEASE submit a bug report to http://llvm.org/bugs/ and include the crash backtrace, preprocessed source, and associated run script.
note: diagnostic msg: Error generating preprocessed source(s) - no preprocessable inputs.
/opt/rocm/opencl/bin/x86_64/clang[0x217c00a]
/opt/rocm/opencl/bin/x86_64/clang[0x217a3be]
/opt/rocm/opencl/bin/x86_64/clang[0x217a510]
/lib/x86_64-linux-gnu/libpthread.so.0(+0x11630)[0x7f3af0b3a630]
/opt/rocm/opencl/bin/x86_64/clang[0x16fc8e1]
/opt/rocm/opencl/bin/x86_64/clang[0x1796683]
/opt/rocm/opencl/bin/x86_64/clang[0x1796ead]
/opt/rocm/opencl/bin/x86_64/clang[0x1392625]
/opt/rocm/opencl/bin/x86_64/clang[0x17650b8]
/opt/rocm/opencl/bin/x86_64/clang[0x176c339]
/opt/rocm/opencl/bin/x86_64/clang[0x1743007]
/opt/rocm/opencl/bin/x86_64/clang[0x20cbcea]
/opt/rocm/opencl/bin/x86_64/clang[0x20cbd83]
/opt/rocm/opencl/bin/x86_64/clang[0x20cc77f]
/opt/rocm/opencl/bin/x86_64/clang[0x598456]
/opt/rocm/opencl/bin/x86_64/clang[0x59a8a3]
/opt/rocm/opencl/bin/x86_64/clang[0x576ceb]
/opt/rocm/opencl/bin/x86_64/clang[0x8ed2ae]
/opt/rocm/opencl/bin/x86_64/clang[0x8c2ad5]
/opt/rocm/opencl/bin/x86_64/clang[0x5720bd]
/opt/rocm/opencl/bin/x86_64/clang[0x56f208]
/opt/rocm/opencl/bin/x86_64/clang[0x52326a]
/lib/x86_64-linux-gnu/libc.so.6(__libc_start_main+0xf1)[0x7f3af07813f1]
/opt/rocm/opencl/bin/x86_64/clang[0x5694c1]
Stack dump:
0.      Program arguments: /opt/rocm/opencl/bin/x86_64/clang -cc1 -triple amdgcn-amd-amdhsa-opencl -emit-obj -disable-free -disable-llvm-verifier -discard-value-names -main-file-name t_2643_72.bc -mrelocation-model static -mthread-model posix -mdisable-fp-elim -fmath-errno -masm-verbose -mconstructor-aliases -target-cpu carrizo -dwarf-column-info -debug-info-kind=limited -dwarf-version=2 -debugger-tuning=gdb -resource-dir /opt/rocm/opencl/bin/lib/clang/4.0 -O3 -fdebug-compilation-dir /home/maxime/workspace/caffe/caffe -ferror-limit 19 -fmessage-length 130 -cl-kernel-arg-info -fobjc-runtime=gcc -fdiagnostics-show-option -vectorize-loops -vectorize-slp -mllvm -amdgpu-internalize-symbols -mllvm -amdgpu-early-inline-all -o /tmp/t_2643_72-2d0238.o -x ir /tmp/AMD_2643_57/t_2643_72.bc 
1.      Code generation
2.      Running pass 'Function Pass Manager' on module '/tmp/AMD_2643_57/t_2643_72.bc'.
3.      Running pass 'Machine Instruction Scheduler' on function '@dgemv'
Error: Creating the executable failed: Compiling LLVM IRs to executable

========================================================

*** Aborted at 1494456158 (unix time) try "date -d @1494456158" if you are using GNU date ***
PC: @     0x7fbece4ba160 makeKernelCached
*** SIGSEGV (@0x0) received by PID 2643 (TID 0x7fbed6e00800) from PID 0; stack trace: ***
    @     0x7fbed1cda630 (unknown)
    @     0x7fbece4ba160 makeKernelCached
    @     0x7fbece4be88e makeSolutionSeq
    @     0x7fbece48a594 (unknown)
    @     0x7fbece48a797 clblasDgemv
    @     0x7fbed26739d2 caffe::greentea_gpu_gemv<>()
    @     0x7fbed281001d caffe::ScaleLayer<>::Backward_gpu()
    @     0x55bd2791a64c caffe::Layer<>::Backward()
    @     0x55bd27937492 caffe::ScaleLayerTest_TestBackwardBroadcastMiddleInPlace_Test<>::TestBody_Impl()
    @     0x55bd27d6d564 testing::internal::HandleExceptionsInMethodIfSupported<>()
    @     0x55bd27d66f9a testing::Test::Run()
    @     0x55bd27d670e8 testing::TestInfo::Run()
    @     0x55bd27d671c5 testing::TestCase::Run()
    @     0x55bd27d67487 testing::internal::UnitTestImpl::RunAllTests()
    @     0x55bd27d677a3 testing::UnitTest::Run()
    @     0x55bd278e2a29 main
    @     0x7fbed19213f1 __libc_start_main
    @     0x55bd278ed39a _start
    @                0x0 (unknown)
Makefile:672 : la recette pour la cible « runtest » a échouée
make: *** [runtest] Erreur de segmentation (core dump créé)
gstoner commented 7 years ago

ROCm has guard pages enabled so if you have out of bounds memory references the app will fault. In the dmeseg log there more info on the event.

maxlem commented 7 years ago

Will provide that asap

On Thu, May 11, 2017 at 1:21 AM, Gregory Stoner notifications@github.com wrote:

ROCm has guard pages enabled so if you have out of bounds memory references the app will fault. In the dmeseg log there more info on the event.

— You are receiving this because you authored the thread. Reply to this email directly, view it on GitHub https://github.com/BVLC/caffe/issues/5610#issuecomment-300685156, or mute the thread https://github.com/notifications/unsubscribe-auth/AJ-Jqdq3lMYMbYh78VtqNWhWVXpNMek5ks5r4prtgaJpZM4NXU-f .

maxlem commented 7 years ago

All I can see in dmesg is:

[ 119.943603] clang[2141]: segfault at 1028 ip 00000000016fc8e1 sp 00007fffc99bda08 error 4 in clang[400000+250e000