hughperkins / coriander

Build NVIDIA® CUDA™ code for OpenCL™ 1.2 devices
Apache License 2.0
839 stars 88 forks source link

Unittests test_clone and test_create_cl_kernel fail #15

Closed OursDesCavernes closed 7 years ago

OursDesCavernes commented 7 years ago

Hello, I ran the unit test and 2 of them fail:

[==========] 79 tests from 13 test cases ran. (570 ms total)
[  PASSED  ] 77 tests.
[  FAILED  ] 2 tests, listed below:
[  FAILED  ] test_struct_cloner.test_clone
[  FAILED  ] test_hostside_opencl_funcs.test_create_cl_kernel

test_struct_cloner.test_clone

[ RUN      ] test_struct_cloner.test_clone
structDefinitions struct mystruct {
    int f0;
    float f1;
    global float* f2;
    int f3;
    global float* f4;
};

structDefinition foo {
    int f0;
    float f1;
    global float* f2;
    int f3;
    global float* f4;
};

structNoPtrCl foo {
    int f0;
    float f1;
    int f2;
};

clCopyCode dest.f0 = src.f0;
dest.f1 = src.f1;
dest.f2 = 0;
dest.f3 = src.f2;
dest.f4 = 0;

testIR [; ModuleID = 'hostsideM'
source_filename = "hostsideM"

%"struct mystruct" = type { i32, float, float*, i32, float* }
%"struct mystruct_nopointers" = type { i32, float, i32 }

define void @testfunc() {
entry:
  %0 = alloca %"struct mystruct"
  %1 = alloca %"struct mystruct_nopointers"
  %2 = getelementptr inbounds %"struct mystruct", %"struct mystruct"* %0, i32 0, i32 0
  %3 = getelementptr inbounds %"struct mystruct_nopointers", %"struct mystruct_nopointers"* %1, i32 0, i32 0
  %loadint = load i32, i32* %2
  store volatile i32 %loadint, i32* %3
  %4 = getelementptr inbounds %"struct mystruct", %"struct mystruct"* %0, i32 0, i32 1
  %5 = getelementptr inbounds %"struct mystruct_nopointers", %"struct mystruct_nopointers"* %1, i32 0, i32 1
  %loadint1 = load float, float* %4
  store volatile float %loadint1, float* %5
  %6 = getelementptr inbounds %"struct mystruct", %"struct mystruct"* %0, i32 0, i32 3
  %7 = getelementptr inbounds %"struct mystruct_nopointers", %"struct mystruct_nopointers"* %1, i32 0, i32 2
  %loadint2 = load i32, i32* %6
  store volatile i32 %loadint2, i32* %7
  ret void
}
]
/home/thomas/src/cuda-on-cl/test/gtest/test_struct_cloner.cpp:159: Failure
Value of: testIR
  Actual: "; ModuleID = 'hostsideM'\nsource_filename = \"hostsideM\"\n\n%\"struct mystruct\" = type { i32, float, float*, i32, float* }\n%\"struct mystruct_nopointers\" = type { i32, float, i32 }\n\ndefine void @testfunc() {\nentry:\n  %0 = alloca %\"struct mystruct\"\n  %1 = alloca %\"struct mystruct_nopointers\"\n  %2 = getelementptr inbounds %\"struct mystruct\", %\"struct mystruct\"* %0, i32 0, i32 0\n  %3 = getelementptr inbounds %\"struct mystruct_nopointers\", %\"struct mystruct_nopointers\"* %1, i32 0, i32 0\n  %loadint = load i32, i32* %2\n  store volatile i32 %loadint, i32* %3\n  %4 = getelementptr inbounds %\"struct mystruct\", %\"struct mystruct\"* %0, i32 0, i32 1\n  %5 = getelementptr inbounds %\"struct mystruct_nopointers\", %\"struct mystruct_nopointers\"* %1, i32 0, i32 1\n  %loadint1 = load float, float* %4\n  store volatile float %loadint1, float* %5\n  %6 = getelementptr inbounds %\"struct mystruct\", %\"struct mystruct\"* %0, i32 0, i32 3\n  %7 = getelementptr inbounds %\"struct mystruct_nopointers\", %\"struct mystruct_nopointers\"* %1, i32 0, i32 2\n  %loadint2 = load i32, i32* %6\n  store volatile i32 %loadint2, i32* %7\n  ret void\n}\n"
Expected: expectedIR
Which is: "; ModuleID = 'hostsideM'\n\n%\"struct mystruct\" = type { i32, float, float*, i32, float* }\n%\"struct mystruct_nopointers\" = type { i32, float, i32 }\n\ndefine void @testfunc() {\nentry:\n  %0 = alloca %\"struct mystruct\"\n  %1 = alloca %\"struct mystruct_nopointers\"\n  %2 = getelementptr inbounds %\"struct mystruct\", %\"struct mystruct\"* %0, i32 0, i32 0\n  %3 = getelementptr inbounds %\"struct mystruct_nopointers\", %\"struct mystruct_nopointers\"* %1, i32 0, i32 0\n  %loadint = load i32, i32* %2\n  store volatile i32 %loadint, i32* %3\n  %4 = getelementptr inbounds %\"struct mystruct\", %\"struct mystruct\"* %0, i32 0, i32 1\n  %5 = getelementptr inbounds %\"struct mystruct_nopointers\", %\"struct mystruct_nopointers\"* %1, i32 0, i32 1\n  %loadint1 = load float, float* %4\n  store volatile float %loadint1, float* %5\n  %6 = getelementptr inbounds %\"struct mystruct\", %\"struct mystruct\"* %0, i32 0, i32 3\n  %7 = getelementptr inbounds %\"struct mystruct_nopointers\", %\"struct mystruct_nopointers\"* %1, i32 0, i32 2\n  %loadint2 = load i32, i32* %6\n  store volatile i32 %loadint2, i32* %7\n  ret void\n}\n"
[  FAILED  ] test_struct_cloner.test_clone (1 ms)

test_hostside_opencl_funcs.test_create_cl_kernel

[ RUN      ] test_hostside_opencl_funcs.test_create_cl_kernel
Using Advanced Micro Devices, Inc. , OpenCL platform: AMD Accelerated Parallel Processing
Using OpenCL device: Ellesmere
building kernel myKernel
 ... built
hostdata[0] 0
/home/thomas/src/cuda-on-cl/test/gtest/test_hostside_opencl_funcs.cpp:55: Failure
Value of: hostdata[0]
  Actual: 0
Expected: 123.0f
Which is: 123
[  FAILED  ] test_hostside_opencl_funcs.test_create_cl_kernel (85 ms)

System info:

kernel: 4.10.10-gentoo opencl: amdgpu-pro-opencl-16.60.1.379184 on top of amdgpu open source stack llvm: 3.9.1 harware: AMD RX480

clinfo output:

Platform #0
  Name:                                  AMD Accelerated Parallel Processing
  Version:                               OpenCL 2.0 AMD-APP (2264.10)

  Device #0
    Name:                                Ellesmere
    Type:                                GPU
    Version:                             OpenCL 1.2 AMD-APP (2264.10)
    Global memory size:                  7 GB 149 MB 396 kB 
    Local memory size:                   32 kB 
    Max work group size:                 256
    Max work group size:                 (256, 256, 256)

  Device #1
    Name:                                Intel(R) Core(TM) i7-7700 CPU @ 3.60GHz
    Type:                                CPU
    Version:                             OpenCL 1.2 AMD-APP (2264.10)
    Global memory size:                  7 GB 703 MB 352 kB 
    Local memory size:                   32 kB 
    Max work group size:                 1024
    Max work item sizes:                 (1024, 1024, 1024)

How would you read these results ?

Thanks, Thomas.

hughperkins commented 7 years ago

For the first one, it doesnt look very serious, just an extra filename line. I'm not sure where that comes from. I grepped the repository for source_filename, but didnt find it. I'm wondering if it's something like, you're using a slightly different version of llvm somehow? Do you have some way of confirming which versoin of llvm you are using?

hughperkins commented 7 years ago

(the second one is trickier. Unfortunatley I dont have access to a Radeon; you will basically need to poke around, to find out how far the 123.0f is getting. It should be being set by the kernel, https://github.com/hughperkins/cuda-on-cl/blob/master/test/gtest/test_hostside_opencl_funcs.cpp#L34 This should be a relatively straightforward test, nothing too weird happening. The test is pretty much written in OpenCL, no CUDA involved, so should be fairly easy to poke around. clenqueuereadbuffer is here: https://www.khronos.org/registry/OpenCL/sdk/1.0/docs/man/xhtml/clEnqueueReadBuffer.html

Oh, it might be that you need to pass the queue into the run_1d, like run_1d(v->currentContext->default_stream.get()->clqueue->queue, 32, 32) (maybe wit ha * or a & in front of that queue expression possibly. I dont know if that will fix it, but it might.

OursDesCavernes commented 7 years ago

exact llvm version: sys-devel/llvm 3.9.1-r1 Options:

# equery u sys-devel/llvm
[ Legend : U - final flag setting for installation]
[        : I - package is installed with flag     ]
[ Colors : set, unset                             ]
 * Found these USE flags for sys-devel/llvm-3.9.1-r1:
 U I
 - - abi_x86_32               : 32-bit (x86) libraries
 + + clang                    : Build the clang C/C++ compiler
 - - debug                    : Enable extra debug codepaths, like asserts and extra output. If you want to get meaningful backtraces see
                                https://wiki.gentoo.org/wiki/Project:Quality_Assurance/Backtraces
 - - default-compiler-rt      : Use compiler-rt instead of libgcc as the default rtlib for clang
 - - default-libcxx           : Use libc++ instead of libstdc++ as the default stdlib for clang
 - - doc                      : Build and install the HTML documentation and regenerate the man pages
 - - gold                     : Build the gold linker plugin
 - - libedit                  : Use the libedit library (replacement for readline)
 + + libffi                   : Enable support for Foreign Function Interface library
 - - llvm_targets_AArch64     : AArch64 CPU target (arm64 in Gentoo)
 + + llvm_targets_AMDGPU      : AMDGPU target (supports R600 and GCN GPUs)
 - - llvm_targets_ARM         : ARM CPU target
 - - llvm_targets_BPF         : Berkeley Packet Filter target
 - - llvm_targets_Hexagon     : Qualcomm Hexagon DSP target
 - - llvm_targets_MSP430      : MSP430 CPU target (experimental)
 - - llvm_targets_Mips        : MIPS CPU target (includes MIPS64)
 - - llvm_targets_NVPTX       : NVIDIA PTX (GPU) target (32-bit and 64-bit)
 - - llvm_targets_PowerPC     : PowerPC CPU target (PPC32 and PPC64)
 - - llvm_targets_Sparc       : Sparc CPU target
 - - llvm_targets_SystemZ     : SystemZ (s390x) CPU target
 - - llvm_targets_XCore       : XCore CPU target
 - - multitarget              : Build all host targets (default: host only)
 + + ncurses                  : Support querying terminal properties using ncurses' terminfo
 - - ocaml                    : Add support/bindings for the Ocaml language
 + + python                   : Add optional support/bindings for the Python language
 + + python_targets_python2_7 : Build with Python 2.7
 + + sanitize                 : Build compiler-rt's sanitizers
 + + static-analyzer          : Install the Clang static analyzer (requires USE=clang)
 - - test                     : Workaround to pull in packages needed to run with FEATURES=test. Portage-2.1.2 handles this internally, so don't set it in
                                make.conf/package.use anymore
 + + xml                      : Add support for XML files

(+ + means option is enabled and curently installed, - - is the oposite) Does it help you in any way ?

I'm trying to get the same tests running on beignet, I can't get opencl working with it, nothing to do with cuda-on-cl.

hughperkins commented 7 years ago

Please use llvm 3.8

hughperkins commented 7 years ago

test_hostside_opencl_funcs fixed, in c9d684c. Piccie on radeon and hd:

screen shot 2017-04-29 at 7 23 49 pm
hughperkins commented 7 years ago

(and for the other one, its a trivial issue you can ignore; but if you really want to fix it, the current supported fix is: install llvm-3.8 :-) )