ProjectPhysX / OpenCL-Benchmark

A small OpenCL benchmark program to measure peak GPU/CPU performance.
Other
161 stars 19 forks source link

Apple M3 Max no FP64 problem #15

Open sumseq opened 3 months ago

sumseq commented 3 months ago

Hi,

I am trying to run the benchmark on an Apple M3 Max Laptop.

The benchmark runs but it thinks there is no FP64 available.

Is there a way to override this and force the benchmark to compile and run on FP64?

The results are shown below:

|----------------.------------------------------------------------------------|
| Device ID    0 | Apple M3 Max                                               |
|----------------'------------------------------------------------------------|
|----------------.------------------------------------------------------------|
| Device ID      | 0                                                          |
| Device Name    | Apple M3 Max                                               |
| Device Vendor  | Apple                                                      |
| Device Driver  | 1.2 1.0 (macOS)                                            |
| OpenCL Version | OpenCL C 1.2                                               |
| Compute Units  | 30 at 1000 MHz (3840 cores, 7.680 TFLOPs/s)                |
| Memory, Cache  | 27648 MB, 0 KB global / 32 KB local                        |
| Buffer Limits  | 5184 MB global, 1048576 KB constant                        |
|----------------'------------------------------------------------------------|
| Info: OpenCL C code successfully compiled.                                  |
| FP64  compute                                          not supported        |
| FP32  compute                                         3.594 TFLOPs/s (1/2 ) |
| FP16  compute                                          not supported        |
| INT64 compute                                         0.691  TIOPs/s (1/12) |
| INT32 compute                                         2.600  TIOPs/s (1/3 ) |
| INT16 compute                                         2.558  TIOPs/s (1/3 ) |
| INT8  compute                                         2.561  TIOPs/s (1/3 ) |
| Memory Bandwidth ( coalesced read      )                        279.51 GB/s |
| Memory Bandwidth ( coalesced      write)                        254.68 GB/s |
| Memory Bandwidth (misaligned read      )                        282.53 GB/s |
| Memory Bandwidth (misaligned      write)                        254.50 GB/s |
| PCIe   Bandwidth (send                 )                         36.11 GB/s |
| PCIe   Bandwidth (   receive           )                         35.76 GB/s |
| PCIe   Bandwidth (        bidirectional)            (Gen4 x16)   41.25 GB/s |
|-----------------------------------------------------------------------------|
ProjectPhysX commented 3 months ago

Hi @sumseq,

yes you can override the detection with some hacks:

diff --git a/src/kernel.cpp b/src/kernel.cpp
@@ -3,7 +3,7 @@ string opencl_c_container() { return R( // ########################## begin of O

-)+"#ifdef cl_khr_fp64"+R( // OpenCL C defines don't work in R() stringification macro
+//)+"#ifdef cl_khr_fp64"+R( // OpenCL C defines don't work in R() stringification macro
 kernel void kernel_double(global float* data) {
        double x = (double)get_global_id(0);
        double y = (double)get_local_id(0);
@@ -13,7 +13,7 @@ kernel void kernel_double(global float* data) {
        }
        data[get_global_id(0)] = (float)y;
 }
-)+"#endif"+R( // cl_khr_fp64
+//)+"#endif"+R( // cl_khr_fp64

diff --git a/src/opencl.hpp b/src/opencl.hpp
@@ -114,7 +114,7 @@ struct Device_Info {
                max_constant_buffer = (uint)(cl_device.getInfo<CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE>()/1024ull); // maximum constant buffer size in KB
                compute_units = (uint)cl_device.getInfo<CL_DEVICE_MAX_COMPUTE_UNITS>(); // compute units (CUs) can contain multiple cores depending on the microarchitecture
                clock_frequency = (uint)cl_device.getInfo<CL_DEVICE_MAX_CLOCK_FREQUENCY>(); // in MHz
-               is_fp64_capable = (uint)cl_device.getInfo<CL_DEVICE_NATIVE_VECTOR_WIDTH_DOUBLE>()*(uint)contains(cl_device.getInfo<CL_DEVICE_EXTENSIONS>(), "cl_khr_fp64");
+               is_fp64_capable = 1u; // (uint)cl_device.getInfo<CL_DEVICE_NATIVE_VECTOR_WIDTH_DOUBLE>()* (uint)contains(cl_device.getInfo<CL_DEVICE_EXTENSIONS>(), "cl_khr_fp64");
                is_fp32_capable = (uint)cl_device.getInfo<CL_DEVICE_NATIVE_VECTOR_WIDTH_FLOAT>();
                is_fp16_capable = (uint)cl_device.getInfo<CL_DEVICE_NATIVE_VECTOR_WIDTH_HALF>()*(uint)contains(cl_device.getInfo<CL_DEVICE_EXTENSIONS>(), "cl_khr_fp16");
                is_int64_capable = (uint)cl_device.getInfo<CL_DEVICE_NATIVE_VECTOR_WIDTH_LONG>();
@@ -242,9 +242,9 @@ private:
        bool exists = false;
        inline string enable_device_capabilities() const { return // enable FP64/FP16 capabilities if available
                "\n     #define def_workgroup_size "+to_string(WORKGROUP_SIZE)+"u"
-               "\n     #ifdef cl_khr_fp64"
+//             "\n     #ifdef cl_khr_fp64"
                "\n     #pragma OPENCL EXTENSION cl_khr_fp64 : enable" // make sure cl_khr_fp64 extension is enabled
-               "\n     #endif"
+//             "\n     #endif"
                "\n     #ifdef cl_khr_fp16"
                "\n     #pragma OPENCL EXTENSION cl_khr_fp16 : enable" // make sure cl_khr_fp16 extension is enabled
                "\n     #endif"

But OpenCL C compiling will likely fail then with error -1, which would confirm that the M3 GPU does not support FP64. Let me know the result!

Kind regards, Moritz