Open sumseq opened 3 months ago
Hi @sumseq,
yes you can override the detection with some hacks:
diff --git a/src/kernel.cpp b/src/kernel.cpp
@@ -3,7 +3,7 @@ string opencl_c_container() { return R( // ########################## begin of O
-)+"#ifdef cl_khr_fp64"+R( // OpenCL C defines don't work in R() stringification macro
+//)+"#ifdef cl_khr_fp64"+R( // OpenCL C defines don't work in R() stringification macro
kernel void kernel_double(global float* data) {
double x = (double)get_global_id(0);
double y = (double)get_local_id(0);
@@ -13,7 +13,7 @@ kernel void kernel_double(global float* data) {
}
data[get_global_id(0)] = (float)y;
}
-)+"#endif"+R( // cl_khr_fp64
+//)+"#endif"+R( // cl_khr_fp64
diff --git a/src/opencl.hpp b/src/opencl.hpp
@@ -114,7 +114,7 @@ struct Device_Info {
max_constant_buffer = (uint)(cl_device.getInfo<CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE>()/1024ull); // maximum constant buffer size in KB
compute_units = (uint)cl_device.getInfo<CL_DEVICE_MAX_COMPUTE_UNITS>(); // compute units (CUs) can contain multiple cores depending on the microarchitecture
clock_frequency = (uint)cl_device.getInfo<CL_DEVICE_MAX_CLOCK_FREQUENCY>(); // in MHz
- is_fp64_capable = (uint)cl_device.getInfo<CL_DEVICE_NATIVE_VECTOR_WIDTH_DOUBLE>()*(uint)contains(cl_device.getInfo<CL_DEVICE_EXTENSIONS>(), "cl_khr_fp64");
+ is_fp64_capable = 1u; // (uint)cl_device.getInfo<CL_DEVICE_NATIVE_VECTOR_WIDTH_DOUBLE>()* (uint)contains(cl_device.getInfo<CL_DEVICE_EXTENSIONS>(), "cl_khr_fp64");
is_fp32_capable = (uint)cl_device.getInfo<CL_DEVICE_NATIVE_VECTOR_WIDTH_FLOAT>();
is_fp16_capable = (uint)cl_device.getInfo<CL_DEVICE_NATIVE_VECTOR_WIDTH_HALF>()*(uint)contains(cl_device.getInfo<CL_DEVICE_EXTENSIONS>(), "cl_khr_fp16");
is_int64_capable = (uint)cl_device.getInfo<CL_DEVICE_NATIVE_VECTOR_WIDTH_LONG>();
@@ -242,9 +242,9 @@ private:
bool exists = false;
inline string enable_device_capabilities() const { return // enable FP64/FP16 capabilities if available
"\n #define def_workgroup_size "+to_string(WORKGROUP_SIZE)+"u"
- "\n #ifdef cl_khr_fp64"
+// "\n #ifdef cl_khr_fp64"
"\n #pragma OPENCL EXTENSION cl_khr_fp64 : enable" // make sure cl_khr_fp64 extension is enabled
- "\n #endif"
+// "\n #endif"
"\n #ifdef cl_khr_fp16"
"\n #pragma OPENCL EXTENSION cl_khr_fp16 : enable" // make sure cl_khr_fp16 extension is enabled
"\n #endif"
But OpenCL C compiling will likely fail then with error -1
, which would confirm that the M3 GPU does not support FP64.
Let me know the result!
Kind regards, Moritz
Hi,
I am trying to run the benchmark on an Apple M3 Max Laptop.
The benchmark runs but it thinks there is no FP64 available.
Is there a way to override this and force the benchmark to compile and run on FP64?
The results are shown below: