Open tomwenseleers opened 1 year ago
So for completeness here also an implementation where a pure R Mandelbrot function is compiled to an OpenCL GPU version using the gpuMagic
R package - with my NVIDIA RTX A2000 Laptop GPU graphics card, that results in a 1333x speed increase (if I use float, 169x speed increase if I use double) compared to the pure R function (run single threaded, non-parallelized on my 16 core 11th Gen Intel(R) Core(TM) i9-11950H @ 2.60GHz) :
if (!require("BiocManager", quietly = TRUE))
install.packages("BiocManager")
BiocManager::install("gpuMagic")
library(gpuMagic)
library(microbenchmark)
library(compiler)
enableJIT(3)
getDeviceList()
setDevice(1)
fromX = -0.74877
toX = -0.74872
fromY = 0.065053
toY = 0.065103
width = 1080L
height = 1080L
maxiter = 10000L;
# colour palette
library(RColorBrewer)
cols = c(colorRampPalette(rev(RColorBrewer::brewer.pal(11, "RdYlBu")))(100),
colorRampPalette(RColorBrewer::brewer.pal(11, "RdYlBu"))(100),
"black")
# define pure R Mandelbrot function to compile to OpenCL GPU version using gpuMagic
mandelbrotfun <- function(i, fromX, toX, fromY, toY, width, height, maxiter){
py = 1+floor((i-1)/width) # Y pixel, 1...height
px = 1+((i-1) - width * floor((i-1) / width)) # = 1+(i-1) %% width = X pixel, 1...width
iteration = -1
x0 = fromX + px * (toX - fromX) / width
y0 = fromY + py * (toY - fromY) / height
x = 0
y = 0
for (iteration in 0:(maxiter-1)) {
xn = x * x - y * y + x0;
y = 2 * x * y + y0;
x = xn;
if (x * x + y * y > 4.0) {
break;
}
}
return(iteration)
}
Timings of the CPU pure R version on my 16 core 11th Gen Intel(R) Core(TM) i9-11950H @ 2.60GHz :
system.time(res <- sapply(1:(width*height), mandelbrotfun,
fromX, toX, fromY, toY, width, height, maxiter))
# 120s
m <- matrix(res, nrow=width)
par(mar=c(0, 0, 0, 0))
system.time(image(m^(1/3), col=cols, asp=(toY-fromY)/(toX-fromX), axes=F, useRaster=T))
Timings of the GPU OpenCL version (on NVIDIA RTX A2000 Laptop GPU) : 169x faster than the CPU pure R version
gpuMagic.setOptions(default.float = "float") # unmark if you would like to use double (float is ca 6x faster with given problem size, but of course one can only zoom a factor of 10^5 vs 10^12 with double before hitting the limit of numerical accuracy
# warmup, first time it is called timing is always a little longer due to R to OpenCl compilation time
# doing warmup with a low res render
res <- gpuSapply(1:(10L*10L),
mandelbrotfun,
fromX, toX, fromY, toY, 10L, 10L, maxiter)
microbenchmark(res <- gpuSapply(1:(width*height),
mandelbrotfun,
fromX, toX, fromY, toY, width, height, maxiter))
# 0.09s, for options see gpuSapply.getOption()
m <- matrix(res, nrow=width)
par(mar=c(0, 0, 0, 0))
system.time(image(m^(1/3), col=cols, asp=(toY-fromY)/(toX-fromX), axes=F, useRaster=T))
# 0.40s
# PS much faster option to display raster is using a native raster,
# https://github.com/coolbutuseless/nara
Timings of two Rcpp versions I made :
library(remotes)
remotes::install_github('tomwenseleers/mandelExplorer', update='none')
library(mandelExplorer)
library(microbenchmark)
Rcpp OpenMP double version, https://github.com/tomwenseleers/mandelExplorer/blob/main/src/mandelRcpp.cpp : 4x slower than the GPU float version on my 16 core 11th Gen Intel(R) Core(TM) i9-11950H @ 2.60GHz :
microbenchmark(m <- mandelRcpp(fromX, toX,
fromY, toY,
width,
height, maxiter), unit="s") # 0.35s
par(mar=c(0, 0, 0, 0))
system.time(image(m^(1/3), col=cols, asp=(toY-fromY)/(toX-fromX), axes=F, useRaster=T))
Rcpp OpenMP double version using vectorized SIMD intrinsics, https://github.com/tomwenseleers/mandelExplorer/blob/main/src/mandelRcpp.cpp (for larger width & height the GPU version can become faster though) : 1.2x faster than the GPU float version on my 16 core 11th Gen Intel(R) Core(TM) i9-11950H @ 2.60GHz :
microbenchmark(m <- matrix(mandelRcpp2(fromX, toX,
fromY, toY,
width,
height, maxiter),
nrow=width),
unit="s") # 0.077s
m[m==0] <- maxiter
m <- matrix(m, nrow=width)
par(mar=c(0, 0, 0, 0))
system.time(image(m^(1/3), col=cols, asp=(toY-fromY)/(toX-fromX), axes=F, useRaster=T))
EDIT: It's also possible to pass manually written OpenCL code, which is a bit faster still, with the float OpenCL code then beating the speed of my Rcpp OpenMP+SIMD code, but using float instead of double :
# OpenCL Mandelbrot code using float ####
# 3x faster than Rcpp OpenMP+SIMD double version, 4444x faster than pure R version, but float instead of double
code_float='
kernel void mandelbrotOpenCL(global int* res,
global double* width, global double* height,
global double* fromX, global double* toX,
global double* fromY, global double* toY) {
int id = get_global_id(0);
int px = id % ((int) width[0]);
int py = id / ((int) width[0]);
int iteration;
float x0 = fromX[0] + ((double)px) * (toX[0] - fromX[0]) / width[0];
float y0 = fromY[0] + ((double)py) * (toY[0] - fromY[0]) / height[0];
float x = 0;
float y = 0;
for (iteration = 0; iteration < 10000; iteration++) {
float xn = x * x - y * y + x0;
y = 2 * x * y + y0;
x = xn;
if (x * x + y * y > 4.0) {
break;
}
}
res[(uint)(width[0] *py + px)] = iteration;
}
'
res_dev = gpuEmptMatrix(height, width, type='int')
microbenchmark(
{
.kernel(src = code_float,
kernel='mandelbrotOpenCL',
parms=list(res_dev, width, height, fromX, toX, fromY, toY),
.globalThreadNum = width*height)
res_dev <- download(res_dev)
m <- matrix(res_dev, nrow=width)
}, unit="s")
# 0.027s on NVIDIA RTX A2000 Laptop GPU
system.time(image(m^(1/3), col=cols, asp=(toY-fromY)/(toX-fromX), axes=F, useRaster=T))
# OpenCL Mandelbrot code using double ####
# similar speed than Rcpp OpenMP non-SIMD version, 400x faster than pure R version
code_double='
kernel void mandelbrotOpenCL(global int* res,
global double* width, global double* height,
global double* fromX, global double* toX,
global double* fromY, global double* toY) {
int id = get_global_id(0);
int px = id % ((int) width[0]);
int py = id / ((int) width[0]);
int iteration;
double x0 = fromX[0] + ((double)px) * (toX[0] - fromX[0]) / width[0];
double y0 = fromY[0] + ((double)py) * (toY[0] - fromY[0]) / height[0];
double x = 0;
double y = 0;
for (iteration = 0; iteration < 10000; iteration++) {
double xn = x * x - y * y + x0;
y = 2 * x * y + y0;
x = xn;
if (x * x + y * y > 4.0) {
break;
}
}
res[(uint)(width[0] *py + px)] = iteration;
}
'
res_dev = gpuEmptMatrix(height, width, type='int')
microbenchmark(
{
.kernel(src = code_double,
kernel='mandelbrotOpenCL',
parms=list(res_dev, width, height, fromX, toX, fromY, toY),
.globalThreadNum = width*height)
res_dev <- download(res_dev)
m <- matrix(res_dev, nrow=width)
}, unit="s")
# 0.30s on NVIDIA RTX A2000 Laptop GPU
system.time(image(m^(1/3), col=cols, asp=(toY-fromY)/(toX-fromX), axes=F, useRaster=T))
Was just testing some timings of your vectorized Mandelbrot version against a naive non-vectorized version that I had lying around, and my non-vectorized version was somehow faster. Example:
My non-vectorized pure R version:
Your vectorized version:
The fastest vectorized Mandelbrot pure R code I found was that available at https://rosettacode.org/wiki/Mandelbrot_set :
Haven't dug in detail where these differences in performance come from.
For the record: none of these are particularly impressive timings - the fastest Rcpp OpenMP version I had on my Intel i9 16 core laptop runs at (see https://github.com/tomwenseleers/mandelExplorer)
So this last version runs about 1000x faster than the fastest pure R version I found. Nice thing is that that allows one to show real-time zooms at about 15 fps in R (using fast graphics from the nara package).
Supposedly, an OpenCL GPU version would run another 2 to 3 orders of magnitude faster still (e.g. as in https://github.com/benhiller/opencl-mandelbrot/blob/master/mandelbrot.cl) - was still planning to try to get that to run from R as well via
gpuR
orgpuMagic
, but didn't manage yet, just to see what performance benefit could be had...