mylesmharrison / mandelbrot

The Mandelbrot Set in R
7 stars 5 forks source link

More efficient non-vectorized version #1

Open tomwenseleers opened 1 year ago

tomwenseleers commented 1 year ago

Was just testing some timings of your vectorized Mandelbrot version against a naive non-vectorized version that I had lying around, and my non-vectorized version was somehow faster. Example:

My non-vectorized pure R version:

mandelR <- function(x_min, x_max, y_min, y_max, res_x, res_y, nb_iter) {
  ret <- matrix(NA, nrow = res_x, ncol = res_y)
  x_step <- (x_max - x_min) / res_x
  y_step <- (y_max - y_min) / res_y
  for (r in 1:res_y) {
    for (c in 1:res_x) {
      zx <- 0.0
      zy <- 0.0
      new_zx <- 0.0
      cx <- x_min + c*x_step
      cy <- y_min + r*y_step
      n <- 0
      n0 <- 0
      q <- (cx-0.25)*(cx-0.25) + cy*cy
      # if((q*(q+(cx-0.25)))<(0.25*cy*cy)) { n0 <- nb_iter } # cardioid test, left this out for now
      for (n in n0:nb_iter) {
        if (zx*zx + zy*zy >= 4.0) { break }
        new_zx <- zx*zx - zy*zy + cx
        zy <- 2.0*zx*zy + cy
        zx <- new_zx
      }
      ret[c,r] <- n
    }
  }
  return(ret)
}

system.time(m <- mandelR(-0.766032578179731, -0.766032578179529, 
                         0.10086220543088, 0.10086220543102, 
                         640, 640, 10000)) # 39s
image(m^0.1)

Your vectorized version:

mandelbrot_vectorized <- function(x_min=-2, x_max=2, 
                                  y_min=-1.5, y_max=1.5, 
                                  res_x=500, res_y=500,
                                  nb_iter=10000 # showplot=TRUE,
                                  # cols=colorRampPalette(c("blue","yellow","red","black"))(11)
                                  ) 
{

  # variables
  x <- seq(x_min, x_max, length.out=res_x)
  y <- seq(y_min, y_max, length.out=res_y)
  c <- outer(x,y*1i,FUN="+")
  z <- matrix(0.0, nrow=length(x), ncol=length(y))
  ret <- matrix(0.0, nrow=length(x), ncol=length(y))

  for (rep in 1:nb_iter) { 
    index <- which(Mod(z) < 2)
    z[index] <- z[index]^2 + c[index]
    ret[index] <- ret[index] + 1
  }

  # if (showplot==TRUE) { image(x,y,k,col=cols, xlab="Re(c)", ylab="Im(c)")}

  return(ret)

}

system.time(m <- mandelbrot_vectorized(-0.766032578179731, -0.766032578179529, 
                                       0.10086220543088, 0.10086220543102, 
                                       640, 640, 10000)) # 53s
image(m^0.1)

The fastest vectorized Mandelbrot pure R code I found was that available at https://rosettacode.org/wiki/Mandelbrot_set :

iterate.until.escape <- function(z, c, trans, cond, max=500, response=dwell) {
  #we iterate all active points in the same array operation,
  #and keeping track of which points are still iterating.
  active <- seq_along(z)
  dwell <- z
  dwell[] <- 0
  for (i in 1:max) {
    z[active] <- trans(z[active], c[active]);
    survived <- cond(z[active])
    dwell[active[!survived]] <- i
    active <- active[survived]
    if (length(active) == 0) break
  }
  eval(substitute(response))
}

mandelR = function(x_min, x_max, y_min, y_max, res_x, res_y, nb_iter) {
  re = seq(x_min, x_max, len=res_x)
  im = seq(y_min, y_max, len=res_y)
  c <- outer(re, im, function(x,y) complex(real=x, imaginary=y))
  ret <- iterate.until.escape(array(0, dim(c)), c,
                          function(z,c)z^2+c, function(z)abs(z) <= 2,
                          max=nb_iter)
  return(ret)
}

system.time(m <- mandelR(-0.766032578179731, -0.766032578179529, 
                         0.10086220543088, 0.10086220543102, 
                         640, 640, 10000)) # 19.64s

image(m^0.1)

Haven't dug in detail where these differences in performance come from.

For the record: none of these are particularly impressive timings - the fastest Rcpp OpenMP version I had on my Intel i9 16 core laptop runs at (see https://github.com/tomwenseleers/mandelExplorer)

# Rcpp versions
install.packages("remotes")
library(remotes)
remotes::install_github("tomwenseleers/mandelExplorer", upgrade="never")
library(mandelExplorer)
library(microbenchmark)

# OpenMP version
microbenchmark(m <- mandelRcpp(as.double(-0.766032578179731), as.double(-0.766032578179529), 
                        as.double(0.10086220543088), as.double(0.10086220543102), 
                        as.integer(640), 
                        as.integer(640), 10000L), unit="s") # 0.11s 

# faster OpenMP version using vectorized SIMD intrinsics
microbenchmark(m <- matrix(mandelRcpp2(as.double(-0.766032578179731), as.double(-0.766032578179529), 
                                       as.double(0.10086220543088), as.double(0.10086220543102), 
                        as.integer(640), 
                        as.integer(640), 10000L), nrow=640),
               unit="s") # 0.02s

So this last version runs about 1000x faster than the fastest pure R version I found. Nice thing is that that allows one to show real-time zooms at about 15 fps in R (using fast graphics from the nara package).

Supposedly, an OpenCL GPU version would run another 2 to 3 orders of magnitude faster still (e.g. as in https://github.com/benhiller/opencl-mandelbrot/blob/master/mandelbrot.cl) - was still planning to try to get that to run from R as well via gpuRor gpuMagic, but didn't manage yet, just to see what performance benefit could be had...

tomwenseleers commented 1 year ago

So for completeness here also an implementation where a pure R Mandelbrot function is compiled to an OpenCL GPU version using the gpuMagicR package - with my NVIDIA RTX A2000 Laptop GPU graphics card, that results in a 1333x speed increase (if I use float, 169x speed increase if I use double) compared to the pure R function (run single threaded, non-parallelized on my 16 core 11th Gen Intel(R) Core(TM) i9-11950H @ 2.60GHz) :

if (!require("BiocManager", quietly = TRUE))
  install.packages("BiocManager")

BiocManager::install("gpuMagic")
library(gpuMagic)
library(microbenchmark)
library(compiler)
enableJIT(3)

getDeviceList()
setDevice(1)

fromX = -0.74877 
toX = -0.74872 
fromY = 0.065053 
toY = 0.065103 
width = 1080L
height = 1080L
maxiter = 10000L;
# colour palette
library(RColorBrewer)
cols =  c(colorRampPalette(rev(RColorBrewer::brewer.pal(11, "RdYlBu")))(100), 
   colorRampPalette(RColorBrewer::brewer.pal(11, "RdYlBu"))(100), 
          "black") 

# define pure R Mandelbrot function to compile to OpenCL GPU version using gpuMagic
mandelbrotfun <- function(i, fromX, toX, fromY, toY, width, height, maxiter){
  py = 1+floor((i-1)/width) # Y pixel, 1...height
  px = 1+((i-1) - width * floor((i-1) / width)) # = 1+(i-1) %% width = X pixel, 1...width

  iteration = -1
  x0 = fromX + px * (toX - fromX) / width
  y0 = fromY + py * (toY - fromY) / height
  x = 0
  y = 0
  for (iteration in 0:(maxiter-1)) {
        xn = x * x - y * y + x0;
        y = 2 * x * y + y0;
        x = xn;
        if (x * x + y * y > 4.0) {
          break;
        }
  }
  return(iteration)
}

Timings of the CPU pure R version on my 16 core 11th Gen Intel(R) Core(TM) i9-11950H @ 2.60GHz :

system.time(res <- sapply(1:(width*height), mandelbrotfun, 
                          fromX, toX, fromY, toY, width, height, maxiter))

# 120s
m <- matrix(res, nrow=width)
par(mar=c(0, 0, 0, 0))
system.time(image(m^(1/3), col=cols, asp=(toY-fromY)/(toX-fromX), axes=F, useRaster=T)) 

mandelbrot

Timings of the GPU OpenCL version (on NVIDIA RTX A2000 Laptop GPU) : 169x faster than the CPU pure R version

gpuMagic.setOptions(default.float = "float") # unmark if you would like to use double (float is ca 6x faster with given problem size, but of course one can only zoom a factor of 10^5 vs 10^12 with double before hitting the limit of numerical accuracy
# warmup, first time it is called timing is always a little longer due to R to OpenCl compilation time
# doing warmup with a low res render
res <- gpuSapply(1:(10L*10L), 
                 mandelbrotfun, 
                 fromX, toX, fromY, toY, 10L, 10L, maxiter) 
microbenchmark(res <- gpuSapply(1:(width*height), 
                             mandelbrotfun, 
                             fromX, toX, fromY, toY, width, height, maxiter))
# 0.09s, for options see gpuSapply.getOption()
m <- matrix(res, nrow=width)
par(mar=c(0, 0, 0, 0))
system.time(image(m^(1/3), col=cols, asp=(toY-fromY)/(toX-fromX), axes=F, useRaster=T)) 
# 0.40s
# PS much faster option to display raster is using a native raster, 
# https://github.com/coolbutuseless/nara

Timings of two Rcpp versions I made :

library(remotes)
remotes::install_github('tomwenseleers/mandelExplorer', update='none')
library(mandelExplorer)
library(microbenchmark)   

Rcpp OpenMP double version, https://github.com/tomwenseleers/mandelExplorer/blob/main/src/mandelRcpp.cpp : 4x slower than the GPU float version on my 16 core 11th Gen Intel(R) Core(TM) i9-11950H @ 2.60GHz :

microbenchmark(m <- mandelRcpp(fromX, toX, 
                               fromY, toY, 
                               width, 
                               height, maxiter), unit="s") # 0.35s
par(mar=c(0, 0, 0, 0))
system.time(image(m^(1/3), col=cols, asp=(toY-fromY)/(toX-fromX), axes=F, useRaster=T)) 

Rcpp OpenMP double version using vectorized SIMD intrinsics, https://github.com/tomwenseleers/mandelExplorer/blob/main/src/mandelRcpp.cpp (for larger width & height the GPU version can become faster though) : 1.2x faster than the GPU float version on my 16 core 11th Gen Intel(R) Core(TM) i9-11950H @ 2.60GHz :

microbenchmark(m <- matrix(mandelRcpp2(fromX, toX, 
                                       fromY, toY, 
                                       width, 
                                       height, maxiter), 
                           nrow=width),
               unit="s") # 0.077s
m[m==0] <- maxiter
m <- matrix(m, nrow=width)
par(mar=c(0, 0, 0, 0))
system.time(image(m^(1/3), col=cols, asp=(toY-fromY)/(toX-fromX), axes=F, useRaster=T)) 

EDIT: It's also possible to pass manually written OpenCL code, which is a bit faster still, with the float OpenCL code then beating the speed of my Rcpp OpenMP+SIMD code, but using float instead of double :

# OpenCL Mandelbrot code using float #### 
# 3x faster than Rcpp OpenMP+SIMD double version, 4444x faster than pure R version, but float instead of double
code_float='
kernel void mandelbrotOpenCL(global int* res, 
          global double* width, global double* height, 
          global double* fromX, global double* toX, 
          global double* fromY, global double* toY) {
    int id = get_global_id(0);
    int px = id % ((int) width[0]);
    int py = id / ((int) width[0]);
    int iteration;
    float x0 = fromX[0] + ((double)px) * (toX[0] - fromX[0]) / width[0];
    float y0 = fromY[0] + ((double)py) * (toY[0] - fromY[0]) / height[0];
    float x = 0;
    float y = 0;
    for (iteration = 0; iteration < 10000; iteration++) {
      float xn = x * x - y * y + x0;
      y = 2 * x * y + y0;
      x = xn;
      if (x * x + y * y > 4.0) {
        break;
      }
    }
    res[(uint)(width[0] *py  + px)] = iteration;
}
'

res_dev = gpuEmptMatrix(height, width, type='int')

microbenchmark(
  {
    .kernel(src = code_float, 
            kernel='mandelbrotOpenCL',
            parms=list(res_dev, width, height, fromX, toX, fromY, toY),
            .globalThreadNum = width*height)
    res_dev <- download(res_dev)
    m <- matrix(res_dev, nrow=width)
  }, unit="s")
# 0.027s on NVIDIA RTX A2000 Laptop GPU
system.time(image(m^(1/3), col=cols, asp=(toY-fromY)/(toX-fromX), axes=F, useRaster=T)) 

# OpenCL Mandelbrot code using double #### 
# similar speed than Rcpp OpenMP non-SIMD version, 400x faster than pure R version
code_double='
kernel void mandelbrotOpenCL(global int* res, 
          global double* width, global double* height, 
          global double* fromX, global double* toX, 
          global double* fromY, global double* toY) {
    int id = get_global_id(0);
    int px = id % ((int) width[0]);
    int py = id / ((int) width[0]);
    int iteration;
    double x0 = fromX[0] + ((double)px) * (toX[0] - fromX[0]) / width[0];
    double y0 = fromY[0] + ((double)py) * (toY[0] - fromY[0]) / height[0];
    double x = 0;
    double y = 0;
    for (iteration = 0; iteration < 10000; iteration++) {
      double xn = x * x - y * y + x0;
      y = 2 * x * y + y0;
      x = xn;
      if (x * x + y * y > 4.0) {
        break;
      }
    }
    res[(uint)(width[0] *py  + px)] = iteration;
}
'

res_dev = gpuEmptMatrix(height, width, type='int')

microbenchmark(
  {
    .kernel(src = code_double, 
            kernel='mandelbrotOpenCL',
            parms=list(res_dev, width, height, fromX, toX, fromY, toY),
            .globalThreadNum = width*height)
    res_dev <- download(res_dev)
    m <- matrix(res_dev, nrow=width)
  }, unit="s")
# 0.30s on NVIDIA RTX A2000 Laptop GPU
system.time(image(m^(1/3), col=cols, asp=(toY-fromY)/(toX-fromX), axes=F, useRaster=T))