IntelLabs / ParallelAccelerator.jl

The ParallelAccelerator package, part of the High Performance Scripting project at Intel Labs
BSD 2-Clause "Simplified" License
294 stars 32 forks source link

Possible performance regression in blackscholes #91

Closed ranjanan closed 8 years ago

ranjanan commented 8 years ago

Hello, I'm trying to compare performance of Black Scholes with respect to the serial version. Here's my code:

using ParallelAccelerator

@acc begin

@inline function cndf2_acc(in::AbstractArray{Float32})
    out = 0.5 .+ 0.5 .* erf(0.707106781 .* in)
    return out
end

function blackscholes_acc(sptprice::AbstractArray{Float32},
                           strike::AbstractArray{Float32},
                           rate::AbstractArray{Float32},
                           volatility::AbstractArray{Float32},
                           time::AbstractArray{Float32})
    logterm = log10(sptprice ./ strike)
    powterm = .5 .* volatility .* volatility
    den = volatility .* sqrt(time)
    d1 = (((rate .+ powterm) .* time) .+ logterm) ./ den
    d2 = d1 .- den
    NofXd1 = cndf2_acc(d1)
    NofXd2 = cndf2_acc(d2)
    futureValue = strike .* exp(- rate .* time)
    c1 = futureValue .* NofXd2
    call = sptprice .* NofXd1 .- c1
    put  = call .- futureValue .+ sptprice
end

end
@inline function cndf2(in::AbstractArray{Float32})
    out = 0.5 .+ 0.5 .* erf(0.707106781 .* in)
    return out
end

function blackscholes(sptprice::AbstractArray{Float32},
                           strike::AbstractArray{Float32},
                           rate::AbstractArray{Float32},
                           volatility::AbstractArray{Float32},
                           time::AbstractArray{Float32})
    logterm = log10(sptprice ./ strike)
    powterm = .5 .* volatility .* volatility
    den = volatility .* sqrt(time)
    d1 = (((rate .+ powterm) .* time) .+ logterm) ./ den
    d2 = d1 .- den
    NofXd1 = cndf2(d1)
    NofXd2 = cndf2(d2)
    futureValue = strike .* exp(- rate .* time)
    c1 = futureValue .* NofXd2
    call = sptprice .* NofXd1 .- c1
    put  = call .- futureValue .+ sptprice
end

function run(iterations)
    sptprice   = Float32[ 42.0 for i = 1:iterations ]
    initStrike = Float32[ 40.0 + (i / iterations) for i = 1:iterations ]
    rate       = Float32[ 0.5 for i = 1:iterations ]
    volatility = Float32[ 0.2 for i = 1:iterations ]
    time       = Float32[ 0.5 for i = 1:iterations ]

    tic()
    put1 = blackscholes_acc(sptprice, initStrike, rate, volatility, time)
    t1 = toq()
tic()
    put2 = blackscholes(sptprice, initStrike, rate, volatility, time)
    t2 = toq()
    println("checksum (acc): ", sum(put1))
    println("checksum (serial) : ", sum(put2))
    return t1, t2
end

function main()
    srand(0)

    tic()
    blackscholes_acc(Float32[], Float32[], Float32[], Float32[], Float32[])
    blackscholes(Float32[], Float32[], Float32[], Float32[], Float32[])
    println("SELFPRIMED ", toq())

    iterations = 10^7

    t1, t2 = run(iterations)
    println("Time acc = $t1")
    println("Time serial = $t2")
    println("rate acc = ", iterations / t1, " opts/sec")
    println("rate serial = ", iterations / t2, " opts/sec")
end

main()

Here's the output I get:

SELFPRIMED 14.366788608
Number of threads = 4
checksum (acc): 2.0954822e8
checksum (serial) : 2.0954822e8
Time acc = 2.052863587
Time serial = 2.177722479
rate acc = 4.871244277177585e6 opts/sec
rate serial = 4.591953334931802e6 opts/sec

As you can see, with 4 threads, there is barely any speedup. Here's my versioninfo()

Julia Version 0.5.0-dev+3834
Commit b3be873* (2016-05-02 05:33 UTC)
Platform Info:
  System: Darwin (x86_64-apple-darwin15.4.0)
  CPU: Intel(R) Core(TM) i5-5257U CPU @ 2.70GHz
  WORD_SIZE: 64
  BLAS: libopenblas (USE64BITINT DYNAMIC_ARCH NO_AFFINITY Haswell)
  LAPACK: libopenblas64_
  LIBM: libopenlibm
  LLVM: libLLVM-3.7.1 (ORCJIT, broadwell)
lkuper commented 8 years ago

Unfortunately, ParallelAccelerator doesn't support Julia 0.5 yet, but we're working on it!

ViralBShah commented 8 years ago

Perhaps leave the issue open and close when 0.5 is supported? It is likely more people will report the same issue.

lkuper commented 8 years ago

We have #53 open already for 0.5 compatibility.