Allow numba to call gumath functions

saulshanabrook commented 6 years ago

We should add support for numba to be able to call gumath kernels. The numba code should be able to call the c functions for the kernels directly.

from numba import jit
import gumath as gm
import numpy as np

@jit(nopython=True)
def test_sin(x):
    return gm.sin(gm.sin(x))

test_sin([float(i) for i in range(2000)])
test_sin(np.array([float(i) for i in range(2000)]))

Also, we should be able to have vectorize compile to a gumath kernel, instead of a numpy one, so that it can the function can be executed from C, without a python layer, and so it can be jitted without python involved

@vectorize(gumath=True)
def f(x, y):
    return x * y

@jit(nopython=True)
def test_vectorized(x):
    return f(x, x)

x = np.array([float(i) for i in range(2000)])

test_vectorized(x)

saulshanabrook commented 6 years ago

@sklam does this cover the use case you wanted?

saulshanabrook commented 6 years ago

So I think we can wrap gumath functions with intrinsic to provide LLVM code that calls the gumath c code.

saulshanabrook commented 6 years ago

I am looking into writing a xnd compatible version of vectorize. My understanding is that vectorize first compiles the function, then creates some looping mechanisms around it so that it can be called as a ufunc.

Hopefully we can re-use numbas existing jit to create a llvm function, and then write some other looping code that traverses xnd data structures.

I figured out the LLVM signature for the function it creates in noptyhon mode. I added a print statement here to see the types of some calls:

vectorize(nopython=True)(lambda a: a * a)(np.arange(10))
i32 (i64*, {i8*, i32}**, i8*, i64)
Out[13]: array([ 0,  1,  4,  9, 16, 25, 36, 49, 64, 81])
vectorize(nopython=True)(lambda a: a * a)(np.arange(10).astype(float))
i32 (double*, {i8*, i32}**, i8*, double)
Out[14]: array([ 0.,  1.,  4.,  9., 16., 25., 36., 49., 64., 81.])
vectorize(nopython=True)(lambda a: a * a)(np.arange(10).astype(bool))
i32 (i64*, {i8*, i32}**, i8*, i8)
Out[15]: array([0, 1, 1, 1, 1, 1, 1, 1, 1, 1])
vectorize(nopython=True)(lambda a, b: a * b)(np.arange(10), np.arange(10))
i32 (i64*, {i8*, i32}**, i8*, i64, i64)
Out[16]: array([ 0,  1,  4,  9, 16, 25, 36, 49, 64, 81])

So the return value is likely success or failure, than it takes in a pointer to where it should save the actual result, a couple of structures, then the list of arguments. We can see this in a different form in the code where it creates the arguments for llvm (src):

        realargs = [retvaltmp, excinfoptr, env] + args
        code = builder.call(callee, realargs)
        status = self._get_return_status(builder, code,
                                         builder.load(excinfoptr))

So I think the next step is to write a basic xnd version of vectorize that takes in xnd types and performs the jitted function on them values inside.

The design issue here that I am unsure about is how we do the broadcasting/looping through xnd data structures. Is there code in gumath to do that already? Or if there isn't code, what are the rules about that? Obviously if we have two types with the same shape, it's intuitive how that should work.

I guess follow the numpy rules? https://docs.scipy.org/doc/numpy-1.14.0/user/basics.broadcasting.html#general-broadcasting-rules

https://arxiv.org/abs/1102.1523

saulshanabrook commented 6 years ago

I made some progress on creating a kernel in LLVM then calling it with gumath. This fails with a SIGSEGV however, so I must be doing something wrong. I am trying to mirror the gm_##func##_strided_##srctype##_##desttype function.

from __future__ import print_function

import llvmlite.binding as llvm
from llvmlite import ir
from llvmlite.ir import PointerType as ptr

import gumath as gm
from xnd import xnd

# All these initializations are required for code generation!
llvm.initialize()
llvm.initialize_native_target()
llvm.initialize_native_asmprinter()  # yes, even this one

def create_execution_engine():
    """
    Create an ExecutionEngine suitable for JIT code generation on
    the host CPU.  The engine is reusable for an arbitrary number of
    modules.
    """
    # Create a target machine representing the host
    target = llvm.Target.from_default_triple()
    target_machine = target.create_target_machine()
    # And an execution engine with an empty backing module
    backing_mod = llvm.parse_assembly("")
    engine = llvm.create_mcjit_compiler(backing_mod, target_machine)
    return engine

def compile_ir(engine, llvm_ir):
    """
    Compile the LLVM IR string with the given engine.
    The compiled module object is returned.
    """
    # Create a LLVM module object from the IR
    mod = llvm.parse_assembly(llvm_ir)
    mod.verify()
    # Now add the module and make sure it is ready for execution
    engine.add_module(mod)
    engine.finalize_object()
    engine.run_static_constructors()
    return mod

module = ir.Module(name='NA') #__file__)

def fill_module():

    intptr_t = ir.IntType(64)
    int_ = ir.IntType(4)
    char_ = ir.IntType(8)
    index = lambda i: ir.Constant(ir.IntType(32), i)

    srctype = ir.DoubleType()
    desttype = ir.DoubleType()

    numba_gm_sin2 = ir.Function(module, ir.FunctionType(
        int_,
        (
            ptr(ptr(char_)),
            ptr(intptr_t),
            ptr(intptr_t),
            intptr_t,
        )
    ), name="numba_gm_sin2")
    args, dimensions, steps, data = numba_gm_sin2.args

    func = module.declare_intrinsic('llvm.sin', [ir.DoubleType()])

    block = numba_gm_sin2.append_basic_block(name="entry")
    builder = ir.IRBuilder(block)

    src = builder.alloca(intptr_t, name='src')
    builder.store(
        builder.ptrtoint(
            builder.load(builder.gep(args, [index(0)])),
            intptr_t
        ),
        src
    )

    dest = builder.alloca(intptr_t, name='dest')
    builder.store(
        builder.ptrtoint(
            builder.load(builder.gep(args, [index(1)])),
            intptr_t
        ),
        dest
    )

    src_step = builder.alloca(intptr_t, name='src_step')
    builder.store(builder.load(builder.gep(steps, [index(0)])), src_step)

    dest_step = builder.alloca(intptr_t, name='dest_step')
    builder.store(builder.load(builder.gep(steps, [index(1)])), dest_step)

    n = builder.alloca(intptr_t, name='n')
    builder.store(builder.load(builder.gep(dimensions, [index(0)])), n)

    i = builder.alloca(intptr_t, name='i')
    builder.store(ir.Constant(intptr_t, 0), i)

    loop_check_block = numba_gm_sin2.append_basic_block("loop_check")

    loop_body_block = numba_gm_sin2.append_basic_block("loop_body")
    loop_body = ir.IRBuilder(loop_body_block)

    v = loop_body.load(
        loop_body.inttoptr(
            loop_body.load(src),
            ptr(srctype)
        ),
        name='v'
    )

    loop_body.store(
        loop_body.call(func, [v]),
        loop_body.inttoptr(loop_body.load(dest), ptr(desttype))
    )
    loop_body.store(
        loop_body.add(
            loop_body.load(src),
            loop_body.load(src_step)
        ),
        src
    )
    loop_body.store(
        loop_body.add(
            loop_body.load(dest),
            loop_body.load(dest_step)
        ),
        dest
    )
    loop_body.branch(loop_check_block)

    loop_end_block = numba_gm_sin2.append_basic_block("loop_end")
    loop_end = ir.IRBuilder(loop_end_block)
    loop_end.ret(ir.Constant(int_, 0))

    loop_check = ir.IRBuilder(loop_check_block)
    loop_check.cbranch(
        loop_check.icmp_signed('<', i, n),
        loop_body_block,
        loop_end_block
    )
    builder.branch(loop_check_block)

fill_module()

print(module)

engine = create_execution_engine()
mod = compile_ir(engine, str(module))

func_ptr = engine.get_function_address("numba_gm_sin2")

gm.sin2 = gm.unsafe_add_numpy_kernel(name="sin2", sig="... * float64 -> ... * float64", ptr=func_ptr)

x = xnd([-1.0, -2e122, 3.0, 0.3])
y = gm.sin2(x)
print(x, y)

teoliphant commented 6 years ago

intptr_t = ir.IntType(64)
int_ = ir.IntType(4)
char_ = ir.IntType(8)

Is the number inside IntType bits or bytes.

I think it's bits so the second one should be 64 as well.

Of course this should come from the platform so that you are 32-bit or 64-bit depending.

teoliphant commented 6 years ago

I think it would be best if xnd allowed for a single-element or 0-d kernel and then you would simply create a call-wrapper around your jitted function.

Otherwise, you would create a strided loop around the jitted function.

Travis Oliphant Quansight

On Wed, Apr 25, 2018 at 3:15 PM, Saul Shanabrook notifications@github.com wrote:

I am looking into writing a xnd compatible version of vectorize. My understanding is that vectorize first compiles the function, then creates some looping mechanisms around it so that it can be called as a ufunc.

Hopefully we can re-use numbas existing jit to create a llvm function, and then write some other looping code that traverses xnd data structures.

I figured out the LLVM signature for the function it creates in noptyhon mode. I added a print statement here https://github.com/saulshanabrook/numba/blob/65f079d1762bbe33ddfb4e35c00aa355933d5be5/numba/npyufunc/wrappers.py#L155 to see the types of some calls:

vectorize(nopython=True)(lambda a: a a)(np.arange(10)) i32 (i64, {i8*, i32}, i8, i64) Out[13]: array([ 0, 1, 4, 9, 16, 25, 36, 49, 64, 81]) vectorize(nopython=True)(lambda a: a a)(np.arange(10).astype(float)) i32 (double, {i8, i32}, i8, double) Out[14]: array([ 0., 1., 4., 9., 16., 25., 36., 49., 64., 81.]) vectorize(nopython=True)(lambda a: a a)(np.arange(10).astype(bool)) i32 (i64, {i8, i32}, i8, i8) Out[15]: array([0, 1, 1, 1, 1, 1, 1, 1, 1, 1]) vectorize(nopython=True)(lambda a, b: a b)(np.arange(10), np.arange(10)) i32 (i64, {i8, i32}, i8*, i64, i64) Out[16]: array([ 0, 1, 4, 9, 16, 25, 36, 49, 64, 81])

So the return value is likely success or failure, than it takes in a pointer to where it should save the actual result, a couple of structures, then the list of arguments. We can see this in a different form in the code where it creates the arguments for llvm (src https://github.com/saulshanabrook/numba/blob/5a9c586ec02680ed30f75e6965bb703be576198b/numba/targets/callconv.py#L463-L466 ):
    realargs = [retvaltmp, excinfoptr, env] + args
    code = builder.call(callee, realargs)
    status = self._get_return_status(builder, code,
                                     builder.load(excinfoptr))
So I think the next step is to write a basic xnd version of vectorize that takes in xnd types and performs the jitted function on them values inside.

The design issue here that I am unsure about is how we do the broadcasting/looping through xnd data structures. Is there code in gumath to do that already? Or if there isn't code, what are the rules about that? Obviously if we have two types with the same shape, it's intuitive how that should work.

I guess follow the numpy rules? https://docs.scipy.org/doc/ numpy-1.14.0/user/basics.broadcasting.html#general-broadcasting-rules

— You are receiving this because you are subscribed to this thread. Reply to this email directly, view it on GitHub https://github.com/plures/gumath/issues/5#issuecomment-384419649, or mute the thread https://github.com/notifications/unsubscribe-auth/AAPjoFVreqpHnVFYEF4SOukexqcL_HiLks5tsNlzgaJpZM4TKSLi .

saulshanabrook commented 6 years ago

Is the number inside IntType bits or bytes.

bits, yeah you are right I changed it to 64.

I think it would be best if xnd allowed for a single-element or 0-d kernel and then you would simply create a call-wrapper around your jitted function.

I agree. My intent here is to manually wrap sin, and then once that is working swap in a jitted function there.

saulshanabrook commented 6 years ago

I successfully generated a custom 0D kernel with LLVM, thanks to the example @skrah's added.

import llvmlite.binding as llvm
from llvmlite import ir
from llvmlite.ir import PointerType as ptr, LiteralStructType as struct
from toolz.functoolz import thread_first as tf

import gumath as gm
from xnd import xnd

# All these initializations are required for code generation!
llvm.initialize()
llvm.initialize_native_target()
llvm.initialize_native_asmprinter()  # yes, even this one

def create_execution_engine():
    """
    Create an ExecutionEngine suitable for JIT code generation on
    the host CPU.  The engine is reusable for an arbitrary number of
    modules.
    """
    # Create a target machine representing the host
    target = llvm.Target.from_default_triple()
    target_machine = target.create_target_machine()
    # And an execution engine with an empty backing module
    backing_mod = llvm.parse_assembly("")
    engine = llvm.create_mcjit_compiler(backing_mod, target_machine)
    return engine

def add_module(engine, llvm_ir):
    mod = llvm.parse_assembly(llvm_ir)
    mod.verify()
    # Now add the module and make sure it is ready for execution
    engine.add_module(mod)

def create_module():
    module = ir.Module(context=ir.context.Context())
    i8, i16, i32, i64 = map(ir.IntType, [8, 16, 32, 64])
    index = lambda i: ir.Constant(i32, i)

    struct = ir.LiteralStructType
    func = module.declare_intrinsic('llvm.sin', [ir.DoubleType()])
    context = module.context

    ndt_slice_t = context.get_identified_type("ndt_slice_t")
    ndt_slice_t.set_body(i64, i64, i64)

    ndt_t = context.get_identified_type("_ndt")
    ndt_t.set_body(
        i32, i32, i32, i32, i64, i16,
        struct([struct([i64, i64, i64, ptr(ptr(ndt_t))])]),
        struct([struct([struct([i32, i64, i32, ptr(i32), i32, ptr(ndt_slice_t)])])]),
        ir.ArrayType(i8, 16)
    )

    xnd_bitmap_t = context.get_identified_type("xnd_bitmap")
    xnd_bitmap_t.set_body(
        ptr(i8),
        i64,
        ptr(xnd_bitmap_t)
    )

    xnd_t = context.get_identified_type("xnd")
    xnd_t.set_body(
        xnd_bitmap_t,
        i64,
        ptr(ndt_t),
        ptr(i8)
    )

    ndt_context_t = context.get_identified_type("_ndt_context_t")
    ndt_context_t.set_body(
        i32, i32, i32,
        struct([ptr(i8)])
    )

    srctype = ir.DoubleType()
    desttype = ir.DoubleType()

    numba_gm_sin = ir.Function(module, ir.FunctionType(
        i32,
        (
            ptr(xnd_t),
            ptr(ndt_context_t),
        )
    ), name="numba_gm_sin")
    stack, context = numba_gm_sin.args

    block = numba_gm_sin.append_basic_block(name="entry")
    builder = ir.IRBuilder(block)

    in_ = tf(
        stack,
        (builder.gep, [index(0), index(3)], True),
        (builder.bitcast, ptr(ptr(srctype))),
        builder.load,
        (builder.load, 'in')
    )

    res = builder.call(func, [in_], name='res')

    out = tf(
        stack,
        (builder.gep, [index(1), index(3)], True),
        (builder.bitcast, ptr(ptr(desttype))),
        builder.load
    )
    builder.store(res, out)
    builder.ret(ir.Constant(i32, 0))

    return module

module = create_module()

print(module)

engine = create_execution_engine()
add_module(engine, str(module))

engine.finalize_object()
engine.run_static_constructors()

func_ptr = engine.get_function_address("numba_gm_sin")

gm.numba_sin = gm.unsafe_add_xnd_kernel(name="double", sig="... * float64 -> ... * float64", ptr=func_ptr)

x = xnd([-1.0, -2e122, 3.0, 0.3])
print(x, gm.sin(x), gm.numba_sin(x), gm.xnd_sin0d(x))

This outputs:

; ModuleID = ""
target triple = "unknown-unknown-unknown"
target datalayout = ""
%"ndt_slice_t" = type {i64, i64, i64}
%"_ndt" = type {i32, i32, i32, i32, i64, i16, {{i64, i64, i64, %"_ndt"**}}, {{{i32, i64, i32, i32*, i32, %"ndt_slice_t"*}}}, [16 x i8]}
%"xnd_bitmap" = type {i8*, i64, %"xnd_bitmap"*}
%"xnd" = type {%"xnd_bitmap", i64, %"_ndt"*, i8*}
%"_ndt_context_t" = type {i32, i32, i32, {i8*}}
declare double @"llvm.sin.f64"(double %".1") 
define i32 @"numba_gm_sin"(%"xnd"* %".1", %"_ndt_context_t"* %".2") 
{
entry:
  %".4" = getelementptr inbounds %"xnd", %"xnd"* %".1", i32 0, i32 3
  %".5" = bitcast i8** %".4" to double**
  %".6" = load double*, double** %".5"
  %"in" = load double, double* %".6"
  %"res" = call double @"llvm.sin.f64"(double %"in")
  %".7" = getelementptr inbounds %"xnd", %"xnd"* %".1", i32 1, i32 3
  %".8" = bitcast i8** %".7" to double**
  %".9" = load double*, double** %".8"
  store double %"res", double* %".9"
  ret i32 0
}
xnd([-1.0, -2e+122, 3.0, 0.3], type='4 * float64') xnd([-0.8414709848078965, 0.44599498177837843, 0.1411200080598672, 0.29552020666133955], type='4 * float64') xnd([-0.8414709848078965, 0.44599498177837843, 0.1411200080598672, 0.29552020666133955], type='4 * float64') xnd([-0.8414709848078965, 0.44599498177837843, 0.1411200080598672, 0.29552020666133955], type='4 * float64')

I based it off of the optimized LLVM that clang produces when it compiles the c code:


    %union.anon = type { %struct.anon.0 }
    %struct.anon.0 = type { i64, i64, i64, %struct._ndt** }
    %struct.anon.18 = type { %union.anon.19 }
    %union.anon.19 = type { %struct.anon.21 }
    %struct.anon.21 = type { i32, i64, i32, i32*, i32, %struct.ndt_slice_t* }
    %struct.ndt_slice_t = type { i64, i64, i64 }
    %struct.ndt_constraint_t = type { i32 (i64*, i8*, %struct._ndt_context*)*, i32, i32, [16 x i8*] }
    %struct._ndt_context = type { i32, i32, i32, %union.anon.10 }
    %union.anon.10 = type { i8* }
    %struct.xnd = type { %struct.xnd_bitmap, i64, %struct._ndt*, i8* }
    %struct.xnd_bitmap = type { i8*, i64, %struct.xnd_bitmap* }
    %struct.gm_kernel_init_t = type { i8*, i8*, %struct.ndt_constraint_t*, i8, i32 (%struct.xnd*, %struct._ndt_context*)*, i32 (%struct.xnd*, %struct._ndt_context*)*, i32 (%struct.xnd*, %struct._ndt_context*)*, i32 (i8**, i64*, i64*, i8*)* }
    %struct._gm_tbl = type opaque
    %struct.ndt_methods_t = type { i1 (i8*, i8*, %struct._ndt_context*)*, i1 (i8*, %struct._ndt_context*)*, i8* (i8*, %struct._ndt_context*)* }

define i32 @gm_0D_sin_d_d(%struct.xnd* nocapture readonly, %struct._ndt_context* nocapture readnone) #0 {
    %3 = getelementptr inbounds %struct.xnd, %struct.xnd* %0, i64 0, i32 3
    %4 = bitcast i8** %3 to double**
    %5 = load double*, double** %4, align 8, !tbaa !3
    %6 = load double, double* %5, align 8, !tbaa !10
    %7 = tail call double @llvm.sin.f64(double %6)
    %8 = getelementptr inbounds %struct.xnd, %struct.xnd* %0, i64 1, i32 3
    %9 = bitcast i8** %8 to double**
    %10 = load double*, double** %9, align 8, !tbaa !3
    store double %7, double* %10, align 8, !tbaa !10
    ret i32 0
    }

saulshanabrook commented 6 years ago

Now I want to be create that on the fly using the result of a jitted function, which can then be used for a vectorize decorator.

saulshanabrook commented 6 years ago

I also had to an unsafe_add_xnd_kernel python method to gumath, which is the same as unsafe_add_numpy_kernel but sets the function to Xnd instead of Vectorize and sets vectorize to false:

static PyObject *
unsafe_add_xnd_kernel(PyObject *m GM_UNUSED, PyObject *args, PyObject *kwds)
{
    NDT_STATIC_CONTEXT(ctx);
    static char *kwlist[] = {"name", "sig", "ptr", NULL};
    gm_kernel_init_t k = {NULL};
    gm_func_t *f;
    char *name;
    char *sig;
    PyObject *ptr;
    void *p;

    if (!PyArg_ParseTupleAndKeywords(args, kwds, "ssO", kwlist, &name, &sig,
        &ptr)) {
        return NULL;
    }

    p = PyLong_AsVoidPtr(ptr);
    if (p == NULL) {
        return NULL;
    }

    k.name = name;
    k.sig = sig;
    k.vectorize = false;
    k.Xnd = p;

    if (gm_add_kernel(table, &k, &ctx) < 0) {
        return seterr(&ctx);
    }

    f = gm_tbl_find(table, name, &ctx);
    if (f == NULL) {
        return seterr(&ctx);
    }

    return gufunc_new(table, f->name);
}

static PyMethodDef gumath_methods [] =
{
  /* Methods */
  { "unsafe_add_numpy_kernel", (PyCFunction)unsafe_add_numpy_kernel, METH_VARARGS|METH_KEYWORDS, NULL },
  { "unsafe_add_xnd_kernel", (PyCFunction)unsafe_add_xnd_kernel, METH_VARARGS|METH_KEYWORDS, NULL },
  { NULL, NULL, 1 }
};

saulshanabrook commented 6 years ago

I successfully implemented this with manually specified type signatures: #12

saulshanabrook commented 6 years ago

Closing this to move over to numba

xnd-project / libgumath

Allow numba to call gumath functions #5