Closed saulshanabrook closed 6 years ago
@sklam does this cover the use case you wanted?
So I think we can wrap gumath functions with intrinsic
to provide LLVM code that calls the gumath c code.
I am looking into writing a xnd compatible version of vectorize. My understanding is that vectorize first compiles the function, then creates some looping mechanisms around it so that it can be called as a ufunc.
Hopefully we can re-use numbas existing jit to create a llvm function, and then write some other looping code that traverses xnd data structures.
I figured out the LLVM signature for the function it creates in noptyhon mode. I added a print statement here to see the types of some calls:
vectorize(nopython=True)(lambda a: a * a)(np.arange(10))
i32 (i64*, {i8*, i32}**, i8*, i64)
Out[13]: array([ 0, 1, 4, 9, 16, 25, 36, 49, 64, 81])
vectorize(nopython=True)(lambda a: a * a)(np.arange(10).astype(float))
i32 (double*, {i8*, i32}**, i8*, double)
Out[14]: array([ 0., 1., 4., 9., 16., 25., 36., 49., 64., 81.])
vectorize(nopython=True)(lambda a: a * a)(np.arange(10).astype(bool))
i32 (i64*, {i8*, i32}**, i8*, i8)
Out[15]: array([0, 1, 1, 1, 1, 1, 1, 1, 1, 1])
vectorize(nopython=True)(lambda a, b: a * b)(np.arange(10), np.arange(10))
i32 (i64*, {i8*, i32}**, i8*, i64, i64)
Out[16]: array([ 0, 1, 4, 9, 16, 25, 36, 49, 64, 81])
So the return value is likely success or failure, than it takes in a pointer to where it should save the actual result, a couple of structures, then the list of arguments. We can see this in a different form in the code where it creates the arguments for llvm (src):
realargs = [retvaltmp, excinfoptr, env] + args
code = builder.call(callee, realargs)
status = self._get_return_status(builder, code,
builder.load(excinfoptr))
So I think the next step is to write a basic xnd version of vectorize that takes in xnd types and performs the jitted function on them values inside.
The design issue here that I am unsure about is how we do the broadcasting/looping through xnd data structures. Is there code in gumath to do that already? Or if there isn't code, what are the rules about that? Obviously if we have two types with the same shape, it's intuitive how that should work.
I guess follow the numpy rules? https://docs.scipy.org/doc/numpy-1.14.0/user/basics.broadcasting.html#general-broadcasting-rules
I made some progress on creating a kernel in LLVM then calling it with gumath. This fails with a SIGSEGV
however, so I must be doing something wrong. I am trying to mirror the gm_##func##_strided_##srctype##_##desttype
function.
from __future__ import print_function
import llvmlite.binding as llvm
from llvmlite import ir
from llvmlite.ir import PointerType as ptr
import gumath as gm
from xnd import xnd
# All these initializations are required for code generation!
llvm.initialize()
llvm.initialize_native_target()
llvm.initialize_native_asmprinter() # yes, even this one
def create_execution_engine():
"""
Create an ExecutionEngine suitable for JIT code generation on
the host CPU. The engine is reusable for an arbitrary number of
modules.
"""
# Create a target machine representing the host
target = llvm.Target.from_default_triple()
target_machine = target.create_target_machine()
# And an execution engine with an empty backing module
backing_mod = llvm.parse_assembly("")
engine = llvm.create_mcjit_compiler(backing_mod, target_machine)
return engine
def compile_ir(engine, llvm_ir):
"""
Compile the LLVM IR string with the given engine.
The compiled module object is returned.
"""
# Create a LLVM module object from the IR
mod = llvm.parse_assembly(llvm_ir)
mod.verify()
# Now add the module and make sure it is ready for execution
engine.add_module(mod)
engine.finalize_object()
engine.run_static_constructors()
return mod
module = ir.Module(name='NA') #__file__)
def fill_module():
intptr_t = ir.IntType(64)
int_ = ir.IntType(4)
char_ = ir.IntType(8)
index = lambda i: ir.Constant(ir.IntType(32), i)
srctype = ir.DoubleType()
desttype = ir.DoubleType()
numba_gm_sin2 = ir.Function(module, ir.FunctionType(
int_,
(
ptr(ptr(char_)),
ptr(intptr_t),
ptr(intptr_t),
intptr_t,
)
), name="numba_gm_sin2")
args, dimensions, steps, data = numba_gm_sin2.args
func = module.declare_intrinsic('llvm.sin', [ir.DoubleType()])
block = numba_gm_sin2.append_basic_block(name="entry")
builder = ir.IRBuilder(block)
src = builder.alloca(intptr_t, name='src')
builder.store(
builder.ptrtoint(
builder.load(builder.gep(args, [index(0)])),
intptr_t
),
src
)
dest = builder.alloca(intptr_t, name='dest')
builder.store(
builder.ptrtoint(
builder.load(builder.gep(args, [index(1)])),
intptr_t
),
dest
)
src_step = builder.alloca(intptr_t, name='src_step')
builder.store(builder.load(builder.gep(steps, [index(0)])), src_step)
dest_step = builder.alloca(intptr_t, name='dest_step')
builder.store(builder.load(builder.gep(steps, [index(1)])), dest_step)
n = builder.alloca(intptr_t, name='n')
builder.store(builder.load(builder.gep(dimensions, [index(0)])), n)
i = builder.alloca(intptr_t, name='i')
builder.store(ir.Constant(intptr_t, 0), i)
loop_check_block = numba_gm_sin2.append_basic_block("loop_check")
loop_body_block = numba_gm_sin2.append_basic_block("loop_body")
loop_body = ir.IRBuilder(loop_body_block)
v = loop_body.load(
loop_body.inttoptr(
loop_body.load(src),
ptr(srctype)
),
name='v'
)
loop_body.store(
loop_body.call(func, [v]),
loop_body.inttoptr(loop_body.load(dest), ptr(desttype))
)
loop_body.store(
loop_body.add(
loop_body.load(src),
loop_body.load(src_step)
),
src
)
loop_body.store(
loop_body.add(
loop_body.load(dest),
loop_body.load(dest_step)
),
dest
)
loop_body.branch(loop_check_block)
loop_end_block = numba_gm_sin2.append_basic_block("loop_end")
loop_end = ir.IRBuilder(loop_end_block)
loop_end.ret(ir.Constant(int_, 0))
loop_check = ir.IRBuilder(loop_check_block)
loop_check.cbranch(
loop_check.icmp_signed('<', i, n),
loop_body_block,
loop_end_block
)
builder.branch(loop_check_block)
fill_module()
print(module)
engine = create_execution_engine()
mod = compile_ir(engine, str(module))
func_ptr = engine.get_function_address("numba_gm_sin2")
gm.sin2 = gm.unsafe_add_numpy_kernel(name="sin2", sig="... * float64 -> ... * float64", ptr=func_ptr)
x = xnd([-1.0, -2e122, 3.0, 0.3])
y = gm.sin2(x)
print(x, y)
intptr_t = ir.IntType(64)
int_ = ir.IntType(4)
char_ = ir.IntType(8)
Is the number inside IntType bits or bytes.
I think it's bits so the second one should be 64 as well.
Of course this should come from the platform so that you are 32-bit or 64-bit depending.
I think it would be best if xnd allowed for a single-element or 0-d kernel and then you would simply create a call-wrapper around your jitted function.
Otherwise, you would create a strided loop around the jitted function.
Travis Oliphant Quansight
On Wed, Apr 25, 2018 at 3:15 PM, Saul Shanabrook notifications@github.com wrote:
I am looking into writing a xnd compatible version of vectorize. My understanding is that vectorize first compiles the function, then creates some looping mechanisms around it so that it can be called as a ufunc.
Hopefully we can re-use numbas existing jit to create a llvm function, and then write some other looping code that traverses xnd data structures.
I figured out the LLVM signature for the function it creates in noptyhon mode. I added a print statement here https://github.com/saulshanabrook/numba/blob/65f079d1762bbe33ddfb4e35c00aa355933d5be5/numba/npyufunc/wrappers.py#L155 to see the types of some calls:
vectorize(nopython=True)(lambda a: a a)(np.arange(10)) i32 (i64, {i8*, i32}, i8, i64) Out[13]: array([ 0, 1, 4, 9, 16, 25, 36, 49, 64, 81]) vectorize(nopython=True)(lambda a: a a)(np.arange(10).astype(float)) i32 (double, {i8, i32}, i8, double) Out[14]: array([ 0., 1., 4., 9., 16., 25., 36., 49., 64., 81.]) vectorize(nopython=True)(lambda a: a a)(np.arange(10).astype(bool)) i32 (i64, {i8, i32}, i8, i8) Out[15]: array([0, 1, 1, 1, 1, 1, 1, 1, 1, 1]) vectorize(nopython=True)(lambda a, b: a b)(np.arange(10), np.arange(10)) i32 (i64, {i8, i32}, i8*, i64, i64) Out[16]: array([ 0, 1, 4, 9, 16, 25, 36, 49, 64, 81])
So the return value is likely success or failure, than it takes in a pointer to where it should save the actual result, a couple of structures, then the list of arguments. We can see this in a different form in the code where it creates the arguments for llvm (src https://github.com/saulshanabrook/numba/blob/5a9c586ec02680ed30f75e6965bb703be576198b/numba/targets/callconv.py#L463-L466 ):
realargs = [retvaltmp, excinfoptr, env] + args code = builder.call(callee, realargs) status = self._get_return_status(builder, code, builder.load(excinfoptr))
So I think the next step is to write a basic xnd version of vectorize that takes in xnd types and performs the jitted function on them values inside.
The design issue here that I am unsure about is how we do the broadcasting/looping through xnd data structures. Is there code in gumath to do that already? Or if there isn't code, what are the rules about that? Obviously if we have two types with the same shape, it's intuitive how that should work.
I guess follow the numpy rules? https://docs.scipy.org/doc/ numpy-1.14.0/user/basics.broadcasting.html#general-broadcasting-rules
— You are receiving this because you are subscribed to this thread. Reply to this email directly, view it on GitHub https://github.com/plures/gumath/issues/5#issuecomment-384419649, or mute the thread https://github.com/notifications/unsubscribe-auth/AAPjoFVreqpHnVFYEF4SOukexqcL_HiLks5tsNlzgaJpZM4TKSLi .
Is the number inside IntType bits or bytes.
bits, yeah you are right I changed it to 64.
I think it would be best if xnd allowed for a single-element or 0-d kernel and then you would simply create a call-wrapper around your jitted function.
I agree. My intent here is to manually wrap sin
, and then once that is working swap in a jitted function there.
I successfully generated a custom 0D kernel with LLVM, thanks to the example @skrah's added.
import llvmlite.binding as llvm
from llvmlite import ir
from llvmlite.ir import PointerType as ptr, LiteralStructType as struct
from toolz.functoolz import thread_first as tf
import gumath as gm
from xnd import xnd
# All these initializations are required for code generation!
llvm.initialize()
llvm.initialize_native_target()
llvm.initialize_native_asmprinter() # yes, even this one
def create_execution_engine():
"""
Create an ExecutionEngine suitable for JIT code generation on
the host CPU. The engine is reusable for an arbitrary number of
modules.
"""
# Create a target machine representing the host
target = llvm.Target.from_default_triple()
target_machine = target.create_target_machine()
# And an execution engine with an empty backing module
backing_mod = llvm.parse_assembly("")
engine = llvm.create_mcjit_compiler(backing_mod, target_machine)
return engine
def add_module(engine, llvm_ir):
mod = llvm.parse_assembly(llvm_ir)
mod.verify()
# Now add the module and make sure it is ready for execution
engine.add_module(mod)
def create_module():
module = ir.Module(context=ir.context.Context())
i8, i16, i32, i64 = map(ir.IntType, [8, 16, 32, 64])
index = lambda i: ir.Constant(i32, i)
struct = ir.LiteralStructType
func = module.declare_intrinsic('llvm.sin', [ir.DoubleType()])
context = module.context
ndt_slice_t = context.get_identified_type("ndt_slice_t")
ndt_slice_t.set_body(i64, i64, i64)
ndt_t = context.get_identified_type("_ndt")
ndt_t.set_body(
i32, i32, i32, i32, i64, i16,
struct([struct([i64, i64, i64, ptr(ptr(ndt_t))])]),
struct([struct([struct([i32, i64, i32, ptr(i32), i32, ptr(ndt_slice_t)])])]),
ir.ArrayType(i8, 16)
)
xnd_bitmap_t = context.get_identified_type("xnd_bitmap")
xnd_bitmap_t.set_body(
ptr(i8),
i64,
ptr(xnd_bitmap_t)
)
xnd_t = context.get_identified_type("xnd")
xnd_t.set_body(
xnd_bitmap_t,
i64,
ptr(ndt_t),
ptr(i8)
)
ndt_context_t = context.get_identified_type("_ndt_context_t")
ndt_context_t.set_body(
i32, i32, i32,
struct([ptr(i8)])
)
srctype = ir.DoubleType()
desttype = ir.DoubleType()
numba_gm_sin = ir.Function(module, ir.FunctionType(
i32,
(
ptr(xnd_t),
ptr(ndt_context_t),
)
), name="numba_gm_sin")
stack, context = numba_gm_sin.args
block = numba_gm_sin.append_basic_block(name="entry")
builder = ir.IRBuilder(block)
in_ = tf(
stack,
(builder.gep, [index(0), index(3)], True),
(builder.bitcast, ptr(ptr(srctype))),
builder.load,
(builder.load, 'in')
)
res = builder.call(func, [in_], name='res')
out = tf(
stack,
(builder.gep, [index(1), index(3)], True),
(builder.bitcast, ptr(ptr(desttype))),
builder.load
)
builder.store(res, out)
builder.ret(ir.Constant(i32, 0))
return module
module = create_module()
print(module)
engine = create_execution_engine()
add_module(engine, str(module))
engine.finalize_object()
engine.run_static_constructors()
func_ptr = engine.get_function_address("numba_gm_sin")
gm.numba_sin = gm.unsafe_add_xnd_kernel(name="double", sig="... * float64 -> ... * float64", ptr=func_ptr)
x = xnd([-1.0, -2e122, 3.0, 0.3])
print(x, gm.sin(x), gm.numba_sin(x), gm.xnd_sin0d(x))
This outputs:
; ModuleID = ""
target triple = "unknown-unknown-unknown"
target datalayout = ""
%"ndt_slice_t" = type {i64, i64, i64}
%"_ndt" = type {i32, i32, i32, i32, i64, i16, {{i64, i64, i64, %"_ndt"**}}, {{{i32, i64, i32, i32*, i32, %"ndt_slice_t"*}}}, [16 x i8]}
%"xnd_bitmap" = type {i8*, i64, %"xnd_bitmap"*}
%"xnd" = type {%"xnd_bitmap", i64, %"_ndt"*, i8*}
%"_ndt_context_t" = type {i32, i32, i32, {i8*}}
declare double @"llvm.sin.f64"(double %".1")
define i32 @"numba_gm_sin"(%"xnd"* %".1", %"_ndt_context_t"* %".2")
{
entry:
%".4" = getelementptr inbounds %"xnd", %"xnd"* %".1", i32 0, i32 3
%".5" = bitcast i8** %".4" to double**
%".6" = load double*, double** %".5"
%"in" = load double, double* %".6"
%"res" = call double @"llvm.sin.f64"(double %"in")
%".7" = getelementptr inbounds %"xnd", %"xnd"* %".1", i32 1, i32 3
%".8" = bitcast i8** %".7" to double**
%".9" = load double*, double** %".8"
store double %"res", double* %".9"
ret i32 0
}
xnd([-1.0, -2e+122, 3.0, 0.3], type='4 * float64') xnd([-0.8414709848078965, 0.44599498177837843, 0.1411200080598672, 0.29552020666133955], type='4 * float64') xnd([-0.8414709848078965, 0.44599498177837843, 0.1411200080598672, 0.29552020666133955], type='4 * float64') xnd([-0.8414709848078965, 0.44599498177837843, 0.1411200080598672, 0.29552020666133955], type='4 * float64')
I based it off of the optimized LLVM that clang produces when it compiles the c code:
%union.anon = type { %struct.anon.0 }
%struct.anon.0 = type { i64, i64, i64, %struct._ndt** }
%struct.anon.18 = type { %union.anon.19 }
%union.anon.19 = type { %struct.anon.21 }
%struct.anon.21 = type { i32, i64, i32, i32*, i32, %struct.ndt_slice_t* }
%struct.ndt_slice_t = type { i64, i64, i64 }
%struct.ndt_constraint_t = type { i32 (i64*, i8*, %struct._ndt_context*)*, i32, i32, [16 x i8*] }
%struct._ndt_context = type { i32, i32, i32, %union.anon.10 }
%union.anon.10 = type { i8* }
%struct.xnd = type { %struct.xnd_bitmap, i64, %struct._ndt*, i8* }
%struct.xnd_bitmap = type { i8*, i64, %struct.xnd_bitmap* }
%struct.gm_kernel_init_t = type { i8*, i8*, %struct.ndt_constraint_t*, i8, i32 (%struct.xnd*, %struct._ndt_context*)*, i32 (%struct.xnd*, %struct._ndt_context*)*, i32 (%struct.xnd*, %struct._ndt_context*)*, i32 (i8**, i64*, i64*, i8*)* }
%struct._gm_tbl = type opaque
%struct.ndt_methods_t = type { i1 (i8*, i8*, %struct._ndt_context*)*, i1 (i8*, %struct._ndt_context*)*, i8* (i8*, %struct._ndt_context*)* }
define i32 @gm_0D_sin_d_d(%struct.xnd* nocapture readonly, %struct._ndt_context* nocapture readnone) #0 {
%3 = getelementptr inbounds %struct.xnd, %struct.xnd* %0, i64 0, i32 3
%4 = bitcast i8** %3 to double**
%5 = load double*, double** %4, align 8, !tbaa !3
%6 = load double, double* %5, align 8, !tbaa !10
%7 = tail call double @llvm.sin.f64(double %6)
%8 = getelementptr inbounds %struct.xnd, %struct.xnd* %0, i64 1, i32 3
%9 = bitcast i8** %8 to double**
%10 = load double*, double** %9, align 8, !tbaa !3
store double %7, double* %10, align 8, !tbaa !10
ret i32 0
}
Now I want to be create that on the fly using the result of a jitted function, which can then be used for a vectorize decorator.
I also had to an unsafe_add_xnd_kernel
python method to gumath, which is the same as unsafe_add_numpy_kernel
but sets the function to Xnd
instead of Vectorize
and sets vectorize
to false
:
static PyObject *
unsafe_add_xnd_kernel(PyObject *m GM_UNUSED, PyObject *args, PyObject *kwds)
{
NDT_STATIC_CONTEXT(ctx);
static char *kwlist[] = {"name", "sig", "ptr", NULL};
gm_kernel_init_t k = {NULL};
gm_func_t *f;
char *name;
char *sig;
PyObject *ptr;
void *p;
if (!PyArg_ParseTupleAndKeywords(args, kwds, "ssO", kwlist, &name, &sig,
&ptr)) {
return NULL;
}
p = PyLong_AsVoidPtr(ptr);
if (p == NULL) {
return NULL;
}
k.name = name;
k.sig = sig;
k.vectorize = false;
k.Xnd = p;
if (gm_add_kernel(table, &k, &ctx) < 0) {
return seterr(&ctx);
}
f = gm_tbl_find(table, name, &ctx);
if (f == NULL) {
return seterr(&ctx);
}
return gufunc_new(table, f->name);
}
static PyMethodDef gumath_methods [] =
{
/* Methods */
{ "unsafe_add_numpy_kernel", (PyCFunction)unsafe_add_numpy_kernel, METH_VARARGS|METH_KEYWORDS, NULL },
{ "unsafe_add_xnd_kernel", (PyCFunction)unsafe_add_xnd_kernel, METH_VARARGS|METH_KEYWORDS, NULL },
{ NULL, NULL, 1 }
};
I successfully implemented this with manually specified type signatures: #12
Closing this to move over to numba
We should add support for numba to be able to call gumath kernels. The numba code should be able to call the c functions for the kernels directly.
Also, we should be able to have vectorize compile to a gumath kernel, instead of a numpy one, so that it can the function can be executed from C, without a python layer, and so it can be jitted without python involved