``complex`` + ``int`` cast fails

Here's the MWE

>>> import pycuda.autoinit
>>> import pycuda.gpuarray as gpuarray
>>> import numpy as np
>>> np.zeros(10, dtype="complex") + np.zeros(10, dtype="int") # Passes
>>> gpuarray.zeros(10, dtype="complex") + gpuarray.zeros(10, dtype="int")  # Fails

Error trace

---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
File ~/pycuda/pycuda/tools.py:470, in context_dependent_memoize.<locals>.wrapper(*args, **kwargs)
    469 try:
--> 470     return ctx_dict[cur_ctx][cache_key]
    471 except KeyError:

KeyError: <pycuda._driver.Context object at 0x7faa41c31bd0>

During handling of the above exception, another exception occurred:

CompileError                              Traceback (most recent call last)
Input In [3], in <cell line: 1>()
----> 1 gpuarray.zeros(10, dtype="complex") + gpuarray.zeros(10, dtype="int")

File ~/pycuda/pycuda/gpuarray.py:588, in GPUArray.__add__(self, other)
    585 if isinstance(other, GPUArray):
    586     # add another vector
    587     result = _get_broadcasted_binary_op_result(self, other)
--> 588     return self._axpbyz(1, other, 1, result)
    590 elif np.isscalar(other):
    591     # add a scalar
    592     if other == 0:

File ~/pycuda/pycuda/gpuarray.py:427, in GPUArray._axpbyz(self, selffac, other, otherfac, out, add_timer, stream)
    421     raise RuntimeError(
    422         "only contiguous arrays may " "be used as arguments to this operation"
    423     )
    424 assert ((self.shape == other.shape == out.shape)
    425     or ((self.shape == ()) and other.shape == out.shape)
    426     or ((other.shape == ()) and self.shape == out.shape))
--> 427 func = elementwise.get_axpbyz_kernel(
    428     self.dtype, other.dtype, out.dtype,
    429     x_is_scalar=(self.shape == ()),
    430     y_is_scalar=(other.shape == ()))
    431 if add_timer is not None:
    432     add_timer(
    433         3 * self.size,
    434         func.prepared_timed_call(
   (...)
    442         ),
    443     )

File ~/pycuda/pycuda/tools.py:474, in context_dependent_memoize.<locals>.wrapper(*args, **kwargs)
    472 context_dependent_memoized_functions.append(func)
    473 arg_dict = ctx_dict.setdefault(cur_ctx, {})
--> 474 result = func(*args, **kwargs)
    475 arg_dict[cache_key] = result
    476 return result

File ~/pycuda/pycuda/elementwise.py:480, in get_axpbyz_kernel(dtype_x, dtype_y, dtype_z, x_is_scalar, y_is_scalar)
    478 by = f"b*(({out_t}) {y})"
    479 result = f"{ax} + {by}"
--> 480 return get_elwise_kernel(
    481     "%(tp_x)s a, %(tp_x)s *x, %(tp_y)s b, %(tp_y)s *y, %(tp_z)s *z"
    482     % {
    483         "tp_x": dtype_to_ctype(dtype_x),
    484         "tp_y": dtype_to_ctype(dtype_y),
    485         "tp_z": dtype_to_ctype(dtype_z),
    486     },
    487     f"z[i] = {result}",
    488     "axpbyz",
    489 )

File ~/pycuda/pycuda/elementwise.py:191, in get_elwise_kernel(arguments, operation, name, keep, options, **kwargs)
    185 def get_elwise_kernel(
    186     arguments, operation, name="kernel", keep=False, options=None, **kwargs
    187 ):
    188     """Return a L{pycuda.driver.Function} that performs the same scalar operation
    189     on one or several vectors.
    190     """
--> 191     mod, func, arguments = get_elwise_kernel_and_types(
    192         arguments, operation, name, keep, options, **kwargs
    193     )
    195     return func

File ~/pycuda/pycuda/elementwise.py:177, in get_elwise_kernel_and_types(arguments, operation, name, keep, options, use_range, **kwargs)
    174 else:
    175     module_builder = get_elwise_module
--> 177 mod = module_builder(arguments, operation, name, keep, options, **kwargs)
    179 func = mod.get_function(name)
    180 func.prepare("".join(arg.struct_char for arg in arguments))

File ~/pycuda/pycuda/elementwise.py:46, in get_elwise_module(arguments, operation, name, keep, options, preamble, loop_prep, after_loop)
     35 def get_elwise_module(
     36     arguments,
     37     operation,
   (...)
     43     after_loop="",
     44 ):
     45     from pycuda.compiler import SourceModule
---> 46     return SourceModule(
     47         """
     48         #include <pycuda-complex.hpp>
     49
     50         %(preamble)s
     51
     52         extern "C"
     53         __global__ void %(name)s(%(arguments)s)
     54         {
     55
     56           unsigned tid = threadIdx.x;
     57           unsigned total_threads = gridDim.x*blockDim.x;
     58           unsigned cta_start = blockDim.x*blockIdx.x;
     59           unsigned i;
     60
     61           %(loop_prep)s;
     62
     63           for (i = cta_start + tid; i < n; i += total_threads)
     64           {
     65             %(operation)s;
     66           }
     67
     68           %(after_loop)s;
     69         }
     70         """
     71         % {
     72             "arguments": ", ".join(arg.declarator() for arg in arguments),
     73             "operation": operation,
     74             "name": name,
     75             "preamble": preamble,
     76             "loop_prep": loop_prep,
     77             "after_loop": after_loop,
     78         },
     79         options=options,
     80         keep=keep,
     81         no_extern_c=True,
     82     )

File ~/pycuda/pycuda/compiler.py:355, in SourceModule.__init__(self, source, nvcc, options, keep, no_extern_c, arch, code, cache_dir, include_dirs)
    341 def __init__(
    342     self,
    343     source,
   (...)
    351     include_dirs=[],
    352 ):
    353     self._check_arch(arch)
--> 355     cubin = compile(
    356         source,
    357         nvcc,
    358         options,
    359         keep,
    360         no_extern_c,
    361         arch,
    362         code,
    363         cache_dir,
    364         include_dirs,
    365     )
    367     from pycuda.driver import module_from_buffer
    369     self.module = module_from_buffer(cubin)

File ~/pycuda/pycuda/compiler.py:304, in compile(source, nvcc, options, keep, no_extern_c, arch, code, cache_dir, include_dirs, target)
    301 for i in include_dirs:
    302     options.append("-I" + i)
--> 304 return compile_plain(source, options, keep, nvcc, cache_dir, target)

File ~/pycuda/pycuda/compiler.py:154, in compile_plain(source, options, keep, nvcc, cache_dir, target)
    148         warn(
    149             "PyCUDA: nvcc exited with status 0, but appears to have "
    150             "encountered an error"
    151         )
    152     from pycuda.driver import CompileError
--> 154     raise CompileError(
    155         "nvcc compilation of %s failed" % cu_file_path,
    156         cmdline,
    157         stdout=stdout.decode("utf-8", "replace"),
    158         stderr=stderr.decode("utf-8", "replace"),
    159     )
    161 if stdout or stderr:
    162     lcase_err_text = (stdout + stderr).decode("utf-8", "replace").lower()

CompileError: nvcc compilation of /tmp/tmpgvce8ei3/kernel.cu failed
[command: nvcc --cubin -arch sm_70 -I/home/mitak2/pycuda/pycuda/cuda kernel.cu]
[stderr:
kernel.cu(19): error: no operator "*" matches these operands
            operand types are: long * pycuda::complex<double>

1 error detected in the compilation of "kernel.cu".
]

inducer / pycuda

``complex`` + ``int`` cast fails #372