pocl / pocl

pocl - Portable Computing Language
http://portablecl.org
MIT License
910 stars 251 forks source link

Crashes in Pocl's workgroup analysis (?) with atomics #1509

Open inducer opened 2 months ago

inducer commented 2 months ago

Consider the following reproducer:

#!/usr/bin/env python

import numpy as np
import pyopencl as cl

n = 10

ctx = cl.create_some_context()
queue = cl.CommandQueue(ctx)

prg = cl.Program(ctx, """
#define lid(N) ((int) get_local_id(N))
#define gid(N) ((int) get_group_id(N))

__kernel void __attribute__ ((reqd_work_group_size(10, 1, 1))) loopy_kernel(__global volatile int *__restrict__ out)
{
  __local volatile int temp;

  int loopy_old_val;
  int loopy_new_val;
  do
  {
    loopy_old_val = temp;
    loopy_new_val = (volatile int) (0);
  }
  while (atomic_cmpxchg(&(temp), loopy_old_val, loopy_new_val) != loopy_old_val);
  barrier(CLK_LOCAL_MEM_FENCE) /* Barrier inserted due to lb1 */;
  int loopy_old_val_0;
  int loopy_new_val_0;
  do
  {
    loopy_old_val_0 = temp;
    loopy_new_val_0 = loopy_old_val_0 + 1;
  }
  while (atomic_cmpxchg(&(temp), loopy_old_val_0, loopy_new_val_0) != loopy_old_val_0);
  barrier(CLK_LOCAL_MEM_FENCE) /* Barrier inserted due to lb2 */;
  int loopy_old_val_1;
  int loopy_new_val_1;
  do
  {
    loopy_old_val_1 = out[lid(0)];
    loopy_new_val_1 = temp;
  }
  while (atomic_cmpxchg(&(out[lid(0)]), loopy_old_val_1, loopy_new_val_1) != loopy_old_val_1);
}
""").build()

res_g = cl.Buffer(ctx, cl.mem_flags.WRITE_ONLY, n*4)
knl = prg.loopy_kernel
knl(queue, (n,), (n,), res_g)

I have not yet tried to make a smaller reproducer, but I can do so if requested.

Both pocl 5.0 and 6.0 exhibit Valgrind failures along the following lines. In pocl 6.0, these lead to crashes.

$ PYOPENCL_TEST=port valgrind  --exit-on-first-error=yes --error-exitcode=1 --suppressions=python-simple.supp  python miscompiled-atomic.py
[SNIP]
==278390== Invalid read of size 8
==278390==    at 0x901C3B4: std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >::size() const (in /usr/lib/x86_64-linux-gnu/libstdc++.so.6.0.33)
==278390==    by 0x28E0E100: bool std::operator==<char, std::char_traits<char>, std::allocator<char> >(std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&) (basic_string.h:3714)
==278390==    by 0x28E71374: bool __gnu_cxx::__ops::_Iter_equals_val<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const>::operator()<__gnu_cxx::__normal_iterator<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const*, std::vector<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, std::allocator<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > > > > >(__gnu_cxx::__normal_iterator<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const*, std::vector<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, std::allocator<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > > > >) (predefined_ops.h:270)
==278390==    by 0x28E706F4: __gnu_cxx::__normal_iterator<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const*, std::vector<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, std::allocator<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > > > > std::__find_if<__gnu_cxx::__normal_iterator<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const*, std::vector<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, std::allocator<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > > > >, __gnu_cxx::__ops::_Iter_equals_val<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const> >(__gnu_cxx::__normal_iterator<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const*, std::vector<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, std::allocator<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > > > >, __gnu_cxx::__normal_iterator<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const*, std::vector<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, std::allocator<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > > > >, __gnu_cxx::__ops::_Iter_equals_val<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const>, std::random_access_iterator_tag) (stl_algobase.h:2072)
==278390==    by 0x28E6FF36: __gnu_cxx::__normal_iterator<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const*, std::vector<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, std::allocator<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > > > > std::__find_if<__gnu_cxx::__normal_iterator<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const*, std::vector<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, std::allocator<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > > > >, __gnu_cxx::__ops::_Iter_equals_val<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const> >(__gnu_cxx::__normal_iterator<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const*, std::vector<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, std::allocator<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > > > >, __gnu_cxx::__normal_iterator<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const*, std::vector<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, std::allocator<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > > > >, __gnu_cxx::__ops::_Iter_equals_val<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const>) (stl_algobase.h:2117)
==278390==    by 0x28E6FA24: __gnu_cxx::__normal_iterator<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const*, std::vector<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, std::allocator<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > > > > std::find<__gnu_cxx::__normal_iterator<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const*, std::vector<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, std::allocator<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > > > >, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > >(__gnu_cxx::__normal_iterator<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const*, std::vector<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, std::allocator<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > > > >, __gnu_cxx::__normal_iterator<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const*, std::vector<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, std::allocator<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > > > >, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&) (stl_algo.h:3897)
==278390==    by 0x28E6F343: pocl::optimizeWorkItemFuncCalls(llvm::Function&) (OptimizeWorkItemFuncCalls.cc:89)
==278390==    by 0x28E6F8A7: pocl::OptimizeWorkItemFuncCalls::run(llvm::Function&, llvm::AnalysisManager<llvm::Function>&) (OptimizeWorkItemFuncCalls.cc:185)
==278390==    by 0x28E722D1: llvm::detail::PassModel<llvm::Function, pocl::OptimizeWorkItemFuncCalls, llvm::PreservedAnalyses, llvm::AnalysisManager<llvm::Function>>::run(llvm::Function&, llvm::AnalysisManager<llvm::Function>&) (PassManagerInternal.h:89)
==278390==    by 0x2DED9C93: llvm::PassManager<llvm::Function, llvm::AnalysisManager<llvm::Function>>::run(llvm::Function&, llvm::AnalysisManager<llvm::Function>&) (in /usr/lib/llvm-18/lib/libLLVM-18.so.1)
==278390==    by 0x2FE262F0: ??? (in /usr/lib/llvm-18/lib/libLLVM-18.so.1)
==278390==    by 0x2DEDD3A0: llvm::ModuleToFunctionPassAdaptor::run(llvm::Module&, llvm::AnalysisManager<llvm::Module>&) (in /usr/lib/llvm-18/lib/libLLVM-18.so.1)
==278390==  Address 0x28a97468 is 8 bytes inside a block of size 352 free'd
==278390==    at 0x4843ADF: operator delete(void*, unsigned long) (in /usr/libexec/valgrind/vgpreload_memcheck-amd64-linux.so)
==278390==    by 0x28E69271: std::__new_allocator<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > >::deallocate(std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >*, unsigned long) (new_allocator.h:172)
==278390==    by 0x28E673FD: UnknownInlinedFun (alloc_traits.h:517)
==278390==    by 0x28E673FD: std::_Vector_base<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, std::allocator<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > > >::_M_deallocate(std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >*, unsigned long) (stl_vector.h:390)
==278390==    by 0x28E67D87: std::_Vector_base<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, std::allocator<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > > >::~_Vector_base() [clone .lto_priv.0] (stl_vector.h:369)
==278390==    by 0x28E24276: std::vector<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, std::allocator<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > > >::~vector() (stl_vector.h:738)
==278390==    by 0x49F7A15: __run_exit_handlers (exit.c:111)
==278390==    by 0x49F7B49: exit (exit.c:141)
==278390==    by 0x49DFC90: (below main) (libc_start_call_main.h:74)
==278390==  Block was alloc'd at
==278390==    at 0x4840F83: operator new(unsigned long) (in /usr/libexec/valgrind/vgpreload_memcheck-amd64-linux.so)
==278390==    by 0x28E69DC7: std::__new_allocator<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > >::allocate(unsigned long, void const*) (new_allocator.h:151)
==278390==    by 0x28E68164: UnknownInlinedFun (alloc_traits.h:482)
==278390==    by 0x28E68164: std::_Vector_base<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, std::allocator<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > > >::_M_allocate(unsigned long) (stl_vector.h:381)
==278390==    by 0x28E67E0B: void std::vector<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, std::allocator<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > > >::_M_range_initialize<char const**>(char const**, char const**, std::forward_iterator_tag) (stl_vector.h:1692)
==278390==    by 0x28E23E6C: std::vector<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, std::allocator<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > > >::vector<char const**, void>(char const**, char const**, std::allocator<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > > const&) (stl_vector.h:711)
==278390==    by 0x28E6AF38: __static_initialization_and_destruction_0() [clone .lto_priv.0] (LLVMUtils.cc:572)
==278390==    by 0x28E6AF74: _GLOBAL__sub_I_LLVMUtils.cc (LLVMUtils.cc:611)
==278390==    by 0x28F1D8B2: _sub_I_65535_0.0 (tuple:125)
==278390==    by 0x4004E7D: call_init (dl-init.c:74)
==278390==    by 0x4004E7D: call_init (dl-init.c:26)
==278390==    by 0x4004F63: _dl_init (dl-init.c:121)
==278390==    by 0x4001515: _dl_catch_exception (dl-catch.c:211)
==278390==    by 0x400B76D: dl_open_worker (dl-open.c:810)

For completeness, python-simple.supp is here. It silences a few spurious warnings about Python's memory allocator.

Versions:

pjaaskel commented 2 months ago

Can you send a C++-only minimal repro, ideally as a PR with an XFAIL test in the tests/regressions suite? Thanks!