ViralBShah commented 9 years ago

One of the first things we need to do is make the runtime thread safe. This work is on the threads branch, and this tracker predates the new GC. I thought it is worth capturing a tracker that @StefanKarpinski prepared earlier in an issue to ease the thread safety work.

This list is organized as Variable; Approach

builtins.c

[x] extern size_t jl_page_size; constant
[x] extern int jl_in_inference; lock
[x] extern int jl_boot_file_loaded; constant
[x] int injl = 0; thread-local
ccall.cpp
[ ] static std::mapstd::stringstd::string sonameMap; lock
[ ] static bool got_sonames = false; lock, write-once
[ ] static std::mapstd::stringuv_lib_t* libMap; lock
[ ] static std::mapstd::stringGlobalVariable* libMapGV; lock
[ ] static std::mapstd::stringGlobalVariable* symMapGV; lock
[ ] ~~static char *temp_arg_area; thread-local (will be deleted very soon)~~
[x] ~~static const uint32_t arg_area_sz = 4196; constant (will be deleted very soon)~~
[ ] ~~static uint32_t arg_area_loc; thread-local (will be deleted very soon)~~
[ ] ~~static void *temp_arg_blocks[N_TEMP_ARG_BLOCKS]; thread-local (will be deleted very soon)~~
[ ] ~~static uint32_t arg_block_n = 0; thread-local (will be deleted very soon)~~
[x] ~~static Function *save_arg_area_loc_func; constant (will be deleted very soon)~~
[x] ~~static Function *restore_arg_area_loc_func; constant (will be deleted very soon)~~
cgutils.cpp
[x] static std::map<const std::stringGlobalVariable*> stringConstants; lock
[x] static std::map<void*jl_value_llvm> jl_value_to_llvm; lock
[x] static std::map<Value void> llvm_to_jl_value; lock
[x] static std::vector<Constant*> jl_sysimg_gvars; lock
[x] static std::map<intjl_value_t*> typeIdToType; lock
[x] jl_array_t *typeToTypeId; lock
[x] static int cur_type_id = 1; lock
codegen.cpp
[x] void *__stack_chk_guard = NULL; thread-local (jwn: why is this on the list? it's a constant and not thread local)
debuginfo.cpp
[ ] extern "C" volatile int jl_in_stackwalk;
[ ] JuliaJITEventListener *jl_jit_events;
[ ] static obfiletype objfilemap;
[x] extern char *jl_sysimage_name; constant
[ ] static logdata_t coverageData;
[ ] static logdata_t mallocData;
dump.c
[ ] static jl_array_t *tree_literal_values=NULL; thread-local
[x] static jl_value_t *jl_idtable_type=NULL; constant
[x] static jl_array_t *datatype_list=NULL; thread 0 only
[x] jl_value_t ***sysimg_gvars = NULL; thread 0 only
[x] extern int globalUnique; thread 0 only
[x] static size_t delayed_fptrs_n = 0; thread 0 only
[x] static size_t delayed_fptrs_max = 0; thread 0 only
gc.c
[ ] static volatile size_t allocd_bytes = 0; thread-local
[ ] static volatile int64_t total_allocd_bytes = 0; thread-local
[ ] static int64_t last_gc_total_bytes = 0; thread-local
[x] static size_t freed_bytes = 0; barrier
[x] static uint64_t total_gc_time=0; barrier
[x] int jl_in_gc=0; * referenced from switchto task.c barrier
[x] static htable_t obj_counts; barrier
[x] static size_t total_freed_bytes=0; barrier
[x] static arraylist_t to_finalize; barrier
[x] static jl_value_t **mark_stack = NULL; barrier
[x] static size_t mark_stack_size = 0; barrier
[x] static size_t mark_sp = 0; barrier
[x] extern jl_module_t *jl_old_base_module; constant
[x] extern jl_array_t *typeToTypeId; barrier
[x] extern jl_array_t *jl_module_init_order; barrier
[x] static int is_gc_enabled = 1; atomic
[x] static double process_t0; constant
init.c
[x] char *jl_stack_lo; thread-local
[x] char *jl_stack_hi; thread-local
[ ] volatile sig_atomic_t jl_signal_pending = 0; thread-local
[ ] volatile sig_atomic_t jl_defer_signal = 0; thread-local
[ ] uv_loop_t *jl_io_loop; I/O thread ?
[ ] static void *signal_stack; thread-local (see https://github.com/JuliaLang/julia/issues/9763#issuecomment-69864396)
[x] static mach_port_t segv_port = 0; constant
[ ] extern void * __stack_chk_guard; thread-local (duplicate of above)
jltypes.c
[ ] int inside_typedef = 0; thread-local
[ ] static int match_intersection_mode = 0; thread-local
[ ] static int has_ntuple_intersect_tuple = 0; thread-local
[x] static int t_uid_ctr = 1; lock
llvm-simdloop.cpp
[x] static unsigned simd_loop_mdkind = 0; constant
[x] static MDNode* simd_loop_md = NULL; constant
[x] char LowerSIMDLoop::ID = 0; lock
module.c
[x] jl_module_t *jl_main_module=NULL; constant
[x] jl_module_t *jl_core_module=NULL; constant
[x] jl_module_t *jl_base_module=NULL; constant
[ ] jl_module_t *jl_current_module=NULL; thread-local
[ ] jl_array_t *jl_module_init_order = NULL; lock (this code is bady broken anyways: https://github.com/JuliaLang/julia/issues/9799)
profile.c
[ ] static volatile ptrint_t* bt_data_prof = NULL;
[ ] static volatile size_t bt_size_max = 0;
[ ] static volatile size_t bt_size_cur = 0;
[ ] static volatile u_int64_t nsecprof = 0;
[ ] static volatile int running = 0;
[ ] volatile HANDLE hBtThread = 0;
[ ] static pthread_t profiler_thread;
[ ] static mach_port_t main_thread;
[ ] clock_serv_t clk;
[ ] static int profile_started = 0;
[ ] static mach_port_t profile_port = 0;
[ ] volatile static int forceDwarf = -2;
[ ] volatile mach_port_t mach_profiler_thread = 0;
[ ] static unw_context_t profiler_uc;
[ ] mach_timespec_t timerprof;
[ ] struct itimerval timerprof;
[ ] static timer_t timerprof;
[ ] static struct itimerspec itsprof;
sys.c
[x] JL_STREAM *JL_STDIN=0; constant
[x] JL_STREAM *JL_STDOUT=0; constant
[x] JL_STREAM *JL_STDERR=0; constant
task.c
[ ] volatile int jl_in_stackwalk = 0; thread-local
[x] static size_t _frame_offset; constant
[x] DLLEXPORT jl_task_t * volatile jl_current_task; thread-local
[x] jl_task_t *jl_root_task; constant
[x] jl_value_t * volatile jl_task_arg_in_transit; thread-local
[x] jl_value_t *jl_exception_in_transit; thread-local
[x] __JL_THREAD jl_gcframe_t *jl_pgcstack = NULL; thread-local
[x] jl_jmp_buf * volatile jl_jmp_target; thread-local
[x] extern int jl_in_gc; barrier
[x] static jl_function_t *task__hook_func=NULL; constant
[x] ptrint_t bt_data[MAX_BT_SIZE+1]; thread-local
[x] size_t bt_size = 0; thread-local
[ ] int needsSymRefreshModuleList; lock
[x] jl_function_t *jl_unprotect_stack_func; constant
toplevel.c
[ ] int jl_lineno = 0; thread-local
[x] jl_module_t *jl_old_base_module = NULL; constant
[x] jl_module_t *jl_internal_main_module = NULL; constant
[x] extern int jl_in_inference; lock

StefanKarpinski commented 9 years ago

Don't we have a spreadsheet of this somewhere? Maybe make it public?

ViralBShah commented 9 years ago

https://docs.google.com/a/mayin.org/spreadsheets/d/1FLrB90u0ORvBDmJZtxUfXhSt8HVdKMM0rqYjoOeMvGo/edit#gid=0

Don't know if it is public - but I thought it would be easier to track the work here in the issue.

StefanKarpinski commented 9 years ago

I fancied up the spreadsheet a bit so that it's easier to organize and sort.

vtjnash commented 9 years ago

i've tried to help knock a few off the list based off existing issues and upcoming changes

JeffBezanson commented 9 years ago

That's true, #9986 helps with this.

ViralBShah commented 9 years ago

It would be nice to get basic printing to work:

julia> using Base.Threading
julia> @threads all for i=1:100; println(i) ; end

signal (11): Segmentation fault
Segmentation fault (core dumped)

ArchRobison commented 9 years ago

Are these changes being made in the master, or a branch? I see items checked off for llvm-simdloop.cpp, but don't see them in the master branch.

StefanKarpinski commented 9 years ago

I think this is for the threading branch, right, @JeffBezanson?

tknopp commented 9 years ago

would also be interesting which is the current threading branch, "threading" or "threads"

StefanKarpinski commented 9 years ago

"threading" is newer than "threads"; not sure why we changed branches.

tknopp commented 9 years ago

that why I asked :-) I have seen changes being made to both branches at a time and thus was not sure about it.

Would be also interesting to have an issue what the blockers are for merging into master. IIUC at some point llvm svn was required but this might be solved if we switch to llvm3.6? Further there is still the pthreads dependency.

ArchRobison commented 9 years ago

Moving to C11/C++11 threads would remove the pthreads dependency. Using C11/C++11 would also give us portable atomic ops with good memory consistency controls. LLVM is moving aggressively to depend on C++11 anyway.

tknopp commented 9 years ago

I am not 100% sure why the pthread dependency was added but it seems to be required to set thread affinity. Is this possible with C++11? For atomic ops we have written some macros that use the compiler dependent intrinsics.

ViralBShah commented 9 years ago

@kpamnany had suggested that I use the threads branch and not threading as one of the macros was not quite working. If that is now fixed, I'd rather delete the threads branch and just have everything on threading.

ArchRobison commented 9 years ago

As far as I know, C++11 has no notion of thread affinity. So we still have to write platform-specific code for that. There is a hook in C++11 for getting at the platform-specific thread handle, so I think it's possible to write most of the threading in C++11/C11 and interface to platform-specific stuff for affinity.

tknopp commented 9 years ago

Not sure what exactly we gain when using C++11 threads. The Julia code is currently largely C based and only uses C++ where necessary (i.e. when interfacing with LLVM). This is a design decision. One could simplify quite some code when using C++ and could e.g. use STL containers instead of the self-written ones in libsupport. But the C vs C++ topic can be quite subjective. Its up to Jeff to give a direction here.

ArchRobison commented 9 years ago

C11 threading would work too. It's supposed to interoperate with C++ threads, i.e. in principle it's just difference spellings of the key operations. Though we'd need to check on Windows. Microsoft has been good about implementing the latest versions of C++, but has shown indifference to implementing even C99.

ArchRobison commented 9 years ago

Another route is to figure out the threading model first, and then write platform specific implementations of the core operations. That's the way the Intel Cilk Plus run-time is written, since it requires stack-switching capabilities beyond PThreads.

eschnett commented 9 years ago

To handle thread affinity I recommend using the hwloc library, already nicely wrapped in https://github.com/JuliaParallel/hwloc.jl. hwloc provides a platform-independent API to query number of cores, sockets, caches, etc., and to define the threads' affinity to them.

I find that C++11 threading implementations are based on pthreads, and thus slow. It would e.g. not be efficient to run each Julia task on a new C++11 thread. This makes C++11 threading facilities (the thread class, async method, future objects) rather useless if one is interested in more fine-grained threading. Most languages that are serious about threading define their own thread abstraction and do not rely on pthreads.

@ArchRobison: What stack-switching capabilities does Cilk Plus have? I had the impression that there are certain limitations. I'm currently using Qthreads https://github.com/Qthreads/qthreads, since there each thread has its own stack, and threads can switch arbitrarily.

ArchRobison commented 9 years ago

Yes, direct use of C11/C++11 threading for tasks would be rather useless, but it at least provides portable access to OS-level threads. I would imagine the Julia run-time would have several layers for threading:

Platform specific services. E.g. create a thread.
Unsafe abstractions implemented directly on top of (1), and hence an implementation per platform.
Portable unsafe abstractions built on top of (2)
Safe services for mere mortals, built on top of (2) and (3).

Cilk implementations (including Cilk Plus) implement a cactus stack. This paper gives a good overview of different ways to implement cactus stacks. The limitation to cactus stacks and the "busy leaves" property (each leaf has a thread running on it) is essential to Cilk's strong space and time guarantees, which may or may not be worth the limitations depending on perspective.

For level 2 above, the internals of the Intel implementation use an abstraction similar to Windows fibers, essentially arbitrary stack switching. The file runtime/cilk_fiber-unix.cpp has the Linux version -- about 144 lines of code.

@eschnett How does QThreads deal with stack overflow? I recall that Rust gave up on segmented stacks. Go copies the entire stack, which requires some care with code generation and stack maps, and takes on a little overhead in function prologs. Though overall I like its design since users don't have to worry about stack overflow and are not tied to cactus stacks.

eschnett commented 9 years ago

@ArchRobison QThread stack overflows are handled via good, old signal 11, if you enable stack overflow detection. Otherwise they remain unhandled.

eschnett commented 9 years ago

@ArchRobison: Regarding Cilk Plus: I have the impression -- please correct me if I am wrong -- that Cilk offers only a limited kind of parallelism: A routine can spawn children, but all children need to exit before the routine itself exits. This is less powerful than e.g. C++11, where the spawned thread's state is typically captured in a future that can be returned, stored, and passed around. Thread execution in C++11 thus doesn't form a tree structure, and cactus stacks (as described in the paper you mention) are not relevant. Is this what you meant by "busy leaves property"?

eschnett commented 9 years ago

@ArchRobison: You said "I recall that Rust gave up on segmented stacks." Do you have a pointer to what Rust tried, or how they failed?

kpamnany commented 9 years ago

To clarify on the threads vs. threading branches: there are no macro issues. The latter branch is newer and we should throw away the former, but #10527.

kpamnany commented 9 years ago

The threading code uses pthreads to start threads, set thread affinities, and for a mutex and condition variable pair to let threads sleep rather than spin when they aren't working. The other platform dependency is atomics, which would also be used in the runtime for spin locks and other synchronization constructs.

I've begun looking at C11 for threads and atomics, but if platform-dependent code is required anyways (for thread affinitization), and moving to C11 doesn't give us Windows, then I'm not sure it's worth the effort. hwloc looks cool but very elaborate for just thread affinity control.

To maximize parallel performance, at least on HSW-EP and KNL (i.e. 72 to >250 threads), plenty of platform-specific code will be needed anyway (e.g. the "best" barrier algorithm itself is different).

vtjnash commented 9 years ago

libuv also provides threading primitives: https://github.com/joyent/node/blob/a995a6a776f1b2d01946702190c6ff837c338577/deps/uv/include/uv.h#L1331-L1378

tknopp commented 9 years ago

@vtjnash this is what is used but libuv misses the thread affinity code. I would say extending libuv would be a Good appoach

ArchRobison commented 9 years ago

@eschnett: Here is a link to Rust giving up on segmented stacks.

"Busy leaves" property says that all leaves in the cactus stack have a thread actively running. E.g., on a P-thread system there will be no more than P leaves, and each leaf will have a thread running it. Each path from a leaf to the root corresponds to a stack that would have occurred in the sequential version of the program, hence the total space is bounded by P*(space for sequential execution). An example of a system without the busy leave property is TBB, where a leaf can stall because of the way TBB maps tasks to threads internally. (Alas a trade off we made for easy of portability.)

It looks like libuv's threading support is essentially the same as pthreads. Atomic operations and user-level scheduling are missing. So libuv seems like a good way to break dependence on pthreads, but we'll still need at least macros for atomics and something for decoupling stack<-->thread bindings.

kpamnany commented 9 years ago

I've created a PR for libuv to add support for thread affinitization. See https://github.com/libuv/libuv/pull/273.

It looks like libuv also has implementations of some atomic operations (used internally only right now). Assuming that the libuv devs are willing, will completing the set of atomic operations there and making them externally usable satisfy portability requirements?

User-level threads and scheduling can, I think, be quite easily implemented on top of the current infrastructure.

tkelman commented 9 years ago

Cool stuff. Note that we're using a fork that's way behind upstream libuv, so anything that we add upstream would need to be backported to JuliaLang/libuv for us to easily use it. A long-standing todo has been to upstream the pipe-related API changes that necessitated a fork in the first place, no one has put much work into doing that yet AFAIK.

tknopp commented 9 years ago

Yes getting this upstream should be a long term goal but should not be a barrier. Its perfectly fine to abstract the missing peaces in the julia source tree in a separate file.

tkelman commented 9 years ago

Good to get code review from libuv devs now though, no harm in that. Hopefully the sections of code touched there aren't too different between our fork and latest upstream.

ViralBShah commented 9 years ago

@kpamnany You may already know this, but we use our own fork of libuv. I expect the patches to apply cleanly.

kpamnany commented 9 years ago

I was, actually, unaware of that, but no harm done. And yes, the affinity patch is trivial.

kpamnany commented 9 years ago

I've verified that the fix for https://github.com/JuliaLang/julia/issues/10527 is working fine, so the threading branch is the only one that anyone should be using. The old threads branch can now be safely deleted I think.

ViralBShah commented 9 years ago

Can you delete it in a couple of days - unless someone says something?

ArchRobison commented 9 years ago

How do people feel about pushing some trivial changes needed for threading into master. By trivial I mean ones that would not impact cod esize or performance of serial execution. Ordinarily, I'm not thrilled with burdening master with work in progress, but I'm thinking a few small changes might:

Reduce pull headaches for the threading branch.
Give maintainers of master some notice of what changes the threading branch will bring on..
Make the eventual merge of the threading branch easier to understand.

For starters, I'm thinking specifically of where global variables need to be marked with __JL_THREAD. We could define it as whitespace in master for now.

tknopp commented 9 years ago

@ArchRobison: I think this is a good idea. If this is to be made opt-in for 0.4 it is IMHO a good idea merging it rather sooner than later.

StefanKarpinski commented 9 years ago

+1

On Mar 23, 2015, at 10:05 PM, Arch D. Robison notifications@github.com wrote:

How do people feel about pushing some trivial changes needed for threading into master. By trivial I mean ones that would not impact cod esize or performance of serial execution. Ordinarily, I'm not thrilled with burdening master with work in progress, but I'm thinking a few small changes might:

Reduce pull headaches for the threading branch. Give maintainers of master some notice of what changes the threading branch will bring on.. Make the eventual merge of the threading branch easier to understand. For starters, I'm thinking specifically of where global variables need to be marked with __JL_THREAD. We could define it as whitespace in master for now.

— Reply to this email directly or view it on GitHub.

kpamnany commented 9 years ago

There have been no further comments or action on the libuv PR. Anyone with OS-X or Windows development systems who can test the patch and report there?

The patch does not apply cleanly to the Julia fork of libuv--some thread related code has moved (was in uv-common.c and is now in src/unix/thread.c and src/win/thread.c). libuv is also now using a user-specializable malloc/free. What's the right answer here? Do the minimal additions necessary on Julia's fork of libuv and diverge further?

@tkelman : can you point me at something that explains these pipe related changes that caused us to fork the library?

tkelman commented 9 years ago

can you point me at something that explains these pipe related changes that caused us to fork the library?

Yep, sure can. See https://github.com/joyent/libuv/pull/451, almost 3 years old now. At the very end there's an alternate implementation plan briefly sketched out by Bert Belder, that should hopefully work.

tkelman commented 9 years ago

Do the minimal additions necessary on Julia's fork of libuv and diverge further?

I think short term we do this if we're in a hurry, but aim to upstream everything eventually if possible, subject to available labor.

garrison commented 9 years ago

Glancing at alloc.c, I noticed a few things that are missing from the list/spreadsheet above: gs_ctr should be made per-thread, and jl_gensym and jl_tagged_genym both return static buffers. There may be other things in this file as well that are not thread safe (I have not checked thoroughly).

ViralBShah commented 9 years ago

Another reliable segfault. @kpamnany Am I using @threads incorrectly?

using Base.Threading
MT = Array(MersenneTwister,Threading.nthreads())
@threads all begin
    MT[Threading.threadid()] = MersenneTwister(Threading.threadid())
end

ViralBShah commented 9 years ago

Another feature request - would be really nice to have println working. Impossible to debug without it. I will try to see if I can have a per-thread file for now.

JeffBezanson commented 8 years ago

I'll start a checklist of steps needed before we can turn threading on by default:

[ ] CI setup and tests pass
[ ] Improve stability (e.g. #13380)
[ ] Make Base thread safe (some functions use global or let-bound temp buffers, should be stack allocated instead)
[ ] Stop-the-world GC

Bonus items:

[ ] Integrate task system (and remove yieldto)
[ ] Implement some approach to I/O

StefanKarpinski commented 8 years ago

That's a good list for things that need to be done before threading can be used but I think far less needs to be done before it can be enabled but not used. In fact I think that none of those are necessary for threading support to be enambled by default.

tkelman commented 8 years ago

@yuyichao's opinion was that at least the GC item should be done before enabling it by default. Does the current setup when threads are enabled result in worse performance in the single threaded case?

ViralBShah commented 8 years ago

I do agree that some performance testing would be great to have and we turn on threading by default carefully.

@jrevels Is this something that your perf testing setup you are planning, be used to track?

JeffBezanson commented 8 years ago

Ok, I agree not all of those are necessary. I lowered the priority of some.

I don't think there should be an intermediate state where threading is enabled by default, but shouldn't be used. To me, enabling it by default sends a signal that everybody should feel free to use it.

JuliaLang / julia

Thread safety tracker #10421

builtins.c

ccall.cpp

cgutils.cpp

codegen.cpp

debuginfo.cpp

dump.c

gc.c

init.c

jltypes.c

llvm-simdloop.cpp

module.c

profile.c

sys.c

task.c

toplevel.c