llvm / llvm-project

The LLVM Project is a collection of modular and reusable compiler and toolchain technologies.
http://llvm.org
Other
28.26k stars 11.67k forks source link

[OMPT] Teams directive with one team causes wrong `parallel_data` in implicit task callback #64734

Open Thyre opened 1 year ago

Thyre commented 1 year ago

Description:

Here's another issue I've encountered while trying to implement support for the teams directive in Score-P. When an application uses the teams directive and selects to use only a single team via num_teams(1), the dispatch of ompt_callback_implicit_task will not pass the parallel_data set in ompt_callback_parallel_begin. Using any other number of teams will yield the expected results. This also includes target teams when selecting x86_64 as the offload target.

Reproducer:

The source code to test this behavior is quite short (aside from the OMPT interface code).

Details

```c #include #include #include #include #include #include #include #include #include #include /* MAIN */ int main( int argc, char** argv ) { omp_set_num_threads(2); omp_set_teams_thread_limit(2); #pragma omp teams default(none) { } } /* OMPT INTERFACE */ #if ( defined( __ppc64__ ) || defined( __powerpc64__ ) || defined( __PPC64__ ) ) #define OMPT_TOOL_CPU_RELAX ( ( void )0 ) #elif ( defined( __x86_64 ) || defined( __x86_64__ ) || defined( __amd64 ) || defined( _M_X64 ) ) #define OMPT_TOOL_CPU_RELAX __asm__ volatile ( "pause" ) #elif ( defined( __aarch64__ ) || defined( __ARM64__ ) || defined( _M_ARM64 ) ) #define OMPT_TOOL_CPU_RELAX __asm__ volatile ( "yield" ) #else #define OMPT_TOOL_CPU_RELAX ( ( void )0 ) #endif /* Test-and-test-and-set lock. Mutexes are of type bool */ #define OMPT_TOOL_LOCK( MUTEX ) \ while ( true ) \ { \ if ( atomic_flag_test_and_set_explicit( &( MUTEX ), memory_order_acquire ) != true ) \ { \ break; \ } \ OMPT_TOOL_CPU_RELAX; \ } #define OMPT_TOOL_UNLOCK( MUTEX ) atomic_flag_clear_explicit( &( MUTEX ), memory_order_release ); #define OMPT_TOOL_GUARDED_PRINTF( ... ) \ OMPT_TOOL_LOCK( ompt_tool_printf_mutex ) \ printf( __VA_ARGS__ ); \ OMPT_TOOL_UNLOCK( ompt_tool_printf_mutex ) _Thread_local int32_t ompt_tool_tid = -1; /* thread counter. >= 1 after thread_begin */ atomic_flag ompt_tool_printf_mutex = ATOMIC_FLAG_INIT; static const char * scope_endpoint2string(ompt_scope_endpoint_t t) { switch (t) { case ompt_scope_begin: return "begin"; case ompt_scope_end: return "end"; case ompt_scope_beginend: return "beginend"; } assert(false); return ""; } static const char* parallel_flag2string( int /* ompt_parallel_flag_t */ t ) { if ( t & ompt_parallel_invoker_program ) { if ( t & ompt_parallel_league ) { assert( t == ( ompt_parallel_invoker_program | ompt_parallel_league ) ); return "invoker_program_league"; } else if ( t & ompt_parallel_team ) { assert( t == ( ompt_parallel_invoker_program | ompt_parallel_team ) ); return "invoker_program_team"; } else { assert( false ); } } else if ( t & ompt_parallel_invoker_runtime ) { if ( t & ompt_parallel_league ) { assert( t == ( ompt_parallel_invoker_runtime | ompt_parallel_league ) ); return "invoker_runtime_league"; } else if ( t & ompt_parallel_team ) { assert( t == ( ompt_parallel_invoker_runtime | ompt_parallel_team ) ); return "invoker_runtime_team"; } else { assert( false ); } } else { assert( false ); } return ""; } static const char* set_result2string( ompt_set_result_t t ) { switch ( t ) { case ompt_set_error: return "error"; case ompt_set_never: return "never"; case ompt_set_impossible: return "impossible"; case ompt_set_sometimes: return "sometimes"; case ompt_set_sometimes_paired: return "sometimes_paired"; case ompt_set_always: return "always"; } assert( false ); return ""; } static const char* task_flag2string( int /* ompt_task_flag_t */ t ) { if ( t & ompt_task_initial ) { assert( t == ompt_task_initial ); return "initial"; } else if ( t & ompt_task_implicit ) { if ( t & ompt_task_undeferred ) { return "implicit_undeferred"; } else if ( t & ompt_task_untied ) { return "implicit_untied"; } else if ( t & ompt_task_final ) { return "implicit_final"; } else if ( t & ompt_task_mergeable ) { return "implicit_mergeable"; } else if ( t & ompt_task_merged ) { return "implicit_merged"; } else { assert( t == ompt_task_implicit ); return "implicit"; } } else if ( t & ompt_task_explicit ) { if ( t & ompt_task_undeferred ) { return "explicit_undeferred"; } else if ( t & ompt_task_untied ) { return "explicit_untied"; } else if ( t & ompt_task_final ) { return "explicit_final"; } else if ( t & ompt_task_mergeable ) { return "explicit_mergeable"; } else if ( t & ompt_task_merged ) { return "explicit_merged"; } else { assert( t == ompt_task_explicit ); return "explicit"; } } else if ( t & ompt_task_target ) { if ( t & ompt_task_undeferred ) { return "target_undeferred"; } else if ( t & ompt_task_untied ) { return "target_untied"; } else if ( t & ompt_task_final ) { return "target_final"; } else if ( t & ompt_task_mergeable ) { return "target_mergeable"; } else if ( t & ompt_task_merged ) { return "target_merged"; } else { assert( t == ompt_task_target ); return "target"; } } else if ( t & ompt_task_taskwait ) { if ( t & ompt_task_undeferred ) { return "taskwait_undeferred"; } else if ( t & ompt_task_untied ) { return "taskwait_untied"; } else if ( t & ompt_task_final ) { return "taskwait_final"; } else if ( t & ompt_task_mergeable ) { return "taskwait_mergeable"; } else if ( t & ompt_task_merged ) { return "taskwait_merged"; } else { assert( t == ompt_task_taskwait ); return "taskwait"; } } else { assert( false ); } return ""; } static const char* thread2string( ompt_thread_t t ) { switch ( t ) { case ompt_thread_initial: return "initial"; case ompt_thread_worker: return "worker"; case ompt_thread_other: return "other"; case ompt_thread_unknown: return "unknown"; } assert( false ); return ""; } void thread_begin_cb( ompt_thread_t thread_type, ompt_data_t* thread_data ) { assert( ompt_tool_tid == -1 ); static atomic_int_least32_t thread_counter = 1; // ompt_tool_tid >= 1 ompt_tool_tid = atomic_fetch_add( &thread_counter, 1 ); thread_data->value = ompt_tool_tid; OMPT_TOOL_GUARDED_PRINTF( "[%s] tid = %" PRId32 " | type = %s\n", __FUNCTION__, ompt_tool_tid, thread2string( thread_type ) ); } #define INVALID_TASK 6666666 #define INVALID_PARALLEL 7777777 #define INVALID_THREAD 8888888 void parallel_begin_cb( ompt_data_t* encountering_task_data, const ompt_frame_t* encountering_task_frame, ompt_data_t* parallel_data, unsigned int requested_parallelism, int flags, const void* codeptr_ra ) { static atomic_uint_least64_t parallel_counter = 7770001; parallel_data->value = atomic_fetch_add( ¶llel_counter, 1 ); OMPT_TOOL_GUARDED_PRINTF( "[%s] tid = %" PRId32 " | parallel_data = %" PRIu64 " | %sencountering_task_data = %" PRIu64 " | flags = %s | requested_parallelism = %u | codeptr_ra = %p\n", __FUNCTION__, ompt_tool_tid, parallel_data->value, ( encountering_task_data == NULL || encountering_task_data->value == 0 ) ? "WARNING " : "", encountering_task_data == NULL ? INVALID_TASK : encountering_task_data->value, parallel_flag2string( flags ), requested_parallelism, codeptr_ra ); } void parallel_end_cb( ompt_data_t* parallel_data, ompt_data_t* encountering_task_data, int flags, const void* codeptr_ra ) { OMPT_TOOL_GUARDED_PRINTF( "[%s] tid = %" PRId32 " | %sparallel_data = %" PRIu64 " | %sencountering_task_data = %" PRIu64 " | flags = %s | codeptr_ra = %p\n", __FUNCTION__, ompt_tool_tid, ( parallel_data == NULL || parallel_data->value == 0 ) ? "WARNING " : "", parallel_data == NULL ? INVALID_PARALLEL : parallel_data->value, ( encountering_task_data == NULL || encountering_task_data->value == 0 ) ? "WARNING " : "", encountering_task_data == NULL ? INVALID_TASK : encountering_task_data->value, parallel_flag2string( flags ), codeptr_ra ); } static uint64_t new_task( void ) { static atomic_uint_least64_t task_counter = 6660001; return atomic_fetch_add( &task_counter, 1 ); } void implicit_task_cb( ompt_scope_endpoint_t endpoint, ompt_data_t* parallel_data, ompt_data_t* task_data, unsigned int actual_parallelism, unsigned int index, /* For initial tasks, that are not created by a teams construct, this argument is 1. */ int flags ) { if ( endpoint == ompt_scope_begin ) { uint64_t old = task_data->value; task_data->value = new_task(); if( old != 0 ) { // runtime might reuse implicit tasks but doesn't reset // ompt_data_t. Seems to be legal. Seem with NVHPC 23.1. OMPT_TOOL_GUARDED_PRINTF( "[%s] tid = %" PRId32 " | parallel_data = %" PRIu64 " | task_data = %" PRIu64 " (reused: %" PRIu64 ") | endpoint = %s | actual_parallelism = %u | index = %u | flags = %s\n", __FUNCTION__, ompt_tool_tid, parallel_data == NULL ? INVALID_PARALLEL : parallel_data->value, task_data == NULL ? INVALID_TASK : task_data->value, old, scope_endpoint2string( endpoint ), actual_parallelism, index, task_flag2string( flags ) ); return; } } OMPT_TOOL_GUARDED_PRINTF( "[%s] tid = %" PRId32 " | parallel_data = %" PRIu64 " | task_data = %" PRIu64 " | endpoint = %s | actual_parallelism = %u | index = %u | flags = %s\n", __FUNCTION__, ompt_tool_tid, parallel_data == NULL ? INVALID_PARALLEL : parallel_data->value, task_data == NULL ? INVALID_TASK : task_data->value, scope_endpoint2string( endpoint ), actual_parallelism, index, task_flag2string( flags ) ); } static int my_initialize_tool( ompt_function_lookup_t lookup, int initial_device_num, ompt_data_t* tool_data ) { OMPT_TOOL_GUARDED_PRINTF( "[%s] tid = %" PRId32 " | initial_device_num %d\n", __FUNCTION__, ompt_tool_tid, initial_device_num ); ompt_set_callback_t set_callback = (ompt_set_callback_t)lookup("ompt_set_callback"); set_callback(ompt_callback_implicit_task, (ompt_callback_t)&implicit_task_cb); set_callback(ompt_callback_thread_begin, (ompt_callback_t)&thread_begin_cb); set_callback(ompt_callback_parallel_begin, (ompt_callback_t)¶llel_begin_cb); set_callback(ompt_callback_parallel_end, (ompt_callback_t)¶llel_end_cb); return 1; /* non-zero indicates success */ } static void my_finalize_tool( ompt_data_t* tool_data ) { OMPT_TOOL_GUARDED_PRINTF( "[%s] tid = %" PRId32 "\n", __FUNCTION__, ompt_tool_tid ); } ompt_start_tool_result_t* ompt_start_tool( unsigned int omp_version, const char* runtime_version ) { setbuf( stdout, NULL ); OMPT_TOOL_GUARDED_PRINTF( "[%s] tid = %" PRId32 " | omp_version %d | runtime_version = \'%s\'\n", __FUNCTION__, ompt_tool_tid, omp_version, runtime_version ); static ompt_start_tool_result_t tool = { &my_initialize_tool, &my_finalize_tool, ompt_data_none }; return &tool; } ```

If we compile and run the tool we can see the following:

$ clang --version
clang version 18.0.0 (https://github.com/llvm/llvm-project.git 52ac71f92d38f75df5cb88e9c090ac5fd5a71548)
Target: x86_64-unknown-linux-gnu
Thread model: posix
InstalledDir: /opt/software/software/LLVM/git/bin
$ clang -fopenmp reproducer.c 
$ OMP_NUM_TEAMS=1 ./a.out | grep "tid = 1"
[thread_begin_cb] tid = 1 | type = initial
[implicit_task_cb] tid = 1 | parallel_data = 0 | task_data = 6660001 | endpoint = begin | actual_parallelism = 1 | index = 1 | flags = initial
[parallel_begin_cb] tid = 1 | parallel_data = 7770001 | encountering_task_data = 6660001 | flags = invoker_runtime_league | requested_parallelism = 1 | codeptr_ra = 0x5613f5f1f1da
[implicit_task_cb] tid = 1 | parallel_data = 0 | task_data = 6660002 | endpoint = begin | actual_parallelism = 1 | index = 0 | flags = initial
[parallel_begin_cb] tid = 1 | parallel_data = 7770002 | encountering_task_data = 6660002 | flags = invoker_runtime_team | requested_parallelism = 2 | codeptr_ra = (nil)
[implicit_task_cb] tid = 1 | parallel_data = 7770002 | task_data = 6660003 | endpoint = begin | actual_parallelism = 2 | index = 0 | flags = implicit
[implicit_task_cb] tid = 1 | parallel_data = 7777777 | task_data = 6660003 | endpoint = end | actual_parallelism = 2 | index = 0 | flags = implicit
[parallel_end_cb] tid = 1 | parallel_data = 7770002 | encountering_task_data = 6660002 | flags = invoker_runtime_team | codeptr_ra = (nil)
[implicit_task_cb] tid = 1 | parallel_data = 7777777 | task_data = 6660002 | endpoint = end | actual_parallelism = 0 | index = 0 | flags = initial
[parallel_end_cb] tid = 1 | parallel_data = 7770001 | encountering_task_data = 6660001 | flags = invoker_runtime_league | codeptr_ra = 0x5613f5f1f1da
[implicit_task_cb] tid = 1 | parallel_data = 0 | task_data = 6660001 | endpoint = end | actual_parallelism = 0 | index = 1 | flags = initial
[my_finalize_tool] tid = 1
$ OMP_NUM_TEAMS=2 ./a.out | grep "tid = 1"
[thread_begin_cb] tid = 1 | type = initial
[implicit_task_cb] tid = 1 | parallel_data = 0 | task_data = 6660001 | endpoint = begin | actual_parallelism = 1 | index = 1 | flags = initial
[parallel_begin_cb] tid = 1 | parallel_data = 7770001 | encountering_task_data = 6660001 | flags = invoker_runtime_league | requested_parallelism = 2 | codeptr_ra = 0x55a782f901da
[implicit_task_cb] tid = 1 | parallel_data = 7770001 | task_data = 6660002 | endpoint = begin | actual_parallelism = 2 | index = 0 | flags = initial
[parallel_begin_cb] tid = 1 | parallel_data = 7770002 | encountering_task_data = 6660002 | flags = invoker_runtime_team | requested_parallelism = 2 | codeptr_ra = (nil)
[implicit_task_cb] tid = 1 | parallel_data = 7770002 | task_data = 6660003 | endpoint = begin | actual_parallelism = 2 | index = 0 | flags = implicit
[implicit_task_cb] tid = 1 | parallel_data = 7777777 | task_data = 6660003 | endpoint = end | actual_parallelism = 2 | index = 0 | flags = implicit
[parallel_end_cb] tid = 1 | parallel_data = 7770002 | encountering_task_data = 6660002 | flags = invoker_runtime_team | codeptr_ra = (nil)
[implicit_task_cb] tid = 1 | parallel_data = 7777777 | task_data = 6660002 | endpoint = end | actual_parallelism = 0 | index = 0 | flags = initial
[parallel_end_cb] tid = 1 | parallel_data = 7770001 | encountering_task_data = 6660001 | flags = invoker_runtime_league | codeptr_ra = 0x55a782f901da
[implicit_task_cb] tid = 1 | parallel_data = 0 | task_data = 6660001 | endpoint = end | actual_parallelism = 0 | index = 1 | flags = initial
[my_finalize_tool] tid = 1

You can see that parallel_data is set to 0 in the second ompt_callback_implicit_task for one team, but gets the correct value for two (and more) teams.

llvmbot commented 1 year ago

@llvm/issue-subscribers-openmp