Open bertwesarg opened 4 years ago
It would be interesting to know if that occurs during the run or at the end and ideally a stack trace would be helpful.
I see two possible places:
1) The large transfer types (vector types used to transfer >2G data); 2) The strided and indexed types used to handle the halo transfers (which seems more likely here)
While I do not think this will fix it, but here is some cleanup and a check before calling MPI_Type_free
diff --git i/dart-impl/mpi/include/dash/dart/mpi/dart_communication_priv.h w/dart-impl/mpi/include/dash/dart/mpi/dart_communication_priv.h
index d146865c9..1a66ea3f4 100644 dart-impl/mpi/include/dash/dart/mpi/dart_communication_priv.h
--- i/dart-impl/mpi/include/dash/dart/mpi/dart_communication_priv.h
+++ w/dart-impl/mpi/include/dash/dart/mpi/dart_communication_priv.h
@@ -75,8 +75,6 @@ dart_ret_t dart__mpi__op_fini();
*/
#define MAX_CONTIG_ELEMENTS (INT_MAX)
-#define DART_MPI_TYPE_UNDEFINED (MPI_Datatype)MPI_UNDEFINED
-
typedef enum {
DART_KIND_BASIC = 0,
DART_KIND_STRIDED,
@@ -190,7 +188,7 @@ MPI_Datatype dart__mpi__datatype_maxtype(dart_datatype_t dart_type) {
dart_datatype_struct_t *dts = dart__mpi__datatype_struct(dart_type);
MPI_Datatype res;
if (dart__mpi__datatype_iscontiguous(dart_type)) {
- if (dts->contiguous.max_type == DART_MPI_TYPE_UNDEFINED) {
+ if (dts->contiguous.max_type == MPI_DATATYPE_NULL) {
dts->contiguous.max_type = dart__mpi__datatype_create_max_datatype(
dts->contiguous.mpi_type);
}
diff --git i/dart-impl/mpi/src/dart_communication.c w/dart-impl/mpi/src/dart_communication.c
index b4da40d73..7f3330360 100644 dart-impl/mpi/src/dart_communication.c
--- i/dart-impl/mpi/src/dart_communication.c
+++ w/dart-impl/mpi/src/dart_communication.c
@@ -391,11 +391,11 @@ dart__mpi__put_basic(
CHECK_MPI_RET(
dart__mpi__put(src_ptr,
nchunks,
- dart__mpi__datatype_struct(dtype)->contiguous.max_type,
+ dart__mpi__datatype_maxtype(dtype),
team_unit_id.id,
offset,
nchunks,
- dart__mpi__datatype_struct(dtype)->contiguous.max_type,
+ dart__mpi__datatype_maxtype(dtype),
win,
reqs, num_reqs),
"MPI_Put");
diff --git i/dart-impl/mpi/src/dart_mpi_types.c w/dart-impl/mpi/src/dart_mpi_types.c
index e90c380f3..2ab8a21c9 100644 dart-impl/mpi/src/dart_mpi_types.c
--- i/dart-impl/mpi/src/dart_mpi_types.c
+++ w/dart-impl/mpi/src/dart_mpi_types.c
@@ -312,7 +312,7 @@ dart_type_create_custom(
new_struct->contiguous.size = num_bytes;
new_struct->contiguous.mpi_type = new_mpi_dtype;
// max_type will be created on-demand for custom types
- new_struct->contiguous.max_type = DART_MPI_TYPE_UNDEFINED;
+ new_struct->contiguous.max_type = MPI_DATATYPE_NULL;
*newtype = (dart_datatype_t)new_struct;
DART_LOG_TRACE("Created new custom data type %p with %zu bytes`",
@@ -343,7 +343,7 @@ dart_type_destroy(dart_datatype_t *dart_type_ptr)
MPI_Type_free(&dart_type->indexed.mpi_type);
} else if (dart_type->kind == DART_KIND_CUSTOM) {
MPI_Type_free(&dart_type->contiguous.mpi_type);
- if (dart_type->contiguous.max_type != DART_MPI_TYPE_UNDEFINED) {
+ if (dart_type->contiguous.max_type != MPI_DATATYPE_NULL) {
MPI_Type_free(&dart_type->contiguous.max_type);
}
}
@@ -357,7 +357,8 @@ dart_type_destroy(dart_datatype_t *dart_type_ptr)
static void destroy_basic_type(dart_datatype_t dart_type_id)
{
dart_datatype_struct_t *dart_type = dart__mpi__datatype_struct(dart_type_id);
- MPI_Type_free(&dart_type->contiguous.max_type);
+ if (dart_type->contiguous.max_type != MPI_DATATYPE_NULL)
+ MPI_Type_free(&dart_type->contiguous.max_type);
dart_type->contiguous.max_type = MPI_DATATYPE_NULL;
}
Can you please post a PR for this? :+1:
Sure, do you think it fixes anything related to my problem here?
Hard to say, the confusion of MPI_DATATYPE_NULL
and DART_MPI_TYPE_UNDEFINED
may be a reason but it's hard to say really.
Actually, the last two lines of the patch might be the culprit (https://github.com/dash-project/dash/pull/709/files#diff-f99e41ced414d50f5b467c4e54685f19R360).
Actually, the last two lines of the patch might be the culprit (https://github.com/dash-project/dash/pull/709/files#diff-f99e41ced414d50f5b467c4e54685f19R360).
But this should be != MPI_DATATYPE_NULL
for all basic types, thus it should not matter.
I will push the changes to SPEC and ask kindly if this is fixed on HPE Cray
Got this error from a SPEC reporter:
Any ideas where this comes from or what I could request from the report?