Closed mpichbot closed 7 years ago
Originally by Klug@astonshell.com on 2009-07-25 13:02:13 -0500
This message has 0 attachment(s)
Originally by jayesh on 2009-07-27 14:55:36 -0500
Hi, Thanks for the bug report. Can you try the newer nemesis channel (I am getting the problem only with shm and sock channels. Try "mpiexec -n 2 -channel nemesis foo.exe") Meanwhile please follow this ticket on our findings related to the bug.
Regards, Jayesh
Originally by jayesh on 2009-07-28 14:14:37 -0500
Hi, This looks to me to be a bug in the serialization of the classes in the code. The modified code below works for me (with shm, sock, ssm, nemesis).
#include <iostream>
#include <boost/mpi/environment.hpp>
#include <boost/mpi/communicator.hpp>
using namespace std;
namespace mpi = boost::mpi;
class TFoo
{
private:
friend class boost::serialization::access;
template<class Archive>
void serialize(Archive & ar, const unsigned int version)
{
ar & a_;
ar & d_;
ar & f_;
}
int a_;
double d_;
float f_;
public:
TFoo(): a_(0), d_(0.0),f_(0.0){
}
TFoo(int a, double d, float f): a_(a),d_(d),f_(f){
}
TFoo(const TFoo &foo){
a_ = foo.a_;
d_ = foo.d_;
f_ = foo.f_;
}
};
BOOST_IS_MPI_DATATYPE(TFoo)
class TBar
{
private:
friend class boost::serialization::access;
template<class Archive>
void serialize(Archive & ar, const unsigned int version)
{
ar & h_[0];
ar & h_[1];
ar & q_[0];
ar & q_[1];
}
int h_[2];
TFoo q_[2];
TFoo *q1_[2];
public:
TBar(){
h_[0] # 0; h_[1]0;
}
TBar(int h_0, int h_1, TFoo q_0, TFoo q_1){
h_[0] = h_0;
h_[1] = h_1;
q_[0] = q_0;
q_[1] = q_1;
q1_[0] = new TFoo(q_0);
q1_[1] = new TFoo(q_1);
}
};
BOOST_IS_MPI_DATATYPE(TBar)
int main(int argc, char* argv[])
{
int i=0;
mpi::environment env(argc, argv);
mpi::communicator world;
std::cout << "I am process " << world.rank() << " of " << world.size()
<< "." << std::endl;
if (world.size() != 2)
{
std::cout << "set world.size() = 2" << std::endl;
return 0;
}
/*
TBar foo =
{
12345678,
3.14159265327,
3.1415926f
};
-/
TFoo foo(12345678, 3.14159265327, 3.1415926f);
TBar bar(12345678, 12345678, foo, foo);
if (world.rank() == 0)
{
world.send(1, 0, foo);//AV here!!!
world.send(1, 0, bar);//AV here!!!
}
else
{
TFoo foo1;
TBar ba[1];
world.recv(0, 0, foo1);
world.recv(0, 0, ba[1]);
}
return 0;
}
Regards, Jayesh
Originally by jayesh on 2009-07-28 16:03:02 -0500
Hi, The following code should also work,
#include <iostream>
#include <boost/mpi/environment.hpp>
#include <boost/mpi/communicator.hpp>
#include <boost/serialization/array.hpp>
#include <boost/serialization/base_object.hpp>
#include <boost/serialization/utility.hpp>
...
class TBar
{
private:
friend class boost::serialization::access;
int h_[2];
TFoo q_[2];
template<class Archive>
void serialize(Archive & ar, const unsigned int version)
{
ar & h_[0];
ar & h_[1];
ar & q_;
}
public:
TBar(){
h_[0] # 0; h_[1]0;
}
TBar(int h_0, int h_1, TFoo q_0, TFoo q_1){
h_[0] = h_0;
h_[1] = h_1;
q_[0] = q_0;
q_[1] = q_1;
}
};
...
Regards, Jayesh
Originally by jayesh on 2009-07-28 16:45:56 -0500
Hi, However trying to serialize both h and q (instead of individual elements) failed... I am looking into the code and will update you on my findings.
Regards, Jayesh
Originally by jayesh on 2009-07-30 10:01:21 -0500
Hi, The bug seems to be in the boost MPI serialization code. The code in 1.39.0 does not handle serialization of arrays (for MPI datatypes) correctly. The following change (I would recommend contacting the boost MPI developers for the correct fix) in the boost header file "boost\mpi\detail\mpi_datatype_primitive.hpp" is required for programs that serialize arrays to behave correctly. Replace the save_array() member function (line 61-66 in mpi_datatype_primitive.hpp @ 1.39.0) with the code below,
// fast saving of arrays of MPI types
template<class T>
void save_array(serialization::array<T> const& x, unsigned int /* version */)
{
BOOST_ASSERT (addresses.size() > 0);
BOOST_ASSERT (types.size() > 0);
BOOST_ASSERT (lengths.size() > 0);
// We don't need the array size. Pop it !
addresses.pop_back();
types.pop_back();
lengths.pop_back();
if (x.count())
save_impl(x.address(), boost::mpi::get_mpi_datatype(*x.address()), x.count());
}
I used the code below (Run it with 1 MPI process, mpiexec -n 1 foo.exe) for my testing,
#include <iostream>
#include <boost/mpi/environment.hpp>
#include <boost/mpi/communicator.hpp>
#include <boost/serialization/array.hpp>
#include <boost/serialization/base_object.hpp>
#include <boost/serialization/utility.hpp>
using namespace std;
namespace mpi = boost::mpi;
class TFoo
{
private:
friend class boost::serialization::access;
template<class Archive>
void serialize(Archive & ar, const unsigned int version)
{
ar & a_;
ar & d_;
ar & f_;
ar & foo_arr;
}
int a_;
double d_;
float f_;
int foo_arr[5];
public:
TFoo(): a_(0), d_(0.0),f_(0.0){
for(int i=0; i<5; i++){
foo_arr[i] = i;
}
}
TFoo(int a, double d, float f): a_(a),d_(d),f_(f){
for(int i=0; i<5; i++){
foo_arr[i] = i;
}
}
TFoo(const TFoo &foo){
a_ = foo.a_;
d_ = foo.d_;
f_ = foo.f_;
for(int i=0; i<5; i++){
foo_arr[i] = foo.foo_arr[i];
}
}
void print(void ){
std::cout << a_ << " , " << d_ << " , " << f_ << endl;
for(int i=0; i<5; i++){
std::cout << foo_arr[i] << " , ";
}
std::cout << std::endl;
}
void printAddresses(void){
std::cout << &a_ << " , " << &d_ << " , " << &f_ << endl;
}
};
BOOST_IS_MPI_DATATYPE(TFoo)
class TBar
{
private:
friend class boost::serialization::access;
int h_[2];
TFoo q_[2];
template<class Archive>
void serialize(Archive & ar, const unsigned int version)
{
ar & h_;
ar & q_;
}
public:
TBar(){
h_[0] # 0; h_[1]0;
}
TBar(int h_0, int h_1, TFoo q_0, TFoo q_1){
h_[0] = h_0;
h_[1] = h_1;
q_[0] = q_0;
q_[1] = q_1;
}
void print(void ){
cout << h_[0] << " , " << h_[1] << endl;
q_[0].print(); q_[1].print();
}
void printAddresses(void ){
std::cout << " &h_ # " << &h_ << " &q_" << &q_ << std::endl;
std::cout << &h_[0] << " , " << &h_[1] << std::endl;
q_[0].printAddresses(); q_[1].printAddresses();
}
};
BOOST_IS_MPI_DATATYPE(TBar)
int main(int argc, char* argv[])
{
int i=0;
mpi::environment env(argc, argv);
mpi::communicator world;
std::cout << "I am process " << world.rank() << " of " << world.size()
<< "." << std::endl;
TFoo foo(1234, 3.14, 3.14f);
TFoo foo_next(5678, 4.12, 4.12f);
TBar bar(1234, 1234, foo, foo_next);
try{
if (world.rank() == 0)
{
TFoo foo1;
TBar ba[1];
world.isend(0, 0, foo);//AV here!!!
world.isend(0, 0, bar);//AV here!!!
world.recv(0, 0, foo1);
world.recv(0, 0, ba[1]);
std::cout << "FOO ======================" << std::endl;
foo1.print();
std::cout << "BAR ======================" << std::endl;
ba[1].print();
}
}catch(mpi::exception &exception){
cout << "Error :" << exception.what();
}
return 0;
}
I would recommend contacting the boost MPI developers regarding the appropriate fix.
Regards, Jayesh
Originally by Klug@astonshell.com on 2009-07-31 10:21:22 -0500
Hello!
> The bug seems to be in the boost MPI serialization code. The code in
> 1.39.0 does not handle serialization of arrays (for MPI datatypes)
> correctly...
[skip]
Please look at new code (without any C++ wrapper):
==========================================
#include <stdio.h>
#include <iostream>
#include <mpi.h>
struct TFoo
{
int a_;
double d_;
float f_;
int g_[5];
};
struct TBar
{
int h_[2];
TFoo q_[2];
};
int main(int argc, char** argv)
{
MPI_Init(&argc, &argv);
int rank, size;
MPI_Comm_size(MPI_COMM_WORLD, &size);
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
std::cout << "I am process " << rank << " of " << size
<< "." << std::endl;
if (size != 2)
{
std::cout << "set world.size() = 2" << std::endl;
return 0;
}
TFoo foo =
{
1,
3.14159265327,
3.1415926f,
};
TBar bar =
{
{1,2},
{
1,
3.14159265327,
3.1415926f,
{},
2,
3.14159265327,
3.1415926f,
}
};
MPI_Datatype foo_type;
{
int lens[] = {1,1,1,_countof(foo.g_)};
MPI_Aint offs[_countof(lens)] =
{
(MPI_Aint)((char*)&foo.a_ - (char*)&foo),
(MPI_Aint)((char*)&foo.d_ - (char*)&foo),
(MPI_Aint)((char*)&foo.f_ - (char*)&foo),
(MPI_Aint)((char*)&foo.g_[0] - (char*)&foo)
};
MPI_Datatype types[_countof(lens)] =
{
MPI_INT,
MPI_DOUBLE,
MPI_FLOAT,
MPI_INT
};
MPI_Type_struct(_countof(lens), lens, offs, types, &foo_type);
MPI_Type_commit(&foo_type);
}
MPI_Datatype bar_type;
{
int lens[] = {_countof(bar.h_), _countof(bar.q_)};
MPI_Aint offs[_countof(lens)] =
{
(MPI_Aint)((char*)&bar.h_[0] - (char*)&bar),
(MPI_Aint)((char*)&bar.q_[0] - (char*)&bar)
};
MPI_Datatype types[_countof(lens)] =
{
MPI_INT,
foo_type//it is OK?
};
MPI_Type_struct(_countof(lens), lens, offs, types, &bar_type);
MPI_Type_commit(&bar_type);
}
if (rank == 0)
{
MPI_Send(&foo, 1, foo_type, 1, 0, MPI_COMM_WORLD);//OK
MPI_Send(&bar, 1, bar_type, 1, 0, MPI_COMM_WORLD);//AV here!!!
}
else
{
MPI_Status st;
TFoo foo2 = {};
MPI_Recv(&foo2, 1, foo_type, 0, 0, MPI_COMM_WORLD, &st);
if (foo2.a_ == foo.a_)
std::cout << "foo::OK" << std::endl;
TBar ba[2] = {};
MPI_Recv(&ba[2], 1, bar_type, 0, 0, MPI_COMM_WORLD, &st);
if (ba[2].h_[0] == bar.h_[0])
std::cout << "bar::OK" << std::endl;
}
MPI_Barrier(MPI_COMM_WORLD);
std::cout << "quit process " << rank << std::endl;
MPI_Finalize();
return 0;
}
==========================================
This program works fine with the nemesis channel, but cause AV with shm
channel (in mpich2-1.1-win-x86-64).
Now, i'm not sure: it is correct to use the "foo_type" in the nested
MPI_Datatype...
---
WBR
Originally by jayesh@mcs.anl.gov on 2009-07-31 10:41:45 -0500
Hi,
Using an MPI datatype after it is committed to create a new datatype is
valid. Let me take a look at your code and get back to you.
Regards,
Jayesh
[attachment:"untitled-part.html"]
Originally by jayesh@mcs.anl.gov on 2009-07-31 10:41:45 -0500
Attachment added: untitled-part.html
(0.5 KiB)
Added by email2trac
Originally by jayesh@mcs.anl.gov on 2009-07-31 13:43:26 -0500
Hi,
I did not get any errors when running your code on a 64-bit machine
(MPICH2 1.1.1) with shm/ssm/nemesis/sock channels .
Try downloading the latest version of MPICH2 and let us know if it works
for you.
Regards,
Jayesh
[attachment:"untitled-part-1.html"]
Originally by jayesh@mcs.anl.gov on 2009-07-31 13:43:26 -0500
Attachment added: untitled-part-1.html
(0.6 KiB)
Added by email2trac
Originally by Klug@astonshell.com on 2009-08-01 11:03:58 -0500
Hello!
> Try downloading the latest version of MPICH2 and let us know if it works
> for you.
Oops, we have the version 1.1.
The new version (1.1.1) works without errors.
Thank you for great support!!
---
WBR
Originally by Oleg Bulychov Klug@astonshell.com on 2009-07-25 13:02:13 -0500