Open sungho91 opened 9 months ago
Certainly, we can provide assistance by addressing specific questions and offering general guidance. However we won't have the availability to assist with actual code writing.
@vladotomov, Yes, coding is my work. Addressing questions and offering general guidance would be sufficient.
My plan is to use the existing structure in Laghos. First, I added a new grid function for the stress vector, which has a size of 3*(dim - 1). So, in 2D and 3D, it has 3 and 6 vectors, respectively. I referenced this vector to a block vector (S).
To calculate the stress rate, I tried to use ForcePA->Mult(one, rhs)
because this operation does -F*1 (H1 by L2 x L2 => product is H1 space)
. I made StressPA
by copying ForcePA.
In StressPA
, I tried to multiply the stress rate tensor by the basis function of L2 space. Just for simplicity to check whether this operator works or not, I used the stress rate tensor as P*I (pressure tensor).
I called StressPA->Mult(one, s_rhs); // one is a dummy value for the rhs
.
I thought I just put1.0
instead of multiplication of Bt
and Gt
, which are considered shape functions of H1 (gradient?) and skip multiplication of one (L2 space)
. It boils down to stress rate * basis of L2
here.
After doing this calculation, I divided each component by mass using Emass
. However, when I plot stress_xx
and_yy
, they are different. I think I was wrong in StressPA as they should be the same.
Any comment is going to be helpful.
for (int c = 0; c < 3*(dim-1); c++)
{
ds.MakeRef(&L2, dS_dt, H1Vsize*2 + L2Vsize + c*size);
ds = 0.0; rhs_s_gf = 0.0;
rhs_s_gf.MakeRef(&L2, s_rhs, c*size);
timer.sw_cgL2.Start();
CG_EMass.Mult(rhs_s_gf, ds);
timer.sw_cgL2.Stop();
const HYPRE_Int cg_num_iter = CG_EMass.GetNumIterations();
timer.L2iter += (cg_num_iter==0) ? 1 : cg_num_iter;
// Move the memory location of the subvector 'ds' to the memory
// location of the base vector 'dS_dt'.
ds.GetMemory().SyncAlias(dS_dt.GetMemory(), ds.Size());
}
StressPAOperator::StressPAOperator(const QuadratureData &qdata,
ParFiniteElementSpace &h1,
ParFiniteElementSpace &l2,
const IntegrationRule &ir) :
Operator(),
dim(h1.GetMesh()->Dimension()),
NE(h1.GetMesh()->GetNE()),
qdata(qdata),
H1(h1),
L2(l2),
H1R(H1.GetElementRestriction(ElementDofOrdering::LEXICOGRAPHIC)),
L2R(L2.GetElementRestriction(ElementDofOrdering::LEXICOGRAPHIC)),
ir1D(IntRules.Get(Geometry::SEGMENT, ir.GetOrder())),
D1D(H1.GetFE(0)->GetOrder()+1),
Q1D(ir1D.GetNPoints()),
L1D(L2.GetFE(0)->GetOrder()+1),
// H1sz(H1.GetVDim() * H1.GetFE(0)->GetDof() * NE),
H1sz(3*(dim-1)*L2.GetVDim() * H1.GetFE(0)->GetDof() * NE),
// L2sz(L2.GetFE(0)->GetDof() * NE),
L2sz(L2.GetVDim() * L2.GetFE(0)->GetDof() * NE),
L2D2Q(&L2.GetFE(0)->GetDofToQuad(ir, DofToQuad::TENSOR)),
H1D2Q(&L2.GetFE(0)->GetDofToQuad(ir, DofToQuad::TENSOR)),
// H1D2Q(&H1.GetFE(0)->GetDofToQuad(ir, DofToQuad::TENSOR)),
X(L2sz), Y(H1sz) { }
template<int DIM, int D1D, int Q1D, int L1D, int NBZ = 1> static
// ForceMult2D is a function that takes in a bunch of parameters and returns void
void StressMult2D(const int NE,
const Array<double> &B_,
const Array<double> &Bt_,
const Array<double> &Gt_,
const DenseTensor &sJit_,
const Vector &x, Vector &y)
{
auto b = Reshape(B_.Read(), Q1D, L1D);
auto bt = Reshape(Bt_.Read(), L1D, Q1D);
auto gt = Reshape(Gt_.Read(), L1D, Q1D);
const double *Stress_rate_JinvT = Read(sJit_.GetMemory(), Q1D*Q1D*NE*DIM*DIM);
auto sJit = Reshape(Stress_rate_JinvT, Q1D, Q1D, NE, DIM, DIM);
auto energy = Reshape(x.Read(), L1D, L1D, NE); // one vector of L2 space
// const double eps1 = std::numeric_limits<double>::epsilon();
// const double eps2 = eps1*eps1;
auto velocity = Reshape(y.Write(), L1D, L1D, 3*(DIM-1), NE); // rhs vector for external force
// auto velocity = Reshape(y.Write(), D1D, D1D, DIM, NE); // rhs vector for external force
MFEM_FORALL_2D(e, NE, Q1D, Q1D, 1,
{
const int z = MFEM_THREAD_ID(z);
MFEM_SHARED double B[Q1D][L1D];
MFEM_SHARED double Bt[L1D][Q1D];
MFEM_SHARED double Gt[L1D][Q1D];
MFEM_SHARED double Ez[NBZ][L1D][L1D];
double (*E)[L1D] = (double (*)[L1D])(Ez + z);
// MFEM_SHARED double LQz[2][NBZ][D1D][Q1D];
// double (*LQ0)[Q1D] = (double (*)[Q1D])(LQz[0] + z);
// double (*LQ1)[Q1D] = (double (*)[Q1D])(LQz[1] + z);
MFEM_SHARED double LQz[2][NBZ][L1D][Q1D];
double (*LQ0)[Q1D] = (double (*)[Q1D])(LQz[0] + z);
double (*LQ1)[Q1D] = (double (*)[Q1D])(LQz[1] + z);
MFEM_SHARED double QQz[3][NBZ][Q1D][Q1D];
double (*QQ)[Q1D] = (double (*)[Q1D])(QQz[0] + z);
double (*QQ0)[Q1D] = (double (*)[Q1D])(QQz[1] + z);
double (*QQ1)[Q1D] = (double (*)[Q1D])(QQz[2] + z);
if (z == 0)
{
MFEM_FOREACH_THREAD(q,x,Q1D)
{
MFEM_FOREACH_THREAD(l,y,Q1D)
{
if (l < L1D) { B[q][l] = b(q,l); }
if (l < L1D) { Bt[l][q] = bt(l,q); }
if (l < L1D) { Gt[l][q] = gt(l,q); }
}
}
}
MFEM_SYNC_THREAD;
// MFEM_FOREACH_THREAD(lx,x,L1D)
// {
// MFEM_FOREACH_THREAD(ly,y,L1D)
// {
// E[lx][ly] = energy(lx,ly,e); // one vector of L2 space
// }
// }
// MFEM_SYNC_THREAD;
// We skip energy multiplication and just
MFEM_FOREACH_THREAD(ly,y,L1D)
{
MFEM_FOREACH_THREAD(qx,x,Q1D)
{
double u = 0.0;
for (int lx = 0; lx < L1D; ++lx)
{
// u += B[qx][lx] * E[lx][ly];
u += B[qx][lx] * 1.0;
}
LQ0[ly][qx] = u;
}
}
MFEM_SYNC_THREAD;
MFEM_FOREACH_THREAD(qy,y,Q1D)
{
MFEM_FOREACH_THREAD(qx,x,Q1D)
{
double u = 0.0;
for (int ly = 0; ly < L1D; ++ly)
{
u += B[qy][ly] * LQ0[ly][qx];
}
QQ[qy][qx] = u;
}
}
MFEM_SYNC_THREAD;
// for (int c = 0; c < 3*(DIM-1); ++c)
for (int c = 0; c < 3*(DIM-1); c++)
{
MFEM_FOREACH_THREAD(qy,y,Q1D)
{
MFEM_FOREACH_THREAD(qx,x,Q1D)
{
// const double esx = QQ[qy][qx] * sJit(qx,qy,e,0,0);
if(c == 0)
{
QQ0[qy][qx] = QQ[qy][qx] * sJit(qx,qy,e,0,0);
}
else if (c == 1)
{
QQ0[qy][qx] = QQ[qy][qx] * sJit(qx,qy,e,1,1);
}
else
{
QQ0[qy][qx] = QQ[qy][qx] * sJit(qx,qy,e,0,1);
}
// const double esy = QQ[qy][qx] * sJit(qx,qy,e,1,c);
// QQ1[qy][qx] = esy;
}
}
MFEM_SYNC_THREAD;
MFEM_FOREACH_THREAD(qy,y,Q1D)
{
// MFEM_FOREACH_THREAD(dx,x,D1D)
MFEM_FOREACH_THREAD(dx,x,L1D)
{
double u = 0.0;
// double v = 0.0;
for (int qx = 0; qx < Q1D; ++qx)
{
u += 1.0 * QQ0[qy][qx];
// u += Gt[dx][qx] * QQ0[qy][qx];
// v += Bt[dx][qx] * QQ1[qy][qx];
}
LQ0[dx][qy] = u;
// LQ1[dx][qy] = v;
}
}
MFEM_SYNC_THREAD;
// MFEM_FOREACH_THREAD(dy,y,D1D)
MFEM_FOREACH_THREAD(dy,y,L1D)
{
// MFEM_FOREACH_THREAD(dx,x,D1D)
MFEM_FOREACH_THREAD(dx,x,L1D)
{
double u = 0.0;
// double v = 0.0;
for (int qy = 0; qy < Q1D; ++qy)
{
u += LQ0[dx][qy] * 1.0;
// u += LQ0[dx][qy] * Bt[dy][qy];
// v += LQ1[dx][qy] * Gt[dy][qy];
}
// velocity(dx,dy,c,e) = u + v;
velocity(dx,dy,c,e) = u;
}
}
MFEM_SYNC_THREAD;
}
// for (int c = 0; c < 3*(DIM-1); ++c)
// {
// MFEM_FOREACH_THREAD(dy,y,L1D)
// {
// MFEM_FOREACH_THREAD(dx,x,L1D)
// {
// const double v = velocity(dx,dy,c,e);
// if (fabs(v) < eps2)
// {
// velocity(dx,dy,c,e) = 0.0;
// }
// }
// }
// MFEM_SYNC_THREAD;
// }
});
}
I thought I just put1.0 instead of multiplication of Bt and Gt
I don't understand what you're trying to do. I'm not sure if this makes sense, you're not moving properly from quadrature points to dofs.
However, when I plot stress_xx and _yy, they are different.
They should be the same if the stress (the quadrature data structure) is indeed pI
, i.e., if it doesn't include the artificial viscosity, etc.
@vladotomov, I've managed to solve the issue, and reshaping from vector to tensor had some problems.
However, I've encountered another problem related to memory management in CUDA.
I have a vector called body_force
, which has the same size as rhs
(the H1 space vector), and I'm trying to add these two vectors together.
Just like rhs += Body_force
or rhs.Add(1.0, Body_force)
.
This addition works correctly when done on the CPU, but when I try to perform it in CUDA, it seems like the summation is incorrect, or perhaps the addition isn't happening at all. I suspect that the body_force
vector resides only in CPU memory, and thus, when I try to access it in the GPU code, it's not available.
To address this, I tried explicitly assigning the body_force vector to reside in GPU memory using Body_Force.UseDevice(true)
, but this didn't solve the issue as I had hoped.
It seems like the problem lies in how I'm managing memory in CUDA, particularly with ensuring that the necessary data is available on the GPU for computation.
Any comments or suggestions on how to properly handle memory allocation and data transfer between the CPU and GPU would be greatly appreciated.
Have you looked at the Memory manager section in the website docs?
@vladotomov Yes, I did but it is over my head.
Since rhs
is the result of ForcePA->Mult(one, rhs)
, rhs
is allocated in GPU's memory, isn't it?
So I presumed my Body_Force vector has some issue with GPU.
Maybe I should use Body_Force->Read()
, Body_Force->Write()
, Body_Force->ReadWrite()
something else, right it?
////
mutable LinearForm *Body_Force;
Body_Force(nullptr);
Body_Force = new LinearForm(&H1);
BodyForceIntegrator *bi = new BodyForceIntegrator(qdata);
bi->SetIntRule(&ir);
Body_Force->AddDomainIntegrator(bi);
Body_Force->Assemble();
ForcePA->Mult(one, rhs); // F*1
rhs.Neg(); // -F
rhs += *Body_Force;
Since rhs is the result of ForcePA->Mult(one, rhs), rhs is allocated in GPU's memory, isn't it?
If rhs
is allocated on the device, and ForcePA
supports GPU execution, then yes.
To get the current data on the host (CPU), you need a HostRead()
.
You should try to get the partial assembly implementation first, before worrying about GPU execution. PA by itself can be fully tested on the CPU.
@vladotomov Thank you for your comment, but I think I take my time to digest it.
I just tested vector operations within MFEM. In Laghos, velocity is calculated component-wise, as shown in the attached code. I printed out B
, which is a vector of length H1c
, assigned to use the device, and B.Neg()
as well. On my terminal, the printed results are the same, which is really strange.
I took a look at Neg()
in vector.cpp
and I believe it should work correctly for the GPU.
Furthermore, rhs.Neg()
is working correctly. So, I am wondering what the rule of vector operation is here when GPU is involved. When I run it on CPU, there is no problem, everything is what I expected.
Or is there any way to use operations defined in MFEM on the GPU?
timer.sw_force.Start();
ForcePA->Mult(one, rhs);
timer.sw_force.Stop();
rhs.Neg();
// Partial assembly solve for each velocity component
const int size = H1c.GetVSize();
const Operator *Pconf = H1c.GetProlongationMatrix();
for (int c = 0; c < dim; c++)
{
dvc_gf.MakeRef(&H1c, dS_dt, H1Vsize + c*size);
rhs_c_gf.MakeRef(&H1c, rhs, c*size);
if (Pconf) { Pconf->MultTranspose(rhs_c_gf, B); }
else { B = rhs_c_gf; }
void Vector::Neg()
{
const bool use_dev = UseDevice();
const int N = size;
auto y = ReadWrite(use_dev);
mfem::forall_switch(use_dev, N, [=] MFEM_HOST_DEVICE (int i) { y[i] = -y[i]; });
}
Can you show the code that prints B
? Did you do a HostRead()
before the print? There should be no difference between the treatment of rhs
and B
, they are both Vectors
.
Thank you for your comments. B
printed out zero, but it printed out some values after using HostRead()
.
I ran a problem (consolidation due to body force) using CPU and CUDA. The results of vertical stress and displacement are identical to my eyes. However, horizontal displacement is a bit different. Of course, horizontal displacement is negligible in comparison with vertical displacement. But I was just wondering if this is a natural thing due to differences in the machine or something else. What do you think?
*RXT -> RTX
The differences seem too big, I'd guess it's a bug.
@vladotomov I see, but I don't know the reason for this yet, and I'll let you know if I figure it out. By the way, I've tested laghos with CUDA with different orders of elements.
Test1: ./laghos -p 4 -m data/square_gresho.mesh -rs 4 -ok 2
-ot 1
-tf 0.5 -s 7 -pa -d cuda (dof H1: 8450; dof L2: 4096)
Test2: ./laghos -p 4 -m data/square_gresho.mesh -rs 4 -ok 3
-ot 2
-tf 0.5 -s 7 -pa -d cuda (dof H1: 18818; dof L2: 9216)
Test3: ./laghos -p 4 -m data/square_gresho.mesh -rs 4 -ok 4
-ot 3
-tf 0.5 -s 7 -pa -d cuda (dof H1: 33282; dof L2: 16384)
Test4: ./laghos -p 4 -m data/square_gresho.mesh -rs 4 -ok 5
-ot 4
-tf 0.5 -s 7 -pa -d cuda (dof H1: 51842; dof L2: 25600)
Test5: ./laghos -p 4 -m data/square_gresho.mesh -rs 4 -ok 6
-ot 5
-tf 0.5 -s 7 -pa -d cuda (dof H1: 74498; dof L2: 36864)
Test6: ./laghos -p 4 -m data/square_gresho.mesh -rs 4 -ok 7
-ot 6
-tf 0.5 -s 7 -pa -d cuda (dof H1: 101250; dof L2: 50176)
I measured the calculation time and found that tests 1 to 6 took 3.878s, 6.128s, 8.854s, 1m2.657s, 2m48.928s, and 8m34.461s. There is a time jump after Test3. Do you have any thoughts on this? This is strange to me. DOFs don't increase drastically, but time increases a lot.
Strange indeed. Are you running this in your branch? Can I see the code somehow?
I used the master version of Laghos with a few modifications to run high-order elements greater than 4.
For example, in laghos_assembly.cpp
, I modified the lines to include {0x290,&ForceMult2D<2,9,16,8>}
and {0x290,&ForceMultTranspose2D<2,9,16,8>}
, and similarly, in laghos_solver.cpp,
I added {0x30,&QKernel<2,16>}
.
The remaining parts are identical to the master version."
Also, you provided a link to the laghos_pa.zip file on GitHub. If you need assistance with the contents of this file or any further clarification, feel free to ask.
The problem was that the corresponding mass kernels for the velocity and energy matrices were not in MFEM. Try this PR for the MFEM build, for Q5Q4 2D, and let me know. It removed the timing jump in my tests (I was seeing the same jump without the PR).
Thanks @vladotomov, it works now. It takes 0m17.078s and computation time became almost linearly scaled.
case 0x5A: return SmemPAMassAssembleDiagonal2D<5,10,2>(NE,B,D,Y);
case 0x6A: return SmemPAMassAssembleDiagonal2D<6,10,4>(NE,B,D,Y);
case 0x5A: return SmemPAMassApply2D<5,10,2>(NE,B,Bt,D,X,Y);
case 0x6A: return SmemPAMassApply2D<6,10,4>(NE,B,Bt,D,X,Y);
How can I expand them to Q6Q5, Q7Q6, and Q8Q7 using some order or rules based on your adding?
@sungho91, sure, more orders can be added.
Hi,
I'm working on developing a tectonic solver using Laghos. It still has many areas for improvement, but now I can use it for some applications.
The main features I've added to Laghos are elasticity and (brittle)plasticity. To achieve this, a lot of parts, I referred to "Dobrev, V.A. et al (2014), High order curvilinear finite elements for elastic–plastic Lagrangian dynamics".
However, I was only able to implement the assembly for stress rate in full assembly mode. Therefore, I'm unable to fully leverage the benefits of Laghos. I'm wondering if I can get assistance from the Laghos team to implement stress assembly in partial assembly mode.
Sungho