<html><head><meta http-equiv="Content-Type" content="text/html; charset=us-ascii"></head><body style="word-wrap: break-word; -webkit-nbsp-mode: space; line-break: after-white-space;" class=""><div class=""><br class=""></div><span style="caret-color: rgb(0, 0, 255); color: rgb(0, 0, 255);" class=""> </span><span style="caret-color: rgb(0, 0, 255);" class=""> ierr = VecNormBegin(u,NORM_2,&norm1);</span><br style="caret-color: rgb(0, 0, 255);" class=""><span style="caret-color: rgb(0, 0, 255);" class="">    ierr = PetscCommSplitReductionBegin(PetscObjectComm((PetscObject)Ax)); </span><div class=""><span style="caret-color: rgb(0, 0, 255);" class=""><br class=""></span></div><div class=""><span style="caret-color: rgb(0, 0, 255);" class="">How come you call this on Ax and not on u? For clarity, if nothing else, I think you should call it on u.</span></div><div class=""><span style="caret-color: rgb(0, 0, 255);" class=""><br class=""></span></div><div class=""><span style="caret-color: rgb(0, 0, 255);" class="">comb.c has </span></div><div class=""><span style="caret-color: rgb(0, 0, 255);" class=""><br class=""></span></div><div class=""><div class="">/*</div><div class="">      Split phase global vector reductions with support for combining the</div><div class="">   communication portion of several operations. Using MPI-1.1 support only</div><div class=""><br class=""></div><div class="">      The idea for this and much of the initial code is contributed by</div><div class="">   Victor Eijkhout.</div><div class=""><br class=""></div><div class="">       Usage:</div><div class="">             VecDotBegin(Vec,Vec,PetscScalar *);</div><div class="">             VecNormBegin(Vec,NormType,PetscReal *);</div><div class="">             ....</div><div class="">             VecDotEnd(Vec,Vec,PetscScalar *);</div><div class="">             VecNormEnd(Vec,NormType,PetscReal *);</div><div class=""><br class=""></div><div class="">       Limitations:</div><div class="">         - The order of the xxxEnd() functions MUST be in the same order</div><div class="">           as the xxxBegin(). There is extensive error checking to try to</div><div class="">           insure that the user calls the routines in the correct order</div><div class="">*/</div><div class=""><br class=""></div><div class="">#include <petsc/private/vecimpl.h>    /*I   "petscvec.h"    I*/</div><div class=""><br class=""></div><div class="">static PetscErrorCode MPIPetsc_Iallreduce(void *sendbuf,void *recvbuf,PetscMPIInt count,MPI_Datatype datatype,MPI_Op op,MPI_Comm comm,MPI_Request *request)</div><div class="">{</div><div class="">  PETSC_UNUSED PetscErrorCode ierr;</div><div class=""><br class=""></div><div class="">  PetscFunctionBegin;</div><div class="">#if defined(PETSC_HAVE_MPI_IALLREDUCE)</div><div class="">  ierr = MPI_Iallreduce(sendbuf,recvbuf,count,datatype,op,comm,request);CHKERRMPI(ierr);</div><div class="">#elif defined(PETSC_HAVE_MPIX_IALLREDUCE)</div><div class="">  ierr = MPIX_Iallreduce(sendbuf,recvbuf,count,datatype,op,comm,request);CHKERRQ(ierr);</div><div class="">#else</div><div class="">  ierr = MPIU_Allreduce(sendbuf,recvbuf,count,datatype,op,comm);CHKERRQ(ierr);</div><div class="">  *request = MPI_REQUEST_NULL;</div><div class="">#endif</div><div class="">  PetscFunctionReturn(0);</div><div class="">}</div><div style="color: rgb(0, 0, 255);" class=""><br class=""></div></div><div class=""><font color="#0000ff" class=""><span style="caret-color: rgb(0, 0, 255);" class=""><br class=""></span></font><div>So first check if $PETSC_DIR/include/petscconf.h has </div><div><br class=""></div><div><span style="caret-color: rgb(0, 0, 255);" class="">PETSC_HAVE_MPI_IALLREDUCE</span></div><div><span style="caret-color: rgb(0, 0, 255);" class=""><br class=""></span></div><div><span style="caret-color: rgb(0, 0, 255);" class="">if it does not then the standard MPI reduce is called. </span></div><div><span style="caret-color: rgb(0, 0, 255);" class=""><br class=""></span></div><div><span style="caret-color: rgb(0, 0, 255);" class="">If this is set then any improvement depends on the implementation of iallreduce inside the MPI you are using. </span></div><div><span style="caret-color: rgb(0, 0, 255);" class=""><br class=""></span></div><div><span style="caret-color: rgb(0, 0, 255);" class="">Barry</span></div><div><span style="caret-color: rgb(0, 0, 255);" class=""><br class=""></span></div><div><font color="#0000ff" class=""><span style="caret-color: rgb(0, 0, 255);" class=""><br class=""></span></font><blockquote type="cite" class=""><div class="">On Jan 21, 2021, at 6:52 AM, Viet H.Q.H. <<a href="mailto:hqhviet@tohoku.ac.jp" class="">hqhviet@tohoku.ac.jp</a>> wrote:</div><br class="Apple-interchange-newline"><div class=""><div dir="ltr" class=""><div class=""><br class=""></div><div class="">Hello Petsc developers and supporters,<br class=""><br class="">I would like to confirm the performance of asynchronous computations of inner product computation overlapping with matrix-vector multiplication computation by the below code.<br class=""></div><div class=""><br class=""></div><div class=""><br class=""></div><font color="#0000ff" class=""> PetscLogDouble tt1,tt2;<br class="">    KSP ksp;<br class="">    //ierr = VecSet(c,one);<br class="">    ierr = VecSet(c,one);<br class="">    ierr = VecSet(u,one);<br class="">    ierr = VecSet(b,one);<br class=""><br class="">    ierr = KSPCreate(PETSC_COMM_WORLD,&ksp); CHKERRQ(ierr);<br class="">    ierr = KSP_MatMult(ksp,A,x,Ax); CHKERRQ(ierr);<br class=""><br class=""><br class="">    ierr = PetscTime(&tt1);CHKERRQ(ierr);<br class="">    ierr = VecNormBegin(u,NORM_2,&norm1);<br class="">    ierr = PetscCommSplitReductionBegin(PetscObjectComm((PetscObject)Ax)); <br class="">    ierr = KSP_MatMult(ksp,A,c,Ac); <br class="">    ierr = VecNormEnd(u,NORM_2,&norm1);<br class="">    ierr = PetscTime(&tt2);CHKERRQ(ierr);<br class=""><br class="">    ierr = PetscPrintf(PETSC_COMM_WORLD, "The time used for the asynchronous calculation: %f\n",tt2-tt1); CHKERRQ(ierr);<br class="">    ierr = PetscPrintf(PETSC_COMM_WORLD,"+ |u| =  %g\n",(double) norm1); CHKERRQ(ierr);<br class=""><br class=""><br class="">    ierr = PetscTime(&tt1);CHKERRQ(ierr);<br class="">    ierr = VecNorm(b,NORM_2,&norm2); CHKERRQ(ierr);<br class="">    ierr = KSP_MatMult(ksp,A,c,Ac); <br class="">    ierr = PetscTime(&tt2);CHKERRQ(ierr);<br class=""><br class="">    ierr = PetscPrintf(PETSC_COMM_WORLD, "The time used for the synchronous calculation: %f\n",tt2-tt1); CHKERRQ(ierr);<br class="">    ierr = PetscPrintf(PETSC_COMM_WORLD,"+ |b| =  %g\n",(double) norm2); CHKERRQ(ierr);<br class=""></font><div class=""><font color="#0000ff" class=""><br class=""></font></div><div class=""><br class=""></div><div class="">On a cluster with two or four nodes, the asynchronous computation is always much slower than synchronous computation.<br class=""><br class=""><font color="#ff0000" class="">The time used for the asynchronous calculation: 0.000203<br class="">+ |u| =  100.<br class="">The time used for the synchronous calculation: 0.000006<br class="">+ |b| =  100.</font><br class=""><br class="">Are there any necessary settings on MPI or Petsc to gain performance of asynchronous computation?<br class=""><br class="">Thank you very much for anything you can provide.<br class="">Sincerely,<br class="">Viet.<br class=""></div><div class=""><br class=""></div><div class=""><br class=""></div><div class=""><br class=""></div><div class=""><br class=""></div><div class=""><br class=""></div><div class=""><br class=""></div></div>

</div></blockquote></div><br class=""></div></body></html>