<html>
  <head>
    <meta content="text/html; charset=utf-8" http-equiv="Content-Type">
  </head>
  <body bgcolor="#FFFFFF" text="#000000">
    <div class="moz-cite-prefix">Hi Sherry,<br>
      <br>
      I ran my code through valgrind and gdb as suggested by Barry. I am
      now coming back to some problem I have had while running with
      parallel symbolic factorization. I am attaching a test matrix
      (petsc binary format) that I LU decompose and then use to solve a
      linear system (see code below). I can run on 2 processors with
      parsymbfact or with 4 processors without parsymbfact. However, if
      I run on 4 procs with parsymbfact, the code is just hanging. Below
      is the simplified test case that I have used to test. The matrix A
      and B are built somewhere else in my program. The matrix I am
      attaching is A-sigma*B (see below).<br>
      <br>
      One thing is that I don't know for sparse matrices what is the
      optimum number of processors to use for a LU decomposition? Does
      it depend on the total number of nonzero? Do you have an easy way
      to compute it?<br>
      <br>
      Thanks,<br>
      <br>
      Anthony<br>
      <br>
      <br>
      <br>
           Subroutine HowBigLUCanBe(rank)<br>
      <br>
            IMPLICIT NONE<br>
            <br>
            integer(i4b),intent(in) :: rank<br>
            integer(i4b)            :: i,ct<br>
            real(dp)                :: begin,endd <br>
            complex(dpc)            :: sigma<br>
            <br>
            PetscErrorCode ierr <br>
            <br>
            <br>
            if (rank==0) call cpu_time(begin)<br>
            <br>
            if (rank==0) then<br>
               write(*,*)<br>
               write(*,*)'Testing How Big LU Can Be...'<br>
               write(*,*)'============================'<br>
               write(*,*)<br>
            endif<br>
            <br>
            sigma = (1.0d0,0.0d0)<br>
            call MatAXPY(A,-sigma,B,DIFFERENT_NONZERO_PATTERN,ierr) ! on
      exit A = A-sigma*B<br>
      <br>
      !.....Write Matrix to ASCII and Binary Format<br>
            !call
      PetscViewerASCIIOpen(PETSC_COMM_WORLD,"Amat.m",viewer,ierr)<br>
            !call MatView(DXX,viewer,ierr)<br>
            !call PetscViewerDestroy(viewer,ierr)<br>
            <br>
            call
PetscViewerBinaryOpen(PETSC_COMM_WORLD,"Amat_binary.m",FILE_MODE_WRITE,viewer,ierr)<br>
            call MatView(A,viewer,ierr)<br>
            call PetscViewerDestroy(viewer,ierr)<br>
            <br>
      !.....Create Linear Solver Context<br>
            call KSPCreate(PETSC_COMM_WORLD,ksp,ierr)<br>
            <br>
      !.....Set operators. Here the matrix that defines the linear
      system also serves as the preconditioning matrix.<br>
            !call
      KSPSetOperators(ksp,A,A,DIFFERENT_NONZERO_PATTERN,ierr) !aha
      commented and replaced by next line<br>
            call KSPSetOperators(ksp,A,A,ierr) ! remember: here A =
      A-sigma*B<br>
            <br>
      !.....Set Relative and Absolute Tolerances and Uses Default for
      Divergence Tol<br>
            tol = 1.e-10 <br>
            call
KSPSetTolerances(ksp,tol,tol,PETSC_DEFAULT_REAL,PETSC_DEFAULT_INTEGER,ierr)<br>
            <br>
      !.....Set the Direct (LU) Solver<br>
            call KSPSetType(ksp,KSPPREONLY,ierr)<br>
            call KSPGetPC(ksp,pc,ierr)<br>
            call PCSetType(pc,PCLU,ierr)<br>
            call
      PCFactorSetMatSolverPackage(pc,MATSOLVERSUPERLU_DIST,ierr) !
      MATSOLVERSUPERLU_DIST MATSOLVERMUMPS<br>
            <br>
      !.....Create Right-Hand-Side Vector<br>
            call MatCreateVecs(A,frhs,PETSC_NULL_OBJECT,ierr)<br>
            call MatCreateVecs(A,sol,PETSC_NULL_OBJECT,ierr)<br>
            <br>
            allocate(xwork1(IendA-IstartA))<br>
            allocate(loc(IendA-IstartA))<br>
            <br>
            ct=0<br>
            do i=IstartA,IendA-1<br>
               ct=ct+1<br>
               loc(ct)=i<br>
               xwork1(ct)=(1.0d0,0.0d0)<br>
            enddo<br>
            <br>
            call
      VecSetValues(frhs,IendA-IstartA,loc,xwork1,INSERT_VALUES,ierr)<br>
            call VecZeroEntries(sol,ierr)<br>
            <br>
            deallocate(xwork1,loc)<br>
            <br>
      !.....Assemble Vectors<br>
            call VecAssemblyBegin(frhs,ierr)<br>
            call VecAssemblyEnd(frhs,ierr)<br>
            <br>
      !.....Solve the Linear System<br>
            call KSPSolve(ksp,frhs,sol,ierr)<br>
            <br>
            !call VecView(sol,PETSC_VIEWER_STDOUT_WORLD,ierr)<br>
            <br>
            if (rank==0) then    <br>
               call cpu_time(endd)<br>
               write(*,*)<br>
               print '("Total time for HowBigLUCanBe = ",f21.3,"
      seconds.")',endd-begin<br>
            endif<br>
      <br>
            call SlepcFinalize(ierr)<br>
            <br>
            STOP<br>
            <br>
            <br>
          end Subroutine HowBigLUCanBe<br>
      <br>
      <br>
      <br>
      <br>
      <br>
      <br>
      <br>
      <br>
      <br>
      <br>
      <br>
      <br>
      <br>
      <br>
      <br>
      <br>
      <br>
      <br>
      <br>
      <br>
      <br>
      <br>
      On 07/08/2015 11:23 AM, Xiaoye S. Li wrote:<br>
    </div>
    <blockquote
cite="mid:CAFvbobXeHtLaPtS1FmN3Z5g2-otcs2Pq3n9VkT4Q3oq56tbKrg@mail.gmail.com"
      type="cite">
      <div dir="ltr">
        <div class="gmail_default"
          style="font-family:arial,helvetica,sans-serif">Indeed, the
          parallel symbolic factorization routine needs power of 2
          processes, however, you can use however many processes you
          need;  internally, we redistribute matrix to nearest power of
          2 processes, do symbolic, then redistribute back to all the
          processes to do factorization, triangular solve etc.  So,
          there is no  restriction from the users viewpoint.<br>
          <br>
        </div>
        <div class="gmail_default"
          style="font-family:arial,helvetica,sans-serif">It's difficult
          to tell what the problem is.  Do you think you can print your
          matrix, then, I can do some debugging by running superlu_dist
          standalone?<br>
          <br>
        </div>
        <div class="gmail_default"
          style="font-family:arial,helvetica,sans-serif">Sherry<br>
          <br>
        </div>
      </div>
      <div class="gmail_extra"><br>
        <div class="gmail_quote">On Wed, Jul 8, 2015 at 10:34 AM,
          Anthony Paul Haas <span dir="ltr"><<a
              moz-do-not-send="true" href="mailto:aph@email.arizona.edu"
              target="_blank">aph@email.arizona.edu</a>></span>
          wrote:<br>
          <blockquote class="gmail_quote" style="margin:0 0 0
            .8ex;border-left:1px #ccc solid;padding-left:1ex">
            <div dir="ltr">
              <div>
                <div>
                  <div>
                    <div>Hi,<br>
                      <br>
                    </div>
                    I have used the switch -mat_superlu_dist_parsymbfact
                    in my pbs script. However, although my program
                    worked fine with sequential symbolic factorization,
                    I get one of the following 2 behaviors when I run
                    with parallel symbolic factorization (depending on
                    the number of processors that I use):<br>
                    <br>
                  </div>
                  1) the program just hangs (it seems stuck in some
                  subroutine ==> see test.out-hangs)<br>
                </div>
                2) I get a floating point exception ==> see
                test.out-floating-point-exception<br>
                <br>
              </div>
              <div>Note that as suggested in the Superlu manual, I use a
                power of 2 number of procs. Are there any tunable
                parameters for the parallel symbolic factorization? Note
                that when I build my sparse matrix, most elements I add
                are nonzero of course but to simplify the programming, I
                also add a few zero elements in the sparse matrix. I was
                thinking that maybe if the parallel symbolic
                factorization proceed by block, there could be some
                blocks where the pivot would be zero, hence creating the
                FPE??<br>
                <br>
              </div>
              <div>Thanks,<br>
                <br>
              </div>
              <div>Anthony<br>
              </div>
              <div><br>
              </div>
              <br>
            </div>
            <div class="HOEnZb">
              <div class="h5">
                <div class="gmail_extra"><br>
                  <div class="gmail_quote">On Wed, Jul 8, 2015 at 6:46
                    AM, Xiaoye S. Li <span dir="ltr"><<a
                        moz-do-not-send="true"
                        href="mailto:xsli@lbl.gov" target="_blank">xsli@lbl.gov</a>></span>
                    wrote:<br>
                    <blockquote class="gmail_quote" style="margin:0 0 0
                      .8ex;border-left:1px #ccc solid;padding-left:1ex">
                      <div dir="ltr">
                        <div class="gmail_default"
                          style="font-family:arial,helvetica,sans-serif">Did
                          you find out how to change option to use
                          parallel symbolic factorization?  Perhaps
                          PETSc team can help. </div>
                        <div class="gmail_default"
                          style="font-family:arial,helvetica,sans-serif"><br>
                        </div>
                        <div class="gmail_default"
                          style="font-family:arial,helvetica,sans-serif">Sherry</div>
                        <div class="gmail_default"
                          style="font-family:arial,helvetica,sans-serif"><br>
                        </div>
                      </div>
                      <div>
                        <div>
                          <div class="gmail_extra"><br>
                            <div class="gmail_quote">On Tue, Jul 7, 2015
                              at 3:58 PM, Xiaoye S. Li <span dir="ltr"><<a
                                  moz-do-not-send="true"
                                  href="mailto:xsli@lbl.gov"
                                  target="_blank">xsli@lbl.gov</a>></span>
                              wrote:<br>
                              <blockquote class="gmail_quote"
                                style="margin:0 0 0 .8ex;border-left:1px
                                #ccc solid;padding-left:1ex">
                                <div dir="ltr">
                                  <div class="gmail_default"
                                    style="font-family:arial,helvetica,sans-serif">Is
                                    there an inquiry function that tells
                                    you all the available options?<br>
                                    <br>
                                  </div>
                                  <div class="gmail_default"
                                    style="font-family:arial,helvetica,sans-serif">Sherry<br>
                                  </div>
                                </div>
                                <div>
                                  <div>
                                    <div class="gmail_extra"><br>
                                      <div class="gmail_quote">On Tue,
                                        Jul 7, 2015 at 3:25 PM, Anthony
                                        Paul Haas <span dir="ltr"><<a
                                            moz-do-not-send="true"
                                            href="mailto:aph@email.arizona.edu"
                                            target="_blank">aph@email.arizona.edu</a>></span>
                                        wrote:<br>
                                        <blockquote class="gmail_quote"
                                          style="margin:0 0 0
                                          .8ex;border-left:1px #ccc
                                          solid;padding-left:1ex">
                                          <div dir="ltr">
                                            <div>
                                              <div>
                                                <div>
                                                  <div>
                                                    <div>Hi Sherry,<br>
                                                      <br>
                                                    </div>
                                                    <div>Thanks for your
                                                      message. I have
                                                      used superlu_dist
                                                      default options. I
                                                      did not realize
                                                      that I was doing
                                                      serial symbolic
                                                      factorization.
                                                      That is probably
                                                      the cause of my
                                                      problem. <br>
                                                    </div>
                                                    Each node on Garnet
                                                    has 60GB usable
                                                    memory and I can run
                                                    with 1,2,4,8,16 or
                                                    32 core per node. <br>
                                                    <br>
                                                  </div>
                                                  So I should use: <br>
                                                  <br>
                                                  -mat_superlu_dist_r 20<br>
                                                  -mat_superlu_dist_c 32<b><br>
                                                    <br>
                                                  </b></div>
                                                How do you specify the
                                                parallel symbolic
                                                factorization option? is
                                                it
                                                -mat_superlu_dist_matinput
                                                1<b><br>
                                                  <br>
                                                </b></div>
                                              Thanks,<br>
                                              <br>
                                            </div>
                                            Anthony<br>
                                            <div>
                                              <div>
                                                <div>
                                                  <div><br>
                                                  </div>
                                                </div>
                                              </div>
                                            </div>
                                          </div>
                                          <div>
                                            <div>
                                              <div class="gmail_extra"><br>
                                                <div class="gmail_quote">On
                                                  Tue, Jul 7, 2015 at
                                                  3:08 PM, Xiaoye S. Li
                                                  <span dir="ltr"><<a
moz-do-not-send="true" href="mailto:xsli@lbl.gov" target="_blank">xsli@lbl.gov</a>></span>
                                                  wrote:<br>
                                                  <blockquote
                                                    class="gmail_quote"
                                                    style="margin:0 0 0
                                                    .8ex;border-left:1px
                                                    #ccc
                                                    solid;padding-left:1ex">
                                                    <div dir="ltr">
                                                      <div
                                                        class="gmail_default"
style="font-family:arial,helvetica,sans-serif">For superlu_dist failure,
                                                        this occurs
                                                        during symbolic
                                                        factorization. 
                                                        Since you are
                                                        using serial
                                                        symbolic
                                                        factorization,
                                                        it requires the
                                                        entire graph of
                                                        A to be
                                                        available in the
                                                        memory of one
                                                        MPI task. How
                                                        much memory do
                                                        you have for
                                                        each MPI task?<br>
                                                        <br>
                                                      </div>
                                                      <div
                                                        class="gmail_default"
style="font-family:arial,helvetica,sans-serif">It won't help even if you
                                                        use more
                                                        processes.  You
                                                        should try to
                                                        use parallel
                                                        symbolic
                                                        factorization
                                                        option.<br>
                                                        <br>
                                                      </div>
                                                      <div
                                                        class="gmail_default"
style="font-family:arial,helvetica,sans-serif">Another point.  You set
                                                        up process grid
                                                        as:<br>
                                                               Process
                                                        grid nprow 32 x
                                                        npcol 20 <br>
                                                      </div>
                                                      <div
                                                        class="gmail_default"
style="font-family:arial,helvetica,sans-serif">For better performance,
                                                        you show swap
                                                        the grid
                                                        dimension. That
                                                        is, it's better
                                                        to use 20 x 32,
                                                        never gives
                                                        nprow larger
                                                        than npcol.<br>
                                                        <br>
                                                        <br>
                                                      </div>
                                                      <div
                                                        class="gmail_default"
style="font-family:arial,helvetica,sans-serif">Sherry<br>
                                                        <br>
                                                      </div>
                                                    </div>
                                                    <div
                                                      class="gmail_extra"><br>
                                                      <div
                                                        class="gmail_quote"><span>On
                                                          Tue, Jul 7,
                                                          2015 at 1:27
                                                          PM, Barry
                                                          Smith <span
                                                          dir="ltr"><<a
moz-do-not-send="true" href="mailto:bsmith@mcs.anl.gov" target="_blank">bsmith@mcs.anl.gov</a>></span>
                                                          wrote:<br>
                                                        </span>
                                                        <div>
                                                          <div>
                                                          <blockquote
                                                          class="gmail_quote"
                                                          style="margin:0
                                                          0 0
                                                          .8ex;border-left:1px
                                                          #ccc
                                                          solid;padding-left:1ex"><br>
                                                             I would
                                                          suggest
                                                          running a
                                                          sequence of
                                                          problems, 101
                                                          by 101 111 by
                                                          111 etc and
                                                          get the memory
                                                          usage in each
                                                          case (when you
                                                          run out of
                                                          memory you can
                                                          get NO useful
                                                          information
                                                          out about
                                                          memory needs).
                                                          You can then
                                                          plot memory
                                                          usage as a
                                                          function of
                                                          problem size
                                                          to get a
                                                          handle on how
                                                          much memory it
                                                          is using.  You
                                                          can also run
                                                          on more and
                                                          more processes
                                                          (which have a
                                                          total of more
                                                          memory) to see
                                                          how large a
                                                          problem you
                                                          may be able to
                                                          reach.<br>
                                                          <br>
                                                             MUMPS also
                                                          has an "out of
                                                          core" version
                                                          (which we have
                                                          never used)
                                                          that could in
                                                          theory anyways
                                                          let you get to
                                                          large problems
                                                          if you have
                                                          lots of disk
                                                          space, but you
                                                          are on your
                                                          own figuring
                                                          out how to use
                                                          it.<br>
                                                          <br>
                                                            Barry<br>
                                                          <div>
                                                          <div><br>
                                                          > On Jul 7,
                                                          2015, at 2:37
                                                          PM, Anthony
                                                          Paul Haas <<a
moz-do-not-send="true" href="mailto:aph@email.arizona.edu"
                                                          target="_blank">aph@email.arizona.edu</a>>
                                                          wrote:<br>
                                                          ><br>
                                                          > Hi Jose,<br>
                                                          ><br>
                                                          > In my
                                                          code, I use
                                                          once PETSc to
                                                          solve a linear
                                                          system to get
                                                          the baseflow
                                                          (without using
                                                          SLEPc) and
                                                          then I use
                                                          SLEPc to do
                                                          the stability
                                                          analysis of
                                                          that baseflow.
                                                          This is why,
                                                          there are some
                                                          SLEPc options
                                                          that are not
                                                          used in
                                                          test.out-superlu_dist-151x151
                                                          (when I am
                                                          solving for
                                                          the baseflow
                                                          with PETSc
                                                          only). I have
                                                          attached a
                                                          101x101 case
                                                          for which I
                                                          get the
                                                          eigenvalues.
                                                          That case
                                                          works fine.
                                                          However If i
                                                          increase to
                                                          151x151, I get
                                                          the error that
                                                          you can see in
                                                          test.out-superlu_dist-151x151
                                                          (similar error
                                                          with mumps:
                                                          see
                                                          test.out-mumps-151x151
                                                          line 2918 ).
                                                          If you look a
                                                          the very end
                                                          of the files
                                                          test.out-superlu_dist-151x151
                                                          and
                                                          test.out-mumps-151x151,
                                                          you will see
                                                          that the last
                                                          info message
                                                          printed is:<br>
                                                          ><br>
                                                          > On
                                                          Processor
                                                          (after
                                                          EPSSetFromOptions) 
                                                          0    memory: 
                                                           
                                                          0.65073152000E+08 
                                                                 
                                                          =====> 
                                                          (see line 807
                                                          of
                                                          module_petsc.F90)<br>
                                                          ><br>
                                                          > This
                                                          means that the
                                                          memory error
                                                          probably
                                                          occurs in the
                                                          call to
                                                          EPSSolve (see
                                                          module_petsc.F90
                                                          line 810). I
                                                          would like to
                                                          evaluate how
                                                          much memory is
                                                          required by
                                                          the most
                                                          memory
                                                          intensive
                                                          operation
                                                          within
                                                          EPSSolve.
                                                          Since I am
                                                          solving a
                                                          generalized
                                                          EVP, I would
                                                          imagine that
                                                          it would be
                                                          the LU
                                                          decomposition.
                                                          But is there
                                                          an accurate
                                                          way of doing
                                                          it?<br>
                                                          ><br>
                                                          > Before
                                                          starting with
                                                          iterative
                                                          solvers, I
                                                          would like to
                                                          exploit as
                                                          much as I can
                                                          direct
                                                          solvers. I
                                                          tried GMRES
                                                          with default
                                                          preconditioner
                                                          at some point
                                                          but I had
                                                          convergence
                                                          problem. What
                                                          solver/preconditioner
                                                          would you
                                                          recommend for
                                                          a generalized
                                                          non-Hermitian
                                                          (EPS_GNHEP)
                                                          EVP?<br>
                                                          ><br>
                                                          > Thanks,<br>
                                                          ><br>
                                                          > Anthony<br>
                                                          ><br>
                                                          > On Tue,
                                                          Jul 7, 2015 at
                                                          12:17 AM, Jose
                                                          E. Roman <<a
moz-do-not-send="true" href="mailto:jroman@dsic.upv.es" target="_blank">jroman@dsic.upv.es</a>>
                                                          wrote:<br>
                                                          ><br>
                                                          > El
                                                          07/07/2015, a
                                                          las 02:33,
                                                          Anthony Haas
                                                          escribió:<br>
                                                          ><br>
                                                          > > Hi,<br>
                                                          > ><br>
                                                          > > I am
                                                          computing
                                                          eigenvalues
                                                          using
                                                          PETSc/SLEPc
                                                          and
                                                          superlu_dist
                                                          for the LU
                                                          decomposition
                                                          (my problem is
                                                          a generalized
                                                          eigenvalue
                                                          problem). The
                                                          code runs fine
                                                          for a grid
                                                          with 101x101
                                                          but when I
                                                          increase to
                                                          151x151, I get
                                                          the following
                                                          error:<br>
                                                          > ><br>
                                                          > >
                                                          Can't expand
                                                          MemType 1:
                                                          jcol 16104 
                                                           (and then
                                                          [NID 00037]
                                                          2015-07-06
                                                          19:19:17 Apid
                                                          31025976: OOM
                                                          killer
                                                          terminated
                                                          this process.)<br>
                                                          > ><br>
                                                          > > It
                                                          seems to be a
                                                          memory
                                                          problem. I
                                                          monitor the
                                                          memory usage
                                                          as far as I
                                                          can and it
                                                          seems that
                                                          memory usage
                                                          is pretty low.
                                                          The most
                                                          memory
                                                          intensive part
                                                          of the program
                                                          is probably
                                                          the LU
                                                          decomposition
                                                          in the context
                                                          of the
                                                          generalized
                                                          EVP. Is there
                                                          a way to
                                                          evaluate how
                                                          much memory
                                                          will be
                                                          required for
                                                          that step? I
                                                          am currently
                                                          running the
                                                          debug version
                                                          of the code
                                                          which I would
                                                          assume would
                                                          use more
                                                          memory?<br>
                                                          > ><br>
                                                          > > I
                                                          have attached
                                                          the output of
                                                          the job. Note
                                                          that the
                                                          program uses
                                                          twice PETSc:
                                                          1) to solve a
                                                          linear system
                                                          for which no
                                                          problem
                                                          occurs, and,
                                                          2) to solve
                                                          the
                                                          Generalized
                                                          EVP with
                                                          SLEPc, where I
                                                          get the error.<br>
                                                          > ><br>
                                                          > >
                                                          Thanks<br>
                                                          > ><br>
                                                          > >
                                                          Anthony<br>
                                                          > >
                                                          <test.out-superlu_dist-151x151><br>
                                                          ><br>
                                                          > In the
                                                          output you are
                                                          attaching
                                                          there are no
                                                          SLEPc objects
                                                          in the report
                                                          and SLEPc
                                                          options are
                                                          not used. It
                                                          seems that
                                                          SLEPc calls
                                                          are skipped?<br>
                                                          ><br>
                                                          > Do you
                                                          get the same
                                                          error with
                                                          MUMPS? Have
                                                          you tried to
                                                          solve linear
                                                          systems with a
                                                          preconditioned
                                                          iterative
                                                          solver?<br>
                                                          ><br>
                                                          > Jose<br>
                                                          ><br>
                                                          ><br>
                                                          </div>
                                                          </div>
                                                          >
<module_petsc.F90><test.out-mumps-151x151><test.out_superlu_dist-101x101><test.out-superlu_dist-151x151><br>
                                                          <br>
                                                          </blockquote>
                                                          </div>
                                                        </div>
                                                      </div>
                                                      <br>
                                                    </div>
                                                  </blockquote>
                                                </div>
                                                <br>
                                              </div>
                                            </div>
                                          </div>
                                        </blockquote>
                                      </div>
                                      <br>
                                    </div>
                                  </div>
                                </div>
                              </blockquote>
                            </div>
                            <br>
                          </div>
                        </div>
                      </div>
                    </blockquote>
                  </div>
                  <br>
                </div>
              </div>
            </div>
          </blockquote>
        </div>
        <br>
      </div>
    </blockquote>
    <br>
  </body>
</html>