<html>

  <head>

    <meta content="text/html; charset=utf-8" http-equiv="Content-Type">

  </head>

  <body bgcolor="#FFFFFF" text="#000000">

    <div class="moz-cite-prefix">Hi Sherry,<br>

      <br>

      I ran my code through valgrind and gdb as suggested by Barry. I am

      now coming back to some problem I have had while running with

      parallel symbolic factorization. I am attaching a test matrix

      (petsc binary format) that I LU decompose and then use to solve a

      linear system (see code below). I can run on 2 processors with

      parsymbfact or with 4 processors without parsymbfact. However, if

      I run on 4 procs with parsymbfact, the code is just hanging. Below

      is the simplified test case that I have used to test. The matrix A

      and B are built somewhere else in my program. The matrix I am

      attaching is A-sigma*B (see below).<br>

      <br>

      One thing is that I don't know for sparse matrices what is the

      optimum number of processors to use for a LU decomposition? Does

      it depend on the total number of nonzero? Do you have an easy way

      to compute it?<br>

      <br>

      Thanks,<br>

      <br>

      Anthony<br>

      <br>

      <br>

      <br>

           Subroutine HowBigLUCanBe(rank)<br>

      <br>

            IMPLICIT NONE<br>

            <br>

            integer(i4b),intent(in) :: rank<br>

            integer(i4b)            :: i,ct<br>

            real(dp)                :: begin,endd <br>

            complex(dpc)            :: sigma<br>

            <br>

            PetscErrorCode ierr <br>

            <br>

            <br>

            if (rank==0) call cpu_time(begin)<br>

            <br>

            if (rank==0) then<br>

               write(*,*)<br>

               write(*,*)'Testing How Big LU Can Be...'<br>

               write(*,*)'============================'<br>

               write(*,*)<br>

            endif<br>

            <br>

            sigma = (1.0d0,0.0d0)<br>

            call MatAXPY(A,-sigma,B,DIFFERENT_NONZERO_PATTERN,ierr) ! on

      exit A = A-sigma*B<br>

      <br>

      !.....Write Matrix to ASCII and Binary Format<br>

            !call

      PetscViewerASCIIOpen(PETSC_COMM_WORLD,"Amat.m",viewer,ierr)<br>

            !call MatView(DXX,viewer,ierr)<br>

            !call PetscViewerDestroy(viewer,ierr)<br>

            <br>

            call

PetscViewerBinaryOpen(PETSC_COMM_WORLD,"Amat_binary.m",FILE_MODE_WRITE,viewer,ierr)<br>

            call MatView(A,viewer,ierr)<br>

            call PetscViewerDestroy(viewer,ierr)<br>

            <br>

      !.....Create Linear Solver Context<br>

            call KSPCreate(PETSC_COMM_WORLD,ksp,ierr)<br>

            <br>

      !.....Set operators. Here the matrix that defines the linear

      system also serves as the preconditioning matrix.<br>

            !call

      KSPSetOperators(ksp,A,A,DIFFERENT_NONZERO_PATTERN,ierr) !aha

      commented and replaced by next line<br>

            call KSPSetOperators(ksp,A,A,ierr) ! remember: here A =

      A-sigma*B<br>

            <br>

      !.....Set Relative and Absolute Tolerances and Uses Default for

      Divergence Tol<br>

            tol = 1.e-10 <br>

            call

KSPSetTolerances(ksp,tol,tol,PETSC_DEFAULT_REAL,PETSC_DEFAULT_INTEGER,ierr)<br>

            <br>

      !.....Set the Direct (LU) Solver<br>

            call KSPSetType(ksp,KSPPREONLY,ierr)<br>

            call KSPGetPC(ksp,pc,ierr)<br>

            call PCSetType(pc,PCLU,ierr)<br>

            call

      PCFactorSetMatSolverPackage(pc,MATSOLVERSUPERLU_DIST,ierr) !

      MATSOLVERSUPERLU_DIST MATSOLVERMUMPS<br>

            <br>

      !.....Create Right-Hand-Side Vector<br>

            call MatCreateVecs(A,frhs,PETSC_NULL_OBJECT,ierr)<br>

            call MatCreateVecs(A,sol,PETSC_NULL_OBJECT,ierr)<br>

            <br>

            allocate(xwork1(IendA-IstartA))<br>

            allocate(loc(IendA-IstartA))<br>

            <br>

            ct=0<br>

            do i=IstartA,IendA-1<br>

               ct=ct+1<br>

               loc(ct)=i<br>

               xwork1(ct)=(1.0d0,0.0d0)<br>

            enddo<br>

            <br>

            call

      VecSetValues(frhs,IendA-IstartA,loc,xwork1,INSERT_VALUES,ierr)<br>

            call VecZeroEntries(sol,ierr)<br>

            <br>

            deallocate(xwork1,loc)<br>

            <br>

      !.....Assemble Vectors<br>

            call VecAssemblyBegin(frhs,ierr)<br>

            call VecAssemblyEnd(frhs,ierr)<br>

            <br>

      !.....Solve the Linear System<br>

            call KSPSolve(ksp,frhs,sol,ierr)<br>

            <br>

            !call VecView(sol,PETSC_VIEWER_STDOUT_WORLD,ierr)<br>

            <br>

            if (rank==0) then    <br>

               call cpu_time(endd)<br>

               write(*,*)<br>

               print '("Total time for HowBigLUCanBe = ",f21.3,"

      seconds.")',endd-begin<br>

            endif<br>

      <br>

            call SlepcFinalize(ierr)<br>

            <br>

            STOP<br>

            <br>

            <br>

          end Subroutine HowBigLUCanBe<br>

      <br>

      <br>

      <br>

      <br>

      <br>

      <br>

      <br>

      <br>

      <br>

      <br>

      <br>

      <br>

      <br>

      <br>

      <br>

      <br>

      <br>

      <br>

      <br>

      <br>

      <br>

      <br>

      On 07/08/2015 11:23 AM, Xiaoye S. Li wrote:<br>

    </div>

    <blockquote

cite="mid:CAFvbobXeHtLaPtS1FmN3Z5g2-otcs2Pq3n9VkT4Q3oq56tbKrg@mail.gmail.com"

      type="cite">

      <div dir="ltr">

        <div class="gmail_default"

          style="font-family:arial,helvetica,sans-serif">Indeed, the

          parallel symbolic factorization routine needs power of 2

          processes, however, you can use however many processes you

          need;  internally, we redistribute matrix to nearest power of

          2 processes, do symbolic, then redistribute back to all the

          processes to do factorization, triangular solve etc.  So,

          there is no  restriction from the users viewpoint.<br>

          <br>

        </div>

        <div class="gmail_default"

          style="font-family:arial,helvetica,sans-serif">It's difficult

          to tell what the problem is.  Do you think you can print your

          matrix, then, I can do some debugging by running superlu_dist

          standalone?<br>

          <br>

        </div>

        <div class="gmail_default"

          style="font-family:arial,helvetica,sans-serif">Sherry<br>

          <br>

        </div>

      </div>

      <div class="gmail_extra"><br>

        <div class="gmail_quote">On Wed, Jul 8, 2015 at 10:34 AM,

          Anthony Paul Haas <span dir="ltr"><<a

              moz-do-not-send="true" href="mailto:aph@email.arizona.edu"

              target="_blank">aph@email.arizona.edu</a>></span>

          wrote:<br>

          <blockquote class="gmail_quote" style="margin:0 0 0

            .8ex;border-left:1px #ccc solid;padding-left:1ex">

            <div dir="ltr">

              <div>

                <div>

                  <div>

                    <div>Hi,<br>

                      <br>

                    </div>

                    I have used the switch -mat_superlu_dist_parsymbfact

                    in my pbs script. However, although my program

                    worked fine with sequential symbolic factorization,

                    I get one of the following 2 behaviors when I run

                    with parallel symbolic factorization (depending on

                    the number of processors that I use):<br>

                    <br>

                  </div>

                  1) the program just hangs (it seems stuck in some

                  subroutine ==> see test.out-hangs)<br>

                </div>

                2) I get a floating point exception ==> see

                test.out-floating-point-exception<br>

                <br>

              </div>

              <div>Note that as suggested in the Superlu manual, I use a

                power of 2 number of procs. Are there any tunable

                parameters for the parallel symbolic factorization? Note

                that when I build my sparse matrix, most elements I add

                are nonzero of course but to simplify the programming, I

                also add a few zero elements in the sparse matrix. I was

                thinking that maybe if the parallel symbolic

                factorization proceed by block, there could be some

                blocks where the pivot would be zero, hence creating the

                FPE??<br>

                <br>

              </div>

              <div>Thanks,<br>

                <br>

              </div>

              <div>Anthony<br>

              </div>

              <div><br>

              </div>

              <br>

            </div>

            <div class="HOEnZb">

              <div class="h5">

                <div class="gmail_extra"><br>

                  <div class="gmail_quote">On Wed, Jul 8, 2015 at 6:46

                    AM, Xiaoye S. Li <span dir="ltr"><<a

                        moz-do-not-send="true"

                        href="mailto:xsli@lbl.gov" target="_blank">xsli@lbl.gov</a>></span>

                    wrote:<br>

                    <blockquote class="gmail_quote" style="margin:0 0 0

                      .8ex;border-left:1px #ccc solid;padding-left:1ex">

                      <div dir="ltr">

                        <div class="gmail_default"

                          style="font-family:arial,helvetica,sans-serif">Did

                          you find out how to change option to use

                          parallel symbolic factorization?  Perhaps

                          PETSc team can help. </div>

                        <div class="gmail_default"

                          style="font-family:arial,helvetica,sans-serif"><br>

                        </div>

                        <div class="gmail_default"

                          style="font-family:arial,helvetica,sans-serif">Sherry</div>

                        <div class="gmail_default"

                          style="font-family:arial,helvetica,sans-serif"><br>

                        </div>

                      </div>

                      <div>

                        <div>

                          <div class="gmail_extra"><br>

                            <div class="gmail_quote">On Tue, Jul 7, 2015

                              at 3:58 PM, Xiaoye S. Li <span dir="ltr"><<a

                                  moz-do-not-send="true"

                                  href="mailto:xsli@lbl.gov"

                                  target="_blank">xsli@lbl.gov</a>></span>

                              wrote:<br>

                              <blockquote class="gmail_quote"

                                style="margin:0 0 0 .8ex;border-left:1px

                                #ccc solid;padding-left:1ex">

                                <div dir="ltr">

                                  <div class="gmail_default"

                                    style="font-family:arial,helvetica,sans-serif">Is

                                    there an inquiry function that tells

                                    you all the available options?<br>

                                    <br>

                                  </div>

                                  <div class="gmail_default"

                                    style="font-family:arial,helvetica,sans-serif">Sherry<br>

                                  </div>

                                </div>

                                <div>

                                  <div>

                                    <div class="gmail_extra"><br>

                                      <div class="gmail_quote">On Tue,

                                        Jul 7, 2015 at 3:25 PM, Anthony

                                        Paul Haas <span dir="ltr"><<a

                                            moz-do-not-send="true"

                                            href="mailto:aph@email.arizona.edu"

                                            target="_blank">aph@email.arizona.edu</a>></span>

                                        wrote:<br>

                                        <blockquote class="gmail_quote"

                                          style="margin:0 0 0

                                          .8ex;border-left:1px #ccc

                                          solid;padding-left:1ex">

                                          <div dir="ltr">

                                            <div>

                                              <div>

                                                <div>

                                                  <div>

                                                    <div>Hi Sherry,<br>

                                                      <br>

                                                    </div>

                                                    <div>Thanks for your

                                                      message. I have

                                                      used superlu_dist

                                                      default options. I

                                                      did not realize

                                                      that I was doing

                                                      serial symbolic

                                                      factorization.

                                                      That is probably

                                                      the cause of my

                                                      problem. <br>

                                                    </div>

                                                    Each node on Garnet

                                                    has 60GB usable

                                                    memory and I can run

                                                    with 1,2,4,8,16 or

                                                    32 core per node. <br>

                                                    <br>

                                                  </div>

                                                  So I should use: <br>

                                                  <br>

                                                  -mat_superlu_dist_r 20<br>

                                                  -mat_superlu_dist_c 32<b><br>

                                                    <br>

                                                  </b></div>

                                                How do you specify the

                                                parallel symbolic

                                                factorization option? is

                                                it

                                                -mat_superlu_dist_matinput

                                                1<b><br>

                                                  <br>

                                                </b></div>

                                              Thanks,<br>

                                              <br>

                                            </div>

                                            Anthony<br>

                                            <div>

                                              <div>

                                                <div>

                                                  <div><br>

                                                  </div>

                                                </div>

                                              </div>

                                            </div>

                                          </div>

                                          <div>

                                            <div>

                                              <div class="gmail_extra"><br>

                                                <div class="gmail_quote">On

                                                  Tue, Jul 7, 2015 at

                                                  3:08 PM, Xiaoye S. Li

                                                  <span dir="ltr"><<a

moz-do-not-send="true" href="mailto:xsli@lbl.gov" target="_blank">xsli@lbl.gov</a>></span>

                                                  wrote:<br>

                                                  <blockquote

                                                    class="gmail_quote"

                                                    style="margin:0 0 0

                                                    .8ex;border-left:1px

                                                    #ccc

                                                    solid;padding-left:1ex">

                                                    <div dir="ltr">

                                                      <div

                                                        class="gmail_default"

style="font-family:arial,helvetica,sans-serif">For superlu_dist failure,

                                                        this occurs

                                                        during symbolic

                                                        factorization. 

                                                        Since you are

                                                        using serial

                                                        symbolic

                                                        factorization,

                                                        it requires the

                                                        entire graph of

                                                        A to be

                                                        available in the

                                                        memory of one

                                                        MPI task. How

                                                        much memory do

                                                        you have for

                                                        each MPI task?<br>

                                                        <br>

                                                      </div>

                                                      <div

                                                        class="gmail_default"

style="font-family:arial,helvetica,sans-serif">It won't help even if you

                                                        use more

                                                        processes.  You

                                                        should try to

                                                        use parallel

                                                        symbolic

                                                        factorization

                                                        option.<br>

                                                        <br>

                                                      </div>

                                                      <div

                                                        class="gmail_default"

style="font-family:arial,helvetica,sans-serif">Another point.  You set

                                                        up process grid

                                                        as:<br>

                                                               Process

                                                        grid nprow 32 x

                                                        npcol 20 <br>

                                                      </div>

                                                      <div

                                                        class="gmail_default"

style="font-family:arial,helvetica,sans-serif">For better performance,

                                                        you show swap

                                                        the grid

                                                        dimension. That

                                                        is, it's better

                                                        to use 20 x 32,

                                                        never gives

                                                        nprow larger

                                                        than npcol.<br>

                                                        <br>

                                                        <br>

                                                      </div>

                                                      <div

                                                        class="gmail_default"

style="font-family:arial,helvetica,sans-serif">Sherry<br>

                                                        <br>

                                                      </div>

                                                    </div>

                                                    <div

                                                      class="gmail_extra"><br>

                                                      <div

                                                        class="gmail_quote"><span>On

                                                          Tue, Jul 7,

                                                          2015 at 1:27

                                                          PM, Barry

                                                          Smith <span

                                                          dir="ltr"><<a

moz-do-not-send="true" href="mailto:bsmith@mcs.anl.gov" target="_blank">bsmith@mcs.anl.gov</a>></span>

                                                          wrote:<br>

                                                        </span>

                                                        <div>

                                                          <div>

                                                          <blockquote

                                                          class="gmail_quote"

                                                          style="margin:0

                                                          0 0

                                                          .8ex;border-left:1px

                                                          #ccc

                                                          solid;padding-left:1ex"><br>

                                                             I would

                                                          suggest

                                                          running a

                                                          sequence of

                                                          problems, 101

                                                          by 101 111 by

                                                          111 etc and

                                                          get the memory

                                                          usage in each

                                                          case (when you

                                                          run out of

                                                          memory you can

                                                          get NO useful

                                                          information

                                                          out about

                                                          memory needs).

                                                          You can then

                                                          plot memory

                                                          usage as a

                                                          function of

                                                          problem size

                                                          to get a

                                                          handle on how

                                                          much memory it

                                                          is using.  You

                                                          can also run

                                                          on more and

                                                          more processes

                                                          (which have a

                                                          total of more

                                                          memory) to see

                                                          how large a

                                                          problem you

                                                          may be able to

                                                          reach.<br>

                                                          <br>

                                                             MUMPS also

                                                          has an "out of

                                                          core" version

                                                          (which we have

                                                          never used)

                                                          that could in

                                                          theory anyways

                                                          let you get to

                                                          large problems

                                                          if you have

                                                          lots of disk

                                                          space, but you

                                                          are on your

                                                          own figuring

                                                          out how to use

                                                          it.<br>

                                                          <br>

                                                            Barry<br>

                                                          <div>

                                                          <div><br>

                                                          > On Jul 7,

                                                          2015, at 2:37

                                                          PM, Anthony

                                                          Paul Haas <<a

moz-do-not-send="true" href="mailto:aph@email.arizona.edu"

                                                          target="_blank">aph@email.arizona.edu</a>>

                                                          wrote:<br>

                                                          ><br>

                                                          > Hi Jose,<br>

                                                          ><br>

                                                          > In my

                                                          code, I use

                                                          once PETSc to

                                                          solve a linear

                                                          system to get

                                                          the baseflow

                                                          (without using

                                                          SLEPc) and

                                                          then I use

                                                          SLEPc to do

                                                          the stability

                                                          analysis of

                                                          that baseflow.

                                                          This is why,

                                                          there are some

                                                          SLEPc options

                                                          that are not

                                                          used in

                                                          test.out-superlu_dist-151x151

                                                          (when I am

                                                          solving for

                                                          the baseflow

                                                          with PETSc

                                                          only). I have

                                                          attached a

                                                          101x101 case

                                                          for which I

                                                          get the

                                                          eigenvalues.

                                                          That case

                                                          works fine.

                                                          However If i

                                                          increase to

                                                          151x151, I get

                                                          the error that

                                                          you can see in

                                                          test.out-superlu_dist-151x151

                                                          (similar error

                                                          with mumps:

                                                          see

                                                          test.out-mumps-151x151

                                                          line 2918 ).

                                                          If you look a

                                                          the very end

                                                          of the files

                                                          test.out-superlu_dist-151x151

                                                          and

                                                          test.out-mumps-151x151,

                                                          you will see

                                                          that the last

                                                          info message

                                                          printed is:<br>

                                                          ><br>

                                                          > On

                                                          Processor

                                                          (after

                                                          EPSSetFromOptions) 

                                                          0    memory: 

                                                          0.65073152000E+08 

                                                          =====> 

                                                          (see line 807

                                                          of

                                                          module_petsc.F90)<br>

                                                          ><br>

                                                          > This

                                                          means that the

                                                          memory error

                                                          probably

                                                          occurs in the

                                                          call to

                                                          EPSSolve (see

                                                          module_petsc.F90

                                                          line 810). I

                                                          would like to

                                                          evaluate how

                                                          much memory is

                                                          required by

                                                          the most

                                                          memory

                                                          intensive

                                                          operation

                                                          within

                                                          EPSSolve.

                                                          Since I am

                                                          solving a

                                                          generalized

                                                          EVP, I would

                                                          imagine that

                                                          it would be

                                                          the LU

                                                          decomposition.

                                                          But is there

                                                          an accurate

                                                          way of doing

                                                          it?<br>

                                                          ><br>

                                                          > Before

                                                          starting with

                                                          iterative

                                                          solvers, I

                                                          would like to

                                                          exploit as

                                                          much as I can

                                                          direct

                                                          solvers. I

                                                          tried GMRES

                                                          with default

                                                          preconditioner

                                                          at some point

                                                          but I had

                                                          convergence

                                                          problem. What

                                                          solver/preconditioner

                                                          would you

                                                          recommend for

                                                          a generalized

                                                          non-Hermitian

                                                          (EPS_GNHEP)

                                                          EVP?<br>

                                                          ><br>

                                                          > Thanks,<br>

                                                          ><br>

                                                          > Anthony<br>

                                                          ><br>

                                                          > On Tue,

                                                          Jul 7, 2015 at

                                                          12:17 AM, Jose

                                                          E. Roman <<a

moz-do-not-send="true" href="mailto:jroman@dsic.upv.es" target="_blank">jroman@dsic.upv.es</a>>

                                                          wrote:<br>

                                                          ><br>

                                                          > El

                                                          07/07/2015, a

                                                          las 02:33,

                                                          Anthony Haas

                                                          escribió:<br>

                                                          ><br>

                                                          > > Hi,<br>

                                                          > ><br>

                                                          > > I am

                                                          computing

                                                          eigenvalues

                                                          using

                                                          PETSc/SLEPc

                                                          and

                                                          superlu_dist

                                                          for the LU

                                                          decomposition

                                                          (my problem is

                                                          a generalized

                                                          eigenvalue

                                                          problem). The

                                                          code runs fine

                                                          for a grid

                                                          with 101x101

                                                          but when I

                                                          increase to

                                                          151x151, I get

                                                          the following

                                                          error:<br>

                                                          > ><br>

                                                          > >

                                                          Can't expand

                                                          MemType 1:

                                                          jcol 16104 

                                                           (and then

                                                          [NID 00037]

                                                          2015-07-06

                                                          19:19:17 Apid

                                                          31025976: OOM

                                                          killer

                                                          terminated

                                                          this process.)<br>

                                                          > ><br>

                                                          > > It

                                                          seems to be a

                                                          memory

                                                          problem. I

                                                          monitor the

                                                          memory usage

                                                          as far as I

                                                          can and it

                                                          seems that

                                                          memory usage

                                                          is pretty low.

                                                          The most

                                                          memory

                                                          intensive part

                                                          of the program

                                                          is probably

                                                          the LU

                                                          decomposition

                                                          in the context

                                                          of the

                                                          generalized

                                                          EVP. Is there

                                                          a way to

                                                          evaluate how

                                                          much memory

                                                          will be

                                                          required for

                                                          that step? I

                                                          am currently

                                                          running the

                                                          debug version

                                                          of the code

                                                          which I would

                                                          assume would

                                                          use more

                                                          memory?<br>

                                                          > ><br>

                                                          > > I

                                                          have attached

                                                          the output of

                                                          the job. Note

                                                          that the

                                                          program uses

                                                          twice PETSc:

                                                          1) to solve a

                                                          linear system

                                                          for which no

                                                          problem

                                                          occurs, and,

                                                          2) to solve

                                                          the

                                                          Generalized

                                                          EVP with

                                                          SLEPc, where I

                                                          get the error.<br>

                                                          > ><br>

                                                          > >

                                                          Thanks<br>

                                                          > ><br>

                                                          > >

                                                          Anthony<br>

                                                          > >

                                                          <test.out-superlu_dist-151x151><br>

                                                          ><br>

                                                          > In the

                                                          output you are

                                                          attaching

                                                          there are no

                                                          SLEPc objects

                                                          in the report

                                                          and SLEPc

                                                          options are

                                                          not used. It

                                                          seems that

                                                          SLEPc calls

                                                          are skipped?<br>

                                                          ><br>

                                                          > Do you

                                                          get the same

                                                          error with

                                                          MUMPS? Have

                                                          you tried to

                                                          solve linear

                                                          systems with a

                                                          preconditioned

                                                          iterative

                                                          solver?<br>

                                                          ><br>

                                                          > Jose<br>

                                                          ><br>

                                                          ><br>

                                                          </div>

                                                          </div>

                                                          >

<module_petsc.F90><test.out-mumps-151x151><test.out_superlu_dist-101x101><test.out-superlu_dist-151x151><br>

                                                          <br>

                                                          </blockquote>

                                                          </div>

                                                        </div>

                                                      </div>

                                                      <br>

                                                    </div>

                                                  </blockquote>

                                                </div>

                                                <br>

                                              </div>

                                            </div>

                                          </div>

                                        </blockquote>

                                      </div>

                                      <br>

                                    </div>

                                  </div>

                                </div>

                              </blockquote>

                            </div>

                            <br>

                          </div>

                        </div>

                      </div>

                    </blockquote>

                  </div>

                  <br>

                </div>

              </div>

            </div>

          </blockquote>

        </div>

        <br>

      </div>

    </blockquote>

    <br>

  </body>

</html>