!
! init_poisson - high level init of physics and solver
!
subroutine init_poisson(grid,psn,iflag_dummy)
  use psn_class
  use grid_class
  use sml_module
  use eq_module
  implicit none
  type(grid_type) :: grid
  type(psn_type) :: psn
  integer :: iflag_dummy ! == 3 --obsolete
  integer :: i
  real (kind=8) :: psi,r,z


  if (iflag_dummy/=3) stop 'init_poisson: iflag\=3'
  
  do i=1,grid%nnode
     psi=grid%psi(i)
     r=grid%x(1,i)
     z=grid%x(2,i)
     psn%tempi_ev(i)=eq_ftn(psi,r,z,eq_tempi)  ! use real r & z value for safety
     psn%tempe_ev(i)=eq_ftn(psi,r,z,eq_tempe)
     psn%tite(i)=psn%tempi_ev(i)/psn%tempe_ev(i)
  enddo
  
  ! init_poisson_solver
  if (sml_iter_solver) then
#ifndef USE_NEW_PETSC_SOLVER
     call init_poisson_iter_solvers(grid,psn,iflag_dummy)
#else
     stop 'init_poisson_iter_solvers not supported'
#endif
  else
     call init_poisson_solver(grid,psn,iflag_dummy)
  endif

  if(sml_add_pot0>0) then
     call check_point('add potential offset')
     allocate( psn%add_pot0(grid%nnode))

     ! prepare additional 0 - potential
     if(sml_add_pot0==1) then
        call read_add_pot0(grid,psn)
     elseif(sml_add_pot0==2) then
!        call neo_pot0_simple(grid,psn)
     else
        psn%add_pot0=0D0
     endif
  endif

  call check_point('prepare gyro-averaging matrix')
  if(sml_plane_mype<grid%nrho*2) then
     call init_gyro_avg_mat(grid,psn)
  endif
end subroutine init_poisson

subroutine init_poisson_solver(grid,psn,iflag_dummy)
  use psn_class
  use grid_class
  use sml_module
  use ptl_module, only : ptl_mass
  use eq_module
  implicit none
#include <finclude/petsc.h>
  type(grid_type) :: grid
  type(psn_type) :: psn
  integer :: iflag_dummy

 ! type(mat_type) :: matl, matr, matr2
  integer :: nnz_row
  real (kind=8),allocatable :: alpha(:),beta(:),teev(:),den(:),b2(:)
  integer :: itr,nd(3),in_bd_node
  PetscErrorCode :: ierr
  real (kind=8) :: x_center(2),psi
  real (kind=8) :: factor,psi_in_poisson_bd,psi_out_poisson_bd
  real (kind=8), external :: gyro_radius2,psi_interpol,b_interpol
  real (kind=8), parameter :: offset=2D0

  allocate(alpha(grid%ntriangle),beta(grid%ntriangle),teev(grid%ntriangle),den(grid%ntriangle),b2(grid%ntriangle))
  nnz_row=sml_max_mat_width

  ! setup n_0,b^2,Te at centroids of triangles
  do itr=1, grid%ntriangle
     nd=grid%nd(:,itr)
     x_center=(grid%x(:,nd(1))+grid%x(:,nd(2))+grid%x(:,nd(3)))/3D0
     psi=psi_interpol(x_center(1),x_center(2),0,0)
     !tiev(itr)=eq_ftn(psi,x_center(1),x_center(2),eq_tempi)
     teev(itr)=eq_ftn(psi,x_center(1),x_center(2),eq_tempe)
     b2(itr)=b_interpol(x_center(1),x_center(2),0D0)**2
     den(itr)=eq_ftn(psi,x_center(1),x_center(2),eq_den)
  enddo
  
  ! 00 solver
  call check_point('Initialize 00 solver')
  if(.NOT. sml_use_simple00 ) then
     ! **************************************
     ! initialize petsc_solver
     ! **************************************
     if(.true.) then ! redundently solve 00 modes solver
        psn%solver00%comm = sml_plane_comm
        psn%solver00%mype=sml_plane_mype
        psn%solver00%totalpe=sml_plane_totalpe
     else
        psn%solver00%comm = petsc_comm_world
        psn%solver00%mype=sml_mype
        psn%solver00%totalpe=sml_totalpe
     endif
     psn%solver00%prefix=1

     ! RHS
     psn%solver00%n_rhs_mat=1
     call check_point('- 00 solver memory create')
     call create_solver(psn%solver00,grid%nnode,nnz_row,ierr)

     ! LHS - alpha del^2 u 
     alpha=ptl_mass(1)/sml_e_charge*den/b2 ! mn_0/eB^2
     beta=0D0
     call helm_matrix(psn%solver00%Amat,alpha,beta,grid,psn%pbd0_2,.false.,ierr)

     ! init_1/2 solvers setup KSP
#ifdef USE_NEW_PETSC_SOLVER
     if (sml_poisson_solver_type/=0) then
        call init_2field_solver(grid,psn,psn%solver00)
     else
#endif
        call init_1field_solver(grid,psn,psn%solver00)
#ifdef USE_NEW_PETSC_SOLVER
     end if
#endif
     ! for compatability with old solver
     call init_xgc_solver(psn%solver00,grid%nnode,psn%solver00%comm,ierr)
     ! RHS mass matrix
     alpha=0D0  ! no del term on RHS
     beta=1D0  ! diagonal (identity)
     call helm_matrix(psn%solver00%rhs_mat,alpha,beta,grid,psn%cbd0_2,.false.,ierr)
  else ! simple 00
     call init_simple00(grid,psn)
  endif

  ! turb solver
  call check_point('Initialize turb solver')
  if(sml_turb_poisson) then
     ! ************************
     ! initialize petsc_solver
     ! ************************
     psn%solverH%comm = sml_plane_comm
     psn%solverH%mype=sml_plane_mype
     psn%solverH%totalpe=sml_plane_totalpe
     psn%solverH%prefix=2

     ! RHS
     psn%solverH%n_rhs_mat=1
     call check_point('- turb solver memory create')
     call create_solver(psn%solverH,grid%nnode,nnz_row,ierr)

     ! LHS mn_0/eB^2
     alpha=ptl_mass(1)/sml_e_charge*den/b2 ! changed sign with new solver
     beta=den/teev
     call helm_matrix(psn%solverH%Amat,alpha,beta,grid,psn%pbdH_2,.false.,ierr)

     call init_1field_solver(grid,psn,psn%solverH) ! like old solver, sets up KSP
     ! for compatability with old solver
     call init_xgc_solver(psn%solverH,grid%nnode,psn%solverH%comm,ierr)

     alpha=0D0		! no del term on RHS
     beta=1D0		! diagonal (identity)
     call helm_matrix(psn%solverH%rhs_mat,alpha,beta,grid,psn%cbdH_2,.false.,ierr)
  endif

  deallocate(alpha,beta,teev,den,b2)

end subroutine init_poisson_solver

!
! create 1 field solver with A00 built (Amat), just setoperator & setup like 2 field
!
subroutine init_1field_solver(grid,psn,solver)
  use grid_class
  use psn_class
  use xgc_solver_module
  use sml_module, only : sml_mype,sml_intpl_mype,sml_pe_per_plane
  implicit none
#include <finclude/petsc.h>
  type(grid_type) :: grid
  type(psn_type):: psn
  type(xgc_solver) :: solver
  PetscErrorCode :: ierr
  !
  if(sml_mype==0) print *, 'start init_1field_solver'

  ! create KSP
  call KSPCreate(solver%comm,solver%ksp,ierr)
  CHKERRQ(ierr)
  if (solver%prefix.eq.2) then ! turb
     call KSPSetOptionsPrefix(solver%ksp,'s2_',ierr)
     if (sml_mype/sml_pe_per_plane .gt. 0) then ! not first, no monitor
        call PetscOptionsClearValue('-s2_ksp_monitor',ierr)
        call PetscOptionsClearValue('-s2_ksp_converged_reason',ierr)
     endif
  else ! 1 field 00 solver, no prefix
     call PetscOptionsClearValue('-ksp_monitor',ierr)
     call PetscOptionsClearValue('-ksp_converged_reason',ierr)
  end if
  call KSPSetFromOptions(solver%ksp,ierr)
  CHKERRQ(ierr)

  call KSPSetOperators(solver%ksp,  solver%Amat,  solver%Amat, SAME_NONZERO_PATTERN, ierr )
  CHKERRQ(ierr)  

  ! setup solver now that matrix is complete
  call KSPSetUp( solver%ksp, ierr )
  if(ierr .ne. 0) then
     print *, 'Error in setup of petsc init_1field_solver solver :',ierr
     call MPI_ABORT(solver%comm,1,ierr)
  endif
end subroutine init_1field_solver
!
! create 2 field solver with A00 built (Amat)
!
#ifdef USE_NEW_PETSC_SOLVER
subroutine init_2field_solver(grid,psn,solver)
  use grid_class
  use psn_class
  use xgc_solver_module
  use sml_module, only : sml_mype,sml_poisson_solver_type,sml_iter_solver,sml_intpl_mype,sml_pe_per_plane
  implicit none
#include <finclude/petsc.h>
  type(grid_type) :: grid
  type(psn_type):: psn
  type(xgc_solver) :: solver
  PetscErrorCode :: ierr
  ! sml_mype/sml_pe_per_plane
  real (kind=8), parameter :: epsilon=0D0
  integer :: comm
  PetscInt :: nflux,j,low,high,N1loc,N2loc
  Mat :: matArray(4)
  KSP :: ksp,ksps(2)
  PC :: pc
  Vec :: x1Vec,x2Vec,x2Vecloc,x1Vecloc
  PetscViewer :: viewer
  PetscInt, parameter :: ione=1
  PetscInt, parameter :: itwo=2
  PetscScalar, parameter :: neg_one = -1.d0
  external FormJacobian,FormFunction
  
  ! set helper vars
  comm = solver%comm
  call MatGetOwnershipRange(solver%Amat,low,high,ierr)
  CHKERRQ(ierr)
  N1loc = high - low

  ! create other matrices and vectors, DM ...
  if (sml_poisson_solver_type/=0 .and. solver%prefix==1) then
     nflux = grid%cnv_2d_00%m
     if (nflux.le.0) stop 'nflux.lt.0'
  else
     ! nflux = 0
     stop '!(sml_poisson_solver_type/=0 .and. solver%prefix==1)'
  endif
  if(sml_mype==0) print *, &
       ' FSA SOLVER: ',nflux,' flux surfaces, (doit=',sml_poisson_solver_type/=0 .and. solver%prefix==1,') make A (A_00) mat.'

  ! store raw A_0 operator for formFunction
  call MatDuplicate(solver%Amat,MAT_COPY_VALUES,solver%A0Mat,ierr)
  CHKERRQ(ierr)

  ! add linearization term to Amat
  call add_11mat(grid,psn,solver%Amat,psn%pbd0_2)
  
  ! Bmat -- tall skinny
  call MatCreate(comm,solver%Bmat,ierr)
  CHKERRQ(ierr)
  call MatSetSizes(solver%Bmat,N1loc,PETSC_DECIDE,PETSC_DECIDE,nflux,ierr)
  CHKERRQ(ierr)
  call MatSetType(solver%Bmat,MATAIJ,ierr)
  CHKERRQ(ierr)
  call MatSeqAIJSetPreallocation(solver%Bmat,itwo,PETSC_NULL_INTEGER,ierr)
  CHKERRQ(ierr)
  call MatMPIAIJSetPreallocation(solver%Bmat,itwo,PETSC_NULL_INTEGER,itwo,PETSC_NULL_INTEGER,ierr)
  CHKERRQ(ierr)
  call MatSetOption(solver%Bmat,MAT_NEW_NONZERO_ALLOCATION_ERR,PETSC_TRUE,ierr)
  CHKERRQ(ierr)
  call MatSetFromOptions(solver%Bmat,ierr)
  CHKERRQ(ierr)
  ! BIdentMat
  call MatCreate(comm,solver%BIdentMat,ierr)
  CHKERRQ(ierr)
  call MatSetSizes(solver%BIdentMat,N1loc,PETSC_DECIDE,PETSC_DECIDE,nflux,ierr)
  CHKERRQ(ierr)
  call MatSetType(solver%BIdentMat,MATAIJ,ierr)
  CHKERRQ(ierr)
  call MatSeqAIJSetPreallocation(solver%BIdentMat,itwo,PETSC_NULL_INTEGER,ierr)
  CHKERRQ(ierr)
  call MatMPIAIJSetPreallocation(solver%BIdentMat,itwo,PETSC_NULL_INTEGER,itwo,PETSC_NULL_INTEGER,ierr)
  CHKERRQ(ierr)
  call MatSetOption(solver%BIdentMat,MAT_NEW_NONZERO_ALLOCATION_ERR,PETSC_TRUE,ierr)
  CHKERRQ(ierr)
  call MatSetFromOptions(solver%BIdentMat,ierr)

  ! Cmat -- short fat, need to preallocate later
  call MatCreate(comm,solver%Cmat,ierr)
  CHKERRQ(ierr)
  call MatSetSizes(solver%Cmat,PETSC_DECIDE,N1loc,nflux,PETSC_DECIDE,ierr)
  CHKERRQ(ierr)
  call MatSetType(solver%Cmat,MATAIJ,ierr)
  CHKERRQ(ierr)

  ! Dmat -- -I
  call MatCreate(comm,solver%Dmat,ierr)
  CHKERRQ(ierr)
  call MatSetSizes(solver%Dmat,PETSC_DECIDE,PETSC_DECIDE,nflux,nflux,ierr)
  CHKERRQ(ierr)
  call MatSetType(solver%Dmat,MATAIJ,ierr)
  CHKERRQ(ierr)
  call MatSeqAIJSetPreallocation(solver%Dmat,ione,PETSC_NULL_INTEGER,ierr)
  CHKERRQ(ierr)
  call MatMPIAIJSetPreallocation(solver%Dmat,ione,PETSC_NULL_INTEGER,ione,PETSC_NULL_INTEGER,ierr)
  CHKERRQ(ierr)
  call MatSetOption(solver%Dmat,MAT_NEW_NONZERO_ALLOCATION_ERR,PETSC_TRUE,ierr)
  CHKERRQ(ierr)
  call MatSetFromOptions(solver%Dmat,ierr)
  CHKERRQ(ierr)

  if( nflux > 0 ) then ! means prefix==1 and solver00
     ! pbd0_2 is hard coded since nflux>0 only with solver00
     call make_12mat(grid,psn,solver%Bmat,solver%BIdentMat,solver,psn%pbd0_2)
     call make_21mat(grid,psn,solver%Cmat,solver%Bmat,psn%pbd0_2)
  else
     ! create_flux_mats has the side effect of setting a prealloc - need something
     call MatSeqAIJSetPreallocation(solver%Cmat,ione,PETSC_NULL_INTEGER,ierr)
     CHKERRQ(ierr)
     call MatMPIAIJSetPreallocation(solver%Cmat,ione,PETSC_NULL_INTEGER,ione,PETSC_NULL_INTEGER,ierr)
     CHKERRQ(ierr)
  endif

  ! Identity
  call MatGetOwnershipRange(solver%Dmat,low,high,ierr)
  N2loc = high-low
  do j=low,high-1
     call MatSetValues(solver%Dmat,ione,j,ione,j,neg_one,INSERT_VALUES,ierr)
     CHKERRQ(ierr)
  enddo
  call MatAssemblyBegin(solver%Bmat,MAT_FINAL_ASSEMBLY,ierr)
  call MatAssemblyBegin(solver%BIdentMat,MAT_FINAL_ASSEMBLY,ierr)
  call MatAssemblyBegin(solver%Cmat,MAT_FINAL_ASSEMBLY,ierr)
  call MatAssemblyBegin(solver%Dmat,MAT_FINAL_ASSEMBLY,ierr)
  call MatAssemblyEnd(solver%Bmat,MAT_FINAL_ASSEMBLY,ierr)
  call MatAssemblyEnd(solver%BIdentMat,MAT_FINAL_ASSEMBLY,ierr)
  call MatAssemblyEnd(solver%Cmat,MAT_FINAL_ASSEMBLY,ierr)
  call MatAssemblyEnd(solver%Dmat,MAT_FINAL_ASSEMBLY,ierr)

  if (sml_intpl_mype == 0 .and. nflux.gt.0 .and. .false.) then
     call  PetscViewerASCIIOpen(comm, 'Amat.m', viewer,ierr)
     call  PetscViewerSetFormat(viewer, PETSC_VIEWER_ASCII_MATLAB,ierr)
     call  MatView(solver%Amat,viewer,ierr)
     call  PetscViewerDestroy(viewer,ierr)
     call  PetscViewerASCIIOpen(comm, 'Bmat.m', viewer,ierr)
     call  PetscViewerSetFormat(viewer, PETSC_VIEWER_ASCII_MATLAB,ierr)
     call  MatView(solver%Bmat,viewer,ierr)
     call  PetscViewerDestroy(viewer,ierr)
     call  PetscViewerASCIIOpen(comm, 'Cmat.m', viewer,ierr)
     call  PetscViewerSetFormat(viewer, PETSC_VIEWER_ASCII_MATLAB,ierr)
     call  MatView(solver%Cmat,viewer,ierr)
     call  PetscViewerDestroy(viewer,ierr)
     call  PetscViewerASCIIOpen(comm, 'Dmat.m', viewer,ierr)
     call  PetscViewerSetFormat(viewer, PETSC_VIEWER_ASCII_MATLAB,ierr)
     call  MatView(solver%Dmat,viewer,ierr)
     call  PetscViewerDestroy(viewer,ierr)
  endif

  ! phi DM
  call VecCreate(comm,x1Vec,ierr)
  call VecSetSizes(x1Vec,N1loc,PETSC_DECIDE,ierr)
  call VecSetFromOptions(x1Vec,ierr)

  call DMShellCreate(comm,solver%daphi,ierr)
  call DMShellSetGlobalVector(solver%daphi,x1Vec,ierr)
  call DMShellSetMatrix(solver%daphi,solver%Amat,ierr)

  call VecCreate(PETSC_COMM_SELF,x1Vecloc,ierr)
  call VecSetSizes(x1Vecloc,N1loc,N1loc,ierr)
  call VecSetFromOptions(x1Vecloc,ierr)
  call DMShellSetLocalVector(solver%daphi,x1Vecloc,ierr)
  call VecDestroy(x1Vec,ierr)
  call VecDestroy(x1Vecloc,ierr)

  ! lambda DM
  call VecCreate(comm,x2Vec,ierr)
  call VecSetSizes(x2Vec,N2loc,nflux,ierr)
  call VecSetFromOptions(x2Vec,ierr)

  call DMShellCreate(comm,solver%dalam,ierr)
  call DMShellSetGlobalVector(solver%dalam,x2Vec,ierr)
  call DMShellSetMatrix(solver%dalam,solver%Dmat,ierr)

  call VecCreate(PETSC_COMM_SELF,x2Vecloc,ierr)
  call VecSetSizes(x2Vecloc,N2loc,N2loc,ierr)
  call VecSetFromOptions(x2Vecloc,ierr)
  call DMShellSetLocalVector(solver%dalam,x2Vecloc,ierr)
  call VecDestroy(x2Vec,ierr)
  call VecDestroy(x2Vecloc,ierr)

  ! - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  !  Create field split DM
  ! - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  call DMCompositeCreate(comm,solver%da,ierr)
  CHKERRQ(ierr)
  call DMSetOptionsPrefix(solver%daphi,'phi_',ierr)
  call DMCompositeAddDM(solver%da,solver%daphi,ierr)
  CHKERRQ(ierr)
  call DMSetOptionsPrefix(solver%dalam,'lambda_',ierr)
  call DMCompositeAddDM(solver%da,solver%dalam,ierr)
  CHKERRQ(ierr)

  matArray(1) = solver%Amat
  matArray(2) = solver%Bmat
  matArray(3) = solver%Cmat
  matArray(4) = solver%Dmat
  call MatCreateNest(comm,itwo,PETSC_NULL_OBJECT,itwo,PETSC_NULL_OBJECT,matArray,solver%KKTmat,ierr)
  call MatSetFromOptions(solver%KKTmat,ierr)

  !  Extract global and local vectors from DM; then duplicate for remaining
  !     vectors that are the same types
  call MatGetVecs(solver%KKTmat,solver%xVec2,solver%bVec2,ierr)
  call VecDuplicate(solver%bVec2,solver%rVec2,ierr)

  ! create SNES
  call SNESCreate(comm,solver%snes,ierr)
  CHKERRQ(ierr)
  call SNESSetDM(solver%snes,solver%da,ierr)
  CHKERRQ(ierr)
  ! call SNESSetApplicationContext(solver%snes,solver,ierr)
  ! call SNESSetApplicationContext(solver%snes,grid,ierr)

  !  Set function evaluation routine and vector
  call SNESSetFunction(solver%snes,solver%rVec2,FormFunction,solver,ierr)
  CHKERRQ(ierr)
  call SNESSetJacobian(solver%snes,solver%KKTmat,solver%KKTmat,FormJacobian,solver,ierr)
  CHKERRQ(ierr)
  ! - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  !  Customize nonlinear solver; set runtime options
  ! - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  !  Set runtime options (e.g., -snes_monitor -snes_rtol <rtol> -ksp_type <type>)
  call SNESSetOptionsPrefix(solver%snes, 'fsa_', ierr)
  CHKERRQ(ierr)
  ! clear monitors on all planes except plane 0
  if (sml_mype/sml_pe_per_plane .gt. 0) then 
     call PetscOptionsClearValue('-fsa_snes_monitor',ierr)
     call PetscOptionsClearValue('-fsa_ksp_monitor',ierr)
     call PetscOptionsClearValue('-fsa_fieldsplit_phi_ksp_monitor',ierr)
     call PetscOptionsClearValue('-fsa_fieldsplit_lambda_ksp_monitor',ierr)
     call PetscOptionsClearValue('-fsa_snes_view',ierr)
     call PetscOptionsClearValue('-fsa_snes_converged_reason',ierr)
  endif
  call SNESSetFromOptions(solver%snes,ierr)
  CHKERRQ(ierr)
  !call SNESSetUp(solver%snes,ierr) ! needs new PETSc
  !CHKERRQ(ierr)

end subroutine init_2field_solver

subroutine add_11mat(grid,psn,Amat,bd)
  use grid_class
  use psn_class
  use eq_module
  use sml_module, only : sml_mype
  implicit none
#include <finclude/petsc.h>
  type(grid_type) :: grid
  type(psn_type):: psn
  Mat :: Amat
  type(boundary2_type), intent(in) :: bd
  !
  real (kind=8),allocatable :: alpha(:), beta(:)
  PetscErrorCode :: ierr
  PetscInt, parameter :: ione=1
  PetscScalar :: value
  real (8) :: pn,wp

  if(sml_mype==0) print *, ' FSA: Add to A mat. with ',grid%ntriangle,' triangles.'

  allocate(alpha(grid%ntriangle),beta(grid%ntriangle))
  alpha=0D0  ! no del term o

  call make_11beta(grid,beta)

  call helm_matrix( Amat, alpha, beta, grid, psn%cbd0_2, .false., ierr)
  CHKERRQ(ierr)

  deallocate(alpha,beta)
end subroutine add_11mat

subroutine make_11beta( &
     grid, &
     beta &                   ! beta u (out)
     )
  use grid_class
  use sml_module
  use eq_module
  implicit  none
  type(grid_type) :: grid
  real (kind=8), intent(out) :: beta(grid%ntriangle)
  !
  real (kind=8) ::  x_center(2),teev,den,psi
  integer :: itr,nd(3)
  real (kind=8), external :: psi_interpol

  do itr=1,grid%ntriangle
     ! setup basic values
     nd=grid%nd(:,itr)
     x_center=( grid%x(:,nd(1)) + grid%x(:,nd(2)) + grid%x(:,nd(3)) )/3D0
#ifdef FLAT_GYRORADIUS
     x_center(1)=eq_axis_r
     x_center(2)=eq_axis_z
#endif
     !rhoi2(itr)=gyro_radius2(x_center)**2
     psi=psi_interpol(x_center(1),x_center(2),0,0)
     !tiev(itr)=eq_ftn(psi,x_center(1),x_center(2),eq_tempi)
     teev=eq_ftn(psi,x_center(1),x_center(2),eq_tempe)
     !b2(itr)=b_interpol(x_center(1),x_center(2),0D0)**2
     den=eq_ftn(psi,x_center(1),x_center(2),eq_den)

     ! e n_0 C /T_e(xx) -- todo: add C
     beta(itr) = den*sml_e_charge/teev
  enddo
end subroutine make_11beta

subroutine make_12mat(grid,psn,Bmat,BIdentMat,solver,bd)
  use grid_class
  use psn_class
  use eq_module
  use sml_module, only : sml_mype, sml_e_charge
  use xgc_solver_module
  implicit none
#include <finclude/petsc.h>
  type(grid_type) :: grid
  type(psn_type):: psn
  Mat :: Bmat,BIdentMat
  type(xgc_solver) :: solver
  type(boundary2_type), intent(in) :: bd
  integer :: inout
  !
  PetscInt :: km1,im1,ip,ipm1,low,high,idxj(2)
  PetscErrorCode :: ierr
  PetscInt, parameter :: ione=1
  PetscScalar :: value(2),x_center(2),teev,den,psi
  real (8) :: pn,wp
  real (kind=8), external :: psi_interpol
  integer :: st,i

  if(sml_mype==0) print *, ' FSA: create B mat. M,N=',grid%cnv_2d_00%m,grid%cnv_2d_00%n

  !Bmat
  call MatGetOwnershipRange(Bmat,low,high,ierr)
  CHKERRQ(ierr)

  ! allocate and cache n_o and e/T_e
  allocate(solver%n0(low:high-1),solver%e_Te(low:high-1),STAT=st)
  if(sml_mype==0) print *, ' FSA: allocate n_0 & e/T_e. low=',low,', high=',high
  if(st>0) then
     print *, 'insufficient memory for matrix allocation',low,high-1
     stop
  endif

  do i=low,high-1
     !do i=1,grid%nnode
     pn=(grid%psi(i)-grid%psi00min)/grid%dpsi00
     ip=floor(pn)+1
     if(0 < ip .and. ip < grid%npsi00 .and. is_rgn12(grid%x(1,i),grid%x(2,i),grid%psi(i)) ) then
        wp=1D0 - ( pn - real(ip-1,8) )
     elseif( ip <= 0 ) then
        ip=1
        wp=1D0
     else
        ip=grid%npsi00-1
        wp=0D0
     endif

     im1=i-1
     ipm1=ip-1

     ! BC check     
     if( is_inside(i,bd) ) then
        ! inner
        value(1)=wp*grid%node_area(i)
        idxj(1) = ipm1
        ! outer
        value(2)=(1D0-wp)*grid%node_area(i)
        idxj(2) = ip
     else
        value=0.d0
        idxj(1) = ipm1
        idxj(2) = ip
     endif

     ! setup n_0 & e/T_e, we could use sml_e_charge/psn%tempe_ev(i) and save a vector
     x_center = grid%x(:,i)
     psi=psi_interpol(x_center(1),x_center(2),0,0)
     teev=eq_ftn(psi,x_center(1),x_center(2),eq_tempe)
     den=eq_ftn(psi,x_center(1),x_center(2),eq_den)
     solver%n0(i) = den
     solver%e_Te(i) = sml_e_charge/teev

     do inout=1,2
        !store raw B - the identity operator
        call MatSetValues(BIdentMat,ione,im1,ione,idxj(inout),value(inout),INSERT_VALUES,ierr)
        CHKERRQ(ierr)
        ! scale 'value' by -e n_0 C /T_e(xx) -- todo: add C
        value(inout) = -value(inout)*den*sml_e_charge/teev
        call MatSetValues(Bmat,ione,im1,ione,idxj(inout),value(inout),INSERT_VALUES,ierr)
        CHKERRQ(ierr)
        ! from original code
        !v2d(i)=v1d(ip)*wp  + v1d(ip+1)*(1D0-wp)
     enddo
  end do
end subroutine make_12mat

subroutine make_21mat(grid,psn,Cmat,Bmat,bd)
  use grid_class
  use psn_class
  use sml_module, only : sml_mype
  implicit none
#include <finclude/petsc.h>
  type(grid_type) :: grid
  type(psn_type):: psn
  Mat :: Cmat
  Mat,intent(in) :: Bmat
  type(boundary2_type), intent(in) :: bd
  !
  PetscInt :: j,k,km1,im1,low,high,i,nlocRows,nGlobalCols,nLocCols
  PetscErrorCode :: ierr
  PetscInt, parameter :: ione=1
  PetscScalar :: value
  PetscInt,allocatable :: d_nnz(:),o_nnz(:)

  ! Cmat - count nnz, can't can call MatGetOwnershipRange before prealloc. Use transpose property.
  call MatGetOwnershipRangeColumn(Bmat,low,high,ierr)
  CHKERRQ(ierr)
  nlocRows = high-low
  allocate(d_nnz(nlocRows),o_nnz(nlocRows))
  d_nnz = 0
  o_nnz = 0
  do i=1, grid%cnv_2d_00%n
     do j=1, grid%cnv_2d_00%nelement(i)
        k = grid%cnv_2d_00%eindex(j,i)
        !y(k) = y(k) + mat%value(j,i) * x(i)
        km1=k-1
        !im1=i-1
        if (km1 .ge.low .and. km1 .lt. high ) then
           d_nnz(km1-low+1) = d_nnz(km1-low+1) + 1
           o_nnz(km1-low+1) = o_nnz(km1-low+1) + 1
        end if
     end do
  end do

  ! transpose
  call MatGetSize(Bmat,nGlobalCols,PETSC_NULL_INTEGER,ierr)
  call MatGetLocalSize(Bmat,nLocCols,PETSC_NULL_INTEGER,ierr)
  do i=1,nlocRows
     if (d_nnz(i) .gt. nLocCols) d_nnz(i) = nLocCols
     if (o_nnz(i) .gt. nGlobalCols-nLocCols) o_nnz(i) = nGlobalCols-nLocCols
  end do

  call MatSeqAIJSetPreallocation(Cmat,ione,d_nnz,ierr)
  CHKERRQ(ierr)
  call MatMPIAIJSetPreallocation(Cmat,ione,d_nnz,ione,o_nnz,ierr)
  CHKERRQ(ierr)
  deallocate(d_nnz,o_nnz)
  call MatSetOption(Cmat,MAT_IGNORE_OFF_PROC_ENTRIES,PETSC_TRUE,ierr)
  CHKERRQ(ierr)
  call MatSetOption(Cmat,MAT_NEW_NONZERO_ALLOCATION_ERR,PETSC_TRUE,ierr)
  CHKERRQ(ierr)
  call MatSetFromOptions(Cmat,ierr)
  CHKERRQ(ierr)

  ! we could: do i=low,high-1 here but its over columns so not a big win
  do i=1,grid%cnv_2d_00%n
     do j=1, grid%cnv_2d_00%nelement(i)
        k = grid%cnv_2d_00%eindex(j,i)
        !y(k) = y(k) + mat%value(j,i) * x(i)
        km1=k-1
        im1=i-1
        if (km1 .ge.low .and. km1 .lt. high ) then
           value=grid%cnv_2d_00%value(j,i)/grid%cnv_norm_1d00(k)
           call MatSetValues(Cmat,ione,km1,ione,im1,value,INSERT_VALUES,ierr)
           CHKERRQ(ierr)
        end if
     enddo
  enddo
end subroutine make_21mat
#endif

!************************************************************************
! solve_poisson - high level solve, init physics, solve
!************************************************************************
subroutine solve_poisson(grid,psn,iflag)
  use psn_class
  use grid_class
  use sml_module
  implicit none
  type(grid_type) :: grid
  type(psn_type) :: psn
  integer,intent(in) :: iflag 

  if(sml_iter_solver) then
#ifndef USE_NEW_PETSC_SOLVER
     call solve_poisson_iter(grid,psn,iflag)
#else
     stop 'solve_poisson_iter supported'
#endif
  else
     call solve_poisson_private(grid,psn,iflag)
  endif

end subroutine solve_poisson
!
! the solver
!
subroutine solve_poisson_private(grid,psn,iflag)
  use psn_class
  use grid_class
  use sml_module
  use perf_monitor
  use smooth_module
  use omp_module, only: split_indices
  implicit none
  type(grid_type) :: grid
  type(psn_type) :: psn
  integer,intent(in) :: iflag ! -1 for init, 1==00 mode+turb, 2==turb only, unless solver_type/=0
  !
  integer :: i, loop_num,ierr
  integer :: ith,i_beg(sml_nthreads),i_end(sml_nthreads)
  real (kind=8), allocatable :: dentmp0(:),dentmp2(:),dentmp3(:),dentmp4(:)
  real (kind=8), allocatable :: sendl(:),recvr(:)
  real (kind=8) :: t1, t2, t3
  save dentmp0, dentmp2, dentmp3, dentmp4
  save sendl,recvr

  if(iflag==-1) then ! this is a memory leak
     allocate(dentmp0(grid%nnode),dentmp2(grid%nnode),dentmp3(grid%nnode),dentmp4(grid%nnode))
     allocate(sendl(grid%nnode),recvr(grid%nnode))
     dentmp0=0D0 !for safty
     dentmp2=0D0
     dentmp3=0D0
     dentmp4=0D0
     if(sml_mype==0) print *, &
          '**************** solve_poisson_private static variable initialized ****************'
     return
  endif

  ! get n0************************************************************************************
  psn%cden00_1d = psn%iden00_1d-psn%eden00_1d ! not used in poisson. Just for diagnosis.
  
  ! RHS density for n=0 mode
  call t_startf("POISSON_00MODE")
  if(sml_poisson_solver_type/=0) then
     if(sml_electron_on) then
        dentmp0=psn%idensity0-psn%edensity0
     else
        dentmp0=psn%idensity0
     endif
  elseif (iflag==1) then
     ! psn%cden00_1d = psn%iden00_1d-psn%eden00_1d
     call convert_001d_2_grid(grid,psn%cden00_1d,dentmp0)
  endif
  
  call set_boundary2_values(dentmp0,0D0,psn%cbd0_2)
  
  if ((sml_poisson_solver_type==0 .and. iflag==1 .and. sml_00_poisson) .or. &
     (sml_poisson_solver_type/=0 .and. (sml_00_poisson .or. sml_turb_poisson))) then  ! solve 00-mode
      if (sml_poisson_solver_type==0 .and. sml_use_simple00 ) then
         call simple00(grid,psn)
      else
         ! solve 00 (neoclassical) field*************************************************************
         call t_startf("POISSON_00MODE_SOLVE")
         !if(sml_sheath_mode/=0) call apply_wall_boundary_condition(grid,psn,dentmp0)
         call petsc_solve(grid%nnode,dentmp0,0,psn%pot0,psn%solver00%comm,psn%solver00,ierr)
         call t_stopf("POISSON_00MODE_SOLVE")
         ! pot0m has n=0,m=aribtrary component
         ! Flux average of pot0m goes to pot00_1d (psi-space)
         ! And, flux averaged potential copied to psn%pot0
         if(sml_poisson_solver_type/=0) then
            call convert_grid_2_001d(grid,psn%pot0m,psn%pot00_1d)
         end if
         call convert_001d_2_grid(grid,psn%pot00_1d,psn%pot0)
      end if
  end if
  call t_stopf("POISSON_00MODE")

  ! solve turbulence field********************************************************************
  call split_indices(grid%nnode, sml_nthreads, i_beg, i_end)
  
  if(sml_turb_poisson) then
     call t_startf("POISSON_TURB")
     ! For all poloidal plane - without n=0 mode
     !$OMP PARALLEL DO &
     !$OMP PRIVATE( ITH, I )
     do ith=1,sml_nthreads
        if(iflag==1 .and. sml_electron_on) then !use non-adiabatic electron
           do i=i_beg(ith),i_end(ith)
              dentmp3(i)= (psn%idensity(i,1) - psn%edensity(i,1) - dentmp0(i))
              !/psn%density0_full(:)  ! ni - ne - <ni - ne>
           enddo
        else ! iflag==2 .or. no electron
           call convert_001d_2_grid(grid,psn%iden00_1d,dentmp0) !dentmp0 becomes flux averaged ion 00 density
           do i=i_beg(ith),i_end(ith)
              dentmp3(i)= (psn%idensity(i,1) - dentmp0(i))
#ifdef MAX_DDEN
           if(dentmp3(i) > 2d0*dentmp0(i)) dentmp3(i)=2d0*dentmp0(i) ! factor 2 genralize later
#endif
              !  ni - <ni>
           enddo
        endif
     enddo

     ! smoothing of charge density -- RHS
     !call smooth_tr_connect(grid,dentmp3)
     !call smooth_pol(grid,dentmp3)
     call set_boundary2_values(dentmp3,0D0,psn%cbdH_2)
 
     psn%dden(:,1)=dentmp3  ! this can be optimized. redundant memory usage.
     call t_startf("POISSON_TURB_SOLVE")
     call petsc_solve(grid%nnode,dentmp3,0,psn%dpot(:,1),psn%solverH%comm,psn%solverH,ierr)
     call t_stopf("POISSON_TURB_SOLVE")

     if(sml_poisson_solver_type/=0) then
        ! move m/=0 component to dpot
        psn%dpot(:,1)=psn%dpot(:,1) + psn%pot0m - psn%pot0
     else
        call smooth_tr_connect(grid,psn%dpot(:,1))
        !************************* extract 00 mode *********************************************
        call t_startf("EXTRACT_00MODE")
        call extract_00mode(grid,psn%dpot)
        call t_stopf("EXTRACT_00MODE")
     end if

     if(sml_mode_select_on==1) then
        call mode_selection(sml_mode_select_n,grid,psn)
     endif

     call t_stopf("POISSON_TURB")
  endif

  !********************** Phase 4 *********************************************
  !send and receive potential value to and from adjacent PE
  !***************************************************************************
  if(sml_turb_poisson) then
     call t_startf("POISSON_SR_POT")
     call send_recv_potential( psn%dpot, recvr,sendl,grid%nnode)
     call t_stopf("POISSON_SR_POT")
  endif

  !******************** Phase 5 ***********************************************
  ! get space derivative of potential
  ! psn%E_para(node,1:nphi )  psn%E_perp(2 vec, ntriangle, 0:nphi)
  !***************************************************************************
  if(sml_add_pot0/=0) then
     if(sml_replace_pot0) then
!$OMP PARALLEL DO &
!$OMP PRIVATE( ITH, I )
        do ith=1,sml_nthreads
           do i=i_beg(ith),i_end(ith)
              psn%pot0(i)=psn%add_pot0(i)
           enddo
        enddo
     else
!$OMP PARALLEL DO &
!$OMP PRIVATE( ITH, I )
        do ith=1,sml_nthreads
           do i=i_beg(ith),i_end(ith)
              psn%pot0(i)=psn%pot0(i)+psn%add_pot0(i)
           enddo
        enddo
     endif
  endif

end subroutine solve_poisson_private


!***************************************************************************
!*** physics
!***************************************************************************
subroutine read_add_pot0(grid,psn)
  use psn_class
  use grid_class
  use sml_module
  implicit none
  type(grid_type) :: grid
  type(psn_type) :: psn
  integer :: num, i

  open(unit=18,file=sml_add_pot0_file,action='read')
  read(18,*) num
  if(num/=grid%nnode) then
     if(sml_mype==0) print *, &
	'Error : Grid number missmatch in read_add_pot0',num, grid%nnode
     stop
  endif
  do i=1, num
     read(18,*) psn%add_pot0(i)
  enddo

  close(18)
end subroutine read_add_pot0

! send and receive other plane potential
subroutine send_recv_potential( dpot, recvr,sendl,nnode)
  use sml_module
  use perf_monitor
  implicit none
  include 'mpif.h'
  integer , intent(in) :: nnode
  real (kind=8) :: recvr(nnode), sendl(nnode), dpot(nnode,-1:2)
  integer :: icount, idest,isource,isendtag,irecvtag
  PetscErrorCode ierr
  integer :: istatus(mpi_status_size)

  ! receive 0
  sendl=dpot(:,1)
  recvr=0D0
  icount=nnode
  isource= modulo(sml_intpl_mype-1,sml_intpl_totalpe)
  idest  = modulo(sml_intpl_mype+1,sml_intpl_totalpe)
  isendtag=sml_intpl_mype
  irecvtag=isource

  call mpi_sendrecv(sendl,icount,MPI_DOUBLE_PRECISION,idest,isendtag,&
       recvr,icount,MPI_DOUBLE_PRECISION,isource,irecvtag,sml_intpl_comm,istatus,ierr)

  dpot(:,0) = recvr(:)

  ! receive -1
  sendl=dpot(:,0)
  recvr=0D0
  icount=nnode
  isource= modulo(sml_intpl_mype-1,sml_intpl_totalpe)
  idest  = modulo(sml_intpl_mype+1,sml_intpl_totalpe)

  isendtag=sml_intpl_mype
  irecvtag=isource

  call mpi_sendrecv(sendl,icount,MPI_DOUBLE_PRECISION,idest,isendtag,&
       recvr,icount,MPI_DOUBLE_PRECISION,isource,irecvtag,sml_intpl_comm,istatus,ierr)

  dpot(:,-1) = recvr(:)

  ! receive 1
  sendl=dpot(:,1)
  recvr=0D0
  icount=nnode
  isource= modulo(sml_intpl_mype+1,sml_intpl_totalpe)
  idest  = modulo(sml_intpl_mype-1,sml_intpl_totalpe)
  isendtag=sml_intpl_mype
  irecvtag=isource
  call mpi_sendrecv(sendl,icount,MPI_DOUBLE_PRECISION,idest,isendtag,&
       recvr,icount,MPI_DOUBLE_PRECISION,isource,irecvtag,sml_intpl_comm,istatus,ierr)

  dpot(:,2) = recvr(:)

end subroutine send_recv_potential


#ifndef EFIELD2
subroutine get_potential_grad(grid,psn)
  use psn_class
  use grid_class
  use sml_module
  use smooth_module
  use omp_module , only : split_indices
  use perf_monitor
  implicit none
  include 'mpif.h'
  type(grid_type) :: grid
  type(psn_type) :: psn
  integer :: i,iphi,itr,nodes(3),j
  real (kind=8) :: dp1,dp2
  integer :: iphi_p, iphi_m
  real (kind=8) :: dpot_p, dpot_m
  integer :: nodes_p(3),nodes_m(3),nd,sgn
  real (kind=8) :: area_sum, E0(2), E(2,0:1)
  integer :: irho, ipe
  ! mpi
  integer :: idest, isendtag, isource, irecvtag
  PetscErrorCode ierr
  integer, dimension(MPI_STATUS_SIZE) :: istatus
  ! omp
  integer :: itr_beg(sml_nthreads), itr_end(sml_nthreads)
  integer :: ith, i_beg(sml_nthreads), i_end(sml_nthreads)

  real (8), allocatable :: E_perp_tr(:,:,:), E_r_node(:,:),E_z_node(:,:), E_para(:,:)
  real (8), allocatable :: pot(:,:,:), E_rho(:,:,:,:)
  ! for electron simulation
  real (8), allocatable :: E_perp00_tr(:,:), E_r00_node(:,:), E_z00_node(:,:)
  character (len=30) :: filename
  real (8) :: turb_on, zero_on

  integer :: nphi, iphim1,iphip1,dir
  real(8) :: pot_mid, pot_i_0,pot_i_1

  integer, parameter :: idebug = 0

!pw  call t_startf("GET_POT_INIT")
  allocate(E_perp_tr(2,grid%ntriangle,0:1))
  allocate(E_r_node(grid%nnode,0:1),E_z_node(grid%nnode,0:1), E_para(grid%nnode,0:1))
  allocate(E_rho(3,grid%nnode,0:1,0:grid%nrho))
  allocate(pot(grid%nnode,0:1,0:grid%nrho))
  if(sml_electron_on) then
     allocate(E_perp00_tr(2,grid%ntriangle), E_r00_node(grid%nnode,0:1), E_z00_node(grid%nnode,0:1))
  endif

  if(sml_turb_efield) then
     turb_on=1D0
  else
     turb_on=0D0
  endif
  if(sml_00_efield) then
     zero_on=1D0
  else
     zero_on=0D0
  endif

  ! pot(:,:,1) as tmp variable
  do iphi=0,1
     pot(:,iphi,1)=zero_on*psn%pot0(:)+turb_on*psn%dpot(:,iphi)
  enddo
#ifdef  USE_CALC_GRADIENT
! ------------------------------------------------------
! copy and communicate psn%pot_phi_real(1:nnode,0:nphim1)
! which is used in computing the gradient
! E_phi_ff(1:3,0:1,1:nnode,0:nphim1)
! ------------------------------------------------------

  call t_startf("GATHER_POT_PHI_COPY")
  do iphi=0,sml_nphi_total-1
    if (sml_intpl_mype == iphi) then
      psn%pot_phi_real(:,iphi) = pot(:,0,1)
    endif
  enddo
  call t_stopf("GATHER_POT_PHI_COPY")

  call t_startf("GATHER_POT_PHI_GATHER")
  call mpi_allgather(MPI_IN_PLACE, grid%nnode,mpi_real8, &
                     psn%pot_phi_real(:,:),grid%nnode, mpi_real8,  &
                     sml_intpl_comm,ierr)
  if (ierr.ne.mpi_success) then
     write(*,*) 'mpi_allgather(psn%pot_phi_real) return ierr=',ierr
     stop '** error in  get_potential_grad'
  endif
  call t_stopf("GATHER_POT_PHI_GATHER")


    if ((idebug.ge.1).and.(sml_mype.eq.0)) then
      write(6,*) 'sml_totalpe ', sml_totalpe
      write(6,*) 'sml_intpl_totalpe ', sml_intpl_totalpe
      write(6,*) 'sml_plane_totalpe ', sml_plane_totalpe
      write(6,*) 'sml_nphi_total ', sml_nphi_total
      write(6,*) 'sml_grid_nrho ', sml_grid_nrho
      write(6,*) 'sml_intpl_mype ', sml_intpl_mype
      write(6,*) 'sml_plane_mype ', sml_plane_mype
      write(6,*) 'sml_mype ', sml_mype
      call flush(6)
    endif


#endif
!pw  call t_stopf("GET_POT_INIT")

  ! Divide triangles among OpenMP threads
  call split_indices(grid%ntriangle, sml_nthreads, itr_beg, itr_end)

  call t_startf("GET_POT_LOOPS")
  !obtain perpendicular E-field -- total E-field
!$OMP PARALLEL DO &
!$OMP PRIVATE( ITH, ITR, NODES, DP1, DP2, IPHI)
  do ith=1,sml_nthreads

     ! store grad-2d pot0 + pot in E_perp
     do iphi=0, 1
        do itr=itr_beg(ith), itr_end(ith)
           nodes(:)=grid%nd(:,itr)

           dp1=pot(nodes(1),iphi,1)- pot(nodes(3),iphi,1)
           dp2=pot(nodes(2),iphi,1)- pot(nodes(3),iphi,1)

           E_perp_tr(:,itr,iphi)= &
                -( dp1*grid%mapping(1,1:2,itr) + dp2*grid%mapping(2,1:2,itr) )
        enddo
     enddo

  enddo

  !obtain perpendicular 00 E-field -- required for weight evolution of electron
  if(sml_electron_on .and. sml_00_efield) then
!$OMP PARALLEL DO &
!$OMP PRIVATE( ITH, ITR, NODES, DP1, DP2)
     do ith=1,sml_nthreads

        ! store grad-2d pot0 + pot in E_perp
        do itr=itr_beg(ith), itr_end(ith)
           nodes(:)=grid%nd(:,itr)

           dp1=psn%pot0(nodes(1))- psn%pot0(nodes(3))
           dp2=psn%pot0(nodes(2))- psn%pot0(nodes(3))

           E_perp00_tr(:,itr)= &
                -( dp1*grid%mapping(1,1:2,itr) + dp2*grid%mapping(2,1:2,itr) )
        enddo

     enddo
  endif


  ! Divide nodes among OpenMP threads
  call split_indices(grid%nnode, sml_nthreads, i_beg, i_end)

  ! Get area averge of perp E-field

!$OMP PARALLEL DO &
!$OMP PRIVATE( ITH, I, AREA_SUM, E, J, ITR )
  do ith=1,sml_nthreads

     do i=i_beg(ith),i_end(ith)
        area_sum=0D0
        E=0D0
        do j=1, grid%num_t_node(i)
           itr=grid%tr_node(j,i)
           area_sum=area_sum+grid%tr_area(itr)
           E(:,:)=E(:,:)+ E_perp_tr(:,itr,0:1) * grid%tr_area(itr)
        enddo
        E_r_node(i,:)=E(1,:)/area_sum
        E_z_node(i,:)=E(2,:)/area_sum
     enddo

  enddo

  if(sml_electron_on .and. sml_00_efield) then
!$OMP PARALLEL DO &
!$OMP PRIVATE( ITH, I, AREA_SUM, E, J, ITR )
     do ith=1,sml_nthreads

        do i=i_beg(ith),i_end(ith)
           area_sum=0D0
           E(:,1)=0D0
           do j=1, grid%num_t_node(i)
              itr=grid%tr_node(j,i)
              area_sum=area_sum+grid%tr_area(itr)
              E(:,1)=E(:,1)+ E_perp00_tr(:,itr) * grid%tr_area(itr)
           enddo
           E_r00_node(i,1)=E(1,1)/area_sum
           E_z00_node(i,1)=E(2,1)/area_sum
        enddo

     enddo
     E_r00_node(:,0)=E_r00_node(:,1)
     E_z00_node(:,0)=E_z00_node(:,1)

  endif

  !apply boundary condition of E-field
  !zero E-field (plus zero potential at boundary)

!  call set_boundary2_values(E_r_node(:,0),0D0,psn%pbd0_2)
!  call set_boundary2_values(E_r_node(:,1),0D0,psn%pbd0_2)
!  call set_boundary2_values(E_z_node(:,0),0D0,psn%pbd0_2)
!  call set_boundary2_values(E_z_node(:,1),0D0,psn%pbd0_2)
!  if(sml_electron_on)then
!  call set_boundary2_values(E_r00_node(:,0),0D0,psn%pbd0_2)
!  call set_boundary2_values(E_r00_node(:,1),0D0,psn%pbd0_2)
!  call set_boundary2_values(E_z00_node(:,0),0D0,psn%pbd0_2)
!  call set_boundary2_values(E_z00_node(:,1),0D0,psn%pbd0_2)
!  endif

  call t_stopf("GET_POT_LOOPS")

  ! convert field following coord
  call t_startf("GET_POT_CNVRT")
  call cnvt_grid_real2ff(grid,psn%ff_hdp_tr,psn%ff_hdp_p,pot(:,:,1),pot(:,:,0))
  call cnvt_grid_real2ff(grid,psn%ff_hdp_tr,psn%ff_hdp_p,E_r_node,pot(:,:,1))
  E_r_node=pot(:,:,1)
  call cnvt_grid_real2ff(grid,psn%ff_hdp_tr,psn%ff_hdp_p,E_z_node,pot(:,:,1))
  E_z_node=pot(:,:,1)
  if(sml_electron_on) then
     if(sml_00_efield) then
        call cnvt_grid_real2ff(grid,psn%ff_hdp_tr,psn%ff_hdp_p,E_r00_node,pot(:,:,1))
        E_r00_node=pot(:,:,1)
        call cnvt_grid_real2ff(grid,psn%ff_hdp_tr,psn%ff_hdp_p,E_z00_node,pot(:,:,1))
        E_z00_node=pot(:,:,1)
     else ! zero efield when sml_00_efield is .false.
        E_r00_node=0D0
        E_z00_node=0D0
     endif
  endif
  call t_stopf("GET_POT_CNVRT")

  ! E-parallel
  ! pot(:,0,1) is tmp variable
  ! obtain E_parallel at half position  --  1_dx is ~ 2 R dphi
  pot(:,0,1)= -sml_bt_sign*(pot(:,1,0)-pot(:,0,0))*psn%bfollow_1_dx(:)*2D0  ! ### replace bfollow_1_dx


  call t_startf("GET_POT_SR")
  ! send left recv right
  idest=mod(sml_intpl_mype-1+sml_intpl_totalpe,sml_intpl_totalpe)
  isendtag=sml_intpl_mype
  isource=mod(sml_intpl_mype+1,sml_intpl_totalpe)
  irecvtag=isource
  call mpi_sendrecv(pot(:,0,1),grid%nnode,MPI_REAL8,  idest,isendtag, &
       E_para(:,1),grid%nnode,MPI_REAL8,isource,irecvtag, &
       sml_intpl_comm,istatus,ierr)

  ! send right recv left
  idest=mod(sml_intpl_mype+1,sml_intpl_totalpe)
  isendtag=sml_intpl_mype
  isource=mod(sml_intpl_mype-1+sml_intpl_totalpe,sml_intpl_totalpe)
  irecvtag=isource
  call mpi_sendrecv(pot(:,0,1),grid%nnode,MPI_REAL8,  idest,isendtag, &
       E_para(:,0),grid%nnode,MPI_REAL8,isource,irecvtag, &
       sml_intpl_comm,istatus,ierr)
  call t_stopf("GET_POT_SR")

  ! require converting of E_para(:,0) and E_perp(:,1) ????
  call t_startf("GET_POT_CNVRT")
  call cnvt_grid_real2ff(grid,psn%ff_1dp_tr,psn%ff_1dp_p,E_para(:,:),pot(:,:,2)) ! ???
  E_para(:,0)=0.5D0*(pot(:,0,2)+pot(:,0,1))
  E_para(:,1)=0.5D0*(pot(:,1,2)+pot(:,0,1))
  call t_stopf("GET_POT_CNVRT")

  ! obtain E_para_ff
  !E_para(:,0)=0.5D0*(E_para(:,0)+pot(:,0,1))
  !E_para(:,1)=0.5D0*(E_para(:,1)+pot(:,0,1))


  ! ----------- end of parallel E-field

  ! assign to E_rho
  E_rho(1,:,:,0)=E_r_node
  E_rho(2,:,:,0)=E_z_node
  E_rho(3,:,:,0)=E_para

  ! obtaion E_rho(:,:,n) and pot(:,:,n)
  if(sml_plane_mype<grid%nrho*2) then
     call t_startf("GET_POT_MAT_MULT")
     iphi=mod(sml_plane_mype,2)
     irho=sml_plane_mype/2  + 1

     call mat_mult(psn%gyro_avg_mat,pot(:,iphi,0),pot(:,iphi,irho))
     call mat_mult_tensor(psn%gyro_avg_mat,E_rho(:,:,iphi,0),3,E_rho(:,:,iphi,irho))
     call t_stopf("GET_POT_MAT_MULT")
  endif

  ! get all of E_rho and pot using bcast
  call t_startf("GET_POT_BCAST")
  do irho=1,grid%nrho
     do iphi=0,1
        ipe=(irho-1)*2 + iphi
        call mpi_bcast(pot(:,iphi,irho),grid%nnode,mpi_real8,ipe,sml_plane_comm,ierr)
        call mpi_bcast(E_rho(:,:,iphi,irho),grid%nnode*3,mpi_real8,ipe,sml_plane_comm,ierr)
     enddo
  enddo
  call t_stopf("GET_POT_BCAST")

#ifdef IDEN_DEBUG
     !for debuging
  if(sml_istep==200 .and. sml_ipc==1 .and. sml_mype==0) then
     do irho=1, grid%nrho
        do iphi=0,1
           write(filename,'("pdebug.",i2.2,".",i1.1,".txt")') irho,iphi
           open(unit=333,file=filename,status='replace')
           do i=1, grid%nnode
              write(333,1000) pot(i,iphi,irho), E_rho(:,i,iphi,irho)
           enddo
        enddo
     enddo
  endif
1000 format(18(e19.13,1x))
#endif


  ! change of indexing order
  call t_startf("GET_POT_IDX_REORD")
  do irho=0,grid%nrho
     do iphi=0,1
        psn%pot_rho_ff(iphi,irho,:)=pot(:,iphi,irho)
        psn%E_rho_ff(:,iphi,irho,:) = E_rho(:,:,iphi,irho)
     enddo
  enddo

  if(sml_electron_on) then
     do iphi=0,1
        psn%E00_ff(1,iphi,:)=E_r00_node(:,iphi)
        psn%E00_ff(2,iphi,:)=E_z00_node(:,iphi)
     enddo
  endif
  call t_stopf("GET_POT_IDX_REORD")

  ! for debugging ######
!  do iphi=0,1
!     psn%E_perp_node(1:2,:,iphi)=E_rho(1:2,:,iphi,0)
!     psn%E_para(:,iphi)     =E_rho(3  ,:,iphi,0)
!  enddo

  deallocate(E_perp_tr,E_r_node,E_z_node,E_para,E_rho,pot)
  if(sml_electron_on) deallocate(E_perp00_tr, E_r00_node, E_z00_node)

end subroutine get_potential_grad
#endif

subroutine get_dpot_ff(grid,psn)
  use psn_class
  use grid_class
  use sml_module
  use smooth_module
  use omp_module , only : split_indices
  use perf_monitor
  implicit none
  include 'mpif.h'
  type(grid_type) :: grid
  type(psn_type) :: psn


  ! convert field following coord
  call cnvt_grid_real2ff(grid,psn%ff_hdp_tr,psn%ff_hdp_p,psn%dpot(:,0:1),psn%dpot_ff)

end subroutine get_dpot_ff

subroutine zero_out_total_charge(grid,psn,den_org,den_zero)
  use grid_class
  use psn_class
  implicit none
!  integer :: flag
  type(grid_type) :: grid
  type(psn_type) :: psn
  real (kind=8) :: den_org(grid%nnode), den_zero(grid%nnode), total,area
  integer :: ibd, obd

!!$  ibd=psn%cbd0%in%end+1
!!$  obd=psn%cbd0%out1%start-1
!!$
!!$  ! Volume?? Area??
!!$  total=sum( den_org(ibd:obd)*grid%node_area(ibd:obd) )
!!$  area=sum(grid%node_area(ibd:obd))
!!$
!!$  den_zero=den_org - total/area

end subroutine zero_out_total_charge

subroutine set_decaying_boundary(den,grid)
  use grid_class
  use sml_module
  implicit none
!  integer :: flag
  type(grid_type) :: grid
  real (kind=8) :: den(grid%nnode)
  real (kind=8) :: sml_f0_psi_c, sml_f0_1_psi_w, alpha
  integer :: i

  sml_f0_psi_c=0.5*(sml_inpsi+sml_outpsi)
  sml_f0_1_psi_w=1D0/( 0.35*(sml_outpsi-sml_inpsi) )

  do i=1, grid%nnode
    alpha= exp( - ((grid%psi(i) - sml_f0_psi_c)*sml_f0_1_psi_w )**6 )
    den(i)=den(i)* alpha
  enddo

end subroutine


!! return gyro radius with a give mu and position
real (kind=8) function gyro_radius(x,mu)
  use ptl_module
  use eq_module
  implicit none
  real (kind=8),intent(in) :: x(2),mu
  real (kind=8) :: B
  real (kind=8),external :: b_interpol


  B=b_interpol(x(1),x(2),0D0)
#ifdef XGC_FLAT_B
  b=eq_axis_b
#endif
  gyro_radius=sqrt(mu/B/ptl_c2_2m(1))


end function gyro_radius

subroutine gyro_radius_vec(nx,x,mu,gyro_radius)
  use ptl_module
  implicit none
  integer, intent(in) :: nx
  real (kind=8),intent(in) :: x(2,nx),mu
  real (kind=8), intent(inout) :: gyro_radius(nx)

  real (kind=8) :: B(nx), phi(nx)

  interface
     subroutine b_interpol_vec(n,r,z,phi,b_interpol)
       implicit none
       integer, intent(in) :: n
       real(kind=8), dimension(n), intent(in) :: r,z,phi
       real(kind=8), dimension(n), intent(inout) :: b_interpol
     end subroutine b_interpol_vec
  end interface


  phi = 0.0d0
!  call b_interpol_vec(nx, x(1,1:nx), x(2,1:nx), phi, B )
  !  B(1:nx)=b_interpol_vec(x(1,1:nx),x(2,1:nx),0D0)
  gyro_radius=sqrt(mu/(B*ptl_c2_2m(1)))


end subroutine  gyro_radius_vec


!!return gyro radius of a thermal ion at a given position
real (kind=8) function gyro_radius2(x)
  use sml_module
  use ptl_module
  use eq_module
  implicit none
  real (kind=8),intent(in) :: x(2)
  real (kind=8) :: B,psi,tev
  real (kind=8),external :: b_interpol,psi_interpol
!  real (kind=8),external :: init_tempi_ev

  B=b_interpol(x(1),x(2),0D0)
  psi=psi_interpol(x(1),x(2),0,0)
  tev=eq_ftn(psi,0D0,0D0,eq_tempi)

  gyro_radius2=sqrt(Tev*sml_ev2j/ptl_c2_2m(1)/2D0)/B


end function gyro_radius2

real (kind=8) function gyro2_tev(x)
  use sml_module
  use ptl_module
  implicit none
  real (kind=8),intent(in) :: x(2)
  real (kind=8) :: B,psi,tev
  real (kind=8),external :: b_interpol,psi_interpol
!  real (kind=8),external :: init_tempi_ev

  B=b_interpol(x(1),x(2),0D0)
  psi=psi_interpol(x(1),x(2),0,0)
!  tev=init_tempi_ev(psi)

!  gyro2_tev=sml_ev2j/ptl_c2_2m(1)/2D0/B**2
  gyro2_tev=sml_ev2j/(ptl_c2_2m(1)*2D0*B**2)

end function gyro2_tev

!! Write out matrix element for poisson equation
!subroutine output_matrix(grid,psn)
!  use psn_class
!  use grid_class
!  implicit none
!  integer :: i,j
!  type(psn_type) :: psn
!  type(grid_type) :: grid
!
!  open(unit=201,file='poisson.matrix',status='replace')
!  do i=1, grid%nnode
!     do j=1, psn%Amat%nelement(i)
!        write(201,*) i,psn%Amat%eindex(j,i),psn%Amat%value(j,i),psn%Amat%value(j,i)/grid%node_area(i)
!     enddo
!  enddo
!  close(201)
!
!end subroutine output_matrix

! mode selection - works only the nodes between boundaries
! refine ngrid setting for generalization
subroutine mode_selection(nmode,grid,psn)
  use sml_module
  use grid_class
  use psn_class
  implicit none
  include 'mpif.h'
  integer, intent(in) :: nmode
  type(grid_type) :: grid
  type(psn_type) :: psn
  integer :: ngrid,ngrid2,grid_offset,ngrid_per_pe,nphi_total,nplane_per_pe,npe_per_plane
  integer :: iphi_loc,iphi_offset, iseg,idest,isource,isendtag,irecvtag,istart
  integer :: i,inode,itmp1,itmp2,itmp3,istatus(mpi_status_size)
  PetscErrorCode ierr
  real (kind=8) ,allocatable :: buff1(:), buff2(:,:),buff3(:)
  complex (kind=8), allocatable :: ftc(:),iftc(:)
  complex (kind=8) :: cn

  ! 1. initialization
  nphi_total=sml_nphi_total
  nplane_per_pe=sml_plane_per_pe  ! one
  npe_per_plane=sml_pe_per_plane
  iphi_offset=grid%iphi_offset


  grid_offset=1
  ngrid=grid%nnode

  ! Parallelizaing - ngrid_per_pe : number of grid (inside boundary) per cpu
  ! ngrid2 - number of grid total computed.  Node number which is  > ngrid will be ignored later.
  ngrid_per_pe=ngrid/sml_totalpe
  if(ngrid==sml_totalpe*ngrid_per_pe) then
     ngrid2=ngrid
  else
     ngrid_per_pe=ngrid_per_pe+1
     ngrid2=sml_totalpe*ngrid_per_pe
  endif

  allocate( buff1(ngrid2),buff3(ngrid2),buff2(ngrid_per_pe,nphi_total) )
  allocate( ftc(nphi_total), iftc(nphi_total) )

  ! preparing (inverse) Fourier transform coefficient.
  do i=1, nphi_total
!     ftc(i)=cexp(cmplx(0D0,sml_2pi*real((i-1)*nmode,8)/real(nphi_total,8),8))
     ftc(i)=cexp(cmplx(0D0,sml_2pi*real((i-1)*nmode)/real(nphi_total)))
     iftc(i)=conjg(ftc(i))/real(nphi_total,8)
  enddo

  ! store delta potential value to the buff1 - with grid offset.
  buff1(1:ngrid)=psn%dpot(grid_offset:grid_offset-1+ngrid,1)
  buff1(ngrid+1:ngrid2)=0D0


  ! 2. send - receive field data
  ! send buff1 data (given plane field data) to buff2 data (poloidal angle (or grid number) localized data)
  do iseg=1, nphi_total
     iphi_loc=1+(iseg-1)/sml_totalpe   ! should be one
     idest  =mod(sml_intpl_mype+(iseg-1),sml_intpl_totalpe)
     isource=mod(sml_intpl_mype-(iseg-1),sml_intpl_totalpe)
     isource=mod(isource+sml_intpl_totalpe,sml_intpl_totalpe) ! to preven minus
     isendtag=sml_intpl_mype
     irecvtag=isource
!     istart= 1 + ngrid_per_pe*mod( sml_mype+iseg-1 , sml_totalpe )
     istart= 1 + ngrid_per_pe*idest
     !sending buff1(istart:istart+ngrid_per_pe,iphi)
     !receiving buff2(1:ngrid_per_pe,1+modulo(iphi_offset+iseg-1,nphi_total))
     itmp1=istart
     itmp2=istart+ngrid_per_pe-1
!     itmp3=mod(  iphi_offset + nplane_per_pe*(1-iseg) + iphi_loc-1 , nphi_total   )
!     itmp3= 1 + mod(itmp3 + nphi_total, nphi_total)
     itmp3=iphi_loc + isource*nplane_per_pe/npe_per_plane
     ! send and receive data
     call MPI_SENDRECV( buff1(itmp1:itmp2), ngrid_per_pe, MPI_DOUBLE_PRECISION,idest,isendtag,&
          buff2(1:ngrid_per_pe,itmp3), ngrid_per_pe, MPI_DOUBLE_PRECISION,isource,irecvtag,&
          sml_intpl_comm,istatus,ierr)
  enddo


  do inode=1, ngrid_per_pe

     ! 3. Fourier transform. Single mode direct cal
     cn=0D0
     do i=1, nphi_total
        cn= cn + buff2(inode,i)*ftc(i)
     enddo

     ! 4. inverse fouirer transform. Single mode direct cal
     do i=1, nphi_total
        buff2(inode,i)=real(cn*iftc(i))*2D0 ! 2D0 for -nmode summation : nmode and (-nmode) are complex conjugate and should be summed.
     enddo

  enddo

  ! 5. send - receive filtered data
  buff1=0D0
  do iseg=1, nphi_total
     iphi_loc=1+(iseg-1)/sml_totalpe
     idest  =mod(sml_intpl_mype-(iseg-1),sml_intpl_totalpe)
     idest  =mod(isource+sml_intpl_totalpe,sml_intpl_totalpe) ! to preven minus ! to preven minus
     isource=mod(sml_intpl_mype+(iseg-1),sml_intpl_totalpe)
     isendtag=sml_intpl_mype
     irecvtag=isource
!     istart= 1 + ngrid_per_pe*mod( sml_mype+iseg-1 , sml_totalpe )
     istart= 1 + ngrid_per_pe*isource
     !sending buff1(istart:istart+ngrid_per_pe,iphi)
     !receiving buff2(1:ngrid_per_pe,1+modulo(iphi_offset+iseg-1,nphi_total))
     itmp1=istart
     itmp2=istart+ngrid_per_pe-1
!     itmp3=mod(  iphi_offset + nplane_per_pe*(1-iseg) + iphi_loc-1 , nphi_total   )
!     itmp3= 1 + mod(itmp3 + nphi_total, nphi_total)
     itmp3=iphi_loc + idest*nplane_per_pe/npe_per_plane

     ! send and receive data
     call MPI_SENDRECV( buff2(1:ngrid_per_pe,itmp3), ngrid_per_pe,MPI_DOUBLE_PRECISION,idest,isendtag,&
          buff1(itmp1:itmp2), ngrid_per_pe, MPI_DOUBLE_PRECISION,isource,irecvtag,&
          sml_intpl_comm,istatus,ierr)
  enddo

  ! 6. broad cast between group (for multiple cpu per plane)
  if(npe_per_plane > 1 ) then
     call MPI_ALLREDUCE(buff1,buff3,ngrid2,MPI_DOUBLE_PRECISION, MPI_SUM,sml_plane_comm,ierr)
  else
     buff3=buff1
  endif
  do iphi_loc=1, nplane_per_pe
     psn%dpot(grid_offset:grid_offset-1+ngrid,1)=buff3(1:ngrid)
  enddo
  ! 7. finalize
  deallocate(buff1,buff2,buff3,ftc,iftc)

end subroutine mode_selection

#ifndef TRIANGLE_GYRO_AVG
subroutine init_gyro_avg_mat(grid,psn)
  use sml_module
  use grid_class
  use psn_class
  implicit none
  type(grid_type):: grid
  type(psn_type) :: psn
  integer :: iphi, irho
  real (8) :: rho, phi0, phi
  integer :: i, larmor
  real (8) :: x(2), xff(2), x_ring(2), x_rff(2)
  integer, parameter :: n_gyro=8
  real (8) :: dx_unit(2,n_gyro), dx_unit2(2)
  real (8) :: angle, dpdr, dpdz, dp, cosa, sina
  integer :: itr, node, ip
  real (8) :: p(3), weight
  !
  real (8), external :: psi_interpol

  call new_mat(psn%gyro_avg_mat,grid%nnode,n_gyro*3)

  iphi=mod(sml_plane_mype,2)
  irho=sml_plane_mype/2  + 1

  rho=grid%drho*real(irho)
  phi0=0.5D0*grid%delta_phi
  phi=real(iphi)*grid%delta_phi  ! 0 for 0 , 1 for delta_phi

  do larmor=1, n_gyro
     angle=sml_2pi/real(n_gyro)*real(larmor-1)
     dx_unit(:,larmor)=(/cos(angle),sin(angle)/)
  enddo

  do i=1, grid%nnode
     !find real position -- follow field line
     xff=grid%x(:,i)
     call field_following_pos2(xff,phi0,phi,x)

     ! find gyro-ring position
     dpdr=psi_interpol(x(1),x(2),1,0)  !### get together 3 of them (psi, dpdr, dpdz)
     dpdz=psi_interpol(x(1),x(2),0,1)
     dp=sqrt(dpdr**2 + dpdz**2)
     cosa=dpdr/dp
     sina=dpdz/dp


     do larmor=1, n_gyro

        !find 8-point with rhoi
        dx_unit2(1)= dx_unit(1,larmor)*cosa+dx_unit(2,larmor)*sina
        dx_unit2(2)=-dx_unit(1,larmor)*sina+dx_unit(2,larmor)*cosa

        x_ring = x + rho* dx_unit2(:)

        ! back transform to ff position
        call field_following_pos2(x_ring,phi,phi0,x_rff)

        ! find triangle
        call search_tr2(grid,x_rff,itr,p)

        if(itr>0) then
           do ip=1,3
              weight=p(ip)/real(n_gyro)
              node=grid%nd(ip,itr)
              call set_value(psn%gyro_avg_mat,i,node,weight,1)
           enddo
        endif
     enddo

     ! column normalization is required for boundary nodes
  enddo

end subroutine init_gyro_avg_mat
#else

subroutine init_gyro_avg_mat(grid,psn)
  use sml_module
  use grid_class
  use psn_class
  implicit none
  type(grid_type):: grid
  type(psn_type) :: psn
  integer :: iphi, irho
  real (8) :: rho, phi0, phi
  integer, parameter :: n_gyro=8, nsub=3
  integer :: larmor, i, nd(3), j, k, itmp
  real (8) :: dx_unit(2,n_gyro), dx_unit2(2)
  real (8) :: dx1(2), dx2(2), area, c1, c2, c3
  real (8) :: x(2), xff(2), x_ring(2), x_rff(2)
  real (8) :: angle, dpdr, dpdz, dp, cosa, sina
  integer :: itr, node, ip
  real (8) :: p(3), weight
  !
  real (8), external :: psi_interpol
  !

  ! matrix size??  factor 2 at the end is approximate factor
  call new_mat(psn%gyro_avg_mat,grid%nnode,n_gyro*3*nsub*8)

  iphi=mod(sml_plane_mype,2)
  irho=sml_plane_mype/2  + 1

  rho=grid%drho*real(irho)
  phi0=0.5D0*grid%delta_phi
  phi=real(iphi)*grid%delta_phi  ! 0 for 0 , 1 for delta_phi

  do larmor=1, n_gyro
     angle=sml_2pi/real(n_gyro)*real(larmor-1)
     dx_unit(:,larmor)=(/cos(angle),sin(angle)/)
  enddo


  do i=1, grid%ntriangle ! for all triangle

     nd=grid%nd(:,i)
     dx1= ( grid%x(:,nd(1)) - grid%x(:,nd(3)) )/real(nsub,8)
     dx2= ( grid%x(:,nd(2)) - grid%x(:,nd(3)) )/real(nsub,8)
     area= 0.5D0*abs( dx1(1)*dx2(2) - dx2(1)*dx1(2) )

     do j=1, nsub       ! for all subtriangle
        do k=1, 2*j-1 ! for all subtriangle
           itmp=  (k-1)/2   ! make integer if not integer
           c1=(j-1)- itmp + real(mod(k,2)*2-1,8)/3D0
           itmp= k/2
           c2=itmp + real(mod(k,2)*2-1,8)/3D0
           c3=real(nsub,8) - c1 - c2

           xff=grid%x(:,nd(3)) + c1*dx1 + c2*dx2 ! center of subtriangle
           !volume=area*xff(1)    ! volume of subtriangle (2pi is missing)


           ! coordinate transform
           call field_following_pos2(xff,phi0,phi,x)

           ! find gyro-ring position
           dpdr=psi_interpol(x(1),x(2),1,0)  !### get together 3 of them (psi, dpdr, dpdz)
           dpdz=psi_interpol(x(1),x(2),0,1)
           dp=sqrt(dpdr**2 + dpdz**2)
           cosa=dpdr/dp
           sina=dpdz/dp


           do larmor=1, n_gyro

              !find 8-point with rhoi
              dx_unit2(1)= dx_unit(1,larmor)*cosa+dx_unit(2,larmor)*sina
              dx_unit2(2)=-dx_unit(1,larmor)*sina+dx_unit(2,larmor)*cosa

              x_ring = x + rho* dx_unit2(:)

              ! back transform to ff position
              call field_following_pos2(x_ring,phi,phi0,x_rff)

              ! find triangle
              call search_tr2(grid,x_rff,itr,p)

              if(itr>0) then
                 do ip=1,3
                    weight=p(ip)/real(n_gyro)
                    node=grid%nd(ip,itr)
                    call set_value(psn%gyro_avg_mat,nd(1),node,weight*c1*area,1)
                    call set_value(psn%gyro_avg_mat,nd(2),node,weight*c2*area,1)
                    call set_value(psn%gyro_avg_mat,nd(3),node,weight*c3*area,1)
                 enddo
              endif

           enddo
        enddo
     enddo
  enddo

  !normalize average operation
  call normalize_const(psn%gyro_avg_mat)

contains
  subroutine normalize_const(mat)
    implicit none
    type(mat_type) :: mat
    integer :: i,j
    real (8) :: sum

    do i=1, mat%n
       sum=1D-99
       do j=1, mat%nelement(i)
          sum=sum + mat%value(j,i)
       enddo

       mat%value(:,i)=mat%value(:,i)/sum
    enddo
  end subroutine normalize_const
end subroutine init_gyro_avg_mat

#endif

subroutine get_add_pot0_psi(psi_in,pot,grid,psn)
  use grid_class
  use psn_class
  use sml_module
  use eq_module
  implicit none
  type(grid_type) :: grid
  type(psn_type) :: psn
  real (kind=8), intent(in) :: psi_in
  real (kind=8), intent(out) :: pot
  real (kind=8) :: psi,x(2), p(3)
  integer :: itr, nodes(3), j
  logical, parameter :: USE_SEARCH_TR2 = .true.
  real (kind=8) , external :: get_mid_r

  ! exit with 0
  if(sml_add_pot0<=0) then
     pot=0D0
     return
  endif
  ! set psi
  psi=min(max(psi_in,sml_inpsi),sml_outpsi)

  ! get r,z=0 from psi
  x(1)=get_mid_R(psi)
  x(2)=eq_axis_z

  ! find triangle

  ! find position and save it.
  if (USE_SEARCH_TR2) then
     call search_tr2(grid,x,itr,p)
  else
     call search_tr(grid,x,itr,p)
  endif


  ! get potential value
  pot=0D0
  if( itr > 0 ) then

     nodes=grid%nd(:,itr)

     do j=1, 3
        pot= pot + p(j) * psn%add_pot0(nodes(j))
     enddo
  else
     print *, 'Warning : itr <0 in get_add_pot0_psi', psi, psi_in
     call err_count
  endif


end subroutine get_add_pot0_psi


subroutine extract_00mode(grid,phi)
  use grid_class
  use sml_module
  use smooth_module
  use omp_module , only : split_indices
  use perf_monitor
  implicit none
  type(grid_type) :: grid
  real (kind=8) :: phi(grid%nnode,-1:2)
  real (kind=8) :: tmp(grid%nnode), tmp2(grid%npsi00)
  integer :: i, j
  integer :: ith, i_beg(sml_nthreads), i_end(sml_nthreads)

  ! Divide nodes among OpenMP threads
  call split_indices(grid%nnode, sml_nthreads, i_beg, i_end)

!$OMP PARALLEL DO &
!$OMP PRIVATE( ITH, I )
  do ith=1,sml_nthreads

     do i=i_beg(ith),i_end(ith)
        tmp(i)=phi(i,1)
     enddo

  enddo
  ! sum-up
  call t_startf("EXTRACT_00MODE_RED")
  call my_mpi_allreduce(tmp,grid%rtmp1,grid%nnode) !rtmp1 is used for a temporory purpose. variable
  call t_stopf("EXTRACT_00MODE_RED")
  ! get density from charge summation
  !call smooth_pol0(grid, grid%rtmp1, smooth00)
  call convert_grid_2_001d(grid,grid%rtmp1,tmp2)
  call convert_001d_2_grid(grid,tmp2,grid%rtmp1)


!$OMP PARALLEL DO &
!$OMP PRIVATE( ITH, I )
  do ith=1,sml_nthreads

     do i=i_beg(ith),i_end(ith)
        phi(i,1)=phi(i,1) - grid%rtmp1(i)/(real(sml_totalpe))
     enddo

  enddo

end subroutine extract_00mode

subroutine apply_wall_boundary_condition(grid,psn,den)
  use grid_class
  use psn_class
  implicit none
  type(grid_type) :: grid
  type(psn_type) :: psn
  real (kind=8) :: den(grid%nnode)
  integer :: i

  do i=1, psn%nwall
     den(psn%wall_nodes(i))=psn%sheath_pot(i)
  enddo

end subroutine apply_wall_boundary_condition

subroutine  sheath_pot_init(grid,psn)
  use grid_class
  use psn_class
  use sml_module
  use eq_module
  implicit none
  type(grid_type) :: grid
  type(psn_type) :: psn
  integer :: i
  real (kind=8) :: x(2), Te, psi
  real (kind=8) :: psi_interpol
! apply sheat potential according to temperature


   do i=1, psn%nwall
      x=grid%x(:,psn%wall_nodes(i))
      psi=psi_interpol(x(1),x(2),0,0)
      ! get initial temperature
      Te=eq_ftn(psi,x(1),x(2),eq_tempe)
      ! apply alpha *  T
      psn%sheath_pot(i)=sml_sheath_init_pot_factor*Te
   enddo


end subroutine


#ifdef EFIELD2
subroutine get_potential_grad(grid,psn)
  use psn_class
  use grid_class
  use sml_module
  use smooth_module
  use omp_module , only : split_indices
  use perf_monitor
  implicit none
  include 'mpif.h'
  type(grid_type) :: grid
  type(psn_type) :: psn
  integer :: i,iphi,itr,nodes(3),j
  real (kind=8) :: dp1,dp2
  integer :: iphi_p, iphi_m
  real (kind=8) :: dpot_p, dpot_m
  integer :: nodes_p(3),nodes_m(3),nd,sgn
  real (kind=8) :: area_sum, E0(2), E(2,0:1)
  integer :: irho, ipe
  ! mpi
  integer :: idest, isendtag, isource, irecvtag
  PetscErrorCode ierr
  integer, dimension(MPI_STATUS_SIZE) :: istatus
  ! omp
  integer :: itr_beg(sml_nthreads), itr_end(sml_nthreads)
  integer :: ith, i_beg(sml_nthreads), i_end(sml_nthreads)

  real (8), allocatable :: E_perp_tr(:,:,:), E_r_node(:,:),E_z_node(:,:), E_para(:,:)
  real (8), allocatable :: pot(:,:,:), E_rho(:,:,:,:)
  ! for electron simulation
  real (8), allocatable :: E_perp00_tr(:,:), E_r00_node(:,:), E_z00_node(:,:)
  real (8) :: turb_on, zero_on


  allocate(E_perp_tr(2,grid%ntriangle,0:1))
  allocate(E_r_node(grid%nnode,0:1),E_z_node(grid%nnode,0:1), E_para(grid%nnode,0:1))
  allocate(E_rho(3,grid%nnode,0:1,0:grid%nrho))
  allocate(pot(grid%nnode,0:1,0:grid%nrho))
  if(sml_electron_on) then
     allocate(E_perp00_tr(2,grid%ntriangle), E_r00_node(grid%nnode,0:1), E_z00_node(grid%nnode,0:1))
  endif


  if(sml_turb_efield) then
     turb_on=1D0
  else
     turb_on=0D0
  endif
  if(sml_00_efield) then
     zero_on=1D0
  else
     zero_on=0D0
  endif

  ! pot(:,:,1) as tmp variable
  do iphi=0,1
     pot(:,iphi,1)=zero_on*psn%pot0(:)+turb_on*psn%dpot(:,iphi)
  enddo


  ! Divide triangles among OpenMP threads
  call split_indices(grid%ntriangle, sml_nthreads, itr_beg, itr_end)

  !obtain perpendicular E-field -- total E-field
!$OMP PARALLEL DO &
!$OMP PRIVATE( ITH, ITR, NODES, DP1, DP2, IPHI)
  do ith=1,sml_nthreads

     ! store grad-2d pot0 + pot in E_perp
     do iphi=0, 1
        do itr=itr_beg(ith), itr_end(ith)
           nodes(:)=grid%nd(:,itr)

           dp1=pot(nodes(1),iphi,1)- pot(nodes(3),iphi,1)
           dp2=pot(nodes(2),iphi,1)- pot(nodes(3),iphi,1)

           E_perp_tr(:,itr,iphi)= &
                -( dp1*grid%mapping(1,1:2,itr) + dp2*grid%mapping(2,1:2,itr) )
        enddo
     enddo

  enddo


  !obtain perpendicular 00 E-field -- required for weight evolution of electron
  if(sml_electron_on .and. sml_00_efield) then
!$OMP PARALLEL DO &
!$OMP PRIVATE( ITH, ITR, NODES, DP1, DP2)
     do ith=1,sml_nthreads

        ! store grad-2d pot0 + pot in E_perp
        do itr=itr_beg(ith), itr_end(ith)
           nodes(:)=grid%nd(:,itr)

           dp1=psn%pot0(nodes(1))- psn%pot0(nodes(3))
           dp2=psn%pot0(nodes(2))- psn%pot0(nodes(3))

           E_perp00_tr(:,itr)= &
                -( dp1*grid%mapping(1,1:2,itr) + dp2*grid%mapping(2,1:2,itr) )
        enddo

     enddo
  endif


  ! Divide nodes among OpenMP threads
  call split_indices(grid%nnode, sml_nthreads, i_beg, i_end)

  ! Get area averge of perp E-field

!$OMP PARALLEL DO &
!$OMP PRIVATE( ITH, I, AREA_SUM, E, J, ITR )
  do ith=1,sml_nthreads

     do i=i_beg(ith),i_end(ith)
        area_sum=0D0
        E=0D0
        do j=1, grid%num_t_node(i)
           itr=grid%tr_node(j,i)
           area_sum=area_sum+grid%tr_area(itr)
           E(:,:)=E(:,:)+ E_perp_tr(:,itr,0:1) * grid%tr_area(itr)
        enddo
        E_r_node(i,:)=E(1,:)/area_sum
        E_z_node(i,:)=E(2,:)/area_sum
     enddo

  enddo

  if(sml_electron_on .and. sml_00_efield) then
!$OMP PARALLEL DO &
!$OMP PRIVATE( ITH, I, AREA_SUM, E, J, ITR )
     do ith=1,sml_nthreads

        do i=i_beg(ith),i_end(ith)
           area_sum=0D0
           E(:,1)=0D0
           do j=1, grid%num_t_node(i)
              itr=grid%tr_node(j,i)
              area_sum=area_sum+grid%tr_area(itr)
              E(:,1)=E(:,1)+ E_perp00_tr(:,itr) * grid%tr_area(itr)
           enddo
           E_r00_node(i,1)=E(1,1)/area_sum
           E_z00_node(i,1)=E(2,1)/area_sum
        enddo

     enddo
     E_r00_node(:,0)=E_r00_node(:,1)
     E_z00_node(:,0)=E_z00_node(:,1)

  endif


  ! convert field following coord
  call cnvt_grid_real2ff(grid,psn%ff_hdp_tr,psn%ff_hdp_p,pot(:,:,1),pot(:,:,0))
  call cnvt_grid_real2ff(grid,psn%ff_hdp_tr,psn%ff_hdp_p,E_r_node,pot(:,:,1))
  E_r_node=pot(:,:,1)
  call cnvt_grid_real2ff(grid,psn%ff_hdp_tr,psn%ff_hdp_p,E_z_node,pot(:,:,1))
  E_z_node=pot(:,:,1)
  if(sml_electron_on) then
     if(sml_00_efield) then
        call cnvt_grid_real2ff(grid,psn%ff_hdp_tr,psn%ff_hdp_p,E_r00_node,pot(:,:,1))
        E_r00_node=pot(:,:,1)
        call cnvt_grid_real2ff(grid,psn%ff_hdp_tr,psn%ff_hdp_p,E_z00_node,pot(:,:,1))
        E_z00_node=pot(:,:,1)
     else ! zero efield when sml_00_efield is .false.
        E_r00_node=0D0
        E_z00_node=0D0
     endif
  endif


  !------- Parallel field -- difference of EFIELD2 starts here

  if(sml_bt_sign>0D0) then
     sgn=1
  else
     sgn=-1
  endif

  ! store grad_para (phi0 + phi) in E_para

!$OMP PARALLEL DO &
!$OMP PRIVATE( ITH, I, IPHI, IPHI_P, IPHI_M, NODES_P, NODES_M, DPOT_P, DPOT_M, ND )
  do ith=1,sml_nthreads

     do i=i_beg(ith),i_end(ith)
        do iphi=0, 1
           iphi_p=iphi+sgn
           iphi_m=iphi-sgn
           nodes_p=grid%nd(:,psn%bfollow_tr(1,i))
           nodes_m=grid%nd(:,psn%bfollow_tr(2,i))
           dpot_p=0D0
           dpot_m=0D0
           do nd=1,3
              dpot_p=dpot_p + psn%dpot(nodes_p(nd),iphi_p)*psn%bfollow_p(nd,1,i)
              dpot_m=dpot_m + psn%dpot(nodes_m(nd),iphi_m)*psn%bfollow_p(nd,2,i)
           enddo
           E_para(i,iphi) = -(dpot_p-dpot_m)*psn%bfollow_1_dx(i)    !(psn%pot0(i)
        enddo
     enddo

  enddo

  call cnvt_grid_real2ff(grid,psn%ff_hdp_tr,psn%ff_hdp_p,E_para,pot(:,:,1))
  E_para=pot(:,:,1)


  !--------- end of parallel E-field


  ! assign to E_rho
  E_rho(1,:,:,0)=E_r_node
  E_rho(2,:,:,0)=E_z_node
  E_rho(3,:,:,0)=E_para

  ! obtaion E_rho(:,:,n) and pot(:,:,n)
  if(sml_plane_mype<grid%nrho*2) then
     iphi=mod(sml_plane_mype,2)
     irho=sml_plane_mype/2  + 1

     call mat_mult(psn%gyro_avg_mat,pot(:,iphi,0),pot(:,iphi,irho))
     call mat_mult_tensor(psn%gyro_avg_mat,E_rho(:,:,iphi,0),3,E_rho(:,:,iphi,irho))
  endif

  ! get all of E_rho and pot using bcast
  do irho=1,grid%nrho
     do iphi=0,1
        ipe=(irho-1)*2 + iphi
        call mpi_bcast(pot(:,iphi,irho),grid%nnode,mpi_real8,ipe,sml_plane_comm,ierr)
        call mpi_bcast(E_rho(:,:,iphi,irho),grid%nnode*3,mpi_real8,ipe,sml_plane_comm,ierr)
     enddo
  enddo

  ! change of indexing order
  do irho=0,grid%nrho
     do iphi=0,1
        psn%pot_rho_ff(iphi,irho,:)=pot(:,iphi,irho)
        psn%E_rho_ff(:,iphi,irho,:) = E_rho(:,:,iphi,irho)
     enddo
  enddo

  if(sml_electron_on) then
     do iphi=0,1
        psn%E00_ff(1,iphi,:)=E_r00_node(:,iphi)
        psn%E00_ff(2,iphi,:)=E_z00_node(:,iphi)
     enddo
  endif

  ! for debugging ######
!  do iphi=0,1
!     psn%E_perp_node(1:2,:,iphi)=E_rho(1:2,:,iphi,0)
!     psn%E_para(:,iphi)     =E_rho(3  ,:,iphi,0)
!  enddo

  deallocate(E_perp_tr,E_r_node,E_z_node,E_para,E_rho,pot)
  if(sml_electron_on) deallocate(E_perp00_tr, E_r00_node, E_z00_node)

end subroutine get_potential_grad
#endif

subroutine smooth_tr_connect(grid,var)
  use sml_module, only : sml_nthreads
  use grid_class
  use omp_module, only : split_indices
  implicit none
  type(grid_type) :: grid
  real (8) :: var(grid%nnode)
  real (8), allocatable :: tmp(:)
  integer :: nodes(3)
  integer :: i,j, itr
  real (8) :: area_sum, nval
  integer :: ith, i_beg(sml_nthreads), i_end(sml_nthreads)

  integer :: ismooth
  integer, parameter :: nsmooth=1

  allocate(tmp(grid%ntriangle))

  do ismooth=1, nsmooth

     ! average value of each triangle

     call split_indices(grid%ntriangle, sml_nthreads, i_beg, i_end)

!$OMP PARALLEL DO &
!$OMP PRIVATE( ITH, I, NODES )
     do ith=1,sml_nthreads
        do itr=i_beg(ith), i_end(ith)
           nodes(:)=grid%nd(:,itr)
           ! get triangle average value
           tmp(itr)=(var(nodes(1))+var(nodes(2))+var(nodes(3)))/3D0
        enddo
     enddo


     ! angle average of triangle value at a node

     call split_indices(grid%nnode, sml_nthreads, i_beg, i_end)

!$OMP PARALLEL DO &
!$OMP PRIVATE( ITH, I, AREA_SUM, NVAL, J, ITR )
     do ith=1,sml_nthreads
        do i=i_beg(ith),i_end(ith)
           area_sum=0D0
           nval=0D0
           do j=1, grid%num_t_node(i)
              itr=grid%tr_node(j,i)
              area_sum=area_sum+         grid%tr_area(itr)
              nval    =nval    +tmp(itr)*grid%tr_area(itr)
           enddo
           var(i)=nval/area_sum
        enddo
     enddo
     !
  enddo

  ! finalize
  deallocate(tmp)

end subroutine smooth_tr_connect

subroutine simple00(grid,psn)
  use sml_module
  use grid_class
  use psn_class
  use eq_module
  implicit none
  type(grid_type) :: grid
  type(psn_type) :: psn

  real (8) :: area(grid%npsi00), dadp(grid%npsi00), dpdp00(grid%npsi00)
  integer :: npsi, i, bd1, bd2, bd3, bd4
  real (8) :: psi, in_bd_psi, out_bd_psi


  npsi = grid%npsi00

  ! prepare coefficient
  dadp = psn%vol00 / (sml_2pi*eq_axis_r*grid%dpsi00)  ! not real area -- approx sense.

  area(1)=dadp(1)  ! area defined in the middle point

  do i=2, npsi
     area(i)=area(i-1) + dadp(i)
  enddo

  ! prepare boundary
  bd1=1   !potential boundary - where potential is zero
  bd2=1
  bd3=npsi
  bd4=npsi
  do i=1, npsi
     psi=grid%psi00min + real(i-1)*grid%dpsi00

     in_bd_psi  = sml_inpsi  + sml_bd_ext_delta1*eq_x_psi
     out_bd_psi = sml_outpsi - sml_bd_ext_delta3*eq_x_psi
!     if(sml_rgn1_pot0_only) out_bd_psi=min(eq_x_psi*(1D0-sml_bd_ext_delta3), out_bd_psi)
     if(.true.) out_bd_psi=min(eq_x_psi*(1D0-sml_bd_ext_delta3), out_bd_psi) !enforce region1 only

     if(sml_zero_inner_bd/=1) in_bd_psi=-99D0  ! some minus number
     if( psi <= in_bd_psi  .and. psi + grid%dpsi00 >   in_bd_psi ) bd1=i
     if( psi < out_bd_psi  .and. psi + grid%dpsi00 >= out_bd_psi ) bd3=i+1

     in_bd_psi  = sml_inpsi  + sml_bd_ext_delta2*eq_x_psi
     out_bd_psi = sml_outpsi - sml_bd_ext_delta4*eq_x_psi
!     if(sml_rgn1_pot0_only) out_bd_psi=min(eq_x_psi*(1D0-sml_bd_ext_delta4), out_bd_psi)
     if(.true.) out_bd_psi=min(eq_x_psi*(1D0-sml_bd_ext_delta4), out_bd_psi) !enforce region1 only

     if(sml_zero_inner_bd/=1) in_bd_psi=-99D0  ! some minus number
     if( psi <= in_bd_psi  .and. psi + grid%dpsi00 >   in_bd_psi ) bd2=i
     if( psi < out_bd_psi  .and. psi + grid%dpsi00 >= out_bd_psi ) bd4=i+1
  enddo


  dpdp00=0D0  ! defined in the middle point
  do i=bd2 + 1, bd4  ! bd3?
     dpdp00(i) = dpdp00(i-1) + psn%cden00_1d(i)*dadp(i)/sml_2pi * grid%dpsi00
  enddo

  dpdp00(1:npsi-1)=dpdp00(1:npsi-1)*0.5D0*(dadp(1:npsi-1) + dadp(2:npsi)) / ( 2D0*area(1:npsi-1)*psn%mn_eb2(1:npsi-1) )


  psn%pot00_1d=0D0
  do i=bd1 + 1, bd3
     psn%pot00_1d(i) = psn%pot00_1d(i-1) + dpdp00(i-1)
  enddo

  psn%pot00_1d(1:bd3)=psn%pot00_1d(1:bd3) - psn%pot00_1d(bd3)

end subroutine simple00

subroutine init_simple00(grid,psn)
  use sml_module
  use grid_class
  use psn_class
  use eq_module
  use ptl_module
  implicit none
  type(grid_type) :: grid
  type(psn_type) :: psn
  integer :: i
  real (8) :: psi, den

  allocate(psn%mn_eb2(grid%npsi00))

  do i=1, grid%npsi00

     psi=grid%psi00min + (real(i)-0.5D0)*grid%dpsi00
     den=eq_ftn(psi,eq_axis_r,eq_axis_z,eq_den)

     psn%mn_eb2(i)=-ptl_mass(1)/sml_e_charge*den/(eq_axis_b*eq_axis_b)
  enddo


end subroutine init_simple00


!***************************************************************************
!***************************************************************************
!***************************************************************************
!**************************************************************************
!***************************************************************************
!***************************************************************************


! smooth_pol and related routines are out of date
!
subroutine smooth_pol(grid,var)
  use grid_class
  use smooth_module
  implicit none
  type(grid_type):: grid
  real (8) :: var(grid%nnode)
  real (8), allocatable :: tmp(:)
  logical :: first=.true.
  save first

  print *, 'smoth_pol is out of date. Do not use it'
  stop


  if(first) then
     call init_smooth_pol_mat(grid)
     first=.false.
  endif

  allocate(tmp(grid%nnode))

  call mat_mult(smooth_pol_mat,var,tmp)
  var=tmp

  deallocate(tmp)
end subroutine smooth_pol

subroutine init_smooth_pol_mat(grid)
  use sml_module
  use grid_class
  use smooth_module
  implicit none
  type(grid_type):: grid
  integer :: iphi, irho
  real (8) :: rho, phi0, phi
  integer :: i, larmor
  real (8) :: x(2), x_ring(2)
  integer, parameter :: n_gyro=8
  real (8) :: dx_unit(2,n_gyro), dx_unit2(2)
  real (8) :: angle, dpdr, dpdz, dp, cosa, sina
  integer :: itr, node, ip
  real (8) :: p(3), weight
  !
  real (8), external :: psi_interpol

  call new_mat(smooth_pol_mat,grid%nnode,n_gyro*3+1)

  rho=grid%drho*real(irho)
  phi0=0.5D0*grid%delta_phi

  do larmor=1, n_gyro
     angle=sml_2pi/real(n_gyro)*real(larmor-1)
     dx_unit(:,larmor)=(/cos(angle),sin(angle)/)
  enddo

  do i=1, grid%nnode
     x=grid%x(:,i)

     ! self : 0.5-- minimum diagonal
     weight=0.5D0
     call set_value(smooth_pol_mat,i,i,weight,1)


     do larmor=1, n_gyro
        !find 8-point with rhoi
        dx_unit2(1)= dx_unit(1,larmor)
        dx_unit2(2)= dx_unit(2,larmor)

        x_ring = x + rho* dx_unit2(:)

        ! find triangle
        call search_tr2(grid,x_ring,itr,p)

        if(itr>0) then
           do ip=1,3
              weight=p(ip)/real(n_gyro)*0.5D0
              node=grid%nd(ip,itr)
              call set_value(smooth_pol_mat,i,node,weight,1)
           enddo
        endif
     enddo

  enddo

end subroutine init_smooth_pol_mat
!
! deprecated
!
#ifndef USE_NEW_PETSC_SOLVER
subroutine init_poisson_iter_solvers(grid,psn,iflag)
  use psn_class
  use grid_class
  use sml_module
  use ptl_module, only : ptl_mass
  use eq_module
  implicit none
#include <finclude/petsc.h>
  type(grid_type) :: grid
  type(psn_type) :: psn
  integer :: iflag

 ! type(mat_type) :: matl, matr, matr2
  real (kind=8),allocatable :: alpha(:), beta(:), tiev(:), teev(:), den(:),b2(:),rhoi2(:)
  integer :: itr,nd(3),in_bd_node,nnz_row
  PetscErrorCode ierr
  real (kind=8) :: x_center(2),psi
  real (kind=8) :: factor, psi_in_poisson_bd, psi_out_poisson_bd
  real (kind=8), external :: gyro_radius2,psi_interpol,  b_interpol
  real (kind=8), parameter :: offset=2D0
  allocate(alpha(grid%ntriangle),beta(grid%ntriangle),tiev(grid%ntriangle),teev(grid%ntriangle),den(grid%ntriangle),b2(grid%ntriangle),rhoi2(grid%ntriangle) )
  nnz_row=sml_max_mat_width

  ! setup basic values
  do itr=1, grid%ntriangle
     nd=grid%nd(:,itr)
     x_center=( grid%x(:,nd(1)) + grid%x(:,nd(2)) + grid%x(:,nd(3)) )/3D0
#ifdef FLAT_GYRORADIUS
     x_center(1)=eq_axis_r
     x_center(2)=eq_axis_z
#endif
     rhoi2(itr)=gyro_radius2(x_center)**2
     psi=psi_interpol(x_center(1),x_center(2),0,0)
     tiev(itr)=eq_ftn(psi,x_center(1),x_center(2),eq_tempi)
     teev(itr)=eq_ftn(psi,x_center(1),x_center(2),eq_tempe)
     b2(itr)=b_interpol(x_center(1),x_center(2),0D0)**2
     den(itr)=eq_ftn(psi,x_center(1),x_center(2),eq_den)
  enddo

  ! 00 solver
  call check_point('Initialize 00 solver')
  if(.NOT. sml_use_simple00 ) then
!  if( .true. ) then
     ! **************************************
     ! initialize petsc_solver
     ! **************************************
     if(.true.) then ! redundently solve 00 modes solver
       psn%solver00%comm = sml_plane_comm
       psn%solver00%mype=sml_plane_mype
       psn%solver00%totalpe=sml_plane_totalpe
     else
       psn%solver00%comm = petsc_comm_world
       psn%solver00%mype=sml_mype
       psn%solver00%totalpe=sml_totalpe
     endif
     psn%solver00%prefix=1

     ! RHS
     psn%solver00%n_rhs_mat=2 ! only 00 solver needs two RHS
     call check_point('- 00 solver memory create')
     call create_solver(psn%solver00,grid%nnode,nnz_row,ierr)

     ! LHS
     ! Alpha and Beta
     alpha=ptl_mass(1)/sml_e_charge*den/b2 ! changed sign with new solver
     beta=den/teev  ! add diagonal term
     call helm_matrix( psn%solver00%Amat, alpha, beta, grid, psn%pbd0_2, sml_sheath_mode/=0,ierr)
     !        call helm_matrix( psn%solver00%Amat, alpha, beta, grid, psn%pbd0_2, .false. ,ierr)

     ! RHS - first
     alpha=0D0  ! no del term on RHS -- pade term removed
     beta=1D0  ! diagonal (identity)
     call helm_matrix(psn%solver00%rhs_mat,alpha,beta,grid,psn%cbd0_2,.false.,ierr)

     ! RHS - second
     alpha=0D0  ! no del term on RHS -- pade term removed
     beta=den/teev  ! diagona term same as LHS
     call helm_matrix(psn%solver00%rhs2_mat,alpha,beta,grid,psn%cbd0_2,.false.,ierr)
     !call init_solver_with_11mat(grid,psn,psn%solver00)
     call init_simple00(grid,psn)  ! use for initial condition
  else ! simple 00
     call init_simple00(grid,psn)
  endif
 
  ! turb solver
  call check_point('Initialize turb solver')
  if(sml_turb_poisson) then
     ! ************************
     ! initialize petsc_solver
     ! ************************
     psn%solverH%comm = sml_plane_comm
     psn%solverH%mype=sml_plane_mype
     psn%solverH%totalpe=sml_plane_totalpe
     psn%solverH%prefix=2

     ! RHS
     psn%solverH%n_rhs_mat=1
     call check_point('- turb solver memory create')
     call create_solver(psn%solverH,grid%nnode,nnz_row,ierr)

     ! LHS
     alpha=ptl_mass(1)/sml_e_charge*den/b2 ! changed sign with new solver
     beta=den/teev

     !	   call helm_matrix( psn%solverH%Amat,alpha, beta, grid, psn%pbdH_2, sml_sheath_mode/=0, ierr)
     call helm_matrix( psn%solverH%Amat,alpha, beta, grid, psn%pbdH_2, .false. , ierr)

     alpha=0D0		! no del term on RHS
     beta=1D0		! diagonal (identity)
     call helm_matrix( psn%solverH%rhs_mat, alpha, beta, grid, psn%cbdH_2, .false., ierr)

     !call init_solver_with_11mat(grid,psn,psn%solverH)
  endif

  deallocate(alpha,beta,tiev,teev,den,b2,rhoi2)

end subroutine init_poisson_iter_solvers

!
! deprecated
!
subroutine solve_poisson_iter(grid,psn,iflag)
  use psn_class
  use grid_class
  use sml_module
  use perf_monitor
  use smooth_module
  use omp_module, only: split_indices
  implicit none
  type(grid_type) :: grid
  type(psn_type) :: psn
  integer,intent(in) :: iflag
  integer :: i, loop_num, iter
  PetscErrorCode ierr
  integer :: ith, i_beg(sml_nthreads), i_end(sml_nthreads)
  real (kind=8), allocatable :: dentmp(:), dentmp2(:), dentmp3(:), dentmp4(:)
  real (kind=8), allocatable :: sendl(:),recvr(:)
  real (kind=8) :: t1, t2, t3
  save dentmp, dentmp2, dentmp3, dentmp4
  save sendl,recvr

  if(iflag==-1) then
     allocate(dentmp(grid%nnode),dentmp2(grid%nnode),dentmp3(grid%nnode),dentmp4(grid%nnode))
     allocate(sendl(grid%nnode),recvr(grid%nnode))
     dentmp=0D0 !for safty
     dentmp2=0D0
     dentmp3=0D0
     dentmp4=0D0
     if(sml_mype==0) print *, &
	'**************** Poisson_iter_solvers is used ******************'
     return
  endif

  call t_startf("POISSON_00MODE")


  ! get n0*******************************************************************************
  ! RHS density for n=0 mode --> save it as dentmp
  psn%cden00_1d = psn%iden00_1d-psn%eden00_1d


  if(iflag==1 .and. sml_electron_on) then
     dentmp=psn%idensity0-psn%edensity0
  else
     dentmp=psn%idensity0
  endif

  call set_boundary2_values(dentmp,0D0,psn%cbd0_2)


  ! solve n=0 field iteratively ************************************************************
!  if(sml_00_poisson) then ! solve n=0 poisson only when it is .true. -- for speed up
  if(.true.) then ! always true
     !if(sml_sheath_mode/=0) call apply_wall_boundary_condition(grid,psn,dentmp)


     ! set initial value - from simple00
     call simple00(grid,psn)
     call convert_001d_2_grid(grid,psn%pot00_1d,psn%pot0)

     ! solve iterativley
     do iter=1, sml_iter_solver_niter

        ! update RHS
        call convert_grid_2_001d(grid,psn%pot0,psn%pot00_1d)
        call convert_001d_2_grid(grid,psn%pot00_1d,dentmp2)  !dentmp2 is flux averaged solution

        ! write out 1d solutions - convergence test
        if(.false.) then
           if(sml_mype==0) then
              if(iter==1) open(unit=567,file='itersol.txt',status='replace')
              write(567,1000) psn%pot00_1d
1000          format(400(e19.12,1x))
              if(iter==sml_iter_solver_niter) close(567)
           endif
        endif
        ! solve poisson eq
        call t_startf("POISSON_00MODE_SOLVE")
        call petsc_solve(grid%nnode,dentmp,dentmp2,psn%pot0,psn%solver00%comm,psn%solver00,ierr)
        call t_stopf("POISSON_00MODE_SOLVE")

     enddo


     !extract 00 solution
     call convert_grid_2_001d(grid,psn%pot0,psn%pot00_1d)
     call convert_001d_2_grid(grid,psn%pot00_1d,dentmp2)  !dentmp2 is flux averaged solution
     psn%pot0m=psn%pot0
     ! (pot0 - dentmp2) has poloidal variations. This should be moved to dpot
  endif

  call t_stopf("POISSON_00MODE")

  ! solve turbulence field**********************************************************************************
  call split_indices(grid%nnode, sml_nthreads, i_beg, i_end)

  if(sml_turb_poisson) then
     call t_startf("POISSON_TURB")

     ! For all toroidal section
     ! the case when phitmp=>psn%pot0 can be eliminated. No function now.

!$OMP PARALLEL DO &
!$OMP PRIVATE( ITH, I )
     do ith=1,sml_nthreads
        if(iflag==1 .and. sml_electron_on) then
           do i=i_beg(ith),i_end(ith)
              dentmp3(i)= (psn%idensity(i,1) - psn%edensity(i,1) - dentmp(i))
              !/psn%density0_full(:)  ! ni - ne - <ni - ne>
           enddo
        else ! iflag==2 .or. no electron
!           call convert_001d_2_grid(grid,psn%iden00_1d,dentmp) !dentmp becomes flux averaged ion 00 density
           do i=i_beg(ith),i_end(ith)
              dentmp3(i)= (psn%idensity(i,1) - psn%idensity0(i))
              !  ni - <ni>
           enddo
        endif
     enddo

     ! smoothing of charge density -- RHS
     !call smooth_tr_connect(grid,dentmp3)
     !call smooth_pol(grid,dentmp3)
     call set_boundary2_values(dentmp3,0D0,psn%cbdH_2)

     psn%dden(:,1)=dentmp3  ! this can be optimized. redundant memory usage.
     call t_startf("POISSON_TURB_SOLVE")
     call petsc_solve(grid%nnode,dentmp3,0,psn%dpot(:,1),psn%solverH%comm,psn%solverH,ierr)
     call t_stopf("POISSON_TURB_SOLVE")


     ! move m/=0 component to dpot
     psn%dpot(:,1)=psn%dpot(:,1)+psn%pot0 - dentmp2
     psn%pot0=dentmp2


     ! call smooth_pol0(grid,psn%dpot(:,1),smoothH)
     ! call smooth_r(psn%dpot(:,1),grid%rtmp1,smooth_r1,grid)
     ! call smooth_nearx(psn%dpot(:,1),smooth_nearx1,grid)
     !call smooth_tr_connect(grid,psn%dpot(:,1))
     !call smooth_pol(grid,psn%dpot(:,1))


     !************************* extract 00 mode **********************************************************
     !call t_startf("EXTRACT_00MODE")
     !call extract_00mode(grid,psn%dpot)
     !call t_stopf("EXTRACT_00MODE")

     !**************************** mode selection*********************************************************
     if(sml_mode_select_on==1) then
        call mode_selection(sml_mode_select_n,grid,psn)
     endif

     call t_stopf("POISSON_TURB")
  endif


  !********************** Phase 4 *********************************************
  !send and receive potential value to and from adjacent PE
  !***************************************************************************
  if(sml_turb_poisson) then
     call t_startf("POISSON_SR_POT")
     call send_recv_potential( psn%dpot, recvr,sendl,grid%nnode)
     call t_stopf("POISSON_SR_POT")
  endif

  !******************** Phase 5 ***********************************************
  ! get space derivative of potential
  ! psn%E_para(node,1:nphi )  psn%E_perp(2 vec, ntriangle, 0:nphi)
  !***************************************************************************
  if(sml_add_pot0/=0) then
     if(sml_replace_pot0) then
!$OMP PARALLEL DO &
!$OMP PRIVATE( ITH, I )
        do ith=1,sml_nthreads

           do i=i_beg(ith),i_end(ith)
              psn%pot0(i)=psn%add_pot0(i)
           enddo

        enddo
     else
!$OMP PARALLEL DO &
!$OMP PRIVATE( ITH, I )
        do ith=1,sml_nthreads

           do i=i_beg(ith),i_end(ith)
              psn%pot0(i)=psn%pot0(i)+psn%add_pot0(i)
           enddo

        enddo
     endif
  endif

end subroutine solve_poisson_iter
#endif
! 
! not used
!
subroutine solve_poisson_private_old_simple(grid,psn,iflag)
  use psn_class
  use grid_class
  use sml_module
  use perf_monitor
  use smooth_module
  use omp_module, only: split_indices
  implicit none
  type(grid_type) :: grid
  type(psn_type) :: psn
  integer,intent(in) :: iflag
  integer :: i, loop_num,ierr
  integer :: ith, i_beg(sml_nthreads), i_end(sml_nthreads)
  real (kind=8), allocatable :: dentmp(:), dentmp2(:), dentmp3(:), dentmp4(:)
  real (kind=8), allocatable :: sendl(:),recvr(:)
  real (kind=8) :: t1, t2, t3
  save dentmp, dentmp2, dentmp3, dentmp4
  save sendl,recvr

  if(iflag==-1) then
     allocate(dentmp(grid%nnode),dentmp2(grid%nnode),dentmp3(grid%nnode),dentmp4(grid%nnode))
     allocate(sendl(grid%nnode),recvr(grid%nnode))
     dentmp=0D0 !for safty
     dentmp2=0D0
     dentmp3=0D0
     dentmp4=0D0
     if(sml_mype==0) print *, &
	'**************** 1 field Poisson static variables initialized  ******************'
     return
  endif

  ! get n0***********************************************************************************
  ! RHS density for 00 mode

  if(iflag==1) then  ! solve 00-mode
     call t_startf("POISSON_00MODE")
     psn%cden00_1d = psn%iden00_1d-psn%eden00_1d
     call convert_001d_2_grid(grid,psn%cden00_1d,dentmp)

     !if(sml_zero_out_total_charge) then  ! need opimization
     !   call zero_out_total_charge(grid,psn,dentmp,dentmp4) ! zero out totoal charge - dentmp4= dentmp - int(dentmp)/total_vol
     !else
     !   dentmp4=dentmp
     !endif

     call set_boundary2_values(dentmp,0D0,psn%cbd0_2)


     ! solve 00 (neoclassical) field*********************************************************
     if(sml_00_poisson) then ! solve 00 poisson only when it is .true. -- for speed up
!        if(sml_sheath_mode/=0) call apply_wall_boundary_condition(grid,psn,dentmp)
        if(.not. sml_use_simple00) then
           call t_startf("POISSON_00MODE_SOLVE")
           call petsc_solve(grid%nnode,dentmp,0,psn%pot0,psn%solver00%comm,psn%solver00,ierr)
           call t_stopf("POISSON_00MODE_SOLVE")

           call convert_grid_2_001d(grid,psn%pot0,psn%pot00_1d)
        else
           call simple00(grid,psn)
        endif
        call convert_001d_2_grid(grid,psn%pot00_1d,psn%pot0)
     endif
     call t_stopf("POISSON_00MODE")
  endif

  ! solve turbulence field*******************************************************************
  call split_indices(grid%nnode, sml_nthreads, i_beg, i_end)

  if(sml_turb_poisson) then
     call t_startf("POISSON_TURB")

     ! For all toroidal section
     ! the case when phitmp=>psn%pot0 can be eliminated. No function now.

!$OMP PARALLEL DO &
!$OMP PRIVATE( ITH, I )
     do ith=1,sml_nthreads
        if(iflag==1 .and. sml_electron_on) then
           do i=i_beg(ith),i_end(ith)
              dentmp3(i)= (psn%idensity(i,1) - psn%edensity(i,1) - dentmp(i))
              !/psn%density0_full(:)  ! ni - ne - <ni - ne>
           enddo
        else ! iflag==2 .or. no electron
           call convert_001d_2_grid(grid,psn%iden00_1d,dentmp) !dentmp becomes flux averaged ion 00 density
           do i=i_beg(ith),i_end(ith)
              dentmp3(i)= (psn%idensity(i,1) - dentmp(i))
#ifdef MAX_DDEN
           if(dentmp3(i) > 2d0*dentmp(i)) dentmp3(i)=2d0*dentmp(i) ! factor 2 genralize later
#endif
              !  ni - <ni>
           enddo
        endif
     enddo

     ! smoothing of charge density -- RHS
     !call smooth_tr_connect(grid,dentmp3)
     !call smooth_pol(grid,dentmp3)
     call set_boundary2_values(dentmp3,0D0,psn%cbdH_2)

     psn%dden(:,1)=dentmp3  ! this can be optimized. redundant memory usage.
     call t_startf("POISSON_TURB_SOLVE")
     call petsc_solve(grid%nnode,dentmp3,0,psn%dpot(:,1),psn%solverH%comm,psn%solverH,ierr)
     call t_stopf("POISSON_TURB_SOLVE")

     ! call smooth_pol0(grid,psn%dpot(:,1),smoothH)
     ! call smooth_r(psn%dpot(:,1),grid%rtmp1,smooth_r1,grid)
     ! call smooth_nearx(psn%dpot(:,1),smooth_nearx1,grid)
     call smooth_tr_connect(grid,psn%dpot(:,1))
     !call smooth_pol(grid,psn%dpot(:,1))


     !************************* extract 00 mode *********************************************
     call t_startf("EXTRACT_00MODE")
     call extract_00mode(grid,psn%dpot)
     call t_stopf("EXTRACT_00MODE")

     !**************************** mode selection********************************************
     if(sml_mode_select_on==1) then
        call mode_selection(sml_mode_select_n,grid,psn)
     endif

     call t_stopf("POISSON_TURB")
  endif


  !********************** Phase 4 *********************************************
  !send and receive potential value to and from adjacent PE
  !***************************************************************************
  if(sml_turb_poisson) then
     call t_startf("POISSON_SR_POT")
     call send_recv_potential( psn%dpot, recvr,sendl,grid%nnode)
     call t_stopf("POISSON_SR_POT")
  endif

  !******************** Phase 5 ***********************************************
  ! get space derivative of potential
  ! psn%E_para(node,1:nphi )  psn%E_perp(2 vec, ntriangle, 0:nphi)
  !***************************************************************************
  if(sml_add_pot0/=0) then
     if(sml_replace_pot0) then
!$OMP PARALLEL DO &
!$OMP PRIVATE( ITH, I )
        do ith=1,sml_nthreads
           do i=i_beg(ith),i_end(ith)
              psn%pot0(i)=psn%add_pot0(i)
           enddo
        enddo
     else
!$OMP PARALLEL DO &
!$OMP PRIVATE( ITH, I )
        do ith=1,sml_nthreads
           do i=i_beg(ith),i_end(ith)
              psn%pot0(i)=psn%pot0(i)+psn%add_pot0(i)
           enddo
        enddo
     endif
  endif

end subroutine solve_poisson_private_old_simple