!=======================================================================
! Message-passing routines for parallel processing using MPI
!
! messenger_init()
! messenger_free()
!
! barrier()
! bcast(A, na)
! globalSum(A, na)
! globalMax(A, na)
! globalMin(A, na)
! globalDirSum(A, na, ixyz)
! globalDirMax(A, na, ixyz)
!
! scatter(A, B)
! gather(A, B)
! allGather(A, B)
! scatterXY(A, B, nk)
! gatherXY(A, B, nk)
! allGatherXY(A, B, nk)
! transpose(R1, R2, isign)
! transposeA(R1, R2, isign)
! updateBorder(A, n1,n2,n3, ixyz, ijk)
! updateBorder2(A, n1,n2,n3, ixyz, ijk)
!
! triDiagonalM_(ixyz, a, b, c, r, n, lot)
! triDiagonalM_BC_(ixyz, a, b, c, r, n, lot)
! triDiagonalLUM_(ixyz, a, b, c, work, n, lot)
! triLUSolveM_(ixyz, a, b, c, r, work, n, lot)
! triLUSolveM_BC_(ixyz, a, b, c, r, work, n, lot)
! pentaDiagonalM_(ixyz, a, b, c, d, e, r, n, lot)
!

module messenger
    use data_export

    integer myid                      ! my process rank
    integer idroot                    ! rank of root process

    ! Grid topology
    integer icomm_grid                ! comm for grid topology
    integer icomm_xyz(3)              ! directional subcomms
    integer icoord(3,nproc)           ! proc grid coordinates

    real wallTime

end module

!=======================================================================
subroutine messenger_init()
    use messenger
    include "mpif.inc"
    integer idims(3)
    logical Lperiodic(3)
    logical Lremain_dims(3)

    ! Initialize MPI
    call MPI_init (ierr)
    call MPI_comm_size (MPI_COMM_WORLD, np, ierr)
    call MPI_comm_rank (MPI_COMM_WORLD, myid, ierr)
!    write(*,*) np,nproc
    if (np .ne. nproc) stop "Wrong number of processors"

    ! Grid topology
    ndims = 3
    idims(1) = npx
    idims(2) = npy
    idims(3) = 1
    Lperiodic = .true.
    call MPI_Cart_create(MPI_COMM_WORLD, ndims, idims, Lperiodic, .false., &
                         icomm_grid, ierr)
    do ip=1,nproc
        call MPI_Cart_coords(icomm_grid, ip-1, ndims, icoord(1,ip), ierr)
    end do
    icoord = icoord + 1
    call MPI_comm_rank (icomm_grid, irank, ierr)
    irank = irank + 1
    iblock = icoord(1, irank)
    jblock = icoord(2, irank)
    kblock = icoord(3, irank)

    ! Directional subcomms
    do ixyz=1,3
        Lremain_dims(:) = .false.
        Lremain_dims(ixyz) = .true.
        call MPI_Cart_sub (icomm_grid, Lremain_dims, icomm_xyz(ixyz), ierr)
    end do

    ! Root process at coordinates (0,0,0)
    idims = 0
    call MPI_Cart_rank(icomm_grid, idims, idroot, ierr)
    iroot = idroot + 1

    ! Save current time
    wallTime = MPI_wtime()

    return
end

subroutine messenger_free()
    use messenger
    include "mpif.inc"

    ! Report time used
    print "(a,f8.2)", "time: ", MPI_wtime() - wallTime

    ! Finalize MPI
    call MPI_finalize (ierr)

    return
end

!=======================================================================
subroutine barrier()
    use messenger
    include "mpif.inc"

    call MPI_barrier (MPI_COMM_WORLD, ierr)

    return
end

subroutine bcast(A, na)
    use messenger
    include "mpif.inc"
    real A(na)

    call MPI_bcast (A, na, MPI_REAL8, idroot, MPI_COMM_WORLD, ierr)

    return
end

subroutine globalSum(A, na)
    use messenger
    include "mpif.inc"
    real A(na)
    real buf(na)

    call MPI_AllReduce (A, buf, na, MPI_REAL8, &
                        MPI_SUM, MPI_COMM_WORLD, ierr)
    A = buf

    return
end

subroutine globalMax(A, na)
    use messenger
    include "mpif.inc"
    real A(na)
    real buf(na)

    call MPI_AllReduce (A, buf, na, MPI_REAL8, &
                        MPI_MAX, MPI_COMM_WORLD, ierr)
    A = buf

    return
end

subroutine globalMin(A, na)
    use messenger
    include "mpif.inc"
    real A(na)
    real buf(na)

    call MPI_AllReduce (A, buf, na, MPI_REAL8, &
                        MPI_MIN, MPI_COMM_WORLD, ierr)
    A = buf

    return
end

subroutine globalDirSum(A, na, ixyz)
    use messenger
    include "mpif.inc"
    real A(na)
    real buf(na)

    call MPI_AllReduce (A, buf, na, MPI_REAL8, &
                        MPI_SUM, icomm_xyz(ixyz), ierr)
    A = buf

    return
end

subroutine globalDirMax(A, na, ixyz)
    use messenger
    include "mpif.inc"
    real A(na)
    real buf(na)

    call MPI_AllReduce (A, buf, na, MPI_REAL8, &
                        MPI_MAX, icomm_xyz(ixyz), ierr)
    A = buf

    return
end

!=======================================================================
subroutine scatter(A, B)
    use messenger
    include "mpif.inc"
    real A(nx,ny,nz), B(nx_1,ny_1,nz_1)
    real, allocatable :: sendbuf(:)
    integer i1(nproc), i2(nproc), &
            j1(nproc), j2(nproc), &
            k1(nproc), k2(nproc)

    ! Scatter an array among the processors
    ! including overlapping borders

    if (myid == idroot) then
        do ip=1,nproc
            i1(ip) = ibmino_1(icoord(1,ip))
            j1(ip) = jbmino_1(icoord(2,ip))
            k1(ip) = kbmino_1(icoord(3,ip))
        end do
        i2 = i1 + nx_1 - 1
        j2 = j1 + ny_1 - 1
        k2 = k1 + nz_1 - 1

        allocate (sendbuf(nproc*nxyz_1))
        L = 0
        do ip=1,nproc
            do k=k1(ip),k2(ip)
            do j=j1(ip),j2(ip)
            do i=i1(ip),i2(ip)
                L = L + 1
                sendbuf(L) = A(i,j,k)
            end do
            end do
            end do
        end do
    end if

    call MPI_scatter (sendbuf, nxyz_1, MPI_REAL8, &
                      B, nxyz_1, MPI_REAL8, &
                      idroot, icomm_grid, ierr)

    call updateBorder(B, nx_1,ny_1,nz_1, 1, 1)
    call updateBorder(B, nx_1,ny_1,nz_1, 2, 2)
    call updateBorder(B, nx_1,ny_1,nz_1, 3, 3)

    if (myid == idroot) deallocate (sendbuf)

    return
end

subroutine gather(A, B)
    use messenger
    include "mpif.inc"
    real A(nx_1,ny_1,nz_1), B(nx,ny,nz)
    real, allocatable :: recvbuf(:,:,:,:)
    integer i1(nproc), i2(nproc), &
            j1(nproc), j2(nproc), &
            k1(nproc), k2(nproc)

    ! Gather a distributed array to the root process
    ! including overlapping borders

    call updateBorder(A, nx_1,ny_1,nz_1, 1, 1)
    call updateBorder(A, nx_1,ny_1,nz_1, 2, 2)
    call updateBorder(A, nx_1,ny_1,nz_1, 3, 3)

    if (myid == idroot) allocate (recvbuf(nx_1,ny_1,nz_1,nproc))

    call MPI_gather (A, nxyz_1, MPI_REAL8, &
                     recvbuf, nxyz_1, MPI_REAL8, &
                     idroot, MPI_COMM_WORLD, ierr)

    if (myid == idroot) then
        do ip=1,nproc
            i1(ip) = ibmino_1(icoord(1,ip))
            j1(ip) = jbmino_1(icoord(2,ip))
            k1(ip) = kbmino_1(icoord(3,ip))
            i2(ip) = ibmaxo_1(icoord(1,ip))
            j2(ip) = jbmaxo_1(icoord(2,ip))
            k2(ip) = kbmaxo_1(icoord(3,ip))
        end do

        do ip=1,nproc
            do k=k1(ip),k2(ip)
            do j=j1(ip),j2(ip)
            do i=i1(ip),i2(ip)
                kk = k-k1(ip)+1
                jj = j-j1(ip)+1
                ii = i-i1(ip)+1
                B(i,j,k) = recvbuf(ii,jj,kk,ip)
            end do
            end do
            end do
        end do

        deallocate (recvbuf)
    end if

    return
end

subroutine allGather(A, B)
    use messenger
    include "mpif.inc"
    real A(nx_1,ny_1,nz_1), B(nx,ny,nz)
    real, allocatable :: recvbuf(:,:,:,:)
    integer i1(nproc), i2(nproc), &
            j1(nproc), j2(nproc), &
            k1(nproc), k2(nproc)

    ! Gather a distributed array to the root process
    ! including overlapping borders

    call updateBorder(A, nx_1,ny_1,nz_1, 1, 1)
    call updateBorder(A, nx_1,ny_1,nz_1, 2, 2)
    call updateBorder(A, nx_1,ny_1,nz_1, 3, 3)

    allocate (recvbuf(nx_1,ny_1,nz_1,nproc))

    call MPI_allGather (A, nxyz_1, MPI_REAL8, &
                        recvbuf, nxyz_1, MPI_REAL8, &
                        MPI_COMM_WORLD, ierr)

    do ip=1,nproc
        i1(ip) = ibmino_1(icoord(1,ip))
        j1(ip) = jbmino_1(icoord(2,ip))
        k1(ip) = kbmino_1(icoord(3,ip))
        i2(ip) = ibmaxo_1(icoord(1,ip))
        j2(ip) = jbmaxo_1(icoord(2,ip))
        k2(ip) = kbmaxo_1(icoord(3,ip))
    end do

    do ip=1,nproc
        do k=k1(ip),k2(ip)
        do j=j1(ip),j2(ip)
        do i=i1(ip),i2(ip)
            kk = k-k1(ip)+1
            jj = j-j1(ip)+1
            ii = i-i1(ip)+1
            B(i,j,k) = recvbuf(ii,jj,kk,ip)
        end do
        end do
        end do
    end do

    deallocate (recvbuf)

    return
end

!=======================================================================
subroutine scatterXY(A, B, nk)
    use messenger
    include "mpif.inc"
    real A(nx,ny,nk), B(nx_1,ny_1,nk)
    real sendbuf(nproc*nx_1*ny_1*nk)
    integer i1(nproc), i2(nproc), &
            j1(nproc), j2(nproc)

    ! Scatter an XY-array among the processors
    ! including overlapping borders

    if (myid == idroot) then
        do ip=1,nproc
            i1(ip) = ibmino_1(icoord(1,ip))
            j1(ip) = jbmino_1(icoord(2,ip))
        end do
        i2 = i1 + nx_1 - 1
        j2 = j1 + ny_1 - 1

        L = 0
        do ip=1,nproc
            do k=1,nk
            do j=j1(ip),j2(ip)
            do i=i1(ip),i2(ip)
                L = L + 1
                sendbuf(L) = A(i,j,k)
            end do
            end do
            end do
        end do
    end if

    call MPI_scatter (sendbuf, nx_1*ny_1*nk, MPI_REAL8, &
                      B, nx_1*ny_1*nk, MPI_REAL8, &
                      idroot, icomm_grid, ierr)

    call updateBorder(B, nx_1,ny_1,nk, 1, 1)
    call updateBorder(B, nx_1,ny_1,nk, 2, 2)

    return
end

subroutine gatherXY(A, B, nk)
    use messenger
    include "mpif.inc"
    real A(nx_1,ny_1,nk), B(nx,ny,nk)
    real recvbuf(nx_1,ny_1,nk,nproc)
    integer i1(nproc), i2(nproc), &
            j1(nproc), j2(nproc)

    ! Gather a distributed XY-array to the root process
    ! including overlapping borders

    call updateBorder(A, nx_1,ny_1,nk, 1, 1)
    call updateBorder(A, nx_1,ny_1,nk, 2, 2)

    call MPI_gather (A, nx_1*ny_1*nk, MPI_REAL8, &
                     recvbuf, nx_1*ny_1*nk, MPI_REAL8, &
                     idroot, MPI_COMM_WORLD, ierr)

    if (myid == idroot) then
        do ip=1,nproc
            i1(ip) = ibmino_1(icoord(1,ip))
            j1(ip) = jbmino_1(icoord(2,ip))
            i2(ip) = ibmaxo_1(icoord(1,ip))
            j2(ip) = jbmaxo_1(icoord(2,ip))
        end do

        do ip=1,nproc
            do k=1,nk
            do j=j1(ip),j2(ip)
            do i=i1(ip),i2(ip)
                jj = j-j1(ip)+1
                ii = i-i1(ip)+1
                B(i,j,k) = recvbuf(ii,jj,k,ip)
            end do
            end do
            end do
        end do
    end if

    return
end

subroutine allGatherXY(A, B, nk)
    use messenger
    include "mpif.inc"
    real A(nx_1,ny_1,nk), B(nx,ny,nk)
    real recvbuf(nx_1,ny_1,nk,nproc)
    integer i1(nproc), i2(nproc), &
            j1(nproc), j2(nproc)

    ! Gather a distributed XY-array to the root process
    ! including overlapping borders

    call updateBorder(A, nx_1,ny_1,nk, 1, 1)
    call updateBorder(A, nx_1,ny_1,nk, 2, 2)

    call MPI_allGather (A, nx_1*ny_1*nk, MPI_REAL8, &
                        recvbuf, nx_1*ny_1*nk, MPI_REAL8, &
                        MPI_COMM_WORLD, ierr)

    do ip=1,nproc
        i1(ip) = ibmino_1(icoord(1,ip))
        j1(ip) = jbmino_1(icoord(2,ip))
        i2(ip) = ibmaxo_1(icoord(1,ip))
        j2(ip) = jbmaxo_1(icoord(2,ip))
    end do

    do ip=1,nproc
        do k=1,nk
        do j=j1(ip),j2(ip)
        do i=i1(ip),i2(ip)
            jj = j-j1(ip)+1
            ii = i-i1(ip)+1
            B(i,j,k) = recvbuf(ii,jj,k,ip)
        end do
        end do
        end do
    end do

    return
end

!=======================================================================
subroutine transpose(R1, R2, isign)
    use messenger
    include "mpif.inc"
    real R1(nz,nx_1,ny_1), R2(nx,ny_2,nz_2)
    real sendbuf(nx_1,ny_1,niz_2,nbz_2), &
         recvbuf(nx_1,ny_2,niz_2,nbx_1)
    integer i1(nbx_1), i2(nbx_1), &
            k1(nbz_2), k2(nbz_2)
    logical Lremain_dims(3)
    integer, save :: icomm = MPI_COMM_NULL

    ! Transpose for the poisson equation
    ! A(block,block,*) ==> B(*,block,block)

    if (icomm == MPI_COMM_NULL) then
        Lremain_dims(1) = .true.
        Lremain_dims(2) = .false.
        Lremain_dims(3) = .true.
        call MPI_Cart_sub (icomm_grid, Lremain_dims, icomm, ierr)
    end if

    do ib=1,nbx_1
        i1(ib) = ibmino_1(ib)
        i2(ib) = ibmaxo_1(ib)
    end do
    do kb=1,nbz_2
        k1(kb) = kbmin_2(kb)
        k2(kb) = kbmax_2(kb)
    end do

    isendcount = nx_1*ny_1*niz_2
    irecvcount = nx_1*ny_2*niz_2

    if (isign == +1) then
        do ip=1,npx
            do k=k1(ip),k2(ip)
            do j=1,ny_1
            do i=1,nx_1
                kk = k-k1(ip)+1
                sendbuf(i,j,kk,ip) = R1(k,i,j)
            end do
            end do
            end do
        end do

        call MPI_AllToAll (sendbuf, isendcount, MPI_REAL8, &
                           recvbuf, irecvcount, MPI_REAL8, &
                           icomm, ierr)

        do ip=1,npx
            do k=1,niz_2
            do j=1,ny_2
            do i=i1(ip),i2(ip)
                ii = i-i1(ip)+1
                R2(i,j,k+noz) = recvbuf(ii,j,k,ip)
            end do
            end do
            end do
        end do

    else if (isign == -1) then
        do ip=1,npx
            do k=1,niz_2
            do j=1,ny_2
            do i=i1(ip),i2(ip)
                ii = i-i1(ip)+1
                recvbuf(ii,j,k,ip) = R2(i,j,k+noz)
            end do
            end do
            end do
        end do

        call MPI_AllToAll (recvbuf, irecvcount, MPI_REAL8, &
                           sendbuf, isendcount, MPI_REAL8, &
                           icomm, ierr)

        do ip=1,npx
            do k=k1(ip),k2(ip)
            do j=1,ny_1
            do i=1,nx_1
                kk = k-k1(ip)+1
                R1(k,i,j) = sendbuf(i,j,kk,ip)
            end do
            end do
            end do
        end do

    end if

    return
end

subroutine transposeA(R1, R2, isign)
    use messenger
    include "mpif.inc"
    real R1(nx_1,ny_1,nz), R2(nx,ny_2,nz_2)
    real sendbuf(nx_1,ny_1,niz_2,nbz_2), &
         recvbuf(nx_1,ny_2,niz_2,nbx_1)
    integer i1(nbx_1), i2(nbx_1), &
            k1(nbz_2), k2(nbz_2)
    logical Lremain_dims(3)
    integer, save :: icomm = MPI_COMM_NULL

    ! Transpose for the poisson equation
    ! A(block,block,*) ==> B(*,block,block)

    if (icomm == MPI_COMM_NULL) then
        Lremain_dims(1) = .true.
        Lremain_dims(2) = .false.
        Lremain_dims(3) = .true.
        call MPI_Cart_sub (icomm_grid, Lremain_dims, icomm, ierr)
    end if

    do ib=1,nbx_1
        i1(ib) = ibmino_1(ib)
        i2(ib) = ibmaxo_1(ib)
    end do
    do kb=1,nbz_2
        k1(kb) = kbmin_2(kb)
        k2(kb) = kbmax_2(kb)
    end do

    isendcount = nx_1*ny_1*niz_2
    irecvcount = nx_1*ny_2*niz_2

    if (isign == +1) then
        do ip=1,npx
            do k=k1(ip),k2(ip)
            do j=1,ny_1
            do i=1,nx_1
                kk = k-k1(ip)+1
                sendbuf(i,j,kk,ip) = R1(i,j,k)
            end do
            end do
            end do
        end do

        call MPI_AllToAll (sendbuf, isendcount, MPI_REAL8, &
                           recvbuf, irecvcount, MPI_REAL8, &
                           icomm, ierr)

        do ip=1,npx
            do k=1,niz_2
            do j=1,ny_2
            do i=i1(ip),i2(ip)
                ii = i-i1(ip)+1
                R2(i,j,k+noz) = recvbuf(ii,j,k,ip)
            end do
            end do
            end do
        end do

    else if (isign == -1) then
        do ip=1,npx
            do k=1,niz_2
            do j=1,ny_2
            do i=i1(ip),i2(ip)
                ii = i-i1(ip)+1
                recvbuf(ii,j,k,ip) = R2(i,j,k+noz)
            end do
            end do
            end do
        end do

        call MPI_AllToAll (recvbuf, irecvcount, MPI_REAL8, &
                           sendbuf, isendcount, MPI_REAL8, &
                           icomm, ierr)

        do ip=1,npx
            do k=k1(ip),k2(ip)
            do j=1,ny_1
            do i=1,nx_1
                kk = k-k1(ip)+1
                R1(i,j,k) = sendbuf(i,j,kk,ip)
            end do
            end do
            end do
        end do

    end if

    return
end

subroutine updateBorder(A, n1,n2,n3, ixyz, ijk)
    use messenger
    include "mpif.inc"
    real A(n1,n2,n3)
    real, allocatable :: buf1(:,:,:), buf2(:,:,:)
    integer istatus(MPI_STATUS_SIZE)

    ! Update overlapping borders

    i1 = imap_1(ibmin_1(iblock_1))
    i2 = imap_1(ibmax_1(iblock_1))
    j1 = jmap_1(jbmin_1(jblock_1))
    j2 = jmap_1(jbmax_1(jblock_1))
    k1 = kmap_1(kbmin_1(kblock_1))
    k2 = kmap_1(kbmax_1(kblock_1))

    select case (ixyz)
    case (1)
        no = nox
        nn = nx_1
        ia = i1
        ib = i2
        ilower = 0
        iupper = 0
        if (iblock_1 .ne. 1) ilower = 1
        if (iblock_1 .ne. nbx_1) iupper = 1

    case (2)
        no = noy
        nn = ny_1
        ia = j1
        ib = j2
        ilower = 0
        iupper = 0
        if (jblock_1 .ne. 1) ilower = 1
        if (jblock_1 .ne. nby_1) iupper = 1

    case (3)
        no = noz
        nn = nz_1
        ia = k1
        ib = k2
        ! Periodic direction
        ilower = 1
        iupper = 1

    case default
        stop "updateBorder: invalid value for ixyz"
    end select

    select case (ijk)
    case (1)
        if (n1 .ne. nn) stop "updateBorder: bad value for n1"
        allocate(buf1(no,n2,n3), buf2(no,n2,n3))
        icount = no*n2*n3

        buf1 = A(ia:ia+no-1,:,:)
    case (2)
        if (n2 .ne. nn) stop "updateBorder: bad value for n2"
        allocate(buf1(n1,no,n3), buf2(n1,no,n3))
        icount = n1*no*n3

        buf1 = A(:,ia:ia+no-1,:)
    case (3)
        if (n3 .ne. nn) stop "updateBorder: bad value for n3"
        allocate(buf1(n1,n2,no), buf2(n1,n2,no))
        icount = n1*n2*no

        buf1 = A(:,:,ia:ia+no-1)
    case default
        stop "updateBorder: invalid value for ijk"
    end select

    ! Send to lower neighbor
    call MPI_Cart_shift(icomm_grid, ixyz-1, -1, isource, idest, ierr)
    call MPI_sendrecv(buf1, icount, MPI_REAL8, idest, 0, &
                      buf2, icount, MPI_REAL8, isource, 0, &
                      icomm_grid, istatus, ierr)

    select case (ijk)
    case (1)
        if (iupper == 1) &
        A(ib+1:ib+no,:,:) = buf2
        A(ib+no+1:n1,:,:) = 0.
        buf1 = A(ib-no+1:ib,:,:)
    case (2)
        if (iupper == 1) &
        A(:,ib+1:ib+no,:) = buf2
        A(:,ib+no+1:n2,:) = 0.
        buf1 = A(:,ib-no+1:ib,:)
    case (3)
        if (iupper == 1) &
        A(:,:,ib+1:ib+no) = buf2
        A(:,:,ib+no+1:n3) = 0.
        buf1 = A(:,:,ib-no+1:ib)
    end select

    ! Send to upper neighbor
    call MPI_Cart_shift(icomm_grid, ixyz-1, +1, isource, idest, ierr)
    call MPI_sendrecv(buf1, icount, MPI_REAL8, idest, 0, &
                      buf2, icount, MPI_REAL8, isource, 0, &
                      icomm_grid, istatus, ierr)

    select case (ijk)
    case (1)
        if (ilower == 1) &
        A(ia-no:ia-1,:,:) = buf2
    case (2)
        if (ilower == 1) &
        A(:,ia-no:ia-1,:) = buf2
    case (3)
        if (ilower == 1) &
        A(:,:,ia-no:ia-1) = buf2
    end select

    deallocate(buf1, buf2)

    return
end

subroutine updateBorder2(A, n1,n2,n3, ixyz, ijk)
    use messenger
    include "mpif.inc"
    real A(n1,n2,n3)
    real, allocatable :: buf1(:,:,:), buf2(:,:,:)
    integer istatus(MPI_STATUS_SIZE)

    ! Update overlapping borders

    i1 = imap_1(ibmin_1(iblock_1))
    i2 = imap_1(ibmax_1(iblock_1))
    j1 = jmap_1(jbmin_1(jblock_1))
    j2 = jmap_1(jbmax_1(jblock_1))
    k1 = kmap_1(kbmin_1(kblock_1))
    k2 = kmap_1(kbmax_1(kblock_1))

    select case (ixyz)
    case (1)
        no = nox + 1
        nn = nx_1 + 2
        ia = i1 + 1
        ib = i2 + 1
        ilower = 0
        iupper = 0
        if (iblock_1 .ne. 1) ilower = 1
        if (iblock_1 .ne. nbx_1) iupper = 1

    case (2)
        no = noy + 1
        nn = ny_1 + 2
        ia = j1 + 1
        ib = j2 + 1
        ilower = 0
        iupper = 0
        if (jblock_1 .ne. 1) ilower = 1
        if (jblock_1 .ne. nby_1) iupper = 1

    case (3)
        no = noz + 1
        nn = nz_1 + 2
        ia = k1 + 1
        ib = k2 + 1
        ! Periodic direction
        ilower = 1
        iupper = 1

    case default
        stop "updateBorder2: invalid value for ixyz"
    end select

    select case (ijk)
    case (1)
        if (n1 .ne. nn) stop "updateBorder2: bad value for n1"
        allocate(buf1(no,n2,n3), buf2(no,n2,n3))
        icount = no*n2*n3

        buf1 = A(ia:ia+no-1,:,:)
    case (2)
        if (n2 .ne. nn) stop "updateBorder2: bad value for n2"
        allocate(buf1(n1,no,n3), buf2(n1,no,n3))
        icount = n1*no*n3

        buf1 = A(:,ia:ia+no-1,:)
    case (3)
        if (n3 .ne. nn) stop "updateBorder2: bad value for n3"
        allocate(buf1(n1,n2,no), buf2(n1,n2,no))
        icount = n1*n2*no

        buf1 = A(:,:,ia:ia+no-1)
    case default
        stop "updateBorder2: invalid value for ijk"
    end select

    ! Send to lower neighbor
    call MPI_Cart_shift(icomm_grid, ixyz-1, -1, isource, idest, ierr)
    call MPI_sendrecv(buf1, icount, MPI_REAL8, idest, 0, &
                      buf2, icount, MPI_REAL8, isource, 0, &
                      icomm_grid, istatus, ierr)

    select case (ijk)
    case (1)
        if (iupper == 1) then
            A(ib+1:ib+no,:,:) = buf2
        else
            A(ib+no,:,:) = 0.
        end if
        A(ib+no+1:n1,:,:) = 0.
        buf1 = A(ib-no+1:ib,:,:)
    case (2)
        if (iupper == 1) then
            A(:,ib+1:ib+no,:) = buf2
        else
            A(:,ib+no,:) = 0.
        end if
        A(:,ib+no+1:n2,:) = 0.
        buf1 = A(:,ib-no+1:ib,:)
    case (3)
        if (iupper == 1)  then
            A(:,:,ib+1:ib+no) = buf2
        else
            A(:,:,ib+no) = 0.
        end if
        A(:,:,ib+no+1:n3) = 0.
        buf1 = A(:,:,ib-no+1:ib)
    end select

    ! Send to upper neighbor
    call MPI_Cart_shift(icomm_grid, ixyz-1, +1, isource, idest, ierr)
    call MPI_sendrecv(buf1, icount, MPI_REAL8, idest, 0, &
                      buf2, icount, MPI_REAL8, isource, 0, &
                      icomm_grid, istatus, ierr)

    select case (ijk)
    case (1)
        if (ilower == 1) then
            A(ia-no:ia-1,:,:) = buf2
        else
            A(ia-no,:,:) = 0.
        end if
    case (2)
        if (ilower == 1) then
            A(:,ia-no:ia-1,:) = buf2
        else
            A(:,ia-no,:) = 0.
        end if
    case (3)
        if (ilower == 1) then
            A(:,:,ia-no:ia-1) = buf2
        else
            A(:,:,ia-no) = 0.
        end if
    end select

    deallocate(buf1, buf2)

    return
end

subroutine updateBoundary(A, n1,n2,n3, ixyz, ijk)
    use messenger
    include "mpif.inc"
    real A(n1,n2,n3)
    real, allocatable :: buf1(:,:,:), buf2(:,:,:)
    integer istatus(MPI_STATUS_SIZE)

    ! Update border values

    select case (ixyz)
    case (1)
        no = nox
        ilower = 0
        iupper = 0
        if (iblock_1 .ne. 1) ilower = 1
        if (iblock_1 .ne. nbx_1) iupper = 1

    case (2)
        no = noy
        ilower = 0
        iupper = 0
        if (jblock_1 .ne. 1) ilower = 1
        if (jblock_1 .ne. nby_1) iupper = 1

    case (3)
        no = noz
        ! Periodic direction
        ilower = 1
        iupper = 1

    case default
        stop "updateBorder: invalid value for ixyz"
    end select

    select case (ijk)
    case (1)
        allocate(buf1(no,n2,n3), buf2(no,n2,n3))
        icount = no*n2*n3
        ia = 1+no
        ib = n1-no
        buf1 = A(ia:ia+no-1,:,:)
    case (2)
        allocate(buf1(n1,no,n3), buf2(n1,no,n3))
        icount = n1*no*n3
        ia = 1+no
        ib = n2-no
        buf1 = A(:,ia:ia+no-1,:)
    case (3)
        allocate(buf1(n1,n2,no), buf2(n1,n2,no))
        icount = n1*n2*no
        ia = 1+no
        ib = n3-no
        buf1 = A(:,:,ia:ia+no-1)
    case default
        stop "updateBorder: invalid value for ijk"
    end select

    ! Send to lower neighbor
    call MPI_Cart_shift(icomm_grid, ixyz-1, -1, isource, idest, ierr)
    call MPI_sendrecv(buf1, icount, MPI_REAL8, idest, 0, &
                      buf2, icount, MPI_REAL8, isource, 0, &
                      icomm_grid, istatus, ierr)

    select case (ijk)
    case (1)
        if (iupper == 1) &
        A(ib+1:ib+no,:,:) = buf2
        A(ib+no+1:n1,:,:) = 0.
        buf1 = A(ib-no+1:ib,:,:)
    case (2)
        if (iupper == 1) &
        A(:,ib+1:ib+no,:) = buf2
        A(:,ib+no+1:n2,:) = 0.
        buf1 = A(:,ib-no+1:ib,:)
    case (3)
        if (iupper == 1) &
        A(:,:,ib+1:ib+no) = buf2
        A(:,:,ib+no+1:n3) = 0.
        buf1 = A(:,:,ib-no+1:ib)
    end select

    ! Send to upper neighbor
    call MPI_Cart_shift(icomm_grid, ixyz-1, +1, isource, idest, ierr)
    call MPI_sendrecv(buf1, icount, MPI_REAL8, idest, 0, &
                      buf2, icount, MPI_REAL8, isource, 0, &
                      icomm_grid, istatus, ierr)

    select case (ijk)
    case (1)
        if (ilower == 1) &
        A(ia-no:ia-1,:,:) = buf2
    case (2)
        if (ilower == 1) &
        A(:,ia-no:ia-1,:) = buf2
    case (3)
        if (ilower == 1) &
        A(:,:,ia-no:ia-1) = buf2
    end select

    deallocate(buf1, buf2)

    return
end

!=======================================================================
subroutine triDiagonalM_(ixyz, a, b, c, r, n, lot)
    use messenger
    include "mpif.inc"

    call MPI_comm_size (icomm_xyz(ixyz), isize, ierr)

    if (isize == 1) then
        call triDiagonalM (a, b, c, r, n, lot)
    else
        call MPI_triDiagonalM (a, b, c, r, n, lot, icomm_xyz(ixyz), 0)
    end if

    return
end

subroutine triDiagonalM_BC_(ixyz, a, b, c, r, n, lot)
    use messenger
    include "mpif.inc"

    call MPI_comm_size (icomm_xyz(ixyz), isize, ierr)

    if (isize == 1) then
        call triDiagonalM (a, b, c, r, n, lot)
    else
        call MPI_triDiagonalM (a, b, c, r, n, lot, icomm_xyz(ixyz), 1)
    end if

    return
end

subroutine triDiagonalLUM_(ixyz, a, b, c, work, n, lot)
    use messenger
    include "mpif.inc"
    real work(lot,2*n+20)

    call MPI_comm_size (icomm_xyz(ixyz), isize, ierr)

    if (isize == 1) then
        call triDiagonalLUM (a, b, c, n, lot)
    else
        call MPI_triDiagonalLUM (a, b, c, work(1,1), work(1,1+n), &
                                 work(1,1+2*n), n, lot, icomm_xyz(ixyz))
    end if

    return
end

subroutine triLUSolveM_(ixyz, a, b, c, r, work, n, lot)
    use messenger
    include "mpif.inc"
    real work(lot,2*n+20)

    call MPI_comm_size (icomm_xyz(ixyz), isize, ierr)

    if (isize == 1) then
        call triLUSolveM (a, b, c, r, n, lot)
    else
        call MPI_triLUSolveM (a, b, c, work(1,1), work(1,1+n), &
                              work(1,1+2*n), r, n, lot, icomm_xyz(ixyz), 0)
    end if

    return
end

subroutine triLUSolveM_BC_(ixyz, a, b, c, r, work, n, lot)
    use messenger
    include "mpif.inc"
    real work(lot,2*n+20)

    call MPI_comm_size (icomm_xyz(ixyz), isize, ierr)

    if (isize == 1) then
        call triLUSolveM (a, b, c, r, n, lot)
    else
        call MPI_triLUSolveM (a, b, c, work(1,1), work(1,1+n), &
                              work(1,1+2*n), r, n, lot, icomm_xyz(ixyz), 1)
    end if

    return
end

subroutine pentaDiagonalM_(ixyz, a, b, c, d, e, r, n, lot)
    use messenger
    include "mpif.inc"

    call MPI_comm_size (icomm_xyz(ixyz), isize, ierr)

    if (isize == 1) then
        call pentaDiagonalM (a, b, c, d, e, r, n, lot)
    else
        call MPI_pentaDiagonalM (a, b, c, d, e, r, n, lot, icomm_xyz(ixyz))
    end if

    return
end