[petsc-dev] pbjacobi error, in parallel, on kokkos/cuda

Mark Adams mfadams at lbl.gov
Tue Feb 18 10:24:19 CST 2025


And here is the ex55 diffs:

diff --git a/src/ksp/ksp/tutorials/ex55.c b/src/ksp/ksp/tutorials/ex55.c
index dded9cae13b..18bdcc8efaf 100644
--- a/src/ksp/ksp/tutorials/ex55.c
+++ b/src/ksp/ksp/tutorials/ex55.c
@@ -6,6 +6,7 @@ Load of 1.0 in x direction on all nodes (not a true uniform
load).\n\
   -alpha <v>      : scaling of material coefficient in embedded
circle\n\n";

 #include <petscksp.h>
+#include "../../../../src/ksp/pc/impls/gamg/gamg.h"            /*I
"petscpc.h" I*/

 int main(int argc, char **args)
 {
@@ -221,6 +222,30 @@ int main(int argc, char **args)

   PetscCall(VecSet(xx, .0));

+  PC pc;
+  PetscCall(KSPGetPC(ksp, &pc));
+  PC_MG   *mg      = (PC_MG *)pc->data;
+  PC_MG_Levels **mglevels = mg->levels;
+  Mat P = mglevels[mg->nlevels-1]->interpolate;
+  PetscCall(MatViewFromOptions(mglevels[mg->nlevels-1]->A, NULL,
"-rap_mat_view"));
+  PetscCall(MatViewFromOptions(Amat, NULL, "-rap_mat_view"));
+  KSP ksp2;
+  PetscCall(KSPCreate(PETSC_COMM_WORLD, &ksp2));
+  PetscCall(KSPSetOptionsPrefix(ksp2, "rap_"));
+  PetscCall(KSPSetFromOptions(ksp2));
+  PetscCall(KSPGetPC(ksp2, &pc));
+  PetscCall(KSPSetOperators(ksp2, Amat, Amat));
+  PetscCall(PCMGSetGalerkin(pc, PC_MG_GALERKIN_PMAT));
+  PetscCall(PCMGSetInterpolation(pc, 1, P));
+  PetscCall(VecSet(bb, 1.0));
+  PetscCall(PetscLogStagePush(stage[1]));
+  PetscCall(KSPSolve(ksp2, bb, xx));
+  //PetscCall(MatViewFromOptions(mglevels[0]->A, NULL, "-rap_mat_view"));
+  PetscCall(PetscLogStagePop());
+  PetscCall(PetscFinalize());
+  exit(12);
+
+
   PetscCall(PetscLogStagePush(stage[1]));
   PetscCall(KSPSolve(ksp, bb, xx));
   PetscCall(PetscLogStagePop());

On Tue, Feb 18, 2025 at 11:21 AM Mark Adams <mfadams at lbl.gov> wrote:

> And, I forgot that the GAMG coarse grid (and this tiny grid) are forced to
> one processor, hence valgrind errors only on one process.
> Add: -pc_gamg_parallel_coarse_grid_solver -ne 13
> If you want to see valgrind errors on all 4 processors.
>
> On Tue, Feb 18, 2025 at 11:07 AM Mark Adams <mfadams at lbl.gov> wrote:
>
>> Also, this uses the branch: adams/mat-rap-blocksize
>> that has fixes to get the block sizes moved up in P'AP.
>>
>> On Tue, Feb 18, 2025 at 9:29 AM Mark Adams <mfadams at lbl.gov> wrote:
>>
>>> I've got a bug in pbjacobi that only shows up on* the Galerkin coarse
>>> grid *(I have not been able to reproduce it a fine grid at least), and
>>> in *parallel*, and on *GPUs*/kokkos.
>>>
>>> I have modified ex55 to take P from GAMG and give it (one) to PCMG with
>>> Galerkin coarse grids,and solve (code and command lines appended).
>>> I see this with ex56 (bs=3 & 6), ex55 (bs=2), but ex54 (bs=1) is fine
>>> (does pbjacobi switch to jacobi?)
>>>
>>> With 4 processors I get these valgrind errors only on these bad solves
>>> (no false positives). Note that *only one process has errors*, and note
>>> some solver output before and after:
>>>
>>> I'm going to keeps digging but ideas are welcome,
>>> Thanks,
>>> Mark
>>>
>>> [0] <pc:gamg> PCSetUp_GAMG(): (null): 1) N=4, n data cols=2, nnz/row
>>> (ave)=4, 1 active pes
>>> [0] <pc:gamg> PCSetUp_GAMG(): (null): 2 levels, operator complexity =
>>> 1.04
>>> [0] <pc:gamg> PCSetUp_GAMG(): (null): PCSetUp_GAMG: call
>>> KSPChebyshevSetEigenvalues on level 0 (N=32) with emax = 2.26125 emin =
>>> 0.0198344
>>> [0] <pc:gamg> PCSetUp_MG(): Using outer operators to define finest grid
>>> operator
>>>   because PCMGGetSmoother(pc,nlevels-1,&ksp);KSPSetOperators(ksp,...);
>>> was not called.
>>> [0] <pc:mg> PCSetUp_MG(): Using outer operators to define finest grid
>>> operator
>>>   because PCMGGetSmoother(pc,nlevels-1,&ksp);KSPSetOperators(ksp,...);
>>> was not called.
>>> ==978424== Invalid read of size 16
>>> ==978424==    at 0x42B9AFA2: ??? (in /usr/lib64/libcuda.so.550.127.08)
>>> ==978424==    by 0x42AF0F12: ??? (in /usr/lib64/libcuda.so.550.127.08)
>>> ==978424==    by 0x42C3EE7B: ??? (in /usr/lib64/libcuda.so.550.127.08)
>>> ==978424==    by 0x42EEC474: ??? (in /usr/lib64/libcuda.so.550.127.08)
>>> ==978424==    by 0x42B4B495: ??? (in /usr/lib64/libcuda.so.550.127.08)
>>> ==978424==    by 0x42EC748F: ??? (in /usr/lib64/libcuda.so.550.127.08)
>>> ==978424==    by 0x42B44781: ??? (in /usr/lib64/libcuda.so.550.127.08)
>>> ==978424==    by 0x42CF766D: ??? (in /usr/lib64/libcuda.so.550.127.08)
>>> ==978424==    by 0x42445504: ??? (in
>>> /opt/nvidia/hpc_sdk/Linux_x86_64/23.9/cuda/12.2/targets/x86_64-linux/lib/libcudart.so.12.2.53)
>>> ==978424==    by 0x42417A04: ??? (in
>>> /opt/nvidia/hpc_sdk/Linux_x86_64/23.9/cuda/12.2/targets/x86_64-linux/lib/libcudart.so.12.2.53)
>>> ==978424==    by 0x42468730: cudaMemcpy (in
>>> /opt/nvidia/hpc_sdk/Linux_x86_64/23.9/cuda/12.2/targets/x86_64-linux/lib/libcudart.so.12.2.53)
>>> ==978424==    by 0x4896979: cuda_memcpy_wrapper<>
>>> (Kokkos_Cuda_Instance.hpp:365)
>>> ==978424==    by 0x4896979: Kokkos::Impl::DeepCopyCuda(void*, void
>>> const*, unsigned long) (Kokkos_CudaSpace.cpp:62)
>>> ==978424==  Address 0xcb722dfc is 1,644 bytes inside a block of size
>>> 1,652 alloc'd
>>> ==978424==    at 0x4E0A926: memalign (in
>>> /usr/lib/valgrind/vgpreload_memcheck-amd64-linux.so)
>>> ==978424==    by 0x4E0AA69: posix_memalign (in
>>> /usr/lib/valgrind/vgpreload_memcheck-amd64-linux.so)
>>> ==978424==    by 0x57E3BD9: PetscMallocAlign (mal.c:52)
>>> ==978424==    by 0x57E892F: PetscTrMallocDefault (mtr.c:175)
>>> ==978424==    by 0x57E5A77: PetscMallocA (mal.c:421)
>>> ==978424==    by 0x63B02F5: MatInvertBlockDiagonal_SeqAIJ (aij.c:3333)
>>> ==978424==    by 0x69B0599: MatInvertBlockDiagonal (matrix.c:10908)
>>> ==978424==    by 0x629D585: MatInvertBlockDiagonal_MPIAIJ (mpiaij.c:2588)
>>> ==978424==    by 0x69B0599: MatInvertBlockDiagonal (matrix.c:10908)
>>> ==978424==    by 0x7C7F726: PCSetUp_PBJacobi_Host (pbjacobi.c:256)
>>> ==978424==    by 0x7D737D8: PCSetUp_PBJacobi_Kokkos
>>> (pbjacobi_kok.kokkos.cxx:90)
>>> ==978424==    by 0x7C803F8: PCSetUp_PBJacobi (pbjacobi.c:296)
>>> ==978424==
>>> ==978424== Invalid read of size 16
>>> ==978424==    at 0x42B9AFB4: ??? (in /usr/lib64/libcuda.so.550.127.08)
>>> ==978424==    by 0x42AF0F12: ??? (in /usr/lib64/libcuda.so.550.127.08)
>>> ==978424==    by 0x42C3EE7B: ??? (in /usr/lib64/libcuda.so.550.127.08)
>>> ==978424==    by 0x42EEC474: ??? (in /usr/lib64/libcuda.so.550.127.08)
>>> ==978424==    by 0x42B4B495: ??? (in /usr/lib64/libcuda.so.550.127.08)
>>> ==978424==    by 0x42EC748F: ??? (in /usr/lib64/libcuda.so.550.127.08)
>>> ==978424==    by 0x42B44781: ??? (in /usr/lib64/libcuda.so.550.127.08)
>>> ==978424==    by 0x42CF766D: ??? (in /usr/lib64/libcuda.so.550.127.08)
>>> ==978424==    by 0x42445504: ??? (in
>>> /opt/nvidia/hpc_sdk/Linux_x86_64/23.9/cuda/12.2/targets/x86_64-linux/lib/libcudart.so.12.2.53)
>>> ==978424==    by 0x42417A04: ??? (in
>>> /opt/nvidia/hpc_sdk/Linux_x86_64/23.9/cuda/12.2/targets/x86_64-linux/lib/libcudart.so.12.2.53)
>>> ==978424==    by 0x42468730: cudaMemcpy (in
>>> /opt/nvidia/hpc_sdk/Linux_x86_64/23.9/cuda/12.2/targets/x86_64-linux/lib/libcudart.so.12.2.53)
>>> ==978424==    by 0x4896979: cuda_memcpy_wrapper<>
>>> (Kokkos_Cuda_Instance.hpp:365)
>>> ==978424==    by 0x4896979: Kokkos::Impl::DeepCopyCuda(void*, void
>>> const*, unsigned long) (Kokkos_CudaSpace.cpp:62)
>>> ==978424==  Address 0xcb722e0c is 8 bytes after a block of size 1,652
>>> alloc'd
>>> ==978424==    at 0x4E0A926: memalign (in
>>> /usr/lib/valgrind/vgpreload_memcheck-amd64-linux.so)
>>> ==978424==    by 0x4E0AA69: posix_memalign (in
>>> /usr/lib/valgrind/vgpreload_memcheck-amd64-linux.so)
>>> ==978424==    by 0x57E3BD9: PetscMallocAlign (mal.c:52)
>>> ==978424==    by 0x57E892F: PetscTrMallocDefault (mtr.c:175)
>>> ==978424==    by 0x57E5A77: PetscMallocA (mal.c:421)
>>> ==978424==    by 0x63B02F5: MatInvertBlockDiagonal_SeqAIJ (aij.c:3333)
>>> ==978424==    by 0x69B0599: MatInvertBlockDiagonal (matrix.c:10908)
>>> ==978424==    by 0x629D585: MatInvertBlockDiagonal_MPIAIJ (mpiaij.c:2588)
>>> ==978424==    by 0x69B0599: MatInvertBlockDiagonal (matrix.c:10908)
>>> ==978424==    by 0x7C7F726: PCSetUp_PBJacobi_Host (pbjacobi.c:256)
>>> ==978424==    by 0x7D737D8: PCSetUp_PBJacobi_Kokkos
>>> (pbjacobi_kok.kokkos.cxx:90)
>>> ==978424==    by 0x7C803F8: PCSetUp_PBJacobi (pbjacobi.c:296)
>>> ==978424==
>>> ==978424== Invalid read of size 4
>>> ==978424==    at 0x42B9B103: ??? (in /usr/lib64/libcuda.so.550.127.08)
>>> ==978424==    by 0x42AF0F12: ??? (in /usr/lib64/libcuda.so.550.127.08)
>>> ==978424==    by 0x42C3EE7B: ??? (in /usr/lib64/libcuda.so.550.127.08)
>>> ==978424==    by 0x42EEC474: ??? (in /usr/lib64/libcuda.so.550.127.08)
>>> ==978424==    by 0x42B4B495: ??? (in /usr/lib64/libcuda.so.550.127.08)
>>> ==978424==    by 0x42EC748F: ??? (in /usr/lib64/libcuda.so.550.127.08)
>>> ==978424==    by 0x42B44781: ??? (in /usr/lib64/libcuda.so.550.127.08)
>>> ==978424==    by 0x42CF766D: ??? (in /usr/lib64/libcuda.so.550.127.08)
>>> ==978424==    by 0x42445504: ??? (in
>>> /opt/nvidia/hpc_sdk/Linux_x86_64/23.9/cuda/12.2/targets/x86_64-linux/lib/libcudart.so.12.2.53)
>>> ==978424==    by 0x42417A04: ??? (in
>>> /opt/nvidia/hpc_sdk/Linux_x86_64/23.9/cuda/12.2/targets/x86_64-linux/lib/libcudart.so.12.2.53)
>>> ==978424==    by 0x42468730: cudaMemcpy (in
>>> /opt/nvidia/hpc_sdk/Linux_x86_64/23.9/cuda/12.2/targets/x86_64-linux/lib/libcudart.so.12.2.53)
>>> ==978424==    by 0x4896979: cuda_memcpy_wrapper<>
>>> (Kokkos_Cuda_Instance.hpp:365)
>>> ==978424==    by 0x4896979: Kokkos::Impl::DeepCopyCuda(void*, void
>>> const*, unsigned long) (Kokkos_CudaSpace.cpp:62)
>>> ==978424==  Address 0xcb722e1c is 12 bytes after a block of size 1,664
>>> in arena "client"
>>> ==978424==
>>> ==978424== Invalid read of size 4
>>> ==978424==    at 0x42B9B107: ??? (in /usr/lib64/libcuda.so.550.127.08)
>>> ==978424==    by 0x42AF0F12: ??? (in /usr/lib64/libcuda.so.550.127.08)
>>> ==978424==    by 0x42C3EE7B: ??? (in /usr/lib64/libcuda.so.550.127.08)
>>> ==978424==    by 0x42EEC474: ??? (in /usr/lib64/libcuda.so.550.127.08)
>>> ==978424==    by 0x42B4B495: ??? (in /usr/lib64/libcuda.so.550.127.08)
>>> ==978424==    by 0x42EC748F: ??? (in /usr/lib64/libcuda.so.550.127.08)
>>> ==978424==    by 0x42B44781: ??? (in /usr/lib64/libcuda.so.550.127.08)
>>> ==978424==    by 0x42CF766D: ??? (in /usr/lib64/libcuda.so.550.127.08)
>>> ==978424==    by 0x42445504: ??? (in
>>> /opt/nvidia/hpc_sdk/Linux_x86_64/23.9/cuda/12.2/targets/x86_64-linux/lib/libcudart.so.12.2.53)
>>> ==978424==    by 0x42417A04: ??? (in
>>> /opt/nvidia/hpc_sdk/Linux_x86_64/23.9/cuda/12.2/targets/x86_64-linux/lib/libcudart.so.12.2.53)
>>> ==978424==    by 0x42468730: cudaMemcpy (in
>>> /opt/nvidia/hpc_sdk/Linux_x86_64/23.9/cuda/12.2/targets/x86_64-linux/lib/libcudart.so.12.2.53)
>>> ==978424==    by 0x4896979: cuda_memcpy_wrapper<>
>>> (Kokkos_Cuda_Instance.hpp:365)
>>> ==978424==    by 0x4896979: Kokkos::Impl::DeepCopyCuda(void*, void
>>> const*, unsigned long) (Kokkos_CudaSpace.cpp:62)
>>> ==978424==  Address 0xcb722e1c is 12 bytes after a block of size 1,664
>>> in arena "client"
>>> ==978424==
>>>       Residual norms for rap_mg_coarse_ solve.
>>>       0 KSP Residual norm 1.118121970641e+01
>>>       1 KSP Residual norm 3.575439993035e-01
>>> [0]PETSC ERROR: --------------------- Error Message
>>> --------------------------------------------------------------
>>> [0]PETSC ERROR: Diverged due to indefinite preconditioner, beta
>>> -0.00553023, betaold 5.27744
>>>
>>>
>>> ksp/ex55:
>>>
>>>
>>>     PC pc;
>>>   PetscCall(KSPGetPC(ksp, &pc));
>>>   PC_MG   *mg      = (PC_MG *)pc->data;
>>>   PC_MG_Levels **mglevels = mg->levels;
>>>   Mat P = mglevels[mg->nlevels-1]->interpolate;
>>>   PetscCall(MatViewFromOptions(mglevels[mg->nlevels-1]->A, NULL,
>>> "-rap_mat_view"));
>>>   PetscCall(MatViewFromOptions(Amat, NULL, "-rap_mat_view"));
>>>   KSP ksp2;
>>>   PetscCall(KSPCreate(PETSC_COMM_WORLD, &ksp2));
>>>   PetscCall(KSPSetOptionsPrefix(ksp2, "rap_"));
>>>   PetscCall(KSPSetFromOptions(ksp2));
>>>   PetscCall(KSPGetPC(ksp2, &pc));
>>>   PetscCall(KSPSetOperators(ksp2, Amat, Amat));
>>>   PetscCall(PCMGSetGalerkin(pc, PC_MG_GALERKIN_PMAT));
>>>   PetscCall(PCMGSetInterpolation(pc, 1, P));
>>>   PetscCall(VecSet(bb, 1.0));
>>>   PetscCall(PetscLogStagePush(stage[1]));
>>>   PetscCall(KSPSolve(ksp2, bb, xx));
>>>   PetscCall(PetscLogStagePop());
>>>   PetscCall(PetscFinalize());
>>>   exit(12);
>>>
>>>
>>>   PetscCall(PetscLogStagePush(stage[1])); // original ex55 code
>>>   PetscCall(KSPSolve(ksp, bb, xx));
>>>
>>>
>>> *$ srun -n 4 valgrind --tool=memcheck --leak-check=no ./ex55 -ne 3
>>> -pc_type gamg -rap_pc_type mg -rap_ksp_monitor -rap_mg_levels_pc_type
>>> jacobi -rap_mg_coarse_pc_type pbjacobi -rap_mg_coarse_ksp_monitor
>>> -options_left -rap_pc_mg_levels 2 -rap_mg_coarse_ksp_type cg -mat_type
>>> aijkokkos -fp_trap -ksp_monitor -rap_ksp_viewxx -info :pc,dm
>>> -rap_mg_coarse_ksp_error_if_not_converged *
>>>
>>>
>>>
>>>
>>>
>>>
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.mcs.anl.gov/pipermail/petsc-dev/attachments/20250218/b8891528/attachment-0001.html>


More information about the petsc-dev mailing list