[petsc-users] kokkos gpu/cpu copy

Steven Dargaville dargaville.steven at gmail.com
Wed Apr 2 19:11:06 CDT 2025


Hi

I have some code that does a solve with a PCMAT preconditioner. The mat
used is a shell and inside the shell MatMult it calls VecPointwiseDivide
with a vector "diag" that is the diagonal of a matrix assigned outside the
shell.

If I use mat/vec type of cuda, this occurs without any gpu/cpu copies as I
would expect. If however I use mat/vec type kokkos, at every iteration of
the solve there is a gpu/cpu copy that occurs. It seems this is triggered
by the offloadmask in the vector "diag", as it stays as 1 and hence a copy
occurs in VecPointwiseDivide.

I would have expected the offload mask to be 256 (kokkos) after the first
iteration, as the offload mask of "diag" changes to 3 when using cuda after
the first iteration.

Is this the expected behaviour with Kokkos, or is there something I need to
do to trigger that "diag" has its values on the gpu to prevent copies? I
have example c++ code that demonstrates this below. You can see the
difference when run with petsc 3.23.0 and either "-log_view -mat_type
aijcusparse -vec_type cuda" or "-log_view -mat_type aijkokkos -vec_type
kokkos".

Thanks for your help
Steven

Example c++ code:

// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

static char help[] = "Tests Kokkos for SHELL matrices\n\n";

#include <iostream>
#include <petscksp.h>
#include <petsclog.h>

typedef struct _n_User *User;
struct _n_User {
  Mat A;
  Vec diag;
};

static PetscErrorCode MatMult_User(Mat A, Vec X, Vec Y)
{
  User user;

  PetscFunctionBegin;
  PetscCall(MatShellGetContext(A, &user));

  // Print the offload mask inside the matmult
  PetscOffloadMask offloadmask;
  PetscCall(VecGetOffloadMask(X, &offloadmask));
  std::cout << "offload inside X " << offloadmask << std::endl;
  PetscCall(VecGetOffloadMask(Y, &offloadmask));
  std::cout << "offload inside Y " << offloadmask << std::endl;
  PetscCall(VecGetOffloadMask(user->diag, &offloadmask));
  std::cout << "offload inside diag " << offloadmask << std::endl;

  PetscCall(VecPointwiseDivide(Y, X, user->diag));
  PetscFunctionReturn(PETSC_SUCCESS);
}

int main(int argc, char **args)
{
  const PetscScalar xvals[] = {11, 13}, yvals[] = {17, 19};
  const PetscInt    inds[]  = {0, 1};
  PetscScalar       avals[] = {2, 3, 5, 7};
  Mat               S1, A;
  Vec               X, Y, diag;
  KSP               ksp;
  PC                pc;
  User              user;
  PetscLogStage  stage1, gpu_copy;

  PetscFunctionBeginUser;
  PetscCall(PetscInitialize(&argc, &args, NULL, help));

  // Build a matrix and vectors
  PetscCall(MatCreateFromOptions(PETSC_COMM_WORLD, NULL, 1, 2, 2, 2, 2,
&A));
  PetscCall(MatSetUp(A));
  PetscCall(MatSetValues(A, 2, inds, 2, inds, avals, INSERT_VALUES));
  PetscCall(MatAssemblyBegin(A, MAT_FINAL_ASSEMBLY));
  PetscCall(MatAssemblyEnd(A, MAT_FINAL_ASSEMBLY));
  PetscCall(MatCreateVecs(A, NULL, &X));
  PetscCall(VecCreateFromOptions(PETSC_COMM_WORLD, NULL, 1, 2, 2, &X));
  PetscCall(VecSetValues(X, 2, inds, xvals, INSERT_VALUES));
  PetscCall(VecDuplicate(X, &Y));
  PetscCall(VecDuplicate(X, &diag));
  PetscCall(VecSetValues(Y, 2, inds, yvals, INSERT_VALUES));
  PetscCall(VecAssemblyBegin(Y));
  PetscCall(VecAssemblyEnd(Y));

  // Create a shell matrix
  PetscCall(MatGetDiagonal(A, diag));
  PetscCall(PetscNew(&user));
  user->A = A;
  user->diag = diag;
  PetscCall(MatCreateShell(PETSC_COMM_WORLD, 2, 2, 2, 2, user, &S1));
  PetscCall(MatSetUp(S1));
  PetscCall(MatShellSetOperation(S1, MATOP_MULT, (void
(*)(void))MatMult_User));
  PetscCall(MatAssemblyBegin(S1, MAT_FINAL_ASSEMBLY));
  PetscCall(MatAssemblyEnd(S1, MAT_FINAL_ASSEMBLY));

  // Do a solve
  PetscCall(KSPCreate(PETSC_COMM_WORLD,&ksp));
  // Give the ksp a pcmat as the preconditioner and the mat is the shell
  PetscCall(KSPSetOperators(ksp,A, S1));
  PetscCall(KSPSetType(ksp, KSPRICHARDSON));
  PetscCall(KSPSetFromOptions(ksp));
  PetscCall(KSPGetPC(ksp, &pc));
  PetscCall(PCSetType(pc, PCMAT));
  PetscCall(KSPSetUp(ksp));

  // Print the offload mask before our solve
  PetscOffloadMask offloadmask;
  PetscCall(VecGetOffloadMask(X, &offloadmask));
  std::cout << "offload X " << offloadmask << std::endl;
  PetscCall(VecGetOffloadMask(Y, &offloadmask));
  std::cout << "offload Y " << offloadmask << std::endl;
  PetscCall(VecGetOffloadMask(user->diag, &offloadmask));
  std::cout << "offload diag " << offloadmask << std::endl;

  // Trigger any gpu copies in the first solve
  PetscCall(PetscLogStageRegister("gpu_copy",&gpu_copy));
  PetscCall(PetscLogStagePush(gpu_copy));
  PetscCall(KSPSolve(ksp, X, Y));
  PetscCall(PetscLogStagePop());

  // There should be no copies in this solve
  PetscCall(PetscLogStageRegister("no copy",&stage1));
  PetscCall(PetscLogStagePush(stage1));
  PetscCall(KSPSolve(ksp, X, Y));
  PetscCall(PetscLogStagePop());

  PetscCall(MatDestroy(&S1));
  PetscCall(VecDestroy(&X));
  PetscCall(VecDestroy(&Y));
  PetscCall(PetscFinalize());
  return 0;
}
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.mcs.anl.gov/pipermail/petsc-users/attachments/20250403/cb92a2dc/attachment.html>


More information about the petsc-users mailing list