[petsc-dev] PETSc init eats too much CUDA memory

Fri Jan 7 10:51:03 CST 2022

Commenting out the block containing PetscDeviceContextXXX reduces the memory cost from 1.9GB to 1.5GB.
Commenting out  PetscDeviceInitializeTypeFromOptions_Private(0 reduces it to 0GB.

diff --git a/src/sys/objects/device/interface/device.cxx b/src/sys/objects/device/interface/device.cxx
index a682f16b696..1b2c7210dfe 100644
--- a/src/sys/objects/device/interface/device.cxx
+++ b/src/sys/objects/device/interface/device.cxx
@@ -422,7 +422,7 @@ PetscErrorCode PetscDeviceInitializeFromOptions_Internal(MPI_Comm comm)
     const auto deviceType = static_cast<PetscDeviceType>(i);
     auto initType         = defaultInitType;

-    ierr = PetscDeviceInitializeTypeFromOptions_Private(comm,deviceType,defaultDevice,defaultView,&initType);CHKERRQ(ierr);
+    //ierr = PetscDeviceInitializeTypeFromOptions_Private(comm,deviceType,defaultDevice,defaultView,&initType);CHKERRQ(ierr);
     if (PetscDeviceConfiguredFor_Internal(deviceType) && (initType == PETSC_DEVICE_INIT_EAGER)) {
       initializeDeviceContextEagerly = PETSC_TRUE;
       deviceContextInitDevice        = deviceType;
@@ -433,11 +433,13 @@ PetscErrorCode PetscDeviceInitializeFromOptions_Internal(MPI_Comm comm)

     /* somewhat inefficient here as the device context is potentially fully set up twice (once
      * when retrieved then the second time if setfromoptions makes changes) */
+    /*
     ierr = PetscInfo1(PETSC_NULLPTR,"Eagerly initializing PetscDeviceContext with %s device\n",PetscDeviceTypes[deviceContextInitDevice]);CHKERRQ(ierr);
     ierr = PetscDeviceContextSetRootDeviceType_Internal(deviceContextInitDevice);CHKERRQ(ierr);
     ierr = PetscDeviceContextGetCurrentContext(&dctx);CHKERRQ(ierr);
     ierr = PetscDeviceContextSetFromOptions(comm,"root_",dctx);CHKERRQ(ierr);
     ierr = PetscDeviceContextSetUp(dctx);CHKERRQ(ierr);
+    */
   }
   PetscFunctionReturn(0);
 }

On Jan 7, 2022, at 10:24 AM, Barry Smith <bsmith at petsc.dev<mailto:bsmith at petsc.dev>> wrote:


Without log_view it does not load any cuBLAS/cuSolve immediately with -log_view it loads all that stuff at startup. You need to go into the PetscInitialize() routine find where it loads the cublas and cusolve and comment out those lines then run with -log_view


On Jan 7, 2022, at 11:14 AM, Zhang, Hong via petsc-dev <petsc-dev at mcs.anl.gov<mailto:petsc-dev at mcs.anl.gov>> wrote:

When PETSc is initialized, it takes about 2GB CUDA memory. This is way too much for doing nothing. A test script is attached to reproduce the issue. If I remove the first line "import torch", PETSc consumes about 0.73GB, which is still significant. Does anyone have any idea about this behavior?

Thanks,
Hong


hongzhang at gpu02:/gpfs/jlse-fs0/users/hongzhang/Projects/pnode/examples (caidao22/update-examples)$ python3 test.py
CUDA memory before PETSc 0.000GB
CUDA memory after PETSc 0.004GB
hongzhang at gpu02:/gpfs/jlse-fs0/users/hongzhang/Projects/pnode/examples (caidao22/update-examples)$ python3 test.py -log_view :0.txt
CUDA memory before PETSc 0.000GB
CUDA memory after PETSc 1.936GB


import torch
import sys
import os

import nvidia_smi
nvidia_smi.nvmlInit()
handle = nvidia_smi.nvmlDeviceGetHandleByIndex(0)
info = nvidia_smi.nvmlDeviceGetMemoryInfo(handle)
print('CUDA memory before PETSc %.3fGB' % (info.used/1e9))

petsc4py_path = os.path.join(os.environ['PETSC_DIR'],os.environ['PETSC_ARCH'],'lib')
sys.path.append(petsc4py_path)
import petsc4py
petsc4py.init(sys.argv)
handle = nvidia_smi.nvmlDeviceGetHandleByIndex(0)
info = nvidia_smi.nvmlDeviceGetMemoryInfo(handle)
print('CUDA memory after PETSc %.3fGB' % (info.used/1e9))



-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.mcs.anl.gov/pipermail/petsc-dev/attachments/20220107/9ba86587/attachment.html>