#include #include #include #include #include const char* libcublas = "/opt/cuda/lib64/libcublas.so"; cublasStatus cublasInit() { void* library = dlopen(libcublas, RTLD_LAZY); if (!library) { fprintf(stderr, "Cannot open shared library %s\n", libcublas); return CUBLAS_STATUS_INTERNAL_ERROR; } cublasStatus (*cublasInit_)(); cublasInit_ = dlsym(library, "cublasInit"); if (!cublasInit_) { fprintf(stderr, "Cannot load symbol cublasInit from library %s\n", libcublas); dlclose(library); return CUBLAS_STATUS_INTERNAL_ERROR; } // Get the MPI process rank. int rank; MPI_Comm_rank(MPI_COMM_WORLD, &rank); // Get the total number of available CUDA-enabled GPUs. int ndevices; int cuda_status = cudaGetDeviceCount(&ndevices); if (cuda_status != cudaSuccess) { fprintf(stderr, "Cannot get CUDA device count, status = %d\n", cuda_status); return CUBLAS_STATUS_INTERNAL_ERROR; } // Set device MPI process to work with. int idevice = rank % ndevices; cuda_status = cudaSetDevice(idevice); if (cuda_status != cudaSuccess) { fprintf(stderr, "Cannot set CUDA device to %d, status = %d\n", rank, cuda_status); return CUBLAS_STATUS_INTERNAL_ERROR; } else { printf("Assigned CUDA device %d to MPI process %d\n", idevice, rank); } cublasStatus result = (*cublasInit_)(); dlclose(library); return result; }