exporting patch: # HG changeset patch # User Paul Mullowney # Date 1362859983 25200 # Node ID 6be0d0f06ad18794377015b5ec468e11105b58b1 # Parent 6d60df3aadfe49f0bf640fa0a41fe3a423b88cf6 Implementation of ICC in this patch. diff -r 6d60df3aadfe -r 6be0d0f06ad1 src/mat/impls/aij/seq/seqcusparse/aijcusparse.cu --- a/src/mat/impls/aij/seq/seqcusparse/aijcusparse.cu Mon Feb 04 11:42:47 2013 -0700 +++ b/src/mat/impls/aij/seq/seqcusparse/aijcusparse.cu Sat Mar 09 13:13:03 2013 -0700 @@ -1,12 +1,13 @@ /* - Defines the basic matrix operations for the AIJ (compressed row) + Defines the basic matrix operations for the AIJ (compressed row) matrix storage format. */ #include "petscconf.h" PETSC_CUDA_EXTERN_C_BEGIN #include "../src/mat/impls/aij/seq/aij.h" /*I "petscmat.h" I*/ -//#include "petscbt.h" +#include <../src/mat/impls/sbaij/seq/sbaij.h> + #include "../src/vec/vec/impls/dvecimpl.h" #include "petsc-private/vecimpl.h" PETSC_CUDA_EXTERN_C_END @@ -53,12 +54,28 @@ EXTERN_C_BEGIN extern PetscErrorCode MatGetFactor_seqaij_petsc(Mat,MatFactorType,Mat*); EXTERN_C_END -/* - MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers (ILU) for seq matrices - on the GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp - Level: beginner -*/ + +/*MC + MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices + on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported + algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer + performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the + CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these + algorithms are not recommended. This class does NOT support direct solver operations. + + ./configure --download-txpetscgpu to install PETSc to use CUSPARSE + + Consult CUSPARSE documentation for more information about the matrix storage formats + which correspond to the options database keys below. + + Options Database Keys: ++ -mat_cusparse_solve_storage_format csr - sets the storage format matrices (for factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). Only available with 'txpetscgpu' package. + + Level: beginner + +.seealso: PCFactorSetMatSolverPackage(), MatSolverPackage, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation +M*/ EXTERN_C_BEGIN #undef __FUNCT__ @@ -69,20 +86,16 @@ PetscFunctionBegin; ierr = MatGetFactor_seqaij_petsc(A,ftype,B);CHKERRQ(ierr); + ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); + ierr = MatSetFromOptions_SeqAIJCUSPARSE(*B);CHKERRQ(ierr); if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) { - ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); - ierr = MatSetFromOptions_SeqAIJCUSPARSE(*B);CHKERRQ(ierr); - ierr = PetscObjectComposeFunctionDynamic((PetscObject)(*B),"MatFactorGetSolverPackage_C","MatFactorGetSolverPackage_seqaij_cusparse",MatFactorGetSolverPackage_seqaij_cusparse);CHKERRQ(ierr); - (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE; (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJCUSPARSE; } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) { - ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); - ierr = MatSetFromOptions_SeqAIJCUSPARSE(*B);CHKERRQ(ierr); - ierr = PetscObjectComposeFunctionDynamic((PetscObject)(*B),"MatFactorGetSolverPackage_C","MatFactorGetSolverPackage_seqaij_cusparse",MatFactorGetSolverPackage_seqaij_cusparse);CHKERRQ(ierr); (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJCUSPARSE; (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE; } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types"); + ierr = PetscObjectComposeFunctionDynamic((PetscObject)(*B),"MatFactorGetSolverPackage_C","MatFactorGetSolverPackage_seqaij_cusparse",MatFactorGetSolverPackage_seqaij_cusparse);CHKERRQ(ierr); (*B)->factortype = ftype; PetscFunctionReturn(0); } @@ -281,7 +294,7 @@ } cusparseMat = GPU_Matrix_Factory::getNew(MatCUSPARSEStorageFormats[cusparseTriFactors->format]); - stat = cusparseMat->initializeCusparseMat(MAT_cusparseHandle, CUSPARSE_MATRIX_TYPE_TRIANGULAR, CUSPARSE_FILL_MODE_LOWER);CHKERRCUSP(stat); + stat = cusparseMat->initializeCusparseMat(MAT_cusparseHandle, CUSPARSE_MATRIX_TYPE_TRIANGULAR, CUSPARSE_FILL_MODE_LOWER, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSP(stat); ierr = cusparseMat->setMatrix(n, n, nzLower, AiLo, AjLo, AALo);CHKERRCUSP(ierr); stat = cusparseMat->solveAnalysis();CHKERRCUSP(stat); @@ -298,7 +311,7 @@ } #undef __FUNCT__ -#define __FUNCT__ "MatSeqAIJCUSPARSEBuildLUUpperTriMatrix" +#define __FUNCT__ "MatSeqAIJCUSPARSEBuildILUUpperTriMatrix" PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A) { Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; @@ -348,7 +361,7 @@ } cusparseMat = GPU_Matrix_Factory::getNew(MatCUSPARSEStorageFormats[cusparseTriFactors->format]); - stat = cusparseMat->initializeCusparseMat(MAT_cusparseHandle, CUSPARSE_MATRIX_TYPE_TRIANGULAR, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSP(stat); + stat = cusparseMat->initializeCusparseMat(MAT_cusparseHandle, CUSPARSE_MATRIX_TYPE_TRIANGULAR, CUSPARSE_FILL_MODE_UPPER, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSP(stat); ierr = cusparseMat->setMatrix(n, n, nzUpper, AiUp, AjUp, AAUp);CHKERRCUSP(ierr); stat = cusparseMat->solveAnalysis();CHKERRCUSP(stat); @@ -404,12 +417,119 @@ + + +#undef __FUNCT__ +#define __FUNCT__ "MatSeqAIJCUSPARSEBuildICCTriMatrices" +PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A) +{ + Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; + Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; + GPU_Matrix_Ifc * cusparseMatLo = (GPU_Matrix_Ifc*)cusparseTriFactors->loTriFactorPtr; + GPU_Matrix_Ifc * cusparseMatUp = (GPU_Matrix_Ifc*)cusparseTriFactors->upTriFactorPtr; + cusparseStatus_t stat; + PetscErrorCode ierr; + PetscInt *AiUp, *AjUp; + PetscScalar *AAUp; + PetscScalar *AALo; + PetscInt nzUpper=a->nz,n=A->rmap->n,i,offset,nz,j; + Mat_SeqSBAIJ *b = (Mat_SeqSBAIJ*)A->data; + const PetscInt *ai=b->i,*aj=b->j,*vj; + const MatScalar *aa=b->a,*v; + + + + PetscFunctionBegin; + if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU) { + try { + /* Allocate Space for the upper triangular matrix */ + ierr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUSP(ierr); + ierr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUSP(ierr); + ierr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUSP(ierr); + ierr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUSP(ierr); + + /* Fill the upper triangular matrix */ + AiUp[0]=(PetscInt) 0; + AiUp[n]=nzUpper; + offset = 0; + for (i=0; i0) { + ierr = PetscMemcpy(&(AjUp[offset]), vj, nz*sizeof(PetscInt));CHKERRQ(ierr); + ierr = PetscMemcpy(&(AAUp[offset]), v, nz*sizeof(PetscScalar));CHKERRQ(ierr); + for (j=offset; jformat]); + stat = cusparseMatUp->initializeCusparseMat(MAT_cusparseHandle, CUSPARSE_MATRIX_TYPE_TRIANGULAR, CUSPARSE_FILL_MODE_UPPER, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSP(stat); + ierr = cusparseMatUp->setMatrix(A->rmap->n, A->cmap->n, a->nz, AiUp, AjUp, AAUp);CHKERRCUSP(ierr); + stat = cusparseMatUp->solveAnalysis();CHKERRCUSP(stat); + ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = cusparseMatUp; + + /* Build the lower triangular piece */ + cusparseMatLo = GPU_Matrix_Factory::getNew(MatCUSPARSEStorageFormats[cusparseTriFactors->format]); + stat = cusparseMatLo->initializeCusparseMat(MAT_cusparseHandle, CUSPARSE_MATRIX_TYPE_TRIANGULAR, CUSPARSE_FILL_MODE_UPPER, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSP(stat); + ierr = cusparseMatLo->setMatrix(A->rmap->n, A->cmap->n, a->nz, AiUp, AjUp, AALo);CHKERRCUSP(ierr); + stat = cusparseMatLo->solveAnalysis(CUSPARSE_OPERATION_TRANSPOSE);CHKERRCUSP(stat); + ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = cusparseMatLo; + + /* set this flag ... for performance logging */ + ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->isSymmOrHerm = PETSC_TRUE; + + A->valid_GPU_matrix = PETSC_CUSP_BOTH; + ierr = cudaFreeHost(AiUp);CHKERRCUSP(ierr); + ierr = cudaFreeHost(AjUp);CHKERRCUSP(ierr); + ierr = cudaFreeHost(AAUp);CHKERRCUSP(ierr); + ierr = cudaFreeHost(AALo);CHKERRCUSP(ierr); + } catch(char *ex) { + SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); + } + } + PetscFunctionReturn(0); +} + + #undef __FUNCT__ #define __FUNCT__ "MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU" PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A) { + PetscErrorCode ierr; + Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; + Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; + IS ip=a->row; + const PetscInt *rip; + PetscBool perm_identity; + PetscInt n = A->rmap->n; + PetscFunctionBegin; - SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"ICC and Cholesky Factorization not yet supported for CUSPARSE matrices"); + ierr = MatSeqAIJCUSPARSEBuildICCTriMatrices(A);CHKERRQ(ierr); + cusparseTriFactors->tempvec = new CUSPARRAY; + cusparseTriFactors->tempvec->resize(n); + /*lower triangular indices */ + ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr); + ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr); + if (!perm_identity) { + ierr = cusparseTriFactors->loTriFactorPtr->setOrdIndices(rip, n);CHKERRCUSP(ierr); + ierr = cusparseTriFactors->upTriFactorPtr->setOrdIndices(rip, n);CHKERRCUSP(ierr); + } + ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr); PetscFunctionReturn(0); } @@ -447,15 +567,14 @@ { PetscErrorCode ierr; Mat_SeqAIJ *b =(Mat_SeqAIJ*)B->data; - IS isrow = b->row,iscol = b->col; - PetscBool row_identity,col_identity; - + IS ip=b->row; + PetscBool perm_identity; PetscFunctionBegin; ierr = MatCholeskyFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); + /* determine which version of MatSolve needs to be used. */ - ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); - ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); - if (row_identity && col_identity) { + ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr); + if (perm_identity) { B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; } else { @@ -478,17 +597,17 @@ GPU_Matrix_Ifc* cusparseMatUp = (GPU_Matrix_Ifc*)cusparseTriFactors->upTriFactorPtr; cusparseStatus_t stat; PetscFunctionBegin; - stat = cusparseMatLo->initializeCusparseMat(MAT_cusparseHandle, - CUSPARSE_MATRIX_TYPE_TRIANGULAR, - CUSPARSE_FILL_MODE_UPPER, - TRANSPOSE);CHKERRCUSP(stat); - stat = cusparseMatLo->solveAnalysis(TRANSPOSE);CHKERRCUSP(stat); + stat = cusparseMatLo->initializeCusparseMatTranspose(MAT_cusparseHandle, + CUSPARSE_MATRIX_TYPE_TRIANGULAR, + CUSPARSE_FILL_MODE_UPPER, + CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSP(stat); + stat = cusparseMatLo->solveAnalysisTranspose();CHKERRCUSP(stat); - stat = cusparseMatUp->initializeCusparseMat(MAT_cusparseHandle, - CUSPARSE_MATRIX_TYPE_TRIANGULAR, - CUSPARSE_FILL_MODE_LOWER, - TRANSPOSE);CHKERRCUSP(stat); - stat = cusparseMatUp->solveAnalysis(TRANSPOSE);CHKERRCUSP(stat); + stat = cusparseMatUp->initializeCusparseMatTranspose(MAT_cusparseHandle, + CUSPARSE_MATRIX_TYPE_TRIANGULAR, + CUSPARSE_FILL_MODE_LOWER, + CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSP(stat); + stat = cusparseMatUp->solveAnalysisTranspose();CHKERRCUSP(stat); PetscFunctionReturn(0); } @@ -501,8 +620,13 @@ Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; cusparseStatus_t stat; PetscFunctionBegin; - stat = cusparseMat->mat->initializeCusparseMat(MAT_cusparseHandle, CUSPARSE_MATRIX_TYPE_GENERAL, CUSPARSE_FILL_MODE_UPPER, TRANSPOSE);CHKERRCUSP(stat); - ierr = cusparseMat->mat->setMatrix(A->rmap->n, A->cmap->n, a->nz, a->i, a->j, a->a, TRANSPOSE);CHKERRCUSP(ierr); + if (cusparseMat->isSymmOrHerm) { + stat = cusparseMat->mat->initializeCusparseMatTranspose(MAT_cusparseHandle, CUSPARSE_MATRIX_TYPE_SYMMETRIC, CUSPARSE_FILL_MODE_UPPER, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSP(stat); + } + else { + stat = cusparseMat->mat->initializeCusparseMatTranspose(MAT_cusparseHandle, CUSPARSE_MATRIX_TYPE_GENERAL, CUSPARSE_FILL_MODE_LOWER, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSP(stat); + } + ierr = cusparseMat->mat->setMatrixTranspose(A->rmap->n, A->cmap->n, a->nz, a->i, a->j, a->a);CHKERRCUSP(ierr); PetscFunctionReturn(0); } @@ -532,15 +656,20 @@ /* solve with reordering */ ierr = cusparseMatUp->reorderIn(xGPU, bGPU);CHKERRCUSP(ierr); - stat = cusparseMatUp->solve(xGPU, tempGPU, TRANSPOSE);CHKERRCUSP(stat); - stat = cusparseMatLo->solve(tempGPU, xGPU, TRANSPOSE);CHKERRCUSP(stat); + stat = cusparseMatUp->solveTranspose(xGPU, tempGPU);CHKERRCUSP(stat); + stat = cusparseMatLo->solveTranspose(tempGPU, xGPU);CHKERRCUSP(stat); ierr = cusparseMatLo->reorderOut(xGPU);CHKERRCUSP(ierr); /* restore */ ierr = VecCUSPRestoreArrayRead(bb,&bGPU);CHKERRQ(ierr); ierr = VecCUSPRestoreArrayWrite(xx,&xGPU);CHKERRQ(ierr); ierr = WaitForGPU();CHKERRCUSP(ierr); - ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr); + + if (cusparseTriFactors->isSymmOrHerm) { + ierr = PetscLogFlops(4.0*a->nz - 3.0*A->cmap->n);CHKERRQ(ierr); + } else { + ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr); + } PetscFunctionReturn(0); } @@ -569,14 +698,18 @@ ierr = VecCUSPGetArrayRead(bb,&bGPU);CHKERRQ(ierr); /* solve */ - stat = cusparseMatUp->solve(bGPU, tempGPU, TRANSPOSE);CHKERRCUSP(stat); - stat = cusparseMatLo->solve(tempGPU, xGPU, TRANSPOSE);CHKERRCUSP(stat); + stat = cusparseMatUp->solveTranspose(bGPU, tempGPU);CHKERRCUSP(stat); + stat = cusparseMatLo->solveTranspose(tempGPU, xGPU);CHKERRCUSP(stat); /* restore */ ierr = VecCUSPRestoreArrayRead(bb,&bGPU);CHKERRQ(ierr); ierr = VecCUSPRestoreArrayWrite(xx,&xGPU);CHKERRQ(ierr); ierr = WaitForGPU();CHKERRCUSP(ierr); - ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr); + if (cusparseTriFactors->isSymmOrHerm) { + ierr = PetscLogFlops(4.0*a->nz - 3.0*A->cmap->n);CHKERRQ(ierr); + } else { + ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr); + } PetscFunctionReturn(0); } @@ -608,7 +741,11 @@ ierr = VecCUSPRestoreArrayRead(bb,&bGPU);CHKERRQ(ierr); ierr = VecCUSPRestoreArrayWrite(xx,&xGPU);CHKERRQ(ierr); ierr = WaitForGPU();CHKERRCUSP(ierr); - ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr); + if (cusparseTriFactors->isSymmOrHerm) { + ierr = PetscLogFlops(4.0*a->nz - 3.0*A->cmap->n);CHKERRQ(ierr); + } else { + ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr); + } PetscFunctionReturn(0); } @@ -640,7 +777,11 @@ ierr = VecCUSPRestoreArrayRead(bb,&bGPU);CHKERRQ(ierr); ierr = VecCUSPRestoreArrayWrite(xx,&xGPU);CHKERRQ(ierr); ierr = WaitForGPU();CHKERRCUSP(ierr); - ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr); + if (cusparseTriFactors->isSymmOrHerm) { + ierr = PetscLogFlops(4.0*a->nz - 3.0*A->cmap->n);CHKERRQ(ierr); + } else { + ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr); + } PetscFunctionReturn(0); } @@ -653,7 +794,8 @@ Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; PetscInt m = A->rmap->n,*ii,*ridx; PetscErrorCode ierr; - + PetscBool symmetryTest=PETSC_FALSE, hermitianTest=PETSC_FALSE; + PetscBool symmetryOptionIsSet=PETSC_FALSE, symmetryOptionTest=PETSC_FALSE; PetscFunctionBegin; if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU) { @@ -706,8 +848,28 @@ ierr = MPI_Comm_size(PETSC_COMM_WORLD,&size);CHKERRQ(ierr); ierr = cusparseMat->mat->buildStreamsAndEvents(size, &theBodyStream);CHKERRCUSP(ierr); - /* FILL MODE UPPER is irrelevant */ - cusparseStatus_t stat = cusparseMat->mat->initializeCusparseMat(MAT_cusparseHandle, CUSPARSE_MATRIX_TYPE_GENERAL, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSP(stat); + ierr = MatIsSymmetricKnown(A,&symmetryOptionIsSet,&symmetryOptionTest); + if ((symmetryOptionIsSet && !symmetryOptionTest) || !symmetryOptionIsSet) { + /* CUSPARSE_FILL_MODE_UPPER and CUSPARSE_DIAG_TYPE_NON_UNIT are irrelevant here */ + cusparseStatus_t stat = cusparseMat->mat->initializeCusparseMat(MAT_cusparseHandle, CUSPARSE_MATRIX_TYPE_GENERAL, CUSPARSE_FILL_MODE_UPPER, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSP(stat); + cusparseMat->isSymmOrHerm = PETSC_FALSE; + } else { + ierr = MatIsSymmetric(A,0.0,&symmetryTest); + if (symmetryTest) { + cusparseStatus_t stat = cusparseMat->mat->initializeCusparseMat(MAT_cusparseHandle, CUSPARSE_MATRIX_TYPE_SYMMETRIC, CUSPARSE_FILL_MODE_UPPER, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSP(stat); + cusparseMat->isSymmOrHerm = PETSC_TRUE; + } else { + ierr = MatIsHermitian(A,0.0,&hermitianTest); + if (hermitianTest) { + cusparseStatus_t stat = cusparseMat->mat->initializeCusparseMat(MAT_cusparseHandle, CUSPARSE_MATRIX_TYPE_HERMITIAN, CUSPARSE_FILL_MODE_UPPER, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSP(stat); + cusparseMat->isSymmOrHerm = PETSC_TRUE; + } else { + /* CUSPARSE_FILL_MODE_UPPER and CUSPARSE_DIAG_TYPE_NON_UNIT are irrelevant here */ + cusparseStatus_t stat = cusparseMat->mat->initializeCusparseMat(MAT_cusparseHandle, CUSPARSE_MATRIX_TYPE_GENERAL, CUSPARSE_FILL_MODE_UPPER, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSP(stat); + cusparseMat->isSymmOrHerm = PETSC_FALSE; + } + } + } /* lastly, build the matrix */ ierr = cusparseMat->mat->setMatrix(m, A->cmap->n, a->nz, ii, a->j, a->a);CHKERRCUSP(ierr); @@ -778,7 +940,11 @@ if (!cusparseMat->mat->hasNonZeroStream()) { ierr = WaitForGPU();CHKERRCUSP(ierr); } - ierr = PetscLogFlops(2.0*a->nz - cusparseMat->nonzerorow);CHKERRQ(ierr); + if (cusparseMat->isSymmOrHerm) { + ierr = PetscLogFlops(4.0*a->nz - 3.0*cusparseMat->nonzerorow);CHKERRQ(ierr); + } else { + ierr = PetscLogFlops(2.0*a->nz - cusparseMat->nonzerorow);CHKERRQ(ierr); + } PetscFunctionReturn(0); } @@ -802,7 +968,7 @@ ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr); ierr = VecCUSPGetArrayWrite(yy,&yarray);CHKERRQ(ierr); try { - ierr = cusparseMat->mat->multiply(xarray, yarray, TRANSPOSE);CHKERRCUSP(ierr); + ierr = cusparseMat->mat->multiplyTranspose(xarray, yarray);CHKERRCUSP(ierr); } catch (char *ex) { SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); } @@ -811,7 +977,11 @@ if (!cusparseMat->mat->hasNonZeroStream()) { ierr = WaitForGPU();CHKERRCUSP(ierr); } - ierr = PetscLogFlops(2.0*a->nz - cusparseMat->nonzerorow);CHKERRQ(ierr); + if (cusparseMat->isSymmOrHerm) { + ierr = PetscLogFlops(4.0*a->nz - 3.0*cusparseMat->nonzerorow);CHKERRQ(ierr); + } else { + ierr = PetscLogFlops(2.0*a->nz - cusparseMat->nonzerorow);CHKERRQ(ierr); + } PetscFunctionReturn(0); } @@ -844,7 +1014,11 @@ SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); } ierr = WaitForGPU();CHKERRCUSP(ierr); - ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr); + if (cusparseMat->isSymmOrHerm) { + ierr = PetscLogFlops(4.0*a->nz - 2.0*cusparseMat->nonzerorow);CHKERRQ(ierr); + } else { + ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr); + } PetscFunctionReturn(0); } @@ -871,7 +1045,7 @@ ierr = VecCUSPGetArrayWrite(zz,&zarray);CHKERRQ(ierr); /* multiply add with matrix transpose */ - ierr = cusparseMat->mat->multiplyAdd(xarray, yarray, TRANSPOSE);CHKERRCUSP(ierr); + ierr = cusparseMat->mat->multiplyAddTranspose(xarray, yarray);CHKERRCUSP(ierr); ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr); ierr = VecCUSPRestoreArrayRead(yy,&yarray);CHKERRQ(ierr); @@ -881,7 +1055,11 @@ SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); } ierr = WaitForGPU();CHKERRCUSP(ierr); - ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr); + if (cusparseMat->isSymmOrHerm) { + ierr = PetscLogFlops(4.0*a->nz - 2.0*cusparseMat->nonzerorow);CHKERRQ(ierr); + } else { + ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr); + } PetscFunctionReturn(0); } @@ -1028,6 +1206,7 @@ ((Mat_SeqAIJCUSPARSE*)B->spptr)->tempvec = 0; ((Mat_SeqAIJCUSPARSE*)B->spptr)->format = MAT_CUSPARSE_CSR; ((Mat_SeqAIJCUSPARSE*)B->spptr)->hasTranspose = PETSC_FALSE; + ((Mat_SeqAIJCUSPARSE*)B->spptr)->isSymmOrHerm = PETSC_FALSE; } else { /* NEXT, set the pointers to the triangular factors */ B->spptr = new Mat_SeqAIJCUSPARSETriFactors; @@ -1037,6 +1216,7 @@ ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->tempvec = 0; ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->format = cusparseMatSolveStorageFormat; ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->hasTranspose = PETSC_FALSE; + ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->isSymmOrHerm = PETSC_FALSE; } /* Create a single instance of the MAT_cusparseHandle for any matrix (matMult, TriSolve, ...) */ if (!MAT_cusparseHandle) { diff -r 6d60df3aadfe -r 6be0d0f06ad1 src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h --- a/src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h Mon Feb 04 11:42:47 2013 -0700 +++ b/src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h Sat Mar 09 13:13:03 2013 -0700 @@ -24,6 +24,7 @@ CUSPARRAY *tempvec; MatCUSPARSEStorageFormat format; /* the storage format for the matrix on the device */ PetscBool hasTranspose; /* boolean describing whether a transpose has been built or not */ + PetscBool isSymmOrHerm; /* boolean describing whether the matrix is symmetric (hermitian) or not. This is used mostly for performance logging purposes. */ }; struct Mat_SeqAIJCUSPARSE { @@ -32,6 +33,7 @@ PetscInt nonzerorow; /* number of nonzero rows ... used in the flop calculations */ MatCUSPARSEStorageFormat format; /* the storage format for the matrix on the device */ PetscBool hasTranspose; /* boolean describing whether a transpose has been built or not */ + PetscBool isSymmOrHerm; /* boolean describing whether the matrix is symmetric (hermitian) or not. This is used mostly for performance logging purposes. */ }; extern PetscErrorCode MatCUSPARSECopyToGPU(Mat);