// Copyright (c) 1997-2000 by Mark F. Adams // $Id: prom_mat_prod.C,v 1.4 2011-04-23 19:26:47 madams Exp $ // Author: Mark F. Adams // Version: 2.0 // Filename: prom_mat_prod.C // #include "prom_grid.hh" #undef __FUNCT__ #define __FUNCT__ "MakeScalarNextMat" /* PromGrid::MakeScalarNextMat *********************************** * * - use native PETSc methods for smoothing * * INPUT: * - nextG: other version of this grid * * SIDE EFFECTS: * - changes the prolongators matrix for 'nextG' * * RETURN: * - PETSc error code */ int PromGrid::MakeScalarNextMat( PromGrid *nnext ) { int ierr,vb=options_.verbose_,mype = comm_.mype(); assert( !isTop() ); Mat A1, P = nnext->prolongator_->mat_, A0 = stiffness_->mat_; PetscFunctionBegin; perf_mon_.EventBegin(PromContext::prom_logs_[LOG_MAT_TRI_PROD]); if( vb > 1) { PetscLogDouble rss; PetscMemoryGetCurrentUsage(&rss); PetscPrintf(comm_(),"[%d]%s(%d) start: loc=%d glob=%d Mb=%d\n", mype,__FUNCT__,getLevel(),nLocalNd_,nglobal(), (int)(rss/1.e6+.5)); } // RAP to next grid assert(nnext != NULL); PromMatrix *nextA = nnext->stiffness_; if( nextA != NULL ) { A1 = nextA->mat_; ierr = MatDestroy( &A1 ); CHKERRQ(ierr); ierr = MatPtAP( A0, P, MAT_INITIAL_MATRIX, 1.0, &A1 ); CHKERRQ(ierr); nextA->mat_ = A1; } else{ ierr = MatPtAP( A0, P, MAT_INITIAL_MATRIX, 3.0, &A1 ); CHKERRQ(ierr); ierr = nnext->MakeMatrix( false, false, 0, NULL, A1 ); CHKERRQ(ierr); nextA = nnext->stiffness_; } // if( nextA->gid_ndf_ == NULL ) { nextA->gid_ndf_ = new PromTable((3*nnext->nlocal())/2+117); } if( gid_extraParents_ == NULL ) { ierr = nextA->SetOption( MAT_NEW_NONZERO_ALLOCATION_ERR ); CHKERRQ(ierr); } if( vb > 2) { PetscLogDouble rss; PetscMemoryGetCurrentUsage(&rss); PetscPrintf(comm_(),"[%d]%s(%d) loc=%d glob=%d Mb=%d\n", mype,__FUNCT__,getLevel(),nLocalNd_,nglobal(), (int)(rss/1.e6+.5)); } // singular on coarse grids (-pc_lu_nonzeros_along_diagonal 1.e-100) ierr = nextA->Assembly(); CHKERRQ(ierr); double nrm; ierr = nextA->NormInf( &nrm ); CHKERRQ(ierr); if( nrm*PROM_RAP_SHIFT < 1.e300 ) { ierr = nextA->Shift( nrm*PROM_RAP_SHIFT ); CHKERRQ(ierr); } ierr = nextA->SetOption( MAT_STRUCTURALLY_SYMMETRIC ); CHKERRQ(ierr); perf_mon_.EventEnd(PromContext::prom_logs_[LOG_MAT_TRI_PROD]); if( vb > 1 ) { PetscLogDouble rss; PetscMemoryGetCurrentUsage(&rss); PetscPrintf(comm_(),"[%d]%s(%d) done Mb=%d - norm(A)=%e\n", mype,__FUNCT__,getLevel(),(int)(rss/1.e6+.5),nrm); } PetscFunctionReturn(0); } //#define PROM_CLEAN_LISTS // destroy hash tables -- this will cause a hang because P_ij entries are clean after first RAP but maps are not redone - fix: redo maps //#define PROM_CLEAN_VALS #define PROM_PETSC_LOCAL // !use my hash table to collect ON-proc stuff //#define PROM_PETSC_GLOBAL // !use my hash table to collect OFF-proc stuff #define PROM_USE_CACHE #undef __FUNCT__ #define __FUNCT__ "MakeNextMat" /* PromGrid::MakeNextMat ****************************************************** * * * INPUT: * - nnext: * * SIDE EFFECTS: * * RETURN: * - PETSc error code */ int PromGrid::MakeNextMat( PromGrid *nnext ) { int ierr,vb=options_.verbose_,mype = comm_.mype(); assert( !isTop() ); // RAP to next grid assert(nnext != NULL); PromMatrix *nextA = nnext->stiffness_; PetscFunctionBegin; if( nextA != NULL ) { ierr = nextA->PromMatrix::ZeroEntries(); CHKERRQ(ierr); } else{ ierr = nnext->MakeMatrix(); CHKERRQ(ierr); nextA = nnext->stiffness_; } if( nextA->gid_ndf_ == NULL ) { nextA->gid_ndf_ = new PromTable((3*nnext->nlocal())/2+117); } if( gid_extraParents_ == NULL ) { ierr = nextA->SetOption( MAT_NEW_NONZERO_ALLOCATION_ERR ); CHKERRQ(ierr); } if( vb > 2) { PetscLogDouble rss; PetscMemoryGetCurrentUsage(&rss); PetscPrintf(comm_(),"[%d]%s(%d) loc=%d glob=%d Mb=%d\n", mype,__FUNCT__,getLevel(),nLocalNd_,nglobal(), (int)(rss/1.e6+.5)); } // make next (coarse) matrix perf_mon_.EventBegin(PromContext::prom_logs_[LOG_MAT_TRI_PROD]); perf_mon_.EventBegin( PromContext::prom_logs_[LOG_MIS6] ); ierr = MakeNextMat_private( nnext ); CHKERRQ(ierr); perf_mon_.EventEnd( PromContext::prom_logs_[LOG_MIS6] ); #if !defined(PROM_PETSC_GLOBAL) perf_mon_.EventBegin( PromContext::prom_logs_[LOG_MIS7]); ierr = nnext->AssembleTable( this ); CHKERRQ(ierr); perf_mon_.EventEnd( PromContext::prom_logs_[LOG_MIS7]); #endif // singular on coarse grids (-pc_lu_nonzeros_along_diagonal 1.e-100) ierr = nextA->Assembly(); CHKERRQ(ierr); double nrm; ierr = nextA->NormInf( &nrm ); CHKERRQ(ierr); if( nrm*PROM_RAP_SHIFT < 1.e300 ) { ierr = nextA->Shift( nrm*PROM_RAP_SHIFT ); CHKERRQ(ierr); } ierr = nextA->SetOption( MAT_STRUCTURALLY_SYMMETRIC ); CHKERRQ(ierr); perf_mon_.EventEnd(PromContext::prom_logs_[LOG_MAT_TRI_PROD]); if( vb > 1 ) { PetscLogDouble rss; PetscMemoryGetCurrentUsage(&rss); PetscPrintf(comm_(),"[%d]%s(%d) done Mb=%d - norm(A)=%e\n", mype,__FUNCT__,getLevel(),(int)(rss/1.e6+.5),nrm); } PetscFunctionReturn(0); } #if defined(PROM_USE_CACHE) #undef __FUNCT__ #define __FUNCT__ "GetCacheEntry" /* PromGrid::GetCacheEntry *************************************************** * * - get cache entry * * INPUT: * - cache[PROM_MAXNIEGH*nParI*cndf2]: cache to add products (in/out) * - nParI: number of rows in cache * - I_ceq: * - J_ceq: (in/out) * - nCorsCols: current top of cache. (int/out) * - dirtyCache: flags for dirty cells in cache. (in/out) * - parGIDcache: parents. (in/out) * - J: global column * - Jndf: * - II: parent row index into cache (0-3 for geo mG) * * SIDE EFFECTS: * * RETURN: * - cache entry */ inline double* PromGrid::GetCacheEntry( double cache[], const int nParI, const int I_ceq[PROM_MAXPAR_HARD], int J_ceq[PROM_MAXNIEGH], int &nCorsCols, bool dirtyCache[], int parGIDcache[PROM_MAXNIEGH], const int J, const int Jndf, const int II ) { int k,tt,coljj; // find column if it exists for( k = 0, coljj = -9 ; k < nCorsCols ; k++ ) { if( parGIDcache[k] == J ){ coljj = k; break; } } if( coljj == -9 ) { parGIDcache[nCorsCols] = J; // clear dirtyCache J_ceq[nCorsCols+1] = Jndf + J_ceq[nCorsCols]; for(k = 0 ; k < nParI ; k++ ) dirtyCache[k*PROM_MAXNIEGH + nCorsCols] = 0; coljj = nCorsCols++; // new column } // get pointer -- col. storage!! int eqJ0=J_ceq[coljj],ndfJ=J_ceq[coljj+1]-eqJ0,neqI=I_ceq[nParI],eqI0=I_ceq[II]; tt = neqI*eqJ0 + eqI0*ndfJ; double *tp = &cache[tt]; // if( dirtyCache[II*PROM_MAXNIEGH + coljj] == 0 ) { dirtyCache[II*PROM_MAXNIEGH + coljj] = 1; int ndf = I_ceq[II+1] - eqI0, sz = ndfJ*ndf; for(k=0;kstiffness_, *AA = stiffness_; double Bi, Bj, scale; assert(AA!=NULL && LnA!=NULL); const int fact=AA->isComplex() ? 2:1; assert(AA->isComplex() == LnA->isComplex()); double *Ai[PROM_MAXNIEGH]; PromComplexMatrix *K; int nCorsCols,numpartot=0,adjacs[PROM_MAXNIEGH]; double numops = 0.0, *aIJ; #if defined(PROM_USE_CACHE) int parGIDcache[PROM_MAXNIEGH], J_ceq[PROM_MAXNIEGH]; int parGIDcache_i[PROM_MAXNIEGH], J_ceq_i[PROM_MAXNIEGH]; int I_ceq[PROM_MAXPAR_HARD], nCorsCols_i; double *cache, *cache_i; I_ceq[0] = J_ceq[0] = J_ceq_i[0] = 0; #else int I,J,i,j,k,kk; #endif PetscFunctionBegin; #if defined(PROM_USE_CACHE) II = nnext->isConstNDF() ? nnext->constNDFSize() : PROM_MAXNDF; int maxarrsz = 15*PROM_MAXPAR_SOFT, buffsz = maxarrsz*II*II; ierr = PetscMalloc((buffsz+1)*sizeof(double),&cache); while( ierr != 0 && buffsz > 5*PROM_MAXPAR_SOFT*II*II ){ buffsz = buffsz/2+1; ierr = PetscMalloc((buffsz+1)*sizeof(double),&cache); } CHKERRQ(ierr); cache[buffsz] = -13.; II = PROM_MAXPAR_HARD*PROM_MAXNIEGH; bool *dirtyCache, *dirtyCache_i; ierr = PetscMalloc(II*sizeof(bool),&dirtyCache); CHKERRQ(ierr); if( AA->isComplex() ) { ierr = PetscMalloc(II*sizeof(bool),&dirtyCache_i); CHKERRQ(ierr); ierr = PetscMalloc((buffsz+1)*sizeof(double),&cache_i); CHKERRQ(ierr); cache_i[buffsz] = -13.; K = dynamic_cast(LnA); assert(K != NULL); } else{ K = NULL; cache_i = NULL; dirtyCache_i = NULL; } #endif // loop over all the fine (curr) nodes (i) for( xx = 0 ; xx < nLocal ; xx++ ) { const PromNode *finei = nodes_[xx]; const int fndfi = finei->getNDF(); PromParent * const * parI = finei->parents_->data_; const int nparI = finei->nParents(); #if defined(PROM_USE_CACHE) for( II = 0 ; II < nparI ; II++) { I_ceq[II+1] = I_ceq[II] + parI[II]->getNDF(); // cndf } #endif // ROW OF FINE MATRIX ierr = AA->GetLocalNodeRow( xx, PROM_MAXNIEGH, ncols, adjacs, Ai ); CHKERRQ(ierr); if ( ncols > PROM_MAXNIEGH-100 ){ PetscPrintf(PETSC_COMM_SELF,"\t\t\t[%d]%s WARNING ncols=%d/%d\n", mype,__FUNCT__,ncols,PROM_MAXNIEGH); } // loop over all the fine nodes (j) - includes self for( nCorsCols_i = nCorsCols = 0, col = 0; col < ncols ; col++ ) { const int gidj = adjacs[col], lid = gidj - my0; PromParentArr *pararrJ=NULL; PromNode *finej=NULL; // put fndfj --> 'II' if( lid >= 0 && lid < nLocal ) { finej=nodes_[lid]; pararrJ = nodes_[lid]->parents_; II=finej->getNDF(); } else { finej = ghosts_.find(gidj+1); if( finej != NULL ){ pararrJ = finej->parents_; II = finej->getNDF(); } else { // constriant edges assert( gid_extraParents_ != NULL ); PromParentArr_extra *gpararr = gid_extraParents_->find(gidj+1); assert(gpararr!=NULL); pararrJ = gpararr; //assert(gpararr->getGID() == gidj); II = gpararr->getNDF(); } } const int fndfj = II, fndf2 = fndfi*fndfj, nparJ = pararrJ->getSize(); const PromParent * const * parJ = pararrJ->data_; // for all coarse nodes (J) total = 0.0; for( int JJ = 0 ; JJ < nparJ ; JJ++ ){ const PromParent * const parJJ = parJ[JJ]; const bool scalarJshape = (parJJ->getPsz() == 1); int cndfJ = parJJ->getNDF(), parJgID = parJJ->global(); const shpfloat *shjT, *shiT; if( !scalarJshape ) { if( parJJ->DAP_ != NULL ) shjT = parJJ->DAP_; else{ shjT = parJJ->P0_; if( shjT == NULL ) continue; // outer loop } } else shjT = &parJJ->shape_; // for all coarse nodes (I) for( II = 0 ; II < nparI ; II++ ){ const PromParent * const parII = parI[II]; const bool scalarshape = (scalarJshape && (parII->getPsz()==1)); int cndfI = parII->getNDF(); // check for zero shape -- before get cache entry!!! if( scalarshape ) { Bj = (parJJ->DAP_!=NULL) ? *parJJ->DAP_ : parJJ->getScalarP0(); Bi = (parII->DAP_!=NULL) ? *parII->DAP_ : parII->getScalarP0(); scale = Bi * Bj; //total += scale; if( fabs(scale) < 1.e-12 ) continue; } else { if( parII->DAP_ != NULL ) shiT = parII->DAP_; else { shiT = parII->P0_; if(shiT == NULL) continue; // inner loop } } // rap_IJ into aIJ // add to cache - see if a new I,J for( int qq = 0 ; qq < fact ; qq++ ) { #if defined(PROM_USE_CACHE) if( qq == 0 ) { aIJ = GetCacheEntry(cache, nparI, I_ceq, J_ceq, nCorsCols, dirtyCache,parGIDcache, parJgID,cndfJ,II); CHKERRQ(ierr); } else { aIJ = GetCacheEntry(cache_i, nparI, I_ceq, J_ceq_i, nCorsCols_i, dirtyCache_i,parGIDcache_i,parJgID,cndfJ,II); CHKERRQ(ierr); assert(cache_i[buffsz] == -13.); } #else double aIJ_bf__[4*PROM_MAXNDF*PROM_MAXNDF]; // buffer aIJ = aIJ_bf__; k = cndfI*cndfJ; while(k--) aIJ[k] = 0.0; // zero out #endif // get buffer 'tAij' double tAij[4*PROM_MAXNDF*PROM_MAXNDF]; { const double *aijT = Ai[col]; // transpose if( fact == 1 ) { for(int j,i=0,id=0,idi=0 ; i < fndfi ; i++ ){ for(j=0;jglobal(); // add #if !defined(PROM_PETSC_GLOBAL) ierr = nnext->AddValueTable(I, J, cndfI, cndfJ, aIJ, parII->proc(), (qq==0) ? false : true ); CHKERRQ(ierr); #else // add in manually if( qq == 0 ) { ierr = LnA->SetValuesBlocked(1, &I, 1, &J, aIJ, ADD_VALUES); CHKERRQ(ierr); } else { ierr = K->SetValuesBlocked_i(1, &I, 1, &J, aIJ, ADD_VALUES); CHKERRQ(ierr); } #endif #endif // check cache size if( (nCorsCols+1)*nparI > maxarrsz ){ if(vb>1){ PetscPrintf( (vb>2) ? PETSC_COMM_SELF : PETSC_COMM_WORLD, "[%d]%s rehash cache %d/%d\n", mype,__FUNCT__,(nCorsCols+1)*nparI,maxarrsz); } // copy over assert(cache[buffsz] == -13.); ierr = nnext->AddCache( cache, nparI, I_ceq, J_ceq, nCorsCols, parI, dirtyCache, parGIDcache ); CHKERRQ(ierr); maxarrsz = (nCorsCols+30)*nparI; nCorsCols = 0; PetscFree(cache); if( AA->isComplex() ) { assert(cache_i[buffsz] == -13.); ierr = nnext->AddCache(cache_i, nparI, I_ceq,J_ceq_i,nCorsCols_i, parI, dirtyCache_i, parGIDcache_i, true ); CHKERRQ(ierr); nCorsCols_i = 0; PetscFree(cache_i); } // make new buffer double *tcach; int ndf=nnext->isConstNDF() ? nnext->constNDFSize() : PROM_MAXNDF; buffsz = maxarrsz*ndf*ndf; ierr = PetscMalloc((buffsz+1)*sizeof(double),&tcach);CHKERRQ(ierr); cache = tcach; cache[buffsz] = -13.; if( AA->isComplex() ) { ierr = PetscMalloc((buffsz+1)*sizeof(double), &tcach); CHKERRQ(ierr); cache_i = tcach; cache_i[buffsz] = -13.; } } } // 'qq' complex loop } // coarse nodes II } // coarse nodes JJ } // col assert(I_ceq[0] == 0 && J_ceq[0] == 0 && J_ceq_i[0] == 0); // add cache #if defined(PROM_USE_CACHE) ierr = nnext->AddCache( cache, nparI, I_ceq, J_ceq, nCorsCols, parI, dirtyCache, parGIDcache ); CHKERRQ(ierr); if( AA->isComplex() ) { assert(cache_i[buffsz] == -13.); ierr = nnext->AddCache( cache_i, nparI, I_ceq, J_ceq_i, nCorsCols_i, parI, dirtyCache_i, parGIDcache_i, true ); CHKERRQ(ierr); } #endif numpartot += nCorsCols; } // fine nodes (i) #if defined(PROM_USE_CACHE) assert(cache[buffsz] == -13.); PetscFree(cache); PetscFree(dirtyCache); if( AA->isComplex() ) { assert(cache_i[buffsz] == -13.); PetscFree(cache_i); PetscFree(dirtyCache_i); } #endif /* diagnostics */ numops *= 2.0; if( AA->isComplex() ) numops *= 2.0; while( numops > 2e9 ){ perf_mon_.Flops(2000000000); numops -= 2e9; } perf_mon_.Flops( (int)numops ); PetscFunctionReturn(0); } #undef __FUNCT__ #define __FUNCT__ "AddCache" /* PromGrid::AddCache *************************************************** * * - add cache to a matrix, zero fixed * * INPUT: * - cache[PROM_MAXNIEGH*nParI*cndf2]: cache to add products (in/out) * - nParI: * - I_ceq: * - J_ceq: * - nCorsCols: * - parI: * - dirtyCache: flags for dirty cells in cache. * - parGIDcache: global parents. * - iscomplex: * * SIDE EFFECTS: * RETURN: * - PETSc error code */ int PromGrid::AddCache(double cache[], const int nParI, const int I_ceq[], const int J_ceq[], const int nCorsCols, const PromParent * const * parI, const bool dirtyCache[], const int parGIDcache[PROM_MAXNIEGH],const bool iscmplx ) { int II,ierr,coljj; double *pv; const int neqI = I_ceq[nParI]; // add cache to A_{i+1} for( II = 0 ; II < nParI ; II++ ){ int eq0I = I_ceq[II], cndfI = I_ceq[II+1] - eq0I; const PromParent * const pari = parI[II]; int I = pari->global(), proci = pari->proc(); for( coljj = 0 ; coljj < nCorsCols ; coljj++ ) { if( dirtyCache[II*PROM_MAXNIEGH + coljj] ) { int J = parGIDcache[coljj]; int eq0J = J_ceq[coljj], cndfJ = J_ceq[coljj+1] - eq0J; // add pv = &cache[neqI*eq0J + eq0I*cndfJ]; #if !defined(PROM_PETSC_GLOBAL) ierr = AddValueTable( I, J, cndfI, cndfJ, pv, proci, iscmplx ); CHKERRQ(ierr); #else // standard assembly if( iscomplex ) { PromComplexMatrix *K = dynamic_cast(stiffness_); assert(K != NULL); ierr = K->SetValuesBlocked_i(1, &I, 1, &J, pv, ADD_VALUES); CHKERRQ(ierr); } else{ ierr = stiffness_->SetValuesBlocked(1,&I,1,&J,pv,ADD_VALUES); CHKERRQ(ierr); } #endif } } } return 0; } #endif #undef __FUNCT__ #define __FUNCT__ "AddValueTable" /* PromGrid::AddValueTable ************************************ * * - add (block) value to processor cache * * INPUT: * - I: global row * - J: global column * - ndfI: * - ndfJ: * - aIJ: value * - proci: proc to add to * - iscomplex: * * SIDE EFFECTS: * - adds to stiffness_->gid_ndf_ * * RETURN: * - PETSc error code */ int PromGrid::AddValueTable( const int I, const int J, const int ndfI, const int ndfJ, double *aIJ, const int proci, const bool iscomplex ) { int ierr,ii; PromTable*> *rowi_table_rowj; #if defined(PROM_PETSC_LOCAL) if( proci == comm_.mype() ) { int ii = I, jj = J; if( iscomplex ) { PromComplexMatrix *K = dynamic_cast(stiffness_); assert(K != NULL); ierr = K->SetValuesBlocked_i( 1 ,&ii ,1 ,&jj ,aIJ ,ADD_VALUES); CHKERRQ(ierr); } else { ierr = stiffness_->SetValuesBlocked( 1 ,&ii ,1 ,&jj ,aIJ ,ADD_VALUES); CHKERRQ(ierr); } return 0; } #endif if( iscomplex ) { rowi_table_rowj = stiffness_->proci_table_gidi_tablej_i_.find(proci+1); if( rowi_table_rowj == NULL ) { rowi_table_rowj = new PromTable*>(nlocal()/2+319); CHKNEW(rowi_table_rowj,1); stiffness_->proci_table_gidi_tablej_i_.add( rowi_table_rowj, proci+1 ); } } else{ rowi_table_rowj = stiffness_->proci_table_gidi_tablej_.find(proci+1); if( rowi_table_rowj == NULL ) { rowi_table_rowj = new PromTable*>(nlocal()/2+319); CHKNEW(rowi_table_rowj,1); stiffness_->proci_table_gidi_tablej_.add( rowi_table_rowj, proci+1 ); } } PromTable *rowj_vals = rowi_table_rowj->find(I+1); if( !rowj_vals ) { rowj_vals = new PromTable(191); CHKNEW(rowj_vals,1); rowi_table_rowj->add( rowj_vals, I+1 ); stiffness_->gid_ndf_->add( ndfI, I+1 ); // has complex 'factor' } double *vals = rowj_vals->find( J+1 ); if( vals == NULL ) stiffness_->gid_ndf_->add( ndfJ, J+1 ); if( ndfI == 1 && ndfJ == 1 ) { if( vals==NULL ) { ierr = PetscMalloc( sizeof(double), &vals ); CHKERRQ(ierr); rowj_vals->add( vals, J+1 ); vals[0] = aIJ[0]; } else vals[0] += aIJ[0]; } else if( ndfI == 3 && ndfJ == 3 ) { if( vals==NULL ) { ierr = PetscMalloc( sizeof(double)*9, &vals ); CHKERRQ(ierr); rowj_vals->add( vals, J+1 ); vals[0] = aIJ[0]; vals[1] = aIJ[1]; vals[2] = aIJ[2]; vals[3] = aIJ[3]; vals[4] = aIJ[4]; vals[5] = aIJ[5]; vals[6] = aIJ[6]; vals[7] = aIJ[7]; vals[8] = aIJ[8]; } else { vals[0] += aIJ[0]; vals[1] += aIJ[1]; vals[2] += aIJ[2]; vals[3] += aIJ[3]; vals[4] += aIJ[4]; vals[5] += aIJ[5]; vals[6] += aIJ[6]; vals[7] += aIJ[7]; vals[8] += aIJ[8]; } } else if ( ndfI == 6 && ndfJ == 6 ){ if( vals==NULL ) { ierr = PetscMalloc( sizeof(double)*36, &vals ); CHKERRQ(ierr); rowj_vals->add( vals, J+1 ); vals[0] = aIJ[0]; vals[1] = aIJ[1]; vals[2] = aIJ[2]; vals[3] = aIJ[3]; vals[4] = aIJ[4]; vals[5] = aIJ[5]; vals[6] = aIJ[6]; vals[7] = aIJ[7]; vals[8] = aIJ[8]; vals[9] = aIJ[9]; vals[10] = aIJ[10]; vals[11] = aIJ[11]; vals[12] = aIJ[12]; vals[13] = aIJ[13]; vals[14] = aIJ[14]; vals[15] = aIJ[15]; vals[16] = aIJ[16]; vals[17] = aIJ[17]; vals[18] = aIJ[18]; vals[19] = aIJ[19]; vals[20] = aIJ[20]; vals[21] = aIJ[21]; vals[22] = aIJ[22]; vals[23] = aIJ[23]; vals[24] = aIJ[24]; vals[25] = aIJ[25]; vals[26] = aIJ[26]; vals[27] = aIJ[27]; vals[28] = aIJ[28]; vals[29] = aIJ[29]; vals[30] = aIJ[30]; vals[31] = aIJ[31]; vals[32] = aIJ[32]; vals[33] = aIJ[33]; vals[34] = aIJ[34]; vals[35] = aIJ[35]; } else { vals[0] += aIJ[0]; vals[1] += aIJ[1]; vals[2] += aIJ[2]; vals[3] += aIJ[3]; vals[4] += aIJ[4]; vals[5] += aIJ[5]; vals[6] += aIJ[6]; vals[7] += aIJ[7]; vals[8] += aIJ[8]; vals[9] += aIJ[9]; vals[10] += aIJ[10]; vals[11] += aIJ[11]; vals[12] += aIJ[12]; vals[13] += aIJ[13]; vals[14] += aIJ[14]; vals[15] += aIJ[15]; vals[16] += aIJ[16]; vals[17] += aIJ[17]; vals[18] += aIJ[18]; vals[19] += aIJ[19]; vals[20] += aIJ[20]; vals[21] += aIJ[21]; vals[22] += aIJ[22]; vals[23] += aIJ[23]; vals[24] += aIJ[24]; vals[25] += aIJ[25]; vals[26] += aIJ[26]; vals[27] += aIJ[27]; vals[28] += aIJ[28]; vals[29] += aIJ[29]; vals[30] += aIJ[30]; vals[31] += aIJ[31]; vals[32] += aIJ[32]; vals[33] += aIJ[33]; vals[34] += aIJ[34]; vals[35] += aIJ[35]; } } else { int sz = ndfJ*ndfI; if( vals == NULL ) { ierr = PetscMalloc( sz*sizeof(double), &vals ); CHKERRQ(ierr); rowj_vals->add( vals, J+1 ); for( ii = 0 ; ii < sz ; ii++ ) vals[ii] = aIJ[ii]; } else { for( ii = 0 ; ii < sz ; ii++ ) vals[ii] += aIJ[ii]; } } return 0; } #undef __FUNCT__ #define __FUNCT__ "AssembleTable" /* PromGrid::AssembleTablse *********************************************** * * - do assembly ops -- off(all) processor values * * INPUT: * - lastG: * * SIDE EFFECTS: * * RETURN: * - PETSc error code */ int PromGrid::AssembleTable( PromGrid *lastG ) { if( !lastG->isActive()) return 0; const int mype = comm_.mype(), np = comm_.np(), vb = options_.verbose_; int ierr,colj,ii,proc,rowi,ndfI,ndfJ,tt=masterProc(mype); const int fsz=sizeof(double)/sizeof(unsigned int); MPI_Status status; const int tag = lastG->getNewTag(); TablePos tpos; tt = sizeof(MPI_Request); ii = sizeof(double); const int reqsz = (tt/ii + ((tt%ii == 0) ? 0 : 1)) * fsz; PromList smessages, rmessages; PromTable*> *rowi_table_rowj,*rowi_table_rowj_i; PromTable *rowj_vals; int sbuf[PROM_BSZ],rbuf[PROM_BSZ]; bool determinate = FALSE; ierr = options_.HasOptionName("-prometheus_use_deterministic",&determinate); // send sizes if( stiffness_->pRecvDone_ == FALSE ) { stiffness_->pRecvDone_ = TRUE; // count recv procs const int lnp = np/lastG->factor_; for( ii = 0 ; ii < lnp ; ii++ ) sbuf[ii] = 0; tpos = stiffness_->proci_table_gidi_tablej_.getHeadPosition(); while( tpos ) { rowi_table_rowj=stiffness_->proci_table_gidi_tablej_.getNext(tpos,&proc); proc--; if( proc != mype ){ assert(sbuf[proc/lastG->factor_] == 0); TablePos tpos2 = rowi_table_rowj->getHeadPosition(); tt = 0; while( tpos2 ) { rowj_vals = rowi_table_rowj->getNext( tpos2, &rowi ); ndfI = stiffness_->gid_ndf_->find( rowi ); TablePos tpos3 = rowj_vals->getHeadPosition(); while( tpos3 ) { rowj_vals->getNext( tpos3, &colj ); ndfJ = stiffness_->gid_ndf_->find(colj); tt += 2 + ndfJ*ndfI*fsz; } } sbuf[proc/lastG->factor_] = tt + 2; } } // imaginary part tpos = stiffness_->proci_table_gidi_tablej_i_.getHeadPosition(); while( tpos ) { rowi_table_rowj=stiffness_->proci_table_gidi_tablej_i_.getNext(tpos,&proc); proc--; if( proc != mype ){ TablePos tpos2 = rowi_table_rowj->getHeadPosition(); tt = 0; while( tpos2 ) { rowj_vals = rowi_table_rowj->getNext( tpos2, &rowi ); ndfI = stiffness_->gid_ndf_->find( rowi ); TablePos tpos3 = rowj_vals->getHeadPosition(); while( tpos3 ) { rowj_vals->getNext( tpos3, &colj ); ndfJ = stiffness_->gid_ndf_->find(colj); tt += 2 + ndfJ*ndfI*fsz; } } sbuf[proc/lastG->factor_] += tt; // plus! } } // global!!! MPI_Alltoall( sbuf, 1, MPI_INT, rbuf, 1, MPI_INT, lastG->gComm_ ); for(ii=0,stiffness_->numRecv_=0;iinumRecv_++; } if( stiffness_->numRecv_ > 0 ) { if( stiffness_->recvSzs_ != NULL ){ PetscFree(stiffness_->recvSzs_); PetscFree(stiffness_->recvParts_); } ierr = PetscMalloc(stiffness_->numRecv_*sizeof(int),&stiffness_->recvSzs_ ); CHKERRQ(ierr); ierr = PetscMalloc(stiffness_->numRecv_*sizeof(int),&stiffness_->recvParts_ ); CHKERRQ(ierr); } else stiffness_->recvSzs_ = stiffness_->recvParts_ = NULL; for( ii = 0, stiffness_->numRecv_ = 0 ; ii < lnp ; ii++ ) { if( rbuf[ii] != 0 ){ stiffness_->recvSzs_[stiffness_->numRecv_] = rbuf[ii]; stiffness_->recvParts_[stiffness_->numRecv_] = ii; stiffness_->numRecv_++; } } if(vb>1)PetscPrintf(PETSC_COMM_WORLD,"[%d]%s(%d) numRecv_=%d np=%d\n", mype,__FUNCT__,getLevel(),stiffness_->numRecv_, stiffness_->proci_table_gidi_tablej_.getCount() ); } // post recvs for( ii = 0 ; ii < stiffness_->numRecv_ ; ii++ ) { tt = stiffness_->recvSzs_[ii]; MPI_Request *rreq; ierr = PetscMalloc( (tt+reqsz)*sizeof(int), &rreq ); CHKERRQ(ierr); int *buff = (int*)rreq + reqsz,proc=stiffness_->recvParts_[ii]; MPI_Irecv(buff,tt,MPI_UNSIGNED,proc,tag,lastG->gComm_,rreq); rmessages.addTail( rreq ); } // send dat tpos = stiffness_->proci_table_gidi_tablej_.getHeadPosition(); while( tpos ) { rowi_table_rowj = stiffness_->proci_table_gidi_tablej_.getNext(tpos,&tt); const int proc = tt - 1; assert(proc%factor_ == 0); if( proc == mype ) continue; // count data tt = 0; TablePos tpos2 = rowi_table_rowj->getHeadPosition(); while( tpos2 ) { rowj_vals = rowi_table_rowj->getNext(tpos2,&rowi); ndfI = stiffness_->gid_ndf_->find(rowi); TablePos tpos3 = rowj_vals->getHeadPosition(); while( tpos3 ) { rowj_vals->getNext( tpos3, &colj ); ndfJ = stiffness_->gid_ndf_->find(colj); tt += 2 + ndfJ*ndfI*fsz; } } tt += 2; // flag // imaginary part if((rowi_table_rowj_i=stiffness_->proci_table_gidi_tablej_i_.find(proc+1))){ tpos2 = rowi_table_rowj_i->getHeadPosition(); while( tpos2 ) { rowj_vals = rowi_table_rowj_i->getNext(tpos2,&rowi); ndfI = stiffness_->gid_ndf_->find(rowi); TablePos tpos3 = rowj_vals->getHeadPosition(); while( tpos3 ) { rowj_vals->getNext( tpos3, &colj ); ndfJ = stiffness_->gid_ndf_->find(colj); tt += 2 + ndfJ*ndfI*fsz; } } } MPI_Request *sreq; ierr = PetscMalloc((tt+reqsz)*sizeof(int),&sreq); CHKERRQ(ierr); smessages.addTail( sreq ); int *buff = (int*)sreq + reqsz, *pb = buff; // sends data tpos2 = rowi_table_rowj->getHeadPosition(); while( tpos2 ) { rowj_vals = rowi_table_rowj->getNext(tpos2, &rowi); rowi--; ndfI = stiffness_->gid_ndf_->find(rowi+1); TablePos tpos3 = rowj_vals->getHeadPosition(); while( tpos3 ) { double *vals = rowj_vals->getNext( tpos3, &colj ); colj--; ndfJ = stiffness_->gid_ndf_->find(colj+1); // copy in *pb++ = (int)(ndfI-1) + (int)rowi * PROM_MAXNDF; *pb++ = (int)(ndfJ-1) + (int)colj * PROM_MAXNDF; double *fp = (double*)pb, *vp = vals; int sz = ndfJ*ndfI; for( int hh = 0 ; hh < sz ; hh++ ) *fp++ = *vp++; pb = (int*)fp; #if defined(PROM_CLEAN_VALS) PetscFree(vals); #else PetscMemzero( (char*)vals, sz*sizeof(double) ); #endif } // j #if defined(PROM_CLEAN_VALS) rowj_vals->removeAll(); // will get delte of cle... #endif } // send imaginary if( rowi_table_rowj_i == NULL ){ *pb++ = -2; *pb++ = -3; } else{ *pb++ = -1; *pb++ = -3; tpos2 = rowi_table_rowj_i->getHeadPosition(); while( tpos2 ) { rowj_vals = rowi_table_rowj_i->getNext(tpos2, &rowi); rowi--; ndfI = stiffness_->gid_ndf_->find(rowi+1); TablePos tpos3 = rowj_vals->getHeadPosition(); while( tpos3 ) { double *vals = rowj_vals->getNext( tpos3, &colj ); colj--; ndfJ = stiffness_->gid_ndf_->find(colj+1); // copy in *pb++ = (int)(ndfI-1) + (int)rowi * PROM_MAXNDF; *pb++ = (int)(ndfJ-1) + (int)colj * PROM_MAXNDF; double *fp = (double*)pb, *vp = vals; int sz = ndfJ*ndfI; for( int hh = 0 ; hh < sz ; hh++ ) *fp++ = *vp++; pb = (int*)fp; #if defined(PROM_CLEAN_VALS) PetscFree(vals); #else PetscMemzero( (char*)vals, sz*sizeof(double) ); #endif } // j #if defined(PROM_CLEAN_VALS) rowj_vals->removeAll(); // will get delte of cle... #endif } } assert(pb-buff == tt); // send(tag) *< gidI, gidJ, data > MPI_Isend(buff,tt,MPI_UNSIGNED,proc/lastG->factor_,tag,lastG->gComm_,sreq); // should clean out rowi_table_rowj if dynamic } // master -- receive while( !rmessages.isEmpty() ) { MPI_Request *rreq = rmessages.removeHead(); if( determinate ) { MPI_Wait( rreq, &status ); } else { MPI_Test( rreq, &ii, &status ); if( !ii ){ rmessages.addTail( rreq ); continue; } } MPI_Get_count( &status, MPI_INT, &tt ); int *buff = (int*)rreq + reqsz, *pb = buff, iscmplx=0; // recv(tag) *< gidI, gidJ, data > while( pb-buff < tt ) { int urowi = *pb++, ucolj = *pb++; if( urowi < 0 ) { if( urowi == -2 ) break; else{ assert(iscmplx==0); iscmplx=1; urowi = *pb++; ucolj = *pb++; } } ndfI = urowi % PROM_MAXNDF + 1; rowi = urowi / PROM_MAXNDF; ndfJ = ucolj % PROM_MAXNDF + 1; colj = ucolj / PROM_MAXNDF; int sz = ndfJ*ndfI; double *vp = (double*)pb; if( iscmplx == 0 ) { ierr = stiffness_->SetValuesBlocked(1,&rowi,1,&colj,vp,ADD_VALUES); CHKERRQ(ierr); } else { PromComplexMatrix *K = dynamic_cast(stiffness_); assert(K != NULL); ierr = K->SetValuesBlocked_i(1, &rowi, 1, &colj, vp, ADD_VALUES); CHKERRQ(ierr); } pb = (int*)(vp + sz); } if(pb-buff != tt){ SETERRQ2(PETSC_COMM_SELF,1,"pb-buff(%d) != tt(%d)",pb-buff,tt); } PetscFree( rreq ); } // do real local assembly (!PROM_PETSC_LOCAL) rowi_table_rowj = stiffness_->proci_table_gidi_tablej_.find(mype+1); if( rowi_table_rowj ){ TablePos tpos2 = rowi_table_rowj->getHeadPosition(); assert(tpos2!=0); while( tpos2 ) { rowj_vals = rowi_table_rowj->getNext(tpos2, &rowi); rowi--; ndfI = stiffness_->gid_ndf_->find( rowi + 1 ); TablePos tpos3 = rowj_vals->getHeadPosition(); while( tpos3 ) { double *vs = rowj_vals->getNext( tpos3, &colj ); colj--; ndfJ = stiffness_->gid_ndf_->find( colj + 1 ); ierr = stiffness_->SetValuesBlocked(1, &rowi, 1, &colj, vs, ADD_VALUES); CHKERRQ(ierr); // clean up #if defined(PROM_CLEAN_VALS) PetscFree(vs); #else int sz = ndfJ*ndfI; PetscMemzero( (char*)vs, sz*sizeof(double) ); #endif } #if defined(PROM_CLEAN_VALS) rowj_vals->removeAll(); // will get delte of cle... #endif } } // do imaginary rowi_table_rowj = stiffness_->proci_table_gidi_tablej_i_.find(mype+1); if( rowi_table_rowj ){ PromComplexMatrix *K = dynamic_cast(stiffness_); assert(K != NULL); TablePos tpos2 = rowi_table_rowj->getHeadPosition(); assert(tpos2!=0); while( tpos2 ) { rowj_vals = rowi_table_rowj->getNext(tpos2, &rowi); rowi--; ndfI = stiffness_->gid_ndf_->find( rowi + 1 ); TablePos tpos3 = rowj_vals->getHeadPosition(); while( tpos3 ) { double *vs = rowj_vals->getNext( tpos3, &colj ); colj--; ndfJ = stiffness_->gid_ndf_->find( colj + 1 ); ierr = K->SetValuesBlocked_i(1, &rowi, 1, &colj, vs, ADD_VALUES); CHKERRQ(ierr); // clean up #if defined(PROM_CLEAN_VALS) PetscFree(vs); #else int sz = ndfJ*ndfI; PetscMemzero( (char*)vs, sz*sizeof(double) ); #endif } #if defined(PROM_CLEAN_VALS) rowj_vals->removeAll(); // will get delte of cle... #endif } } // clean out, for memory -- keep for efficiency? ii = stiffness_->proci_table_gidi_tablej_.getCount(); #if defined(PROM_CLEAN_LISTS) #if !defined(PROM_CLEAN_VALS) #error requires 'PROM_CLEAN_VALS' #endif while( !stiffness_->proci_table_gidi_tablej_.isEmpty() ) { PromTable*> *rowi_table_rowj; rowi_table_rowj = stiffness_->proci_table_gidi_tablej_.removeHead(); while( !rowi_table_rowj->isEmpty() ) { PromTable *rowj_vals = rowi_table_rowj->removeHead(); assert(rowj_vals->isEmpty()); delete rowj_vals; } delete rowi_table_rowj; } stiffness_->proci_table_gidi_tablej_.removeAll(); // while( !stiffness_->proci_table_gidi_tablej_i_.isEmpty() ) { PromTable*> *rowi_table_rowj; rowi_table_rowj = stiffness_->proci_table_gidi_tablej_i_.removeHead(); while( !rowi_table_rowj->isEmpty() ) { PromTable *rowj_vals = rowi_table_rowj->removeHead(); assert(rowj_vals->isEmpty()); delete rowj_vals; } delete rowi_table_rowj; } stiffness_->proci_table_gidi_tablej_i_.removeAll(); #endif // free while( !smessages.isEmpty() ) { MPI_Request *sreq = smessages.removeHead(); MPI_Wait( sreq, &status ); PetscFree( sreq ); } MPI_Iprobe( MPI_ANY_SOURCE, tag, lastG->gComm_, &tt, &status ); assert(!tt); return 0; }