#include "petscmat.h" #include #include #include #include static char help[]="Reads in a matrix in LibSVM format and dumps them into Petsc binary files \n\ Input parameters are:\n\ -in : input file in libsvm format\n\ -tin : input test file in libsvm format (optional)\n\ -data : output file which contains data in Petsc binary format\n\ -labels : output file which contains labels in Petsc binary format\n\ -tdata : output test file which contains data in Petsc binary format (optional)\n\ -tlabels : output test file which contains labels in Petsc binary format (optional)\n\\n"; typedef struct{ PetscInt dim; // Dimensions PetscInt m; // Number of data points PetscInt maxnnz; // Maximum number of nonzeros in any row PetscInt maxlen; // Maximum number of characters in any row Vec labels; // Store labels Mat data; // Store data // These variables are used only for intermediate storage PetscInt *nnz_array; // Add an extra dimension? PetscTruth flg; } AppCtx; // line: Input buffer to store the read line // maxlen: guess of the maximum length of a line in the input // file. // input: File pointer // Assume: Caller has already allocated memory to store maxlen // characters in line. // Return: Pointer to the buffer with the read line. char* readline(char* line,int* maxlen,FILE *input){ PetscInt read_len; // First try to read a line from the input file stream. maxlen // represents our best guess for the maximum length of the line. If // the fgets call returns a NULL then it failed. So we also return a // NULL. if (fgets(line,*maxlen,input)==NULL) return NULL; // If strrchr returns a NULL it means the call to fgets terminated // because the line contains more than maxlen characters. while (strrchr(line,'\n')==NULL){ // Double the buffer size until it is large enough to hold the // entire input *maxlen*=2; // Reallocate a larger buffer to read the input line=(char *) realloc(line,*maxlen*sizeof(char)); read_len=(int) strlen(line); // Read the rest of the line into the reallocated buffer if (fgets(line+read_len,*maxlen-read_len,input)==NULL) break; } return line; } PetscErrorCode parse_file(FILE* input,AppCtx* user){ PetscErrorCode info; user->maxlen=1024; char* line=0; info=PetscMalloc(user->maxlen*sizeof(char),&line);CHKERRQ(info); PetscInt m_guess=128; user->nnz_array=0; info=PetscMalloc(m_guess*sizeof(PetscInt),&user->nnz_array);CHKERRQ(info); user->dim=-1; user->m=0; user->maxnnz=0; while ((line=readline(line,&user->maxlen,input))!=NULL){ // Read and ignore the label strtok(line," \t"); PetscInt nnz=0; // Now process the rest of the line while (1){ char* pidx=strtok(NULL,":"); char* pval=strtok(NULL," \t"); if (pval==NULL) break; PetscInt idx=(PetscInt) strtol(pidx,NULL,10); if (idx > user->dim) user->dim=idx; nnz++; } if (user->m >= m_guess){ m_guess *= 2; user->nnz_array=(PetscInt *) realloc(user->nnz_array,m_guess*sizeof(PetscInt)); } user->nnz_array[user->m]=nnz; // Add one extra dimension if needed if (user->flg) user->nnz_array[user->m]++; // Remember maximum number of nonzeros in any row if(user->nnz_array[user->m]>user->maxnnz) user->maxnnz=user->nnz_array[user->m]; user->m++; // if(!(user->m%100000)) // PetscPrintf(PETSC_COMM_WORLD, "m=%d\n", user->m); } // adjust for last index // Add one extra dimension if needed if (user->flg){ user->dim+=2; } else { user->dim+=1; } // adjust to final size of the array user->nnz_array=(PetscInt *) realloc(user->nnz_array,user->m*sizeof(PetscInt)); info=PetscFree(line);CHKERRQ(info); return 0; } void parse_line(char* line,PetscInt* idxs,PetscScalar* vals,PetscScalar* label,int m,AppCtx* user){ // Read the label char *plabel=strtok(line," \t"); *label=strtol(plabel,NULL,10); // process the line PetscInt nnz=0; while (1){ char* pidx=strtok(NULL,":"); char* pval=strtok(NULL," \t"); if (pval==NULL) break; vals[nnz]=strtod(pval,NULL); idxs[nnz]=(PetscInt) strtol(pidx,NULL,10); nnz++; } // Add an extra dimension and set it to one if (user->flg){ vals[nnz]=1.0; idxs[nnz]=user->dim-1; nnz++; } assert(user->nnz_array[m]==nnz); return; } PetscErrorCode assemble_matrix(FILE* input,PetscInt begin,PetscInt end,PetscInt locm,AppCtx* user){ PetscErrorCode info; char* line=0; info=PetscMalloc(user->maxlen*sizeof(char),&line);CHKERRQ(info); PetscInt m=0; PetscInt ii=0; PetscScalar* vals=0; info=PetscMalloc(user->maxnnz*sizeof(PetscScalar),&vals);CHKERRQ(info); PetscInt* idxs=0; info=PetscMalloc(user->maxnnz*sizeof(PetscInt),&idxs);CHKERRQ(info); while ((line=readline(line,&user->maxlen,input)) != NULL){ //skip the lines for which this processor is not responsible if (m=end) break; } PetscScalar label=0; parse_line(line,idxs,vals,&label,m,user); info=VecSetValues(user->labels,1,&m,&label,ADD_VALUES);CHKERRQ(info); info=MatSetValues(user->data,1,&m,user->nnz_array[m],idxs,vals,INSERT_VALUES);CHKERRQ(info); m++; ii++; } assert(ii=locm); info=PetscFree(vals);CHKERRQ(info); info=PetscFree(idxs);CHKERRQ(info); info=PetscFree(line);CHKERRQ(info); info=VecAssemblyBegin(user->labels);CHKERRQ(info); info=VecAssemblyEnd(user->labels);CHKERRQ(info); info=MatAssemblyBegin(user->data,MAT_FINAL_ASSEMBLY);CHKERRQ(info); info=MatAssemblyEnd(user->data,MAT_FINAL_ASSEMBLY);CHKERRQ(info); return 0; } PetscErrorCode fill_arrays(FILE* input,AppCtx* user){ PetscErrorCode info; // Allocate space for the labels info=VecCreate(PETSC_COMM_WORLD,&user->labels);CHKERRQ(info); info=VecSetSizes(user->labels,PETSC_DECIDE,user->m);CHKERRQ(info); info=VecSetFromOptions(user->labels);CHKERRQ(info); info=PetscObjectSetName((PetscObject) user->labels,"Labels");CHKERRQ(info); // Allocate space for the data info=MatCreate(PETSC_COMM_WORLD,&user->data);CHKERRQ(info); info=MatSetSizes(user->data,PETSC_DECIDE,PETSC_DECIDE,user->m,user->dim);CHKERRQ(info); info=MatSetFromOptions(user->data);CHKERRQ(info); info=MatSetType(user->data,MATSEQAIJ);CHKERRQ(info); info=MatSeqAIJSetPreallocation(user->data,0,user->nnz_array);CHKERRQ(info); info=PetscObjectSetName((PetscObject) user->data,"Data");CHKERRQ(info); assemble_matrix(input,0,user->m,user->m,user); return 0; } PetscErrorCode reparse_file(FILE* input,PetscInt* diag_nnz, PetscInt* offdiag_nnz,AppCtx* user){ PetscErrorCode info; char* line=0; info=PetscMalloc(user->maxlen*sizeof(char),&line); PetscInt begin,end; info=MatGetOwnershipRange(user->data,&begin,&end);CHKERRQ(info); PetscInt locm=end-begin; PetscInt cbegin,cend; MatGetOwnershipRangeColumn(user->data,&cbegin,&cend);CHKERRQ(info); PetscInt m=0; PetscInt ii=0; for (PetscInt i=0; imaxlen,input))!=NULL){ //skip the lines for which this processor is not responsible if (m=end) break; } // Read and ignore the label strtok(line," \t"); // Now process the rest of the line while (1){ char* pidx=strtok(NULL,":"); char* pval=strtok(NULL," \t"); if (pval==NULL) break; PetscInt idx=(PetscInt) strtol(pidx,NULL,10); if (idx>=cbegin && idxflg && user->dim-1>=cbegin && user->dim-1nnz_array[m]-diag_nnz[ii]; m++; ii++; } info=PetscFree(line);CHKERRQ(info); assert(ii==locm); return 0; } PetscErrorCode fill_arrays_parallel(FILE* input,AppCtx* user){ PetscErrorCode info; // Create labels vector info=VecCreate(PETSC_COMM_WORLD,&user->labels);CHKERRQ(info); info=VecSetSizes(user->labels,PETSC_DECIDE,user->m);CHKERRQ(info); info=VecSetFromOptions(user->labels);CHKERRQ(info); info=VecSetType(user->labels,VECMPI); info=PetscObjectSetName((PetscObject) user->labels,"Labels");CHKERRQ(info); // Create data matrix info=MatCreate(PETSC_COMM_WORLD,&user->data);CHKERRQ(info); info=MatSetSizes(user->data,PETSC_DECIDE,PETSC_DECIDE,user->m,user->dim);CHKERRQ(info); info=MatSetFromOptions(user->data);CHKERRQ(info); info=MatSetType(user->data,MATMPIAIJ); info=PetscObjectSetName((PetscObject) user->data,"Data");CHKERRQ(info); // Allocate space for the data PetscInt begin,end; info=MatGetOwnershipRange(user->data,&begin,&end);CHKERRQ(info); PetscInt locm=end-begin; PetscInt diag_nnz[locm]; PetscInt offdiag_nnz[locm]; reparse_file(input,diag_nnz,offdiag_nnz,user); rewind(input); info=MatMPIAIJSetPreallocation(user->data,0,diag_nnz,0,offdiag_nnz);CHKERRQ(info); assemble_matrix(input,begin,end,locm,user); return 0; } #undef __FUNCT__ #define __FUNCT__ "main" int main(int argc,char **args) { AppCtx user; AppCtx tuser; PetscErrorCode info; char data_path[PETSC_MAX_PATH_LEN],bdata_path[PETSC_MAX_PATH_LEN],blabels_path[PETSC_MAX_PATH_LEN]; char tdata_path[PETSC_MAX_PATH_LEN],btdata_path[PETSC_MAX_PATH_LEN],btlabels_path[PETSC_MAX_PATH_LEN]; PetscTruth data_flg,bdata_flg,blabels_flg; PetscTruth tdata_flg,btdata_flg,btlabels_flg; FILE *data_file; FILE *tdata_file; PetscViewer view; PetscInitialize(&argc,&args,(char *)0,help); PetscMPIInt size,rank; info=MPI_Comm_size(PETSC_COMM_WORLD,&size);CHKERRQ(info); info=MPI_Comm_rank(PETSC_COMM_WORLD,&rank);CHKERRQ(info); // Read command line arguments info=PetscOptionsGetString(PETSC_NULL,"-in",data_path,PETSC_MAX_PATH_LEN-1,&data_flg);CHKERRQ(info); info=PetscOptionsGetString(PETSC_NULL,"-data",bdata_path,PETSC_MAX_PATH_LEN-1,&bdata_flg);CHKERRQ(info); info=PetscOptionsGetString(PETSC_NULL,"-labels",blabels_path,PETSC_MAX_PATH_LEN-1,&blabels_flg);CHKERRQ(info); if (!data_flg) SETERRQ(PETSC_ERR_SUP,"No libsvm input file specified!"); if (!bdata_flg||!blabels_flg) SETERRQ(PETSC_ERR_SUP,"No output files specified!"); // Read more command line arguments info=PetscOptionsGetString(PETSC_NULL,"-tin",tdata_path,PETSC_MAX_PATH_LEN-1,&tdata_flg);CHKERRQ(info); if (!tdata_flg){ info=PetscPrintf(PETSC_COMM_WORLD,"No libsvm test file specified!\n");CHKERRQ(info); }else{ info=PetscOptionsGetString(PETSC_NULL,"-tdata",btdata_path,PETSC_MAX_PATH_LEN-1,&btdata_flg);CHKERRQ(info); info=PetscOptionsGetString(PETSC_NULL,"-tlabels",btlabels_path,PETSC_MAX_PATH_LEN-1,&btlabels_flg);CHKERRQ(info); if (!bdata_flg||!blabels_flg) SETERRQ(PETSC_ERR_SUP,"No output test files specified!"); } // Should we add an extra dimension? user.flg=PETSC_FALSE; info=PetscOptionsGetTruth(PETSC_NULL,"-add_bias",&user.flg,PETSC_NULL);CHKERRQ(info); tuser.flg=user.flg; info=PetscPrintf(PETSC_COMM_WORLD,"\n Reading libsvm train file at %s\n",data_path);CHKERRQ(info); info=PetscFOpen(PETSC_COMM_SELF,data_path,"r",&data_file);CHKERRQ(info); user.nnz_array=NULL; if (rank==0){ parse_file(data_file,&user); rewind(data_file); // Set file pointer to beginning of file } tuser.nnz_array=NULL; if (tdata_flg){ info=PetscPrintf(PETSC_COMM_WORLD,"\n Reading libsvm test file at %s\n",tdata_path);CHKERRQ(info); info=PetscFOpen(PETSC_COMM_SELF,tdata_path,"r",&tdata_file);CHKERRQ(info); if (rank==0){ parse_file(tdata_file,&tuser); rewind(tdata_file); // Set file pointer to beginning of file // Choose the larger of the two dimensions if (user.dim > tuser.dim) tuser.dim=user.dim; else user.dim=tuser.dim; } MPI_Bcast(&tuser.dim,1,MPI_INT,0,PETSC_COMM_WORLD); MPI_Bcast(&tuser.m,1,MPI_INT,0,PETSC_COMM_WORLD); MPI_Bcast(&tuser.maxnnz,1,MPI_INT,0,PETSC_COMM_WORLD); MPI_Bcast(&tuser.maxlen,1,MPI_INT,0,PETSC_COMM_WORLD); MPI_Bcast(&tuser.flg,1,MPI_INT,0,PETSC_COMM_WORLD); if (rank!=0){ info=PetscMalloc(tuser.m*sizeof(PetscInt),&tuser.nnz_array);CHKERRQ(info); } MPI_Bcast(tuser.nnz_array,tuser.m,MPI_INT,0,PETSC_COMM_WORLD); } MPI_Bcast(&user.dim,1,MPI_INT,0,PETSC_COMM_WORLD); MPI_Bcast(&user.m,1,MPI_INT,0,PETSC_COMM_WORLD); MPI_Bcast(&user.maxnnz,1,MPI_INT,0,PETSC_COMM_WORLD); MPI_Bcast(&user.maxlen,1,MPI_INT,0,PETSC_COMM_WORLD); MPI_Bcast(&user.flg,1,MPI_INT,0,PETSC_COMM_WORLD); if (rank!=0){ info=PetscMalloc(user.m*sizeof(PetscInt),&user.nnz_array);CHKERRQ(info); } MPI_Bcast(user.nnz_array,user.m,MPI_INT,0,PETSC_COMM_WORLD); // PetscSynchronizedPrintf(PETSC_COMM_WORLD, "user.dim=%d user.m=%d user.maxnnz=%d user.maxlen=%d user.flg=%d \n", user.dim, user.m, user.maxnnz, user.maxlen, user.flg); // PetscSynchronizedFlush(PETSC_COMM_WORLD); if (size==1) fill_arrays(data_file,&user); else fill_arrays_parallel(data_file,&user); info=PetscFree(user.nnz_array);CHKERRQ(info); PetscFClose(PETSC_COMM_SELF,data_file); // Write the data and labels in Petsc binary file info=PetscPrintf(PETSC_COMM_WORLD,"\n Writing data in binary format to %s \n",bdata_path);CHKERRQ(info); info=PetscViewerBinaryOpen(PETSC_COMM_WORLD,bdata_path,FILE_MODE_WRITE,&view);CHKERRQ(info); info=MatView(user.data,view);CHKERRQ(info); info=PetscViewerDestroy(view);CHKERRQ(info); info=MatDestroy(user.data);CHKERRQ(info); info=PetscPrintf(PETSC_COMM_WORLD,"\n Writing labels in binary format to %s \n",blabels_path);CHKERRQ(info); info=PetscViewerBinaryOpen(PETSC_COMM_WORLD,blabels_path,FILE_MODE_WRITE,&view);CHKERRQ(info); info=VecView(user.labels,view);CHKERRQ(info); info=PetscViewerDestroy(view);CHKERRQ(info); info=VecDestroy(user.labels);CHKERRQ(info); if (tdata_flg){ if (size==1) fill_arrays(tdata_file,&tuser); else{ fill_arrays_parallel(tdata_file,&tuser); } info=PetscFree(tuser.nnz_array);CHKERRQ(info); PetscFClose(PETSC_COMM_SELF,tdata_file); // Write the test data and labels in Petsc binary file info=PetscPrintf(PETSC_COMM_WORLD,"\n Writing test data in binary format to %s \n",btdata_path);CHKERRQ(info); info=PetscViewerBinaryOpen(PETSC_COMM_WORLD,btdata_path,FILE_MODE_WRITE,&view);CHKERRQ(info); info=MatView(tuser.data,view);CHKERRQ(info); info=PetscViewerDestroy(view);CHKERRQ(info); info=MatDestroy(tuser.data);CHKERRQ(info); info=PetscPrintf(PETSC_COMM_WORLD,"\n Writing test labels in binary format to %s \n",btlabels_path);CHKERRQ(info); info=PetscViewerBinaryOpen(PETSC_COMM_WORLD,btlabels_path,FILE_MODE_WRITE,&view);CHKERRQ(info); info=VecView(tuser.labels,view);CHKERRQ(info); info=PetscViewerDestroy(view);CHKERRQ(info); info=VecDestroy(tuser.labels);CHKERRQ(info); } info=PetscFinalize();CHKERRQ(info); return 0; }