#include "petscmat.h" #include #include #include #include static char help[]="Reads in a matrix in LibSVM format and dumps them into Petsc binary files \n\ Input parameters are:\n\ -in : input file in libsvm format\n\ -tin : input test file in libsvm format (optional)\n\ -data : output file which contains data in Petsc binary format\n\ -labels : output file which contains labels in Petsc binary format\n\ -tdata : output test file which contains data in Petsc binary format (optional)\n\ -tlabels : output test file which contains labels in Petsc binary format (optional)\n\\n"; typedef struct{ PetscInt dim; // Dimensions PetscInt m; // Number of data points Vec labels; // Store labels Mat data; // Store data // These variables are used only for intermediate storage int *nnz_array; // Add an extra dimension? PetscTruth flg; } AppCtx; // line: Input buffer to store the read line // max_len: guess of the maximum length of a line in the input // file. // input: File pointer // Assume: Caller has already allocated memory to store max_len // characters in line. // Return: Pointer to the buffer with the read line. char* readline(char* line,int* max_len,FILE *input){ int read_len; // First try to read a line from the input file stream. max_len // represents our best guess for the maximum length of the line. If // the fgets call returns a NULL then it failed. So we also return a // NULL. if (fgets(line,*max_len,input)==NULL){ *max_len=0; free(line); line=NULL; return NULL; } // If strrchr returns a NULL it means the call to fgets terminated // because the line contains more than max_len characters. while (strrchr(line,'\n')==NULL){ // Double the buffer size until it is large enough to hold the // entire input *max_len*=2; // Reallocate a larger buffer to read the input line=(char *) realloc(line,*max_len*sizeof(char)); read_len=(int) strlen(line); // Read the rest of the line into the reallocated buffer if (fgets(line+read_len,*max_len-read_len,input)==NULL) break; } return line; } void parse_file(FILE* input,void* ptr){ AppCtx *user=(AppCtx *) ptr; int max_len=1024; char* line=(char *) calloc(max_len,sizeof(char)); assert(line!=NULL); int m_guess=128; user->nnz_array=(int *) calloc(m_guess,sizeof(int)); assert(user->nnz_array!=NULL); user->dim=-1; user->m=0; while ((line=readline(line,&max_len,input))!=NULL){ // Read and ignore the label strtok(line," \t"); int nnz=0; // Now process the rest of the line while (1){ char* pidx=strtok(NULL,":"); char* pval=strtok(NULL," \t"); if (pval==NULL) break; int idx=(int) strtol(pidx,NULL,10); if (idx > user->dim) user->dim=idx; nnz++; } if (user->m >= m_guess){ m_guess *= 2; user->nnz_array=(int *) realloc(user->nnz_array,m_guess*sizeof(int)); } user->nnz_array[user->m]=nnz; // Add one extra dimension if needed if (user->flg){ user->nnz_array[user->m]++; } user->m++; } // adjust for last index user->dim+=1; // Add one extra dimension if needed if (user->flg){ user->dim+=1; } // adjust to final size of the array user->nnz_array=(int *) realloc(user->nnz_array,user->m*sizeof(int)); if (line != NULL) free(line); return; } int fill_arrays(FILE* input,void *ptr){ AppCtx *user=(AppCtx *) ptr; int info; // Allocate space for the labels info=VecCreate(PETSC_COMM_WORLD,&user->labels);CHKERRQ(info); info=VecSetSizes(user->labels,PETSC_DECIDE,user->m);CHKERRQ(info); info=VecSetFromOptions(user->labels);CHKERRQ(info); info=PetscObjectSetName((PetscObject) user->labels,"Labels");CHKERRQ(info); // Allocate space for the data info=MatCreate(PETSC_COMM_WORLD,&user->data);CHKERRQ(info); info=MatSetSizes(user->data,PETSC_DECIDE,PETSC_DECIDE,user->m,user->dim);CHKERRQ(info); info=MatSetFromOptions(user->data);CHKERRQ(info); info=MatSetType(user->data,MATSEQAIJ);CHKERRQ(info); info=MatSeqAIJSetPreallocation(user->data,0,user->nnz_array);CHKERRQ(info); info=PetscObjectSetName((PetscObject) user->data,"Data");CHKERRQ(info); int max_len=1024; char* line=(char *) calloc(max_len,sizeof(char)); assert(line != NULL); int m=0; while ((line=readline(line,&max_len,input)) != NULL){ char* myline=strdup(line); // Read the label char *plabel=strtok(line," \t"); double label=strtol(plabel,NULL,10); VecSetValues(user->labels,1,&m,&label,ADD_VALUES); double* vals=(double *) calloc(user->nnz_array[m],sizeof(double)); int* idxs=(int *) calloc(user->nnz_array[m],sizeof(int)); // process of the line int nnz=0; while (1){ char* pidx=strtok(NULL,":"); char* pval=strtok(NULL," \t"); if (pval==NULL) break; int idx=(int) strtol(pidx,NULL,10); double val=strtod(pval,NULL); vals[nnz]=val; idxs[nnz]=idx; nnz++; } // Add an extra dimension and set it to one if (user->flg){ vals[nnz]=1.0; idxs[nnz]=user->dim-1; nnz++; } assert(user->nnz_array[m]==nnz); info=MatSetValues(user->data,1,&m,user->nnz_array[m],idxs,vals,INSERT_VALUES);CHKERRQ(info); free(myline); free(vals); free(idxs); m++; } info=VecAssemblyBegin(user->labels);CHKERRQ(info); info=VecAssemblyEnd(user->labels);CHKERRQ(info); info=MatAssemblyBegin(user->data,MAT_FINAL_ASSEMBLY);CHKERRQ(info); info=MatAssemblyEnd(user->data,MAT_FINAL_ASSEMBLY);CHKERRQ(info); assert(user->m==m); if (line != NULL) free(line); return 0; } void parse_file_parallel(FILE* input,int begin,int end,int locm,int* diag_nnz,int* offdiag_nnz,int* nnz){ int max_len=1024; char* line=(char *) calloc(max_len,sizeof(char)); assert(line!=NULL); int m=0; int ii=0; for (int i=0; i=end) break; } // Read and ignore the label strtok(line," \t"); // Now process the rest of the line while (1){ char* pidx=strtok(NULL,":"); char* pval=strtok(NULL," \t"); if (pval==NULL) break; int idx=(int) strtol(pidx,NULL,10); if (idx>=begin && idxm/size; int begin=rank*locm; int end=begin+locm; if (rank==size-1){ end=user->m; locm=user->m-begin; } int diag_nnz[locm]; int offdiag_nnz[locm]; if (user->dimnnz_array[ii]; offdiag_nnz[i]=0; } }else{ parse_file_parallel(input,begin,end,locm,diag_nnz,offdiag_nnz,user->nnz_array); rewind(input); } // Allocate space for the labels info=VecCreate(PETSC_COMM_WORLD,&user->labels);CHKERRQ(info); info=VecSetSizes(user->labels,locm,user->m);CHKERRQ(info); info=VecSetFromOptions(user->labels);CHKERRQ(info); info=VecSetType(user->labels,VECMPI); info=PetscObjectSetName((PetscObject) user->labels,"Labels");CHKERRQ(info); // Allocate space for the data info=MatCreate(PETSC_COMM_WORLD,&user->data);CHKERRQ(info); info=MatSetSizes(user->data,locm,user->dim,user->m,user->dim);CHKERRQ(info); info=MatSetFromOptions(user->data);CHKERRQ(info); info=MatSetType(user->data,MATMPIAIJ); info=MatMPIAIJSetPreallocation(user->data,0,diag_nnz,0,offdiag_nnz);CHKERRQ(info); info=PetscObjectSetName((PetscObject) user->data,"Data");CHKERRQ(info); int max_len=1024; char* line=0; PetscMalloc(max_len*sizeof(char),&line); assert(line); int m=0; int ii=0; while ((line=readline(line,&max_len,input)) != NULL){ //skip the lines for which this processor is not responsible if (m=end) break; } char* myline=strdup(line); // Read the label char *plabel=strtok(line," \t"); double label=strtol(plabel,NULL,10); VecSetValues(user->labels,1,&m,&label,ADD_VALUES); double* vals=(double *) calloc(user->nnz_array[m],sizeof(double)); int* idxs=(int *) calloc(user->nnz_array[m],sizeof(int)); // process of the line int nnz=0; while (1){ char* pidx=strtok(NULL,":"); char* pval=strtok(NULL," \t"); if (pval==NULL) break; int idx=(int) strtol(pidx,NULL,10); double val=strtod(pval,NULL); vals[nnz]=val; idxs[nnz]=idx; nnz++; } // Add an extra dimension and set it to one if (user->flg){ vals[nnz]=1.0; idxs[nnz]=user->dim-1; nnz++; } assert(user->nnz_array[m]==nnz); info=MatSetValues(user->data,1,&m,user->nnz_array[m],idxs,vals,INSERT_VALUES);CHKERRQ(info); free(myline); free(vals); free(idxs); m++; ii++; } assert(ii=locm); info=VecAssemblyBegin(user->labels);CHKERRQ(info); info=VecAssemblyEnd(user->labels);CHKERRQ(info); info=MatAssemblyBegin(user->data,MAT_FINAL_ASSEMBLY);CHKERRQ(info); info=MatAssemblyEnd(user->data,MAT_FINAL_ASSEMBLY);CHKERRQ(info); PetscFree(line); return 0; } #undef __FUNCT__ #define __FUNCT__ "main" int main(int argc,char **args) { AppCtx user; AppCtx tuser; PetscErrorCode info; char data_path[PETSC_MAX_PATH_LEN],bdata_path[PETSC_MAX_PATH_LEN],blabels_path[PETSC_MAX_PATH_LEN]; char tdata_path[PETSC_MAX_PATH_LEN],btdata_path[PETSC_MAX_PATH_LEN],btlabels_path[PETSC_MAX_PATH_LEN]; PetscTruth data_flg,bdata_flg,blabels_flg; PetscTruth tdata_flg,btdata_flg,btlabels_flg; FILE *data_file; FILE *tdata_file; PetscViewer view; PetscInitialize(&argc,&args,(char *)0,help); PetscMPIInt size,rank; info=MPI_Comm_size(PETSC_COMM_WORLD,&size);CHKERRQ(info); info=MPI_Comm_rank(PETSC_COMM_WORLD,&rank);CHKERRQ(info); // Read command line arguments info=PetscOptionsGetString(PETSC_NULL,"-in",data_path,PETSC_MAX_PATH_LEN-1,&data_flg);CHKERRQ(info); info=PetscOptionsGetString(PETSC_NULL,"-data",bdata_path,PETSC_MAX_PATH_LEN-1,&bdata_flg);CHKERRQ(info); info=PetscOptionsGetString(PETSC_NULL,"-labels",blabels_path,PETSC_MAX_PATH_LEN-1,&blabels_flg);CHKERRQ(info); if (!data_flg) SETERRQ(PETSC_ERR_SUP,"No libsvm input file specified!"); if (!bdata_flg||!blabels_flg) SETERRQ(PETSC_ERR_SUP,"No output files specified!"); // Read more command line arguments info=PetscOptionsGetString(PETSC_NULL,"-tin",tdata_path,PETSC_MAX_PATH_LEN-1,&tdata_flg);CHKERRQ(info); if (!tdata_flg){ info=PetscPrintf(PETSC_COMM_WORLD,"No libsvm test file specified!\n");CHKERRQ(info); }else{ info=PetscOptionsGetString(PETSC_NULL,"-tdata",btdata_path,PETSC_MAX_PATH_LEN-1,&btdata_flg);CHKERRQ(info); info=PetscOptionsGetString(PETSC_NULL,"-tlabels",btlabels_path,PETSC_MAX_PATH_LEN-1,&btlabels_flg);CHKERRQ(info); if (!bdata_flg||!blabels_flg) SETERRQ(PETSC_ERR_SUP,"No output test files specified!"); } // Should we add an extra dimension? user.flg=PETSC_FALSE; info=PetscOptionsGetTruth(PETSC_NULL,"-add_bias",&user.flg,PETSC_NULL);CHKERRQ(info); tuser.flg=user.flg; info=PetscPrintf(PETSC_COMM_WORLD,"\n Reading libsvm train file at %s\n",data_path);CHKERRQ(info); info=PetscFOpen(PETSC_COMM_SELF,data_path,"r",&data_file);CHKERRQ(info); user.nnz_array=NULL; if (rank==0){ parse_file(data_file,&user); rewind(data_file); // Set file pointer to beginning of file } tuser.nnz_array=NULL; if (tdata_flg){ info=PetscPrintf(PETSC_COMM_WORLD,"\n Reading libsvm test file at %s\n",tdata_path);CHKERRQ(info); info=PetscFOpen(PETSC_COMM_SELF,tdata_path,"r",&tdata_file);CHKERRQ(info); if (rank==0){ parse_file(tdata_file,&tuser); rewind(tdata_file); // Set file pointer to beginning of file // Choose the larger of the two dimensions if (user.dim > tuser.dim) tuser.dim=user.dim; else user.dim=tuser.dim; } MPI_Bcast(&tuser.dim,1,MPI_INT,0,PETSC_COMM_WORLD); MPI_Bcast(&tuser.m,1,MPI_INT,0,PETSC_COMM_WORLD); if (rank!=0) tuser.nnz_array=(int *) calloc(tuser.m,sizeof(int)); MPI_Bcast(tuser.nnz_array,tuser.m,MPI_INT,0,PETSC_COMM_WORLD); MPI_Bcast(&tuser.flg,1,MPI_INT,0,PETSC_COMM_WORLD); } MPI_Bcast(&user.dim,1,MPI_INT,0,PETSC_COMM_WORLD); MPI_Bcast(&user.m,1,MPI_INT,0,PETSC_COMM_WORLD); if (rank!=0) user.nnz_array=(int *) calloc(user.m,sizeof(int)); MPI_Bcast(user.nnz_array,user.m,MPI_INT,0,PETSC_COMM_WORLD); MPI_Bcast(&user.flg,1,MPI_INT,0,PETSC_COMM_WORLD); if (size==1) fill_arrays(data_file,&user); else{ // All processors should finish reading the file MPI_Barrier(PETSC_COMM_WORLD); fill_arrays_parallel(data_file,&user); } if (user.nnz_array) free(user.nnz_array); PetscFClose(PETSC_COMM_SELF,data_file); // Write the data and labels in Petsc binary file info=PetscPrintf(PETSC_COMM_WORLD,"\n Writing data in binary format to %s \n",bdata_path);CHKERRQ(info); info=PetscViewerBinaryOpen(PETSC_COMM_WORLD,bdata_path,FILE_MODE_WRITE,&view);CHKERRQ(info); info=MatView(user.data,view);CHKERRQ(info); info=PetscViewerDestroy(view);CHKERRQ(info); info=MatDestroy(user.data);CHKERRQ(info); info=PetscPrintf(PETSC_COMM_WORLD,"\n Writing labels in binary format to %s \n",blabels_path);CHKERRQ(info); info=PetscViewerBinaryOpen(PETSC_COMM_WORLD,blabels_path,FILE_MODE_WRITE,&view);CHKERRQ(info); info=VecView(user.labels,view);CHKERRQ(info); info=PetscViewerDestroy(view);CHKERRQ(info); info=VecDestroy(user.labels);CHKERRQ(info); if (tdata_flg){ if (size==1) fill_arrays(tdata_file,&tuser); else{ // All processors should finish writing the train file MPI_Barrier(PETSC_COMM_WORLD); fill_arrays_parallel(tdata_file,&tuser); } if (tuser.nnz_array) free(tuser.nnz_array); PetscFClose(PETSC_COMM_SELF,tdata_file); // Write the test data and labels in Petsc binary file info=PetscPrintf(PETSC_COMM_WORLD,"\n Writing test data in binary format to %s \n",btdata_path);CHKERRQ(info); info=PetscViewerBinaryOpen(PETSC_COMM_WORLD,btdata_path,FILE_MODE_WRITE,&view);CHKERRQ(info); info=MatView(tuser.data,view);CHKERRQ(info); info=PetscViewerDestroy(view);CHKERRQ(info); info=MatDestroy(tuser.data);CHKERRQ(info); info=PetscPrintf(PETSC_COMM_WORLD,"\n Writing test labels in binary format to %s \n",btlabels_path);CHKERRQ(info); info=PetscViewerBinaryOpen(PETSC_COMM_WORLD,btlabels_path,FILE_MODE_WRITE,&view);CHKERRQ(info); info=VecView(tuser.labels,view);CHKERRQ(info); info=PetscViewerDestroy(view);CHKERRQ(info); info=VecDestroy(tuser.labels);CHKERRQ(info); } info=PetscFinalize();CHKERRQ(info); return 0; }