Index: src/lib/getput.m4 =================================================================== --- src/lib/getput.m4 (revision 1343) +++ src/lib/getput.m4 (working copy) @@ -697,7 +697,55 @@ dnl string is not yet supported +/* it's possible to describe more than 32 bits of data with pnetcdf -- simply + * try to read or write a variable larger than 2 GiB. At one time it was rare + * for a single process to request 2 GiB of data but over time that will become + * less rare -- and it's entirely possible that 'int' will be 32 bits forever + * input: 'nbytes': count of number of MPI_BYTE types sufficient to describe + * memory + * returns: a datatype big enough to describe all of memory, without + * needing to count 2^32 of them + */ +static MPI_Datatype make_largexfer_type(MPI_Offset nbytes) +{ + int typechunk_size = 1024*1024; /* in bytes: TODO: figure out how big a + chunk is really needed */ + int chunk_count; + int remainder=0; + MPI_Datatype memtype, chunktype; + + /* need to cook up a new datatype to accomodate large datatypes */ + /* first pass: chunks of 1 MiB plus an additional remainder. Does require + * 8 byte MPI_Aint, which should have been checked for earlier */ + + if (sizeof(MPI_Aint) <= sizeof(int)) { + return MPI_DATATYPE_NULL; + } + + chunk_count = nbytes/typechunk_size; + remainder = nbytes % typechunk_size; + MPI_Type_contiguous(typechunk_size, MPI_BYTE, &chunktype); + + /* a zero remainder means we can just count contigs */ + if (remainder == 0) { + MPI_Type_commit(&chunktype); + memtype = chunktype; + } else { + /* struct type: some number of chunks plus remaining bytes tacked + * on at end */ + int lens[] = {chunk_count, remainder}; + MPI_Aint disp[] = {0, (MPI_Aint) typechunk_size * (MPI_Aint)chunk_count}; + MPI_Datatype types[] = {chunktype, MPI_BYTE}; + + MPI_Type_struct(2, lens, disp, types, &memtype); + MPI_Type_commit(&memtype); + MPI_Type_free(&chunktype); + } + return memtype; +} + + /* for write case, buf needs to swapped back if swapped previously */ #define FINAL_CLEAN_UP { \ if (is_buf_swapped) /* byte-swap back to buf's original contents */ \ @@ -746,11 +794,11 @@ { void *xbuf=NULL, *cbuf=NULL; int el_size, buftype_is_contig, mpireturn, need_swap=0, is_buf_swapped=0; - int isderived; + int isderived, type_count; int warning, err, status; /* err is for API abort and status is not */ MPI_Offset fnelems=1, bnelems, nbytes, offset=0; MPI_Status mpistatus; - MPI_Datatype ptype, filetype=MPI_BYTE; + MPI_Datatype ptype, filetype=MPI_BYTE, memtype; MPI_File fh; /* "API error" will abort this API call, but not the entire program */ @@ -780,8 +828,10 @@ err = NCMPII_ECHAR(varp->type, ptype); if (err != NC_NOERR) goto err_check; - CHECK_NELEMS(varp, fnelems, count, bnelems, bufcount, nbytes, err) - if (nbytes != (int)nbytes) { + CHECK_NELEMS(varp, fnelems, count, bnelems, bufcount, nbytes, err); + /* nelems could be > 32 bits. MPI can deal with this (with some help) as + * long as MPI_Aint is 8 bytes */ + if ( (nbytes != (int)nbytes) && (sizeof(MPI_Aint) <= sizeof(int))) { err = NC_EINTOVERFLOW; if (io_method == INDEP_IO) return err; goto err_check; @@ -920,9 +970,17 @@ if (filetype != MPI_BYTE) MPI_Type_free(&filetype); + if (nbytes != (int)nbytes) { + memtype = make_largexfer_type(nbytes); + type_count = 1; + } else { + memtype = MPI_BYTE; + type_count = nbytes; + } + if (rw_flag == WRITE_REQ) { if (io_method == COLL_IO) { - mpireturn = MPI_File_write_all(fh, xbuf, nbytes, MPI_BYTE, &mpistatus); + mpireturn = MPI_File_write_all(fh, xbuf, type_count, memtype, &mpistatus); if (mpireturn != MPI_SUCCESS) { ncmpii_handle_error(mpireturn, "MPI_File_write_all"); /* return the first encountered error if there is any */ @@ -930,7 +988,7 @@ } } else { /* io_method == INDEP_IO */ - mpireturn = MPI_File_write(fh, xbuf, nbytes, MPI_BYTE, &mpistatus); + mpireturn = MPI_File_write(fh, xbuf, type_count, memtype, &mpistatus); if (mpireturn != MPI_SUCCESS) { ncmpii_handle_error(mpireturn, "MPI_File_write"); /* return the first encountered error if there is any */ @@ -943,7 +1001,7 @@ } else { /* rw_flag == READ_REQ */ if (io_method == COLL_IO) { - mpireturn = MPI_File_read_all(fh, xbuf, nbytes, MPI_BYTE, &mpistatus); + mpireturn = MPI_File_read_all(fh, xbuf, type_count, memtype, &mpistatus); if (mpireturn != MPI_SUCCESS) { ncmpii_handle_error(mpireturn, "MPI_File_read_all"); /* return the first encountered error if there is any */ @@ -951,15 +1009,20 @@ } } else { /* io_method == INDEP_IO */ - mpireturn = MPI_File_read(fh, xbuf, nbytes, MPI_BYTE, &mpistatus); + mpireturn = MPI_File_read(fh, xbuf, type_count, memtype, &mpistatus); if (mpireturn != MPI_SUCCESS) { ncmpii_handle_error(mpireturn, "MPI_File_read"); /* return the first encountered error if there is any */ if (status == NC_NOERR) status = NC_EREAD; } } +#ifdef HAVE_MPI_GET_COUNT_X + MPI_Count get_size; + MPI_Get_count_x(&mpistatus, memtype, &get_size); +#else int get_size; - MPI_Get_count(&mpistatus, MPI_BYTE, &get_size); + MPI_Get_count(&mpistatus, memtype, &get_size); +#endif ncp->nciop->get_size += get_size; }