diff -ruNp romio-orig/adio/common/ad_read_coll.c romio/adio/common/ad_read_coll.c --- romio-orig/adio/common/ad_read_coll.c 2007-07-13 08:59:55.000454903 -0400 +++ romio/adio/common/ad_read_coll.c 2007-07-13 09:29:53.000527998 -0400 @@ -133,6 +133,8 @@ void ADIOI_GEN_ReadStridedColl(ADIO_File if (fd->hints->cb_read == ADIOI_HINT_DISABLE || (!interleave_count && (fd->hints->cb_read == ADIOI_HINT_AUTO))) { + int filerange_is_contig = 0; + /* don't do aggregation */ if (fd->hints->cb_read != ADIOI_HINT_DISABLE) { ADIOI_Free(offset_list); @@ -143,8 +145,13 @@ void ADIOI_GEN_ReadStridedColl(ADIO_File fd->fp_ind = orig_fp; ADIOI_Datatype_iscontig(fd->filetype, &filetype_is_contig); + if (!filetype_is_contig) + ADIOI_Filetype_range_iscontig(fd, offset, file_ptr_type, + datatype, count, &filerange_is_contig); + + if (buftype_is_contig && (filetype_is_contig || + filerange_is_contig)) { - if (buftype_is_contig && filetype_is_contig) { if (file_ptr_type == ADIO_EXPLICIT_OFFSET) { off = fd->disp + (fd->etype_size) * offset; ADIO_ReadContig(fd, buf, count, datatype, ADIO_EXPLICIT_OFFSET, diff -ruNp romio-orig/adio/common/ad_read_coll.c.bak romio/adio/common/ad_read_coll.c.bak --- romio-orig/adio/common/ad_read_coll.c.bak 1969-12-31 19:00:00.000000000 -0500 +++ romio/adio/common/ad_read_coll.c.bak 2007-07-13 09:29:53.000142088 -0400 @@ -0,0 +1,1032 @@ +/* -*- Mode: C; c-basic-offset:4 ; -*- */ +/* + * + * Copyright (C) 1997 University of Chicago. + * See COPYRIGHT notice in top-level directory. + */ + +#include "adio.h" +#include "adio_extern.h" +#ifdef PROFILE +#include "mpe.h" +#endif + +/* prototypes of functions used for collective reads only. */ +static void ADIOI_Read_and_exch(ADIO_File fd, void *buf, MPI_Datatype + datatype, int nprocs, + int myrank, ADIOI_Access + *others_req, ADIO_Offset *offset_list, + int *len_list, int contig_access_count, + ADIO_Offset + min_st_offset, ADIO_Offset fd_size, + ADIO_Offset *fd_start, ADIO_Offset *fd_end, + int *buf_idx, int *error_code); +static void ADIOI_R_Exchange_data(ADIO_File fd, void *buf, ADIOI_Flatlist_node + *flat_buf, ADIO_Offset *offset_list, int + *len_list, int *send_size, int *recv_size, + int *count, int *start_pos, + int *partial_send, + int *recd_from_proc, int nprocs, + int myrank, int + buftype_is_contig, int contig_access_count, + ADIO_Offset min_st_offset, + ADIO_Offset fd_size, + ADIO_Offset *fd_start, ADIO_Offset *fd_end, + ADIOI_Access *others_req, + int iter, + MPI_Aint buftype_extent, int *buf_idx); +static void ADIOI_Fill_user_buffer(ADIO_File fd, void *buf, ADIOI_Flatlist_node + *flat_buf, char **recv_buf, ADIO_Offset + *offset_list, int *len_list, + int *recv_size, + MPI_Request *requests, MPI_Status *statuses, + int *recd_from_proc, int nprocs, + int contig_access_count, + ADIO_Offset min_st_offset, + ADIO_Offset fd_size, ADIO_Offset *fd_start, + ADIO_Offset *fd_end, + MPI_Aint buftype_extent); + + +void ADIOI_GEN_ReadStridedColl(ADIO_File fd, void *buf, int count, + MPI_Datatype datatype, int file_ptr_type, + ADIO_Offset offset, ADIO_Status *status, int + *error_code) +{ +/* Uses a generalized version of the extended two-phase method described + in "An Extended Two-Phase Method for Accessing Sections of + Out-of-Core Arrays", Rajeev Thakur and Alok Choudhary, + Scientific Programming, (5)4:301--317, Winter 1996. + http://www.mcs.anl.gov/home/thakur/ext2ph.ps */ + + ADIOI_Access *my_req; + /* array of nprocs structures, one for each other process in + whose file domain this process's request lies */ + + ADIOI_Access *others_req; + /* array of nprocs structures, one for each other process + whose request lies in this process's file domain. */ + + int i, filetype_is_contig, nprocs, nprocs_for_coll, myrank; + int contig_access_count=0, interleave_count = 0, buftype_is_contig; + int *count_my_req_per_proc, count_my_req_procs, count_others_req_procs; + ADIO_Offset start_offset, end_offset, orig_fp, fd_size, min_st_offset, off; + ADIO_Offset *offset_list = NULL, *st_offsets = NULL, *fd_start = NULL, + *fd_end = NULL, *end_offsets = NULL; + int *len_list = NULL, *buf_idx = NULL; + +#ifdef HAVE_STATUS_SET_BYTES + int bufsize, size; +#endif + +#ifdef PROFILE + MPE_Log_event(13, 0, "start computation"); +#endif + + MPI_Comm_size(fd->comm, &nprocs); + MPI_Comm_rank(fd->comm, &myrank); + + /* number of aggregators, cb_nodes, is stored in the hints */ + nprocs_for_coll = fd->hints->cb_nodes; + orig_fp = fd->fp_ind; + + /* only check for interleaving if cb_read isn't disabled */ + if (fd->hints->cb_read != ADIOI_HINT_DISABLE) { + /* For this process's request, calculate the list of offsets and + lengths in the file and determine the start and end offsets. */ + + /* Note: end_offset points to the last byte-offset that will be accessed. + e.g., if start_offset=0 and 100 bytes to be read, end_offset=99*/ + + ADIOI_Calc_my_off_len(fd, count, datatype, file_ptr_type, offset, + &offset_list, &len_list, &start_offset, + &end_offset, &contig_access_count); + + /* for (i=0; icomm); + MPI_Allgather(&end_offset, 1, ADIO_OFFSET, end_offsets, 1, + ADIO_OFFSET, fd->comm); + + /* are the accesses of different processes interleaved? */ + for (i=1; ihints->cb_read == ADIOI_HINT_DISABLE + || (!interleave_count && (fd->hints->cb_read == ADIOI_HINT_AUTO))) + { + int filerange_is_contig = 0; + + /* don't do aggregation */ + if (fd->hints->cb_read != ADIOI_HINT_DISABLE) { + ADIOI_Free(offset_list); + ADIOI_Free(len_list); + ADIOI_Free(st_offsets); + ADIOI_Free(end_offsets); + } + + fd->fp_ind = orig_fp; + ADIOI_Datatype_iscontig(fd->filetype, &filetype_is_contig); + if (!filetype_is_contig) + ADIOI_Filetype_range_iscontig(fd, offset, file_ptr_type, + datatype, count, &filerange_is_contig); + + if (buftype_is_contig && (filetype_is_contig || + filerange_is_contig)) { + + if (file_ptr_type == ADIO_EXPLICIT_OFFSET) { + off = fd->disp + (fd->etype_size) * offset; + ADIO_ReadContig(fd, buf, count, datatype, ADIO_EXPLICIT_OFFSET, + off, status, error_code); + } + else ADIO_ReadContig(fd, buf, count, datatype, ADIO_INDIVIDUAL, + 0, status, error_code); + } + else ADIO_ReadStrided(fd, buf, count, datatype, file_ptr_type, + offset, status, error_code); + + return; + } + + /* We're going to perform aggregation of I/O. Here we call + * ADIOI_Calc_file_domains() to determine what processes will handle I/O + * to what regions. We pass nprocs_for_coll into this function; it is + * used to determine how many processes will perform I/O, which is also + * the number of regions into which the range of bytes must be divided. + * These regions are called "file domains", or FDs. + * + * When this function returns, fd_start, fd_end, fd_size, and + * min_st_offset will be filled in. fd_start holds the starting byte + * location for each file domain. fd_end holds the ending byte location. + * min_st_offset holds the minimum byte location that will be accessed. + * + * Both fd_start[] and fd_end[] are indexed by an aggregator number; this + * needs to be mapped to an actual rank in the communicator later. + * + */ + ADIOI_Calc_file_domains(st_offsets, end_offsets, nprocs, + nprocs_for_coll, &min_st_offset, + &fd_start, &fd_end, &fd_size); + + /* calculate where the portions of the access requests of this process + * are located in terms of the file domains. this could be on the same + * process or on other processes. this function fills in: + * count_my_req_procs - number of processes (including this one) for which + * this process has requests in their file domain + * count_my_req_per_proc - count of requests for each process, indexed + * by rank of the process + * my_req[] - array of data structures describing the requests to be + * performed by each process (including self). indexed by rank. + * buf_idx[] - array of locations into which data can be directly moved; + * this is only valid for contiguous buffer case + */ + ADIOI_Calc_my_req(fd, offset_list, len_list, contig_access_count, + min_st_offset, fd_start, fd_end, fd_size, + nprocs, &count_my_req_procs, + &count_my_req_per_proc, &my_req, + &buf_idx); + + /* perform a collective communication in order to distribute the + * data calculated above. fills in the following: + * count_others_req_procs - number of processes (including this + * one) which have requests in this process's file domain. + * count_others_req_per_proc[] - number of separate contiguous + * requests from proc i lie in this process's file domain. + */ + ADIOI_Calc_others_req(fd, count_my_req_procs, + count_my_req_per_proc, my_req, + nprocs, myrank, &count_others_req_procs, + &others_req); + + /* my_req[] and count_my_req_per_proc aren't needed at this point, so + * let's free the memory + */ + ADIOI_Free(count_my_req_per_proc); + for (i=0; ifp_sys_posn = -1; /* set it to null. */ +} + +void ADIOI_Calc_my_off_len(ADIO_File fd, int bufcount, MPI_Datatype + datatype, int file_ptr_type, ADIO_Offset + offset, ADIO_Offset **offset_list_ptr, int + **len_list_ptr, ADIO_Offset *start_offset_ptr, + ADIO_Offset *end_offset_ptr, int + *contig_access_count_ptr) +{ + int filetype_size, buftype_size, etype_size; + int i, j, k, frd_size=0, old_frd_size=0, st_index=0; + int n_filetypes, etype_in_filetype; + ADIO_Offset abs_off_in_filetype=0; + int bufsize, sum, n_etypes_in_filetype, size_in_filetype; + int contig_access_count, *len_list, flag, filetype_is_contig; + MPI_Aint filetype_extent, filetype_lb; + ADIOI_Flatlist_node *flat_file; + ADIO_Offset *offset_list, off, end_offset=0, disp; + +/* For this process's request, calculate the list of offsets and + lengths in the file and determine the start and end offsets. */ + + ADIOI_Datatype_iscontig(fd->filetype, &filetype_is_contig); + + MPI_Type_size(fd->filetype, &filetype_size); + MPI_Type_extent(fd->filetype, &filetype_extent); + MPI_Type_lb(fd->filetype, &filetype_lb); + MPI_Type_size(datatype, &buftype_size); + etype_size = fd->etype_size; + + if ( ! filetype_size ) { + *contig_access_count_ptr = 0; + *offset_list_ptr = (ADIO_Offset *) ADIOI_Malloc(2*sizeof(ADIO_Offset)); + *len_list_ptr = (int *) ADIOI_Malloc(2*sizeof(int)); + /* 2 is for consistency. everywhere I malloc one more than needed */ + + offset_list = *offset_list_ptr; + len_list = *len_list_ptr; + offset_list[0] = (file_ptr_type == ADIO_INDIVIDUAL) ? fd->fp_ind : + fd->disp + etype_size * offset; + len_list[0] = 0; + *start_offset_ptr = offset_list[0]; + *end_offset_ptr = offset_list[0] + len_list[0] - 1; + + return; + } + + if (filetype_is_contig) { + *contig_access_count_ptr = 1; + *offset_list_ptr = (ADIO_Offset *) ADIOI_Malloc(2*sizeof(ADIO_Offset)); + *len_list_ptr = (int *) ADIOI_Malloc(2*sizeof(int)); + /* 2 is for consistency. everywhere I malloc one more than needed */ + + offset_list = *offset_list_ptr; + len_list = *len_list_ptr; + offset_list[0] = (file_ptr_type == ADIO_INDIVIDUAL) ? fd->fp_ind : + fd->disp + etype_size * offset; + len_list[0] = bufcount * buftype_size; + *start_offset_ptr = offset_list[0]; + *end_offset_ptr = offset_list[0] + len_list[0] - 1; + + /* update file pointer */ + if (file_ptr_type == ADIO_INDIVIDUAL) fd->fp_ind = *end_offset_ptr + 1; + } + + else { + + /* First calculate what size of offset_list and len_list to allocate */ + + /* filetype already flattened in ADIO_Open or ADIO_Fcntl */ + flat_file = ADIOI_Flatlist; + while (flat_file->type != fd->filetype) flat_file = flat_file->next; + disp = fd->disp; + + if (file_ptr_type == ADIO_INDIVIDUAL) { + offset = fd->fp_ind; /* in bytes */ + n_filetypes = -1; + flag = 0; + while (!flag) { + n_filetypes++; + for (i=0; icount; i++) { + if (disp + flat_file->indices[i] + + (ADIO_Offset) n_filetypes*filetype_extent + + flat_file->blocklens[i] >= offset) + { + st_index = i; + frd_size = (int) (disp + flat_file->indices[i] + + (ADIO_Offset) n_filetypes*filetype_extent + + flat_file->blocklens[i] - offset); + flag = 1; + break; + } + } + } + } + else { + n_etypes_in_filetype = filetype_size/etype_size; + n_filetypes = (int) (offset / n_etypes_in_filetype); + etype_in_filetype = (int) (offset % n_etypes_in_filetype); + size_in_filetype = etype_in_filetype * etype_size; + + sum = 0; + for (i=0; icount; i++) { + sum += flat_file->blocklens[i]; + if (sum > size_in_filetype) { + st_index = i; + frd_size = sum - size_in_filetype; + abs_off_in_filetype = flat_file->indices[i] + + size_in_filetype - (sum - flat_file->blocklens[i]); + break; + } + } + + /* abs. offset in bytes in the file */ + offset = disp + (ADIO_Offset) n_filetypes*filetype_extent + + abs_off_in_filetype; + } + + /* calculate how much space to allocate for offset_list, len_list */ + + old_frd_size = frd_size; + contig_access_count = i = 0; + j = st_index; + bufsize = buftype_size * bufcount; + frd_size = ADIOI_MIN(frd_size, bufsize); + while (i < bufsize) { + if (frd_size) contig_access_count++; + i += frd_size; + j = (j + 1) % flat_file->count; + frd_size = ADIOI_MIN(flat_file->blocklens[j], bufsize-i); + } + + /* allocate space for offset_list and len_list */ + + *offset_list_ptr = (ADIO_Offset *) + ADIOI_Malloc((contig_access_count+1)*sizeof(ADIO_Offset)); + *len_list_ptr = (int *) ADIOI_Malloc((contig_access_count+1)*sizeof(int)); + /* +1 to avoid a 0-size malloc */ + + offset_list = *offset_list_ptr; + len_list = *len_list_ptr; + + /* find start offset, end offset, and fill in offset_list and len_list */ + + *start_offset_ptr = offset; /* calculated above */ + + i = k = 0; + j = st_index; + off = offset; + frd_size = ADIOI_MIN(old_frd_size, bufsize); + while (i < bufsize) { + if (frd_size) { + offset_list[k] = off; + len_list[k] = frd_size; + k++; + } + i += frd_size; + end_offset = off + frd_size - 1; + + /* Note: end_offset points to the last byte-offset that will be accessed. + e.g., if start_offset=0 and 100 bytes to be read, end_offset=99*/ + + if (off + frd_size < disp + flat_file->indices[j] + + flat_file->blocklens[j] + + (ADIO_Offset) n_filetypes*filetype_extent) + { + off += frd_size; + /* did not reach end of contiguous block in filetype. + * no more I/O needed. off is incremented by frd_size. + */ + } + else { + if (j < (flat_file->count - 1)) j++; + else { + /* hit end of flattened filetype; + * start at beginning again + */ + j = 0; + n_filetypes++; + } + off = disp + flat_file->indices[j] + + (ADIO_Offset) n_filetypes*filetype_extent; + frd_size = ADIOI_MIN(flat_file->blocklens[j], bufsize-i); + } + } + + /* update file pointer */ + if (file_ptr_type == ADIO_INDIVIDUAL) fd->fp_ind = off; + + *contig_access_count_ptr = contig_access_count; + *end_offset_ptr = end_offset; + } +} + +static void ADIOI_Read_and_exch(ADIO_File fd, void *buf, MPI_Datatype + datatype, int nprocs, + int myrank, ADIOI_Access + *others_req, ADIO_Offset *offset_list, + int *len_list, int contig_access_count, ADIO_Offset + min_st_offset, ADIO_Offset fd_size, + ADIO_Offset *fd_start, ADIO_Offset *fd_end, + int *buf_idx, int *error_code) +{ +/* Read in sizes of no more than coll_bufsize, an info parameter. + Send data to appropriate processes. + Place recd. data in user buf. + The idea is to reduce the amount of extra memory required for + collective I/O. If all data were read all at once, which is much + easier, it would require temp space more than the size of user_buf, + which is often unacceptable. For example, to read a distributed + array from a file, where each local array is 8Mbytes, requiring + at least another 8Mbytes of temp space is unacceptable. */ + + int i, j, m, size, ntimes, max_ntimes, buftype_is_contig; + ADIO_Offset st_loc=-1, end_loc=-1, off, done, real_off, req_off; + char *read_buf = NULL, *tmp_buf; + int *curr_offlen_ptr, *count, *send_size, *recv_size; + int *partial_send, *recd_from_proc, *start_pos, for_next_iter; + int real_size, req_len, flag, for_curr_iter, rank; + MPI_Status status; + ADIOI_Flatlist_node *flat_buf=NULL; + MPI_Aint buftype_extent; + int coll_bufsize; + + *error_code = MPI_SUCCESS; /* changed below if error */ + /* only I/O errors are currently reported */ + +/* calculate the number of reads of size coll_bufsize + to be done by each process and the max among all processes. + That gives the no. of communication phases as well. + coll_bufsize is obtained from the hints object. */ + + coll_bufsize = fd->hints->cb_buffer_size; + + /* grab some initial values for st_loc and end_loc */ + for (i=0; i < nprocs; i++) { + if (others_req[i].count) { + st_loc = others_req[i].offsets[0]; + end_loc = others_req[i].offsets[0]; + break; + } + } + + /* now find the real values */ + for (i=0; i < nprocs; i++) + for (j=0; jcomm); + + if (ntimes) read_buf = (char *) ADIOI_Malloc(coll_bufsize); + + curr_offlen_ptr = (int *) ADIOI_Calloc(nprocs, sizeof(int)); + /* its use is explained below. calloc initializes to 0. */ + + count = (int *) ADIOI_Malloc(nprocs * sizeof(int)); + /* to store count of how many off-len pairs per proc are satisfied + in an iteration. */ + + partial_send = (int *) ADIOI_Calloc(nprocs, sizeof(int)); + /* if only a portion of the last off-len pair is sent to a process + in a particular iteration, the length sent is stored here. + calloc initializes to 0. */ + + send_size = (int *) ADIOI_Malloc(nprocs * sizeof(int)); + /* total size of data to be sent to each proc. in an iteration */ + + recv_size = (int *) ADIOI_Malloc(nprocs * sizeof(int)); + /* total size of data to be recd. from each proc. in an iteration. + Of size nprocs so that I can use MPI_Alltoall later. */ + + recd_from_proc = (int *) ADIOI_Calloc(nprocs, sizeof(int)); + /* amount of data recd. so far from each proc. Used in + ADIOI_Fill_user_buffer. initialized to 0 here. */ + + start_pos = (int *) ADIOI_Malloc(nprocs*sizeof(int)); + /* used to store the starting value of curr_offlen_ptr[i] in + this iteration */ + + ADIOI_Datatype_iscontig(datatype, &buftype_is_contig); + if (!buftype_is_contig) { + ADIOI_Flatten_datatype(datatype); + flat_buf = ADIOI_Flatlist; + while (flat_buf->type != datatype) flat_buf = flat_buf->next; + } + MPI_Type_extent(datatype, &buftype_extent); + + done = 0; + off = st_loc; + for_curr_iter = for_next_iter = 0; + + MPI_Comm_rank(fd->comm, &rank); + +#ifdef PROFILE + MPE_Log_event(14, 0, "end computation"); +#endif + + for (m=0; mcomm); + + nprocs_recv = 0; + for (i=0; i < nprocs; i++) if (recv_size[i]) nprocs_recv++; + + nprocs_send = 0; + for (i=0; icomm, requests+j); + j++; + buf_idx[i] += recv_size[i]; + } + } + else { +/* allocate memory for recv_buf and post receives */ + recv_buf = (char **) ADIOI_Malloc(nprocs * sizeof(char*)); + for (i=0; i < nprocs; i++) + if (recv_size[i]) recv_buf[i] = + (char *) ADIOI_Malloc(recv_size[i]); + + j = 0; + for (i=0; i < nprocs; i++) + if (recv_size[i]) { + MPI_Irecv(recv_buf[i], recv_size[i], MPI_BYTE, i, + myrank+i+100*iter, fd->comm, requests+j); + j++; + /* FPRINTF(stderr, "node %d, recv_size %d, tag %d \n", + myrank, recv_size[i], myrank+i+100*iter); */ + } + } + +/* create derived datatypes and send data */ + + j = 0; + for (i=0; icomm, requests+nprocs_recv+j); + MPI_Type_free(&send_type); + if (partial_send[i]) others_req[i].lens[k] = tmp; + j++; + } + } + + statuses = (MPI_Status *) ADIOI_Malloc((nprocs_send+nprocs_recv+1) * \ + sizeof(MPI_Status)); + /* +1 to avoid a 0-size malloc */ + + /* wait on the receives */ + if (nprocs_recv) { +#ifdef NEEDS_MPI_TEST + j = 0; + while (!j) MPI_Testall(nprocs_recv, requests, &j, statuses); +#else + MPI_Waitall(nprocs_recv, requests, statuses); +#endif + + /* if noncontiguous, to the copies from the recv buffers */ + if (!buftype_is_contig) + ADIOI_Fill_user_buffer(fd, buf, flat_buf, recv_buf, + offset_list, len_list, recv_size, + requests, statuses, recd_from_proc, + nprocs, contig_access_count, + min_st_offset, fd_size, fd_start, fd_end, + buftype_extent); + } + + /* wait on the sends*/ + MPI_Waitall(nprocs_send, requests+nprocs_recv, statuses+nprocs_recv); + + ADIOI_Free(statuses); + ADIOI_Free(requests); + + if (!buftype_is_contig) { + for (i=0; i < nprocs; i++) + if (recv_size[i]) ADIOI_Free(recv_buf[i]); + ADIOI_Free(recv_buf); + } +} + + +#define ADIOI_BUF_INCR \ +{ \ + while (buf_incr) { \ + size_in_buf = ADIOI_MIN(buf_incr, flat_buf_sz); \ + user_buf_idx += size_in_buf; \ + flat_buf_sz -= size_in_buf; \ + if (!flat_buf_sz) { \ + if (flat_buf_idx < (flat_buf->count - 1)) flat_buf_idx++; \ + else { \ + flat_buf_idx = 0; \ + n_buftypes++; \ + } \ + user_buf_idx = flat_buf->indices[flat_buf_idx] + \ + n_buftypes*buftype_extent; \ + flat_buf_sz = flat_buf->blocklens[flat_buf_idx]; \ + } \ + buf_incr -= size_in_buf; \ + } \ +} + + +#define ADIOI_BUF_COPY \ +{ \ + while (size) { \ + size_in_buf = ADIOI_MIN(size, flat_buf_sz); \ + memcpy(((char *) buf) + user_buf_idx, \ + &(recv_buf[p][recv_buf_idx[p]]), size_in_buf); \ + recv_buf_idx[p] += size_in_buf; \ + user_buf_idx += size_in_buf; \ + flat_buf_sz -= size_in_buf; \ + if (!flat_buf_sz) { \ + if (flat_buf_idx < (flat_buf->count - 1)) flat_buf_idx++; \ + else { \ + flat_buf_idx = 0; \ + n_buftypes++; \ + } \ + user_buf_idx = flat_buf->indices[flat_buf_idx] + \ + n_buftypes*buftype_extent; \ + flat_buf_sz = flat_buf->blocklens[flat_buf_idx]; \ + } \ + size -= size_in_buf; \ + buf_incr -= size_in_buf; \ + } \ + ADIOI_BUF_INCR \ +} + + +static void ADIOI_Fill_user_buffer(ADIO_File fd, void *buf, ADIOI_Flatlist_node + *flat_buf, char **recv_buf, ADIO_Offset + *offset_list, int *len_list, + int *recv_size, + MPI_Request *requests, MPI_Status *statuses, + int *recd_from_proc, int nprocs, + int contig_access_count, + ADIO_Offset min_st_offset, + ADIO_Offset fd_size, ADIO_Offset *fd_start, + ADIO_Offset *fd_end, + MPI_Aint buftype_extent) +{ +/* this function is only called if buftype is not contig */ + + int i, p, flat_buf_idx, size, buf_incr; + int flat_buf_sz, size_in_buf, n_buftypes; + ADIO_Offset off, len, rem_len, user_buf_idx; + int *curr_from_proc, *done_from_proc, *recv_buf_idx; + + ADIOI_UNREFERENCED_ARG(requests); + ADIOI_UNREFERENCED_ARG(statuses); + +/* curr_from_proc[p] = amount of data recd from proc. p that has already + been accounted for so far + done_from_proc[p] = amount of data already recd from proc. p and + filled into user buffer in previous iterations + user_buf_idx = current location in user buffer + recv_buf_idx[p] = current location in recv_buf of proc. p */ + curr_from_proc = (int *) ADIOI_Malloc(nprocs * sizeof(int)); + done_from_proc = (int *) ADIOI_Malloc(nprocs * sizeof(int)); + recv_buf_idx = (int *) ADIOI_Malloc(nprocs * sizeof(int)); + + for (i=0; i < nprocs; i++) { + recv_buf_idx[i] = curr_from_proc[i] = 0; + done_from_proc[i] = recd_from_proc[i]; + } + + user_buf_idx = flat_buf->indices[0]; + flat_buf_idx = 0; + n_buftypes = 0; + flat_buf_sz = flat_buf->blocklens[0]; + + /* flat_buf_idx = current index into flattened buftype + flat_buf_sz = size of current contiguous component in + flattened buf */ + + for (i=0; i done_from_proc[p]) { + if (done_from_proc[p] > curr_from_proc[p]) { + size = (int)ADIOI_MIN(curr_from_proc[p] + len - + done_from_proc[p], recv_size[p]-recv_buf_idx[p]); + buf_incr = done_from_proc[p] - curr_from_proc[p]; + ADIOI_BUF_INCR + buf_incr = (int)(curr_from_proc[p]+len-done_from_proc[p]); + curr_from_proc[p] = done_from_proc[p] + size; + ADIOI_BUF_COPY + } + else { + size = (int)ADIOI_MIN(len,recv_size[p]-recv_buf_idx[p]); + buf_incr = (int)len; + curr_from_proc[p] += size; + ADIOI_BUF_COPY + } + } + else { + curr_from_proc[p] += (int)len; + buf_incr = (int)len; + ADIOI_BUF_INCR + } + } + else { + buf_incr = (int)len; + ADIOI_BUF_INCR + } + off += len; + rem_len -= len; + } + } + for (i=0; i < nprocs; i++) + if (recv_size[i]) recd_from_proc[i] = curr_from_proc[i]; + + ADIOI_Free(curr_from_proc); + ADIOI_Free(done_from_proc); + ADIOI_Free(recv_buf_idx); +} diff -ruNp romio-orig/adio/common/ad_write_coll.c romio/adio/common/ad_write_coll.c --- romio-orig/adio/common/ad_write_coll.c 2007-07-13 08:59:55.000611711 -0400 +++ romio/adio/common/ad_write_coll.c 2007-07-13 09:30:07.000435793 -0400 @@ -129,6 +129,8 @@ void ADIOI_GEN_WriteStridedColl(ADIO_Fil if (fd->hints->cb_write == ADIOI_HINT_DISABLE || (!interleave_count && (fd->hints->cb_write == ADIOI_HINT_AUTO))) { + int filerange_is_contig = 0; + /* use independent accesses */ if (fd->hints->cb_write != ADIOI_HINT_DISABLE) { ADIOI_Free(offset_list); @@ -139,8 +141,12 @@ void ADIOI_GEN_WriteStridedColl(ADIO_Fil fd->fp_ind = orig_fp; ADIOI_Datatype_iscontig(fd->filetype, &filetype_is_contig); + if (!filetype_is_contig) + ADIOI_Filetype_range_iscontig(fd, offset, file_ptr_type, + datatype, count, &filerange_is_contig); - if (buftype_is_contig && filetype_is_contig) { + if (buftype_is_contig && (filetype_is_contig || + filerange_is_contig)) { if (file_ptr_type == ADIO_EXPLICIT_OFFSET) { off = fd->disp + (fd->etype_size) * offset; ADIO_WriteContig(fd, buf, count, datatype, diff -ruNp romio-orig/adio/common/ad_write_coll.c.bak romio/adio/common/ad_write_coll.c.bak --- romio-orig/adio/common/ad_write_coll.c.bak 1969-12-31 19:00:00.000000000 -0500 +++ romio/adio/common/ad_write_coll.c.bak 2007-07-13 09:30:01.000919147 -0400 @@ -0,0 +1,1051 @@ +/* -*- Mode: C; c-basic-offset:4 ; -*- */ +/* + * + * Copyright (C) 1997 University of Chicago. + * See COPYRIGHT notice in top-level directory. + */ + +#include "adio.h" +#include "adio_extern.h" +#ifdef PROFILE +#include "mpe.h" +#endif + +/* prototypes of functions used for collective writes only. */ +static void ADIOI_Exch_and_write(ADIO_File fd, void *buf, MPI_Datatype + datatype, int nprocs, int myrank, ADIOI_Access + *others_req, ADIO_Offset *offset_list, + int *len_list, int contig_access_count, ADIO_Offset + min_st_offset, ADIO_Offset fd_size, + ADIO_Offset *fd_start, ADIO_Offset *fd_end, + int *buf_idx, int *error_code); +static void ADIOI_W_Exchange_data(ADIO_File fd, void *buf, char *write_buf, + ADIOI_Flatlist_node *flat_buf, ADIO_Offset + *offset_list, int *len_list, int *send_size, + int *recv_size, ADIO_Offset off, int size, + int *count, int *start_pos, int *partial_recv, + int *sent_to_proc, int nprocs, + int myrank, int + buftype_is_contig, int contig_access_count, + ADIO_Offset min_st_offset, ADIO_Offset fd_size, + ADIO_Offset *fd_start, ADIO_Offset *fd_end, + ADIOI_Access *others_req, + int *send_buf_idx, int *curr_to_proc, + int *done_to_proc, int *hole, int iter, + MPI_Aint buftype_extent, int *buf_idx, int *error_code); +static void ADIOI_Fill_send_buffer(ADIO_File fd, void *buf, ADIOI_Flatlist_node + *flat_buf, char **send_buf, ADIO_Offset + *offset_list, int *len_list, int *send_size, + MPI_Request *requests, int *sent_to_proc, + int nprocs, int myrank, + int contig_access_count, ADIO_Offset + min_st_offset, ADIO_Offset fd_size, + ADIO_Offset *fd_start, ADIO_Offset *fd_end, + int *send_buf_idx, int *curr_to_proc, + int *done_to_proc, int iter, + MPI_Aint buftype_extent); +static void ADIOI_Heap_merge(ADIOI_Access *others_req, int *count, + ADIO_Offset *srt_off, int *srt_len, int *start_pos, + int nprocs, int nprocs_recv, int total_elements); + + +void ADIOI_GEN_WriteStridedColl(ADIO_File fd, void *buf, int count, + MPI_Datatype datatype, int file_ptr_type, + ADIO_Offset offset, ADIO_Status *status, int + *error_code) +{ +/* Uses a generalized version of the extended two-phase method described + in "An Extended Two-Phase Method for Accessing Sections of + Out-of-Core Arrays", Rajeev Thakur and Alok Choudhary, + Scientific Programming, (5)4:301--317, Winter 1996. + http://www.mcs.anl.gov/home/thakur/ext2ph.ps */ + + ADIOI_Access *my_req; + /* array of nprocs access structures, one for each other process in + whose file domain this process's request lies */ + + ADIOI_Access *others_req; + /* array of nprocs access structures, one for each other process + whose request lies in this process's file domain. */ + + int i, filetype_is_contig, nprocs, nprocs_for_coll, myrank; + int contig_access_count=0, interleave_count = 0, buftype_is_contig; + int *count_my_req_per_proc, count_my_req_procs, count_others_req_procs; + ADIO_Offset orig_fp, start_offset, end_offset, fd_size, min_st_offset, off; + ADIO_Offset *offset_list = NULL, *st_offsets = NULL, *fd_start = NULL, + *fd_end = NULL, *end_offsets = NULL; + int *buf_idx = NULL, *len_list = NULL; + int old_error, tmp_error; + + +#ifdef PROFILE + MPE_Log_event(13, 0, "start computation"); +#endif + + MPI_Comm_size(fd->comm, &nprocs); + MPI_Comm_rank(fd->comm, &myrank); + +/* the number of processes that actually perform I/O, nprocs_for_coll, + * is stored in the hints off the ADIO_File structure + */ + nprocs_for_coll = fd->hints->cb_nodes; + orig_fp = fd->fp_ind; + + /* only check for interleaving if cb_write isn't disabled */ + if (fd->hints->cb_write != ADIOI_HINT_DISABLE) { + /* For this process's request, calculate the list of offsets and + lengths in the file and determine the start and end offsets. */ + + /* Note: end_offset points to the last byte-offset that will be accessed. + e.g., if start_offset=0 and 100 bytes to be read, end_offset=99*/ + + ADIOI_Calc_my_off_len(fd, count, datatype, file_ptr_type, offset, + &offset_list, &len_list, &start_offset, + &end_offset, &contig_access_count); + + /* each process communicates its start and end offsets to other + processes. The result is an array each of start and end offsets stored + in order of process rank. */ + + st_offsets = (ADIO_Offset *) ADIOI_Malloc(nprocs*sizeof(ADIO_Offset)); + end_offsets = (ADIO_Offset *) ADIOI_Malloc(nprocs*sizeof(ADIO_Offset)); + + MPI_Allgather(&start_offset, 1, ADIO_OFFSET, st_offsets, 1, + ADIO_OFFSET, fd->comm); + MPI_Allgather(&end_offset, 1, ADIO_OFFSET, end_offsets, 1, + ADIO_OFFSET, fd->comm); + + /* are the accesses of different processes interleaved? */ + for (i=1; ihints->cb_write == ADIOI_HINT_DISABLE || + (!interleave_count && (fd->hints->cb_write == ADIOI_HINT_AUTO))) + { + int filerange_is_contig; + + /* use independent accesses */ + if (fd->hints->cb_write != ADIOI_HINT_DISABLE) { + ADIOI_Free(offset_list); + ADIOI_Free(len_list); + ADIOI_Free(st_offsets); + ADIOI_Free(end_offsets); + } + + fd->fp_ind = orig_fp; + ADIOI_Datatype_iscontig(fd->filetype, &filetype_is_contig); + if (!filetype_is_contig) + ADIOI_Filetype_range_iscontig(fd, offset, file_ptr_type, + datatype, count, &filerange_is_contig); + + if (buftype_is_contig && (filetype_is_contig || + filerange_is_contig)) { + if (file_ptr_type == ADIO_EXPLICIT_OFFSET) { + off = fd->disp + (fd->etype_size) * offset; + ADIO_WriteContig(fd, buf, count, datatype, + ADIO_EXPLICIT_OFFSET, + off, status, error_code); + } + else ADIO_WriteContig(fd, buf, count, datatype, ADIO_INDIVIDUAL, + 0, status, error_code); + } + else ADIO_WriteStrided(fd, buf, count, datatype, file_ptr_type, + offset, status, error_code); + + return; + } + +/* Divide the I/O workload among "nprocs_for_coll" processes. This is + done by (logically) dividing the file into file domains (FDs); each + process may directly access only its own file domain. */ + + ADIOI_Calc_file_domains(st_offsets, end_offsets, nprocs, + nprocs_for_coll, &min_st_offset, + &fd_start, &fd_end, &fd_size); + + +/* calculate what portions of the access requests of this process are + located in what file domains */ + + ADIOI_Calc_my_req(fd, offset_list, len_list, contig_access_count, + min_st_offset, fd_start, fd_end, fd_size, + nprocs, &count_my_req_procs, + &count_my_req_per_proc, &my_req, + &buf_idx); + +/* based on everyone's my_req, calculate what requests of other + processes lie in this process's file domain. + count_others_req_procs = number of processes whose requests lie in + this process's file domain (including this process itself) + count_others_req_per_proc[i] indicates how many separate contiguous + requests of proc. i lie in this process's file domain. */ + + ADIOI_Calc_others_req(fd, count_my_req_procs, + count_my_req_per_proc, my_req, + nprocs, myrank, + &count_others_req_procs, &others_req); + + ADIOI_Free(count_my_req_per_proc); + for (i=0; i < nprocs; i++) { + if (my_req[i].count) { + ADIOI_Free(my_req[i].offsets); + ADIOI_Free(my_req[i].lens); + } + } + ADIOI_Free(my_req); + +/* exchange data and write in sizes of no more than coll_bufsize. */ + ADIOI_Exch_and_write(fd, buf, datatype, nprocs, myrank, + others_req, offset_list, + len_list, contig_access_count, min_st_offset, + fd_size, fd_start, fd_end, buf_idx, error_code); + + /* If this collective write is followed by an independent write, + * it's possible to have those subsequent writes on other processes + * race ahead and sneak in before the read-modify-write completes. + * We carry out a collective communication at the end here so no one + * can start independent i/o before collective I/O completes. + * + * need to do some gymnastics with the error codes so that if something + * went wrong, all processes report error, but if a process has a more + * specific error code, we can still have that process report the + * additional information */ + + old_error = *error_code; + if (*error_code != MPI_SUCCESS) *error_code = MPI_ERR_IO; + + /* optimization: if only one process performing i/o, we can perform + * a less-expensive Bcast */ + if (fd->hints->cb_nodes == 1) + MPI_Bcast(error_code, 1, MPI_INT, + fd->hints->ranklist[0], fd->comm); + else { + tmp_error = *error_code; + MPI_Allreduce(&tmp_error, error_code, 1, MPI_INT, + MPI_MAX, fd->comm); + } + + if ( (old_error != MPI_SUCCESS) && (old_error != MPI_ERR_IO) ) + *error_code = old_error; + + + if (!buftype_is_contig) ADIOI_Delete_flattened(datatype); + +/* free all memory allocated for collective I/O */ + + for (i=0; ifp_sys_posn = -1; /* set it to null. */ +} + + + +/* If successful, error_code is set to MPI_SUCCESS. Otherwise an error + * code is created and returned in error_code. + */ +static void ADIOI_Exch_and_write(ADIO_File fd, void *buf, MPI_Datatype + datatype, int nprocs, int myrank, + ADIOI_Access + *others_req, ADIO_Offset *offset_list, + int *len_list, int contig_access_count, + ADIO_Offset + min_st_offset, ADIO_Offset fd_size, + ADIO_Offset *fd_start, ADIO_Offset *fd_end, + int *buf_idx, int *error_code) +{ +/* Send data to appropriate processes and write in sizes of no more + than coll_bufsize. + The idea is to reduce the amount of extra memory required for + collective I/O. If all data were written all at once, which is much + easier, it would require temp space more than the size of user_buf, + which is often unacceptable. For example, to write a distributed + array to a file, where each local array is 8Mbytes, requiring + at least another 8Mbytes of temp space is unacceptable. */ + + /* TODO: 'hole' not used outside of ADIOI_W_Exchange_data */ + int hole, i, j, m, size=0, ntimes, max_ntimes, buftype_is_contig; + ADIO_Offset st_loc=-1, end_loc=-1, off, done, req_off; + char *write_buf=NULL; + int *curr_offlen_ptr, *count, *send_size, req_len, *recv_size; + int *partial_recv, *sent_to_proc, *start_pos, flag; + int *send_buf_idx, *curr_to_proc, *done_to_proc; + MPI_Status status; + ADIOI_Flatlist_node *flat_buf=NULL; + MPI_Aint buftype_extent; + int info_flag, coll_bufsize; + char *value; + static char myname[] = "ADIOI_EXCH_AND_WRITE"; + + *error_code = MPI_SUCCESS; /* changed below if error */ + /* only I/O errors are currently reported */ + +/* calculate the number of writes of size coll_bufsize + to be done by each process and the max among all processes. + That gives the no. of communication phases as well. */ + + value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL+1)*sizeof(char)); + MPI_Info_get(fd->info, "cb_buffer_size", MPI_MAX_INFO_VAL, value, + &info_flag); + coll_bufsize = atoi(value); + ADIOI_Free(value); + + + for (i=0; i < nprocs; i++) { + if (others_req[i].count) { + st_loc = others_req[i].offsets[0]; + end_loc = others_req[i].offsets[0]; + break; + } + } + + for (i=0; i < nprocs; i++) + for (j=0; j < others_req[i].count; j++) { + st_loc = ADIOI_MIN(st_loc, others_req[i].offsets[j]); + end_loc = ADIOI_MAX(end_loc, (others_req[i].offsets[j] + + others_req[i].lens[j] - 1)); + } + +/* ntimes=ceiling_div(end_loc - st_loc + 1, coll_bufsize)*/ + + ntimes = (int) ((end_loc - st_loc + coll_bufsize)/coll_bufsize); + + if ((st_loc==-1) && (end_loc==-1)) { + ntimes = 0; /* this process does no writing. */ + } + + MPI_Allreduce(&ntimes, &max_ntimes, 1, MPI_INT, MPI_MAX, + fd->comm); + + if (ntimes) write_buf = (char *) ADIOI_Malloc(coll_bufsize); + + curr_offlen_ptr = (int *) ADIOI_Calloc(nprocs, sizeof(int)); + /* its use is explained below. calloc initializes to 0. */ + + count = (int *) ADIOI_Malloc(nprocs*sizeof(int)); + /* to store count of how many off-len pairs per proc are satisfied + in an iteration. */ + + partial_recv = (int *) ADIOI_Calloc(nprocs, sizeof(int)); + /* if only a portion of the last off-len pair is recd. from a process + in a particular iteration, the length recd. is stored here. + calloc initializes to 0. */ + + send_size = (int *) ADIOI_Malloc(nprocs*sizeof(int)); + /* total size of data to be sent to each proc. in an iteration. + Of size nprocs so that I can use MPI_Alltoall later. */ + + recv_size = (int *) ADIOI_Malloc(nprocs*sizeof(int)); + /* total size of data to be recd. from each proc. in an iteration.*/ + + sent_to_proc = (int *) ADIOI_Calloc(nprocs, sizeof(int)); + /* amount of data sent to each proc so far. Used in + ADIOI_Fill_send_buffer. initialized to 0 here. */ + + send_buf_idx = (int *) ADIOI_Malloc(nprocs*sizeof(int)); + curr_to_proc = (int *) ADIOI_Malloc(nprocs*sizeof(int)); + done_to_proc = (int *) ADIOI_Malloc(nprocs*sizeof(int)); + /* Above three are used in ADIOI_Fill_send_buffer*/ + + start_pos = (int *) ADIOI_Malloc(nprocs*sizeof(int)); + /* used to store the starting value of curr_offlen_ptr[i] in + this iteration */ + + ADIOI_Datatype_iscontig(datatype, &buftype_is_contig); + if (!buftype_is_contig) { + ADIOI_Flatten_datatype(datatype); + flat_buf = ADIOI_Flatlist; + while (flat_buf->type != datatype) flat_buf = flat_buf->next; + } + MPI_Type_extent(datatype, &buftype_extent); + + +/* I need to check if there are any outstanding nonblocking writes to + the file, which could potentially interfere with the writes taking + place in this collective write call. Since this is not likely to be + common, let me do the simplest thing possible here: Each process + completes all pending nonblocking operations before completing. */ + + ADIOI_Complete_async(error_code); + if (*error_code != MPI_SUCCESS) return; + MPI_Barrier(fd->comm); + + done = 0; + off = st_loc; + +#ifdef PROFILE + MPE_Log_event(14, 0, "end computation"); +#endif + + for (m=0; m < ntimes; m++) { + /* go through all others_req and check which will be satisfied + by the current write */ + + /* Note that MPI guarantees that displacements in filetypes are in + monotonically nondecreasing order and that, for writes, the + filetypes cannot specify overlapping regions in the file. This + simplifies implementation a bit compared to reads. */ + + /* off = start offset in the file for the data to be written in + this iteration + size = size of data written (bytes) corresponding to off + req_off = off in file for a particular contiguous request + minus what was satisfied in previous iteration + req_size = size corresponding to req_off */ + + /* first calculate what should be communicated */ + +#ifdef PROFILE + MPE_Log_event(13, 0, "start computation"); +#endif + for (i=0; i < nprocs; i++) count[i] = recv_size[i] = 0; + + size = (int) (ADIOI_MIN(coll_bufsize, end_loc-st_loc+1-done)); + + for (i=0; i < nprocs; i++) { + if (others_req[i].count) { + start_pos[i] = curr_offlen_ptr[i]; + for (j=curr_offlen_ptr[i]; jcomm); + + /* create derived datatypes for recv */ + + nprocs_recv = 0; + for (i=0; iatomicity) { + /* bug fix from Wei-keng Liao and Kenin Coloma */ + requests = (MPI_Request *) + ADIOI_Malloc((nprocs_send+1)*sizeof(MPI_Request)); + send_req = requests; + } + else { + requests = (MPI_Request *) + ADIOI_Malloc((nprocs_send+nprocs_recv+1)*sizeof(MPI_Request)); + /* +1 to avoid a 0-size malloc */ + + /* post receives */ + j = 0; + for (i=0; icomm, requests+j); + j++; + } + } + send_req = requests + nprocs_recv; + } + +/* post sends. if buftype_is_contig, data can be directly sent from + user buf at location given by buf_idx. else use send_buf. */ + + if (buftype_is_contig) { + j = 0; + for (i=0; i < nprocs; i++) + if (send_size[i]) { + MPI_Isend(((char *) buf) + buf_idx[i], send_size[i], + MPI_BYTE, i, myrank+i+100*iter, fd->comm, + send_req+j); + j++; + buf_idx[i] += send_size[i]; + } + } + else if (nprocs_send) { + /* buftype is not contig */ + send_buf = (char **) ADIOI_Malloc(nprocs*sizeof(char*)); + for (i=0; i < nprocs; i++) + if (send_size[i]) + send_buf[i] = (char *) ADIOI_Malloc(send_size[i]); + + ADIOI_Fill_send_buffer(fd, buf, flat_buf, send_buf, + offset_list, len_list, send_size, + send_req, + sent_to_proc, nprocs, myrank, + contig_access_count, + min_st_offset, fd_size, fd_start, fd_end, + send_buf_idx, curr_to_proc, done_to_proc, iter, + buftype_extent); + /* the send is done in ADIOI_Fill_send_buffer */ + } + + if (fd->atomicity) { + /* bug fix from Wei-keng Liao and Kenin Coloma */ + j = 0; + for (i=0; icomm, &wkl_status); + j++; + } + } + } + + for (i=0; iatomicity) { + /* bug fix from Wei-keng Liao and Kenin Coloma */ + statuses = (MPI_Status *) ADIOI_Malloc((nprocs_send+1) * \ + sizeof(MPI_Status)); + /* +1 to avoid a 0-size malloc */ + } + else { + statuses = (MPI_Status *) ADIOI_Malloc((nprocs_send+nprocs_recv+1) * \ + sizeof(MPI_Status)); + /* +1 to avoid a 0-size malloc */ + } + +#ifdef NEEDS_MPI_TEST + i = 0; + if (fd->atomicity) { + /* bug fix from Wei-keng Liao and Kenin Coloma */ + while (!i) MPI_Testall(nprocs_send, send_req, &i, statuses); + } + else { + while (!i) MPI_Testall(nprocs_send+nprocs_recv, requests, &i, statuses); + } +#else + if (fd->atomicity) + /* bug fix from Wei-keng Liao and Kenin Coloma */ + MPI_Waitall(nprocs_send, send_req, statuses); + else + MPI_Waitall(nprocs_send+nprocs_recv, requests, statuses); +#endif + + ADIOI_Free(statuses); + ADIOI_Free(requests); + if (!buftype_is_contig && nprocs_send) { + for (i=0; i < nprocs; i++) + if (send_size[i]) ADIOI_Free(send_buf[i]); + ADIOI_Free(send_buf); + } +} + + +#define ADIOI_BUF_INCR \ +{ \ + while (buf_incr) { \ + size_in_buf = ADIOI_MIN(buf_incr, flat_buf_sz); \ + user_buf_idx += size_in_buf; \ + flat_buf_sz -= size_in_buf; \ + if (!flat_buf_sz) { \ + if (flat_buf_idx < (flat_buf->count - 1)) flat_buf_idx++; \ + else { \ + flat_buf_idx = 0; \ + n_buftypes++; \ + } \ + user_buf_idx = flat_buf->indices[flat_buf_idx] + \ + n_buftypes*buftype_extent; \ + flat_buf_sz = flat_buf->blocklens[flat_buf_idx]; \ + } \ + buf_incr -= size_in_buf; \ + } \ +} + + +#define ADIOI_BUF_COPY \ +{ \ + while (size) { \ + size_in_buf = ADIOI_MIN(size, flat_buf_sz); \ + memcpy(&(send_buf[p][send_buf_idx[p]]), \ + ((char *) buf) + user_buf_idx, size_in_buf); \ + send_buf_idx[p] += size_in_buf; \ + user_buf_idx += size_in_buf; \ + flat_buf_sz -= size_in_buf; \ + if (!flat_buf_sz) { \ + if (flat_buf_idx < (flat_buf->count - 1)) flat_buf_idx++; \ + else { \ + flat_buf_idx = 0; \ + n_buftypes++; \ + } \ + user_buf_idx = flat_buf->indices[flat_buf_idx] + \ + n_buftypes*buftype_extent; \ + flat_buf_sz = flat_buf->blocklens[flat_buf_idx]; \ + } \ + size -= size_in_buf; \ + buf_incr -= size_in_buf; \ + } \ + ADIOI_BUF_INCR \ +} + + + +static void ADIOI_Fill_send_buffer(ADIO_File fd, void *buf, ADIOI_Flatlist_node + *flat_buf, char **send_buf, ADIO_Offset + *offset_list, int *len_list, int *send_size, + MPI_Request *requests, int *sent_to_proc, + int nprocs, int myrank, + int contig_access_count, + ADIO_Offset min_st_offset, ADIO_Offset fd_size, + ADIO_Offset *fd_start, ADIO_Offset *fd_end, + int *send_buf_idx, int *curr_to_proc, + int *done_to_proc, int iter, + MPI_Aint buftype_extent) +{ +/* this function is only called if buftype is not contig */ + + int i, p, flat_buf_idx, size; + int flat_buf_sz, buf_incr, size_in_buf, jj, n_buftypes; + ADIO_Offset off, len, rem_len, user_buf_idx; + +/* curr_to_proc[p] = amount of data sent to proc. p that has already + been accounted for so far + done_to_proc[p] = amount of data already sent to proc. p in + previous iterations + user_buf_idx = current location in user buffer + send_buf_idx[p] = current location in send_buf of proc. p */ + + for (i=0; i < nprocs; i++) { + send_buf_idx[i] = curr_to_proc[i] = 0; + done_to_proc[i] = sent_to_proc[i]; + } + jj = 0; + + user_buf_idx = flat_buf->indices[0]; + flat_buf_idx = 0; + n_buftypes = 0; + flat_buf_sz = flat_buf->blocklens[0]; + + /* flat_buf_idx = current index into flattened buftype + flat_buf_sz = size of current contiguous component in + flattened buf */ + + for (i=0; i done_to_proc[p]) { + if (done_to_proc[p] > curr_to_proc[p]) { + size = (int)ADIOI_MIN(curr_to_proc[p] + len - + done_to_proc[p], send_size[p]-send_buf_idx[p]); + buf_incr = done_to_proc[p] - curr_to_proc[p]; + ADIOI_BUF_INCR + buf_incr = (int)(curr_to_proc[p] + len - done_to_proc[p]); + curr_to_proc[p] = done_to_proc[p] + size; + ADIOI_BUF_COPY + } + else { + size = (int)ADIOI_MIN(len,send_size[p]-send_buf_idx[p]); + buf_incr = (int)len; + curr_to_proc[p] += size; + ADIOI_BUF_COPY + } + if (send_buf_idx[p] == send_size[p]) { + MPI_Isend(send_buf[p], send_size[p], MPI_BYTE, p, + myrank+p+100*iter, fd->comm, requests+jj); + jj++; + } + } + else { + curr_to_proc[p] += (int)len; + buf_incr = (int)len; + ADIOI_BUF_INCR + } + } + else { + buf_incr = (int)len; + ADIOI_BUF_INCR + } + off += len; + rem_len -= len; + } + } + for (i=0; i < nprocs; i++) + if (send_size[i]) sent_to_proc[i] = curr_to_proc[i]; +} + + + +static void ADIOI_Heap_merge(ADIOI_Access *others_req, int *count, + ADIO_Offset *srt_off, int *srt_len, int *start_pos, + int nprocs, int nprocs_recv, int total_elements) +{ + typedef struct { + ADIO_Offset *off_list; + int *len_list; + int nelem; + } heap_struct; + + heap_struct *a, tmp; + int i, j, heapsize, l, r, k, smallest; + + a = (heap_struct *) ADIOI_Malloc((nprocs_recv+1)*sizeof(heap_struct)); + + j = 0; + for (i=0; i=0; i--) { + /* Heapify(a, i, heapsize); Algorithm from Cormen et al. pg. 143 + modified for a heap with smallest element at root. I have + removed the recursion so that there are no function calls. + Function calls are too expensive. */ + k = i; + for(;;) { + l = 2*(k+1) - 1; + r = 2*(k+1); + + if ((l < heapsize) && + (*(a[l].off_list) < *(a[k].off_list))) + smallest = l; + else smallest = k; + + if ((r < heapsize) && + (*(a[r].off_list) < *(a[smallest].off_list))) + smallest = r; + + if (smallest != k) { + tmp.off_list = a[k].off_list; + tmp.len_list = a[k].len_list; + tmp.nelem = a[k].nelem; + + a[k].off_list = a[smallest].off_list; + a[k].len_list = a[smallest].len_list; + a[k].nelem = a[smallest].nelem; + + a[smallest].off_list = tmp.off_list; + a[smallest].len_list = tmp.len_list; + a[smallest].nelem = tmp.nelem; + + k = smallest; + } + else break; + } + } + + for (i=0; itype != fd->filetype) flat_file = flat_file->next; + disp = fd->disp; + + MPI_Type_size(fd->filetype, &filetype_size); + MPI_Type_extent(fd->filetype, &filetype_extent); + etype_size = fd->etype_size; + + if (file_ptr_type == ADIO_INDIVIDUAL) { + offset = fd->fp_ind; /* in bytes */ + n_filetypes = -1; + flag = 0; + while (!flag) { + n_filetypes++; + for (i=0; icount; i++) { + if (disp + flat_file->indices[i] + + (ADIO_Offset) n_filetypes*filetype_extent + flat_file->blocklens[i] + >= offset) { + st_index = i; + st_io_size = (int) (disp + flat_file->indices[i] + + (ADIO_Offset) n_filetypes*filetype_extent + + flat_file->blocklens[i] - offset); + flag = 1; + break; + } + } + } + } else { + n_etypes_in_filetype = filetype_size/etype_size; + n_filetypes = (int) (offset / n_etypes_in_filetype); + etype_in_filetype = (int) (offset % n_etypes_in_filetype); + size_in_filetype = etype_in_filetype * etype_size; + + sum = 0; + for (i=0; icount; i++) { + sum += flat_file->blocklens[i]; + if (sum > size_in_filetype) { + st_index = i; + st_io_size = sum - size_in_filetype; + abs_off_in_filetype = flat_file->indices[i] + + size_in_filetype - (sum - flat_file->blocklens[i]); + break; + } + } + + /* abs. offset in bytes in the file */ + offset = disp + (ADIO_Offset) n_filetypes*filetype_extent + abs_off_in_filetype; + } + + *start_index = st_index; + *start_io_size = st_io_size; + *start_offset = offset; + *start_ftype = n_filetypes; +} + +void ADIOI_Filetype_range_iscontig(ADIO_File fd, ADIO_Offset offset, + int file_ptr_type, MPI_Datatype datatype, int count, int *flag) +{ + int srclen, datatype_size; + int st_index, st_ftype, st_offset, st_io_size; + + MPI_Type_size(datatype, &datatype_size); + srclen = datatype_size * count; + + ADIOI_Filetype_range_start(fd, offset, file_ptr_type, + &st_index, &st_ftype, &st_offset, &st_io_size); + *flag = st_io_size >= srclen ? 1 : 0; +} + diff -ruNp romio-orig/adio/include/adioi.h romio/adio/include/adioi.h --- romio-orig/adio/include/adioi.h 2007-07-13 08:59:56.000018889 -0400 +++ romio/adio/include/adioi.h 2007-07-13 09:02:25.000302029 -0400 @@ -304,6 +304,10 @@ void *ADIOI_Calloc_fn(size_t nelem, size void *ADIOI_Realloc_fn(void *ptr, size_t size, int lineno, char *fname); void ADIOI_Free_fn(void *ptr, int lineno, char *fname); void ADIOI_Datatype_iscontig(MPI_Datatype datatype, int *flag); +void ADIOI_Filetype_range_iscontig(ADIO_File fd, ADIO_Offset offset, + int file_ptr_type, MPI_Datatype datatype, int count, int *flag); +void ADIOI_Filetype_range_start(ADIO_File fd, ADIO_Offset offset, int file_ptr_type, + int *start_index, int *start_ftype, int *start_offset, int *start_io_size); void ADIOI_Get_position(ADIO_File fd, ADIO_Offset *offset); void ADIOI_Get_eof_offset(ADIO_File fd, ADIO_Offset *eof_offset); void ADIOI_Get_byte_offset(ADIO_File fd, ADIO_Offset offset,