[mpich-discuss] patch for ad_lustre_wrcoll.c
Rob Latham
robl at mcs.anl.gov
Mon Sep 12 16:05:59 CDT 2011
On Wed, Aug 10, 2011 at 01:56:33PM -0600, Martin Pokorny wrote:
> I have modified ad_lustre_wrcoll.c to improve the performance of the
> ADIO Lustre code under workloads typical in my application.
Hi Martin. Thanks for the patch. I wanted to let you know that I am
going to look more closely at it, but haven't had time yet.
==rob
> This
> patch should be correct in all cases, so I thought that I ought to
> submit my changes back to the code maintainers. The changes are
> relatively simple, and I have not detected any errors in the files
> generated using this code; nevertheless, further testing in other
> applications would be in order.
>
> These changes improve performance by reducing the number of system
> 'write' calls in the ADIO Lustre collective write code, and perhaps
> also by keeping the writes ordered. This is especially effective in
> my application, in which the data are highly interleaved among the
> processes in the group calling the MPI-IO collective write
> functions.
>
> It's difficult for me to get clean performance numbers with the
> system on which my application runs. However, by way of example,
> prior to these changes my application could not sustain 25 MB/s (on
> our 1 Gb Ethernet-connected, 4 OSS Lustre filesystem); with these
> changes 25 MB/s is no problem, and there are indications that 50
> MB/s or better might be possible.
>
> --- ad_lustre_wrcoll.c.orig 2010-02-23 10:33:53.000000000 -0700
> +++ ad_lustre_wrcoll.c.new 2011-08-10 10:38:24.000000000 -0600
> @@ -52,7 +52,9 @@
> int *curr_to_proc,
> int *done_to_proc, int *hole,
> int iter, MPI_Aint buftype_extent,
> - int *buf_idx, int *error_code);
> + int *buf_idx,
> + ADIO_Offset **srt_off, int **srt_len, int *srt_num,
> + int *error_code);
> void ADIOI_Heap_merge(ADIOI_Access *others_req, int *count,
> ADIO_Offset *srt_off, int *srt_len, int *start_pos,
> int nprocs, int nprocs_recv, int total_elements);
> @@ -317,6 +319,11 @@
> MPI_Aint buftype_extent;
> int stripe_size = striping_info[0], avail_cb_nodes = striping_info[2];
> int data_sieving = 0;
> + ADIO_Offset *srt_off = NULL;
> + int *srt_len = NULL;
> + int srt_num = 0;
> + ADIO_Offset block_offset;
> + int block_len;
>
> *error_code = MPI_SUCCESS; /* changed below if error */
> /* only I/O errors are currently reported */
> @@ -513,7 +520,9 @@
> buftype_is_contig, contig_access_count,
> striping_info, others_req, send_buf_idx,
> curr_to_proc, done_to_proc, &hole, m,
> - buftype_extent, this_buf_idx, error_code);
> + buftype_extent, this_buf_idx,
> + &srt_off, &srt_len, &srt_num, error_code);
> +
> if (*error_code != MPI_SUCCESS)
> goto over;
>
> @@ -537,23 +546,42 @@
> ADIO_EXPLICIT_OFFSET, off, &status,
> error_code);
> } else {
> - for (i = 0; i < nprocs; i++) {
> - if (others_req[i].count) {
> - for (j = 0; j < others_req[i].count; j++) {
> - if (others_req[i].offsets[j] < off + real_size &&
> - others_req[i].offsets[j] >= off) {
> + block_offset = -1;
> + block_len = 0;
> + for (i = 0; i < srt_num; ++i) {
> + if (srt_off[i] < off + real_size &&
> + srt_off[i] >= off) {
> + if (block_offset == -1) {
> + block_offset = srt_off[i];
> + block_len = srt_len[i];
> + } else {
> + if (srt_off[i] == block_offset + block_len) {
> + block_len += srt_len[i];
> + } else {
> ADIO_WriteContig(fd,
> - write_buf + others_req[i].offsets[j] - off,
> - others_req[i].lens[j],
> + write_buf + block_offset - off,
> + block_len,
> MPI_BYTE, ADIO_EXPLICIT_OFFSET,
> - others_req[i].offsets[j], &status,
> + block_offset, &status,
> error_code);
> if (*error_code != MPI_SUCCESS)
> goto over;
> + block_offset = srt_off[i];
> + block_len = srt_len[i];
> }
> }
> }
> }
> + if (block_offset != -1) {
> + ADIO_WriteContig(fd,
> + write_buf + block_offset - off,
> + block_len,
> + MPI_BYTE, ADIO_EXPLICIT_OFFSET,
> + block_offset, &status,
> + error_code);
> + if (*error_code != MPI_SUCCESS)
> + goto over;
> + }
> }
> }
> if (*error_code != MPI_SUCCESS)
> @@ -562,6 +590,10 @@
> iter_st_off += max_size;
> }
> over:
> + if (srt_off)
> + ADIOI_Free(srt_off);
> + if (srt_len)
> + ADIOI_Free(srt_len);
> if (ntimes)
> ADIOI_Free(write_buf);
> ADIOI_Free(recv_curr_offlen_ptr);
> @@ -598,15 +630,16 @@
> int *curr_to_proc, int *done_to_proc,
> int *hole, int iter,
> MPI_Aint buftype_extent,
> - int *buf_idx, int *error_code)
> + int *buf_idx,
> + ADIO_Offset **srt_off, int **srt_len, int *srt_num,
> + int *error_code)
> {
> int i, j, nprocs_recv, nprocs_send, err;
> char **send_buf = NULL;
> MPI_Request *requests, *send_req;
> MPI_Datatype *recv_types;
> MPI_Status *statuses, status;
> - int *srt_len, sum, sum_recv;
> - ADIO_Offset *srt_off;
> + int sum_recv;
> int data_sieving = *hole;
> static char myname[] = "ADIOI_W_EXCHANGE_DATA";
>
> @@ -638,20 +671,26 @@
> * For this, merge the (sorted) offset lists others_req using a heap-merge.
> */
>
> - sum = 0;
> + *srt_num = 0;
> for (i = 0; i < nprocs; i++)
> - sum += count[i];
> - srt_off = (ADIO_Offset *) ADIOI_Malloc((sum + 1) * sizeof(ADIO_Offset));
> - srt_len = (int *) ADIOI_Malloc((sum + 1) * sizeof(int));
> + *srt_num += count[i];
> + if (*srt_off)
> + *srt_off = (ADIO_Offset *) ADIOI_Realloc(*srt_off, (*srt_num + 1) * sizeof(ADIO_Offset));
> + else
> + *srt_off = (ADIO_Offset *) ADIOI_Malloc((*srt_num + 1) * sizeof(ADIO_Offset));
> + if (*srt_len)
> + *srt_len = (int *) ADIOI_Realloc(*srt_len, (*srt_num + 1) * sizeof(int));
> + else
> + *srt_len = (int *) ADIOI_Malloc((*srt_num + 1) * sizeof(int));
> /* +1 to avoid a 0-size malloc */
>
> - ADIOI_Heap_merge(others_req, count, srt_off, srt_len, start_pos,
> - nprocs, nprocs_recv, sum);
> + ADIOI_Heap_merge(others_req, count, *srt_off, *srt_len, start_pos,
> + nprocs, nprocs_recv, *srt_num);
>
> /* check if there are any holes */
> *hole = 0;
> - for (i = 0; i < sum - 1; i++) {
> - if (srt_off[i] + srt_len[i] < srt_off[i + 1]) {
> + for (i = 0; i < *srt_num - 1; i++) {
> + if ((*srt_off)[i] + (*srt_len)[i] < (*srt_off)[i + 1]) {
> *hole = 1;
> break;
> }
> @@ -681,14 +720,10 @@
> MPI_ERR_IO,
> "**ioRMWrdwr", 0);
> ADIOI_Free(recv_types);
> - ADIOI_Free(srt_off);
> - ADIOI_Free(srt_len);
> return;
> }
> // --END ERROR HANDLING--
> }
> - ADIOI_Free(srt_off);
> - ADIOI_Free(srt_len);
>
> nprocs_send = 0;
> for (i = 0; i < nprocs; i++)
--
Rob Latham
Mathematics and Computer Science Division
Argonne National Lab, IL USA
More information about the mpich-discuss
mailing list