[mpich-discuss] patch for ad_lustre_wrcoll.c

Rob Latham robl at mcs.anl.gov
Mon Sep 12 16:05:59 CDT 2011


On Wed, Aug 10, 2011 at 01:56:33PM -0600, Martin Pokorny wrote:
> I have modified ad_lustre_wrcoll.c to improve the performance of the
> ADIO Lustre code under workloads typical in my application.

Hi Martin.  Thanks for the patch.  I wanted to let you know that I am
going to look more closely at it, but haven't had time yet. 

==rob

> This
> patch should be correct in all cases, so I thought that I ought to
> submit my changes back to the code maintainers. The changes are
> relatively simple,  and I have not detected any errors in the files
> generated using this code; nevertheless, further testing in other
> applications would be in order.
> 
> These changes improve performance by reducing the number of system
> 'write' calls in the ADIO Lustre collective write code, and perhaps
> also by keeping the writes ordered. This is especially effective in
> my application, in which the data are highly interleaved among the
> processes in the group calling the MPI-IO collective write
> functions.
> 
> It's difficult for me to get clean performance numbers with the
> system on which my application runs. However, by way of example,
> prior to these changes my application could not sustain 25 MB/s (on
> our 1 Gb Ethernet-connected, 4 OSS Lustre filesystem); with these
> changes 25 MB/s is no problem, and there are indications that 50
> MB/s or better might be possible.
> 

> --- ad_lustre_wrcoll.c.orig	2010-02-23 10:33:53.000000000 -0700
> +++ ad_lustre_wrcoll.c.new	2011-08-10 10:38:24.000000000 -0600
> @@ -52,7 +52,9 @@
>  					 int *curr_to_proc,
>  					 int *done_to_proc, int *hole,
>  					 int iter, MPI_Aint buftype_extent,
> -					 int *buf_idx, int *error_code);
> +					 int *buf_idx,
> +					 ADIO_Offset **srt_off, int **srt_len, int *srt_num,
> +					 int *error_code);
>  void ADIOI_Heap_merge(ADIOI_Access *others_req, int *count,
>                        ADIO_Offset *srt_off, int *srt_len, int *start_pos,
>                        int nprocs, int nprocs_recv, int total_elements);
> @@ -317,6 +319,11 @@
>      MPI_Aint buftype_extent;
>      int stripe_size = striping_info[0], avail_cb_nodes = striping_info[2];
>      int data_sieving = 0;
> +    ADIO_Offset *srt_off = NULL;
> +    int *srt_len = NULL;
> +    int srt_num = 0;
> +    ADIO_Offset block_offset;
> +    int block_len;
>  
>      *error_code = MPI_SUCCESS;	/* changed below if error */
>      /* only I/O errors are currently reported */
> @@ -513,7 +520,9 @@
>                                       buftype_is_contig, contig_access_count,
>                                       striping_info, others_req, send_buf_idx,
>                                       curr_to_proc, done_to_proc, &hole, m,
> -                                     buftype_extent, this_buf_idx, error_code);
> +                                  buftype_extent, this_buf_idx,
> +                                  &srt_off, &srt_len, &srt_num, error_code);
> +
>  	if (*error_code != MPI_SUCCESS)
>              goto over;
>  
> @@ -537,23 +546,42 @@
>                                       ADIO_EXPLICIT_OFFSET, off, &status,
>                                       error_code);
>                  } else {
> -                    for (i = 0; i < nprocs; i++) {
> -                        if (others_req[i].count) {
> -                            for (j = 0; j < others_req[i].count; j++) {
> -                                if (others_req[i].offsets[j] < off + real_size &&
> -                                    others_req[i].offsets[j] >= off) {
> +                    block_offset = -1;
> +                    block_len = 0;
> +                    for (i = 0; i < srt_num; ++i) {
> +                        if (srt_off[i] < off + real_size &&
> +                            srt_off[i] >= off) {
> +                            if (block_offset == -1) {
> +                                block_offset = srt_off[i];
> +                                block_len = srt_len[i];
> +                            } else {
> +                                if (srt_off[i] == block_offset + block_len) {
> +                                    block_len += srt_len[i];
> +                                } else {
>                                      ADIO_WriteContig(fd,
> -                                                     write_buf + others_req[i].offsets[j] - off,
> -                                                     others_req[i].lens[j],
> +                                                     write_buf + block_offset - off,
> +                                                     block_len,
>                                                       MPI_BYTE, ADIO_EXPLICIT_OFFSET,
> -                                                     others_req[i].offsets[j], &status,
> +                                                     block_offset, &status,
>                                                       error_code);
>  	                            if (*error_code != MPI_SUCCESS)
>  		                        goto over;
> +                                    block_offset = srt_off[i];
> +                                    block_len = srt_len[i];
>                                  }
>                              }
>                          }
>                      }
> +                    if (block_offset != -1) {
> +                        ADIO_WriteContig(fd,
> +                                         write_buf + block_offset - off,
> +                                         block_len,
> +                                         MPI_BYTE, ADIO_EXPLICIT_OFFSET,
> +                                         block_offset, &status,
> +                                         error_code);
> +                        if (*error_code != MPI_SUCCESS)
> +                            goto over;
> +                    }
>                  }
>              }
>  	    if (*error_code != MPI_SUCCESS)
> @@ -562,6 +590,10 @@
>          iter_st_off += max_size;
>      }
>  over:
> +    if (srt_off)
> +        ADIOI_Free(srt_off);
> +    if (srt_len)
> +        ADIOI_Free(srt_len);
>      if (ntimes)
>  	ADIOI_Free(write_buf);
>      ADIOI_Free(recv_curr_offlen_ptr);
> @@ -598,15 +630,16 @@
>  					 int *curr_to_proc, int *done_to_proc,
>                                           int *hole, int iter,
>                                           MPI_Aint buftype_extent,
> -					 int *buf_idx, int *error_code)
> +					 int *buf_idx,
> +                          ADIO_Offset **srt_off, int **srt_len, int *srt_num,
> +                          int *error_code)
>  {
>      int i, j, nprocs_recv, nprocs_send, err;
>      char **send_buf = NULL;
>      MPI_Request *requests, *send_req;
>      MPI_Datatype *recv_types;
>      MPI_Status *statuses, status;
> -    int *srt_len, sum, sum_recv;
> -    ADIO_Offset *srt_off;
> +    int sum_recv;
>      int data_sieving = *hole;
>      static char myname[] = "ADIOI_W_EXCHANGE_DATA";
>  
> @@ -638,20 +671,26 @@
>       * For this, merge the (sorted) offset lists others_req using a heap-merge.
>       */
>  
> -    sum = 0;
> +    *srt_num = 0;
>      for (i = 0; i < nprocs; i++)
> -	sum += count[i];
> -    srt_off = (ADIO_Offset *) ADIOI_Malloc((sum + 1) * sizeof(ADIO_Offset));
> -    srt_len = (int *) ADIOI_Malloc((sum + 1) * sizeof(int));
> +        *srt_num += count[i];
> +    if (*srt_off)
> +        *srt_off = (ADIO_Offset *) ADIOI_Realloc(*srt_off, (*srt_num + 1) * sizeof(ADIO_Offset));
> +    else
> +        *srt_off = (ADIO_Offset *) ADIOI_Malloc((*srt_num + 1) * sizeof(ADIO_Offset));
> +    if (*srt_len)
> +        *srt_len = (int *) ADIOI_Realloc(*srt_len, (*srt_num + 1) * sizeof(int));
> +    else
> +        *srt_len = (int *) ADIOI_Malloc((*srt_num + 1) * sizeof(int));
>      /* +1 to avoid a 0-size malloc */
>  
> -    ADIOI_Heap_merge(others_req, count, srt_off, srt_len, start_pos,
> -		     nprocs, nprocs_recv, sum);
> +    ADIOI_Heap_merge(others_req, count, *srt_off, *srt_len, start_pos,
> +		     nprocs, nprocs_recv, *srt_num);
>  
>      /* check if there are any holes */
>      *hole = 0;
> -    for (i = 0; i < sum - 1; i++) {
> -        if (srt_off[i] + srt_len[i] < srt_off[i + 1]) {
> +    for (i = 0; i < *srt_num - 1; i++) {
> +        if ((*srt_off)[i] + (*srt_len)[i] < (*srt_off)[i + 1]) {
>              *hole = 1;
>  	    break;
>  	}
> @@ -681,14 +720,10 @@
>                                                 MPI_ERR_IO,
>                                                 "**ioRMWrdwr", 0);
>              ADIOI_Free(recv_types);
> -            ADIOI_Free(srt_off);
> -            ADIOI_Free(srt_len);
>              return;
>          }
>          // --END ERROR HANDLING--
>      }
> -    ADIOI_Free(srt_off);
> -    ADIOI_Free(srt_len);
>  
>      nprocs_send = 0;
>      for (i = 0; i < nprocs; i++)


-- 
Rob Latham
Mathematics and Computer Science Division
Argonne National Lab, IL USA


More information about the mpich-discuss mailing list