[mpich-discuss] Facing problem while using MPI_File_set_view

Christina Patrick christina.subscribes at gmail.com
Wed May 27 15:11:21 CDT 2009


Hi Everybody,

I tried it with 3 versions of MPI and they all give me errors when I
use a pvfs file system. The stack trace for each of the different
versions is pasted below:

mpich2-1.0.8/mpich2-1.1rc1

3:  (gdb) bt
0:  #0  0x00000000 in ?? ()
1-3:  Couldn't get registers: No such process.
1-3:  (gdb) 0:  #1  0x0022c096 in gettimeofday () from /lib/libc.so.6
0:  #2  0x0810dd24 in job_testcontext (out_id_array_p=0xbfaf1e40,
0:      inout_count_p=0xbfaf2640, returned_user_ptr_array=0xbfaeea40,
0:      out_status_array_p=0xbfaeee40, timeout_ms=10, context_id=0)
0:      at src/io/job/job.c:4490
0:  #3  0x080c3fa7 in PINT_client_state_machine_test (op_id=20,
0:      error_code=0xbfaf26e8) at src/client/sysint/client-state-machine.c:639
0:  #4  0x080c4441 in PINT_client_wait_internal (op_id=20,
0:      in_op_str=0x8155101 "io", out_error=0xbfaf26e8,
0:      in_class_str=0x81523d4 "sys")
0:      at src/client/sysint/client-state-machine.c:798
0:  #5  0x080c46b5 in PVFS_sys_wait (op_id=20, in_op_str=0x8155101 "io",
0:      out_error=0xbfaf26e8) at src/client/sysint/client-state-machine.c:960
0:  #6  0x080cd7ff in PVFS_sys_io (ref=
0:        {handle = 2305843009213693944, fs_id = 1298105808, __pad1 = 0},
0:      file_req=0x82c7b70, file_req_offset=0, buffer=0xb77d8008,
0:      mem_req=0x82c7b20, credentials=0x82b0a90, resp_p=0xbfaf2890,
0:      io_type=PVFS_IO_READ, hints=0x0) at src/client/sysint/sys-io.sm:381
0:  #7  0x08098fd4 in ADIOI_PVFS2_ReadStrided (fd=0x82b1b70, buf=0xb77d8008,
0:      count=1048576, datatype=1275070475, file_ptr_type=101, offset=0,
0:      status=0xbfaf2bc0, error_code=0xbfaf2b28) at ad_pvfs2_read.c:527
0:  #8  0x080590f1 in ADIOI_GEN_ReadStridedColl (fd=0x82b1b70, buf=0xb77d8008,
0:      count=1048576, datatype=1275070475, file_ptr_type=101, offset=0,
0:      status=0xbfaf2bc0, error_code=0xbfaf2b28) at ad_read_coll.c:149
0:  #9  0x08054402 in MPIOI_File_read_all (mpi_fh=0x82b1b70, offset=0,
0:      file_ptr_type=101, buf=0xb77d8008, count=1048576, datatype=1275070475,
0:      myname=0x8169df8 "MPI_FILE_READ_ALL", status=0xbfaf2bc0) at
read_all.c:106
0:  #10 0x080544fd in PMPI_File_read_all (mpi_fh=0x82b1b70, buf=0xb77d8008,
0:      count=1048576, datatype=1275070475, status=0xbfaf2bc0) at read_all.c:52
0:  #11 0x0804b5a2 in main (argc=1, argv=0xbfaf3144) at row.c:95



mpich2-1.0.8p1

1:  (gdb) 1:  (gdb) bt
0:  #0  0x00000000 in ?? ()
1-3:  Couldn't get registers: No such process.
1-3:  (gdb) 0:  #1  0x00271723 in readv () from /lib/libc.so.6
0:  #2  0x08127441 in BMI_sockio_nbvector (s=14, vector=0x8192640, count=1,
0:      recv_flag=1) at src/io/bmi/bmi_tcp/sockio.c:298
0:  #3  0x08126c7d in payload_progress (s=14, buffer_list=0x8a0046c,
0:      size_list=0x89f83c0, list_count=4, total_size=262144,
0:      list_index=0x8a00604, current_index_complete=0x8a00608,
0:      send_recv=BMI_RECV, enc_hdr=0x0, env_amt_complete=0x0)
0:      at src/io/bmi/bmi_tcp/bmi-tcp.c:3953
0:  #4  0x08125fd3 in work_on_recv_op (my_method_op=0x8a00598,
0:      stall_flag=0xbff49850) at src/io/bmi/bmi_tcp/bmi-tcp.c:3317
0:  #5  0x0812525d in tcp_do_work_recv (map=0x89e2368, stall_flag=0xbff49850)
0:      at src/io/bmi/bmi_tcp/bmi-tcp.c:2976
0:  #6  0x08124e65 in tcp_do_work (max_idle_time=10)
0:      at src/io/bmi/bmi_tcp/bmi-tcp.c:2791
0:  #7  0x081228d0 in BMI_tcp_testcontext (incount=5, out_id_array=0x81923e0,
0:      outcount=0xbff49d04, error_code_array=0x8192408,
0:      actual_size_array=0x8192420, user_ptr_array=0x8192448, max_idle_time=10,
0:      context_id=0) at src/io/bmi/bmi_tcp/bmi-tcp.c:1336
0:  #8  0x080fcc92 in BMI_testcontext (incount=5, out_id_array=0x81923e0,
0:      outcount=0x81922fc, error_code_array=0x8192408,
0:      actual_size_array=0x8192420, user_ptr_array=0x8192448,
0:      max_idle_time_ms=10, context_id=0) at src/io/bmi/bmi.c:1081
0:  #9  0x0810f1fe in bmi_thread_function (ptr=0x0) at
src/io/job/thread-mgr.c:248
0:  #10 0x0810fb8c in PINT_thread_mgr_bmi_push (max_idle_time=10)
0:      at src/io/job/thread-mgr.c:847
0:  #11 0x0810e9cf in do_one_work_cycle_all (idle_time_ms=10)
0:      at src/io/job/job.c:5411
0:  #12 0x0810dc7b in job_testcontext (out_id_array_p=0xbff4d280,
0:      inout_count_p=0xbff4da80, returned_user_ptr_array=0xbff49e80,
0:      out_status_array_p=0xbff4a280, timeout_ms=10, context_id=0)
0:      at src/io/job/job.c:4452
0:  #13 0x080c3fa7 in PINT_client_state_machine_test (op_id=20,
0:      error_code=0xbff4db28) at src/client/sysint/client-state-machine.c:639
0:  #14 0x080c4441 in PINT_client_wait_internal (op_id=20,
0:      in_op_str=0x8155101 "io", out_error=0xbff4db28,
0:      in_class_str=0x81523d4 "sys")
0:      at src/client/sysint/client-state-machine.c:798
0:  #15 0x080c46b5 in PVFS_sys_wait (op_id=20, in_op_str=0x8155101 "io",
0:      out_error=0xbff4db28) at src/client/sysint/client-state-machine.c:960
0:  #16 0x080cd7ff in PVFS_sys_io (ref=
0:        {handle = 2305843009213693944, fs_id = 1298105808, __pad1 = 0},
0:      file_req=0x89e4b70, file_req_offset=0, buffer=0xb77bb008,
0:      mem_req=0x89e4b20, credentials=0x89cb930, resp_p=0xbff4dcd0,
0:      io_type=PVFS_IO_READ, hints=0x0) at src/client/sysint/sys-io.sm:381
0:  #17 0x08098fd4 in ADIOI_PVFS2_ReadStrided (fd=0x89cb698, buf=0xb77bb008,
0:      count=1048576, datatype=1275070475, file_ptr_type=101, offset=0,
0:      status=0xbff4e000, error_code=0xbff4df68) at ad_pvfs2_read.c:527
0:  #18 0x080590f1 in ADIOI_GEN_ReadStridedColl (fd=0x89cb698, buf=0xb77bb008,
0:      count=1048576, datatype=1275070475, file_ptr_type=101, offset=0,
0:      status=0xbff4e000, error_code=0xbff4df68) at ad_read_coll.c:149
0:  #19 0x08054402 in MPIOI_File_read_all (mpi_fh=0x89cb698, offset=0,
0:      file_ptr_type=101, buf=0xb77bb008, count=1048576, datatype=1275070475,
0:      myname=0x8169df8 "MPI_FILE_READ_ALL", status=0xbff4e000) at
read_all.c:106
0:  #20 0x080544fd in PMPI_File_read_all (mpi_fh=0x89cb698, buf=0xb77bb008,
0:      count=1048576, datatype=1275070475, status=0xbff4e000) at read_all.c:52
0:  #21 0x0804b5a2 in main (argc=1, argv=0xbff4e584) at row.c:95


Thanks and Regards,
Christina.

On Wed, May 27, 2009 at 3:23 PM, Christina Patrick
<christina.subscribes at gmail.com> wrote:
> I will try it with the latest releace rc1 and let you know.
>
> Thanks,
> Christina.
>
> On Wed, May 27, 2009 at 3:04 PM, Christina Patrick
> <christina.subscribes at gmail.com> wrote:
>> I have a feeling that there is a bug in the MPI code when we use
>> MPI_Type_create_subarray() to create row file view as specified below.
>> If you change the buffer size below in my program to something smaller
>> than the entire view, you will get an error. I checked the use of
>> MPI_Type_create_subarray() API for creating column and block file
>> views. They work fine.
>>
>> If somebody could please take a look at this problem, it would be
>> really helpful.
>>
>> Thanks and Regards,
>> Christina.
>>
>> On Mon, May 25, 2009 at 5:19 PM, Christina Patrick
>> <christina.subscribes at gmail.com> wrote:
>>> Hi Everybody,
>>>
>>> I am writing a program to read an array of 16384 x 16384 of type
>>> double ~ 2GB file size
>>> I am using the file view as follows:
>>>
>>> ------------------------------
>>> |                             |    P0
>>> ------------------------------
>>> |                             |    P1
>>> ------------------------------
>>> |                             |    P2
>>> ------------------------------
>>> |                             |    P3
>>> ------------------------------
>>>
>>> I create the file view using MPI_Type_create_subarray() and use
>>> collective I/O MPI_File_read_all()
>>>
>>> When I read the entire file view of P0 (P1, P2, P3) into a buffer
>>> (512MB) (in a single instance), the program works fine.
>>> However, I do not want to use a buffer as big as 512 MB. So I use a
>>> smaller buffer (such as 8MB) and iterate over the file view of P0 (P1,
>>> P2, P3), my program starts throwing errors/get segmentation faults:
>>> [E 16:49:28.377861] Error: payload_progress: Bad address
>>> OR
>>> segmentation fault in ADIOI_Calc_my_off_len() at line: while
>>> (flat_file->type != fd->filetype) flat_file = flat_file->next;
>>> because flat_file becomes 0x0 in one of the processes.
>>>
>>> If I use a column view and logically do the same thing, I do not face
>>> this problem. (I hope that I have been able to explain the problem. In
>>> case of doubt, please let me know.)
>>>
>>> Could somebody please help me,
>>>
>>> Thanks and Regards,
>>> Christina.
>>>
>>> PS: I am pasting the program below:
>>>
>>> #include "mpi.h"
>>> #include <stdio.h>
>>> #include <string.h>
>>> #include <stdlib.h>
>>> #include <math.h>
>>> #include <errno.h>
>>>
>>> #define ROWS                (16384)
>>> #define COLS                (16384)
>>> #define MPI_DATATYPE        (MPI_DOUBLE)
>>> #define C_DATATYPE          double
>>> #define DIMS                (2)
>>> #define COLL_BUFSIZE        (536870912)
>>>
>>> int main(int argc, char **argv) {
>>>  char          fname[] = "pvfs2:/home/mdl/patrick/pvfs2/testfile";
>>>  int           i = 0, nprocs = 0, mynod = 0, provided = 0, c_size =
>>> 0, mpi_size = 0, iterations = 0,
>>>                array_size[] = {0, 0}, array_subsize[] = {0, 0},
>>> array_start[] = {0, 0};
>>>  long          rows = 0l, cols = 0l, coll_bufsize = 0l, rows_view =
>>> 0l, cols_view = 0l, rows_collbuf = 0l, cols_collbuf = 0l, elts_collbuf
>>> = 0l;
>>>  unsigned long filesize = 0l;
>>>  double        *buffer = NULL;
>>>  MPI_File      fhandle;
>>>  MPI_Status    status;
>>>  MPI_Datatype  subarray;
>>>
>>>  MPI_Init_thread(&argc, &argv, MPI_THREAD_MULTIPLE, &provided);
>>>  MPI_Comm_rank(MPI_COMM_WORLD, &mynod);
>>>  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
>>>
>>>  MPI_Type_size(MPI_DATATYPE, &mpi_size);
>>>  c_size     = sizeof(C_DATATYPE);
>>>  if(c_size != mpi_size) {
>>>    fprintf(stderr, "Datatypes in MPI and C do not match\n");
>>>    MPI_Abort(MPI_COMM_WORLD, EIO);
>>>  }
>>>
>>>  rows             = ROWS;
>>>  cols             = COLS;
>>>  coll_bufsize     = COLL_BUFSIZE;
>>>  elts_collbuf     = coll_bufsize / mpi_size;
>>>  rows_view        = rows / nprocs;
>>>  cols_view        = cols;
>>>  cols_collbuf     = cols_view;
>>>  rows_collbuf     = elts_collbuf / cols_collbuf;
>>>  filesize         = rows * cols * mpi_size;
>>>  array_size[0]    = rows;
>>>  array_size[1]    = cols;
>>>  array_subsize[0] = rows_view;
>>>  array_subsize[1] = cols_view;
>>>  array_start[0]   = rows_view * mynod;
>>>  array_start[1]   = 0;
>>>
>>>  buffer = (C_DATATYPE *)malloc(coll_bufsize);
>>>  if(!buffer) {
>>>    fprintf(stderr, "calloc error\n");
>>>    MPI_Abort(MPI_COMM_WORLD, ENOMEM);
>>>  }
>>>
>>>  MPI_File_open(MPI_COMM_WORLD, fname, MPI_MODE_RDONLY, MPI_INFO_NULL,
>>> &fhandle);
>>>
>>>  MPI_Type_create_subarray(DIMS, array_size, array_subsize,
>>> array_start, MPI_ORDER_C, MPI_DATATYPE, &subarray);
>>>  MPI_Type_commit(&subarray);
>>>  MPI_File_set_view(fhandle, 0, MPI_DATATYPE, subarray, "native",
>>> MPI_INFO_NULL);
>>>
>>>  iterations = rows_view / rows_collbuf;
>>>
>>>  for(i = 0; i < iterations; i++)
>>>    MPI_File_read_all(fhandle, buffer, elts_collbuf, MPI_DATATYPE, &status);
>>>
>>>  MPI_File_close(&fhandle);
>>>  free(buffer);
>>>  MPI_Finalize();
>>>
>>>  return 0;
>>> }
>>>
>>
>


More information about the mpich-discuss mailing list