[MPICH] mpich 1.2.7 hangs on MPID_P4_Init

Adam Zhang Adam.Zhang at Sun.com
Fri Jun 29 00:37:48 CDT 2007


Hi all,

I am using mpqc (version 2.3.1) with mpich 1.2.7p1. When I run one of 
mpqc test example, I find it hangs on MPID_P4_Init. I wonder if this is 
a problem of MPI Sends and Receives. Below is the stack of running process:

Stack:
adam at bishop # pgrep mpqc
29125
29126
29130
29129
adam at bishop # ptree 29125
411   /usr/lib/ssh/sshd
  28740 /usr/lib/ssh/sshd
    28747 /usr/lib/ssh/sshd
      28749 -bash
        29064 /bin/sh /export/home/adam/local/bin/mpirun -np 2 bin/mpqc 
data/h2
          29125 /export/home/adam/amanda/bin/mpqc 
data/h2o_mp200sto3gc1.in -p4p
            29126 /export/home/adam/amanda/bin/mpqc 
data/h2o_mp200sto3gc1.in -p
            29127 rsh bishop -l adam -n 
/export/home/adam/amanda/bin/mpqc bisho
adam at bishop # ptree 29129
291   /usr/lib/inet/inetd start
  29128 /usr/sbin/in.rshd
    29129 /export/home/adam/amanda/bin/mpqc bishop 34021 -p4amslave 
-p4yourname
      29130 /export/home/adam/amanda/bin/mpqc bishop 34021 -p4amslave 
-p4yourna


adam at bishop # pstack 29126  |  /opt/SUNWspro/bin/c++filt
29126:  /export/home/adam/amanda/bin/mpqc data/h2o_mp200sto3gc1.in -p4pg 
/expo
 fda20a57 pollsys  (8044ee0, 2, 0, 0)
 fd9cee0a pselect  (9, 8044fc0, fda49868, fda49868, 0, 0) + 18e
 fd9cf100 select   (9, 8044fc0, 0, 0, 0) + 82
 0856a838 listener (8047b6c, 8728648, 8761218, fda48000, 8045084, 
fd9c3023) + 238
 0856370f create_bm_processes (87af588) + 50f
 085630f5 p4_startup (87af588) + 125
 08562fae p4_create_procgroup (8047b6c, 8728648, 8761218, 8761218, 
8045158, 8581fd6) + 8e
 0856fd4a MPID_P4_Init (8047b6c, 8047b70) + 4a
 0856efcf MPID_CH_InitMsgPass (8047b6c, 8047b70, 4000, 1f400) + ef
 0856c235 MPID_Init (8047b6c, 8047b70, 0, 8045260) + 1f5
 0854a120 MPIR_Init (8047b6c, 8047b70) + 130
 08549fd6 MPI_Init_thread (8047b6c, 8047b70, 1, 80452ec) + 26
 0851da81 void sc::MPIMessageGrp::init(int,int*,char***) (8761218, 5b, 
8047b6c, 8047b70) + 19d
 0851d61e sc::MPIMessageGrp::MPIMessageGrp #Nvariant 1(int*,char***) 
(8761218, 8047b6c, 8047b70) + 46
 08131c2a int try_main(int,char**) (2, 8047bcc) + 10a
 08137aae main     (6, 8047bcc, 8047be8) + 1a
 08130c1a _start   (6, 8047cb4, 8047cd6, 0, 8047cf5, 0) + 7a

adam at bishop # pstack 29130  |  /opt/SUNWspro/bin/c++filt
29130:  /export/home/adam/amanda/bin/mpqc bishop 34021 -p4amslave 
-p4yourname
 fda20a57 pollsys  (80432e0, 2, 0, 0)
 fd9cee0a pselect  (a, 80433cc, fda49868, fda49868, 0, 0) + 18e
 fd9cf100 select   (a, 80433cc, 0, 0, 0) + 82
 0856a838 listener (8047ddc, 8728648, 8761218, 0, 0, 0) + 238
 08564ba3 create_rm_processes (1, 6) + 653
 085642ed rm_start (8047ddc, 8047e3c) + 43d
 085610f5 p4_initenv (8047ddc, 8047e3c) + 185
 0856fd24 MPID_P4_Init (8047ddc, 8047de0) + 24
 0856efcf MPID_CH_InitMsgPass (8047ddc, 8047de0, 4000, 1f400) + ef
 0856c235 MPID_Init (8047ddc, 8047de0, 0, 80454d0) + 1f5
 0854a120 MPIR_Init (8047ddc, 8047de0) + 130
 08549fd6 MPI_Init_thread (8047ddc, 8047de0, 1, 804555c) + 26
 0851da81 void sc::MPIMessageGrp::init(int,int*,char***) (8761218, 5b, 
8047ddc, 8047de0) + 19d
 0851d61e sc::MPIMessageGrp::MPIMessageGrp #Nvariant 1(int*,char***) 
(8761218, 8047ddc, 8047de0) + 46
 08131c2a int try_main(int,char**) (4, 8047e3c) + 10a
 08137aae main     (8, 8047e3c, 8047e60) + 1a
 08130c1a _start   (8, 8047eec, 8047f0e, 8047f15, 8047f1b, 0) + 7a


adam at bishop # pstack 29125  |  /opt/SUNWspro/bin/c++filt
29125:  /export/home/adam/amanda/bin/mpqc data/h2o_mp200sto3gc1.in -p4pg 
/expo
-----------------  lwp# 1 / thread# 1  --------------------
 fda20d87 lwp_wait (2, 803e7f4)
 fda1cfd2 _thrp_join (2, 0, 803e844, 1) + 5a
 fda1d151 pthread_join (2, 803e844) + 2b
 08522d33 int sc::PthreadThreadGrp::wait_threads() (87719b8) + 3f
 08522630 void sc::MTMPIMemoryGrp::deactivate() (8813e60) + a8
 08522675 void sc::MTMPIMemoryGrp::sync() (8813e60) + 3d
 08152c4c void sc::MBPT2::compute_cs_grad() (8811940) + 1a58
 0814dbc4 void sc::MBPT2::compute() (8811940) + 43c
 0852660a void sc::AccResultInfo::update() (8811980) + 36
 0844a256 double sc::Function::value() (8811940) + 16
 083c8080 double sc::MolecularEnergy::energy() (8811940) + 14
 0813505f int try_main(int,char**) (2, 8047bcc) + 353f
 08137aae main     (6, 8047bcc, 8047be8) + 1a
 08130c1a _start   (6, 8047cb4, 8047cd6, 0, 8047cf5, 0) + 7a
-----------------  lwp# 2 / thread# 2  --------------------
 fda20a57 pollsys  (fd5e7bd0, 1, fd5e7c68, 0)
 fd9cee0a pselect  (8, fd5e7cac, fda49868, fda49868, fd5e7c68, 0) + 18e
 fd9cf100 select   (8, fd5e7cac, 0, 0, fd5e7d2c) + 82
 08569403 socket_recv (1) + 1c3
 0857b3b3 recv_message (873ec78, 873ec74) + 33
 0857b1af p4_recv  (873ec78, 873ec74, fd5e7dc8, 873efbc) + 6f
 0858234b MPID_CH_Check_incoming (8761290, 1) + 2ab
 0857e079 MPID_RecvComplete (fd5ebe5c, fd5ebf80, fd5ebf20) + d9
 0856d2cd MPID_RecvDatatype (8814170, fd5ebf68, 18, 873ea60, fffffffe, 
3a99) + 8d
 08549be8 MPI_Recv (fd5ebf68, 18, 3, fffffffe, 3a99, 89) + 218
 0852122d int sc::MTMPIThread::run_one() (881d398) + 2d
 085211f4 void sc::MTMPIThread::run() (881d398) + 10
 08519f95 void*sc::Thread::run_Thread_run(void*) (881d398) + 15
 08519ece Thread__run_Thread_run (881d398) + e
 fda1fd36 _thr_setup (fd8e2400) + 4e
 fda20020 _lwp_start (fd8e2400, 0, 0, fd5ebff8, fda20020, fd8e2400)
adam at bishop #
adam at bishop # pstack 29129  |  /opt/SUNWspro/bin/c++filt
29129:  /export/home/adam/amanda/bin/mpqc bishop 34021 -p4amslave 
-p4yourname
-----------------  lwp# 1 / thread# 1  --------------------
 fda20d87 lwp_wait (2, 803ea64)
 fda1cfd2 _thrp_join (2, 0, 803eab4, 1) + 5a
 fda1d151 pthread_join (2, 803eab4) + 2b
 08522d33 int sc::PthreadThreadGrp::wait_threads() (87719b8) + 3f
 08522630 void sc::MTMPIMemoryGrp::deactivate() (87bb918) + a8
 08522675 void sc::MTMPIMemoryGrp::sync() (87bb918) + 3d
 08152c4c void sc::MBPT2::compute_cs_grad() (87beeb0) + 1a58
 0814dbc4 void sc::MBPT2::compute() (87beeb0) + 43c
 0852660a void sc::AccResultInfo::update() (87beef0) + 36
 0844a256 double sc::Function::value() (87beeb0) + 16
 083c8080 double sc::MolecularEnergy::energy() (87beeb0) + 14
 0813505f int try_main(int,char**) (2, 877ece8) + 353f
 08137aae main     (8, 8047e3c, 8047e60) + 1a
 08130c1a _start   (8, 8047eec, 8047f0e, 8047f15, 8047f1b, 0) + 7a
-----------------  lwp# 2 / thread# 2  --------------------
 fda20a57 pollsys  (fd5e7be0, 1, fd5e7c68, 0)
 fd9cee0a pselect  (7, fd5e7cac, fda49868, fda49868, fd5e7c68, 0) + 18e
 fd9cf100 select   (7, fd5e7cac, 0, 0, fd5e7d2c) + 82
 08569403 socket_recv (1) + 1c3
 0857b3b3 recv_message (873ec78, 873ec74) + 33
 0857b1af p4_recv  (873ec78, 873ec74, fd5e7dc8, 873efbc) + 6f
 0858234b MPID_CH_Check_incoming (8761290, 1) + 2ab
 0857e079 MPID_RecvComplete (fd5ebe5c, fd5ebf80, fd5ebf20) + d9
 0856d2cd MPID_RecvDatatype (87bbc28, fd5ebf68, 18, 873ea60, fffffffe, 
3a99) + 8d
 08549be8 MPI_Recv (fd5ebf68, 18, 3, fffffffe, 3a99, 89) + 218
 0852122d int sc::MTMPIThread::run_one() (87c77c0) + 2d
 085211f4 void sc::MTMPIThread::run() (87c77c0) + 10
 08519f95 void*sc::Thread::run_Thread_run(void*) (87c77c0) + 15
 08519ece Thread__run_Thread_run (87c77c0) + e
 fda1fd36 _thr_setup (fd8e2400) + 4e
 fda20020 _lwp_start (fd8e2400, 0, 0, fd5ebff8, fda20020, fd8e2400)


Environment: Solaris 10 for AMD64.
Compiler: Sun Studio 11
The MPQC configure command is :
./configure --with-cc="mpicc" --with-cxx="mpicxx" --with-f77="mpif77" 
--with-libs="-lsunperf -lmpich" --with-libdirs='-L/opt/mpich/lib' 
--prefix=/opt/mpqc --enable-always-use-mpi --with-default-parallel=mpi 
--with-mpi-thread="funneled"

Regards,
Adam




More information about the mpich-discuss mailing list