<html xmlns:v="urn:schemas-microsoft-com:vml" xmlns:o="urn:schemas-microsoft-com:office:office" xmlns:w="urn:schemas-microsoft-com:office:word" xmlns:m="http://schemas.microsoft.com/office/2004/12/omml" xmlns="http://www.w3.org/TR/REC-html40">

<head>
<meta http-equiv=Content-Type content="text/html; charset=us-ascii">
<meta name=Generator content="Microsoft Word 12 (filtered medium)">
<style>
<!--
 /* Font Definitions */
 @font-face
        {font-family:Calibri;
        panose-1:2 15 5 2 2 2 4 3 2 4;}
@font-face
        {font-family:Tahoma;
        panose-1:2 11 6 4 3 5 4 4 2 4;}
 /* Style Definitions */
 p.MsoNormal, li.MsoNormal, div.MsoNormal
        {margin:0in;
        margin-bottom:.0001pt;
        font-size:12.0pt;
        font-family:"Times New Roman","serif";}
a:link, span.MsoHyperlink
        {mso-style-priority:99;
        color:blue;
        text-decoration:underline;}
a:visited, span.MsoHyperlinkFollowed
        {mso-style-priority:99;
        color:purple;
        text-decoration:underline;}
p
        {mso-style-priority:99;
        mso-margin-top-alt:auto;
        margin-right:0in;
        mso-margin-bottom-alt:auto;
        margin-left:0in;
        font-size:12.0pt;
        font-family:"Times New Roman","serif";}
span.subsectiontoc
        {mso-style-name:subsectiontoc;}
span.EmailStyle19
        {mso-style-type:personal-reply;
        font-family:"Calibri","sans-serif";
        color:#1F497D;}
.MsoChpDefault
        {mso-style-type:export-only;}
@page Section1
        {size:8.5in 11.0in;
        margin:1.0in 1.0in 1.0in 1.0in;}
div.Section1
        {page:Section1;}
-->
</style>
<!--[if gte mso 9]><xml>
 <o:shapedefaults v:ext="edit" spidmax="1026" />
</xml><![endif]--><!--[if gte mso 9]><xml>
 <o:shapelayout v:ext="edit">
  <o:idmap v:ext="edit" data="1" />
 </o:shapelayout></xml><![endif]-->
</head>

<body lang=EN-US link=blue vlink=purple>

<div class=Section1>

<p class=MsoNormal><span style='font-size:11.0pt;font-family:"Calibri","sans-serif";
color:#1F497D'>Krishna, thanks for the suggestion &#8211; but setting
MV2_USE_SHMEM_COLL to zero did not seem to change the stack trace much:<o:p></o:p></span></p>

<p class=MsoNormal><span style='font-size:11.0pt;font-family:"Calibri","sans-serif";
color:#1F497D'><o:p>&nbsp;</o:p></span></p>

<p class=MsoNormal><span style='font-size:11.0pt;font-family:"Calibri","sans-serif";
color:#1F497D'>Node 0:<o:p></o:p></span></p>

<p class=MsoNormal><span style='font-size:11.0pt;font-family:"Calibri","sans-serif";
color:#1F497D'><o:p>&nbsp;</o:p></span></p>

<p class=MsoNormal><span style='font-size:11.0pt;font-family:"Calibri","sans-serif";
color:#1F497D'>0x00002aaaaab5d8b7 in MPIDI_CH3I_MRAILI_Cq_poll
(vbuf_handle=0x7fffcb46d698,<o:p></o:p></span></p>

<p class=MsoNormal><span style='font-size:11.0pt;font-family:"Calibri","sans-serif";
color:#1F497D'>&nbsp;&nbsp;&nbsp; vc_req=0x0, receiving=0, is_blocking=1) at
ibv_channel_manager.c:529<o:p></o:p></span></p>

<p class=MsoNormal><span style='font-size:11.0pt;font-family:"Calibri","sans-serif";
color:#1F497D'>529&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; &nbsp;&nbsp;for (; i
&lt; rdma_num_hcas; ++i) {<o:p></o:p></span></p>

<p class=MsoNormal><span style='font-size:11.0pt;font-family:"Calibri","sans-serif";
color:#1F497D'>(gdb) where<o:p></o:p></span></p>

<p class=MsoNormal><span style='font-size:11.0pt;font-family:"Calibri","sans-serif";
color:#1F497D'>#0&nbsp; 0x00002aaaaab5d8b7 in MPIDI_CH3I_MRAILI_Cq_poll (<o:p></o:p></span></p>

<p class=MsoNormal><span style='font-size:11.0pt;font-family:"Calibri","sans-serif";
color:#1F497D'>&nbsp;&nbsp;&nbsp; vbuf_handle=0x7fffcb46d698, vc_req=0x0,
receiving=0, is_blocking=1)<o:p></o:p></span></p>

<p class=MsoNormal><span style='font-size:11.0pt;font-family:"Calibri","sans-serif";
color:#1F497D'>&nbsp;&nbsp;&nbsp; at ibv_channel_manager.c:529<o:p></o:p></span></p>

<p class=MsoNormal><span style='font-size:11.0pt;font-family:"Calibri","sans-serif";
color:#1F497D'>#1&nbsp; 0x00002aaaaab177fa in MPIDI_CH3I_read_progress
(vc_pptr=0x7fffcb46d6a0,<o:p></o:p></span></p>

<p class=MsoNormal><span style='font-size:11.0pt;font-family:"Calibri","sans-serif";
color:#1F497D'>&nbsp;&nbsp;&nbsp; v_ptr=0x7fffcb46d698, is_blocking=1) at
ch3_read_progress.c:143<o:p></o:p></span></p>

<p class=MsoNormal><span style='font-size:11.0pt;font-family:"Calibri","sans-serif";
color:#1F497D'>#2&nbsp; 0x00002aaaaab17464 in MPIDI_CH3I_Progress
(is_blocking=1,<o:p></o:p></span></p>

<p class=MsoNormal><span style='font-size:11.0pt;font-family:"Calibri","sans-serif";
color:#1F497D'>&nbsp;&nbsp;&nbsp; state=&lt;value optimized out&gt;) at
ch3_progress.c:202<o:p></o:p></span></p>

<p class=MsoNormal><span style='font-size:11.0pt;font-family:"Calibri","sans-serif";
color:#1F497D'>#3&nbsp; 0x00002aaaaab5bc4e in MPIC_Wait
(request_ptr=0x2aaaaae19800)<o:p></o:p></span></p>

<p class=MsoNormal><span style='font-size:11.0pt;font-family:"Calibri","sans-serif";
color:#1F497D'>&nbsp;&nbsp;&nbsp; at helper_fns.c:269<o:p></o:p></span></p>

<p class=MsoNormal><span style='font-size:11.0pt;font-family:"Calibri","sans-serif";
color:#1F497D'>#4&nbsp; 0x00002aaaaab5c043 in MPIC_Sendrecv
(sendbuf=0x10993a80, sendcount=2,<o:p></o:p></span></p>

<p class=MsoNormal><span style='font-size:11.0pt;font-family:"Calibri","sans-serif";
color:#1F497D'>&nbsp;&nbsp;&nbsp; sendtype=1275069445, dest=1, sendtag=7,
recvbuf=0x10993a88, recvcount=2,<o:p></o:p></span></p>

<p class=MsoNormal><span style='font-size:11.0pt;font-family:"Calibri","sans-serif";
color:#1F497D'>&nbsp;&nbsp;&nbsp; recvtype=1275069445, source=1, recvtag=7,
comm=1140850688,<o:p></o:p></span></p>

<p class=MsoNormal><span style='font-size:11.0pt;font-family:"Calibri","sans-serif";
color:#1F497D'>&nbsp;&nbsp;&nbsp; status=0x7fffcb46d820) at helper_fns.c:125<o:p></o:p></span></p>

<p class=MsoNormal><span style='font-size:11.0pt;font-family:"Calibri","sans-serif";
color:#1F497D'>#5&nbsp; 0x00002aaaaaafe387 in MPIR_Allgather (sendbuf=&lt;value
optimized out&gt;,<o:p></o:p></span></p>

<p class=MsoNormal><span style='font-size:11.0pt;font-family:"Calibri","sans-serif";
color:#1F497D'>&nbsp;&nbsp;&nbsp; sendcount=&lt;value optimized out&gt;,
sendtype=1275069445, recvbuf=0x10993a80,<o:p></o:p></span></p>

<p class=MsoNormal><span style='font-size:11.0pt;font-family:"Calibri","sans-serif";
color:#1F497D'>&nbsp;&nbsp;&nbsp; recvcount=2, recvtype=1275069445,
comm_ptr=0x2aaaaae1c1e0)<o:p></o:p></span></p>

<p class=MsoNormal><span style='font-size:11.0pt;font-family:"Calibri","sans-serif";
color:#1F497D'>&nbsp;&nbsp;&nbsp; at allgather.c:192<o:p></o:p></span></p>

<p class=MsoNormal><span style='font-size:11.0pt;font-family:"Calibri","sans-serif";
color:#1F497D'>#6&nbsp; 0x00002aaaaaafeff9 in PMPI_Allgather
(sendbuf=0xffffffffffffffff,<o:p></o:p></span></p>

<p class=MsoNormal><span style='font-size:11.0pt;font-family:"Calibri","sans-serif";
color:#1F497D'>&nbsp;&nbsp;&nbsp; sendcount=2, sendtype=1275069445,
recvbuf=0x10993a80, recvcount=2,<o:p></o:p></span></p>

<p class=MsoNormal><span style='font-size:11.0pt;font-family:"Calibri","sans-serif";
color:#1F497D'>&nbsp;&nbsp;&nbsp; recvtype=1275069445, comm=1140850688) at
allgather.c:866<o:p></o:p></span></p>

<p class=MsoNormal><span style='font-size:11.0pt;font-family:"Calibri","sans-serif";
color:#1F497D'>#7&nbsp; 0x00002aaaaab3b00b in PMPI_Comm_split (comm=1140850688,
color=0, key=0,<o:p></o:p></span></p>

<p class=MsoNormal><span style='font-size:11.0pt;font-family:"Calibri","sans-serif";
color:#1F497D'>&nbsp;&nbsp;&nbsp; newcomm=0x2aaaaae1c2f4) at comm_split.c:196<o:p></o:p></span></p>

<p class=MsoNormal><span style='font-size:11.0pt;font-family:"Calibri","sans-serif";
color:#1F497D'>#8&nbsp; 0x00002aaaaab3cd84 in create_2level_comm
(comm=1140850688, size=2,<o:p></o:p></span></p>

<p class=MsoNormal><span style='font-size:11.0pt;font-family:"Calibri","sans-serif";
color:#1F497D'>---Type &lt;return&gt; to continue, or q &lt;return&gt; to
quit---<o:p></o:p></span></p>

<p class=MsoNormal><span style='font-size:11.0pt;font-family:"Calibri","sans-serif";
color:#1F497D'>&nbsp;&nbsp;&nbsp; my_rank=&lt;value optimized out&gt;) at
create_2level_comm.c:142<o:p></o:p></span></p>

<p class=MsoNormal><span style='font-size:11.0pt;font-family:"Calibri","sans-serif";
color:#1F497D'>#9&nbsp; 0x00002aaaaab6877d in PMPI_Init (argc=0x7fffcb46db3c,
argv=0x7fffcb46db30)<o:p></o:p></span></p>

<p class=MsoNormal><span style='font-size:11.0pt;font-family:"Calibri","sans-serif";
color:#1F497D'>&nbsp;&nbsp;&nbsp; at init.c:146<o:p></o:p></span></p>

<p class=MsoNormal><span style='font-size:11.0pt;font-family:"Calibri","sans-serif";
color:#1F497D'>#10 0x0000000000400b2f in main (argc=3, argv=0x7fffcb46dc78) at
bw.c:27<o:p></o:p></span></p>

<p class=MsoNormal><span style='font-size:11.0pt;font-family:"Calibri","sans-serif";
color:#1F497D'><o:p>&nbsp;</o:p></span></p>

<p class=MsoNormal><span style='font-size:11.0pt;font-family:"Calibri","sans-serif";
color:#1F497D'>Node 1:<o:p></o:p></span></p>

<p class=MsoNormal><span style='font-size:11.0pt;font-family:"Calibri","sans-serif";
color:#1F497D'><o:p>&nbsp;</o:p></span></p>

<p class=MsoNormal><span style='font-size:11.0pt;font-family:"Calibri","sans-serif";
color:#1F497D'>MPIDI_CH3I_read_progress (vc_pptr=0x7fff0b10bb50,
v_ptr=0x7fff0b10bb48,<o:p></o:p></span></p>

<p class=MsoNormal><span style='font-size:11.0pt;font-family:"Calibri","sans-serif";
color:#1F497D'>&nbsp;&nbsp;&nbsp; is_blocking=1) at ch3_read_progress.c:143<o:p></o:p></span></p>

<p class=MsoNormal><span style='font-size:11.0pt;font-family:"Calibri","sans-serif";
color:#1F497D'>143&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; type =
MPIDI_CH3I_MRAILI_Cq_poll(v_ptr, NULL, 0, is_blocking);<o:p></o:p></span></p>

<p class=MsoNormal><span style='font-size:11.0pt;font-family:"Calibri","sans-serif";
color:#1F497D'>(gdb) where<o:p></o:p></span></p>

<p class=MsoNormal><span style='font-size:11.0pt;font-family:"Calibri","sans-serif";
color:#1F497D'>#0&nbsp; MPIDI_CH3I_read_progress (vc_pptr=0x7fff0b10bb50,
v_ptr=0x7fff0b10bb48,<o:p></o:p></span></p>

<p class=MsoNormal><span style='font-size:11.0pt;font-family:"Calibri","sans-serif";
color:#1F497D'>&nbsp;&nbsp;&nbsp; is_blocking=1) at ch3_read_progress.c:143<o:p></o:p></span></p>

<p class=MsoNormal><span style='font-size:11.0pt;font-family:"Calibri","sans-serif";
color:#1F497D'>#1&nbsp; 0x00002afc9fb21f44 in MPIDI_CH3I_Progress (is_blocking=1,<o:p></o:p></span></p>

<p class=MsoNormal><span style='font-size:11.0pt;font-family:"Calibri","sans-serif";
color:#1F497D'>&nbsp;&nbsp;&nbsp; state=&lt;value optimized out&gt;) at
ch3_progress.c:202<o:p></o:p></span></p>

<p class=MsoNormal><span style='font-size:11.0pt;font-family:"Calibri","sans-serif";
color:#1F497D'>#2&nbsp; 0x00002afc9fb6660e in MPIC_Wait
(request_ptr=0x2afc9fd242a0)<o:p></o:p></span></p>

<p class=MsoNormal><span style='font-size:11.0pt;font-family:"Calibri","sans-serif";
color:#1F497D'>&nbsp;&nbsp;&nbsp; at helper_fns.c:269<o:p></o:p></span></p>

<p class=MsoNormal><span style='font-size:11.0pt;font-family:"Calibri","sans-serif";
color:#1F497D'>#3&nbsp; 0x00002afc9fb66a03 in MPIC_Sendrecv (sendbuf=0xf77028,
sendcount=2,<o:p></o:p></span></p>

<p class=MsoNormal><span style='font-size:11.0pt;font-family:"Calibri","sans-serif";
color:#1F497D'>&nbsp;&nbsp;&nbsp; sendtype=1275069445, dest=0, sendtag=7,
recvbuf=0xf77020, recvcount=4,<o:p></o:p></span></p>

<p class=MsoNormal><span style='font-size:11.0pt;font-family:"Calibri","sans-serif";
color:#1F497D'>&nbsp;&nbsp;&nbsp; recvtype=1275069445, source=0, recvtag=7,
comm=1140850688,<o:p></o:p></span></p>

<p class=MsoNormal><span style='font-size:11.0pt;font-family:"Calibri","sans-serif";
color:#1F497D'>&nbsp;&nbsp;&nbsp; status=0x7fff0b10bcd0) at helper_fns.c:125<o:p></o:p></span></p>

<p class=MsoNormal><span style='font-size:11.0pt;font-family:"Calibri","sans-serif";
color:#1F497D'>#4&nbsp; 0x00002afc9fb08ddb in MPIR_Allgather (sendbuf=&lt;value
optimized out&gt;,<o:p></o:p></span></p>

<p class=MsoNormal><span style='font-size:11.0pt;font-family:"Calibri","sans-serif";
color:#1F497D'>&nbsp;&nbsp;&nbsp; sendcount=&lt;value optimized out&gt;,
sendtype=1275069445, recvbuf=0xf77020,<o:p></o:p></span></p>

<p class=MsoNormal><span style='font-size:11.0pt;font-family:"Calibri","sans-serif";
color:#1F497D'>&nbsp;&nbsp;&nbsp; recvcount=2, recvtype=1275069445,
comm_ptr=0x2afc9fd26c80)<o:p></o:p></span></p>

<p class=MsoNormal><span style='font-size:11.0pt;font-family:"Calibri","sans-serif";
color:#1F497D'>&nbsp;&nbsp;&nbsp; at allgather.c:192<o:p></o:p></span></p>

<p class=MsoNormal><span style='font-size:11.0pt;font-family:"Calibri","sans-serif";
color:#1F497D'>#5&nbsp; 0x00002afc9fb09a45 in PMPI_Allgather
(sendbuf=0xffffffffffffffff,<o:p></o:p></span></p>

<p class=MsoNormal><span style='font-size:11.0pt;font-family:"Calibri","sans-serif";
color:#1F497D'>&nbsp;&nbsp;&nbsp; sendcount=2, sendtype=1275069445, recvbuf=0xf77020,
recvcount=2,<o:p></o:p></span></p>

<p class=MsoNormal><span style='font-size:11.0pt;font-family:"Calibri","sans-serif";
color:#1F497D'>&nbsp;&nbsp;&nbsp; recvtype=1275069445, comm=1140850688) at
allgather.c:866<o:p></o:p></span></p>

<p class=MsoNormal><span style='font-size:11.0pt;font-family:"Calibri","sans-serif";
color:#1F497D'>#6&nbsp; 0x00002afc9fb4591b in PMPI_Comm_split (comm=1140850688,
color=1, key=0,<o:p></o:p></span></p>

<p class=MsoNormal><span style='font-size:11.0pt;font-family:"Calibri","sans-serif";
color:#1F497D'>&nbsp;&nbsp;&nbsp; newcomm=0x2afc9fd26d94) at comm_split.c:196<o:p></o:p></span></p>

<p class=MsoNormal><span style='font-size:11.0pt;font-family:"Calibri","sans-serif";
color:#1F497D'>#7&nbsp; 0x00002afc9fb478f4 in create_2level_comm (comm=1140850688,
size=2,<o:p></o:p></span></p>

<p class=MsoNormal><span style='font-size:11.0pt;font-family:"Calibri","sans-serif";
color:#1F497D'>&nbsp;&nbsp;&nbsp; my_rank=&lt;value optimized out&gt;) at
create_2level_comm.c:142<o:p></o:p></span></p>

<p class=MsoNormal><span style='font-size:11.0pt;font-family:"Calibri","sans-serif";
color:#1F497D'>#8&nbsp; 0x00002afc9fb730a5 in PMPI_Init (argc=0x7fff0b10bfec,
argv=0x7fff0b10bfe0)<o:p></o:p></span></p>

<p class=MsoNormal><span style='font-size:11.0pt;font-family:"Calibri","sans-serif";
color:#1F497D'>&nbsp;&nbsp;&nbsp; at init.c:146<o:p></o:p></span></p>

<p class=MsoNormal><span style='font-size:11.0pt;font-family:"Calibri","sans-serif";
color:#1F497D'>---Type &lt;return&gt; to continue, or q &lt;return&gt; to
quit---<o:p></o:p></span></p>

<p class=MsoNormal><span style='font-size:11.0pt;font-family:"Calibri","sans-serif";
color:#1F497D'>#9&nbsp; 0x0000000000400bcf in main (argc=3,
argv=0x7fff0b10c128) at bw.c:27<o:p></o:p></span></p>

<p class=MsoNormal><span style='font-size:10.0pt;font-family:"Arial","sans-serif";
color:#1F497D'><o:p>&nbsp;</o:p></span></p>

<p class=MsoNormal><span style='font-size:10.0pt;font-family:"Arial","sans-serif";
color:#1F497D'>Any suggestions would be appreciated.<o:p></o:p></span></p>

<p class=MsoNormal><span style='font-size:10.0pt;font-family:"Arial","sans-serif";
color:#1F497D'>--</span><span style='color:#1F497D'><o:p></o:p></span></p>

<p class=MsoNormal><span style='font-size:10.0pt;font-family:"Arial","sans-serif";
color:#1F497D'>Michael Heinz</span><span style='color:#1F497D'><o:p></o:p></span></p>

<p class=MsoNormal><span style='font-size:10.0pt;font-family:"Arial","sans-serif";
color:#1F497D'>Principal Engineer, Qlogic Corporation</span><span
style='color:#1F497D'><o:p></o:p></span></p>

<p class=MsoNormal><span style='font-size:10.0pt;font-family:"Arial","sans-serif";
color:#1F497D'>King of Prussia, Pennsylvania</span><span style='font-size:11.0pt;
font-family:"Calibri","sans-serif";color:#1F497D'><o:p></o:p></span></p>

<div style='border:none;border-top:solid #B5C4DF 1.0pt;padding:3.0pt 0in 0in 0in'>

<p class=MsoNormal><b><span style='font-size:10.0pt;font-family:"Tahoma","sans-serif"'>From:</span></b><span
style='font-size:10.0pt;font-family:"Tahoma","sans-serif"'>
kris.c1986@gmail.com [mailto:kris.c1986@gmail.com] <b>On Behalf Of </b>Krishna
Chaitanya<br>
<b>Sent:</b> Tuesday, July 14, 2009 6:39 PM<br>
<b>To:</b> Mike Heinz<br>
<b>Cc:</b> Todd Rimmer; mvapich-discuss@cse.ohio-state.edu;
mpich2-dev@mcs.anl.gov<br>
<b>Subject:</b> Re: [mvapich-discuss] [mpich2-dev] Need a hint in debugging a
problem that only affects a few machines in our cluster.<o:p></o:p></span></p>

</div>

<p class=MsoNormal><o:p>&nbsp;</o:p></p>

<p class=MsoNormal style='margin-bottom:12.0pt'>Mike,<br>
&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; The hang seems to be occuring
when the MPI library is trying to create the 2-level communicator, during the
init phase. Can you try running the test with <span class=subsectiontoc><a
href="http://mvapich.cse.ohio-state.edu/support/user_guide_mvapich2-1.4rc1.html#x1-16000011.74"
id=QQ2-1-160>MV2_USE_SHMEM_COLL</a></span>=0. This will ensure that a flat
communicator is used for the subsequent MPI calls. This might help us isolate
the problem. <br>
<br>
Thanks,<br>
Krishna <br>
&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; <o:p></o:p></p>

<div>

<p class=MsoNormal>On Tue, Jul 14, 2009 at 5:04 PM, Mike Heinz &lt;<a
href="mailto:michael.heinz@qlogic.com">michael.heinz@qlogic.com</a>&gt; wrote:<o:p></o:p></p>

<div>

<div>

<p>We&#8217;re having a very odd problem with our fabric, where, out of the
entire cluster, machine &#8220;A&#8221; can&#8217;t run mvapich2 programs with
&nbsp;machine &#8220;B&#8221;, and machine &#8220;C&#8221; can&#8217;t run
programs with machine &#8220;D&#8221; &#8211; even though &#8220;A&#8221; can
run with &#8220;D&#8221; and &#8220;B&#8221; can run with &#8220;C&#8221;
&#8211; and the rest of the fabric works fine.<o:p></o:p></p>

<p>&nbsp;<o:p></o:p></p>

<p style='text-indent:-.25in'>1)<span style='font-size:7.0pt'>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;
</span>There are no IB errors anywhere on the fabric that I can find, and the
machines in question all work correctly with mvapich1 and low-level IB tests.<o:p></o:p></p>

<p style='text-indent:-.25in'>2)<span style='font-size:7.0pt'>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;
</span>The problem occurs whether using mpd or rsh.<o:p></o:p></p>

<p style='text-indent:-.25in'>3)<span style='font-size:7.0pt'>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;
</span>If I attach to the running processes, both machines appear to be waiting
for a read operation to complete. (See below)<o:p></o:p></p>

<p>&nbsp;<o:p></o:p></p>

<p>Can anyone make a suggestion on how to debug this? <o:p></o:p></p>

<p>&nbsp;<o:p></o:p></p>

<p>Stack trace for node 0:<o:p></o:p></p>

<p>&nbsp;<o:p></o:p></p>

<p>#0&nbsp; 0x000000361160abb5 in pthread_spin_lock () from
/lib64/libpthread.so.0<o:p></o:p></p>

<p>#1&nbsp; 0x00002aaaab08fb6c in mthca_poll_cq (ibcq=0x2060980, ne=1,<o:p></o:p></p>

<p>&nbsp;&nbsp;&nbsp; wc=0x7fff9d835900) at src/cq.c:468<o:p></o:p></p>

<p>#2&nbsp; 0x00002aaaaab5d8d8 in MPIDI_CH3I_MRAILI_Cq_poll (<o:p></o:p></p>

<p>&nbsp;&nbsp;&nbsp; vbuf_handle=0x7fff9d8359d8, vc_req=0x0, receiving=0,
is_blocking=1)<o:p></o:p></p>

<p>&nbsp;&nbsp;&nbsp; at /usr/include/infiniband/verbs.h:934<o:p></o:p></p>

<p>#3&nbsp; 0x00002aaaaab177fa in MPIDI_CH3I_read_progress
(vc_pptr=0x7fff9d8359e0,<o:p></o:p></p>

<p>&nbsp;&nbsp;&nbsp; v_ptr=0x7fff9d8359d8, is_blocking=1) at
ch3_read_progress.c:143<o:p></o:p></p>

<p>#4&nbsp; 0x00002aaaaab17464 in MPIDI_CH3I_Progress (is_blocking=1,<o:p></o:p></p>

<p>&nbsp;&nbsp;&nbsp; state=&lt;value optimized out&gt;) at ch3_progress.c:202<o:p></o:p></p>

<p>#5&nbsp; 0x00002aaaaab5bc4e in MPIC_Wait (request_ptr=0x2aaaaae19800)<o:p></o:p></p>

<p>&nbsp;&nbsp;&nbsp; at helper_fns.c:269<o:p></o:p></p>

<p>#6&nbsp; 0x00002aaaaab5c043 in MPIC_Sendrecv (sendbuf=0x217fc50,
sendcount=2,<o:p></o:p></p>

<p>&nbsp;&nbsp;&nbsp; sendtype=1275069445, dest=1, sendtag=7,
recvbuf=0x217fc58, recvcount=2,<o:p></o:p></p>

<p>&nbsp;&nbsp;&nbsp; recvtype=1275069445, source=1, recvtag=7,
comm=1140850688,<o:p></o:p></p>

<p>&nbsp;&nbsp;&nbsp; status=0x7fff9d835b60) at helper_fns.c:125<o:p></o:p></p>

<p>#7&nbsp; 0x00002aaaaaafe387 in MPIR_Allgather (sendbuf=&lt;value optimized
out&gt;,<o:p></o:p></p>

<p>&nbsp;&nbsp;&nbsp; sendcount=&lt;value optimized out&gt;,
sendtype=1275069445, recvbuf=0x217fc50,<o:p></o:p></p>

<p>&nbsp;&nbsp;&nbsp; recvcount=2, recvtype=1275069445,
comm_ptr=0x2aaaaae1c1e0)<o:p></o:p></p>

<p>&nbsp;&nbsp;&nbsp; at allgather.c:192<o:p></o:p></p>

<p>#8&nbsp; 0x00002aaaaaafeff9 in PMPI_Allgather (sendbuf=0xffffffffffffffff,<o:p></o:p></p>

<p>&nbsp;&nbsp;&nbsp; sendcount=2, sendtype=1275069445, recvbuf=0x217fc50,
recvcount=2,<o:p></o:p></p>

<p>&nbsp;&nbsp;&nbsp; recvtype=1275069445, comm=1140850688) at allgather.c:866<o:p></o:p></p>

<p>---Type &lt;return&gt; to continue, or q &lt;return&gt; to quit---<o:p></o:p></p>

<p>#9&nbsp; 0x00002aaaaab3b00b in PMPI_Comm_split (comm=1140850688, color=0,
key=0,<o:p></o:p></p>

<p>&nbsp;&nbsp;&nbsp; newcomm=0x2aaaaae1c2f4) at comm_split.c:196<o:p></o:p></p>

<p>#10 0x00002aaaaab3cd84 in create_2level_comm (comm=1140850688, size=2,<o:p></o:p></p>

<p>&nbsp;&nbsp;&nbsp; my_rank=&lt;value optimized out&gt;) at
create_2level_comm.c:142<o:p></o:p></p>

<p>#11 0x00002aaaaab6877d in PMPI_Init (argc=0x7fff9d835e7c,
argv=0x7fff9d835e70)<o:p></o:p></p>

<p>&nbsp;&nbsp;&nbsp; at init.c:146<o:p></o:p></p>

<p>#12 0x0000000000400b2f in main (argc=3, argv=0x7fff9d835fb8) at bw.c:27<o:p></o:p></p>

<p>&nbsp;<o:p></o:p></p>

<p>Stack trace for node 1:<o:p></o:p></p>

<p>&nbsp;<o:p></o:p></p>

<p>#0&nbsp; 0x00002ac3cbdac2d2 in MPIDI_CH3I_read_progress
(vc_pptr=0x7fffdee81020,<o:p></o:p></p>

<p>&nbsp;&nbsp;&nbsp; v_ptr=0x7fffdee81018, is_blocking=1) at
ch3_read_progress.c:143<o:p></o:p></p>

<p>#1&nbsp; 0x00002ac3cbdabf44 in MPIDI_CH3I_Progress (is_blocking=1,<o:p></o:p></p>

<p>&nbsp;&nbsp;&nbsp; state=&lt;value optimized out&gt;) at ch3_progress.c:202<o:p></o:p></p>

<p>#2&nbsp; 0x00002ac3cbdf060e in MPIC_Wait (request_ptr=0x2ac3cbfae2a0)<o:p></o:p></p>

<p>&nbsp;&nbsp;&nbsp; at helper_fns.c:269<o:p></o:p></p>

<p>#3&nbsp; 0x00002ac3cbdf0a03 in MPIC_Sendrecv (sendbuf=0xf79028, sendcount=2,<o:p></o:p></p>

<p>&nbsp;&nbsp;&nbsp; sendtype=1275069445, dest=0, sendtag=7, recvbuf=0xf79020,
recvcount=4,<o:p></o:p></p>

<p>&nbsp;&nbsp;&nbsp; recvtype=1275069445, source=0, recvtag=7,
comm=1140850688,<o:p></o:p></p>

<p>&nbsp;&nbsp;&nbsp; status=0x7fffdee811a0) at helper_fns.c:125<o:p></o:p></p>

<p>#4&nbsp; 0x00002ac3cbd92ddb in MPIR_Allgather (sendbuf=&lt;value optimized
out&gt;,<o:p></o:p></p>

<p>&nbsp;&nbsp;&nbsp; sendcount=&lt;value optimized out&gt;,
sendtype=1275069445, recvbuf=0xf79020,<o:p></o:p></p>

<p>&nbsp;&nbsp;&nbsp; recvcount=2, recvtype=1275069445,
comm_ptr=0x2ac3cbfb0c80)<o:p></o:p></p>

<p>&nbsp;&nbsp;&nbsp; at allgather.c:192<o:p></o:p></p>

<p>#5&nbsp; 0x00002ac3cbd93a45 in PMPI_Allgather (sendbuf=0xffffffffffffffff,<o:p></o:p></p>

<p>&nbsp;&nbsp;&nbsp; sendcount=2, sendtype=1275069445, recvbuf=0xf79020,
recvcount=2,<o:p></o:p></p>

<p>&nbsp;&nbsp;&nbsp; recvtype=1275069445, comm=1140850688) at allgather.c:866<o:p></o:p></p>

<p>#6&nbsp; 0x00002ac3cbdcf91b in PMPI_Comm_split (comm=1140850688, color=1,
key=0,<o:p></o:p></p>

<p>&nbsp;&nbsp;&nbsp; newcomm=0x2ac3cbfb0d94) at comm_split.c:196<o:p></o:p></p>

<p>#7&nbsp; 0x00002ac3cbdd18f4 in create_2level_comm (comm=1140850688, size=2,<o:p></o:p></p>

<p>&nbsp;&nbsp;&nbsp; my_rank=&lt;value optimized out&gt;) at
create_2level_comm.c:142<o:p></o:p></p>

<p>#8&nbsp; 0x00002ac3cbdfd0a5 in PMPI_Init (argc=0x7fffdee814bc,
argv=0x7fffdee814b0)<o:p></o:p></p>

<p>&nbsp;&nbsp;&nbsp; at init.c:146<o:p></o:p></p>

<p>---Type &lt;return&gt; to continue, or q &lt;return&gt; to quit---<o:p></o:p></p>

<p>#9&nbsp; 0x0000000000400bcf in main (argc=3, argv=0x7fffdee815f8) at bw.c:27<o:p></o:p></p>

<p><span style='font-size:10.0pt'>--</span><o:p></o:p></p>

<p><span style='font-size:10.0pt'>Michael Heinz</span><o:p></o:p></p>

<p><span style='font-size:10.0pt'>Principal Engineer, Qlogic Corporation</span><o:p></o:p></p>

<p><span style='font-size:10.0pt'>King of Prussia, Pennsylvania</span><o:p></o:p></p>

</div>

</div>

<p class=MsoNormal style='margin-bottom:12.0pt'><br>
_______________________________________________<br>
mvapich-discuss mailing list<br>
<a href="mailto:mvapich-discuss@cse.ohio-state.edu">mvapich-discuss@cse.ohio-state.edu</a><br>
<a href="http://mail.cse.ohio-state.edu/mailman/listinfo/mvapich-discuss"
target="_blank">http://mail.cse.ohio-state.edu/mailman/listinfo/mvapich-discuss</a><o:p></o:p></p>

</div>

<p class=MsoNormal><br>
<br clear=all>
<br>
-- <br>
In the middle of difficulty, lies opportunity<o:p></o:p></p>

</div>

</body>

</html>