[SCore-users-jp] [54] MTMI: fatal error (0x1): score_attach_network() failed: 12 <13> SCORE: Program signaled (Aborted).について

鈴木 陽介 yosukeys @ yahoo.co.jp
2012年 7月 13日 (金) 19:26:25 JST


亀山さま
神奈川大学 鈴木陽介です。

メール有難うございます。
最初から設定を見直してみました。問題はありませんでしたが

結果、条件は不明ですが、[5] MTMI: fatal errorが発生したら
以下手順で、再操作すると、16x4も実行できます。

有難うございました。頂いた計算方法はじっくり確認してみます。
> 16x4 ですと, 4 MB 強ぐらいでしょうか?

また、以下の部分も再検討してみます。
> dmesg | grep new_pm_ethernet_context
 

なお、まだよく理解できていませんが

気になっているのが、以下のSTEPです。\マーク以下、scorehosts.dbまでの
コマンド実行です。

# rsh-all -q -s -P -g machinefile /opt/score/sbin/scbdrec 2> \
>             /dev/null > /opt/score/etc/scorehosts.db

#cd /opt/score/7.0.1/etc/
# cat scorehosts.db
上記の操作で、scorehosts.db も確認してみました。
問題無いようようです。

■rootユーザーでログイン(server3.pccluster.org)
#cd /home/sc01
#scout -g machinefile
SCOUT: session started. の応答確認する。
# mpicc hello.c
scrun -g=machinefile -node=4x4 -network=ethernet ./a.out

SCORE{1} 16 nodes (4x4) ready.
[5] MTMI: fatal error (0x1): score_attach_network() failed: 12
<1> SCORE: Program signaled (Aborted).


■再設定実行

scorehosts.dbの定義内容を確認する。

#cd /opt/score/7.0.1/etc/ 
#ls scorehosts.db
#cat scorehosts.db

[root @ server3 sc01]# scrun -g=pcc -node=4x4 -network=ethernet ./a.out
SCore (7.0.1) Connected
SCORE{1} 16 nodes (4x4) ready.
[5] MTMI: fatal error (0x1): score_attach_network() failed: 12
<1> SCORE: Program signaled (Aborted).
[root @ server3 sc01]# scrun -g=pcc -node=4x4 -network=ethernet ./a.out

エラーコード[5] MTMI: fatal error (0x1):終了!

■score関連の環境変数の確認からリトライ

# printenv | grep SC

SCORE_PROGFILE_TEMPLATE=/opt/score/7.0.1/bin/obj.%s/scout.exe
SCOUT_CSH_OPTION=-f
SCORE_ACCEPT_PLATFORM=x86_64-rhel5-linux2_6
SCORE_PREFIX_COMMAND=
SCOUT_TCSH_OPTION=-f
SCBDSERV=server3.pccluster.org
SCOUT_NHOSTS=16
SCORE_RSH=/usr/bin/rsh
SCOUTPORT=/var/tmp//SCore7.root/server3.pccluster.org//scout/scout-3da32

■server3でマウントされているディレクトリとクライアントをリストする。

# showmount -a
All mount points on server3.pccluster.org:
192.168.0.11:/home
192.168.0.11:/opt/score
192.168.0.12:/home
192.168.0.12:/opt/score
192.168.0.13:/home
192.168.0.13:/opt/score
192.168.0.13:/root
192.168.0.14:/home
192.168.0.14:/opt/score
192.168.0.15:/home
192.168.0.15:/opt/score
192.168.0.16:/home
192.168.0.16:/opt/score
192.168.0.17:/home
192.168.0.17:/opt/score
192.168.0.18:/home
192.168.0.18:/opt/score
192.168.0.19:/home
192.168.0.19:/opt/score
192.168.0.203:/home
192.168.0.203:/opt/score
192.168.0.20:/home
192.168.0.20:/opt/score
192.168.0.21:/home
192.168.0.21:/opt/score
192.168.0.22:/home
192.168.0.22:/opt/score
192.168.0.23:/home
192.168.0.23:/opt/score
192.168.0.24:/home
192.168.0.24:/opt/score
192.168.0.25:/home
192.168.0.25:/opt/score
192.168.0.26:/home
192.168.0.26:/opt/score

# showmount -d
Directories on server3.pccluster.org:
/home
/opt/score
/root

# showmount -e
Export list for server3.pccluster.org:
/home 192.168.0.0/255.255.255.0

■計算ホスト16台の再起動の実行

#rsh host001
#reboot
#exit

# cd /home/sc01
# bash
# export SCORE_RSH=rsh
# . /etc/profile.d/score.sh
# sceptic -g machinefile -v
host005.pccluster.org: OK
host002.pccluster.org: OK
host009.pccluster.org: OK
host006.pccluster.org: OK
host012.pccluster.org: OK
host015.pccluster.org: OK
host003.pccluster.org: OK
host013.pccluster.org: OK
host004.pccluster.org: OK
host008.pccluster.org: OK
host011.pccluster.org: OK
host014.pccluster.org: OK
host010.pccluster.org: OK
host001.pccluster.org: OK
host016.pccluster.org: OK
host007.pccluster.org: OK

# printenv | grep SC
SCOUT_CSH_OPTION=-f
SCOUT_TCSH_OPTION=-f
SCBDSERV=server3.pccluster.org
SCORE_RSH=/usr/bin/rsh

# SCORE_RSH=/usr/bin/rsh
#  . /etc/profile.d/score.sh
# rsh-all -q -s -P -g machinefile uptime 2> /dev/null

 11:02:15 up 10 min,  0 users,  load average: 0.00, 0.05, 0.06
 11:02:15 up 10 min,  0 users,  load average: 0.00, 0.03, 0.03
 11:02:15 up 10 min,  0 users,  load average: 0.04, 0.05, 0.04
 11:02:14 up 9 min,  0 users,  load average: 0.00, 0.04, 0.05
 11:02:15 up 9 min,  0 users,  load average: 0.00, 0.05, 0.05
 09:53:11 up 9 min,  0 users,  load average: 0.00, 0.05, 0.05
 10:02:00 up 9 min,  0 users,  load average: 0.02, 0.08, 0.07
 09:55:38 up 9 min,  0 users,  load average: 0.00, 0.06, 0.05
 11:02:14 up 9 min,  0 users,  load average: 0.00, 0.27, 0.28
 11:00:50 up 7 min,  0 users,  load average: 0.00, 0.07, 0.06
 10:01:54 up 7 min,  0 users,  load average: 0.00, 0.07, 0.06
 11:02:08 up 7 min,  0 users,  load average: 0.00, 0.08, 0.06
 11:01:51 up 7 min,  0 users,  load average: 0.00, 0.19, 0.14
 11:03:24 up 7 min,  0 users,  load average: 0.00, 0.17, 0.12
 11:02:15 up 7 min,  0 users,  load average: 0.00, 0.09, 0.06
 11:03:33 up 7 min,  0 users,  load average: 0.02, 0.10, 0.07

# bash
# rsh-all -q -s -P -g machinefile /opt/score/sbin/scbdrec 2> \
>             /dev/null > /opt/score/etc/scorehosts.db

# reboot


■serverに、rootユーザーでログイン(server3.pccluster.org)
#cd /home/sc01

■scorehosts.dbの定義内容を確認する。

#cd /opt/score/7.0.1/etc/ 
#ls scorehosts.db
#cat scorehosts.db

# cat scorehosts.db
host001.pccluster.org   socks=1  cores=4  speed=2100 \
        network=etherhxb,ethernet,sctp \
        group=_scoreall_,pcc
host002.pccluster.org   socks=1  cores=4  speed=2100 \
        network=etherhxb,ethernet,sctp \
        group=_scoreall_,pcc
host003.pccluster.org   socks=1  cores=4  speed=2100 \
        network=etherhxb,ethernet,sctp \
        group=_scoreall_,pcc
host004.pccluster.org   socks=1  cores=4  speed=2100 \
        network=etherhxb,ethernet,sctp \
        group=_scoreall_,pcc
host005.pccluster.org   socks=1  cores=4  speed=2100 \
        network=etherhxb,ethernet,sctp \
        group=_scoreall_,pcc
host006.pccluster.org   socks=1  cores=4  speed=2100 \
        network=etherhxb,ethernet,sctp \
        group=_scoreall_,pcc
host007.pccluster.org   socks=1  cores=4  speed=2100 \
        network=etherhxb,ethernet,sctp \
        group=_scoreall_,pcc
host008.pccluster.org   socks=1  cores=4  speed=2100 \
        network=etherhxb,ethernet,sctp \
        group=_scoreall_,pcc
host009.pccluster.org   socks=1  cores=4  speed=2100 \
        network=etherhxb,ethernet,sctp \
        group=_scoreall_,pcc
host010.pccluster.org   socks=1  cores=4  speed=2100 \
        network=etherhxb,ethernet,sctp \
        group=_scoreall_,pcc
host011.pccluster.org   socks=1  cores=4  speed=2100 \
        network=etherhxb,ethernet,sctp \
        group=_scoreall_,pcc
host012.pccluster.org   socks=1  cores=4  speed=2100 \
        network=etherhxb,ethernet,sctp \
        group=_scoreall_,pcc
host013.pccluster.org   socks=1  cores=4  speed=2100 \
        network=etherhxb,ethernet,sctp \
        group=_scoreall_,pcc
host014.pccluster.org   socks=1  cores=4  speed=2100 \
        network=etherhxb,ethernet,sctp \
        group=_scoreall_,pcc
host015.pccluster.org   socks=1  cores=4  speed=2100 \
        network=etherhxb,ethernet,sctp \
        group=_scoreall_,pcc
host016.pccluster.org   socks=1  cores=4  speed=2100 \
        network=etherhxb,ethernet,sctp \
        group=_scoreall_,pcc

#cd /home/sc01

# scout -g machinefile
SCOUT: session started.
#mpicc hello.c

# scrun -g=machinefile -node=16x4 -network=ethernet ./a.out
SCore (7.0.1) Connected
SCORE{1} 64 nodes (16x4) ready.
Hello !! from host001.pccluster.org @ 0/64
Hello !! from host001.pccluster.org @ 1/64
Hello !! from host001.pccluster.org @ 2/64
Hello !! from host004.pccluster.org @ 12/64
Hello !! from host002.pccluster.org @ 4/64
Hello !! from host005.pccluster.org @ 16/64
Hello !! from host009.pccluster.org @ 32/64
Hello !! from host003.pccluster.org @ 8/64
Hello !! from host008.pccluster.org @ 28/64
Hello !! from host004.pccluster.org @ 13/64
Hello !! from host002.pccluster.org @ 5/64
Hello !! from host009.pccluster.org @ 33/64
Hello !! from host005.pccluster.org @ 18/64
Hello !! from host008.pccluster.org @ 29/64
Hello !! from host004.pccluster.org @ 14/64
Hello !! from host009.pccluster.org @ 35/64
Hello !! from host010.pccluster.org @ 36/64
Hello !! from host008.pccluster.org @ 31/64
Hello !! from host003.pccluster.org @ 9/64
Hello !! from host014.pccluster.org @ 52/64
Hello !! from host003.pccluster.org @ 10/64
Hello !! from host010.pccluster.org @ 37/64
Hello !! from host016.pccluster.org @ 60/64
Hello !! from host014.pccluster.org @ 54/64
Hello !! from host011.pccluster.org @ 40/64
Hello !! from host016.pccluster.org @ 61/64
Hello !! from host014.pccluster.org @ 55/64
Hello !! from host011.pccluster.org @ 42/64
Hello !! from host010.pccluster.org @ 39/64
Hello !! from host016.pccluster.org @ 62/64
Hello !! from host011.pccluster.org @ 43/64
Hello !! from host001.pccluster.org @ 3/64
Hello !! from host005.pccluster.org @ 17/64
Hello !! from host005.pccluster.org @ 19/64
Hello !! from host012.pccluster.org @ 44/64
Hello !! from host012.pccluster.org @ 45/64
Hello !! from host012.pccluster.org @ 46/64
Hello !! from host016.pccluster.org @ 63/64
Hello !! from host008.pccluster.org @ 30/64
Hello !! from host014.pccluster.org @ 53/64
Hello !! from host011.pccluster.org @ 41/64
Hello !! from host015.pccluster.org @ 56/64
Hello !! from host012.pccluster.org @ 47/64
Hello !! from host015.pccluster.org @ 57/64
Hello !! from host009.pccluster.org @ 34/64
Hello !! from host004.pccluster.org @ 15/64
Hello !! from host006.pccluster.org @ 20/64
Hello !! from host007.pccluster.org @ 24/64
Hello !! from host007.pccluster.org @ 25/64
Hello !! from host006.pccluster.org @ 21/64
Hello !! from host007.pccluster.org @ 26/64
Hello !! from host006.pccluster.org @ 22/64
Hello !! from host006.pccluster.org @ 23/64
Hello !! from host007.pccluster.org @ 27/64
Hello !! from host010.pccluster.org @ 38/64
Hello !! from host015.pccluster.org @ 58/64
Hello !! from host015.pccluster.org @ 59/64
Hello !! from host002.pccluster.org @ 6/64
Hello !! from host013.pccluster.org @ 48/64
Hello !! from host013.pccluster.org @ 49/64
Hello !! from host013.pccluster.org @ 50/64
Hello !! from host002.pccluster.org @ 7/64
Hello !! from host013.pccluster.org @ 51/64
Hello !! from host003.pccluster.org @ 11/64

# scrun -g=machinefile -node=16x4 -network=ethernet ./a.out | sort
■実行結果をソートしてみました。


SCore (7.0.1) Connected
SCORE{1} 64 nodes (16x4) ready.
Hello !! from host001.pccluster.org @ 0/64
Hello !! from host001.pccluster.org @ 1/64
Hello !! from host001.pccluster.org @ 2/64
Hello !! from host001.pccluster.org @ 3/64
Hello !! from host002.pccluster.org @ 4/64
Hello !! from host002.pccluster.org @ 5/64
Hello !! from host002.pccluster.org @ 6/64
Hello !! from host002.pccluster.org @ 7/64
Hello !! from host003.pccluster.org @ 10/64
Hello !! from host003.pccluster.org @ 11/64
Hello !! from host003.pccluster.org @ 8/64
Hello !! from host003.pccluster.org @ 9/64
Hello !! from host004.pccluster.org @ 12/64
Hello !! from host004.pccluster.org @ 13/64
Hello !! from host004.pccluster.org @ 14/64
Hello !! from host004.pccluster.org @ 15/64
Hello !! from host005.pccluster.org @ 16/64
Hello !! from host005.pccluster.org @ 17/64
Hello !! from host005.pccluster.org @ 18/64
Hello !! from host005.pccluster.org @ 19/64
Hello !! from host006.pccluster.org @ 20/64
Hello !! from host006.pccluster.org @ 21/64
Hello !! from host006.pccluster.org @ 22/64
Hello !! from host006.pccluster.org @ 23/64
Hello !! from host007.pccluster.org @ 24/64
Hello !! from host007.pccluster.org @ 25/64
Hello !! from host007.pccluster.org @ 26/64
Hello !! from host007.pccluster.org @ 27/64
Hello !! from host008.pccluster.org @ 28/64
Hello !! from host008.pccluster.org @ 29/64
Hello !! from host008.pccluster.org @ 30/64
Hello !! from host008.pccluster.org @ 31/64
Hello !! from host009.pccluster.org @ 32/64
Hello !! from host009.pccluster.org @ 33/64
Hello !! from host009.pccluster.org @ 34/64
Hello !! from host009.pccluster.org @ 35/64
Hello !! from host010.pccluster.org @ 36/64
Hello !! from host010.pccluster.org @ 37/64
Hello !! from host010.pccluster.org @ 38/64
Hello !! from host010.pccluster.org @ 39/64
Hello !! from host011.pccluster.org @ 40/64
Hello !! from host011.pccluster.org @ 41/64
Hello !! from host011.pccluster.org @ 42/64
Hello !! from host011.pccluster.org @ 43/64
Hello !! from host012.pccluster.org @ 44/64
Hello !! from host012.pccluster.org @ 45/64
Hello !! from host012.pccluster.org @ 46/64
Hello !! from host012.pccluster.org @ 47/64
Hello !! from host013.pccluster.org @ 48/64
Hello !! from host013.pccluster.org @ 49/64
Hello !! from host013.pccluster.org @ 50/64
Hello !! from host013.pccluster.org @ 51/64
Hello !! from host014.pccluster.org @ 52/64
Hello !! from host014.pccluster.org @ 53/64
Hello !! from host014.pccluster.org @ 54/64
Hello !! from host014.pccluster.org @ 55/64
Hello !! from host015.pccluster.org @ 56/64
Hello !! from host015.pccluster.org @ 57/64
Hello !! from host015.pccluster.org @ 58/64
Hello !! from host015.pccluster.org @ 59/64
Hello !! from host016.pccluster.org @ 60/64
Hello !! from host016.pccluster.org @ 61/64
Hello !! from host016.pccluster.org @ 62/64
Hello !! from host016.pccluster.org @ 63/64

--- On Thu, 2012/7/12, Kameyama Toyohisa <kameyama @ riken.jp> wrote:

> 亀山です.
> 
> (2012年07月11日 17:25), 鈴木 陽介 wrote:
> >>> A.OUT @ host008[30/64:2/4]#7840:   CALL pmEthernetSetupContext() at >ethernet.c:178 !!ERROR!! cannot set key 16. errno is 12
> >
> >> PMX/Ethernet は通信のための memory を確保しているのですが,
> >> その memory の確保に失敗しているようです.
> >
> > そうですか、計算ホストごとに、どれくらい空きメモリが必要でししょうか?
> 
> kernel で割り当てる容量ですが,
>      scrun -nodes=XxY
> で機動したとき, 大体
>     1  + X + (384 + 16) * 2 + (1 + X * Y + (384 + 16) * 2) * Y
> KB になります.
> 
> 16x4 ですと, 4 MB 強ぐらいでしょうか?
> 
> >> (dmesg で見ればどの memory の確保で失敗したかが分かりますが...)
> > dmesgコマンドは理解できていないので、
> > 確認方法をご教授板だけますか?
> 
>      dmesg | grep new_pm_ethernet_context
> あたりで見てください.
> 
> > なお、代わりに、cat /proc/meminfoコマンドで確認しました。
> >
> >> 計算ホストにどれくらいの memory がありますでしょうか?
> >> 他の process やドライバがメモリを占有していないでしょうか?
> >
> > ●host001~016まで
> > MemTotalは、すべて 各計算ホスト 1025756 kBです。
> >
> > MemFreelは,以下のとおりです。
> > フリーメモリにバラツキがあるので詳細別途確認しますが
> >
> 
> 搭載memory が 1 GB で free が 50 MB から 100 MB ぐらい
> という感じですね.
> 十分そうな気はしますが...
> 
> Kameyama Toyojisa
> 



SCore-users-jp メーリングリストの案内