# "defq" is the default and route only queue with # targets: common and ib queues #PBS -q defq # Ask for 20 minutes wall clock's time for the whole job #PBS -l walltime=00:20:00 # Ask for 5 minutes CPU time for the whole job #PBS -l cput=00:05:00 # Ask for 10 CPUes in total #PBS -l nodes=10 ## Ask for 10 hosts x 4 cpus ##PBS -l nodes=wn001.jinr.ru:ppn=4+wn002.jinr.ru:ppn=4+wn003.jinr.ru:ppn=4+wn004.jinr.ru:ppn=4+wn005.jinr.ru:ppn=4+wn006.jinr.ru:ppn=4+wn007.jinr.ru:ppn=4+wn008.jinr.ru:ppn=4+wn009.jinr.ru:ppn=4+wn010.jinr.ru:ppn=4 # Job is not restartable #PBS -r n # No checkpoint for the job (not implemented on linux) #PBS -c n # # start of the real jobs from here # echo "==================================================" # Print user name myname=`whoami` 2>&1 echo "whoami=$myname" # Print execution host (mother superior) echo "Mother Superior host: "`hostname -f` echo "Number of allocated CPU: "`cat $PBS_NODEFILE | wc -l` echo "All allocated hosts:" cat $PBS_NODEFILE | sort | uniq # Print current directory echo "pwd="`pwd 2>&1` # Check for PAG echo "id="`id 2>&1` # Check for my procs and environment echo "==================== klist ======================" klist -5 2>&1 | grep -Ev '^[[:space:]]*$' echo "==================== tokens ======================" tokens 2>&1 | grep -Ev '^[[:space:]]*$' echo "============== my processes ======================" ps -ef | grep $myname | grep -v 'grep ' | grep -v 'ps -ef' echo "============ /tmp/tkt* /tmp/krb5cc* ============" /bin/ls -lt /tmp/tkt* /tmp/krb5cc* 2>&1 | grep $myname | head -5 echo "==================================================" # The program source is in lxpub05:/scr/u/vmi/myprog.c # Go to $TMPDIR and copy myprog.c echo "cd \$TMPDIR" cd $TMPDIR 2>&1 if test $? -ne 0 ; then echo "ERROR: can not cd to \"$TMPDIR\"" exit 1 fi # Print current directory echo "pwd="`pwd 2>&1` scp -p2 lxpub05:/scr/u/vmi/myprog.c . if test $? -ne 0 ; then echo "ERROR: can not scp lxpub05:/scr/u/vmi/myprog.c" exit 2 fi echo "Got myprog.c from lxpub05:/scr/u/vmi/myprog.c" # Build the executable. echo "mpicc: "`which mpicc 2>&1` echo "mpicc -i-dynamic -o myprog myprog.c" mpicc -i-dynamic -o myprog myprog.c 2>&1 # Distribute program to all allocated nodes exclude this one for node in `cat $PBS_NODEFILE | sort | uniq` ; do test X"$node" = X"`hostname -f`" && continue ssh -2x $node "mkdir -p "`pwd` scp -p2 myprog $node:`pwd` if test $? -ne 0 ; then echo "ERROR: can not scp myprog to $node:"`pwd` exit 3 fi echo "Copy done to $node:"`pwd`"/myprog" done # And run it in all allocated nodes echo "mpiexec: "`which mpiexec 2>&1` if test -x myprog ; then # is infiniband loaded on the node if test X"`/sbin/lsmod | grep ^ib_core\ `" != "X" ; then # with infiniband - default echo "ulimit -l 262144" ulimit -l 262144 2>&1 echo "================= run program ====================" echo "mpiexec ./myprog" mpiexec ./myprog 2>&1 else # no infiniband - disable openib echo "================= run program ====================" echo "mpiexec --mca btl ^openib ./myprog" mpiexec --mca btl ^openib ./myprog 2>&1 fi else echo "myprog (executable) not found in "`pwd` fi echo "==================================================" # That's all echo "done"