#! /bin/bash # script enabling multi-processor execution under PBS # E. J. Valeo 03/31/04 # It distributes nmx single processor jobs among the nproc # processors assigned by PBS. # nproc jobs are started initially. Processors are reassigned to # the next job on the list as they complete their assigned tasks. # Customization occurs in the command_script() function. # submit to 1 node, 2 processors #PBS -l nodes=1:ppn=2:gigabit #PBS -N orbit3d echo "Show PBS nodes assigned" echo "--------------------------------------------" /bin/cat $PBS_NODEFILE echo "--------------------------------------------" echo PBS_O_WORKDIR= $PBS_O_WORKDIR job_nr=`echo $PBS_JOBID | sed s/\.transp.*//g` cd $PBS_O_WORKDIR cp $PBS_NODEFILE PI$job_nr CHECK_JOB_FILE=check_job_file.$job_nr lock_dir=$job_nr declare -a proc declare -a in_use declare -a stream_number declare -a finish_file_name host= got_one= load_proc_arrays(){ iproc=0 { while read hn do proc[$iproc]=$hn in_use[$iproc]=no iproc=`expr $iproc + 1` done } < $PBS_NODEFILE nproc=${#proc[@]} } assign_host(){ ip=$1; job_=$2 host=${proc[$ip]} in_use[$ip]=yes stream_number[$ip]=${job_} finish_file_name[${job_}]=${lock_dir}/finish.${host}.${job_} } re_assign_host(){ ip=$1 job_=$2 assign_host $ip ${job_} } get_processor(){ # get a processor for the next job in the list # don't return until a processor has been issued while [ /bin/true ];do iproc=0 while [ $iproc -lt $nproc ];do # check if this processor has a job begun on it if [ ${in_use[$iproc]} = yes ];then # check if this processor has finished tstfile=${lock_dir}/finish.${proc[$iproc]}.${stream_number[$iproc]}; if [ -f $tstfile ]; then # yes, re assign to next job re_assign_host $iproc $job got_one=yes break 2; else # no, increment to next processor let iproc=$iproc+1; got_one= host= fi else # this process has not yet been assigned, assign it assign_host $iproc $job got_one=yes break 2; fi done # wait for a host to appear sleep 1 done } setup(){ # create a lock file directory, if necessary if [ ! -d $lock_dir ];then mkdir -p $lock_dir else # clean pre-existing directory rm -f $lock_dir/* fi load_proc_arrays # load the list of assigned hosts } command_script_template(){ let s=$RANDOM/1000 sleep $s cmd="echo running job number ${job} on host ${host}" echo $cmd ssh -f ${host} "($cmd)" } command_script(){ n=${job} echo "writing orbin."$n awk -v var=$n -v varmx=$nmx 'BEGIN{scale=2/(varmx-1)}\ {if($2=="!wmlt") {printf "%14.4e !wmlt\n",$1*scale*((-varmx+1)/2+var)} else {print}}' _orbin > orbin.$n echo "executing orbit3d $n" cmd="hostname ; cd $PBS_O_WORKDIR; pwd; ./orbit3d $n" ssh ${host} "($cmd)" } launch_job_template(){ # echo launching command_script job_number $job on host $host ; fn=${lock_dir}/${host}.${job} startfn=${lock_dir}/start.${host}.${job} echo `date` > $startfn # startfile is used only to allow for chronology verification # random dummy job execution time # "command_script should be replaced by your command (command_script $host $job; echo `date` > $fn) & } launch_job(){ # echo launching command_script job_number $job on host $host ; finishfn=${lock_dir}/finish.${host}.${job} # existence of finish file is checked by get_processor to # reassign ${host} startfn=${lock_dir}/start.${host}.${job} echo `date` > $startfn # start file is used only to allow for chronology verification # "command_script should be replaced by your command (command_script; sleep 2; echo `date` > ${finishfn}) & } wait_for_completion(){ cj=0 while [ $cj -lt $nmx ];do if [ -f ${finish_file_name[$cj]} ]; then (echo "job $cj done" >> $CHECK_JOB_FILE) let cj=$cj+1; fi done } setup # initialization routine nmx=10 # set number of jobs in queue job_counter=0 while [ $job_counter -lt $nmx ];do job=${job_counter} get_processor; launch_job; let job_counter=$job_counter+1; done wait_for_completion exit