#!/bin/sh
#
#
# (c) 2002 Sun Microsystems, Inc. Use is subject to license terms.  

ckpt_dir=$3

if [ ! -f $ckpt_dir/ckpt.log ]; then
   touch $ckpt_dir/ckpt.log
   chmod 666 $ckpt_dir/ckpt.log
fi

# create temp directory for holding checkpoint info

tmpdir=$ckpt_dir/ckpt.$1
mkdir -p $tmpdir
cd $tmpdir

# create log file

F=$tmpdir/checkpoint.log
touch $F

echo -------------------------------------------------------------  >> $F 2>&1
echo `basename $0` called at `date`      >> $F 2>&1
echo called by: `id`			 >> $F 2>&1
echo with args: $*			 >> $F 2>&1

# restore the O.S. job identifier to the jobs directory

job_dir=`dirname $JOB_SCRIPT`/../active_jobs/$1
job_id=`cat osjobid`
echo $job_id > $job_dir/osjobid

echo `date +"%D %T"` Job $1 "(osjobid=$job_id)" restarting >> $ckpt_dir/ckpt.log

/usr/bin/restart -w -f chkpnt_$1  >> $F 2>&1

# Now be careful: The restart command is the parent process of the restarted
# job. SGE is the parent process of the restart command.
# If the job was killed (probably due to a migration request), we need to
# tell our parent that by killing ourselves. SGE will also detect an 
# exit status > 128 analogous to a KILL

exit_status=$?
echo Exit status of restart command: $exit_status >> $F 2>&1

if [ $exit_status = 1 ]
then
   jstat=`/bin/acctcom -j $job_id -b -p -Z -f -v /usr/adm/acct/day/pacct | $SGE_ROOT/ckpt/cray_parse_job_status $2`
   echo "jobstatus $job_id $2 = $jstat" >> $F 2>&1
   if [ "$jstat" = "" ]
   then
      exit_status=100
   elif [ "$jstat" = "0" ]
   then
      exit_status=0
   else
      exit_status=`expr $jstat + 128`
   fi
fi

# If killing ourselves didn't help or the exit_status was  < 128 exit 
# with the exit status of our child

echo `date +"%D %T"` Job $1 "(osjobid=$job_id) exiting, status=$exit_status" >> $ckpt_dir/ckpt.log

echo Exiting with exit status: $exit_status      >> $F 2>&1
exit $exit_status
