#!/bin/sh
#
#
# (c) 2002 Sun Microsystems, Inc. Use is subject to license terms.  

set +u    # don't treat unset parameters as an error
set +e    # don't exit on bad command status

ckpt_dir=$3

if [ ! -f $ckpt_dir/ckpt.log ]; then
   touch $ckpt_dir/ckpt.log
   chmod 666 $ckpt_dir/ckpt.log
fi

if [ "$SGE_TASK_ID" = "undefined" -o "$SGE_TASK_ID" = "" ]; then
   jobid=$JOB_ID
   jobdir=$JOB_ID.1
else
   jobid=$JOB_ID.$SGE_TASK_ID
   jobdir=$JOB_ID.$SGE_TASK_ID
fi

#
# create temp directory for holding checkpoint info
#

tmpdir=$ckpt_dir/ckpt.$jobid
mkdir -p $tmpdir
cd $tmpdir

#
# create log file
#

F=$tmpdir/checkpoint.log
touch $F
exec >> $F 2>&1

echo -------------------------------------------------------------
echo `basename $0` called at `date`
echo called by: `id`
echo with args: $*

#
# Get original job_pid and osjobid
#

job_pid=`cat job_pid`
osjobid=`cat osjobid`

# restore the O.S. job identifier to the jobs directory
# NOTE: do we need to restore osjobid?
# NOTE: do we need to restore job_pid?
job_dir=`dirname $JOB_SCRIPT`/../active_jobs/$jobdir
echo original job_pid=$job_pid
echo original osjobid=$osjobid
ls -la $job_dir/job_pid
cat $job_dir/job_pid
ls -la $job_dir/osjobid
cat $job_dir/osjobid
#echo $job_pid > $job_dir/job_pid
#echo $osjobid > $job_dir/osjobid

echo `date +"%D %T"` Job $jobid "(job_pid=$job_pid, osjobid=$osjobid)" restarting >> $ckpt_dir/ckpt.log

#
# If previous restart file exists, we assume that we
# died during a checkpoint and we should recover using
# this file
#

if [ -f chkpnt_$jobid.save ]; then
   mv chkpnt_$jobid.save chkpnt_$jobid
fi

#
# Register restart file, just in case it's not registered. This could
# happen if the host died during the last checkpoint
#

echo /usr/bin/rsvresf chkpnt_$jobid
/usr/bin/rsvresf chkpnt_$jobid

#
# Now restart the job and wait for it to complete
#

echo /usr/bin/restart -f -i -w chkpnt_$jobid
/usr/bin/restart -f -i -w chkpnt_$jobid
exit_status=$?
echo Exit status of restart command: $exit_status

# Now be careful: The restart command is the parent process of the restarted
# job. Grid Engine is the parent process of the restart command.
# If the job was killed (probably due to a migration request), we need to
# tell our parent that by killing ourselves. Grid Engine will also detect an 
# exit status > 128 analogous to a KILL

#if [ $exit_status = 1 ]
#then
#   jstat=`/bin/acctcom -j $job_id -b -p -Z -f -v /usr/adm/acct/day/pacct | $SGE_ROOT/ckpt/cray_parse_job_status $2`
#   echo "jobstatus $job_id $2 = $jstat"
#   if [ "$jstat" = "" ]
#   then
#      exit_status=100
#   elif [ "$jstat" = "0" ]
#   then
#      exit_status=0
#   else
#      exit_status=`expr $jstat + 128`
#   fi
#fi

# If killing ourselves didn't help or the exit_status was  < 128 exit 
# with the exit status of our child

echo `date +"%D %T"` Job $jobid "(job_pid=$job_pid, osjobid=$osjobid) exiting, status=$exit_status" >> $ckpt_dir/ckpt.log

echo Exiting with exit status: $exit_status
exit $exit_status

