#!/bin/ksh
#
#
# (c) 2002 Sun Microsystems, Inc. Use is subject to license terms.  

set +u

ckpt_dir=$3

if [ ! -f $ckpt_dir/ckpt.log ]; then
   touch $ckpt_dir/ckpt.log
   chmod 666 $ckpt_dir/ckpt.log
fi

sge_root=${SGE}
sge_cell=${SGE_CELL}

# workaround to force job to restart on same queue (svd)
. $sge_root/${sge_cell:-default}/common/settings.sh
qalter -q $QUEUE $JOB_ID

# create temp directory for holding checkpoint info

tmpdir=$ckpt_dir/ckpt.$1
mkdir -p $tmpdir
cd $tmpdir

# create log file

#F=$tmpdir/checkpoint.log
F=~/$REQNAME.co$1
touch $F

print -------------------------------------------------------------  >> $F 2>&1
print `basename $0` called at `date`      >> $F 2>&1
print called by: `id`			 >> $F 2>&1
print with args: $*			 >> $F 2>&1

# checkpoint the job to one of two different files (i.e. ping-pong)
# just in case we go down while checkpointing

currcpr=`cat currcpr`
if [ "$currcpr" = "2" ]; then
    currcpr=1
    prevcpr=2
else
    currcpr=2
    prevcpr=1
fi

# use the ASH to checkpoint if it is available.
# otherwise, use the process group ID

if [ -n "$OSJOBID" ]
then
    popt="$OSJOBID:ASH"
else
    popt="$2:GID"
fi

print Checkpoint command: cpr -c cpr_$1.$currcpr -p $popt -f -g >> $F 2>&1
cpr -c cpr_$1.$currcpr -p $popt -f -g >> $F 2>&1
cc=$?
if [ $cc -eq 0 ]; then
   print $currcpr > currcpr
   if [ -d cpr_$1.$prevcpr ]; then
      print Deleting old checkpoint file >> $F 2>&1
      cpr -D cpr_$1.$prevcpr >> $F 2>&1
   fi
fi

print `date +"%D %T"` Job $1 "(pid=$2) checkpointed, status=$cc" >> $ckpt_dir/ckpt.log
