Child pages
  • Slurm - job dependency and dynamic node claiming
Skip to end of metadata
Go to start of metadata

Job Dependency

#!/bin/bash

#multi.sh

# DESCRIPTION:

#   This is the entry point for submitting a series of jobs

#

#   It will submit 15 jobs, each of which will depend on its predecessor

#   which will dynamically scale to consume as many nodes (up to 6) as it

#   can but which will wait until at least 2 nodes are available.

#  

# Authors:  Justin Talbot and Ralph Bean

#

# Just a constant variable used throughout the script to name our jobs

#   in a meaningful way.

jobname="test-dependency-job"


# Another constant variable used to name the slurm submission file that

#   this script is going to submit to slurm.

jobfile="slurm-job-file.sh"


# Stores the command to execute in the variable $command for easy reading

#   More information on this below inside the 'for' loop

outfile=output-iteration-0.out

command="sbatch --output $outfile --error $outfile --nodes=1-4 --job-name $jobname-0 $jobfile"


# Use the command above to submit our first job.  It doesn't depend on any

#   other jobs.  Its output to the terminal will be something like:

#                 Successfully submitted job 4023

#   We are going to use that output and grab the number 4023 (the fourth word

#   in the output, hence the $4) and stuff that number in the variable latest_id

latest_id=$($command | awk ' { print $4 }')


# Now submit the remaining jobs so that they depend on their predecessors.

# Loop from 1 to 14 putting the value in the variable $i

for i in `seq 1 14` ; do

    # Do a little output to see what's up as this executes.

    echo "Submitting my ${i}th job that depends on SLURM job $latest_id"


    # Name a file dynamically where we want all of our 'messages' to go.

    outfile=output-iteration-$i.out

    errorfile=error-iteration-$i.err


    # Submit the next job and..

    #   1) Consume at least 2 and up to 6 nodes, depending on what's available

    #   2) Do not start the job until the last was finished SUCCESSFULLY

    #   3) Name it based on the sequence of jobs

    #   4) Use $outfile for standard output and error

    #   5) Use the job described in filename $jobfile

    # The backslashes '\' here only allow us to break the command into

    #   multiple lines to make it easier to read and understand as humans.

    sbatch \

            --nodes=1-4 \

            --dependency=afterok:$latest_id \

            --job-name $jobname-$i \

            --output=$outfile \

            --error=$errorfile \

            $jobfile


    # Increment the job on which the next should depend by one

    latest_id=`echo $latest_id + 1 | bc`

done



Dynamic Node Claiming

#!/bin/bash -l
# NOTE the -l flag!

#slurm-job-file.sh

#To send emails, set the adcdress below and remove one of the "#" signs.
##SBATCH --mail-user=<user id>@rit.edu

# notify on state change: BEGIN, END, FAIL or ALL
#SBATCH --mail-type=ALL

# 5 days run time MAX, anything over will be KILLED
# Request 1 hour time limit day-hrs:min:sec
#SBATCH --time 1:0:0

#  Put the job in the partition assocaited with the account and request one core
#  Note that no number of nodes is specified here.  We do that in the
#  other script.
#SBATCH --account <account>
#SBATCH --partition <partition>

# Job memory requirements in MB=m (default), GB=g, TB=t
#SBATCH --mem=6000

####################################################################
# Below here goes the code we actually want to run.  The 'payload' #
####################################################################

# Since we have 4 cpus on each node, we want to tell
#  our program to use '4*the-number--outputf-nodes' cpus.
cores=`echo $SLURM_NNODES*4 | bc`

echo "Here I need to put code to tell slurm job $SLURM_JOB_ID"
echo "  which has been allocated $SLURM_NNODES 'nodes', and"
echo "  which means it can make use $cores cores to actually"
echo "  execute my program specified to use only $cores cores."
echo ""

# Sleep for 10 seconds, then keep outputting stuff with 'echo'
sleep 10

echo "If you check in any of the output-iteration-$i.out files,"
echo "  you should see these messages."
echo ""


Job Submitting

#!/bin/bash

#submit-many-jobs.sh

# DESCRIPTION:

#   This is the entry point for submitting a series of jobs

#

#   It will submit 15 jobs, each of which will depend on its predecessor

#   which will dynamically scale to consume as many nodes (up to 6) as it

#   can but which will wait until at least 2 nodes are available.

#  

# Authors:  Justin Talbot and Ralph Bean

#


# Just a constant variable used throughout the script to name our jobs

#   in a meaningful way.

jobname="test-dependency-job"


# Another constant variable used to name the slurm submission file that

#   this script is going to submit to slurm.

jobfile="slurm-job-file.sh"


# Stores the command to execute in the variable $command for easy reading

#   More information on this below inside the 'for' loop

outfile=output-iteration-0.out

command="sbatch --output $outfile --error $outfile --nodes=1-4 --job-name $jobname-0 $jobfile"


# Use the command above to submit our first job.  It doesn't depend on any

#   other jobs.  Its output to the terminal will be something like:

#                 Successfully submitted job 4023

#   We are going to use that output and grab the number 4023 (the fourth word

#   in the output, hence the $4) and stuff that number in the variable latest_id

latest_id=$($command | awk ' { print $4 }')


# Now submit the remaining jobs so that they depend on their predecessors.

# Loop from 1 to 14 putting the value in the variable $i

for i in `seq 1 14` ; do

    # Do a little output to see what's up as this executes.

    echo "Submitting my ${i}th job that depends on SLURM job $latest_id"


    # Name a file dynamically where we want all of our 'messages' to go.

    outfile=output-iteration-$i.out

    errorfile=error-iteration-$i.err


    # Submit the next job and..

    #   1) Consume at least 2 and up to 6 nodes, depending on what's available

    #   2) Do not start the job until the last was finished SUCCESSFULLY

    #   3) Name it based on the sequence of jobs

    #   4) Use $outfile for standard output and error

    #   5) Use the job described in filename $jobfile

    # The backslashes '\' here only allow us to break the command into

    #   multiple lines to make it easier to read and understand as humans.

    sbatch \

            --nodes=1-4 \

            --dependency=afterok:$latest_id \

            --job-name $jobname-$i \

            --output=$outfile \

            --error=$errorfile \

            $jobfile


    # Increment the job on which the next should depend by one

    latest_id=`echo $latest_id + 1 | bc`

done



  • No labels