Child pages
  • Slurm - job dependency and dynamic node claiming
Skip to end of metadata
Go to start of metadata

You are viewing an old version of this page. View the current version.

Compare with Current View Page History

« Previous Version 2 Current »

Job Dependency

#!/bin/bash

#multi.sh

# DESCRIPTION:

#   This is the entry point for submitting a series of jobs

#

#   It will submit 15 jobs, each of which will depend on its predecessor

#   which will dynamically scale to consume as many nodes (up to 6) as it

#   can but which will wait until at least 2 nodes are available.

#  

# Authors:  Justin Talbot and Ralph Bean

#

# Just a constant variable used throughout the script to name our jobs

#   in a meaningful way.

jobname="test-dependency-job"


# Another constant variable used to name the slurm submission file that

#   this script is going to submit to slurm.

jobfile="slurm-job-file.sh"


# Stores the command to execute in the variable $command for easy reading

#   More information on this below inside the 'for' loop

outfile=output-iteration-0.out

command="sbatch --output $outfile --error $outfile --nodes=1-4 --job-name $jobname-0 $jobfile"


# Use the command above to submit our first job.  It doesn't depend on any

#   other jobs.  Its output to the terminal will be something like:

#                 Successfully submitted job 4023

#   We are going to use that output and grab the number 4023 (the fourth word

#   in the output, hence the $4) and stuff that number in the variable latest_id

latest_id=$($command | awk ' { print $4 }')


# Now submit the remaining jobs so that they depend on their predecessors.

# Loop from 1 to 14 putting the value in the variable $i

for i in `seq 1 14` ; do

    # Do a little output to see what's up as this executes.

    echo "Submitting my ${i}th job that depends on SLURM job $latest_id"


    # Name a file dynamically where we want all of our 'messages' to go.

    outfile=output-iteration-$i.out

    errorfile=error-iteration-$i.err


    # Submit the next job and..

    #   1) Consume at least 2 and up to 6 nodes, depending on what's available

    #   2) Do not start the job until the last was finished SUCCESSFULLY

    #   3) Name it based on the sequence of jobs

    #   4) Use $outfile for standard output and error

    #   5) Use the job described in filename $jobfile

    # The backslashes '\' here only allow us to break the command into

    #   multiple lines to make it easier to read and understand as humans.

    sbatch \

            --nodes=1-4 \

            --dependency=afterok:$latest_id \

            --job-name $jobname-$i \

            --output=$outfile \

            --error=$errorfile \

            $jobfile


    # Increment the job on which the next should depend by one

    latest_id=`echo $latest_id + 1 | bc`

done



Dynamic Node Claiming

#!/bin/bash -l
# NOTE the -l flag!

#slurm-job-file.sh

#To send emails, set the adcdress below and remove one of the "#" signs.
##SBATCH --mail-user=slpits@rit.edu

# notify on state change: BEGIN, END, FAIL or ALL
#SBATCH --mail-type=ALL

# 5 days run time MAX, anything over will be KILLED
# Request 1 hour time limit day-hrs:min:sec
#SBATCH --time 1:0:0

#  Put the job in the partition assocaited with the account and request one core
#  Note that no number of nodes is specified here.  We do that in the
#  other script.
#SBATCH --account <account>
#SBATCH --partition <partition>

# Job memory requirements in MB=m (default), GB=g, TB=t
#SBATCH --mem=6000

####################################################################
# Below here goes the code we actually want to run.  The 'payload' #
####################################################################

# Since we have 4 cpus on each node, we want to tell
#  our program to use '4*the-number--outputf-nodes' cpus.
cores=`echo $SLURM_NNODES*4 | bc`

echo "Here I need to put code to tell slurm job $SLURM_JOB_ID"
echo "  which has been allocated $SLURM_NNODES 'nodes', and"
echo "  which means it can make use $cores cores to actually"
echo "  execute my program specified to use only $cores cores."
echo ""

# Sleep for 10 seconds, then keep outputting stuff with 'echo'
sleep 10

echo "If you check in any of the output-iteration-$i.out files,"
echo "  you should see these messages."
echo ""


Job Submitting

#!/bin/bash

#submit-many-jobs.sh

# DESCRIPTION:

#   This is the entry point for submitting a series of jobs

#

#   It will submit 15 jobs, each of which will depend on its predecessor

#   which will dynamically scale to consume as many nodes (up to 6) as it

#   can but which will wait until at least 2 nodes are available.

#  

# Authors:  Justin Talbot and Ralph Bean

#


# Just a constant variable used throughout the script to name our jobs

#   in a meaningful way.

jobname="test-dependency-job"


# Another constant variable used to name the slurm submission file that

#   this script is going to submit to slurm.

jobfile="slurm-job-file.sh"


# Stores the command to execute in the variable $command for easy reading

#   More information on this below inside the 'for' loop

outfile=output-iteration-0.out

command="sbatch --output $outfile --error $outfile --nodes=1-4 --job-name $jobname-0 $jobfile"


# Use the command above to submit our first job.  It doesn't depend on any

#   other jobs.  Its output to the terminal will be something like:

#                 Successfully submitted job 4023

#   We are going to use that output and grab the number 4023 (the fourth word

#   in the output, hence the $4) and stuff that number in the variable latest_id

latest_id=$($command | awk ' { print $4 }')


# Now submit the remaining jobs so that they depend on their predecessors.

# Loop from 1 to 14 putting the value in the variable $i

for i in `seq 1 14` ; do

    # Do a little output to see what's up as this executes.

    echo "Submitting my ${i}th job that depends on SLURM job $latest_id"


    # Name a file dynamically where we want all of our 'messages' to go.

    outfile=output-iteration-$i.out

    errorfile=error-iteration-$i.err


    # Submit the next job and..

    #   1) Consume at least 2 and up to 6 nodes, depending on what's available

    #   2) Do not start the job until the last was finished SUCCESSFULLY

    #   3) Name it based on the sequence of jobs

    #   4) Use $outfile for standard output and error

    #   5) Use the job described in filename $jobfile

    # The backslashes '\' here only allow us to break the command into

    #   multiple lines to make it easier to read and understand as humans.

    sbatch \

            --nodes=1-4 \

            --dependency=afterok:$latest_id \

            --job-name $jobname-$i \

            --output=$outfile \

            --error=$errorfile \

            $jobfile


    # Increment the job on which the next should depend by one

    latest_id=`echo $latest_id + 1 | bc`

done



  • No labels