#!/usr/bin/perl -w
#
# CMU Dynamic User Groups
#
# This script monitors LSF, PBS, and/or SLURM job queues and
# dynamically creates CMU User Groups for running jobs. This
# makes it easy for CMU system administrators to monitor the
# resources of individual user jobs on the cluster.
#
# This script is designed to be run on the CMU Management
# Node and expects to be able to query the resource manager
# from that node. It is designed to be launched as a
# standalone daemon.
#
# **********************************************************
# NOTE: You must configure the following variables correctly
# as needed for proper operation.
#***********************************************************
#
# The following variables must be full path! They are tested
# for existence before getting executed.
#
# MAKE SURE THAT THESE PATHS ARE CORRECT! Or set them to an
# invalid path to disable querying the specific resource
# manager.
#
# 'bjobs' is the Platform LSF job status command
my $LSFJOBS = "/opt/lsf/8.0/linux2.6-glibc2.3-x86_64/bin/bjobs";
# The following line is placed ahead of the LSFJOBS command
# to setup the LSF environment. LSF commands expect these
# environment variables to be properly set when they are
# invoked.
#
my $LSFENV = "/bin/env LSF_ENVDIR=/opt/lsf/conf LSF_SERVERDIR=/opt/lsf";
# SLEEP_INTERVAL is the number of seconds to sleep between
# checking the job queues and updating the CMU User Groups.
my $SLEEP_INTERVAL = 5;
###########################################
# SHOULD BE NO NEED TO EDIT BELOW THIS LINE
###########################################
# CMU commands
my $CMU_SHOW_GROUPS = "/opt/cmu/bin/cmu_show_user_groups";
my $CMU_ADD_GROUP = "/opt/cmu/bin/cmu_add_user_group";
my $CMU_ADD_NODES = "/opt/cmu/bin/cmu_add_to_user_group";
my $CMU_DEL_GROUP = "/opt/cmu/bin/cmu_del_user_group";
my $CMU_SHOW_NODES = "/opt/cmu/bin/cmu_show_nodes";
# function declarations
sub addSLURMjobsToHash();
sub addLSFjobsToHash();
sub addPBSjobsToHash();
# global variables
my %jobhash;
my @cmu_nodes;
sub get_cmu_nodes() {
@cmu_nodes = split /\s+/, `$CMU_SHOW_NODES`;
}
sub is_cmu_node($) {
my ($node) = (@_);
foreach my $cn (@cmu_nodes) {
return 1 if ($cn eq $node);
}
return 0;
}
while (1) {
#
# Get existing CMU Nodes and User Groups
#
get_cmu_nodes();
my $curgroups = `$CMU_SHOW_GROUPS`;
my @cmugroups = split /\n/, $curgroups;
#
# Reload the jobhash
#
delete @jobhash{keys %jobhash};
addLSFjobsToHash() if ( -f "$LSFJOBS");
my @jlist = keys %jobhash;
#
# delete old user groups that begin with
# 'SLURM_', 'LSF_', or 'PBS_'
#
my @foundgroups = ();
foreach my $g (@cmugroups) {
my $found = 0;
foreach my $j (@jlist) {
if ($g eq $j) {
$found = 1;
push @foundgroups, $g;
last;
}
}
unless ($found) {
# my @items = split /_/, $g;
# if ($items[0] eq "SLURM" || $items[0] eq "LSF" ||
# $items[0] eq "PBS") {
`$CMU_DEL_GROUP $g`;
# }
}
}
#
# add new user groups
#
foreach my $j (@jlist) {
# check for existing group
my $found = 0;
foreach my $g (@foundgroups) {
if ($g eq $j) {
# FIXME confirm/update the node list
$found = 1;
last;
}
}
next if ($found);
# print '.';
# create the CMU user group and add the nodelist to it
`$CMU_ADD_GROUP $j`;
`$CMU_ADD_NODES -t $j $jobhash{$j}` if ($jobhash{$j} ne "");
}
sleep $SLEEP_INTERVAL;
}
sub addLSFjobsToHash() {
my $curjobs = `$LSFENV $LSFJOBS -r -u all -w`;
my @jstrs = split /\n/, $curjobs;
my $jname = "";
my @nodestr = "";
my $nodes_free = 64;
my $free_compute_nodes = "cn01 cn02 cn03 cn04 cn05 cn06 cn07 cn08 cn09 cn10 cn11 cn12 cn13 cn14 cn15 cn16 cn17 cn18 cn19 cn20 cn21 cn22 cn23 cn24 cn25 cn26 cn27 cn28 cn29 cn30 cn31 cn32 cn33 cn34 cn35 cn36 cn37 cn38 cn39 cn40 cn41 cn42 cn43 cn44 cn45 cn46 cn47 cn48 cn49 cn50 cn51 cn52 cn53 cn54 cn55 cn56 cn57 cn58 cn59 cn60 cn61 cn62 cn63 cn64";
foreach (@jstrs) {
my @elements = split /\s+/;
if (@elements == 1 && $jname ne "") {
# This must be a node from the current job
# if LSB_SHORT_HOSTLIST is enabled, then
# remove the prepended cpu count
my @items = split /\*/, $elements[0];
# check for duplicates
my @check = split /\s+/, $nodestr;
my $found = 0;
foreach my $n (@check) {
if ($n eq $items[-1]) {
$found = 1;
last;
}
}
if (!$found && is_cmu_node($items[-1])) {
$nodestr .= " $items[-1]";
}
next;
}
if (@elements > 7 && $elements[2] eq "RUN") {
if ($jname) {
# we've found a new job so add the
# current job to the hash
$jobhash{$jname} = $nodestr;
}
my $cpus = 0;
my $nodecnt = 0;
# get the node, removing any prepended cpu count
$nodestr = "";
my @nodes = split /:/, $elements[5];
foreach my $n (@nodes) {
my @items = split /\*/, $n;
if (@items == 1) { $cpus += 1;}
else { $cpus += $items[0];}
$nodecnt += 1;
$nodes_free -= 1;
$free_compute_nodes =~ s/$items[-1]//;
$nodestr .= " $items[-1]"
if (is_cmu_node($items[-1]));
}
# name = LSF_{user}_{jobid}[_{array_id}]
$jname = $elements[1]. $cpus . 'p' . $nodecnt . 'n_' . $elements[0];
# $jname = $elements[1]. '_' . $elements[0];
# check for an array ID, i.e. 'foo[array ID]'
my $name = $elements[6];
my $lastchar = chop $name;
if ($lastchar eq "]") {
@namebits = split /\[/, $name;
$jname .= '_' . $namebits[-1];
}
}
}
$jobhash{'zzz'.12*$nodes_free .'p'. $nodes_free.'n'}=$free_compute_nodes;
if ($jname) {
$jobhash{$jname} = $nodestr;
}
}