Cluster Rundown
Recently, I have had some problems accessing our computing cluster because it was overloaded with users and batch jobs. I discussed this with some students (current and past) and some of them had some ideas of how to get a rundown of cluster resources, so I wrote something up for some people who want to check it out.
(NB: this is probably more configured for our specific cluster setup and not that general. Also, I'm aware this can be done solely in bash
and doesn't need to be done in R
. Most of our users know R
and not as many know awk
, sed
, etc, and I'm not as proficient in them. Also, some formats of the output are not so friendly of line-by-line readers)
Here's the code:
library(reshape2)
library(plyr)
suppressMessages(library(zoo))
getslot = function(x, slot) {
x[slot]
}
get.rundown = function(username = NULL, all.q = TRUE) {
out = system("qstat -u \"*\"", intern = TRUE)
out = out[3:length(out)]
out = gsub(" +", " ", out)
### keep only ones that are running
ss = strsplit(out, " ")
running = sapply(ss, getslot, slot = 5)
out = out[running == "r"]
ss = strsplit(out, " ")
### just grab username and queue
user.q = t(sapply(ss, getslot, slot = c(4, 8)))
user.q = data.frame(user.q, stringsAsFactors = FALSE)
colnames(user.q) = c("user", "queue")
### don't want the node - just the queue
ss = strsplit(user.q$queue, split = "@")
user.q$queue = sapply(ss, getslot, slot = 1)
### taking off the trailing .q
user.q$queue = gsub("\\.q", "", user.q$queue)
### table of number of jobs by each queue
indiv.q = tapply(user.q$user, user.q$queue, function(x) {
sort(table(x), decreasing = TRUE)
})
user.table = sort(table(user.q$user), decreasing = TRUE)
standard = user.q[user.q$queue == "standard", ]
user.tab = NULL
if (!is.null(username)) {
dat = user.q
if (all.q)
dat$queue = factor(dat$queue)
dat = dat[dat$user == username, ]
if (nrow(dat) == 0)
dat[1, ] = c(username, NA)
user.tab = table(dat$user, dat$queue, useNA = "no")
}
return(list(user.q = user.q, standard = standard, q.users = indiv.q, user.table = user.table,
user.tab = user.tab))
}
What does it do? It pulls statistics for each user, looks at only running jobs (or interactive QRLOGIN sessions), extracts the username, queue and then creates a summary table for a specific user if desired. The argument all.q
tables the data with all possible queues showing, versus only the ones that are in use.
The output below shows my breakdown by queue (I'm just on the interactive queue, running this script).
out = get.rundown(user = "jmuschel")
print(out$user.tab)
# cegs chaklab download gwas interactive jabba mathias ozone sas jmuschel 0
# 0 0 0 1 0 0 0 0 standard stanley jmuschel 0 0
In addition to number of slots being used, sometimes you want to know which queues or nodes have specific amounts of memory are free. So below I made an attempt at it.
#### get resources for different queue and nodes
get.resource = function() {
out = system("qstat -u \"*\" -F", intern = TRUE)
out = out[2:length(out)]
qs = grep("-----------", out) + 1
out = cbind(out, NA)
out[qs, 2] = out[qs, 1]
colnames(out) = c("info", "node")
out = data.frame(out, stringsAsFactors = FALSE)
out$node = na.locf(out$node, na.rm = FALSE)
ss = strsplit(out$node, " ")
out$node = sapply(ss, getslot, 1)
out = out[-(c(qs - 1, qs)), ]
out$queue = gsub("(.*).q@(.*)", "\\1", out$node)
out$info = gsub("\thl:", "", out$info)
out$info = gsub("\tqf:", "", out$info)
out$info = gsub("\tqc:", "", out$info)
keeprows = grepl("mem_|swap_|virtual_|^cpu", out$info)
out = out[keeprows, ]
out$var = gsub("(.*)=(.*)", "\\1", out$info)
out$value = gsub("(.*)=(.*)", "\\2", out$info)
cpu = out[out$var == "cpu", ]
out = out[out$var != "cpu", ]
out$tf = gsub("(.*)_(.*)", "\\2", out$var)
out$var = gsub("(.*)_(.*)", "\\1", out$var)
out$gorm = gsub("(.*)(.)$", "\\2", out$value)
### T is future hopes...
stopifnot(all(out$gorm %in% c("0", "G", "K", "M", "T")))
### all output is in gigabytes
out$value = as.numeric(gsub("G|K|M", "", out$value))
out$value[out$gorm == "M"] = out$value[out$gorm == "M"]/1024
out$value[out$gorm == "K"] = out$value[out$gorm == "K"]/(1024 * 1024)
out$gorm = out$info = NULL
varwide = dcast(out, queue + node + var ~ tf, value.var = "value")
tfwide = dcast(out, queue + node + tf ~ var, value.var = "value")
wide = dcast(out, queue + node ~ var + tf, value.var = "value")
agg = wide[, !colnames(wide) %in% "node"]
agg = ddply(.data = agg, .(queue), function(x) {
colSums(x[, !colnames(x) %in% "queue"], na.rm = TRUE)
})
return(list(out = out, varwide = varwide, tfwide = tfwide, wide = wide,
byqueue = agg))
}
The output is the output by node (out
), and reshaped version depending on the quality wanted (varwide
has node, and type of memory as “ids” in that a node, and either mem/swap/virtual define a row), (tfwide
has node and free/used/total as the identifiers) and an aggregate summation of resources by queue (byqueue
).
Here's some example output from our standard queue (all terms are in Gb).
x = get.resource()
agg = x$byqueue
agg[agg$queue == "standard", ]
# queue mem_free mem_total mem_used swap_free swap_total swap_used 16
# standard 5233.08 6564.179 1331.098 6526.928 6568.344 41.41569 virtual_free
# virtual_total virtual_used 16 11760.01 13132.52 1372.515
Overall, I hope this helps you see which nodes fit your needs for the most beneficial experience for all users.