[Swift-commit] cog r3432
swift at ci.uchicago.edu
swift at ci.uchicago.edu
Sun Jul 29 17:30:05 CDT 2012
------------------------------------------------------------------------
r3432 | davidkelly999 | 2012-07-29 17:28:10 -0500 (Sun, 29 Jul 2012) | 2 lines
Initial commit of slurm scheduler
------------------------------------------------------------------------
Index: modules/provider-localscheduler/src/org/globus/cog/abstraction/impl/scheduler/slurm/SlurmExecutor.java
===================================================================
--- modules/provider-localscheduler/src/org/globus/cog/abstraction/impl/scheduler/slurm/SlurmExecutor.java (revision 0)
+++ modules/provider-localscheduler/src/org/globus/cog/abstraction/impl/scheduler/slurm/SlurmExecutor.java (revision 3432)
@@ -0,0 +1,374 @@
+package org.globus.cog.abstraction.impl.scheduler.slurm;
+
+import java.io.IOException;
+import java.io.Writer;
+import java.text.DecimalFormat;
+import java.text.NumberFormat;
+import java.util.Collection;
+import java.util.Date;
+import java.util.Iterator;
+
+import org.apache.log4j.Logger;
+import org.globus.cog.abstraction.impl.common.execution.WallTime;
+import org.globus.cog.abstraction.impl.scheduler.common.AbstractExecutor;
+import org.globus.cog.abstraction.impl.scheduler.common.AbstractProperties;
+import org.globus.cog.abstraction.impl.scheduler.common.AbstractQueuePoller;
+import org.globus.cog.abstraction.impl.scheduler.common.Job;
+import org.globus.cog.abstraction.impl.scheduler.common.ProcessListener;
+import org.globus.cog.abstraction.interfaces.FileLocation;
+import org.globus.cog.abstraction.interfaces.JobSpecification;
+import org.globus.cog.abstraction.interfaces.Task;
+
+public class SlurmExecutor extends AbstractExecutor {
+ public static final Logger logger = Logger.getLogger(SlurmExecutor.class);
+
+ /**
+ Number of program invocations
+ */
+ int count = 1;
+
+ /**
+ PBS processes-per-node
+ */
+ int ppn = 1;
+
+ /**
+ PBS mppdepth: number of available threads per node
+ */
+ int depth = 1;
+
+ /**
+ Unique number for automatic task names
+ */
+ private static int unique = 0;
+
+ public SlurmExecutor(Task task, ProcessListener listener) {
+ super(task, listener);
+ }
+
+ private static NumberFormat IDF = new DecimalFormat("000000");
+
+ /**
+ The job name is limited to 15 characters:
+ http://doesciencegrid.org/public/pbs/qsub.html
+ */
+ protected void validate(Task task) {
+ String name = task.getName();
+ if (name == null) {
+ int i = 0;
+ synchronized(SlurmExecutor.class) {
+ i = unique++;
+ }
+ name = "cog-" + IDF.format(i);
+ if (logger.isDebugEnabled()) {
+ logger.debug("Slurm name: for: " + task.getIdentity() +
+ " is: " + name);
+ }
+ }
+ else if (name.length() > 15) {
+ task.setName(name.substring(0, 15));
+ }
+ }
+
+ /**
+ Write attribute if non-null
+ @throws IOException
+ */
+ protected void writeAttr(String attrName, String arg, Writer wr)
+ throws IOException {
+ Object value = getSpec().getAttribute(attrName);
+ if (value != null) {
+ wr.write("#PBS " + arg + String.valueOf(value) + '\n');
+ }
+ }
+
+ /**
+ Write attribute if non-null and non-empty
+ @throws IOException
+ */
+ protected void writeNonEmptyAttr(String attrName, String arg,
+ Writer wr)
+ throws IOException {
+ Object value = getSpec().getAttribute(attrName);
+ if (value != null) {
+ String v = String.valueOf(value);
+ if (v.length() > 0 )
+ wr.write("#PBS " + arg + v + '\n');
+ }
+ }
+
+ protected void writeWallTime(Writer wr) throws IOException {
+ Object walltime = getSpec().getAttribute("maxwalltime");
+ if (walltime != null) {
+ wr.write("#PBS -l walltime="
+ + WallTime.normalize(walltime.toString(), "pbs-native")
+ + '\n');
+ }
+ }
+
+ private int parseAndValidateInt(Object obj, String name) {
+ try {
+ assert(obj != null);
+ return Integer.parseInt(obj.toString());
+ }
+ catch (NumberFormatException e) {
+ throw new IllegalArgumentException("Illegal value for " + name + ". Must be an integer.");
+ }
+ }
+
+ /**
+ Obtains profile settings regarding job size from
+ JobSpecification and writes them into the PBS file.
+ Looks for profiles count, ppn, ppts, and pbs.mpp
+ count: mandatory, default 1 (number of processes)
+ depth: default 1 (number of threads per node)
+ ppn: optional, default 1 (processes per node)
+ pbs.mpp: output mppwidth/mppnppn instead of nodes/ppn
+ pbs.properties: extra PBS properties
+ pbs.resource_list: extra PBS -l line
+
+ Note that the semantics are different for the pbs.mpp setting:
+ mppwidth is the total number of cores while nodes is the number
+ of nodes.
+
+ http://www.clusterresources.com/torquedocs/2.1jobsubmission.shtml
+ @return true if this is a multi-core job
+ */
+ protected boolean writeCountAndPPN(JobSpecification spec,
+ Writer wr)
+ throws IOException {
+ boolean result = false;
+
+ Object o;
+
+ // Number of program invocations
+ o = getSpec().getAttribute("count");
+ if (o != null)
+ count = parseAndValidateInt(o, "count");
+ if (count != 1)
+ result = true;
+
+ o = spec.getAttribute("ppn");
+ if (o != null)
+ ppn = parseAndValidateInt(o, "ppn");
+
+ o = spec.getAttribute("depth");
+ if (o != null)
+ depth = parseAndValidateInt(o, "depth");
+
+ String slurmProperties =
+ (String) getSpec().getAttribute("slurm.properties");
+
+ boolean mpp = false;
+ if (spec.getAttribute("pbs.mpp") != null)
+ mpp = true;
+
+ StringBuilder sb = new StringBuilder(512);
+ sb.append("#PBS -l ");
+ if (mpp) {
+ sb.append("mppwidth=").append(count);
+ sb.append(",");
+ sb.append("mppnppn=").append(ppn);
+ sb.append(",");
+ sb.append("mppdepth=").append(depth);
+ }
+ else {
+ sb.append("nodes=");
+ sb.append(count);
+ sb.append(":");
+ sb.append("ppn=");
+ sb.append(ppn);
+ }
+
+ if (slurmProperties != null &&
+ slurmProperties.length() > 0 ) {
+ sb.append(":");
+ sb.append(slurmProperties);
+ }
+
+ sb.append('\n');
+
+ wr.write(sb.toString());
+
+ return result;
+ }
+
+ /*
+ private boolean parseAndValidateBool(Object obj, String name)
+ {
+ try {
+ return Boolean.parseBoolean(obj.toString());
+ }
+ catch (NumberFormatException e) {
+ throw new IllegalArgumentException
+ ("Illegal value for " + name + ". Must be true/false.");
+ }
+ }
+ */
+
+ @Override
+ protected void writeScript(Writer wr, String exitcodefile, String stdout,
+ String stderr)
+ throws IOException {
+ Task task = getTask();
+ JobSpecification spec = getSpec();
+ Properties properties = Properties.getProperties();
+
+ getSpec().unpackProviderAttributes();
+
+ validate(task);
+ writeHeader(wr);
+
+ wr.write("#PBS -S /bin/bash\n");
+ wr.write("#PBS -N " + task.getName() + '\n');
+ wr.write("#PBS -m n\n");
+ writeNonEmptyAttr("project", "-A ", wr);
+ boolean multiple = writeCountAndPPN(spec, wr);
+ writeWallTime(wr);
+ writeNonEmptyAttr("queue", "-q ", wr);
+ wr.write("#PBS -o " + quote(stdout) + '\n');
+ wr.write("#PBS -e " + quote(stderr) + '\n');
+
+ for (String name : spec.getEnvironmentVariableNames()) {
+ // "export" is necessary on the Cray XT5 Crow
+ wr.write("export ");
+ wr.write(name);
+ wr.write('=');
+ wr.write(quote(spec.getEnvironmentVariable(name)));
+ wr.write('\n');
+ }
+
+ if (spec.getEnvironmentVariableNames().size() > 0) {
+ wr.write("#PBS -v " + makeList(spec.getEnvironmentVariableNames()) + '\n');
+ }
+
+ String resources =
+ (String) spec.getAttribute("slurm.resource_list");
+ if (resources != null && resources.length() > 0) {
+ if (logger.isDebugEnabled())
+ logger.debug("slurm.resource_list: " + resources);
+ wr.write("#PBS -l " + resources + '\n');
+ }
+
+ // aprun option specifically for Cray Beagle, Franklin
+ boolean aprun = false;
+ if (spec.getAttribute("pbs.aprun") != null)
+ aprun = true;
+
+ String type = (String) spec.getAttribute("jobType");
+ if (logger.isDebugEnabled())
+ logger.debug("Job type: " + type);
+ if ("multiple".equals(type))
+ multiple = true;
+ else if("single".equals(type))
+ multiple = false;
+ if (aprun)
+ multiple = false;
+ if (multiple)
+ writeMultiJobPreamble(wr, exitcodefile);
+
+ if (type != null) {
+ String wrapper =
+ properties.getProperty("wrapper." + type);
+ if (logger.isDebugEnabled()) {
+ logger.debug("Wrapper: " + wrapper);
+ }
+ if (wrapper != null) {
+ wrapper = replaceVars(wrapper);
+ wr.write(wrapper);
+ wr.write(' ');
+ }
+ if (logger.isDebugEnabled()) {
+ logger.debug("Wrapper after variable substitution: " + wrapper);
+ }
+ }
+ if (spec.getDirectory() != null) {
+ wr.write("cd " + quote(spec.getDirectory()) + " && ");
+ }
+
+ if (aprun)
+ wr.write("aprun -n " + count + " -N 1 -cc none -d " +
+ depth + " -F exclusive /bin/sh -c '");
+
+ wr.write(quote(spec.getExecutable()));
+ writeQuotedList(wr, spec.getArgumentsAsList());
+
+ if (aprun)
+ wr.write("'");
+
+ if (spec.getStdInput() != null) {
+ wr.write(" < " + quote(spec.getStdInput()));
+ }
+ if (multiple) {
+ writeMultiJobPostamble(wr);
+ }
+ else {
+ wr.write('\n');
+ wr.write("/bin/echo $? >" + exitcodefile + '\n');
+ }
+ wr.close();
+ }
+
+ void writeHeader(Writer writer)
+ throws IOException {
+ writer.write("#CoG This script generated by CoG\n");
+ writer.write("#CoG by class: " + SlurmExecutor.class + '\n');
+ writer.write("#CoG on date: " + new Date() + "\n\n");
+ }
+
+
+ private String makeList(Collection<String> names) {
+ StringBuilder sb = new StringBuilder();
+ Iterator<String> i = names.iterator();
+ while (i.hasNext()) {
+ sb.append(i.next());
+ if (i.hasNext()) {
+ sb.append(", ");
+ }
+ }
+ return sb.toString();
+ }
+
+ protected void writeMultiJobPreamble(Writer wr, String exitcodefile)
+ throws IOException {
+ wr.write("NODES=`cat $PBS_NODEFILE`\n");
+ wr.write("ECF=" + exitcodefile + "\n");
+ wr.write("INDEX=0\n");
+ wr.write("for NODE in $NODES; do\n");
+ wr.write(" echo \"N\" >$ECF.$INDEX\n");
+ wr.write(" ssh $NODE /bin/bash -c \\\" \"");
+ }
+
+
+ @Override
+protected String getName() {
+ return "PBS";
+ }
+
+ @Override
+protected AbstractProperties getProperties() {
+ return Properties.getProperties();
+ }
+
+ @Override
+protected Job createJob(String jobid, String stdout,
+ FileLocation stdOutputLocation, String stderr,
+ FileLocation stdErrorLocation, String exitcode,
+ AbstractExecutor executor) {
+ return new Job(jobid, stdout, stdOutputLocation, stderr,
+ stdErrorLocation, exitcode, executor);
+ }
+
+ private static QueuePoller poller;
+
+ @Override
+protected AbstractQueuePoller getQueuePoller() {
+ synchronized(SlurmExecutor.class) {
+ if (poller == null) {
+ poller = new QueuePoller(getProperties());
+ poller.start();
+ }
+ return poller;
+ }
+ }
+}
Index: modules/provider-localscheduler/src/org/globus/cog/abstraction/impl/scheduler/slurm/QueuePoller.java
===================================================================
--- modules/provider-localscheduler/src/org/globus/cog/abstraction/impl/scheduler/slurm/QueuePoller.java (revision 0)
+++ modules/provider-localscheduler/src/org/globus/cog/abstraction/impl/scheduler/slurm/QueuePoller.java (revision 3432)
@@ -0,0 +1,168 @@
+package org.globus.cog.abstraction.impl.scheduler.slurm;
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.io.StringReader;
+import java.util.HashSet;
+import java.util.Iterator;
+import java.util.Map;
+import java.util.Set;
+
+import org.apache.log4j.Logger;
+import org.globus.cog.abstraction.impl.scheduler.common.AbstractProperties;
+import org.globus.cog.abstraction.impl.scheduler.common.AbstractQueuePoller;
+import org.globus.cog.abstraction.impl.scheduler.common.Job;
+
+public class QueuePoller extends AbstractQueuePoller {
+ public static final Logger logger = Logger.getLogger(QueuePoller.class);
+ public static final int FULL_LIST_THRESHOLD = 16;
+
+ private Set processed;
+
+ public QueuePoller(AbstractProperties properties) {
+ super("PBS provider queue poller", properties);
+ processed = new HashSet();
+ }
+
+ private static String[] CMDARRAY;
+
+ protected synchronized String[] getCMDArray() {
+ if (getJobs().size() <= FULL_LIST_THRESHOLD) {
+ String[] cmda = new String[2 + getJobs().size()];
+ cmda[0] = getProperties().getPollCommand();
+ cmda[1] = "-f";
+ int i = 2;
+ for (Job j : getJobs().values()) {
+ cmda[i++] = j.getJobID();
+ }
+ return cmda;
+ }
+ else {
+ if (CMDARRAY == null) {
+ CMDARRAY = new String[] { getProperties().getPollCommand(), "-f" };
+ }
+ }
+ return CMDARRAY;
+ }
+
+ @Override
+ protected int getError(int ec, String stderr) {
+ if (ec != 0) {
+ BufferedReader sr = new BufferedReader(new StringReader(stderr));
+ try {
+ String line = sr.readLine();
+ while (line != null) {
+ if (!line.contains("Unknown Job Id")) {
+ return ec;
+ }
+ line = sr.readLine();
+ }
+ }
+ catch (IOException e) {
+ // should not occur while reading from a string reader
+ e.printStackTrace();
+ }
+ return 0;
+ }
+ else {
+ return ec;
+ }
+ }
+
+ protected void processStdout(InputStream is) throws IOException {
+ BufferedReader br = new BufferedReader(new InputStreamReader(is));
+ processed.clear();
+ String line;
+ String currentJobID = null;
+ Job currentJob = null;
+ do {
+ line = br.readLine();
+ if (line != null) {
+ try {
+ line = line.trim();
+ if (line.startsWith("Job Id: ")) {
+ currentJobID = line.substring("Job Id: ".length());
+ processed.add(currentJobID);
+ currentJob = getJob(currentJobID);
+ continue;
+ }
+ if (currentJob != null) {
+ if (line.startsWith("job_state = ")) {
+ if (logger.isDebugEnabled()) {
+ logger.debug("Status line: " + line);
+ }
+ switch (line.substring("job_state = ".length())
+ .charAt(0)) {
+ case 'Q': {
+ if (logger.isDebugEnabled()) {
+ logger.debug("Status for "
+ + currentJobID + " is Q");
+ }
+ currentJob.setState(Job.STATE_QUEUED);
+ break;
+ }
+ case 'R': {
+ if (logger.isDebugEnabled()) {
+ logger.debug("Status for "
+ + currentJobID + " is R");
+ }
+ currentJob.setState(Job.STATE_RUNNING);
+ break;
+ }
+ case 'C': {
+ // for sites where keep_completed is there,
+ // don't wait
+ // for the job to be removed from the queue
+ if (logger.isDebugEnabled()) {
+ logger.debug("Status for "
+ + currentJobID + " is C");
+ }
+ addDoneJob(currentJob.getJobID());
+ break;
+ }
+ }
+ }
+ else if (line.startsWith("exit_status = ")) {
+ try {
+ int ec = Integer.parseInt(line.substring(
+ "exit_status = ".length()).trim());
+ currentJob.setExitcode(ec);
+ }
+ catch (Exception e) {
+ if (logger.isDebugEnabled()) {
+ logger.debug("Could not parse exit_status",
+ e);
+ }
+ }
+ }
+ }
+ }
+ catch (Exception e) {
+ logger.warn("Exception caught while handling "
+ + getProperties().getPollCommandName()
+ + " output: " + line, e);
+ }
+ }
+ } while (line != null);
+ Iterator i = getJobs().entrySet().iterator();
+ while (i.hasNext()) {
+ Map.Entry e = (Map.Entry) i.next();
+ String id = (String) e.getKey();
+ if (!processed.contains(id)) {
+ Job job = (Job) e.getValue();
+ if (logger.isDebugEnabled()) {
+ logger.debug("Status for " + id + " is Done");
+ }
+ job.setState(Job.STATE_DONE);
+ if (job.getState() == Job.STATE_DONE) {
+ addDoneJob(id);
+ }
+ }
+ }
+ }
+
+ protected void processStderr(InputStream is) throws IOException {
+ }
+}
Index: modules/provider-localscheduler/src/org/globus/cog/abstraction/impl/scheduler/slurm/execution/TaskHandlerImpl.java
===================================================================
--- modules/provider-localscheduler/src/org/globus/cog/abstraction/impl/scheduler/slurm/execution/TaskHandlerImpl.java (revision 0)
+++ modules/provider-localscheduler/src/org/globus/cog/abstraction/impl/scheduler/slurm/execution/TaskHandlerImpl.java (revision 3432)
@@ -0,0 +1,21 @@
+package org.globus.cog.abstraction.impl.scheduler.slurm.execution;
+
+import org.globus.cog.abstraction.interfaces.DelegatedTaskHandler;
+
+/**
+ *Provides a local PBS <code>TaskHandler</code>
+ *for job submission to the local resource without
+ *any security context.
+ *
+ */
+public class TaskHandlerImpl extends
+ org.globus.cog.abstraction.impl.common.execution.TaskHandlerImpl {
+
+ protected DelegatedTaskHandler newDelegatedTaskHandler() {
+ return new JobSubmissionTaskHandler();
+ }
+
+ protected String getName() {
+ return "Slurm";
+ }
+}
\ No newline at end of file
Index: modules/provider-localscheduler/src/org/globus/cog/abstraction/impl/scheduler/slurm/execution/JobSubmissionTaskHandler.java
===================================================================
--- modules/provider-localscheduler/src/org/globus/cog/abstraction/impl/scheduler/slurm/execution/JobSubmissionTaskHandler.java (revision 0)
+++ modules/provider-localscheduler/src/org/globus/cog/abstraction/impl/scheduler/slurm/execution/JobSubmissionTaskHandler.java (revision 3432)
@@ -0,0 +1,13 @@
+package org.globus.cog.abstraction.impl.scheduler.slurm.execution;
+
+import org.globus.cog.abstraction.impl.scheduler.common.AbstractExecutor;
+import org.globus.cog.abstraction.impl.scheduler.common.AbstractJobSubmissionTaskHandler;
+import org.globus.cog.abstraction.impl.scheduler.slurm.SlurmExecutor;
+import org.globus.cog.abstraction.interfaces.Task;
+
+public class JobSubmissionTaskHandler extends AbstractJobSubmissionTaskHandler {
+ protected AbstractExecutor newExecutor(Task task,
+ AbstractJobSubmissionTaskHandler th) {
+ return new SlurmExecutor(task, th);
+ }
+}
\ No newline at end of file
Index: modules/provider-localscheduler/src/org/globus/cog/abstraction/impl/scheduler/slurm/Properties.java
===================================================================
--- modules/provider-localscheduler/src/org/globus/cog/abstraction/impl/scheduler/slurm/Properties.java (revision 0)
+++ modules/provider-localscheduler/src/org/globus/cog/abstraction/impl/scheduler/slurm/Properties.java (revision 3432)
@@ -0,0 +1,47 @@
+package org.globus.cog.abstraction.impl.scheduler.slurm;
+
+import org.globus.cog.abstraction.impl.scheduler.common.AbstractProperties;
+
+public class Properties extends AbstractProperties {
+
+ private static final long serialVersionUID = 1L;
+ public static final String PROPERTIES = "provider-slurm.properties";
+
+ public static final String POLL_INTERVAL = "poll.interval";
+ public static final String QSUB = "qsub";
+ public static final String QSTAT = "qstat";
+ public static final String QDEL = "qdel";
+ public static final String USE_MPPWIDTH = "use.mppwidth";
+
+ private static Properties properties;
+
+ public static synchronized Properties getProperties() {
+ if (properties == null) {
+ properties = new Properties();
+ properties.load(PROPERTIES);
+ }
+ return properties;
+ }
+
+ protected void setDefaults() {
+ setPollInterval(5);
+ setSubmitCommand("qsub");
+ setPollCommand("qstat");
+ setRemoveCommand("qdel");
+ }
+
+
+ public String getPollCommandName() {
+ return QSTAT;
+ }
+
+
+ public String getRemoveCommandName() {
+ return QDEL;
+ }
+
+
+ public String getSubmitCommandName() {
+ return QSUB;
+ }
+}
Index: modules/provider-localscheduler/resources/cog-provider.properties
===================================================================
--- modules/provider-localscheduler/resources/cog-provider.properties (revision 3431)
+++ modules/provider-localscheduler/resources/cog-provider.properties (working copy)
@@ -23,3 +23,9 @@
sandbox=false
executionTaskHandler=org.globus.cog.abstraction.impl.scheduler.sge.execution.TaskHandlerImpl
securityContext=org.globus.cog.abstraction.impl.common.task.SecurityContextImpl
+
+provider=slurm
+sandbox=false
+executionTaskHandler=org.globus.cog.abstraction.impl.scheduler.slurm.execution.TaskHandlerImpl
+securityContext=org.globus.cog.abstraction.impl.common.task.SecurityContextImpl
+
Index: modules/provider-localscheduler/etc/provider-slurm.properties
===================================================================
--- modules/provider-localscheduler/etc/provider-slurm.properties (revision 0)
+++ modules/provider-localscheduler/etc/provider-slurm.properties (revision 3432)
@@ -0,0 +1,33 @@
+#
+# The interval, in seconds, at which the provider will poll the PBS
+# queue for status updates. There is at most one poll thread per JVM,
+# which is shared by all the jobs submitted through the PBS provider.
+#
+poll.interval=5
+
+#
+# The path to qsub. The default assumes that qsub is in PATH
+#
+qsub=notqsub
+
+#
+# The path to qstat. The default assumes that qstat is in PATH
+#
+qstat=notqstat
+
+#
+# The path to qdel. The default assumes that qdel is in PATH
+#
+qdel=notqdel
+
+# If true, use "#PBS -l mppwidth=" instead of "#PBS -l nodes="
+# in PBS script
+use.mppwidth=false
+
+# If the jobType attribute is specified, then the PBS provider
+# will look for a property named "wrapper.<jobType>" and prepend
+# that to the executable line in the PBS script. It will also
+# substitute value of attributes in the job specification, using
+# the "$attrName" notation.
+#
+wrapper.mpi=mpirun -np $count
More information about the Swift-commit
mailing list