/// <summary> /// Node state event handler. /// </summary> private static void Node_OnStateChange(object sender, NodeStateEventArg e) { if (e.NewState == NodeState.Offline) { // If node is offline, remove it from NodeInfoCache as we held the reference to ISchedulerNode instances in that cache. // If the node is online again, it will be added back into the cache. LockNodeInfoCache.AcquireWriterLock(Timeout.Infinite); try { ISchedulerNode nodeInfo = (ISchedulerNode)sender; ISchedulerNode testNode; if (!NodeInfoCache.TryGetValue(nodeInfo.Name, out testNode)) { TraceHelper.TraceWarning(0, "[JobMonitorEntry] Node info cache inconsistent."); return; } // remove node info. step 1. deregister node state event nodeInfo.OnNodeState -= Node_OnStateChange; // remove node info. step 2. remove node info from NodeInfoCache NodeInfoCache.Remove(nodeInfo.Name); } finally { LockNodeInfoCache.ReleaseLock(); } } }
private ISchedulerNode GetSchedulerNode(string monitoredSystemName) { string nodeName = monitoredSystemName.Split('.')[0]; checkClusterConnection(); IScheduler scheduler = (IScheduler)(((HpcClusterConnection)clusterConnection.CopyConnection()).GetConnection()); ISchedulerNode requestedNode = null; foreach (ISchedulerNode node in scheduler.GetNodeList(null, null)) { if (node.Name.Equals(nodeName)) { requestedNode = node; } } return(requestedNode); }
//for best results, run this sample code in queued scheduling mode static void Main(string[] args) { string clusterName = Environment.GetEnvironmentVariable("CCP_SCHEDULER"); using (IScheduler scheduler = new Scheduler()) { Console.WriteLine("Connecting to {0}", clusterName); scheduler.Connect(clusterName); //assume you have two nodegroups, NodeGroup1 and NodeGroup2 IStringCollection nodeGroup1 = scheduler.GetNodesInNodeGroup("NodeGroup1"); IStringCollection nodeGroup2 = scheduler.GetNodesInNodeGroup("NodeGroup2"); if (nodeGroup1.Count == 0 || nodeGroup2.Count == 0) { Console.WriteLine("Node groups are not set up correctly"); return; } //and nodes in NodeGroup2 are not in NodeGroup1, and vise versa. string nodeToMove = ""; foreach (string node in nodeGroup2) { if (!nodeGroup1.Contains(node)) { nodeToMove = node; break; } } if (string.IsNullOrEmpty(nodeToMove)) { Console.WriteLine("No eligible nodes to move"); return; } //create a job to run on NodeGroup1 ISchedulerJob job = scheduler.CreateJob(); job.NodeGroups.Add("NodeGroup1"); //Set unit type to node, but let it autocalculate resources job.UnitType = JobUnitType.Node; ISchedulerTask task = job.CreateTask(); task.CommandLine = "ver"; task.Type = TaskType.Service; job.AddTask(task); job.OnTaskState += new EventHandler <TaskStateEventArg>(job_OnTaskState); Console.WriteLine("Submitting job on NodeGroup1"); scheduler.SubmitJob(job, null, null); Console.WriteLine("Job {0} Submitted", job.Id); //wait for the job to start running running.WaitOne(); job.Refresh(); int allocationCount = job.AllocatedNodes.Count; Console.WriteLine("Number of allocated nodes: {0}", allocationCount); //Check the status of NodeGroup1 nodes int idleCores = 0; foreach (string nodename in nodeGroup1) { ISchedulerNode node = scheduler.OpenNodeByName(nodename); idleCores += node.GetCounters().IdleCoreCount; } //There are no more idle cores remaining in this node group //So we'll place one of the nodes from NodeGroup2 allow the job to grow if (idleCores == 0) { running.Reset(); //Changing nodegroups is available through the UI or PowerShell string powershellScript = String.Format("add-pssnapin microsoft.hpc; " + "add-hpcgroup -scheduler {0} -name {1} -nodename {2}", clusterName, "NodeGroup1", nodeToMove); using (PowerShell ps = PowerShell.Create()) { ps.AddScript(powershellScript, true); ps.Invoke(); } running.WaitOne(); Console.WriteLine("(Waiting 5 seconds for job to update the scheduler)"); Thread.Sleep(5 * 1000); job.Refresh(); int newAllocationCount = job.AllocatedNodes.Count; //verify that job has grown if (newAllocationCount > allocationCount) { Console.WriteLine("Job has grown to {0} nodes", newAllocationCount); } } else { Console.WriteLine("There are still idle cores in the nodegroup"); } } }
static void Main(string[] args) { Console.WriteLine("Utility to setup a sweep run\n"); Console.WriteLine("-- sets up nodeDir (removes if existing)\n"); Console.WriteLine("-- copies client utility to nodes\n"); Console.WriteLine("-- makes a single (localMaster) copy of files on nodes\n"); //required args //path to runfiles string filePath = null; //root dir on slaves string nodeDir = null; //username string userName = null; //password string password = null; //client string clientExe = "hpc_client_util.exe"; //node file string nodeFile = null; //cluster UNC name string clusterName = null; bool updateOnly = false; if (parse_cmd_args(args, ref filePath, ref nodeFile, ref nodeDir, ref clusterName, ref userName, ref password, ref updateOnly) == false) { Console.WriteLine("parse cmd args fail..."); Console.WriteLine("required commandline args: -filePath:path to folder with complete set of files"); Console.WriteLine(" -nodeDir:working dir on the compute nodes"); Console.WriteLine(" -userName: domain user name"); //Console.WriteLine(" -password: domain password"); Console.WriteLine("\noptional (default) -nodeFile:file with node UNC name(s) to use(all)"); Console.WriteLine(" negative for numCores less than max "); Console.WriteLine(" -clusterName:cluster UNC name (babeshn010)"); Console.WriteLine(" -updateOnly:updates existing dir structure with new files"); return; } //set clusterName if (clusterName == null) { clusterName = "IGSBABESHN010"; } //get files in filePath string[] dataFiles = null; try { dataFiles = Directory.GetFiles(filePath); } catch (Exception e) { Console.WriteLine("Unable to get file list for filePath:\n " + filePath); Console.WriteLine(e); return; } //try to find a copy of the node-side client string clientPath = null; if (clientPath == null) { string[] localFiles = null; try { localFiles = Directory.GetFiles(".\\"); } catch (Exception e) { Console.WriteLine("Unable to get local file list"); Console.WriteLine(e); return; } foreach (string file in localFiles) { if (Path.GetFileName(file) == clientExe) { clientPath = Path.GetFullPath(file); } } } //if still not found, give up if (clientPath == null) { Console.WriteLine("could not find client in local folder: " + clientExe); return; } // // //**********************HPC portion************************* // // IScheduler scheduler = null; try { // Make the scheduler and connect to the local host. scheduler = new Scheduler(); scheduler.Connect(clusterName); } catch (Exception e) { Console.WriteLine("Unable to connect to cluster:\n " + clusterName); Console.WriteLine(e); return; } List <string> clusterNodes = new List <string>(); // Get all the nodes in the compute node group. try { clusterNodes = convert(scheduler.GetNodesInNodeGroup("ComputeNodes")); } catch (Exception e) { Console.WriteLine("Unable to get cluster node list:\n " + clusterName); Console.WriteLine(e); return; } //get the nodes in the nodeFile List <string> fileNodes = new List <string>(); if (nodeFile != null) { try { fileNodes = get_node_list(nodeFile); } catch (Exception e) { Console.WriteLine("Unable to get node list from nodeFile:\n " + nodeFile); Console.WriteLine(e); return; } } else { fileNodes = clusterNodes; } //build requestedNodes List <string> requestedNodes = new List <string>(); foreach (string fnode in fileNodes) { foreach (string cnode in clusterNodes) { if (fnode.ToUpper() == cnode.ToUpper()) { ISchedulerNode node = scheduler.OpenNodeByName(fnode); if (node.Reachable == true) { requestedNodes.Add(fnode); Console.WriteLine("compute node added: " + fnode); } else { Console.WriteLine("compute node not reachable: " + fnode); } } } } if (requestedNodes.Count == 0) { Console.WriteLine("no usable compute nodes found"); return; } string task, localHost; bool success; localHost = Environment.MachineName; string[] oneDirLevelUp = get_up_level_dir(nodeDir); if (!updateOnly) { //first remove existing node dir // success = true; task = @"rmdir " + oneDirLevelUp[1] + " /S /Q"; Console.WriteLine("removing (possibly) existing nodeDir"); success = submit_job(scheduler, task, oneDirLevelUp[0], requestedNodes, userName, password, true); if (success == false) { return; } //now make the dir // task = @"mkdir " + oneDirLevelUp[1]; Console.WriteLine("making new nodeDir"); success = submit_job(scheduler, task, oneDirLevelUp[0], requestedNodes, userName, password, true); if (success == false) { return; } //now copy client to slaves // string clientUnc = get_master_unc(localHost, clientPath); task = @"copy " + clientUnc; Console.WriteLine("Copying client to nodes"); success = submit_job(scheduler, task, nodeDir, requestedNodes, userName, password, true); if (success == false) { return; } } else { Console.WriteLine("using existing directory structure and hpc_client_util, updating files only..."); } //now finally run client to make one localMaster copy // string masterUnc = get_master_unc(localHost, Path.GetFullPath(filePath)); Console.WriteLine("starting client"); task = clientExe + " -src:" + masterUnc + " "; task = task + " -n:0"; if (updateOnly) { task = task + " -updateOnly"; } success = submit_job(scheduler, task, nodeDir, requestedNodes, userName, password, false); if (success == false) { return; } return; }
static void Main(string[] args) { //required args //path to runfiles string filePath = null; //root dir on slaves string nodeDir = null; //pest case string pestCase = null; //username string userName = null; //password string password = null; //optional //master dir string masterDir = null; //node file string nodeFile = null; //exec name string execName = null; //execArgs string execArgs = null; //client command string clientExe = null; //client command line string clientArgs = null; //cluster UNC name string clusterName = null; //number of cores to use on each node int numCores = -999; //port number int portNum = -999; //master start delay int delay = 0; //flag to potentially not starting a master bool masterFlag = true; //flag to potentially stagger start nodes bool staggerFlag = false; if (parse_cmd_args(args, ref filePath, ref masterDir, ref nodeFile, ref nodeDir, ref execName, ref numCores, ref pestCase, ref portNum, ref clientExe, ref clientArgs, ref clusterName, ref userName, ref password, ref delay, ref masterFlag, ref staggerFlag) == false) { Console.WriteLine("parse cmd args fail..."); Console.WriteLine("required commandline args: -filePath:path to folder with complete set of files"); Console.WriteLine(" -nodeDir:root dir on the compute nodes"); Console.WriteLine(" -pestCase: pest case name"); Console.WriteLine(" -userName: domain user name"); Console.WriteLine("\noptional (default) -nodeFile:file with node UNC name(s) to use(all)"); Console.WriteLine(" -masterDir:directory to run for master(.\\master)"); Console.WriteLine(" if not passed existing, \".\\master\" is removed!"); Console.WriteLine(" -execName:executable name (\"beopest64.exe\")"); Console.WriteLine(" -numCores:number of cores per node (processor count)"); Console.WriteLine(" negative for numCores less than max "); Console.WriteLine(" -portNum:TCP/IP port number (4004)"); Console.WriteLine(" -clientExe:node side client (hpc_client_util.exe)"); Console.WriteLine(" -clientCmdLine:passed only if client passed"); Console.WriteLine(" use \" \" if clientCmdLine contains spaces"); Console.WriteLine(" -clusterName:cluster UNC name (babeshn010)"); Console.WriteLine(" -delay:time to wait after master start (0 seconds)"); Console.WriteLine(" -noMaster:if passed, no master started, only slaves"); //Console.WriteLine(" -stagger:if passed, each node will be start sequentially"); return; } //set clusterName if (clusterName == null) { clusterName = "IGSBABESHN010"; } //set execName if (execName == null) { execName = "beopest64.exe"; } //set clientExe if (clientExe == null) { clientExe = "hpc_client_util.exe"; } //set port number if (portNum == -999) { portNum = 4004; } if (password == null) { Console.WriteLine("Enter network password:"******"Unable to get file list for filePath:\n " + filePath); Console.WriteLine(e); return; } // make sure pestCase.pst exists and execCmd exists bool execFlag = false, pstFlag = false; foreach (string file in dataFiles) { if (Path.GetFileName(file) == execName) { execFlag = true; } else if (Path.GetFileNameWithoutExtension(file) == pestCase && Path.GetExtension(file) == ".pst") { pstFlag = true; } } if (execFlag == false) { Console.WriteLine("executable not found in file path folder:\n " + execName); return; } if (pstFlag == false) { Console.WriteLine("pestCase.pst not found in file path folder:\n " + pestCase); return; } //set numCores if (numCores == -999) { numCores = Environment.ProcessorCount; } else if (numCores < 0) { numCores = Environment.ProcessorCount - numCores; } //setup master dir string currentDir = Directory.GetCurrentDirectory(); bool newMaster = false; if ((masterDir == null) && (masterFlag == false)) { masterDir = currentDir + @"\" + "master"; newMaster = true; if (Directory.Exists(masterDir)) { Console.WriteLine("master dir already exists:\n " + masterDir + "...removing..."); try { Directory.Delete(masterDir, true); } catch (Exception e) { Console.WriteLine("Unable to remove default master dir:\n " + masterDir); Console.WriteLine(e); return; } } try { Directory.CreateDirectory(masterDir); } catch (Exception e) { Console.WriteLine("Unable to create master dir:\n " + masterDir); Console.WriteLine(e); return; } } else { if (Directory.Exists(masterDir) == false) { Console.WriteLine("Unable to find existing master dir:\n " + masterDir); return; } } //try to find a copy of the node-side client string clientPath = null; //first filePath files foreach (string file in dataFiles) { if (Path.GetFileName(file) == clientExe) { clientPath = Path.GetFullPath(file); } } //next look in the current dir if (clientPath == null) { string[] localFiles = null; try { localFiles = Directory.GetFiles(".\\"); } catch (Exception e) { Console.WriteLine("Unable to get local file list"); Console.WriteLine(e); return; } foreach (string file in localFiles) { if (Path.GetFileName(file) == clientExe) { clientPath = Path.GetFullPath(file); } } } //if still not found, give up if (clientPath == null) { Console.WriteLine("could not find client in filePath folder\n or local folder: " + clientExe); return; } //copy files to master if (newMaster == true) { try { copy_folder(filePath, masterDir); } catch (Exception e) { Console.WriteLine("Unable to copy files to master dir:\n " + masterDir); Console.WriteLine(e); return; } } //start beopest master Process master = new Process(); if (masterFlag) { string masterCmd = " " + pestCase + " /h :" + portNum; try { master = run_wait(masterDir, execName, masterCmd, delay); Console.WriteLine(master.Id); } catch (Exception e) { Console.WriteLine("Unable to start master successfully,\n adding full path to masterDir and retrying"); Console.WriteLine(e); masterDir = currentDir + @"\" + masterDir; try { master = run_wait(masterDir, execName, masterCmd, delay); Console.WriteLine(master.Id); } catch (Exception e2) { Console.WriteLine("Still unable to start master successfully"); Console.WriteLine(e2); return; } } Console.WriteLine("Master started successfully in " + masterDir); } else { Console.WriteLine("No master started. Adding current path to masterDir"); masterDir = currentDir + @"\" + masterDir; } // // //**********************HPC portion************************* // // IScheduler scheduler = null; try { // Make the scheduler and connect to the local host. scheduler = new Scheduler(); scheduler.Connect(clusterName); } catch (Exception e) { Console.WriteLine("Unable to connect to cluster:\n " + clusterName); Console.WriteLine(e); master.Kill(); return; } List <string> clusterNodes = new List <string>(); // Get all the nodes in the compute node group. try { clusterNodes = convert(scheduler.GetNodesInNodeGroup("ComputeNodes")); } catch (Exception e) { Console.WriteLine("Unable to get cluster node list:\n " + clusterName); Console.WriteLine(e); master.Kill(); return; } //get the nodes in the nodeFile List <string> fileNodes = new List <string>(); if (nodeFile != null) { try { fileNodes = get_node_list(nodeFile); } catch (Exception e) { Console.WriteLine("Unable to get node list from nodeFile:\n " + nodeFile); Console.WriteLine(e); master.Kill(); return; } } else { fileNodes = clusterNodes; } //build requestedNodes List <string> requestedNodes = new List <string>(); foreach (string fnode in fileNodes) { foreach (string cnode in clusterNodes) { if (fnode.ToUpper() == cnode.ToUpper()) { ISchedulerNode node = scheduler.OpenNodeByName(fnode); if (node.Reachable == true) { requestedNodes.Add(fnode); Console.WriteLine("compute node added: " + fnode); } else { Console.WriteLine("compute node not reachable: " + fnode); } } } } if (requestedNodes.Count == 0) { Console.WriteLine("no usable compute nodes found"); if (masterFlag) { master.Kill(); } return; } if (!staggerFlag) { //first remove existing node dir // bool success = false; string[] oneDirLevelUp = get_up_level_dir(nodeDir); string task = @"rmdir " + oneDirLevelUp[1] + " /S /Q"; Console.WriteLine("removing (possibly) existing nodeDir"); success = submit_job(scheduler, task, oneDirLevelUp[0], requestedNodes, userName, password, true); if (success == false) { if (masterFlag) { master.Kill(); } return; } //now make the dir // task = @"mkdir " + oneDirLevelUp[1]; Console.WriteLine("making new nodeDir"); success = submit_job(scheduler, task, oneDirLevelUp[0], requestedNodes, userName, password, true); if (success == false) { if (masterFlag) { master.Kill(); } return; } //now copy slaveCopyRun to slaves // string localHost = Environment.MachineName; string clientUnc = get_master_unc(localHost, clientPath); task = @"copy " + clientUnc; Console.WriteLine("Copying client to nodes"); success = submit_job(scheduler, task, nodeDir, requestedNodes, userName, password, true); if (success == false) { if (masterFlag) { master.Kill(); } return; } //now finally run slaveCopyRun // string masterUnc = get_master_unc(localHost, masterDir); Console.WriteLine("starting client util"); if (execArgs == null) { execArgs = " " + pestCase + " /h " + localHost + ":" + portNum; } task = clientExe + " -src:" + masterUnc + " "; task = task + " -cmdExec:" + execName + " -cmdArgs:\"" + execArgs + "\""; success = submit_job(scheduler, task, nodeDir, requestedNodes, userName, password, false); if (success == false) { master.Kill(); return; } } //if nodes are stagger started else { for (int i = 0; i < requestedNodes.Count; i++) { List <string> rnode = new List <string> { requestedNodes[i] }; //first remove existing node dir // bool success = false; string[] oneDirLevelUp = get_up_level_dir(nodeDir); string task = @"rmdir " + oneDirLevelUp[1] + " /S /Q"; Console.WriteLine("removing (possibly) existing nodeDir"); success = submit_job(scheduler, task, oneDirLevelUp[0], rnode, userName, password, true); if (success == false) { if (masterFlag) { master.Kill(); } return; } //now make the dir // task = @"mkdir " + oneDirLevelUp[1]; Console.WriteLine("making new nodeDir"); success = submit_job(scheduler, task, oneDirLevelUp[0], rnode, userName, password, true); if (success == false) { if (masterFlag) { master.Kill(); } return; } //now copy slaveCopyRun to slaves // string localHost = Environment.MachineName; string clientUnc = get_master_unc(localHost, clientPath); task = @"copy " + clientUnc; Console.WriteLine("Copying client to nodes"); success = submit_job(scheduler, task, nodeDir, rnode, userName, password, true); if (success == false) { if (masterFlag) { master.Kill(); } return; } //now finally run slaveCopyRun // string masterUnc = get_master_unc(localHost, masterDir); Console.WriteLine("starting slaveCopyRun"); if (execArgs == null) { execArgs = " " + pestCase + " /h " + localHost + ":" + portNum; } task = clientExe + " -src:" + masterUnc + " "; task = task + " -cmdExec:" + execName + " -cmdArgs:\"" + execArgs + "\""; success = submit_job(scheduler, task, nodeDir, rnode, userName, password, false); if (success == false) { master.Kill(); return; } } } while (true) { try { if (master.HasExited) { break; } } catch (Exception e) { break; } } return; }