public TaskStateInfo GetTaskStateInfo(TaskRunContext task) { var node = GetNode(task); var taskId = task.TaskId; string result; TaskStateInfo info = null; if (GetFromResourceTaskStateInfo(task, out result)) { info = new TaskStateInfo(TaskState.Started, result); } // esle if (Aborted, Failed) else { CopyOutputsToExchange(task); info = new TaskStateInfo(TaskState.Completed, result); } //todo nbutakov change info.ProcessInfo = GetCurrentTaskInfo(task); info.NodeName = GetNode(task).NodeName; return info; }
public object Run(TaskRunContext task) { var node = GetNode(task); var pack = PackageByName(node, task.PackageName); ulong taskId = task.TaskId; Log.Info("Locking operation"); var operationHolder = LockOperation(task.TaskId, TaskLock.WRITE_OPERATION_EXECUTED); string fileNames; string clusterHomeFolder = CopyInputFiles(task, out fileNames); string cmdLine = String.Format(task.CommandLine, pack.AppPath, taskId, fileNames.Trim()); Log.Info("cmdline = " + cmdLine); String scriptPath; Log.Info("Preparing script"); ScriptPrepare(pack, cmdLine, node, clusterHomeFolder, out scriptPath); Log.Info("Script prepared. Executing it."); var result = ExecuteRun(node, scriptPath); string jobId = result.Split(new[] { '\r', '\n' }, StringSplitOptions.RemoveEmptyEntries).First(); Log.Info("Exec done. Job id = " + jobId); UnLockOperation(task.TaskId, operationHolder); Log.Info("Operation unlocked"); return(jobId); }
private void RevokeTask(TaskRunContext task) { lock (task.Lock) { lock (_nodeStateCacheLock) { var nodeStates = _nodeStateCache[task.Resource.ResourceName]; foreach (var nodeConfig in task.NodesConfig) { var nodeState = nodeStates.Single(n => n.NodeName == nodeConfig.NodeName); nodeState.TasksSubmitted--; nodeState.CoresReserved -= nodeConfig.Cores; if (nodeState.TasksSubmitted < 0) { Log.Warn(); nodeState.TasksSubmitted = 0; } if (nodeState.CoresReserved < 0) { Log.Warn(); nodeState.CoresReserved = 0; } } } } }
public TaskStateInfo GetTaskStateInfo(TaskRunContext task) { var node = GetNode(task); var taskId = task.TaskId; string result; TaskStateInfo info = null; if (GetFromResourceTaskStateInfo(task, out result)) { info = new TaskStateInfo(TaskState.Started, result); } // esle if (Aborted, Failed) else { CopyOutputsToExchange(task); info = new TaskStateInfo(TaskState.Completed, result); } //todo nbutakov change info.ProcessInfo = GetCurrentTaskInfo(task); info.NodeName = GetNode(task).NodeName; return(info); }
protected virtual bool GetFromResourceTaskStateInfo(TaskRunContext task, out string result) { var node = GetNode(task); result = SshExec(node, GetTaskStateCommand(), (string)task.LocalId, new PbsErrorResolver()).ToLowerInvariant(); return(result.Contains("job_state = R") || result.Contains("job_state = Q") || result.Contains("job_state = r") || result.Contains("job_state = q")); }
public TaskStateInfo GetTaskStateInfo(TaskRunContext task) { var node = GetNode(task); string result = SshExec(node, SshCommands.GetTaskState, (string)task.LocalId); string result_UPPER = result.ToUpperInvariant(); string[] runningTokens = new[] { "CONFIGURING", "COMPLETING", "PENDING", "RUNNING", "SUSPENDED" }; string[] abortedTokens = new[] { "CANCELLED", "TIMEOUT" }; string[] failedTokens = new[] { "FAILED", "NODE_FAIL", "PREEMPTED" }; string[] completedTokens = new[] { "COMPLETED", "Invalid job id specified".ToUpperInvariant() }; if (runningTokens.Any(st => result_UPPER.Contains(st))) return new TaskStateInfo(TaskState.Started, result); else if (abortedTokens.Any(st => result_UPPER.Contains(st))) return new TaskStateInfo(TaskState.Aborted, result); else if (failedTokens.Any(st => result_UPPER.Contains(st))) return new TaskStateInfo(TaskState.Failed, result); else if (completedTokens.Any(st => result_UPPER.Contains(st))) { CopyOutputsToExchange(task); return new TaskStateInfo(TaskState.Completed, result); } else { Log.Warn("Wnknown responce from SLURM. Hoping task was completed: " + result); CopyOutputsToExchange(task); return new TaskStateInfo(TaskState.Completed, result); } }
public void Run(TaskRunContext task) { _resourcesLock.EnterReadLock(); try { Log.Info("Running task " + task.ToString()); string resourceName = task.NodesConfig.First().ResourceName; var resourceCache = ResourceCache.GetByName(resourceName); lock (resourceCache.StateLock) { CheckNodeConfigConsistency(task.TaskId, task.NodesConfig, resourceCache.Resource); task.Resource = resourceCache.Resource; task.Controller = resourceCache.Controller; } try { resourceCache.Acquire(task.NodesConfig); // todo : m.b. move under resourceCache.StateLock? Log.Info(String.Format("Trying to run task {0} on resource {1}", task.TaskId, task.Resource.ResourceName)); task.LocalId = task.Controller.Run(task); Log.Info(String.Format("Task {0} ({1}) started on resource {2} with localId = {3}", task.TaskId, task.PackageName, task.Resource.ResourceName, task.LocalId )); var state = new TaskStateInfo(TaskState.Started, task.LocalId.ToString()); TaskCache.AddTask(task, state); } catch (Exception e) { resourceCache.Release(task.NodesConfig); Log.Error(String.Format("Unable to run task {0}: {1}", task.TaskId, e)); throw; } } catch (Exception e) { Log.Error(String.Format("Exception on Farm.Run(task {0}): {1}", task.TaskId, e)); throw; } finally { _resourcesLock.ExitReadLock(); } //todo for mock if (CacheCollectorFactory.CheckMockMode()) { CacheCollectorFactory.GetInstance().SendTask(task); } }
private TaskCache(TaskRunContext context, TaskStateInfo state) { Context = context; StateInfo = state; _isUpdating = false; _lastUpdateTime = DateTime.Now; }
} // mutable private TaskCache(TaskRunContext context, TaskState state = TaskState.Started, string stateComment = "") { Context = context; StateInfo = new TaskStateInfo(state, stateComment); _isUpdating = false; _lastUpdateTime = DateTime.Now - UPDATE_INTERVAL - TimeSpan.FromMilliseconds(50); }
public static void AddTask(TaskRunContext context, TaskState state = TaskState.Started, string stateComment = "") { var taskCache = new TaskCache(context, state, stateComment); lock (_globalLock) { _cache[context.TaskId] = taskCache; } }
public static void AddTask(TaskRunContext context, TaskStateInfo state) //, TaskState state = TaskState.Started, string stateComment = "") { var taskCache = new TaskCache(context, state, CacheCollectorFactory.GetInstance().GetTaskCacheCollector()); // autosaves lock (_globalLock) { _cache[context.TaskId] = taskCache; } }
public void Abort(TaskRunContext task) { lock (_gridLock) { RefreshCertificate(); string localId = (string) task.LocalId; localId = localId.EndsWith("/a") ? localId.Remove(localId.Length - 2) : localId; string sshOut = SshExec(PilotCommands.CancelJob, localId); } }
public void Abort(TaskRunContext task) { var node = GetNode(task); var esService = GetExecuteServiceClient(node); var providedWords = ((string)task.LocalId).Split(new[] { '\n' }); // todo : string -> string[] if (providedWords.Length > 2) Log.Warn(String.Format("Too many sections in provided task id for win PC: {0}", task.LocalId)); string pid = providedWords[0]; esService.StopTaskRunning(int.Parse(pid)); }
private TaskStateInfo _stateInfo; //todo : BsonElement("StateInfo") #endregion Fields #region Constructors private TaskCache(TaskRunContext context, TaskStateInfo state, ITaskGlobalCacheCollector collector = null) { lock (_globalLock) { StateLock = new object(); // needs to be explicitly before SetState, which triggers Save (i.e. makes object publicly available in memory) _isUpdating = false; _lastUpdateTime = DateTime.Now - UPDATE_INTERVAL - TimeSpan.FromMilliseconds(50); Context = context; } SetState(state); gcCollector = collector; }
private TaskCache(TaskRunContext context, TaskStateInfo state, ITaskGlobalCacheCollector collector = null) { lock (_globalLock) { StateLock = new object(); // needs to be explicitly before SetState, which triggers Save (i.e. makes object publicly available in memory) _isUpdating = false; _lastUpdateTime = DateTime.Now - UPDATE_INTERVAL - TimeSpan.FromMilliseconds(50); Context = context; } SetState(state); gcCollector = collector; }
public void Abort(TaskRunContext task) { try { var node = GetNode(task); SshExec(node, GetTaskStateCommand(), (string)task.LocalId); // todo : Abort, not GetTaskState? } catch (Exception e) { Log.Error(String.Format("Failed to abort task {1} on resource {2}: {3}{0}{4}", Environment.NewLine, task.TaskId, task.Resource.ResourceName, e.Message, e.StackTrace )); // todo : throw; } }
private void SubmitTask(TaskRunContext task) { lock (task.Lock) { try { lock (_nodeStateCacheLock) { bool nodesOverloaded = false; var nodeStates = _nodeStateCache[task.Resource.ResourceName]; foreach (var nodeConfig in task.NodesConfig) { var nodeState = nodeStates.Single(n => n.NodeName == nodeConfig.NodeName); if (nodeState.CoresAvailable <= nodeConfig.Cores) { nodesOverloaded = true; } nodeState.TasksSubmitted++; nodeState.CoresReserved += nodeConfig.Cores; } if (nodesOverloaded) { Log.Error("Nodes overload for resource " + task.Resource.ResourceName); throw new Exception("Wrong config for task " + task.TaskId.ToString() + ". Selected nodes are overloaded"); } } task.LocalId = task.Controller.Run(task); task.CachedRunInfo = new TaskRunInfo(TaskState.Started); } catch (Exception e) { RevokeTask(task); Log.Error(String.Format("Unable to run task {1}: {2}{0}{3}", Environment.NewLine, task.TaskId, e.Message, e.StackTrace )); throw; } } }
public void Abort(TaskRunContext task) { try { var node = GetNode(task); SshExec(node, SshCommands.Abort, (string) task.LocalId); // todo : Abort, not GetTaskState? } catch (Exception e) { Log.Error(String.Format("Failed to abort task {1} on resource {2}: {3}{0}{4}", Environment.NewLine, task.TaskId, task.Resource.ResourceName, e.Message, e.StackTrace )); // todo : throw; } }
public void Run(TaskRunContext task) { lock (task.Lock) { Log.Info("Running task " + task.ToString()); string resourceName = task.NodesConfig.First().ResourceName; if (task.NodesConfig.Any(node => node.ResourceName != resourceName)) { Log.Error("Node configs have different resources: " + String.Join(", ", task.NodesConfig.Select(c => c.ResourceName))); throw new ArgumentException("All node configs should have the same resource name"); } lock (_resourcesCacheLock) { if (!_resourcesCache.ContainsKey(resourceName)) { Log.Error("No controller for resource " + resourceName); throw new ArgumentException("No such resource controller"); } var unknownNodes = task.NodesConfig.Select(n => n.NodeName).Except(_nodeStateCache[resourceName].Select(n => n.NodeName)); if (unknownNodes.Any()) { Log.Error(String.Format( "Task {0} has unknown nodes for resource {1}: {2}", task.TaskId, resourceName, String.Join(", ", unknownNodes) )); throw new Exception("Wrong node config for task " + task.TaskId.ToString() + ": " + String.Join(", ", unknownNodes)); } task.Resource = _resourcesCache[resourceName]; task.Controller = _controllers[resourceName]; } // POSSIBLE DATA RACE?! if state is inside controller SubmitTask(task); lock (_tasksCacheLock) { _tasksCache.Add(task.TaskId, task); } } }
public string CopyInputFiles(TaskRunContext task, out string fileNames) { var node = GetNode(task); //string ftpFolder = IncarnationParams.IncarnatePath(node.DataFolders.ExchangeUrlFromSystem, taskId, CopyPhase.In); //string jobFtpFolder = IncarnationParams.IncarnatePath(node.DataFolders.ExchangeUrlFromSystem, taskId, CopyPhase.None); //string ftpInputFolder = IncarnationParams.IncarnatePath(node.DataFolders.ExchangeUrlFromResource, taskId, CopyPhase.In); //string ftpOutputFolder = IncarnationParams.IncarnatePath(node.DataFolders.ExchangeUrlFromResource, taskId, CopyPhase.Out); string clusterHomeFolder = IncarnationParams.IncarnatePath(node.DataFolders.LocalFolder, task.TaskId, CopyPhase.None); //IOProxy.Ftp.MakePath(ftpInputFolder); //IOProxy.Ftp.MakePath(ftpOutputFolder); try { Log.Info(Thread.CurrentThread.ManagedThreadId + " entered."); SshExec(node, "mkdir " + clusterHomeFolder); Log.Info(Thread.CurrentThread.ManagedThreadId + " exited."); } catch (Exception e) { Log.Warn(e.ToString()); } Log.Info("Copying input files for task " + task.TaskId.ToString()); fileNames = ""; //String.Join(" ", incarnation.FilesToCopy.Select(f => f.FileName)); foreach (var file in task.InputFiles) { string tmpFile = Path.GetTempFileName(); IOProxy.Storage.Download(file.StorageId, tmpFile); string fileOnCluster = clusterHomeFolder.TrimEnd(new[] { '/', '\\' }) + "/" + file.FileName; fileNames += " " + fileOnCluster; Log.Info("Copying file " + fileOnCluster); //ScpCopy(node, fileOnCluster, tmpFile); UploadFile(node, fileOnCluster, tmpFile); File.Delete(tmpFile); } return clusterHomeFolder; }
protected override bool GetFromResourceTaskStateInfo(TaskRunContext task, out string result) { var node = GetNode(task); try { result = SshExec(node, GetTaskStateCommand(), (string)task.LocalId, null).ToLowerInvariant(); } catch (Exception e) { Log.Error(String.Format("Exception while updating task's {0} state: {1}", task.TaskId, e)); result = "SshExec error while updating task's state"; } string clusterFolder = IncarnationParams.IncarnatePath(node.DataFolders.LocalFolder, task.TaskId, CopyPhase.Out); string result2 = SshExec(node, SshUnixCommands.Ls, clusterFolder); return result.Contains(task.LocalId.ToString()) && !result2.Contains(ClavireFinishFileName); }
public TaskStateInfo GetTaskStateInfo(TaskRunContext task) { var node = GetNode(task); string result = SshExec(node, SshCommands.GetTaskState, (string)task.LocalId); string result_UPPER = result.ToUpperInvariant(); string[] runningTokens = new[] { "CONFIGURING", "COMPLETING", "PENDING", "RUNNING", "SUSPENDED" }; string[] abortedTokens = new[] { "CANCELLED", "TIMEOUT" }; string[] failedTokens = new[] { "FAILED", "NODE_FAIL", "PREEMPTED" }; string[] completedTokens = new[] { "COMPLETED", "Invalid job id specified".ToUpperInvariant() }; if (runningTokens.Any(st => result_UPPER.Contains(st))) { return(new TaskStateInfo(TaskState.Started, result)); } else if (abortedTokens.Any(st => result_UPPER.Contains(st))) { return(new TaskStateInfo(TaskState.Aborted, result)); } else if (failedTokens.Any(st => result_UPPER.Contains(st))) { return(new TaskStateInfo(TaskState.Failed, result)); } else if (completedTokens.Any(st => result_UPPER.Contains(st))) { CopyOutputsToExchange(task); return(new TaskStateInfo(TaskState.Completed, result)); } else { Log.Warn("Wnknown responce from SLURM. Hoping task was completed: " + result); CopyOutputsToExchange(task); return(new TaskStateInfo(TaskState.Completed, result)); } }
public void push(TaskRunContext context, ulong taskId, TaskStateInfo info) { Common.Utility.LogInfo("TaskCacheCollector.push taskId=" + taskId + " info=" + info.ProcessInfo.TimeSnapshot); if (!IsServicedController(context.Controller)){ return;} var resName = context.Resource.ResourceName; lock (_lock) { if (!bufferTaskInfo.ContainsKey(taskId)) { bufferTaskInfo.Add(taskId, new TaskStatInfo(new Dictionary<string, List<ProcessStatInfo>>(),resName)); } if (!bufferTaskInfo[taskId].ProcessInfoCollection.ContainsKey(info.NodeName)) { bufferTaskInfo[taskId].ProcessInfoCollection.Add(info.NodeName,new List<ProcessStatInfo>()); } bufferTaskInfo[taskId].ProcessInfoCollection[info.NodeName].Add(info.ProcessInfo); } }
public TaskStateInfo GetTaskStateInfo(TaskRunContext task) { string[] providedWords = ((string)task.LocalId).Split(new char[] { '\n' }); // todo : string -> string[] if (providedWords.Length > 2) { logger.Warn("Too many sections in provided task id for win PC: {0}", task.LocalId); } string pid = providedWords[0]; string nodeName = providedWords[1]; var node = task.Resource.Nodes.First(n => n.NodeName == nodeName); var rexService = GetREx(node.Services.ExecutionUrl); try { //rexService.InnerChannel.OperationTimeout = TimeSpan.FromSeconds(10); bool isRunning = rexService.IsProcessRunning(Int32.Parse(pid)); rexService.Close(); if (!isRunning) { return(new TaskStateInfo(TaskState.Completed, "")); } return(new TaskStateInfo(TaskState.Started, "")); } catch (Exception e) { rexService.Abort(); logger.WarnException(string.Format("Exception while getting task '{0}' state (local id = {1}): ", task.TaskId, task.LocalId), e); throw; //return new TaskStateInfo(TaskState.Started, ""); } }
public object Run(TaskRunContext task) { lock (_gridLock) { RefreshCertificate(); //var incarnation = task.Incarnation; string tmpFileName = null; if (task.UserCert != null) { Log.Info("Using user's certificate"); tmpFileName = Path.GetTempFileName(); IOProxy.Storage.Download(task.UserCert, tmpFileName); var scpForCert = new SSH.Scp(HELPER_SSH_HOST, HELPER_SSH_USER, HELPER_SSH_PASS); scpForCert.Connect(); scpForCert.Recursive = true; scpForCert.Put(tmpFileName, "/tmp/x509up_u500"); scpForCert.Close(); File.Delete(tmpFileName); SshExec(PilotCommands.SetPermissionsOnProxyCertFile); } else { Log.Info("Using system's certificate"); } try { long coresToUse = task.NodesConfig.Sum(cfg => cfg.Cores); var node = GetNode(task); var pack = node.PackageByName(task.PackageName); // todo : remove string commandLine = task.CommandLine; commandLine = commandLine.Replace("java -jar ", ""); if (task.PackageName.ToLowerInvariant() == "cnm") commandLine = commandLine.Replace("{0}", "ru.ifmo.hpc.main.ExtendedModel"); else if (task.PackageName.ToLowerInvariant() == "ism") commandLine = commandLine.Replace("{0}", "ru.ifmo.hpc.main.SpreadModel"); else //if (task.PackageName.ToLowerInvariant() == "orca") commandLine = commandLine.Replace("{0}", ""); string ftpFolderFromSystem = IncarnationParams.IncarnatePath(node.DataFolders.ExchangeUrlFromSystem, task.TaskId, CopyPhase.In); string ftpFolderFromResource = IncarnationParams.IncarnatePath(node.DataFolders.ExchangeUrlFromResource, task.TaskId, CopyPhase.In); string gridFtpFolder = IncarnationParams.IncarnatePath(node.DataFolders.LocalFolder, task.TaskId, CopyPhase.None); SshExec(PilotCommands.MakeFolderOnGridFtp, gridFtpFolder); string endl = "\n"; // Сначала дописываем недостающий входной файл (скрипт запуска пакета на кластере) string scriptName = pack.AppPath; //if (pack.EnvVars.Any()) { // Файл с установкой переменных окружения, если пакет их использует scriptName = "run.sh"; var scriptContent = new StringBuilder(); scriptContent.Append("#!/bin/bash" + endl); foreach (var pair in pack.EnvVars) scriptContent.AppendFormat("export {0}={1}" + endl, pair.Key, pair.Value); scriptContent.Append(pack.AppPath); /* if (task.PackageName.ToLowerInvariant() == "orca") { string[] args = commandLine.Split(new[] { ' ' }, StringSplitOptions.RemoveEmptyEntries); for (int i = 0; i < args.Length; i++) { if (args[i] == "orca.out") scriptContent.Append(" >"); scriptContent.Append(" $" + (i + 1).ToString()); } } else*/ { scriptContent.Append(" " + commandLine); } string scriptLocalPath = Path.GetTempFileName(); File.WriteAllText(scriptLocalPath, scriptContent.ToString()); IOProxy.Ftp.UploadLocalFile(scriptLocalPath, ftpFolderFromSystem, scriptName); File.Delete(scriptLocalPath); } //IOProxy.Ftp.UploadLocalFile(DEFAULT_JOB_LAUNCHER_PATH, GetFtpInputFolder(taskId), Path.GetFileName(DEFAULT_JOB_LAUNCHER_PATH)); // Копируем входные файлы с ФТП на ГридФТП SshExec(PilotCommands.CopyFilesToGridFtp, ftpFolderFromResource + " " + gridFtpFolder); SshExec(PilotCommands.MakeFilesExecutableOnGridFtp, gridFtpFolder + "*"); // Формируем описание задания для грида var jobFileContent = new StringBuilder(); jobFileContent.AppendFormat(@"{{ ""version"": 2, ""description"": ""{0}""," + endl, task.TaskId); jobFileContent.AppendFormat(@" ""default_storage_base"": ""{0}""," + endl, gridFtpFolder); jobFileContent.AppendFormat(@" ""tasks"": [ {{ ""id"": ""a"", ""description"": ""task"", ""definition"": {{ ""version"": 2," + endl); jobFileContent.AppendFormat(@" ""executable"": ""{0}""," + endl, scriptName); //jobFileContent.AppendFormat(@" ""arguments"": [ ""{0}"" ]," + endl, String.Join(@""", """, args)); jobFileContent.AppendFormat(@" ""input_files"": {{" + endl); if (scriptName == "run.sh") // todo : if no input files? jobFileContent.AppendFormat(@" ""run.sh"": ""run.sh""," + endl); jobFileContent.AppendFormat(@" " + String.Join( "," + endl + " ", task.InputFiles.Select( file => String.Format(@"""{0}"": ""{0}""", file.FileName) ) )); jobFileContent.AppendFormat(endl + @" }}," + endl); jobFileContent.AppendFormat(@" ""output_files"": {{" + endl); //if (task.PackageName.ToLowerInvariant() == "cnm") // jobFileContent.AppendFormat(@" ""output.dat"": ""output.dat""" + endl); //else if (task.PackageName.ToLowerInvariant() == "ism") jobFileContent.AppendFormat(@" ""output.dat"": ""output.dat""" + endl); else if (task.PackageName.ToLowerInvariant() == "orca") { jobFileContent.AppendFormat(@" ""orca.out"": ""orca.out""," + endl); jobFileContent.AppendFormat(@" ""eldens.cube"": ""eldens.cube""" + endl); } else { jobFileContent.AppendFormat(@" " + String.Join( "," + endl + " ", task.ExpectedOutputFileNames .Where(name => name != "std.out" && name != "std.err") .Select( name => String.Format(@"""{0}"": ""{0}""", name) ) ) + endl); } jobFileContent.AppendFormat(@" }}," + endl); jobFileContent.AppendFormat(@" ""stdout"": ""std.out"", ""stderr"": ""std.err"", " + endl); jobFileContent.AppendFormat(@" ""count"": {0}" + endl, coresToUse); if (pack.Params.ContainsKey("requirements")) jobFileContent.AppendFormat(@" ,""requirements"": {0}" + endl, pack.Params["requirements"]); jobFileContent.AppendFormat(@" }} }} ]," + endl); jobFileContent.AppendFormat(@" ""requirements"": {{ ""hostname"": [""{0}""]", node.NodeAddress); //if (pack.Params.ContainsKey("requirements")) // jobFileContent.AppendFormat(@", {0}" + endl, pack.Params["requirements"]); jobFileContent.AppendFormat(@"}}" + endl + "}}", node.NodeAddress); Log.Debug(String.Format("Task's '{0}' grid job JSON: ", task.TaskId, jobFileContent)); string jobFileName = "job_" + task.TaskId.ToString() + ".js"; string jobFilePathOnHelper = JOBS_FOLDER_ON_HELPER + jobFileName; //string jobFileContent = File.ReadAllText(DEFAULT_JOB_DESCR_PATH).Replace(GRIDFTP_PATH_TOKEN, taskFolderOnGridFtp); string jobFilePathLocal = Path.GetTempFileName(); File.WriteAllText(jobFilePathLocal, jobFileContent.ToString()); // Записываем его на сервер с Пилотом var scp = new SSH.Scp(HELPER_SSH_HOST, HELPER_SSH_USER, HELPER_SSH_PASS); /* var notifier = new JobDescriptionUploadNotifier(TaskId, Cluster, RunParams); scp.OnTransferEnd += new SSH.FileTransferEvent(notifier.OnFinish); // todo : необязательно */ scp.Connect(); scp.Recursive = true; scp.Put(jobFilePathLocal, jobFilePathOnHelper); scp.Close(); File.Delete(jobFilePathLocal); // todo : remove files on helper and gridftp // Запускаем Log.Info(String.Format( "Trying to exec task {0} on grid cluster {1}", task.TaskId, node.NodeName )); string launchResult = SshExec(PilotCommands.SubmitJob, jobFilePathOnHelper, pilotUrl: node.Services.ExecutionUrl); int urlPos = launchResult.IndexOf("https://"); string jobUrl = launchResult.Substring(urlPos).Trim() + "a"; Log.Debug(jobUrl); Log.Info(String.Format( "Task {0} launched on grid with jobUrl = {1}", task.TaskId, jobUrl )); return jobUrl; } catch (Exception e) { Log.Error(String.Format( "Error while starting task {0} in grid: {1}\n{2}", task.TaskId, e.Message, e.StackTrace )); throw; } finally { if (task.UserCert != null) { Log.Info("Wiping user's certificate"); tmpFileName = Path.GetTempFileName(); File.WriteAllText(tmpFileName, "Wiped by Easis system"); var scpForCert = new SSH.Scp(HELPER_SSH_HOST, HELPER_SSH_USER, HELPER_SSH_PASS); scpForCert.Connect(); scpForCert.Recursive = true; scpForCert.Put(tmpFileName, "/tmp/x509up_u500"); scpForCert.Close(); File.Delete(tmpFileName); SshExec(PilotCommands.SetPermissionsOnProxyCertFile); } } } }
public TaskStateInfo GetTaskStateInfo(TaskRunContext task) { lock (_gridLock) { RefreshCertificate(); ulong taskId = task.TaskId; string localId = (string) task.LocalId; string state = SshExec(PilotCommands.JobStatus, localId).ToLower(); if (state.Contains("is new")) return new TaskStateInfo(TaskState.Started, state); //return Tuple.Create(TaskState.Scheduled, state); if (state.Contains("is running") || state.Contains("is starting")) return new TaskStateInfo(TaskState.Started, state); var node = GetNode(task); string ftpOutFolderFromSystem = IncarnationParams.IncarnatePath(node.DataFolders.ExchangeUrlFromSystem, taskId, CopyPhase.Out); string ftpOutFolderFromResource = IncarnationParams.IncarnatePath(node.DataFolders.ExchangeUrlFromResource, taskId, CopyPhase.Out); string gridFolder = IncarnationParams.IncarnatePath(node.DataFolders.LocalFolder, taskId, CopyPhase.None); IOProxy.Ftp.MakePath(ftpOutFolderFromSystem); SshExec(PilotCommands.CopyFilesToGridFtp, gridFolder + " " + ftpOutFolderFromResource); if (state.Contains("is finished")) return new TaskStateInfo(TaskState.Completed, state); else return new TaskStateInfo(TaskState.Failed, state); } }
public TaskStateInfo GetTaskStateInfo(TaskRunContext task) { string[] providedWords = ((string)task.LocalId).Split(new[] { '\n' }); // todo : string -> string[] if (providedWords.Length > 2) Log.Warn(String.Format("Too many sections in provided task id for win PC: {0}", task.LocalId)); string pid = providedWords[0]; string nodeName = providedWords[1]; var farmId = task.Resource.Controller.FarmId; var node = task.Resource.Nodes.First(n => n.NodeName == nodeName); Log.Info(String.Format("Getting task {0} info...", pid)); var esClient = GetExecuteServiceClient(node); try { var isRunning = esClient.IsTaskRunning((int.Parse(pid))); esClient.Close(); if (!isRunning) { CopyOutputsToExchange(task, farmId); return new TaskStateInfo(TaskState.Completed, ""); } Log.Info(String.Format("task {0} running is : {1} ", pid, isRunning)); return new TaskStateInfo(); } catch (Exception e) { esClient.Abort(); Log.Warn(String.Format( "Exception while getting task '{0}' state (local id = {1}): {2}", task.TaskId, task.LocalId, e )); throw; } }
public ProcessStatInfo GetCurrentTaskInfo(TaskRunContext task) { var node = GetNode(task); var result = SshExec(node, GetTaskInfoCommand()); return (ProcessStatInfo) ObtainInfo(result, typeof (ProcessStatInfo)); }
public void Abort(TaskRunContext task) { logger.Warn("Abort is not implemented on windows controller!"); }
protected ResourceNode GetNode(TaskRunContext task) { return GetNode(task.Resource, task.NodesConfig); }
private List<Tuple<ResourceNode, int>> ExtractInfoCountPerNode(TaskRunContext task) { var nodesForTask = task.Resource.Nodes.Where(x => task.NodesConfig.Any(y => y.NodeName == x.NodeName)); var random = new Random(); List<Tuple<ResourceNode, int>> list = nodesForTask.Select(x => new Tuple<ResourceNode, int>(x, random.Next(2, 5))).ToList(); return list; }
//for mock public virtual void SendTask(TaskRunContext task) { }
public override void SendTask(TaskRunContext task) { //todo rewrite all this sht later if (taskCacheCollector == null) { taskCacheCollector = base.GetTaskCacheCollector(); } var data = ExtractInfoCountPerNode(task); var iter = data.GetEnumerator(); int current = 0; var t = 0; var coeff = Math.Sin(t); var angleRandom = new Random(); Utility.CreateAndRunRepeatedProcess(1000, false, () => { Common.Utility.ExceptionablePlaceWrapper(() => { var info = GenerateTaskMockData(iter.Current.Item1.NodeName, t); taskCacheCollector.push(task, task.TaskId, info); }, " Exception while creating and pushing task mock data for taskId=" + task.TaskId + " in MockCacheCollectorFactory", " Mock statistic data for task with taskId=" + task.TaskId + " have been generated and pushed", false); } , () => { if (current == 0) { if (!iter.MoveNext()) { t = 0; return true; } current = iter.Current.Item2; } t += angleRandom.Next(5,20); --current; return false; }); }
public void CopyOutputsToExchange(TaskRunContext task, string farmId) { ulong taskId = task.TaskId; var node = GetNode(task); var pack = PackageByName(node, task.PackageName); // temporary hack: files are not pushed from resource => using path from resource for scp copying string outFolderFromSystem = IncarnationParams.IncarnatePath(node.DataFolders.ExchangeUrlFromResource, taskId, CopyPhase.Out); //string outFolderFromSystem = IncarnationParams.IncarnatePath(node.DataFolders.ExchangeUrlFromSystem, taskId, CopyPhase.Out); bool copyingOutsToFtp = outFolderFromSystem.StartsWith("ftp://"); if (copyingOutsToFtp && !outFolderFromSystem.EndsWith("/")) outFolderFromSystem += '/'; if (!copyingOutsToFtp && !outFolderFromSystem.EndsWith("\\")) outFolderFromSystem += '\\'; string clusterFolder = IncarnationParams.IncarnatePath((!String.IsNullOrEmpty(pack.LocalDir)) ? String.Format(pack.LocalDir, task.TaskId) : node.DataFolders.LocalFolder, taskId, CopyPhase.Out); if (!clusterFolder.EndsWith("\\")) clusterFolder += "\\"; var exClient = GetExecuteServiceClient(node); string[] fileNames = exClient.GetAllFileNames(farmId, taskId); foreach(var output in task.ExpectedOutputFileNames) { Log.Info(output); } //IOProxy.Ftp.MakePath(ftpOutFolderFromSystem); var dirStructure = fileNames .Where(name => name.Contains('/') || name.Contains('\\')) // inside subdir .Select(name => name.Remove(name.LastIndexOfAny(new[] { '\\', '/' }))) .Distinct() .Select(file => outFolderFromSystem + file) .Union(new[] { outFolderFromSystem }); foreach (string dir in dirStructure) { if (copyingOutsToFtp) IOProxy.Ftp.MakePath(dir); else { Log.Debug("Creating dir " + dir); Directory.CreateDirectory(dir); } } Log.Info("Copying output files"); //System.Threading.Tasks.Parallel.ForEach(fileNames, (fileName) => foreach (string fileName in fileNames) { //if (files.Contains(fileName)) { string tmpFile = Path.GetTempFileName(); try { Log.Info("Copying file " + clusterFolder + fileName); //ScpGet( node, clusterFolder + fileName, tmpFile, false); DownloadFile(node, clusterFolder + fileName, tmpFile, taskId, farmId); if (copyingOutsToFtp) IOProxy.Ftp.UploadLocalFile(tmpFile, outFolderFromSystem, fileName, shouldCreatePath: false); else File.Copy(tmpFile, outFolderFromSystem + fileName); File.Delete(tmpFile); Log.Info("File copied " + fileName); } catch (Exception e) { Log.Warn(String.Format("Exception on file '{0}' copy: {1}", clusterFolder + fileName, e)); } } }//); }
private void CopyInputFiles(TaskRunContext task, string resorceHomeFolder) { var node = GetNode(task); Log.Info("Copying input files for task " + task.TaskId); foreach (var file in task.InputFiles) { var tmpFile = Path.GetTempFileName(); try { IOProxy.Storage.Download(file.StorageId, tmpFile); } catch(Exception exp) { Log.Error("Error " + exp); } var fileOnResource = resorceHomeFolder.TrimEnd(new[] { '/', '\\' }) + "\\" + file.FileName; Log.Info("Copying file " + fileOnResource); UploadFile(node, fileOnResource, tmpFile, task.TaskId.ToString(), task.Resource.Controller.FarmId); File.Delete(tmpFile); } Log.Info(String.Format("Copying input files for task {0} finished.", task.TaskId)); }
public virtual object Run(TaskRunContext task) { var node = GetNode(task); var pack = PackageByName(node, task.PackageName); var taskId = task.TaskId; var farmId = task.Resource.Controller.FarmId; var esService = GetExecuteServiceClient(node); var resorceHomeFolder = IncarnationParams.IncarnatePath(node.DataFolders.LocalFolder, task.TaskId, farmId, CopyPhase.None); PrepareEnviroment(esService, pack, resorceHomeFolder, farmId); CopyInputFiles(task, resorceHomeFolder); string cmdLine = String.Format(task.CommandLine, pack.AppPath, taskId); Log.Info("cmdline = " + cmdLine); var result = esService.ExecuteTaskOnFarm(taskId, farmId, cmdLine); Log.Info("Exec done. Job id = " + result); esService.Close(); return result + "\n" + node.NodeName; }
public object Run(TaskRunContext task) { ulong taskId = task.TaskId; int coresToUse = (int)task.NodesConfig.Sum(cfg => cfg.Cores); var node = GetNode(task); string ftpFolder = IncarnationParams.IncarnatePath(node.DataFolders.ExchangeUrlFromSystem, taskId, CopyPhase.In); string jobFtpFolder = IncarnationParams.IncarnatePath(node.DataFolders.ExchangeUrlFromSystem, taskId, CopyPhase.None); string sharedInputFolder = IncarnationParams.IncarnatePath(node.DataFolders.ExchangeUrlFromResource, taskId, CopyPhase.In); string sharedOutputFolder = IncarnationParams.IncarnatePath(node.DataFolders.ExchangeUrlFromResource, taskId, CopyPhase.Out); string tmpFolder = IncarnationParams.IncarnatePath(node.DataFolders.LocalFolder, taskId, CopyPhase.None); IOProxy.Ftp.MakePath(ftpFolder); IOProxy.Ftp.MakePath(jobFtpFolder); string jobFileName = "job_" + taskId + ".cmd"; logger.Info("Trying to exec task {0} on win PC {1}.{2}", taskId, node.ResourceName, node.NodeName); var pack = node.Packages.First(p => String.Equals(p.Name, task.PackageName, StringComparison.InvariantCultureIgnoreCase)); string batchContent = ""; batchContent += "mkdir " + tmpFolder.TrimEnd(new char[] { '/', '\\' }) + Environment.NewLine; if (Path.IsPathRooted(tmpFolder)) // change drive if needed { batchContent += Path.GetPathRoot(tmpFolder).TrimEnd(new char[] { '/', '\\' }) + Environment.NewLine; } batchContent += String.Format( @"cd {0}" + Environment.NewLine, tmpFolder.TrimEnd(new char[] { '/', '\\' }) ); batchContent += "echo %time% > clavire_script_started" + Environment.NewLine; foreach (string copyPath in pack.CopyOnStartup) { batchContent += String.Format( @"xcopy {0} {1}\ /z /s /e /c /i /h /r /y" + Environment.NewLine, copyPath.TrimEnd(new char[] { '/', '\\' }), tmpFolder.TrimEnd(new char[] { '/', '\\' }) ); } batchContent += String.Format( //@"ping localhost -w 1000 -n 50" + Environment.NewLine + @"xcopy {0} {1}\ /z /s /e /c /i /h /r /y" + Environment.NewLine, sharedInputFolder.TrimEnd(new char[] { '/', '\\' }), tmpFolder.TrimEnd(new char[] { '/', '\\' }) ); foreach (var envVar in pack.EnvVars) { batchContent += "set " + envVar.Key + "=" + envVar.Value + Environment.NewLine; } string commandLine = task.CommandLine; //var pack = node.Packages.First(p => commandLine.StartsWith(p.Name, StringComparison.InvariantCultureIgnoreCase)); //commandLine = pack.Params["appPath"] + commandLine.Substring(pack.Name.Length); commandLine = String.Format(task.CommandLine, pack.AppPath); //commandLine = String.Format(incarnation.CommandLine, pack.Params["appPath"]); batchContent += "echo %time% > clavire_task_started" + Environment.NewLine; batchContent += //"start \"" + jobFileName + " " + incarnation.PackageNameInConfig + "\" /wait /b" + "cmd.exe /c " + commandLine + Environment.NewLine; batchContent += "echo %time% > clavire_task_finished" + Environment.NewLine; foreach (string copyPath in pack.CleanupIgnore) { batchContent += String.Format( @"xcopy {1} {0} /z /s /e /c /i /h /r /y" + Environment.NewLine, (sharedOutputFolder.TrimEnd(new char[] { '/', '\\' }) + "/" + copyPath.TrimStart(new char[] { '/', '\\' })).Replace("/", "\\"), (tmpFolder.TrimEnd(new char[] { '/', '\\' }) + "/" + copyPath.TrimStart(new char[] { '/', '\\' })).Replace("/", "\\") ); } foreach (string delPath in pack.Cleanup) { batchContent += String.Format( @"rmdir /s /q {0}" + Environment.NewLine + @"del /f /s /q {0}" + Environment.NewLine, tmpFolder + delPath // todo: delPath.TrimStart ); } batchContent += String.Format( @"xcopy {1} {0}\ /z /s /e /c /i /h /r /y" + Environment.NewLine, sharedOutputFolder.TrimEnd(new char[] { '/', '\\' }), tmpFolder.TrimEnd(new char[] { '/', '\\' }) ); batchContent += String.Format( //@"ping localhost -n 3" + Environment.NewLine + @"echo %time% > clavire_script_finished" + Environment.NewLine + @"xcopy clavire_script_finished {1}\ /z /s /e /c /i /h /r /y" + Environment.NewLine + @"cd {0}" + Environment.NewLine + @"cd .." + Environment.NewLine + //@"rmdir /s /q {0}" + Environment.NewLine + "", tmpFolder.TrimEnd(new char[] { '/', '\\' }), sharedOutputFolder.TrimEnd(new char[] { '/', '\\' }) ); int pauseLine = -1; Int32.TryParse(Config.AppSettings[DEBUG_PAUSE_PARAM_NAME] ?? "-1", out pauseLine); if (pauseLine >= 0) { var batchLines = batchContent.Replace("\r", "").Split(new[] { '\n' }); string newBatchContent = String.Join(Environment.NewLine, batchLines.Take(pauseLine)) + Environment.NewLine + "pause" + Environment.NewLine + String.Join(Environment.NewLine, batchLines.Skip(pauseLine)); batchContent = newBatchContent; } IOProxy.Ftp.UploadFileContent(batchContent, jobFtpFolder, jobFileName); var rexService = GetREx(node.Services.ExecutionUrl); // todo : close service client! int pid = rexService.Exec(taskId); logger.Info("Task {0} ({1}) started on pc {2}.{3} with pid = {4}", taskId, pack.Name, node.ResourceName, node.NodeName, pid); return(pid + "\n" + node.NodeName); }
public void push(TaskRunContext context, ulong taskId, TaskStateInfo info) { }
public object Run(TaskRunContext task) { var node = GetNode(task); var pack = PackageByName(node, task.PackageName); ulong taskId = task.TaskId; Log.Info("Locking operation"); var operationHolder = LockOperation(task.TaskId, TaskLock.WRITE_OPERATION_EXECUTED); string fileNames; string clusterHomeFolder = CopyInputFiles(task, out fileNames); string cmdLine = String.Format(task.CommandLine, pack.AppPath, taskId, fileNames.Trim()); Log.Debug("cmdline = " + cmdLine); Log.Info("Preparing script"); string scriptPath = MakeScript(pack, cmdLine, node, clusterHomeFolder); Log.Info("Script prepared. Executing it."); var result = SshExec(node, SshCommands.Run, scriptPath); UnLockOperation(task.TaskId, operationHolder); Log.Info("Operation unlocked"); string jobId = result.Split(new[] { '\r', '\n', ' ', '\t' }, StringSplitOptions.RemoveEmptyEntries).Last(); Log.Info("Exec done. Job id = " + jobId); return jobId; }
private TaskCache(TaskRunContext context, TaskState state = TaskState.Started, string stateComment = "") { Context = context; StateInfo = new TaskStateInfo(state, stateComment); _isUpdating = false; _lastUpdateTime = DateTime.Now - UPDATE_INTERVAL - TimeSpan.FromMilliseconds(50); }
private void SubmitTask(TaskRunContext task) { lock (task.Lock) { try { lock (_nodeStateCacheLock) { bool nodesOverloaded = false; var nodeStates = _nodeStateCache[task.Resource.ResourceName]; foreach (var nodeConfig in task.NodesConfig) { var nodeState = nodeStates.Single(n => n.NodeName == nodeConfig.NodeName); if (nodeState.CoresAvailable <= nodeConfig.Cores) nodesOverloaded = true; nodeState.TasksSubmitted++; nodeState.CoresReserved += nodeConfig.Cores; } if (nodesOverloaded) { Log.Error("Nodes overload for resource " + task.Resource.ResourceName); throw new Exception("Wrong config for task " + task.TaskId.ToString() + ". Selected nodes are overloaded"); } } task.LocalId = task.Controller.Run(task); task.CachedRunInfo = new TaskRunInfo(TaskState.Started); } catch (Exception e) { RevokeTask(task); Log.Error(String.Format("Unable to run task {1}: {2}{0}{3}", Environment.NewLine, task.TaskId, e.Message, e.StackTrace )); throw; } } }
public void CopyOutputsToExchange(TaskRunContext task) { ulong taskId = task.TaskId; var node = GetNode(task); var pack = PackageByName(node, task.PackageName); // temporary hack: files are not pushed from resource => using path from resource for scp copying string outFolderFromSystem = IncarnationParams.IncarnatePath(node.DataFolders.ExchangeUrlFromResource, taskId, CopyPhase.Out); //string outFolderFromSystem = IncarnationParams.IncarnatePath(node.DataFolders.ExchangeUrlFromSystem, taskId, CopyPhase.Out); bool copyingOutsToFtp = outFolderFromSystem.StartsWith("ftp://"); if (copyingOutsToFtp && !outFolderFromSystem.EndsWith("/")) outFolderFromSystem += '/'; if (!copyingOutsToFtp && !outFolderFromSystem.EndsWith("\\")) outFolderFromSystem += '\\'; string clusterFolder = IncarnationParams.IncarnatePath((!String.IsNullOrEmpty(pack.LocalDir)) ? String.Format(pack.LocalDir, task.TaskId) : node.DataFolders.LocalFolder, taskId, CopyPhase.Out); if (!clusterFolder.EndsWith("/")) clusterFolder += "/"; //var files = ImproveFiles(task.Incarnation.ExpectedOutputFileNames); /* var fileNames = SshExec(node, SshPbsCommands.Find, clusterFolder) .Split(new[] { ", ", "," }, StringSplitOptions.RemoveEmptyEntries) .Where(st => !st.Contains("/")) .Select(st => st.Replace("*", "").Replace("|", "").Replace("\n","")) .Where(st => !st.Contains(".rst") && !st.Contains(".err") && !st.Contains(".esav"));*/ var fileNames = SshExec(node, "cd " + clusterFolder + "; " + SSH_FIND_COMMAND, "") .Replace("./", "/").Split(new[] { "\n" }, StringSplitOptions.RemoveEmptyEntries) .Where(st => !st.Contains(".rst") /*&& !st.Contains(".err")*/ && !st.Contains(".esav")) .Select(st => st.Trim(new[] { '/', '\\' })); //IOProxy.Ftp.MakePath(ftpOutFolderFromSystem); var dirStructure = fileNames .Where(name => name.Contains('/') || name.Contains('\\')) // inside subdir .Select(name => name.Remove(name.LastIndexOfAny(new[] { '\\', '/' }))) .Distinct() .Select(file => outFolderFromSystem + file) .Union(new[] { outFolderFromSystem }); foreach (string dir in dirStructure) { if (copyingOutsToFtp) IOProxy.Ftp.MakePath(dir); else { Log.Debug("Creating dir " + dir); Directory.CreateDirectory(dir); } } Log.Info("Copying output files"); //System.Threading.Tasks.Parallel.ForEach(fileNames, (fileName) => foreach (string fileName in fileNames) { //if (files.Contains(fileName)) { string tmpFile = Path.GetTempFileName(); try { Log.Info("Copying file " + clusterFolder + fileName); //ScpGet(node, clusterFolder + fileName, tmpFile, false); DownloadFile(node, clusterFolder + fileName, tmpFile); if (copyingOutsToFtp) IOProxy.Ftp.UploadLocalFile(tmpFile, outFolderFromSystem, fileName, shouldCreatePath: false); else File.Copy(tmpFile, outFolderFromSystem + fileName); File.Delete(tmpFile); Log.Info("File copied " + fileName); } catch (Ssh.SshTransferException e) { Log.Warn(String.Format("During coping file {0} for task {1} from error was happend: {2}", fileName, taskId, e)); // todo : lolwut? } catch (Exception e) { Log.Warn(String.Format("Exception on file '{0}' copy: {1}", clusterFolder + fileName, e)); } } }//); }
protected ResourceNode GetNode(TaskRunContext task) { return(GetNode(task.Resource, task.NodesConfig)); }
public Dictionary<string, List<ProcessStatInfo>> GetTaskInfoStartWith(ulong taskId, DateTime startTime, TaskRunContext task) { //todo ask about getting all nodes var node = GetNode(task); var client = GetStatisticsServiceClient(node); var result = new Dictionary<string, List<ProcessStatInfo>>(); try { var data = client.GetAllTaskInfoStartedWith((int) taskId, startTime); result.Add(node.NodeName,data); } catch (FaultException ex) { //todo this Exception can be linked with state of a task //resolve it later Log.Error(" Exception while trying to get cacheable task info " + ex.ToString()); //throw ex; } catch (Exception ex) { Log.Error(" Exception while trying to get cacheable task info " + ex.ToString()); throw ex; } return result; }