public override async T.Task <bool> ProcessTaskItemAsync(TaskItem taskItem, CancellationToken token) { var message = taskItem.GetMessage <TaskEventMessage>(); this.Logger.Information("Do work for TaskEvent {0}, {1}, {2}, message {3}", message.Id, message.JobType, message.EventVerb, taskItem.Id); try { // TODO: refactor the processor design. JobTaskProcessor processor = null; switch (message.EventVerb) { case "cancel": processor = this.Provider.GetService <CancelJobOrTaskProcessor>(); break; case "start": processor = this.Provider.GetService <StartJobAndTaskProcessor>(); break; default: break; } if (processor is ServerObject so) { so.CopyFrom(this); } var result = await processor.ProcessAsync(message, taskItem.GetInsertionTime(), token); this.Logger.Information("Finished process {0} {1} {2}, result {3}", message.EventVerb, message.JobId, message.Id, result); return(result); } catch (OperationCanceledException) { return(false); } catch (Exception ex) { this.Logger.Error("Exception occurred when process {0}, {1}, {2}, {3}", message.EventVerb, message.JobId, message.Id, ex); await this.Utilities.UpdateJobAsync(message.JobType, message.JobId, j => { j.State = (j.State == JobState.Canceled || j.State == JobState.Finished) ? j.State : JobState.Failed; (j.Events ?? (j.Events = new List <Event>())).Add(new Event() { Content = $"Exception occurred when process job {message.JobId} {message.JobType} {message.EventVerb}. {ex}", Source = EventSource.Job, Type = EventType.Alert, }); }, token, this.Logger); } return(true); }
public override async T.Task <bool> ProcessTaskItemAsync(TaskItem taskItem, CancellationToken token) { var message = taskItem.GetMessage <TaskEventMessage>(); this.Logger.Information("Do work for TaskEvent {0}, {1}, {2}, message {3}", message.Id, message.JobType, message.EventVerb, taskItem.Id); try { // TODO: refactor the processor design. JobTaskProcessor processor = null; switch (message.EventVerb) { case "cancel": processor = this.Provider.GetService <CancelJobOrTaskProcessor>(); break; case "start": processor = this.Provider.GetService <StartJobAndTaskProcessor>(); break; default: break; } if (processor is ServerObject so) { so.CopyFrom(this); } var result = await processor.ProcessAsync(message, taskItem.GetInsertionTime(), token); this.Logger.Information("Finished process {0} {1} {2}, result {3}", message.EventVerb, message.JobId, message.Id, result); return(result); } catch (OperationCanceledException) { return(false); } catch (Exception ex) { this.Logger.Error("Exception occurred when process {0}, {1}, {2}, {3}", message.EventVerb, message.JobId, message.Id, ex); await this.Utilities.FailJobWithEventAsync( message.JobType, message.JobId, $"Exception occurred when process job {message.JobId} {message.JobType} {message.EventVerb}. {ex}", token); } return(true); }
public override async T.Task <bool> ProcessTaskItemAsync(TaskItem taskItem, CancellationToken token) { var runningJobMessage = taskItem.GetMessage <RunningJobMessage>(); this.Logger.Information("Do work for job {0}, requeueCount {1}, message {2}", runningJobMessage.JobId, runningJobMessage.RequeueCount, taskItem.Id); var worker = this.Provider.GetRequiredService <JobTaskDispatcherWorker>(); worker.CopyFrom(this); await worker.InitializeAsync(runningJobMessage.JobType, runningJobMessage.JobId, token); await worker.DoWorkAsync(token); this.Logger.Information("Finished running for job {0}, requeueCount {1}, message {2}", runningJobMessage.JobId, runningJobMessage.RequeueCount, taskItem.Id); return(true); }
public override async T.Task <bool> ProcessTaskItemAsync(TaskItem taskItem, CancellationToken token) { this.Logger.Information("Do work for Script sync message {0}", taskItem?.Id); try { var message = taskItem?.GetMessage <ScriptSyncMessage>(); var results = await T.Task.WhenAll(this.SyncDiagScriptsAsync(token), this.SyncMetricScriptsAsync(token)); return(results.All(r => r)); } catch (Exception ex) { this.Logger.Error("Exception occurred when process script sync message {0}, {1}", taskItem?.Id, ex); } return(true); }
public override async Task <bool> DoWorkAsync(TaskItem taskItem, CancellationToken token) { var message = taskItem.GetMessage <JobDispatchMessage>(); using (this.logger.BeginScope("Do work for JobDispatchMessage {0}", message.Id)) { var result = await this.jobTable.ExecuteAsync( TableOperation.Retrieve <JsonTableEntity>( this.utilities.GetJobPartitionKey($"{message.Type}", message.Id), this.utilities.JobEntryKey), null, null, token); this.logger.LogInformation("Queried job table entity for job id {0}, result {1}", message.Id, result.HttpStatusCode); if (result.Result is JsonTableEntity entity) { var job = entity.GetObject <Job>(); job.State = JobState.Running; var internalJob = InternalJob.CreateFrom(job); await Task.WhenAll(internalJob.TargetNodes.Select(async n => { var q = await this.utilities.GetOrCreateNodeDispatchQueueAsync(n, token); await q.AddMessageAsync(new CloudQueueMessage(JsonConvert.SerializeObject(internalJob)), null, null, null, null, token); })); result = await this.jobTable.ExecuteAsync(TableOperation.Replace(entity), null, null, token); this.logger.LogInformation("Dispatched job, update job result code {0}", result.HttpStatusCode); return(result.IsSuccessfulStatusCode()); } else { this.logger.LogWarning("The entity queried is not of <JobTableEntity> type, {0}", result.Result); return(false); } } }
public override async Task <bool> DoWorkAsync(TaskItem taskItem, CancellationToken token) { var job = taskItem.GetMessage <InternalJob>(); var nodeName = this.Configuration.GetValue <string>(Constants.HpcHostNameEnv); using (this.logger.BeginScope("Do work for InternalJob {0} on node {1}", job.Id, nodeName)) { // TODO: make sure invisible. logger.LogInformation("Executing job {0}", job.Id); var tasks = Enumerable.Range(0, job.CommandLines.Length).Select(async taskId => { var cmd = job.CommandLines[taskId]; logger.LogInformation("Executing command {0}, job {1}", cmd, job.Id); var taskKey = this.utilities.GetTaskKey(job.Id, taskId, job.RequeueCount); var resultKey = this.utilities.GetJobResultKey(nodeName, taskKey); var taskResultBlob = await this.utilities.CreateOrReplaceTaskOutputBlobAsync(job.Id, resultKey, token); using (var monitor = this.Monitor.StartMonitorTask(taskKey, async(output, cancellationToken) => { try { await taskResultBlob.AppendTextAsync(output, Encoding.UTF8, null, null, null, cancellationToken); } catch (Exception ex) { this.logger.LogError(ex, "Error happened when append to blob {0}", taskResultBlob.Name); } })) { this.logger.LogInformation("Call startjobandtask for job {0}, task {1}", job.Id, taskKey); var jobPartitionName = this.utilities.GetJobPartitionKey($"{job.Type}", job.Id); var nodePartitionName = this.utilities.GetNodePartitionKey(nodeName); var taskResultArgs = new ComputeNodeTaskCompletionEventArgs(nodeName, job.Id, null) { State = TaskState.Dispatching }; var taskResultEntity = new JsonTableEntity(jobPartitionName, resultKey, taskResultArgs); var result = await jobsTable.ExecuteAsync(TableOperation.InsertOrReplace(taskResultEntity), null, null, token); this.logger.LogInformation("Saved task result {0} to jobs table, status code {1}", resultKey, result.HttpStatusCode); if (!result.IsSuccessfulStatusCode()) { return(false); } var nodeResultEntity = new JsonTableEntity(nodePartitionName, resultKey, taskResultArgs); result = await nodesTable.ExecuteAsync(TableOperation.InsertOrReplace(nodeResultEntity), null, null, token); this.logger.LogInformation("Saved task result {0} to nodes table, status code {1}", resultKey, result.HttpStatusCode); if (!result.IsSuccessfulStatusCode()) { return(false); } await this.communicator.StartJobAndTaskAsync( nodeName, new StartJobAndTaskArg(new int[0], job.Id, taskId), "", "", new ProcessStartInfo(cmd, "", "", $"{this.communicator.Options.AgentUriBase}/output/{taskKey}", "", new System.Collections.Hashtable(), new long[0], job.RequeueCount), token); taskResultArgs = new ComputeNodeTaskCompletionEventArgs(nodeName, job.Id, null) { State = TaskState.Running }; taskResultEntity = new JsonTableEntity(jobPartitionName, resultKey, taskResultArgs); result = await jobsTable.ExecuteAsync(TableOperation.InsertOrReplace(taskResultEntity), null, null, token); this.logger.LogInformation("Saved task result {0} to jobs table, status code {1}", resultKey, result.HttpStatusCode); if (!result.IsSuccessfulStatusCode()) { return(false); } nodeResultEntity = new JsonTableEntity(nodePartitionName, resultKey, taskResultArgs); result = await nodesTable.ExecuteAsync(TableOperation.InsertOrReplace(nodeResultEntity), null, null, token); this.logger.LogInformation("Saved task result {0} to nodes table, status code {1}", resultKey, result.HttpStatusCode); if (!result.IsSuccessfulStatusCode()) { return(false); } this.logger.LogInformation("Wait for response for job {0}, task {1}", job.Id, taskKey); taskResultArgs = await monitor.Execution; this.logger.LogInformation("Saving result for job {0}, task {1}", job.Id, taskKey); taskResultArgs.State = TaskState.Finished; taskResultEntity = new JsonTableEntity(jobPartitionName, resultKey, taskResultArgs); result = await jobsTable.ExecuteAsync(TableOperation.InsertOrReplace(taskResultEntity), null, null, token); this.logger.LogInformation("Saved task result {0} to jobs table, status code {1}", resultKey, result.HttpStatusCode); if (!result.IsSuccessfulStatusCode()) { return(false); } nodeResultEntity = new JsonTableEntity(nodePartitionName, resultKey, taskResultArgs); result = await nodesTable.ExecuteAsync(TableOperation.InsertOrReplace(nodeResultEntity), null, null, token); this.logger.LogInformation("Saved task result {0} to nodes table, status code {1}", resultKey, result.HttpStatusCode); if (!result.IsSuccessfulStatusCode()) { return(false); } return(true); } }); var results = await Task.WhenAll <bool>(tasks); return(results.All(r => r)); } }
public override async T.Task <bool> ProcessTaskItemAsync(TaskItem taskItem, CancellationToken token) { var taskItems = taskItem.GetMessage <TaskItem[]>(); var messages = taskItems.Select(ti => { var msg = ti.GetMessage <TaskCompletionMessage>(); this.Logger.Information("Do work for job {0}, task {1}, message {2}", msg.JobId, msg.Id, ti.Id); return(msg); }); var jobGroups = messages.GroupBy(msg => this.Utilities.GetJobPartitionKey(msg.JobType, msg.JobId)); var results = await T.Task.WhenAll(jobGroups.Select(async jg => { var jobPartitionKey = jg.Key; this.Logger.Information("Do work for job {0}, tasks finished: {1}", jobPartitionKey, string.Join(",", jg.Select(t => t.Id))); var job = await this.jobsTable.RetrieveAsync <Job>(jobPartitionKey, this.Utilities.JobEntryKey, token); if (job == null || job.State != JobState.Running) { this.Logger.Warning("Skip processing the task completion of {0}. Job state {1}.", jobPartitionKey, job?.State); return(true); } var skippedTasks = string.Join(",", jg.Where(msg => msg.RequeueCount != job.RequeueCount).Select(msg => $"{msg.Id}.{msg.RequeueCount}")); if (!string.IsNullOrEmpty(skippedTasks)) { this.Logger.Warning("Skip processing the task completion, job requeueCount {0}, tasks {1}.", job.RequeueCount, skippedTasks); } var tasks = jg.Where(msg => msg.RequeueCount == job.RequeueCount).ToList(); var completedCount = tasks.Count(t => t.Id != 0 && t.Id != int.MaxValue); if (completedCount > 0) { await this.Utilities.UpdateJobAsync(job.Type, job.Id, j => { j.CompletedTaskCount = Math.Min(j.CompletedTaskCount + completedCount, j.TaskCount); }, token, this.Logger); } if (job.Type == JobType.Diagnostics) { string diagKey = job.DiagnosticTest.Category + job.DiagnosticTest.Name; if (!this.diagTests.TryGetValue(diagKey, out InternalDiagnosticsTest diagTest)) { diagTest = await this.jobsTable.RetrieveAsync <InternalDiagnosticsTest>( this.Utilities.GetDiagPartitionKey(job.DiagnosticTest.Category), job.DiagnosticTest.Name, token); this.diagTests.TryAdd(diagKey, diagTest); } if (diagTest?.TaskResultFilterScript?.Name != null) { if (!this.taskFilterScript.TryGetValue(diagKey, out string script)) { var scriptBlob = this.Utilities.GetBlob(diagTest.TaskResultFilterScript.ContainerName, diagTest.TaskResultFilterScript.Name); using (var stream = new MemoryStream()) { await scriptBlob.DownloadToStreamAsync(stream, null, null, null, token); stream.Seek(0, SeekOrigin.Begin); using (StreamReader sr = new StreamReader(stream, true)) { script = await sr.ReadToEndAsync(); } } this.taskFilterScript.TryAdd(diagKey, script); } var path = Path.GetTempFileName(); try { await File.WriteAllTextAsync(path, script, token); var hookResults = await T.Task.WhenAll(tasks.Select(tid => this.TaskResultHook(job, tid.Id, path, token))); if (hookResults.Any(r => !r)) { return(false); } } finally { File.Delete(path); } } } if (job.FailJobOnTaskFailure && tasks.Any(t => t.ExitCode != 0)) { this.Logger.Information("Fail the job because some tasks failed {0}", job.Id); await this.Utilities.UpdateJobAsync(job.Type, job.Id, j => { j.State = JobState.Failed; (j.Events ?? (j.Events = new List <Event>())).Add(new Event() { Content = $"Fail the job because some tasks failed", Source = EventSource.Job, Type = EventType.Alert }); }, token, this.Logger); return(true); } var childIds = await T.Task.WhenAll(tasks.Select(async t => new { t.Id, ChildIds = t.ChildIds ?? await this.Utilities.LoadTaskChildIdsAsync(t.Id, job.Id, job.RequeueCount, token) })); foreach (var cids in childIds) { this.Logger.Information("{0} Job {1} requeuecount {2}, task {3} completed, child ids {4}", job.Type, job.Id, job.RequeueCount, cids.Id, string.Join(",", cids.ChildIds)); } var childIdGroups = childIds .SelectMany(ids => ids.ChildIds.Select(cid => new { ParentId = ids.Id, ChildId = cid })) .GroupBy(idPair => idPair.ChildId) .Select(g => new { ChildId = g.Key, Count = g.Count(), ParentIds = g.Select(idPair => idPair.ParentId).ToList() }).ToList(); var childResults = await T.Task.WhenAll(childIdGroups.Select(async cid => { this.Logger.Information("{0} Job {1} requeuecount {2}, task {3} has {4} ancestor tasks completed {5}", job.Type, job.Id, job.RequeueCount, cid.ChildId, cid.Count, string.Join(",", cid.ParentIds)); var childTaskKey = this.Utilities.GetTaskKey(job.Id, cid.ChildId, job.RequeueCount); bool unlocked = false; bool isEndTask = false; Task childTask = null; if (!await this.Utilities.UpdateTaskAsync(jobPartitionKey, childTaskKey, t => { var unzippedParentIds = Compress.UnZip(t.ZippedParentIds); this.Logger.Information("{0} Job {1} task {2}, ZippedParentIds {3}, unzipped {4}", job.Type, job.Id, cid.ChildId, t.ZippedParentIds, unzippedParentIds); HashSet <int> parentIds; try { parentIds = new HashSet <int>(unzippedParentIds.Split(',', StringSplitOptions.RemoveEmptyEntries).Select(_ => int.Parse(_))); } catch (FormatException ex) { this.Logger.Error("Error happened {0}, input string {1}, len {2}", ex, unzippedParentIds, unzippedParentIds.Length); throw; } var oldParentIdsCount = parentIds.Count; this.Logger.Information("{0} Job {1} requeuecount {2}, task {3} has {4} parent tasks {5}", job.Type, job.Id, job.RequeueCount, cid.ChildId, oldParentIdsCount, string.Join(",", parentIds)); cid.ParentIds.ForEach(_ => parentIds.Remove(_)); var newParentIdsStr = string.Join(",", parentIds); this.Logger.Information("{0} Job {1} requeuecount {2}, after remove, task {3} has {4} parent tasks {5}", job.Type, job.Id, job.RequeueCount, cid.ChildId, parentIds.Count, newParentIdsStr); if (parentIds.Count + cid.Count != oldParentIdsCount) { this.Logger.Warning("{0} Job {1} requeuecount {2}, task {3}, ids mismatch!", job.Type, job.Id, job.RequeueCount, cid.ChildId); } t.ZippedParentIds = Compress.GZip(newParentIdsStr); unlocked = parentIds.Count == 0; isEndTask = t.Id == int.MaxValue; if (unlocked) { t.State = isEndTask ? TaskState.Finished : TaskState.Dispatching; } childTask = t; }, token, this.Logger)) { await this.Utilities.UpdateJobAsync(job.Type, job.Id, j => { j.State = JobState.Failed; // TODO: make event separate. (j.Events ?? (j.Events = new List <Event>())).Add(new Event() { Content = $"Unable to update task record {cid.ChildId}", Source = EventSource.Job, Type = EventType.Alert }); }, token, this.Logger); return(true); } if (unlocked) { if (isEndTask) { await this.Utilities.UpdateJobAsync(job.Type, job.Id, j => j.State = j.State == JobState.Running ? JobState.Finishing : j.State, token, this.Logger); var jobEventQueue = this.Utilities.GetJobEventQueue(); await jobEventQueue.AddMessageAsync( // todo: event message generation. new CloudQueueMessage(JsonConvert.SerializeObject(new JobEventMessage() { Id = job.Id, EventVerb = "finish", Type = job.Type })), null, null, null, null, token); } else { var queue = this.Utilities.GetNodeDispatchQueue(childTask.Node); await queue.AddMessageAsync( new CloudQueueMessage(JsonConvert.SerializeObject(new TaskEventMessage() { EventVerb = "start", Id = childTask.Id, JobId = childTask.JobId, JobType = childTask.JobType, RequeueCount = job.RequeueCount }, Formatting.Indented)), null, null, null, null, token); this.Logger.Information("Dispatched job {0} task {1} to node {2}", childTask.JobId, childTask.Id, childTask.Node); } } return(true); })); return(childResults.All(r => r)); })); return(results.All(r => r)); }
public override async T.Task <bool> ProcessTaskItemAsync(TaskItem taskItem, CancellationToken token) { var message = taskItem.GetMessage <JobEventMessage>(); this.Logger.Information("Do work for JobEvent {0}, {1}, {2}, message {3}", message.Id, message.Type, message.EventVerb, taskItem.Id); var jobPartitionKey = this.Utilities.GetJobPartitionKey(message.Type, message.Id); var jobEntryKey = this.Utilities.JobEntryKey; var job = await this.jobsTable.RetrieveAsync <Job>( jobPartitionKey, jobEntryKey, token); this.Logger.Information("Queried job table entity for job id {0}", message.Id); if (job != null) { try { IJobTypeHandler typeHandler; IJobActionHandler actionHandler; if (this.ActionHandlerTypes.TryGetValue(message.EventVerb, out Type actionType) && this.JobTypeHandlers.TryGetValue(message.Type, out Type jobType) && (null != (actionHandler = (IJobActionHandler)this.Provider.GetService(actionType))) && (null != (typeHandler = (IJobTypeHandler)this.Provider.GetService(jobType)))) { ((ServerObject)actionHandler).CopyFrom(this); ((ServerObject)typeHandler).CopyFrom(this); actionHandler.JobTypeHandler = typeHandler; await actionHandler.ProcessAsync(job, message, token); this.Logger.Information("Processed {0} job {1} {2}", job.Type, job.Id, job.State); } else { this.Logger.Warning("No processors found for job type {0}, {1}, {2}", job.Type, job.Id, message.EventVerb); await this.Utilities.UpdateJobAsync(job.Type, job.Id, j => { j.State = JobState.Failed; (j.Events ?? (j.Events = new List <Event>())).Add(new Event() { Content = $"No processors found for job type {j.Type}, event {message.EventVerb}", Source = EventSource.Job, Type = EventType.Alert, }); }, token, this.Logger); } } catch (Exception ex) { this.Logger.Error("Exception occurred when process job {0}, {1}, {2}, {3}", job.Id, job.Type, message.EventVerb, ex); await this.Utilities.UpdateJobAsync(job.Type, job.Id, j => { j.State = JobState.Failed; (j.Events ?? (j.Events = new List <Event>())).Add(new Event() { Content = $"Exception occurred when process job {job.Id} {job.Type} {message.EventVerb}. {ex}", Source = EventSource.Job, Type = EventType.Alert, }); }, token, this.Logger); } return(true); } else { Debug.Assert(false); this.Logger.Warning("The entity queried is not of job type, {0}", message.Id); return(false); } }
public override async T.Task <bool> ProcessTaskItemAsync(TaskItem taskItem, CancellationToken token) { this.batchId++; var taskItems = taskItem.GetMessage <TaskItem[]>(); this.Logger.Information("Entering batch {0}, size {1}", this.batchId, taskItems.Length); var messages = taskItems.Select(ti => { var msg = ti.GetMessage <TaskCompletionMessage>(); this.Logger.Information(" Do work for job {0}, task {1}, message {2}", msg.JobId, msg.Id, ti.Id); return(msg); }).ToList(); this.Logger.Information("Do work for job {0}, tasks finished: {1}", jobPartitionKey, string.Join(",", messages.Select(t => t.Id))); var skippedTasks = string.Join(",", messages.Where(msg => msg.RequeueCount != job.RequeueCount).Select(msg => $"{msg.Id}.{msg.RequeueCount}")); if (!string.IsNullOrEmpty(skippedTasks)) { this.Logger.Warning("Skip processing the task completion, job requeueCount {0}, tasks {1}.", job.RequeueCount, skippedTasks); } var tasks = messages.Where(msg => msg.RequeueCount == job.RequeueCount).ToList(); this.Logger.Information("Deleting timeout guard {0}", jobPartitionKey); var jobTaskCompletionQueue = this.Utilities.GetJobTaskCompletionQueue(job.Id); await T.Task.WhenAll(tasks.Where(t => !t.Timeouted).Select(async t => { if (t.Id == 0) { return; } async T.Task DeleteTimeoutAsync(ConcurrentDictionary <int, CloudQueueMessage> dict, int jobId, int id, CloudQueue queue) { if (dict.TryRemove(id, out var msg)) { try { await queue.DeleteMessageAsync(msg.Id, msg.PopReceipt, null, null, token); this.Logger.Information(" Deleted {0} timeout message for job {1}, task {2}, message {3}", queue.Name, jobId, id, msg.Id); } catch (StorageException ex) { if (ex.IsNotFound()) { this.Logger.Information(" Not found the {0} timeout message {1} for job {2}, task {3}", queue.Name, msg.Id, jobId, id); } else if (ex.IsCancellation()) { return; } else { this.Logger.Warning(ex, " Unable to delete the {0} timeout message {1} for job {2}, task {3}", queue.Name, msg.Id, jobId, id); } } } else { this.Logger.Information(" Cannot find the node timeout message in memory for job {0}, task {1}", job.Id, t.Id); } } if (!this.tasksDict.TryGetValue(t.Id, out Task tt)) { this.Logger.Information(" Cannot find task for job {0}, task {1}", job.Id, t.Id); return; } var nodeCancelQueue = this.Utilities.GetNodeCancelQueue(tt.Node); await T.Task.WhenAll(DeleteTimeoutAsync(this.taskNodeTimeoutMessages, job.Id, t.Id, nodeCancelQueue), DeleteTimeoutAsync(this.taskTimeoutMessages, job.Id, t.Id, jobTaskCompletionQueue)); })); this.Logger.Information("Updating tasks state in memory {0}", jobPartitionKey); foreach (var t in tasks) { var tt = this.tasksDict[t.Id]; this.Logger.Information(" {0} Job {1} requeuecount {2}, task {3} on {4} completed, timeout {5}, currentState {6}, child ids {7}", job.Type, job.Id, job.RequeueCount, t.Id, tt.Node, t.Timeouted, tt.State, string.Join(",", this.tasksDict[t.Id].ChildIds)); bool alreadyFinished = tt.State == TaskState.Finished || tt.State == TaskState.Canceled || tt.State == TaskState.Failed; tt.State = alreadyFinished ? tt.State : (t.Timeouted ? TaskState.Canceled : (t.ExitCode == 0 ? TaskState.Finished : TaskState.Failed)); } this.Logger.Information("Updating tasks state for timeouted tasks in storage {0}", jobPartitionKey); await T.Task.WhenAll(tasks.Where(t => t.Timeouted).Select(async t => { var key = this.Utilities.GetTaskKey(job.Id, t.Id, job.RequeueCount); TaskState state = TaskState.Canceled; await this.Utilities.UpdateTaskAsync(this.jobPartitionKey, key, task => { state = task.State = task.State != TaskState.Canceled && task.State != TaskState.Failed && task.State != TaskState.Finished ? TaskState.Canceled : task.State; }, token, this.Logger); this.Logger.Information(" Updated {0}, task {1} state to {2}", job.Id, t.Id, state); })); if (this.batchId % 10 == 0) { await this.UpdateJobProgress(token); } if (job?.State != JobState.Running) { this.shouldExit = true; this.Logger.Warning("Skip processing the task completion of {0}. Job state {1}.", jobPartitionKey, job?.State); return(true); } if (job.Type == JobType.Diagnostics) { this.Logger.Information("Processing task filters for job {0}", jobPartitionKey); string diagKey = job.DiagnosticTest.Category + job.DiagnosticTest.Name; if (!this.diagTests.TryGetValue(diagKey, out InternalDiagnosticsTest diagTest)) { diagTest = await this.jobsTable.RetrieveAsync <InternalDiagnosticsTest>( this.Utilities.GetDiagPartitionKey(job.DiagnosticTest.Category), job.DiagnosticTest.Name, token); this.diagTests.TryAdd(diagKey, diagTest); } if (diagTest?.TaskResultFilterScript?.Name != null && diagTest.RunTaskResultFilter) { this.Logger.Information("Run task filters for job {0}", jobPartitionKey); if (!this.taskFilterScript.TryGetValue(diagKey, out string script)) { var scriptBlob = this.Utilities.GetBlob(diagTest.TaskResultFilterScript.ContainerName, diagTest.TaskResultFilterScript.Name); using (var stream = new MemoryStream()) { await scriptBlob.DownloadToStreamAsync(stream, null, null, null, token); stream.Seek(0, SeekOrigin.Begin); using (StreamReader sr = new StreamReader(stream, true)) { script = await sr.ReadToEndAsync(); } } this.taskFilterScript.TryAdd(diagKey, script); } var path = Path.GetTempFileName(); try { await File.WriteAllTextAsync(path, script, token); await T.Task.WhenAll(tasks.Select(tid => this.TaskResultHook(job, tid.Id, path, token))); } finally { File.Delete(path); } } } this.Logger.Information("Check FailOnTaskFailure for job {0}", jobPartitionKey); if (job.FailJobOnTaskFailure && tasks.Any(t => t.ExitCode != 0)) { this.Logger.Information("Fail the job because some tasks failed {0}", job.Id); await this.Utilities.UpdateJobAsync(job.Type, job.Id, j => { j.State = JobState.Failed; (j.Events ?? (j.Events = new List <Event>())).Add(new Event() { Content = $"Fail the job because some tasks failed", Source = EventSource.Job, Type = EventType.Alert }); this.job = j; }, token, this.Logger); return(true); } this.Logger.Information("Fetching finished tasks for job {0}", jobPartitionKey); var finishedTasks = tasks.Select(t => this.tasksDict[t.Id]); this.Logger.Information("Converting to child Ids view for job {0}", jobPartitionKey); var childIdGroups = finishedTasks .SelectMany(ids => ids.ChildIds.Select(cid => new { ParentId = ids.Id, ChildId = cid })) .GroupBy(idPair => idPair.ChildId) .Select(g => new { ChildId = g.Key, Count = g.Count(), ParentIds = g.Select(idPair => idPair.ParentId).ToList() }).ToList(); this.Logger.Information("Converted to child Ids view for job {0}, children count {1}", jobPartitionKey, childIdGroups.Count); await T.Task.WhenAll(childIdGroups.Select(async cid => { this.Logger.Information(" {0} Job {1} requeuecount {2}, task {3} has {4} ancestor tasks completed {5}", job.Type, job.Id, job.RequeueCount, cid.ChildId, cid.Count, string.Join(",", cid.ParentIds)); var toBeUnlocked = this.tasksDict[cid.ChildId]; bool isEndTask = toBeUnlocked.Id == int.MaxValue; if (!isEndTask && (toBeUnlocked.State != TaskState.Queued)) { this.Logger.Information(" {0} Job {1} requeuecount {2}, task {3} is in state {4}, skip dispatching.", job.Type, job.Id, job.RequeueCount, cid.ChildId, toBeUnlocked.State); return; } var oldParentIdsCount = toBeUnlocked.RemainingParentIds.Count; var oldParents = string.Join(',', toBeUnlocked.RemainingParentIds); cid.ParentIds.ForEach(pid => toBeUnlocked.RemainingParentIds.Remove(pid)); this.Logger.Information(" Job {0}, requeueCount {1}, task {2} had {3} parents, remaining {4} parents, removed {5}", job.Id, job.Request, cid.ChildId, oldParentIdsCount, toBeUnlocked.RemainingParentIds.Count, cid.ParentIds.Count); if (cid.ParentIds.Count + toBeUnlocked.RemainingParentIds.Count != oldParentIdsCount) { this.Logger.Warning(" Job {0}, requeueCount {1}, task {2} mismatch! old {3}, remaining {4}, removed {5}.", job.Id, job.Request, cid.ChildId, oldParents, string.Join(',', toBeUnlocked.RemainingParentIds), string.Join(',', cid.ParentIds)); } if (toBeUnlocked.RemainingParentIds.Count == 0) { // unlocked var targetState = isEndTask ? TaskState.Finished : TaskState.Dispatching; var childTaskKey = this.Utilities.GetTaskKey(job.Id, cid.ChildId, job.RequeueCount); Task childTask = toBeUnlocked; if (isEndTask) { await this.UpdateJobProgress(token); await this.Utilities.UpdateJobAsync(job.Type, job.Id, j => j.State = j.State == JobState.Running ? JobState.Finishing : j.State, token, this.Logger); var jobEventQueue = this.Utilities.GetJobEventQueue(); await jobEventQueue.AddMessageAsync( // todo: event message generation. new CloudQueueMessage(JsonConvert.SerializeObject(new JobEventMessage() { Id = job.Id, EventVerb = "finish", Type = job.Type })), null, null, null, null, token); this.shouldExit = true; } else { async T.Task dispatch() { var dispatchQueue = this.Utilities.GetNodeDispatchQueue(childTask.Node); await dispatchQueue.AddMessageAsync( new CloudQueueMessage(JsonConvert.SerializeObject(new TaskEventMessage() { EventVerb = "start", Id = childTask.Id, JobId = childTask.JobId, JobType = childTask.JobType, RequeueCount = job.RequeueCount }, Formatting.Indented)), TimeSpan.FromSeconds(childTask.MaximumRuntimeSeconds), null, null, null, token); }; async T.Task cancel() { if (!this.taskNodeTimeoutMessages.ContainsKey(childTask.Id)) { var taskTimeoutMessage = new CloudQueueMessage( JsonConvert.SerializeObject(new TaskEventMessage() { EventVerb = "timeout", Id = childTask.Id, JobId = childTask.JobId, JobType = childTask.JobType, RequeueCount = job.RequeueCount }, Formatting.Indented)); var cancelQueue = this.Utilities.GetNodeCancelQueue(childTask.Node); await cancelQueue.AddMessageAsync( taskTimeoutMessage, null, TimeSpan.FromSeconds(childTask.MaximumRuntimeSeconds), null, null, token); this.taskNodeTimeoutMessages.TryAdd(childTask.Id, taskTimeoutMessage); } else { this.Logger.Warning(" Cannot add taskNodeTimeout for job {0} task {1}", job.Id, childTask.Id); } }; async T.Task complete() { if (!this.taskTimeoutMessages.ContainsKey(childTask.Id)) { var taskTimeoutMessage = new CloudQueueMessage( JsonConvert.SerializeObject(new TaskCompletionMessage() { ChildIds = childTask.ChildIds, ExitCode = -1, Id = childTask.Id, JobId = childTask.JobId, JobType = childTask.JobType, RequeueCount = childTask.RequeueCount, Timeouted = true }, Formatting.Indented)); await jobTaskCompletionQueue.AddMessageAsync( taskTimeoutMessage, null, TimeSpan.FromSeconds(childTask.MaximumRuntimeSeconds), null, null, token); this.taskTimeoutMessages.TryAdd(childTask.Id, taskTimeoutMessage); } else { this.Logger.Warning(" Cannot add taskTimeout for job {0} task {1}", job.Id, childTask.Id); } }; await T.Task.WhenAll(dispatch(), cancel(), complete()); this.Logger.Information(" Dispatched job {0} task {1} to node {2}", childTask.JobId, childTask.Id, childTask.Node); } toBeUnlocked.State = targetState; this.Logger.Information(" Updated job {0} task {1} state to {2} in memory", childTask.JobId, childTask.Id, childTask.State); } })); this.Logger.Information("Finished to process the batch of tasks of job {0}", jobPartitionKey); return(true); }