예제 #1
0
        public override async T.Task <bool> ProcessTaskItemAsync(TaskItem taskItem, CancellationToken token)
        {
            var message = taskItem.GetMessage <TaskEventMessage>();

            this.Logger.Information("Do work for TaskEvent {0}, {1}, {2}, message {3}", message.Id, message.JobType, message.EventVerb, taskItem.Id);

            try
            {
                // TODO: refactor the processor design.
                JobTaskProcessor processor = null;
                switch (message.EventVerb)
                {
                case "cancel":
                    processor = this.Provider.GetService <CancelJobOrTaskProcessor>();
                    break;

                case "start":
                    processor = this.Provider.GetService <StartJobAndTaskProcessor>();
                    break;

                default:
                    break;
                }

                if (processor is ServerObject so)
                {
                    so.CopyFrom(this);
                }

                var result = await processor.ProcessAsync(message, taskItem.GetInsertionTime(), token);

                this.Logger.Information("Finished process {0} {1} {2}, result {3}", message.EventVerb, message.JobId, message.Id, result);
                return(result);
            }
            catch (OperationCanceledException) { return(false); }
            catch (Exception ex)
            {
                this.Logger.Error("Exception occurred when process {0}, {1}, {2}, {3}", message.EventVerb, message.JobId, message.Id, ex);
                await this.Utilities.UpdateJobAsync(message.JobType, message.JobId, j =>
                {
                    j.State = (j.State == JobState.Canceled || j.State == JobState.Finished) ? j.State : JobState.Failed;
                    (j.Events ?? (j.Events = new List <Event>())).Add(new Event()
                    {
                        Content = $"Exception occurred when process job {message.JobId} {message.JobType} {message.EventVerb}. {ex}",
                        Source  = EventSource.Job,
                        Type    = EventType.Alert,
                    });
                }, token, this.Logger);
            }

            return(true);
        }
예제 #2
0
        public override async T.Task <bool> ProcessTaskItemAsync(TaskItem taskItem, CancellationToken token)
        {
            var message = taskItem.GetMessage <TaskEventMessage>();

            this.Logger.Information("Do work for TaskEvent {0}, {1}, {2}, message {3}", message.Id, message.JobType, message.EventVerb, taskItem.Id);

            try
            {
                // TODO: refactor the processor design.
                JobTaskProcessor processor = null;
                switch (message.EventVerb)
                {
                case "cancel":
                    processor = this.Provider.GetService <CancelJobOrTaskProcessor>();
                    break;

                case "start":
                    processor = this.Provider.GetService <StartJobAndTaskProcessor>();
                    break;

                default:
                    break;
                }

                if (processor is ServerObject so)
                {
                    so.CopyFrom(this);
                }

                var result = await processor.ProcessAsync(message, taskItem.GetInsertionTime(), token);

                this.Logger.Information("Finished process {0} {1} {2}, result {3}", message.EventVerb, message.JobId, message.Id, result);
                return(result);
            }
            catch (OperationCanceledException) { return(false); }
            catch (Exception ex)
            {
                this.Logger.Error("Exception occurred when process {0}, {1}, {2}, {3}", message.EventVerb, message.JobId, message.Id, ex);

                await this.Utilities.FailJobWithEventAsync(
                    message.JobType,
                    message.JobId,
                    $"Exception occurred when process job {message.JobId} {message.JobType} {message.EventVerb}. {ex}",
                    token);
            }

            return(true);
        }
예제 #3
0
        public override async T.Task <bool> ProcessTaskItemAsync(TaskItem taskItem, CancellationToken token)
        {
            var runningJobMessage = taskItem.GetMessage <RunningJobMessage>();

            this.Logger.Information("Do work for job {0}, requeueCount {1}, message {2}", runningJobMessage.JobId, runningJobMessage.RequeueCount, taskItem.Id);

            var worker = this.Provider.GetRequiredService <JobTaskDispatcherWorker>();

            worker.CopyFrom(this);
            await worker.InitializeAsync(runningJobMessage.JobType, runningJobMessage.JobId, token);

            await worker.DoWorkAsync(token);

            this.Logger.Information("Finished running for job {0}, requeueCount {1}, message {2}", runningJobMessage.JobId, runningJobMessage.RequeueCount, taskItem.Id);

            return(true);
        }
예제 #4
0
        public override async T.Task <bool> ProcessTaskItemAsync(TaskItem taskItem, CancellationToken token)
        {
            this.Logger.Information("Do work for Script sync message {0}", taskItem?.Id);

            try
            {
                var message = taskItem?.GetMessage <ScriptSyncMessage>();
                var results = await T.Task.WhenAll(this.SyncDiagScriptsAsync(token), this.SyncMetricScriptsAsync(token));

                return(results.All(r => r));
            }
            catch (Exception ex)
            {
                this.Logger.Error("Exception occurred when process script sync message {0}, {1}", taskItem?.Id, ex);
            }

            return(true);
        }
예제 #5
0
        public override async Task <bool> DoWorkAsync(TaskItem taskItem, CancellationToken token)
        {
            var message = taskItem.GetMessage <JobDispatchMessage>();

            using (this.logger.BeginScope("Do work for JobDispatchMessage {0}", message.Id))
            {
                var result = await this.jobTable.ExecuteAsync(
                    TableOperation.Retrieve <JsonTableEntity>(
                        this.utilities.GetJobPartitionKey($"{message.Type}", message.Id),
                        this.utilities.JobEntryKey),
                    null,
                    null,
                    token);

                this.logger.LogInformation("Queried job table entity for job id {0}, result {1}", message.Id, result.HttpStatusCode);

                if (result.Result is JsonTableEntity entity)
                {
                    var job = entity.GetObject <Job>();

                    job.State = JobState.Running;
                    var internalJob = InternalJob.CreateFrom(job);

                    await Task.WhenAll(internalJob.TargetNodes.Select(async n =>
                    {
                        var q = await this.utilities.GetOrCreateNodeDispatchQueueAsync(n, token);
                        await q.AddMessageAsync(new CloudQueueMessage(JsonConvert.SerializeObject(internalJob)), null, null, null, null, token);
                    }));

                    result = await this.jobTable.ExecuteAsync(TableOperation.Replace(entity), null, null, token);

                    this.logger.LogInformation("Dispatched job, update job result code {0}", result.HttpStatusCode);
                    return(result.IsSuccessfulStatusCode());
                }
                else
                {
                    this.logger.LogWarning("The entity queried is not of <JobTableEntity> type, {0}", result.Result);
                    return(false);
                }
            }
        }
예제 #6
0
        public override async Task <bool> DoWorkAsync(TaskItem taskItem, CancellationToken token)
        {
            var job      = taskItem.GetMessage <InternalJob>();
            var nodeName = this.Configuration.GetValue <string>(Constants.HpcHostNameEnv);

            using (this.logger.BeginScope("Do work for InternalJob {0} on node {1}", job.Id, nodeName))
            {
                // TODO: make sure invisible.
                logger.LogInformation("Executing job {0}", job.Id);
                var tasks = Enumerable.Range(0, job.CommandLines.Length).Select(async taskId =>
                {
                    var cmd = job.CommandLines[taskId];
                    logger.LogInformation("Executing command {0}, job {1}", cmd, job.Id);
                    var taskKey        = this.utilities.GetTaskKey(job.Id, taskId, job.RequeueCount);
                    var resultKey      = this.utilities.GetJobResultKey(nodeName, taskKey);
                    var taskResultBlob = await this.utilities.CreateOrReplaceTaskOutputBlobAsync(job.Id, resultKey, token);
                    using (var monitor = this.Monitor.StartMonitorTask(taskKey, async(output, cancellationToken) =>
                    {
                        try
                        {
                            await taskResultBlob.AppendTextAsync(output, Encoding.UTF8, null, null, null, cancellationToken);
                        }
                        catch (Exception ex)
                        {
                            this.logger.LogError(ex, "Error happened when append to blob {0}", taskResultBlob.Name);
                        }
                    }))
                    {
                        this.logger.LogInformation("Call startjobandtask for job {0}, task {1}", job.Id, taskKey);
                        var jobPartitionName  = this.utilities.GetJobPartitionKey($"{job.Type}", job.Id);
                        var nodePartitionName = this.utilities.GetNodePartitionKey(nodeName);

                        var taskResultArgs = new ComputeNodeTaskCompletionEventArgs(nodeName, job.Id, null)
                        {
                            State = TaskState.Dispatching
                        };
                        var taskResultEntity = new JsonTableEntity(jobPartitionName, resultKey, taskResultArgs);
                        var result           = await jobsTable.ExecuteAsync(TableOperation.InsertOrReplace(taskResultEntity), null, null, token);
                        this.logger.LogInformation("Saved task result {0} to jobs table, status code {1}", resultKey, result.HttpStatusCode);
                        if (!result.IsSuccessfulStatusCode())
                        {
                            return(false);
                        }

                        var nodeResultEntity = new JsonTableEntity(nodePartitionName, resultKey, taskResultArgs);
                        result = await nodesTable.ExecuteAsync(TableOperation.InsertOrReplace(nodeResultEntity), null, null, token);
                        this.logger.LogInformation("Saved task result {0} to nodes table, status code {1}", resultKey, result.HttpStatusCode);
                        if (!result.IsSuccessfulStatusCode())
                        {
                            return(false);
                        }

                        await this.communicator.StartJobAndTaskAsync(
                            nodeName,
                            new StartJobAndTaskArg(new int[0], job.Id, taskId),
                            "", "", new ProcessStartInfo(cmd, "", "", $"{this.communicator.Options.AgentUriBase}/output/{taskKey}",
                                                         "", new System.Collections.Hashtable(), new long[0], job.RequeueCount), token);

                        taskResultArgs = new ComputeNodeTaskCompletionEventArgs(nodeName, job.Id, null)
                        {
                            State = TaskState.Running
                        };
                        taskResultEntity = new JsonTableEntity(jobPartitionName, resultKey, taskResultArgs);
                        result           = await jobsTable.ExecuteAsync(TableOperation.InsertOrReplace(taskResultEntity), null, null, token);
                        this.logger.LogInformation("Saved task result {0} to jobs table, status code {1}", resultKey, result.HttpStatusCode);
                        if (!result.IsSuccessfulStatusCode())
                        {
                            return(false);
                        }

                        nodeResultEntity = new JsonTableEntity(nodePartitionName, resultKey, taskResultArgs);
                        result           = await nodesTable.ExecuteAsync(TableOperation.InsertOrReplace(nodeResultEntity), null, null, token);
                        this.logger.LogInformation("Saved task result {0} to nodes table, status code {1}", resultKey, result.HttpStatusCode);
                        if (!result.IsSuccessfulStatusCode())
                        {
                            return(false);
                        }

                        this.logger.LogInformation("Wait for response for job {0}, task {1}", job.Id, taskKey);
                        taskResultArgs = await monitor.Execution;

                        this.logger.LogInformation("Saving result for job {0}, task {1}", job.Id, taskKey);

                        taskResultArgs.State = TaskState.Finished;
                        taskResultEntity     = new JsonTableEntity(jobPartitionName, resultKey, taskResultArgs);
                        result = await jobsTable.ExecuteAsync(TableOperation.InsertOrReplace(taskResultEntity), null, null, token);
                        this.logger.LogInformation("Saved task result {0} to jobs table, status code {1}", resultKey, result.HttpStatusCode);
                        if (!result.IsSuccessfulStatusCode())
                        {
                            return(false);
                        }

                        nodeResultEntity = new JsonTableEntity(nodePartitionName, resultKey, taskResultArgs);
                        result           = await nodesTable.ExecuteAsync(TableOperation.InsertOrReplace(nodeResultEntity), null, null, token);
                        this.logger.LogInformation("Saved task result {0} to nodes table, status code {1}", resultKey, result.HttpStatusCode);
                        if (!result.IsSuccessfulStatusCode())
                        {
                            return(false);
                        }

                        return(true);
                    }
                });

                var results = await Task.WhenAll <bool>(tasks);

                return(results.All(r => r));
            }
        }
예제 #7
0
        public override async T.Task <bool> ProcessTaskItemAsync(TaskItem taskItem, CancellationToken token)
        {
            var taskItems = taskItem.GetMessage <TaskItem[]>();
            var messages  = taskItems.Select(ti =>
            {
                var msg = ti.GetMessage <TaskCompletionMessage>();
                this.Logger.Information("Do work for job {0}, task {1}, message {2}", msg.JobId, msg.Id, ti.Id);
                return(msg);
            });

            var jobGroups = messages.GroupBy(msg => this.Utilities.GetJobPartitionKey(msg.JobType, msg.JobId));

            var results = await T.Task.WhenAll(jobGroups.Select(async jg =>
            {
                var jobPartitionKey = jg.Key;
                this.Logger.Information("Do work for job {0}, tasks finished: {1}", jobPartitionKey, string.Join(",", jg.Select(t => t.Id)));
                var job = await this.jobsTable.RetrieveAsync <Job>(jobPartitionKey, this.Utilities.JobEntryKey, token);
                if (job == null || job.State != JobState.Running)
                {
                    this.Logger.Warning("Skip processing the task completion of {0}. Job state {1}.", jobPartitionKey, job?.State);
                    return(true);
                }

                var skippedTasks = string.Join(",", jg.Where(msg => msg.RequeueCount != job.RequeueCount).Select(msg => $"{msg.Id}.{msg.RequeueCount}"));
                if (!string.IsNullOrEmpty(skippedTasks))
                {
                    this.Logger.Warning("Skip processing the task completion, job requeueCount {0}, tasks {1}.", job.RequeueCount, skippedTasks);
                }

                var tasks = jg.Where(msg => msg.RequeueCount == job.RequeueCount).ToList();

                var completedCount = tasks.Count(t => t.Id != 0 && t.Id != int.MaxValue);
                if (completedCount > 0)
                {
                    await this.Utilities.UpdateJobAsync(job.Type, job.Id, j =>
                    {
                        j.CompletedTaskCount = Math.Min(j.CompletedTaskCount + completedCount, j.TaskCount);
                    }, token, this.Logger);
                }

                if (job.Type == JobType.Diagnostics)
                {
                    string diagKey = job.DiagnosticTest.Category + job.DiagnosticTest.Name;
                    if (!this.diagTests.TryGetValue(diagKey, out InternalDiagnosticsTest diagTest))
                    {
                        diagTest = await this.jobsTable.RetrieveAsync <InternalDiagnosticsTest>(
                            this.Utilities.GetDiagPartitionKey(job.DiagnosticTest.Category),
                            job.DiagnosticTest.Name,
                            token);
                        this.diagTests.TryAdd(diagKey, diagTest);
                    }

                    if (diagTest?.TaskResultFilterScript?.Name != null)
                    {
                        if (!this.taskFilterScript.TryGetValue(diagKey, out string script))
                        {
                            var scriptBlob = this.Utilities.GetBlob(diagTest.TaskResultFilterScript.ContainerName, diagTest.TaskResultFilterScript.Name);
                            using (var stream = new MemoryStream())
                            {
                                await scriptBlob.DownloadToStreamAsync(stream, null, null, null, token);
                                stream.Seek(0, SeekOrigin.Begin);
                                using (StreamReader sr = new StreamReader(stream, true))
                                {
                                    script = await sr.ReadToEndAsync();
                                }
                            }

                            this.taskFilterScript.TryAdd(diagKey, script);
                        }

                        var path = Path.GetTempFileName();
                        try
                        {
                            await File.WriteAllTextAsync(path, script, token);
                            var hookResults = await T.Task.WhenAll(tasks.Select(tid => this.TaskResultHook(job, tid.Id, path, token)));
                            if (hookResults.Any(r => !r))
                            {
                                return(false);
                            }
                        }
                        finally
                        {
                            File.Delete(path);
                        }
                    }
                }

                if (job.FailJobOnTaskFailure && tasks.Any(t => t.ExitCode != 0))
                {
                    this.Logger.Information("Fail the job because some tasks failed {0}", job.Id);
                    await this.Utilities.UpdateJobAsync(job.Type, job.Id, j =>
                    {
                        j.State = JobState.Failed;
                        (j.Events ?? (j.Events = new List <Event>())).Add(new Event()
                        {
                            Content = $"Fail the job because some tasks failed",
                            Source  = EventSource.Job,
                            Type    = EventType.Alert
                        });
                    }, token, this.Logger);

                    return(true);
                }

                var childIds = await T.Task.WhenAll(tasks.Select(async t => new { t.Id, ChildIds = t.ChildIds ?? await this.Utilities.LoadTaskChildIdsAsync(t.Id, job.Id, job.RequeueCount, token) }));

                foreach (var cids in childIds)
                {
                    this.Logger.Information("{0} Job {1} requeuecount {2}, task {3} completed, child ids {4}", job.Type, job.Id, job.RequeueCount, cids.Id, string.Join(",", cids.ChildIds));
                }

                var childIdGroups = childIds
                                    .SelectMany(ids => ids.ChildIds.Select(cid => new { ParentId = ids.Id, ChildId = cid }))
                                    .GroupBy(idPair => idPair.ChildId)
                                    .Select(g => new { ChildId = g.Key, Count = g.Count(), ParentIds = g.Select(idPair => idPair.ParentId).ToList() }).ToList();

                var childResults = await T.Task.WhenAll(childIdGroups.Select(async cid =>
                {
                    this.Logger.Information("{0} Job {1} requeuecount {2}, task {3} has {4} ancestor tasks completed {5}", job.Type, job.Id, job.RequeueCount, cid.ChildId, cid.Count, string.Join(",", cid.ParentIds));
                    var childTaskKey = this.Utilities.GetTaskKey(job.Id, cid.ChildId, job.RequeueCount);

                    bool unlocked  = false;
                    bool isEndTask = false;
                    Task childTask = null;
                    if (!await this.Utilities.UpdateTaskAsync(jobPartitionKey, childTaskKey, t =>
                    {
                        var unzippedParentIds = Compress.UnZip(t.ZippedParentIds);
                        this.Logger.Information("{0} Job {1} task {2}, ZippedParentIds {3}, unzipped {4}", job.Type, job.Id, cid.ChildId, t.ZippedParentIds, unzippedParentIds);
                        HashSet <int> parentIds;
                        try
                        {
                            parentIds = new HashSet <int>(unzippedParentIds.Split(',', StringSplitOptions.RemoveEmptyEntries).Select(_ => int.Parse(_)));
                        }
                        catch (FormatException ex)
                        {
                            this.Logger.Error("Error happened {0}, input string {1}, len {2}", ex, unzippedParentIds, unzippedParentIds.Length);
                            throw;
                        }

                        var oldParentIdsCount = parentIds.Count;
                        this.Logger.Information("{0} Job {1} requeuecount {2}, task {3} has {4} parent tasks {5}", job.Type, job.Id, job.RequeueCount, cid.ChildId, oldParentIdsCount, string.Join(",", parentIds));
                        cid.ParentIds.ForEach(_ => parentIds.Remove(_));
                        var newParentIdsStr = string.Join(",", parentIds);
                        this.Logger.Information("{0} Job {1} requeuecount {2}, after remove, task {3} has {4} parent tasks {5}", job.Type, job.Id, job.RequeueCount, cid.ChildId, parentIds.Count, newParentIdsStr);
                        if (parentIds.Count + cid.Count != oldParentIdsCount)
                        {
                            this.Logger.Warning("{0} Job {1} requeuecount {2}, task {3}, ids mismatch!", job.Type, job.Id, job.RequeueCount, cid.ChildId);
                        }

                        t.ZippedParentIds = Compress.GZip(newParentIdsStr);
                        unlocked = parentIds.Count == 0;
                        isEndTask = t.Id == int.MaxValue;
                        if (unlocked)
                        {
                            t.State = isEndTask ? TaskState.Finished : TaskState.Dispatching;
                        }

                        childTask = t;
                    }, token,
                                                              this.Logger))
                    {
                        await this.Utilities.UpdateJobAsync(job.Type, job.Id, j =>
                        {
                            j.State = JobState.Failed;
                            // TODO: make event separate.
                            (j.Events ?? (j.Events = new List <Event>())).Add(new Event()
                            {
                                Content = $"Unable to update task record {cid.ChildId}",
                                Source  = EventSource.Job,
                                Type    = EventType.Alert
                            });
                        }, token, this.Logger);

                        return(true);
                    }

                    if (unlocked)
                    {
                        if (isEndTask)
                        {
                            await this.Utilities.UpdateJobAsync(job.Type, job.Id, j => j.State = j.State == JobState.Running ? JobState.Finishing : j.State, token, this.Logger);
                            var jobEventQueue = this.Utilities.GetJobEventQueue();
                            await jobEventQueue.AddMessageAsync(
                                // todo: event message generation.
                                new CloudQueueMessage(JsonConvert.SerializeObject(new JobEventMessage()
                            {
                                Id = job.Id, EventVerb = "finish", Type = job.Type
                            })),
                                null, null, null, null,
                                token);
                        }
                        else
                        {
                            var queue = this.Utilities.GetNodeDispatchQueue(childTask.Node);
                            await queue.AddMessageAsync(
                                new CloudQueueMessage(JsonConvert.SerializeObject(new TaskEventMessage()
                            {
                                EventVerb = "start", Id = childTask.Id, JobId = childTask.JobId, JobType = childTask.JobType, RequeueCount = job.RequeueCount
                            }, Formatting.Indented)),
                                null, null, null, null, token);
                            this.Logger.Information("Dispatched job {0} task {1} to node {2}", childTask.JobId, childTask.Id, childTask.Node);
                        }
                    }

                    return(true);
                }));

                return(childResults.All(r => r));
            }));

            return(results.All(r => r));
        }
예제 #8
0
        public override async T.Task <bool> ProcessTaskItemAsync(TaskItem taskItem, CancellationToken token)
        {
            var message = taskItem.GetMessage <JobEventMessage>();

            this.Logger.Information("Do work for JobEvent {0}, {1}, {2}, message {3}", message.Id, message.Type, message.EventVerb, taskItem.Id);

            var jobPartitionKey = this.Utilities.GetJobPartitionKey(message.Type, message.Id);
            var jobEntryKey     = this.Utilities.JobEntryKey;
            var job             = await this.jobsTable.RetrieveAsync <Job>(
                jobPartitionKey,
                jobEntryKey,
                token);

            this.Logger.Information("Queried job table entity for job id {0}", message.Id);

            if (job != null)
            {
                try
                {
                    IJobTypeHandler   typeHandler;
                    IJobActionHandler actionHandler;
                    if (this.ActionHandlerTypes.TryGetValue(message.EventVerb, out Type actionType) &&
                        this.JobTypeHandlers.TryGetValue(message.Type, out Type jobType) &&
                        (null != (actionHandler = (IJobActionHandler)this.Provider.GetService(actionType))) &&
                        (null != (typeHandler = (IJobTypeHandler)this.Provider.GetService(jobType))))
                    {
                        ((ServerObject)actionHandler).CopyFrom(this);
                        ((ServerObject)typeHandler).CopyFrom(this);
                        actionHandler.JobTypeHandler = typeHandler;
                        await actionHandler.ProcessAsync(job, message, token);

                        this.Logger.Information("Processed {0} job {1} {2}", job.Type, job.Id, job.State);
                    }
                    else
                    {
                        this.Logger.Warning("No processors found for job type {0}, {1}, {2}", job.Type, job.Id, message.EventVerb);

                        await this.Utilities.UpdateJobAsync(job.Type, job.Id, j =>
                        {
                            j.State = JobState.Failed;
                            (j.Events ?? (j.Events = new List <Event>())).Add(new Event()
                            {
                                Content = $"No processors found for job type {j.Type}, event {message.EventVerb}",
                                Source  = EventSource.Job,
                                Type    = EventType.Alert,
                            });
                        }, token, this.Logger);
                    }
                }
                catch (Exception ex)
                {
                    this.Logger.Error("Exception occurred when process job {0}, {1}, {2}, {3}", job.Id, job.Type, message.EventVerb, ex);
                    await this.Utilities.UpdateJobAsync(job.Type, job.Id, j =>
                    {
                        j.State = JobState.Failed;
                        (j.Events ?? (j.Events = new List <Event>())).Add(new Event()
                        {
                            Content = $"Exception occurred when process job {job.Id} {job.Type} {message.EventVerb}. {ex}",
                            Source  = EventSource.Job,
                            Type    = EventType.Alert,
                        });
                    }, token, this.Logger);
                }

                return(true);
            }
            else
            {
                Debug.Assert(false);
                this.Logger.Warning("The entity queried is not of job type, {0}", message.Id);
                return(false);
            }
        }
예제 #9
0
        public override async T.Task <bool> ProcessTaskItemAsync(TaskItem taskItem, CancellationToken token)
        {
            this.batchId++;
            var taskItems = taskItem.GetMessage <TaskItem[]>();

            this.Logger.Information("Entering batch {0}, size {1}", this.batchId, taskItems.Length);
            var messages = taskItems.Select(ti =>
            {
                var msg = ti.GetMessage <TaskCompletionMessage>();
                this.Logger.Information("    Do work for job {0}, task {1}, message {2}", msg.JobId, msg.Id, ti.Id);
                return(msg);
            }).ToList();

            this.Logger.Information("Do work for job {0}, tasks finished: {1}", jobPartitionKey, string.Join(",", messages.Select(t => t.Id)));
            var skippedTasks = string.Join(",", messages.Where(msg => msg.RequeueCount != job.RequeueCount).Select(msg => $"{msg.Id}.{msg.RequeueCount}"));

            if (!string.IsNullOrEmpty(skippedTasks))
            {
                this.Logger.Warning("Skip processing the task completion, job requeueCount {0}, tasks {1}.", job.RequeueCount, skippedTasks);
            }

            var tasks = messages.Where(msg => msg.RequeueCount == job.RequeueCount).ToList();

            this.Logger.Information("Deleting timeout guard {0}", jobPartitionKey);
            var jobTaskCompletionQueue = this.Utilities.GetJobTaskCompletionQueue(job.Id);
            await T.Task.WhenAll(tasks.Where(t => !t.Timeouted).Select(async t =>
            {
                if (t.Id == 0)
                {
                    return;
                }

                async T.Task DeleteTimeoutAsync(ConcurrentDictionary <int, CloudQueueMessage> dict, int jobId, int id, CloudQueue queue)
                {
                    if (dict.TryRemove(id, out var msg))
                    {
                        try
                        {
                            await queue.DeleteMessageAsync(msg.Id, msg.PopReceipt, null, null, token);
                            this.Logger.Information("    Deleted {0} timeout message for job {1}, task {2}, message {3}", queue.Name, jobId, id, msg.Id);
                        }
                        catch (StorageException ex)
                        {
                            if (ex.IsNotFound())
                            {
                                this.Logger.Information("    Not found the {0} timeout message {1} for job {2}, task {3}", queue.Name, msg.Id, jobId, id);
                            }
                            else if (ex.IsCancellation())
                            {
                                return;
                            }
                            else
                            {
                                this.Logger.Warning(ex, "    Unable to delete the {0} timeout message {1} for job {2}, task {3}", queue.Name, msg.Id, jobId, id);
                            }
                        }
                    }
                    else
                    {
                        this.Logger.Information("    Cannot find the node timeout message in memory for job {0}, task {1}", job.Id, t.Id);
                    }
                }

                if (!this.tasksDict.TryGetValue(t.Id, out Task tt))
                {
                    this.Logger.Information("    Cannot find task for job {0}, task {1}", job.Id, t.Id);
                    return;
                }

                var nodeCancelQueue = this.Utilities.GetNodeCancelQueue(tt.Node);
                await T.Task.WhenAll(DeleteTimeoutAsync(this.taskNodeTimeoutMessages, job.Id, t.Id, nodeCancelQueue),
                                     DeleteTimeoutAsync(this.taskTimeoutMessages, job.Id, t.Id, jobTaskCompletionQueue));
            }));

            this.Logger.Information("Updating tasks state in memory {0}", jobPartitionKey);
            foreach (var t in tasks)
            {
                var tt = this.tasksDict[t.Id];
                this.Logger.Information("    {0} Job {1} requeuecount {2}, task {3} on {4} completed, timeout {5}, currentState {6}, child ids {7}", job.Type, job.Id, job.RequeueCount, t.Id, tt.Node, t.Timeouted, tt.State, string.Join(",", this.tasksDict[t.Id].ChildIds));

                bool alreadyFinished = tt.State == TaskState.Finished || tt.State == TaskState.Canceled || tt.State == TaskState.Failed;

                tt.State = alreadyFinished ? tt.State : (t.Timeouted ? TaskState.Canceled : (t.ExitCode == 0 ? TaskState.Finished : TaskState.Failed));
            }

            this.Logger.Information("Updating tasks state for timeouted tasks in storage {0}", jobPartitionKey);
            await T.Task.WhenAll(tasks.Where(t => t.Timeouted).Select(async t =>
            {
                var key = this.Utilities.GetTaskKey(job.Id, t.Id, job.RequeueCount);

                TaskState state = TaskState.Canceled;
                await this.Utilities.UpdateTaskAsync(this.jobPartitionKey, key, task =>
                {
                    state = task.State = task.State != TaskState.Canceled && task.State != TaskState.Failed && task.State != TaskState.Finished ? TaskState.Canceled : task.State;
                },
                                                     token, this.Logger);

                this.Logger.Information("    Updated {0}, task {1} state to {2}", job.Id, t.Id, state);
            }));

            if (this.batchId % 10 == 0)
            {
                await this.UpdateJobProgress(token);
            }

            if (job?.State != JobState.Running)
            {
                this.shouldExit = true;
                this.Logger.Warning("Skip processing the task completion of {0}. Job state {1}.", jobPartitionKey, job?.State);
                return(true);
            }

            if (job.Type == JobType.Diagnostics)
            {
                this.Logger.Information("Processing task filters for job {0}", jobPartitionKey);
                string diagKey = job.DiagnosticTest.Category + job.DiagnosticTest.Name;
                if (!this.diagTests.TryGetValue(diagKey, out InternalDiagnosticsTest diagTest))
                {
                    diagTest = await this.jobsTable.RetrieveAsync <InternalDiagnosticsTest>(
                        this.Utilities.GetDiagPartitionKey(job.DiagnosticTest.Category),
                        job.DiagnosticTest.Name,
                        token);

                    this.diagTests.TryAdd(diagKey, diagTest);
                }

                if (diagTest?.TaskResultFilterScript?.Name != null && diagTest.RunTaskResultFilter)
                {
                    this.Logger.Information("Run task filters for job {0}", jobPartitionKey);
                    if (!this.taskFilterScript.TryGetValue(diagKey, out string script))
                    {
                        var scriptBlob = this.Utilities.GetBlob(diagTest.TaskResultFilterScript.ContainerName, diagTest.TaskResultFilterScript.Name);
                        using (var stream = new MemoryStream())
                        {
                            await scriptBlob.DownloadToStreamAsync(stream, null, null, null, token);

                            stream.Seek(0, SeekOrigin.Begin);
                            using (StreamReader sr = new StreamReader(stream, true))
                            {
                                script = await sr.ReadToEndAsync();
                            }
                        }

                        this.taskFilterScript.TryAdd(diagKey, script);
                    }

                    var path = Path.GetTempFileName();
                    try
                    {
                        await File.WriteAllTextAsync(path, script, token);

                        await T.Task.WhenAll(tasks.Select(tid => this.TaskResultHook(job, tid.Id, path, token)));
                    }
                    finally
                    {
                        File.Delete(path);
                    }
                }
            }

            this.Logger.Information("Check FailOnTaskFailure for job {0}", jobPartitionKey);
            if (job.FailJobOnTaskFailure && tasks.Any(t => t.ExitCode != 0))
            {
                this.Logger.Information("Fail the job because some tasks failed {0}", job.Id);
                await this.Utilities.UpdateJobAsync(job.Type, job.Id, j =>
                {
                    j.State = JobState.Failed;
                    (j.Events ?? (j.Events = new List <Event>())).Add(new Event()
                    {
                        Content = $"Fail the job because some tasks failed",
                        Source  = EventSource.Job,
                        Type    = EventType.Alert
                    });

                    this.job = j;
                }, token, this.Logger);

                return(true);
            }

            this.Logger.Information("Fetching finished tasks for job {0}", jobPartitionKey);
            var finishedTasks = tasks.Select(t => this.tasksDict[t.Id]);

            this.Logger.Information("Converting to child Ids view for job {0}", jobPartitionKey);
            var childIdGroups = finishedTasks
                                .SelectMany(ids => ids.ChildIds.Select(cid => new { ParentId = ids.Id, ChildId = cid }))
                                .GroupBy(idPair => idPair.ChildId)
                                .Select(g => new { ChildId = g.Key, Count = g.Count(), ParentIds = g.Select(idPair => idPair.ParentId).ToList() }).ToList();

            this.Logger.Information("Converted to child Ids view for job {0}, children count {1}", jobPartitionKey, childIdGroups.Count);
            await T.Task.WhenAll(childIdGroups.Select(async cid =>
            {
                this.Logger.Information("    {0} Job {1} requeuecount {2}, task {3} has {4} ancestor tasks completed {5}", job.Type, job.Id, job.RequeueCount, cid.ChildId, cid.Count, string.Join(",", cid.ParentIds));
                var toBeUnlocked = this.tasksDict[cid.ChildId];
                bool isEndTask   = toBeUnlocked.Id == int.MaxValue;
                if (!isEndTask && (toBeUnlocked.State != TaskState.Queued))
                {
                    this.Logger.Information("    {0} Job {1} requeuecount {2}, task {3} is in state {4}, skip dispatching.", job.Type, job.Id, job.RequeueCount, cid.ChildId, toBeUnlocked.State);
                    return;
                }

                var oldParentIdsCount = toBeUnlocked.RemainingParentIds.Count;
                var oldParents        = string.Join(',', toBeUnlocked.RemainingParentIds);
                cid.ParentIds.ForEach(pid => toBeUnlocked.RemainingParentIds.Remove(pid));
                this.Logger.Information("    Job {0}, requeueCount {1}, task {2} had {3} parents, remaining {4} parents, removed {5}",
                                        job.Id, job.Request, cid.ChildId, oldParentIdsCount, toBeUnlocked.RemainingParentIds.Count, cid.ParentIds.Count);
                if (cid.ParentIds.Count + toBeUnlocked.RemainingParentIds.Count != oldParentIdsCount)
                {
                    this.Logger.Warning("    Job {0}, requeueCount {1}, task {2} mismatch! old {3}, remaining {4}, removed {5}.",
                                        job.Id, job.Request, cid.ChildId, oldParents, string.Join(',', toBeUnlocked.RemainingParentIds), string.Join(',', cid.ParentIds));
                }

                if (toBeUnlocked.RemainingParentIds.Count == 0)
                {
                    // unlocked
                    var targetState  = isEndTask ? TaskState.Finished : TaskState.Dispatching;
                    var childTaskKey = this.Utilities.GetTaskKey(job.Id, cid.ChildId, job.RequeueCount);
                    Task childTask   = toBeUnlocked;

                    if (isEndTask)
                    {
                        await this.UpdateJobProgress(token);
                        await this.Utilities.UpdateJobAsync(job.Type, job.Id, j => j.State = j.State == JobState.Running ? JobState.Finishing : j.State, token, this.Logger);
                        var jobEventQueue = this.Utilities.GetJobEventQueue();
                        await jobEventQueue.AddMessageAsync(
                            // todo: event message generation.
                            new CloudQueueMessage(JsonConvert.SerializeObject(new JobEventMessage()
                        {
                            Id = job.Id, EventVerb = "finish", Type = job.Type
                        })),
                            null, null, null, null,
                            token);

                        this.shouldExit = true;
                    }
                    else
                    {
                        async T.Task dispatch()
                        {
                            var dispatchQueue = this.Utilities.GetNodeDispatchQueue(childTask.Node);
                            await dispatchQueue.AddMessageAsync(
                                new CloudQueueMessage(JsonConvert.SerializeObject(new TaskEventMessage()
                            {
                                EventVerb = "start", Id = childTask.Id, JobId = childTask.JobId, JobType = childTask.JobType, RequeueCount = job.RequeueCount
                            }, Formatting.Indented)),
                                TimeSpan.FromSeconds(childTask.MaximumRuntimeSeconds), null, null, null, token);
                        };

                        async T.Task cancel()
                        {
                            if (!this.taskNodeTimeoutMessages.ContainsKey(childTask.Id))
                            {
                                var taskTimeoutMessage = new CloudQueueMessage(
                                    JsonConvert.SerializeObject(new TaskEventMessage()
                                {
                                    EventVerb = "timeout", Id = childTask.Id, JobId = childTask.JobId, JobType = childTask.JobType, RequeueCount = job.RequeueCount
                                }, Formatting.Indented));
                                var cancelQueue = this.Utilities.GetNodeCancelQueue(childTask.Node);
                                await cancelQueue.AddMessageAsync(
                                    taskTimeoutMessage,
                                    null, TimeSpan.FromSeconds(childTask.MaximumRuntimeSeconds), null, null, token);

                                this.taskNodeTimeoutMessages.TryAdd(childTask.Id, taskTimeoutMessage);
                            }
                            else
                            {
                                this.Logger.Warning("    Cannot add taskNodeTimeout for job {0} task {1}", job.Id, childTask.Id);
                            }
                        };

                        async T.Task complete()
                        {
                            if (!this.taskTimeoutMessages.ContainsKey(childTask.Id))
                            {
                                var taskTimeoutMessage = new CloudQueueMessage(
                                    JsonConvert.SerializeObject(new TaskCompletionMessage()
                                {
                                    ChildIds = childTask.ChildIds, ExitCode = -1, Id = childTask.Id, JobId = childTask.JobId, JobType = childTask.JobType, RequeueCount = childTask.RequeueCount, Timeouted = true
                                }, Formatting.Indented));

                                await jobTaskCompletionQueue.AddMessageAsync(
                                    taskTimeoutMessage,
                                    null, TimeSpan.FromSeconds(childTask.MaximumRuntimeSeconds), null, null, token);

                                this.taskTimeoutMessages.TryAdd(childTask.Id, taskTimeoutMessage);
                            }
                            else
                            {
                                this.Logger.Warning("    Cannot add taskTimeout for job {0} task {1}", job.Id, childTask.Id);
                            }
                        };

                        await T.Task.WhenAll(dispatch(), cancel(), complete());
                        this.Logger.Information("    Dispatched job {0} task {1} to node {2}", childTask.JobId, childTask.Id, childTask.Node);
                    }

                    toBeUnlocked.State = targetState;
                    this.Logger.Information("    Updated job {0} task {1} state to {2} in memory", childTask.JobId, childTask.Id, childTask.State);
                }
            }));

            this.Logger.Information("Finished to process the batch of tasks of job {0}", jobPartitionKey);

            return(true);
        }