Esempio n. 1
0
        public async void DispatcherRenewJobRequestRecoverFromExceptions()
        {
            //Arrange
            using (var hc = new TestHostContext(this))
            {
                int   poolId    = 1;
                Int64 requestId = 1000;
                int   count     = 0;

                var trace = hc.GetTrace(nameof(DispatcherRenewJobRequestRecoverFromExceptions));
                TaskCompletionSource <int> firstJobRequestRenewed  = new TaskCompletionSource <int>();
                CancellationTokenSource    cancellationTokenSource = new CancellationTokenSource();

                TaskAgentJobRequest request           = new TaskAgentJobRequest();
                PropertyInfo        lockUntilProperty = request.GetType().GetProperty("LockedUntil", BindingFlags.Instance | BindingFlags.NonPublic | BindingFlags.Public);
                Assert.NotNull(lockUntilProperty);
                lockUntilProperty.SetValue(request, DateTime.UtcNow.AddMinutes(5));

                hc.SetSingleton <IRunnerServer>(_runnerServer.Object);
                hc.SetSingleton <IConfigurationStore>(_configurationStore.Object);
                _configurationStore.Setup(x => x.GetSettings()).Returns(new RunnerSettings()
                {
                    PoolId = 1
                });
                _runnerServer.Setup(x => x.RenewAgentRequestAsync(It.IsAny <int>(), It.IsAny <long>(), It.IsAny <Guid>(), It.IsAny <CancellationToken>()))
                .Returns(() =>
                {
                    count++;
                    if (!firstJobRequestRenewed.Task.IsCompletedSuccessfully)
                    {
                        trace.Info("First renew happens.");
                    }

                    if (count < 5)
                    {
                        return(Task.FromResult <TaskAgentJobRequest>(request));
                    }
                    else if (count == 5 || count == 6 || count == 7)
                    {
                        throw new TimeoutException("");
                    }
                    else
                    {
                        cancellationTokenSource.Cancel();
                        return(Task.FromResult <TaskAgentJobRequest>(request));
                    }
                });

                var jobDispatcher = new JobDispatcher();
                jobDispatcher.Initialize(hc);

                await jobDispatcher.RenewJobRequestAsync(poolId, requestId, Guid.Empty, firstJobRequestRenewed, cancellationTokenSource.Token);

                Assert.True(firstJobRequestRenewed.Task.IsCompletedSuccessfully, "First renew should succeed.");
                Assert.True(cancellationTokenSource.IsCancellationRequested);
                _runnerServer.Verify(x => x.RenewAgentRequestAsync(It.IsAny <int>(), It.IsAny <long>(), It.IsAny <Guid>(), It.IsAny <CancellationToken>()), Times.Exactly(8));
                _runnerServer.Verify(x => x.RefreshConnectionAsync(RunnerConnectionType.JobRequest, It.IsAny <TimeSpan>()), Times.Exactly(3));
                _runnerServer.Verify(x => x.SetConnectionTimeout(RunnerConnectionType.JobRequest, It.IsAny <TimeSpan>()), Times.Once);
            }
        }
Esempio n. 2
0
        private async Task EnsureDispatchFinished(WorkerDispatcher jobDispatch)
        {
            if (!jobDispatch.WorkerDispatch.IsCompleted)
            {
                // base on the current design, server will only send one job for a given agent everytime.
                // if the agent received a new job request while a previous job request is still running, this typically indicate two situations
                // 1. an agent bug cause server and agent mismatch on the state of the job request, ex. agent not renew jobrequest properly but think it still own the job reqest, however server already abandon the jobrequest.
                // 2. a server bug or design change that allow server send more than one job request to an given agent that haven't finish previous job request.
                var agentServer             = HostContext.GetService <IAgentServer>();
                TaskAgentJobRequest request = await agentServer.GetAgentRequestAsync(_poolId, jobDispatch.RequestId, CancellationToken.None);

                if (request.Result != null)
                {
                    // job request has been finished, the server already has result.
                    // this means agent is busted since it still running that request.
                    // cancel the zombie worker, run next job request.
                    Trace.Error($"Received job request while previous job {jobDispatch.JobId} still running on worker. Cancel the previous job since the job request have been finished on server side with result: {request.Result.Value}.");
                    jobDispatch.WorkerCancellationTokenSource.Cancel();

                    // wait 45 sec for worker to finish.
                    Task completedTask = await Task.WhenAny(jobDispatch.WorkerDispatch, Task.Delay(TimeSpan.FromSeconds(45)));

                    if (completedTask != jobDispatch.WorkerDispatch)
                    {
                        // at this point, the job exectuion might encounter some dead lock and even not able to be canclled.
                        // no need to localize the exception string should never happen.
                        throw new InvalidOperationException("Job dispatch process has encountered unexpected error, the dispatch task is not able to be canceled within 45 seconds.");
                    }
                }
                else
                {
                    // something seriously wrong on server side. stop agent from continue running.
                    // no need to localize the exception string should never happen.
                    throw new InvalidOperationException("Server send a new job request while the previous job request haven't finished.");
                }
            }

            try
            {
                await jobDispatch.WorkerDispatch;
                Trace.Info($"Job request {jobDispatch.JobId} processed succeed.");
            }
            catch (Exception ex)
            {
                Trace.Error($"Worker Dispatch failed witn an exception for job request {jobDispatch.JobId}.");
                Trace.Error(ex);
            }
            finally
            {
                WorkerDispatcher workerDispatcher;
                if (_jobInfos.TryRemove(jobDispatch.JobId, out workerDispatcher))
                {
                    Trace.Verbose($"Remove WorkerDispather from {nameof(_jobInfos)} dictionary for job {jobDispatch.JobId}.");
                    workerDispatcher.Dispose();
                }
            }
        }
Esempio n. 3
0
        public async void DispatcherRenewJobRequestFirstRenewRetrySixTimes()
        {
            //Arrange
            using (var hc = new TestHostContext(this))
            {
                int   poolId    = 1;
                Int64 requestId = 1000;
                int   count     = 0;

                var trace = hc.GetTrace(nameof(DispatcherRenewJobRequestFirstRenewRetrySixTimes));
                TaskCompletionSource <int> firstJobRequestRenewed  = new TaskCompletionSource <int>();
                CancellationTokenSource    cancellationTokenSource = new CancellationTokenSource();

                TaskAgentJobRequest request           = new TaskAgentJobRequest();
                PropertyInfo        lockUntilProperty = request.GetType().GetProperty("LockedUntil", BindingFlags.Instance | BindingFlags.NonPublic | BindingFlags.Public);
                Assert.NotNull(lockUntilProperty);
                lockUntilProperty.SetValue(request, DateTime.UtcNow.AddMinutes(5));

                hc.SetSingleton <IRunnerServer>(_runnerServer.Object);
                hc.SetSingleton <IConfigurationStore>(_configurationStore.Object);
                _configurationStore.Setup(x => x.GetSettings()).Returns(new RunnerSettings()
                {
                    PoolId = 1
                });
                _runnerServer.Setup(x => x.RenewAgentRequestAsync(It.IsAny <int>(), It.IsAny <long>(), It.IsAny <Guid>(), It.IsAny <string>(), It.IsAny <CancellationToken>()))
                .Returns(() =>
                {
                    count++;
                    if (!firstJobRequestRenewed.Task.IsCompletedSuccessfully)
                    {
                        trace.Info("First renew happens.");
                    }

                    if (count <= 5)
                    {
                        throw new TimeoutException("");
                    }
                    else
                    {
                        cancellationTokenSource.CancelAfter(10000);
                        throw new InvalidOperationException("Should not reach here.");
                    }
                });

                var jobDispatcher = new JobDispatcher();
                jobDispatcher.Initialize(hc);

                await jobDispatcher.RenewJobRequestAsync(poolId, requestId, Guid.Empty, Guid.NewGuid().ToString(), firstJobRequestRenewed, cancellationTokenSource.Token);

                Assert.False(firstJobRequestRenewed.Task.IsCompletedSuccessfully, "First renew should failed.");
                Assert.False(cancellationTokenSource.IsCancellationRequested);
                _runnerServer.Verify(x => x.RenewAgentRequestAsync(It.IsAny <int>(), It.IsAny <long>(), It.IsAny <Guid>(), It.IsAny <string>(), It.IsAny <CancellationToken>()), Times.Exactly(6));
            }
        }
Esempio n. 4
0
        public async void DispatchesOneTimeJobRequest()
        {
            //Arrange
            using (var hc = new TestHostContext(this))
            {
                var jobDispatcher = new JobDispatcher();
                hc.SetSingleton <IConfigurationStore>(_configurationStore.Object);
                hc.SetSingleton <IRunnerServer>(_runnerServer.Object);

                hc.EnqueueInstance <IProcessChannel>(_processChannel.Object);
                hc.EnqueueInstance <IProcessInvoker>(_processInvoker.Object);

                _configurationStore.Setup(x => x.GetSettings()).Returns(new RunnerSettings()
                {
                    PoolId = 1
                });
                jobDispatcher.Initialize(hc);

                var ts = new CancellationTokenSource();
                Pipelines.AgentJobRequestMessage message = CreateJobRequestMessage();
                string strMessage = JsonUtility.ToString(message);

                _processInvoker.Setup(x => x.ExecuteAsync(It.IsAny <String>(), It.IsAny <String>(), "spawnclient 1 2", null, It.IsAny <CancellationToken>()))
                .Returns(Task.FromResult <int>(56));

                _processChannel.Setup(x => x.StartServer(It.IsAny <StartProcessDelegate>()))
                .Callback((StartProcessDelegate startDel) => { startDel("1", "2"); });
                _processChannel.Setup(x => x.SendAsync(MessageType.NewJobRequest, It.Is <string>(s => s.Equals(strMessage)), It.IsAny <CancellationToken>()))
                .Returns(Task.CompletedTask);

                var          request           = new TaskAgentJobRequest();
                PropertyInfo sessionIdProperty = request.GetType().GetProperty("LockedUntil", BindingFlags.Instance | BindingFlags.NonPublic | BindingFlags.Public);
                Assert.NotNull(sessionIdProperty);
                sessionIdProperty.SetValue(request, DateTime.UtcNow.AddMinutes(5));

                _runnerServer.Setup(x => x.RenewAgentRequestAsync(It.IsAny <int>(), It.IsAny <long>(), It.IsAny <Guid>(), It.IsAny <CancellationToken>())).Returns(Task.FromResult <TaskAgentJobRequest>(request));

                _runnerServer.Setup(x => x.FinishAgentRequestAsync(It.IsAny <int>(), It.IsAny <long>(), It.IsAny <Guid>(), It.IsAny <DateTime>(), It.IsAny <TaskResult>(), It.IsAny <CancellationToken>())).Returns(Task.FromResult <TaskAgentJobRequest>(new TaskAgentJobRequest()));

                //Act
                jobDispatcher.Run(message, true);

                //Assert
                await jobDispatcher.WaitAsync(CancellationToken.None);

                Assert.True(jobDispatcher.RunOnceJobCompleted.Task.IsCompleted, "JobDispatcher should set task complete token for one time agent.");
                Assert.True(jobDispatcher.RunOnceJobCompleted.Task.Result, "JobDispatcher should set task complete token to 'TRUE' for one time agent.");
            }
        }
Esempio n. 5
0
        public async void DispatchesJobRequest()
        {
            //Arrange
            using (var hc = new TestHostContext(this))
            { 
                var jobDispatcher = new JobDispatcher();
                hc.SetSingleton<IConfigurationStore>(_configurationStore.Object);
                hc.SetSingleton<IAgentServer>(_agentServer.Object);

                hc.EnqueueInstance<IProcessChannel>(_processChannel.Object);
                hc.EnqueueInstance<IProcessInvoker>(_processInvoker.Object);

                _configurationStore.Setup(x => x.GetSettings()).Returns(new AgentSettings() { PoolId = 1 });
                jobDispatcher.Initialize(hc);

                var ts = new CancellationTokenSource();
                JobRequestMessage message = CreateJobRequestMessage();
                string strMessage = JsonUtility.ToString(message);

                _processInvoker.Setup(x => x.ExecuteAsync(It.IsAny<String>(), It.IsAny<String>(), "spawnclient 1 2", null, It.IsAny<CancellationToken>()))
                    .Returns(Task.FromResult<int>(56));

                _processChannel.Setup(x => x.StartServer(It.IsAny<StartProcessDelegate>()))
                    .Callback((StartProcessDelegate startDel) => { startDel("1","2"); });
                _processChannel.Setup(x => x.SendAsync(MessageType.NewJobRequest, It.Is<string>(s => s.Equals(strMessage)), It.IsAny<CancellationToken>()))
                    .Returns(Task.CompletedTask);

                var request = new TaskAgentJobRequest();
                PropertyInfo sessionIdProperty = request.GetType().GetProperty("LockedUntil", BindingFlags.Instance | BindingFlags.NonPublic | BindingFlags.Public);
                Assert.NotNull(sessionIdProperty);
                sessionIdProperty.SetValue(request, DateTime.UtcNow.AddMinutes(5));

                _agentServer.Setup(x => x.RenewAgentRequestAsync(It.IsAny<int>(), It.IsAny<long>(), It.IsAny<Guid>(), It.IsAny<CancellationToken>())).Returns(Task.FromResult<TaskAgentJobRequest>(request));

                _agentServer.Setup(x => x.FinishAgentRequestAsync(It.IsAny<int>(), It.IsAny<long>(), It.IsAny<Guid>(), It.IsAny<DateTime>(), It.IsAny<TaskResult>(), It.IsAny<CancellationToken>())).Returns(Task.FromResult<TaskAgentJobRequest>(new TaskAgentJobRequest()));


                //Actt
                jobDispatcher.Run(message);

                //Assert
                await jobDispatcher.WaitAsync(CancellationToken.None);
            }
        }
Esempio n. 6
0
        private async Task RenewJobRequestAsync(int poolId, long requestId, Guid lockToken, TaskCompletionSource <int> firstJobRequestRenewed, CancellationToken token)
        {
            var agentServer             = HostContext.GetService <IAgentServer>();
            int firstRenewRetryLimit    = 5;
            TaskAgentJobRequest request = null;

            // renew lock during job running.
            // stop renew only if cancellation token for lock renew task been signal or exception still happen after retry.
            while (!token.IsCancellationRequested)
            {
                try
                {
                    request = await agentServer.RenewAgentRequestAsync(poolId, requestId, lockToken, token);

                    Trace.Info($"Successfully renew job request, job is valid till {request.LockedUntil.Value}");

                    if (!firstJobRequestRenewed.Task.IsCompleted)
                    {
                        // fire first renew successed event.
                        firstJobRequestRenewed.TrySetResult(0);
                    }

                    // renew again after 60 sec delay
                    await Task.Delay(TimeSpan.FromSeconds(60), token);
                }
                catch (Exception ex)
                {
                    if (ex is TaskAgentJobNotFoundException)
                    {
                        // no need for retry. the job is not valid anymore.
                        Trace.Info("TaskAgentJobNotFoundException received, job is no longer valid, stop renew job request.");
                        return;
                    }
                    else if (ex is TaskAgentJobTokenExpiredException)
                    {
                        // no need for retry. the job is not valid anymore.
                        Trace.Info("TaskAgentJobTokenExpiredException received, job is no longer valid, stop renew job request.");
                        return;
                    }
                    else if (ex is TaskCanceledException)
                    {
                        // TaskCanceledException may caused by http timeout or _lockRenewalTokenSource.Cance();
                        if (token.IsCancellationRequested)
                        {
                            Trace.Info("job renew has been canceled, stop renew job request.");
                            return;
                        }
                    }

                    // retry
                    TimeSpan remainingTime = TimeSpan.Zero;
                    if (!firstJobRequestRenewed.Task.IsCompleted)
                    {
                        // retry 5 times every 10 sec for the first renew
                        if (firstRenewRetryLimit-- > 0)
                        {
                            remainingTime = TimeSpan.FromSeconds(10);
                        }
                    }
                    else
                    {
                        // retry till reach lockeduntil
                        remainingTime = request.LockedUntil.Value - DateTime.UtcNow;
                    }

                    if (remainingTime > TimeSpan.Zero)
                    {
                        Trace.Verbose($"Retrying lock renewal. Job is still locked for: {remainingTime.TotalSeconds} seconds.");
                        TimeSpan delayTime = remainingTime.TotalSeconds > 60 ? TimeSpan.FromMinutes(1) : remainingTime;
                        await Task.Delay(delayTime, token);
                    }
                    else
                    {
                        return;
                    }
                }
            }

            return;
        }
Esempio n. 7
0
        public async Task RenewJobRequestAsync(int poolId, long requestId, Guid lockToken, TaskCompletionSource <int> firstJobRequestRenewed, CancellationToken token)
        {
            var agentServer             = HostContext.GetService <IAgentServer>();
            TaskAgentJobRequest request = null;
            int firstRenewRetryLimit    = 5;
            int encounteringError       = 0;

            // renew lock during job running.
            // stop renew only if cancellation token for lock renew task been signal or exception still happen after retry.
            while (!token.IsCancellationRequested)
            {
                try
                {
                    request = await agentServer.RenewAgentRequestAsync(poolId, requestId, lockToken, token);

                    Trace.Info($"Successfully renew job request {requestId}, job is valid till {request.LockedUntil.Value}");

                    if (!firstJobRequestRenewed.Task.IsCompleted)
                    {
                        // fire first renew succeed event.
                        firstJobRequestRenewed.TrySetResult(0);
                    }

                    if (encounteringError > 0)
                    {
                        encounteringError = 0;
                        agentServer.SetConnectionTimeout(AgentConnectionType.JobRequest, TimeSpan.FromSeconds(60));
                        HostContext.WritePerfCounter("JobRenewRecovered");
                    }

                    // renew again after 60 sec delay
                    await HostContext.Delay(TimeSpan.FromSeconds(60), token);
                }
                catch (TaskAgentJobNotFoundException)
                {
                    // no need for retry. the job is not valid anymore.
                    Trace.Info($"TaskAgentJobNotFoundException received when renew job request {requestId}, job is no longer valid, stop renew job request.");
                    return;
                }
                catch (TaskAgentJobTokenExpiredException)
                {
                    // no need for retry. the job is not valid anymore.
                    Trace.Info($"TaskAgentJobTokenExpiredException received renew job request {requestId}, job is no longer valid, stop renew job request.");
                    return;
                }
                catch (OperationCanceledException) when(token.IsCancellationRequested)
                {
                    // OperationCanceledException may caused by http timeout or _lockRenewalTokenSource.Cance();
                    // Stop renew only on cancellation token fired.
                    Trace.Info($"job renew has been canceled, stop renew job request {requestId}.");
                    return;
                }
                catch (Exception ex)
                {
                    Trace.Error($"Catch exception during renew agent jobrequest {requestId}.");
                    Trace.Error(ex);
                    encounteringError++;

                    // retry
                    TimeSpan remainingTime = TimeSpan.Zero;
                    if (!firstJobRequestRenewed.Task.IsCompleted)
                    {
                        // retry 5 times every 10 sec for the first renew
                        if (firstRenewRetryLimit-- > 0)
                        {
                            remainingTime = TimeSpan.FromSeconds(10);
                        }
                    }
                    else
                    {
                        // retry till reach lockeduntil + 5 mins extra buffer.
                        remainingTime = request.LockedUntil.Value + TimeSpan.FromMinutes(5) - DateTime.UtcNow;
                    }

                    if (remainingTime > TimeSpan.Zero)
                    {
                        TimeSpan delayTime;
                        if (!firstJobRequestRenewed.Task.IsCompleted)
                        {
                            Trace.Info($"Retrying lock renewal for jobrequest {requestId}. The first job renew request has failed.");
                            delayTime = BackoffTimerHelper.GetRandomBackoff(TimeSpan.FromSeconds(1), TimeSpan.FromSeconds(10));
                        }
                        else
                        {
                            Trace.Info($"Retrying lock renewal for jobrequest {requestId}. Job is valid until {request.LockedUntil.Value}.");
                            if (encounteringError > 5)
                            {
                                delayTime = BackoffTimerHelper.GetRandomBackoff(TimeSpan.FromSeconds(15), TimeSpan.FromSeconds(30));
                            }
                            else
                            {
                                delayTime = BackoffTimerHelper.GetRandomBackoff(TimeSpan.FromSeconds(5), TimeSpan.FromSeconds(15));
                            }
                        }

                        // Re-establish connection to server in order to avoid affinity with server.
                        // Reduce connection timeout to 30 seconds (from 60s)
                        HostContext.WritePerfCounter("ResetJobRenewConnection");
                        await agentServer.RefreshConnectionAsync(AgentConnectionType.JobRequest, TimeSpan.FromSeconds(30));

                        try
                        {
                            // back-off before next retry.
                            await HostContext.Delay(delayTime, token);
                        }
                        catch (OperationCanceledException) when(token.IsCancellationRequested)
                        {
                            Trace.Info($"job renew has been canceled, stop renew job request {requestId}.");
                        }
                    }
                    else
                    {
                        Trace.Info($"Lock renewal has run out of retry, stop renew lock for jobrequest {requestId}.");
                        HostContext.WritePerfCounter("JobRenewReachLimit");
                        return;
                    }
                }
            }
        }
Esempio n. 8
0
        private async Task EnsureDispatchFinished(WorkerDispatcher jobDispatch, bool cancelRunningJob = false)
        {
            if (!jobDispatch.WorkerDispatch.IsCompleted)
            {
                if (cancelRunningJob)
                {
                    // cancel running job when shutting down the runner.
                    // this will happen when runner get Ctrl+C or message queue loop crashed.
                    jobDispatch.WorkerCancellationTokenSource.Cancel();
                    // wait for worker process exit then return.
                    await jobDispatch.WorkerDispatch;

                    return;
                }

                // base on the current design, server will only send one job for a given runner everytime.
                // if the runner received a new job request while a previous job request is still running, this typically indicate two situations
                // 1. an runner bug cause server and runner mismatch on the state of the job request, ex. runner not renew jobrequest properly but think it still own the job reqest, however server already abandon the jobrequest.
                // 2. a server bug or design change that allow server send more than one job request to an given runner that haven't finish previous job request.
                var runnerServer            = HostContext.GetService <IRunnerServer>();
                TaskAgentJobRequest request = null;
                try
                {
                    request = await runnerServer.GetAgentRequestAsync(_poolId, jobDispatch.RequestId, CancellationToken.None);
                }
                catch (Exception ex)
                {
                    // we can't even query for the jobrequest from server, something totally busted, stop runner/worker.
                    Trace.Error($"Catch exception while checking jobrequest {jobDispatch.JobId} status. Cancel running worker right away.");
                    Trace.Error(ex);

                    jobDispatch.WorkerCancellationTokenSource.Cancel();
                    // make sure worker process exit before we rethrow, otherwise we might leave orphan worker process behind.
                    await jobDispatch.WorkerDispatch;

                    // rethrow original exception
                    throw;
                }

                if (request.Result != null)
                {
                    // job request has been finished, the server already has result.
                    // this means runner is busted since it still running that request.
                    // cancel the zombie worker, run next job request.
                    Trace.Error($"Received job request while previous job {jobDispatch.JobId} still running on worker. Cancel the previous job since the job request have been finished on server side with result: {request.Result.Value}.");
                    jobDispatch.WorkerCancellationTokenSource.Cancel();

                    // wait 45 sec for worker to finish.
                    Task completedTask = await Task.WhenAny(jobDispatch.WorkerDispatch, Task.Delay(TimeSpan.FromSeconds(45)));

                    if (completedTask != jobDispatch.WorkerDispatch)
                    {
                        // at this point, the job exectuion might encounter some dead lock and even not able to be canclled.
                        // no need to localize the exception string should never happen.
                        throw new InvalidOperationException($"Job dispatch process for {jobDispatch.JobId} has encountered unexpected error, the dispatch task is not able to be canceled within 45 seconds.");
                    }
                }
                else
                {
                    // something seriously wrong on server side. stop runner from continue running.
                    // no need to localize the exception string should never happen.
                    throw new InvalidOperationException($"Server send a new job request while the previous job request {jobDispatch.JobId} haven't finished.");
                }
            }

            try
            {
                await jobDispatch.WorkerDispatch;
                Trace.Info($"Job request {jobDispatch.JobId} processed succeed.");
            }
            catch (Exception ex)
            {
                Trace.Error($"Worker Dispatch failed with an exception for job request {jobDispatch.JobId}.");
                Trace.Error(ex);
            }
            finally
            {
                WorkerDispatcher workerDispatcher;
                if (_jobInfos.TryRemove(jobDispatch.JobId, out workerDispatcher))
                {
                    Trace.Verbose($"Remove WorkerDispather from {nameof(_jobInfos)} dictionary for job {jobDispatch.JobId}.");
                    workerDispatcher.Dispose();
                }
            }
        }