public async Task ResumePendingActionsAsync(FabricClient fc, CancellationToken cancellationToken) { TestabilityTrace.TraceSource.WriteInfo(TraceType, "Getting running actions"); IEnumerable <ActionStateBase> incompleteActions = await this.actionStore.GetRunningActionsAsync(); IEnumerable <ActionStateBase> two = incompleteActions.OrderBy(s => s.TimeReceived); TestabilityTrace.TraceSource.WriteInfo(TraceType, "Done getting running actions"); foreach (ActionStateBase actionState in two) { if (actionState is NodeCommandState) { NodeCommandState nodeState = actionState as NodeCommandState; nodeState.NodeSync = this.entitySynch.NodeSynchronizer; nodeState.StoppedNodeTable = this.stoppedNodeTable; this.entitySynch.NodeSynchronizer.Add(nodeState.Info.NodeName); } FabricTestAction action = await this.ConstructActionAsync(actionState.ActionType, actionState); TestabilityTrace.TraceSource.WriteInfo(TraceType, "{0} - Resuming action of type {1}", actionState.OperationId, actionState.ActionType); this.Enqueue(actionState); } }
public async Task RunAsync(FabricClient fc, FabricTestAction action, ActionStateBase actionState, ServiceInternalFaultInfo serviceInternalFaultInfo, CancellationToken cancellationToken) { TestabilityTrace.TraceSource.WriteInfo(TraceType, "{0} - Inside RunAsync of Engine, entering state machine", actionState.OperationId); try { do { cancellationToken.ThrowIfCancellationRequested(); RollbackState readRollbackState = await this.CheckUserCancellationAndUpdateIfNeededAsync(actionState, cancellationToken, FASConstants.OuterLoop).ConfigureAwait(false); // For the non-force case we need to cleanup, so that is why there's no break statement in that case. if (readRollbackState == RollbackState.RollingBackForce) { actionState.StateProgress.Push(StepStateNames.Failed); await this.actionStore.UpdateActionStateAsync(actionState).ConfigureAwait(false); break; } await this.RunStateMachineAsync(fc, action, actionState, serviceInternalFaultInfo, cancellationToken).ConfigureAwait(false); if (actionState.RollbackState == RollbackState.RollingBackAndWillRetryAction) { actionState.ErrorCausingRollback = 0; int pauseTime = this.random.Next(10, 60); TestabilityTrace.TraceSource.WriteInfo(TraceType, "{0} - Pausing for {1} seconds before retrying", actionState.OperationId, pauseTime); // Clear the rollback state so it will go forward when it resumes. actionState.RollbackState = RollbackState.NotRollingBack; await this.actionStore.UpdateActionStateAsync(actionState).ConfigureAwait(false); await Task.Delay(TimeSpan.FromSeconds(pauseTime), cancellationToken).ConfigureAwait(false); } }while (actionState.StateProgress.Peek() != StepStateNames.CompletedSuccessfully && actionState.StateProgress.Peek() != StepStateNames.Failed); } catch (FabricNotPrimaryException notPrimary) { FaultAnalysisServiceUtility.TraceFabricNotPrimary(actionState.OperationId, notPrimary); } catch (FabricObjectClosedException objectClosed) { FaultAnalysisServiceUtility.TraceFabricObjectClosed(actionState.OperationId, objectClosed); } catch (Exception e) { TestabilityTrace.TraceSource.WriteWarning(TraceType, "{0} caught exception - {1}", actionState.OperationId, e); throw; } TestabilityTrace.TraceSource.WriteInfo(TraceType, "{0} - Exiting state machine", actionState.OperationId); }
private async Task CleanupStepAsync( FabricClient fabricClient, FabricTestAction action, ActionStateBase actionState, CancellationToken cancellationToken, ServiceInternalFaultInfo serviceInternalFaultInfo) { StepStateNames state = actionState.StateProgress.Peek(); TestabilityTrace.TraceSource.WriteInfo(TraceType, "Cleaning up state={0}, name={1}, key={2}", state, actionState.ActionType, actionState.OperationId); StepBase actionUnit = null; actionUnit = action.GetStep(fabricClient, actionState, state, cancellationToken); TestabilityTrace.TraceSource.WriteInfo(TraceType, "{0} - Cleaning up {1}", actionState.OperationId, actionUnit.StepName); try { while (true) { cancellationToken.ThrowIfCancellationRequested(); RollbackState readRollbackState = await this.CheckUserCancellationAndUpdateIfNeededAsync(actionState, cancellationToken, FASConstants.InnerCleanupLoop).ConfigureAwait(false); if (readRollbackState == RollbackState.RollingBackDueToUserCancel) { // Do nothing, already rolling back TestabilityTrace.TraceSource.WriteWarning(TraceType, "{0} - Inner cleanup loop read RollingBackDueToUserCancel", actionState.OperationId); } else if (readRollbackState == RollbackState.RollingBackForce) { TestabilityTrace.TraceSource.WriteWarning(TraceType, "{0} - Inner cleanup loop read RollingBackForce", actionState.OperationId); break; } try { await actionUnit.CleanupAsync(cancellationToken).ConfigureAwait(false); actionState.StateProgress.Pop(); break; } catch (Exception cleanupException) { TestabilityTrace.TraceSource.WriteWarning( TraceType, "{0} - Cleanup of action type={1}, failed with {2}, retrying", actionState.OperationId, actionState.ActionType, cleanupException); } await Task.Delay(TimeSpan.FromSeconds(this.commandStepRetryBackoffInSeconds), cancellationToken).ConfigureAwait(false); } } catch (Exception e) { TestabilityTrace.TraceSource.WriteWarning(TraceType, "{0} - CleanupStepAsync, error: {1}", actionState.OperationId, e.ToString()); throw; } }
private async Task RunStepAsync( FabricClient fabricClient, FabricTestAction action, ActionStateBase actionState, CancellationToken cancellationToken, ServiceInternalFaultInfo serviceInternalFaultInfo) { StepStateNames state = actionState.StateProgress.Peek(); TestabilityTrace.TraceSource.WriteInfo(TraceType, "Running state={0}, name={1}, key={2}", state, actionState.ActionType, actionState.OperationId); StepBase actionUnit = null; actionUnit = action.GetStep(fabricClient, actionState, state, cancellationToken); TestabilityTrace.TraceSource.WriteInfo(TraceType, "{0} - Running {1}", actionState.OperationId, actionUnit.StepName); try { while (true) { cancellationToken.ThrowIfCancellationRequested(); RollbackState readRollbackState = await this.CheckUserCancellationAndUpdateIfNeededAsync(actionState, cancellationToken, FASConstants.InnerForwardLoop).ConfigureAwait(false); TestabilityTrace.TraceSource.WriteInfo(TraceType, "{0} - readRollbackState={1}", actionState.OperationId, readRollbackState); // If RetryStepWithoutRollingbackOnFailure == true, then don't allow graceful user cancel if (!actionState.RetryStepWithoutRollingBackOnFailure && (readRollbackState == RollbackState.RollingBackDueToUserCancel)) { TestabilityTrace.TraceSource.WriteInfo(TraceType, "{0} - read RollingBackDueToUserCancel breaking from Run loop inside RunStepAsync()", actionState.OperationId); break; } // RollingBackForce always stops execution if (readRollbackState == RollbackState.RollingBackForce) { break; } Exception runException = null; try { TestabilityTrace.TraceSource.WriteInfo(TraceType, "{0}, {1} - calling Step.Run()", actionState.OperationId, actionState.ActionType); ActionStateBase newContext = await actionUnit.RunAsync(cancellationToken, serviceInternalFaultInfo).ConfigureAwait(false); TestabilityTrace.TraceSource.WriteInfo(TraceType, "{0}, {1} - calling break after run", actionState.OperationId, actionState.ActionType); break; } catch (Exception runExceptionTemp) { runException = runExceptionTemp; TestabilityTrace.TraceSource.WriteWarning(TraceType, "{0}, {1} - runException {2}", actionState.OperationId, actionState.ActionType, runException); if (actionState.RetryStepWithoutRollingBackOnFailure) { // trace and loop. Should have /backoff/? TestabilityTrace.TraceSource.WriteWarning( TraceType, "{0}, {1} has RetryStepWithoutRollingbackOnFailure set to true, retrying step name='{2}'. Caught exception: {3}", actionState.OperationId, actionState.ActionType, actionUnit.StepName, runException); this.ProcessRetryStepExceptions(actionState.OperationId, runException); } else { throw; } } if (runException != null) { await Task.Delay(TimeSpan.FromSeconds(this.commandStepRetryBackoffInSeconds), cancellationToken).ConfigureAwait(false); } } } catch (Exception e) { TestabilityTrace.TraceSource.WriteWarning(TraceType, "{0} - RunState, error: {1}", actionState.OperationId, e.ToString()); throw; } }
private async Task RunStateMachineAsync( FabricClient fabricClient, FabricTestAction action, ActionStateBase actionState, ServiceInternalFaultInfo serviceInternalFaultInfo, CancellationToken cancellationToken) { if (actionState.StateProgress == null || actionState.StateProgress.Count == 0) { ReleaseAssert.AssertIf(actionState.StateProgress == null || actionState.StateProgress.Count == 0, "ActionProgress should not be null or empty"); } Exception actionError = null; if (actionState.RollbackState == RollbackState.NotRollingBack || (actionState.RollbackState != RollbackState.RollingBackForce && actionState.RetryStepWithoutRollingBackOnFailure)) { // TODO: should also include Error while (actionState.StateProgress.Peek() != StepStateNames.CompletedSuccessfully) { cancellationToken.ThrowIfCancellationRequested(); RollbackState readRollbackState = await this.CheckUserCancellationAndUpdateIfNeededAsync(actionState, cancellationToken, FASConstants.ForwardLoop).ConfigureAwait(false); if ((readRollbackState == RollbackState.RollingBackForce) || ((readRollbackState == RollbackState.RollingBackDueToUserCancel) && !actionState.RetryStepWithoutRollingBackOnFailure)) { break; } try { await this.RunStepAsync(fabricClient, action, actionState, cancellationToken, serviceInternalFaultInfo).ConfigureAwait(false); ActionTest.PerformInternalServiceFaultIfRequested(actionState.OperationId, serviceInternalFaultInfo, actionState, cancellationToken); if (actionState.StateProgress.Peek() == StepStateNames.CompletedSuccessfully) { TestabilityTrace.TraceSource.WriteInfo(TraceType, "{0} - completed successfully, clearing ErrorCausingRollback", actionState.OperationId); actionState.ErrorCausingRollback = 0; } actionState.TimeStopped = DateTime.UtcNow; await this.actionStore.UpdateActionStateAsync(actionState).ConfigureAwait(false); } catch (RetrySameStepException) { // Retry the command in the same step - do not rollback or go forward, and do not call ActionStore.UpdateActionStateAsync(). TestabilityTrace.TraceSource.WriteWarning(TraceType, "{0} - threw RetrySameStepException, retrying state {1} ", actionState.StateProgress.Peek()); } catch (FabricNotPrimaryException) { throw; } catch (FabricObjectClosedException) { throw; } catch (Exception e) { TestabilityTrace.TraceSource.WriteWarning(TraceType, "{0} - RunStateMachineAsync caught: {1}", actionState.OperationId, e.ToString()); readRollbackState = this.CheckUserCancellationAndUpdateIfNeededAsync(actionState, cancellationToken, FASConstants.ForwardLoopExceptionBlock).GetAwaiter().GetResult(); // 1st line: if this is a force rollback (RollingBackForce), just exit // 2nd line: if !RetryStepWithoutRollingBackOnFailure and there was a graceful cancel then exit this block and proceed to the rollback code block below. // If RetryStepWithoutRollingBackOnFailure is true, which it is only for the node steps today, then first call HandleRollback to translate the exception. if ((readRollbackState == RollbackState.RollingBackForce) || ((readRollbackState == RollbackState.RollingBackDueToUserCancel) && !actionState.RetryStepWithoutRollingBackOnFailure)) { break; } else { bool isRetryable = this.HandleRollback(actionState.OperationId, e); if (isRetryable) { TestabilityTrace.TraceSource.WriteWarning(TraceType, "{0} - observed retryable exception, will retry action. Exception: {1}", actionState.OperationId, e.ToString()); actionState.RollbackState = RollbackState.RollingBackAndWillRetryAction; } else { TestabilityTrace.TraceSource.WriteWarning(TraceType, "{0} - observed non-retryable exception. Exception: {1}", actionState.OperationId, e.ToString()); actionState.RollbackState = RollbackState.RollingBackAndWillFailAction; } } actionError = e; break; } } } if (actionState.RollbackState == RollbackState.RollingBackAndWillRetryAction || actionState.RollbackState == RollbackState.RollingBackAndWillFailAction || (actionState.RollbackState == RollbackState.RollingBackDueToUserCancel && (actionState.StateProgress.Peek() != StepStateNames.CompletedSuccessfully))) { TestabilityTrace.TraceSource.WriteWarning(TraceType, "{0} - Rollingback type={1}", actionState.OperationId, actionState.ActionType); if (!this.isTestMode && actionState.StateProgress.Peek() == StepStateNames.CompletedSuccessfully) { string error = string.Format(CultureInfo.InvariantCulture, "{0} - state should not be CompletedSuccessfully", actionState.OperationId); TestabilityTrace.TraceSource.WriteError(TraceType, error); ReleaseAssert.Failfast(error); } // If actionError is not null it means we are currently running a resumed rollback. In that case the ErrorCausingRollback must have // already been set. if (actionError != null) { actionState.ErrorCausingRollback = TranslateRollbackError(actionError.HResult); TestabilityTrace.TraceSource.WriteWarning(TraceType, "{0} - Translated ErrorCausingRollback ={1}", actionState.OperationId, actionState.ErrorCausingRollback); } if (this.isTestMode && actionState.StateProgress.Peek() == StepStateNames.CompletedSuccessfully) { // In test mode it's intentionally possible to fault an action after it's completed its work, but before the state name has been updated. actionState.StateProgress.Pop(); } await this.actionStore.UpdateActionStateAsync(actionState).ConfigureAwait(false); try { while (actionState.StateProgress.Peek() != StepStateNames.IntentSaved && actionState.StateProgress.Peek() != StepStateNames.Failed) { cancellationToken.ThrowIfCancellationRequested(); RollbackState readRollbackState = await this.CheckUserCancellationAndUpdateIfNeededAsync(actionState, cancellationToken, FASConstants.OuterCleanupLoop).ConfigureAwait(false); if (readRollbackState == RollbackState.RollingBackDueToUserCancel) { // Do nothing, already rolling back - debug only TestabilityTrace.TraceSource.WriteWarning(TraceType, "{0} - Read RollingBackDueToUserCancel in outer rollback loop", actionState.OperationId); } else if (readRollbackState == RollbackState.RollingBackForce) { TestabilityTrace.TraceSource.WriteWarning(TraceType, "{0} - Read RollingBackForce in outer rollback loop", actionState.OperationId); break; } StepStateNames currentStateName = actionState.StateProgress.Peek(); TestabilityTrace.TraceSource.WriteInfo(TraceType, "{0} - DEBUG - Rollback path loop, current state {1}", actionState.OperationId, actionState.StateProgress.Peek()); try { await this.CleanupStepAsync(fabricClient, action, actionState, cancellationToken, serviceInternalFaultInfo).ConfigureAwait(false); await this.actionStore.UpdateActionStateAsync(actionState).ConfigureAwait(false); } catch (FabricNotPrimaryException) { throw; } catch (FabricObjectClosedException) { throw; } catch (Exception e) { ReleaseAssert.Failfast("Unexpected exception, RunStateAsync for cleanup should have handled {0}", e); } } // If this is true rollback is finished. If it is retryable set the state to LookingUpState if (actionState.StateProgress.Peek() == StepStateNames.IntentSaved) { if (actionState.RollbackState == RollbackState.RollingBackAndWillRetryAction) { actionState.StateProgress.Push(StepStateNames.LookingUpState); actionState.ClearInfo(); } else if (actionState.RollbackState == RollbackState.RollingBackAndWillFailAction) { actionState.StateProgress.Push(StepStateNames.Failed); actionState.RollbackState = RollbackState.NotRollingBack; actionState.TimeStopped = DateTime.UtcNow; } else if (actionState.RollbackState == RollbackState.RollingBackDueToUserCancel) { actionState.StateProgress.Push(StepStateNames.Failed); actionState.TimeStopped = DateTime.UtcNow; } else if (actionState.RollbackState == RollbackState.RollingBackForce) { actionState.StateProgress.Push(StepStateNames.Failed); actionState.TimeStopped = DateTime.UtcNow; } else { string error = string.Format(CultureInfo.InvariantCulture, "{0} - RollbackState == NotRollingBack not expected", actionState.OperationId); ReleaseAssert.Failfast(error); } } else if (actionState.RollbackState == RollbackState.RollingBackForce) { actionState.StateProgress.Push(StepStateNames.Failed); actionState.TimeStopped = DateTime.UtcNow; } } catch (OperationCanceledException) { // This means the cancellation token is set, not that an api call observed an E_ABORT throw; } catch (FabricNotPrimaryException) { throw; } catch (FabricObjectClosedException) { throw; } catch (Exception e) { ReleaseAssert.Failfast("Unexpected exception, RunStateAsync for cleanup should have handled {0}", e); } TestabilityTrace.TraceSource.WriteInfo( TraceType, "{0} - Action failed, type='{1}', will retry={2}, RollbackState={3}", actionState.OperationId, actionState.ActionType, actionState.RollbackState == RollbackState.RollingBackAndWillRetryAction ? "true" : "false", actionState.RollbackState); await this.actionStore.UpdateActionStateAsync(actionState).ConfigureAwait(false); } else if (actionState.StateProgress.Peek() == StepStateNames.CompletedSuccessfully) { // user cancelled, but action/command completed anyways before cancellation was checked. TestabilityTrace.TraceSource.WriteInfo(TraceType, "DEBUG {0} - Action type '{1}' completed successfully, not updating again ", actionState.OperationId, actionState.ActionType); } else if ((actionState.StateProgress.Peek() == StepStateNames.IntentSaved) && (actionState.RollbackState == RollbackState.RollingBackDueToUserCancel)) { actionState.StateProgress.Push(StepStateNames.Failed); actionState.TimeStopped = DateTime.UtcNow; await this.actionStore.UpdateActionStateAsync(actionState).ConfigureAwait(false); } else if (actionState.RollbackState == RollbackState.RollingBackForce) { // Note: unlike the case above this does not have a state of IntentSaved as a requirement since a force rollback is an abort and does run the steps in reverse. // It is possible for the StateProgress to be CompletedSuccessfully here, since we want to exit as quickly as possible. In that case, the block 2 blocks above handles it - // we do nothing extra, and the command finishes executing. If the user calls an api for information on this command, we translate the state to ForceCancelled if state is a terminal state // and RollbackState is RollingBackForce. See ActionStore.MatchesStateFilter(). actionState.TimeStopped = DateTime.UtcNow; TestabilityTrace.TraceSource.WriteWarning(TraceType, "Bottom of Engine.RunAsync() - state is={0}, rollbackState={1}", actionState.StateProgress.Peek().ToString(), actionState.RollbackState.ToString()); actionState.StateProgress.Push(StepStateNames.Failed); await this.actionStore.UpdateActionStateAsync(actionState).ConfigureAwait(false); } else { string unexpectedError = string.Format(CultureInfo.InvariantCulture, "Unexpected case reached, state is={0}, rollbackState={1}", actionState.StateProgress.Peek().ToString(), actionState.RollbackState.ToString()); TestabilityTrace.TraceSource.WriteError(TraceType, "{0}", unexpectedError); ReleaseAssert.Failfast(unexpectedError); } }
private async Task <FabricTestAction> ConstructActionAsync(ActionType actionType, ActionStateBase actionStateBase) { FabricTestAction action = null; if (actionType == ActionType.InvokeDataLoss) { InvokeDataLossState actionState = actionStateBase as InvokeDataLossState; StepStateNames currentState = actionState.StateProgress.Peek(); if (currentState == StepStateNames.IntentSaved) { actionState.StateProgress.Push(StepStateNames.LookingUpState); await this.actionStore.UpdateActionStateAsync(actionState); TestabilityTrace.TraceSource.WriteInfo(TraceType, "action state has been updated"); } action = new InvokeDataLossAction( this.stateManager, this.Partition, actionState, actionState.Info.PartitionSelector, actionState.Info.DataLossMode, this.dataLossCheckWaitDurationInSeconds, this.dataLossCheckPollIntervalInSeconds, this.replicaDropWaitDurationInSeconds, this.requestTimeout, this.operationTimeout); } else if (actionType == ActionType.InvokeQuorumLoss) { InvokeQuorumLossState actionState = actionStateBase as InvokeQuorumLossState; StepStateNames currentState = actionState.StateProgress.Peek(); if (currentState == StepStateNames.IntentSaved) { actionState.StateProgress.Push(StepStateNames.LookingUpState); await this.actionStore.UpdateActionStateAsync(actionState); TestabilityTrace.TraceSource.WriteInfo(TraceType, "action state has been updated"); } // This is the case for resuming an action after a failover action = new InvokeQuorumLossAction(this.stateManager, this.Partition, actionState, actionState.Info.PartitionSelector, actionState.Info.QuorumLossMode, actionState.Info.QuorumLossDuration, this.requestTimeout, this.operationTimeout); } else if (actionType == ActionType.RestartPartition) { RestartPartitionState actionState = actionStateBase as RestartPartitionState; StepStateNames currentState = actionState.StateProgress.Peek(); if (currentState == StepStateNames.IntentSaved) { actionState.StateProgress.Push(StepStateNames.LookingUpState); await this.actionStore.UpdateActionStateAsync(actionState); TestabilityTrace.TraceSource.WriteInfo(TraceType, "action state has been updated"); } // This is the case for resuming an action after a failover action = new RestartPartitionAction(this.stateManager, this.Partition, actionState, actionState.Info.PartitionSelector, actionState.Info.RestartPartitionMode, this.requestTimeout, this.operationTimeout); } else if (actionType == ActionType.TestStuck) { StuckState actionState = actionStateBase as StuckState; StepStateNames currentState = actionState.StateProgress.Peek(); if (currentState == StepStateNames.IntentSaved) { actionState.StateProgress.Push(StepStateNames.LookingUpState); await this.actionStore.UpdateActionStateAsync(actionState); TestabilityTrace.TraceSource.WriteInfo(TraceType, "action state has been updated"); } action = new StuckAction(this.stateManager, this.Partition, actionState, this.requestTimeout, this.operationTimeout); } else if (actionType == ActionType.TestRetryStep) { TestRetryStepState actionState = actionStateBase as TestRetryStepState; StepStateNames currentState = actionState.StateProgress.Peek(); if (currentState == StepStateNames.IntentSaved) { actionState.StateProgress.Push(StepStateNames.LookingUpState); await this.actionStore.UpdateActionStateAsync(actionState); TestabilityTrace.TraceSource.WriteInfo(TraceType, "action state has been updated"); } action = new TestRetryStepAction(this.stateManager, this.Partition, actionState, this.requestTimeout, this.operationTimeout); } else if (actionType == ActionType.StartNode) { NodeCommandState actionState = actionStateBase as NodeCommandState; actionState.StoppedNodeTable = this.stoppedNodeTable; StepStateNames currentState = actionState.StateProgress.Peek(); if (currentState == StepStateNames.IntentSaved) { actionState.StateProgress.Push(StepStateNames.LookingUpState); await this.actionStore.UpdateActionStateAsync(actionState); TestabilityTrace.TraceSource.WriteInfo(TraceType, "action state has been updated"); } action = new StartNodeFromFASAction(this.stateManager, this.Partition, actionState, this.stoppedNodeTable, this.requestTimeout, this.operationTimeout); } else if (actionType == ActionType.StopNode) { NodeCommandState actionState = actionStateBase as NodeCommandState; actionState.StoppedNodeTable = this.stoppedNodeTable; StepStateNames currentState = actionState.StateProgress.Peek(); if (currentState == StepStateNames.IntentSaved) { actionState.StateProgress.Push(StepStateNames.LookingUpState); await this.actionStore.UpdateActionStateAsync(actionState); TestabilityTrace.TraceSource.WriteInfo(TraceType, "action state has been updated"); } action = new StopNodeFromFASAction(this.stateManager, this.Partition, actionState, this.stoppedNodeTable, this.requestTimeout, this.operationTimeout); } else { TestabilityTrace.TraceSource.WriteInfo(TraceType, "Unknown actionType"); } return(action); }