public static StepBase GetStep( StepStateNames stateName, FabricClient fabricClient, ActionStateBase actionState, InvokeDataLossAction action, TimeSpan requestTimeout, TimeSpan operationTimeout, CancellationToken cancellationToken) { StepBase step = null; InvokeDataLossState invokeDataLossState = Convert(actionState); StepStateNames prevContext = actionState.StateProgress.Peek(); if (stateName == StepStateNames.LookingUpState) { step = new DataLossStepsFactory.LookingUpState(fabricClient, invokeDataLossState, requestTimeout, operationTimeout, action.PartitionSelector); } else if (stateName == StepStateNames.PerformingActions) { step = new DataLossStepsFactory.PerformingActions(fabricClient, invokeDataLossState, requestTimeout, operationTimeout, action.PartitionSelector, action.DataLossCheckWaitDurationInSeconds, action.DataLossCheckPollIntervalInSeconds, action.ReplicaDropWaitDurationInSeconds); } else if (stateName == StepStateNames.CompletedSuccessfully) { // done - but then this method should not have been called ReleaseAssert.Failfast(string.Format(CultureInfo.InvariantCulture, "{0} - GetStep() should not have been called when the state name is CompletedSuccessfully"), actionState.OperationId); } else { ReleaseAssert.Failfast(string.Format(CultureInfo.InvariantCulture, "{0} - Unexpected state name={1}", actionState.OperationId, stateName)); } return(step); }
public PerformingActions(FabricClient fabricClient, InvokeDataLossState state, TimeSpan requestTimeout, TimeSpan operationTimeout, PartitionSelector partitionSelector, int dataLossCheckWaitDurationInSeconds, int dataLossCheckPollIntervalInSeconds, int replicaDropWaitDurationInSeconds) : base(fabricClient, state, requestTimeout, operationTimeout) { this.partitionSelector = partitionSelector; this.dataLossCheckWaitDurationInSeconds = dataLossCheckWaitDurationInSeconds; this.dataLossCheckPollIntervalInSeconds = dataLossCheckPollIntervalInSeconds; this.replicaDropWaitDurationInSeconds = replicaDropWaitDurationInSeconds; }
public override Task CleanupAsync(CancellationToken cancellationToken) { // debug - remove later TestabilityTrace.TraceSource.WriteInfo(StepBase.TraceType, "{0} - Enter Cleanup for PerformingAction", this.State.OperationId); InvokeDataLossState state = Convert(this.State); string behaviorName = state.Info.UnreliableTransportInfo.First().Item2; return(RemoveUnreliableTransportAsync(this.FabricClient, state.Info.NodeName, behaviorName, this.RequestTimeout, this.OperationTimeout, cancellationToken)); }
public static InvokeDataLossState Convert(ActionStateBase actionState) { InvokeDataLossState invokeDataLossState = actionState as InvokeDataLossState; if (invokeDataLossState == null) { throw new InvalidCastException("State object could not be converted"); } return(invokeDataLossState); }
public async Task <PartitionDataLossProgress> GetInvokeDataLossProgressAsync( Guid operationId, TimeSpan timeout, CancellationToken cancellationToken) { this.ThrowIfNotReady(); PartitionDataLossProgress progress = null; TestabilityTrace.TraceSource.WriteInfo(TraceType, "Inside GetInvokeDataLossProgressAsync, operationId = {0}", operationId); try { ActionStateBase actionState = await this.MessageProcessor.ProcessGetProgressAsync(operationId, timeout, cancellationToken); StepStateNames stateName = actionState.StateProgress.Peek(); TestCommandProgressState state = FaultAnalysisServiceUtility.ConvertState(actionState, TraceType); InvokeDataLossState invokeDataLossState = actionState as InvokeDataLossState; if (invokeDataLossState == null) { throw new InvalidCastException("State object could not be converted"); } StepStateNames stepState = actionState.StateProgress.Peek(); var selectedPartition = new SelectedPartition { ServiceName = invokeDataLossState.Info.PartitionSelector.ServiceName, PartitionId = invokeDataLossState.Info.PartitionId }; PartitionDataLossResult result = new PartitionDataLossResult(selectedPartition, actionState.ErrorCausingRollback); progress = new PartitionDataLossProgress(state, result); TestabilityTrace.TraceSource.WriteInfo( TraceType, "{0} - {1} progress - {2}, Exception - {3}", operationId, ActionType.InvokeDataLoss, progress.Result != null ? progress.Result.SelectedPartition.ToString() : FASConstants.UnavailableMessage, (progress.Result != null && progress.Result.Exception != null) ? progress.Result.Exception.ToString() : FASConstants.UnavailableMessage); } catch (Exception e) { TestabilityTrace.TraceSource.WriteWarning(TraceType, "{0} - Caught {1}", operationId, e.ToString()); FaultAnalysisServiceUtility.ThrowTransientExceptionIfRetryable(e); throw; } return(progress); }
// Use this method signature for now until the actual client interface is decided public async Task ProcessDataLossCommandAsync(Guid operationId, PartitionSelector partitionSelector, DataLossMode dataLossMode, TimeSpan timeout, ServiceInternalFaultInfo serviceInternalFaultInfo) { ThrowIfDataLossModeInvalid(dataLossMode); ActionStateBase actionState = new InvokeDataLossState(operationId, serviceInternalFaultInfo, partitionSelector, dataLossMode); try { // After this call finishes the intent has been persisted await this.actionStore.InitializeNewActionAsync(actionState, timeout); this.Enqueue(actionState); } catch (Exception e) { TestabilityTrace.TraceSource.WriteWarning(TraceType, "{0} - Exception {1}", operationId, e); throw; } }
private ActionStateBase ReadData(byte[] bytes) { ActionStateBase result = null; using (BinaryReader br = new BinaryReader(new MemoryStream(bytes))) { // The first 4 bytes are the command type ActionType a = ActionStateBase.ReadCommandType(br); if (a == ActionType.InvokeDataLoss) { result = InvokeDataLossState.FromBytes(br); } else if (a == ActionType.InvokeQuorumLoss) { result = InvokeQuorumLossState.FromBytes(br); } else if (a == ActionType.RestartPartition) { result = RestartPartitionState.FromBytes(br); } else if (a == ActionType.TestStuck) { result = StuckState.FromBytes(br); } else if (a == ActionType.TestRetryStep) { result = TestRetryStepState.FromBytes(br); } else if (a == ActionType.StartNode) { result = NodeCommandState.FromBytes(br, a); } else if (a == ActionType.StopNode) { result = NodeCommandState.FromBytes(br, a); } } return(result); }
public override async Task <ActionStateBase> RunAsync(CancellationToken cancellationToken, ServiceInternalFaultInfo serviceInternalFaultInfo) { InvokeDataLossState state = Convert(this.State); ServiceDescription result = await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync( () => this.FabricClient.ServiceManager.GetServiceDescriptionAsync( this.partitionSelector.ServiceName, this.RequestTimeout, cancellationToken), this.OperationTimeout, cancellationToken).ConfigureAwait(false); if (result.Kind != ServiceDescriptionKind.Stateful) { // The message in the first arg is only for debugging, it is not returned to the user. throw new FabricInvalidForStatelessServicesException("FabricInvalidForStatelessServicesException", FabricErrorCode.InvalidForStatelessServices); } int targetReplicaSetSize = (result as StatefulServiceDescription).TargetReplicaSetSize; SelectedPartition targetPartition = await FaultAnalysisServiceUtility.GetSelectedPartitionStateAsync( this.FabricClient, this.partitionSelector, this.RequestTimeout, this.OperationTimeout, cancellationToken).ConfigureAwait(false); Guid partitionId = targetPartition.PartitionId; long preDataLossNumber = 0; ServicePartitionList partitionsResult = await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync( () => this.FabricClient.QueryManager.GetPartitionListAsync( this.partitionSelector.ServiceName, null, this.RequestTimeout, cancellationToken), this.OperationTimeout, cancellationToken).ConfigureAwait(false); bool partitionFound = false; foreach (StatefulServicePartition partition in partitionsResult) { if (partition.PartitionInformation.Id == partitionId) { preDataLossNumber = partition.PrimaryEpoch.DataLossNumber; partitionFound = true; break; } } if (!partitionFound) { throw new FabricException(StringHelper.Format(StringResources.Error_PartitionNotFound), FabricErrorCode.PartitionNotFound); } ServiceReplicaList failoverManagerReplicasResult = await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync( () => this.FabricClient.QueryManager.GetReplicaListAsync( FASConstants.FmPartitionId, 0, this.RequestTimeout, cancellationToken), this.OperationTimeout, cancellationToken).ConfigureAwait(false); string failoverManagerPrimaryNodeName = string.Empty; var readyFMReplicas = failoverManagerReplicasResult.Where(r => r.ReplicaStatus == ServiceReplicaStatus.Ready).ToArray(); foreach (var replica in readyFMReplicas) { StatefulServiceReplica statefulReplica = replica as StatefulServiceReplica; ReleaseAssert.AssertIf(statefulReplica == null, "FM Replica is not a stateful replica"); if (statefulReplica.ReplicaRole == ReplicaRole.Primary) { failoverManagerPrimaryNodeName = replica.NodeName; } } if (string.IsNullOrEmpty(failoverManagerPrimaryNodeName)) { throw new FabricException(StringHelper.Format(StringResources.Error_PartitionPrimaryNotReady, "FailoverManager"), FabricErrorCode.NotReady); } TestabilityTrace.TraceSource.WriteInfo(StepBase.TraceType, "{0} - FM primary location={1}", this.State.OperationId, failoverManagerPrimaryNodeName); string behaviorName = "BlockDoReconfiguration_" + this.State.OperationId; List <Tuple <string, string> > unreliableTransportInfo = new List <Tuple <string, string> >(); unreliableTransportInfo.Add(new Tuple <string, string>(failoverManagerPrimaryNodeName, behaviorName)); state.StateProgress.Push(StepStateNames.PerformingActions); state.Info.DataLossNumber = preDataLossNumber; state.Info.NodeName = failoverManagerPrimaryNodeName; state.Info.PartitionId = partitionId; state.Info.UnreliableTransportInfo = unreliableTransportInfo; state.Info.TargetReplicaSetSize = targetReplicaSetSize; return(state); }
public LookingUpState(FabricClient fabricClient, InvokeDataLossState state, TimeSpan requestTimeout, TimeSpan operationTimeout, PartitionSelector partitionSelector) : base(fabricClient, state, requestTimeout, operationTimeout) { this.partitionSelector = partitionSelector; }
public override async Task <ActionStateBase> RunAsync(CancellationToken cancellationToken, ServiceInternalFaultInfo serviceInternalFaultInfo) { InvokeDataLossState state = Convert(this.State); PartitionSelector partitionSelector = state.Info.PartitionSelector; DataLossMode dataLossMode = state.Info.DataLossMode; long preDataLossNumber = state.Info.DataLossNumber; string failoverManagerPrimaryNodeName = state.Info.NodeName; Guid partitionId = state.Info.PartitionId; string behaviorName = state.Info.UnreliableTransportInfo.First().Item2; int targetReplicaSetSize = state.Info.TargetReplicaSetSize; TestabilityTrace.TraceSource.WriteInfo(StepBase.TraceType, "{0} - applying UT, partitionId={1}", this.State.OperationId, partitionId); System.Fabric.Common.UnreliableTransportBehavior behavior = new System.Fabric.Common.UnreliableTransportBehavior("*", "DoReconfiguration"); behavior.AddFilterForPartitionId(partitionId); await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync( () => this.FabricClient.TestManager.AddUnreliableTransportBehaviorAsync( failoverManagerPrimaryNodeName, behaviorName, behavior, this.RequestTimeout, cancellationToken), this.OperationTimeout, cancellationToken).ConfigureAwait(false); // TODO: Wait for some time so that the unreliable transport behavior can be read from the files. // Bug#2271465 - Unreliable transport through API should return only once the behavior has been successfully applied await Task.Delay(TimeSpan.FromSeconds(5.0), cancellationToken).ConfigureAwait(false); ServiceReplicaList replicasResult = await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync( () => this.FabricClient.QueryManager.GetReplicaListAsync( partitionId, 0, this.RequestTimeout, cancellationToken), this.OperationTimeout, cancellationToken).ConfigureAwait(false); List <StatefulServiceReplica> replicaList = new List <StatefulServiceReplica>(); foreach (var replica in replicasResult) { StatefulServiceReplica statefulReplica = replica as StatefulServiceReplica; ReleaseAssert.AssertIf(statefulReplica == null, "Service Replica is not of stateful type even though service is stateful"); replicaList.Add(statefulReplica); } // Select target replicas based on the DataLosMode List <StatefulServiceReplica> targets = null; if (dataLossMode == DataLossMode.FullDataLoss) { targets = GetReplicasForFullDataLoss(replicaList); } else if (dataLossMode == DataLossMode.PartialDataLoss) { targets = FaultAnalysisServiceUtility.GetReplicasForPartialLoss(state.OperationId, replicaList); } else { throw FaultAnalysisServiceUtility.CreateException(StepBase.TraceType, Interop.NativeTypes.FABRIC_ERROR_CODE.E_INVALIDARG, Strings.StringResources.Error_UnsupportedDataLossMode); } if (targets == null) { // This will cause the command to rollback and retry throw new FabricTransientException("The operation could not be performed, please retry", FabricErrorCode.NotReady); } foreach (var replica in targets) { TestabilityTrace.TraceSource.WriteInfo( StepBase.TraceType, "{0} - Removing replica {1} in partition {2} with role {3} and status {4} to induce data loss", this.State.OperationId, replica.Id, partitionId, replica.ReplicaRole, replica.ReplicaStatus); await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync( () => this.FabricClient.ServiceManager.RemoveReplicaAsync( replica.NodeName, partitionId, replica.Id, this.RequestTimeout, cancellationToken), FabricClientRetryErrors.RemoveReplicaErrors.Value, this.OperationTimeout, cancellationToken).ConfigureAwait(false); } ActionTest.PerformInternalServiceFaultIfRequested(this.State.OperationId, serviceInternalFaultInfo, this.State, cancellationToken, true); await this.WaitForAllTargetReplicasToGetDroppedAsync(partitionId, targets, cancellationToken).ConfigureAwait(false); await RemoveUnreliableTransportAsync(this.FabricClient, failoverManagerPrimaryNodeName, behaviorName, this.RequestTimeout, this.OperationTimeout, cancellationToken).ConfigureAwait(false); bool dataLossWasSuccessful = false; TimeoutHelper timeoutHelper = new TimeoutHelper(TimeSpan.FromSeconds(30)); do { ServicePartitionList partitionsResult = await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync( () => this.FabricClient.QueryManager.GetPartitionListAsync( this.partitionSelector.ServiceName, null, this.RequestTimeout, cancellationToken), this.OperationTimeout, cancellationToken).ConfigureAwait(false); bool partitionFound = false; long postDataLossNumber = 0; foreach (StatefulServicePartition partition in partitionsResult) { if (partition.PartitionInformation.Id == partitionId) { postDataLossNumber = partition.PrimaryEpoch.DataLossNumber; partitionFound = true; break; } } if (!partitionFound) { throw new FabricException(StringHelper.Format(StringResources.Error_PartitionNotFound), FabricErrorCode.PartitionNotFound); } TestabilityTrace.TraceSource.WriteInfo( StepBase.TraceType, "{0} - Checking data loss numbers for partition {1} with remaining time {2}. Current numbers {3}:{4}", this.State.OperationId, partitionId, timeoutHelper.GetRemainingTime(), preDataLossNumber, postDataLossNumber); if (postDataLossNumber != preDataLossNumber) { dataLossWasSuccessful = true; break; } await System.Fabric.Common.AsyncWaiter.WaitAsync(TimeSpan.FromSeconds(this.dataLossCheckPollIntervalInSeconds), cancellationToken).ConfigureAwait(false); }while (timeoutHelper.GetRemainingTime() > TimeSpan.Zero); if (!dataLossWasSuccessful) { // This is only viewable internally for debug. This will cause a retry of the whole flow. string error = string.Format( CultureInfo.InvariantCulture, "{0} - Service could not induce data loss for service '{1}' partition '{2}' in '{3}' Please retry", this.State.OperationId, partitionSelector.ServiceName, partitionId, this.dataLossCheckWaitDurationInSeconds); TestabilityTrace.TraceSource.WriteWarning(StepBase.TraceType, error); throw new FabricTransientException("The operation could not be performed, please retry", FabricErrorCode.NotReady); } state.StateProgress.Push(StepStateNames.CompletedSuccessfully); return(state); }
private async Task <FabricTestAction> ConstructActionAsync(ActionType actionType, ActionStateBase actionStateBase) { FabricTestAction action = null; if (actionType == ActionType.InvokeDataLoss) { InvokeDataLossState actionState = actionStateBase as InvokeDataLossState; StepStateNames currentState = actionState.StateProgress.Peek(); if (currentState == StepStateNames.IntentSaved) { actionState.StateProgress.Push(StepStateNames.LookingUpState); await this.actionStore.UpdateActionStateAsync(actionState); TestabilityTrace.TraceSource.WriteInfo(TraceType, "action state has been updated"); } action = new InvokeDataLossAction( this.stateManager, this.Partition, actionState, actionState.Info.PartitionSelector, actionState.Info.DataLossMode, this.dataLossCheckWaitDurationInSeconds, this.dataLossCheckPollIntervalInSeconds, this.replicaDropWaitDurationInSeconds, this.requestTimeout, this.operationTimeout); } else if (actionType == ActionType.InvokeQuorumLoss) { InvokeQuorumLossState actionState = actionStateBase as InvokeQuorumLossState; StepStateNames currentState = actionState.StateProgress.Peek(); if (currentState == StepStateNames.IntentSaved) { actionState.StateProgress.Push(StepStateNames.LookingUpState); await this.actionStore.UpdateActionStateAsync(actionState); TestabilityTrace.TraceSource.WriteInfo(TraceType, "action state has been updated"); } // This is the case for resuming an action after a failover action = new InvokeQuorumLossAction(this.stateManager, this.Partition, actionState, actionState.Info.PartitionSelector, actionState.Info.QuorumLossMode, actionState.Info.QuorumLossDuration, this.requestTimeout, this.operationTimeout); } else if (actionType == ActionType.RestartPartition) { RestartPartitionState actionState = actionStateBase as RestartPartitionState; StepStateNames currentState = actionState.StateProgress.Peek(); if (currentState == StepStateNames.IntentSaved) { actionState.StateProgress.Push(StepStateNames.LookingUpState); await this.actionStore.UpdateActionStateAsync(actionState); TestabilityTrace.TraceSource.WriteInfo(TraceType, "action state has been updated"); } // This is the case for resuming an action after a failover action = new RestartPartitionAction(this.stateManager, this.Partition, actionState, actionState.Info.PartitionSelector, actionState.Info.RestartPartitionMode, this.requestTimeout, this.operationTimeout); } else if (actionType == ActionType.TestStuck) { StuckState actionState = actionStateBase as StuckState; StepStateNames currentState = actionState.StateProgress.Peek(); if (currentState == StepStateNames.IntentSaved) { actionState.StateProgress.Push(StepStateNames.LookingUpState); await this.actionStore.UpdateActionStateAsync(actionState); TestabilityTrace.TraceSource.WriteInfo(TraceType, "action state has been updated"); } action = new StuckAction(this.stateManager, this.Partition, actionState, this.requestTimeout, this.operationTimeout); } else if (actionType == ActionType.TestRetryStep) { TestRetryStepState actionState = actionStateBase as TestRetryStepState; StepStateNames currentState = actionState.StateProgress.Peek(); if (currentState == StepStateNames.IntentSaved) { actionState.StateProgress.Push(StepStateNames.LookingUpState); await this.actionStore.UpdateActionStateAsync(actionState); TestabilityTrace.TraceSource.WriteInfo(TraceType, "action state has been updated"); } action = new TestRetryStepAction(this.stateManager, this.Partition, actionState, this.requestTimeout, this.operationTimeout); } else if (actionType == ActionType.StartNode) { NodeCommandState actionState = actionStateBase as NodeCommandState; actionState.StoppedNodeTable = this.stoppedNodeTable; StepStateNames currentState = actionState.StateProgress.Peek(); if (currentState == StepStateNames.IntentSaved) { actionState.StateProgress.Push(StepStateNames.LookingUpState); await this.actionStore.UpdateActionStateAsync(actionState); TestabilityTrace.TraceSource.WriteInfo(TraceType, "action state has been updated"); } action = new StartNodeFromFASAction(this.stateManager, this.Partition, actionState, this.stoppedNodeTable, this.requestTimeout, this.operationTimeout); } else if (actionType == ActionType.StopNode) { NodeCommandState actionState = actionStateBase as NodeCommandState; actionState.StoppedNodeTable = this.stoppedNodeTable; StepStateNames currentState = actionState.StateProgress.Peek(); if (currentState == StepStateNames.IntentSaved) { actionState.StateProgress.Push(StepStateNames.LookingUpState); await this.actionStore.UpdateActionStateAsync(actionState); TestabilityTrace.TraceSource.WriteInfo(TraceType, "action state has been updated"); } action = new StopNodeFromFASAction(this.stateManager, this.Partition, actionState, this.stoppedNodeTable, this.requestTimeout, this.operationTimeout); } else { TestabilityTrace.TraceSource.WriteInfo(TraceType, "Unknown actionType"); } return(action); }