public static async Task RemoveUnreliableTransportAsync(ActionStateBase state, FabricClient fabricClient, TimeSpan requestTimeout, TimeSpan operationTimeout, CancellationToken cancellationToken) { RestartPartitionState restartPartitionState = Convert(state); if (restartPartitionState.Info.UnreliableTransportInfo != null) { string behaviorName = restartPartitionState.Info.UnreliableTransportInfo.First().Item2; string failoverManagerPrimaryNodeName = restartPartitionState.Info.NodeName; TestabilityTrace.TraceSource.WriteInfo(StepBase.TraceType, "{0} - Enter Cleanup for '{1}', fmPrimaryNodeName={2}, behavior name={3}", state.OperationId, behaviorName, failoverManagerPrimaryNodeName, behaviorName); await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync( () => fabricClient.TestManager.RemoveUnreliableTransportBehaviorAsync( failoverManagerPrimaryNodeName, behaviorName, requestTimeout, cancellationToken), FabricClientRetryErrors.RemoveUnreliableTransportBehaviorErrors.Value, operationTimeout, cancellationToken).ConfigureAwait(false); // TODO: Wait for some time so that the unreliable transport behavior can be read from the files. // Bug#2271465 - Unreliable transport through API should return only once the behavior has been successfully applied await Task.Delay(TimeSpan.FromSeconds(5.0), cancellationToken).ConfigureAwait(false); } }
public static StepBase GetStep( StepStateNames stateName, FabricClient fabricClient, ActionStateBase actionState, RestartPartitionAction action, TimeSpan requestTimeout, TimeSpan operationTimeout, CancellationToken cancellationToken) { StepBase step = null; RestartPartitionState restartPartitionState = Convert(actionState); if (stateName == StepStateNames.LookingUpState) { step = new LookingUpStep(fabricClient, restartPartitionState, requestTimeout, operationTimeout, action.PartitionSelector, action.RestartPartitionMode); } else if (stateName == StepStateNames.PerformingActions) { step = new RestartingSelectedReplicas(fabricClient, restartPartitionState, requestTimeout, operationTimeout, action.RestartPartitionMode); } else if (stateName == StepStateNames.CompletedSuccessfully) { // done - but then this method should not have been called ReleaseAssert.Failfast("GetStep() should not have been called when the state nme is CompletedSuccessfully"); } else { ReleaseAssert.Failfast(string.Format(CultureInfo.InvariantCulture, "Unexpected state name={0}", stateName)); } return(step); }
public static RestartPartitionState Convert(ActionStateBase actionState) { RestartPartitionState restartPartitionState = actionState as RestartPartitionState; if (restartPartitionState == null) { throw new InvalidCastException("State object could not be converted"); } return(restartPartitionState); }
public async Task <PartitionRestartProgress> GetRestartPartitionProgressAsync( Guid operationId, TimeSpan timeout, CancellationToken cancellationToken) { this.ThrowIfNotReady(); PartitionRestartProgress progress = null; try { TestabilityTrace.TraceSource.WriteInfo(TraceType, "GetRestartPartitionProgressAsync calling message processor"); ActionStateBase actionState = await this.MessageProcessor.ProcessGetProgressAsync(operationId, timeout, cancellationToken); StepStateNames stateName = actionState.StateProgress.Peek(); TestCommandProgressState state = FaultAnalysisServiceUtility.ConvertState(actionState, TraceType); RestartPartitionState restartPartitionState = actionState as RestartPartitionState; TestabilityTrace.TraceSource.WriteInfo( TraceType, "RestartPartition - serviceName={0}, partitionId={1}", restartPartitionState.Info.PartitionSelector.ServiceName.ToString(), restartPartitionState.Info.PartitionId); var selectedPartition = new SelectedPartition { ServiceName = restartPartitionState.Info.PartitionSelector.ServiceName, PartitionId = restartPartitionState.Info.PartitionId }; PartitionRestartResult result = new PartitionRestartResult(selectedPartition, actionState.ErrorCausingRollback); progress = new PartitionRestartProgress(state, result); TestabilityTrace.TraceSource.WriteInfo( TraceType, "{0} - {1} progress - {2}, Exception - {3}", operationId, ActionType.RestartPartition, progress.Result != null ? progress.Result.SelectedPartition.ToString() : FASConstants.UnavailableMessage, (progress.Result != null && progress.Result.Exception != null) ? progress.Result.Exception.ToString() : FASConstants.UnavailableMessage); } catch (Exception e) { TestabilityTrace.TraceSource.WriteWarning(TraceType, "{0} - Caught {1}", operationId, e.ToString()); FaultAnalysisServiceUtility.ThrowTransientExceptionIfRetryable(e); throw; } return(progress); }
// Use this method signature for now until the actual client interface is decided public async Task ProcessRestartPartitionCommandAsync(Guid operationId, PartitionSelector partitionSelector, RestartPartitionMode restartPartitionMode, TimeSpan timeout, ServiceInternalFaultInfo serviceInternalFaultInfo) { ThrowIfRestartPartitionModeInvalid(restartPartitionMode); RestartPartitionState actionState = new RestartPartitionState(operationId, serviceInternalFaultInfo, partitionSelector, restartPartitionMode); try { // After this call finishes the intent has been persisted await this.actionStore.InitializeNewActionAsync(actionState, timeout); this.Enqueue(actionState); } catch (Exception e) { TestabilityTrace.TraceSource.WriteWarning(TraceType, "{0} - Exception {1}", operationId, e); throw; } }
private ActionStateBase ReadData(byte[] bytes) { ActionStateBase result = null; using (BinaryReader br = new BinaryReader(new MemoryStream(bytes))) { // The first 4 bytes are the command type ActionType a = ActionStateBase.ReadCommandType(br); if (a == ActionType.InvokeDataLoss) { result = InvokeDataLossState.FromBytes(br); } else if (a == ActionType.InvokeQuorumLoss) { result = InvokeQuorumLossState.FromBytes(br); } else if (a == ActionType.RestartPartition) { result = RestartPartitionState.FromBytes(br); } else if (a == ActionType.TestStuck) { result = StuckState.FromBytes(br); } else if (a == ActionType.TestRetryStep) { result = TestRetryStepState.FromBytes(br); } else if (a == ActionType.StartNode) { result = NodeCommandState.FromBytes(br, a); } else if (a == ActionType.StopNode) { result = NodeCommandState.FromBytes(br, a); } } return(result); }
public override async Task <ActionStateBase> RunAsync(CancellationToken cancellationToken, ServiceInternalFaultInfo serviceInternalFaultInfo) { TestabilityTrace.TraceSource.WriteInfo(StepBase.TraceType, "Inside CollectingState, service={0}", this.partitionSelector.ServiceName); RestartPartitionState state = Convert(this.State); // Get service info and validate if the parameters are valid ServiceDescription result = await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync( () => this.FabricClient.ServiceManager.GetServiceDescriptionAsync( this.partitionSelector.ServiceName, this.RequestTimeout, cancellationToken), this.OperationTimeout, cancellationToken).ConfigureAwait(false); if (result.Kind != ServiceDescriptionKind.Stateful && this.restartPartitionMode == RestartPartitionMode.OnlyActiveSecondaries) { // The message in the first arg is only for debugging, it is not returned to the user. string debugText = string.Format(CultureInfo.InvariantCulture, "RestartPartition: for stateless services only RestartPartitionMode.AllReplicasOrInstances is valid"); TestabilityTrace.TraceSource.WriteWarning(StepBase.TraceType, debugText); throw FaultAnalysisServiceUtility.CreateException(StepBase.TraceType, NativeTypes.FABRIC_ERROR_CODE.E_INVALIDARG, debugText); } bool hasPersistedState = false; if (result.Kind == ServiceDescriptionKind.Stateful) { StatefulServiceDescription statefulDescription = result as StatefulServiceDescription; ReleaseAssert.AssertIf(statefulDescription == null, "Stateful service description is not WinFabricStatefulServiceDescription"); hasPersistedState = statefulDescription.HasPersistedState; } SelectedPartition targetPartition = await FaultAnalysisServiceUtility.GetSelectedPartitionStateAsync( this.FabricClient, this.partitionSelector, this.RequestTimeout, this.OperationTimeout, cancellationToken).ConfigureAwait(false); Guid partitionId = targetPartition.PartitionId; // get replicas for target ServiceReplicaList replicasResult = await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync( () => this.FabricClient.QueryManager.GetReplicaListAsync( partitionId, 0, this.RequestTimeout, cancellationToken), this.OperationTimeout, cancellationToken).ConfigureAwait(false); // get replicas for fm in order to get the primary ServiceReplicaList failoverManagersReplicasResult = await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync( () => this.FabricClient.QueryManager.GetReplicaListAsync( FASConstants.FmPartitionId, 0, this.RequestTimeout, cancellationToken), this.OperationTimeout, cancellationToken).ConfigureAwait(false); string failoverManagerPrimaryNodeName = string.Empty; var readyFMReplicas = failoverManagersReplicasResult.Where(r => r.ReplicaStatus == ServiceReplicaStatus.Ready).ToArray(); foreach (var replica in readyFMReplicas) { StatefulServiceReplica statefulReplica = replica as StatefulServiceReplica; ReleaseAssert.AssertIf(statefulReplica == null, "FM Replica is not a stateful replica"); if (statefulReplica.ReplicaRole == ReplicaRole.Primary) { failoverManagerPrimaryNodeName = replica.NodeName; } } if (string.IsNullOrEmpty(failoverManagerPrimaryNodeName)) { throw new FabricException(StringHelper.Format(StringResources.Error_PartitionPrimaryNotReady, "FailoverManager"), FabricErrorCode.NotReady); } TestabilityTrace.TraceSource.WriteInfo(StepBase.TraceType, "{0} - FM primary is at node={1}", this.State.OperationId, failoverManagerPrimaryNodeName); string behaviorName = RestartingSelectedReplicas.UTBehaviorPrefixName + "_" + this.State.OperationId; List <Tuple <string, string> > unreliableTransportInfo = new List <Tuple <string, string> >(); unreliableTransportInfo.Add(new Tuple <string, string>(failoverManagerPrimaryNodeName, behaviorName)); state.StateProgress.Push(StepStateNames.PerformingActions); state.Info.PartitionId = partitionId; state.Info.NodeName = failoverManagerPrimaryNodeName; state.Info.HasPersistedState = hasPersistedState; state.Info.UnreliableTransportInfo = unreliableTransportInfo; return(state); }
public LookingUpStep(FabricClient fabricClient, RestartPartitionState state, TimeSpan requestTimeout, TimeSpan operationTimeout, PartitionSelector partitionSelector, RestartPartitionMode restartPartitionMode) : base(fabricClient, state, requestTimeout, operationTimeout) { this.partitionSelector = partitionSelector; this.restartPartitionMode = restartPartitionMode; }
public override async Task <ActionStateBase> RunAsync(CancellationToken cancellationToken, ServiceInternalFaultInfo serviceInternalFaultInfo) { RestartPartitionState state = Convert(this.State); Guid partitionId = state.Info.PartitionId; bool hasPersistedState = state.Info.HasPersistedState; string failoverManagerPrimaryNodeName = state.Info.NodeName; string behaviorName = state.Info.UnreliableTransportInfo.First().Item2; System.Fabric.Common.UnreliableTransportBehavior behavior = new System.Fabric.Common.UnreliableTransportBehavior("*", "DoReconfiguration"); behavior.AddFilterForPartitionId(partitionId); await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync( () => this.FabricClient.TestManager.AddUnreliableTransportBehaviorAsync( failoverManagerPrimaryNodeName, behaviorName, behavior, this.RequestTimeout, cancellationToken), this.OperationTimeout, cancellationToken).ConfigureAwait(false); // Bug#2271465 - Unreliable transport through API should return only once the behavior has been successfully applied await Task.Delay(TimeSpan.FromSeconds(5.0), cancellationToken).ConfigureAwait(false); TestabilityTrace.TraceSource.WriteInfo(StepBase.TraceType, "{0} - applied UT on partitionId {1}, node={2}", this.State.OperationId, partitionId, failoverManagerPrimaryNodeName); ServiceReplicaList replicasResult = await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync( () => this.FabricClient.QueryManager.GetReplicaListAsync( partitionId, 0, this.RequestTimeout, cancellationToken), this.OperationTimeout, cancellationToken).ConfigureAwait(false); var stableReplicasToRestart = replicasResult.Where(r => r.ReplicaStatus == ServiceReplicaStatus.Ready).ToArray(); foreach (var replica in stableReplicasToRestart) { if (this.restartPartitionMode == RestartPartitionMode.OnlyActiveSecondaries) { StatefulServiceReplica statefulReplica = replica as StatefulServiceReplica; ReleaseAssert.AssertIf(statefulReplica == null, "Stateful service replica is not StatefulServiceReplica"); if (statefulReplica.ReplicaRole == ReplicaRole.Primary) { continue; } } TestabilityTrace.TraceSource.WriteInfo(StepBase.TraceType, "{0} - restarting replica partition={1}, node={2}, replica id={3}", this.State.OperationId, partitionId, replica.NodeName, replica.Id); if (hasPersistedState) { await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync( () => this.FabricClient.ServiceManager.RestartReplicaAsync( replica.NodeName, partitionId, replica.Id, this.RequestTimeout, cancellationToken), FabricClientRetryErrors.RestartReplicaErrors.Value, this.OperationTimeout, cancellationToken).ConfigureAwait(false); } else { await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync( () => this.FabricClient.ServiceManager.RemoveReplicaAsync( replica.NodeName, partitionId, replica.Id, this.RequestTimeout, cancellationToken), FabricClientRetryErrors.RemoveReplicaErrors.Value, this.OperationTimeout, cancellationToken).ConfigureAwait(false); } } ActionTest.PerformInternalServiceFaultIfRequested(this.State.OperationId, serviceInternalFaultInfo, this.State, cancellationToken, true); await RemoveUnreliableTransportAsync(this.State, this.FabricClient, this.RequestTimeout, this.OperationTimeout, cancellationToken); state.StateProgress.Push(StepStateNames.CompletedSuccessfully); return(state); }
public RestartingSelectedReplicas(FabricClient fabricClient, RestartPartitionState state, TimeSpan requestTimeout, TimeSpan operationTimeout, RestartPartitionMode restartPartitionMode) : base(fabricClient, state, requestTimeout, operationTimeout) { this.restartPartitionMode = restartPartitionMode; }
private async Task <FabricTestAction> ConstructActionAsync(ActionType actionType, ActionStateBase actionStateBase) { FabricTestAction action = null; if (actionType == ActionType.InvokeDataLoss) { InvokeDataLossState actionState = actionStateBase as InvokeDataLossState; StepStateNames currentState = actionState.StateProgress.Peek(); if (currentState == StepStateNames.IntentSaved) { actionState.StateProgress.Push(StepStateNames.LookingUpState); await this.actionStore.UpdateActionStateAsync(actionState); TestabilityTrace.TraceSource.WriteInfo(TraceType, "action state has been updated"); } action = new InvokeDataLossAction( this.stateManager, this.Partition, actionState, actionState.Info.PartitionSelector, actionState.Info.DataLossMode, this.dataLossCheckWaitDurationInSeconds, this.dataLossCheckPollIntervalInSeconds, this.replicaDropWaitDurationInSeconds, this.requestTimeout, this.operationTimeout); } else if (actionType == ActionType.InvokeQuorumLoss) { InvokeQuorumLossState actionState = actionStateBase as InvokeQuorumLossState; StepStateNames currentState = actionState.StateProgress.Peek(); if (currentState == StepStateNames.IntentSaved) { actionState.StateProgress.Push(StepStateNames.LookingUpState); await this.actionStore.UpdateActionStateAsync(actionState); TestabilityTrace.TraceSource.WriteInfo(TraceType, "action state has been updated"); } // This is the case for resuming an action after a failover action = new InvokeQuorumLossAction(this.stateManager, this.Partition, actionState, actionState.Info.PartitionSelector, actionState.Info.QuorumLossMode, actionState.Info.QuorumLossDuration, this.requestTimeout, this.operationTimeout); } else if (actionType == ActionType.RestartPartition) { RestartPartitionState actionState = actionStateBase as RestartPartitionState; StepStateNames currentState = actionState.StateProgress.Peek(); if (currentState == StepStateNames.IntentSaved) { actionState.StateProgress.Push(StepStateNames.LookingUpState); await this.actionStore.UpdateActionStateAsync(actionState); TestabilityTrace.TraceSource.WriteInfo(TraceType, "action state has been updated"); } // This is the case for resuming an action after a failover action = new RestartPartitionAction(this.stateManager, this.Partition, actionState, actionState.Info.PartitionSelector, actionState.Info.RestartPartitionMode, this.requestTimeout, this.operationTimeout); } else if (actionType == ActionType.TestStuck) { StuckState actionState = actionStateBase as StuckState; StepStateNames currentState = actionState.StateProgress.Peek(); if (currentState == StepStateNames.IntentSaved) { actionState.StateProgress.Push(StepStateNames.LookingUpState); await this.actionStore.UpdateActionStateAsync(actionState); TestabilityTrace.TraceSource.WriteInfo(TraceType, "action state has been updated"); } action = new StuckAction(this.stateManager, this.Partition, actionState, this.requestTimeout, this.operationTimeout); } else if (actionType == ActionType.TestRetryStep) { TestRetryStepState actionState = actionStateBase as TestRetryStepState; StepStateNames currentState = actionState.StateProgress.Peek(); if (currentState == StepStateNames.IntentSaved) { actionState.StateProgress.Push(StepStateNames.LookingUpState); await this.actionStore.UpdateActionStateAsync(actionState); TestabilityTrace.TraceSource.WriteInfo(TraceType, "action state has been updated"); } action = new TestRetryStepAction(this.stateManager, this.Partition, actionState, this.requestTimeout, this.operationTimeout); } else if (actionType == ActionType.StartNode) { NodeCommandState actionState = actionStateBase as NodeCommandState; actionState.StoppedNodeTable = this.stoppedNodeTable; StepStateNames currentState = actionState.StateProgress.Peek(); if (currentState == StepStateNames.IntentSaved) { actionState.StateProgress.Push(StepStateNames.LookingUpState); await this.actionStore.UpdateActionStateAsync(actionState); TestabilityTrace.TraceSource.WriteInfo(TraceType, "action state has been updated"); } action = new StartNodeFromFASAction(this.stateManager, this.Partition, actionState, this.stoppedNodeTable, this.requestTimeout, this.operationTimeout); } else if (actionType == ActionType.StopNode) { NodeCommandState actionState = actionStateBase as NodeCommandState; actionState.StoppedNodeTable = this.stoppedNodeTable; StepStateNames currentState = actionState.StateProgress.Peek(); if (currentState == StepStateNames.IntentSaved) { actionState.StateProgress.Push(StepStateNames.LookingUpState); await this.actionStore.UpdateActionStateAsync(actionState); TestabilityTrace.TraceSource.WriteInfo(TraceType, "action state has been updated"); } action = new StopNodeFromFASAction(this.stateManager, this.Partition, actionState, this.stoppedNodeTable, this.requestTimeout, this.operationTimeout); } else { TestabilityTrace.TraceSource.WriteInfo(TraceType, "Unknown actionType"); } return(action); }