// Select a quorum of P and S that are not Down or Dropped internal static List <StatefulServiceReplica> GetReplicasForPartialLoss(Guid operationId, List <StatefulServiceReplica> replicaList) { List <StatefulServiceReplica> tempReplicas = new List <StatefulServiceReplica>(); foreach (StatefulServiceReplica replica in replicaList) { if (FaultAnalysisServiceUtility.IsPrimaryOrSecondary(replica) && FaultAnalysisServiceUtility.IsReplicaUp(replica)) { TestabilityTrace.TraceSource.WriteInfo(TraceType, "DEBUG {0} temp adding {1},{2},{3}", operationId, replica.Id, replica.ReplicaRole, replica.ReplicaStatus); tempReplicas.Add(replica); } } int replicasToRestartWithoutPrimary = tempReplicas.Count / 2; StatefulServiceReplica primary = tempReplicas.Where(r => r.ReplicaRole == ReplicaRole.Primary).FirstOrDefault(); if (primary == null) { return(null); } List <StatefulServiceReplica> targetReplicas = new List <StatefulServiceReplica>(replicasToRestartWithoutPrimary + 1); TestabilityTrace.TraceSource.WriteInfo(TraceType, "DEBUG {0} target adding primary {1},{2},{3}", operationId, primary.Id, primary.ReplicaRole, primary.ReplicaStatus); targetReplicas.Add(primary); tempReplicas.Remove(primary); for (int i = 0; i < replicasToRestartWithoutPrimary; i++) { TestabilityTrace.TraceSource.WriteInfo(TraceType, "DEBUG {0} target adding {1},{2},{3}", operationId, tempReplicas[i].Id, tempReplicas[i].ReplicaRole, tempReplicas[i].ReplicaStatus); targetReplicas.Add(tempReplicas[i]); } return(targetReplicas); }
// Replica does not have default constructor so need a method. private Replica CreateReplica() { Replica result; if (this.random.Next() % 2 == 0) { result = new StatefulServiceReplica( this.random.CreateRandom <ServiceReplicaStatus>(), this.random.CreateRandom <HealthState>(), ReplicaRole.IdleSecondary, this.random.CreateRandom <Uri>().ToString(), this.random.CreateRandom <string>(), this.random.CreateRandom <long>(), TimeSpan.FromSeconds(2)); } else { result = new StatelessServiceInstance( this.random.CreateRandom <ServiceReplicaStatus>(), this.random.CreateRandom <HealthState>(), this.random.CreateRandom <Uri>().ToString(), this.random.CreateRandom <string>(), this.random.CreateRandom <long>(), TimeSpan.FromSeconds(4)); } return(result); }
private void KillPrimaryReplica() { // Kill the primary Application application = _fabricClient.QueryManager.GetApplicationListAsync() .Result.Single(a => a.ApplicationTypeName == DefaultApplicationTypeName); Service service = _fabricClient.QueryManager.GetServiceListAsync(application.ApplicationName).Result.Single(); Partition partition = _fabricClient.QueryManager.GetPartitionListAsync(service.ServiceName).Result.Single(); StatefulServiceReplica primaryReplica = _fabricClient.QueryManager.GetReplicaListAsync(partition.PartitionInformation.Id) .Result.Select(replica => replica as StatefulServiceReplica) .Single(statefulServiceReplica => statefulServiceReplica.ReplicaRole == ReplicaRole.Primary); LogHelper.Log("Killing the primary replica at node {0}", primaryReplica.NodeName); ReplicaSelector replicaSelector = ReplicaSelector.ReplicaIdOf(PartitionSelector.PartitionIdOf(service.ServiceName, partition.PartitionInformation.Id), primaryReplica.Id); _fabricClient.FaultManager.RemoveReplicaAsync(replicaSelector, CompletionMode.DoNotVerify, false); }
public async Task <Dictionary <Partition, StatefulServiceReplica[]> > QueryPartitionAndReplicaResultAsyncStateful(CancellationToken ct) { var servicePartitionMap = await this.QueryLocationsAsync(ct).ConfigureAwait(false); var allServiceReplicas = new Dictionary <Partition, StatefulServiceReplica[]>(); foreach (var partition in servicePartitionMap) { List <StatefulServiceReplica> statefulReplicas = new List <StatefulServiceReplica>(); foreach (Replica replica in partition.Value) { StatefulServiceReplica statefulReplica = replica as StatefulServiceReplica; ReleaseAssert.AssertIf(statefulReplica == null, "Replica {0} should be of type stateful for Partition {1}", replica.Id, partition.Key.PartitionId()); statefulReplicas.Add(statefulReplica); } allServiceReplicas.Add(partition.Key, statefulReplicas.ToArray()); } return(allServiceReplicas); }
protected override async Task ExecuteActionAsync(FabricTestContext testContext, RestartPartitionAction action, CancellationToken cancellationToken) { ThrowIf.Null(action.PartitionSelector, "partitionSelector"); this.helper = new TimeoutHelper(action.ActionTimeout); // get service info so we can validate if the operation is valid ServiceDescription result = await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync( () => testContext.FabricClient.ServiceManager.GetServiceDescriptionAsync( action.PartitionSelector.ServiceName, action.RequestTimeout, cancellationToken), this.helper.GetRemainingTime(), cancellationToken).ConfigureAwait(false); if (result.Kind != ServiceDescriptionKind.Stateful && action.RestartPartitionMode == RestartPartitionMode.OnlyActiveSecondaries) { throw new InvalidOperationException(StringHelper.Format(StringResources.Error_InvalidServiceTypeTestability, "RestartPartitionMode.OnlyActiveSecondaries", "Stateful", action.PartitionSelector.ServiceName, "Stateless")); } bool hasPersistedState = false; if (result.Kind == ServiceDescriptionKind.Stateful) { StatefulServiceDescription statefulDescription = result as StatefulServiceDescription; ReleaseAssert.AssertIf(statefulDescription == null, "Stateful service description is not WinFabricStatefulServiceDescription"); hasPersistedState = statefulDescription.HasPersistedState; } // now actually select a partition var getPartitionStateAction = new GetSelectedPartitionStateAction(action.PartitionSelector) { RequestTimeout = action.RequestTimeout, ActionTimeout = helper.GetRemainingTime() }; await testContext.ActionExecutor.RunAsync(getPartitionStateAction, cancellationToken); Guid partitionId = getPartitionStateAction.Result.PartitionId; // get replicas for target ServiceReplicaList replicasResult = await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync( () => testContext.FabricClient.QueryManager.GetReplicaListAsync( partitionId, 0, action.RequestTimeout, cancellationToken), this.helper.GetRemainingTime(), cancellationToken).ConfigureAwait(false); // get replicas for fm in order to get the primary ServiceReplicaList fmReplicasResult = await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync( () => testContext.FabricClient.QueryManager.GetReplicaListAsync( Constants.FmPartitionId, 0, action.RequestTimeout, cancellationToken), this.helper.GetRemainingTime(), cancellationToken).ConfigureAwait(false); string fmPrimaryNodeName = string.Empty; var readyFMReplicas = fmReplicasResult.Where(r => r.ReplicaStatus == ServiceReplicaStatus.Ready).ToArray(); foreach (var replica in readyFMReplicas) { StatefulServiceReplica statefulReplica = replica as StatefulServiceReplica; ReleaseAssert.AssertIf(statefulReplica == null, "FM Replica is not a stateful replica"); if (statefulReplica.ReplicaRole == ReplicaRole.Primary) { fmPrimaryNodeName = replica.NodeName; } } if (string.IsNullOrEmpty(fmPrimaryNodeName)) { throw new FabricException(StringHelper.Format(StringResources.Error_PartitionPrimaryNotReady, "FailoverManager"), FabricErrorCode.NotReady); } ////------------------------------------------------------ // target ut at the fm primary only UnreliableTransportBehavior behavior = new UnreliableTransportBehavior("*", "DoReconfiguration"); behavior.AddFilterForPartitionId(partitionId); string behaviorName = "BlockDoReconfiguration"; await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync( () => testContext.FabricClient.TestManager.AddUnreliableTransportBehaviorAsync( fmPrimaryNodeName, behaviorName, behavior, action.RequestTimeout, cancellationToken), this.helper.GetRemainingTime(), cancellationToken).ConfigureAwait(false); // TODO: Wait for some time so that the unreliable transport behavior can be read from the files. // Bug#2271465 - Unreliable transport through API should return only once the behavior has been successfully applied await Task.Delay(TimeSpan.FromSeconds(5.0), cancellationToken).ConfigureAwait(false); bool triedToRemovedBehavior = false; // inspect the actual replicas to restart, only operate on stable ones try { var stableReplicasToRestart = replicasResult.Where(r => r.ReplicaStatus == ServiceReplicaStatus.Ready).ToArray(); foreach (var replica in stableReplicasToRestart) { var currentReplica = replica; if (action.RestartPartitionMode == RestartPartitionMode.OnlyActiveSecondaries) { StatefulServiceReplica statefulReplica = currentReplica as StatefulServiceReplica; ReleaseAssert.AssertIf(statefulReplica == null, "Stateful service replica is not StatefulServiceReplica"); if (statefulReplica.ReplicaRole == ReplicaRole.Primary) { continue; } } if (hasPersistedState) { await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync( () => testContext.FabricClient.FaultManager.RestartReplicaAsync( currentReplica.NodeName, partitionId, currentReplica.Id, CompletionMode.DoNotVerify, action.RequestTimeout.TotalSeconds, cancellationToken), this.helper.GetRemainingTime(), cancellationToken).ConfigureAwait(false); } else { await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync( () => testContext.FabricClient.FaultManager.RemoveReplicaAsync( currentReplica.NodeName, partitionId, currentReplica.Id, CompletionMode.DoNotVerify, false, /*force remove*/ action.RequestTimeout.TotalSeconds, cancellationToken), this.helper.GetRemainingTime(), cancellationToken).ConfigureAwait(false); } } triedToRemovedBehavior = true; await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync( () => testContext.FabricClient.TestManager.RemoveUnreliableTransportBehaviorAsync( fmPrimaryNodeName, behaviorName, action.RequestTimeout, cancellationToken), FabricClientRetryErrors.RemoveUnreliableTransportBehaviorErrors.Value, this.helper.GetRemainingTime(), cancellationToken).ConfigureAwait(false); // TODO: Wait for some time so that the unreliable transport behavior can be read from the files. // Bug#2271465 - Unreliable transport through API should return only once the behavior has been successfully applied await Task.Delay(TimeSpan.FromSeconds(5.0)).ConfigureAwait(false); } finally { // TODO: Provide a way to clear all behaviors just in case. if (!triedToRemovedBehavior) { ActionTraceSource.WriteWarning(TraceType, "Exception after adding behavior to block messages. Removing behavior synchronously"); FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync( () => testContext.FabricClient.TestManager.RemoveUnreliableTransportBehaviorAsync( fmPrimaryNodeName, behaviorName, action.RequestTimeout, cancellationToken), FabricClientRetryErrors.RemoveUnreliableTransportBehaviorErrors.Value, this.helper.GetRemainingTime(), cancellationToken).GetAwaiter().GetResult(); // TODO: Wait for some time so that the unreliable transport behavior can be read from the files. // Bug#2271465 - Unreliable transport through API should return only once the behavior has been successfully applied Task.Delay(TimeSpan.FromSeconds(5.0)).GetAwaiter().GetResult(); } } // -- note there's no explict validation // action result action.Result = new RestartPartitionResult(getPartitionStateAction.Result); ResultTraceString = StringHelper.Format("RestartPartitionAction succeeded for {0} with RestartPartitionMode = {1}", partitionId, action.RestartPartitionMode); }
protected override async Task ExecuteActionAsync(FabricTestContext testContext, MoveSecondaryAction action, CancellationToken cancellationToken) { ThrowIf.Null(action.PartitionSelector, "PartitionSelector"); this.helper = new TimeoutHelper(action.ActionTimeout); string newSecondaryNode = action.NewSecondaryNodeName; string currentSecondaryNode = action.CurrentSecondaryNodeName; var getPartitionStateAction = new GetSelectedPartitionStateAction(action.PartitionSelector) { RequestTimeout = action.RequestTimeout, ActionTimeout = this.helper.GetRemainingTime() }; await testContext.ActionExecutor.RunAsync(getPartitionStateAction, cancellationToken).ConfigureAwait(false); Guid partitionId = getPartitionStateAction.Result.PartitionId; if (!action.IgnoreConstraints) { // get current primary replica node name. ServiceReplicaList replicasResult = await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync( () => testContext.FabricClient.QueryManager.GetReplicaListAsync( partitionId, 0, action.RequestTimeout, cancellationToken), this.helper.GetRemainingTime(), cancellationToken).ConfigureAwait(false); string currentPrimaryNodeInfo = string.Empty; List <string> currentSecReplicaNodes = new List <string>(); foreach (var replica in replicasResult) { StatefulServiceReplica statefulReplica = replica as StatefulServiceReplica; if (statefulReplica == null) { throw new InvalidOperationException(StringHelper.Format(StringResources.Error_InvalidServiceTypeTestability, "MoveSecondary", "Stateful", action.PartitionSelector.ServiceName, "Stateless")); } if (statefulReplica.ReplicaRole == ReplicaRole.Primary) { currentPrimaryNodeInfo = statefulReplica.NodeName; if (!string.IsNullOrEmpty(newSecondaryNode) && newSecondaryNode == statefulReplica.NodeName) { throw new FabricException( StringHelper.Format(StringResources.Error_InvalidNodeNameProvided, newSecondaryNode, "MoveSecondary", "Primary exists on node"), FabricErrorCode.AlreadyPrimaryReplica); } } else if (statefulReplica.ReplicaRole == ReplicaRole.ActiveSecondary) { currentSecReplicaNodes.Add(statefulReplica.NodeName); if (!string.IsNullOrEmpty(newSecondaryNode) && newSecondaryNode == statefulReplica.NodeName) { throw new FabricException( StringHelper.Format(StringResources.Error_InvalidNodeNameProvided, newSecondaryNode, "MoveSecondary", "Secondary exists on node"), FabricErrorCode.AlreadySecondaryReplica); } } } if (currentSecReplicaNodes.Count == 0) { throw new InvalidOperationException(StringResources.Error_NoSecondariesInReplicaSet); } if (string.IsNullOrEmpty(currentSecondaryNode)) { int num = testContext.Random.Next(currentSecReplicaNodes.Count); currentSecondaryNode = currentSecReplicaNodes.ElementAt(num); } if (!currentSecReplicaNodes.Contains(currentSecondaryNode)) { throw new FabricException( StringHelper.Format(StringResources.Error_InvalidNodeNameProvided, newSecondaryNode, "MoveSecondary", "Current node does not have a secondary replica"), FabricErrorCode.InvalidReplicaStateForReplicaOperation); } } ReleaseAssert.AssertIf(string.IsNullOrEmpty(currentSecondaryNode), "Current node name cannot be null or empty."); ReleaseAssert.AssertIf(newSecondaryNode == currentSecondaryNode, "Current and New node names are same."); ActionTraceSource.WriteInfo(TraceSource, "Calling move secondary with current node {0}, new node {1}, partition {2}", currentSecondaryNode, string.IsNullOrEmpty(newSecondaryNode) ? "Random" : newSecondaryNode, partitionId); await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync( () => testContext.FabricClient.FaultManager.MoveSecondaryUsingNodeNameAsync( currentSecondaryNode, newSecondaryNode, getPartitionStateAction.Result.ServiceName, partitionId, action.IgnoreConstraints, action.RequestTimeout, cancellationToken), FabricClientRetryErrors.MoveSecondaryFabricErrors.Value, this.helper.GetRemainingTime(), cancellationToken).ConfigureAwait(false); action.Result = new MoveSecondaryResult(currentSecondaryNode, newSecondaryNode, getPartitionStateAction.Result); this.ResultTraceString = StringHelper.Format( "MoveSecondaryAction succeeded for moving Primary for {0} from {1} to {2}.", partitionId, currentSecondaryNode, newSecondaryNode); }
public override async Task <ActionStateBase> RunAsync(CancellationToken cancellationToken, ServiceInternalFaultInfo serviceInternalFaultInfo) { InvokeDataLossState state = Convert(this.State); ServiceDescription result = await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync( () => this.FabricClient.ServiceManager.GetServiceDescriptionAsync( this.partitionSelector.ServiceName, this.RequestTimeout, cancellationToken), this.OperationTimeout, cancellationToken).ConfigureAwait(false); if (result.Kind != ServiceDescriptionKind.Stateful) { // The message in the first arg is only for debugging, it is not returned to the user. throw new FabricInvalidForStatelessServicesException("FabricInvalidForStatelessServicesException", FabricErrorCode.InvalidForStatelessServices); } int targetReplicaSetSize = (result as StatefulServiceDescription).TargetReplicaSetSize; SelectedPartition targetPartition = await FaultAnalysisServiceUtility.GetSelectedPartitionStateAsync( this.FabricClient, this.partitionSelector, this.RequestTimeout, this.OperationTimeout, cancellationToken).ConfigureAwait(false); Guid partitionId = targetPartition.PartitionId; long preDataLossNumber = 0; ServicePartitionList partitionsResult = await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync( () => this.FabricClient.QueryManager.GetPartitionListAsync( this.partitionSelector.ServiceName, null, this.RequestTimeout, cancellationToken), this.OperationTimeout, cancellationToken).ConfigureAwait(false); bool partitionFound = false; foreach (StatefulServicePartition partition in partitionsResult) { if (partition.PartitionInformation.Id == partitionId) { preDataLossNumber = partition.PrimaryEpoch.DataLossNumber; partitionFound = true; break; } } if (!partitionFound) { throw new FabricException(StringHelper.Format(StringResources.Error_PartitionNotFound), FabricErrorCode.PartitionNotFound); } ServiceReplicaList failoverManagerReplicasResult = await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync( () => this.FabricClient.QueryManager.GetReplicaListAsync( FASConstants.FmPartitionId, 0, this.RequestTimeout, cancellationToken), this.OperationTimeout, cancellationToken).ConfigureAwait(false); string failoverManagerPrimaryNodeName = string.Empty; var readyFMReplicas = failoverManagerReplicasResult.Where(r => r.ReplicaStatus == ServiceReplicaStatus.Ready).ToArray(); foreach (var replica in readyFMReplicas) { StatefulServiceReplica statefulReplica = replica as StatefulServiceReplica; ReleaseAssert.AssertIf(statefulReplica == null, "FM Replica is not a stateful replica"); if (statefulReplica.ReplicaRole == ReplicaRole.Primary) { failoverManagerPrimaryNodeName = replica.NodeName; } } if (string.IsNullOrEmpty(failoverManagerPrimaryNodeName)) { throw new FabricException(StringHelper.Format(StringResources.Error_PartitionPrimaryNotReady, "FailoverManager"), FabricErrorCode.NotReady); } TestabilityTrace.TraceSource.WriteInfo(StepBase.TraceType, "{0} - FM primary location={1}", this.State.OperationId, failoverManagerPrimaryNodeName); string behaviorName = "BlockDoReconfiguration_" + this.State.OperationId; List <Tuple <string, string> > unreliableTransportInfo = new List <Tuple <string, string> >(); unreliableTransportInfo.Add(new Tuple <string, string>(failoverManagerPrimaryNodeName, behaviorName)); state.StateProgress.Push(StepStateNames.PerformingActions); state.Info.DataLossNumber = preDataLossNumber; state.Info.NodeName = failoverManagerPrimaryNodeName; state.Info.PartitionId = partitionId; state.Info.UnreliableTransportInfo = unreliableTransportInfo; state.Info.TargetReplicaSetSize = targetReplicaSetSize; return(state); }
public override async Task <ActionStateBase> RunAsync(CancellationToken cancellationToken, ServiceInternalFaultInfo serviceInternalFaultInfo) { InvokeDataLossState state = Convert(this.State); PartitionSelector partitionSelector = state.Info.PartitionSelector; DataLossMode dataLossMode = state.Info.DataLossMode; long preDataLossNumber = state.Info.DataLossNumber; string failoverManagerPrimaryNodeName = state.Info.NodeName; Guid partitionId = state.Info.PartitionId; string behaviorName = state.Info.UnreliableTransportInfo.First().Item2; int targetReplicaSetSize = state.Info.TargetReplicaSetSize; TestabilityTrace.TraceSource.WriteInfo(StepBase.TraceType, "{0} - applying UT, partitionId={1}", this.State.OperationId, partitionId); System.Fabric.Common.UnreliableTransportBehavior behavior = new System.Fabric.Common.UnreliableTransportBehavior("*", "DoReconfiguration"); behavior.AddFilterForPartitionId(partitionId); await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync( () => this.FabricClient.TestManager.AddUnreliableTransportBehaviorAsync( failoverManagerPrimaryNodeName, behaviorName, behavior, this.RequestTimeout, cancellationToken), this.OperationTimeout, cancellationToken).ConfigureAwait(false); // TODO: Wait for some time so that the unreliable transport behavior can be read from the files. // Bug#2271465 - Unreliable transport through API should return only once the behavior has been successfully applied await Task.Delay(TimeSpan.FromSeconds(5.0), cancellationToken).ConfigureAwait(false); ServiceReplicaList replicasResult = await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync( () => this.FabricClient.QueryManager.GetReplicaListAsync( partitionId, 0, this.RequestTimeout, cancellationToken), this.OperationTimeout, cancellationToken).ConfigureAwait(false); List <StatefulServiceReplica> replicaList = new List <StatefulServiceReplica>(); foreach (var replica in replicasResult) { StatefulServiceReplica statefulReplica = replica as StatefulServiceReplica; ReleaseAssert.AssertIf(statefulReplica == null, "Service Replica is not of stateful type even though service is stateful"); replicaList.Add(statefulReplica); } // Select target replicas based on the DataLosMode List <StatefulServiceReplica> targets = null; if (dataLossMode == DataLossMode.FullDataLoss) { targets = GetReplicasForFullDataLoss(replicaList); } else if (dataLossMode == DataLossMode.PartialDataLoss) { targets = FaultAnalysisServiceUtility.GetReplicasForPartialLoss(state.OperationId, replicaList); } else { throw FaultAnalysisServiceUtility.CreateException(StepBase.TraceType, Interop.NativeTypes.FABRIC_ERROR_CODE.E_INVALIDARG, Strings.StringResources.Error_UnsupportedDataLossMode); } if (targets == null) { // This will cause the command to rollback and retry throw new FabricTransientException("The operation could not be performed, please retry", FabricErrorCode.NotReady); } foreach (var replica in targets) { TestabilityTrace.TraceSource.WriteInfo( StepBase.TraceType, "{0} - Removing replica {1} in partition {2} with role {3} and status {4} to induce data loss", this.State.OperationId, replica.Id, partitionId, replica.ReplicaRole, replica.ReplicaStatus); await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync( () => this.FabricClient.ServiceManager.RemoveReplicaAsync( replica.NodeName, partitionId, replica.Id, this.RequestTimeout, cancellationToken), FabricClientRetryErrors.RemoveReplicaErrors.Value, this.OperationTimeout, cancellationToken).ConfigureAwait(false); } ActionTest.PerformInternalServiceFaultIfRequested(this.State.OperationId, serviceInternalFaultInfo, this.State, cancellationToken, true); await this.WaitForAllTargetReplicasToGetDroppedAsync(partitionId, targets, cancellationToken).ConfigureAwait(false); await RemoveUnreliableTransportAsync(this.FabricClient, failoverManagerPrimaryNodeName, behaviorName, this.RequestTimeout, this.OperationTimeout, cancellationToken).ConfigureAwait(false); bool dataLossWasSuccessful = false; TimeoutHelper timeoutHelper = new TimeoutHelper(TimeSpan.FromSeconds(30)); do { ServicePartitionList partitionsResult = await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync( () => this.FabricClient.QueryManager.GetPartitionListAsync( this.partitionSelector.ServiceName, null, this.RequestTimeout, cancellationToken), this.OperationTimeout, cancellationToken).ConfigureAwait(false); bool partitionFound = false; long postDataLossNumber = 0; foreach (StatefulServicePartition partition in partitionsResult) { if (partition.PartitionInformation.Id == partitionId) { postDataLossNumber = partition.PrimaryEpoch.DataLossNumber; partitionFound = true; break; } } if (!partitionFound) { throw new FabricException(StringHelper.Format(StringResources.Error_PartitionNotFound), FabricErrorCode.PartitionNotFound); } TestabilityTrace.TraceSource.WriteInfo( StepBase.TraceType, "{0} - Checking data loss numbers for partition {1} with remaining time {2}. Current numbers {3}:{4}", this.State.OperationId, partitionId, timeoutHelper.GetRemainingTime(), preDataLossNumber, postDataLossNumber); if (postDataLossNumber != preDataLossNumber) { dataLossWasSuccessful = true; break; } await System.Fabric.Common.AsyncWaiter.WaitAsync(TimeSpan.FromSeconds(this.dataLossCheckPollIntervalInSeconds), cancellationToken).ConfigureAwait(false); }while (timeoutHelper.GetRemainingTime() > TimeSpan.Zero); if (!dataLossWasSuccessful) { // This is only viewable internally for debug. This will cause a retry of the whole flow. string error = string.Format( CultureInfo.InvariantCulture, "{0} - Service could not induce data loss for service '{1}' partition '{2}' in '{3}' Please retry", this.State.OperationId, partitionSelector.ServiceName, partitionId, this.dataLossCheckWaitDurationInSeconds); TestabilityTrace.TraceSource.WriteWarning(StepBase.TraceType, error); throw new FabricTransientException("The operation could not be performed, please retry", FabricErrorCode.NotReady); } state.StateProgress.Push(StepStateNames.CompletedSuccessfully); return(state); }
protected override async Task ExecuteActionAsync(FabricTestContext testContext, InvokeDataLossAction action, CancellationToken cancellationToken) { ThrowIf.Null(action.PartitionSelector, "PartitionSelector"); var helper = new TimeoutHelper(action.ActionTimeout); ServiceDescription result = await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync( () => testContext.FabricClient.ServiceManager.GetServiceDescriptionAsync( action.PartitionSelector.ServiceName, action.RequestTimeout, cancellationToken), helper.GetRemainingTime(), cancellationToken).ConfigureAwait(false); if (result.Kind != ServiceDescriptionKind.Stateful) { throw new InvalidOperationException(StringHelper.Format(StringResources.Error_InvalidServiceTypeTestability, "DataLoss", "Stateful", action.PartitionSelector.ServiceName, "Stateless")); } var getPartitionStateAction = new GetSelectedPartitionStateAction(action.PartitionSelector) { RequestTimeout = action.RequestTimeout, ActionTimeout = helper.GetRemainingTime() }; await testContext.ActionExecutor.RunAsync(getPartitionStateAction, cancellationToken).ConfigureAwait(false); Guid partitionId = getPartitionStateAction.Result.PartitionId; long preDataLossNumber = 0; ServicePartitionList partitionsResult = await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync( () => testContext.FabricClient.QueryManager.GetPartitionListAsync( action.PartitionSelector.ServiceName, null, action.RequestTimeout, cancellationToken), helper.GetRemainingTime(), cancellationToken).ConfigureAwait(false); bool partitionFound = false; foreach (StatefulServicePartition partition in partitionsResult) { if (partition.PartitionInformation.Id == partitionId) { preDataLossNumber = partition.PrimaryEpoch.DataLossNumber; partitionFound = true; break; } } if (!partitionFound) { throw new FabricException(StringHelper.Format(StringResources.Error_PartitionNotFound), FabricErrorCode.PartitionNotFound); } long postDataLossNumber = preDataLossNumber; do { ActionTraceSource.WriteInfo( TraceType, "InvokeDataLossAction action pending time:{0}", helper.GetRemainingTime()); if (helper.GetRemainingTime() <= TimeSpan.Zero) { throw new TimeoutException(StringHelper.Format(StringResources.Error_TestabilityActionTimeout, "InvokeDataLoss", partitionId)); } ServiceReplicaList replicasResult = await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync( () => testContext.FabricClient.QueryManager.GetReplicaListAsync( partitionId, 0, action.RequestTimeout, cancellationToken), helper.GetRemainingTime(), cancellationToken).ConfigureAwait(false); ServiceReplicaList fmReplicasResult = await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync( () => testContext.FabricClient.QueryManager.GetReplicaListAsync( Constants.FmPartitionId, 0, action.RequestTimeout, cancellationToken), helper.GetRemainingTime(), cancellationToken).ConfigureAwait(false); string fmPrimaryNodeName = string.Empty; var readyFMReplicas = fmReplicasResult.Where(r => r.ReplicaStatus == ServiceReplicaStatus.Ready).ToArray(); foreach (var replica in readyFMReplicas) { StatefulServiceReplica statefulReplica = replica as StatefulServiceReplica; ReleaseAssert.AssertIf(statefulReplica == null, "FM Replica is not a stateful replica"); if (statefulReplica.ReplicaRole == ReplicaRole.Primary) { fmPrimaryNodeName = replica.NodeName; } } if (string.IsNullOrEmpty(fmPrimaryNodeName)) { throw new FabricException(StringHelper.Format(StringResources.Error_PartitionPrimaryNotReady, "FailoverManager"), FabricErrorCode.NotReady); } UnreliableTransportBehavior behavior = new UnreliableTransportBehavior("*", "DoReconfiguration"); behavior.AddFilterForPartitionId(partitionId); string behaviorName = "BlockDoReconfiguration"; await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync( () => testContext.FabricClient.TestManager.AddUnreliableTransportBehaviorAsync( fmPrimaryNodeName, behaviorName, behavior, action.RequestTimeout, cancellationToken), helper.GetRemainingTime(), cancellationToken).ConfigureAwait(false); // TODO: Wait for some time so that the unreliable transport behavior can be read from the files. // Bug#2271465 - Unreliable transport through API should return only once the behavior has been successfully applied await Task.Delay(TimeSpan.FromSeconds(5.0), cancellationToken).ConfigureAwait(false); bool triedToRemovedBehavior = false; try { var stableReplicasToRemove = replicasResult.Where(r => r.ReplicaStatus == ServiceReplicaStatus.Ready).ToArray(); ActionTraceSource.WriteInfo(TraceType, "Total number of replicas found {0}:{1}", replicasResult.Count(), stableReplicasToRemove.Count()); int replicasToRestartWithoutPrimary = action.DataLossMode == DataLossMode.FullDataLoss ? stableReplicasToRemove.Length - 1 : (stableReplicasToRemove.Length + 1) / 2 - 1; foreach (var replica in stableReplicasToRemove) { var currentReplica = replica; StatefulServiceReplica statefulReplica = currentReplica as StatefulServiceReplica; ReleaseAssert.AssertIf(statefulReplica == null, "Service Replica is not of stateful type even though service is stateful"); ActionTraceSource.WriteInfo( TraceType, "Inspecting replica {0}:{1} with role {2} and status {3} to induce data loss", currentReplica.Id, partitionId, statefulReplica.ReplicaRole, statefulReplica.ReplicaStatus); if (statefulReplica.ReplicaRole != ReplicaRole.Primary) { replicasToRestartWithoutPrimary--; } if (replicasToRestartWithoutPrimary >= 0 || statefulReplica.ReplicaRole == ReplicaRole.Primary) { ActionTraceSource.WriteInfo(TraceType, "Removing replica {0}:{1} to induce data loss", currentReplica.Id, partitionId); await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync( () => testContext.FabricClient.FaultManager.RemoveReplicaAsync( currentReplica.NodeName, partitionId, currentReplica.Id, CompletionMode.DoNotVerify, false, /*force remove*/ action.RequestTimeout.TotalSeconds, cancellationToken), helper.GetRemainingTime(), cancellationToken).ConfigureAwait(false); } } triedToRemovedBehavior = true; await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync( () => testContext.FabricClient.TestManager.RemoveUnreliableTransportBehaviorAsync( fmPrimaryNodeName, behaviorName, action.RequestTimeout, cancellationToken), FabricClientRetryErrors.RemoveUnreliableTransportBehaviorErrors.Value, helper.GetRemainingTime(), cancellationToken).ConfigureAwait(false); // TODO: Wait for some time so that the removal of this unreliable transport behavior can be read from the files. // Bug#2271465 - Unreliable transport through API should return only once the behavior has been successully applied await Task.Delay(TimeSpan.FromSeconds(5.0), cancellationToken).ConfigureAwait(false); // retry check for whether data loss number has increased 5 times else do the entire process again const int maxRetryCount = 5; int retryCount = 0; do { partitionsResult = await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync( () => testContext.FabricClient.QueryManager.GetPartitionListAsync( action.PartitionSelector.ServiceName, null, action.RequestTimeout, cancellationToken), FabricClientRetryErrors.GetPartitionListFabricErrors.Value, helper.GetRemainingTime(), cancellationToken).ConfigureAwait(false); partitionFound = false; foreach (StatefulServicePartition partition in partitionsResult) { if (partition.PartitionInformation.Id == partitionId) { postDataLossNumber = partition.PrimaryEpoch.DataLossNumber; partitionFound = true; break; } } if (!partitionFound) { throw new FabricException(StringHelper.Format(StringResources.Error_PartitionNotFound), FabricErrorCode.PartitionNotFound); } ActionTraceSource.WriteInfo( TraceType, "Checking data loss numbers for partition {0} with retryCount {1}. Current numbers {2}:{3}", partitionId, retryCount, preDataLossNumber, postDataLossNumber); if (postDataLossNumber != preDataLossNumber) { break; } await AsyncWaiter.WaitAsync(TimeSpan.FromSeconds(5), cancellationToken); ++retryCount; } while (retryCount < maxRetryCount); } finally { if (!triedToRemovedBehavior) { ActionTraceSource.WriteWarning(TraceType, "Exception after adding behavior to block messages. Removing behavior synchronously"); FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync( () => testContext.FabricClient.TestManager.RemoveUnreliableTransportBehaviorAsync( fmPrimaryNodeName, behaviorName, action.RequestTimeout, cancellationToken), FabricClientRetryErrors.RemoveUnreliableTransportBehaviorErrors.Value, helper.GetRemainingTime(), cancellationToken).GetAwaiter().GetResult(); // TODO: Wait for some time so that the removal of this unreliable transport behavior can be read from the files. // Bug#2271465 - Unreliable transport through API should return only once the behavior has been successully applied Task.Delay(TimeSpan.FromSeconds(5.0), cancellationToken).GetAwaiter().GetResult(); } } }while (postDataLossNumber == preDataLossNumber); ActionTraceSource.WriteInfo( TraceType, "InvokeDataLossAction action completed postDataLossNumber:{0}, preDataLossNumber:{1}", postDataLossNumber, preDataLossNumber); action.Result = new InvokeDataLossResult(getPartitionStateAction.Result); this.ResultTraceString = StringHelper.Format("InvokeDataLossAction succeeded for {0} with DatalossMode = {1}", partitionId, action.DataLossMode); }
internal static bool IsPrimaryOrSecondary(StatefulServiceReplica replica) { return(replica.ReplicaRole == ReplicaRole.Primary || replica.ReplicaRole == ReplicaRole.ActiveSecondary); }
public override async Task <ActionStateBase> RunAsync(CancellationToken cancellationToken, ServiceInternalFaultInfo serviceInternalFaultInfo) { RestartPartitionState state = Convert(this.State); Guid partitionId = state.Info.PartitionId; bool hasPersistedState = state.Info.HasPersistedState; string failoverManagerPrimaryNodeName = state.Info.NodeName; string behaviorName = state.Info.UnreliableTransportInfo.First().Item2; System.Fabric.Common.UnreliableTransportBehavior behavior = new System.Fabric.Common.UnreliableTransportBehavior("*", "DoReconfiguration"); behavior.AddFilterForPartitionId(partitionId); await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync( () => this.FabricClient.TestManager.AddUnreliableTransportBehaviorAsync( failoverManagerPrimaryNodeName, behaviorName, behavior, this.RequestTimeout, cancellationToken), this.OperationTimeout, cancellationToken).ConfigureAwait(false); // Bug#2271465 - Unreliable transport through API should return only once the behavior has been successfully applied await Task.Delay(TimeSpan.FromSeconds(5.0), cancellationToken).ConfigureAwait(false); TestabilityTrace.TraceSource.WriteInfo(StepBase.TraceType, "{0} - applied UT on partitionId {1}, node={2}", this.State.OperationId, partitionId, failoverManagerPrimaryNodeName); ServiceReplicaList replicasResult = await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync( () => this.FabricClient.QueryManager.GetReplicaListAsync( partitionId, 0, this.RequestTimeout, cancellationToken), this.OperationTimeout, cancellationToken).ConfigureAwait(false); var stableReplicasToRestart = replicasResult.Where(r => r.ReplicaStatus == ServiceReplicaStatus.Ready).ToArray(); foreach (var replica in stableReplicasToRestart) { if (this.restartPartitionMode == RestartPartitionMode.OnlyActiveSecondaries) { StatefulServiceReplica statefulReplica = replica as StatefulServiceReplica; ReleaseAssert.AssertIf(statefulReplica == null, "Stateful service replica is not StatefulServiceReplica"); if (statefulReplica.ReplicaRole == ReplicaRole.Primary) { continue; } } TestabilityTrace.TraceSource.WriteInfo(StepBase.TraceType, "{0} - restarting replica partition={1}, node={2}, replica id={3}", this.State.OperationId, partitionId, replica.NodeName, replica.Id); if (hasPersistedState) { await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync( () => this.FabricClient.ServiceManager.RestartReplicaAsync( replica.NodeName, partitionId, replica.Id, this.RequestTimeout, cancellationToken), FabricClientRetryErrors.RestartReplicaErrors.Value, this.OperationTimeout, cancellationToken).ConfigureAwait(false); } else { await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync( () => this.FabricClient.ServiceManager.RemoveReplicaAsync( replica.NodeName, partitionId, replica.Id, this.RequestTimeout, cancellationToken), FabricClientRetryErrors.RemoveReplicaErrors.Value, this.OperationTimeout, cancellationToken).ConfigureAwait(false); } } ActionTest.PerformInternalServiceFaultIfRequested(this.State.OperationId, serviceInternalFaultInfo, this.State, cancellationToken, true); await RemoveUnreliableTransportAsync(this.State, this.FabricClient, this.RequestTimeout, this.OperationTimeout, cancellationToken); state.StateProgress.Push(StepStateNames.CompletedSuccessfully); return(state); }
public void ServiceReplicaSerializationTest() { StatefulServiceReplica svcReplica = new StatefulServiceReplica(ServiceReplicaStatus.Ready, HealthState.Ok, ReplicaRole.ActiveSecondary, "fabric:/testsvc/testreplica", "nodeA", 890, TimeSpan.FromSeconds(5345)); TestUsingSerializer(this.Serializer, svcReplica); }
protected override async Task ExecuteActionAsync(FabricTestContext testContext, MovePrimaryAction action, CancellationToken cancellationToken) { ThrowIf.Null(action.PartitionSelector, "PartitionSelector"); this.helper = new TimeoutHelper(action.ActionTimeout); string newPrimaryNodeName = action.NodeName; var getPartitionStateAction = new GetSelectedPartitionStateAction(action.PartitionSelector) { RequestTimeout = action.RequestTimeout, ActionTimeout = this.helper.GetRemainingTime() }; await testContext.ActionExecutor.RunAsync(getPartitionStateAction, cancellationToken); Guid partitionId = getPartitionStateAction.Result.PartitionId; if (!action.IgnoreConstraints) { // select random node where replica's primary not present var nodesInfo = await testContext.FabricCluster.GetLatestNodeInfoAsync(action.RequestTimeout, this.helper.GetRemainingTime(), cancellationToken); if ((nodesInfo == null || nodesInfo.Count() == 0)) { throw new InvalidOperationException(StringHelper.Format(StringResources.Error_NotEnoughNodesForTestabilityAction, "MovePrimary")); } ServiceReplicaList replicasResult = await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync( () => testContext.FabricClient.QueryManager.GetReplicaListAsync( partitionId, 0, action.RequestTimeout, cancellationToken), this.helper.GetRemainingTime(), cancellationToken).ConfigureAwait(false); NodeInfo currentPrimaryNodeInfo = null; string currentPrimaryNodeName = string.Empty; foreach (var replica in replicasResult) { StatefulServiceReplica statefulReplica = replica as StatefulServiceReplica; if (statefulReplica == null) { throw new InvalidOperationException(StringHelper.Format(StringResources.Error_InvalidServiceTypeTestability, "MovePrimary", "Stateful", action.PartitionSelector.ServiceName, "Stateless")); } if (statefulReplica.ReplicaRole == ReplicaRole.Primary) { currentPrimaryNodeInfo = nodesInfo.FirstOrDefault(n => n.NodeName == statefulReplica.NodeName); if (!string.IsNullOrEmpty(newPrimaryNodeName) && newPrimaryNodeName == statefulReplica.NodeName) { throw new FabricException( StringHelper.Format(StringResources.Error_InvalidNodeNameProvided, newPrimaryNodeName, "MovePrimary", "Primary already exists on node"), FabricErrorCode.AlreadyPrimaryReplica); } break; } } if (currentPrimaryNodeInfo == null) { throw new FabricException(StringHelper.Format(StringResources.Error_PartitionPrimaryNotReady, action.PartitionSelector + ":" + partitionId), FabricErrorCode.NotReady); } currentPrimaryNodeName = currentPrimaryNodeInfo.NodeName; if (newPrimaryNodeName == currentPrimaryNodeName) { throw new FabricException( StringHelper.Format(StringResources.Error_InvalidNodeNameProvided, newPrimaryNodeName, "MovePrimary", "Primary already exists on node"), FabricErrorCode.AlreadyPrimaryReplica); } } ActionTraceSource.WriteInfo(TraceSource, "Calling move primary with node {0}, partition {1}", string.IsNullOrEmpty(newPrimaryNodeName) ? "Random" : newPrimaryNodeName, partitionId); await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync( () => testContext.FabricClient.FaultManager.MovePrimaryUsingNodeNameAsync( newPrimaryNodeName, getPartitionStateAction.Result.ServiceName, partitionId, action.IgnoreConstraints, action.RequestTimeout, cancellationToken), FabricClientRetryErrors.MovePrimaryFabricErrors.Value, this.helper.GetRemainingTime(), cancellationToken).ConfigureAwait(false); action.Result = new MovePrimaryResult(newPrimaryNodeName, getPartitionStateAction.Result); ResultTraceString = StringHelper.Format("MovePrimaryAction succeeded for moving Primary for {0} to node {1}.", partitionId, newPrimaryNodeName); }
internal static bool IsReplicaUp(StatefulServiceReplica replica) { return(replica.ReplicaStatus == ServiceReplicaStatus.InBuild || replica.ReplicaStatus == ServiceReplicaStatus.Ready || replica.ReplicaStatus == ServiceReplicaStatus.Standby); }
protected override async Task ExecuteActionAsync(FabricTestContext testContext, InvokeQuorumLossAction action, CancellationToken cancellationToken) { ThrowIf.Null(action.PartitionSelector, "PartitionSelector"); var helper = new TimeoutHelper(action.ActionTimeout); // get info about the service so we can check type and trss ServiceDescription result = await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync( () => testContext.FabricClient.ServiceManager.GetServiceDescriptionAsync( action.PartitionSelector.ServiceName, action.RequestTimeout, cancellationToken), helper.GetRemainingTime(), cancellationToken).ConfigureAwait(false); if (result.Kind != ServiceDescriptionKind.Stateful) { throw new InvalidOperationException(StringHelper.Format(StringResources.Error_InvalidServiceTypeTestability, "QuorumLoss", "Stateful", action.PartitionSelector.ServiceName, "Stateless")); } StatefulServiceDescription statefulServiceDescription = result as StatefulServiceDescription; ReleaseAssert.AssertIf(statefulServiceDescription == null, "Service is not a stateful service"); if (!statefulServiceDescription.HasPersistedState) { throw new InvalidOperationException(StringHelper.Format(StringResources.Error_InvalidServiceTypeTestability, "QuorumLoss", "Stateful Persistent", action.PartitionSelector.ServiceName, "Stateful In-Memory Only")); } // figure out /which/ partition to select var getPartitionStateAction = new GetSelectedPartitionStateAction(action.PartitionSelector) { RequestTimeout = action.RequestTimeout, ActionTimeout = helper.GetRemainingTime() }; await testContext.ActionExecutor.RunAsync(getPartitionStateAction, cancellationToken); Guid partitionId = getPartitionStateAction.Result.PartitionId; // get data about replicas in that partition ServiceReplicaList replicasResult = await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync( () => testContext.FabricClient.QueryManager.GetReplicaListAsync( partitionId, 0, action.RequestTimeout, cancellationToken), helper.GetRemainingTime(), cancellationToken).ConfigureAwait(false); var removeUTRequestList = new List <Tuple <string, string> >(); Dictionary <Tuple <string, string>, Task> removeUTTaskDictionary = new Dictionary <Tuple <string, string>, Task>(); try { var stableReplicas = replicasResult.Where(r => r.ReplicaStatus == ServiceReplicaStatus.Ready).ToArray(); var stableReplicasToRemove = new List <StatefulServiceReplica>(); long replicasToRestartWithoutPrimary = action.QuorumLossMode == QuorumLossMode.AllReplicas ? stableReplicas.Length - 1 : FabricCluster.GetWriteQuorumSize(replicasResult.Count); foreach (var replica in stableReplicas) { StatefulServiceReplica statefulReplica = replica as StatefulServiceReplica; ReleaseAssert.AssertIf(statefulReplica == null, "Service Replica is not of stateful type even though service is stateful"); if (statefulReplica.ReplicaRole != ReplicaRole.Primary) { replicasToRestartWithoutPrimary--; } if (replicasToRestartWithoutPrimary >= 0 || statefulReplica.ReplicaRole == ReplicaRole.Primary) { stableReplicasToRemove.Add(statefulReplica); } } // for selected replicas, block reopen so that when we restart the replica (NOT remove the replica) it doesn't come up var utTaskList = new List <Task>(); foreach (var statefulReplica in stableReplicasToRemove) { string nodeName = statefulReplica.NodeName; UnreliableTransportBehavior behavior = new UnreliableTransportBehavior("*", "StatefulServiceReopen"); behavior.AddFilterForPartitionId(partitionId); string behaviorName = "BlockStatefulServiceReopen_" + nodeName; removeUTRequestList.Add(new Tuple <string, string>(nodeName, behaviorName)); utTaskList.Add( FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync( () => testContext.FabricClient.TestManager.AddUnreliableTransportBehaviorAsync( nodeName, behaviorName, behavior, action.RequestTimeout, cancellationToken), helper.GetRemainingTime(), cancellationToken)); } await Task.WhenAll(utTaskList).ConfigureAwait(false); // TODO: Wait for some time so that the unreliable transport behavior can be read from the files. // Bug#2271465 - Unreliable transport through API should return only once the behavior has been successfully applied await Task.Delay(TimeSpan.FromSeconds(5.0), cancellationToken); var restartReplicaTaskList = new List <Task>(); foreach (var statefulReplica in stableReplicasToRemove) { ReplicaSelector replicaSelector = ReplicaSelector.ReplicaIdOf(PartitionSelector.PartitionIdOf(action.PartitionSelector.ServiceName, partitionId), statefulReplica.Id); var restartReplicaAction = new RestartReplicaAction(replicaSelector) { CompletionMode = CompletionMode.DoNotVerify, RequestTimeout = action.RequestTimeout, ActionTimeout = helper.GetRemainingTime() }; restartReplicaTaskList.Add(testContext.ActionExecutor.RunAsync(restartReplicaAction, cancellationToken)); } await Task.WhenAll(restartReplicaTaskList).ConfigureAwait(false); await AsyncWaiter.WaitAsync(action.QuorumLossDuration, cancellationToken).ConfigureAwait(false); // validate ServicePartitionList partitionsResult = await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync( () => testContext.FabricClient.QueryManager.GetPartitionListAsync( action.PartitionSelector.ServiceName, null, action.RequestTimeout, cancellationToken), FabricClientRetryErrors.GetPartitionListFabricErrors.Value, helper.GetRemainingTime(), cancellationToken).ConfigureAwait(false); foreach (StatefulServicePartition partition in partitionsResult) { if (partition.PartitionInformation.Id == partitionId) { ReleaseAssert.AssertIf(partition.PartitionStatus != ServicePartitionStatus.InQuorumLoss, "Partition failed to be in Quorum Loss."); break; } } foreach (var removeUTParams in removeUTRequestList) { var currentParams = removeUTParams; Task task = FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync( () => testContext.FabricClient.TestManager.RemoveUnreliableTransportBehaviorAsync( currentParams.Item1, /*nodeName*/ currentParams.Item2, /*behaviorName*/ action.RequestTimeout, cancellationToken), FabricClientRetryErrors.RemoveUnreliableTransportBehaviorErrors.Value, helper.GetRemainingTime(), cancellationToken); removeUTTaskDictionary[currentParams] = task; } await Task.WhenAll(removeUTTaskDictionary.Values).ConfigureAwait(false); // TODO: Wait for some time so that the removal of this unreliable transport behavior can be read from the files. // Bug#2271465 - Unreliable transport through API should return only once the behavior has been successully applied await Task.Delay(TimeSpan.FromSeconds(5.0), cancellationToken); } finally { var removeUTTaskList = new List <Task>(); foreach (var removeUTRequest in removeUTTaskDictionary) { var currentRemoveUTRequest = removeUTRequest; if (currentRemoveUTRequest.Value == null || currentRemoveUTRequest.Value.IsFaulted) { removeUTTaskList.Add( FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync( () => testContext.FabricClient.TestManager.RemoveUnreliableTransportBehaviorAsync( currentRemoveUTRequest.Key.Item1, /*nodeName*/ currentRemoveUTRequest.Key.Item2, /*behaviorName*/ action.RequestTimeout, cancellationToken), FabricClientRetryErrors.RemoveUnreliableTransportBehaviorErrors.Value, helper.GetRemainingTime(), cancellationToken)); } } Task.WhenAll(removeUTTaskList).Wait(cancellationToken); // TODO: Wait for some time so that the removal of this unreliable transport behavior can be read from the files. // Bug#2271465 - Unreliable transport through API should return only once the behavior has been successully applied Task.Delay(TimeSpan.FromSeconds(5.0), cancellationToken).GetAwaiter().GetResult(); } action.Result = new InvokeQuorumLossResult(getPartitionStateAction.Result); this.ResultTraceString = StringHelper.Format("InvokeQuorumLossAction succeeded for {0} with QuorumLossMode = {1}", partitionId, action.QuorumLossMode); }
public override async Task <ActionStateBase> RunAsync(CancellationToken cancellationToken, ServiceInternalFaultInfo serviceInternalFaultInfo) { InvokeQuorumLossState state = Convert(this.State); // get info about the service so we can check type and trss ServiceDescription result = await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync( () => this.FabricClient.ServiceManager.GetServiceDescriptionAsync( this.partitionSelector.ServiceName, this.RequestTimeout, cancellationToken), this.OperationTimeout, cancellationToken).ConfigureAwait(false); if (result.Kind != ServiceDescriptionKind.Stateful) { // The message in the first arg is only for debugging, it is not returned to the user. throw new FabricInvalidForStatelessServicesException("FabricInvalidForStatelessServicesException", FabricErrorCode.InvalidForStatelessServices); } StatefulServiceDescription statefulServiceDescription = result as StatefulServiceDescription; ReleaseAssert.AssertIf(statefulServiceDescription == null, string.Format(CultureInfo.InvariantCulture, "{0} - Service is not a stateful service", this.State.OperationId)); if (!statefulServiceDescription.HasPersistedState) { // The message in the first arg is only for debugging, it is not returned to the user. throw new FabricOnlyValidForStatefulPersistentServicesException("This is only valid for stateful persistent services", FabricErrorCode.OnlyValidForStatefulPersistentServices); } SelectedPartition targetPartition = await FaultAnalysisServiceUtility.GetSelectedPartitionStateAsync( this.FabricClient, this.partitionSelector, this.RequestTimeout, this.OperationTimeout, cancellationToken).ConfigureAwait(false); Guid partitionId = targetPartition.PartitionId; // get data about replicas in that partition ServiceReplicaList replicasResult = await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync( () => this.FabricClient.QueryManager.GetReplicaListAsync( partitionId, 0, this.RequestTimeout, cancellationToken), this.OperationTimeout, cancellationToken).ConfigureAwait(false); List <StatefulServiceReplica> tempReplicas = new List <StatefulServiceReplica>(); foreach (var replica in replicasResult) { StatefulServiceReplica statefulReplica = replica as StatefulServiceReplica; ReleaseAssert.AssertIf(statefulReplica == null, "Expected stateful replica"); tempReplicas.Add(statefulReplica); } List <StatefulServiceReplica> targetReplicas = null; if (this.quorumLossMode == QuorumLossMode.AllReplicas) { targetReplicas = tempReplicas.Where(r => r.ReplicaRole == ReplicaRole.Primary || r.ReplicaRole == ReplicaRole.ActiveSecondary).ToList(); } else if (this.quorumLossMode == QuorumLossMode.QuorumReplicas) { targetReplicas = FaultAnalysisServiceUtility.GetReplicasForPartialLoss(state.OperationId, tempReplicas); } else { throw FaultAnalysisServiceUtility.CreateException(StepBase.TraceType, Interop.NativeTypes.FABRIC_ERROR_CODE.E_INVALIDARG, Strings.StringResources.Error_UnsupportedQuorumLossMode); } if (targetReplicas == null) { // This will cause the command to rollback and retry throw new FabricTransientException("The operation could not be performed, please retry", FabricErrorCode.NotReady); } List <string> targetNodes = new List <string>(); foreach (var replica in targetReplicas) { targetNodes.Add(replica.NodeName); } List <Tuple <string, string> > unreliableTransportInfoList = new List <Tuple <string, string> >(); foreach (string nodeName in targetNodes) { UnreliableTransportBehavior behavior = new UnreliableTransportBehavior("*", "StatefulServiceReopen"); behavior.AddFilterForPartitionId(partitionId); // ApplyingUnreliableTransport.BehaviorNamePrefix + nodeName; string behaviorName = this.CreateBehaviorName(nodeName); unreliableTransportInfoList.Add(new Tuple <string, string>(nodeName, behaviorName)); } state.StateProgress.Push(StepStateNames.PerformingActions); state.Info.PartitionId = partitionId; state.Info.ReplicaIds = targetReplicas.Select(r => r.Id).ToList(); state.Info.UnreliableTransportInfo = unreliableTransportInfoList; return(state); }
public override async Task <ActionStateBase> RunAsync(CancellationToken cancellationToken, ServiceInternalFaultInfo serviceInternalFaultInfo) { TestabilityTrace.TraceSource.WriteInfo(StepBase.TraceType, "Inside CollectingState, service={0}", this.partitionSelector.ServiceName); RestartPartitionState state = Convert(this.State); // Get service info and validate if the parameters are valid ServiceDescription result = await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync( () => this.FabricClient.ServiceManager.GetServiceDescriptionAsync( this.partitionSelector.ServiceName, this.RequestTimeout, cancellationToken), this.OperationTimeout, cancellationToken).ConfigureAwait(false); if (result.Kind != ServiceDescriptionKind.Stateful && this.restartPartitionMode == RestartPartitionMode.OnlyActiveSecondaries) { // The message in the first arg is only for debugging, it is not returned to the user. string debugText = string.Format(CultureInfo.InvariantCulture, "RestartPartition: for stateless services only RestartPartitionMode.AllReplicasOrInstances is valid"); TestabilityTrace.TraceSource.WriteWarning(StepBase.TraceType, debugText); throw FaultAnalysisServiceUtility.CreateException(StepBase.TraceType, NativeTypes.FABRIC_ERROR_CODE.E_INVALIDARG, debugText); } bool hasPersistedState = false; if (result.Kind == ServiceDescriptionKind.Stateful) { StatefulServiceDescription statefulDescription = result as StatefulServiceDescription; ReleaseAssert.AssertIf(statefulDescription == null, "Stateful service description is not WinFabricStatefulServiceDescription"); hasPersistedState = statefulDescription.HasPersistedState; } SelectedPartition targetPartition = await FaultAnalysisServiceUtility.GetSelectedPartitionStateAsync( this.FabricClient, this.partitionSelector, this.RequestTimeout, this.OperationTimeout, cancellationToken).ConfigureAwait(false); Guid partitionId = targetPartition.PartitionId; // get replicas for target ServiceReplicaList replicasResult = await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync( () => this.FabricClient.QueryManager.GetReplicaListAsync( partitionId, 0, this.RequestTimeout, cancellationToken), this.OperationTimeout, cancellationToken).ConfigureAwait(false); // get replicas for fm in order to get the primary ServiceReplicaList failoverManagersReplicasResult = await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync( () => this.FabricClient.QueryManager.GetReplicaListAsync( FASConstants.FmPartitionId, 0, this.RequestTimeout, cancellationToken), this.OperationTimeout, cancellationToken).ConfigureAwait(false); string failoverManagerPrimaryNodeName = string.Empty; var readyFMReplicas = failoverManagersReplicasResult.Where(r => r.ReplicaStatus == ServiceReplicaStatus.Ready).ToArray(); foreach (var replica in readyFMReplicas) { StatefulServiceReplica statefulReplica = replica as StatefulServiceReplica; ReleaseAssert.AssertIf(statefulReplica == null, "FM Replica is not a stateful replica"); if (statefulReplica.ReplicaRole == ReplicaRole.Primary) { failoverManagerPrimaryNodeName = replica.NodeName; } } if (string.IsNullOrEmpty(failoverManagerPrimaryNodeName)) { throw new FabricException(StringHelper.Format(StringResources.Error_PartitionPrimaryNotReady, "FailoverManager"), FabricErrorCode.NotReady); } TestabilityTrace.TraceSource.WriteInfo(StepBase.TraceType, "{0} - FM primary is at node={1}", this.State.OperationId, failoverManagerPrimaryNodeName); string behaviorName = RestartingSelectedReplicas.UTBehaviorPrefixName + "_" + this.State.OperationId; List <Tuple <string, string> > unreliableTransportInfo = new List <Tuple <string, string> >(); unreliableTransportInfo.Add(new Tuple <string, string>(failoverManagerPrimaryNodeName, behaviorName)); state.StateProgress.Push(StepStateNames.PerformingActions); state.Info.PartitionId = partitionId; state.Info.NodeName = failoverManagerPrimaryNodeName; state.Info.HasPersistedState = hasPersistedState; state.Info.UnreliableTransportInfo = unreliableTransportInfo; return(state); }