private FabricTestAction GetRemoveReplicaAction(ReplicaStateTransitionAction ragAction) { Uri serviceUri = ragAction.ServiceUri; Guid guid = ragAction.PartitionId; long replicaId = ragAction.ReplicaId; string report = StringHelper.Format("Generating Action: {0}\n\t\tService: {1}\n\t\tPartition: {2}\n\t\tReplicaId: {3}", ragAction.ActionType, ragAction.ServiceUri, ragAction.PartitionId, ragAction.ReplicaId); // Select specific replica using ReplicaSelector. ReplicaSelector replicaSelector = ReplicaSelector.ReplicaIdOf(PartitionSelector.PartitionIdOf(serviceUri, guid), replicaId); RemoveReplicaAction restartReplicaAction = new RemoveReplicaAction(replicaSelector); return(restartReplicaAction); }
private void KillPrimaryReplica() { // Kill the primary Application application = _fabricClient.QueryManager.GetApplicationListAsync() .Result.Single(a => a.ApplicationTypeName == DefaultApplicationTypeName); Service service = _fabricClient.QueryManager.GetServiceListAsync(application.ApplicationName).Result.Single(); Partition partition = _fabricClient.QueryManager.GetPartitionListAsync(service.ServiceName).Result.Single(); StatefulServiceReplica primaryReplica = _fabricClient.QueryManager.GetReplicaListAsync(partition.PartitionInformation.Id) .Result.Select(replica => replica as StatefulServiceReplica) .Single(statefulServiceReplica => statefulServiceReplica.ReplicaRole == ReplicaRole.Primary); LogHelper.Log("Killing the primary replica at node {0}", primaryReplica.NodeName); ReplicaSelector replicaSelector = ReplicaSelector.ReplicaIdOf(PartitionSelector.PartitionIdOf(service.ServiceName, partition.PartitionInformation.Id), primaryReplica.Id); _fabricClient.FaultManager.RemoveReplicaAsync(replicaSelector, CompletionMode.DoNotVerify, false); }
public static ReplicaSelector GetExpectedReplicaSelector(ParitionSelectorTestHelper.PartitionCase partitionCase, ReplicaCase replicaCase) { ReplicaSelector result = null; PartitionSelector partitionSelector = ParitionSelectorTestHelper.GetExpectedPartitionSelector(partitionCase); switch (replicaCase) { case ReplicaCase.ReplicaPrimary: { result = ReplicaSelector.PrimaryOf(partitionSelector); break; } case ReplicaCase.ReplicaRandomSecondary: { result = ReplicaSelector.RandomSecondaryOf(partitionSelector); break; } case ReplicaCase.ReplicaId: { result = ReplicaSelector.ReplicaIdOf(partitionSelector, replicaInstance.Value); break; } case ReplicaCase.ReplicaId_NoValue: { result = ReplicaSelector.ReplicaIdOf(partitionSelector, 0); break; } case ReplicaCase.ReplicaRandom: { result = ReplicaSelector.RandomOf(partitionSelector); break; } } return(result); }
public override async Task <ActionStateBase> RunAsync(CancellationToken cancellationToken, ServiceInternalFaultInfo serviceInternalFaultInfo) { InvokeQuorumLossState state = Convert(this.State); Guid partitionId = state.Info.PartitionId; List <Tuple <string, string> > unreliableTransportInfo = state.Info.UnreliableTransportInfo; List <long> targetReplicas = state.Info.ReplicaIds; var unreliableTransportTaskList = new List <Task>(); List <Tuple <string, string> > unreliableTransportInfoList = new List <Tuple <string, string> >(); foreach (Tuple <string, string> ut in unreliableTransportInfo) { string nodeName = ut.Item1; string behaviorName = ut.Item2; System.Fabric.Common.UnreliableTransportBehavior behavior = new System.Fabric.Common.UnreliableTransportBehavior("*", "StatefulServiceReopen"); behavior.AddFilterForPartitionId(partitionId); TestabilityTrace.TraceSource.WriteInfo(StepBase.TraceType, "{0} - applying '{1}'", this.State.OperationId, behaviorName); unreliableTransportTaskList.Add(FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync( () => this.FabricClient.TestManager.AddUnreliableTransportBehaviorAsync( nodeName, behaviorName, behavior, this.RequestTimeout, cancellationToken), this.OperationTimeout, cancellationToken)); } await Task.WhenAll(unreliableTransportTaskList).ConfigureAwait(false); // Bug#2271465 - Unreliable transport through API should return only once the behavior has been successfully applied await Task.Delay(TimeSpan.FromSeconds(5.0), cancellationToken).ConfigureAwait(false); List <Task> tasks = new List <Task>(); foreach (long replicaId in targetReplicas) { ReplicaSelector replicaSelector = ReplicaSelector.ReplicaIdOf(PartitionSelector.PartitionIdOf(this.partitionSelector.ServiceName, partitionId), replicaId); TestabilityTrace.TraceSource.WriteInfo(StepBase.TraceType, "{0} - faulting replica with id={1}", this.State.OperationId, replicaId); Task task = FaultAnalysisServiceUtility.RestartReplicaAsync(this.FabricClient, replicaSelector, CompletionMode.DoNotVerify, this.RequestTimeout, this.OperationTimeout, cancellationToken); tasks.Add(task); } await Task.WhenAll(tasks).ConfigureAwait(false); ActionTest.PerformInternalServiceFaultIfRequested(this.State.OperationId, serviceInternalFaultInfo, this.State, cancellationToken, true); TestabilityTrace.TraceSource.WriteInfo(StepBase.TraceType, "{0} - keeping partition in quorum loss for '{1}'", this.State.OperationId, state.Info.QuorumLossDuration); await Task.Delay(state.Info.QuorumLossDuration, cancellationToken).ConfigureAwait(false); TimeoutHelper timeoutHelper = new TimeoutHelper(this.OperationTimeout); bool conditionSatisfied = false; int quorumLossCheckRetries = FASConstants.QuorumLossCheckRetryCount; do { TestabilityTrace.TraceSource.WriteInfo(StepBase.TraceType, "{0} - checking PartitionStatus", this.State.OperationId); ServicePartitionList partitionsResult = await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync( () => this.FabricClient.QueryManager.GetPartitionListAsync( this.partitionSelector.ServiceName, null, this.RequestTimeout, cancellationToken), this.OperationTimeout, cancellationToken).ConfigureAwait(false); foreach (StatefulServicePartition partition in partitionsResult) { if (partition.PartitionInformation.Id == partitionId) { if (partition.PartitionStatus == ServicePartitionStatus.InQuorumLoss) { conditionSatisfied = true; break; } } } await AsyncWaiter.WaitAsync(TimeSpan.FromSeconds(5), cancellationToken).ConfigureAwait(false); }while (!conditionSatisfied && quorumLossCheckRetries-- > 0); if (!conditionSatisfied) { string error = string.Format(CultureInfo.InvariantCulture, "{0} - Service could not induce quorum loss for service '{1}', partition '{2}'. Please retry", this.State.OperationId, this.partitionSelector.ServiceName, partitionId); TestabilityTrace.TraceSource.WriteWarning(StepBase.TraceType, error); throw new FabricTransientException("The operation could not be performed, please retry", FabricErrorCode.NotReady); } await QuorumLossStepsFactory.RemoveUTAsync(this.FabricClient, this.State, this.RequestTimeout, this.OperationTimeout, cancellationToken); state.StateProgress.Push(StepStateNames.CompletedSuccessfully); return(state); }
protected override async Task ExecuteActionAsync(FabricTestContext testContext, InvokeQuorumLossAction action, CancellationToken cancellationToken) { ThrowIf.Null(action.PartitionSelector, "PartitionSelector"); var helper = new TimeoutHelper(action.ActionTimeout); // get info about the service so we can check type and trss ServiceDescription result = await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync( () => testContext.FabricClient.ServiceManager.GetServiceDescriptionAsync( action.PartitionSelector.ServiceName, action.RequestTimeout, cancellationToken), helper.GetRemainingTime(), cancellationToken).ConfigureAwait(false); if (result.Kind != ServiceDescriptionKind.Stateful) { throw new InvalidOperationException(StringHelper.Format(StringResources.Error_InvalidServiceTypeTestability, "QuorumLoss", "Stateful", action.PartitionSelector.ServiceName, "Stateless")); } StatefulServiceDescription statefulServiceDescription = result as StatefulServiceDescription; ReleaseAssert.AssertIf(statefulServiceDescription == null, "Service is not a stateful service"); if (!statefulServiceDescription.HasPersistedState) { throw new InvalidOperationException(StringHelper.Format(StringResources.Error_InvalidServiceTypeTestability, "QuorumLoss", "Stateful Persistent", action.PartitionSelector.ServiceName, "Stateful In-Memory Only")); } // figure out /which/ partition to select var getPartitionStateAction = new GetSelectedPartitionStateAction(action.PartitionSelector) { RequestTimeout = action.RequestTimeout, ActionTimeout = helper.GetRemainingTime() }; await testContext.ActionExecutor.RunAsync(getPartitionStateAction, cancellationToken); Guid partitionId = getPartitionStateAction.Result.PartitionId; // get data about replicas in that partition ServiceReplicaList replicasResult = await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync( () => testContext.FabricClient.QueryManager.GetReplicaListAsync( partitionId, 0, action.RequestTimeout, cancellationToken), helper.GetRemainingTime(), cancellationToken).ConfigureAwait(false); var removeUTRequestList = new List <Tuple <string, string> >(); Dictionary <Tuple <string, string>, Task> removeUTTaskDictionary = new Dictionary <Tuple <string, string>, Task>(); try { var stableReplicas = replicasResult.Where(r => r.ReplicaStatus == ServiceReplicaStatus.Ready).ToArray(); var stableReplicasToRemove = new List <StatefulServiceReplica>(); long replicasToRestartWithoutPrimary = action.QuorumLossMode == QuorumLossMode.AllReplicas ? stableReplicas.Length - 1 : FabricCluster.GetWriteQuorumSize(replicasResult.Count); foreach (var replica in stableReplicas) { StatefulServiceReplica statefulReplica = replica as StatefulServiceReplica; ReleaseAssert.AssertIf(statefulReplica == null, "Service Replica is not of stateful type even though service is stateful"); if (statefulReplica.ReplicaRole != ReplicaRole.Primary) { replicasToRestartWithoutPrimary--; } if (replicasToRestartWithoutPrimary >= 0 || statefulReplica.ReplicaRole == ReplicaRole.Primary) { stableReplicasToRemove.Add(statefulReplica); } } // for selected replicas, block reopen so that when we restart the replica (NOT remove the replica) it doesn't come up var utTaskList = new List <Task>(); foreach (var statefulReplica in stableReplicasToRemove) { string nodeName = statefulReplica.NodeName; UnreliableTransportBehavior behavior = new UnreliableTransportBehavior("*", "StatefulServiceReopen"); behavior.AddFilterForPartitionId(partitionId); string behaviorName = "BlockStatefulServiceReopen_" + nodeName; removeUTRequestList.Add(new Tuple <string, string>(nodeName, behaviorName)); utTaskList.Add( FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync( () => testContext.FabricClient.TestManager.AddUnreliableTransportBehaviorAsync( nodeName, behaviorName, behavior, action.RequestTimeout, cancellationToken), helper.GetRemainingTime(), cancellationToken)); } await Task.WhenAll(utTaskList).ConfigureAwait(false); // TODO: Wait for some time so that the unreliable transport behavior can be read from the files. // Bug#2271465 - Unreliable transport through API should return only once the behavior has been successfully applied await Task.Delay(TimeSpan.FromSeconds(5.0), cancellationToken); var restartReplicaTaskList = new List <Task>(); foreach (var statefulReplica in stableReplicasToRemove) { ReplicaSelector replicaSelector = ReplicaSelector.ReplicaIdOf(PartitionSelector.PartitionIdOf(action.PartitionSelector.ServiceName, partitionId), statefulReplica.Id); var restartReplicaAction = new RestartReplicaAction(replicaSelector) { CompletionMode = CompletionMode.DoNotVerify, RequestTimeout = action.RequestTimeout, ActionTimeout = helper.GetRemainingTime() }; restartReplicaTaskList.Add(testContext.ActionExecutor.RunAsync(restartReplicaAction, cancellationToken)); } await Task.WhenAll(restartReplicaTaskList).ConfigureAwait(false); await AsyncWaiter.WaitAsync(action.QuorumLossDuration, cancellationToken).ConfigureAwait(false); // validate ServicePartitionList partitionsResult = await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync( () => testContext.FabricClient.QueryManager.GetPartitionListAsync( action.PartitionSelector.ServiceName, null, action.RequestTimeout, cancellationToken), FabricClientRetryErrors.GetPartitionListFabricErrors.Value, helper.GetRemainingTime(), cancellationToken).ConfigureAwait(false); foreach (StatefulServicePartition partition in partitionsResult) { if (partition.PartitionInformation.Id == partitionId) { ReleaseAssert.AssertIf(partition.PartitionStatus != ServicePartitionStatus.InQuorumLoss, "Partition failed to be in Quorum Loss."); break; } } foreach (var removeUTParams in removeUTRequestList) { var currentParams = removeUTParams; Task task = FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync( () => testContext.FabricClient.TestManager.RemoveUnreliableTransportBehaviorAsync( currentParams.Item1, /*nodeName*/ currentParams.Item2, /*behaviorName*/ action.RequestTimeout, cancellationToken), FabricClientRetryErrors.RemoveUnreliableTransportBehaviorErrors.Value, helper.GetRemainingTime(), cancellationToken); removeUTTaskDictionary[currentParams] = task; } await Task.WhenAll(removeUTTaskDictionary.Values).ConfigureAwait(false); // TODO: Wait for some time so that the removal of this unreliable transport behavior can be read from the files. // Bug#2271465 - Unreliable transport through API should return only once the behavior has been successully applied await Task.Delay(TimeSpan.FromSeconds(5.0), cancellationToken); } finally { var removeUTTaskList = new List <Task>(); foreach (var removeUTRequest in removeUTTaskDictionary) { var currentRemoveUTRequest = removeUTRequest; if (currentRemoveUTRequest.Value == null || currentRemoveUTRequest.Value.IsFaulted) { removeUTTaskList.Add( FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync( () => testContext.FabricClient.TestManager.RemoveUnreliableTransportBehaviorAsync( currentRemoveUTRequest.Key.Item1, /*nodeName*/ currentRemoveUTRequest.Key.Item2, /*behaviorName*/ action.RequestTimeout, cancellationToken), FabricClientRetryErrors.RemoveUnreliableTransportBehaviorErrors.Value, helper.GetRemainingTime(), cancellationToken)); } } Task.WhenAll(removeUTTaskList).Wait(cancellationToken); // TODO: Wait for some time so that the removal of this unreliable transport behavior can be read from the files. // Bug#2271465 - Unreliable transport through API should return only once the behavior has been successully applied Task.Delay(TimeSpan.FromSeconds(5.0), cancellationToken).GetAwaiter().GetResult(); } action.Result = new InvokeQuorumLossResult(getPartitionStateAction.Result); this.ResultTraceString = StringHelper.Format("InvokeQuorumLossAction succeeded for {0} with QuorumLossMode = {1}", partitionId, action.QuorumLossMode); }
internal static ReplicaSelector GetReplicaSelector(string partitionSetName, Guid partitionId, Uri serviceName, string partitionKey, long?replicaOrInstanceId) { ReplicaSelector replicaSelector = null; PartitionSelector partitionSelector = null; if (partitionSetName.Contains("PartitionId")) { partitionSelector = PartitionSelector.PartitionIdOf(serviceName, partitionId); } else { if (partitionSetName.Contains("PartitionSingleton")) { partitionSelector = PartitionSelector.SingletonOf(serviceName); } else if (partitionSetName.Contains("PartitionNamed")) { partitionSelector = PartitionSelector.PartitionKeyOf(serviceName, partitionKey); } else if (partitionSetName.Contains("PartitionUniformedInt")) { long partitionKeyLong; if (!long.TryParse(partitionKey, out partitionKeyLong)) { throw new ArgumentException(StringResources.Error_InvalidPartitionKey); } partitionSelector = PartitionSelector.PartitionKeyOf(serviceName, partitionKeyLong); } else if (!partitionSetName.Contains("Partition")) { partitionSelector = PartitionSelector.RandomOf(serviceName); } } if (partitionSelector == null) { throw new ArgumentException(StringResources.Error_CouldNotParsePartitionSelector); } if (partitionSetName.Contains("ReplicaPrimary")) { replicaSelector = ReplicaSelector.PrimaryOf(partitionSelector); } else if (partitionSetName.Contains("ReplicaRandomSecondary")) { replicaSelector = ReplicaSelector.RandomSecondaryOf(partitionSelector); } else if (partitionSetName.Contains("ReplicaId")) { replicaSelector = ReplicaSelector.ReplicaIdOf(partitionSelector, replicaOrInstanceId ?? 0); } else if (!partitionSetName.Contains("Replica")) { replicaSelector = ReplicaSelector.RandomOf(partitionSelector); } if (replicaSelector == null) { throw new ArgumentException(StringResources.Error_CouldNotParseReplicaSelector); } return(replicaSelector); }