Esempio n. 1
0
        private FabricTestAction GetRemoveReplicaAction(ReplicaStateTransitionAction ragAction)
        {
            Uri  serviceUri = ragAction.ServiceUri;
            Guid guid       = ragAction.PartitionId;
            long replicaId  = ragAction.ReplicaId;

            string report = StringHelper.Format("Generating Action: {0}\n\t\tService: {1}\n\t\tPartition: {2}\n\t\tReplicaId: {3}", ragAction.ActionType, ragAction.ServiceUri, ragAction.PartitionId, ragAction.ReplicaId);

            // Select specific replica using ReplicaSelector.
            ReplicaSelector replicaSelector = ReplicaSelector.ReplicaIdOf(PartitionSelector.PartitionIdOf(serviceUri, guid), replicaId);

            RemoveReplicaAction restartReplicaAction = new RemoveReplicaAction(replicaSelector);

            return(restartReplicaAction);
        }
Esempio n. 2
0
        private void KillPrimaryReplica()
        {
            // Kill the primary
            Application application =
                _fabricClient.QueryManager.GetApplicationListAsync()
                .Result.Single(a => a.ApplicationTypeName == DefaultApplicationTypeName);
            Service service =
                _fabricClient.QueryManager.GetServiceListAsync(application.ApplicationName).Result.Single();
            Partition partition =
                _fabricClient.QueryManager.GetPartitionListAsync(service.ServiceName).Result.Single();
            StatefulServiceReplica primaryReplica =
                _fabricClient.QueryManager.GetReplicaListAsync(partition.PartitionInformation.Id)
                .Result.Select(replica => replica as StatefulServiceReplica)
                .Single(statefulServiceReplica => statefulServiceReplica.ReplicaRole == ReplicaRole.Primary);

            LogHelper.Log("Killing the primary replica at node {0}", primaryReplica.NodeName);

            ReplicaSelector replicaSelector = ReplicaSelector.ReplicaIdOf(PartitionSelector.PartitionIdOf(service.ServiceName, partition.PartitionInformation.Id), primaryReplica.Id);

            _fabricClient.FaultManager.RemoveReplicaAsync(replicaSelector, CompletionMode.DoNotVerify, false);
        }
        public static ReplicaSelector GetExpectedReplicaSelector(ParitionSelectorTestHelper.PartitionCase partitionCase, ReplicaCase replicaCase)
        {
            ReplicaSelector   result            = null;
            PartitionSelector partitionSelector = ParitionSelectorTestHelper.GetExpectedPartitionSelector(partitionCase);

            switch (replicaCase)
            {
            case ReplicaCase.ReplicaPrimary:
            {
                result = ReplicaSelector.PrimaryOf(partitionSelector);
                break;
            }

            case ReplicaCase.ReplicaRandomSecondary:
            {
                result = ReplicaSelector.RandomSecondaryOf(partitionSelector);
                break;
            }

            case ReplicaCase.ReplicaId:
            {
                result = ReplicaSelector.ReplicaIdOf(partitionSelector, replicaInstance.Value);
                break;
            }

            case ReplicaCase.ReplicaId_NoValue:
            {
                result = ReplicaSelector.ReplicaIdOf(partitionSelector, 0);
                break;
            }

            case ReplicaCase.ReplicaRandom:
            {
                result = ReplicaSelector.RandomOf(partitionSelector);
                break;
            }
            }

            return(result);
        }
            public override async Task <ActionStateBase> RunAsync(CancellationToken cancellationToken, ServiceInternalFaultInfo serviceInternalFaultInfo)
            {
                InvokeQuorumLossState state = Convert(this.State);

                Guid partitionId = state.Info.PartitionId;
                List <Tuple <string, string> > unreliableTransportInfo = state.Info.UnreliableTransportInfo;
                List <long> targetReplicas = state.Info.ReplicaIds;

                var unreliableTransportTaskList = new List <Task>();
                List <Tuple <string, string> > unreliableTransportInfoList = new List <Tuple <string, string> >();

                foreach (Tuple <string, string> ut in unreliableTransportInfo)
                {
                    string nodeName     = ut.Item1;
                    string behaviorName = ut.Item2;

                    System.Fabric.Common.UnreliableTransportBehavior behavior = new System.Fabric.Common.UnreliableTransportBehavior("*", "StatefulServiceReopen");
                    behavior.AddFilterForPartitionId(partitionId);

                    TestabilityTrace.TraceSource.WriteInfo(StepBase.TraceType, "{0} - applying '{1}'", this.State.OperationId, behaviorName);

                    unreliableTransportTaskList.Add(FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync(
                                                        () => this.FabricClient.TestManager.AddUnreliableTransportBehaviorAsync(
                                                            nodeName,
                                                            behaviorName,
                                                            behavior,
                                                            this.RequestTimeout,
                                                            cancellationToken),
                                                        this.OperationTimeout,
                                                        cancellationToken));
                }

                await Task.WhenAll(unreliableTransportTaskList).ConfigureAwait(false);

                // Bug#2271465 - Unreliable transport through API should return only once the behavior has been successfully applied
                await Task.Delay(TimeSpan.FromSeconds(5.0), cancellationToken).ConfigureAwait(false);

                List <Task> tasks = new List <Task>();

                foreach (long replicaId in targetReplicas)
                {
                    ReplicaSelector replicaSelector = ReplicaSelector.ReplicaIdOf(PartitionSelector.PartitionIdOf(this.partitionSelector.ServiceName, partitionId), replicaId);

                    TestabilityTrace.TraceSource.WriteInfo(StepBase.TraceType, "{0} - faulting replica with id={1}", this.State.OperationId, replicaId);
                    Task task = FaultAnalysisServiceUtility.RestartReplicaAsync(this.FabricClient, replicaSelector, CompletionMode.DoNotVerify, this.RequestTimeout, this.OperationTimeout, cancellationToken);
                    tasks.Add(task);
                }

                await Task.WhenAll(tasks).ConfigureAwait(false);

                ActionTest.PerformInternalServiceFaultIfRequested(this.State.OperationId, serviceInternalFaultInfo, this.State, cancellationToken, true);

                TestabilityTrace.TraceSource.WriteInfo(StepBase.TraceType, "{0} - keeping partition in quorum loss for '{1}'", this.State.OperationId, state.Info.QuorumLossDuration);
                await Task.Delay(state.Info.QuorumLossDuration, cancellationToken).ConfigureAwait(false);

                TimeoutHelper timeoutHelper = new TimeoutHelper(this.OperationTimeout);

                bool conditionSatisfied = false;

                int quorumLossCheckRetries = FASConstants.QuorumLossCheckRetryCount;

                do
                {
                    TestabilityTrace.TraceSource.WriteInfo(StepBase.TraceType, "{0} - checking PartitionStatus", this.State.OperationId);
                    ServicePartitionList partitionsResult = await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync(
                        () => this.FabricClient.QueryManager.GetPartitionListAsync(
                            this.partitionSelector.ServiceName,
                            null,
                            this.RequestTimeout,
                            cancellationToken),
                        this.OperationTimeout,
                        cancellationToken).ConfigureAwait(false);

                    foreach (StatefulServicePartition partition in partitionsResult)
                    {
                        if (partition.PartitionInformation.Id == partitionId)
                        {
                            if (partition.PartitionStatus == ServicePartitionStatus.InQuorumLoss)
                            {
                                conditionSatisfied = true;
                                break;
                            }
                        }
                    }

                    await AsyncWaiter.WaitAsync(TimeSpan.FromSeconds(5), cancellationToken).ConfigureAwait(false);
                }while (!conditionSatisfied && quorumLossCheckRetries-- > 0);

                if (!conditionSatisfied)
                {
                    string error = string.Format(CultureInfo.InvariantCulture, "{0} - Service could not induce quorum loss for service '{1}', partition '{2}'. Please retry", this.State.OperationId, this.partitionSelector.ServiceName, partitionId);
                    TestabilityTrace.TraceSource.WriteWarning(StepBase.TraceType, error);

                    throw new FabricTransientException("The operation could not be performed, please retry", FabricErrorCode.NotReady);
                }

                await QuorumLossStepsFactory.RemoveUTAsync(this.FabricClient, this.State, this.RequestTimeout, this.OperationTimeout, cancellationToken);

                state.StateProgress.Push(StepStateNames.CompletedSuccessfully);

                return(state);
            }
            protected override async Task ExecuteActionAsync(FabricTestContext testContext, InvokeQuorumLossAction action, CancellationToken cancellationToken)
            {
                ThrowIf.Null(action.PartitionSelector, "PartitionSelector");

                var helper = new TimeoutHelper(action.ActionTimeout);

                // get info about the service so we can check type and trss
                ServiceDescription result = await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync(
                    () => testContext.FabricClient.ServiceManager.GetServiceDescriptionAsync(
                        action.PartitionSelector.ServiceName,
                        action.RequestTimeout,
                        cancellationToken),
                    helper.GetRemainingTime(),
                    cancellationToken).ConfigureAwait(false);

                if (result.Kind != ServiceDescriptionKind.Stateful)
                {
                    throw new InvalidOperationException(StringHelper.Format(StringResources.Error_InvalidServiceTypeTestability, "QuorumLoss", "Stateful", action.PartitionSelector.ServiceName, "Stateless"));
                }

                StatefulServiceDescription statefulServiceDescription = result as StatefulServiceDescription;

                ReleaseAssert.AssertIf(statefulServiceDescription == null, "Service is not a stateful service");

                if (!statefulServiceDescription.HasPersistedState)
                {
                    throw new InvalidOperationException(StringHelper.Format(StringResources.Error_InvalidServiceTypeTestability, "QuorumLoss", "Stateful Persistent", action.PartitionSelector.ServiceName, "Stateful In-Memory Only"));
                }

                // figure out /which/ partition to select
                var getPartitionStateAction = new GetSelectedPartitionStateAction(action.PartitionSelector)
                {
                    RequestTimeout = action.RequestTimeout,
                    ActionTimeout  = helper.GetRemainingTime()
                };

                await testContext.ActionExecutor.RunAsync(getPartitionStateAction, cancellationToken);

                Guid partitionId = getPartitionStateAction.Result.PartitionId;

                // get data about replicas in that partition
                ServiceReplicaList replicasResult = await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync(
                    () => testContext.FabricClient.QueryManager.GetReplicaListAsync(
                        partitionId,
                        0,
                        action.RequestTimeout,
                        cancellationToken),
                    helper.GetRemainingTime(),
                    cancellationToken).ConfigureAwait(false);

                var removeUTRequestList = new List <Tuple <string, string> >();
                Dictionary <Tuple <string, string>, Task> removeUTTaskDictionary = new Dictionary <Tuple <string, string>, Task>();

                try
                {
                    var  stableReplicas                  = replicasResult.Where(r => r.ReplicaStatus == ServiceReplicaStatus.Ready).ToArray();
                    var  stableReplicasToRemove          = new List <StatefulServiceReplica>();
                    long replicasToRestartWithoutPrimary =
                        action.QuorumLossMode == QuorumLossMode.AllReplicas
                            ? stableReplicas.Length - 1
                            : FabricCluster.GetWriteQuorumSize(replicasResult.Count);
                    foreach (var replica in stableReplicas)
                    {
                        StatefulServiceReplica statefulReplica = replica as StatefulServiceReplica;
                        ReleaseAssert.AssertIf(statefulReplica == null, "Service Replica is not of stateful type even though service is stateful");
                        if (statefulReplica.ReplicaRole != ReplicaRole.Primary)
                        {
                            replicasToRestartWithoutPrimary--;
                        }

                        if (replicasToRestartWithoutPrimary >= 0 || statefulReplica.ReplicaRole == ReplicaRole.Primary)
                        {
                            stableReplicasToRemove.Add(statefulReplica);
                        }
                    }

                    // for selected replicas, block reopen so that when we restart the replica (NOT remove the replica) it doesn't come up
                    var utTaskList = new List <Task>();
                    foreach (var statefulReplica in stableReplicasToRemove)
                    {
                        string nodeName = statefulReplica.NodeName;
                        UnreliableTransportBehavior behavior = new UnreliableTransportBehavior("*", "StatefulServiceReopen");
                        behavior.AddFilterForPartitionId(partitionId);
                        string behaviorName = "BlockStatefulServiceReopen_" + nodeName;

                        removeUTRequestList.Add(new Tuple <string, string>(nodeName, behaviorName));
                        utTaskList.Add(
                            FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync(
                                () =>
                                testContext.FabricClient.TestManager.AddUnreliableTransportBehaviorAsync(
                                    nodeName,
                                    behaviorName,
                                    behavior,
                                    action.RequestTimeout,
                                    cancellationToken),
                                helper.GetRemainingTime(),
                                cancellationToken));
                    }

                    await Task.WhenAll(utTaskList).ConfigureAwait(false);

                    // TODO: Wait for some time so that the unreliable transport behavior can be read from the files.
                    // Bug#2271465 - Unreliable transport through API should return only once the behavior has been successfully applied
                    await Task.Delay(TimeSpan.FromSeconds(5.0), cancellationToken);

                    var restartReplicaTaskList = new List <Task>();
                    foreach (var statefulReplica in stableReplicasToRemove)
                    {
                        ReplicaSelector replicaSelector = ReplicaSelector.ReplicaIdOf(PartitionSelector.PartitionIdOf(action.PartitionSelector.ServiceName, partitionId), statefulReplica.Id);

                        var restartReplicaAction = new RestartReplicaAction(replicaSelector)
                        {
                            CompletionMode = CompletionMode.DoNotVerify,
                            RequestTimeout = action.RequestTimeout,
                            ActionTimeout  = helper.GetRemainingTime()
                        };

                        restartReplicaTaskList.Add(testContext.ActionExecutor.RunAsync(restartReplicaAction, cancellationToken));
                    }

                    await Task.WhenAll(restartReplicaTaskList).ConfigureAwait(false);

                    await AsyncWaiter.WaitAsync(action.QuorumLossDuration, cancellationToken).ConfigureAwait(false);

                    // validate
                    ServicePartitionList partitionsResult = await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync(
                        () => testContext.FabricClient.QueryManager.GetPartitionListAsync(
                            action.PartitionSelector.ServiceName,
                            null,
                            action.RequestTimeout,
                            cancellationToken),
                        FabricClientRetryErrors.GetPartitionListFabricErrors.Value,
                        helper.GetRemainingTime(),
                        cancellationToken).ConfigureAwait(false);

                    foreach (StatefulServicePartition partition in partitionsResult)
                    {
                        if (partition.PartitionInformation.Id == partitionId)
                        {
                            ReleaseAssert.AssertIf(partition.PartitionStatus != ServicePartitionStatus.InQuorumLoss, "Partition failed to be in Quorum Loss.");
                            break;
                        }
                    }

                    foreach (var removeUTParams in removeUTRequestList)
                    {
                        var  currentParams = removeUTParams;
                        Task task          = FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync(
                            () => testContext.FabricClient.TestManager.RemoveUnreliableTransportBehaviorAsync(
                                currentParams.Item1,  /*nodeName*/
                                currentParams.Item2,  /*behaviorName*/
                                action.RequestTimeout,
                                cancellationToken),
                            FabricClientRetryErrors.RemoveUnreliableTransportBehaviorErrors.Value,
                            helper.GetRemainingTime(),
                            cancellationToken);

                        removeUTTaskDictionary[currentParams] = task;
                    }

                    await Task.WhenAll(removeUTTaskDictionary.Values).ConfigureAwait(false);

                    // TODO: Wait for some time so that the removal of this unreliable transport behavior can be read from the files.
                    // Bug#2271465 - Unreliable transport through API should return only once the behavior has been successully applied
                    await Task.Delay(TimeSpan.FromSeconds(5.0), cancellationToken);
                }
                finally
                {
                    var removeUTTaskList = new List <Task>();

                    foreach (var removeUTRequest in removeUTTaskDictionary)
                    {
                        var currentRemoveUTRequest = removeUTRequest;
                        if (currentRemoveUTRequest.Value == null || currentRemoveUTRequest.Value.IsFaulted)
                        {
                            removeUTTaskList.Add(
                                FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync(
                                    () => testContext.FabricClient.TestManager.RemoveUnreliableTransportBehaviorAsync(
                                        currentRemoveUTRequest.Key.Item1, /*nodeName*/
                                        currentRemoveUTRequest.Key.Item2, /*behaviorName*/
                                        action.RequestTimeout,
                                        cancellationToken),
                                    FabricClientRetryErrors.RemoveUnreliableTransportBehaviorErrors.Value,
                                    helper.GetRemainingTime(),
                                    cancellationToken));
                        }
                    }

                    Task.WhenAll(removeUTTaskList).Wait(cancellationToken);

                    // TODO: Wait for some time so that the removal of this unreliable transport behavior can be read from the files.
                    // Bug#2271465 - Unreliable transport through API should return only once the behavior has been successully applied
                    Task.Delay(TimeSpan.FromSeconds(5.0), cancellationToken).GetAwaiter().GetResult();
                }

                action.Result          = new InvokeQuorumLossResult(getPartitionStateAction.Result);
                this.ResultTraceString = StringHelper.Format("InvokeQuorumLossAction succeeded for {0} with QuorumLossMode = {1}", partitionId, action.QuorumLossMode);
            }
        internal static ReplicaSelector GetReplicaSelector(string partitionSetName, Guid partitionId, Uri serviceName, string partitionKey, long?replicaOrInstanceId)
        {
            ReplicaSelector   replicaSelector   = null;
            PartitionSelector partitionSelector = null;

            if (partitionSetName.Contains("PartitionId"))
            {
                partitionSelector = PartitionSelector.PartitionIdOf(serviceName, partitionId);
            }
            else
            {
                if (partitionSetName.Contains("PartitionSingleton"))
                {
                    partitionSelector = PartitionSelector.SingletonOf(serviceName);
                }
                else if (partitionSetName.Contains("PartitionNamed"))
                {
                    partitionSelector = PartitionSelector.PartitionKeyOf(serviceName, partitionKey);
                }
                else if (partitionSetName.Contains("PartitionUniformedInt"))
                {
                    long partitionKeyLong;
                    if (!long.TryParse(partitionKey, out partitionKeyLong))
                    {
                        throw new ArgumentException(StringResources.Error_InvalidPartitionKey);
                    }

                    partitionSelector = PartitionSelector.PartitionKeyOf(serviceName, partitionKeyLong);
                }
                else if (!partitionSetName.Contains("Partition"))
                {
                    partitionSelector = PartitionSelector.RandomOf(serviceName);
                }
            }

            if (partitionSelector == null)
            {
                throw new ArgumentException(StringResources.Error_CouldNotParsePartitionSelector);
            }

            if (partitionSetName.Contains("ReplicaPrimary"))
            {
                replicaSelector = ReplicaSelector.PrimaryOf(partitionSelector);
            }
            else if (partitionSetName.Contains("ReplicaRandomSecondary"))
            {
                replicaSelector = ReplicaSelector.RandomSecondaryOf(partitionSelector);
            }
            else if (partitionSetName.Contains("ReplicaId"))
            {
                replicaSelector = ReplicaSelector.ReplicaIdOf(partitionSelector, replicaOrInstanceId ?? 0);
            }
            else if (!partitionSetName.Contains("Replica"))
            {
                replicaSelector = ReplicaSelector.RandomOf(partitionSelector);
            }

            if (replicaSelector == null)
            {
                throw new ArgumentException(StringResources.Error_CouldNotParseReplicaSelector);
            }

            return(replicaSelector);
        }