internal System.Fabric.Common.UnreliableTransportBehavior Convert()
        {
            var utb = new System.Fabric.Common.UnreliableTransportBehavior(
                this.Destination,
                this.ActionName,
                this.ProbabilityToApply,
                this.Delay,
                this.DelaySpan,
                this.Priority,
                this.ApplyCount);

            utb.Filters = this.Filters;

            return(utb);
        }
            public override async Task <ActionStateBase> RunAsync(CancellationToken cancellationToken, ServiceInternalFaultInfo serviceInternalFaultInfo)
            {
                InvokeQuorumLossState state = Convert(this.State);

                Guid partitionId = state.Info.PartitionId;
                List <Tuple <string, string> > unreliableTransportInfo = state.Info.UnreliableTransportInfo;
                List <long> targetReplicas = state.Info.ReplicaIds;

                var unreliableTransportTaskList = new List <Task>();
                List <Tuple <string, string> > unreliableTransportInfoList = new List <Tuple <string, string> >();

                foreach (Tuple <string, string> ut in unreliableTransportInfo)
                {
                    string nodeName     = ut.Item1;
                    string behaviorName = ut.Item2;

                    System.Fabric.Common.UnreliableTransportBehavior behavior = new System.Fabric.Common.UnreliableTransportBehavior("*", "StatefulServiceReopen");
                    behavior.AddFilterForPartitionId(partitionId);

                    TestabilityTrace.TraceSource.WriteInfo(StepBase.TraceType, "{0} - applying '{1}'", this.State.OperationId, behaviorName);

                    unreliableTransportTaskList.Add(FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync(
                                                        () => this.FabricClient.TestManager.AddUnreliableTransportBehaviorAsync(
                                                            nodeName,
                                                            behaviorName,
                                                            behavior,
                                                            this.RequestTimeout,
                                                            cancellationToken),
                                                        this.OperationTimeout,
                                                        cancellationToken));
                }

                await Task.WhenAll(unreliableTransportTaskList).ConfigureAwait(false);

                // Bug#2271465 - Unreliable transport through API should return only once the behavior has been successfully applied
                await Task.Delay(TimeSpan.FromSeconds(5.0), cancellationToken).ConfigureAwait(false);

                List <Task> tasks = new List <Task>();

                foreach (long replicaId in targetReplicas)
                {
                    ReplicaSelector replicaSelector = ReplicaSelector.ReplicaIdOf(PartitionSelector.PartitionIdOf(this.partitionSelector.ServiceName, partitionId), replicaId);

                    TestabilityTrace.TraceSource.WriteInfo(StepBase.TraceType, "{0} - faulting replica with id={1}", this.State.OperationId, replicaId);
                    Task task = FaultAnalysisServiceUtility.RestartReplicaAsync(this.FabricClient, replicaSelector, CompletionMode.DoNotVerify, this.RequestTimeout, this.OperationTimeout, cancellationToken);
                    tasks.Add(task);
                }

                await Task.WhenAll(tasks).ConfigureAwait(false);

                ActionTest.PerformInternalServiceFaultIfRequested(this.State.OperationId, serviceInternalFaultInfo, this.State, cancellationToken, true);

                TestabilityTrace.TraceSource.WriteInfo(StepBase.TraceType, "{0} - keeping partition in quorum loss for '{1}'", this.State.OperationId, state.Info.QuorumLossDuration);
                await Task.Delay(state.Info.QuorumLossDuration, cancellationToken).ConfigureAwait(false);

                TimeoutHelper timeoutHelper = new TimeoutHelper(this.OperationTimeout);

                bool conditionSatisfied = false;

                int quorumLossCheckRetries = FASConstants.QuorumLossCheckRetryCount;

                do
                {
                    TestabilityTrace.TraceSource.WriteInfo(StepBase.TraceType, "{0} - checking PartitionStatus", this.State.OperationId);
                    ServicePartitionList partitionsResult = await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync(
                        () => this.FabricClient.QueryManager.GetPartitionListAsync(
                            this.partitionSelector.ServiceName,
                            null,
                            this.RequestTimeout,
                            cancellationToken),
                        this.OperationTimeout,
                        cancellationToken).ConfigureAwait(false);

                    foreach (StatefulServicePartition partition in partitionsResult)
                    {
                        if (partition.PartitionInformation.Id == partitionId)
                        {
                            if (partition.PartitionStatus == ServicePartitionStatus.InQuorumLoss)
                            {
                                conditionSatisfied = true;
                                break;
                            }
                        }
                    }

                    await AsyncWaiter.WaitAsync(TimeSpan.FromSeconds(5), cancellationToken).ConfigureAwait(false);
                }while (!conditionSatisfied && quorumLossCheckRetries-- > 0);

                if (!conditionSatisfied)
                {
                    string error = string.Format(CultureInfo.InvariantCulture, "{0} - Service could not induce quorum loss for service '{1}', partition '{2}'. Please retry", this.State.OperationId, this.partitionSelector.ServiceName, partitionId);
                    TestabilityTrace.TraceSource.WriteWarning(StepBase.TraceType, error);

                    throw new FabricTransientException("The operation could not be performed, please retry", FabricErrorCode.NotReady);
                }

                await QuorumLossStepsFactory.RemoveUTAsync(this.FabricClient, this.State, this.RequestTimeout, this.OperationTimeout, cancellationToken);

                state.StateProgress.Push(StepStateNames.CompletedSuccessfully);

                return(state);
            }
Exemple #3
0
            public override async Task <ActionStateBase> RunAsync(CancellationToken cancellationToken, ServiceInternalFaultInfo serviceInternalFaultInfo)
            {
                InvokeDataLossState state = Convert(this.State);

                PartitionSelector partitionSelector   = state.Info.PartitionSelector;
                DataLossMode      dataLossMode        = state.Info.DataLossMode;
                long   preDataLossNumber              = state.Info.DataLossNumber;
                string failoverManagerPrimaryNodeName = state.Info.NodeName;
                Guid   partitionId          = state.Info.PartitionId;
                string behaviorName         = state.Info.UnreliableTransportInfo.First().Item2;
                int    targetReplicaSetSize = state.Info.TargetReplicaSetSize;

                TestabilityTrace.TraceSource.WriteInfo(StepBase.TraceType, "{0} - applying UT, partitionId={1}", this.State.OperationId, partitionId);
                System.Fabric.Common.UnreliableTransportBehavior behavior = new System.Fabric.Common.UnreliableTransportBehavior("*", "DoReconfiguration");
                behavior.AddFilterForPartitionId(partitionId);

                await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync(
                    () => this.FabricClient.TestManager.AddUnreliableTransportBehaviorAsync(
                        failoverManagerPrimaryNodeName,
                        behaviorName,
                        behavior,
                        this.RequestTimeout,
                        cancellationToken),
                    this.OperationTimeout,
                    cancellationToken).ConfigureAwait(false);

                // TODO: Wait for some time so that the unreliable transport behavior can be read from the files.
                // Bug#2271465 - Unreliable transport through API should return only once the behavior has been successfully applied
                await Task.Delay(TimeSpan.FromSeconds(5.0), cancellationToken).ConfigureAwait(false);

                ServiceReplicaList replicasResult = await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync(
                    () => this.FabricClient.QueryManager.GetReplicaListAsync(
                        partitionId,
                        0,
                        this.RequestTimeout,
                        cancellationToken),
                    this.OperationTimeout,
                    cancellationToken).ConfigureAwait(false);

                List <StatefulServiceReplica> replicaList = new List <StatefulServiceReplica>();

                foreach (var replica in replicasResult)
                {
                    StatefulServiceReplica statefulReplica = replica as StatefulServiceReplica;
                    ReleaseAssert.AssertIf(statefulReplica == null, "Service Replica is not of stateful type even though service is stateful");
                    replicaList.Add(statefulReplica);
                }

                // Select target replicas based on the DataLosMode
                List <StatefulServiceReplica> targets = null;

                if (dataLossMode == DataLossMode.FullDataLoss)
                {
                    targets = GetReplicasForFullDataLoss(replicaList);
                }
                else if (dataLossMode == DataLossMode.PartialDataLoss)
                {
                    targets = FaultAnalysisServiceUtility.GetReplicasForPartialLoss(state.OperationId, replicaList);
                }
                else
                {
                    throw FaultAnalysisServiceUtility.CreateException(StepBase.TraceType, Interop.NativeTypes.FABRIC_ERROR_CODE.E_INVALIDARG, Strings.StringResources.Error_UnsupportedDataLossMode);
                }

                if (targets == null)
                {
                    // This will cause the command to rollback and retry
                    throw new FabricTransientException("The operation could not be performed, please retry", FabricErrorCode.NotReady);
                }

                foreach (var replica in targets)
                {
                    TestabilityTrace.TraceSource.WriteInfo(
                        StepBase.TraceType,
                        "{0} - Removing replica {1} in partition {2} with role {3} and status {4} to induce data loss",
                        this.State.OperationId,
                        replica.Id,
                        partitionId,
                        replica.ReplicaRole,
                        replica.ReplicaStatus);

                    await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync(
                        () => this.FabricClient.ServiceManager.RemoveReplicaAsync(
                            replica.NodeName,
                            partitionId,
                            replica.Id,
                            this.RequestTimeout,
                            cancellationToken),
                        FabricClientRetryErrors.RemoveReplicaErrors.Value,
                        this.OperationTimeout,
                        cancellationToken).ConfigureAwait(false);
                }

                ActionTest.PerformInternalServiceFaultIfRequested(this.State.OperationId, serviceInternalFaultInfo, this.State, cancellationToken, true);

                await this.WaitForAllTargetReplicasToGetDroppedAsync(partitionId, targets, cancellationToken).ConfigureAwait(false);

                await RemoveUnreliableTransportAsync(this.FabricClient, failoverManagerPrimaryNodeName, behaviorName, this.RequestTimeout, this.OperationTimeout, cancellationToken).ConfigureAwait(false);

                bool          dataLossWasSuccessful = false;
                TimeoutHelper timeoutHelper         = new TimeoutHelper(TimeSpan.FromSeconds(30));

                do
                {
                    ServicePartitionList partitionsResult = await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync(
                        () => this.FabricClient.QueryManager.GetPartitionListAsync(
                            this.partitionSelector.ServiceName,
                            null,
                            this.RequestTimeout,
                            cancellationToken),
                        this.OperationTimeout,
                        cancellationToken).ConfigureAwait(false);

                    bool partitionFound     = false;
                    long postDataLossNumber = 0;
                    foreach (StatefulServicePartition partition in partitionsResult)
                    {
                        if (partition.PartitionInformation.Id == partitionId)
                        {
                            postDataLossNumber = partition.PrimaryEpoch.DataLossNumber;
                            partitionFound     = true;
                            break;
                        }
                    }

                    if (!partitionFound)
                    {
                        throw new FabricException(StringHelper.Format(StringResources.Error_PartitionNotFound), FabricErrorCode.PartitionNotFound);
                    }

                    TestabilityTrace.TraceSource.WriteInfo(
                        StepBase.TraceType,
                        "{0} - Checking data loss numbers for partition {1} with remaining time {2}. Current numbers {3}:{4}",
                        this.State.OperationId,
                        partitionId,
                        timeoutHelper.GetRemainingTime(),
                        preDataLossNumber,
                        postDataLossNumber);

                    if (postDataLossNumber != preDataLossNumber)
                    {
                        dataLossWasSuccessful = true;
                        break;
                    }

                    await System.Fabric.Common.AsyncWaiter.WaitAsync(TimeSpan.FromSeconds(this.dataLossCheckPollIntervalInSeconds), cancellationToken).ConfigureAwait(false);
                }while (timeoutHelper.GetRemainingTime() > TimeSpan.Zero);

                if (!dataLossWasSuccessful)
                {
                    // This is only viewable internally for debug.  This will cause a retry of the whole flow.
                    string error = string.Format(
                        CultureInfo.InvariantCulture,
                        "{0} - Service could not induce data loss for service '{1}' partition '{2}' in '{3}' Please retry",
                        this.State.OperationId,
                        partitionSelector.ServiceName,
                        partitionId,
                        this.dataLossCheckWaitDurationInSeconds);
                    TestabilityTrace.TraceSource.WriteWarning(StepBase.TraceType, error);
                    throw new FabricTransientException("The operation could not be performed, please retry", FabricErrorCode.NotReady);
                }

                state.StateProgress.Push(StepStateNames.CompletedSuccessfully);

                return(state);
            }