public override async Task <ActionStateBase> RunAsync(CancellationToken cancellationToken, ServiceInternalFaultInfo serviceInternalFaultInfo) { InvokeQuorumLossState state = Convert(this.State); // get info about the service so we can check type and trss ServiceDescription result = await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync( () => this.FabricClient.ServiceManager.GetServiceDescriptionAsync( this.partitionSelector.ServiceName, this.RequestTimeout, cancellationToken), this.OperationTimeout, cancellationToken).ConfigureAwait(false); if (result.Kind != ServiceDescriptionKind.Stateful) { // The message in the first arg is only for debugging, it is not returned to the user. throw new FabricInvalidForStatelessServicesException("FabricInvalidForStatelessServicesException", FabricErrorCode.InvalidForStatelessServices); } StatefulServiceDescription statefulServiceDescription = result as StatefulServiceDescription; ReleaseAssert.AssertIf(statefulServiceDescription == null, string.Format(CultureInfo.InvariantCulture, "{0} - Service is not a stateful service", this.State.OperationId)); if (!statefulServiceDescription.HasPersistedState) { // The message in the first arg is only for debugging, it is not returned to the user. throw new FabricOnlyValidForStatefulPersistentServicesException("This is only valid for stateful persistent services", FabricErrorCode.OnlyValidForStatefulPersistentServices); } SelectedPartition targetPartition = await FaultAnalysisServiceUtility.GetSelectedPartitionStateAsync( this.FabricClient, this.partitionSelector, this.RequestTimeout, this.OperationTimeout, cancellationToken).ConfigureAwait(false); Guid partitionId = targetPartition.PartitionId; // get data about replicas in that partition ServiceReplicaList replicasResult = await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync( () => this.FabricClient.QueryManager.GetReplicaListAsync( partitionId, 0, this.RequestTimeout, cancellationToken), this.OperationTimeout, cancellationToken).ConfigureAwait(false); List <StatefulServiceReplica> tempReplicas = new List <StatefulServiceReplica>(); foreach (var replica in replicasResult) { StatefulServiceReplica statefulReplica = replica as StatefulServiceReplica; ReleaseAssert.AssertIf(statefulReplica == null, "Expected stateful replica"); tempReplicas.Add(statefulReplica); } List <StatefulServiceReplica> targetReplicas = null; if (this.quorumLossMode == QuorumLossMode.AllReplicas) { targetReplicas = tempReplicas.Where(r => r.ReplicaRole == ReplicaRole.Primary || r.ReplicaRole == ReplicaRole.ActiveSecondary).ToList(); } else if (this.quorumLossMode == QuorumLossMode.QuorumReplicas) { targetReplicas = FaultAnalysisServiceUtility.GetReplicasForPartialLoss(state.OperationId, tempReplicas); } else { throw FaultAnalysisServiceUtility.CreateException(StepBase.TraceType, Interop.NativeTypes.FABRIC_ERROR_CODE.E_INVALIDARG, Strings.StringResources.Error_UnsupportedQuorumLossMode); } if (targetReplicas == null) { // This will cause the command to rollback and retry throw new FabricTransientException("The operation could not be performed, please retry", FabricErrorCode.NotReady); } List <string> targetNodes = new List <string>(); foreach (var replica in targetReplicas) { targetNodes.Add(replica.NodeName); } List <Tuple <string, string> > unreliableTransportInfoList = new List <Tuple <string, string> >(); foreach (string nodeName in targetNodes) { UnreliableTransportBehavior behavior = new UnreliableTransportBehavior("*", "StatefulServiceReopen"); behavior.AddFilterForPartitionId(partitionId); // ApplyingUnreliableTransport.BehaviorNamePrefix + nodeName; string behaviorName = this.CreateBehaviorName(nodeName); unreliableTransportInfoList.Add(new Tuple <string, string>(nodeName, behaviorName)); } state.StateProgress.Push(StepStateNames.PerformingActions); state.Info.PartitionId = partitionId; state.Info.ReplicaIds = targetReplicas.Select(r => r.Id).ToList(); state.Info.UnreliableTransportInfo = unreliableTransportInfoList; return(state); }
public override async Task <ActionStateBase> RunAsync(CancellationToken cancellationToken, ServiceInternalFaultInfo serviceInternalFaultInfo) { InvokeDataLossState state = Convert(this.State); PartitionSelector partitionSelector = state.Info.PartitionSelector; DataLossMode dataLossMode = state.Info.DataLossMode; long preDataLossNumber = state.Info.DataLossNumber; string failoverManagerPrimaryNodeName = state.Info.NodeName; Guid partitionId = state.Info.PartitionId; string behaviorName = state.Info.UnreliableTransportInfo.First().Item2; int targetReplicaSetSize = state.Info.TargetReplicaSetSize; TestabilityTrace.TraceSource.WriteInfo(StepBase.TraceType, "{0} - applying UT, partitionId={1}", this.State.OperationId, partitionId); System.Fabric.Common.UnreliableTransportBehavior behavior = new System.Fabric.Common.UnreliableTransportBehavior("*", "DoReconfiguration"); behavior.AddFilterForPartitionId(partitionId); await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync( () => this.FabricClient.TestManager.AddUnreliableTransportBehaviorAsync( failoverManagerPrimaryNodeName, behaviorName, behavior, this.RequestTimeout, cancellationToken), this.OperationTimeout, cancellationToken).ConfigureAwait(false); // TODO: Wait for some time so that the unreliable transport behavior can be read from the files. // Bug#2271465 - Unreliable transport through API should return only once the behavior has been successfully applied await Task.Delay(TimeSpan.FromSeconds(5.0), cancellationToken).ConfigureAwait(false); ServiceReplicaList replicasResult = await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync( () => this.FabricClient.QueryManager.GetReplicaListAsync( partitionId, 0, this.RequestTimeout, cancellationToken), this.OperationTimeout, cancellationToken).ConfigureAwait(false); List <StatefulServiceReplica> replicaList = new List <StatefulServiceReplica>(); foreach (var replica in replicasResult) { StatefulServiceReplica statefulReplica = replica as StatefulServiceReplica; ReleaseAssert.AssertIf(statefulReplica == null, "Service Replica is not of stateful type even though service is stateful"); replicaList.Add(statefulReplica); } // Select target replicas based on the DataLosMode List <StatefulServiceReplica> targets = null; if (dataLossMode == DataLossMode.FullDataLoss) { targets = GetReplicasForFullDataLoss(replicaList); } else if (dataLossMode == DataLossMode.PartialDataLoss) { targets = FaultAnalysisServiceUtility.GetReplicasForPartialLoss(state.OperationId, replicaList); } else { throw FaultAnalysisServiceUtility.CreateException(StepBase.TraceType, Interop.NativeTypes.FABRIC_ERROR_CODE.E_INVALIDARG, Strings.StringResources.Error_UnsupportedDataLossMode); } if (targets == null) { // This will cause the command to rollback and retry throw new FabricTransientException("The operation could not be performed, please retry", FabricErrorCode.NotReady); } foreach (var replica in targets) { TestabilityTrace.TraceSource.WriteInfo( StepBase.TraceType, "{0} - Removing replica {1} in partition {2} with role {3} and status {4} to induce data loss", this.State.OperationId, replica.Id, partitionId, replica.ReplicaRole, replica.ReplicaStatus); await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync( () => this.FabricClient.ServiceManager.RemoveReplicaAsync( replica.NodeName, partitionId, replica.Id, this.RequestTimeout, cancellationToken), FabricClientRetryErrors.RemoveReplicaErrors.Value, this.OperationTimeout, cancellationToken).ConfigureAwait(false); } ActionTest.PerformInternalServiceFaultIfRequested(this.State.OperationId, serviceInternalFaultInfo, this.State, cancellationToken, true); await this.WaitForAllTargetReplicasToGetDroppedAsync(partitionId, targets, cancellationToken).ConfigureAwait(false); await RemoveUnreliableTransportAsync(this.FabricClient, failoverManagerPrimaryNodeName, behaviorName, this.RequestTimeout, this.OperationTimeout, cancellationToken).ConfigureAwait(false); bool dataLossWasSuccessful = false; TimeoutHelper timeoutHelper = new TimeoutHelper(TimeSpan.FromSeconds(30)); do { ServicePartitionList partitionsResult = await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync( () => this.FabricClient.QueryManager.GetPartitionListAsync( this.partitionSelector.ServiceName, null, this.RequestTimeout, cancellationToken), this.OperationTimeout, cancellationToken).ConfigureAwait(false); bool partitionFound = false; long postDataLossNumber = 0; foreach (StatefulServicePartition partition in partitionsResult) { if (partition.PartitionInformation.Id == partitionId) { postDataLossNumber = partition.PrimaryEpoch.DataLossNumber; partitionFound = true; break; } } if (!partitionFound) { throw new FabricException(StringHelper.Format(StringResources.Error_PartitionNotFound), FabricErrorCode.PartitionNotFound); } TestabilityTrace.TraceSource.WriteInfo( StepBase.TraceType, "{0} - Checking data loss numbers for partition {1} with remaining time {2}. Current numbers {3}:{4}", this.State.OperationId, partitionId, timeoutHelper.GetRemainingTime(), preDataLossNumber, postDataLossNumber); if (postDataLossNumber != preDataLossNumber) { dataLossWasSuccessful = true; break; } await System.Fabric.Common.AsyncWaiter.WaitAsync(TimeSpan.FromSeconds(this.dataLossCheckPollIntervalInSeconds), cancellationToken).ConfigureAwait(false); }while (timeoutHelper.GetRemainingTime() > TimeSpan.Zero); if (!dataLossWasSuccessful) { // This is only viewable internally for debug. This will cause a retry of the whole flow. string error = string.Format( CultureInfo.InvariantCulture, "{0} - Service could not induce data loss for service '{1}' partition '{2}' in '{3}' Please retry", this.State.OperationId, partitionSelector.ServiceName, partitionId, this.dataLossCheckWaitDurationInSeconds); TestabilityTrace.TraceSource.WriteWarning(StepBase.TraceType, error); throw new FabricTransientException("The operation could not be performed, please retry", FabricErrorCode.NotReady); } state.StateProgress.Push(StepStateNames.CompletedSuccessfully); return(state); }