protected override async Task ExecuteActionAsync(FabricTestContext testContext, GetSelectedReplicaStateAction action, CancellationToken cancellationToken) { ThrowIf.Null(action.ReplicaSelector, "ReplicaSelector"); TimeoutHelper helper = new TimeoutHelper(action.ActionTimeout); var getPartitionStateAction = new GetSelectedPartitionStateAction(action.ReplicaSelector.PartitionSelector) { RequestTimeout = action.RequestTimeout, ActionTimeout = helper.GetRemainingTime() }; await testContext.ActionExecutor.RunAsync(getPartitionStateAction, cancellationToken); Guid partitionId = getPartitionStateAction.Result.PartitionId; // TODO: make these actions which store state locally as well. ServiceReplicaList replicasResult = await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync <ServiceReplicaList>( () => testContext.FabricClient.QueryManager.GetReplicaListAsync( partitionId, 0, action.RequestTimeout, cancellationToken), helper.GetRemainingTime(), cancellationToken).ConfigureAwait(false); Replica replicaResult = action.ReplicaSelector.GetSelectedReplica(replicasResult.ToArray(), testContext.Random, true /*skip invalid replicas*/); var replicaSelectorResult = new SelectedReplica(replicaResult.Id, getPartitionStateAction.Result); action.Result = new Tuple <SelectedReplica, Replica>( replicaSelectorResult, replicaResult); ResultTraceString = StringHelper.Format("ReplicaSelector Selected Replica {0}", replicaResult.Id); }
private async Task ValidateClusterHealth(FabricTestContext testContext, ValidateClusterAction action, CancellationToken token) { TimeoutHelper timer = new TimeoutHelper(action.MaximumStabilizationTimeout); bool success = false; StringBuilder healthinfo = new StringBuilder(); while (!success && timer.GetRemainingTime() > TimeSpan.Zero) { healthinfo.Clear(); ClusterHealthPolicy healthPolicy = new ClusterHealthPolicy(); var clusterHealthResult = await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync( () => testContext.FabricClient.HealthManager.GetClusterHealthAsync( healthPolicy, action.RequestTimeout, token), FabricClientRetryErrors.GetEntityHealthFabricErrors.Value, timer.GetRemainingTime(), token).ConfigureAwait(false); bool checkError = (action.CheckFlag & ValidationCheckFlag.CheckError) != 0; bool checkWarning = (action.CheckFlag & ValidationCheckFlag.CheckWarning) != 0; if ((checkError && clusterHealthResult.AggregatedHealthState == HealthState.Error) || (checkWarning && clusterHealthResult.AggregatedHealthState == HealthState.Warning) || clusterHealthResult.AggregatedHealthState == HealthState.Invalid || clusterHealthResult.AggregatedHealthState == HealthState.Unknown) { AppTrace.TraceSource.WriteInfo(TraceSource, "Cluster health state is {0}. Will Retry check", clusterHealthResult.AggregatedHealthState); foreach (HealthEvent healthEvent in clusterHealthResult.HealthEvents) { healthinfo.AppendLine(string.Format( "Cluster health state is '{0}' with property '{1}', sourceId '{2}' and description '{3}'", healthEvent.HealthInformation.HealthState, healthEvent.HealthInformation.Property, healthEvent.HealthInformation.SourceId, healthEvent.HealthInformation.Description)); } AppTrace.TraceSource.WriteInfo(TraceSource, healthinfo.ToString()); } else { success = true; } if (!success) { // Delay before querying again so we allow some time for state to change - don't spam the node await AsyncWaiter.WaitAsync(RetryWaitTimeout, token).ConfigureAwait(false); } } if (!success) { throw new FabricValidationException(StringHelper.Format(StringResources.Error_ServiceNotHealthy, "Cluster", action.MaximumStabilizationTimeout, healthinfo)); } }
protected override async Task ExecuteActionAsync(FabricTestContext testContext, ValidateApplicationServicesAction action, CancellationToken cancellationToken) { ThrowIf.Null(action.ApplicationName, "ApplicationName"); TimeoutHelper helper = new TimeoutHelper(action.MaximumStabilizationTimeout); // TODO: make these actions which store state locally as well. var serviceListResult = await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync( () => testContext.FabricClient.QueryManager.GetServiceListAsync( action.ApplicationName, null, action.RequestTimeout, cancellationToken), helper.GetRemainingTime(), cancellationToken).ConfigureAwait(false); List <Task> serviceValidationTasks = new List <Task>(); foreach (Service serviceResult in serviceListResult) { var validateService = new ValidateServiceAction(serviceResult.ServiceName, helper.GetRemainingTime()) { ActionTimeout = action.ActionTimeout, RequestTimeout = action.RequestTimeout, CheckFlag = action.CheckFlag }; serviceValidationTasks.Add(testContext.ActionExecutor.RunAsync(validateService, cancellationToken)); } await Task.WhenAll(serviceValidationTasks.ToArray()).ConfigureAwait(false); ResultTraceString = StringHelper.Format("ValidateApplicationServicesAction succeeded for {0}", action.ApplicationName); }
public async Task <IEnumerable <NodeInfo> > GetLatestNodeInfoAsync(TimeSpan requestTimeout, TimeSpan operationTimeout, CancellationToken cancellationToken) { // Get all current known nodes NodeList nodeList = new NodeList(); string continuationToken = null; do { NodeList queryResult = await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync( () => this.fabricClient.QueryManager.GetNodeListAsync(null, continuationToken), operationTimeout, cancellationToken).ConfigureAwait(false); nodeList.AddRangeNullSafe(queryResult); continuationToken = queryResult.ContinuationToken; } while (!string.IsNullOrEmpty(continuationToken)); if (nodeList.Count == 0) { throw new InvalidOperationException(StringHelper.Format(StringResources.Error_NotEnoughNodesForTestabilityAction, "GetNodes")); } var nodes = new List <NodeInfo>(); foreach (Node node in nodeList) { NodeInfo nodeInfo = NodeInfo.CreateNodeInfo(node); nodes.Add(nodeInfo); } return(nodes); }
public async Task <ServicePartitionList> GetPartitionsAsync(CancellationToken ct) { ReleaseAssert.AssertIfNull(FabricClientRetryErrors.GetPartitionListFabricErrors.Value, "partition list error code"); var retryableErrors = new FabricClientRetryErrors(); retryableErrors.RetryableFabricErrorCodes.AddRangeNullSafe(FabricClientRetryErrors.GetPartitionListFabricErrors.Value.RetryableFabricErrorCodes); retryableErrors.RetryableExceptions.AddRangeNullSafe(FabricClientRetryErrors.GetPartitionListFabricErrors.Value.RetryableExceptions); retryableErrors.RetryableFabricErrorCodes.Add(FabricErrorCode.PartitionNotFound); ServicePartitionList servicePartitionList = new ServicePartitionList(); string continuationToken = null; do { ServicePartitionList queryResult = await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync( () => this.TestContext.FabricClient.QueryManager.GetPartitionListAsync( this.serviceName, null, continuationToken, this.requestTimeout, ct), retryableErrors, this.operationTimeout, ct).ConfigureAwait(false); servicePartitionList.AddRangeNullSafe(queryResult); continuationToken = queryResult.ContinuationToken; } while (!string.IsNullOrEmpty(continuationToken)); return(servicePartitionList); }
// Throws exception if validation was unsuccessful. protected override async Task ExecuteActionAsync(FabricTestContext testContext, ValidateSystemServicesAction action, CancellationToken token) { this.timer = new TimeoutHelper(action.MaximumStabilizationTimeout); // Get all system services var svcListResult = await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync( () => testContext.FabricClient.QueryManager.GetServiceListAsync( SystemApplicationName, null, action.RequestTimeout, token), this.timer.GetRemainingTime(), token).ConfigureAwait(false); List <Task> tasks = new List <Task>(); foreach (Service svcResult in svcListResult) { var validateService = new ValidateServiceAction(svcResult.ServiceName, this.timer.GetRemainingTime()) { ActionTimeout = action.ActionTimeout, RequestTimeout = action.RequestTimeout, CheckFlag = action.CheckFlag }; var task = testContext.ActionExecutor.RunAsync(validateService, token); tasks.Add(task); Trace.WriteNoise("ValidateSystemServicesActionHandler: Validation task added for service: {0}", svcResult.ServiceName.OriginalString); } await Task.WhenAll(tasks).ConfigureAwait(false); ResultTraceString = "ValidateSystemServicesActionHandler completed successfully"; }
public async Task <ServiceReplicaList> GetReplicasAsync(Guid partitionId, CancellationToken ct) { ServiceReplicaList serviceReplicaList = new ServiceReplicaList(); string continuationToken = null; do { ServiceReplicaList queryResult = await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync( () => this.TestContext.FabricClient.QueryManager.GetReplicaListAsync( partitionId, 0, ServiceReplicaStatusFilter.Default, continuationToken, this.requestTimeout, ct), this.operationTimeout, ct).ConfigureAwait(false); serviceReplicaList.AddRangeNullSafe(queryResult); continuationToken = queryResult.ContinuationToken; } while (!string.IsNullOrEmpty(continuationToken)); return(serviceReplicaList); }
private async Task <DeployedCodePackage> GetCodePackageInfoAsync( FabricTestContext testContext, string nodeName, Uri applicationName, string serviceManifestName, string servicePackageActivationId, string codePackageName, RestartDeployedCodePackageAction action, CancellationToken cancellationToken) { var deployedCodePackageListResult = await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync <DeployedCodePackageList>( () => testContext.FabricClient.QueryManager.GetDeployedCodePackageListAsync( nodeName, applicationName, serviceManifestName, codePackageName, action.RequestTimeout, cancellationToken), this.helper.GetRemainingTime(), cancellationToken).ConfigureAwait(false); if (deployedCodePackageListResult == null || deployedCodePackageListResult.Count == 0) { throw new FabricException( StringHelper.Format(StringResources.Error_CodePackageNotDeployedOnNode, applicationName, codePackageName, nodeName), FabricErrorCode.CodePackageNotFound); } return(deployedCodePackageListResult.FirstOrDefault( (cp) => cp.CodePackageName.Equals(codePackageName) && cp.ServicePackageActivationId == servicePackageActivationId)); }
protected override async Task ExecuteActionAsync(FabricTestContext testContext, StartNodeAction action, CancellationToken cancellationToken) { ThrowIf.Null(action.NodeName, "NodeName"); this.helper = new TimeoutHelper(action.ActionTimeout); string nodeName = action.NodeName; BigInteger nodeInstance = action.NodeInstance; if (nodeInstance == BigInteger.MinusOne) { var nodeInfo = await GetCurrentNodeInfoAsync(testContext, action, cancellationToken); if (nodeInfo == null) { throw new FabricException(StringResources.Error_NodeNotFound, FabricErrorCode.NodeNotFound); } nodeInstance = nodeInfo.NodeInstanceId; } await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync( () => testContext.FabricClient.FaultManager.StartNodeUsingNodeNameAsync( nodeName, nodeInstance, action.IPAddressOrFQDN, action.ClusterConnectionPort, action.RequestTimeout, cancellationToken), this.helper.GetRemainingTime(), cancellationToken).ConfigureAwait(false); if (action.CompletionMode == CompletionMode.Verify) { bool success = false; while (this.helper.GetRemainingTime() > TimeSpan.Zero) { var nodeInfo = await this.GetCurrentNodeInfoAsync(testContext, action, cancellationToken).ConfigureAwait(false); if (nodeInfo != null && nodeInfo.NodeInstanceId > nodeInstance && nodeInfo.IsNodeUp) { success = true; break; } ActionTraceSource.WriteInfo(TraceSource, "NodeName = {0} not yet Started. Retrying...", action.NodeName); await AsyncWaiter.WaitAsync(TimeSpan.FromSeconds(5), cancellationToken).ConfigureAwait(false); } if (!success) { throw new TimeoutException(StringHelper.Format(StringResources.Error_TestabilityActionTimeout, "StartNode", action.NodeName + ":" + action.NodeInstance)); } } action.Result = new StartNodeResult(action.NodeName, nodeInstance); this.ResultTraceString = StringHelper.Format("StartNodeAction succeeded for {0}:{1} with CompletionMode = {2}", action.NodeName, nodeInstance, action.CompletionMode); }
protected override async Task ExecuteActionAsync(FabricTestContext testContext, GetSelectedPartitionStateAction action, CancellationToken cancellationToken) { ThrowIf.Null(action.PartitionSelector, "PartitionSelector"); Guid partitionId; Uri serviceName; if (!action.PartitionSelector.TryGetPartitionIdIfNotGetServiceName(out partitionId, out serviceName)) { // TODO: make these actions which store state locally as well. ServicePartitionList partitionsResult = await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync <ServicePartitionList>( () => testContext.FabricClient.QueryManager.GetPartitionListAsync( serviceName, null, default(string), action.RequestTimeout, cancellationToken), action.ActionTimeout, cancellationToken).ConfigureAwait(false); Partition partitionResult = action.PartitionSelector.GetSelectedPartition(partitionsResult.ToArray(), testContext.Random); partitionId = partitionResult.PartitionInformation.Id; } else { // Validate the partition specified is actually from the service specified. // Intentionally do not use FabricClientRetryErrors.GetPartitionListFabricErrors. We do not want to retry "service not found". ServicePartitionList partitionsResult = await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync <ServicePartitionList>( () => testContext.FabricClient.QueryManager.GetPartitionListAsync( serviceName, null, default(string), action.RequestTimeout, cancellationToken), action.ActionTimeout, cancellationToken).ConfigureAwait(false); var guids = partitionsResult.Select(p => p.PartitionId()).ToList(); if (!guids.Contains(partitionId)) { // The message in the first arg is only for debugging, it is not returned to the user. throw new FabricException("Partition not found", FabricErrorCode.PartitionNotFound); } } action.Result = new SelectedPartition(serviceName, partitionId); ResultTraceString = StringHelper.Format("PartitionSelector Selected Partition with ID {0}", action.Result); }
// Throws exception if validation was unsuccessful. protected override async Task ExecuteActionAsync(FabricTestContext testContext, ValidateAllServicesAction action, CancellationToken token) { var timer = new TimeoutHelper(action.MaximumStabilizationTimeout); //// Validate system services first. var validateSystemServices = new ValidateSystemServicesAction(timer.GetRemainingTime()) { ActionTimeout = action.ActionTimeout, RequestTimeout = action.RequestTimeout, CheckFlag = action.CheckFlag }; await testContext.ActionExecutor.RunAsync(validateSystemServices, token).ConfigureAwait(false); var appListResult = await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync( () => testContext.FabricClient.QueryManager.GetApplicationListAsync( null, string.Empty, action.RequestTimeout, token), timer.GetRemainingTime(), token).ConfigureAwait(false); List <Task> tasks = new List <Task>(); foreach (Application appResult in appListResult) { var validateAppServices = new ValidateApplicationServicesAction(appResult.ApplicationName, timer.GetRemainingTime()) { ActionTimeout = action.ActionTimeout, RequestTimeout = action.RequestTimeout, CheckFlag = action.CheckFlag }; var task = testContext.ActionExecutor.RunAsync(validateAppServices, token); tasks.Add(task); this.ActionTraceSource.WriteNoise(TraceType, "ValidateAllServicesActionHandler: Validation task added for application: {0}", appResult.ApplicationName.OriginalString); } await Task.WhenAll(tasks).ConfigureAwait(false); this.ResultTraceString = "ValidateAllServicesActionHandler completed for all services"; }
protected override async Task ExecuteActionAsync( FabricTestContext testContext, CleanTestStateAction action, CancellationToken cancellationToken) { this.helper = new TimeoutHelper(action.ActionTimeout); var nodes = await testContext.FabricCluster.GetLatestNodeInfoAsync(action.RequestTimeout, this.helper.GetRemainingTime(), cancellationToken).ConfigureAwait(false); foreach (var nodeInfo in nodes) { if (nodeInfo.IsNodeUp) { var info = nodeInfo; await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync( () => testContext.FabricClient.TestManager.RemoveUnreliableTransportBehaviorAsync( info.NodeName, "*", action.RequestTimeout, cancellationToken), this.helper.GetRemainingTime(), cancellationToken).ConfigureAwait(false); // TODO: Wait for some time so that the removal of this unreliable transport behavior can be read from the files. // Bug#2271465 - Unreliable transport through API should return only once the behavior has been successully applied await Task.Delay(TimeSpan.FromSeconds(5.0), cancellationToken); ActionTraceSource.WriteInfo(TraceType, "Test state cleaned for node:{0}", nodeInfo.NodeName); } else { ActionTraceSource.WriteInfo(TraceType, "Test clean failed to start node {0}", nodeInfo.NodeName); } } ResultTraceString = StringHelper.Format("CleanTestStateAction succeeded"); }
protected override async Task ExecuteActionAsync(FabricTestContext testContext, RestartReplicaAction action, CancellationToken cancellationToken) { TimeoutHelper helper = new TimeoutHelper(action.ActionTimeout); string nodeName = action.NodeName; Guid? partitionId = action.PartitionId; long? replicaId = action.ReplicaId; SelectedReplica replicaSelectorResult = SelectedReplica.None; if (string.IsNullOrEmpty(nodeName) || !partitionId.HasValue || !replicaId.HasValue) { ThrowIf.Null(action.ReplicaSelector, "ReplicaSelector"); var getReplicaStateAction = new GetSelectedReplicaStateAction(action.ReplicaSelector) { RequestTimeout = action.RequestTimeout, ActionTimeout = helper.GetRemainingTime() }; await testContext.ActionExecutor.RunAsync(getReplicaStateAction, cancellationToken).ConfigureAwait(false); var replicaStateActionResult = getReplicaStateAction.Result; replicaSelectorResult = replicaStateActionResult.Item1; ReleaseAssert.AssertIf(replicaSelectorResult == null, "replicaSelectorResult cannot be null"); partitionId = replicaStateActionResult.Item1.SelectedPartition.PartitionId; Replica replicaStateResult = replicaStateActionResult.Item2; ReleaseAssert.AssertIf(replicaStateResult == null, "replicaStateResult cannot be null"); nodeName = replicaStateResult.NodeName; replicaId = replicaStateResult.Id; } ThrowIf.IsFalse(partitionId.HasValue, "PartitionID"); ThrowIf.IsFalse(replicaId.HasValue, "ReplicaID"); await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync( () => testContext.FabricClient.ServiceManager.RestartReplicaAsync( nodeName, partitionId.Value, replicaId.Value, action.RequestTimeout, cancellationToken), FabricClientRetryErrors.RestartReplicaErrors.Value, helper.GetRemainingTime(), cancellationToken); if (action.CompletionMode == CompletionMode.Verify) { // TODO: Check with failover team to see how to confirm that the replica actually restarted. We do not expose instance id for persisted replicas } action.Result = new RestartReplicaResult(replicaSelectorResult); this.ResultTraceString = StringHelper.Format( "RestartReplicaOrInstance succeeded by restarting replica {0}:{1} node {2} with CompletionMode {3}", partitionId.Value, replicaId.Value, nodeName, action.CompletionMode); }
protected override async Task ExecuteActionAsync(FabricTestContext testContext, RestartDeployedCodePackageAction action, CancellationToken cancellationToken) { this.helper = new TimeoutHelper(action.ActionTimeout); string nodeName = action.NodeName; Uri applicationName = action.ApplicationName; string serviceManifestName = action.ServiceManifestName; string servicePackageActivationId = action.ServicePackageActivationId; string codePackageName = action.CodePackageName; SelectedReplica replicaSelectorResult = SelectedReplica.None; ThrowIf.Null(applicationName, "ApplicationName"); if (string.IsNullOrEmpty(nodeName) || string.IsNullOrEmpty(serviceManifestName) || string.IsNullOrEmpty(codePackageName)) { ThrowIf.Null(action.ReplicaSelector, "ReplicaSelector"); var getReplicaStateAction = new GetSelectedReplicaStateAction(action.ReplicaSelector) { RequestTimeout = action.RequestTimeout, ActionTimeout = this.helper.GetRemainingTime() }; await testContext.ActionExecutor.RunAsync(getReplicaStateAction, cancellationToken).ConfigureAwait(false); var replicaStateActionResult = getReplicaStateAction.Result; ReleaseAssert.AssertIf(replicaStateActionResult == null, "replicaStateActionResult cannot be null"); replicaSelectorResult = replicaStateActionResult.Item1; ReleaseAssert.AssertIf(replicaSelectorResult == null || replicaSelectorResult.SelectedPartition == null, "replicaSelectorResult cannot be null or for a non-null replicaSelectorResult, the selected partition must be non-null"); Guid partitionId = replicaStateActionResult.Item1.SelectedPartition.PartitionId; Replica replicaStateResult = replicaStateActionResult.Item2; ReleaseAssert.AssertIf(replicaStateResult == null, "replicaStateResult cannot be null"); nodeName = replicaStateResult.NodeName; var deployedReplicaListResult = await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync <DeployedServiceReplicaList>( () => testContext.FabricClient.QueryManager.GetDeployedReplicaListAsync( nodeName, applicationName, null, partitionId, action.RequestTimeout, cancellationToken), this.helper.GetRemainingTime(), cancellationToken).ConfigureAwait(false); DeployedServiceReplica selectedReplica = deployedReplicaListResult.FirstOrDefault(r => r.Partitionid == partitionId); if (selectedReplica == null) { throw new FabricException( StringHelper.Format(StringResources.Error_DidNotFindDeployedReplicaOnNode, partitionId, nodeName), FabricErrorCode.ReplicaDoesNotExist); } serviceManifestName = selectedReplica.ServiceManifestName; servicePackageActivationId = selectedReplica.ServicePackageActivationId; codePackageName = selectedReplica.CodePackageName; } ActionTraceSource.WriteInfo(TraceSource, "SelectedReplica: serviceManifestName: {0}, servicePackageActivationId: {1}, codePackageName: {2}", serviceManifestName, servicePackageActivationId, codePackageName); DeployedCodePackage deployedCodePackageListResult = await this.GetCodePackageInfoAsync(testContext, nodeName, applicationName, serviceManifestName, servicePackageActivationId, codePackageName, action, cancellationToken).ConfigureAwait(false); var codepackageEntrypointToRestart = GetCodepackageEntrypointToRestart(action, deployedCodePackageListResult); await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync( () => testContext.FabricClient.FaultManager.RestartDeployedCodePackageUsingNodeNameAsync( nodeName, applicationName, serviceManifestName, servicePackageActivationId, codePackageName, codepackageEntrypointToRestart.EntryPoint.CodePackageInstanceId, action.RequestTimeout, cancellationToken), this.helper.GetRemainingTime(), cancellationToken).ConfigureAwait(false); if (action.CompletionMode == CompletionMode.Verify) { bool success = false; while (this.helper.GetRemainingTime() > TimeSpan.Zero) { var deployedCodePackageListResultAfterRestart = await this.GetCodePackageInfoAsync(testContext, nodeName, applicationName, serviceManifestName, servicePackageActivationId, codePackageName, action, cancellationToken).ConfigureAwait(false); if (deployedCodePackageListResultAfterRestart != null) { var entryPointAfterRestart = codepackageEntrypointToRestart.EntryPointType == EntryPointType.Main ? deployedCodePackageListResultAfterRestart.EntryPoint : deployedCodePackageListResultAfterRestart.SetupEntryPoint; if (entryPointAfterRestart != null && entryPointAfterRestart.CodePackageInstanceId > codepackageEntrypointToRestart.EntryPoint.CodePackageInstanceId && entryPointAfterRestart.EntryPointStatus == EntryPointStatus.Started) { success = true; break; } } ActionTraceSource.WriteInfo(TraceSource, "CodePackage = {0}:{1}:{2} not yet restarted. Retrying...", nodeName, applicationName, codePackageName); await AsyncWaiter.WaitAsync(TimeSpan.FromSeconds(5), cancellationToken).ConfigureAwait(false); } if (!success) { throw new TimeoutException(StringHelper.Format(StringResources.Error_TestabilityActionTimeout, "RestartDeployedCodePackage", applicationName)); } } action.Result = new RestartDeployedCodePackageResult( nodeName, applicationName, serviceManifestName, servicePackageActivationId, codePackageName, codepackageEntrypointToRestart.EntryPoint.CodePackageInstanceId, replicaSelectorResult); ResultTraceString = StringHelper.Format("RestartCodePackageAction succeeded for {0}:{1}:{2} with CompletionMode = {3}", nodeName, applicationName, codePackageName, action.CompletionMode); }
protected override async Task ExecuteActionAsync(FabricTestContext testContext, InvokeDataLossAction action, CancellationToken cancellationToken) { ThrowIf.Null(action.PartitionSelector, "PartitionSelector"); var helper = new TimeoutHelper(action.ActionTimeout); ServiceDescription result = await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync( () => testContext.FabricClient.ServiceManager.GetServiceDescriptionAsync( action.PartitionSelector.ServiceName, action.RequestTimeout, cancellationToken), helper.GetRemainingTime(), cancellationToken).ConfigureAwait(false); if (result.Kind != ServiceDescriptionKind.Stateful) { throw new InvalidOperationException(StringHelper.Format(StringResources.Error_InvalidServiceTypeTestability, "DataLoss", "Stateful", action.PartitionSelector.ServiceName, "Stateless")); } var getPartitionStateAction = new GetSelectedPartitionStateAction(action.PartitionSelector) { RequestTimeout = action.RequestTimeout, ActionTimeout = helper.GetRemainingTime() }; await testContext.ActionExecutor.RunAsync(getPartitionStateAction, cancellationToken).ConfigureAwait(false); Guid partitionId = getPartitionStateAction.Result.PartitionId; long preDataLossNumber = 0; ServicePartitionList partitionsResult = await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync( () => testContext.FabricClient.QueryManager.GetPartitionListAsync( action.PartitionSelector.ServiceName, null, action.RequestTimeout, cancellationToken), helper.GetRemainingTime(), cancellationToken).ConfigureAwait(false); bool partitionFound = false; foreach (StatefulServicePartition partition in partitionsResult) { if (partition.PartitionInformation.Id == partitionId) { preDataLossNumber = partition.PrimaryEpoch.DataLossNumber; partitionFound = true; break; } } if (!partitionFound) { throw new FabricException(StringHelper.Format(StringResources.Error_PartitionNotFound), FabricErrorCode.PartitionNotFound); } long postDataLossNumber = preDataLossNumber; do { ActionTraceSource.WriteInfo( TraceType, "InvokeDataLossAction action pending time:{0}", helper.GetRemainingTime()); if (helper.GetRemainingTime() <= TimeSpan.Zero) { throw new TimeoutException(StringHelper.Format(StringResources.Error_TestabilityActionTimeout, "InvokeDataLoss", partitionId)); } ServiceReplicaList replicasResult = await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync( () => testContext.FabricClient.QueryManager.GetReplicaListAsync( partitionId, 0, action.RequestTimeout, cancellationToken), helper.GetRemainingTime(), cancellationToken).ConfigureAwait(false); ServiceReplicaList fmReplicasResult = await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync( () => testContext.FabricClient.QueryManager.GetReplicaListAsync( Constants.FmPartitionId, 0, action.RequestTimeout, cancellationToken), helper.GetRemainingTime(), cancellationToken).ConfigureAwait(false); string fmPrimaryNodeName = string.Empty; var readyFMReplicas = fmReplicasResult.Where(r => r.ReplicaStatus == ServiceReplicaStatus.Ready).ToArray(); foreach (var replica in readyFMReplicas) { StatefulServiceReplica statefulReplica = replica as StatefulServiceReplica; ReleaseAssert.AssertIf(statefulReplica == null, "FM Replica is not a stateful replica"); if (statefulReplica.ReplicaRole == ReplicaRole.Primary) { fmPrimaryNodeName = replica.NodeName; } } if (string.IsNullOrEmpty(fmPrimaryNodeName)) { throw new FabricException(StringHelper.Format(StringResources.Error_PartitionPrimaryNotReady, "FailoverManager"), FabricErrorCode.NotReady); } UnreliableTransportBehavior behavior = new UnreliableTransportBehavior("*", "DoReconfiguration"); behavior.AddFilterForPartitionId(partitionId); string behaviorName = "BlockDoReconfiguration"; await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync( () => testContext.FabricClient.TestManager.AddUnreliableTransportBehaviorAsync( fmPrimaryNodeName, behaviorName, behavior, action.RequestTimeout, cancellationToken), helper.GetRemainingTime(), cancellationToken).ConfigureAwait(false); // TODO: Wait for some time so that the unreliable transport behavior can be read from the files. // Bug#2271465 - Unreliable transport through API should return only once the behavior has been successfully applied await Task.Delay(TimeSpan.FromSeconds(5.0), cancellationToken).ConfigureAwait(false); bool triedToRemovedBehavior = false; try { var stableReplicasToRemove = replicasResult.Where(r => r.ReplicaStatus == ServiceReplicaStatus.Ready).ToArray(); ActionTraceSource.WriteInfo(TraceType, "Total number of replicas found {0}:{1}", replicasResult.Count(), stableReplicasToRemove.Count()); int replicasToRestartWithoutPrimary = action.DataLossMode == DataLossMode.FullDataLoss ? stableReplicasToRemove.Length - 1 : (stableReplicasToRemove.Length + 1) / 2 - 1; foreach (var replica in stableReplicasToRemove) { var currentReplica = replica; StatefulServiceReplica statefulReplica = currentReplica as StatefulServiceReplica; ReleaseAssert.AssertIf(statefulReplica == null, "Service Replica is not of stateful type even though service is stateful"); ActionTraceSource.WriteInfo( TraceType, "Inspecting replica {0}:{1} with role {2} and status {3} to induce data loss", currentReplica.Id, partitionId, statefulReplica.ReplicaRole, statefulReplica.ReplicaStatus); if (statefulReplica.ReplicaRole != ReplicaRole.Primary) { replicasToRestartWithoutPrimary--; } if (replicasToRestartWithoutPrimary >= 0 || statefulReplica.ReplicaRole == ReplicaRole.Primary) { ActionTraceSource.WriteInfo(TraceType, "Removing replica {0}:{1} to induce data loss", currentReplica.Id, partitionId); await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync( () => testContext.FabricClient.FaultManager.RemoveReplicaAsync( currentReplica.NodeName, partitionId, currentReplica.Id, CompletionMode.DoNotVerify, false, /*force remove*/ action.RequestTimeout.TotalSeconds, cancellationToken), helper.GetRemainingTime(), cancellationToken).ConfigureAwait(false); } } triedToRemovedBehavior = true; await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync( () => testContext.FabricClient.TestManager.RemoveUnreliableTransportBehaviorAsync( fmPrimaryNodeName, behaviorName, action.RequestTimeout, cancellationToken), FabricClientRetryErrors.RemoveUnreliableTransportBehaviorErrors.Value, helper.GetRemainingTime(), cancellationToken).ConfigureAwait(false); // TODO: Wait for some time so that the removal of this unreliable transport behavior can be read from the files. // Bug#2271465 - Unreliable transport through API should return only once the behavior has been successully applied await Task.Delay(TimeSpan.FromSeconds(5.0), cancellationToken).ConfigureAwait(false); // retry check for whether data loss number has increased 5 times else do the entire process again const int maxRetryCount = 5; int retryCount = 0; do { partitionsResult = await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync( () => testContext.FabricClient.QueryManager.GetPartitionListAsync( action.PartitionSelector.ServiceName, null, action.RequestTimeout, cancellationToken), FabricClientRetryErrors.GetPartitionListFabricErrors.Value, helper.GetRemainingTime(), cancellationToken).ConfigureAwait(false); partitionFound = false; foreach (StatefulServicePartition partition in partitionsResult) { if (partition.PartitionInformation.Id == partitionId) { postDataLossNumber = partition.PrimaryEpoch.DataLossNumber; partitionFound = true; break; } } if (!partitionFound) { throw new FabricException(StringHelper.Format(StringResources.Error_PartitionNotFound), FabricErrorCode.PartitionNotFound); } ActionTraceSource.WriteInfo( TraceType, "Checking data loss numbers for partition {0} with retryCount {1}. Current numbers {2}:{3}", partitionId, retryCount, preDataLossNumber, postDataLossNumber); if (postDataLossNumber != preDataLossNumber) { break; } await AsyncWaiter.WaitAsync(TimeSpan.FromSeconds(5), cancellationToken); ++retryCount; } while (retryCount < maxRetryCount); } finally { if (!triedToRemovedBehavior) { ActionTraceSource.WriteWarning(TraceType, "Exception after adding behavior to block messages. Removing behavior synchronously"); FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync( () => testContext.FabricClient.TestManager.RemoveUnreliableTransportBehaviorAsync( fmPrimaryNodeName, behaviorName, action.RequestTimeout, cancellationToken), FabricClientRetryErrors.RemoveUnreliableTransportBehaviorErrors.Value, helper.GetRemainingTime(), cancellationToken).GetAwaiter().GetResult(); // TODO: Wait for some time so that the removal of this unreliable transport behavior can be read from the files. // Bug#2271465 - Unreliable transport through API should return only once the behavior has been successully applied Task.Delay(TimeSpan.FromSeconds(5.0), cancellationToken).GetAwaiter().GetResult(); } } }while (postDataLossNumber == preDataLossNumber); ActionTraceSource.WriteInfo( TraceType, "InvokeDataLossAction action completed postDataLossNumber:{0}, preDataLossNumber:{1}", postDataLossNumber, preDataLossNumber); action.Result = new InvokeDataLossResult(getPartitionStateAction.Result); this.ResultTraceString = StringHelper.Format("InvokeDataLossAction succeeded for {0} with DatalossMode = {1}", partitionId, action.DataLossMode); }
protected override async Task ExecuteActionAsync(FabricTestContext testContext, RemoveReplicaAction action, CancellationToken cancellationToken) { TimeoutHelper helper = new TimeoutHelper(action.ActionTimeout); string nodeName = action.NodeName; Guid? partitionId = action.PartitionId; long? replicaId = action.ReplicaId; SelectedReplica replicaSelectorResult = SelectedReplica.None; if (string.IsNullOrEmpty(nodeName) || !partitionId.HasValue || !replicaId.HasValue) { ThrowIf.Null(action.ReplicaSelector, "ReplicaSelector"); var getReplicaStateAction = new GetSelectedReplicaStateAction(action.ReplicaSelector) { RequestTimeout = action.RequestTimeout, ActionTimeout = helper.GetRemainingTime() }; await testContext.ActionExecutor.RunAsync(getReplicaStateAction, cancellationToken).ConfigureAwait(false); var replicaStateActionResult = getReplicaStateAction.Result; ReleaseAssert.AssertIf(replicaStateActionResult == null, "replicaStateActionResult cannot be null"); replicaSelectorResult = replicaStateActionResult.Item1; partitionId = replicaStateActionResult.Item1.SelectedPartition.PartitionId; Replica replicaStateResult = replicaStateActionResult.Item2; ReleaseAssert.AssertIf(replicaStateResult == null, "replicaStateResult cannot be null"); nodeName = replicaStateResult.NodeName; replicaId = replicaStateResult.Id; } ThrowIf.IsFalse(partitionId.HasValue, "PartitionID"); ThrowIf.IsFalse(replicaId.HasValue, "ReplicaID"); bool forceRemove = action.ForceRemove; await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync( () => testContext.FabricClient.ServiceManager.RemoveReplicaAsync( nodeName, partitionId.Value, replicaId.Value, forceRemove, action.RequestTimeout, cancellationToken), FabricClientRetryErrors.RemoveReplicaErrors.Value, helper.GetRemainingTime(), cancellationToken); if (action.CompletionMode == CompletionMode.Verify) { // Check that replica on selected node has been removed i.e. the replica id does not exist anymore. bool success = false; while (helper.GetRemainingTime() > TimeSpan.Zero) { var replicasResult = await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync( () => testContext.FabricClient.QueryManager.GetReplicaListAsync( partitionId.Value, replicaId.Value, action.RequestTimeout, cancellationToken), helper.GetRemainingTime(), cancellationToken).ConfigureAwait(false); bool dropped = replicasResult.Count == 0; if (!dropped) { // Since we added a replica filter the result should contain the replica or none ReleaseAssert.AssertIf(replicasResult.Count > 1, "More than 1 replica returned with replica filter {0}:{1}", partitionId.Value, replicaId.Value); ReleaseAssert.AssertIf(replicasResult[0].Id != replicaId, "Incorrect replica Id {0} returned by query instead of {1}", replicasResult[0].Id, replicaId); dropped = replicasResult[0].ReplicaStatus == ServiceReplicaStatus.Dropped; } if (dropped) { success = true; break; } ActionTraceSource.WriteInfo(TraceSource, "Replica = {0}:{1} not yet completely removed. Retrying...", partitionId.Value, replicaId.Value); await AsyncWaiter.WaitAsync(TimeSpan.FromSeconds(5), cancellationToken); } if (!success) { throw new TimeoutException(StringHelper.Format(StringResources.Error_TestabilityActionTimeout, "RemoveReplica", StringHelper.Format("{0}:{1}", partitionId.Value, replicaId.Value))); } } action.Result = new RemoveReplicaResult(replicaSelectorResult); ResultTraceString = StringHelper.Format( "RemoveReplicaOrInstance succeeded by removing replica {0}:{1} on node {2} with CompletionMode {3}", partitionId.Value, replicaId.Value, nodeName, action.CompletionMode); }
protected override async Task ExecuteActionAsync(FabricTestContext testContext, MoveSecondaryAction action, CancellationToken cancellationToken) { ThrowIf.Null(action.PartitionSelector, "PartitionSelector"); this.helper = new TimeoutHelper(action.ActionTimeout); string newSecondaryNode = action.NewSecondaryNodeName; string currentSecondaryNode = action.CurrentSecondaryNodeName; var getPartitionStateAction = new GetSelectedPartitionStateAction(action.PartitionSelector) { RequestTimeout = action.RequestTimeout, ActionTimeout = this.helper.GetRemainingTime() }; await testContext.ActionExecutor.RunAsync(getPartitionStateAction, cancellationToken).ConfigureAwait(false); Guid partitionId = getPartitionStateAction.Result.PartitionId; if (!action.IgnoreConstraints) { // get current primary replica node name. ServiceReplicaList replicasResult = await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync( () => testContext.FabricClient.QueryManager.GetReplicaListAsync( partitionId, 0, action.RequestTimeout, cancellationToken), this.helper.GetRemainingTime(), cancellationToken).ConfigureAwait(false); string currentPrimaryNodeInfo = string.Empty; List <string> currentSecReplicaNodes = new List <string>(); foreach (var replica in replicasResult) { StatefulServiceReplica statefulReplica = replica as StatefulServiceReplica; if (statefulReplica == null) { throw new InvalidOperationException(StringHelper.Format(StringResources.Error_InvalidServiceTypeTestability, "MoveSecondary", "Stateful", action.PartitionSelector.ServiceName, "Stateless")); } if (statefulReplica.ReplicaRole == ReplicaRole.Primary) { currentPrimaryNodeInfo = statefulReplica.NodeName; if (!string.IsNullOrEmpty(newSecondaryNode) && newSecondaryNode == statefulReplica.NodeName) { throw new FabricException( StringHelper.Format(StringResources.Error_InvalidNodeNameProvided, newSecondaryNode, "MoveSecondary", "Primary exists on node"), FabricErrorCode.AlreadyPrimaryReplica); } } else if (statefulReplica.ReplicaRole == ReplicaRole.ActiveSecondary) { currentSecReplicaNodes.Add(statefulReplica.NodeName); if (!string.IsNullOrEmpty(newSecondaryNode) && newSecondaryNode == statefulReplica.NodeName) { throw new FabricException( StringHelper.Format(StringResources.Error_InvalidNodeNameProvided, newSecondaryNode, "MoveSecondary", "Secondary exists on node"), FabricErrorCode.AlreadySecondaryReplica); } } } if (currentSecReplicaNodes.Count == 0) { throw new InvalidOperationException(StringResources.Error_NoSecondariesInReplicaSet); } if (string.IsNullOrEmpty(currentSecondaryNode)) { int num = testContext.Random.Next(currentSecReplicaNodes.Count); currentSecondaryNode = currentSecReplicaNodes.ElementAt(num); } if (!currentSecReplicaNodes.Contains(currentSecondaryNode)) { throw new FabricException( StringHelper.Format(StringResources.Error_InvalidNodeNameProvided, newSecondaryNode, "MoveSecondary", "Current node does not have a secondary replica"), FabricErrorCode.InvalidReplicaStateForReplicaOperation); } } ReleaseAssert.AssertIf(string.IsNullOrEmpty(currentSecondaryNode), "Current node name cannot be null or empty."); ReleaseAssert.AssertIf(newSecondaryNode == currentSecondaryNode, "Current and New node names are same."); ActionTraceSource.WriteInfo(TraceSource, "Calling move secondary with current node {0}, new node {1}, partition {2}", currentSecondaryNode, string.IsNullOrEmpty(newSecondaryNode) ? "Random" : newSecondaryNode, partitionId); await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync( () => testContext.FabricClient.FaultManager.MoveSecondaryUsingNodeNameAsync( currentSecondaryNode, newSecondaryNode, getPartitionStateAction.Result.ServiceName, partitionId, action.IgnoreConstraints, action.RequestTimeout, cancellationToken), FabricClientRetryErrors.MoveSecondaryFabricErrors.Value, this.helper.GetRemainingTime(), cancellationToken).ConfigureAwait(false); action.Result = new MoveSecondaryResult(currentSecondaryNode, newSecondaryNode, getPartitionStateAction.Result); this.ResultTraceString = StringHelper.Format( "MoveSecondaryAction succeeded for moving Primary for {0} from {1} to {2}.", partitionId, currentSecondaryNode, newSecondaryNode); }
public async Task <ValidationReport> ValidateHealthWithReportAsync(TimeSpan maximumStabilizationTimeout, TimeSpan retryWait, CancellationToken ct) { TestabilityTrace.TraceSource.WriteInfo(TraceSource, "Validating that '{0}' is healthy with timeout '{1}'.", this.serviceName, maximumStabilizationTimeout); TimeoutHelper timer = new TimeoutHelper(maximumStabilizationTimeout); bool success = false; string healthinfo = string.Empty; int retryCount = 1; while (!success && timer.GetRemainingTime() > TimeSpan.Zero) { TestabilityTrace.TraceSource.WriteInfo(TraceSource, "ValidateHealthWithReportAsync(): retryCount='{0}', timer.GetRemainingTime()='{1}'", retryCount, timer.GetRemainingTime()); healthinfo = string.Empty; if (this.TestContext == null) { Console.WriteLine("testcontext is null"); } ReleaseAssert.AssertIfNull(this.TestContext, "test context"); ReleaseAssert.AssertIfNull(this.serviceName, "serviceName"); ReleaseAssert.AssertIfNull(FabricClientRetryErrors.GetEntityHealthFabricErrors.Value, "health error code"); ApplicationHealthPolicy healthPolicy = new ApplicationHealthPolicy(); var serviceHealthResult = await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync( () => this.TestContext.FabricClient.HealthManager.GetServiceHealthAsync( this.serviceName, healthPolicy, this.requestTimeout, ct), FabricClientRetryErrors.GetEntityHealthFabricErrors.Value, timer.GetRemainingTime(), ct).ConfigureAwait(false); bool checkError = (this.checkFlags & ValidationCheckFlag.CheckError) != 0; bool checkWarning = (this.checkFlags & ValidationCheckFlag.CheckWarning) != 0; if ((checkError && serviceHealthResult.AggregatedHealthState == HealthState.Error) || (checkWarning && serviceHealthResult.AggregatedHealthState == HealthState.Warning) || serviceHealthResult.AggregatedHealthState == HealthState.Invalid || serviceHealthResult.AggregatedHealthState == HealthState.Unknown) { TestabilityTrace.TraceSource.WriteInfo(TraceSource, "{0} is health state is {1}. Will Retry check", this.serviceName, serviceHealthResult.AggregatedHealthState); healthinfo = await this.GetUnhealthyItemsAsync(serviceHealthResult, timer, ct).ConfigureAwait(false); TestabilityTrace.TraceSource.WriteInfo(TraceSource, healthinfo); } else { success = true; } if (!success) { if (retryCount % 10 == 0) { TestabilityTrace.TraceSource.WriteWarning(TraceSource, "Service {0} health validation failed due to issues below, will retry: \n{1}", this.serviceName, healthinfo); } // Delay before querying again so we allow some time for state to change - don't spam the node await AsyncWaiter.WaitAsync(retryWait); } retryCount++; } if (!success) { return(new ValidationReport(true, StringHelper.Format(StringResources.Error_ServiceNotHealthy, serviceName, maximumStabilizationTimeout, healthinfo))); } else { return(ValidationReport.Default); } }
protected override async Task ExecuteActionAsync(FabricTestContext testContext, RestartPartitionAction action, CancellationToken cancellationToken) { ThrowIf.Null(action.PartitionSelector, "partitionSelector"); this.helper = new TimeoutHelper(action.ActionTimeout); // get service info so we can validate if the operation is valid ServiceDescription result = await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync( () => testContext.FabricClient.ServiceManager.GetServiceDescriptionAsync( action.PartitionSelector.ServiceName, action.RequestTimeout, cancellationToken), this.helper.GetRemainingTime(), cancellationToken).ConfigureAwait(false); if (result.Kind != ServiceDescriptionKind.Stateful && action.RestartPartitionMode == RestartPartitionMode.OnlyActiveSecondaries) { throw new InvalidOperationException(StringHelper.Format(StringResources.Error_InvalidServiceTypeTestability, "RestartPartitionMode.OnlyActiveSecondaries", "Stateful", action.PartitionSelector.ServiceName, "Stateless")); } bool hasPersistedState = false; if (result.Kind == ServiceDescriptionKind.Stateful) { StatefulServiceDescription statefulDescription = result as StatefulServiceDescription; ReleaseAssert.AssertIf(statefulDescription == null, "Stateful service description is not WinFabricStatefulServiceDescription"); hasPersistedState = statefulDescription.HasPersistedState; } // now actually select a partition var getPartitionStateAction = new GetSelectedPartitionStateAction(action.PartitionSelector) { RequestTimeout = action.RequestTimeout, ActionTimeout = helper.GetRemainingTime() }; await testContext.ActionExecutor.RunAsync(getPartitionStateAction, cancellationToken); Guid partitionId = getPartitionStateAction.Result.PartitionId; // get replicas for target ServiceReplicaList replicasResult = await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync( () => testContext.FabricClient.QueryManager.GetReplicaListAsync( partitionId, 0, action.RequestTimeout, cancellationToken), this.helper.GetRemainingTime(), cancellationToken).ConfigureAwait(false); // get replicas for fm in order to get the primary ServiceReplicaList fmReplicasResult = await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync( () => testContext.FabricClient.QueryManager.GetReplicaListAsync( Constants.FmPartitionId, 0, action.RequestTimeout, cancellationToken), this.helper.GetRemainingTime(), cancellationToken).ConfigureAwait(false); string fmPrimaryNodeName = string.Empty; var readyFMReplicas = fmReplicasResult.Where(r => r.ReplicaStatus == ServiceReplicaStatus.Ready).ToArray(); foreach (var replica in readyFMReplicas) { StatefulServiceReplica statefulReplica = replica as StatefulServiceReplica; ReleaseAssert.AssertIf(statefulReplica == null, "FM Replica is not a stateful replica"); if (statefulReplica.ReplicaRole == ReplicaRole.Primary) { fmPrimaryNodeName = replica.NodeName; } } if (string.IsNullOrEmpty(fmPrimaryNodeName)) { throw new FabricException(StringHelper.Format(StringResources.Error_PartitionPrimaryNotReady, "FailoverManager"), FabricErrorCode.NotReady); } ////------------------------------------------------------ // target ut at the fm primary only UnreliableTransportBehavior behavior = new UnreliableTransportBehavior("*", "DoReconfiguration"); behavior.AddFilterForPartitionId(partitionId); string behaviorName = "BlockDoReconfiguration"; await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync( () => testContext.FabricClient.TestManager.AddUnreliableTransportBehaviorAsync( fmPrimaryNodeName, behaviorName, behavior, action.RequestTimeout, cancellationToken), this.helper.GetRemainingTime(), cancellationToken).ConfigureAwait(false); // TODO: Wait for some time so that the unreliable transport behavior can be read from the files. // Bug#2271465 - Unreliable transport through API should return only once the behavior has been successfully applied await Task.Delay(TimeSpan.FromSeconds(5.0), cancellationToken).ConfigureAwait(false); bool triedToRemovedBehavior = false; // inspect the actual replicas to restart, only operate on stable ones try { var stableReplicasToRestart = replicasResult.Where(r => r.ReplicaStatus == ServiceReplicaStatus.Ready).ToArray(); foreach (var replica in stableReplicasToRestart) { var currentReplica = replica; if (action.RestartPartitionMode == RestartPartitionMode.OnlyActiveSecondaries) { StatefulServiceReplica statefulReplica = currentReplica as StatefulServiceReplica; ReleaseAssert.AssertIf(statefulReplica == null, "Stateful service replica is not StatefulServiceReplica"); if (statefulReplica.ReplicaRole == ReplicaRole.Primary) { continue; } } if (hasPersistedState) { await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync( () => testContext.FabricClient.FaultManager.RestartReplicaAsync( currentReplica.NodeName, partitionId, currentReplica.Id, CompletionMode.DoNotVerify, action.RequestTimeout.TotalSeconds, cancellationToken), this.helper.GetRemainingTime(), cancellationToken).ConfigureAwait(false); } else { await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync( () => testContext.FabricClient.FaultManager.RemoveReplicaAsync( currentReplica.NodeName, partitionId, currentReplica.Id, CompletionMode.DoNotVerify, false, /*force remove*/ action.RequestTimeout.TotalSeconds, cancellationToken), this.helper.GetRemainingTime(), cancellationToken).ConfigureAwait(false); } } triedToRemovedBehavior = true; await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync( () => testContext.FabricClient.TestManager.RemoveUnreliableTransportBehaviorAsync( fmPrimaryNodeName, behaviorName, action.RequestTimeout, cancellationToken), FabricClientRetryErrors.RemoveUnreliableTransportBehaviorErrors.Value, this.helper.GetRemainingTime(), cancellationToken).ConfigureAwait(false); // TODO: Wait for some time so that the unreliable transport behavior can be read from the files. // Bug#2271465 - Unreliable transport through API should return only once the behavior has been successfully applied await Task.Delay(TimeSpan.FromSeconds(5.0)).ConfigureAwait(false); } finally { // TODO: Provide a way to clear all behaviors just in case. if (!triedToRemovedBehavior) { ActionTraceSource.WriteWarning(TraceType, "Exception after adding behavior to block messages. Removing behavior synchronously"); FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync( () => testContext.FabricClient.TestManager.RemoveUnreliableTransportBehaviorAsync( fmPrimaryNodeName, behaviorName, action.RequestTimeout, cancellationToken), FabricClientRetryErrors.RemoveUnreliableTransportBehaviorErrors.Value, this.helper.GetRemainingTime(), cancellationToken).GetAwaiter().GetResult(); // TODO: Wait for some time so that the unreliable transport behavior can be read from the files. // Bug#2271465 - Unreliable transport through API should return only once the behavior has been successfully applied Task.Delay(TimeSpan.FromSeconds(5.0)).GetAwaiter().GetResult(); } } // -- note there's no explict validation // action result action.Result = new RestartPartitionResult(getPartitionStateAction.Result); ResultTraceString = StringHelper.Format("RestartPartitionAction succeeded for {0} with RestartPartitionMode = {1}", partitionId, action.RestartPartitionMode); }
protected override async Task ExecuteActionAsync(FabricTestContext testContext, MovePrimaryAction action, CancellationToken cancellationToken) { ThrowIf.Null(action.PartitionSelector, "PartitionSelector"); this.helper = new TimeoutHelper(action.ActionTimeout); string newPrimaryNodeName = action.NodeName; var getPartitionStateAction = new GetSelectedPartitionStateAction(action.PartitionSelector) { RequestTimeout = action.RequestTimeout, ActionTimeout = this.helper.GetRemainingTime() }; await testContext.ActionExecutor.RunAsync(getPartitionStateAction, cancellationToken); Guid partitionId = getPartitionStateAction.Result.PartitionId; if (!action.IgnoreConstraints) { // select random node where replica's primary not present var nodesInfo = await testContext.FabricCluster.GetLatestNodeInfoAsync(action.RequestTimeout, this.helper.GetRemainingTime(), cancellationToken); if ((nodesInfo == null || nodesInfo.Count() == 0)) { throw new InvalidOperationException(StringHelper.Format(StringResources.Error_NotEnoughNodesForTestabilityAction, "MovePrimary")); } ServiceReplicaList replicasResult = await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync( () => testContext.FabricClient.QueryManager.GetReplicaListAsync( partitionId, 0, action.RequestTimeout, cancellationToken), this.helper.GetRemainingTime(), cancellationToken).ConfigureAwait(false); NodeInfo currentPrimaryNodeInfo = null; string currentPrimaryNodeName = string.Empty; foreach (var replica in replicasResult) { StatefulServiceReplica statefulReplica = replica as StatefulServiceReplica; if (statefulReplica == null) { throw new InvalidOperationException(StringHelper.Format(StringResources.Error_InvalidServiceTypeTestability, "MovePrimary", "Stateful", action.PartitionSelector.ServiceName, "Stateless")); } if (statefulReplica.ReplicaRole == ReplicaRole.Primary) { currentPrimaryNodeInfo = nodesInfo.FirstOrDefault(n => n.NodeName == statefulReplica.NodeName); if (!string.IsNullOrEmpty(newPrimaryNodeName) && newPrimaryNodeName == statefulReplica.NodeName) { throw new FabricException( StringHelper.Format(StringResources.Error_InvalidNodeNameProvided, newPrimaryNodeName, "MovePrimary", "Primary already exists on node"), FabricErrorCode.AlreadyPrimaryReplica); } break; } } if (currentPrimaryNodeInfo == null) { throw new FabricException(StringHelper.Format(StringResources.Error_PartitionPrimaryNotReady, action.PartitionSelector + ":" + partitionId), FabricErrorCode.NotReady); } currentPrimaryNodeName = currentPrimaryNodeInfo.NodeName; if (newPrimaryNodeName == currentPrimaryNodeName) { throw new FabricException( StringHelper.Format(StringResources.Error_InvalidNodeNameProvided, newPrimaryNodeName, "MovePrimary", "Primary already exists on node"), FabricErrorCode.AlreadyPrimaryReplica); } } ActionTraceSource.WriteInfo(TraceSource, "Calling move primary with node {0}, partition {1}", string.IsNullOrEmpty(newPrimaryNodeName) ? "Random" : newPrimaryNodeName, partitionId); await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync( () => testContext.FabricClient.FaultManager.MovePrimaryUsingNodeNameAsync( newPrimaryNodeName, getPartitionStateAction.Result.ServiceName, partitionId, action.IgnoreConstraints, action.RequestTimeout, cancellationToken), FabricClientRetryErrors.MovePrimaryFabricErrors.Value, this.helper.GetRemainingTime(), cancellationToken).ConfigureAwait(false); action.Result = new MovePrimaryResult(newPrimaryNodeName, getPartitionStateAction.Result); ResultTraceString = StringHelper.Format("MovePrimaryAction succeeded for moving Primary for {0} to node {1}.", partitionId, newPrimaryNodeName); }
protected override async Task ExecuteActionAsync(FabricTestContext testContext, InvokeQuorumLossAction action, CancellationToken cancellationToken) { ThrowIf.Null(action.PartitionSelector, "PartitionSelector"); var helper = new TimeoutHelper(action.ActionTimeout); // get info about the service so we can check type and trss ServiceDescription result = await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync( () => testContext.FabricClient.ServiceManager.GetServiceDescriptionAsync( action.PartitionSelector.ServiceName, action.RequestTimeout, cancellationToken), helper.GetRemainingTime(), cancellationToken).ConfigureAwait(false); if (result.Kind != ServiceDescriptionKind.Stateful) { throw new InvalidOperationException(StringHelper.Format(StringResources.Error_InvalidServiceTypeTestability, "QuorumLoss", "Stateful", action.PartitionSelector.ServiceName, "Stateless")); } StatefulServiceDescription statefulServiceDescription = result as StatefulServiceDescription; ReleaseAssert.AssertIf(statefulServiceDescription == null, "Service is not a stateful service"); if (!statefulServiceDescription.HasPersistedState) { throw new InvalidOperationException(StringHelper.Format(StringResources.Error_InvalidServiceTypeTestability, "QuorumLoss", "Stateful Persistent", action.PartitionSelector.ServiceName, "Stateful In-Memory Only")); } // figure out /which/ partition to select var getPartitionStateAction = new GetSelectedPartitionStateAction(action.PartitionSelector) { RequestTimeout = action.RequestTimeout, ActionTimeout = helper.GetRemainingTime() }; await testContext.ActionExecutor.RunAsync(getPartitionStateAction, cancellationToken); Guid partitionId = getPartitionStateAction.Result.PartitionId; // get data about replicas in that partition ServiceReplicaList replicasResult = await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync( () => testContext.FabricClient.QueryManager.GetReplicaListAsync( partitionId, 0, action.RequestTimeout, cancellationToken), helper.GetRemainingTime(), cancellationToken).ConfigureAwait(false); var removeUTRequestList = new List <Tuple <string, string> >(); Dictionary <Tuple <string, string>, Task> removeUTTaskDictionary = new Dictionary <Tuple <string, string>, Task>(); try { var stableReplicas = replicasResult.Where(r => r.ReplicaStatus == ServiceReplicaStatus.Ready).ToArray(); var stableReplicasToRemove = new List <StatefulServiceReplica>(); long replicasToRestartWithoutPrimary = action.QuorumLossMode == QuorumLossMode.AllReplicas ? stableReplicas.Length - 1 : FabricCluster.GetWriteQuorumSize(replicasResult.Count); foreach (var replica in stableReplicas) { StatefulServiceReplica statefulReplica = replica as StatefulServiceReplica; ReleaseAssert.AssertIf(statefulReplica == null, "Service Replica is not of stateful type even though service is stateful"); if (statefulReplica.ReplicaRole != ReplicaRole.Primary) { replicasToRestartWithoutPrimary--; } if (replicasToRestartWithoutPrimary >= 0 || statefulReplica.ReplicaRole == ReplicaRole.Primary) { stableReplicasToRemove.Add(statefulReplica); } } // for selected replicas, block reopen so that when we restart the replica (NOT remove the replica) it doesn't come up var utTaskList = new List <Task>(); foreach (var statefulReplica in stableReplicasToRemove) { string nodeName = statefulReplica.NodeName; UnreliableTransportBehavior behavior = new UnreliableTransportBehavior("*", "StatefulServiceReopen"); behavior.AddFilterForPartitionId(partitionId); string behaviorName = "BlockStatefulServiceReopen_" + nodeName; removeUTRequestList.Add(new Tuple <string, string>(nodeName, behaviorName)); utTaskList.Add( FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync( () => testContext.FabricClient.TestManager.AddUnreliableTransportBehaviorAsync( nodeName, behaviorName, behavior, action.RequestTimeout, cancellationToken), helper.GetRemainingTime(), cancellationToken)); } await Task.WhenAll(utTaskList).ConfigureAwait(false); // TODO: Wait for some time so that the unreliable transport behavior can be read from the files. // Bug#2271465 - Unreliable transport through API should return only once the behavior has been successfully applied await Task.Delay(TimeSpan.FromSeconds(5.0), cancellationToken); var restartReplicaTaskList = new List <Task>(); foreach (var statefulReplica in stableReplicasToRemove) { ReplicaSelector replicaSelector = ReplicaSelector.ReplicaIdOf(PartitionSelector.PartitionIdOf(action.PartitionSelector.ServiceName, partitionId), statefulReplica.Id); var restartReplicaAction = new RestartReplicaAction(replicaSelector) { CompletionMode = CompletionMode.DoNotVerify, RequestTimeout = action.RequestTimeout, ActionTimeout = helper.GetRemainingTime() }; restartReplicaTaskList.Add(testContext.ActionExecutor.RunAsync(restartReplicaAction, cancellationToken)); } await Task.WhenAll(restartReplicaTaskList).ConfigureAwait(false); await AsyncWaiter.WaitAsync(action.QuorumLossDuration, cancellationToken).ConfigureAwait(false); // validate ServicePartitionList partitionsResult = await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync( () => testContext.FabricClient.QueryManager.GetPartitionListAsync( action.PartitionSelector.ServiceName, null, action.RequestTimeout, cancellationToken), FabricClientRetryErrors.GetPartitionListFabricErrors.Value, helper.GetRemainingTime(), cancellationToken).ConfigureAwait(false); foreach (StatefulServicePartition partition in partitionsResult) { if (partition.PartitionInformation.Id == partitionId) { ReleaseAssert.AssertIf(partition.PartitionStatus != ServicePartitionStatus.InQuorumLoss, "Partition failed to be in Quorum Loss."); break; } } foreach (var removeUTParams in removeUTRequestList) { var currentParams = removeUTParams; Task task = FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync( () => testContext.FabricClient.TestManager.RemoveUnreliableTransportBehaviorAsync( currentParams.Item1, /*nodeName*/ currentParams.Item2, /*behaviorName*/ action.RequestTimeout, cancellationToken), FabricClientRetryErrors.RemoveUnreliableTransportBehaviorErrors.Value, helper.GetRemainingTime(), cancellationToken); removeUTTaskDictionary[currentParams] = task; } await Task.WhenAll(removeUTTaskDictionary.Values).ConfigureAwait(false); // TODO: Wait for some time so that the removal of this unreliable transport behavior can be read from the files. // Bug#2271465 - Unreliable transport through API should return only once the behavior has been successully applied await Task.Delay(TimeSpan.FromSeconds(5.0), cancellationToken); } finally { var removeUTTaskList = new List <Task>(); foreach (var removeUTRequest in removeUTTaskDictionary) { var currentRemoveUTRequest = removeUTRequest; if (currentRemoveUTRequest.Value == null || currentRemoveUTRequest.Value.IsFaulted) { removeUTTaskList.Add( FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync( () => testContext.FabricClient.TestManager.RemoveUnreliableTransportBehaviorAsync( currentRemoveUTRequest.Key.Item1, /*nodeName*/ currentRemoveUTRequest.Key.Item2, /*behaviorName*/ action.RequestTimeout, cancellationToken), FabricClientRetryErrors.RemoveUnreliableTransportBehaviorErrors.Value, helper.GetRemainingTime(), cancellationToken)); } } Task.WhenAll(removeUTTaskList).Wait(cancellationToken); // TODO: Wait for some time so that the removal of this unreliable transport behavior can be read from the files. // Bug#2271465 - Unreliable transport through API should return only once the behavior has been successully applied Task.Delay(TimeSpan.FromSeconds(5.0), cancellationToken).GetAwaiter().GetResult(); } action.Result = new InvokeQuorumLossResult(getPartitionStateAction.Result); this.ResultTraceString = StringHelper.Format("InvokeQuorumLossAction succeeded for {0} with QuorumLossMode = {1}", partitionId, action.QuorumLossMode); }
public async Task <ValidationReport> EnsureStabilityWithReportAsync(TimeSpan maximumStabilizationTimeout, TimeSpan retryWait, CancellationToken ct) { TestabilityTrace.TraceSource.WriteInfo(TraceSource, "Ensuring that '{0}' is online with timeout '{1}'.", this.serviceName, maximumStabilizationTimeout); bool checkQuorumLoss = (this.checkFlags & ValidationCheckFlag.CheckQuorumLoss) != 0; // Load basic information about this service. TestabilityTrace.TraceSource.WriteNoise(TraceSource, "Querying basic information for {0}.", this.serviceName); await this.LoadPartitionAndReplicaCountAsync(ct); DateTime startTime = DateTime.Now; TimeoutHelper timer = new TimeoutHelper(maximumStabilizationTimeout); bool success = false; List <Guid> partitionsInQuorumLoss = new List <Guid>(); StringBuilder errorString = new StringBuilder(); int retryCount = 1; while (!success && timer.GetRemainingTime() > TimeSpan.Zero) { TestabilityTrace.TraceSource.WriteInfo(TraceSource, "EnsureStabilityWithReportAsync(): retryCount='{0}', timer.GetRemainingTime()='{1}'", retryCount, timer.GetRemainingTime()); var nodes = await this.TestContext.FabricCluster.GetLatestNodeInfoAsync(this.requestTimeout, this.operationTimeout, ct); // Empty error string and list of partitions in quorum loss partitionsInQuorumLoss.Clear(); errorString.Clear(); success = true; int totalPartitionsFound = 0; bool stateful; ReleaseAssert.AssertIfNot(this.isStateful.TryGetValue(out stateful), "isStateful flag is not available"); bool checkTarget = (this.checkFlags & ValidationCheckFlag.CheckTargetReplicaSetSize) != 0; bool checkInBuild = (this.checkFlags & ValidationCheckFlag.CheckInBuildReplica) != 0; if (stateful) { var partitionDictionary = await this.QueryPartitionAndReplicaResultAsyncStateful(ct); totalPartitionsFound = partitionDictionary.Count(); foreach (KeyValuePair <Partition, StatefulServiceReplica[]> partition in partitionDictionary) { bool partitionIsReady = partition.Key.PartitionStatus == ServicePartitionStatus.Ready; if (!partitionIsReady) { var message = StringHelper.Format("Partition '{0}' is not Ready", partition.Key.PartitionId()); TestabilityTrace.TraceSource.WriteInfo(TraceSource, "{0}", message); errorString.AppendLine(message); } if (partition.Key.PartitionStatus != ServicePartitionStatus.InQuorumLoss) { int validCount = 0; int inBuildReplicas = 0; foreach (StatefulServiceReplica replica in partition.Value) { if (replica.ReplicaStatus == ServiceReplicaStatus.Ready && (replica.ReplicaRole == ReplicaRole.Primary || replica.ReplicaRole == ReplicaRole.ActiveSecondary)) { ++validCount; } if (replica.ReplicaStatus == ServiceReplicaStatus.InBuild) { ++inBuildReplicas; var message = StringHelper.Format("Replica {0} for partition '{1}' is InBuild", replica.Id, partition.Key.PartitionId()); TestabilityTrace.TraceSource.WriteInfo(TraceSource, "{0}", message); errorString.AppendLine(message); } } bool targetAchieved = this.CheckReplicaSetSize(partition.Key.PartitionInformation.Id, validCount, startTime, nodes, errorString); if (!partitionIsReady || (checkInBuild && inBuildReplicas > 0) || (checkTarget && !targetAchieved)) { success = false; } } else { partitionsInQuorumLoss.Add(partition.Key.PartitionInformation.Id); } } } else { int targetInstanceCount = 0; ReleaseAssert.AssertIf(!this.targetReplicaSetSize.TryGetValue(out targetInstanceCount), "targetReplicaSetSize for service: {0} should have been populated at this point.", this.serviceName); bool placementConstraintsDefined = false; try { // Get the service description to find out if there are placement constraints on the service ServiceDescription result = await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync( () => this.TestContext.FabricClient.ServiceManager.GetServiceDescriptionAsync( this.serviceName, this.requestTimeout, ct), this.operationTimeout, ct).ConfigureAwait(false); ThrowIf.IsTrue(result == null, "A description must be associated with the service: {0}", this.serviceName); placementConstraintsDefined = !string.IsNullOrEmpty(result.PlacementConstraints); } catch (UnauthorizedAccessException) { ServiceGroupDescription groupDescription = await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync( () => this.TestContext.FabricClient.ServiceGroupManager.GetServiceGroupDescriptionAsync( this.serviceName, this.requestTimeout, ct), this.operationTimeout, ct).ConfigureAwait(false); ThrowIf.IsTrue(groupDescription == null, "A description must be associated with the service group: {0}", this.serviceName); placementConstraintsDefined = !string.IsNullOrEmpty(groupDescription.ServiceDescription.PlacementConstraints); } // If a stateless service has instance count == -1 and it has placement constraints such // that the possible number of instances cannot match the total number of nodes, // we need to find out the number of eligible nodes for the service which is tracked by RDBug 8993319. // Until RDBug 8993319 is fixed, we take the presence of placement constraints into consideration to make the // validation more accurate. if (targetInstanceCount == -1 && placementConstraintsDefined) { checkTarget = false; } var partitionDictionary = await this.QueryPartitionAndReplicaResultAsyncStateless(timer.GetRemainingTime(), ct); totalPartitionsFound = partitionDictionary.Count(); foreach (KeyValuePair <Partition, StatelessServiceInstance[]> partition in partitionDictionary) { bool partitionIsReady = partition.Key.PartitionStatus == ServicePartitionStatus.Ready; if (!partitionIsReady) { var message = StringHelper.Format("Partition '{0}' is not Ready", partition.Key.PartitionId()); TestabilityTrace.TraceSource.WriteInfo(TraceSource, "{0}", message); errorString.AppendLine(message); } int validCount = 0; foreach (StatelessServiceInstance instance in partition.Value) { if (instance.ReplicaStatus == ServiceReplicaStatus.Ready) { ++validCount; } } bool targetAchieved = this.CheckReplicaSetSize(partition.Key.PartitionInformation.Id, validCount, startTime, nodes, errorString); if (!partitionIsReady || (checkTarget && !targetAchieved)) { success = false; } } } if (!this.ValidatePartitionCount(totalPartitionsFound)) { success = false; } if (partitionsInQuorumLoss.Count > 0 && checkQuorumLoss) { string paritionIds = string.Join(",", partitionsInQuorumLoss.ToArray()); var message = StringHelper.Format("Partitions '{0}' in quorum loss for service {1}", paritionIds, this.serviceName); TestabilityTrace.TraceSource.WriteInfo(TraceSource, "{0}", message); errorString.AppendLine(message); success = false; } if (!success) { if (retryCount % 10 == 0) { TestabilityTrace.TraceSource.WriteWarning(TraceSource, "Service {0} validation failed due to issues below, will retry: \n{1}", this.serviceName, errorString); } // Delay before querying again so we allow some time for state to change - don't spam the node await AsyncWaiter.WaitAsync(retryWait, ct).ConfigureAwait(false); } retryCount++; } if (partitionsInQuorumLoss.Count > 0) { string partitionIds = string.Join(",", partitionsInQuorumLoss.ToArray()); TestabilityTrace.TraceSource.WriteInfo(TraceSource, "Partitions in quorum loss for service {0} are '{1}'", this.serviceName, partitionIds); if (checkQuorumLoss) { throw new FabricValidationException(StringHelper.Format(StringResources.Error_PartitionsInQuorumLoss, partitionIds, this.serviceName)); } } if (!success) { return(new ValidationReport( true, StringHelper.Format(StringResources.Error_ServiceNotStable, this.serviceName, maximumStabilizationTimeout, errorString))); } else { return(ValidationReport.Default); } }
private async Task <string> GetUnhealthyItemsAsync(ServiceHealth serviceHealth, TimeoutHelper timer, CancellationToken ct) { StringBuilder healthinfo = new StringBuilder(); foreach (var serviceHealthEvent in serviceHealth.HealthEvents) { if (serviceHealthEvent.HealthInformation.HealthState == HealthState.Ok) { continue; } healthinfo.AppendLine(StringHelper.Format( "Service {0} health state is '{1}' with property '{2}', sourceId '{3}' and description '{4}'", this.serviceName, serviceHealthEvent.HealthInformation.HealthState, serviceHealthEvent.HealthInformation.Property, serviceHealthEvent.HealthInformation.SourceId, serviceHealthEvent.HealthInformation.Description)); } foreach (var partitionHealthState in serviceHealth.PartitionHealthStates) { if (partitionHealthState.AggregatedHealthState == HealthState.Ok) { continue; } var partitionHealth = await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync( () => this.TestContext.FabricClient.HealthManager.GetPartitionHealthAsync( partitionHealthState.PartitionId, this.requestTimeout, ct), FabricClientRetryErrors.GetEntityHealthFabricErrors.Value, timer.GetRemainingTime(), ct).ConfigureAwait(false); foreach (var partitionHealthEvent in partitionHealth.HealthEvents) { if (partitionHealthEvent.HealthInformation.HealthState == HealthState.Ok) { continue; } healthinfo.AppendLine(StringHelper.Format( "Service {0}:{1} health state is '{2}' with property '{3}', sourceId '{4}' and description '{5}'", this.serviceName, partitionHealth.PartitionId, partitionHealthEvent.HealthInformation.HealthState, partitionHealthEvent.HealthInformation.Property, partitionHealthEvent.HealthInformation.SourceId, partitionHealthEvent.HealthInformation.Description)); } foreach (var replicaHealthState in partitionHealth.ReplicaHealthStates) { if (replicaHealthState.AggregatedHealthState == HealthState.Ok) { continue; } var replicaHealth = await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync( () => this.TestContext.FabricClient.HealthManager.GetReplicaHealthAsync( replicaHealthState.PartitionId, replicaHealthState.Id, this.requestTimeout, ct), FabricClientRetryErrors.GetEntityHealthFabricErrors.Value, timer.GetRemainingTime(), ct).ConfigureAwait(false); foreach (var replicaHealthEvent in replicaHealth.HealthEvents) { if (replicaHealthEvent.HealthInformation.HealthState == HealthState.Ok) { continue; } healthinfo.AppendLine(StringHelper.Format( "Service {0}:{1}:{2} health state is '{3}' with property '{4}', sourceId '{5}' and description '{6}'", this.serviceName, replicaHealth.PartitionId, replicaHealth.Id, replicaHealthEvent.HealthInformation.HealthState, replicaHealthEvent.HealthInformation.Property, replicaHealthEvent.HealthInformation.SourceId, replicaHealthEvent.HealthInformation.Description)); } } } return(healthinfo.ToString()); }
protected override async Task ExecuteActionAsync(FabricTestContext testContext, RestartNodeAction action, CancellationToken cancellationToken) { ActionTraceSource.WriteInfo(TraceSource, "Enter RestartNodeAction/ExecuteActionAsync: operationTimeout='{0}', requestTimeout='{1}'", action.ActionTimeout, action.RequestTimeout); this.helper = new TimeoutHelper(action.ActionTimeout); SelectedReplica selectedReplica = SelectedReplica.None; string nodeName = action.NodeName; BigInteger nodeInstance = action.NodeInstance; bool createFabricDump = action.CreateFabricDump; if (string.IsNullOrEmpty(nodeName)) { ThrowIf.Null(action.ReplicaSelector, "ReplicaSelector"); var getReplicaStateAction = new GetSelectedReplicaStateAction(action.ReplicaSelector) { RequestTimeout = action.RequestTimeout, ActionTimeout = helper.GetRemainingTime() }; await testContext.ActionExecutor.RunAsync(getReplicaStateAction, cancellationToken).ConfigureAwait(false); var replicaStateActionResult = getReplicaStateAction.Result; ReleaseAssert.AssertIf(replicaStateActionResult == null, "replicaStateActionResult cannot be null"); selectedReplica = replicaStateActionResult.Item1; Replica replicaStateResult = replicaStateActionResult.Item2; ReleaseAssert.AssertIf(replicaStateResult == null, "replicaStateResult cannot be null"); nodeName = replicaStateResult.NodeName; nodeInstance = BigInteger.MinusOne; } if (nodeInstance == BigInteger.MinusOne) { var nodeInfo = await this.GetCurrentNodeInfoAsync(testContext, nodeName, action, cancellationToken).ConfigureAwait(false); nodeInstance = nodeInfo.NodeInstanceId; } await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync( () => testContext.FabricClient.FaultManager.RestartNodeUsingNodeNameAsync( nodeName, nodeInstance, createFabricDump, action.RequestTimeout, cancellationToken), this.helper.GetRemainingTime(), cancellationToken).ConfigureAwait(false); if (action.CompletionMode == CompletionMode.Verify) { bool success = false; while (this.helper.GetRemainingTime() > TimeSpan.Zero) { var nodeInfo = await this.GetCurrentNodeInfoAsync(testContext, nodeName, action, cancellationToken).ConfigureAwait(false); if (nodeInfo.NodeInstanceId > nodeInstance && nodeInfo.IsNodeUp) { success = true; break; } ActionTraceSource.WriteInfo(TraceSource, "NodeName = {0} not yet restarted. '{1}' seconds remain. Retrying...", nodeName, this.helper.GetRemainingTime().TotalSeconds); await AsyncWaiter.WaitAsync(TimeSpan.FromSeconds(5), cancellationToken); } if (!success) { throw new TimeoutException(StringHelper.Format(StringResources.Error_TestabilityActionTimeout, "RestartNode", nodeName)); } } // create result action.Result = new RestartNodeResult(selectedReplica, new NodeResult(nodeName, nodeInstance)); ResultTraceString = StringHelper.Format("RestartNodeAction succeeded for {0}:{1} with CompletionMode = {2}", nodeName, nodeInstance, action.CompletionMode); }