protected async override Task ExecuteActionAsync(FabricTestContext testContext, ApiFaultInformationAction action, CancellationToken cancellationToken) { object value; var storeConnection = testContext.ExtensionProperties.TryGetValue(connectionPrameterName, out value) ? value as EventStoreConnection : null; if (storeConnection == null) { throw new InvalidOperationException(StringResources.EventStoreError_ConnectionRequired); } var task = Task <ApiFaultInformation> .Factory.StartNew( () => EventStoreHelper.GetApiFaultInformation( storeConnection, action.StartTime, action.EndTime), cancellationToken); await task; ResultTraceString = StringHelper.Format("ApiFaultInformationAction succeeded."); action.Result = task.Result; }
protected override async Task ExecuteActionAsync(FabricTestContext testContext, IsClusterUpgradingAction action, CancellationToken cancellationToken) { var currentProgress = await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync( () => testContext.FabricClient.ClusterManager.GetFabricUpgradeProgressAsync( action.RequestTimeout, cancellationToken), FabricClientRetryErrors.UpgradeFabricErrors.Value, action.ActionTimeout, cancellationToken).ConfigureAwait(false); ReleaseAssert.AssertIfNull(currentProgress, "currentProgress"); action.Result = currentProgress.UpgradeState != FabricUpgradeState.RollingBackCompleted && currentProgress.UpgradeState != FabricUpgradeState.RollingForwardCompleted; ResultTraceString = "IsClusterUpgradingAction succeeded"; }
protected override async Task ExecuteActionAsync(FabricTestContext testContext, IsInfrastructureUpgradingAction action, CancellationToken cancellationToken) { TestabilityTrace.TraceSource.WriteInfo(TraceType, "Checking if InfrastructureUpgrade is going on."); var status = await this.GetInfrastructureServiceStatus( testContext.FabricClient, action.ActionTimeout, action.RequestTimeout, cancellationToken).ConfigureAwait(false); if (!status.Any()) { action.Result = false; } else { foreach (var kvp in status) { var jobCollectionData = kvp.Value.Jobs; if (jobCollectionData == null) { continue; } var jobDataArray = jobCollectionData.Jobs; if (jobDataArray == null) { continue; } var activeJobs = jobDataArray.Where( j => ChaosConstants.UpgradingInfrastructureJobStatus.Contains(j.DetailedStatus)); if (activeJobs.Any()) { action.Result = true; return; } } action.Result = false; } }
private async Task <ClusterStateSnapshot> CaptureClusterStateSnapshotAndPopulateEntitiesAsync( FabricTestContext testContext, GetClusterStateSnapshotAction action, CancellationToken cancellationToken) { this.PartitionMapFromFM = new HashSet <string>(StringComparer.InvariantCulture); this.PartitionMapFromNodes = new HashSet <string>(StringComparer.InvariantCulture); this.requestTimeOut = action.RequestTimeout; this.timer = new TimeoutHelper(action.ActionTimeout); this.testContext = testContext; this.deployedSystemReplicaMap = new Dictionary <NodeInfo, DeployedServiceReplicaList>(); var nodes = await this.testContext.FabricCluster.GetLatestNodeInfoAsync(this.requestTimeOut, this.timer.GetRemainingTime(), cancellationToken).ConfigureAwait(false); var clusterSnapshot = new ClusterStateSnapshot(false, action.ShouldFaultSystem); var nodeInfos = nodes as IList <NodeInfo> ?? nodes.ToList(); clusterSnapshot.Nodes.AddNodes(nodeInfos); clusterSnapshot.PopulateNodeMaps(nodes); // Get all current active applications var appListResult = await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync( () => this.testContext.FabricClient.QueryManager.GetApplicationListAsync( null, string.Empty, this.requestTimeOut, cancellationToken), this.timer.GetRemainingTime(), cancellationToken).ConfigureAwait(false); if (appListResult != null) { foreach (var appResultItem in appListResult) { var applicationEntity = clusterSnapshot.Applications.AddApplication(appResultItem); await this.PopulateApplicationEntityAsync(applicationEntity, cancellationToken).ConfigureAwait(false); } var systemApplicationEntity = clusterSnapshot.Applications.AddApplication(SystemApplication); await this.PopulateApplicationEntityAsync(systemApplicationEntity, cancellationToken).ConfigureAwait(false); } foreach (var node in nodeInfos) { var node1 = node.Clone(); if (node1.IsNodeUp) { var retryableErrorsForGetDeployedApplicationList = new FabricClientRetryErrors(); retryableErrorsForGetDeployedApplicationList.RetryableFabricErrorCodes.Add(FabricErrorCode.InvalidAddress); var deployedApplicationList = await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync( () => this.testContext.FabricClient.QueryManager.GetDeployedApplicationListAsync( node1.NodeName, null, this.requestTimeOut, cancellationToken), retryableErrorsForGetDeployedApplicationList, this.timer.GetRemainingTime(), cancellationToken).ConfigureAwait(false); // Add system app entity in the deployed application list // so that we get the deployed replica list for the node if (await this.HasDeployedSystemServiceAsync(node1, cancellationToken).ConfigureAwait(false)) { if (deployedApplicationList == null) { deployedApplicationList = new DeployedApplicationList(); } deployedApplicationList.Add(DeployedSystemApplication); } TestabilityTrace.TraceSource.WriteInfo(TraceType, "Node: {0} has the following apps deployed...", node1); foreach (var app in deployedApplicationList) { TestabilityTrace.TraceSource.WriteInfo(TraceType, "Deployed app = {0}", app.ApplicationName.OriginalString); } foreach (var app in deployedApplicationList) { var application = app; var applicationEntity = clusterSnapshot.Applications.FirstOrDefault(a => a.Application.ApplicationName == application.ApplicationName); if (applicationEntity != null) { if (!await this.TryAssociateDeployedReplicaWithDeployedCodepackageAsync( node1, applicationEntity, cancellationToken).ConfigureAwait(false)) { return(null); } } } // iterate through the deployed apps } // if a node is up } // iterate through the nodes // Information acquired through queries could go stale due to the cluster dynamism. // This happened while the cluster snapshot was being taken -- making the snapshot internally inconsistent. // The fix is to ignore the inconsistent snapshot and capture it again. // // If FailoverManager's point of view coincides with that of the Nodes, return the snapshot; // otherwise, throw FabricException to indicate that the snapshot should be captured afresh. // if (!this.PartitionMapFromFM.SetEquals(this.PartitionMapFromNodes)) { StringBuilder exceptionMessageBuilder = new StringBuilder(); var copyOfFmInfo = new HashSet <string>(this.PartitionMapFromFM); this.PartitionMapFromFM.ExceptWith(this.PartitionMapFromNodes); if (this.PartitionMapFromFM.Any()) { exceptionMessageBuilder.AppendLine(string.Format(CultureInfo.InvariantCulture, "FM has the following extra information:")); foreach (var pinfo in this.PartitionMapFromFM) { exceptionMessageBuilder.AppendLine(string.Format(CultureInfo.InvariantCulture, ReplicaViewPrintFormat, Tab, pinfo)); } } this.PartitionMapFromNodes.ExceptWith(copyOfFmInfo); if (this.PartitionMapFromNodes.Any()) { exceptionMessageBuilder.AppendLine(string.Format(CultureInfo.InvariantCulture, "Nodes has the following partitions deployed, which FM does not know about:")); foreach (var pinfo in this.PartitionMapFromNodes) { exceptionMessageBuilder.AppendLine(string.Format(CultureInfo.InvariantCulture, ReplicaViewPrintFormat, Tab, pinfo)); } } TestabilityTrace.TraceSource.WriteWarning(TraceType, string.Format(CultureInfo.InvariantCulture, "{0}", exceptionMessageBuilder.ToString())); throw new ChaosInconsistentClusterSnapshotException(exceptionMessageBuilder.ToString()); } return(clusterSnapshot); }
protected override async Task ExecuteActionAsync(FabricTestContext testContext, GetClusterStateSnapshotAction action, CancellationToken cancellationToken) { Dictionary <string, int> ExceptionHistory = new Dictionary <string, int>(); int retries = 0; GetClusterStateSnapshotAction.ServiceCount = 0; GetClusterStateSnapshotAction.PartitionCount = 0; GetClusterStateSnapshotAction.ReplicaCount = 0; Stopwatch stopWatch = Stopwatch.StartNew(); ClusterStateSnapshot clusterSnapshot = null; do { ++retries; await Task.Delay(Constants.DefaultChaosSnapshotRecaptureBackoffInterval, cancellationToken).ConfigureAwait(false); try { clusterSnapshot = await this.CaptureClusterStateSnapshotAndPopulateEntitiesAsync( testContext, action, cancellationToken).ConfigureAwait(false); } catch (Exception exception) when(exception is FabricException || exception is ChaosInconsistentClusterSnapshotException) { string exceptionString = exception.Message; if (ExceptionHistory.ContainsKey(exceptionString)) { ExceptionHistory[exceptionString]++; } else { ExceptionHistory[exceptionString] = 1; } } string allExceptions = string.Join(ExceptionDelimeter, ExceptionHistory); if (retries >= action.MaximumNumberOfRetries) { TestabilityTrace.TraceSource.WriteWarning(TraceType, "While taking a consistent cluster snapshot, following exceptions occurred: {0}", allExceptions); } ChaosUtility.ThrowOrAssertIfTrue( ChaosConstants.GetClusterSnapshotAction_MaximumNumberOfRetriesAchieved_TelemetryId, retries >= action.MaximumNumberOfRetries, string.Format(StringResources.ChaosEngineError_GetClusterSnapshotAction_MaximumNumberOfRetriesAchieved, action.MaximumNumberOfRetries, allExceptions)); }while (clusterSnapshot == null); stopWatch.Stop(); var elapsedInGatherSnapshot = stopWatch.Elapsed; stopWatch = Stopwatch.StartNew(); clusterSnapshot.ApplyChaosTargetFilter(action.ChaosTargetFilter); clusterSnapshot.MarkAllUnsafeEntities(); stopWatch.Stop(); var elapsedInMarkAllUnsafe = stopWatch.Elapsed; if (UniformRandomNumberGenerator.NextDouble() < action.TelemetrySamplingProbability) { FabricEvents.Events.ChaosSnapshot( Guid.NewGuid().ToString(), clusterSnapshot.Nodes.Count, clusterSnapshot.Applications.Count, GetClusterStateSnapshotAction.ServiceCount, GetClusterStateSnapshotAction.PartitionCount, GetClusterStateSnapshotAction.ReplicaCount, elapsedInGatherSnapshot.TotalSeconds, elapsedInMarkAllUnsafe.TotalSeconds, retries); } TestabilityTrace.TraceSource.WriteInfo(TraceType, "For '{0}' nodes, '{1}' apps, '{2}' services, '{3}' partitions, '{4}' replicas, snapshot took '{5}', mark unsafe took '{6}', took '{7}' retries.", clusterSnapshot.Nodes.Count, clusterSnapshot.Applications.Count, GetClusterStateSnapshotAction.ServiceCount, GetClusterStateSnapshotAction.PartitionCount, GetClusterStateSnapshotAction.ReplicaCount, elapsedInGatherSnapshot, elapsedInMarkAllUnsafe, retries); action.Result = clusterSnapshot; ResultTraceString = "GetClusterStateSnapshotAction succeeded"; }