protected override async Task <bool> OnDataLossAsync(RestoreContext restoreContext, CancellationToken cancellationToken) { await DatalossHelper.UpdateHeathStateAsync(isHealthy : false).ConfigureAwait(false); UpgradeOrchestrationTrace.TraceSource.WriteWarning(TraceType, "Data loss happens. Manual cluster config upgrade is required to fix."); return(false); }
public static async Task UpdateHeathStateAsync(bool isHealthy) { try { FabricClient fc = new FabricClient(); HealthInformation healthInfo; if (!isHealthy) { healthInfo = new HealthInformation( Constants.UpgradeOrchestrationHealthSourceId, Constants.DatalossHealthProperty, HealthState.Error) { Description = "Data loss has occurred to Fabric Upgrade Orchestration Service. Run Start-ServiceFabricClusterConfigurationUpgrade to recover the service state.", TimeToLive = TimeSpan.MaxValue, RemoveWhenExpired = true }; } else { healthInfo = new HealthInformation( Constants.UpgradeOrchestrationHealthSourceId, Constants.DatalossHealthProperty, HealthState.Ok) { TimeToLive = TimeSpan.FromMilliseconds(1), RemoveWhenExpired = true }; } fc.HealthManager.ReportHealth(new ClusterHealthReport(healthInfo), new HealthReportSendOptions() { Immediate = true }); // HM takes a bit to process the health report. Wait for it to complete before returning or else the report will not be processed since fabricClient will be killed beforehand while (isHealthy == await DatalossHelper.IsInDatalossStateAsync(CancellationToken.None)) { UpgradeOrchestrationTrace.TraceSource.WriteInfo(FabricUpgradeOrchestrationService.TraceType, "Waiting for HM to process health report.."); await Task.Delay(TimeSpan.FromSeconds(5), CancellationToken.None).ConfigureAwait(false); } } catch (Exception ex) { UpgradeOrchestrationTrace.TraceSource.WriteError(FabricUpgradeOrchestrationService.TraceType, "Fail to report health warning for data loss due to error: {0}", ex); } }
public async Task ProcessStartClusterConfigurationUpgradeAsync(ConfigurationUpgradeDescription configUpgradeDesc, TimeSpan timeout, CancellationToken cancellationToken) { /* The cancellation token passed in this API call is not used (by design). This token corresponds to the client side call. * The global this.cancellationToken is initialised in RunAsync() and is honored in every API call. */ UpgradeOrchestrationTrace.TraceSource.WriteInfo(TraceType, "Entering ProcessStartUpgradeAsync."); try { UpgradeOrchestrationTrace.TraceSource.WriteInfo(TraceType, "Deserializing input json config string."); StandAloneInstallerJsonModelBase targetJsonConfig = StandAloneInstallerJsonModelBase.GetJsonConfigFromString(configUpgradeDesc.ClusterConfiguration); if (targetJsonConfig == null) { throw new ArgumentException("The input cluster configuration is not in a valid json format or supported apiVersion."); } UpgradeOrchestrationTrace.TraceSource.WriteInfo(TraceType, "Retrieve current cluster resource from StoreManager."); StandAloneCluster cluster = await this.storeManager.GetClusterResourceAsync( Constants.ClusterReliableDictionaryKey, this.cancellationToken).ConfigureAwait(false); bool isInDataLossState = await DatalossHelper.IsInDatalossStateAsync(this.cancellationToken).ConfigureAwait(false); if (!isInDataLossState) { if (cluster == null || cluster.Current == null) { UpgradeOrchestrationTrace.TraceSource.WriteWarning(TraceType, "Persisted cluster resource is not ready: {0}", cluster == null ? "null" : "current = null"); throw new FabricException("UpgradeOrchestrationService is not ready."); } UpgradeOrchestrationTrace.TraceSource.WriteInfo(TraceType, "Setting target config and topology based on new input json config."); if (cluster.Pending != null && !this.IsInterruptibleAsync(cluster.Pending).Result) { throw new FabricException(string.Format("Cluster configuration upgrade of type {0} is already in progress and cannot be interrupted.", cluster.Pending.GetType().Name)); } StandaloneSettingsValidator validator = new StandaloneSettingsValidator(targetJsonConfig); await UpgradeOrchestrationMessageProcessor.ValidateModel(targetJsonConfig, validator, cluster, true).ConfigureAwait(false); var removedNodes = validator.GetRemovedNodes(cluster.Topology); var addedNodes = validator.GetAddedNodes(cluster.Topology); if (addedNodes.Any() && StandaloneUtility.CheckFabricRunningAsGMSA(cluster.Current.CSMConfig)) { /* Need to resolve assembly so that FabricDeployer can load the right binaries from FabricCodePath since not all binaries required by FabricDeployer are present in UOS.Current folder.*/ AppDomain.CurrentDomain.AssemblyResolve += new ResolveEventHandler(this.LoadFromFabricCodePath); try { await this.PerformAddNodeOperationGMSAAsync(addedNodes, this.fabricClient, validator.ClusterProperties.NodeTypes).ConfigureAwait(false); } catch (AggregateException ex) { UpgradeOrchestrationTrace.TraceSource.WriteError(TraceType, "Adding nodes for GMSA scenario failed with exception: {0}", ex); throw; } finally { AppDomain.CurrentDomain.AssemblyResolve -= new ResolveEventHandler(this.LoadFromFabricCodePath); } } if (addedNodes.Any()) { cluster.TargetNodeConfig = GetTargetNodeConfigAddNode(validator.Topology, cluster.Current.NodeConfig.Version); } if (removedNodes.Any()) { cluster.TargetNodeConfig = GetTargetNodeConfigRemoveNode(cluster.Topology, removedNodes, cluster.Current.NodeConfig.Version); } else { cluster.Topology = validator.Topology; } cluster.TargetCsmConfig = validator.ClusterProperties; // Cluster is updated above so persist it. await this.storeManager.PersistClusterResourceAsync(Constants.ClusterReliableDictionaryKey, cluster, cancellationToken).ConfigureAwait(false); await this.UpdatePersistedCodeUpgradePackage(validator).ConfigureAwait(false); UpgradeOrchestrationTrace.TraceSource.WriteInfo(TraceType, "Invoking Orchestrator"); await this.orchestrator.StartUpgradeAsync(cluster, this.cancellationToken, configUpgradeDesc).ContinueWith(t => { if (t.Exception != null) { UpgradeOrchestrationTrace.TraceSource.WriteWarning(TraceType, "Orchestrator completed with status: {0} exception: {1}", t.Status, t.Exception); } else { UpgradeOrchestrationTrace.TraceSource.WriteInfo(TraceType, "Orchestrator completed with status: {0}", t.Status); } }); } else { StandaloneSettingsValidator validator = new StandaloneSettingsValidator(targetJsonConfig); await UpgradeOrchestrationMessageProcessor.ValidateModel(targetJsonConfig, validator, cluster, false).ConfigureAwait(false); cluster = FabricUpgradeOrchestrationService.ConstructClusterFromJson(targetJsonConfig, FabricNativeConfigStore.FabricGetConfigStore()); DatalossHelper.DryRunConfigUpgrade(cluster); await this.storeManager.PersistClusterResourceAsync(Constants.ClusterReliableDictionaryKey, cluster, this.cancellationToken); await DatalossHelper.UpdateHeathStateAsync(isHealthy : true).ConfigureAwait(false); } } catch (Exception e) { UpgradeOrchestrationTrace.TraceSource.WriteWarning(TraceType, "ProcessStartUpgradeAsync exception: {0}", e); throw UpgradeOrchestrationMessageProcessor.ConvertToComException(e); } UpgradeOrchestrationTrace.TraceSource.WriteInfo(TraceType, "Exiting ProcessStartUpgradeAsync."); }
protected override async Task RunAsync(CancellationToken cancellationToken) { UpgradeOrchestrationTrace.TraceSource.WriteInfo(TraceType, "Enter RunAsync"); try { var configStore = FabricNativeConfigStore.FabricGetConfigStore(); StandAloneFabricSettingsActivator.InitializeConfigStore(configStore); UpgradeOrchestrationTrace.TraceSource.WriteInfo(TraceType, "After InitializeAsync"); UpgradeOrchestrationTrace.TraceSource.WriteInfo(TraceType, "Getting cluster resource from store manager."); StandAloneCluster cluster = await this.storeManager.GetClusterResourceAsync(Constants.ClusterReliableDictionaryKey, cancellationToken).ConfigureAwait(false); bool isInDatalossState = await DatalossHelper.IsInDatalossStateAsync(cancellationToken).ConfigureAwait(false); if (!isInDatalossState) { bool isBaselineUpgrade = false; if (cluster == null) { // Cluster resource does not exist, e.g. first time baseline upgrade. UpgradeOrchestrationTrace.TraceSource.WriteInfo(TraceType, "Cluster resource does not exist in store manager. Initiating new clsuter resource."); cluster = FabricUpgradeOrchestrationService.InitClusterResource(configStore); ReleaseAssert.AssertIf(cluster == null, "Cluster Resource cannot be initialized."); isBaselineUpgrade = true; await this.CompleteAndPersistBaselineStateAsync(cluster, cancellationToken).ConfigureAwait(false); } UpgradeOrchestrationTrace.TraceSource.WriteInfo(TraceType, "Setting a valid cancellation token for UpgradeOrchestrationMessageProcessor"); this.messageProcessor.CancellationToken = cancellationToken; /* In version 5.7, we added Iron as a new entry to ReliabilityLevel enum. This entry was added as ReliabilityLevel = 1 and hence, it moved all the existing levels one level down. * For clusters created in <=5.6, when upgraded to 5.7+ the ReliabilityLevel field would be interpreted differently. The below code fixes this issue by reading the * UOS state and comparing it against the actual cluster state. If there is a mismatch it sets the UOS to the correct ReliabilityLevel. */ if (!isBaselineUpgrade && cluster.Current != null && cluster.Current.CSMConfig != null) { UpgradeOrchestrationTrace.TraceSource.WriteInfo(TraceType, "Current ReliabilityLevel set in UOS state {0}", cluster.Current.CSMConfig.ReliabilityLevel); var actualReliabilityLevelForCluster = await this.GetActualReliabilityLevelForCluster(cancellationToken).ConfigureAwait(false); UpgradeOrchestrationTrace.TraceSource.WriteInfo(TraceType, "Actual ReliabilityLevel set for the cluster {0}", actualReliabilityLevelForCluster); if (actualReliabilityLevelForCluster != cluster.Current.CSMConfig.ReliabilityLevel) { UpgradeOrchestrationTrace.TraceSource.WriteInfo(TraceType, "UOS ReliabilityLevel is inconsistent with actual reliability level for the cluster.. Setting UOS state to {0}", actualReliabilityLevelForCluster); cluster.Current.CSMConfig.ReliabilityLevel = actualReliabilityLevelForCluster; await this.storeManager.PersistClusterResourceAsync(Constants.ClusterReliableDictionaryKey, cluster, cancellationToken).ConfigureAwait(false); cluster = await this.storeManager.GetClusterResourceAsync(Constants.ClusterReliableDictionaryKey, cancellationToken).ConfigureAwait(false); UpgradeOrchestrationTrace.TraceSource.WriteInfo(TraceType, "UOS ReliabilityLevel set to {0}", cluster.Current.CSMConfig.ReliabilityLevel); } } /* This is a workaround till actual admin config upgrades are implemented. In 5.7 we changed some properties in ClusterSettings * but those will not take effect for clusters created with version < 5.7 since no upgrade reads those settings. * This workaround initiates a WRP config upgrade in the form of SimpleClusterUpgradeState if the settings found are old (after the code upgrade completes to this version). */ string isAdminConfigUpgradeAttempted = await this.storeManager.GetStorageObjectAsync(Constants.AdminConfigUpgradeAttemptedDictionaryKey, cancellationToken).ConfigureAwait(false); if (!isBaselineUpgrade && string.IsNullOrEmpty(isAdminConfigUpgradeAttempted) && this.IsFirewallRuleDisabled()) { UpgradeOrchestrationTrace.TraceSource.WriteInfo(TraceType, "Old admin configuration settings detected.. Will initiate admin config upgrade after code upgrade completes.."); bool isCurrentCodeUpgradeCompleted = false; while (!isCurrentCodeUpgradeCompleted) { isCurrentCodeUpgradeCompleted = await this.CheckCodeUpgradeCompletedAsync(cancellationToken).ConfigureAwait(false); await Task.Delay(TimeSpan.FromSeconds(10), cancellationToken).ConfigureAwait(false); } UpgradeOrchestrationTrace.TraceSource.WriteInfo(TraceType, "Setting targetWRPConfig to initiate admin config upgrade."); var adminConfig = new StandaloneAdminConfig(); adminConfig.Version.ClusterSettingsVersion = "2.1"; cluster.TargetWrpConfig = adminConfig; await this.storeManager.SetStorageObjectAsync(Constants.AdminConfigUpgradeAttemptedDictionaryKey, "true", cancellationToken).ConfigureAwait(false); } if (!isBaselineUpgrade) { // If the cluster manifest versions don't match, send a health warning for users to know. Task manifestCheck = this.SetupClusterManifestVersionCheck(cancellationToken, cluster.Current.ExternalState.ClusterManifest.Version); UpgradeOrchestrationTrace.TraceSource.WriteInfo(TraceType, "Invoking Orchestrator.StartUpgradeAsync"); Task t = this.Orchestrator.StartUpgradeAsync(cluster, cancellationToken, new ConfigurationUpgradeDescription()); } UpgradeOrchestrationTrace.TraceSource.WriteInfo(TraceType, "Done with Orchestrator.StartUpgradeAsync"); } var goalStateProvisioner = new StandaloneGoalStateProvisioner(this.storeManager, this.Orchestrator, cancellationToken); Task goalStatePollTask = (goalStateProvisioner.IsAutoupgradeEnabled() || goalStateProvisioner.IsAutoupgradeInstallEnabled()) ? goalStateProvisioner.SetUpGoalStatePoll(cancellationToken, this.IsSkipInitialGoalStateCheck()) : Task.Run( () => { goalStateProvisioner.EmitGoalStateReachableHealth(this.fabricClient, true /*success*/); goalStateProvisioner.EmitClusterVersionSupportedHealth(this.fabricClient, true /*success*/); }, cancellationToken); await Task.Delay(Timeout.Infinite, cancellationToken).ConfigureAwait(false); } catch (FabricNotPrimaryException ex) { UpgradeOrchestrationTrace.TraceSource.WriteError(TraceType, ex.ToString()); } catch (FabricObjectClosedException ocex) { UpgradeOrchestrationTrace.TraceSource.WriteError(TraceType, ocex.ToString()); } UpgradeOrchestrationTrace.TraceSource.WriteInfo(TraceType, "Exit RunAsync"); }