protected override async Task <bool> OnDataLossAsync(RestoreContext restoreContext, CancellationToken cancellationToken)
        {
            await DatalossHelper.UpdateHeathStateAsync(isHealthy : false).ConfigureAwait(false);

            UpgradeOrchestrationTrace.TraceSource.WriteWarning(TraceType, "Data loss happens. Manual cluster config upgrade is required to fix.");

            return(false);
        }
예제 #2
0
        public static async Task UpdateHeathStateAsync(bool isHealthy)
        {
            try
            {
                FabricClient      fc = new FabricClient();
                HealthInformation healthInfo;

                if (!isHealthy)
                {
                    healthInfo = new HealthInformation(
                        Constants.UpgradeOrchestrationHealthSourceId,
                        Constants.DatalossHealthProperty,
                        HealthState.Error)
                    {
                        Description       = "Data loss has occurred to Fabric Upgrade Orchestration Service. Run Start-ServiceFabricClusterConfigurationUpgrade to recover the service state.",
                        TimeToLive        = TimeSpan.MaxValue,
                        RemoveWhenExpired = true
                    };
                }
                else
                {
                    healthInfo = new HealthInformation(
                        Constants.UpgradeOrchestrationHealthSourceId,
                        Constants.DatalossHealthProperty,
                        HealthState.Ok)
                    {
                        TimeToLive        = TimeSpan.FromMilliseconds(1),
                        RemoveWhenExpired = true
                    };
                }

                fc.HealthManager.ReportHealth(new ClusterHealthReport(healthInfo), new HealthReportSendOptions()
                {
                    Immediate = true
                });

                // HM takes a bit to process the health report. Wait for it to complete before returning or else the report will not be processed since fabricClient will be killed beforehand
                while (isHealthy == await DatalossHelper.IsInDatalossStateAsync(CancellationToken.None))
                {
                    UpgradeOrchestrationTrace.TraceSource.WriteInfo(FabricUpgradeOrchestrationService.TraceType, "Waiting for HM to process health report..");
                    await Task.Delay(TimeSpan.FromSeconds(5), CancellationToken.None).ConfigureAwait(false);
                }
            }
            catch (Exception ex)
            {
                UpgradeOrchestrationTrace.TraceSource.WriteError(FabricUpgradeOrchestrationService.TraceType, "Fail to report health warning for data loss due to error: {0}", ex);
            }
        }
        public async Task ProcessStartClusterConfigurationUpgradeAsync(ConfigurationUpgradeDescription configUpgradeDesc, TimeSpan timeout, CancellationToken cancellationToken)
        {
            /* The cancellation token passed in this API call is not used (by design). This token corresponds to the client side call.
             * The global this.cancellationToken is initialised in RunAsync() and is honored in every API call. */

            UpgradeOrchestrationTrace.TraceSource.WriteInfo(TraceType, "Entering ProcessStartUpgradeAsync.");
            try
            {
                UpgradeOrchestrationTrace.TraceSource.WriteInfo(TraceType, "Deserializing input json config string.");
                StandAloneInstallerJsonModelBase targetJsonConfig = StandAloneInstallerJsonModelBase.GetJsonConfigFromString(configUpgradeDesc.ClusterConfiguration);
                if (targetJsonConfig == null)
                {
                    throw new ArgumentException("The input cluster configuration is not in a valid json format or supported apiVersion.");
                }

                UpgradeOrchestrationTrace.TraceSource.WriteInfo(TraceType, "Retrieve current cluster resource from StoreManager.");
                StandAloneCluster cluster = await this.storeManager.GetClusterResourceAsync(
                    Constants.ClusterReliableDictionaryKey, this.cancellationToken).ConfigureAwait(false);

                bool isInDataLossState = await DatalossHelper.IsInDatalossStateAsync(this.cancellationToken).ConfigureAwait(false);

                if (!isInDataLossState)
                {
                    if (cluster == null || cluster.Current == null)
                    {
                        UpgradeOrchestrationTrace.TraceSource.WriteWarning(TraceType, "Persisted cluster resource is not ready: {0}", cluster == null ? "null" : "current = null");
                        throw new FabricException("UpgradeOrchestrationService is not ready.");
                    }

                    UpgradeOrchestrationTrace.TraceSource.WriteInfo(TraceType, "Setting target config and topology based on new input json config.");

                    if (cluster.Pending != null && !this.IsInterruptibleAsync(cluster.Pending).Result)
                    {
                        throw new FabricException(string.Format("Cluster configuration upgrade of type {0} is already in progress and cannot be interrupted.", cluster.Pending.GetType().Name));
                    }

                    StandaloneSettingsValidator validator = new StandaloneSettingsValidator(targetJsonConfig);

                    await UpgradeOrchestrationMessageProcessor.ValidateModel(targetJsonConfig, validator, cluster, true).ConfigureAwait(false);

                    var removedNodes = validator.GetRemovedNodes(cluster.Topology);
                    var addedNodes   = validator.GetAddedNodes(cluster.Topology);

                    if (addedNodes.Any() && StandaloneUtility.CheckFabricRunningAsGMSA(cluster.Current.CSMConfig))
                    {
                        /* Need to resolve assembly so that FabricDeployer can load the right binaries from FabricCodePath since not all binaries required by FabricDeployer are present in UOS.Current folder.*/
                        AppDomain.CurrentDomain.AssemblyResolve += new ResolveEventHandler(this.LoadFromFabricCodePath);
                        try
                        {
                            await this.PerformAddNodeOperationGMSAAsync(addedNodes, this.fabricClient, validator.ClusterProperties.NodeTypes).ConfigureAwait(false);
                        }
                        catch (AggregateException ex)
                        {
                            UpgradeOrchestrationTrace.TraceSource.WriteError(TraceType, "Adding nodes for GMSA scenario failed with exception: {0}", ex);
                            throw;
                        }
                        finally
                        {
                            AppDomain.CurrentDomain.AssemblyResolve -= new ResolveEventHandler(this.LoadFromFabricCodePath);
                        }
                    }

                    if (addedNodes.Any())
                    {
                        cluster.TargetNodeConfig = GetTargetNodeConfigAddNode(validator.Topology, cluster.Current.NodeConfig.Version);
                    }

                    if (removedNodes.Any())
                    {
                        cluster.TargetNodeConfig = GetTargetNodeConfigRemoveNode(cluster.Topology, removedNodes, cluster.Current.NodeConfig.Version);
                    }
                    else
                    {
                        cluster.Topology = validator.Topology;
                    }

                    cluster.TargetCsmConfig = validator.ClusterProperties;

                    // Cluster is updated above so persist it.
                    await this.storeManager.PersistClusterResourceAsync(Constants.ClusterReliableDictionaryKey, cluster, cancellationToken).ConfigureAwait(false);

                    await this.UpdatePersistedCodeUpgradePackage(validator).ConfigureAwait(false);

                    UpgradeOrchestrationTrace.TraceSource.WriteInfo(TraceType, "Invoking Orchestrator");

                    await this.orchestrator.StartUpgradeAsync(cluster, this.cancellationToken, configUpgradeDesc).ContinueWith(t =>
                    {
                        if (t.Exception != null)
                        {
                            UpgradeOrchestrationTrace.TraceSource.WriteWarning(TraceType, "Orchestrator completed with status: {0} exception: {1}", t.Status, t.Exception);
                        }
                        else
                        {
                            UpgradeOrchestrationTrace.TraceSource.WriteInfo(TraceType, "Orchestrator completed with status: {0}", t.Status);
                        }
                    });
                }
                else
                {
                    StandaloneSettingsValidator validator = new StandaloneSettingsValidator(targetJsonConfig);
                    await UpgradeOrchestrationMessageProcessor.ValidateModel(targetJsonConfig, validator, cluster, false).ConfigureAwait(false);

                    cluster = FabricUpgradeOrchestrationService.ConstructClusterFromJson(targetJsonConfig, FabricNativeConfigStore.FabricGetConfigStore());

                    DatalossHelper.DryRunConfigUpgrade(cluster);
                    await this.storeManager.PersistClusterResourceAsync(Constants.ClusterReliableDictionaryKey, cluster, this.cancellationToken);

                    await DatalossHelper.UpdateHeathStateAsync(isHealthy : true).ConfigureAwait(false);
                }
            }
            catch (Exception e)
            {
                UpgradeOrchestrationTrace.TraceSource.WriteWarning(TraceType, "ProcessStartUpgradeAsync exception: {0}", e);
                throw UpgradeOrchestrationMessageProcessor.ConvertToComException(e);
            }

            UpgradeOrchestrationTrace.TraceSource.WriteInfo(TraceType, "Exiting ProcessStartUpgradeAsync.");
        }
        protected override async Task RunAsync(CancellationToken cancellationToken)
        {
            UpgradeOrchestrationTrace.TraceSource.WriteInfo(TraceType, "Enter RunAsync");

            try
            {
                var configStore = FabricNativeConfigStore.FabricGetConfigStore();

                StandAloneFabricSettingsActivator.InitializeConfigStore(configStore);

                UpgradeOrchestrationTrace.TraceSource.WriteInfo(TraceType, "After InitializeAsync");

                UpgradeOrchestrationTrace.TraceSource.WriteInfo(TraceType, "Getting cluster resource from store manager.");
                StandAloneCluster cluster = await this.storeManager.GetClusterResourceAsync(Constants.ClusterReliableDictionaryKey, cancellationToken).ConfigureAwait(false);

                bool isInDatalossState = await DatalossHelper.IsInDatalossStateAsync(cancellationToken).ConfigureAwait(false);

                if (!isInDatalossState)
                {
                    bool isBaselineUpgrade = false;
                    if (cluster == null)
                    {
                        // Cluster resource does not exist, e.g. first time baseline upgrade.
                        UpgradeOrchestrationTrace.TraceSource.WriteInfo(TraceType, "Cluster resource does not exist in store manager. Initiating new clsuter resource.");

                        cluster = FabricUpgradeOrchestrationService.InitClusterResource(configStore);
                        ReleaseAssert.AssertIf(cluster == null, "Cluster Resource cannot be initialized.");
                        isBaselineUpgrade = true;

                        await this.CompleteAndPersistBaselineStateAsync(cluster, cancellationToken).ConfigureAwait(false);
                    }

                    UpgradeOrchestrationTrace.TraceSource.WriteInfo(TraceType, "Setting a valid cancellation token for UpgradeOrchestrationMessageProcessor");
                    this.messageProcessor.CancellationToken = cancellationToken;

                    /* In version 5.7, we added Iron as a new entry to ReliabilityLevel enum. This entry was added as ReliabilityLevel = 1 and hence, it moved all the existing levels one level down.
                     * For clusters created in <=5.6, when upgraded to 5.7+ the ReliabilityLevel field would be interpreted differently. The below code fixes this issue by reading the
                     * UOS state and comparing it against the actual cluster state. If there is a mismatch it sets the UOS to the correct ReliabilityLevel.
                     */
                    if (!isBaselineUpgrade &&
                        cluster.Current != null &&
                        cluster.Current.CSMConfig != null)
                    {
                        UpgradeOrchestrationTrace.TraceSource.WriteInfo(TraceType, "Current ReliabilityLevel set in UOS state {0}", cluster.Current.CSMConfig.ReliabilityLevel);
                        var actualReliabilityLevelForCluster = await this.GetActualReliabilityLevelForCluster(cancellationToken).ConfigureAwait(false);

                        UpgradeOrchestrationTrace.TraceSource.WriteInfo(TraceType, "Actual ReliabilityLevel set for the cluster {0}", actualReliabilityLevelForCluster);
                        if (actualReliabilityLevelForCluster != cluster.Current.CSMConfig.ReliabilityLevel)
                        {
                            UpgradeOrchestrationTrace.TraceSource.WriteInfo(TraceType, "UOS ReliabilityLevel is inconsistent with actual reliability level for the cluster.. Setting UOS state to {0}", actualReliabilityLevelForCluster);
                            cluster.Current.CSMConfig.ReliabilityLevel = actualReliabilityLevelForCluster;
                            await this.storeManager.PersistClusterResourceAsync(Constants.ClusterReliableDictionaryKey, cluster, cancellationToken).ConfigureAwait(false);

                            cluster = await this.storeManager.GetClusterResourceAsync(Constants.ClusterReliableDictionaryKey, cancellationToken).ConfigureAwait(false);

                            UpgradeOrchestrationTrace.TraceSource.WriteInfo(TraceType, "UOS ReliabilityLevel set to {0}", cluster.Current.CSMConfig.ReliabilityLevel);
                        }
                    }

                    /* This is a workaround till actual admin config upgrades are implemented. In 5.7 we changed some properties in ClusterSettings
                     * but those will not take effect for clusters created with version < 5.7 since no upgrade reads those settings.
                     * This workaround initiates a WRP config upgrade in the form of SimpleClusterUpgradeState if the settings found are old (after the code upgrade completes to this version). */
                    string isAdminConfigUpgradeAttempted = await this.storeManager.GetStorageObjectAsync(Constants.AdminConfigUpgradeAttemptedDictionaryKey, cancellationToken).ConfigureAwait(false);

                    if (!isBaselineUpgrade && string.IsNullOrEmpty(isAdminConfigUpgradeAttempted) && this.IsFirewallRuleDisabled())
                    {
                        UpgradeOrchestrationTrace.TraceSource.WriteInfo(TraceType, "Old admin configuration settings detected.. Will initiate admin config upgrade after code upgrade completes..");
                        bool isCurrentCodeUpgradeCompleted = false;
                        while (!isCurrentCodeUpgradeCompleted)
                        {
                            isCurrentCodeUpgradeCompleted = await this.CheckCodeUpgradeCompletedAsync(cancellationToken).ConfigureAwait(false);

                            await Task.Delay(TimeSpan.FromSeconds(10), cancellationToken).ConfigureAwait(false);
                        }

                        UpgradeOrchestrationTrace.TraceSource.WriteInfo(TraceType, "Setting targetWRPConfig to initiate admin config upgrade.");
                        var adminConfig = new StandaloneAdminConfig();
                        adminConfig.Version.ClusterSettingsVersion = "2.1";
                        cluster.TargetWrpConfig = adminConfig;
                        await this.storeManager.SetStorageObjectAsync(Constants.AdminConfigUpgradeAttemptedDictionaryKey, "true", cancellationToken).ConfigureAwait(false);
                    }

                    if (!isBaselineUpgrade)
                    {
                        // If the cluster manifest versions don't match, send a health warning for users to know.
                        Task manifestCheck = this.SetupClusterManifestVersionCheck(cancellationToken, cluster.Current.ExternalState.ClusterManifest.Version);

                        UpgradeOrchestrationTrace.TraceSource.WriteInfo(TraceType, "Invoking Orchestrator.StartUpgradeAsync");
                        Task t = this.Orchestrator.StartUpgradeAsync(cluster, cancellationToken, new ConfigurationUpgradeDescription());
                    }

                    UpgradeOrchestrationTrace.TraceSource.WriteInfo(TraceType, "Done with Orchestrator.StartUpgradeAsync");
                }

                var  goalStateProvisioner = new StandaloneGoalStateProvisioner(this.storeManager, this.Orchestrator, cancellationToken);
                Task goalStatePollTask    = (goalStateProvisioner.IsAutoupgradeEnabled() || goalStateProvisioner.IsAutoupgradeInstallEnabled())
                    ? goalStateProvisioner.SetUpGoalStatePoll(cancellationToken, this.IsSkipInitialGoalStateCheck())
                    : Task.Run(
                    () =>
                {
                    goalStateProvisioner.EmitGoalStateReachableHealth(this.fabricClient, true /*success*/);
                    goalStateProvisioner.EmitClusterVersionSupportedHealth(this.fabricClient, true /*success*/);
                },
                    cancellationToken);

                await Task.Delay(Timeout.Infinite, cancellationToken).ConfigureAwait(false);
            }
            catch (FabricNotPrimaryException ex)
            {
                UpgradeOrchestrationTrace.TraceSource.WriteError(TraceType, ex.ToString());
            }
            catch (FabricObjectClosedException ocex)
            {
                UpgradeOrchestrationTrace.TraceSource.WriteError(TraceType, ocex.ToString());
            }

            UpgradeOrchestrationTrace.TraceSource.WriteInfo(TraceType, "Exit RunAsync");
        }