Exemplo n.º 1
0
        /// <summary>
        /// Asynchronously starts a restore operation using the state indicated by <paramref name="backupMetadata"/>.
        /// The backup is retrieved from the central store.
        /// This method completes and returns before the backup restore process is completely done.
        /// </summary>
        /// <param name="service"></param>
        /// <param name="dataLossMode"></param>
        /// <param name="backupMetadata"></param>
        /// <returns></returns>
        public static async Task BeginRestoreBackup(this IBackupRestoreServiceOperations service, BackupMetadata backupMetadata, DataLossMode dataLossMode)
        {
            service.LogCallback?.Invoke($"BackupRestoreService - Beginning restore backup {backupMetadata.BackupId} for partition {service.Context.PartitionId}.");

            try
            {
                if (backupMetadata == null)
                {
                    throw new ArgumentNullException(nameof(backupMetadata));
                }

                await service.CentralBackupStore.ScheduleBackupRestoreAsync(service.Context.PartitionId, backupMetadata.BackupId);

                var partitionSelector = PartitionSelector.PartitionIdOf(service.Context.ServiceName, service.Context.PartitionId);

                var operationId = Guid.NewGuid();
                await new FabricClient(FabricClientRole.Admin).TestManager.StartPartitionDataLossAsync(operationId, partitionSelector, dataLossMode);
                //Causes OnDataLossAsync to be called later on.
            }
            catch (Exception ex)
            {
                string message = $"Failed to restore backup for partition {service.Context.PartitionId}";
                service.LogCallback?.Invoke($"{nameof(BackupRestoreServiceOperations)} - {nameof(BeginRestoreBackup)} failed for partition: {service.Context.PartitionId}. Message:{message} - Error: {ex.Message}");
                throw new Exception(message, ex);
            }
            service.LogCallback?.Invoke($"{nameof(BackupRestoreServiceOperations)} - {nameof(BeginRestoreBackup)} succeeded {backupMetadata.BackupId} for partition {service.Context.PartitionId}.");
        }
Exemplo n.º 2
0
        private FabricTestAction GetMoveSecondaryReplicaAction(MoveSecondaryReplicaStateTransitionAction ragAction)
        {
            Requires.Argument("moveSecondaryReplicaStateTransitionAction", ragAction).NotNull();

            Uri    serviceUri      = ragAction.ServiceUri;
            Guid   guid            = ragAction.PartitionId;
            string currentNodeName = ragAction.NodeFrom;
            string newNodeName     = ragAction.NodeTo;

            string report = StringHelper.Format(
                "Generating Action: {0}\n\t\tService: {1}\n\t\tPartition: {2}\n\t\tFrom: {3} To: {4}",
                ragAction.ActionType,
                serviceUri,
                guid,
                currentNodeName,
                newNodeName);

            Log.WriteInfo(TraceType, report);
            if (this.reportFunction != null)
            {
                this.reportFunction(report);
            }

            var partitionSelector = PartitionSelector.PartitionIdOf(serviceUri, guid);

            return(new MoveSecondaryAction(currentNodeName, newNodeName, partitionSelector, ragAction.ForceMove));
        }
        protected PartitionSelector GetPartitionSelector()
        {
            if (this.ParameterSetName == "PartitionId")
            {
                return(PartitionSelector.PartitionIdOf(this.ServiceName, this.PartitionId));
            }
            else
            {
                switch (this.ParameterSetName)
                {
                case "ServiceNameRandomPartition":
                    return(PartitionSelector.RandomOf(this.ServiceName));

                case "ServiceNamePartitionSingleton":
                    return(PartitionSelector.SingletonOf(this.ServiceName));

                case "ServiceNamePartitionNamed":
                    return(PartitionSelector.PartitionKeyOf(this.ServiceName, this.PartitionKey));

                case "ServiceNamePartitionUniformedInt":
                    long partitionKeyLong;
                    if (!long.TryParse(this.PartitionKey, out partitionKeyLong))
                    {
                        throw new ArgumentException(StringResources.Error_InvalidPartitionKey);
                    }

                    return(PartitionSelector.PartitionKeyOf(this.ServiceName, partitionKeyLong));

                default:
                    throw new ArgumentException(StringResources.Error_CouldNotParsePartitionSelector);
                }
            }
        }
        public async Task RestoreServiceAsync(string nameOfBackupSet)
        {
            this._fileStore.WriteRestoreInformation(nameOfBackupSet);
            var partitionSelector = PartitionSelector.PartitionIdOf(this.Context.ServiceName, this.Context.PartitionId);

            var operationId = Guid.NewGuid();

            await new FabricClient(FabricClientRole.Admin).TestManager.StartPartitionDataLossAsync(operationId, partitionSelector, DataLossMode.FullDataLoss);
        }
Exemplo n.º 5
0
 internal static async Task InitiatePartitionDataLoss(Guid dataLossGuid, string serviceNameUri, string partitionId, TimeSpan timeout)
 {
     await InvokeWithRetryAsync(() =>
     {
         return(FabricClient.TestManager.StartPartitionDataLossAsync(dataLossGuid,
                                                                     PartitionSelector.PartitionIdOf(new Uri(UtilityHelper.GetUriFromCustomUri(serviceNameUri)), Guid.Parse(partitionId)), DataLossMode.PartialDataLoss));
     }
                                );
 }
Exemplo n.º 6
0
        private async Task <bool> BackupCallbackAzureAsync(BackupInfo backupInfo)
        {
            string            backupId          = Guid.NewGuid().ToString();
            CancellationToken cancellationToken = default(CancellationToken);

            long totalBackupCount;


            IReliableDictionary <int, long> countDictionary = await this.StateManager.GetOrAddAsync <IReliableDictionary <int, long> >(this.countDictionaryName);

            using (ITransaction txn = this.StateManager.CreateTransaction())
            {
                long count = await countDictionary.AddOrUpdateAsync(txn, 0, 0, (key, oldValue) => { return(oldValue + 1); });

                totalBackupCount = count;
                await txn.CommitAsync();
            }


            ServiceEventSource.Current.Message("Backup count dictionary updated: " + totalBackupCount);


            if ((totalBackupCount % 10) == 0)
            {
                //Store no more than 10 backups at a time - the actual max might be a bit more than 10 since more backups could have been created when deletion was taking place. Keeps behind 5 backups.
                await this.backupStore.DeleteBackupsAzureAsync(cancellationToken);
            }

            if ((totalBackupCount > 10) && (DateTime.Now.Second % 20) == 0)
            {
                //Let's simulate a data loss every time the time is a multiple of 20 seconds, and a backup just completed.
                ServiceEventSource.Current.ServiceMessage(this, "Restore Started");

                using (FabricClient fabricClient = new FabricClient())
                {
                    PartitionSelector partitionSelector = PartitionSelector.PartitionIdOf(
                        this.ServiceInitializationParameters.ServiceName,
                        this.ServiceInitializationParameters.PartitionId);

                    await fabricClient.ServiceManager.InvokeDataLossAsync(partitionSelector, DataLossMode.PartialDataLoss, cancellationToken);
                }
            }

            ServiceEventSource.Current.Message("Backing up from directory, ID  : " + backupInfo.Directory + " *** " + backupId);
            try
            {
                await this.backupStore.UploadBackupFolderAsync(backupInfo.Directory, backupId, CancellationToken.None);
            }
            catch (Exception e)
            {
                ServiceEventSource.Current.ServiceMessage(this, "Uploading to backup folder failed: " + "{0} {1}" + e.GetType() + e.Message);
            }

            return(true);
        }
Exemplo n.º 7
0
        private async Task <bool> BackupCallbackAsync(BackupInfo backupInfo)
        {
            string backupId = Guid.NewGuid().ToString();


            long totalBackupCount;


            IReliableDictionary <int, long> countDictionary = await this.StateManager.GetOrAddAsync <IReliableDictionary <int, long> >(this.countDictionaryName);

            using (ITransaction txn = this.StateManager.CreateTransaction())
            {
                long count = await countDictionary.AddOrUpdateAsync(txn, 0, 0, (key, oldValue) => { return(oldValue + 1); });

                totalBackupCount = count;
                await txn.CommitAsync();
            }

            ServiceEventSource.Current.ServiceMessage(this, "Backup count dictionary updated: " + totalBackupCount);
            ServiceEventSource.Current.Message("Backup count dictionary updated: " + totalBackupCount);

            if ((totalBackupCount % 20) == 0)
            {
                //The following limits the number of backups stored to 20 per partition. The actual max might be more than 20 per partition since more backups
                //could have been created when deletion was taking place.
                //Also depending on the backup that was restored, the count of backups could be a lot larger.
                this.DeleteBackups(Path.Combine(this.localBackupStore, this.ServicePartition.PartitionInfo.Id.ToString()), 5);
            }

            //Simulate a restore/data loss event randomly. This assumes that all partitions have some state at this point.
            //Five inventory items must be added for all five partitions to have state.
            if ((totalBackupCount > 19) && (DateTime.Now.Second % 20) == 0)
            {
                CancellationToken cancellationToken = default(CancellationToken);

                ServiceEventSource.Current.ServiceMessage(this, "Restore Started");

                using (FabricClient fabricClient = new FabricClient())
                {
                    PartitionSelector partitionSelector = PartitionSelector.PartitionIdOf(
                        this.ServiceInitializationParameters.ServiceName,
                        this.ServiceInitializationParameters.PartitionId);

                    await fabricClient.ServiceManager.InvokeDataLossAsync(partitionSelector, DataLossMode.PartialDataLoss, cancellationToken);
                }
            }

            await
            this.CopyBackupFolderAsync(backupInfo.Directory, this.ServicePartition.PartitionInfo.Id.ToString(), backupId, CancellationToken.None);

            return(true);
        }
Exemplo n.º 8
0
        private FabricTestAction GetRemoveReplicaAction(ReplicaStateTransitionAction ragAction)
        {
            Uri  serviceUri = ragAction.ServiceUri;
            Guid guid       = ragAction.PartitionId;
            long replicaId  = ragAction.ReplicaId;

            string report = StringHelper.Format("Generating Action: {0}\n\t\tService: {1}\n\t\tPartition: {2}\n\t\tReplicaId: {3}", ragAction.ActionType, ragAction.ServiceUri, ragAction.PartitionId, ragAction.ReplicaId);

            // Select specific replica using ReplicaSelector.
            ReplicaSelector replicaSelector = ReplicaSelector.ReplicaIdOf(PartitionSelector.PartitionIdOf(serviceUri, guid), replicaId);

            RemoveReplicaAction restartReplicaAction = new RemoveReplicaAction(replicaSelector);

            return(restartReplicaAction);
        }
        /// <summary>
        /// Asynchronously starts a restore operation using the state indicated by <paramref name="backupMetadata"/>.
        /// The backup is retrieved from the central store.
        /// </summary>
        /// <param name="service"></param>
        /// <param name="dataLossMode"></param>
        /// <param name="backupMetadata"></param>
        /// <returns></returns>
        public static async Task BeginRestoreBackup(this IBackupRestoreServiceInternal service, BackupMetadata backupMetadata, DataLossMode dataLossMode)
        {
            service.LogCallback?.Invoke($"BackupRestoreService - Beginning restore backup {backupMetadata.BackupId} for partition {service.Context.PartitionId}.");

            if (backupMetadata == null)
            {
                throw new ArgumentNullException(nameof(backupMetadata));
            }

            await service.CentralBackupStore.ScheduleBackupAsync(service.Context.PartitionId, backupMetadata.BackupId);

            var partitionSelector = PartitionSelector.PartitionIdOf(service.Context.ServiceName, service.Context.PartitionId);

            var operationId = Guid.NewGuid();

            await new FabricClient(FabricClientRole.Admin).TestManager.StartPartitionDataLossAsync(operationId, partitionSelector, dataLossMode);
            //Causes OnDataLossAsync to be called.

            service.LogCallback?.Invoke($"BackupRestoreService - Begun restore backup {backupMetadata.BackupId} for partition {service.Context.PartitionId}.");
        }
Exemplo n.º 10
0
        private void KillPrimaryReplica()
        {
            // Kill the primary
            Application application =
                _fabricClient.QueryManager.GetApplicationListAsync()
                .Result.Single(a => a.ApplicationTypeName == DefaultApplicationTypeName);
            Service service =
                _fabricClient.QueryManager.GetServiceListAsync(application.ApplicationName).Result.Single();
            Partition partition =
                _fabricClient.QueryManager.GetPartitionListAsync(service.ServiceName).Result.Single();
            StatefulServiceReplica primaryReplica =
                _fabricClient.QueryManager.GetReplicaListAsync(partition.PartitionInformation.Id)
                .Result.Select(replica => replica as StatefulServiceReplica)
                .Single(statefulServiceReplica => statefulServiceReplica.ReplicaRole == ReplicaRole.Primary);

            LogHelper.Log("Killing the primary replica at node {0}", primaryReplica.NodeName);

            ReplicaSelector replicaSelector = ReplicaSelector.ReplicaIdOf(PartitionSelector.PartitionIdOf(service.ServiceName, partition.PartitionInformation.Id), primaryReplica.Id);

            _fabricClient.FaultManager.RemoveReplicaAsync(replicaSelector, CompletionMode.DoNotVerify, false);
        }
Exemplo n.º 11
0
        public static PartitionSelector GetExpectedPartitionSelector(PartitionCase partitionCase)
        {
            PartitionSelector result = null;

            switch (partitionCase)
            {
            case PartitionCase.PartitionId:
            {
                result = PartitionSelector.PartitionIdOf(ServiceName, PartitionID);
                break;
            }

            case PartitionCase.ServiceNameRandomPartition:
            {
                result = PartitionSelector.RandomOf(ServiceName);
                break;
            }

            case PartitionCase.ServiceNamePartitionSingleton:
            {
                result = PartitionSelector.SingletonOf(ServiceName);
                break;
            }

            case PartitionCase.ServiceNamePartitionNamed:
            {
                result = PartitionSelector.PartitionKeyOf(ServiceName, PartitionKey);
                break;
            }

            case PartitionCase.ServiceNamePartitionUniformedInt:
            {
                result = PartitionSelector.PartitionKeyOf(ServiceName, PartitionKeyLong);
                break;
            }
            }

            return(result);
        }
Exemplo n.º 12
0
        private FabricTestAction GetMovePrimaryReplicaAction(MovePrimaryReplicaStateTransitionAction ragAction)
        {
            Uri    serviceUri  = ragAction.ServiceUri;
            Guid   guid        = ragAction.PartitionId;
            string newNodeName = ragAction.NodeTo;

            string report = StringHelper.Format(
                "Generating Action: {0}\n\t\tService: {1}\n\t\tPartition: {2}\n\t\tTo: {3}",
                ragAction.ActionType,
                serviceUri,
                guid,
                newNodeName);

            Log.WriteInfo(TraceType, report);
            if (this.reportFunction != null)
            {
                this.reportFunction(report);
            }

            var partitionSelector = PartitionSelector.PartitionIdOf(serviceUri, guid);

            return(new MovePrimaryAction(newNodeName, partitionSelector, ragAction.ForceMove));
        }
        public async Task Restore(IPersistentDownloader downloader, EventArgs eventArgs)
        {
            await m_init;

            if (eventArgs == EventArgs.Empty)
            {
                // !Note, we disable the event handler before we trigger an active restore, otherwise
                // the active restore event will notify the backup controller, while in fact the restore
                // command is issued from the controller, and SF backup manager should do it passively.
                Log.WriteLine($"{nameof(ServiceFabricBackupManager)}: initiating a new restore operation.");
                m_svc.RequestRestore -= OnServiceFabricRequestRestore;
                await m_svc.FabricClient.TestManager.StartPartitionDataLossAsync(Guid.NewGuid(), PartitionSelector.PartitionIdOf(m_svc.Context.ServiceName, m_svc.Context.PartitionId), DataLossMode.PartialDataLoss);

                await m_sem.WaitAsync();

                m_svc.RequestRestore += OnServiceFabricRequestRestore;
                return;
            }
            if (!(eventArgs is RestoreEventArgs rstArgs))
            {
                throw new NotSupportedException();
            }
            try
            {
                var ctx   = rstArgs.m_rctx;
                var dir   = Path.Combine(TrinityConfig.StorageRoot, Path.GetRandomFileName());
                var dsc   = new RestoreDescription(dir);
                var fname = Path.Combine(TrinityConfig.StorageRoot, Path.GetRandomFileName());
                using (var file = File.OpenWrite(fname))
                {
                    Log.WriteLine($"{nameof(ServiceFabricBackupManager)}: Downloading ServiceFabric backup data.");
                    await downloader.DownloadMetadataAsync(MetadataKey, file);
                }
                Log.WriteLine($"{nameof(ServiceFabricBackupManager)}: Decompressing ServiceFabric backup data.");
                FileUtility.CompletePath(dir, create_nonexistent: true);
                ZipFile.ExtractToDirectory(fname, dir);
                File.Delete(fname);
                Log.WriteLine($"{nameof(ServiceFabricBackupManager)}: Restoring ServiceFabric backup data.");
                await ctx.RestoreAsync(dsc);

                Directory.Delete(dir, recursive: true);
                Log.WriteLine($"{nameof(ServiceFabricBackupManager)}: Restored ServiceFabric backup data.");
                rstArgs.Complete();
            }
            catch (Exception ex)
            {
                rstArgs.Complete(ex);
                throw;
            }
            m_sem.Release();
        }
Exemplo n.º 14
0
            public override async Task <ActionStateBase> RunAsync(CancellationToken cancellationToken, ServiceInternalFaultInfo serviceInternalFaultInfo)
            {
                InvokeQuorumLossState state = Convert(this.State);

                Guid partitionId = state.Info.PartitionId;
                List <Tuple <string, string> > unreliableTransportInfo = state.Info.UnreliableTransportInfo;
                List <long> targetReplicas = state.Info.ReplicaIds;

                var unreliableTransportTaskList = new List <Task>();
                List <Tuple <string, string> > unreliableTransportInfoList = new List <Tuple <string, string> >();

                foreach (Tuple <string, string> ut in unreliableTransportInfo)
                {
                    string nodeName     = ut.Item1;
                    string behaviorName = ut.Item2;

                    System.Fabric.Common.UnreliableTransportBehavior behavior = new System.Fabric.Common.UnreliableTransportBehavior("*", "StatefulServiceReopen");
                    behavior.AddFilterForPartitionId(partitionId);

                    TestabilityTrace.TraceSource.WriteInfo(StepBase.TraceType, "{0} - applying '{1}'", this.State.OperationId, behaviorName);

                    unreliableTransportTaskList.Add(FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync(
                                                        () => this.FabricClient.TestManager.AddUnreliableTransportBehaviorAsync(
                                                            nodeName,
                                                            behaviorName,
                                                            behavior,
                                                            this.RequestTimeout,
                                                            cancellationToken),
                                                        this.OperationTimeout,
                                                        cancellationToken));
                }

                await Task.WhenAll(unreliableTransportTaskList).ConfigureAwait(false);

                // Bug#2271465 - Unreliable transport through API should return only once the behavior has been successfully applied
                await Task.Delay(TimeSpan.FromSeconds(5.0), cancellationToken).ConfigureAwait(false);

                List <Task> tasks = new List <Task>();

                foreach (long replicaId in targetReplicas)
                {
                    ReplicaSelector replicaSelector = ReplicaSelector.ReplicaIdOf(PartitionSelector.PartitionIdOf(this.partitionSelector.ServiceName, partitionId), replicaId);

                    TestabilityTrace.TraceSource.WriteInfo(StepBase.TraceType, "{0} - faulting replica with id={1}", this.State.OperationId, replicaId);
                    Task task = FaultAnalysisServiceUtility.RestartReplicaAsync(this.FabricClient, replicaSelector, CompletionMode.DoNotVerify, this.RequestTimeout, this.OperationTimeout, cancellationToken);
                    tasks.Add(task);
                }

                await Task.WhenAll(tasks).ConfigureAwait(false);

                ActionTest.PerformInternalServiceFaultIfRequested(this.State.OperationId, serviceInternalFaultInfo, this.State, cancellationToken, true);

                TestabilityTrace.TraceSource.WriteInfo(StepBase.TraceType, "{0} - keeping partition in quorum loss for '{1}'", this.State.OperationId, state.Info.QuorumLossDuration);
                await Task.Delay(state.Info.QuorumLossDuration, cancellationToken).ConfigureAwait(false);

                TimeoutHelper timeoutHelper = new TimeoutHelper(this.OperationTimeout);

                bool conditionSatisfied = false;

                int quorumLossCheckRetries = FASConstants.QuorumLossCheckRetryCount;

                do
                {
                    TestabilityTrace.TraceSource.WriteInfo(StepBase.TraceType, "{0} - checking PartitionStatus", this.State.OperationId);
                    ServicePartitionList partitionsResult = await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync(
                        () => this.FabricClient.QueryManager.GetPartitionListAsync(
                            this.partitionSelector.ServiceName,
                            null,
                            this.RequestTimeout,
                            cancellationToken),
                        this.OperationTimeout,
                        cancellationToken).ConfigureAwait(false);

                    foreach (StatefulServicePartition partition in partitionsResult)
                    {
                        if (partition.PartitionInformation.Id == partitionId)
                        {
                            if (partition.PartitionStatus == ServicePartitionStatus.InQuorumLoss)
                            {
                                conditionSatisfied = true;
                                break;
                            }
                        }
                    }

                    await AsyncWaiter.WaitAsync(TimeSpan.FromSeconds(5), cancellationToken).ConfigureAwait(false);
                }while (!conditionSatisfied && quorumLossCheckRetries-- > 0);

                if (!conditionSatisfied)
                {
                    string error = string.Format(CultureInfo.InvariantCulture, "{0} - Service could not induce quorum loss for service '{1}', partition '{2}'. Please retry", this.State.OperationId, this.partitionSelector.ServiceName, partitionId);
                    TestabilityTrace.TraceSource.WriteWarning(StepBase.TraceType, error);

                    throw new FabricTransientException("The operation could not be performed, please retry", FabricErrorCode.NotReady);
                }

                await QuorumLossStepsFactory.RemoveUTAsync(this.FabricClient, this.State, this.RequestTimeout, this.OperationTimeout, cancellationToken);

                state.StateProgress.Push(StepStateNames.CompletedSuccessfully);

                return(state);
            }
Exemplo n.º 15
0
            protected override async Task ExecuteActionAsync(FabricTestContext testContext, InvokeQuorumLossAction action, CancellationToken cancellationToken)
            {
                ThrowIf.Null(action.PartitionSelector, "PartitionSelector");

                var helper = new TimeoutHelper(action.ActionTimeout);

                // get info about the service so we can check type and trss
                ServiceDescription result = await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync(
                    () => testContext.FabricClient.ServiceManager.GetServiceDescriptionAsync(
                        action.PartitionSelector.ServiceName,
                        action.RequestTimeout,
                        cancellationToken),
                    helper.GetRemainingTime(),
                    cancellationToken).ConfigureAwait(false);

                if (result.Kind != ServiceDescriptionKind.Stateful)
                {
                    throw new InvalidOperationException(StringHelper.Format(StringResources.Error_InvalidServiceTypeTestability, "QuorumLoss", "Stateful", action.PartitionSelector.ServiceName, "Stateless"));
                }

                StatefulServiceDescription statefulServiceDescription = result as StatefulServiceDescription;

                ReleaseAssert.AssertIf(statefulServiceDescription == null, "Service is not a stateful service");

                if (!statefulServiceDescription.HasPersistedState)
                {
                    throw new InvalidOperationException(StringHelper.Format(StringResources.Error_InvalidServiceTypeTestability, "QuorumLoss", "Stateful Persistent", action.PartitionSelector.ServiceName, "Stateful In-Memory Only"));
                }

                // figure out /which/ partition to select
                var getPartitionStateAction = new GetSelectedPartitionStateAction(action.PartitionSelector)
                {
                    RequestTimeout = action.RequestTimeout,
                    ActionTimeout  = helper.GetRemainingTime()
                };

                await testContext.ActionExecutor.RunAsync(getPartitionStateAction, cancellationToken);

                Guid partitionId = getPartitionStateAction.Result.PartitionId;

                // get data about replicas in that partition
                ServiceReplicaList replicasResult = await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync(
                    () => testContext.FabricClient.QueryManager.GetReplicaListAsync(
                        partitionId,
                        0,
                        action.RequestTimeout,
                        cancellationToken),
                    helper.GetRemainingTime(),
                    cancellationToken).ConfigureAwait(false);

                var removeUTRequestList = new List <Tuple <string, string> >();
                Dictionary <Tuple <string, string>, Task> removeUTTaskDictionary = new Dictionary <Tuple <string, string>, Task>();

                try
                {
                    var  stableReplicas                  = replicasResult.Where(r => r.ReplicaStatus == ServiceReplicaStatus.Ready).ToArray();
                    var  stableReplicasToRemove          = new List <StatefulServiceReplica>();
                    long replicasToRestartWithoutPrimary =
                        action.QuorumLossMode == QuorumLossMode.AllReplicas
                            ? stableReplicas.Length - 1
                            : FabricCluster.GetWriteQuorumSize(replicasResult.Count);
                    foreach (var replica in stableReplicas)
                    {
                        StatefulServiceReplica statefulReplica = replica as StatefulServiceReplica;
                        ReleaseAssert.AssertIf(statefulReplica == null, "Service Replica is not of stateful type even though service is stateful");
                        if (statefulReplica.ReplicaRole != ReplicaRole.Primary)
                        {
                            replicasToRestartWithoutPrimary--;
                        }

                        if (replicasToRestartWithoutPrimary >= 0 || statefulReplica.ReplicaRole == ReplicaRole.Primary)
                        {
                            stableReplicasToRemove.Add(statefulReplica);
                        }
                    }

                    // for selected replicas, block reopen so that when we restart the replica (NOT remove the replica) it doesn't come up
                    var utTaskList = new List <Task>();
                    foreach (var statefulReplica in stableReplicasToRemove)
                    {
                        string nodeName = statefulReplica.NodeName;
                        UnreliableTransportBehavior behavior = new UnreliableTransportBehavior("*", "StatefulServiceReopen");
                        behavior.AddFilterForPartitionId(partitionId);
                        string behaviorName = "BlockStatefulServiceReopen_" + nodeName;

                        removeUTRequestList.Add(new Tuple <string, string>(nodeName, behaviorName));
                        utTaskList.Add(
                            FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync(
                                () =>
                                testContext.FabricClient.TestManager.AddUnreliableTransportBehaviorAsync(
                                    nodeName,
                                    behaviorName,
                                    behavior,
                                    action.RequestTimeout,
                                    cancellationToken),
                                helper.GetRemainingTime(),
                                cancellationToken));
                    }

                    await Task.WhenAll(utTaskList).ConfigureAwait(false);

                    // TODO: Wait for some time so that the unreliable transport behavior can be read from the files.
                    // Bug#2271465 - Unreliable transport through API should return only once the behavior has been successfully applied
                    await Task.Delay(TimeSpan.FromSeconds(5.0), cancellationToken);

                    var restartReplicaTaskList = new List <Task>();
                    foreach (var statefulReplica in stableReplicasToRemove)
                    {
                        ReplicaSelector replicaSelector = ReplicaSelector.ReplicaIdOf(PartitionSelector.PartitionIdOf(action.PartitionSelector.ServiceName, partitionId), statefulReplica.Id);

                        var restartReplicaAction = new RestartReplicaAction(replicaSelector)
                        {
                            CompletionMode = CompletionMode.DoNotVerify,
                            RequestTimeout = action.RequestTimeout,
                            ActionTimeout  = helper.GetRemainingTime()
                        };

                        restartReplicaTaskList.Add(testContext.ActionExecutor.RunAsync(restartReplicaAction, cancellationToken));
                    }

                    await Task.WhenAll(restartReplicaTaskList).ConfigureAwait(false);

                    await AsyncWaiter.WaitAsync(action.QuorumLossDuration, cancellationToken).ConfigureAwait(false);

                    // validate
                    ServicePartitionList partitionsResult = await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync(
                        () => testContext.FabricClient.QueryManager.GetPartitionListAsync(
                            action.PartitionSelector.ServiceName,
                            null,
                            action.RequestTimeout,
                            cancellationToken),
                        FabricClientRetryErrors.GetPartitionListFabricErrors.Value,
                        helper.GetRemainingTime(),
                        cancellationToken).ConfigureAwait(false);

                    foreach (StatefulServicePartition partition in partitionsResult)
                    {
                        if (partition.PartitionInformation.Id == partitionId)
                        {
                            ReleaseAssert.AssertIf(partition.PartitionStatus != ServicePartitionStatus.InQuorumLoss, "Partition failed to be in Quorum Loss.");
                            break;
                        }
                    }

                    foreach (var removeUTParams in removeUTRequestList)
                    {
                        var  currentParams = removeUTParams;
                        Task task          = FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync(
                            () => testContext.FabricClient.TestManager.RemoveUnreliableTransportBehaviorAsync(
                                currentParams.Item1,  /*nodeName*/
                                currentParams.Item2,  /*behaviorName*/
                                action.RequestTimeout,
                                cancellationToken),
                            FabricClientRetryErrors.RemoveUnreliableTransportBehaviorErrors.Value,
                            helper.GetRemainingTime(),
                            cancellationToken);

                        removeUTTaskDictionary[currentParams] = task;
                    }

                    await Task.WhenAll(removeUTTaskDictionary.Values).ConfigureAwait(false);

                    // TODO: Wait for some time so that the removal of this unreliable transport behavior can be read from the files.
                    // Bug#2271465 - Unreliable transport through API should return only once the behavior has been successully applied
                    await Task.Delay(TimeSpan.FromSeconds(5.0), cancellationToken);
                }
                finally
                {
                    var removeUTTaskList = new List <Task>();

                    foreach (var removeUTRequest in removeUTTaskDictionary)
                    {
                        var currentRemoveUTRequest = removeUTRequest;
                        if (currentRemoveUTRequest.Value == null || currentRemoveUTRequest.Value.IsFaulted)
                        {
                            removeUTTaskList.Add(
                                FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync(
                                    () => testContext.FabricClient.TestManager.RemoveUnreliableTransportBehaviorAsync(
                                        currentRemoveUTRequest.Key.Item1, /*nodeName*/
                                        currentRemoveUTRequest.Key.Item2, /*behaviorName*/
                                        action.RequestTimeout,
                                        cancellationToken),
                                    FabricClientRetryErrors.RemoveUnreliableTransportBehaviorErrors.Value,
                                    helper.GetRemainingTime(),
                                    cancellationToken));
                        }
                    }

                    Task.WhenAll(removeUTTaskList).Wait(cancellationToken);

                    // TODO: Wait for some time so that the removal of this unreliable transport behavior can be read from the files.
                    // Bug#2271465 - Unreliable transport through API should return only once the behavior has been successully applied
                    Task.Delay(TimeSpan.FromSeconds(5.0), cancellationToken).GetAwaiter().GetResult();
                }

                action.Result          = new InvokeQuorumLossResult(getPartitionStateAction.Result);
                this.ResultTraceString = StringHelper.Format("InvokeQuorumLossAction succeeded for {0} with QuorumLossMode = {1}", partitionId, action.QuorumLossMode);
            }
        internal static ReplicaSelector GetReplicaSelector(string partitionSetName, Guid partitionId, Uri serviceName, string partitionKey, long?replicaOrInstanceId)
        {
            ReplicaSelector   replicaSelector   = null;
            PartitionSelector partitionSelector = null;

            if (partitionSetName.Contains("PartitionId"))
            {
                partitionSelector = PartitionSelector.PartitionIdOf(serviceName, partitionId);
            }
            else
            {
                if (partitionSetName.Contains("PartitionSingleton"))
                {
                    partitionSelector = PartitionSelector.SingletonOf(serviceName);
                }
                else if (partitionSetName.Contains("PartitionNamed"))
                {
                    partitionSelector = PartitionSelector.PartitionKeyOf(serviceName, partitionKey);
                }
                else if (partitionSetName.Contains("PartitionUniformedInt"))
                {
                    long partitionKeyLong;
                    if (!long.TryParse(partitionKey, out partitionKeyLong))
                    {
                        throw new ArgumentException(StringResources.Error_InvalidPartitionKey);
                    }

                    partitionSelector = PartitionSelector.PartitionKeyOf(serviceName, partitionKeyLong);
                }
                else if (!partitionSetName.Contains("Partition"))
                {
                    partitionSelector = PartitionSelector.RandomOf(serviceName);
                }
            }

            if (partitionSelector == null)
            {
                throw new ArgumentException(StringResources.Error_CouldNotParsePartitionSelector);
            }

            if (partitionSetName.Contains("ReplicaPrimary"))
            {
                replicaSelector = ReplicaSelector.PrimaryOf(partitionSelector);
            }
            else if (partitionSetName.Contains("ReplicaRandomSecondary"))
            {
                replicaSelector = ReplicaSelector.RandomSecondaryOf(partitionSelector);
            }
            else if (partitionSetName.Contains("ReplicaId"))
            {
                replicaSelector = ReplicaSelector.ReplicaIdOf(partitionSelector, replicaOrInstanceId ?? 0);
            }
            else if (!partitionSetName.Contains("Replica"))
            {
                replicaSelector = ReplicaSelector.RandomOf(partitionSelector);
            }

            if (replicaSelector == null)
            {
                throw new ArgumentException(StringResources.Error_CouldNotParseReplicaSelector);
            }

            return(replicaSelector);
        }
Exemplo n.º 17
0
        /// <summary>
        /// This API supports the Service Fabric platform and is not meant to be called from your code
        /// </summary>
        /// <param name="token">This API supports the Service Fabric platform and is not meant to be called from your code</param>
        /// <returns></returns>
        protected override async Task OnExecuteAsync(CancellationToken token)
        {
            this.serviceDescription = await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync(
                () => this.FabricClient.ServiceManager.GetServiceDescriptionAsync(
                    this.failoverTestScenarioParameters.PartitionSelector.ServiceName,
                    this.failoverTestScenarioParameters.RequestTimeout,
                    token),
                this.failoverTestScenarioParameters.OperationTimeout,
                token).ConfigureAwait(false);

            bool hasPersistedState = false;

            if (this.serviceDescription.IsStateful())
            {
                StatefulServiceDescription statefulDescription = this.serviceDescription as StatefulServiceDescription;
                ReleaseAssert.AssertIf(statefulDescription == null, "Stateful service description is not WinFabricStatefulServiceDescription");
                hasPersistedState = statefulDescription.HasPersistedState;
            }

            Log.WriteInfo(TraceType, "Validating Service health and availability");
            await this.FabricClient.TestManager.ValidateServiceAsync(
                this.failoverTestScenarioParameters.PartitionSelector.ServiceName,
                this.failoverTestScenarioParameters.MaxServiceStabilizationTimeout,
                token);

            Log.WriteInfo(TraceType, "Getting Selected Partition");
            var getPartitionStateAction = new GetSelectedPartitionStateAction(this.failoverTestScenarioParameters.PartitionSelector)
            {
                RequestTimeout = this.failoverTestScenarioParameters.RequestTimeout,
                ActionTimeout  = this.failoverTestScenarioParameters.OperationTimeout
            };

            await this.TestContext.ActionExecutor.RunAsync(getPartitionStateAction, token);

            Guid selectedPartitionId = getPartitionStateAction.Result.PartitionId;

            Log.WriteInfo(TraceType, "Running test for partition {0}", selectedPartitionId);

            this.ReportProgress("Selected partition {0} for testing failover", selectedPartitionId);

            PartitionSelector selectedPartition = PartitionSelector.PartitionIdOf(this.failoverTestScenarioParameters.PartitionSelector.ServiceName, selectedPartitionId);

            while (this.failoverTestScenarioParameters.TimeToRun - this.GetElapsedTime() > TimeSpan.Zero && !token.IsCancellationRequested)
            {
                if (this.serviceDescription.IsStateful())
                {
                    ReplicaSelector primaryReplicaSelector   = ReplicaSelector.PrimaryOf(selectedPartition);
                    ReplicaSelector secondaryReplicaSelector = ReplicaSelector.RandomSecondaryOf(selectedPartition);

                    // Make Primary go through RemoveReplica, RestartReplica and RestartCodePackage

                    await this.TestReplicaFaultsAsync(primaryReplicaSelector, "Primary", hasPersistedState, token);

                    // Make Secondary go through RemoveReplica, RestartReplica and RestartCodePackage

                    await this.TestReplicaFaultsAsync(secondaryReplicaSelector, "Secondary", hasPersistedState, token);
                }
                else
                {
                    ReplicaSelector randomInstanceSelector = ReplicaSelector.RandomOf(selectedPartition);

                    // Make Stateless Instance go through RemoveReplica, RestartReplica and RestartCodePackage

                    await this.TestReplicaFaultsAsync(randomInstanceSelector, "Stateless Instance", hasPersistedState, token);
                }

                if (this.serviceDescription.IsStateful())
                {
                    // Restart all secondary replicas and make sure the replica set recovers

                    await this.InvokeAndValidateFaultAsync(
                        "Restarting all the secondary replicas",
                        () =>
                    {
#pragma warning disable 618
                        return(this.FabricClient.TestManager.RestartPartitionAsync(
                                   selectedPartition,
                                   RestartPartitionMode.OnlyActiveSecondaries,
                                   this.failoverTestScenarioParameters.OperationTimeout,
                                   token));

#pragma warning restore 618
                    }, token);

                    // Restart all replicas if service is persisted

                    if (hasPersistedState)
                    {
                        await this.InvokeAndValidateFaultAsync(
                            "Restarting all replicas including Primary",
                            () =>
                        {
#pragma warning disable 618
                            return(this.FabricClient.TestManager.RestartPartitionAsync(
                                       selectedPartition,
                                       RestartPartitionMode.AllReplicasOrInstances,
                                       this.failoverTestScenarioParameters.OperationTimeout,
                                       token));

#pragma warning restore 618
                        }, token);
                    }

                    // Induce move and swap primary a few times

                    await this.InvokeAndValidateFaultAsync(
                        "Move Primary to a different node",
                        () =>
                    {
                        return(this.FabricClient.FaultManager.MovePrimaryAsync(
                                   string.Empty,
                                   selectedPartition,
                                   true,
                                   this.failoverTestScenarioParameters.OperationTimeout,
                                   token));
                    }, token);

                    // Induce move secondary a few times

                    await this.InvokeAndValidateFaultAsync(
                        "Move Secondary to a different node",
                        () =>
                    {
                        return(this.FabricClient.FaultManager.MoveSecondaryAsync(
                                   string.Empty,
                                   string.Empty,
                                   selectedPartition,
                                   true,
                                   this.failoverTestScenarioParameters.OperationTimeout,
                                   token));
                    }, token);
                }
                else
                {
                    // Restart all stateless instances

                    await this.InvokeAndValidateFaultAsync(
                        "Restarting all stateless instances for partition",
                        () =>
                    {
#pragma warning disable 618
                        return(this.FabricClient.TestManager.RestartPartitionAsync(
                                   selectedPartition,
                                   RestartPartitionMode.AllReplicasOrInstances,
                                   this.failoverTestScenarioParameters.OperationTimeout,
                                   token));

#pragma warning restore 618
                    }, token);
                }
            }
        }
        private static async Task MainAsync()
        {
            Console.WriteLine("Waiting for services....");

            var proxyPartitionOne = await CreateProxyAsync(-1L);

            var proxyPartitionTwo = await CreateProxyAsync(1L);

            var proxy = proxyPartitionOne;

            Console.WriteLine("Waited for services..");


            while (true)
            {
                Console.WriteLine($"Press any key to continue");
                Console.ReadKey(true);
                Console.Clear();

                Console.WriteLine("Press 0 to select target partition");
                Console.WriteLine("Press 1 to get state");
                Console.WriteLine("Press 2 to set state");
                Console.WriteLine("Press 3 to create a backup");
                Console.WriteLine("Press 4 to restore a backup");
                Console.WriteLine("Press 5 to list all central backups");
                Console.WriteLine("Press 6 to list the current Service Partition Ids");
                Console.WriteLine("Press 7 to invoke full dataloss on one of the current Service's Partitions");
                Console.WriteLine("Other key to exit");

                var    key = Console.ReadKey(true);
                string input;

                switch (key.Key)
                {
                case ConsoleKey.D0:
                    Console.WriteLine("Type 1 for partition one, or 2 for partition two");
                    key = Console.ReadKey(true);
                    if (ConsoleKey.D2 == key.Key)
                    {
                        proxy = proxyPartitionTwo;
                        Console.WriteLine("Using partition two.");
                    }
                    else
                    {
                        proxy = proxyPartitionOne;
                        Console.WriteLine("Using partition one.");
                    }
                    break;

                case ConsoleKey.D1:
                    string state = await proxy.GetState();

                    Console.WriteLine($"State: '{state}'");
                    break;

                case ConsoleKey.D2:
                    Console.WriteLine("Enter string to store as state:");
                    input = Console.ReadLine();
                    await proxy.SetState(input ?? "");

                    Console.WriteLine($"State saved: '{input}'");
                    break;

                case ConsoleKey.D3:
                    Console.WriteLine("Type 1 for full backup or 2 for incremental backup (incremental requires full backup to exist)");
                    key = Console.ReadKey(true);
                    if (ConsoleKey.D1 == key.Key)
                    {
                        Console.WriteLine("Creating a full backup asynchronously...");
                        await proxy.BeginCreateBackup(BackupOption.Full);
                    }
                    else
                    {
                        Console.WriteLine("Creating an incremental backup asynchronously...");
                        await proxy.BeginCreateBackup(BackupOption.Incremental);
                    }

                    break;

                case ConsoleKey.D4:
                    Console.WriteLine($"Starting the restore of a backup");
                    Console.WriteLine($"Enter central backup id (guid):");
                    input = Console.ReadLine();

                    var  backups = (await proxy.ListAllBackups()).ToList();
                    Guid index;
                    if (Guid.TryParse(input, out index))
                    {
                        DataLossMode lossMode = DataLossMode.FullDataLoss;
                        Console.WriteLine("Type 1 for full data loss or 2 for partial data loss.");

                        key = Console.ReadKey(true);
                        if (ConsoleKey.D1 == key.Key)
                        {
                            Console.WriteLine("Restoring backup with full data loss asynchronously...");
                        }
                        else
                        {
                            Console.WriteLine("Restoring backup with partial data loss asynchronously...");
                            lossMode = DataLossMode.PartialDataLoss;
                        }

                        await proxy.BeginRestoreBackup(backups.Single(b => b.BackupId == index), lossMode);

                        Console.WriteLine($"Restore is active. This will take some time. Check progress in SF explorer.");
                    }

                    break;

                case ConsoleKey.D5:
                    Console.WriteLine($"List all central backups");
                    var list = await proxy.ListAllBackups();

                    Console.WriteLine($"Original partition\t\t\tBackup Id\t\t\t\tBackup Type\tTimestamp UTC");
                    Console.WriteLine(string.Join(Environment.NewLine, list.Select(data => $"{data.OriginalServicePartitionId}\t{data.BackupId}\t{data.BackupOption}\t\t{data.TimeStampUtc}")));
                    break;

                case ConsoleKey.D6:
                    var resolver = ServicePartitionResolver.GetDefault();
                    var resolved = await resolver.ResolveAsync(ServiceUri, new ServicePartitionKey(-1L), CancellationToken.None);

                    Console.WriteLine($"Partition key -1L resolves to partition {resolved.Info.Id}");
                    resolved = await resolver.ResolveAsync(ServiceUri, new ServicePartitionKey(1L), CancellationToken.None);

                    Console.WriteLine($"Partition key 1L resolves to partition {resolved.Info.Id}");

                    if (proxy == proxyPartitionOne)
                    {
                        Console.WriteLine("Using partition one (-1L)");
                    }
                    else
                    {
                        Console.WriteLine("Using partition two (1L)");
                    }
                    break;

                case ConsoleKey.D7:
                    Console.WriteLine("Enter partitionID");
                    string partitionString = Console.ReadLine();
                    if (Guid.TryParse(partitionString, out Guid partitionID))
                    {
                        var partitionSelector = PartitionSelector.PartitionIdOf(ServiceUri, partitionID);
                        await new FabricClient(FabricClientRole.Admin).TestManager.StartPartitionDataLossAsync(Guid.NewGuid(), partitionSelector, DataLossMode.FullDataLoss);
                    }
                    break;

                default:
                    return;
                }
            }
        }