/// <summary> /// Asynchronously starts a restore operation using the state indicated by <paramref name="backupMetadata"/>. /// The backup is retrieved from the central store. /// This method completes and returns before the backup restore process is completely done. /// </summary> /// <param name="service"></param> /// <param name="dataLossMode"></param> /// <param name="backupMetadata"></param> /// <returns></returns> public static async Task BeginRestoreBackup(this IBackupRestoreServiceOperations service, BackupMetadata backupMetadata, DataLossMode dataLossMode) { service.LogCallback?.Invoke($"BackupRestoreService - Beginning restore backup {backupMetadata.BackupId} for partition {service.Context.PartitionId}."); try { if (backupMetadata == null) { throw new ArgumentNullException(nameof(backupMetadata)); } await service.CentralBackupStore.ScheduleBackupRestoreAsync(service.Context.PartitionId, backupMetadata.BackupId); var partitionSelector = PartitionSelector.PartitionIdOf(service.Context.ServiceName, service.Context.PartitionId); var operationId = Guid.NewGuid(); await new FabricClient(FabricClientRole.Admin).TestManager.StartPartitionDataLossAsync(operationId, partitionSelector, dataLossMode); //Causes OnDataLossAsync to be called later on. } catch (Exception ex) { string message = $"Failed to restore backup for partition {service.Context.PartitionId}"; service.LogCallback?.Invoke($"{nameof(BackupRestoreServiceOperations)} - {nameof(BeginRestoreBackup)} failed for partition: {service.Context.PartitionId}. Message:{message} - Error: {ex.Message}"); throw new Exception(message, ex); } service.LogCallback?.Invoke($"{nameof(BackupRestoreServiceOperations)} - {nameof(BeginRestoreBackup)} succeeded {backupMetadata.BackupId} for partition {service.Context.PartitionId}."); }
private FabricTestAction GetMoveSecondaryReplicaAction(MoveSecondaryReplicaStateTransitionAction ragAction) { Requires.Argument("moveSecondaryReplicaStateTransitionAction", ragAction).NotNull(); Uri serviceUri = ragAction.ServiceUri; Guid guid = ragAction.PartitionId; string currentNodeName = ragAction.NodeFrom; string newNodeName = ragAction.NodeTo; string report = StringHelper.Format( "Generating Action: {0}\n\t\tService: {1}\n\t\tPartition: {2}\n\t\tFrom: {3} To: {4}", ragAction.ActionType, serviceUri, guid, currentNodeName, newNodeName); Log.WriteInfo(TraceType, report); if (this.reportFunction != null) { this.reportFunction(report); } var partitionSelector = PartitionSelector.PartitionIdOf(serviceUri, guid); return(new MoveSecondaryAction(currentNodeName, newNodeName, partitionSelector, ragAction.ForceMove)); }
protected PartitionSelector GetPartitionSelector() { if (this.ParameterSetName == "PartitionId") { return(PartitionSelector.PartitionIdOf(this.ServiceName, this.PartitionId)); } else { switch (this.ParameterSetName) { case "ServiceNameRandomPartition": return(PartitionSelector.RandomOf(this.ServiceName)); case "ServiceNamePartitionSingleton": return(PartitionSelector.SingletonOf(this.ServiceName)); case "ServiceNamePartitionNamed": return(PartitionSelector.PartitionKeyOf(this.ServiceName, this.PartitionKey)); case "ServiceNamePartitionUniformedInt": long partitionKeyLong; if (!long.TryParse(this.PartitionKey, out partitionKeyLong)) { throw new ArgumentException(StringResources.Error_InvalidPartitionKey); } return(PartitionSelector.PartitionKeyOf(this.ServiceName, partitionKeyLong)); default: throw new ArgumentException(StringResources.Error_CouldNotParsePartitionSelector); } } }
public async Task RestoreServiceAsync(string nameOfBackupSet) { this._fileStore.WriteRestoreInformation(nameOfBackupSet); var partitionSelector = PartitionSelector.PartitionIdOf(this.Context.ServiceName, this.Context.PartitionId); var operationId = Guid.NewGuid(); await new FabricClient(FabricClientRole.Admin).TestManager.StartPartitionDataLossAsync(operationId, partitionSelector, DataLossMode.FullDataLoss); }
internal static async Task InitiatePartitionDataLoss(Guid dataLossGuid, string serviceNameUri, string partitionId, TimeSpan timeout) { await InvokeWithRetryAsync(() => { return(FabricClient.TestManager.StartPartitionDataLossAsync(dataLossGuid, PartitionSelector.PartitionIdOf(new Uri(UtilityHelper.GetUriFromCustomUri(serviceNameUri)), Guid.Parse(partitionId)), DataLossMode.PartialDataLoss)); } ); }
private async Task <bool> BackupCallbackAzureAsync(BackupInfo backupInfo) { string backupId = Guid.NewGuid().ToString(); CancellationToken cancellationToken = default(CancellationToken); long totalBackupCount; IReliableDictionary <int, long> countDictionary = await this.StateManager.GetOrAddAsync <IReliableDictionary <int, long> >(this.countDictionaryName); using (ITransaction txn = this.StateManager.CreateTransaction()) { long count = await countDictionary.AddOrUpdateAsync(txn, 0, 0, (key, oldValue) => { return(oldValue + 1); }); totalBackupCount = count; await txn.CommitAsync(); } ServiceEventSource.Current.Message("Backup count dictionary updated: " + totalBackupCount); if ((totalBackupCount % 10) == 0) { //Store no more than 10 backups at a time - the actual max might be a bit more than 10 since more backups could have been created when deletion was taking place. Keeps behind 5 backups. await this.backupStore.DeleteBackupsAzureAsync(cancellationToken); } if ((totalBackupCount > 10) && (DateTime.Now.Second % 20) == 0) { //Let's simulate a data loss every time the time is a multiple of 20 seconds, and a backup just completed. ServiceEventSource.Current.ServiceMessage(this, "Restore Started"); using (FabricClient fabricClient = new FabricClient()) { PartitionSelector partitionSelector = PartitionSelector.PartitionIdOf( this.ServiceInitializationParameters.ServiceName, this.ServiceInitializationParameters.PartitionId); await fabricClient.ServiceManager.InvokeDataLossAsync(partitionSelector, DataLossMode.PartialDataLoss, cancellationToken); } } ServiceEventSource.Current.Message("Backing up from directory, ID : " + backupInfo.Directory + " *** " + backupId); try { await this.backupStore.UploadBackupFolderAsync(backupInfo.Directory, backupId, CancellationToken.None); } catch (Exception e) { ServiceEventSource.Current.ServiceMessage(this, "Uploading to backup folder failed: " + "{0} {1}" + e.GetType() + e.Message); } return(true); }
private async Task <bool> BackupCallbackAsync(BackupInfo backupInfo) { string backupId = Guid.NewGuid().ToString(); long totalBackupCount; IReliableDictionary <int, long> countDictionary = await this.StateManager.GetOrAddAsync <IReliableDictionary <int, long> >(this.countDictionaryName); using (ITransaction txn = this.StateManager.CreateTransaction()) { long count = await countDictionary.AddOrUpdateAsync(txn, 0, 0, (key, oldValue) => { return(oldValue + 1); }); totalBackupCount = count; await txn.CommitAsync(); } ServiceEventSource.Current.ServiceMessage(this, "Backup count dictionary updated: " + totalBackupCount); ServiceEventSource.Current.Message("Backup count dictionary updated: " + totalBackupCount); if ((totalBackupCount % 20) == 0) { //The following limits the number of backups stored to 20 per partition. The actual max might be more than 20 per partition since more backups //could have been created when deletion was taking place. //Also depending on the backup that was restored, the count of backups could be a lot larger. this.DeleteBackups(Path.Combine(this.localBackupStore, this.ServicePartition.PartitionInfo.Id.ToString()), 5); } //Simulate a restore/data loss event randomly. This assumes that all partitions have some state at this point. //Five inventory items must be added for all five partitions to have state. if ((totalBackupCount > 19) && (DateTime.Now.Second % 20) == 0) { CancellationToken cancellationToken = default(CancellationToken); ServiceEventSource.Current.ServiceMessage(this, "Restore Started"); using (FabricClient fabricClient = new FabricClient()) { PartitionSelector partitionSelector = PartitionSelector.PartitionIdOf( this.ServiceInitializationParameters.ServiceName, this.ServiceInitializationParameters.PartitionId); await fabricClient.ServiceManager.InvokeDataLossAsync(partitionSelector, DataLossMode.PartialDataLoss, cancellationToken); } } await this.CopyBackupFolderAsync(backupInfo.Directory, this.ServicePartition.PartitionInfo.Id.ToString(), backupId, CancellationToken.None); return(true); }
private FabricTestAction GetRemoveReplicaAction(ReplicaStateTransitionAction ragAction) { Uri serviceUri = ragAction.ServiceUri; Guid guid = ragAction.PartitionId; long replicaId = ragAction.ReplicaId; string report = StringHelper.Format("Generating Action: {0}\n\t\tService: {1}\n\t\tPartition: {2}\n\t\tReplicaId: {3}", ragAction.ActionType, ragAction.ServiceUri, ragAction.PartitionId, ragAction.ReplicaId); // Select specific replica using ReplicaSelector. ReplicaSelector replicaSelector = ReplicaSelector.ReplicaIdOf(PartitionSelector.PartitionIdOf(serviceUri, guid), replicaId); RemoveReplicaAction restartReplicaAction = new RemoveReplicaAction(replicaSelector); return(restartReplicaAction); }
/// <summary> /// Asynchronously starts a restore operation using the state indicated by <paramref name="backupMetadata"/>. /// The backup is retrieved from the central store. /// </summary> /// <param name="service"></param> /// <param name="dataLossMode"></param> /// <param name="backupMetadata"></param> /// <returns></returns> public static async Task BeginRestoreBackup(this IBackupRestoreServiceInternal service, BackupMetadata backupMetadata, DataLossMode dataLossMode) { service.LogCallback?.Invoke($"BackupRestoreService - Beginning restore backup {backupMetadata.BackupId} for partition {service.Context.PartitionId}."); if (backupMetadata == null) { throw new ArgumentNullException(nameof(backupMetadata)); } await service.CentralBackupStore.ScheduleBackupAsync(service.Context.PartitionId, backupMetadata.BackupId); var partitionSelector = PartitionSelector.PartitionIdOf(service.Context.ServiceName, service.Context.PartitionId); var operationId = Guid.NewGuid(); await new FabricClient(FabricClientRole.Admin).TestManager.StartPartitionDataLossAsync(operationId, partitionSelector, dataLossMode); //Causes OnDataLossAsync to be called. service.LogCallback?.Invoke($"BackupRestoreService - Begun restore backup {backupMetadata.BackupId} for partition {service.Context.PartitionId}."); }
private void KillPrimaryReplica() { // Kill the primary Application application = _fabricClient.QueryManager.GetApplicationListAsync() .Result.Single(a => a.ApplicationTypeName == DefaultApplicationTypeName); Service service = _fabricClient.QueryManager.GetServiceListAsync(application.ApplicationName).Result.Single(); Partition partition = _fabricClient.QueryManager.GetPartitionListAsync(service.ServiceName).Result.Single(); StatefulServiceReplica primaryReplica = _fabricClient.QueryManager.GetReplicaListAsync(partition.PartitionInformation.Id) .Result.Select(replica => replica as StatefulServiceReplica) .Single(statefulServiceReplica => statefulServiceReplica.ReplicaRole == ReplicaRole.Primary); LogHelper.Log("Killing the primary replica at node {0}", primaryReplica.NodeName); ReplicaSelector replicaSelector = ReplicaSelector.ReplicaIdOf(PartitionSelector.PartitionIdOf(service.ServiceName, partition.PartitionInformation.Id), primaryReplica.Id); _fabricClient.FaultManager.RemoveReplicaAsync(replicaSelector, CompletionMode.DoNotVerify, false); }
public static PartitionSelector GetExpectedPartitionSelector(PartitionCase partitionCase) { PartitionSelector result = null; switch (partitionCase) { case PartitionCase.PartitionId: { result = PartitionSelector.PartitionIdOf(ServiceName, PartitionID); break; } case PartitionCase.ServiceNameRandomPartition: { result = PartitionSelector.RandomOf(ServiceName); break; } case PartitionCase.ServiceNamePartitionSingleton: { result = PartitionSelector.SingletonOf(ServiceName); break; } case PartitionCase.ServiceNamePartitionNamed: { result = PartitionSelector.PartitionKeyOf(ServiceName, PartitionKey); break; } case PartitionCase.ServiceNamePartitionUniformedInt: { result = PartitionSelector.PartitionKeyOf(ServiceName, PartitionKeyLong); break; } } return(result); }
private FabricTestAction GetMovePrimaryReplicaAction(MovePrimaryReplicaStateTransitionAction ragAction) { Uri serviceUri = ragAction.ServiceUri; Guid guid = ragAction.PartitionId; string newNodeName = ragAction.NodeTo; string report = StringHelper.Format( "Generating Action: {0}\n\t\tService: {1}\n\t\tPartition: {2}\n\t\tTo: {3}", ragAction.ActionType, serviceUri, guid, newNodeName); Log.WriteInfo(TraceType, report); if (this.reportFunction != null) { this.reportFunction(report); } var partitionSelector = PartitionSelector.PartitionIdOf(serviceUri, guid); return(new MovePrimaryAction(newNodeName, partitionSelector, ragAction.ForceMove)); }
public async Task Restore(IPersistentDownloader downloader, EventArgs eventArgs) { await m_init; if (eventArgs == EventArgs.Empty) { // !Note, we disable the event handler before we trigger an active restore, otherwise // the active restore event will notify the backup controller, while in fact the restore // command is issued from the controller, and SF backup manager should do it passively. Log.WriteLine($"{nameof(ServiceFabricBackupManager)}: initiating a new restore operation."); m_svc.RequestRestore -= OnServiceFabricRequestRestore; await m_svc.FabricClient.TestManager.StartPartitionDataLossAsync(Guid.NewGuid(), PartitionSelector.PartitionIdOf(m_svc.Context.ServiceName, m_svc.Context.PartitionId), DataLossMode.PartialDataLoss); await m_sem.WaitAsync(); m_svc.RequestRestore += OnServiceFabricRequestRestore; return; } if (!(eventArgs is RestoreEventArgs rstArgs)) { throw new NotSupportedException(); } try { var ctx = rstArgs.m_rctx; var dir = Path.Combine(TrinityConfig.StorageRoot, Path.GetRandomFileName()); var dsc = new RestoreDescription(dir); var fname = Path.Combine(TrinityConfig.StorageRoot, Path.GetRandomFileName()); using (var file = File.OpenWrite(fname)) { Log.WriteLine($"{nameof(ServiceFabricBackupManager)}: Downloading ServiceFabric backup data."); await downloader.DownloadMetadataAsync(MetadataKey, file); } Log.WriteLine($"{nameof(ServiceFabricBackupManager)}: Decompressing ServiceFabric backup data."); FileUtility.CompletePath(dir, create_nonexistent: true); ZipFile.ExtractToDirectory(fname, dir); File.Delete(fname); Log.WriteLine($"{nameof(ServiceFabricBackupManager)}: Restoring ServiceFabric backup data."); await ctx.RestoreAsync(dsc); Directory.Delete(dir, recursive: true); Log.WriteLine($"{nameof(ServiceFabricBackupManager)}: Restored ServiceFabric backup data."); rstArgs.Complete(); } catch (Exception ex) { rstArgs.Complete(ex); throw; } m_sem.Release(); }
public override async Task <ActionStateBase> RunAsync(CancellationToken cancellationToken, ServiceInternalFaultInfo serviceInternalFaultInfo) { InvokeQuorumLossState state = Convert(this.State); Guid partitionId = state.Info.PartitionId; List <Tuple <string, string> > unreliableTransportInfo = state.Info.UnreliableTransportInfo; List <long> targetReplicas = state.Info.ReplicaIds; var unreliableTransportTaskList = new List <Task>(); List <Tuple <string, string> > unreliableTransportInfoList = new List <Tuple <string, string> >(); foreach (Tuple <string, string> ut in unreliableTransportInfo) { string nodeName = ut.Item1; string behaviorName = ut.Item2; System.Fabric.Common.UnreliableTransportBehavior behavior = new System.Fabric.Common.UnreliableTransportBehavior("*", "StatefulServiceReopen"); behavior.AddFilterForPartitionId(partitionId); TestabilityTrace.TraceSource.WriteInfo(StepBase.TraceType, "{0} - applying '{1}'", this.State.OperationId, behaviorName); unreliableTransportTaskList.Add(FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync( () => this.FabricClient.TestManager.AddUnreliableTransportBehaviorAsync( nodeName, behaviorName, behavior, this.RequestTimeout, cancellationToken), this.OperationTimeout, cancellationToken)); } await Task.WhenAll(unreliableTransportTaskList).ConfigureAwait(false); // Bug#2271465 - Unreliable transport through API should return only once the behavior has been successfully applied await Task.Delay(TimeSpan.FromSeconds(5.0), cancellationToken).ConfigureAwait(false); List <Task> tasks = new List <Task>(); foreach (long replicaId in targetReplicas) { ReplicaSelector replicaSelector = ReplicaSelector.ReplicaIdOf(PartitionSelector.PartitionIdOf(this.partitionSelector.ServiceName, partitionId), replicaId); TestabilityTrace.TraceSource.WriteInfo(StepBase.TraceType, "{0} - faulting replica with id={1}", this.State.OperationId, replicaId); Task task = FaultAnalysisServiceUtility.RestartReplicaAsync(this.FabricClient, replicaSelector, CompletionMode.DoNotVerify, this.RequestTimeout, this.OperationTimeout, cancellationToken); tasks.Add(task); } await Task.WhenAll(tasks).ConfigureAwait(false); ActionTest.PerformInternalServiceFaultIfRequested(this.State.OperationId, serviceInternalFaultInfo, this.State, cancellationToken, true); TestabilityTrace.TraceSource.WriteInfo(StepBase.TraceType, "{0} - keeping partition in quorum loss for '{1}'", this.State.OperationId, state.Info.QuorumLossDuration); await Task.Delay(state.Info.QuorumLossDuration, cancellationToken).ConfigureAwait(false); TimeoutHelper timeoutHelper = new TimeoutHelper(this.OperationTimeout); bool conditionSatisfied = false; int quorumLossCheckRetries = FASConstants.QuorumLossCheckRetryCount; do { TestabilityTrace.TraceSource.WriteInfo(StepBase.TraceType, "{0} - checking PartitionStatus", this.State.OperationId); ServicePartitionList partitionsResult = await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync( () => this.FabricClient.QueryManager.GetPartitionListAsync( this.partitionSelector.ServiceName, null, this.RequestTimeout, cancellationToken), this.OperationTimeout, cancellationToken).ConfigureAwait(false); foreach (StatefulServicePartition partition in partitionsResult) { if (partition.PartitionInformation.Id == partitionId) { if (partition.PartitionStatus == ServicePartitionStatus.InQuorumLoss) { conditionSatisfied = true; break; } } } await AsyncWaiter.WaitAsync(TimeSpan.FromSeconds(5), cancellationToken).ConfigureAwait(false); }while (!conditionSatisfied && quorumLossCheckRetries-- > 0); if (!conditionSatisfied) { string error = string.Format(CultureInfo.InvariantCulture, "{0} - Service could not induce quorum loss for service '{1}', partition '{2}'. Please retry", this.State.OperationId, this.partitionSelector.ServiceName, partitionId); TestabilityTrace.TraceSource.WriteWarning(StepBase.TraceType, error); throw new FabricTransientException("The operation could not be performed, please retry", FabricErrorCode.NotReady); } await QuorumLossStepsFactory.RemoveUTAsync(this.FabricClient, this.State, this.RequestTimeout, this.OperationTimeout, cancellationToken); state.StateProgress.Push(StepStateNames.CompletedSuccessfully); return(state); }
protected override async Task ExecuteActionAsync(FabricTestContext testContext, InvokeQuorumLossAction action, CancellationToken cancellationToken) { ThrowIf.Null(action.PartitionSelector, "PartitionSelector"); var helper = new TimeoutHelper(action.ActionTimeout); // get info about the service so we can check type and trss ServiceDescription result = await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync( () => testContext.FabricClient.ServiceManager.GetServiceDescriptionAsync( action.PartitionSelector.ServiceName, action.RequestTimeout, cancellationToken), helper.GetRemainingTime(), cancellationToken).ConfigureAwait(false); if (result.Kind != ServiceDescriptionKind.Stateful) { throw new InvalidOperationException(StringHelper.Format(StringResources.Error_InvalidServiceTypeTestability, "QuorumLoss", "Stateful", action.PartitionSelector.ServiceName, "Stateless")); } StatefulServiceDescription statefulServiceDescription = result as StatefulServiceDescription; ReleaseAssert.AssertIf(statefulServiceDescription == null, "Service is not a stateful service"); if (!statefulServiceDescription.HasPersistedState) { throw new InvalidOperationException(StringHelper.Format(StringResources.Error_InvalidServiceTypeTestability, "QuorumLoss", "Stateful Persistent", action.PartitionSelector.ServiceName, "Stateful In-Memory Only")); } // figure out /which/ partition to select var getPartitionStateAction = new GetSelectedPartitionStateAction(action.PartitionSelector) { RequestTimeout = action.RequestTimeout, ActionTimeout = helper.GetRemainingTime() }; await testContext.ActionExecutor.RunAsync(getPartitionStateAction, cancellationToken); Guid partitionId = getPartitionStateAction.Result.PartitionId; // get data about replicas in that partition ServiceReplicaList replicasResult = await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync( () => testContext.FabricClient.QueryManager.GetReplicaListAsync( partitionId, 0, action.RequestTimeout, cancellationToken), helper.GetRemainingTime(), cancellationToken).ConfigureAwait(false); var removeUTRequestList = new List <Tuple <string, string> >(); Dictionary <Tuple <string, string>, Task> removeUTTaskDictionary = new Dictionary <Tuple <string, string>, Task>(); try { var stableReplicas = replicasResult.Where(r => r.ReplicaStatus == ServiceReplicaStatus.Ready).ToArray(); var stableReplicasToRemove = new List <StatefulServiceReplica>(); long replicasToRestartWithoutPrimary = action.QuorumLossMode == QuorumLossMode.AllReplicas ? stableReplicas.Length - 1 : FabricCluster.GetWriteQuorumSize(replicasResult.Count); foreach (var replica in stableReplicas) { StatefulServiceReplica statefulReplica = replica as StatefulServiceReplica; ReleaseAssert.AssertIf(statefulReplica == null, "Service Replica is not of stateful type even though service is stateful"); if (statefulReplica.ReplicaRole != ReplicaRole.Primary) { replicasToRestartWithoutPrimary--; } if (replicasToRestartWithoutPrimary >= 0 || statefulReplica.ReplicaRole == ReplicaRole.Primary) { stableReplicasToRemove.Add(statefulReplica); } } // for selected replicas, block reopen so that when we restart the replica (NOT remove the replica) it doesn't come up var utTaskList = new List <Task>(); foreach (var statefulReplica in stableReplicasToRemove) { string nodeName = statefulReplica.NodeName; UnreliableTransportBehavior behavior = new UnreliableTransportBehavior("*", "StatefulServiceReopen"); behavior.AddFilterForPartitionId(partitionId); string behaviorName = "BlockStatefulServiceReopen_" + nodeName; removeUTRequestList.Add(new Tuple <string, string>(nodeName, behaviorName)); utTaskList.Add( FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync( () => testContext.FabricClient.TestManager.AddUnreliableTransportBehaviorAsync( nodeName, behaviorName, behavior, action.RequestTimeout, cancellationToken), helper.GetRemainingTime(), cancellationToken)); } await Task.WhenAll(utTaskList).ConfigureAwait(false); // TODO: Wait for some time so that the unreliable transport behavior can be read from the files. // Bug#2271465 - Unreliable transport through API should return only once the behavior has been successfully applied await Task.Delay(TimeSpan.FromSeconds(5.0), cancellationToken); var restartReplicaTaskList = new List <Task>(); foreach (var statefulReplica in stableReplicasToRemove) { ReplicaSelector replicaSelector = ReplicaSelector.ReplicaIdOf(PartitionSelector.PartitionIdOf(action.PartitionSelector.ServiceName, partitionId), statefulReplica.Id); var restartReplicaAction = new RestartReplicaAction(replicaSelector) { CompletionMode = CompletionMode.DoNotVerify, RequestTimeout = action.RequestTimeout, ActionTimeout = helper.GetRemainingTime() }; restartReplicaTaskList.Add(testContext.ActionExecutor.RunAsync(restartReplicaAction, cancellationToken)); } await Task.WhenAll(restartReplicaTaskList).ConfigureAwait(false); await AsyncWaiter.WaitAsync(action.QuorumLossDuration, cancellationToken).ConfigureAwait(false); // validate ServicePartitionList partitionsResult = await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync( () => testContext.FabricClient.QueryManager.GetPartitionListAsync( action.PartitionSelector.ServiceName, null, action.RequestTimeout, cancellationToken), FabricClientRetryErrors.GetPartitionListFabricErrors.Value, helper.GetRemainingTime(), cancellationToken).ConfigureAwait(false); foreach (StatefulServicePartition partition in partitionsResult) { if (partition.PartitionInformation.Id == partitionId) { ReleaseAssert.AssertIf(partition.PartitionStatus != ServicePartitionStatus.InQuorumLoss, "Partition failed to be in Quorum Loss."); break; } } foreach (var removeUTParams in removeUTRequestList) { var currentParams = removeUTParams; Task task = FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync( () => testContext.FabricClient.TestManager.RemoveUnreliableTransportBehaviorAsync( currentParams.Item1, /*nodeName*/ currentParams.Item2, /*behaviorName*/ action.RequestTimeout, cancellationToken), FabricClientRetryErrors.RemoveUnreliableTransportBehaviorErrors.Value, helper.GetRemainingTime(), cancellationToken); removeUTTaskDictionary[currentParams] = task; } await Task.WhenAll(removeUTTaskDictionary.Values).ConfigureAwait(false); // TODO: Wait for some time so that the removal of this unreliable transport behavior can be read from the files. // Bug#2271465 - Unreliable transport through API should return only once the behavior has been successully applied await Task.Delay(TimeSpan.FromSeconds(5.0), cancellationToken); } finally { var removeUTTaskList = new List <Task>(); foreach (var removeUTRequest in removeUTTaskDictionary) { var currentRemoveUTRequest = removeUTRequest; if (currentRemoveUTRequest.Value == null || currentRemoveUTRequest.Value.IsFaulted) { removeUTTaskList.Add( FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync( () => testContext.FabricClient.TestManager.RemoveUnreliableTransportBehaviorAsync( currentRemoveUTRequest.Key.Item1, /*nodeName*/ currentRemoveUTRequest.Key.Item2, /*behaviorName*/ action.RequestTimeout, cancellationToken), FabricClientRetryErrors.RemoveUnreliableTransportBehaviorErrors.Value, helper.GetRemainingTime(), cancellationToken)); } } Task.WhenAll(removeUTTaskList).Wait(cancellationToken); // TODO: Wait for some time so that the removal of this unreliable transport behavior can be read from the files. // Bug#2271465 - Unreliable transport through API should return only once the behavior has been successully applied Task.Delay(TimeSpan.FromSeconds(5.0), cancellationToken).GetAwaiter().GetResult(); } action.Result = new InvokeQuorumLossResult(getPartitionStateAction.Result); this.ResultTraceString = StringHelper.Format("InvokeQuorumLossAction succeeded for {0} with QuorumLossMode = {1}", partitionId, action.QuorumLossMode); }
internal static ReplicaSelector GetReplicaSelector(string partitionSetName, Guid partitionId, Uri serviceName, string partitionKey, long?replicaOrInstanceId) { ReplicaSelector replicaSelector = null; PartitionSelector partitionSelector = null; if (partitionSetName.Contains("PartitionId")) { partitionSelector = PartitionSelector.PartitionIdOf(serviceName, partitionId); } else { if (partitionSetName.Contains("PartitionSingleton")) { partitionSelector = PartitionSelector.SingletonOf(serviceName); } else if (partitionSetName.Contains("PartitionNamed")) { partitionSelector = PartitionSelector.PartitionKeyOf(serviceName, partitionKey); } else if (partitionSetName.Contains("PartitionUniformedInt")) { long partitionKeyLong; if (!long.TryParse(partitionKey, out partitionKeyLong)) { throw new ArgumentException(StringResources.Error_InvalidPartitionKey); } partitionSelector = PartitionSelector.PartitionKeyOf(serviceName, partitionKeyLong); } else if (!partitionSetName.Contains("Partition")) { partitionSelector = PartitionSelector.RandomOf(serviceName); } } if (partitionSelector == null) { throw new ArgumentException(StringResources.Error_CouldNotParsePartitionSelector); } if (partitionSetName.Contains("ReplicaPrimary")) { replicaSelector = ReplicaSelector.PrimaryOf(partitionSelector); } else if (partitionSetName.Contains("ReplicaRandomSecondary")) { replicaSelector = ReplicaSelector.RandomSecondaryOf(partitionSelector); } else if (partitionSetName.Contains("ReplicaId")) { replicaSelector = ReplicaSelector.ReplicaIdOf(partitionSelector, replicaOrInstanceId ?? 0); } else if (!partitionSetName.Contains("Replica")) { replicaSelector = ReplicaSelector.RandomOf(partitionSelector); } if (replicaSelector == null) { throw new ArgumentException(StringResources.Error_CouldNotParseReplicaSelector); } return(replicaSelector); }
/// <summary> /// This API supports the Service Fabric platform and is not meant to be called from your code /// </summary> /// <param name="token">This API supports the Service Fabric platform and is not meant to be called from your code</param> /// <returns></returns> protected override async Task OnExecuteAsync(CancellationToken token) { this.serviceDescription = await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync( () => this.FabricClient.ServiceManager.GetServiceDescriptionAsync( this.failoverTestScenarioParameters.PartitionSelector.ServiceName, this.failoverTestScenarioParameters.RequestTimeout, token), this.failoverTestScenarioParameters.OperationTimeout, token).ConfigureAwait(false); bool hasPersistedState = false; if (this.serviceDescription.IsStateful()) { StatefulServiceDescription statefulDescription = this.serviceDescription as StatefulServiceDescription; ReleaseAssert.AssertIf(statefulDescription == null, "Stateful service description is not WinFabricStatefulServiceDescription"); hasPersistedState = statefulDescription.HasPersistedState; } Log.WriteInfo(TraceType, "Validating Service health and availability"); await this.FabricClient.TestManager.ValidateServiceAsync( this.failoverTestScenarioParameters.PartitionSelector.ServiceName, this.failoverTestScenarioParameters.MaxServiceStabilizationTimeout, token); Log.WriteInfo(TraceType, "Getting Selected Partition"); var getPartitionStateAction = new GetSelectedPartitionStateAction(this.failoverTestScenarioParameters.PartitionSelector) { RequestTimeout = this.failoverTestScenarioParameters.RequestTimeout, ActionTimeout = this.failoverTestScenarioParameters.OperationTimeout }; await this.TestContext.ActionExecutor.RunAsync(getPartitionStateAction, token); Guid selectedPartitionId = getPartitionStateAction.Result.PartitionId; Log.WriteInfo(TraceType, "Running test for partition {0}", selectedPartitionId); this.ReportProgress("Selected partition {0} for testing failover", selectedPartitionId); PartitionSelector selectedPartition = PartitionSelector.PartitionIdOf(this.failoverTestScenarioParameters.PartitionSelector.ServiceName, selectedPartitionId); while (this.failoverTestScenarioParameters.TimeToRun - this.GetElapsedTime() > TimeSpan.Zero && !token.IsCancellationRequested) { if (this.serviceDescription.IsStateful()) { ReplicaSelector primaryReplicaSelector = ReplicaSelector.PrimaryOf(selectedPartition); ReplicaSelector secondaryReplicaSelector = ReplicaSelector.RandomSecondaryOf(selectedPartition); // Make Primary go through RemoveReplica, RestartReplica and RestartCodePackage await this.TestReplicaFaultsAsync(primaryReplicaSelector, "Primary", hasPersistedState, token); // Make Secondary go through RemoveReplica, RestartReplica and RestartCodePackage await this.TestReplicaFaultsAsync(secondaryReplicaSelector, "Secondary", hasPersistedState, token); } else { ReplicaSelector randomInstanceSelector = ReplicaSelector.RandomOf(selectedPartition); // Make Stateless Instance go through RemoveReplica, RestartReplica and RestartCodePackage await this.TestReplicaFaultsAsync(randomInstanceSelector, "Stateless Instance", hasPersistedState, token); } if (this.serviceDescription.IsStateful()) { // Restart all secondary replicas and make sure the replica set recovers await this.InvokeAndValidateFaultAsync( "Restarting all the secondary replicas", () => { #pragma warning disable 618 return(this.FabricClient.TestManager.RestartPartitionAsync( selectedPartition, RestartPartitionMode.OnlyActiveSecondaries, this.failoverTestScenarioParameters.OperationTimeout, token)); #pragma warning restore 618 }, token); // Restart all replicas if service is persisted if (hasPersistedState) { await this.InvokeAndValidateFaultAsync( "Restarting all replicas including Primary", () => { #pragma warning disable 618 return(this.FabricClient.TestManager.RestartPartitionAsync( selectedPartition, RestartPartitionMode.AllReplicasOrInstances, this.failoverTestScenarioParameters.OperationTimeout, token)); #pragma warning restore 618 }, token); } // Induce move and swap primary a few times await this.InvokeAndValidateFaultAsync( "Move Primary to a different node", () => { return(this.FabricClient.FaultManager.MovePrimaryAsync( string.Empty, selectedPartition, true, this.failoverTestScenarioParameters.OperationTimeout, token)); }, token); // Induce move secondary a few times await this.InvokeAndValidateFaultAsync( "Move Secondary to a different node", () => { return(this.FabricClient.FaultManager.MoveSecondaryAsync( string.Empty, string.Empty, selectedPartition, true, this.failoverTestScenarioParameters.OperationTimeout, token)); }, token); } else { // Restart all stateless instances await this.InvokeAndValidateFaultAsync( "Restarting all stateless instances for partition", () => { #pragma warning disable 618 return(this.FabricClient.TestManager.RestartPartitionAsync( selectedPartition, RestartPartitionMode.AllReplicasOrInstances, this.failoverTestScenarioParameters.OperationTimeout, token)); #pragma warning restore 618 }, token); } } }
private static async Task MainAsync() { Console.WriteLine("Waiting for services...."); var proxyPartitionOne = await CreateProxyAsync(-1L); var proxyPartitionTwo = await CreateProxyAsync(1L); var proxy = proxyPartitionOne; Console.WriteLine("Waited for services.."); while (true) { Console.WriteLine($"Press any key to continue"); Console.ReadKey(true); Console.Clear(); Console.WriteLine("Press 0 to select target partition"); Console.WriteLine("Press 1 to get state"); Console.WriteLine("Press 2 to set state"); Console.WriteLine("Press 3 to create a backup"); Console.WriteLine("Press 4 to restore a backup"); Console.WriteLine("Press 5 to list all central backups"); Console.WriteLine("Press 6 to list the current Service Partition Ids"); Console.WriteLine("Press 7 to invoke full dataloss on one of the current Service's Partitions"); Console.WriteLine("Other key to exit"); var key = Console.ReadKey(true); string input; switch (key.Key) { case ConsoleKey.D0: Console.WriteLine("Type 1 for partition one, or 2 for partition two"); key = Console.ReadKey(true); if (ConsoleKey.D2 == key.Key) { proxy = proxyPartitionTwo; Console.WriteLine("Using partition two."); } else { proxy = proxyPartitionOne; Console.WriteLine("Using partition one."); } break; case ConsoleKey.D1: string state = await proxy.GetState(); Console.WriteLine($"State: '{state}'"); break; case ConsoleKey.D2: Console.WriteLine("Enter string to store as state:"); input = Console.ReadLine(); await proxy.SetState(input ?? ""); Console.WriteLine($"State saved: '{input}'"); break; case ConsoleKey.D3: Console.WriteLine("Type 1 for full backup or 2 for incremental backup (incremental requires full backup to exist)"); key = Console.ReadKey(true); if (ConsoleKey.D1 == key.Key) { Console.WriteLine("Creating a full backup asynchronously..."); await proxy.BeginCreateBackup(BackupOption.Full); } else { Console.WriteLine("Creating an incremental backup asynchronously..."); await proxy.BeginCreateBackup(BackupOption.Incremental); } break; case ConsoleKey.D4: Console.WriteLine($"Starting the restore of a backup"); Console.WriteLine($"Enter central backup id (guid):"); input = Console.ReadLine(); var backups = (await proxy.ListAllBackups()).ToList(); Guid index; if (Guid.TryParse(input, out index)) { DataLossMode lossMode = DataLossMode.FullDataLoss; Console.WriteLine("Type 1 for full data loss or 2 for partial data loss."); key = Console.ReadKey(true); if (ConsoleKey.D1 == key.Key) { Console.WriteLine("Restoring backup with full data loss asynchronously..."); } else { Console.WriteLine("Restoring backup with partial data loss asynchronously..."); lossMode = DataLossMode.PartialDataLoss; } await proxy.BeginRestoreBackup(backups.Single(b => b.BackupId == index), lossMode); Console.WriteLine($"Restore is active. This will take some time. Check progress in SF explorer."); } break; case ConsoleKey.D5: Console.WriteLine($"List all central backups"); var list = await proxy.ListAllBackups(); Console.WriteLine($"Original partition\t\t\tBackup Id\t\t\t\tBackup Type\tTimestamp UTC"); Console.WriteLine(string.Join(Environment.NewLine, list.Select(data => $"{data.OriginalServicePartitionId}\t{data.BackupId}\t{data.BackupOption}\t\t{data.TimeStampUtc}"))); break; case ConsoleKey.D6: var resolver = ServicePartitionResolver.GetDefault(); var resolved = await resolver.ResolveAsync(ServiceUri, new ServicePartitionKey(-1L), CancellationToken.None); Console.WriteLine($"Partition key -1L resolves to partition {resolved.Info.Id}"); resolved = await resolver.ResolveAsync(ServiceUri, new ServicePartitionKey(1L), CancellationToken.None); Console.WriteLine($"Partition key 1L resolves to partition {resolved.Info.Id}"); if (proxy == proxyPartitionOne) { Console.WriteLine("Using partition one (-1L)"); } else { Console.WriteLine("Using partition two (1L)"); } break; case ConsoleKey.D7: Console.WriteLine("Enter partitionID"); string partitionString = Console.ReadLine(); if (Guid.TryParse(partitionString, out Guid partitionID)) { var partitionSelector = PartitionSelector.PartitionIdOf(ServiceUri, partitionID); await new FabricClient(FabricClientRole.Admin).TestManager.StartPartitionDataLossAsync(Guid.NewGuid(), partitionSelector, DataLossMode.FullDataLoss); } break; default: return; } } }