private static void ThrowIfDataLossModeInvalid(DataLossMode dataLossMode) { if (dataLossMode == DataLossMode.Invalid) { throw FaultAnalysisServiceUtility.CreateException(TraceType, Interop.NativeTypes.FABRIC_ERROR_CODE.E_INVALIDARG, Strings.StringResources.Error_UnsupportedDataLossMode); } }
public override void ClearInfo() { PartitionSelector ps = this.Info.PartitionSelector; DataLossMode dlm = this.Info.DataLossMode; this.Info = new InvokeDataLossInfo(ps, dlm); }
public InvokeDataLossInfo(PartitionSelector partitionSelector, DataLossMode dataLossMode) { this.PartitionSelector = partitionSelector; this.DataLossMode = dataLossMode; // This is a default value and will be overwritten when the command executes. The default value is not used during the command. this.NodeName = "UNKNOWNNODE"; this.UnreliableTransportInfo = new List <Tuple <string, string> >(); }
public InvokeDataLossDescription( Guid operationId, PartitionSelector partitionSelector, DataLossMode dataLossMode) { Requires.Argument <Guid>("operationId", operationId).NotNull(); this.OperationId = operationId; this.PartitionSelector = partitionSelector; this.DataLossMode = dataLossMode; }
public InvokeDataLossAction( IReliableStateManager stateManager, IStatefulServicePartition partition, InvokeDataLossState state, PartitionSelector partitionSelector, DataLossMode dataLossMode, int dataLossCheckWaitDurationInSeconds, int dataLossCheckPollIntervalInSeconds, int replicaDropWaitDurationInSeconds, TimeSpan requestTimeout, TimeSpan operationTimeout) : base(stateManager, partition, state, requestTimeout, operationTimeout) { ThrowIf.Null(partitionSelector, "partitionSelector"); this.PartitionSelector = partitionSelector; this.DataLossMode = dataLossMode; this.DataLossCheckWaitDurationInSeconds = dataLossCheckWaitDurationInSeconds; this.DataLossCheckPollIntervalInSeconds = dataLossCheckPollIntervalInSeconds; this.ReplicaDropWaitDurationInSeconds = replicaDropWaitDurationInSeconds; }
/// <summary> /// Decorates EventDocument-derived classes, providing event routing info: Namespace and Queue atoms /// </summary> public EventAttribute(string ns, string queue, DataLossMode mode) { Namespace = Atom.Encode(ns.NonBlank(nameof(ns))); Queue = Atom.Encode(queue.NonBlank(nameof(queue))); LossMode = mode; }
public InvokeDataLossState(Guid operationId, ServiceInternalFaultInfo serviceInternalFaultInfo, PartitionSelector partitionSelector, DataLossMode dataLossMode) : base(operationId, ActionType.InvokeDataLoss, serviceInternalFaultInfo) { this.Info = new InvokeDataLossInfo(partitionSelector, dataLossMode); }
private static async Task MainAsync() { Console.WriteLine("Waiting for services...."); var proxyPartitionOne = await CreateProxyAsync(-1L); var proxyPartitionTwo = await CreateProxyAsync(1L); var proxy = proxyPartitionOne; Console.WriteLine("Waited for services.."); while (true) { Console.WriteLine($"Press any key to continue"); Console.ReadKey(true); Console.Clear(); Console.WriteLine("Press 0 to select target partition"); Console.WriteLine("Press 1 to get state"); Console.WriteLine("Press 2 to set state"); Console.WriteLine("Press 3 to create a backup"); Console.WriteLine("Press 4 to restore a backup"); Console.WriteLine("Press 5 to list all central backups"); Console.WriteLine("Press 6 to list the current Service Partition Ids"); Console.WriteLine("Press 7 to invoke full dataloss on one of the current Service's Partitions"); Console.WriteLine("Other key to exit"); var key = Console.ReadKey(true); string input; switch (key.Key) { case ConsoleKey.D0: Console.WriteLine("Type 1 for partition one, or 2 for partition two"); key = Console.ReadKey(true); if (ConsoleKey.D2 == key.Key) { proxy = proxyPartitionTwo; Console.WriteLine("Using partition two."); } else { proxy = proxyPartitionOne; Console.WriteLine("Using partition one."); } break; case ConsoleKey.D1: string state = await proxy.GetState(); Console.WriteLine($"State: '{state}'"); break; case ConsoleKey.D2: Console.WriteLine("Enter string to store as state:"); input = Console.ReadLine(); await proxy.SetState(input ?? ""); Console.WriteLine($"State saved: '{input}'"); break; case ConsoleKey.D3: Console.WriteLine("Type 1 for full backup or 2 for incremental backup (incremental requires full backup to exist)"); key = Console.ReadKey(true); if (ConsoleKey.D1 == key.Key) { Console.WriteLine("Creating a full backup asynchronously..."); await proxy.BeginCreateBackup(BackupOption.Full); } else { Console.WriteLine("Creating an incremental backup asynchronously..."); await proxy.BeginCreateBackup(BackupOption.Incremental); } break; case ConsoleKey.D4: Console.WriteLine($"Starting the restore of a backup"); Console.WriteLine($"Enter central backup id (guid):"); input = Console.ReadLine(); var backups = (await proxy.ListAllBackups()).ToList(); Guid index; if (Guid.TryParse(input, out index)) { DataLossMode lossMode = DataLossMode.FullDataLoss; Console.WriteLine("Type 1 for full data loss or 2 for partial data loss."); key = Console.ReadKey(true); if (ConsoleKey.D1 == key.Key) { Console.WriteLine("Restoring backup with full data loss asynchronously..."); } else { Console.WriteLine("Restoring backup with partial data loss asynchronously..."); lossMode = DataLossMode.PartialDataLoss; } await proxy.BeginRestoreBackup(backups.Single(b => b.BackupId == index), lossMode); Console.WriteLine($"Restore is active. This will take some time. Check progress in SF explorer."); } break; case ConsoleKey.D5: Console.WriteLine($"List all central backups"); var list = await proxy.ListAllBackups(); Console.WriteLine($"Original partition\t\t\tBackup Id\t\t\t\tBackup Type\tTimestamp UTC"); Console.WriteLine(string.Join(Environment.NewLine, list.Select(data => $"{data.OriginalServicePartitionId}\t{data.BackupId}\t{data.BackupOption}\t\t{data.TimeStampUtc}"))); break; case ConsoleKey.D6: var resolver = ServicePartitionResolver.GetDefault(); var resolved = await resolver.ResolveAsync(ServiceUri, new ServicePartitionKey(-1L), CancellationToken.None); Console.WriteLine($"Partition key -1L resolves to partition {resolved.Info.Id}"); resolved = await resolver.ResolveAsync(ServiceUri, new ServicePartitionKey(1L), CancellationToken.None); Console.WriteLine($"Partition key 1L resolves to partition {resolved.Info.Id}"); if (proxy == proxyPartitionOne) { Console.WriteLine("Using partition one (-1L)"); } else { Console.WriteLine("Using partition two (1L)"); } break; case ConsoleKey.D7: Console.WriteLine("Enter partitionID"); string partitionString = Console.ReadLine(); if (Guid.TryParse(partitionString, out Guid partitionID)) { var partitionSelector = PartitionSelector.PartitionIdOf(ServiceUri, partitionID); await new FabricClient(FabricClientRole.Admin).TestManager.StartPartitionDataLossAsync(Guid.NewGuid(), partitionSelector, DataLossMode.FullDataLoss); } break; default: return; } } }
public StartPartitionDataLossRestRequest(IFabricClient fabricClient, Guid operationId, Uri servicename, Guid partitionId, DataLossMode dataLossMode, TimeSpan timeout) : base(fabricClient, timeout) { this.OperationId = operationId; this.ServiceName = servicename; this.PartitionId = partitionId; this.DataLossMode = dataLossMode; this.RetryErrorCodes.Add((uint)NativeTypes.FABRIC_ERROR_CODE.FABRIC_E_NOT_READY); this.RetryErrorCodes.Add((uint)NativeTypes.FABRIC_ERROR_CODE.FABRIC_E_RECONFIGURATION_PENDING); this.SucceedErrorCodes.Add((uint)NativeTypes.FABRIC_ERROR_CODE.FABRIC_E_TEST_COMMAND_OPERATION_ID_ALREADY_EXISTS); }
public override async Task <ActionStateBase> RunAsync(CancellationToken cancellationToken, ServiceInternalFaultInfo serviceInternalFaultInfo) { InvokeDataLossState state = Convert(this.State); PartitionSelector partitionSelector = state.Info.PartitionSelector; DataLossMode dataLossMode = state.Info.DataLossMode; long preDataLossNumber = state.Info.DataLossNumber; string failoverManagerPrimaryNodeName = state.Info.NodeName; Guid partitionId = state.Info.PartitionId; string behaviorName = state.Info.UnreliableTransportInfo.First().Item2; int targetReplicaSetSize = state.Info.TargetReplicaSetSize; TestabilityTrace.TraceSource.WriteInfo(StepBase.TraceType, "{0} - applying UT, partitionId={1}", this.State.OperationId, partitionId); System.Fabric.Common.UnreliableTransportBehavior behavior = new System.Fabric.Common.UnreliableTransportBehavior("*", "DoReconfiguration"); behavior.AddFilterForPartitionId(partitionId); await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync( () => this.FabricClient.TestManager.AddUnreliableTransportBehaviorAsync( failoverManagerPrimaryNodeName, behaviorName, behavior, this.RequestTimeout, cancellationToken), this.OperationTimeout, cancellationToken).ConfigureAwait(false); // TODO: Wait for some time so that the unreliable transport behavior can be read from the files. // Bug#2271465 - Unreliable transport through API should return only once the behavior has been successfully applied await Task.Delay(TimeSpan.FromSeconds(5.0), cancellationToken).ConfigureAwait(false); ServiceReplicaList replicasResult = await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync( () => this.FabricClient.QueryManager.GetReplicaListAsync( partitionId, 0, this.RequestTimeout, cancellationToken), this.OperationTimeout, cancellationToken).ConfigureAwait(false); List <StatefulServiceReplica> replicaList = new List <StatefulServiceReplica>(); foreach (var replica in replicasResult) { StatefulServiceReplica statefulReplica = replica as StatefulServiceReplica; ReleaseAssert.AssertIf(statefulReplica == null, "Service Replica is not of stateful type even though service is stateful"); replicaList.Add(statefulReplica); } // Select target replicas based on the DataLosMode List <StatefulServiceReplica> targets = null; if (dataLossMode == DataLossMode.FullDataLoss) { targets = GetReplicasForFullDataLoss(replicaList); } else if (dataLossMode == DataLossMode.PartialDataLoss) { targets = FaultAnalysisServiceUtility.GetReplicasForPartialLoss(state.OperationId, replicaList); } else { throw FaultAnalysisServiceUtility.CreateException(StepBase.TraceType, Interop.NativeTypes.FABRIC_ERROR_CODE.E_INVALIDARG, Strings.StringResources.Error_UnsupportedDataLossMode); } if (targets == null) { // This will cause the command to rollback and retry throw new FabricTransientException("The operation could not be performed, please retry", FabricErrorCode.NotReady); } foreach (var replica in targets) { TestabilityTrace.TraceSource.WriteInfo( StepBase.TraceType, "{0} - Removing replica {1} in partition {2} with role {3} and status {4} to induce data loss", this.State.OperationId, replica.Id, partitionId, replica.ReplicaRole, replica.ReplicaStatus); await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync( () => this.FabricClient.ServiceManager.RemoveReplicaAsync( replica.NodeName, partitionId, replica.Id, this.RequestTimeout, cancellationToken), FabricClientRetryErrors.RemoveReplicaErrors.Value, this.OperationTimeout, cancellationToken).ConfigureAwait(false); } ActionTest.PerformInternalServiceFaultIfRequested(this.State.OperationId, serviceInternalFaultInfo, this.State, cancellationToken, true); await this.WaitForAllTargetReplicasToGetDroppedAsync(partitionId, targets, cancellationToken).ConfigureAwait(false); await RemoveUnreliableTransportAsync(this.FabricClient, failoverManagerPrimaryNodeName, behaviorName, this.RequestTimeout, this.OperationTimeout, cancellationToken).ConfigureAwait(false); bool dataLossWasSuccessful = false; TimeoutHelper timeoutHelper = new TimeoutHelper(TimeSpan.FromSeconds(30)); do { ServicePartitionList partitionsResult = await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync( () => this.FabricClient.QueryManager.GetPartitionListAsync( this.partitionSelector.ServiceName, null, this.RequestTimeout, cancellationToken), this.OperationTimeout, cancellationToken).ConfigureAwait(false); bool partitionFound = false; long postDataLossNumber = 0; foreach (StatefulServicePartition partition in partitionsResult) { if (partition.PartitionInformation.Id == partitionId) { postDataLossNumber = partition.PrimaryEpoch.DataLossNumber; partitionFound = true; break; } } if (!partitionFound) { throw new FabricException(StringHelper.Format(StringResources.Error_PartitionNotFound), FabricErrorCode.PartitionNotFound); } TestabilityTrace.TraceSource.WriteInfo( StepBase.TraceType, "{0} - Checking data loss numbers for partition {1} with remaining time {2}. Current numbers {3}:{4}", this.State.OperationId, partitionId, timeoutHelper.GetRemainingTime(), preDataLossNumber, postDataLossNumber); if (postDataLossNumber != preDataLossNumber) { dataLossWasSuccessful = true; break; } await System.Fabric.Common.AsyncWaiter.WaitAsync(TimeSpan.FromSeconds(this.dataLossCheckPollIntervalInSeconds), cancellationToken).ConfigureAwait(false); }while (timeoutHelper.GetRemainingTime() > TimeSpan.Zero); if (!dataLossWasSuccessful) { // This is only viewable internally for debug. This will cause a retry of the whole flow. string error = string.Format( CultureInfo.InvariantCulture, "{0} - Service could not induce data loss for service '{1}' partition '{2}' in '{3}' Please retry", this.State.OperationId, partitionSelector.ServiceName, partitionId, this.dataLossCheckWaitDurationInSeconds); TestabilityTrace.TraceSource.WriteWarning(StepBase.TraceType, error); throw new FabricTransientException("The operation could not be performed, please retry", FabricErrorCode.NotReady); } state.StateProgress.Push(StepStateNames.CompletedSuccessfully); return(state); }
public InvokeDataLossAction(PartitionSelector partitionSelector, DataLossMode dataLossMode) { this.PartitionSelector = partitionSelector; this.DataLossMode = dataLossMode; }
/// <summary> /// Asynchronously starts a restore operation using the state indicated by <paramref name="backupMetadata"/>. /// The backup is retrieved from the central store. /// </summary> /// <param name="service"></param> /// <param name="dataLossMode"></param> /// <param name="backupMetadata"></param> /// <returns></returns> public static async Task BeginRestoreBackup(this IBackupRestoreServiceInternal service, BackupMetadata backupMetadata, DataLossMode dataLossMode) { service.LogCallback?.Invoke($"BackupRestoreService - Beginning restore backup {backupMetadata.BackupId} for partition {service.Context.PartitionId}."); if (backupMetadata == null) { throw new ArgumentNullException(nameof(backupMetadata)); } await service.CentralBackupStore.ScheduleBackupAsync(service.Context.PartitionId, backupMetadata.BackupId); var partitionSelector = PartitionSelector.PartitionIdOf(service.Context.ServiceName, service.Context.PartitionId); var operationId = Guid.NewGuid(); await new FabricClient(FabricClientRole.Admin).TestManager.StartPartitionDataLossAsync(operationId, partitionSelector, dataLossMode); //Causes OnDataLossAsync to be called. service.LogCallback?.Invoke($"BackupRestoreService - Begun restore backup {backupMetadata.BackupId} for partition {service.Context.PartitionId}."); }
// Use this method signature for now until the actual client interface is decided public async Task ProcessDataLossCommandAsync(Guid operationId, PartitionSelector partitionSelector, DataLossMode dataLossMode, TimeSpan timeout, ServiceInternalFaultInfo serviceInternalFaultInfo) { ThrowIfDataLossModeInvalid(dataLossMode); ActionStateBase actionState = new InvokeDataLossState(operationId, serviceInternalFaultInfo, partitionSelector, dataLossMode); try { // After this call finishes the intent has been persisted await this.actionStore.InitializeNewActionAsync(actionState, timeout); this.Enqueue(actionState); } catch (Exception e) { TestabilityTrace.TraceSource.WriteWarning(TraceType, "{0} - Exception {1}", operationId, e); throw; } }
/// <summary> /// Fetches raw events along with their deserialized EventDocument-derived instances when possible, returning an enumerable of /// (raw, doc, error) tuples /// </summary> /// <param name="consumer">Event consumer implementation</param> /// <param name="route">Queue designator</param> /// <param name="partition">Logical partition to fetch from <see cref="IEventConsumer.PartitionCount"/></param> /// <param name="checkpoint">A point in time as of which to fetch</param> /// <param name="skip">Number of events to skip in the beginning</param> /// <param name="count">Number of events to fetch</param> /// <param name="lossMode">Data loss tolerance</param> /// <returns> /// A tuple of `raw` event representation, its converted EventDocument-derived instance `doc`, and an error (if any) which surfaced /// during event doc deserialization attempt, thus `doc` and `err` are mutually exclusive /// </returns> public static async Task <IEnumerable <(Event raw, EventDocument doc, Exception err)> > FetchEventDocsAsync(this IEventConsumer consumer, Route route, int partition, ulong checkpoint, int skip, int count, DataLossMode lossMode = DataLossMode.Default) { var got = await consumer.NonNull(nameof(consumer)) .FetchAsync(route, partition, checkpoint, skip, count, lossMode); using (var ms = new IO.BufferSegmentReadingStream()) { return(got.Select(e => { EventDocument doc = null; Exception error = null; try { if (e.ContentType == CONTENT_TYPE_JSON_DOC && e.Content != null) { ms.UnsafeBindBuffer(e.Content, 0, e.Content.Length); var map = JsonReader.DeserializeDataObject(ms, EVENT_JSON_ENCODING, true) as JsonDataMap; doc = JsonReader.ToDoc <EventDocument>(map, fromUI: false); } } catch (Exception err) { error = err; } return (raw: e, doc: doc, err: error); }).ToArray()); } }
/// <inheritdoc /> public Task BeginRestoreBackup(BackupMetadata backupMetadata, DataLossMode dataLossMode) { return(BackupRestoreServiceInternalExtensions.BeginRestoreBackup(this, backupMetadata, dataLossMode)); }
/// <inheritdoc /> public Task BeginRestoreBackup(BackupMetadata backupMetadata, DataLossMode dataLossMode) { return(BackupRestoreServiceOperations.BeginRestoreBackup(this, backupMetadata, dataLossMode)); }
/// <summary> /// Asynchronously starts a restore operation using the state indicated by <paramref name="backupMetadata"/>. /// The backup is retrieved from the central store. /// This method completes and returns before the backup restore process is completely done. /// </summary> /// <param name="service"></param> /// <param name="dataLossMode"></param> /// <param name="backupMetadata"></param> /// <returns></returns> public static async Task BeginRestoreBackup(this IBackupRestoreServiceOperations service, BackupMetadata backupMetadata, DataLossMode dataLossMode) { service.LogCallback?.Invoke($"BackupRestoreService - Beginning restore backup {backupMetadata.BackupId} for partition {service.Context.PartitionId}."); try { if (backupMetadata == null) { throw new ArgumentNullException(nameof(backupMetadata)); } await service.CentralBackupStore.ScheduleBackupRestoreAsync(service.Context.PartitionId, backupMetadata.BackupId); var partitionSelector = PartitionSelector.PartitionIdOf(service.Context.ServiceName, service.Context.PartitionId); var operationId = Guid.NewGuid(); await new FabricClient(FabricClientRole.Admin).TestManager.StartPartitionDataLossAsync(operationId, partitionSelector, dataLossMode); //Causes OnDataLossAsync to be called later on. } catch (Exception ex) { string message = $"Failed to restore backup for partition {service.Context.PartitionId}"; service.LogCallback?.Invoke($"{nameof(BackupRestoreServiceOperations)} - {nameof(BeginRestoreBackup)} failed for partition: {service.Context.PartitionId}. Message:{message} - Error: {ex.Message}"); throw new Exception(message, ex); } service.LogCallback?.Invoke($"{nameof(BackupRestoreServiceOperations)} - {nameof(BeginRestoreBackup)} succeeded {backupMetadata.BackupId} for partition {service.Context.PartitionId}."); }