private async Task DrainReplicationStreamAsync(IOperationStream replicationStream) { FabricEvents.Events.DrainStart(this.tracer.Type, "Replication stream"); TaskCompletionSource <object> allOperationsAckedTcs = new TaskCompletionSource <object>(); var lastReplicatedRecord = LogicalLogRecord.InvalidLogicalLogRecord; long replicatedRecordNumber = 0, acksOutstanding = 1, bytesOutstanding = 0; this.roleContextDrainState.OnDrainReplication(); do { var drainTask = replicationStream.GetOperationAsync(CancellationToken.None); if (drainTask.IsCompleted == false) { this.replicatedLogManager.LogManager.FlushAsync("DrainReplicationStream.IsEmpty").IgnoreExceptionVoid(); await drainTask.ConfigureAwait(false); } var operation = drainTask.GetAwaiter().GetResult(); if (operation != null) { var data = operation.Data; #if DEBUG ReplicatedLogManager.ValidateOperationData(data, "DrainReplicationStream LSN: " + operation.SequenceNumber); #endif lastReplicatedRecord = (LogicalLogRecord)LogRecord.FromOperationData(data); lastReplicatedRecord.Lsn = new LogicalSequenceNumber(operation.SequenceNumber); await this.LogLogicalRecordOnSecondaryAsync(lastReplicatedRecord).ConfigureAwait(false); var acksRemaining = Interlocked.Increment(ref acksOutstanding); FabricEvents.Events.DrainReplicationReceive( this.tracer.Type, replicatedRecordNumber, (uint)lastReplicatedRecord.RecordType, lastReplicatedRecord.Lsn.LSN, acksRemaining); ++replicatedRecordNumber; long operationSize = Utility.GetOperationSize(data); var bytesRemaining = Interlocked.Add(ref bytesOutstanding, operationSize); if (((this.replicatorSettings.PublicSettings.MaxSecondaryReplicationQueueSize / 2 <= acksRemaining) || ((this.replicatorSettings.PublicSettings.MaxSecondaryReplicationQueueMemorySize > 0) && (this.replicatorSettings.PublicSettings.MaxSecondaryReplicationQueueMemorySize / 2 <= bytesRemaining))) || ((this.replicatorSettings.PublicSettings.MaxPrimaryReplicationQueueSize / 2 <= acksRemaining) || ((this.replicatorSettings.PublicSettings.MaxPrimaryReplicationQueueMemorySize > 0) && (this.replicatorSettings.PublicSettings.MaxPrimaryReplicationQueueMemorySize / 2 <= bytesRemaining)))) { FabricEvents.Events.DrainReplicationFlush( this.tracer.Type, replicatedRecordNumber, lastReplicatedRecord.Lsn.LSN, acksRemaining, bytesRemaining); this.replicatedLogManager.LogManager.FlushAsync("DrainReplicationStream.IsFull").IgnoreExceptionVoid(); } var capturedOperation = operation; var capturedRecord = lastReplicatedRecord; lastReplicatedRecord.AwaitFlush().IgnoreException().ContinueWith( async task => { var acksPending = Interlocked.Decrement(ref acksOutstanding); if (task.Exception != null) { // Signal the drain completion task if needed if (acksPending == 0) { allOperationsAckedTcs.TrySetResult(null); } return; } var bytesPending = Interlocked.Add(ref bytesOutstanding, -operationSize); Utility.Assert( (acksPending >= 0) && (bytesPending >= 0), "(acksPending >= 0) && (bytesPending >= 0)"); if (acksPending == 0) { allOperationsAckedTcs.TrySetResult(null); } capturedOperation.Acknowledge(); FabricEvents.Events.DrainReplicationNoise( this.tracer.Type, capturedRecord.Lsn.LSN, acksPending, bytesPending); await capturedRecord.AwaitApply().ConfigureAwait(false); }).IgnoreExceptionVoid(); } else { await this.replicatedLogManager.FlushInformationRecordAsync( InformationEvent.ReplicationFinished, closeLog : false, flushInitiator : "DrainReplicationstream.IsFinished").ConfigureAwait(false); await this.replicatedLogManager.LastInformationRecord.AwaitProcessing().ConfigureAwait(false); await this.recordsProcessor.WaitForLogicalRecordsProcessingAsync().ConfigureAwait(false); var acksPending = Interlocked.Decrement(ref acksOutstanding); Utility.Assert(acksPending >= 0, "acksPending >= 0"); if (acksPending != 0) { await allOperationsAckedTcs.Task.ConfigureAwait(false); } Utility.Assert(acksOutstanding == 0, "acksOutstanding == 0"); break; } } while (true); #if !DotNetCoreClr // These are new events defined in System.Fabric, existing CoreCLR apps would break // if these events are refernced as it wont be found. As CoreCLR apps carry System.Fabric // along with application // This is just a mitigation for now. Actual fix being tracked via bug# 11614507 FabricEvents.Events.DrainCompleted( this.tracer.Type, "Replication", "Completed", replicatedRecordNumber, (uint)lastReplicatedRecord.RecordType, lastReplicatedRecord.Lsn.LSN, lastReplicatedRecord.Psn.PSN, lastReplicatedRecord.RecordPosition); #endif }
private async Task DrainCopyStreamAsync( IOperationStream copyStream, IOperation operation, BeginCheckpointLogRecord copiedCheckpointRecord, bool renamedCopyLogSuccessfully) { FabricEvents.Events.DrainStart(this.tracer.Type, "Copy stream: RenamedCopyLogSuccessfully: " + renamedCopyLogSuccessfully); var lastCopiedRecord = LogicalLogRecord.InvalidLogicalLogRecord; long copiedRecordNumber = 0, acksOutstanding = 1; TaskCompletionSource <object> allOperationsAckedTcs = new TaskCompletionSource <object>(); try { if (operation != null) { this.roleContextDrainState.OnDrainCopy(); do { var data = operation.Data; #if DEBUG ReplicatedLogManager.ValidateOperationData(data, "DrainCopyStreamAsync LSN: " + operation.SequenceNumber); #endif lastCopiedRecord = (LogicalLogRecord)LogRecord.FromOperationData(data); await this.LogLogicalRecordOnSecondaryAsync(lastCopiedRecord).ConfigureAwait(false); // After successfully appending the record into the buffer, increment the outstanding ack count var acksRemaining = Interlocked.Increment(ref acksOutstanding); FabricEvents.Events.DrainCopyReceive( this.tracer.Type, copiedRecordNumber, lastCopiedRecord.RecordType.ToString(), lastCopiedRecord.Lsn.LSN, acksRemaining); ++copiedRecordNumber; if (this.replicatorSettings.PublicSettings.MaxCopyQueueSize / 2 <= acksRemaining) { FabricEvents.Events.DrainCopyFlush( this.tracer.Type, copiedRecordNumber, lastCopiedRecord.Lsn.LSN, acksRemaining); this.replicatedLogManager.LogManager.FlushAsync("DrainCopyStream.IsFull").IgnoreExceptionVoid(); } var capturedOperation = operation; var capturedRecord = lastCopiedRecord; if (copiedCheckpointRecord == null) { copiedCheckpointRecord = this.replicatedLogManager.LastInProgressCheckpointRecord; if (copiedCheckpointRecord != null) { Utility.Assert( copiedCheckpointRecord.Lsn == this.recoveredOrCopiedCheckpointLsn.Value, "copiedCheckpointRecordLsn {0} == recoveredOrCopiedCheckpointLsn {1}", copiedCheckpointRecord.Lsn, this.recoveredOrCopiedCheckpointLsn.Value); } } // If pumped the last operation in the copy stream (indicated by copiedUptoLsn), rename the copy log if this was a full copy // as we are guranteed that the replica has all the data needed to be promoted to an active secondary and we could not have lost any state if (copiedCheckpointRecord != null && copiedCheckpointRecord != BeginCheckpointLogRecord.InvalidBeginCheckpointLogRecord && lastCopiedRecord.Lsn == this.copiedUptoLsn && renamedCopyLogSuccessfully == false) // Copied UE record could have same LSN, so this condition is needed { await this.checkpointManager.CompleteFirstCheckpointOnIdleAndRenameLog(copiedCheckpointRecord, this.copiedUptoLsn.LSN).ConfigureAwait(false); renamedCopyLogSuccessfully = true; } lastCopiedRecord.AwaitFlush().ContinueWith( async task => { var acksPending = Interlocked.Decrement(ref acksOutstanding); if (task.Exception != null) { // Signal the drain completion task if needed if (acksPending == 0) { allOperationsAckedTcs.TrySetResult(null); } return; } capturedOperation.Acknowledge(); Utility.Assert(acksPending >= 0, "acksPending {0} >= 0", acksPending); if (acksPending == 0) { allOperationsAckedTcs.TrySetResult(null); } FabricEvents.Events.DrainCopyNoise( this.tracer.Type, capturedRecord.Lsn.LSN, acksPending); await capturedRecord.AwaitApply().ConfigureAwait(false); }).IgnoreExceptionVoid(); var drainTask = copyStream.GetOperationAsync(CancellationToken.None); if (drainTask.IsCompleted == false) { // GopalK: Currently, we cannot wait for copy to finish because copy might get // abandoned if the primary fails and the product waits for pending // copy operations to get acknowledged before electing a new primary this.replicatedLogManager.LogManager.FlushAsync("DrainCopyStream.IsEmpty").IgnoreExceptionVoid(); await drainTask.ConfigureAwait(false); } operation = drainTask.GetAwaiter().GetResult(); } while (operation != null); } } // This finally block ensures that before we continue, we cancel the first full copy checkpoint during full build // Without having this, it is possible that the above code throws out of this method and any lifecycle API like close might get stuck because // there is a pending checkpoint that is not yet fully processed finally { // If the pump prematurely finishes for any reason, it means the copy log cannot be renamed if (copiedCheckpointRecord != null && copiedCheckpointRecord != BeginCheckpointLogRecord.InvalidBeginCheckpointLogRecord && renamedCopyLogSuccessfully == false) { await this.checkpointManager.CancelFirstCheckpointOnIdleDueToIncompleteCopy(copiedCheckpointRecord, this.copiedUptoLsn.LSN); } } await this.replicatedLogManager.FlushInformationRecordAsync( InformationEvent.CopyFinished, closeLog : false, flushInitiator : "DrainCopyStream.IsFinished").ConfigureAwait(false); // Awaiting processing of this record, // ensures that all operations in the copystream must have been applied Before we complete the drainComplationTcs. await this.replicatedLogManager.LastInformationRecord.AwaitProcessing().ConfigureAwait(false); await this.recordsProcessor.WaitForLogicalRecordsProcessingAsync().ConfigureAwait(false); var acksOpen = Interlocked.Decrement(ref acksOutstanding); Utility.Assert(acksOpen >= 0, "acksOpen {0} >= 0", acksOpen); if (acksOpen != 0) { // wait for all the callbacks above to finish running and acknowleding await allOperationsAckedTcs.Task.ConfigureAwait(false); } Utility.Assert(acksOutstanding == 0, "acksOutstanding == 0"); #if !DotNetCoreClr // These are new events defined in System.Fabric, existing CoreCLR apps would break // if these events are refernced as it wont be found. As CoreCLR apps carry System.Fabric // along with application // This is just a mitigation for now. Actual fix being tracked via bug# 11614507 FabricEvents.Events.DrainCompleted( this.tracer.Type, "Copy", "Completed", copiedRecordNumber, (uint)lastCopiedRecord.RecordType, lastCopiedRecord.Lsn.LSN, lastCopiedRecord.Psn.PSN, lastCopiedRecord.RecordPosition); #endif }