/// <summary> /// Restores the checkpoint for a given checkpoint id. /// </summary> public Task <BoolResult> RestoreCheckpointAsync(OperationContext context, string checkpointId) { context = context.CreateNested(); return(context.PerformOperationAsync( _tracer, async() => { bool successfullyUpdatedIncrementalState = false; try { bool isIncrementalCheckpoint = false; var checkpointFileExtension = ".zip"; if (checkpointId.EndsWith(IncrementalCheckpointIdSuffix, StringComparison.OrdinalIgnoreCase)) { isIncrementalCheckpoint = true; checkpointFileExtension = ".txt"; // Remove the suffix to get the real checkpoint id used with central storage checkpointId = checkpointId.Substring(0, checkpointId.Length - IncrementalCheckpointIdSuffix.Length); } var checkpointFile = _checkpointStagingDirectory / $"chkpt{checkpointFileExtension}"; var extractedCheckpointDirectory = _checkpointStagingDirectory / "chkpt"; FileUtilities.DeleteDirectoryContents(_checkpointStagingDirectory.ToString()); FileUtilities.DeleteDirectoryContents(extractedCheckpointDirectory.ToString()); // Creating a working temporary folder using (new DisposableDirectory(_fileSystem, _checkpointStagingDirectory)) { // Getting the checkpoint from the central store await _storage.TryGetFileAsync(context, checkpointId, checkpointFile).ThrowIfFailure(); if (isIncrementalCheckpoint) { var incrementalRestoreResult = await RestoreCheckpointIncrementalAsync(context, checkpointFile, extractedCheckpointDirectory); successfullyUpdatedIncrementalState = incrementalRestoreResult.Succeeded; incrementalRestoreResult.ThrowIfFailure(); } else { RestoreFullCheckpointAsync(checkpointFile, extractedCheckpointDirectory); } // Restoring the checkpoint return _database.RestoreCheckpoint(context, extractedCheckpointDirectory); } } finally { ClearIncrementalCheckpointStateIfNeeded(context, successfullyUpdatedIncrementalState); } }, extraStartMessage: $"CheckpointId=[{checkpointId}]", extraEndMessage: _ => $"CheckpointId=[{checkpointId}]")); }
private async Task ProcessEventsAsync(OperationContext context, List <EventData> messages) { // Creating nested context for all the processing operations. context = context.CreateNested(nameof(EventHubContentLocationEventStore)); if (messages.Count == 0) { // This probably does not actually occur, but just in case, ignore empty message batch. // NOTE: We do this after logging to ensure we notice if the we are getting empty message batches. return; } var state = new SharedEventProcessingState(context, this, messages); if (_eventProcessingBlocks != null) { await context .CreateOperation(Tracer, () => sendToActionBlockAsync(state)) .WithOptions(traceOperationStarted: false, endMessageFactory: r => $"TotalQueueSize={Interlocked.Read(ref _queueSize)}") .RunAsync(caller: "SendToActionBlockAsync") .TraceIfFailure(context); } else { await ProcessEventsCoreAsync(new ProcessEventsInput(state, messages, actionBlockIndex : -1, store : this), EventDataSerializer); } async Task <BoolResult> sendToActionBlockAsync(SharedEventProcessingState localState) { // This local function "sends" a message into an action block based on the sender's hash code to process events in parallel from different machines. // (keep in mind, that the data from the same machine should be processed sequentially, because events order matters). // Then, it creates a local counter for each processing operation to track the results for the entire batch. foreach (var messageGroup in messages.GroupBy(GetProcessingIndex)) { int actionBlockIndex = messageGroup.Key; var eventProcessingBlock = _eventProcessingBlocks ![actionBlockIndex]; var input = new ProcessEventsInput(localState, messageGroup, actionBlockIndex, this); var sendAsyncTask = eventProcessingBlock.SendAsync(input); if (sendAsyncTask.Status == TaskStatus.WaitingForActivation) { // The action block is busy. It means that its most likely full. Tracer.Debug(context, $"Action block {actionBlockIndex} is busy. Block's queue size={eventProcessingBlock.InputCount}."); } bool success = await sendAsyncTask; if (!success) { // NOTE: This case should not actually occur. // Complete the operation in case we couldn't send to the action block to prevent pending event queue from getting backlogged. input.Complete(); return(new BoolResult("Failed to add message to an action block.")); } }
/// <inheritdoc /> protected override async Task <BoolResult> SendEventsCoreAsync( OperationContext context, ContentLocationEventData[] events, CounterCollection <ContentLocationEventStoreCounters> counters) { IReadOnlyList <EventData> eventDatas; using (counters[Serialization].Start()) { eventDatas = SerializeEventData(context, events); } var operationId = Guid.NewGuid(); context = context.CreateNested(operationId); for (var eventNumber = 0; eventNumber < eventDatas.Count; eventNumber++) { var eventData = eventDatas[eventNumber]; eventData.Properties[EventFilterKey] = _configuration.Epoch; eventData.Properties[SenderMachineKey] = _localMachineName; counters[SentEventBatchCount].Increment(); Tracer.Debug( context, $"{Tracer.Name}: Sending {eventNumber}/{events.Length} event. OpId={operationId}, Epoch='{_configuration.Epoch}', Size={eventData.Body.Count}."); counters[SentMessagesTotalSize].Add(eventData.Body.Count); eventData.Properties[OperationIdKey] = operationId.ToString(); // Even though event hub client has it's own built-in retry strategy, we have to wrap all the calls into a separate // one to cover a few more important cases that the default strategy misses. await _extraEventHubClientRetryPolicy.ExecuteAsync(async() => { try { await _partitionSender.SendAsync(eventData); } catch (ServerBusyException exception) { // TODO: Verify that the HResult is 50002. Documentation shows that this should be the error code for throttling, // but documentation is done for Microsoft.ServiceBus.Messaging.ServerBusyException and not Microsoft.Azure.EventHubs.ServerBusyException // https://docs.microsoft.com/en-us/azure/event-hubs/event-hubs-messaging-exceptions#serverbusyexception Tracer.Debug(context, $"{Tracer.Name}: OpId={operationId} was throttled by EventHub. HResult={exception.HResult}"); Tracer.TrackMetric(context, "EventHubThrottle", 1); throw; } }); } return(BoolResult.Success); }
/// <summary> /// Creates a checkpoint for a given sequence point. /// </summary> public Task <BoolResult> CreateCheckpointAsync(OperationContext context, EventSequencePoint sequencePoint) { context = context.CreateNested(); string checkpointId = "Unknown"; long checkpointSize = 0; return(context.PerformOperationAsync( _tracer, async() => { bool successfullyUpdatedIncrementalState = false; try { // Creating a working temporary directory using (new DisposableDirectory(_fileSystem, _checkpointStagingDirectory)) { // NOTE(jubayard): this needs to be done previous to checkpointing, because we always // fetch the latest version's size in this way. This implies there may be some difference // between the reported value and the actual size on disk: updates will get in in-between. // The better alternative is to actually open the checkpoint and ask, but it seems like too // much. checkpointSize = _database.GetContentDatabaseSizeBytes().GetValueOrDefault(-1); // Saving checkpoint for the database into the temporary folder _database.SaveCheckpoint(context, _checkpointStagingDirectory).ThrowIfFailure(); if (_configuration.UseIncrementalCheckpointing) { checkpointId = await CreateCheckpointIncrementalAsync(context, sequencePoint); successfullyUpdatedIncrementalState = true; } else { checkpointId = await CreateFullCheckpointAsync(context, sequencePoint); } return BoolResult.Success; } } finally { ClearIncrementalCheckpointStateIfNeeded(context, successfullyUpdatedIncrementalState); } }, extraStartMessage: $"SequencePoint=[{sequencePoint}]", extraEndMessage: result => $"SequencePoint=[{sequencePoint}] Id=[{checkpointId}] SizeMb=[{(checkpointSize < 0 ? checkpointSize:checkpointSize*1e-6)}]")); }
protected override async Task <BoolResult> StartupCoreAsync(OperationContext context) { // Potential error is already traced by Calibrate method. Ignore the result to avoid double reporting. await CalibrateAllAsync(context).IgnoreFailure(); // Processing requests is a long running operation. Scheduling it into a dedicated thread to avoid thread pool exhaustion. _processReserveRequestsTask = Task.Factory.StartNew( () => ProcessReserveRequestsAsync(context.CreateNested(nameof(QuotaKeeper))), TaskCreationOptions.LongRunning).Unwrap(); // Start purging immediately on startup to clear out residual content in the cache // over the cache quota if configured. const string Operation = "PurgeRequest"; SendPurgeRequest(context, "Startup").FireAndForget(context, Operation); return(BoolResult.Success); }
private async Task ProcessEventsAsync(OperationContext context, List <EventData> messages) { // Creating nested context for all the processing operations. context = context.CreateNested(); string asyncProcessing = _eventProcessingBlocks != null ? "on" : "off"; Tracer.Info(context, $"{Tracer.Name}: Received {messages.Count} events from Event Hub. Async processing is '{asyncProcessing}'."); if (messages.Count == 0) { // This probably does not actually occur, but just in case, ignore empty message batch. // NOTE: We do this after logging to ensure we notice if the we are getting empty message batches. return; } var state = new SharedEventProcessingState(context, this, messages); if (_eventProcessingBlocks != null) { await context .CreateOperation(Tracer, () => sendToActionBlockAsync()) .WithOptions(traceOperationStarted: false, endMessageFactory: r => $"TotalQueueSize={Interlocked.Read(ref _queueSize)}") .RunAsync(caller: "SendToActionBlockAsync") .TraceIfFailure(context); } else { await ProcessEventsCoreAsync(new ProcessEventsInput(state, messages, actionBlockIndex : -1, store : this), EventDataSerializer); } async Task <BoolResult> sendToActionBlockAsync() { // This local function "sends" a message into an action block based on the sender's hash code to process events in parallel from different machines. // (keep in mind, that the data from the same machine should be processed sequentially, because events order matters). // Then, it creates a local counter for each processing operation to track the results for the entire batch. foreach (var messageGroup in messages.GroupBy(GetProcessingIndex)) { int actionBlockIndex = messageGroup.Key; var eventProcessingBlock = _eventProcessingBlocks ![actionBlockIndex];
protected override Task <BoolResult> StartupCoreAsync(OperationContext context) { var bindAddress = _configuration.BindAddress; context.TraceInfo($"gRPC service binding on address {bindAddress}:{_configuration.Port}", component: nameof(RoxisGrpcService)); _grpcServer = new GrpcCore.Server(GrpcEnvironment.GetServerOptions()) { // TODO: perhaps multi-bind for the frontend + backend scenario? Ports = { new ServerPort(bindAddress, _configuration.Port, ServerCredentials.Insecure) }, RequestCallTokensPerCompletionQueue = _configuration.RequestCallTokensPerCompletionQueue, }; var metadataServiceGrpcAdapter = new RoxisGrpcAdapter( context.CreateNested(nameof(RoxisGrpcService)), _service); var serviceDefinition = Grpc.RoxisService.BindService(metadataServiceGrpcAdapter); _grpcServer.Services.Add(serviceDefinition); _grpcServer.Start(); return(BoolResult.SuccessTask); }
/// <summary> /// Creates a checkpoint for a given sequence point. /// </summary> public Task <BoolResult> CreateCheckpointAsync(OperationContext context, EventSequencePoint sequencePoint) { context = context.CreateNested(); return(context.PerformOperationAsync( _tracer, async() => { bool successfullyUpdatedIncrementalState = false; try { // Creating a working temporary directory using (new DisposableDirectory(_fileSystem, _checkpointStagingDirectory)) { // Saving checkpoint for the database into the temporary folder _database.SaveCheckpoint(context, _checkpointStagingDirectory).ThrowIfFailure(); if (_configuration.UseIncrementalCheckpointing) { await CreateCheckpointIncrementalAsync(context, sequencePoint); successfullyUpdatedIncrementalState = true; } else { await CreateFullCheckpointAsync(context, sequencePoint); } return BoolResult.Success; } } finally { ClearIncrementalCheckpointStateIfNeeded(context, successfullyUpdatedIncrementalState); } }, extraStartMessage: $"SequencePoint=[{sequencePoint}]", extraEndMessage: _ => $"SequencePoint=[{sequencePoint}]")); }
/// <inheritdoc /> protected override async Task <BoolResult> StartupCoreAsync(OperationContext context) { // NOTE: We create and start the content location store before the inner content store just in case the // inner content store starts background eviction after startup. We need the content store to be initialized // so that it can be queried and used to unregister content. await _contentLocationStoreFactory.StartupAsync(context).ThrowIfFailure(); _contentLocationStore = await _contentLocationStoreFactory.CreateAsync(LocalMachineLocation, InnerContentStore as ILocalContentStore); // Initializing inner store before initializing LocalLocationStore because // LocalLocationStore may use inner store for reconciliation purposes await InnerContentStore.StartupAsync(context).ThrowIfFailure(); await _contentLocationStore.StartupAsync(context).ThrowIfFailure(); if (_settings.EnableProactiveReplication && _contentLocationStore is TransitioningContentLocationStore tcs && InnerContentStore is ILocalContentStore localContentStore) { await ProactiveReplicationAsync(context.CreateNested(nameof(DistributedContentStore <T>)), localContentStore, tcs).ThrowIfFailure(); } return(BoolResult.Success); }
private async Task ProcessEventsAsync(OperationContext context, List <EventData> messages) { // Creating nested context for all the processing operations. context = context.CreateNested(); var sw = Stopwatch.StartNew(); string asyncProcessing = _eventProcessingBlocks != null ? "on" : "off"; Tracer.Info(context, $"{Tracer.Name}: Received {messages.Count} events from Event Hub. Async processing is '{asyncProcessing}'."); if (_eventProcessingBlocks != null) { // Creating nested context to correlate all the processing operations. context = context.CreateNested(); SendToActionBlockResult result = await context.PerformOperationAsync( Tracer, () => sendToActionBlockAsync(), traceOperationStarted : false).TraceIfFailure(context); printOperationResultsAsynchronously(result); } else { await ProcessEventsCoreAsync(new ProcessEventsInput(context, messages, new OperationCounters(), processingFinishedTaskSource : null), EventDataSerializer); } void printOperationResultsAsynchronously(SendToActionBlockResult results) { if (results) { Task.WhenAll(results.Value).ContinueWith( t => { var eventStoreCounters = t.GetAwaiter().GetResult() .Select(c => c.EventStoreCounters) .Aggregate((collection, counterCollection) => collection + counterCollection); int duration = (int)sw.ElapsedMilliseconds; context.LogProcessEventsOverview(eventStoreCounters, duration); }).IgnoreErrors(); } } async Task <SendToActionBlockResult> sendToActionBlockAsync() { // This local function "sends" a message into an action block based on the sender's hash code to process events in parallel from different machines. // (keep in mind, that the data from the same machine should be processed sequentially, because events order matters). // Then, it creates a local counter for each processing operation to track the results for the entire batch. var operationTasks = new List <Task <OperationCounters> >(); foreach (var messageGroup in messages.GroupBy(GetProcessingIndex)) { var eventProcessingBlock = _eventProcessingBlocks[messageGroup.Key]; var input = ProcessEventsInput.Create(context, messageGroup); bool success = await eventProcessingBlock.SendAsync(input); if (!success) { return(new SendToActionBlockResult("Failed to add message to an action block.")); } Contract.Assert(input.ProcessingFinishedTaskSource != null); operationTasks.Add(input.ProcessingFinishedTaskSource.Value.Task); } return(new SendToActionBlockResult(operationTasks)); } }
/// <summary> /// Creates a checkpoint for a given sequence point. /// </summary> public Task <BoolResult> CreateCheckpointAsync(OperationContext context, EventSequencePoint sequencePoint) { context = context.CreateNested(nameof(CheckpointManager)); string checkpointId = "Unknown"; double contentColumnFamilySizeMb = -1; double contentDataSizeMb = -1; double metadataColumnFamilySizeMb = -1; double metadataDataSizeMb = -1; double sizeOnDiskMb = -1; return(context.PerformOperationAsync( _tracer, async() => { bool successfullyUpdatedIncrementalState = false; try { // Creating a working temporary directory using (new DisposableDirectory(_fileSystem, _checkpointStagingDirectory)) { // Write out the time this checkpoint was generated to the database. This will be used by // the workers in order to determine whether they should restore or not after restart. The // checkpoint id is generated inside the upload methods, so we only generate the guid here. // Since this is only used for reporting purposes, there's no harm in it. var checkpointGuid = Guid.NewGuid(); DatabaseWriteCheckpointCreationTime(context, checkpointGuid.ToString(), DateTime.UtcNow); // NOTE(jubayard): this needs to be done previous to checkpointing, because we always // fetch the latest version's size in this way. This implies there may be some difference // between the reported value and the actual size on disk: updates will get in in-between. // The better alternative is to actually open the checkpoint and ask, but it seems like too // much. if (_database is RocksDbContentLocationDatabase rocksDb) { contentColumnFamilySizeMb = rocksDb.GetLongProperty( RocksDbContentLocationDatabase.LongProperty.LiveFilesSizeBytes, RocksDbContentLocationDatabase.Entity.ContentTracking).Select(x => x * 1e-6).GetValueOrDefault(-1); contentDataSizeMb = rocksDb.GetLongProperty( RocksDbContentLocationDatabase.LongProperty.LiveDataSizeBytes, RocksDbContentLocationDatabase.Entity.ContentTracking).Select(x => x * 1e-6).GetValueOrDefault(-1); metadataColumnFamilySizeMb = rocksDb.GetLongProperty( RocksDbContentLocationDatabase.LongProperty.LiveFilesSizeBytes, RocksDbContentLocationDatabase.Entity.Metadata).Select(x => x * 1e-6).GetValueOrDefault(-1); metadataDataSizeMb = rocksDb.GetLongProperty( RocksDbContentLocationDatabase.LongProperty.LiveDataSizeBytes, RocksDbContentLocationDatabase.Entity.Metadata).Select(x => x * 1e-6).GetValueOrDefault(-1); } // Saving checkpoint for the database into the temporary folder _database.SaveCheckpoint(context, _checkpointStagingDirectory).ThrowIfFailure(); try { sizeOnDiskMb = _fileSystem .EnumerateFiles(_checkpointStagingDirectory, EnumerateOptions.Recurse) .Sum(fileInfo => fileInfo.Length) * 1e-6; } catch (IOException e) { _tracer.Error(context, $"Error counting size of checkpoint's staging directory `{_checkpointStagingDirectory}`: {e}"); } if (_configuration.UseIncrementalCheckpointing) { checkpointId = await CreateCheckpointIncrementalAsync(context, sequencePoint, checkpointGuid); successfullyUpdatedIncrementalState = true; } else { checkpointId = await CreateFullCheckpointAsync(context, sequencePoint, checkpointGuid); } return BoolResult.Success; } } finally { ClearIncrementalCheckpointStateIfNeeded(context, successfullyUpdatedIncrementalState); } },
/// <inheritdoc /> protected override async Task <BoolResult> StartupCoreAsync(OperationContext context) { // NOTE: We create and start the content location store before the inner content store just in case the // inner content store starts background eviction after startup. We need the content store to be initialized // so that it can be queried and used to unregister content. await _contentLocationStoreFactory.StartupAsync(context).ThrowIfFailure(); _contentLocationStore = await _contentLocationStoreFactory.CreateAsync(LocalMachineLocation); _distributedCopier = _distributedCopierFactory(_contentLocationStore); await _distributedCopier.StartupAsync(context).ThrowIfFailure(); if (_contentLocationStore is TransitioningContentLocationStore tcs && tcs.IsLocalLocationStoreEnabled) { // Define proactive copy logic. async Task <ResultBase> proactiveCopyTaskFactory(OperationContext operationContext, ContentHash hash) { var sessionResult = await _proactiveCopySession.Value; if (sessionResult) { return(await sessionResult.Value.ProactiveCopyIfNeededAsync(operationContext, hash, tryBuildRing : false)); } return(new BoolResult("Failed to retrieve session for proactive copies.")); } tcs.LocalLocationStore.PreStartupInitialize(context, InnerContentStore as ILocalContentStore, _distributedCopier, proactiveCopyTaskFactory); } // Initializing inner store before initializing LocalLocationStore because // LocalLocationStore may use inner store for reconciliation purposes await InnerContentStore.StartupAsync(context).ThrowIfFailure(); await _contentLocationStore.StartupAsync(context).ThrowIfFailure(); Func <ContentHash[], Task> evictionHandler; var localContext = context.CreateNested(); if (_enableDistributedEviction) { evictionHandler = hashes => EvictContentAsync(localContext, hashes); } else { evictionHandler = hashes => DistributedGarbageCollectionAsync(localContext, hashes); } // Queue is created in unstarted state because the eviction function // requires the context passed at startup. So we start the queue here. _evictionNagleQueue.Start(evictionHandler); var touchContext = context.CreateNested(); _touchNagleQueue = NagleQueue <ContentHashWithSize> .Create( hashes => TouchBulkAsync(touchContext, hashes), Redis.RedisContentLocationStoreConstants.BatchDegreeOfParallelism, Redis.RedisContentLocationStoreConstants.BatchInterval, batchSize : _locationStoreBatchSize); return(BoolResult.Success); }
/// <summary> /// Restores the checkpoint for a given checkpoint id. /// </summary> public Task <BoolResult> RestoreCheckpointAsync(OperationContext context, CheckpointState checkpointState) { context = context.CreateNested(nameof(CheckpointManager)); var checkpointId = checkpointState.CheckpointId; return(context.PerformOperationWithTimeoutAsync( _tracer, async nestedContext => { bool successfullyUpdatedIncrementalState = false; try { bool isIncrementalCheckpoint = false; var checkpointFileExtension = ".zip"; if (checkpointId.EndsWith(IncrementalCheckpointIdSuffix, StringComparison.OrdinalIgnoreCase)) { isIncrementalCheckpoint = true; checkpointFileExtension = ".txt"; // Remove the suffix to get the real checkpoint id used with central storage checkpointId = checkpointId.Substring(0, checkpointId.Length - IncrementalCheckpointIdSuffix.Length); } var checkpointFile = _checkpointStagingDirectory / $"chkpt{checkpointFileExtension}"; var extractedCheckpointDirectory = _checkpointStagingDirectory / "chkpt"; FileUtilities.DeleteDirectoryContents(_checkpointStagingDirectory.ToString()); FileUtilities.DeleteDirectoryContents(extractedCheckpointDirectory.ToString()); // Creating a working temporary folder using (new DisposableDirectory(_fileSystem, _checkpointStagingDirectory)) { // Getting the checkpoint from the central store await _storage.TryGetFileAsync(nestedContext, checkpointId, checkpointFile, isImmutable: true).ThrowIfFailure(); if (isIncrementalCheckpoint) { var incrementalRestoreResult = await RestoreCheckpointIncrementalAsync(nestedContext, checkpointFile, extractedCheckpointDirectory); incrementalRestoreResult.ThrowIfFailure(); } else { RestoreFullCheckpoint(checkpointFile, extractedCheckpointDirectory); } // Restoring the checkpoint _database.RestoreCheckpoint(nestedContext, extractedCheckpointDirectory).ThrowIfFailure(); // Save latest checkpoint info to file in case we get restarded and want to know about the previous checkpoint. WriteLatestCheckpoint(nestedContext, checkpointState); successfullyUpdatedIncrementalState = true; return BoolResult.Success; } } finally { ClearIncrementalCheckpointStateIfNeeded(nestedContext, successfullyUpdatedIncrementalState); } }, extraStartMessage: $"CheckpointId=[{checkpointId}]", extraEndMessage: _ => $"CheckpointId=[{checkpointId}]", timeout: _configuration.RestoreCheckpointTimeout)); }