/// <summary> /// Handle split for given partition. /// </summary> /// <param name="partitionKeyRangeId">The id of the partition that was splitted, aka parent partition.</param> /// <param name="continuationToken">Continuation token on split partition before split.</param> /// <param name="leaseId">The id of the lease. This is needed to avoid extra call to ILeaseManager to get the lease by partitionId.</param> /// <returns>True on success, false on failure.</returns> private async Task <bool> HandleSplitAsync(string partitionKeyRangeId, string continuationToken, string leaseId) { Debug.Assert(!string.IsNullOrEmpty(partitionKeyRangeId)); Debug.Assert(!string.IsNullOrEmpty(leaseId)); TraceLog.Informational(string.Format("Partition {0} is gone due to split, continuation '{1}'", partitionKeyRangeId, continuationToken)); List <PartitionKeyRange> allRanges = await this.EnumPartitionKeyRangesAsync(this.collectionSelfLink); var childRanges = new List <PartitionKeyRange>(allRanges.Where(range => range.Parents.Contains(partitionKeyRangeId))); if (childRanges.Count < 2) { TraceLog.Error(string.Format("Partition {0} had split but we failed to find at least 2 child paritions.")); return(false); } var tasks = new List <Task>(); foreach (var childRange in childRanges) { tasks.Add(this.leaseManager.CreateLeaseIfNotExistAsync(childRange.Id, continuationToken)); TraceLog.Informational(string.Format("Creating lease for partition '{0}' as child of partition '{1}', continuation '{2}'", childRange.Id, partitionKeyRangeId, continuationToken)); } await Task.WhenAll(tasks); await this.leaseManager.DeleteAsync(new DocumentServiceLease { Id = leaseId }); TraceLog.Informational(string.Format("Deleted lease for gone (splitted) partition '{0}' continuation '{1}'", partitionKeyRangeId, continuationToken)); // Note: the rest is up to lease taker, that after waking up would consume these new leases. return(true); }
async Task IPartitionObserver <DocumentServiceLease> .OnPartitionReleasedAsync(DocumentServiceLease l, ChangeFeedObserverCloseReason reason) { #if DEBUG Interlocked.Decrement(ref this.partitionCount); #endif TraceLog.Informational(string.Format("Host '{0}' releasing partition {1}...", this.HostName, l.PartitionId)); WorkerData workerData = null; if (this.partitionKeyRangeIdToWorkerMap.TryGetValue(l.PartitionId, out workerData)) { workerData.Cancellation.Cancel(); try { await workerData.Observer.CloseAsync(workerData.Context, reason); } catch (Exception ex) { // Eat all client exceptions. TraceLog.Error(string.Format("IChangeFeedObserver.CloseAsync: exception: {0}", ex)); } await workerData.Task; this.partitionKeyRangeIdToWorkerMap.TryRemove(l.PartitionId, out workerData); } TraceLog.Informational(string.Format("Host '{0}' partition {1}: released!", this.HostName, workerData.Context.PartitionKeyRangeId)); }
async Task <DocumentServiceLease> CheckpointAsync(DocumentServiceLease lease, string continuation, ChangeFeedObserverContext context) { Debug.Assert(lease != null); Debug.Assert(!string.IsNullOrEmpty(continuation)); DocumentServiceLease result = null; try { result = (DocumentServiceLease)await this.checkpointManager.CheckpointAsync(lease, continuation, lease.SequenceNumber + 1); Debug.Assert(result.ContinuationToken == continuation, "ContinuationToken was not updated!"); TraceLog.Informational(string.Format("Checkpoint: partition {0}, new continuation '{1}'", lease.PartitionId, continuation)); } catch (LeaseLostException) { TraceLog.Warning(string.Format("Partition {0}: failed to checkpoint due to lost lease", context.PartitionKeyRangeId)); throw; } catch (Exception ex) { TraceLog.Error(string.Format("Partition {0}: failed to checkpoint due to unexpected error: {1}", context.PartitionKeyRangeId, ex.Message)); throw; } Debug.Assert(result != null); return(await Task.FromResult <DocumentServiceLease>(result)); }
internal static Int64 GetDocumentCount(ResourceResponse <DocumentCollection> response) { Debug.Assert(response != null); var resourceUsage = response.ResponseHeaders["x-ms-resource-usage"]; if (resourceUsage != null) { var parts = resourceUsage.Split(';'); foreach (var part in parts) { var name = part.Split('='); if (name.Length > 1 && string.Equals(name[0], "documentsCount", StringComparison.OrdinalIgnoreCase) && !string.IsNullOrEmpty(name[1])) { Int64 result = -1; if (Int64.TryParse(name[1], out result)) { return(result); } else { TraceLog.Error(string.Format("Failed to get document count from response, can't Int64.TryParse('{0}')", part)); } break; } } } return(-1); }
/// <summary> /// Asynchronously checks the current existing leases and calculates an estimate of remaining work per leased partitions. /// </summary> /// <returns>An estimate amount of remaining documents to be processed</returns> public async Task <long> GetEstimatedRemainingWork() { await this.InitializeAsync(); long remaining = 0; ChangeFeedOptions options = new ChangeFeedOptions { MaxItemCount = 1 }; foreach (DocumentServiceLease existingLease in await this.leaseManager.ListLeases()) { options.PartitionKeyRangeId = existingLease.PartitionId; options.RequestContinuation = existingLease.ContinuationToken; IDocumentQuery <Document> query = this.documentClient.CreateDocumentChangeFeedQuery(this.collectionSelfLink, options); FeedResponse <Document> response = null; try { response = await query.ExecuteNextAsync <Document>(); long parsedLSNFromSessionToken = TryConvertToNumber(ParseAmountFromSessionToken(response.SessionToken)); long lastSequenceNumber = response.Count > 0 ? TryConvertToNumber(response.First().GetPropertyValue <string>(LSNPropertyName)) : parsedLSNFromSessionToken; long partitionRemaining = parsedLSNFromSessionToken - lastSequenceNumber; remaining += partitionRemaining < 0 ? 0 : partitionRemaining; } catch (DocumentClientException ex) { ExceptionDispatchInfo exceptionDispatchInfo = ExceptionDispatchInfo.Capture(ex); DocumentClientException dcex = (DocumentClientException)exceptionDispatchInfo.SourceException; if ((StatusCode.NotFound == (StatusCode)dcex.StatusCode && SubStatusCode.ReadSessionNotAvailable != (SubStatusCode)GetSubStatusCode(dcex)) || StatusCode.Gone == (StatusCode)dcex.StatusCode) { // We are not explicitly handling Splits here to avoid any collision with an Observer that might have picked this up and managing the split TraceLog.Error(string.Format("GetEstimateWork > Partition {0}: resource gone (subStatus={1}).", existingLease.PartitionId, GetSubStatusCode(dcex))); } else if (StatusCode.TooManyRequests == (StatusCode)dcex.StatusCode || StatusCode.ServiceUnavailable == (StatusCode)dcex.StatusCode) { TraceLog.Warning(string.Format("GetEstimateWork > Partition {0}: retriable exception : {1}", existingLease.PartitionId, dcex.Message)); } else { TraceLog.Error(string.Format("GetEstimateWork > Partition {0}: Unhandled exception", ex.Error.Message)); } } } return(remaining); }
internal async Task CheckpointAsync(string continuation, ChangeFeedObserverContext context) { if (string.IsNullOrEmpty(continuation)) { throw new ArgumentException("continuation"); } if (context == null) { throw new ArgumentNullException("context"); } if (string.IsNullOrEmpty(context.PartitionKeyRangeId)) { throw new ArgumentException("context.PartitionKeyRangeId"); } WorkerData workerData; this.partitionKeyRangeIdToWorkerMap.TryGetValue(context.PartitionKeyRangeId, out workerData); if (workerData == null) { TraceLog.Warning(string.Format("CheckpointAsync: called at wrong time, failed to get worker data for partition {0}. Most likely the partition is not longer owned by this host.", context.PartitionKeyRangeId)); throw new LeaseLostException(string.Format("Failed to find lease for partition {0} in the set of owned leases.", context.PartitionKeyRangeId)); } if (workerData.Lease == null) { TraceLog.Error(string.Format("CheckpointAsync: found the worker data but lease is null, for partition {0}. This should never happen.", context.PartitionKeyRangeId)); throw new LeaseLostException(string.Format("Failed to find lease for partition {0}.", context.PartitionKeyRangeId)); } await workerData.CheckpointInProgress.WaitAsync(); try { if (workerData.Cancellation.IsCancellationRequested) { TraceLog.Warning(string.Format("CheckpointAsync: called at wrong time, partition {0} is shutting down. The ownership of the partition by this host is about to end.", context.PartitionKeyRangeId)); throw new LeaseLostException(string.Format("CheckpointAsync: partition {0} is shutting down.", context.PartitionKeyRangeId)); } workerData.Lease = await this.CheckpointAsync(workerData.Lease, continuation, context); } finally { workerData.CheckpointInProgress.Release(); } }
async Task IPartitionObserver <DocumentServiceLease> .OnPartitionAcquiredAsync(DocumentServiceLease lease) { Debug.Assert(lease != null && !string.IsNullOrEmpty(lease.Owner), "lease"); TraceLog.Informational(string.Format("Host '{0}' partition {1}: acquired!", this.HostName, lease.PartitionId)); #if DEBUG Interlocked.Increment(ref this.partitionCount); #endif IChangeFeedObserver observer = this.observerFactory.CreateObserver(); ChangeFeedObserverContext context = new ChangeFeedObserverContext { PartitionKeyRangeId = lease.PartitionId }; CancellationTokenSource cancellation = new CancellationTokenSource(); // Create ChangeFeedOptions to use for this worker. ChangeFeedOptions options = new ChangeFeedOptions { MaxItemCount = this.changeFeedOptions.MaxItemCount, PartitionKeyRangeId = this.changeFeedOptions.PartitionKeyRangeId, SessionToken = this.changeFeedOptions.SessionToken, StartFromBeginning = this.changeFeedOptions.StartFromBeginning, RequestContinuation = this.changeFeedOptions.RequestContinuation }; var workerTask = await Task.Factory.StartNew(async() => { ChangeFeedObserverCloseReason?closeReason = null; try { try { await observer.OpenAsync(context); } catch (Exception ex) { TraceLog.Error(string.Format("IChangeFeedObserver.OpenAsync exception: {0}", ex)); closeReason = ChangeFeedObserverCloseReason.ObserverError; throw; } options.PartitionKeyRangeId = lease.PartitionId; if (!string.IsNullOrEmpty(lease.ContinuationToken)) { options.RequestContinuation = lease.ContinuationToken; } CheckpointStats checkpointStats = null; if (!this.statsSinceLastCheckpoint.TryGetValue(lease.PartitionId, out checkpointStats) || checkpointStats == null) { // It could be that the lease was created by different host and we picked it up. checkpointStats = this.statsSinceLastCheckpoint.AddOrUpdate( lease.PartitionId, new CheckpointStats(), (partitionId, existingStats) => existingStats); Trace.TraceWarning(string.Format("Added stats for partition '{0}' for which the lease was picked up after the host was started.", lease.PartitionId)); } IDocumentQuery <Document> query = this.documentClient.CreateDocumentChangeFeedQuery(this.collectionSelfLink, options); TraceLog.Verbose(string.Format("Worker start: partition '{0}', continuation '{1}'", lease.PartitionId, lease.ContinuationToken)); string lastContinuation = options.RequestContinuation; try { while (this.isShutdown == 0) { do { ExceptionDispatchInfo exceptionDispatchInfo = null; FeedResponse <Document> response = null; try { response = await query.ExecuteNextAsync <Document>(); lastContinuation = response.ResponseContinuation; } catch (DocumentClientException ex) { exceptionDispatchInfo = ExceptionDispatchInfo.Capture(ex); } if (exceptionDispatchInfo != null) { DocumentClientException dcex = (DocumentClientException)exceptionDispatchInfo.SourceException; if (StatusCode.NotFound == (StatusCode)dcex.StatusCode && SubStatusCode.ReadSessionNotAvailable != (SubStatusCode)GetSubStatusCode(dcex)) { // Most likely, the database or collection was removed while we were enumerating. // Shut down. The user will need to start over. // Note: this has to be a new task, can't await for shutdown here, as shudown awaits for all worker tasks. TraceLog.Error(string.Format("Partition {0}: resource gone (subStatus={1}). Aborting.", context.PartitionKeyRangeId, GetSubStatusCode(dcex))); await Task.Factory.StartNew(() => this.StopAsync(ChangeFeedObserverCloseReason.ResourceGone)); break; } else if (StatusCode.Gone == (StatusCode)dcex.StatusCode) { SubStatusCode subStatusCode = (SubStatusCode)GetSubStatusCode(dcex); if (SubStatusCode.PartitionKeyRangeGone == subStatusCode) { bool isSuccess = await HandleSplitAsync(context.PartitionKeyRangeId, lastContinuation, lease.Id); if (!isSuccess) { TraceLog.Error(string.Format("Partition {0}: HandleSplit failed! Aborting.", context.PartitionKeyRangeId)); await Task.Factory.StartNew(() => this.StopAsync(ChangeFeedObserverCloseReason.ResourceGone)); break; } // Throw LeaseLostException so that we take the lease down. throw new LeaseLostException(lease, exceptionDispatchInfo.SourceException, true); } else if (SubStatusCode.Splitting == subStatusCode) { TraceLog.Warning(string.Format("Partition {0} is splitting. Will retry to read changes until split finishes. {1}", context.PartitionKeyRangeId, dcex.Message)); } else { exceptionDispatchInfo.Throw(); } } else if (StatusCode.TooManyRequests == (StatusCode)dcex.StatusCode || StatusCode.ServiceUnavailable == (StatusCode)dcex.StatusCode) { TraceLog.Warning(string.Format("Partition {0}: retriable exception : {1}", context.PartitionKeyRangeId, dcex.Message)); } else { exceptionDispatchInfo.Throw(); } await Task.Delay(dcex.RetryAfter != TimeSpan.Zero ? dcex.RetryAfter : this.options.FeedPollDelay, cancellation.Token); } if (response != null) { if (response.Count > 0) { List <Document> docs = new List <Document>(); docs.AddRange(response); try { context.FeedResponse = response; await observer.ProcessChangesAsync(context, docs); } catch (Exception ex) { TraceLog.Error(string.Format("IChangeFeedObserver.ProcessChangesAsync exception: {0}", ex)); closeReason = ChangeFeedObserverCloseReason.ObserverError; throw; } finally { context.FeedResponse = null; } } checkpointStats.ProcessedDocCount += (uint)response.Count; if (IsCheckpointNeeded(lease, checkpointStats)) { lease = await CheckpointAsync(lease, response.ResponseContinuation, context); checkpointStats.Reset(); } else if (response.Count > 0) { TraceLog.Informational(string.Format("Checkpoint: not checkpointing for partition {0}, {1} docs, new continuation '{2}' as frequency condition is not met", lease.PartitionId, response.Count, response.ResponseContinuation)); } } }while (query.HasMoreResults && this.isShutdown == 0); if (this.isShutdown == 0) { await Task.Delay(this.options.FeedPollDelay, cancellation.Token); } } // Outer while (this.isShutdown == 0) loop. closeReason = ChangeFeedObserverCloseReason.Shutdown; } catch (TaskCanceledException) { Debug.Assert(cancellation.IsCancellationRequested, "cancellation.IsCancellationRequested"); TraceLog.Informational(string.Format("Cancel signal received for partition {0} worker!", context.PartitionKeyRangeId)); } } catch (LeaseLostException ex) { closeReason = ex.IsGone ? ChangeFeedObserverCloseReason.LeaseGone : ChangeFeedObserverCloseReason.LeaseLost; } catch (Exception ex) { TraceLog.Error(string.Format("Partition {0} exception: {1}", context.PartitionKeyRangeId, ex)); if (!closeReason.HasValue) { closeReason = ChangeFeedObserverCloseReason.Unknown; } } if (closeReason.HasValue) { TraceLog.Informational(string.Format("Releasing lease for partition {0} due to an error, reason: {1}!", context.PartitionKeyRangeId, closeReason.Value)); // Note: this has to be a new task, because OnPartitionReleasedAsync awaits for worker task. await Task.Factory.StartNew(async() => await this.partitionManager.TryReleasePartitionAsync(context.PartitionKeyRangeId, true, closeReason.Value)); } TraceLog.Informational(string.Format("Partition {0}: worker finished!", context.PartitionKeyRangeId)); }); var newWorkerData = new WorkerData(workerTask, observer, context, cancellation); this.partitionKeyRangeIdToWorkerMap.AddOrUpdate(context.PartitionKeyRangeId, newWorkerData, (string id, WorkerData d) => { return(newWorkerData); }); }
async Task IPartitionObserver <DocumentServiceLease> .OnPartitionAcquiredAsync(DocumentServiceLease lease) { Debug.Assert(lease != null && !string.IsNullOrEmpty(lease.Owner), "lease"); TraceLog.Informational(string.Format("Host '{0}' partition {1}: acquired!", this.HostName, lease.PartitionId)); #if DEBUG Interlocked.Increment(ref this.partitionCount); #endif IChangeFeedObserver observer = this.observerFactory.CreateObserver(); ChangeFeedObserverContext context = new ChangeFeedObserverContext { PartitionKeyRangeId = lease.PartitionId }; CancellationTokenSource cancellation = new CancellationTokenSource(); // Create ChangeFeedOptions to use for this worker. ChangeFeedOptions options = new ChangeFeedOptions { MaxItemCount = this.changeFeedOptions.MaxItemCount, PartitionKeyRangeId = this.changeFeedOptions.PartitionKeyRangeId, SessionToken = this.changeFeedOptions.SessionToken, StartFromBeginning = this.changeFeedOptions.StartFromBeginning, RequestContinuation = this.changeFeedOptions.RequestContinuation }; var workerTask = await Task.Factory.StartNew(async() => { ChangeFeedObserverCloseReason?closeReason = null; try { try { await observer.OpenAsync(context); } catch (Exception ex) { TraceLog.Error(string.Format("IChangeFeedObserver.OpenAsync exception: {0}", ex)); closeReason = ChangeFeedObserverCloseReason.ObserverError; throw; } options.PartitionKeyRangeId = lease.PartitionId; if (!string.IsNullOrEmpty(lease.ContinuationToken)) { options.RequestContinuation = lease.ContinuationToken; } IDocumentQuery <Document> query = this.documentClient.CreateDocumentChangeFeedQuery(this.collectionSelfLink, options); TraceLog.Verbose(string.Format("Worker start: partition '{0}', continuation '{1}'", lease.PartitionId, lease.ContinuationToken)); try { while (this.isShutdown == 0) { do { DocumentClientException dcex = null; FeedResponse <Document> response = null; try { response = await query.ExecuteNextAsync <Document>(); } catch (DocumentClientException ex) { if (StatusCode.NotFound != (StatusCode)ex.StatusCode && StatusCode.TooManyRequests != (StatusCode)ex.StatusCode && StatusCode.ServiceUnavailable != (StatusCode)ex.StatusCode) { throw; } dcex = ex; } if (dcex != null) { const int ReadSessionNotAvailable = 1002; if (StatusCode.NotFound == (StatusCode)dcex.StatusCode && GetSubStatusCode(dcex) != ReadSessionNotAvailable) { // Most likely, the database or collection was removed while we were enumerating. // Shut down. The user will need to start over. // Note: this has to be a new task, can't await for shutdown here, as shudown awaits for all worker tasks. await Task.Factory.StartNew(() => this.StopAsync(ChangeFeedObserverCloseReason.ResourceGone)); break; } else { Debug.Assert(StatusCode.TooManyRequests == (StatusCode)dcex.StatusCode || StatusCode.ServiceUnavailable == (StatusCode)dcex.StatusCode); TraceLog.Warning(string.Format("Partition {0}: retriable exception : {1}", context.PartitionKeyRangeId, dcex.Message)); await Task.Delay(dcex.RetryAfter != TimeSpan.Zero ? dcex.RetryAfter : this.options.FeedPollDelay, cancellation.Token); } } if (response != null) { if (response.Count > 0) { List <Document> docs = new List <Document>(); docs.AddRange(response); try { await observer.ProcessChangesAsync(context, docs); } catch (Exception ex) { TraceLog.Error(string.Format("IChangeFeedObserver.ProcessChangesAsync exception: {0}", ex)); closeReason = ChangeFeedObserverCloseReason.ObserverError; throw; } // Checkpoint after every successful delivery to the client. lease = await CheckpointAsync(lease, response.ResponseContinuation, context); } else if (string.IsNullOrEmpty(lease.ContinuationToken)) { // Checkpoint if we've never done that for this lease. lease = await CheckpointAsync(lease, response.ResponseContinuation, context); } } }while (query.HasMoreResults && this.isShutdown == 0); if (this.isShutdown == 0) { await Task.Delay(this.options.FeedPollDelay, cancellation.Token); } } // Outer while (this.isShutdown == 0) loop. } catch (TaskCanceledException) { Debug.Assert(cancellation.IsCancellationRequested, "cancellation.IsCancellationRequested"); TraceLog.Informational(string.Format("Cancel signal received for partition {0} worker!", context.PartitionKeyRangeId)); } } catch (LeaseLostException) { closeReason = ChangeFeedObserverCloseReason.LeaseLost; } catch (Exception ex) { TraceLog.Error(string.Format("Partition {0} exception: {1}", context.PartitionKeyRangeId, ex)); if (!closeReason.HasValue) { closeReason = ChangeFeedObserverCloseReason.Unknown; } } if (closeReason.HasValue) { TraceLog.Informational(string.Format("Releasing lease for partition {0} due to an error, reason: {1}!", context.PartitionKeyRangeId, closeReason.Value)); // Note: this has to be a new task, because OnPartitionReleasedAsync awaits for worker task. await Task.Factory.StartNew(async() => await this.partitionManager.TryReleasePartitionAsync(context.PartitionKeyRangeId, true, closeReason.Value)); } TraceLog.Informational(string.Format("Partition {0}: worker finished!", context.PartitionKeyRangeId)); }); var newWorkerData = new WorkerData(workerTask, observer, context, cancellation); this.partitionKeyRangeIdToWorkerMap.AddOrUpdate(context.PartitionKeyRangeId, newWorkerData, (string id, WorkerData d) => { return(newWorkerData); }); }