protected override async Task<bool> Fetch(CollectorHttpClient client, ReadWriteCursor front, ReadCursor back, CancellationToken cancellationToken) { JObject root = await client.GetJObjectAsync(Index, cancellationToken); IEnumerable<CatalogItem> rootItems = root["items"] .Select(item => new CatalogItem(item)) .Where(item => item.CommitTimeStamp > front.Value) .OrderBy(item => item.CommitTimeStamp); bool acceptNextBatch = false; foreach (CatalogItem rootItem in rootItems) { JObject page = await client.GetJObjectAsync(rootItem.Uri, cancellationToken); JToken context = null; page.TryGetValue("@context", out context); var batches = await CreateBatches(page["items"] .Select(item => new CatalogItem(item)) .Where(item => item.CommitTimeStamp > front.Value && item.CommitTimeStamp <= back.Value)); var orderedBatches = batches .OrderBy(batch => batch.CommitTimeStamp) .ToList(); var lastBatch = orderedBatches.LastOrDefault(); foreach (var batch in orderedBatches) { acceptNextBatch = await OnProcessBatch( client, batch.Items.Select(item => item.Value), context, batch.CommitTimeStamp, batch.CommitTimeStamp == lastBatch.CommitTimeStamp, cancellationToken); front.Value = batch.CommitTimeStamp; await front.Save(cancellationToken); Trace.TraceInformation("CommitCatalog.Fetch front.Save has value: {0}", front); if (!acceptNextBatch) { break; } } if (!acceptNextBatch) { break; } } return acceptNextBatch; }
protected override async Task<bool> OnProcessBatch(CollectorHttpClient client, IEnumerable<JToken> items, JToken context, DateTime commitTimeStamp, bool isLastBatch, CancellationToken cancellationToken) { JObject catalogIndex = (_baseAddress != null) ? await client.GetJObjectAsync(Index, cancellationToken) : null; IEnumerable<JObject> catalogItems = await FetchCatalogItems(client, items, cancellationToken); var numDocs = _indexWriter.NumDocs(); _logger.LogInformation(string.Format("Index contains {0} documents.", _indexWriter.NumDocs())); ProcessCatalogIndex(_indexWriter, catalogIndex, _baseAddress); ProcessCatalogItems(_indexWriter, catalogItems, _baseAddress); var docsDifference = _indexWriter.NumDocs() - numDocs; UpdateCommitMetadata(commitTimeStamp, docsDifference); _logger.LogInformation(string.Format("Processed catalog items. Index now contains {0} documents. (total uncommitted {1}, batch {2})", _indexWriter.NumDocs(), _metadataForNextCommit.Count, docsDifference)); if (_commitEachBatch || isLastBatch) { EnsureCommitted(); } return true; }
public static async Task ProcessGraphs( string id, IDictionary<string, IGraph> sortedGraphs, StorageFactory storageFactory, Uri contentBaseAddress, int partitionSize, int packageCountThreshold, CancellationToken cancellationToken) { int versionAlreadyExistsCount = 0; existingVersionsWithID = new List<string>(); try { Storage storage = storageFactory.Create(id.ToLowerInvariant()); Uri resourceUri = storage.ResolveUri("index.json"); string json = await storage.LoadString(resourceUri, cancellationToken); int count = Utils.CountItems(json); //Determine if there are any versions that are existing already CollectorHttpClient httpClient = new CollectorHttpClient(); foreach (var graph in sortedGraphs) { JObject jsonContent = await httpClient.GetJObjectAsync(new Uri(graph.Key), cancellationToken); string existingId = jsonContent["@id"].ToString(); string existingVersionWithId = existingId.Substring(existingId.LastIndexOf("/") + 1); string existingVersion = jsonContent["version"].ToString() + ".json"; //Determine if the version is actually available //In Registration blobs, the format is /packageID/packageVersion.json //So to check the existence of version we need to know only the version.json if (storage.Exists(existingVersion)) { //When we compare later in AddExistingItems, we need the "packageId.packageversion.json" for comparison so store it with Id existingVersionsWithID.Add(existingVersionWithId); versionAlreadyExistsCount++; } } int total = count + sortedGraphs.Count - versionAlreadyExistsCount; if (total < packageCountThreshold) { await SaveSmallRegistration(storage, storageFactory.BaseAddress, sortedGraphs, contentBaseAddress, partitionSize, cancellationToken); } else { await SaveLargeRegistration(storage, storageFactory.BaseAddress, sortedGraphs, json, contentBaseAddress, partitionSize, cancellationToken); } } catch (Exception e) { throw new Exception(string.Format("Process id = {0}", id), e); } }
static async Task<IEnumerable<JObject>> FetchCatalogItems(CollectorHttpClient client, IEnumerable<JToken> items, CancellationToken cancellationToken) { IList<Task<JObject>> tasks = new List<Task<JObject>>(); foreach (JToken item in items) { Uri catalogItemUri = item["@id"].ToObject<Uri>(); tasks.Add(client.GetJObjectAsync(catalogItemUri, cancellationToken)); } await Task.WhenAll(tasks); return tasks.Select(t => t.Result); }
protected override async Task<bool> OnProcessBatch(CollectorHttpClient client, IEnumerable<JToken> items, JToken context, DateTime commitTimeStamp, CancellationToken cancellationToken) { JObject catalogIndex = (_baseAddress != null) ? await client.GetJObjectAsync(Index, cancellationToken) : null; IEnumerable<JObject> catalogItems = await FetchCatalogItems(client, items, cancellationToken); using (IndexWriter indexWriter = CreateIndexWriter(_directory)) { Trace.TraceInformation("Index contains {0} documents", indexWriter.NumDocs()); ProcessCatalogIndex(indexWriter, catalogIndex, _baseAddress); ProcessCatalogItems(indexWriter, catalogItems, _baseAddress); indexWriter.ExpungeDeletes(); indexWriter.Commit(CreateCommitMetadata(commitTimeStamp)); Trace.TraceInformation("COMMIT index contains {0} documents commitTimeStamp {1}", indexWriter.NumDocs(), commitTimeStamp.ToString("O")); } return true; }
protected async Task <IEnumerable <CatalogCommit> > FetchCatalogCommitsAsync( CollectorHttpClient client, ReadCursor front, ReadCursor back, CancellationToken cancellationToken) { JObject root; using (_telemetryService.TrackDuration( TelemetryConstants.CatalogIndexReadDurationSeconds, new Dictionary <string, string>() { { TelemetryConstants.Uri, Index.AbsoluteUri } })) { root = await client.GetJObjectAsync(Index, cancellationToken); } var commits = root["items"].Select(item => CatalogCommit.Create((JObject)item)); return(GetCommitsInRange(commits, front.Value, back.Value)); }
protected async Task <IEnumerable <CatalogCommit> > FetchCatalogCommitsAsync( CollectorHttpClient client, ReadWriteCursor front, CancellationToken cancellationToken) { JObject root; using (_telemetryService.TrackDuration( TelemetryConstants.CatalogIndexReadDurationSeconds, new Dictionary <string, string>() { { TelemetryConstants.Uri, Index.AbsoluteUri } })) { root = await client.GetJObjectAsync(Index, cancellationToken); } IEnumerable <CatalogCommit> commits = root["items"] .Select(item => CatalogCommit.Create((JObject)item)) .Where(item => item.CommitTimeStamp > front.Value) .OrderBy(item => item.CommitTimeStamp); return(commits); }
protected override async Task<bool> Fetch(CollectorHttpClient client, ReadWriteCursor front, ReadCursor back, CancellationToken cancellationToken) { int beforeBatchCount = BatchCount; IList<JObject> items = new List<JObject>(); JObject root = await client.GetJObjectAsync(Index, cancellationToken); JToken context = null; root.TryGetValue("@context", out context); IEnumerable<JToken> rootItems = root["items"].OrderBy(item => item["commitTimeStamp"].ToObject<DateTime>()); DateTime resumeDateTime = front.Value; bool acceptNextBatch = true; foreach (JObject rootItem in rootItems) { if (!acceptNextBatch) { break; } DateTime rootItemCommitTimeStamp = rootItem["commitTimeStamp"].ToObject<DateTime>(); if (rootItemCommitTimeStamp <= front.Value) { continue; } Uri pageUri = rootItem["@id"].ToObject<Uri>(); JObject page = await client.GetJObjectAsync(pageUri); IEnumerable<JToken> pageItems = page["items"].OrderBy(item => item["commitTimeStamp"].ToObject<DateTime>()); foreach (JObject pageItem in pageItems) { DateTime pageItemCommitTimeStamp = pageItem["commitTimeStamp"].ToObject<DateTime>(); if (pageItemCommitTimeStamp <= front.Value) { continue; } if (pageItemCommitTimeStamp > back.Value) { break; } items.Add(pageItem); resumeDateTime = pageItemCommitTimeStamp; if (items.Count == _batchSize) { acceptNextBatch = await ProcessBatch(client, items, context, front, resumeDateTime, cancellationToken); if (!acceptNextBatch) { break; } } } } if (acceptNextBatch && items.Count > 0) { await ProcessBatch(client, items, context, front, resumeDateTime, cancellationToken); } int afterBatchCount = BatchCount; PreviousRunBatchCount = (afterBatchCount - beforeBatchCount); return (PreviousRunBatchCount > 0); }
internal static async Task <bool> ProcessCatalogCommitsAsync( CollectorHttpClient client, ReadWriteCursor front, ReadCursor back, FetchCatalogCommitsAsync fetchCatalogCommitsAsync, CreateCommitItemBatchesAsync createCommitItemBatchesAsync, ProcessCommitItemBatchAsync processCommitItemBatchAsync, int maxConcurrentBatches, ILogger logger, CancellationToken cancellationToken) { var rootItems = await fetchCatalogCommitsAsync(client, front, back, cancellationToken); var hasAnyBatchFailed = false; var hasAnyBatchBeenProcessed = false; foreach (CatalogCommit rootItem in rootItems) { JObject page = await client.GetJObjectAsync(rootItem.Uri, cancellationToken); var context = (JObject)page["@context"]; CatalogCommitItemBatch[] batches = await CreateBatchesForAllAvailableItemsInPageAsync( front, back, page, context, createCommitItemBatchesAsync); if (!batches.Any()) { continue; } hasAnyBatchBeenProcessed = true; DateTime maxCommitTimeStamp = GetMaxCommitTimeStamp(batches); var unprocessedBatches = batches.ToList(); var processingBatches = new List <CatalogCommitItemBatchTask>(); var exceptions = new List <Exception>(); StartProcessingBatchesIfNoFailures( client, context, unprocessedBatches, processingBatches, maxConcurrentBatches, processCommitItemBatchAsync, cancellationToken); while (processingBatches.Any()) { var activeTasks = processingBatches.Where(batch => !batch.Task.IsCompleted) .Select(batch => batch.Task) .DefaultIfEmpty(Task.CompletedTask); await Task.WhenAny(activeTasks); for (var i = 0; i < processingBatches.Count; ++i) { var batch = processingBatches[i]; if (batch.Task.IsFaulted || batch.Task.IsCanceled) { hasAnyBatchFailed = true; if (batch.Task.Exception != null) { var exception = ExceptionUtilities.Unwrap(batch.Task.Exception); exceptions.Add(exception); } } if (batch.Task.IsCompleted) { processingBatches.RemoveAt(i); --i; } } if (!hasAnyBatchFailed) { StartProcessingBatchesIfNoFailures( client, context, unprocessedBatches, processingBatches, maxConcurrentBatches, processCommitItemBatchAsync, cancellationToken); } } if (hasAnyBatchFailed) { foreach (var exception in exceptions) { logger.LogError(_eventId, exception, Strings.BatchProcessingFailure); } var innerException = exceptions.Count == 1 ? exceptions.Single() : new AggregateException(exceptions); throw new BatchProcessingException(innerException); } front.Value = maxCommitTimeStamp; await front.SaveAsync(cancellationToken); Trace.TraceInformation($"{nameof(CatalogCommitUtilities)}.{nameof(ProcessCatalogCommitsAsync)} " + $"{nameof(front)}.{nameof(front.Value)} saved since timestamp changed from previous: {{0}}", front); } return(hasAnyBatchBeenProcessed); }
protected override async Task <bool> FetchAsync( CollectorHttpClient client, ReadWriteCursor front, ReadCursor back, CancellationToken cancellationToken) { IEnumerable <CatalogCommit> commits = await FetchCatalogCommitsAsync(client, front, cancellationToken); bool acceptNextBatch = false; foreach (CatalogCommit commit in commits) { JObject page = await client.GetJObjectAsync(commit.Uri, cancellationToken); JToken context = null; page.TryGetValue("@context", out context); var batches = await CreateBatchesAsync(page["items"] .Select(item => CatalogCommitItem.Create((JObject)context, (JObject)item)) .Where(item => item.CommitTimeStamp > front.Value && item.CommitTimeStamp <= back.Value)); var orderedBatches = batches .OrderBy(batch => batch.CommitTimeStamp) .ToList(); var lastBatch = orderedBatches.LastOrDefault(); DateTime?previousCommitTimeStamp = null; foreach (var batch in orderedBatches) { // If the commit timestamp has changed from the previous batch, commit. This is important because if // two batches have the same commit timestamp but processing the second fails, we should not // progress the cursor forward. if (previousCommitTimeStamp.HasValue && previousCommitTimeStamp != batch.CommitTimeStamp) { front.Value = previousCommitTimeStamp.Value; await front.SaveAsync(cancellationToken); Trace.TraceInformation("CommitCatalog.Fetch front.Value saved since timestamp changed from previous: {0}", front); } using (_telemetryService.TrackDuration(TelemetryConstants.ProcessBatchSeconds, new Dictionary <string, string>() { { TelemetryConstants.BatchItemCount, batch.Items.Count.ToString() } })) { acceptNextBatch = await OnProcessBatchAsync( client, batch.Items, context, batch.CommitTimeStamp, batch.CommitTimeStamp == lastBatch.CommitTimeStamp, cancellationToken); } // If this is the last batch, commit the cursor. if (ReferenceEquals(batch, lastBatch)) { front.Value = batch.CommitTimeStamp; await front.SaveAsync(cancellationToken); Trace.TraceInformation("CommitCatalog.Fetch front.Value saved due to last batch: {0}", front); } previousCommitTimeStamp = batch.CommitTimeStamp; Trace.TraceInformation("CommitCatalog.Fetch front.Value is: {0}", front); if (!acceptNextBatch) { break; } } if (!acceptNextBatch) { break; } } return(acceptNextBatch); }
protected override async Task <bool> Fetch(CollectorHttpClient client, ReadWriteCursor front, ReadCursor back) { int beforeBatchCount = BatchCount; IList <JObject> items = new List <JObject>(); JObject root = await client.GetJObjectAsync(Index); JToken context = null; root.TryGetValue("@context", out context); IEnumerable <JToken> rootItems = root["items"].OrderBy(item => item["commitTimeStamp"].ToObject <DateTime>()); DateTime resumeDateTime = front.Value; bool acceptNextBatch = true; foreach (JObject rootItem in rootItems) { if (!acceptNextBatch) { break; } DateTime rootItemCommitTimeStamp = rootItem["commitTimeStamp"].ToObject <DateTime>(); if (rootItemCommitTimeStamp <= front.Value) { continue; } Uri pageUri = rootItem["@id"].ToObject <Uri>(); JObject page = await client.GetJObjectAsync(pageUri); IEnumerable <JToken> pageItems = page["items"].OrderBy(item => item["commitTimeStamp"].ToObject <DateTime>()); foreach (JObject pageItem in pageItems) { DateTime pageItemCommitTimeStamp = pageItem["commitTimeStamp"].ToObject <DateTime>(); if (pageItemCommitTimeStamp <= front.Value) { continue; } if (pageItemCommitTimeStamp > back.Value) { break; } items.Add(pageItem); resumeDateTime = pageItemCommitTimeStamp; if (items.Count == _batchSize) { acceptNextBatch = await ProcessBatch(client, items, context, front, resumeDateTime); if (!acceptNextBatch) { break; } } } } if (acceptNextBatch && items.Count > 0) { await ProcessBatch(client, items, context, front, resumeDateTime); } int afterBatchCount = BatchCount; PreviousRunBatchCount = (afterBatchCount - beforeBatchCount); return(PreviousRunBatchCount > 0); }