public void Enqueue(CrawlingQueueItem queueItem) { var writeQueueProxy = QueueProxiesWrite.FirstOrDefault(); // if too many in queue already and proxy present, add to proxy (global azure queue), else add to local queue if (writeQueueProxy != null && LocalQueue.Count >= MaxLocalQueueSize) { writeQueueProxy.EnqueueAsync(queueItem, QueueCancellationTokenSource) // No need to wait for the operation to complete successfully .ContinueWith(pred => { if (pred.Status != TaskStatus.RanToCompletion) { queueItem.ChangeStatus(CrawlingQueueItem.CrawlingStatuses.InLocalQueue); lock (LocalQueue) LocalQueue.Enqueue(queueItem); } }) .ContinueWith(task => { if (task.IsFaulted) { Trace.TraceError("CrawlingQueue.[CrawlingQueueProxy].EnqueueAsync: Exception while trying to enqueue {0} [{1}]", queueItem.ResourceLink, task.Exception); } }); } else { queueItem.ChangeStatus(CrawlingQueueItem.CrawlingStatuses.InLocalQueue); lock (LocalQueue) LocalQueue.Enqueue(queueItem); } }
public async Task EnqueueAsync(CrawlingQueueItem crawlingQueueItem, CancellationTokenSource cts) { // 1. Asign an AsyncState object when creating a task, so that if remote action fails, we still have the object to insert into local queue // 2. Use a timeout value for remote operation crawlingQueueItem.ChangeStatus(CrawlingQueueItem.CrawlingStatuses.InProxyQueue); await _azureCrawlingQueue.AddMessageAsync( CloudQueueMessage.CreateCloudQueueMessageFromByteArray(crawlingQueueItem.ToByteArray()), //new CloudQueueMessage(crawlingQueueItem.ToByteArray()), null, null, new QueueRequestOptions { }, new OperationContext { }, cts.Token ); }
public override Task <bool> OnBeforeEnqueueAsync(CrawlingQueueItem crawlingQueueItem) { return(ExecuteOnAllInterceptorsAsync(eventInterceptor => eventInterceptor.OnBeforeEnqueueAsync(crawlingQueueItem) )); }
public async Task ProcessCrawlingQueueAsync(CrawlingQueue crawlingQueue) { _crawlingParameters.CancellationTokenSource.Token.Register(() => crawlingQueue.QueueCancellationTokenSource.Cancel() ); var tasksLock = new System.Threading.ReaderWriterLockSlim(); var tasks = new HashSet <Task>(); var queueItemsProcessingSemaphore = new SemaphoreSlim(crawlingQueue.CrawlingConfiguration.MaxSimmultaneousQueueItemsProcessed / 2, crawlingQueue.CrawlingConfiguration.MaxSimmultaneousQueueItemsProcessed); while (await queueItemsProcessingSemaphore.WaitAsync(crawlingQueue.CrawlingConfiguration.MaxTimeToProcessOneQueueItem)) { if (crawlingQueue.QueueCancellationTokenSource.IsCancellationRequested) { await Task.WhenAll(tasks.ToArray()); // TODO: Move remaining items from local queue to the distributed queue // TODO: Figure out how to filter out duplicates from the queue? Or should we? // We will probably have to resort to known urls-based duplicates check // Because otherwise we will drown in failing sql queries on multiplie machines Trace.TraceWarning("ProcessCrawlingQueueAsync: Queue cancellation requested. Preventing dequeing of new elements. Processing will be shut down after currently executing items are complete."); break; } var queueItem = await crawlingQueue.DequeueAsync(); if (queueItem == null) // Both Local and Proxy queues are depleted { // NOTE: If Queue is depleted, we must wait until all running tasks are executed, because they might add new items to queue await Task.WhenAll(tasks.ToArray()); // wait for all queue proxies to complete fetching items // TODO: consider locking (multithreading scenario) var queueProxiesPending = crawlingQueue.QueueProxies.Where(queueProxy => queueProxy.IsPending()).ToArray(); if (queueProxiesPending.Length > 0) { continue; } if (crawlingQueue.LocalQueue.Count > 0) { continue; } break; } if (!await _crawlingEventInterceptorManager.OnAfterDequeueAsync(queueItem)) { // If interceptor returns false, means it's an instruction to ignore this item; continue; } tasksLock.EnterWriteLock(); queueItem.ChangeStatus(CrawlingQueueItem.CrawlingStatuses.Downloading); tasks.Add(System.Threading.Tasks.TaskExtensions.Unwrap( CrawlAsync(queueItem.ResourceLink) .ContinueWith(async task => { tasksLock.EnterWriteLock(); tasks.Remove(task); // to avoid infinite bloating of the collection tasksLock.ExitWriteLock(); try { queueItem.ChangeStatus(CrawlingQueueItem.CrawlingStatuses.Downloaded); if (task.Status == TaskStatus.RanToCompletion) { var resourceContentUnits = task.Result; var httpResultUnit = resourceContentUnits.OfType <HttpResultUnit>().Single(); queueItem.ChangeStatus(CrawlingQueueItem.CrawlingStatuses.Processing); var resourceContentUnitsProcessingCountdown = new AsyncCountdownEvent(resourceContentUnits.Count); // Process resource content units extracted from Response foreach (var resourceContentUnit in resourceContentUnits) { switch (resourceContentUnit) { case ExtractedLinksUnit extractedLinksUnit: if (extractedLinksUnit.ExtractedLinks.Count > 0) { var linksProcessingCountdown = new AsyncCountdownEvent(extractedLinksUnit.ExtractedLinks.Count); foreach (var extractedLink in extractedLinksUnit.ExtractedLinks) { var crawlingQueueItem = new CrawlingQueueItem(extractedLink); // Do not enqueue item if prevented by any interceptor if (!await _crawlingEventInterceptorManager.OnBeforeEnqueueAsync(crawlingQueueItem)) { continue; } crawlingQueueItem.ProcessingCompleted += () => linksProcessingCountdown.AddCount(1) ; crawlingQueue.Enqueue(crawlingQueueItem); } // Wait while all links are processed before releasing the content units semaphore and set Status = Processed for parent linksProcessingCountdown.WaitAsync() .ContinueWith(linksProcessingTask => resourceContentUnitsProcessingCountdown.AddCount(1) ); } else { resourceContentUnitsProcessingCountdown.AddCount(1); } // Set Processed status when all extracted links are processed break; case ExtractedDataUnit extractedDataUnit: if (!await _crawlingEventInterceptorManager.OnDataDocumentDownloadedAsync( queueItem.ResourceLink, // May be a DocumentLink, or a FrameLink. Not quite intuitive and probably requires redesign. extractedDataUnit, httpResultUnit )) { // If any of interceptors failed to process the download result, // AND failed to store download result for later processing // we must re-enqueue the item, in order to ensure the results are not lost for good // We ignore the item and log the error. Chances are we couldn't process the item for a reason. And repeating would just make it stuck infinitely (re-downloading and re-processing) // (WAS) we must re-enqueue the item, in order to ensure the results are not lost for good //crawlingQueue.EnqueueAsync(queueItem); } resourceContentUnitsProcessingCountdown.Signal(); break; case DownloadedFilesUnit downloadedFileUnit: // If download file is a result of redirection, // we must either explicitly declare that we're expecting a file, or throw a processing exception var fileLink = queueItem.ResourceLink as FileLink; if (fileLink == null) { Trace.TraceError($"ProcessCrawlingQueueAsync: Downloaded file unit. Resource link is of type {queueItem.ResourceLink.GetType().Name}, expecting FileLink. Preventing processing."); break; } if (!await _crawlingEventInterceptorManager.OnFileDownloadedAsync( fileLink, downloadedFileUnit, httpResultUnit )) { // If any of interceptors failed to process the download result, // AND failed to store download result for later processing.... // We ignore the item and log the error. Chances are we couldn't process the item for a reason. And repeating would just make it stuck infinitely (re-downloading and re-processing) // (WAS) we must re-enqueue the item, in order to ensure the results are not lost for good //crawlingQueue.EnqueueAsync(queueItem); } resourceContentUnitsProcessingCountdown.Signal(); break; case HttpResultUnit httpResultUnitStub: // TODO: Determine what we should do if HTTP download failed. Either re-enqueue or ignore, or alert/do something else switch (httpResultUnitStub.HttpStatus) { //case HttpStatusCode.InternalServerError: // it's likely to repeat within the same run case HttpStatusCode.GatewayTimeout: case HttpStatusCode.RequestTimeout: queueItem.ChangeStatus(CrawlingQueueItem.CrawlingStatuses.NotLinked); crawlingQueue.Enqueue(queueItem); // Trying to recrawl item if it failed for some intermitent reason break; default: // We need to invoke ProcessingCompleted only after Data and Links extracted are really processed. //queueItem.ChangeStatus(CrawlingQueueItem.CrawlingStatuses.ProcessingCompleted); break; } resourceContentUnitsProcessingCountdown.Signal(); break; default: throw new NotSupportedException(); } } // Do not actually wait for related resources processing completion. // Those might be extracted links or files. No need to hold queue resources while linked units are downloaded // Set Processed status after all content units were registered and interceptors triggered await resourceContentUnitsProcessingCountdown.WaitAsync() .ContinueWith(resourceContentUnitsProcessingTask => queueItem.ChangeStatus(CrawlingQueueItem.CrawlingStatuses.Processed) ); } else { Trace.TraceError("CrawlAsync: Failed for queue item {0} with exception [{1}]", queueItem.ResourceLink, task.Exception); } } finally { queueItemsProcessingSemaphore.Release(); } }) ) ); tasksLock.ExitWriteLock(); } await Task.WhenAll(tasks.ToArray()); }
public override async Task <bool> OnBeforeEnqueueAsync(CrawlingQueueItem crawlingQueueItem) { return(AddKnownData(crawlingQueueItem.ResourceLink.Uri.ToString())); }
/// <returns>True - continue item execution, False - ignore this item</returns> public async virtual Task <bool> OnAfterDequeueAsync(CrawlingQueueItem crawlingQueueItem) { return(true); }
/// <returns>True - continue enqueue, False - prevent enqueue</returns> public async virtual Task <bool> OnBeforeEnqueueAsync(CrawlingQueueItem crawlingQueueItem) { return(true); }