public override async Task <IList <ResourceContentUnit> > ProcessWebResponseAsync(WebResponse webResponse) { var resourceContentUnits = base.ProcessWebResponseAsync(webResponse); foreach (var extractedUnit in await resourceContentUnits) { switch (extractedUnit) { case ExtractedDataUnit extractedDataUnit: foreach (var record in extractedDataUnit.ExtractedData) { _crawlingBase.PredefinedValues.Dictionary[record.Key] = record.Value; } break; case ExtractedLinksUnit extractedLinkUnit: await Task.WhenAll( extractedLinkUnit.ExtractedLinks .Select(link => CrawlingEngine.CrawlAsync(link, false)) .ToArray() ); // In case of initialization link, crawl related items immediately (omitting the regular crawling queue) extractedLinkUnit.ExtractedLinks.Clear(); break; } } return(await resourceContentUnits); }
protected void ExtractAutoDetect( ExtractionItem extractionItem, IDictionary <string, ExtractionItem> extractionItems, CollectionDictionary <string, string> extractedItems, ResponseParserPositionPointer?relativeLocationBase = null) { if (extractedItems.ContainsKey(extractionItem.Name)) { return; } // First, extract item as a normal item ExtractItem(extractionItem, extractionItems, extractedItems, relativeLocationBase); // Then, add specific functionality, like link-scoped items and registering a ExtractedLink entity // If item is a link, extract it's link aspect ExtractionLink extractionLink; if ((extractionLink = extractionItem as ExtractionLink) != null) { ExtractLink(extractionLink); } // If item is a frame (which is a link as well), then it's link data has already been extracted an we only need to perform Frame-specific actions, like download and replace values in ExtractedItems ExtractionFrame extractionFrame; if ((extractionFrame = extractionItem as ExtractionFrame) != null) { // Frames are stored separated from links, to avoid queuing and download by the crawler ExtractedFrames[extractionFrame.Name] = ExtractedLinks[extractionFrame.Name]; ExtractedLinks.Remove(extractionFrame.Name); // TODO: Download frames inline and store them in ExtractedItems (override initially extracted values) var frameResourceLinks = ExtractedFrames.GetValues(extractionFrame.Name); var frameDownloadTasks = frameResourceLinks .Select(frame => CrawlingEngine.CrawlAsync(frame, false)) .ToArray(); Task.WaitAll(frameDownloadTasks); // We're not in async context, so we'll have to hold this thread until we download all the inline downloads required // Replace previously extracted data for the frame with it's downloaded content ExtractedItems[extractionFrame.Name] = frameDownloadTasks .SelectMany(frameDownloadTask => frameDownloadTask.Result) .OfType <ResponseStringUnit>() .Select(frameResponse => { IEnumerable <string> result = new [] { frameResponse.Content }; if (extractionFrame.PostProcessOnDownload) { result = PostProcess(result, extractionItem.PostProcessors, DependencyDataSource); } return(result.ToArray()); }) .SelectMany(pred => pred) .ToArray(); } }
public override async Task <IList <CrawlingQueueItem> > FetchAsync(int portionSize, CancellationTokenSource cts) { ChangeStatus(Statuses.Fetching); // On first fetch returns only InitializationQueueItem. if (InitializationLink != null) { try { await CrawlingEngine.CrawlAsync(InitializationLink); } catch (Exception ex) { Trace.TraceError($"{GetType().Name}.FetchAsync: Initialization failed for link {InitializationLink.Url} (Config: {InitializationLink.Config.Name}, Job: {InitializationLink.Job?.Name}) with exception [{ex}]"); ChangeStatus(Statuses.Error); return(new CrawlingQueueItem[] { }); } } var queueItems = QueueItemsAvailable.ToArray(); if (queueItems.Length > 0) { var queueItemsCountdown = new AsyncCountdownEvent(queueItems.Length); foreach (var queueItem in queueItems) { queueItem.ProcessingCompleted += () => { queueItemsCountdown.Signal(); }; } queueItemsCountdown .WaitAsync() .ContinueWith(allQueuedItemsCompletedTask => { ChangeStatus(Statuses.Depleted); }); } else { ChangeStatus(Statuses.Depleted); } // TODO: Add predefined values validation for config/job after initialization and before crawling EntryLinks. return(queueItems); }