Exemplo n.º 1
0
        public override async Task <IList <ResourceContentUnit> > ProcessWebResponseAsync(WebResponse webResponse)
        {
            var resourceContentUnits = base.ProcessWebResponseAsync(webResponse);

            foreach (var extractedUnit in await resourceContentUnits)
            {
                switch (extractedUnit)
                {
                case ExtractedDataUnit extractedDataUnit:
                    foreach (var record in extractedDataUnit.ExtractedData)
                    {
                        _crawlingBase.PredefinedValues.Dictionary[record.Key] = record.Value;
                    }
                    break;

                case ExtractedLinksUnit extractedLinkUnit:
                    await Task.WhenAll(
                        extractedLinkUnit.ExtractedLinks
                        .Select(link => CrawlingEngine.CrawlAsync(link, false))
                        .ToArray()
                        );

                    // In case of initialization link, crawl related items immediately (omitting the regular crawling queue)
                    extractedLinkUnit.ExtractedLinks.Clear();
                    break;
                }
            }

            return(await resourceContentUnits);
        }
Exemplo n.º 2
0
        protected void ExtractAutoDetect(
            ExtractionItem extractionItem,
            IDictionary <string, ExtractionItem> extractionItems,
            CollectionDictionary <string, string> extractedItems,
            ResponseParserPositionPointer?relativeLocationBase = null)
        {
            if (extractedItems.ContainsKey(extractionItem.Name))
            {
                return;
            }

            // First, extract item as a normal item
            ExtractItem(extractionItem, extractionItems, extractedItems, relativeLocationBase);

            // Then, add specific functionality, like link-scoped items and registering a ExtractedLink entity
            // If item is a link, extract it's link aspect
            ExtractionLink extractionLink;

            if ((extractionLink = extractionItem as ExtractionLink) != null)
            {
                ExtractLink(extractionLink);
            }

            // If item is a frame (which is a link as well), then it's link data has already been extracted an we only need to perform Frame-specific actions, like download and replace values in ExtractedItems
            ExtractionFrame extractionFrame;

            if ((extractionFrame = extractionItem as ExtractionFrame) != null)
            {
                // Frames are stored separated from links, to avoid queuing and download by the crawler
                ExtractedFrames[extractionFrame.Name] = ExtractedLinks[extractionFrame.Name];
                ExtractedLinks.Remove(extractionFrame.Name);

                // TODO: Download frames inline and store them in ExtractedItems (override initially extracted values)
                var frameResourceLinks = ExtractedFrames.GetValues(extractionFrame.Name);
                var frameDownloadTasks = frameResourceLinks
                                         .Select(frame => CrawlingEngine.CrawlAsync(frame, false))
                                         .ToArray();

                Task.WaitAll(frameDownloadTasks); // We're not in async context, so we'll have to hold this thread until we download all the inline downloads required

                // Replace previously extracted data for the frame with it's downloaded content
                ExtractedItems[extractionFrame.Name] =
                    frameDownloadTasks
                    .SelectMany(frameDownloadTask => frameDownloadTask.Result)
                    .OfType <ResponseStringUnit>()
                    .Select(frameResponse =>
                {
                    IEnumerable <string> result = new [] { frameResponse.Content };
                    if (extractionFrame.PostProcessOnDownload)
                    {
                        result = PostProcess(result, extractionItem.PostProcessors, DependencyDataSource);
                    }

                    return(result.ToArray());
                })
                    .SelectMany(pred => pred)
                    .ToArray();
            }
        }
Exemplo n.º 3
0
        public override async Task <IList <CrawlingQueueItem> > FetchAsync(int portionSize, CancellationTokenSource cts)
        {
            ChangeStatus(Statuses.Fetching);

            // On first fetch returns only InitializationQueueItem.
            if (InitializationLink != null)
            {
                try
                {
                    await CrawlingEngine.CrawlAsync(InitializationLink);
                }
                catch (Exception ex)
                {
                    Trace.TraceError($"{GetType().Name}.FetchAsync: Initialization failed for link {InitializationLink.Url} (Config: {InitializationLink.Config.Name}, Job: {InitializationLink.Job?.Name}) with exception [{ex}]");
                    ChangeStatus(Statuses.Error);
                    return(new CrawlingQueueItem[] { });
                }
            }

            var queueItems = QueueItemsAvailable.ToArray();

            if (queueItems.Length > 0)
            {
                var queueItemsCountdown = new AsyncCountdownEvent(queueItems.Length);

                foreach (var queueItem in queueItems)
                {
                    queueItem.ProcessingCompleted += () =>
                    {
                        queueItemsCountdown.Signal();
                    };
                }

                queueItemsCountdown
                .WaitAsync()
                .ContinueWith(allQueuedItemsCompletedTask =>
                {
                    ChangeStatus(Statuses.Depleted);
                });
            }
            else
            {
                ChangeStatus(Statuses.Depleted);
            }

            // TODO: Add predefined values validation for config/job after initialization and before crawling EntryLinks.
            return(queueItems);
        }