Пример #1
0
        protected void ExtractAutoDetect(
            ExtractionItem extractionItem,
            IDictionary <string, ExtractionItem> extractionItems,
            CollectionDictionary <string, string> extractedItems,
            ResponseParserPositionPointer?relativeLocationBase = null)
        {
            if (extractedItems.ContainsKey(extractionItem.Name))
            {
                return;
            }

            // First, extract item as a normal item
            ExtractItem(extractionItem, extractionItems, extractedItems, relativeLocationBase);

            // Then, add specific functionality, like link-scoped items and registering a ExtractedLink entity
            // If item is a link, extract it's link aspect
            ExtractionLink extractionLink;

            if ((extractionLink = extractionItem as ExtractionLink) != null)
            {
                ExtractLink(extractionLink);
            }

            // If item is a frame (which is a link as well), then it's link data has already been extracted an we only need to perform Frame-specific actions, like download and replace values in ExtractedItems
            ExtractionFrame extractionFrame;

            if ((extractionFrame = extractionItem as ExtractionFrame) != null)
            {
                // Frames are stored separated from links, to avoid queuing and download by the crawler
                ExtractedFrames[extractionFrame.Name] = ExtractedLinks[extractionFrame.Name];
                ExtractedLinks.Remove(extractionFrame.Name);

                // TODO: Download frames inline and store them in ExtractedItems (override initially extracted values)
                var frameResourceLinks = ExtractedFrames.GetValues(extractionFrame.Name);
                var frameDownloadTasks = frameResourceLinks
                                         .Select(frame => CrawlingEngine.CrawlAsync(frame, false))
                                         .ToArray();

                Task.WaitAll(frameDownloadTasks); // We're not in async context, so we'll have to hold this thread until we download all the inline downloads required

                // Replace previously extracted data for the frame with it's downloaded content
                ExtractedItems[extractionFrame.Name] =
                    frameDownloadTasks
                    .SelectMany(frameDownloadTask => frameDownloadTask.Result)
                    .OfType <ResponseStringUnit>()
                    .Select(frameResponse =>
                {
                    IEnumerable <string> result = new [] { frameResponse.Content };
                    if (extractionFrame.PostProcessOnDownload)
                    {
                        result = PostProcess(result, extractionItem.PostProcessors, DependencyDataSource);
                    }

                    return(result.ToArray());
                })
                    .SelectMany(pred => pred)
                    .ToArray();
            }
        }
Пример #2
0
        private static ExtractionItem ReadExtractionItemSection(XmlReader reader, WebsiteConfig config)
        {
            var extractionItem = new ExtractionItem();

            ReadExtractionItemAttributes(extractionItem, reader, config);
            ReadExtractionItemPostProcessors(extractionItem, reader);

            return(extractionItem);
        }
Пример #3
0
        private static void ReadExtractionItemAttributes(ExtractionItem extractionItem, XmlReader reader, WebsiteConfig config)
        {
            extractionItem.Name  = XmlReaderExtensions.GetAttribute(reader, "name", "default");
            extractionItem.Value = reader.GetAttribute("value");

            extractionItem.SetExtractionLocation(
                reader.GetAttribute("location"),
                reader.GetAttribute("location_type", ExtractionLocation.ExtractionLocationTypes.InnerText),
                reader.GetAttribute("include_child_nodes", true)
                );

            extractionItem.SetExtractionContext(
                reader.GetAttribute("context"),
                reader.GetAttribute("context_document_type", config.CrawlingSettings.DocumentType)
                );

            extractionItem.DependsOn = reader.GetAttribute <string>("depends_on", null)?.Split(new[] { ',', ';', '|' }, StringSplitOptions.RemoveEmptyEntries);
        }
Пример #4
0
        private void ExtractDependencies(
            ExtractionItem extractionItem,
            IDictionary <string, ExtractionItem> extractionItems,
            CollectionDictionary <string, string> extractedItems,
            ResponseParserPositionPointer?relativeLocationBase = null
            )
        {
            // TODO:
            // A MAJOR BUG!!!
            // StringWithDependency objects are shared between threads and jobs
            // Therefore, current strategy of "Resolving" it on extraction:
            // 1) Changes instance value for everyone that might be using it
            // 2) "Resolve" stores resolved data inside the object,
            //    so next resolve with new data does nothing since the string is already resolved

            // Links are extracted as dependencies as well (like normal extraction items)
            // ensure dependencies are extracted
            var stringsWithDependencies = extractionItem.GetStringsWithDependencies();

            foreach (var stringWithDependencies in stringsWithDependencies)
            {
                foreach (var dependencyName in stringWithDependencies.DependencyNames)
                {
                    // dependency may be of PredefinedValues and doesn't require to be extracted from any of page items
                    if (extractionItems.ContainsKey(dependencyName))
                    {
                        ExtractAutoDetect(
                            extractionItems[dependencyName],
                            extractionItems,
                            extractedItems,

                            relativeLocationBase
                            );
                    }
                    else
                    {
                        Debug.Assert(
                            _documentLink.Config.PredefinedValues.Dictionary.ContainsKey(dependencyName) ||
                            (_documentLink.Job?.PredefinedValues.Dictionary.ContainsKey(dependencyName) ?? false)
                            );
                    }
                }
            }
        }
Пример #5
0
        protected void ExtractItem(
            ExtractionItem extractionItem,
            IDictionary <string, ExtractionItem> extractionItems,
            CollectionDictionary <string, string> extractedItems,
            ResponseParserPositionPointer?relativeLocationBase = null
            )
        {
            if (extractedItems.ContainsKey(extractionItem.Name))
            {
                return; // already extracted as someone's dependency
            }
            ExtractDependencies(extractionItem, extractionItems, extractedItems, relativeLocationBase);

            var extractedValues = new List <string>();

            // constant value, if specified in config
            if (extractionItem.Value != null)
            {
                extractedValues.Add(DependencyDataSource.Resolve(extractionItem.Value));
            }

            // values, extracted from page using selector
            if (extractionItem.Location != null)
            {
                var documentParsers = new List <DocumentProcessor>();
                // If Context is not specified for item, then item is extracted from base document, without any complexities
                if (extractionItem.Context == null)
                {
                    documentParsers.Add(this);
                }
                else
                {
                    // If Context is specified, we're creating a separate DocumentParser for every context item
                    // Create once per Context item, not parse the document 10 times for each element extracted
                    var contextItemName = DependencyDataSource.Resolve(extractionItem.Context.ContextItemName);

                    if (!ContextItemDocumentParsers.TryGetValue(contextItemName, out var contextDocumentParsers))
                    {
                        // Generate context parsers for this ContextItem if they have not been generated before
                        ContextItemDocumentParsers[contextItemName]
                              = contextDocumentParsers
                              = new List <DocumentProcessor>();

                        var contextResourceFrames = ExtractedFrames[contextItemName];
                        var contextDocumentLinks  = contextResourceFrames.OfType <DocumentLink>().ToArray();

                        Debug.Assert(contextResourceFrames.Count == contextDocumentLinks.Length);

                        var contextDocumentStrings = ExtractedItems.GetValues(contextItemName);

                        Debug.Assert(contextDocumentLinks.Length == contextDocumentStrings.Count);

                        for (var i = 0; i < contextDocumentLinks.Length; i++)
                        {
                            // TODO: Documents and Downloaded strings order is not the same. Refactor.
                            var documentString = contextDocumentStrings[i];
                            var documentLink   = contextDocumentLinks[i];

                            var contextDocumentParser =
                                _documentLink.Config.CreateDocumentStringParser(
                                    documentString,
                                    documentLink,
                                    extractionItem.Context.ContextDocumentType
                                    );

                            // Reusing document parsers globally inside the config. Otherwise we get infinite recursive
                            contextDocumentParser.ContextItemDocumentParsers = ContextItemDocumentParsers;
                            contextDocumentParser.ExtractedItems             = ExtractedItems;
                            contextDocumentParser.ExtractedLinks             = ExtractedLinks;
                            contextDocumentParser.ExtractedFrames            = ExtractedFrames;

                            contextDocumentParsers.Add(contextDocumentParser);
                            contextDocumentParser.Parse();
                        }
                    }

                    documentParsers.AddRange(contextDocumentParsers);
                }

                foreach (var documentParser in documentParsers)
                {
                    extractedValues.AddRange(
                        documentParser.ExtractItemValuesFromLocation(extractionItem.Location, relativeLocationBase)
                        .Select(pred => pred.ExtractedValue)
                        );
                }
            }

            // TODO: Reconsider architecture
            // Links are not post-processed at all and frames are post-processed on download only

            // apply post-processing, if specified
            extractedItems.AddValues(
                extractionItem.Name,
                extractionItem.PostProcessOnExtraction
                    ? PostProcess(extractedValues, extractionItem.PostProcessors, DependencyDataSource)
                    : extractedValues
                );
        }
Пример #6
0
 private static void ReadExtractionItemPostProcessors(ExtractionItem extractionItem, XmlReader reader)
 {
     extractionItem.PostProcessors = ReadExtractionItemPostProcessors(reader);
 }