protected void ExtractAutoDetect( ExtractionItem extractionItem, IDictionary <string, ExtractionItem> extractionItems, CollectionDictionary <string, string> extractedItems, ResponseParserPositionPointer?relativeLocationBase = null) { if (extractedItems.ContainsKey(extractionItem.Name)) { return; } // First, extract item as a normal item ExtractItem(extractionItem, extractionItems, extractedItems, relativeLocationBase); // Then, add specific functionality, like link-scoped items and registering a ExtractedLink entity // If item is a link, extract it's link aspect ExtractionLink extractionLink; if ((extractionLink = extractionItem as ExtractionLink) != null) { ExtractLink(extractionLink); } // If item is a frame (which is a link as well), then it's link data has already been extracted an we only need to perform Frame-specific actions, like download and replace values in ExtractedItems ExtractionFrame extractionFrame; if ((extractionFrame = extractionItem as ExtractionFrame) != null) { // Frames are stored separated from links, to avoid queuing and download by the crawler ExtractedFrames[extractionFrame.Name] = ExtractedLinks[extractionFrame.Name]; ExtractedLinks.Remove(extractionFrame.Name); // TODO: Download frames inline and store them in ExtractedItems (override initially extracted values) var frameResourceLinks = ExtractedFrames.GetValues(extractionFrame.Name); var frameDownloadTasks = frameResourceLinks .Select(frame => CrawlingEngine.CrawlAsync(frame, false)) .ToArray(); Task.WaitAll(frameDownloadTasks); // We're not in async context, so we'll have to hold this thread until we download all the inline downloads required // Replace previously extracted data for the frame with it's downloaded content ExtractedItems[extractionFrame.Name] = frameDownloadTasks .SelectMany(frameDownloadTask => frameDownloadTask.Result) .OfType <ResponseStringUnit>() .Select(frameResponse => { IEnumerable <string> result = new [] { frameResponse.Content }; if (extractionFrame.PostProcessOnDownload) { result = PostProcess(result, extractionItem.PostProcessors, DependencyDataSource); } return(result.ToArray()); }) .SelectMany(pred => pred) .ToArray(); } }
private static ExtractionItem ReadExtractionItemSection(XmlReader reader, WebsiteConfig config) { var extractionItem = new ExtractionItem(); ReadExtractionItemAttributes(extractionItem, reader, config); ReadExtractionItemPostProcessors(extractionItem, reader); return(extractionItem); }
private static void ReadExtractionItemAttributes(ExtractionItem extractionItem, XmlReader reader, WebsiteConfig config) { extractionItem.Name = XmlReaderExtensions.GetAttribute(reader, "name", "default"); extractionItem.Value = reader.GetAttribute("value"); extractionItem.SetExtractionLocation( reader.GetAttribute("location"), reader.GetAttribute("location_type", ExtractionLocation.ExtractionLocationTypes.InnerText), reader.GetAttribute("include_child_nodes", true) ); extractionItem.SetExtractionContext( reader.GetAttribute("context"), reader.GetAttribute("context_document_type", config.CrawlingSettings.DocumentType) ); extractionItem.DependsOn = reader.GetAttribute <string>("depends_on", null)?.Split(new[] { ',', ';', '|' }, StringSplitOptions.RemoveEmptyEntries); }
private void ExtractDependencies( ExtractionItem extractionItem, IDictionary <string, ExtractionItem> extractionItems, CollectionDictionary <string, string> extractedItems, ResponseParserPositionPointer?relativeLocationBase = null ) { // TODO: // A MAJOR BUG!!! // StringWithDependency objects are shared between threads and jobs // Therefore, current strategy of "Resolving" it on extraction: // 1) Changes instance value for everyone that might be using it // 2) "Resolve" stores resolved data inside the object, // so next resolve with new data does nothing since the string is already resolved // Links are extracted as dependencies as well (like normal extraction items) // ensure dependencies are extracted var stringsWithDependencies = extractionItem.GetStringsWithDependencies(); foreach (var stringWithDependencies in stringsWithDependencies) { foreach (var dependencyName in stringWithDependencies.DependencyNames) { // dependency may be of PredefinedValues and doesn't require to be extracted from any of page items if (extractionItems.ContainsKey(dependencyName)) { ExtractAutoDetect( extractionItems[dependencyName], extractionItems, extractedItems, relativeLocationBase ); } else { Debug.Assert( _documentLink.Config.PredefinedValues.Dictionary.ContainsKey(dependencyName) || (_documentLink.Job?.PredefinedValues.Dictionary.ContainsKey(dependencyName) ?? false) ); } } } }
protected void ExtractItem( ExtractionItem extractionItem, IDictionary <string, ExtractionItem> extractionItems, CollectionDictionary <string, string> extractedItems, ResponseParserPositionPointer?relativeLocationBase = null ) { if (extractedItems.ContainsKey(extractionItem.Name)) { return; // already extracted as someone's dependency } ExtractDependencies(extractionItem, extractionItems, extractedItems, relativeLocationBase); var extractedValues = new List <string>(); // constant value, if specified in config if (extractionItem.Value != null) { extractedValues.Add(DependencyDataSource.Resolve(extractionItem.Value)); } // values, extracted from page using selector if (extractionItem.Location != null) { var documentParsers = new List <DocumentProcessor>(); // If Context is not specified for item, then item is extracted from base document, without any complexities if (extractionItem.Context == null) { documentParsers.Add(this); } else { // If Context is specified, we're creating a separate DocumentParser for every context item // Create once per Context item, not parse the document 10 times for each element extracted var contextItemName = DependencyDataSource.Resolve(extractionItem.Context.ContextItemName); if (!ContextItemDocumentParsers.TryGetValue(contextItemName, out var contextDocumentParsers)) { // Generate context parsers for this ContextItem if they have not been generated before ContextItemDocumentParsers[contextItemName] = contextDocumentParsers = new List <DocumentProcessor>(); var contextResourceFrames = ExtractedFrames[contextItemName]; var contextDocumentLinks = contextResourceFrames.OfType <DocumentLink>().ToArray(); Debug.Assert(contextResourceFrames.Count == contextDocumentLinks.Length); var contextDocumentStrings = ExtractedItems.GetValues(contextItemName); Debug.Assert(contextDocumentLinks.Length == contextDocumentStrings.Count); for (var i = 0; i < contextDocumentLinks.Length; i++) { // TODO: Documents and Downloaded strings order is not the same. Refactor. var documentString = contextDocumentStrings[i]; var documentLink = contextDocumentLinks[i]; var contextDocumentParser = _documentLink.Config.CreateDocumentStringParser( documentString, documentLink, extractionItem.Context.ContextDocumentType ); // Reusing document parsers globally inside the config. Otherwise we get infinite recursive contextDocumentParser.ContextItemDocumentParsers = ContextItemDocumentParsers; contextDocumentParser.ExtractedItems = ExtractedItems; contextDocumentParser.ExtractedLinks = ExtractedLinks; contextDocumentParser.ExtractedFrames = ExtractedFrames; contextDocumentParsers.Add(contextDocumentParser); contextDocumentParser.Parse(); } } documentParsers.AddRange(contextDocumentParsers); } foreach (var documentParser in documentParsers) { extractedValues.AddRange( documentParser.ExtractItemValuesFromLocation(extractionItem.Location, relativeLocationBase) .Select(pred => pred.ExtractedValue) ); } } // TODO: Reconsider architecture // Links are not post-processed at all and frames are post-processed on download only // apply post-processing, if specified extractedItems.AddValues( extractionItem.Name, extractionItem.PostProcessOnExtraction ? PostProcess(extractedValues, extractionItem.PostProcessors, DependencyDataSource) : extractedValues ); }
private static void ReadExtractionItemPostProcessors(ExtractionItem extractionItem, XmlReader reader) { extractionItem.PostProcessors = ReadExtractionItemPostProcessors(reader); }