public static IEnumerable <XElement> Merge(this IEnumerable <XElement> ruleElements) { var nodesBySelector = new CollectionDictionary <Selector, XElement, List <XElement> >(); foreach (var rule in ruleElements) { var selector = new Selector(rule); nodesBySelector.Add(selector, rule); } var result = new List <XElement>(); foreach (var k in nodesBySelector) { if (k.Value.Count >= 1) { var first = k.Value[0]; for (var i = 1; i < k.Value.Count; i++) { var node = k.Value[i]; node.Elements("property").ForEach(first.Add); node.Elements("style").ForEach(first.Add); if (node.Parent != null) { node.Remove(); } } first.Elements("style").Merge(); result.Add(first); } } return(result); }
protected void ExtractAutoDetect( ExtractionItem extractionItem, IDictionary <string, ExtractionItem> extractionItems, CollectionDictionary <string, string> extractedItems, ResponseParserPositionPointer?relativeLocationBase = null) { if (extractedItems.ContainsKey(extractionItem.Name)) { return; } // First, extract item as a normal item ExtractItem(extractionItem, extractionItems, extractedItems, relativeLocationBase); // Then, add specific functionality, like link-scoped items and registering a ExtractedLink entity // If item is a link, extract it's link aspect ExtractionLink extractionLink; if ((extractionLink = extractionItem as ExtractionLink) != null) { ExtractLink(extractionLink); } // If item is a frame (which is a link as well), then it's link data has already been extracted an we only need to perform Frame-specific actions, like download and replace values in ExtractedItems ExtractionFrame extractionFrame; if ((extractionFrame = extractionItem as ExtractionFrame) != null) { // Frames are stored separated from links, to avoid queuing and download by the crawler ExtractedFrames[extractionFrame.Name] = ExtractedLinks[extractionFrame.Name]; ExtractedLinks.Remove(extractionFrame.Name); // TODO: Download frames inline and store them in ExtractedItems (override initially extracted values) var frameResourceLinks = ExtractedFrames.GetValues(extractionFrame.Name); var frameDownloadTasks = frameResourceLinks .Select(frame => CrawlingEngine.CrawlAsync(frame, false)) .ToArray(); Task.WaitAll(frameDownloadTasks); // We're not in async context, so we'll have to hold this thread until we download all the inline downloads required // Replace previously extracted data for the frame with it's downloaded content ExtractedItems[extractionFrame.Name] = frameDownloadTasks .SelectMany(frameDownloadTask => frameDownloadTask.Result) .OfType <ResponseStringUnit>() .Select(frameResponse => { IEnumerable <string> result = new [] { frameResponse.Content }; if (extractionFrame.PostProcessOnDownload) { result = PostProcess(result, extractionItem.PostProcessors, DependencyDataSource); } return(result.ToArray()); }) .SelectMany(pred => pred) .ToArray(); } }
private static IList <ResourceLink> ReadEntryLinksSection(XmlReader reader, WebsiteConfig config, WebsiteJob job) { var result = new List <ResourceLink>(); while (!(reader.Name == "entry" && reader.NodeType == XmlNodeType.EndElement) && reader.Read()) { if (!reader.IsStartElement()) { continue; } switch (reader.Name) { case "link": var extractionLink = ReadExtractionLinkSection(reader, config); var linkExtractedItems = new CollectionDictionary <string, StringWithDependencies>(); foreach (var extractionItem in extractionLink.PredefinedExtractionItems.Values) { linkExtractedItems.AddValue(extractionItem.Name, extractionItem.Value); } var dependencyDataSource = new DependencyDataSource( linkExtractedItems .Where(pred => pred.Value.Any(value => !value.RequiresResolve)) .ToCollectionDictionary( pred => pred.Key, pred => pred.Value .Where(value => !value.RequiresResolve) .Select(sel => sel.FormatString) ), config.PredefinedValues, job.PredefinedValues ); // NOTE: These are entry links, so they can't have any location to extract items from, only constant values var extractedLink = new AutoDetectLink( dependencyDataSource.Resolve(extractionLink.Value), extractionLink.HttpMethod, dependencyDataSource.Resolve(extractionLink.Parameters), dependencyDataSource.Resolve(extractionLink.Headers), extractionLink.ExtractLinks, extractionLink.ExtractData, config, job, dependencyDataSource.Resolve(linkExtractedItems), (job as CrawlingBase ?? config)?.InitializationDocumentLink ); result.Add(extractedLink); break; default: throw new ArgumentException("Unrecognized element", reader.Name); } } return(result); }
public ReformatCsvPostProcessor( string outputDelimiter, IList <CsvColumnTransition> columnTransitions ) { _outputDelimiter = outputDelimiter; _columnTransitions = columnTransitions.ToCollectionDictionary(pred => pred.Name, pred => pred); _columnsOrder = columnTransitions.Select(pred => pred.Name).Distinct().ToArray(); }
public CollectionDictionary <string, string> Resolve(CollectionDictionary <string, StringWithDependencies> dictionaryWithDependencies) { if (dictionaryWithDependencies == null) { return(null); } return(dictionaryWithDependencies.ToCollectionDictionary( pred => pred.Key, pred => pred.Value.Select(value => Resolve(value)) )); }
public InitializationLink( StringWithDependencies urlWithDependencies, string httpMethod, IDictionary <string, StringWithDependencies> parametersWithDependencies, IDictionary <string, StringWithDependencies> headersWithDependencies, bool extractLinks, bool extractData, WebsiteConfig config, WebsiteJob job, CollectionDictionary <string, StringWithDependencies> preExtractedItemsWithDependencies = null ) : base(urlWithDependencies, httpMethod, parametersWithDependencies, headersWithDependencies, extractLinks, extractData, config, job, preExtractedItemsWithDependencies, null) { }
public InitializationLink( string url, string httpMethod, IDictionary <string, string> parameters, IDictionary <string, string> headers, bool extractLinks, bool extractData, WebsiteConfig config, WebsiteJob job, CollectionDictionary <string, string> preExtractedItems = null ) : base(url, httpMethod, parameters, headers, extractLinks, extractData, config, job, preExtractedItems, null) { }
public void TestCollectionDictionary() { Console.WriteLine(CollectionDictionary.CollectionDictionaryInsert("Session1", "Row001")); Console.WriteLine(CollectionDictionary.CollectionDictionaryInsert("Session1", "Row002")); Console.WriteLine(CollectionDictionary.CollectionDictionaryInsert("Session1", "Row003")); Console.WriteLine(CollectionDictionary.CollectionDictionaryInsert("Session1", "Row004")); Console.WriteLine(CollectionDictionary.CollectionDictionaryInsert("Session1", "Row005")); Console.WriteLine(CollectionDictionary.CollectionDictionaryInsert("Session2", "Row010")); Console.WriteLine(CollectionDictionary.CollectionDictionaryInsert("Session2", "Row011")); Console.WriteLine(CollectionDictionary.CollectionDictionaryInsert("Session2", "Row012")); Console.WriteLine(CollectionDictionary.CollectionDictionaryInsert("Session2", "Row013")); Console.WriteLine(CollectionDictionary.CollectionDictionaryInsert("Session1", "Row006")); Console.WriteLine(CollectionDictionary.CollectionDictionaryInsert("Session1", "Row007")); Console.WriteLine(CollectionDictionary.CollectionDictionaryInsert("Session2", "Row014")); Console.WriteLine(CollectionDictionary.GetElement("Session1")); Console.WriteLine(CollectionDictionary.GetElement("Session1")); Console.WriteLine(CollectionDictionary.GetElement("Session1")); Console.WriteLine(CollectionDictionary.GetElement("Session2")); Console.WriteLine(CollectionDictionary.GetElement("Session2")); Console.WriteLine(CollectionDictionary.GetElement("Session2")); Console.WriteLine(CollectionDictionary.GetElement("Session2")); Console.WriteLine(CollectionDictionary.GetElement("Session2")); Console.WriteLine(CollectionDictionary.GetElement("Session2")); Console.WriteLine(CollectionDictionary.GetElement("Session2")); Console.WriteLine(CollectionDictionary.GetElement("Session2")); Console.WriteLine(CollectionDictionary.GetElement("Session2")); Console.WriteLine(CollectionDictionary.GetElement("Session2")); Console.WriteLine(CollectionDictionary.GetElement("Session1")); Console.WriteLine(CollectionDictionary.GetElement("Session1")); Console.WriteLine(CollectionDictionary.GetElement("Session1")); Console.WriteLine(CollectionDictionary.GetElement("Session1")); Console.WriteLine(CollectionDictionary.GetElement("Session1")); Console.WriteLine(CollectionDictionary.GetElement("Session1")); Console.WriteLine(CollectionDictionary.GetCount("Session1")); Console.WriteLine(CollectionDictionary.GetCount("Session2")); Console.WriteLine(CollectionDictionary.GetElementAt("Session1", 1)); Console.WriteLine(CollectionDictionary.ClearCollection("Session1")); Console.WriteLine(CollectionDictionary.GetElementAt("Session1", 1)); }
public IEnumerable <string> ResolveAll(StringWithDependencies stringWithDependencies) { if (stringWithDependencies == null) { yield return(null); } if (!stringWithDependencies.RequiresResolve) { yield return(stringWithDependencies.FormatString); } var resolvedValues = new CollectionDictionary <string>(); foreach (var dependencyName in stringWithDependencies.DependencyNames) { if (ExtractedItems.ContainsKey(dependencyName)) { var dependencyValues = ExtractedItems[dependencyName]; foreach (var dependencyValue in dependencyValues) { resolvedValues.AddValue(dependencyName, dependencyValue); } } else if (ConfigPredefinedValues.Dictionary.ContainsKey(dependencyName)) { var dependencyValue = ConfigPredefinedValues.Dictionary[dependencyName]; resolvedValues.AddValues(dependencyName, dependencyValue); } else if (JobPredefinedValues != null && JobPredefinedValues.Dictionary.ContainsKey(dependencyName)) { var dependencyValue = JobPredefinedValues.Dictionary[dependencyName]; resolvedValues.AddValues(dependencyName, dependencyValue); } else { resolvedValues.AddValue(dependencyName, string.Empty); Trace.TraceError($"{GetType().Name}.ResolveAll: Could not resolve item {dependencyName} with dependencies [{string.Join(",", stringWithDependencies.DependencyNames)}] based on extracted items {ExtractedItems.Select(pred => string.Format("[{0}: {1}]", pred.Key, string.Join(",", pred.Value)))}"); } } foreach (var combination in FormatAllDependencyCombinations(stringWithDependencies, resolvedValues)) { yield return(combination); } }
public DocumentLink( StringWithDependencies urlWithDependencies, string httpMethod, IDictionary <string, StringWithDependencies> parametersWithDependencies, IDictionary <string, StringWithDependencies> headersWithDependencies, bool extractLinks, bool extractData, WebsiteConfig config, WebsiteJob job, CollectionDictionary <string, StringWithDependencies> preExtractedItemsWithDependencies, DocumentLink referrerDocumentLink = null ) : base(urlWithDependencies, httpMethod, parametersWithDependencies, headersWithDependencies, config, job, referrerDocumentLink) { ExtractLinks = extractLinks; ExtractData = extractData; PreExtractedItemsWithDependencies = preExtractedItemsWithDependencies; }
private void ExtractDependencies( ExtractionItem extractionItem, IDictionary <string, ExtractionItem> extractionItems, CollectionDictionary <string, string> extractedItems, ResponseParserPositionPointer?relativeLocationBase = null ) { // TODO: // A MAJOR BUG!!! // StringWithDependency objects are shared between threads and jobs // Therefore, current strategy of "Resolving" it on extraction: // 1) Changes instance value for everyone that might be using it // 2) "Resolve" stores resolved data inside the object, // so next resolve with new data does nothing since the string is already resolved // Links are extracted as dependencies as well (like normal extraction items) // ensure dependencies are extracted var stringsWithDependencies = extractionItem.GetStringsWithDependencies(); foreach (var stringWithDependencies in stringsWithDependencies) { foreach (var dependencyName in stringWithDependencies.DependencyNames) { // dependency may be of PredefinedValues and doesn't require to be extracted from any of page items if (extractionItems.ContainsKey(dependencyName)) { ExtractAutoDetect( extractionItems[dependencyName], extractionItems, extractedItems, relativeLocationBase ); } else { Debug.Assert( _documentLink.Config.PredefinedValues.Dictionary.ContainsKey(dependencyName) || (_documentLink.Job?.PredefinedValues.Dictionary.ContainsKey(dependencyName) ?? false) ); } } } }
public DocumentLink ( string url, string httpMethod, IDictionary <string, string> parameters, IDictionary <string, string> headers, bool extractLinks, bool extractData, WebsiteConfig config, WebsiteJob job, CollectionDictionary <string, string> preExtractedItems, DocumentLink referrerDocumentLink = null ) : base(url, httpMethod, parameters, headers, config, job, referrerDocumentLink) { ExtractLinks = extractLinks; ExtractData = extractData; PreExtractedItems = preExtractedItems; }
private void TrySyncSavedLists(ListKind listKind, bool save = true) { IList <string> savedCollection; if (!savedCollections.TryGetValue(listKind, out savedCollection)) { return; } CollectionPair currentCollection; if (!CollectionDictionary.TryGetValue(listKind, out currentCollection)) { return; } savedCollection.Clear(); currentCollection.List.Each(savedCollection.Add); if (save) { SettingsService.SaveApplicationSettingsToXml(currentCharacter); } }
public CrawlingPredefinedValues(CrawlingPredefinedValues template) { Required = template.Required.ToHashSet(); Dictionary = new CollectionDictionary <string, string>(template.Dictionary); }
public DependencyDataSource(CollectionDictionary <string, string> extractedItems, CrawlingPredefinedValues configPredefinedValues, CrawlingPredefinedValues jobPredefinedValues) { ExtractedItems = extractedItems; ConfigPredefinedValues = configPredefinedValues; JobPredefinedValues = jobPredefinedValues; }
protected IEnumerable <ResourceLink> ExtractResourceLinks(ExtractionLink extractionLink) { if (ExtractedItems.ContainsKey(extractionLink.Name)) { for (var i = 0; i < ExtractedItems[extractionLink.Name].Count; i++) { var linkValue = ExtractedItems[extractionLink.Name][i]; var linkInDocumentPositionPointer = new ResponseParserPositionPointer(extractionLink.Location, i); var linkScopedExtractedItems = new CollectionDictionary <string, string>(); foreach (var extractionItem in extractionLink.PredefinedExtractionItems.Values) { ExtractItem( extractionItem, extractionLink.PredefinedExtractionItems, linkScopedExtractedItems, extractionLink.IsPredefinedExtractionItemsLocationRelativeToLink ? linkInDocumentPositionPointer : (ResponseParserPositionPointer?)null ); } var url = linkValue; ResourceLink resourceLink; switch (extractionLink.Type) { case ExtractionLink.LinkTypes.Document: resourceLink = new DocumentLink( url, extractionLink.HttpMethod, DependencyDataSource.Resolve(extractionLink.Parameters), DependencyDataSource.Resolve(extractionLink.Headers), extractionLink.ExtractLinks, extractionLink.ExtractData, _documentLink.Config, _documentLink.Job, linkScopedExtractedItems, _documentLink ); break; case ExtractionLink.LinkTypes.File: resourceLink = new FileLink( url, DependencyDataSource.Resolve(extractionLink.Parameters), DependencyDataSource.Resolve(extractionLink.Headers), _documentLink.Config, _documentLink.Job, _documentLink ); break; case ExtractionLink.LinkTypes.Auto: resourceLink = new AutoDetectLink( linkValue, extractionLink.HttpMethod, DependencyDataSource.Resolve(extractionLink.Parameters), DependencyDataSource.Resolve(extractionLink.Headers), extractionLink.ExtractLinks, extractionLink.ExtractData, _documentLink.Config, _documentLink.Job, linkScopedExtractedItems, _documentLink ); break; default: throw new NotSupportedException(); } yield return(resourceLink); } } }
protected void ExtractItem( ExtractionItem extractionItem, IDictionary <string, ExtractionItem> extractionItems, CollectionDictionary <string, string> extractedItems, ResponseParserPositionPointer?relativeLocationBase = null ) { if (extractedItems.ContainsKey(extractionItem.Name)) { return; // already extracted as someone's dependency } ExtractDependencies(extractionItem, extractionItems, extractedItems, relativeLocationBase); var extractedValues = new List <string>(); // constant value, if specified in config if (extractionItem.Value != null) { extractedValues.Add(DependencyDataSource.Resolve(extractionItem.Value)); } // values, extracted from page using selector if (extractionItem.Location != null) { var documentParsers = new List <DocumentProcessor>(); // If Context is not specified for item, then item is extracted from base document, without any complexities if (extractionItem.Context == null) { documentParsers.Add(this); } else { // If Context is specified, we're creating a separate DocumentParser for every context item // Create once per Context item, not parse the document 10 times for each element extracted var contextItemName = DependencyDataSource.Resolve(extractionItem.Context.ContextItemName); if (!ContextItemDocumentParsers.TryGetValue(contextItemName, out var contextDocumentParsers)) { // Generate context parsers for this ContextItem if they have not been generated before ContextItemDocumentParsers[contextItemName] = contextDocumentParsers = new List <DocumentProcessor>(); var contextResourceFrames = ExtractedFrames[contextItemName]; var contextDocumentLinks = contextResourceFrames.OfType <DocumentLink>().ToArray(); Debug.Assert(contextResourceFrames.Count == contextDocumentLinks.Length); var contextDocumentStrings = ExtractedItems.GetValues(contextItemName); Debug.Assert(contextDocumentLinks.Length == contextDocumentStrings.Count); for (var i = 0; i < contextDocumentLinks.Length; i++) { // TODO: Documents and Downloaded strings order is not the same. Refactor. var documentString = contextDocumentStrings[i]; var documentLink = contextDocumentLinks[i]; var contextDocumentParser = _documentLink.Config.CreateDocumentStringParser( documentString, documentLink, extractionItem.Context.ContextDocumentType ); // Reusing document parsers globally inside the config. Otherwise we get infinite recursive contextDocumentParser.ContextItemDocumentParsers = ContextItemDocumentParsers; contextDocumentParser.ExtractedItems = ExtractedItems; contextDocumentParser.ExtractedLinks = ExtractedLinks; contextDocumentParser.ExtractedFrames = ExtractedFrames; contextDocumentParsers.Add(contextDocumentParser); contextDocumentParser.Parse(); } } documentParsers.AddRange(contextDocumentParsers); } foreach (var documentParser in documentParsers) { extractedValues.AddRange( documentParser.ExtractItemValuesFromLocation(extractionItem.Location, relativeLocationBase) .Select(pred => pred.ExtractedValue) ); } } // TODO: Reconsider architecture // Links are not post-processed at all and frames are post-processed on download only // apply post-processing, if specified extractedItems.AddValues( extractionItem.Name, extractionItem.PostProcessOnExtraction ? PostProcess(extractedValues, extractionItem.PostProcessors, DependencyDataSource) : extractedValues ); }
private IEnumerable <string> FormatAllDependencyCombinations(StringWithDependencies stringWithDependencies, CollectionDictionary <string> collectionDictionary) { foreach (var i in collectionDictionary.Keys) { var indexes = new Dictionary <string, int>(); foreach (var kvp in collectionDictionary) { indexes.Add(kvp.Key, 0); } for (var j = 0; j < collectionDictionary[i].Count; j++) { var formatDictionary = new Dictionary <string, string>(); foreach (var key in collectionDictionary.Keys) { formatDictionary[key] = collectionDictionary[key][indexes[key]++]; } formatDictionary[i] = collectionDictionary[i][j]; yield return(Smart.Format(stringWithDependencies.FormatString, formatDictionary)); } } }
public ExtractedDataUnit(CollectionDictionary <string, string> extractedData) { ExtractedData = extractedData; }