コード例 #1
0
        public static IEnumerable <XElement> Merge(this IEnumerable <XElement> ruleElements)
        {
            var nodesBySelector = new CollectionDictionary <Selector, XElement, List <XElement> >();

            foreach (var rule in ruleElements)
            {
                var selector = new Selector(rule);
                nodesBySelector.Add(selector, rule);
            }

            var result = new List <XElement>();

            foreach (var k in nodesBySelector)
            {
                if (k.Value.Count >= 1)
                {
                    var first = k.Value[0];
                    for (var i = 1; i < k.Value.Count; i++)
                    {
                        var node = k.Value[i];
                        node.Elements("property").ForEach(first.Add);
                        node.Elements("style").ForEach(first.Add);
                        if (node.Parent != null)
                        {
                            node.Remove();
                        }
                    }

                    first.Elements("style").Merge();
                    result.Add(first);
                }
            }
            return(result);
        }
コード例 #2
0
ファイル: DocumentProcessor.cs プロジェクト: rezgar/crawler
        protected void ExtractAutoDetect(
            ExtractionItem extractionItem,
            IDictionary <string, ExtractionItem> extractionItems,
            CollectionDictionary <string, string> extractedItems,
            ResponseParserPositionPointer?relativeLocationBase = null)
        {
            if (extractedItems.ContainsKey(extractionItem.Name))
            {
                return;
            }

            // First, extract item as a normal item
            ExtractItem(extractionItem, extractionItems, extractedItems, relativeLocationBase);

            // Then, add specific functionality, like link-scoped items and registering a ExtractedLink entity
            // If item is a link, extract it's link aspect
            ExtractionLink extractionLink;

            if ((extractionLink = extractionItem as ExtractionLink) != null)
            {
                ExtractLink(extractionLink);
            }

            // If item is a frame (which is a link as well), then it's link data has already been extracted an we only need to perform Frame-specific actions, like download and replace values in ExtractedItems
            ExtractionFrame extractionFrame;

            if ((extractionFrame = extractionItem as ExtractionFrame) != null)
            {
                // Frames are stored separated from links, to avoid queuing and download by the crawler
                ExtractedFrames[extractionFrame.Name] = ExtractedLinks[extractionFrame.Name];
                ExtractedLinks.Remove(extractionFrame.Name);

                // TODO: Download frames inline and store them in ExtractedItems (override initially extracted values)
                var frameResourceLinks = ExtractedFrames.GetValues(extractionFrame.Name);
                var frameDownloadTasks = frameResourceLinks
                                         .Select(frame => CrawlingEngine.CrawlAsync(frame, false))
                                         .ToArray();

                Task.WaitAll(frameDownloadTasks); // We're not in async context, so we'll have to hold this thread until we download all the inline downloads required

                // Replace previously extracted data for the frame with it's downloaded content
                ExtractedItems[extractionFrame.Name] =
                    frameDownloadTasks
                    .SelectMany(frameDownloadTask => frameDownloadTask.Result)
                    .OfType <ResponseStringUnit>()
                    .Select(frameResponse =>
                {
                    IEnumerable <string> result = new [] { frameResponse.Content };
                    if (extractionFrame.PostProcessOnDownload)
                    {
                        result = PostProcess(result, extractionItem.PostProcessors, DependencyDataSource);
                    }

                    return(result.ToArray());
                })
                    .SelectMany(pred => pred)
                    .ToArray();
            }
        }
コード例 #3
0
        private static IList <ResourceLink> ReadEntryLinksSection(XmlReader reader, WebsiteConfig config, WebsiteJob job)
        {
            var result = new List <ResourceLink>();

            while (!(reader.Name == "entry" && reader.NodeType == XmlNodeType.EndElement) && reader.Read())
            {
                if (!reader.IsStartElement())
                {
                    continue;
                }

                switch (reader.Name)
                {
                case "link":
                    var extractionLink     = ReadExtractionLinkSection(reader, config);
                    var linkExtractedItems = new CollectionDictionary <string, StringWithDependencies>();
                    foreach (var extractionItem in extractionLink.PredefinedExtractionItems.Values)
                    {
                        linkExtractedItems.AddValue(extractionItem.Name, extractionItem.Value);
                    }

                    var dependencyDataSource = new DependencyDataSource(
                        linkExtractedItems
                        .Where(pred => pred.Value.Any(value => !value.RequiresResolve))
                        .ToCollectionDictionary(
                            pred => pred.Key,
                            pred => pred.Value
                            .Where(value => !value.RequiresResolve)
                            .Select(sel => sel.FormatString)
                            ),
                        config.PredefinedValues,
                        job.PredefinedValues
                        );

                    // NOTE: These are entry links, so they can't have any location to extract items from, only constant values
                    var extractedLink = new AutoDetectLink(
                        dependencyDataSource.Resolve(extractionLink.Value),
                        extractionLink.HttpMethod,
                        dependencyDataSource.Resolve(extractionLink.Parameters),
                        dependencyDataSource.Resolve(extractionLink.Headers),
                        extractionLink.ExtractLinks,
                        extractionLink.ExtractData,
                        config,
                        job,
                        dependencyDataSource.Resolve(linkExtractedItems),
                        (job as CrawlingBase ?? config)?.InitializationDocumentLink
                        );

                    result.Add(extractedLink);
                    break;

                default:
                    throw new ArgumentException("Unrecognized element", reader.Name);
                }
            }

            return(result);
        }
コード例 #4
0
 public ReformatCsvPostProcessor(
     string outputDelimiter,
     IList <CsvColumnTransition> columnTransitions
     )
 {
     _outputDelimiter   = outputDelimiter;
     _columnTransitions = columnTransitions.ToCollectionDictionary(pred => pred.Name, pred => pred);
     _columnsOrder      = columnTransitions.Select(pred => pred.Name).Distinct().ToArray();
 }
コード例 #5
0
        public CollectionDictionary <string, string> Resolve(CollectionDictionary <string, StringWithDependencies> dictionaryWithDependencies)
        {
            if (dictionaryWithDependencies == null)
            {
                return(null);
            }

            return(dictionaryWithDependencies.ToCollectionDictionary(
                       pred => pred.Key,
                       pred => pred.Value.Select(value => Resolve(value))
                       ));
        }
コード例 #6
0
ファイル: InitializationLink.cs プロジェクト: rezgar/crawler
 public InitializationLink(
     StringWithDependencies urlWithDependencies,
     string httpMethod,
     IDictionary <string, StringWithDependencies> parametersWithDependencies,
     IDictionary <string, StringWithDependencies> headersWithDependencies,
     bool extractLinks,
     bool extractData,
     WebsiteConfig config,
     WebsiteJob job,
     CollectionDictionary <string, StringWithDependencies> preExtractedItemsWithDependencies = null
     )
     : base(urlWithDependencies, httpMethod, parametersWithDependencies, headersWithDependencies, extractLinks, extractData, config, job, preExtractedItemsWithDependencies, null)
 {
 }
コード例 #7
0
ファイル: InitializationLink.cs プロジェクト: rezgar/crawler
 public InitializationLink(
     string url,
     string httpMethod,
     IDictionary <string, string> parameters,
     IDictionary <string, string> headers,
     bool extractLinks,
     bool extractData,
     WebsiteConfig config,
     WebsiteJob job,
     CollectionDictionary <string, string> preExtractedItems = null
     )
     : base(url, httpMethod, parameters, headers, extractLinks, extractData, config, job, preExtractedItems, null)
 {
 }
コード例 #8
0
        public void TestCollectionDictionary()
        {
            Console.WriteLine(CollectionDictionary.CollectionDictionaryInsert("Session1", "Row001"));
            Console.WriteLine(CollectionDictionary.CollectionDictionaryInsert("Session1", "Row002"));
            Console.WriteLine(CollectionDictionary.CollectionDictionaryInsert("Session1", "Row003"));
            Console.WriteLine(CollectionDictionary.CollectionDictionaryInsert("Session1", "Row004"));
            Console.WriteLine(CollectionDictionary.CollectionDictionaryInsert("Session1", "Row005"));

            Console.WriteLine(CollectionDictionary.CollectionDictionaryInsert("Session2", "Row010"));
            Console.WriteLine(CollectionDictionary.CollectionDictionaryInsert("Session2", "Row011"));
            Console.WriteLine(CollectionDictionary.CollectionDictionaryInsert("Session2", "Row012"));
            Console.WriteLine(CollectionDictionary.CollectionDictionaryInsert("Session2", "Row013"));


            Console.WriteLine(CollectionDictionary.CollectionDictionaryInsert("Session1", "Row006"));
            Console.WriteLine(CollectionDictionary.CollectionDictionaryInsert("Session1", "Row007"));
            Console.WriteLine(CollectionDictionary.CollectionDictionaryInsert("Session2", "Row014"));



            Console.WriteLine(CollectionDictionary.GetElement("Session1"));
            Console.WriteLine(CollectionDictionary.GetElement("Session1"));
            Console.WriteLine(CollectionDictionary.GetElement("Session1"));
            Console.WriteLine(CollectionDictionary.GetElement("Session2"));
            Console.WriteLine(CollectionDictionary.GetElement("Session2"));
            Console.WriteLine(CollectionDictionary.GetElement("Session2"));
            Console.WriteLine(CollectionDictionary.GetElement("Session2"));
            Console.WriteLine(CollectionDictionary.GetElement("Session2"));
            Console.WriteLine(CollectionDictionary.GetElement("Session2"));
            Console.WriteLine(CollectionDictionary.GetElement("Session2"));
            Console.WriteLine(CollectionDictionary.GetElement("Session2"));
            Console.WriteLine(CollectionDictionary.GetElement("Session2"));
            Console.WriteLine(CollectionDictionary.GetElement("Session2"));
            Console.WriteLine(CollectionDictionary.GetElement("Session1"));
            Console.WriteLine(CollectionDictionary.GetElement("Session1"));
            Console.WriteLine(CollectionDictionary.GetElement("Session1"));
            Console.WriteLine(CollectionDictionary.GetElement("Session1"));
            Console.WriteLine(CollectionDictionary.GetElement("Session1"));
            Console.WriteLine(CollectionDictionary.GetElement("Session1"));


            Console.WriteLine(CollectionDictionary.GetCount("Session1"));
            Console.WriteLine(CollectionDictionary.GetCount("Session2"));

            Console.WriteLine(CollectionDictionary.GetElementAt("Session1", 1));
            Console.WriteLine(CollectionDictionary.ClearCollection("Session1"));
            Console.WriteLine(CollectionDictionary.GetElementAt("Session1", 1));
        }
コード例 #9
0
        public IEnumerable <string> ResolveAll(StringWithDependencies stringWithDependencies)
        {
            if (stringWithDependencies == null)
            {
                yield return(null);
            }

            if (!stringWithDependencies.RequiresResolve)
            {
                yield return(stringWithDependencies.FormatString);
            }

            var resolvedValues = new CollectionDictionary <string>();

            foreach (var dependencyName in stringWithDependencies.DependencyNames)
            {
                if (ExtractedItems.ContainsKey(dependencyName))
                {
                    var dependencyValues = ExtractedItems[dependencyName];
                    foreach (var dependencyValue in dependencyValues)
                    {
                        resolvedValues.AddValue(dependencyName, dependencyValue);
                    }
                }
                else if (ConfigPredefinedValues.Dictionary.ContainsKey(dependencyName))
                {
                    var dependencyValue = ConfigPredefinedValues.Dictionary[dependencyName];
                    resolvedValues.AddValues(dependencyName, dependencyValue);
                }
                else if (JobPredefinedValues != null && JobPredefinedValues.Dictionary.ContainsKey(dependencyName))
                {
                    var dependencyValue = JobPredefinedValues.Dictionary[dependencyName];
                    resolvedValues.AddValues(dependencyName, dependencyValue);
                }
                else
                {
                    resolvedValues.AddValue(dependencyName, string.Empty);
                    Trace.TraceError($"{GetType().Name}.ResolveAll: Could not resolve item {dependencyName} with dependencies [{string.Join(",", stringWithDependencies.DependencyNames)}] based on extracted items {ExtractedItems.Select(pred => string.Format("[{0}: {1}]", pred.Key, string.Join(",", pred.Value)))}");
                }
            }

            foreach (var combination in FormatAllDependencyCombinations(stringWithDependencies, resolvedValues))
            {
                yield return(combination);
            }
        }
コード例 #10
0
ファイル: DocumentLink.cs プロジェクト: rezgar/crawler
 public DocumentLink(
     StringWithDependencies urlWithDependencies,
     string httpMethod,
     IDictionary <string, StringWithDependencies> parametersWithDependencies,
     IDictionary <string, StringWithDependencies> headersWithDependencies,
     bool extractLinks,
     bool extractData,
     WebsiteConfig config,
     WebsiteJob job,
     CollectionDictionary <string, StringWithDependencies> preExtractedItemsWithDependencies,
     DocumentLink referrerDocumentLink = null
     )
     : base(urlWithDependencies, httpMethod, parametersWithDependencies, headersWithDependencies, config, job, referrerDocumentLink)
 {
     ExtractLinks = extractLinks;
     ExtractData  = extractData;
     PreExtractedItemsWithDependencies = preExtractedItemsWithDependencies;
 }
コード例 #11
0
ファイル: DocumentProcessor.cs プロジェクト: rezgar/crawler
        private void ExtractDependencies(
            ExtractionItem extractionItem,
            IDictionary <string, ExtractionItem> extractionItems,
            CollectionDictionary <string, string> extractedItems,
            ResponseParserPositionPointer?relativeLocationBase = null
            )
        {
            // TODO:
            // A MAJOR BUG!!!
            // StringWithDependency objects are shared between threads and jobs
            // Therefore, current strategy of "Resolving" it on extraction:
            // 1) Changes instance value for everyone that might be using it
            // 2) "Resolve" stores resolved data inside the object,
            //    so next resolve with new data does nothing since the string is already resolved

            // Links are extracted as dependencies as well (like normal extraction items)
            // ensure dependencies are extracted
            var stringsWithDependencies = extractionItem.GetStringsWithDependencies();

            foreach (var stringWithDependencies in stringsWithDependencies)
            {
                foreach (var dependencyName in stringWithDependencies.DependencyNames)
                {
                    // dependency may be of PredefinedValues and doesn't require to be extracted from any of page items
                    if (extractionItems.ContainsKey(dependencyName))
                    {
                        ExtractAutoDetect(
                            extractionItems[dependencyName],
                            extractionItems,
                            extractedItems,

                            relativeLocationBase
                            );
                    }
                    else
                    {
                        Debug.Assert(
                            _documentLink.Config.PredefinedValues.Dictionary.ContainsKey(dependencyName) ||
                            (_documentLink.Job?.PredefinedValues.Dictionary.ContainsKey(dependencyName) ?? false)
                            );
                    }
                }
            }
        }
コード例 #12
0
ファイル: DocumentLink.cs プロジェクト: rezgar/crawler
 public DocumentLink
 (
     string url,
     string httpMethod,
     IDictionary <string, string> parameters,
     IDictionary <string, string> headers,
     bool extractLinks,
     bool extractData,
     WebsiteConfig config,
     WebsiteJob job,
     CollectionDictionary <string, string> preExtractedItems,
     DocumentLink referrerDocumentLink = null
 )
     : base(url, httpMethod, parameters, headers, config, job, referrerDocumentLink)
 {
     ExtractLinks      = extractLinks;
     ExtractData       = extractData;
     PreExtractedItems = preExtractedItems;
 }
コード例 #13
0
        private void TrySyncSavedLists(ListKind listKind, bool save = true)
        {
            IList <string> savedCollection;

            if (!savedCollections.TryGetValue(listKind, out savedCollection))
            {
                return;
            }

            CollectionPair currentCollection;

            if (!CollectionDictionary.TryGetValue(listKind, out currentCollection))
            {
                return;
            }

            savedCollection.Clear();
            currentCollection.List.Each(savedCollection.Add);

            if (save)
            {
                SettingsService.SaveApplicationSettingsToXml(currentCharacter);
            }
        }
コード例 #14
0
 public CrawlingPredefinedValues(CrawlingPredefinedValues template)
 {
     Required   = template.Required.ToHashSet();
     Dictionary = new CollectionDictionary <string, string>(template.Dictionary);
 }
コード例 #15
0
 public DependencyDataSource(CollectionDictionary <string, string> extractedItems, CrawlingPredefinedValues configPredefinedValues, CrawlingPredefinedValues jobPredefinedValues)
 {
     ExtractedItems         = extractedItems;
     ConfigPredefinedValues = configPredefinedValues;
     JobPredefinedValues    = jobPredefinedValues;
 }
コード例 #16
0
ファイル: DocumentProcessor.cs プロジェクト: rezgar/crawler
        protected IEnumerable <ResourceLink> ExtractResourceLinks(ExtractionLink extractionLink)
        {
            if (ExtractedItems.ContainsKey(extractionLink.Name))
            {
                for (var i = 0; i < ExtractedItems[extractionLink.Name].Count; i++)
                {
                    var linkValue = ExtractedItems[extractionLink.Name][i];
                    var linkInDocumentPositionPointer = new ResponseParserPositionPointer(extractionLink.Location, i);

                    var linkScopedExtractedItems = new CollectionDictionary <string, string>();
                    foreach (var extractionItem in extractionLink.PredefinedExtractionItems.Values)
                    {
                        ExtractItem(
                            extractionItem,
                            extractionLink.PredefinedExtractionItems,
                            linkScopedExtractedItems,

                            extractionLink.IsPredefinedExtractionItemsLocationRelativeToLink
                                ? linkInDocumentPositionPointer
                                : (ResponseParserPositionPointer?)null
                            );
                    }

                    var url = linkValue;

                    ResourceLink resourceLink;
                    switch (extractionLink.Type)
                    {
                    case ExtractionLink.LinkTypes.Document:
                        resourceLink = new DocumentLink(
                            url,
                            extractionLink.HttpMethod,
                            DependencyDataSource.Resolve(extractionLink.Parameters),
                            DependencyDataSource.Resolve(extractionLink.Headers),
                            extractionLink.ExtractLinks,
                            extractionLink.ExtractData,
                            _documentLink.Config,
                            _documentLink.Job,
                            linkScopedExtractedItems,
                            _documentLink
                            );
                        break;

                    case ExtractionLink.LinkTypes.File:
                        resourceLink = new FileLink(
                            url,
                            DependencyDataSource.Resolve(extractionLink.Parameters),
                            DependencyDataSource.Resolve(extractionLink.Headers),
                            _documentLink.Config,
                            _documentLink.Job,
                            _documentLink
                            );
                        break;

                    case ExtractionLink.LinkTypes.Auto:
                        resourceLink = new AutoDetectLink(
                            linkValue,
                            extractionLink.HttpMethod,
                            DependencyDataSource.Resolve(extractionLink.Parameters),
                            DependencyDataSource.Resolve(extractionLink.Headers),
                            extractionLink.ExtractLinks,
                            extractionLink.ExtractData,
                            _documentLink.Config,
                            _documentLink.Job,
                            linkScopedExtractedItems,
                            _documentLink
                            );
                        break;

                    default:
                        throw new NotSupportedException();
                    }

                    yield return(resourceLink);
                }
            }
        }
コード例 #17
0
ファイル: DocumentProcessor.cs プロジェクト: rezgar/crawler
        protected void ExtractItem(
            ExtractionItem extractionItem,
            IDictionary <string, ExtractionItem> extractionItems,
            CollectionDictionary <string, string> extractedItems,
            ResponseParserPositionPointer?relativeLocationBase = null
            )
        {
            if (extractedItems.ContainsKey(extractionItem.Name))
            {
                return; // already extracted as someone's dependency
            }
            ExtractDependencies(extractionItem, extractionItems, extractedItems, relativeLocationBase);

            var extractedValues = new List <string>();

            // constant value, if specified in config
            if (extractionItem.Value != null)
            {
                extractedValues.Add(DependencyDataSource.Resolve(extractionItem.Value));
            }

            // values, extracted from page using selector
            if (extractionItem.Location != null)
            {
                var documentParsers = new List <DocumentProcessor>();
                // If Context is not specified for item, then item is extracted from base document, without any complexities
                if (extractionItem.Context == null)
                {
                    documentParsers.Add(this);
                }
                else
                {
                    // If Context is specified, we're creating a separate DocumentParser for every context item
                    // Create once per Context item, not parse the document 10 times for each element extracted
                    var contextItemName = DependencyDataSource.Resolve(extractionItem.Context.ContextItemName);

                    if (!ContextItemDocumentParsers.TryGetValue(contextItemName, out var contextDocumentParsers))
                    {
                        // Generate context parsers for this ContextItem if they have not been generated before
                        ContextItemDocumentParsers[contextItemName]
                              = contextDocumentParsers
                              = new List <DocumentProcessor>();

                        var contextResourceFrames = ExtractedFrames[contextItemName];
                        var contextDocumentLinks  = contextResourceFrames.OfType <DocumentLink>().ToArray();

                        Debug.Assert(contextResourceFrames.Count == contextDocumentLinks.Length);

                        var contextDocumentStrings = ExtractedItems.GetValues(contextItemName);

                        Debug.Assert(contextDocumentLinks.Length == contextDocumentStrings.Count);

                        for (var i = 0; i < contextDocumentLinks.Length; i++)
                        {
                            // TODO: Documents and Downloaded strings order is not the same. Refactor.
                            var documentString = contextDocumentStrings[i];
                            var documentLink   = contextDocumentLinks[i];

                            var contextDocumentParser =
                                _documentLink.Config.CreateDocumentStringParser(
                                    documentString,
                                    documentLink,
                                    extractionItem.Context.ContextDocumentType
                                    );

                            // Reusing document parsers globally inside the config. Otherwise we get infinite recursive
                            contextDocumentParser.ContextItemDocumentParsers = ContextItemDocumentParsers;
                            contextDocumentParser.ExtractedItems             = ExtractedItems;
                            contextDocumentParser.ExtractedLinks             = ExtractedLinks;
                            contextDocumentParser.ExtractedFrames            = ExtractedFrames;

                            contextDocumentParsers.Add(contextDocumentParser);
                            contextDocumentParser.Parse();
                        }
                    }

                    documentParsers.AddRange(contextDocumentParsers);
                }

                foreach (var documentParser in documentParsers)
                {
                    extractedValues.AddRange(
                        documentParser.ExtractItemValuesFromLocation(extractionItem.Location, relativeLocationBase)
                        .Select(pred => pred.ExtractedValue)
                        );
                }
            }

            // TODO: Reconsider architecture
            // Links are not post-processed at all and frames are post-processed on download only

            // apply post-processing, if specified
            extractedItems.AddValues(
                extractionItem.Name,
                extractionItem.PostProcessOnExtraction
                    ? PostProcess(extractedValues, extractionItem.PostProcessors, DependencyDataSource)
                    : extractedValues
                );
        }
コード例 #18
0
        private IEnumerable <string> FormatAllDependencyCombinations(StringWithDependencies stringWithDependencies, CollectionDictionary <string> collectionDictionary)
        {
            foreach (var i in collectionDictionary.Keys)
            {
                var indexes = new Dictionary <string, int>();
                foreach (var kvp in collectionDictionary)
                {
                    indexes.Add(kvp.Key, 0);
                }

                for (var j = 0; j < collectionDictionary[i].Count; j++)
                {
                    var formatDictionary = new Dictionary <string, string>();
                    foreach (var key in collectionDictionary.Keys)
                    {
                        formatDictionary[key] = collectionDictionary[key][indexes[key]++];
                    }
                    formatDictionary[i] = collectionDictionary[i][j];

                    yield return(Smart.Format(stringWithDependencies.FormatString, formatDictionary));
                }
            }
        }
コード例 #19
0
ファイル: ExtractedDataUnit.cs プロジェクト: rezgar/crawler
 public ExtractedDataUnit(CollectionDictionary <string, string> extractedData)
 {
     ExtractedData = extractedData;
 }