示例#1
0
        private static WebsiteJob ReadWebsiteJobNode(XmlTextReader reader, WebsiteConfig config)
        {
            var job = new WebsiteJob(config);

            job.Name = reader.GetAttribute <string>("name", job.Name);
            var holderNodeName = reader.Name;

            while (!(reader.Name == holderNodeName && reader.NodeType == XmlNodeType.EndElement) && reader.Read())
            {
                if (!reader.IsStartElement())
                {
                    continue;
                }

                switch (reader.Name)
                {
                case "initialization":
                    job.InitializationDocumentLink = ReadInitializationDocumentSection(reader, config, job);
                    break;

                case "entry":
                    job.EntryLinks = ReadEntryLinksSection(reader, config, job);
                    break;

                case "dictionary":
                    job.PredefinedValues = ReadPredefinedValuesSection(reader, config);
                    break;

                default:
                    throw new ArgumentException("Unrecognized element", reader.Name);
                }
            }

            return(job);
        }
示例#2
0
        private static void ReadExtractionLinkSection(XmlReader reader, WebsiteConfig config, ExtractionLink extractionLink)
        {
            ReadExtractionItemAttributes(extractionLink, reader, config);
            extractionLink.ExtractLinks = reader.GetAttribute("extract_links", extractionLink.ExtractLinks);
            extractionLink.ExtractData  = reader.GetAttribute("extract_data", extractionLink.ExtractData);
            extractionLink.HttpMethod   = reader.GetAttribute <string>("method", extractionLink.HttpMethod);
            extractionLink.Type         = reader.GetAttribute("type", ExtractionLink.LinkTypes.Auto);
            extractionLink.DependsOn    = reader.GetAttribute <string>("depends_on", null)?.Split(new[] { ',', ';', '|' }, StringSplitOptions.RemoveEmptyEntries);

            reader.ProcessChildren((childName, childReader) =>
            {
                switch (childName)
                {
                case "predefined_items":
                    extractionLink.IsPredefinedExtractionItemsLocationRelativeToLink = childReader.GetAttribute("relative", extractionLink.IsPredefinedExtractionItemsLocationRelativeToLink);
                    extractionLink.PredefinedExtractionItems = ReadExtractionItemsSection(childReader, config);
                    break;

                case "parameters":
                    extractionLink.Parameters = ReadExtractionLinkParametersSection(childReader);
                    break;

                case "headers":
                    extractionLink.Headers = ReadHttpHeadersSection(childReader);
                    break;

                case "post_processors":
                    ReadExtractionItemPostProcessors(extractionLink, childReader);
                    break;
                }
            });
        }
示例#3
0
        private static void ReadWebsiteJobsSectionIntoConfig(XmlTextReader reader, WebsiteConfig config)
        {
            config.JobsProcessedInParallel = reader.GetAttribute("parallelism", (int?)null);

            var sectionRootNodeName = reader.Name;

            while (!(reader.Name == sectionRootNodeName && reader.NodeType == XmlNodeType.EndElement) && reader.Read())
            {
                if (!reader.IsStartElement())
                {
                    continue;
                }

                switch (reader.Name)
                {
                case "job":
                    var job = ReadWebsiteJobNode(reader, config);
                    config.JobsByName.Add(job.Name, job);
                    break;

                case "template":
                    var template = ReadWebsiteJobNode(reader, config);
                    config.JobTemplatesByName.Add(template.Name, template);
                    break;
                }
            }
        }
示例#4
0
        private static ExtractionLink ReadExtractionLinkSection(XmlReader reader, WebsiteConfig config)
        {
            var extractionLink = new ExtractionLink();

            ReadExtractionLinkSection(reader, config, extractionLink);

            return(extractionLink);
        }
示例#5
0
        private static IList <ResourceLink> ReadEntryLinksSection(XmlReader reader, WebsiteConfig config, WebsiteJob job)
        {
            var result = new List <ResourceLink>();

            while (!(reader.Name == "entry" && reader.NodeType == XmlNodeType.EndElement) && reader.Read())
            {
                if (!reader.IsStartElement())
                {
                    continue;
                }

                switch (reader.Name)
                {
                case "link":
                    var extractionLink     = ReadExtractionLinkSection(reader, config);
                    var linkExtractedItems = new CollectionDictionary <string, StringWithDependencies>();
                    foreach (var extractionItem in extractionLink.PredefinedExtractionItems.Values)
                    {
                        linkExtractedItems.AddValue(extractionItem.Name, extractionItem.Value);
                    }

                    var dependencyDataSource = new DependencyDataSource(
                        linkExtractedItems
                        .Where(pred => pred.Value.Any(value => !value.RequiresResolve))
                        .ToCollectionDictionary(
                            pred => pred.Key,
                            pred => pred.Value
                            .Where(value => !value.RequiresResolve)
                            .Select(sel => sel.FormatString)
                            ),
                        config.PredefinedValues,
                        job.PredefinedValues
                        );

                    // NOTE: These are entry links, so they can't have any location to extract items from, only constant values
                    var extractedLink = new AutoDetectLink(
                        dependencyDataSource.Resolve(extractionLink.Value),
                        extractionLink.HttpMethod,
                        dependencyDataSource.Resolve(extractionLink.Parameters),
                        dependencyDataSource.Resolve(extractionLink.Headers),
                        extractionLink.ExtractLinks,
                        extractionLink.ExtractData,
                        config,
                        job,
                        dependencyDataSource.Resolve(linkExtractedItems),
                        (job as CrawlingBase ?? config)?.InitializationDocumentLink
                        );

                    result.Add(extractedLink);
                    break;

                default:
                    throw new ArgumentException("Unrecognized element", reader.Name);
                }
            }

            return(result);
        }
示例#6
0
        private static ExtractionItem ReadExtractionItemSection(XmlReader reader, WebsiteConfig config)
        {
            var extractionItem = new ExtractionItem();

            ReadExtractionItemAttributes(extractionItem, reader, config);
            ReadExtractionItemPostProcessors(extractionItem, reader);

            return(extractionItem);
        }
示例#7
0
        private static ExtractionFrame ReadExtractionFrameSection(XmlReader reader, WebsiteConfig config)
        {
            var extractionFrame = new ExtractionFrame();

            ReadExtractionLinkSection(reader, config, extractionFrame);

            extractionFrame.Type = ExtractionLink.LinkTypes.Document;

            return(extractionFrame);
        }
示例#8
0
        private static WebsiteConfig ReadWebsiteConfig(XmlTextReader reader)
        {
            var config = new WebsiteConfig();

            while (reader.Read())
            {
                if (reader.IsStartElement())
                {
                    switch (reader.Name)
                    {
                    case "config":
                        config.Name = reader.GetAttribute("name");
                        break;

                    case "settings":
                        config.CrawlingSettings = ReadWebsiteCrawlingSettingsSection(reader, config);
                        break;

                    case "dictionary":
                        config.PredefinedValues = ReadPredefinedValuesSection(reader, config);
                        break;

                    case "initialization":
                        config.InitializationDocumentLink = ReadInitializationDocumentSection(reader, config, null);
                        break;

                    case "entry":
                        config.EntryLinks = ReadEntryLinksSection(reader, config, null);
                        break;

                    case "jobs":
                        ReadWebsiteJobsSectionIntoConfig(reader, config);
                        break;

                    case "extraction":
                        config.ExtractionItems = ReadExtractionItemsSection(reader, config);
                        break;
                    }
                }

                //if (!reader.IsStartElement())
                //    continue;
            }

            return(config);
        }
示例#9
0
        private static CrawlingPredefinedValues ReadPredefinedValuesSection(XmlReader reader, WebsiteConfig config)
        {
            var result = new CrawlingPredefinedValues();

            while (!(reader.Name == "dictionary" && reader.NodeType == XmlNodeType.EndElement) && reader.Read())
            {
                if (reader.IsStartElement("item"))
                {
                    var name            = reader.GetAttribute("name");
                    var valuesSeparator = reader.GetAttribute("values_separator", ",");

                    var values       = new List <string>();
                    var valuesString = reader.GetAttribute("values");
                    if (!string.IsNullOrEmpty(valuesString))
                    {
                        values.AddRange(valuesString.Split(new[] { valuesSeparator }, StringSplitOptions.RemoveEmptyEntries));
                    }

                    result.Dictionary[name] = values;

                    if (reader.GetAttribute("required", false))
                    {
                        result.Required.Add(name);
                    }
                }
            }

            return(result);
        }
示例#10
0
        private static void ReadExtractionItemAttributes(ExtractionItem extractionItem, XmlReader reader, WebsiteConfig config)
        {
            extractionItem.Name  = XmlReaderExtensions.GetAttribute(reader, "name", "default");
            extractionItem.Value = reader.GetAttribute("value");

            extractionItem.SetExtractionLocation(
                reader.GetAttribute("location"),
                reader.GetAttribute("location_type", ExtractionLocation.ExtractionLocationTypes.InnerText),
                reader.GetAttribute("include_child_nodes", true)
                );

            extractionItem.SetExtractionContext(
                reader.GetAttribute("context"),
                reader.GetAttribute("context_document_type", config.CrawlingSettings.DocumentType)
                );

            extractionItem.DependsOn = reader.GetAttribute <string>("depends_on", null)?.Split(new[] { ',', ';', '|' }, StringSplitOptions.RemoveEmptyEntries);
        }
示例#11
0
        private static IDictionary <string, ExtractionItem> ReadExtractionItemsSection(XmlReader reader, WebsiteConfig config)
        {
            var result = new Dictionary <string, ExtractionItem>();

            while (!(reader.Name == "extraction" && reader.NodeType == XmlNodeType.EndElement) && reader.Read())
            {
                if (!reader.IsStartElement())
                {
                    continue;
                }

                switch (reader.Name)
                {
                case "frame":
                    var extractionFrame = ReadExtractionFrameSection(reader, config);
                    result.Add(extractionFrame.Name, extractionFrame);
                    break;

                case "link":
                    var extractionLink = ReadExtractionLinkSection(reader, config);
                    result.Add(extractionLink.Name, extractionLink);
                    break;

                case "item":
                    var extractionItem = ReadExtractionItemSection(reader, config);
                    result.Add(extractionItem.Name, extractionItem);
                    break;

                default:
                    throw new ArgumentException("Unrecognized element", reader.Name);
                }
            }

            return(result);
        }
示例#12
0
        private static InitializationLink ReadInitializationDocumentSection(XmlReader reader, WebsiteConfig config, WebsiteJob job)
        {
            var url        = reader.GetAttribute("url");
            var httpMethod = reader.GetAttribute <string>("method", System.Net.WebRequestMethods.Http.Get);

            IDictionary <string, string>         parameters = null;
            IDictionary <string, string>         headers    = null;
            IDictionary <string, ExtractionItem> extractionItemsOverride = new Dictionary <string, ExtractionItem>();

            while (!(reader.Name == "initialization" && reader.NodeType == XmlNodeType.EndElement) && reader.Read())
            {
                if (!reader.IsStartElement())
                {
                    continue;
                }

                switch (reader.Name)
                {
                case "parameters":
                    parameters = ReadExtractionLinkParametersSection(reader).ToDictionary(pred => pred.Key, pred => pred.Value.FormatString);
                    break;

                case "headers":
                    headers = ReadHttpHeadersSection(reader).ToDictionary(pred => pred.Key, pred => pred.Value.FormatString);
                    break;

                case "extraction":
                    extractionItemsOverride = ReadExtractionItemsSection(reader, config);
                    break;
                }
                //switch (reader.Name)
                //{
                //    case "frame":
                //        var extractionFrame = ReadExtractionFrameSection(reader, config);
                //        result.ExtractionItemsOverride.Add(extractionFrame.Name, extractionFrame);
                //        break;
                //    case "link":
                //        var extractionLink = ReadExtractionLinkSection(reader, config);
                //        result.ExtractionItemsOverride.Add(extractionLink.Name, extractionLink);
                //        break;
                //    case "item":
                //        var extractionItem = ReadExtractionItemSection(reader, config);
                //        result.ExtractionItemsOverride.Add(extractionItem.Name, extractionItem);
                //        break;
                //    default:
                //        throw new ArgumentException("Unrecognized element", reader.Name);
                //}
            }


            var result = new InitializationLink(
                url,
                httpMethod,
                parameters,
                headers,
                true,
                true,
                config,
                job
                );

            result.ExtractionItemsOverride = extractionItemsOverride;

            return(result);
        }
示例#13
0
        //private static IDictionary<string, CrawlingConditional> ReadCrawlingConditionalsSection(XmlReader reader)
        //{
        //    var result = new Dictionary<string, CrawlingConditional>();
        //    while (!(reader.Name == "conditionals" && reader.NodeType == XmlNodeType.EndElement) && reader.Read())
        //    {
        //        if (reader.IsStartElement("conditional"))
        //        {
        //            var conditional = new CrawlingConditional();
        //            result[conditional.Name] = conditional;

        //            conditional.Name = reader.GetAttribute("name");

        //            if (reader.IsStartElement("conditional"))
        //            {
        //                reader.GetAttribute(ref conditional.Action, "action");
        //                reader.GetAttribute(ref conditional.Logic, "logic");

        //                conditional.Conditions = new List<CrawlingConditional.Condition>();

        //                do
        //                {
        //                    if (reader.IsStartElement("condition"))
        //                    {
        //                        Debug.Assert(!string.IsNullOrEmpty(reader.GetAttribute("type")));

        //                        var type = CrawlingConditional.Condition.ConditionType.Equals;
        //                        reader.GetAttribute(ref type, "type");

        //                        string items = reader.GetAttribute("items");
        //                        string argument = reader.GetAttribute("argument");

        //                        conditional.Conditions.Add(new CrawlingConditional.Condition(type, argument, items));
        //                    }
        //                }
        //                while (!(reader.Name == "conditional" && reader.NodeType == XmlNodeType.EndElement) && reader.Read());
        //            }
        //        }
        //    }

        //    return result;
        //}

        //private static IDictionary<string, CrawlingCustomAction> ReadCrawlingCustomActionsSection(XmlReader reader)
        //{
        //    string tagName = reader.Name;
        //    var result = new Dictionary<string, CrawlingCustomAction>(StringComparer.OrdinalIgnoreCase);

        //    if (!reader.IsEmptyElement)
        //    {
        //        while (!(reader.Name == tagName && reader.NodeType == XmlNodeType.EndElement) && reader.Read())
        //        {
        //            if (reader.IsStartElement("action"))
        //            {
        //                var action = new CrawlingCustomAction(
        //                    reader.GetAttribute("id"),
        //                    reader.GetAttribute("language"),
        //                    reader.ReadElementContentAsString()
        //                );

        //                result.Add(action.Id, action);
        //            }
        //        }
        //    }

        //    return result.Count > 0
        //        ? result
        //        : null;
        //}

        private static WebsiteCrawlingSettings ReadWebsiteCrawlingSettingsSection(XmlReader reader, WebsiteConfig config)
        {
            var result = new WebsiteCrawlingSettings();

            while (!(reader.Name == "settings" && reader.NodeType == XmlNodeType.EndElement) && reader.Read())
            {
                if (reader.IsStartElement())
                {
                    switch (reader.Name)
                    {
                    case "download":
                        reader.GetAttribute(ref result.DownloadTimeout, "timeout");
                        reader.GetAttribute(ref result.DownloadDelay, "delay");
                        reader.GetAttribute(ref result.DownloadDelayRandomizationMax, "delay_randomization_max");
                        reader.GetAttribute(ref result.PersistCookies, "persist_cookies");
                        //reader.GetAttribute(ref result.SupplyRefererUrl, "supply_referer_url");
                        reader.GetAttribute(ref result.KeepAlive, "keep_alive");

                        reader.GetAttribute(ref result.UseProxy, "use_proxy");
                        reader.GetAttribute(ref result.UseProxyForImages, "use_proxy_for_images");
                        reader.GetAttribute(ref result.UseProxyForInlineDownloads, "use_proxy_for_inline_downloads");
                        reader.GetAttribute(ref result.UseProxyForLinkDownloads, "use_proxy_for_link_downloads");

                        reader.GetAttribute(ref result.PessimizeProxyFailures, "pessimize_proxy_failures");
                        reader.GetAttribute(ref result.RegisterProxyFailures, "register_proxy_failures");

                        string proxyBlockPeriodString = null;
                        reader.GetAttribute(ref proxyBlockPeriodString, "proxy_block_period");
                        if (!string.IsNullOrEmpty(proxyBlockPeriodString))
                        {
                            result.ProxyBlockPeriod = TimeSpan.Parse(proxyBlockPeriodString);
                        }

                        reader.GetAttribute(ref result.ExclusiveProxyLocking, "exclusive_proxy_locking");
                        reader.GetAttribute(ref result.ProxyInheritance, "proxy_inheritance");
                        reader.GetAttribute(ref result.IgnoreServerErrors, "ignore_server_errors");
                        //reader.GetAttribute(ref result.ValidateActionName, "validate_action");

                        string fallBackEncoding = reader.GetAttribute("fall_back_encoding");
                        if (!string.IsNullOrEmpty(fallBackEncoding))
                        {
                            result.FallBackEncoding = Encoding.GetEncoding(fallBackEncoding);
                        }

                        //string failUrlRegex = reader.GetAttribute("fail_url_regex");
                        //if (!string.IsNullOrEmpty(failUrlRegex))
                        //    result.FailUrlRegex = new Regex(failUrlRegex, RegexOptions.Compiled | RegexOptions.IgnoreCase);

                        //string proxyFailUrlRegex = reader.GetAttribute("proxy_fail_url_regex");
                        //if (!string.IsNullOrEmpty(proxyFailUrlRegex))
                        //    result.ProxyFailUrlRegex = new Regex(proxyFailUrlRegex, RegexOptions.Compiled | RegexOptions.IgnoreCase);

                        reader.GetAttribute(ref result.MaxThreads, "max_threads");

                        reader.GetAttribute(ref result.MaxExistingIncrementalUrls, "max_existing_incremental_urls");
                        reader.GetAttribute(ref result.MaxOutdatedIncrementalUrls, "max_outdated_incremental_urls");
                        //reader.GetAttribute(ref result.Output, "output");
                        reader.GetAttribute(ref result.DefaultBufferSize, "default_buffer_size");
                        //reader.GetAttribute(ref result.CompleteIncrementalRecrawl, "complete_incremental_recrawl");
                        //reader.GetAttribute(ref result.CrossJobKnownUrls, "cross_job_known_urls");

                        reader.GetAttribute(ref result.ValidateDomain, "validate_domain");
                        //reader.GetAttribute(ref result.DownloadInlineItemsImmediately, "download_inline_items_immediately");

                        var domains = reader.GetAttribute("domains");
                        if (!string.IsNullOrEmpty(domains))
                        {
                            result.Domains = new HashSet <string>(domains.Split('|'), StringComparer.OrdinalIgnoreCase);
                        }

                        // Read children
                        var userAgents = new List <string>();
                        reader.ProcessChildren((childName, childReader) =>
                        {
                            switch (childName)
                            {
                            case "user_agents":
                                childReader.ProcessChildren((userAgentsChildName, userAgentsChildReader) =>
                                {
                                    switch (userAgentsChildName)
                                    {
                                    case "user_agent":
                                        userAgentsChildReader.Read();
                                        userAgents.Add(userAgentsChildReader.Value);
                                        break;
                                    }
                                });
                                break;

                            case "headers":
                                result.Headers = ReadHttpHeadersSection(childReader)
                                                 .Where(pred => !pred.Value.RequiresResolve)
                                                 .ToDictionary(pred => pred.Key, pred => pred.Value.FormatString);
                                break;
                            }
                        });

                        if (userAgents.Count > 0)
                        {
                            result.UserAgents = userAgents;
                        }

                        break;

                    case "processing":
                        reader.GetAttribute(ref result.DocumentType, "document_type");
                        reader.GetAttribute(ref result.UrlUniquePartRegex, "url_unique_part_regex");
                        //reader.GetAttribute(ref result.PreserveLinebreaks, "preserve_linebreaks");
                        //reader.GetAttribute(ref result.IndexingMode, "indexing_mode");
                        //reader.GetAttribute(ref result.OverwriteFutureDates, "overwrite_future_dates");
                        reader.GetAttribute(ref result.PersistIgnoredUrls, "persist_ignored_urls");
                        //reader.GetAttribute(ref result.OutputData, "output_data");
                        //reader.GetAttribute(ref result.OutputPiratedData, "output_pirated_data");
                        //reader.GetAttribute(ref result.OutputFiles, "output_files");
                        reader.GetAttribute(ref result.PageGenuityMarkerItemId, "page_genuity_marker_item");
                        break;

                    case "locale":
                        string timeZone = reader.GetAttribute("time_zone");
                        if (!string.IsNullOrEmpty(timeZone))
                        {
                            result.TimeZone = TimeZoneInfo.FindSystemTimeZoneById(timeZone);
                        }

                        string culture = reader.GetAttribute("culture");
                        if (!string.IsNullOrEmpty(culture))
                        {
                            result.Culture = CultureInfo.GetCultureInfo(culture);
                        }

                        break;

                    case "error_handling":
                        reader.GetAttribute(ref result.PageErrorRetryTimes, "page_retry_times");
                        reader.GetAttribute(ref result.ImageErrorRetryTimes, "image_retry_times");
                        reader.GetAttribute(ref result.ErrorRetryTimeout, "retry_timeout");
                        reader.GetAttribute(ref result.TraceDownloadErrors, "trace_download_errors");
                        reader.GetAttribute(ref result.TraceProxyErrors, "trace_proxy_errors");
                        break;

                    //case "author":
                    //reader.GetAttribute(ref result.ExtractAuthor, "extract");
                    //break;
                    default:
                        throw new ArgumentException("Unrecognized element", reader.Name);
                        break;
                    }
                }
            }

            return(result);
        }