private static void ConvertCrawlerConfiguration(XmlDocument xmlDoc, List <RequestConfig> lstReqCfg)
        {
            XmlNodeList crawlerNodes = xmlDoc.SelectNodes("//Crawlers/Crawler");

            if (crawlerNodes != null)
            {
                foreach (XmlNode configNode in crawlerNodes)
                {
                    CrawlerRequestConfig curCrawlerReqConfig = new CrawlerRequestConfig();
                    curCrawlerReqConfig.CrawlerKey         = configNode.Attributes["key"].Value;
                    curCrawlerReqConfig.CrawlerDescription = configNode.Attributes["description"].Value;

                    if (!string.IsNullOrEmpty(curCrawlerReqConfig.CrawlerKey))
                    {
                        curCrawlerReqConfig.Source = configNode.SelectSingleNode("Request/@source").Value;

                        if (configNode.SelectSingleNode("Request/Url/Address") != null)
                        {
                            curCrawlerReqConfig.RequestUrl = configNode.SelectSingleNode("Request/Url/Address").InnerText;
                        }

                        if (configNode.SelectSingleNode("Request/Url/Pattern") != null)
                        {
                            curCrawlerReqConfig.RequestUrlPattern = configNode.SelectSingleNode("Request/Url/Pattern").InnerText;
                        }

                        if (configNode.SelectSingleNode("Request/Method") != null && !string.IsNullOrEmpty(configNode.SelectSingleNode("Request/Method").InnerText.Trim()))
                        {
                            curCrawlerReqConfig.RequestMethod = configNode.SelectSingleNode("Request/Method").InnerText.Trim().ToUpper();
                        }
                        else
                        {
                            //set HttpMethod as Get
                            curCrawlerReqConfig.RequestMethod = HttpMethod.Get.ToString().ToUpper();
                        }

                        if (configNode.SelectSingleNode("Request/ContentType") != null)
                        {
                            curCrawlerReqConfig.ContentType = configNode.SelectSingleNode("Request/ContentType").InnerText;
                        }

                        //TODO other configuration item to be done

                        lstReqCfg.Add(curCrawlerReqConfig);
                    }
                }
            }
        }
Example #2
0
        public CommonCrawler(string crawlerKey, Dictionary <String, String> dicParameters = null)
        {
            CrawlerRequestConfig reqConfig = CrawlerConfigHelper.GetCrawlerRequestConfig(crawlerKey) as CrawlerRequestConfig;

            reqConfig.UrlParas = dicParameters;
            //CrawlerRequestConfig here is to store the request configuration and it is set as readonly in case of invaild modification.
            CrawlerRequestConfig = reqConfig;

            // convert webRequest in CrawlerBase to be HttpWebRequest type
            this.webRequest = WebRequest.Create(reqConfig.RequestUrl) as HttpWebRequest;
            //unlock the limitation of http request connection counts
            ServicePointManager.DefaultConnectionLimit = Int32.MaxValue;

            //Initialize the WebRequest Client using reqConfig(request configuration items)
            InitWebRequest(reqConfig);
        }