Ejemplo n.º 1
0
        private void SettingsForm_Load(object sender, EventArgs e)
        {
            CrawlSettings settings = Form1.curCrawlSettings;

            this.deviceTxt.Text            = settings.DeviceId;
            this.timestampTxt.Text         = settings.timestamp;
            this.signTxt.Text              = settings.sign;
            this.refleshTokenTxt.Text      = settings.RefleshToken;
            this.accessTokenTxt.Text       = settings.AccessToken;
            this.enterpriseIpTxt.Text      = Form1.enterpriseIp;
            this.isProvinceCHK.Checked     = Form1.IsProvince;
            this.onlyDateUpdateCHK.Checked = Form1.OnlyDateUpdate;
            this.industryCHK.Checked       = Form1.IndustrySearch;
            this.GRegistCapiBeginTxt.Text  = Form1.GRegistCapiBegin;
            this.GRegistCapiEndTxt.Text    = Form1.GRegistCapiEnd;
            this.comboBox1.Items.Clear();
            foreach (BsonDocument account in this.mainForm.GetAppDeviceAccount)
            {
                int index = this.comboBox1.Items.Add(account.Text("deviceId"));
                if (account.Text("deviceId") == settings.DeviceId)
                {
                    this.comboBox1.SelectedIndex = index;
                }
            }
            if (!string.IsNullOrEmpty(Form1.SearchKeyType))
            {
                this.searchKeyTypeComBox.SelectedText = Form1.SearchKeyType;
            }
            if (Form1.PreKeyWordList.Count() > 0)
            {
                this.keyWordRTxt.Text = string.Join("\n", Form1.PreKeyWordList);
            }
        }
Ejemplo n.º 2
0
        protected CrawlSettings GetTestSettings()
        {
            var settings = new CrawlSettings();

            settings.TaskHandlerOptions.BubbleUpExceptions = true;
            return(settings);
        }
Ejemplo n.º 3
0
        public async Task AutoRetryOnFailure()
        {
            var crawler = GetTestSiteCrawler(new SiteContext
            {
                SiteFolder = "EmptySite"
            });
            var settings = new CrawlSettings
            {
                NumberOfRetries         = 3,
                RequestProcessor        = GetLoggedRequestProcessor(),
                RequestProcessorOptions = new RequestProcessorOptions
                {
                    DelayBetweenRequestStart        = new TimeSpan(),
                    MaxNumberOfSimultaneousRequests = 4,
                    TimeoutBeforeThrottle           = new TimeSpan(),
                    DelayJitter    = new TimeSpan(),
                    RequestTimeout = new TimeSpan(0, 0, 0, 0, 150)
                }
            };

            settings.RequestProcessor.Add(new Uri("http://localhost/delay/300/300ms-delay-1"));
            settings.RequestProcessor.Add(new Uri("http://localhost/delay/300/300ms-delay-2"));
            settings.RequestProcessor.Add(new Uri("http://localhost/delay/300/300ms-delay-3"));
            settings.RequestProcessor.Add(new Uri("http://localhost/delay/300/300ms-delay-4"));

            var results = await crawler.Crawl(new Uri("http://localhost/"), settings);

            var delayedCrawls = results.CrawledUris.Where(c => c.Location.PathAndQuery.Contains("delay")).ToArray();

            foreach (var crawledUri in delayedCrawls)
            {
                Assert.AreEqual(CrawlStatus.MaxRetries, crawledUri.Status);
                Assert.IsNull(crawledUri.Content);
            }
        }
        /// <summary>
        /// 谁的那个
        /// </summary>
        /// <param name="_Settings"></param>
        /// <param name="filter"></param>
        public RegisterEnterpriseAddInfoCrawler(CrawlSettings _Settings, BloomFilter <string> _filter, DataOperation _dataop)
        {
            Settings = _Settings; filter = _filter; dataop = _dataop;

            pageUrlfilter = new BloomFilter <string>(2000000);
            urlfilter     = new BloomFilter <string>(2000000);
        }
Ejemplo n.º 5
0
        public async Task AllowedExternalSitesAreCrawled()
        {
            var crawler = GetTestSiteCrawler(new SiteContext
            {
                SiteFolder = "BasicSite"
            });
            var settings = new CrawlSettings
            {
                HostAliases             = new[] { "test-domain.com" },
                RequestProcessor        = GetLoggedRequestProcessor(),
                RequestProcessorOptions = GetNoDelayRequestProcessorOptions()
            };
            var result = await crawler.Crawl(new Uri("http://localhost/"), settings);

            var uri        = new Uri("http://localhost/index.html");
            var crawledUri = result.CrawledUris.Where(c => c.Location == uri).FirstOrDefault();

            var externalUri = new Uri("http://test-domain.com");

            Assert.IsTrue(crawledUri.Content.Links.Any(l => l.Location == externalUri));

            var externalCrawl = result.CrawledUris.FirstOrDefault(c => c.Location == externalUri);

            Assert.IsNotNull(externalCrawl);
            Assert.AreEqual(HttpStatusCode.OK, externalCrawl.Requests.LastOrDefault().StatusCode);
        }
Ejemplo n.º 6
0
 public void Test_Crawler_Returns_All_With_No_Filters()
 {
     IConfigurationEnumerator crawler = new ConfigurationEnumerator(fileSystemProvider, configurationProvider);
     CrawlSettings crawlerSettings = new CrawlSettings();
     var results = crawler.Crawl(crawlerSettings);
     var count = results.Count();
     Assert.AreEqual(configurations.Keys.Count, results.Count());
 }
Ejemplo n.º 7
0
        public void PostRequestTest()
        {
            CrawlSettings settings = new CrawlSettings();
            RequestArgs   args     = new RequestArgs()
            {
                PostParameters = "a=1234", RequestUri = "http://www.wqii.com.cn"
            };
            ILog log = LogManager.GetLogger("requestLogger");

            CookieRequester.PostRequest(settings, args, log);
            Assert.AreEqual(null, settings.CookieContainer);
        }
Ejemplo n.º 8
0
        public CrawlRunner(Uri baseUri, RobotsFile robotsFile, HttpClient httpClient, CrawlSettings crawlSettings, ILogger logger = null)
        {
            BaseUri    = baseUri;
            RobotsFile = robotsFile;
            HttpClient = httpClient;
            Settings   = crawlSettings;

            Logger           = logger;
            RobotsPageParser = new RobotsPageParser();

            AddRequest(baseUri);
        }
Ejemplo n.º 9
0
        /// <summary>
        /// 创建具体事务对象
        /// </summary>
        /// <param name="Name"></param>
        /// <returns></returns>
        public ISimpleCrawler Create(string Name, CrawlSettings _Settings, BloomFilter <string> _filter, DataOperation _dataop)
        {
            ISimpleCrawler myExecuteTran = null;

            try
            {
                Type type = Type.GetType(Name, true);
                myExecuteTran = (ISimpleCrawler)Activator.CreateInstance(type, _Settings, _filter, _dataop);
            }
            catch (TypeLoadException e)
            {
            }
            return(myExecuteTran);
        }
Ejemplo n.º 10
0
        private async Task <CrawlResult> GetCrawlResult()
        {
            var crawler = GetTestSiteCrawler(new SiteContext
            {
                SiteFolder = "BasicSite"
            });
            var settings = new CrawlSettings
            {
                RequestProcessor        = GetLoggedRequestProcessor(),
                RequestProcessorOptions = GetNoDelayRequestProcessorOptions()
            };

            return(await crawler.Crawl(new Uri("http://localhost/"), settings));
        }
Ejemplo n.º 11
0
        public static CrawlSettings PostRequest(CrawlSettings settings,RequestArgs args)
        {
            CookieContainer cookie = null;

            HttpWebRequest request = (HttpWebRequest)WebRequest.Create(new Uri(args.RequestUri));
            request.Method = "POST";
            request.Accept = "*/*";
            request.UserAgent = settings.CrawConfiguration.UserAgentString;

            byte[] transferData = Encoding.UTF8.GetBytes(args.PostParameters);
            request.ContentLength = transferData.Length;

            using (Stream stream = request.GetRequestStream())
            {
                stream.Write(transferData, 0, transferData.Length);
            }

            HttpWebResponse response = null;
            try { response = (HttpWebResponse)request.GetResponse(); }
            catch (WebException e)
            {
                response = (HttpWebResponse)e.Response;
            }
            catch (Exception e)
            {
                Console.WriteLine(string.Format("Error occured when getting cookie , Reason :{0}", e.Message));
            }

            //handle error code 
            if (response != null)
            {
                Console.WriteLine(string.Format("Server Return StatusCode {0}", response.StatusCode));
                if(response.StatusCode == HttpStatusCode.OK)
                {
                    cookie.Add(response.Cookies);
                    settings.CookieContainer = cookie;
                    Console.WriteLine(string.Format("Cookie (key:{0}) is achieved", args.RequestUri));
                }
            }
            else
            {
                Console.WriteLine("No Response From Server");
            }

            return settings;   

        }
Ejemplo n.º 12
0
        public async Task MaximumPagesCrawledFollowed(int maxPages)
        {
            var crawler = GetTestSiteCrawler(new SiteContext
            {
                SiteFolder = "BasicSite"
            });
            var settings = new CrawlSettings
            {
                RequestProcessor        = GetLoggedRequestProcessor(),
                RequestProcessorOptions = GetNoDelayRequestProcessorOptions()
            };

            settings.MaxNumberOfPagesToCrawl = maxPages;
            var result = await crawler.Crawl(new Uri("http://localhost/"), settings);

            Assert.AreEqual(maxPages, result.CrawledUris.Count());
        }
Ejemplo n.º 13
0
        public static RequestContext BuildRequestContext(CrawlSettings settings, RequestArgs args)
        {
            RequestContext requestContext = new RequestContext();
            HttpWebRequest request = (HttpWebRequest)WebRequest.Create(new Uri(args.RequestUri));
            request.Method = "POST";
            request.Accept = "*/*";
            request.UserAgent = settings.CrawConfiguration.UserAgentString;

            byte[] transferData = Encoding.UTF8.GetBytes(args.PostParameters);
            request.ContentLength = transferData.Length;

            using (Stream stream = request.GetRequestStream())
            {
                stream.Write(transferData, 0, transferData.Length);
            }
            requestContext.Request = request;

            return requestContext;
        }
Ejemplo n.º 14
0
        ///// <summary>
        /////  分类信息
        ///// </summary>
        //public string DataTableNameCategory
        //{
        //    get { return "CategoryInfo_MT"; }

        //}

        /// <summary>
        ///  构造函数
        /// </summary>
        /// <param name="_Settings"></param>
        /// <param name="filter"></param>
        public HuiCongMaterialAPPCrawler(CrawlSettings _Settings, BloomFilter <string> _filter, DataOperation _dataop)
        {
            Settings   = _Settings; filter = _filter; dataop = _dataop;
            guidFilter = new BloomFilter <string>(9000000);
        }
Ejemplo n.º 15
0
        ///// <summary>
        /////  分类信息
        ///// </summary>
        //public string DataTableNameCategory
        //{
        //    get { return "CategoryInfo_MT"; }

        //}

        /// <summary>
        ///  构造函数
        /// </summary>
        /// <param name="_Settings"></param>
        /// <param name="filter"></param>
        public DoctorHospitalDetailAPPCrawler(CrawlSettings _Settings, BloomFilter <string> _filter, DataOperation _dataop)
        {
            Settings   = _Settings; filter = _filter; dataop = _dataop;
            guidFilter = new BloomFilter <string>(9000000);
        }
Ejemplo n.º 16
0
 /// <summary>
 /// 谁的那个
 /// </summary>
 /// <param name="_Settings"></param>
 /// <param name="filter"></param>
 public MiEggListCrawler(CrawlSettings _Settings, BloomFilter <string> _filter, DataOperation _dataop)
 {
     Settings = _Settings; filter = _filter; dataop = _dataop;
 }
Ejemplo n.º 17
0
 List <BsonDocument> allLandUrlList = new List <BsonDocument>(); //没有县市的Url
 /// <summary>
 /// 构造函数
 /// </summary>
 /// <param name="_Settings"></param>
 /// <param name="filter"></param>
 public LandFangXYInfoUpdateEXCrawler(CrawlSettings _Settings, BloomFilter <string> _filter, DataOperation _dataop)
 {
     Settings = _Settings; filter = _filter; dataop = _dataop;
 }
Ejemplo n.º 18
0
 public SiMuListCrawler(CrawlSettings _Settings, BloomFilter <string> _filter, DataOperation _dataop)
 {
     this.Settings = _Settings;
     this.filter   = _filter;
     this.dataop   = _dataop;
 }
 /// <summary>
 /// 谁的那个
 /// </summary>
 /// <param name="_Settings"></param>
 /// <param name="filter"></param>
 public SoHuBuildingDetailCrawler(CrawlSettings _Settings, BloomFilter <string> _filter, DataOperation _dataop)
 {
     Settings = _Settings; filter = _filter; dataop = _dataop;
 }
 /// <summary>
 /// 谁的那个
 /// </summary>
 /// <param name="_Settings"></param>
 /// <param name="filter"></param>
 public LandFangCityRegionUpdateCrawler(CrawlSettings _Settings, BloomFilter <string> _filter, DataOperation _dataop)
 {
     Settings = _Settings; filter = _filter; dataop = _dataop;
 }
 /// <summary>
 /// 谁的那个
 /// </summary>
 /// <param name="_Settings"></param>
 /// <param name="filter"></param>
 public FangProjectDetailCrawler_NanNing(CrawlSettings _Settings, BloomFilter <string> _filter, DataOperation _dataop)
 {
     Settings = _Settings; filter = _filter; dataop = _dataop;
 }
Ejemplo n.º 22
0
 List <BsonDocument> allLandUrlList = new List <BsonDocument>(); //没有县市的Url
 /// <summary>
 /// 谁的那个
 /// </summary>
 /// <param name="_Settings"></param>
 /// <param name="filter"></param>
 public ProfileCompany56JobDetailCrawler(CrawlSettings _Settings, BloomFilter <string> _filter, DataOperation _dataop)
 {
     Settings = _Settings; filter = _filter; dataop = _dataop;
 }
 /// <summary>
 /// 谁的那个
 /// </summary>
 /// <param name="_Settings"></param>
 /// <param name="filter"></param>
 public QCCEnterpriseCrawler_FOSHAN(CrawlSettings _Settings, BloomFilter <string> _filter, DataOperation _dataop)
 {
     Settings = _Settings; filter = _filter; dataop = _dataop;
 }
Ejemplo n.º 24
0
 /// <summary>
 /// 谁的那个
 /// </summary>
 /// <param name="_Settings"></param>
 /// <param name="filter"></param>
 public QCCEnterpriseDetailInfoAnalyse(CrawlSettings _Settings, BloomFilter <string> _filter, DataOperation _dataop)
 {
     Settings = _Settings; filter = _filter; dataop = _dataop;
 }
Ejemplo n.º 25
0
 /// <summary>
 ///  构造函数
 /// </summary>
 /// <param name="_Settings"></param>
 /// <param name="filter"></param>
 public PhantomJSCrawler(CrawlSettings _Settings, BloomFilter <string> _filter, DataOperation _dataop)
 {
     Settings = _Settings; filter = _filter; dataop = _dataop;
 }
 List <BsonDocument> allLandUrlList = new List <BsonDocument>(); //没有县市的Url
 /// <summary>
 /// 谁的那个
 /// </summary>
 /// <param name="_Settings"></param>
 /// <param name="filter"></param>
 public DistrictLeaderCrawler(CrawlSettings _Settings, BloomFilter <string> _filter, DataOperation _dataop)
 {
     Settings = _Settings; filter = _filter; dataop = _dataop;
 }
Ejemplo n.º 27
0
        ///// <summary>
        /////  分类信息
        ///// </summary>
        //public string DataTableNameCategory
        //{
        //    get { return "CategoryInfo_MT"; }

        //}

        /// <summary>
        ///  构造函数
        /// </summary>
        /// <param name="_Settings"></param>
        /// <param name="filter"></param>
        public MeiTuCityBusinessEnterpriseAPPCrawler(CrawlSettings _Settings, BloomFilter <string> _filter, DataOperation _dataop)
        {
            Settings   = _Settings; filter = _filter; dataop = _dataop;
            guidFilter = new BloomFilter <string>(9000000);
        }
Ejemplo n.º 28
0
 /// <summary>
 /// 谁的那个
 /// </summary>
 /// <param name="_Settings"></param>
 /// <param name="filter"></param>
 public MeiTuCityBusinessDistinctAPPCrawler(CrawlSettings _Settings, BloomFilter <string> _filter, DataOperation _dataop)
 {
     Settings = _Settings; filter = _filter; dataop = _dataop;
 }
Ejemplo n.º 29
0
        public ActionResult CrawlSettings()
        {
            CrawlSettings config = new CrawlSettings();

            return(View(config));
        }
 /// <summary>
 /// 谁的那个
 /// </summary>
 /// <param name="_Settings"></param>
 /// <param name="filter"></param>
 public PlantDetailInfoCrawler(CrawlSettings _Settings, BloomFilter<string> _filter, DataOperation _dataop)
 {
     Settings = _Settings; filter = _filter; dataop = _dataop;
 }
Ejemplo n.º 31
0
 public AuthenticPageRequester(CrawlSettings setting) : this(setting.CrawConfiguration)
 {
     _cookieContainer = setting.CookieContainer;
 }