private void SettingsForm_Load(object sender, EventArgs e) { CrawlSettings settings = Form1.curCrawlSettings; this.deviceTxt.Text = settings.DeviceId; this.timestampTxt.Text = settings.timestamp; this.signTxt.Text = settings.sign; this.refleshTokenTxt.Text = settings.RefleshToken; this.accessTokenTxt.Text = settings.AccessToken; this.enterpriseIpTxt.Text = Form1.enterpriseIp; this.isProvinceCHK.Checked = Form1.IsProvince; this.onlyDateUpdateCHK.Checked = Form1.OnlyDateUpdate; this.industryCHK.Checked = Form1.IndustrySearch; this.GRegistCapiBeginTxt.Text = Form1.GRegistCapiBegin; this.GRegistCapiEndTxt.Text = Form1.GRegistCapiEnd; this.comboBox1.Items.Clear(); foreach (BsonDocument account in this.mainForm.GetAppDeviceAccount) { int index = this.comboBox1.Items.Add(account.Text("deviceId")); if (account.Text("deviceId") == settings.DeviceId) { this.comboBox1.SelectedIndex = index; } } if (!string.IsNullOrEmpty(Form1.SearchKeyType)) { this.searchKeyTypeComBox.SelectedText = Form1.SearchKeyType; } if (Form1.PreKeyWordList.Count() > 0) { this.keyWordRTxt.Text = string.Join("\n", Form1.PreKeyWordList); } }
protected CrawlSettings GetTestSettings() { var settings = new CrawlSettings(); settings.TaskHandlerOptions.BubbleUpExceptions = true; return(settings); }
public async Task AutoRetryOnFailure() { var crawler = GetTestSiteCrawler(new SiteContext { SiteFolder = "EmptySite" }); var settings = new CrawlSettings { NumberOfRetries = 3, RequestProcessor = GetLoggedRequestProcessor(), RequestProcessorOptions = new RequestProcessorOptions { DelayBetweenRequestStart = new TimeSpan(), MaxNumberOfSimultaneousRequests = 4, TimeoutBeforeThrottle = new TimeSpan(), DelayJitter = new TimeSpan(), RequestTimeout = new TimeSpan(0, 0, 0, 0, 150) } }; settings.RequestProcessor.Add(new Uri("http://localhost/delay/300/300ms-delay-1")); settings.RequestProcessor.Add(new Uri("http://localhost/delay/300/300ms-delay-2")); settings.RequestProcessor.Add(new Uri("http://localhost/delay/300/300ms-delay-3")); settings.RequestProcessor.Add(new Uri("http://localhost/delay/300/300ms-delay-4")); var results = await crawler.Crawl(new Uri("http://localhost/"), settings); var delayedCrawls = results.CrawledUris.Where(c => c.Location.PathAndQuery.Contains("delay")).ToArray(); foreach (var crawledUri in delayedCrawls) { Assert.AreEqual(CrawlStatus.MaxRetries, crawledUri.Status); Assert.IsNull(crawledUri.Content); } }
/// <summary> /// 谁的那个 /// </summary> /// <param name="_Settings"></param> /// <param name="filter"></param> public RegisterEnterpriseAddInfoCrawler(CrawlSettings _Settings, BloomFilter <string> _filter, DataOperation _dataop) { Settings = _Settings; filter = _filter; dataop = _dataop; pageUrlfilter = new BloomFilter <string>(2000000); urlfilter = new BloomFilter <string>(2000000); }
public async Task AllowedExternalSitesAreCrawled() { var crawler = GetTestSiteCrawler(new SiteContext { SiteFolder = "BasicSite" }); var settings = new CrawlSettings { HostAliases = new[] { "test-domain.com" }, RequestProcessor = GetLoggedRequestProcessor(), RequestProcessorOptions = GetNoDelayRequestProcessorOptions() }; var result = await crawler.Crawl(new Uri("http://localhost/"), settings); var uri = new Uri("http://localhost/index.html"); var crawledUri = result.CrawledUris.Where(c => c.Location == uri).FirstOrDefault(); var externalUri = new Uri("http://test-domain.com"); Assert.IsTrue(crawledUri.Content.Links.Any(l => l.Location == externalUri)); var externalCrawl = result.CrawledUris.FirstOrDefault(c => c.Location == externalUri); Assert.IsNotNull(externalCrawl); Assert.AreEqual(HttpStatusCode.OK, externalCrawl.Requests.LastOrDefault().StatusCode); }
public void Test_Crawler_Returns_All_With_No_Filters() { IConfigurationEnumerator crawler = new ConfigurationEnumerator(fileSystemProvider, configurationProvider); CrawlSettings crawlerSettings = new CrawlSettings(); var results = crawler.Crawl(crawlerSettings); var count = results.Count(); Assert.AreEqual(configurations.Keys.Count, results.Count()); }
public void PostRequestTest() { CrawlSettings settings = new CrawlSettings(); RequestArgs args = new RequestArgs() { PostParameters = "a=1234", RequestUri = "http://www.wqii.com.cn" }; ILog log = LogManager.GetLogger("requestLogger"); CookieRequester.PostRequest(settings, args, log); Assert.AreEqual(null, settings.CookieContainer); }
public CrawlRunner(Uri baseUri, RobotsFile robotsFile, HttpClient httpClient, CrawlSettings crawlSettings, ILogger logger = null) { BaseUri = baseUri; RobotsFile = robotsFile; HttpClient = httpClient; Settings = crawlSettings; Logger = logger; RobotsPageParser = new RobotsPageParser(); AddRequest(baseUri); }
/// <summary> /// 创建具体事务对象 /// </summary> /// <param name="Name"></param> /// <returns></returns> public ISimpleCrawler Create(string Name, CrawlSettings _Settings, BloomFilter <string> _filter, DataOperation _dataop) { ISimpleCrawler myExecuteTran = null; try { Type type = Type.GetType(Name, true); myExecuteTran = (ISimpleCrawler)Activator.CreateInstance(type, _Settings, _filter, _dataop); } catch (TypeLoadException e) { } return(myExecuteTran); }
private async Task <CrawlResult> GetCrawlResult() { var crawler = GetTestSiteCrawler(new SiteContext { SiteFolder = "BasicSite" }); var settings = new CrawlSettings { RequestProcessor = GetLoggedRequestProcessor(), RequestProcessorOptions = GetNoDelayRequestProcessorOptions() }; return(await crawler.Crawl(new Uri("http://localhost/"), settings)); }
public static CrawlSettings PostRequest(CrawlSettings settings,RequestArgs args) { CookieContainer cookie = null; HttpWebRequest request = (HttpWebRequest)WebRequest.Create(new Uri(args.RequestUri)); request.Method = "POST"; request.Accept = "*/*"; request.UserAgent = settings.CrawConfiguration.UserAgentString; byte[] transferData = Encoding.UTF8.GetBytes(args.PostParameters); request.ContentLength = transferData.Length; using (Stream stream = request.GetRequestStream()) { stream.Write(transferData, 0, transferData.Length); } HttpWebResponse response = null; try { response = (HttpWebResponse)request.GetResponse(); } catch (WebException e) { response = (HttpWebResponse)e.Response; } catch (Exception e) { Console.WriteLine(string.Format("Error occured when getting cookie , Reason :{0}", e.Message)); } //handle error code if (response != null) { Console.WriteLine(string.Format("Server Return StatusCode {0}", response.StatusCode)); if(response.StatusCode == HttpStatusCode.OK) { cookie.Add(response.Cookies); settings.CookieContainer = cookie; Console.WriteLine(string.Format("Cookie (key:{0}) is achieved", args.RequestUri)); } } else { Console.WriteLine("No Response From Server"); } return settings; }
public async Task MaximumPagesCrawledFollowed(int maxPages) { var crawler = GetTestSiteCrawler(new SiteContext { SiteFolder = "BasicSite" }); var settings = new CrawlSettings { RequestProcessor = GetLoggedRequestProcessor(), RequestProcessorOptions = GetNoDelayRequestProcessorOptions() }; settings.MaxNumberOfPagesToCrawl = maxPages; var result = await crawler.Crawl(new Uri("http://localhost/"), settings); Assert.AreEqual(maxPages, result.CrawledUris.Count()); }
public static RequestContext BuildRequestContext(CrawlSettings settings, RequestArgs args) { RequestContext requestContext = new RequestContext(); HttpWebRequest request = (HttpWebRequest)WebRequest.Create(new Uri(args.RequestUri)); request.Method = "POST"; request.Accept = "*/*"; request.UserAgent = settings.CrawConfiguration.UserAgentString; byte[] transferData = Encoding.UTF8.GetBytes(args.PostParameters); request.ContentLength = transferData.Length; using (Stream stream = request.GetRequestStream()) { stream.Write(transferData, 0, transferData.Length); } requestContext.Request = request; return requestContext; }
///// <summary> ///// 分类信息 ///// </summary> //public string DataTableNameCategory //{ // get { return "CategoryInfo_MT"; } //} /// <summary> /// 构造函数 /// </summary> /// <param name="_Settings"></param> /// <param name="filter"></param> public HuiCongMaterialAPPCrawler(CrawlSettings _Settings, BloomFilter <string> _filter, DataOperation _dataop) { Settings = _Settings; filter = _filter; dataop = _dataop; guidFilter = new BloomFilter <string>(9000000); }
///// <summary> ///// 分类信息 ///// </summary> //public string DataTableNameCategory //{ // get { return "CategoryInfo_MT"; } //} /// <summary> /// 构造函数 /// </summary> /// <param name="_Settings"></param> /// <param name="filter"></param> public DoctorHospitalDetailAPPCrawler(CrawlSettings _Settings, BloomFilter <string> _filter, DataOperation _dataop) { Settings = _Settings; filter = _filter; dataop = _dataop; guidFilter = new BloomFilter <string>(9000000); }
/// <summary> /// 谁的那个 /// </summary> /// <param name="_Settings"></param> /// <param name="filter"></param> public MiEggListCrawler(CrawlSettings _Settings, BloomFilter <string> _filter, DataOperation _dataop) { Settings = _Settings; filter = _filter; dataop = _dataop; }
List <BsonDocument> allLandUrlList = new List <BsonDocument>(); //没有县市的Url /// <summary> /// 构造函数 /// </summary> /// <param name="_Settings"></param> /// <param name="filter"></param> public LandFangXYInfoUpdateEXCrawler(CrawlSettings _Settings, BloomFilter <string> _filter, DataOperation _dataop) { Settings = _Settings; filter = _filter; dataop = _dataop; }
public SiMuListCrawler(CrawlSettings _Settings, BloomFilter <string> _filter, DataOperation _dataop) { this.Settings = _Settings; this.filter = _filter; this.dataop = _dataop; }
/// <summary> /// 谁的那个 /// </summary> /// <param name="_Settings"></param> /// <param name="filter"></param> public SoHuBuildingDetailCrawler(CrawlSettings _Settings, BloomFilter <string> _filter, DataOperation _dataop) { Settings = _Settings; filter = _filter; dataop = _dataop; }
/// <summary> /// 谁的那个 /// </summary> /// <param name="_Settings"></param> /// <param name="filter"></param> public LandFangCityRegionUpdateCrawler(CrawlSettings _Settings, BloomFilter <string> _filter, DataOperation _dataop) { Settings = _Settings; filter = _filter; dataop = _dataop; }
/// <summary> /// 谁的那个 /// </summary> /// <param name="_Settings"></param> /// <param name="filter"></param> public FangProjectDetailCrawler_NanNing(CrawlSettings _Settings, BloomFilter <string> _filter, DataOperation _dataop) { Settings = _Settings; filter = _filter; dataop = _dataop; }
List <BsonDocument> allLandUrlList = new List <BsonDocument>(); //没有县市的Url /// <summary> /// 谁的那个 /// </summary> /// <param name="_Settings"></param> /// <param name="filter"></param> public ProfileCompany56JobDetailCrawler(CrawlSettings _Settings, BloomFilter <string> _filter, DataOperation _dataop) { Settings = _Settings; filter = _filter; dataop = _dataop; }
/// <summary> /// 谁的那个 /// </summary> /// <param name="_Settings"></param> /// <param name="filter"></param> public QCCEnterpriseCrawler_FOSHAN(CrawlSettings _Settings, BloomFilter <string> _filter, DataOperation _dataop) { Settings = _Settings; filter = _filter; dataop = _dataop; }
/// <summary> /// 谁的那个 /// </summary> /// <param name="_Settings"></param> /// <param name="filter"></param> public QCCEnterpriseDetailInfoAnalyse(CrawlSettings _Settings, BloomFilter <string> _filter, DataOperation _dataop) { Settings = _Settings; filter = _filter; dataop = _dataop; }
/// <summary> /// 构造函数 /// </summary> /// <param name="_Settings"></param> /// <param name="filter"></param> public PhantomJSCrawler(CrawlSettings _Settings, BloomFilter <string> _filter, DataOperation _dataop) { Settings = _Settings; filter = _filter; dataop = _dataop; }
List <BsonDocument> allLandUrlList = new List <BsonDocument>(); //没有县市的Url /// <summary> /// 谁的那个 /// </summary> /// <param name="_Settings"></param> /// <param name="filter"></param> public DistrictLeaderCrawler(CrawlSettings _Settings, BloomFilter <string> _filter, DataOperation _dataop) { Settings = _Settings; filter = _filter; dataop = _dataop; }
///// <summary> ///// 分类信息 ///// </summary> //public string DataTableNameCategory //{ // get { return "CategoryInfo_MT"; } //} /// <summary> /// 构造函数 /// </summary> /// <param name="_Settings"></param> /// <param name="filter"></param> public MeiTuCityBusinessEnterpriseAPPCrawler(CrawlSettings _Settings, BloomFilter <string> _filter, DataOperation _dataop) { Settings = _Settings; filter = _filter; dataop = _dataop; guidFilter = new BloomFilter <string>(9000000); }
/// <summary> /// 谁的那个 /// </summary> /// <param name="_Settings"></param> /// <param name="filter"></param> public MeiTuCityBusinessDistinctAPPCrawler(CrawlSettings _Settings, BloomFilter <string> _filter, DataOperation _dataop) { Settings = _Settings; filter = _filter; dataop = _dataop; }
public ActionResult CrawlSettings() { CrawlSettings config = new CrawlSettings(); return(View(config)); }
/// <summary> /// 谁的那个 /// </summary> /// <param name="_Settings"></param> /// <param name="filter"></param> public PlantDetailInfoCrawler(CrawlSettings _Settings, BloomFilter<string> _filter, DataOperation _dataop) { Settings = _Settings; filter = _filter; dataop = _dataop; }
public AuthenticPageRequester(CrawlSettings setting) : this(setting.CrawConfiguration) { _cookieContainer = setting.CookieContainer; }