public void When_HtmlHasSeveralHyperlinks_Then_ReturnSeveralChildrenNodes() { // Arrange ArrangeMocks(DomainBase, new List <Node>() { HomeNode, AboutNode, ContactNode }); ArrangeMocks(HomeNode.Uri, new List <Node>() { }); ArrangeMocks(AboutNode.Uri, new List <Node>() { }); ArrangeMocks(ContactNode.Uri, new List <Node>() { }); var target = new WebCrawler(_factoryMock.Object, _httpMock.Object); //Act var result = target.Crawl(DomainBase); // Assert Assert.AreEqual(3, result.Nodes.Count); Assert.AreEqual(HomeNode.Uri, result.Nodes[0].Uri); Assert.AreEqual(AboutNode.Uri, result.Nodes[1].Uri); Assert.AreEqual(ContactNode.Uri, result.Nodes[2].Uri); }
public void When_PageContainsVisitedNode_Then_DontRepeatVisitedNode() { // Arrange ArrangeMocks(DomainBase, new List <Node>() { HomeNode, AboutNode }); ArrangeMocks(HomeNode.Uri, new List <Node>() { BaseNode }); ArrangeMocks(AboutNode.Uri, new List <Node>() { BaseNode }); var target = new WebCrawler(_factoryMock.Object, _httpMock.Object); //Act var result = target.Crawl(DomainBase); // Assert Assert.AreEqual(2, result.Nodes.Count); Assert.AreEqual(HomeNode.Uri, result.Nodes[0].Uri); Assert.AreEqual(AboutNode.Uri, result.Nodes[1].Uri); Assert.AreEqual(0, result.Nodes[0].Nodes[0].Nodes.Count); Assert.AreEqual(0, result.Nodes[1].Nodes[0].Nodes.Count); }
static void Main(string[] args) { ICrawlingFilterDetail crawlingFilterDetail = new CrawlingFilterDetail("jobdetail-iframe", "src", "/jobdetail"); ICrawlingStats crawlingStats = new XingCrawlingStats(new[] { "jobdetail" }, crawlingFilterDetail); IResultWriter resultWriter = new ResultWriter(crawlingStats); var walter = new WebCrawler(crawlingStats, resultWriter, new Clock()); var result = walter.Crawl(new Uri("https://www.xn--jobbrse-d1a.com/list/jobtitle/"), @"c:\temp\WalterResult.csv"); }
static void Main(string[] args) { var seedUrls = ConfigurationManager.AppSettings["Seeds"].Split(',').ToList(); StockPageAnalyzer analyzer = new StockPageAnalyzer(); WebCrawler crawler = new WebCrawler(seedUrls, analyzer); crawler.Crawl(); analyzer.RankAll(); analyzer.PrintRanked(); }
public void When_HtmlDoesntHaveHyperlinks_Then_ReturnNoChildrenNodes() { // Arrange ArrangeMocks(DomainBase, new List <Node>() { }); var target = new WebCrawler(_factoryMock.Object, _httpMock.Object); //Act var result = target.Crawl(DomainBase); // Assert Assert.AreEqual(0, result.Nodes.Count); }
public void When_DomainIsNotFound_Then_ReturnNoChildrenNodes() { // Arrange ArrangeMocks(DomainBase, false, string.Empty, DomainBase, new List <Node>() { }); var target = new WebCrawler(_factoryMock.Object, _httpMock.Object); //Act var result = target.Crawl(DomainBase); // Assert Assert.AreEqual(0, result.Nodes.Count); }
public void Check_For_Deadlinks() { var host = Environment.GetEnvironmentVariable("Red-Folder_Tests_Acceptance_Host"); var hostSynonyms = Environment.GetEnvironmentVariable("Red-Folder_Tests_Acceptance_Host_Synonyms"); var crawlRequest = new CrawlRequest { Host = string.IsNullOrWhiteSpace(host) ? "https://www.red-folder.com" : host, HostSynonyms = string.IsNullOrWhiteSpace(hostSynonyms) ? new List <string>() : hostSynonyms.Split(';').ToList() }; var uat = new WebCrawler(); uat.Crawl(crawlRequest); Assert.True(uat.Valid(), uat.ToString()); }
public void When_HtmlHasOneHyperlinks_Then_ReturnOneChildNode() { // Arrange ArrangeMocks(DomainBase, new List <Node>() { HomeNode }); ArrangeMocks(HomeNode.Uri, new List <Node>() { }); var target = new WebCrawler(_factoryMock.Object, _httpMock.Object); //Act var result = target.Crawl(DomainBase); // Assert Assert.AreEqual(1, result.Nodes.Count); Assert.AreEqual(HomeNode.Uri, result.Nodes[0].Uri); }
protected void Application_Start() { AreaRegistration.RegisterAllAreas(); RouteConfig.RegisterRoutes(RouteTable.Routes); GlobalConfiguration.Configure(WebApiConfig.Register); var crawler = new WebCrawler(25); var timer = new Timer(10000); timer.Elapsed += (s, e) => { if (crawler.IsCrawling) { return; } crawler.Crawl(); }; timer.Start(); }
private static IEnumerable <Proxy> getProxiesFromURL(string url) { List <Proxy> proxies = new List <Proxy>(); string urlText = null; WebCrawler crawler = new WebCrawler(); crawler.Crawl(url); if (crawler.IsCrawlingCompleted) { urlText += crawler.CrawledText; } //IP:PORT pattern (whitespaces between ip and port are ignored) //TODO: Can find a better regex string pattern = @"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}\s{0,}[\:*\s{0,}]([0-9]{1,5}|[1-5][0-9]{4}|6[0-4][0-9]{3}|65[0-4][0-9]{2}|655[0-2][0-9]|6553[0-5])"; string refinedSourceCode = urlText.Replace("\n", ""); refinedSourceCode = refinedSourceCode.Replace("\r", ""); MatchCollection matchCollection = Regex.Matches(refinedSourceCode, pattern); //TODO: Should be multi-threaded foreach (Match match in matchCollection) { try { //Split IP and PORT info string[] array = match.Value.Split(new[] { ' ', ':' }); Proxy proxy = new Proxy(); proxy.IP = array[0]; proxy.Port = array[1]; //TODO: Check socks type and timeout - if it is live (set ProxyType) proxies.Add(proxy); } //Ignore parsing errors, just don't add which it couldn't parse catch (Exception) { } } return(proxies); }
static void Main(string[] args) { var dbFile = "resultData.db"; if (File.Exists(dbFile)) { File.Delete(dbFile); } using (IUnitOfWork uow = new SQLiteUnitOfwork(dbFile)) { var list = WebCrawler.Crawl <ICU4Crawler>("https://www.4icu.org/de/a-z/", "/html/body/div[2]/div/div[2]/div/table/tbody/tr"); var rep1 = new ICU4Repository(uow); try { uow.BeginTrans(); foreach (dynamic d in list) { System.Console.WriteLine($@"{d.UniId} - {d.UniName} - {d.City} - {d.Rank} - {d.Acronym} - {d.Founded} -------------------------------------------------------------- - {d.Address} - {d.Tel} - {d.Fax} - {d.TuitionLclStdnB} - {d.TuitionIntStdnB} - {d.TuitionLclStdnM} - {d.TuitionIntStdnM} -------------------------------------------------------------- - {d.Gender} - {d.InterNlStdn} - {d.SelectionType} - {d.StudentEnrollment} - {d.AcademicStaff} - {d.ControlType} -------------------------------------------------------------- - {d.EntityType} - {d.AcademicCalendar} - {d.CampusSetting} - {d.ReligiousAffiliation} - {d.Library} - {d.Housing} -------------------------------------------------------------- - {d.SportFacilities} - {d.FinancialAids} - {d.StudyAbroad} - {d.DistanceLearning} -------------------------------------------------------------- - Graduate - {d.GraduateB} - {d.GraduateM} - {d.GraduateP} - {d.ArtsHumanitiesB} - {d.ArtsHumanitiesM} - {d.ArtsHumanitiesP} -------------------------------------------------------------- - Graduate - {d.BusinessSocialB} - {d.BusinessSocialM} - {d.BusinessSocialP} - {d.LanguageCulturalB} - {d.LanguageCulturalM} - {d.LanguageCulturalP} -------------------------------------------------------------- - Graduate - {d.MedicineHealthB} - {d.MedicineHealthM} - {d.MedicineHealthP} - {d.EngineeringB} - {d.EngineeringM} - {d.EngineeringP} -------------------------------------------------------------- - Graduate - {d.ScienceTechnologyB} - {d.ScienceTechnologyM} - {d.ScienceTechnologyP} -------------------------------------------------------------- =============================================================="); rep1.InsertUni(d); } var s = "let cities=[\"" + string.Join("\",\"", ((IEnumerable <dynamic>)list).Select(new Func <dynamic, string>(a => a.City?.Replace(" ...", ""))).Distinct()); s += "\"];"; uow.Commit(); } catch (System.Exception ex) { throw; } list = WebCrawler.Crawl <IndiaeducationCrawler>("https://www.indiaeducation.net/studyabroad/germany/list-of-universities-a-f.aspx", "/div[@id=\"artBody\"]/ul[]/li[]/a"); var rep2 = new IndiaeducationRepository(uow); try { uow.BeginTrans(); foreach (dynamic d in list) { //rep2.InsertUni(d); System.Console.WriteLine($"{d.UniId} - {d.UniName} - {d.City} - {d.Rank} - {d.Founded}"); } uow.Commit(); } catch (System.Exception ex) { throw; } } //var rep = new UniRepository("CollectData.sqlite"); //using (var uow=rep.CreateUOW()) //{ // uow.BeginTrans(); // rep.InsertUni(new { UniName="Test Uni 1", Rank=1, City="Test City" }); // uow.Commit(); // uow.BeginTrans(); // rep.InsertUni(new { UniName="Test Uni 1", Rank=1, City="Test City" }); // uow.Commit(); //} }