public void When_HtmlHasSeveralHyperlinks_Then_ReturnSeveralChildrenNodes()
        {
            // Arrange
            ArrangeMocks(DomainBase, new List <Node>()
            {
                HomeNode, AboutNode, ContactNode
            });
            ArrangeMocks(HomeNode.Uri, new List <Node>()
            {
            });
            ArrangeMocks(AboutNode.Uri, new List <Node>()
            {
            });
            ArrangeMocks(ContactNode.Uri, new List <Node>()
            {
            });
            var target = new WebCrawler(_factoryMock.Object, _httpMock.Object);

            //Act
            var result = target.Crawl(DomainBase);

            // Assert
            Assert.AreEqual(3, result.Nodes.Count);
            Assert.AreEqual(HomeNode.Uri, result.Nodes[0].Uri);
            Assert.AreEqual(AboutNode.Uri, result.Nodes[1].Uri);
            Assert.AreEqual(ContactNode.Uri, result.Nodes[2].Uri);
        }
        public void When_PageContainsVisitedNode_Then_DontRepeatVisitedNode()
        {
            // Arrange
            ArrangeMocks(DomainBase, new List <Node>()
            {
                HomeNode, AboutNode
            });
            ArrangeMocks(HomeNode.Uri, new List <Node>()
            {
                BaseNode
            });
            ArrangeMocks(AboutNode.Uri, new List <Node>()
            {
                BaseNode
            });
            var target = new WebCrawler(_factoryMock.Object, _httpMock.Object);

            //Act
            var result = target.Crawl(DomainBase);

            // Assert
            Assert.AreEqual(2, result.Nodes.Count);
            Assert.AreEqual(HomeNode.Uri, result.Nodes[0].Uri);
            Assert.AreEqual(AboutNode.Uri, result.Nodes[1].Uri);
            Assert.AreEqual(0, result.Nodes[0].Nodes[0].Nodes.Count);
            Assert.AreEqual(0, result.Nodes[1].Nodes[0].Nodes.Count);
        }
Esempio n. 3
0
 static void Main(string[] args)
 {
     ICrawlingFilterDetail crawlingFilterDetail = new CrawlingFilterDetail("jobdetail-iframe", "src", "/jobdetail");
     ICrawlingStats crawlingStats = new XingCrawlingStats(new[] { "jobdetail" }, crawlingFilterDetail); 
     IResultWriter resultWriter = new ResultWriter(crawlingStats);
     var walter = new WebCrawler(crawlingStats, resultWriter, new Clock());
     var result = walter.Crawl(new Uri("https://www.xn--jobbrse-d1a.com/list/jobtitle/"), @"c:\temp\WalterResult.csv");
 }
Esempio n. 4
0
        static void Main(string[] args)
        {
            var seedUrls = ConfigurationManager.AppSettings["Seeds"].Split(',').ToList();
            StockPageAnalyzer analyzer = new StockPageAnalyzer();
            WebCrawler        crawler  = new WebCrawler(seedUrls, analyzer);

            crawler.Crawl();
            analyzer.RankAll();
            analyzer.PrintRanked();
        }
        public void When_HtmlDoesntHaveHyperlinks_Then_ReturnNoChildrenNodes()
        {
            // Arrange
            ArrangeMocks(DomainBase, new List <Node>()
            {
            });
            var target = new WebCrawler(_factoryMock.Object, _httpMock.Object);

            //Act
            var result = target.Crawl(DomainBase);

            // Assert
            Assert.AreEqual(0, result.Nodes.Count);
        }
        public void When_DomainIsNotFound_Then_ReturnNoChildrenNodes()
        {
            // Arrange
            ArrangeMocks(DomainBase, false, string.Empty, DomainBase, new List <Node>()
            {
            });
            var target = new WebCrawler(_factoryMock.Object, _httpMock.Object);

            //Act
            var result = target.Crawl(DomainBase);

            // Assert
            Assert.AreEqual(0, result.Nodes.Count);
        }
Esempio n. 7
0
        public void Check_For_Deadlinks()
        {
            var host         = Environment.GetEnvironmentVariable("Red-Folder_Tests_Acceptance_Host");
            var hostSynonyms = Environment.GetEnvironmentVariable("Red-Folder_Tests_Acceptance_Host_Synonyms");

            var crawlRequest = new CrawlRequest
            {
                Host         = string.IsNullOrWhiteSpace(host) ? "https://www.red-folder.com" : host,
                HostSynonyms = string.IsNullOrWhiteSpace(hostSynonyms) ? new List <string>() : hostSynonyms.Split(';').ToList()
            };

            var uat = new WebCrawler();

            uat.Crawl(crawlRequest);

            Assert.True(uat.Valid(), uat.ToString());
        }
        public void When_HtmlHasOneHyperlinks_Then_ReturnOneChildNode()
        {
            // Arrange
            ArrangeMocks(DomainBase, new List <Node>()
            {
                HomeNode
            });
            ArrangeMocks(HomeNode.Uri, new List <Node>()
            {
            });
            var target = new WebCrawler(_factoryMock.Object, _httpMock.Object);

            //Act
            var result = target.Crawl(DomainBase);

            // Assert
            Assert.AreEqual(1, result.Nodes.Count);
            Assert.AreEqual(HomeNode.Uri, result.Nodes[0].Uri);
        }
Esempio n. 9
0
        protected void Application_Start()
        {
            AreaRegistration.RegisterAllAreas();
            RouteConfig.RegisterRoutes(RouteTable.Routes);

            GlobalConfiguration.Configure(WebApiConfig.Register);

            var crawler = new WebCrawler(25);
            var timer   = new Timer(10000);

            timer.Elapsed += (s, e) =>
            {
                if (crawler.IsCrawling)
                {
                    return;
                }
                crawler.Crawl();
            };
            timer.Start();
        }
Esempio n. 10
0
        private static IEnumerable <Proxy> getProxiesFromURL(string url)
        {
            List <Proxy> proxies = new List <Proxy>();
            string       urlText = null;

            WebCrawler crawler = new WebCrawler();

            crawler.Crawl(url);
            if (crawler.IsCrawlingCompleted)
            {
                urlText += crawler.CrawledText;
            }

            //IP:PORT pattern (whitespaces between ip and port are ignored)
            //TODO: Can find a better regex
            string pattern =
                @"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}\s{0,}[\:*\s{0,}]([0-9]{1,5}|[1-5][0-9]{4}|6[0-4][0-9]{3}|65[0-4][0-9]{2}|655[0-2][0-9]|6553[0-5])";

            string refinedSourceCode = urlText.Replace("\n", "");

            refinedSourceCode = refinedSourceCode.Replace("\r", "");
            MatchCollection matchCollection = Regex.Matches(refinedSourceCode, pattern);

            //TODO: Should be multi-threaded
            foreach (Match match in matchCollection)
            {
                try
                {
                    //Split IP and PORT info
                    string[] array = match.Value.Split(new[] { ' ', ':' });
                    Proxy    proxy = new Proxy();
                    proxy.IP   = array[0];
                    proxy.Port = array[1];
                    //TODO: Check socks type and timeout - if it is live (set ProxyType)
                    proxies.Add(proxy);
                }
                //Ignore parsing errors, just don't add which it couldn't parse
                catch (Exception) { }
            }
            return(proxies);
        }
        static void Main(string[] args)
        {
            var dbFile = "resultData.db";

            if (File.Exists(dbFile))
            {
                File.Delete(dbFile);
            }
            using (IUnitOfWork uow = new SQLiteUnitOfwork(dbFile))
            {
                var list = WebCrawler.Crawl <ICU4Crawler>("https://www.4icu.org/de/a-z/", "/html/body/div[2]/div/div[2]/div/table/tbody/tr");
                var rep1 = new ICU4Repository(uow);
                try
                {
                    uow.BeginTrans();
                    foreach (dynamic d in list)
                    {
                        System.Console.WriteLine($@"{d.UniId} - {d.UniName} - {d.City} - {d.Rank} - {d.Acronym} - {d.Founded} 
--------------------------------------------------------------
- {d.Address} - {d.Tel} - {d.Fax} - {d.TuitionLclStdnB} - {d.TuitionIntStdnB} - {d.TuitionLclStdnM} - {d.TuitionIntStdnM}
--------------------------------------------------------------
- {d.Gender} - {d.InterNlStdn} - {d.SelectionType} - {d.StudentEnrollment} - {d.AcademicStaff} - {d.ControlType} 
--------------------------------------------------------------
- {d.EntityType} - {d.AcademicCalendar} - {d.CampusSetting} - {d.ReligiousAffiliation} - {d.Library} - {d.Housing} 
--------------------------------------------------------------
- {d.SportFacilities} - {d.FinancialAids} - {d.StudyAbroad} - {d.DistanceLearning}
--------------------------------------------------------------
- Graduate  - {d.GraduateB} - {d.GraduateM} - {d.GraduateP} - {d.ArtsHumanitiesB} - {d.ArtsHumanitiesM} - {d.ArtsHumanitiesP}
--------------------------------------------------------------
- Graduate  - {d.BusinessSocialB} - {d.BusinessSocialM} - {d.BusinessSocialP} - {d.LanguageCulturalB} - {d.LanguageCulturalM} - {d.LanguageCulturalP}
--------------------------------------------------------------
- Graduate  - {d.MedicineHealthB} - {d.MedicineHealthM} - {d.MedicineHealthP} - {d.EngineeringB} - {d.EngineeringM} - {d.EngineeringP}
--------------------------------------------------------------
- Graduate  - {d.ScienceTechnologyB} - {d.ScienceTechnologyM} - {d.ScienceTechnologyP}
--------------------------------------------------------------
==============================================================");
                        rep1.InsertUni(d);
                    }
                    var s = "let cities=[\"" + string.Join("\",\"", ((IEnumerable <dynamic>)list).Select(new Func <dynamic, string>(a => a.City?.Replace(" ...", ""))).Distinct());

                    s += "\"];";
                    uow.Commit();
                }
                catch (System.Exception ex)
                {
                    throw;
                }

                list = WebCrawler.Crawl <IndiaeducationCrawler>("https://www.indiaeducation.net/studyabroad/germany/list-of-universities-a-f.aspx",
                                                                "/div[@id=\"artBody\"]/ul[]/li[]/a");
                var rep2 = new IndiaeducationRepository(uow);
                try
                {
                    uow.BeginTrans();
                    foreach (dynamic d in list)
                    {
                        //rep2.InsertUni(d);
                        System.Console.WriteLine($"{d.UniId} - {d.UniName} - {d.City} - {d.Rank}  - {d.Founded}");
                    }
                    uow.Commit();
                }
                catch (System.Exception ex)
                {
                    throw;
                }
            }
            //var rep = new UniRepository("CollectData.sqlite");
            //using (var uow=rep.CreateUOW())
            //{
            //    uow.BeginTrans();

            //    rep.InsertUni(new { UniName="Test Uni 1", Rank=1, City="Test City" });

            //    uow.Commit();

            //    uow.BeginTrans();

            //    rep.InsertUni(new { UniName="Test Uni 1", Rank=1, City="Test City" });

            //    uow.Commit();
            //}
        }