Пример #1
0
 static void Main(string[] args)
 {
     ICrawlingFilterDetail crawlingFilterDetail = new CrawlingFilterDetail("jobdetail-iframe", "src", "/jobdetail");
     ICrawlingStats crawlingStats = new XingCrawlingStats(new[] { "jobdetail" }, crawlingFilterDetail); 
     IResultWriter resultWriter = new ResultWriter(crawlingStats);
     var walter = new WebCrawler(crawlingStats, resultWriter, new Clock());
     var result = walter.Crawl(new Uri("https://www.xn--jobbrse-d1a.com/list/jobtitle/"), @"c:\temp\WalterResult.csv");
 }
Пример #2
0
        public void count_pages_containing_specific_keywords()
        {
            ICrawlingStats crawlingStats = new XingCrawlingStats(new[] { "jobdetail" }, _crawlingFilterDetail);
            var page = new CrawledPage(new Uri("http://a.com/jobdetail"));

            crawlingStats.ProcessCrawledPage(page);

            Assert.AreEqual(1, crawlingStats.CountOfCrawledPagesContainingSpecificKeyword);
        }
Пример #3
0
        public void ignore_duplicated_pages()
        {
            ICrawlingStats crawlingStats = new XingCrawlingStats(new[] { "jobdetail" }, _crawlingFilterDetail);
            var page = new CrawledPage(new Uri("https://www.xn--jobbrse-d1a.com/jobdetail/?rid=101496772&qid=36120&fid=97&_uri=am9idGl0bGU9TWFya2V0aW5nJnJhZGl1cz0xMCZjb3VudHJ5PSZjYXRlZ29yeT0mYWdlbmN5PTAmY2FyZWVyPSZwYXJ0dGltZT0wJnNvcnQ9ZGF0ZSZwYWdlPTEmcnBwPTEwJmRhdGU9JnFkYXRlPTIwMTYtMDItMjImam9iaWQ9MSZ0b3RhbD0yNzI1Mw=="));

            crawlingStats.ProcessCrawledPage(page);
            crawlingStats.ProcessCrawledPage(page);

            Assert.AreEqual(1, crawlingStats.CountOfCrawledPagesContainingSpecificKeyword);
        }
Пример #4
0
        public void count_pages_containing_specific_content_with_specific_filter_detail()
        {
            ICrawlingFilterDetail crawlingFilterDetail = new CrawlingFilterDetail("jobdetail-iframe", "src", "/jobdetail");
            ICrawlingStats crawlingStats = new XingCrawlingStats(new[] { "jobdetail" }, crawlingFilterDetail);
            CrawledPage page = new CrawledPage(new Uri("http://a.com/jobdetail"))
            {
                Content = new PageContent
                {
                    Text = GetFileContent("TestPages\\StellenangebotOeffnen.html")
                }
            };

            crawlingStats.ProcessCrawledPage(page);

            Assert.AreEqual(1, crawlingStats.CountOfCrawledPagesContainingSpecificDetails);
        }