public void GetPageCrawlerDetailTest()
        {
            string page = string.Empty;

            bo.Web.WebPageContent content = new bo.Web.WebPageContent();
            string fileName = "tyre-shopper.co.uk.home.html";
            string path     = GetUnitTestDataFilePath(fileName);

            page = GetFileContents(fileName);
            Uri uri = new Uri(path);

            bo.Web.PageCrawlDetail pageCrawlDetail = new bo.Web.PageCrawlDetail(uri);
            pageCrawlDetail.PageUri = new Uri("http://www.tyre-shopper.co.uk");
            pageCrawlDetail.LoadContent(page, HttpStatusCode.OK);
            pageCrawlDetail.LoadUris();
            Assert.IsTrue(pageCrawlDetail.AllLinks.Count == 37);

            content                 = new bo.Web.WebPageContent();
            fileName                = "tyre-shopper.co.uk.aboutUs.html";
            path                    = GetUnitTestDataFilePath(fileName);
            page                    = GetFileContents(fileName);
            uri                     = new Uri(path);
            pageCrawlDetail         = new bo.Web.PageCrawlDetail(uri);
            pageCrawlDetail.PageUri = new Uri("http://www.tyre-shopper.co.uk/about-us/");
            pageCrawlDetail.LoadContent(page, HttpStatusCode.OK);
            pageCrawlDetail.LoadUris();
            Assert.IsTrue(pageCrawlDetail.AllLinks.Count == 22);
        }
Example #2
0
        private static void ProcessUrl(string url)
        {
            content = new bo.Web.WebPageContent();
            content.SetRequestFromURL(url);

            if (bo.Helpers.WebHelper.IsUrlValid(url) && content.TryGetResponse())
            {
                // Now we can get the robots file
                Uri baseUri   = new Uri(url);
                Uri robotsUri = new Uri(bo.Helpers.WebHelper.CombineUrl(baseUri, "/robots.txt"));
                content = new bo.Web.WebPageContent();
                content.SetRequestFromURL(robotsUri.AbsoluteUri);
                string responseString = string.Empty;
                if (content.TryGetResponse())
                {
                    responseString = bo.Helpers.WebHelper.ResponseToString(content.response);
                }

                bo.Web.RobotsTxt robots = bo.Helpers.RobotsHelper.ParseRobotsTxt(responseString);

                bo.Web.PageCrawlDetail pageDetails = new bo.Web.PageCrawlDetail(baseUri);


                try
                {
                    //Start the Crawling
                    bo.Crawler.CrawlerProcessing crw = new bo.Crawler.CrawlerProcessing(NoOfThreads, SleepTime, baseUri, robots);
                    crw.Crawl(pageDetails).Wait();
                    bo.Helpers.FileHelper.PageDetailToCSV(crw.CrawlList.UrlsCompleted, outputFilePath);
                }
                catch (AggregateException e)
                {
                    foreach (var ex in e.InnerExceptions)
                    {
                        Console.WriteLine(ex.InnerException);
                    }
                    Console.ReadLine();
                }
            }
            else
            {
                string info = "Invalid URL: " + url;
                Console.WriteLine(url);
                log.Info(url);
            }
        }
        public void TestToCSV()
        {
            string filePath = "testToCSV.csv";
            List <bo.Web.PageCrawlDetail> list = new List <bo.Web.PageCrawlDetail>();

            bo.Web.PageCrawlDetail n1 = new bo.Web.PageCrawlDetail(new Uri("http://www.google.co.uk"));
            n1.PageTitle  = "Google Search Engine";
            n1.StatusCode = System.Net.HttpStatusCode.NotFound;
            list.Add(n1);

            bo.Web.PageCrawlDetail n2 = new bo.Web.PageCrawlDetail(new Uri("http://www.yahoo.co.uk"));
            n2.PageTitle  = "Yahoo Search Engine";
            n2.StatusCode = System.Net.HttpStatusCode.OK;
            list.Add(n2);

            bo.Helpers.FileHelper.PageDetailToCSV(list, filePath);

            File.Exists(filePath);
        }
        public void CrawlListUnitTest()
        {
            bo.Web.RobotsTxt robots = new bo.Web.RobotsTxt(new Uri("http://www.tyre-shopper.co.uk"));
            robots.DisallowedList.Add("/type-type");
            bo.Crawler.CrawlList list = new bo.Crawler.CrawlList(robots);

            bo.Web.PageCrawlDetail page1 = new bo.Web.PageCrawlDetail(robots.BastUri);

            Assert.IsTrue(list.UrlToBeAdded(page1));
            list.AddUrl(page1);

            Assert.IsFalse(list.UrlToBeAdded(page1));

            bo.Web.PageCrawlDetail page2 = list.GetNext();
            list.UrlsCompleted.Add(page1);
            Assert.IsFalse(list.UrlToBeAdded(page1));


            bo.Web.PageCrawlDetail page3 = new bo.Web.PageCrawlDetail(new Uri("http://www.tyre-shopper.co.uk/type-type"));
            Assert.IsFalse(list.UrlToBeAdded(page3));

            bo.Web.PageCrawlDetail page4 = new bo.Web.PageCrawlDetail(new Uri("http://www.facebook.com"));
            Assert.IsFalse(list.UrlToBeAdded(page4));
        }