Пример #1
0
        public void TestChainTwoLinks()
        {
            MockRepository mocks = new MockRepository();
            var web = mocks.StrictMock<IWebInteractor>();
            var mockFSI = MockRepository.GenerateStub<IFileSystemInteractor>();

            CrawlResult testResult = new CrawlResult();
            testResult.ReturnCode = 200;
            testResult.ReturnStatus = "OK";
            testResult.Html = "href=\"/csse.html\"";

            var resultTwo = new CrawlResult();
            resultTwo.ReturnCode = 200;
            resultTwo.ReturnStatus = "OK";
            resultTwo.Html = "href=\"/abbe.html\"";

            var resultThree = new CrawlResult();
            resultThree.ReturnCode = 200;
            resultThree.ReturnStatus = "OK";
            resultThree.Html = "";

            Expect.On(web).Call(web.GetPage("www.test.com")).Return(testResult);
            Expect.On(web).Call(web.GetPage("www.test.com/csse.html")).Return(resultTwo);
            Expect.On(web).Call(web.GetPage("www.test.com/abbe.html")).Return(resultThree);

            mocks.ReplayAll();

            Bot b = new Bot(new Website("www.test.com","simplepath"),null,null,web,mockFSI);
            List<CrawlResult> results = b.CrawlSite(2);

            mocks.VerifyAll();

            Assert.AreEqual(3,results.Count);
        }
Пример #2
0
        static void Main(string[] args)
        {
            PoliteWebCrawler crawler = new PoliteWebCrawler();

            crawler.PageCrawlStartingAsync        += crawler_ProcessPageCrawlStarting;
            crawler.PageCrawlCompletedAsync       += crawler_ProcessPageCrawlCompleted;
            crawler.PageCrawlDisallowedAsync      += crawler_PageCrawlDisallowed;
            crawler.PageLinksCrawlDisallowedAsync += crawler_PageLinksCrawlDisallowed;

            CrawlResult result = crawler.Crawl(new Uri("https://www.goodreads.com/quotes/tag/motivation?page=2"));

            //This is synchronous, it will not go to the next line until the crawl has completed
            //if (result.ErrorOccurred)
            //    Console.WriteLine("Crawl of {0} completed with error: {1}", result.RootUri.AbsoluteUri, result.ErrorException.Message);
            //else
            //    Console.WriteLine("Crawl of {0} completed without error.", result.RootUri.AbsoluteUri);

            Console.ReadLine();
        }
Пример #3
0
        void testWebAbot()
        {
            CrawlConfiguration crawlConfig = new CrawlConfiguration();

            crawlConfig.CrawlTimeoutSeconds  = 100;
            crawlConfig.MaxConcurrentThreads = 50;
            crawlConfig.MaxPagesToCrawl      = 3000;
            crawlConfig.UserAgentString      = "Mozilla/5.0 (Windows NT 6.3; Trident/7.0; rv:11.0) like Gecko";
            crawlConfig.ConfigurationExtensions.Add("SomeCustomConfigValue1", "1111");
            crawlConfig.ConfigurationExtensions.Add("SomeCustomConfigValue2", "2222");
            crawlConfig.IsSendingCookiesEnabled         = false;
            crawlConfig.HttpServicePointConnectionLimit = 200;
            crawlConfig.MaxLinksPerPage = 3000;
            crawlConfig.MaxCrawlDepth   = 10000;
            crawlConfig.IsExternalPageCrawlingEnabled = true;

            crawlConfig.MaxPagesToCrawlPerDomain = 10000;

            crawlConfig.MaxLinksPerPage = 0;
            crawlConfig.IsHttpRequestAutoRedirectsEnabled = true;
            crawlConfig.MaxMemoryUsageInMb = 0;


            var crawler = new PoliteWebCrawler(crawlConfig);

            crawler.PageCrawlStartingAsync        += crawler_ProcessPageCrawlStarting;
            crawler.PageCrawlCompletedAsync       += crawler_ProcessPageCrawlCompleted;
            crawler.PageCrawlDisallowedAsync      += crawler_PageCrawlDisallowed;
            crawler.PageLinksCrawlDisallowedAsync += crawler_PageLinksCrawlDisallowed;

            CrawlResult result = crawler.Crawl(new Uri(textBox1.Text)); //This is synchronous, it will not go to the next line until the crawl has completed

            if (result.ErrorOccurred)
            {
                Console.WriteLine("Crawl of {0} completed with error: {1}", result.RootUri.AbsoluteUri, result.ErrorException.Message);
            }
            else
            {
                Console.WriteLine("Crawl of {0} completed without error.", result.RootUri.AbsoluteUri);
            }
        }
Пример #4
0
        //        private string _basePath;
        public CrawlResult GetPage(string url)
        {
            CrawlResult cr = new CrawlResult();

            int indexOfSlash = url.IndexOf('/');
            string urlExtension = "";
            if (indexOfSlash > -1)
                urlExtension = url.Substring(indexOfSlash);

            HttpWebRequest request = (HttpWebRequest)HttpWebRequest.Create("http://" + url);
            request.Method = "GET";
            request.AllowAutoRedirect = false;

            try
            {
                HttpWebResponse response = (HttpWebResponse)request.GetResponse();
                var reader = new StreamReader(response.GetResponseStream());
                var html = reader.ReadToEnd();

                cr.ReturnCode = (int) response.StatusCode;
                cr.ReturnStatus = response.StatusDescription;
                cr.Html = html;
            }
            catch (WebException we)
            {
            //                var code = ((HttpWebResponse)we.Response).StatusCode;
            //                cr.ReturnCode = (int) code;
            //                cr.ReturnStatus = code.ToString();
            //                _log.writeError(String.Format(@"Error: {0} ({1}) {2}",((HttpWebResponse)we.Response).ResponseUri, (int)code, code));
                _log.writeError("Error getting site " + url);

            }

            return cr;
        }
Пример #5
0
        public void TestStatus404()
        {
            MockRepository mocks = new MockRepository();
            var web = mocks.StrictMock<IWebInteractor>();
            var site = new Website("www.whocares.com", "whatever");
            var mockFSI = MockRepository.GenerateStub<FileSystemInteractor>();

            var retVal = new CrawlResult();
            retVal.ReturnCode = 404;
            retVal.Html = @"";

            Expect.On(web).Call(web.GetPage("www.whocares.com")).Return(retVal);
            mocks.ReplayAll();

            var useableBot = new Bot(site, null, null, web, mockFSI);
            var check = useableBot.CrawlSite(1);

            Assert.IsTrue(check[0].ReturnCode == 404);

            mocks.VerifyAll();
        }
Пример #6
0
        public void TestHttpInternalWithWwwLink()
        {
            MockRepository mocks = new MockRepository();
            var web = mocks.StrictMock<IWebInteractor>();
            var site = new Website("www.test.com", "whatever");
            var mockFSI = MockRepository.GenerateStub<IFileSystemInteractor>();

            CrawlResult testResult = new CrawlResult();
            testResult.ReturnCode = 200;
            testResult.ReturnStatus = "OK";
            testResult.Html = "href=\"http://www.test.com/csse.html\"";

            var resultTwo = new CrawlResult();
            resultTwo.ReturnCode = 200;
            resultTwo.ReturnStatus = "OK";
            resultTwo.Html = "";

            Expect.On(web).Call(web.GetPage("www.test.com")).Return(testResult);
            Expect.On(web).Call(web.GetPage("www.test.com/csse.html")).Return(resultTwo);

            mocks.ReplayAll();

            Bot b = new Bot(site,null,null,web,mockFSI);
            List<CrawlResult> results = b.CrawlSite(2);

            mocks.VerifyAll();

            Assert.AreEqual(2,results.Count);
        }
Пример #7
0
        public CrawlResult CrawlPage(CrawlPlan plan)
        {
            var crawlResult = new CrawlResult(plan.AbsoluteUri);


            string driverPageSource;

            //using (IWebDriver driver = new ChromeDriver(new ChromeOptions() { }))
            //{
            //WebDriverWait wait = new WebDriverWait(driver, TimeSpan.FromSeconds(10));

            try
            {
                //todo: disable redirect
                driver.Navigate().GoToUrl(plan.AbsoluteUri);

                if (_pageLoadWait > 0)
                {
                    Thread.Sleep(TimeSpan.FromSeconds(_pageLoadWait));
                }

                driverPageSource = driver.PageSource;
            }
            catch (Exception e)
            {
                Console.WriteLine(e);

                crawlResult.BrowserFailedAt        = DateTime.UtcNow;
                crawlResult.BrowserFailedException = e.ToString();

                //throw;

                return(crawlResult);
            }



            //string html = driver.ExecuteJavaScript<string>("return document.documentElement.outerHTML;");


            //driver.FindElement(By.Name("q")).SendKeys("cheese" + Keys.Enter);
            //IWebElement firstResult = wait.Until(ExpectedConditions.ElementExists(By.TagName("h1")));
            //Console.WriteLine(firstResult.GetAttribute("textContent"));
            //}



            //crawlResult.Doc = doc;
            crawlResult.BrowserCrawledAt = DateTime.UtcNow;
            crawlResult.BrowserContent   = driverPageSource;



            var linkSets = new List <List <string> >();

            //find links
            var linkElements = driver.FindElements(By.XPath("//a[@href]"));

            if (linkElements != null && linkElements.Count > 0)
            {
                linkSets.Add(linkElements.Select(o => o.GetAttribute("href")).ToList());
            }

            //------------------------------------------
            //Some Webpages DO NOT show all elements unless interacted with e.g. hovering)
            //try hovering and findind more contents
            //------------------------------------------
            var hoverableList = new string[] {
                //"//p[.='Creators']", "//p[.='Learn more']", "//p[.='Crypto Community']"
            };

            if (hoverableList != null && hoverableList.Length > 0)
            {
                var action = new Actions(driver);
                foreach (var hoverableXPath in hoverableList)
                {
                    var hoverableElement = driver.FindElement(By.XPath(hoverableXPath));
                    if (hoverableElement != null)
                    {
                        action.MoveToElement(hoverableElement).Perform();
                        linkElements = driver.FindElements(By.XPath("//a[@href]"));
                        if (linkElements != null && linkElements.Count > 0)
                        {
                            linkSets.Add(linkElements.Select(o => o.GetAttribute("href")).ToList());
                        }
                    }
                }
            }

            var linksToSave = new List <CrawledLink>();

            foreach (var linkSet in linkSets)
            {
                //Console.WriteLine($"\tfound {links.Count} child links.");
                foreach (var href in linkSet)
                {
                    //var href = link.GetAttribute("href");
                    //if(href.Contains("fortnite-stats/")) Debugger.Break();

                    var decoded = HttpUtility.HtmlDecode(href);
                    //if (decoded != href)
                    //    Debugger.Break();

                    if (decoded == "" || decoded.StartsWith("javascript:") || decoded.StartsWith("mailto:") || decoded.StartsWith("skype:"))
                    {
                        continue;
                    }

                    try
                    {
                        var childUri = Util.GetUriObjectFromUriString(decoded, plan.AbsoluteUri);

                        //no duplicated links
                        if (linksToSave.All(o => o.AbsoluteUri != childUri.AbsoluteUri))
                        {
                            linksToSave.Add(new CrawledLink()
                            {
                                AbsoluteUri       = childUri.AbsoluteUri,
                                IsBrowserRequired = true,
                            });
                        }
                    }
                    catch (UriFormatException e) //for mal-formated uris, just add them into the list without using System.Uri
                    {
                        if (linksToSave.All(o => o.AbsoluteUri != decoded))
                        {
                            linksToSave.Add(new CrawledLink()
                            {
                                AbsoluteUri       = decoded,
                                IsBrowserRequired = true,
                            });
                        }

                        Console.WriteLine(e);
                    }
                }
            }
            crawlResult.LinkAbsoluteUris = linksToSave;

            return(crawlResult);
        }
Пример #8
0
        public CrawlResult CrawlPage(CrawlPlan plan)
        {
            var crawlResult = new CrawlResult(plan.AbsoluteUri);

            var web = new HtmlWeb();

            web.CaptureRedirect = true;
            web.PreRequest      = request => WebPreRequest(request);
            web.PostResponse    = (request, response) => HtmlWeb_PostResponse(request, response);

            var stopWatch = new Stopwatch();

            stopWatch.Start();

            //Console.WriteLine($"loading...\t{plan.AbsoluteUri}");
            HtmlDocument doc;

            try
            {
                doc = web.Load(plan.AbsoluteUri);
            }
            catch (Exception e)
            {
                Console.WriteLine(e);

                //if (e is HtmlWebException && e.Message.StartsWith("Unsupported uri scheme:"))
                //{
                //}
                //else
                //{
                crawlResult.FailedAt      = DateTime.UtcNow;
                crawlResult.FailException = e.ToString();
                //}

                return(crawlResult);
            }

            stopWatch.Stop();

            var statusCode       = (int)web.StatusCode;
            var statusCodeString = web.StatusCode.ToString();
            var timeTaken        = stopWatch.Elapsed.TotalSeconds;

            crawlResult.CrawledAt     = DateTime.UtcNow;
            crawlResult.StatusCodeStr = statusCodeString;
            crawlResult.StatusCode    = statusCode;
            crawlResult.TimeTaken     = (decimal?)timeTaken;

            //not 200 OK
            if (web.StatusCode != HttpStatusCode.OK)
            {
                //Console.WriteLine($"\tstatus code = {statusCode}");

                //3xx redirect
                if (statusCode / 100 == 3)
                {
                    try
                    {
                        //Console.WriteLine($"\tfound redirect {statusCode} {_redirectLocation}");
                        var locationUri = Util.GetUriObjectFromUriString(_redirectLocation, plan.AbsoluteUri);
                        //Console.WriteLine($"\tfound new location {locationUri.AbsoluteUri}");

                        crawlResult.LocationAbsoluteUri = locationUri.AbsoluteUri;
                    }
                    catch (Exception e)
                    {
                        Console.WriteLine(e);
                    }
                }

                return(crawlResult);
            }

            //not a document
            if (doc.ParsedText == null)
            {
                return(crawlResult);
            }

            //crawlResult.Doc = doc;
            crawlResult.ContentLength = doc.ParsedText.Length;
            crawlResult.Content       = doc.ParsedText;

            //find links
            var links = doc.DocumentNode.SelectNodes("//a[@href]");

            if (links != null)
            {
                //Console.WriteLine($"\tfound {links.Count} child links.");
                var linksToSave = new List <CrawledLink>();
                foreach (var link in links)
                {
                    var href = link.Attributes["href"].Value;
                    //if(href.Contains("fortnite-stats/")) Debugger.Break();

                    var decoded = HttpUtility.HtmlDecode(href);
                    //if (decoded != href)
                    //    Debugger.Break();

                    if (decoded == "" || decoded.StartsWith("javascript:") || decoded.StartsWith("mailto:") || decoded.StartsWith("skype:"))
                    {
                        continue;
                    }

                    try
                    {
                        var childUri = Util.GetUriObjectFromUriString(decoded, plan.AbsoluteUri);

                        //no duplicated links
                        if (linksToSave.All(o => o.AbsoluteUri != childUri.AbsoluteUri))
                        {
                            linksToSave.Add(new CrawledLink()
                            {
                                AbsoluteUri = childUri.AbsoluteUri,
                            });
                        }
                    }
                    catch (UriFormatException e)//for mal-formated uris, just add them into the list without using System.Uri
                    {
                        if (linksToSave.All(o => o.AbsoluteUri != decoded))
                        {
                            linksToSave.Add(new CrawledLink()
                            {
                                AbsoluteUri = decoded,
                            });
                        }

                        Console.WriteLine(e);
                    }
                }

                crawlResult.LinkAbsoluteUris = linksToSave;
            }

            //find canonical
            var canonicalLinks = doc.DocumentNode.SelectNodes("//link[@rel='canonical']");

            if (canonicalLinks != null && canonicalLinks.Count > 0)
            {
                var canonicalLinkValue = canonicalLinks[0].Attributes["href"].Value;
                //Console.WriteLine($"\tfound canonical");
                crawlResult.Canonical = canonicalLinkValue;
            }

            return(crawlResult);
        }