Exemplo n.º 1
0
        static void DoCrawl()
        {
            CrawlConfiguration crawlConfig = AbotConfigurationSectionHandler.LoadFromXml().Convert();

            crawlConfig.CrawlTimeoutSeconds  = 100;
            crawlConfig.MaxConcurrentThreads = 10;
            crawlConfig.MaxPagesToCrawl      = 5000;
            crawlConfig.UserAgentString      = "abot v1.0 http://code.google.com/p/abot";
            //crawlConfig.ConfigurationExtensions.Add("SomeCustomConfigValue1", "1111");
            //crawlConfig.ConfigurationExtensions.Add("SomeCustomConfigValue2", "2222");

            //Will use app.config for confguration
            PoliteWebCrawler crawler = new PoliteWebCrawler();

            crawler.PageCrawlStartingAsync        += crawler_ProcessPageCrawlStarting;
            crawler.PageCrawlCompletedAsync       += crawler_ProcessPageCrawlCompleted;
            crawler.PageCrawlDisallowedAsync      += crawler_PageCrawlDisallowed;
            crawler.PageLinksCrawlDisallowedAsync += crawler_PageLinksCrawlDisallowed;

            CrawlResult result = crawler.Crawl(new Uri("http://sunnah.com/"));

            Console.WriteLine("jumlah crawled content :" + result.CrawlContext.CrawledCount);
            if (result.ErrorOccurred)
            {
                Console.WriteLine("Crawl of {0} completed with error: {1}", result.RootUri.AbsoluteUri, result.ErrorException.Message);
            }
            else
            {
                Console.WriteLine("Crawl of {0} completed without error.", result.RootUri.AbsoluteUri);
            }
        }
Exemplo n.º 2
0
        /// <summary>
        /// Initializes the crawler from configuration and stores a definition of the instance
        /// </summary>
        /// <param name="seedUrl"></param>
        /// <param name="sessionId"></param>
        /// <param name="crawlerId"></param>
        public bool InitializeCrawler(string seedUrl, int sessionId, int crawlerId)
        {
            var config      = new CrawlConfiguration();
            var abotSection = AbotConfigurationSectionHandler.LoadFromXml();

            if (abotSection != null)
            {
                config = abotSection.Convert();
                _logger.InfoFormat("CrawlConfiguration loaded from app.config");
            }
            else
            {
                config.CrawlTimeoutSeconds                = 100;
                config.MaxConcurrentThreads               = 1;
                config.MaxPagesToCrawl                    = long.MaxValue;
                config.IsExternalPageCrawlingEnabled      = false;
                config.IsExternalPageLinksCrawlingEnabled = false;
                config.MinCrawlDelayPerDomainMilliSeconds = 10000;
                config.DownloadableContentTypes           = "text/html, text/plain";
                config.IsHttpRequestAutoRedirectsEnabled  = true;
                config.IsUriRecrawlingEnabled             = false;
                config.UserAgentString                    = "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:31.0) Gecko/20100101 Firefox/31.0";
                _logger.InfoFormat("CrawlConfiguration default loaded");
            }

            return(InitializeCrawler(seedUrl, sessionId, crawlerId, config));
        }
        public AbotConfigurationSectionHandlerTest()
        {
            var builder = new ConfigurationBuilder();

            builder.AddJsonFile("appsettings.json");
            var cr = builder.Build();

            _uut = new AbotConfigurationSectionHandler(cr);
        }
Exemplo n.º 4
0
        private CrawlConfiguration GetCrawlConfigurationFromConfigFile()
        {
            AbotConfigurationSectionHandler configFromFile = AbotConfigurationSectionHandler.LoadFromXml();

            if (configFromFile == null)
            {
                throw new ApplicationException("Config section \"abot\" was NOT found");
            }

            return(configFromFile.Convert());
        }
Exemplo n.º 5
0
        private CrawlConfiguration GetCrawlConfigurationFromConfigFile()
        {
            AbotConfigurationSectionHandler configFromFile = AbotConfigurationSectionHandler.LoadFromXml();

            if (configFromFile == null)
            {
                throw new InvalidOperationException("abot config section was NOT found");
            }

            _logger.DebugFormat("abot config section was found");
            return(configFromFile.Convert());
        }
Exemplo n.º 6
0
        public void Crawl_RetryEnabled_VerifyCrawlResultIsAsExpected()
        {
            new PageRequester(new CrawlConfiguration {
                UserAgentString = "aaa"
            }).MakeRequest(new Uri("http://localhost.fiddler:1111/PageGenerator/ClearCounters"));

            CrawlConfiguration configuration = AbotConfigurationSectionHandler.LoadFromXml().Convert();

            configuration.MaxRetryCount = 3;
            configuration.MinRetryDelayInMilliseconds = 2000;

            base.CrawlAndAssert(new PoliteWebCrawler(configuration));
        }
Exemplo n.º 7
0
        private static PoliteWebCrawler CreateCrawler(int recursionDepth, int maxLinks)
        {
            CrawlConfiguration crawlConfig = AbotConfigurationSectionHandler.LoadFromXml().Convert();

            crawlConfig.MaxCrawlDepth        = recursionDepth;
            crawlConfig.MaxConcurrentThreads = 20;
            crawlConfig.MaxLinksPerPage      = maxLinks;
            //Must pass an instance of AngleSharpHyperlinkParser to override default customized HAP parser, which is incompatible with my installed HAP dll
            PoliteWebCrawler crawler = new PoliteWebCrawler(crawlConfig, null, null, null, null, new AngleSharpHyperlinkParser(), null, null, null);

            crawler.PageCrawlCompletedAsync += crawler_ProcessPageCrawlCompleted;
            crawler.PageCrawlStartingAsync  += crawler_ProcessPageCrawlStarting;
            return(crawler);
        }
Exemplo n.º 8
0
        private CrawlConfiguration GetCrawlConfigurationFromConfigFile()
        {
            AbotConfigurationSectionHandler configFromFile = null;

            try{ configFromFile = AbotConfigurationSectionHandler.LoadFromXml(); } catch {}

            if (configFromFile == null)
            {
                _logger.DebugFormat("abot config section was NOT found");
                return(null);
            }

            _logger.DebugFormat("abot config section was found");
            return(configFromFile.Convert());
        }
Exemplo n.º 9
0
        private WebCrawler CreateCrawler(IThreadManager threadManager)
        {
            CrawlConfiguration crawlConfig = AbotConfigurationSectionHandler.LoadFromXml().Convert();

            crawlConfig.MaxConcurrentThreads = 10;//this overrides the config value
            crawlConfig.MaxCrawlDepth        = 3;

            //Will use the manually created crawlConfig object created above
            PoliteWebCrawler crawler = new PoliteWebCrawler(crawlConfig, new AmazonPageDecisionMaker(_amazonHelper), threadManager, null, new AmazonPageRequester(crawlConfig), new AmazonHyperLinkParser(_amazonHelper), null, null, null);

            crawler.PageCrawlStarting        += crawler_ProcessPageCrawlStarting;
            crawler.PageCrawlCompleted       += crawler_ProcessPageCrawlCompleted;
            crawler.PageCrawlDisallowed      += crawler_PageCrawlDisallowed;
            crawler.PageLinksCrawlDisallowed += crawler_PageLinksCrawlDisallowed;
            return(crawler);
        }
Exemplo n.º 10
0
        private void ExecuteStartCrawlCommand()
        {
            try
            {
                CrawlConfiguration crawlConfig = AbotConfigurationSectionHandler.LoadFromXml().Convert();
                crawlConfig.MaxCrawlDepth   = Convert.ToInt32(MaxDepth); //this overrides the config value
                crawlConfig.MaxPagesToCrawl = Convert.ToInt32(MaxPages); //this overrides the config value

                WelcomeTitle = string.Empty;
                AbotManager m = new AbotManager();
                m.MessageUpdate += M_MessageUpdate;
                m.RunCrawl(_crawlUrl, _localFolder, crawlConfig);
            }
            catch (Exception ex)
            {
                M_MessageUpdate(this, new MessageEventArgs(ex.Message));
            }
        }
Exemplo n.º 11
0
        public async Task Crawl_RetryEnabled_VerifyCrawlResultIsAsExpected()
        {
            await new PageRequester(new CrawlConfiguration {
                UserAgentString = "aaa"
            }).MakeRequestAsync(new Uri("http://localhost:1111/PageGenerator/ClearCounters"));

            var builder = new ConfigurationBuilder();

            builder.AddJsonFile("appsettings.json");
            var cr = builder.Build();

            CrawlConfiguration configuration = new AbotConfigurationSectionHandler(cr).Convert();

            configuration.MaxRetryCount = 3;
            configuration.MinRetryDelayInMilliseconds = 2000;

            await base.CrawlAndAssertAsync(new PoliteWebCrawler(configuration));
        }
Exemplo n.º 12
0
        private PoliteWebCrawler SetUp()
        {
            XmlConfigurator.Configure();

            CrawlConfiguration crawlConfig = AbotConfigurationSectionHandler.LoadFromXml().Convert();

            crawlConfig.MaxConcurrentThreads = 5;//this overrides the config value

            // Ostrożnie z głębokością, 0 i tak sporo zwraca rekordów
            crawlConfig.MaxCrawlDepth = 0;

            //Will use app.config for configuration
            PoliteWebCrawler crawler = new PoliteWebCrawler();

            crawler.ShouldCrawlPage(ShouldCrawlPage);

            crawler.PageCrawlCompletedAsync       += crawler_ProcessPageCrawlCompleted;
            crawler.PageCrawlDisallowedAsync      += crawler_PageCrawlDisallowed;
            crawler.PageLinksCrawlDisallowedAsync += crawler_PageLinksCrawlDisallowed;
            return(crawler);
        }
        public void SectionHandlerDefaults_MatchPocoDefaults()
        {
            _uut = new AbotConfigurationSectionHandler();
            CrawlConfiguration pocoDefaults = new CrawlConfiguration();

            Assert.AreEqual(pocoDefaults.ConfigurationExtensions.Count, _uut.ExtensionValues.Count);
            Assert.AreEqual(pocoDefaults.CrawlTimeoutSeconds, _uut.CrawlBehavior.CrawlTimeoutSeconds);
            Assert.AreEqual(pocoDefaults.DownloadableContentTypes, _uut.CrawlBehavior.DownloadableContentTypes);
            Assert.AreEqual(pocoDefaults.IsExternalPageCrawlingEnabled, _uut.CrawlBehavior.IsExternalPageCrawlingEnabled);
            Assert.AreEqual(pocoDefaults.IsExternalPageLinksCrawlingEnabled, _uut.CrawlBehavior.IsExternalPageLinksCrawlingEnabled);
            Assert.AreEqual(pocoDefaults.IsRespectRobotsDotTextEnabled, _uut.Politeness.IsRespectRobotsDotTextEnabled);
            Assert.AreEqual(pocoDefaults.IsRespectMetaRobotsNoFollowEnabled, _uut.Politeness.IsRespectMetaRobotsNoFollowEnabled);
            Assert.AreEqual(pocoDefaults.IsRespectHttpXRobotsTagHeaderNoFollowEnabled, _uut.Politeness.IsRespectHttpXRobotsTagHeaderNoFollowEnabled);
            Assert.AreEqual(pocoDefaults.IsRespectAnchorRelNoFollowEnabled, _uut.Politeness.IsRespectAnchorRelNoFollowEnabled);
            Assert.AreEqual(pocoDefaults.IsIgnoreRobotsDotTextIfRootDisallowedEnabled, _uut.Politeness.IsIgnoreRobotsDotTextIfRootDisallowedEnabled);
            Assert.AreEqual(pocoDefaults.IsUriRecrawlingEnabled, _uut.CrawlBehavior.IsUriRecrawlingEnabled);
            Assert.AreEqual(pocoDefaults.MaxConcurrentThreads, _uut.CrawlBehavior.MaxConcurrentThreads);
            Assert.AreEqual(pocoDefaults.MaxRobotsDotTextCrawlDelayInSeconds, _uut.Politeness.MaxRobotsDotTextCrawlDelayInSeconds);
            Assert.AreEqual(pocoDefaults.MaxPagesToCrawl, _uut.CrawlBehavior.MaxPagesToCrawl);
            Assert.AreEqual(pocoDefaults.MaxPagesToCrawlPerDomain, _uut.CrawlBehavior.MaxPagesToCrawlPerDomain);
            Assert.AreEqual(pocoDefaults.MinCrawlDelayPerDomainMilliSeconds, _uut.Politeness.MinCrawlDelayPerDomainMilliSeconds);
            Assert.AreEqual(pocoDefaults.UserAgentString, _uut.CrawlBehavior.UserAgentString);
            Assert.AreEqual(pocoDefaults.RobotsDotTextUserAgentString, _uut.Politeness.RobotsDotTextUserAgentString);
            Assert.AreEqual(pocoDefaults.MaxPageSizeInBytes, _uut.CrawlBehavior.MaxPageSizeInBytes);
            Assert.AreEqual(pocoDefaults.HttpServicePointConnectionLimit, _uut.CrawlBehavior.HttpServicePointConnectionLimit);
            Assert.AreEqual(pocoDefaults.IsSslCertificateValidationEnabled, _uut.CrawlBehavior.IsSslCertificateValidationEnabled);
            Assert.AreEqual(pocoDefaults.HttpRequestTimeoutInSeconds, _uut.CrawlBehavior.HttpRequestTimeoutInSeconds);
            Assert.AreEqual(pocoDefaults.HttpRequestMaxAutoRedirects, _uut.CrawlBehavior.HttpRequestMaxAutoRedirects);
            Assert.AreEqual(pocoDefaults.IsHttpRequestAutoRedirectsEnabled, _uut.CrawlBehavior.IsHttpRequestAutoRedirectsEnabled);
            Assert.AreEqual(pocoDefaults.IsHttpRequestAutomaticDecompressionEnabled, _uut.CrawlBehavior.IsHttpRequestAutomaticDecompressionEnabled);
            Assert.AreEqual(pocoDefaults.IsSendingCookiesEnabled, _uut.CrawlBehavior.IsSendingCookiesEnabled);
            Assert.AreEqual(pocoDefaults.MaxMemoryUsageCacheTimeInSeconds, _uut.CrawlBehavior.MaxMemoryUsageCacheTimeInSeconds);
            Assert.AreEqual(pocoDefaults.MaxMemoryUsageInMb, _uut.CrawlBehavior.MaxMemoryUsageInMb);
            Assert.AreEqual(pocoDefaults.MinAvailableMemoryRequiredInMb, _uut.CrawlBehavior.MinAvailableMemoryRequiredInMb);
            Assert.AreEqual(pocoDefaults.MaxCrawlDepth, _uut.CrawlBehavior.MaxCrawlDepth);
            Assert.AreEqual(pocoDefaults.MaxLinksPerPage, _uut.CrawlBehavior.MaxLinksPerPage);
            Assert.AreEqual(pocoDefaults.IsForcedLinkParsingEnabled, _uut.CrawlBehavior.IsForcedLinkParsingEnabled);
            Assert.AreEqual(pocoDefaults.MaxRetryCount, _uut.CrawlBehavior.MaxRetryCount);
            Assert.AreEqual(pocoDefaults.MinRetryDelayInMilliseconds, _uut.CrawlBehavior.MinRetryDelayInMilliseconds);
        }
Exemplo n.º 14
0
        public PoliteWebCrawler CreateCrawler()
        {
            _dataFinder = new DataFinder(new KomputronikDataExtractor());

            XmlConfigurator.Configure();

            CrawlConfiguration crawlConfig = AbotConfigurationSectionHandler.LoadFromXml().Convert();

            crawlConfig.MaxConcurrentThreads = 15;//this overrides the config value

            crawlConfig.MaxCrawlDepth = 15;

            //Will use app.config for configuration
            PoliteWebCrawler crawler = new PoliteWebCrawler();

            crawler.ShouldCrawlPage(ShouldCrawlPage);
            crawler.ShouldDownloadPageContent(ShouldCrawlPageContent);
            crawler.ShouldCrawlPageLinks(ShouldCrawlPageLinks);

            crawler.PageCrawlCompletedAsync       += crawler_ProcessPageCrawlCompleted;
            crawler.PageCrawlDisallowedAsync      += crawler_PageCrawlDisallowed;
            crawler.PageLinksCrawlDisallowedAsync += crawler_PageLinksCrawlDisallowed;
            return(crawler);
        }
        public int DoCrawl()
        {
            CrawlConfiguration CConfig = AbotConfigurationSectionHandler.LoadFromXml().Convert();

            CConfig.MaxConcurrentThreads        = maxConcurrentThreads;
            CConfig.MaxPagesToCrawl             = maxPagesToCrawl;
            CConfig.CrawlTimeoutSeconds         = crawlTimeoutSeconds;
            CConfig.HttpRequestTimeoutInSeconds = httpRequestTimeoutInSeconds;
            CConfig.LoginUser     = loginUser;
            CConfig.LoginPassword = loginPassword;

            Console.WriteLine("Doing Crawl With Slack " + (slackBotEnabled ? "Enabled" : "Disabled"));

            PoliteWebCrawler crawler = new PoliteWebCrawler(CConfig, null, null, null, null, null, null, null, null);

            //PoliteWebCrawler crawler = new PoliteWebCrawler();

            errors = new List <Errors>();


            crawler.PageCrawlStartingAsync        += crawler_ProcessPageCrawlStarting;
            crawler.PageCrawlCompletedAsync       += crawler_ProcessPageCrawlCompleted;
            crawler.PageCrawlDisallowedAsync      += crawler_PageCrawlDisallowed;
            crawler.PageLinksCrawlDisallowedAsync += crawler_PageLinksCrawlDisallowed;

            CrawlResult result = crawler.Crawl(new Uri(URL)); //This is synchronous, it will not go to the next line until the crawl has completed

            if (result.ErrorOccurred)
            {
                Console.WriteLine("Crawl of {0} completed with error: {1}", result.RootUri.AbsoluteUri, result.ErrorException.Message);
            }
            else
            {
                Console.WriteLine("Crawl of {0} completed without error.", result.RootUri.AbsoluteUri);
            }

            IEnumerable <Errors> EnumList = errors.AsEnumerable();

            for (int i = 0; i < 525; i++)
            {
                if (EnumList.Where(x => x.ErrorCode == i).Count() != 0)
                {
                    returnInt = 1;
                    Console.ForegroundColor = ConsoleColor.Yellow;
                    Console.WriteLine(i + " (" + getErrorName(i) + ") Errors:");
                    slackMessage           += i + " (" + getErrorName(i) + ") Errors:\n";
                    Console.ForegroundColor = ConsoleColor.Red;
                    foreach (Errors err in EnumList.Where(x => x.ErrorCode == i))
                    {
                        Console.WriteLine("   " + err.ErrorURL);
                        slackMessage += "   " + err.ErrorURL + "\n";
                    }
                }
            }

            Console.ResetColor();

            if (slackMessage == "")
            {
                slackMessage = "No Errors In WebPage!";
            }

            Console.ForegroundColor = ConsoleColor.Green;
            Console.WriteLine("Done");
            Console.ResetColor();
            return(returnInt);
        }
Exemplo n.º 16
0
        static void Main(string[] args)
        {
            CrawlConfiguration crawlConfig = AbotConfigurationSectionHandler.LoadFromXml().Convert();

            crawlConfig.MaxConcurrentThreads = 5;//this overrides the config value
            crawlConfig.MaxCrawlDepth        = 0;
            crawler = new PoliteWebCrawler();
            crawler.PageCrawlStartingAsync        += crawler_ProcessPageCrawlStarting;
            crawler.PageCrawlCompletedAsync       += crawler_ProcessPageCrawlCompleted;
            crawler.PageCrawlDisallowedAsync      += crawler_PageCrawlDisallowed;
            crawler.PageLinksCrawlDisallowedAsync += crawler_PageLinksCrawlDisallowed;

            //var doc = new HtmlDocument();
            //doc.Load(@"C:\Users\lucao\Downloads\keketest.html");
            //var embedNodes = doc.DocumentNode.SelectSingleNode("//script[contains(text(), 'thunder_url')]");
            //var domain = Regex.Match(embedNodes.InnerText, @".*domain.*'(.*)'").Groups[1].ToString();
            //var thunder_url = Regex.Match(embedNodes.InnerText, ".*thunder_url.*\"(.*)\"").Groups[1].ToString();
            //var downloadMp3Link = domain + thunder_url;


            CrawlResult result;

            for (int i = 58; i > 30; i--)
            {
                DownloadLinkList.Clear();
                Thread.Sleep(60000);
                result = crawler.Crawl(new Uri($"http://www.kekenet.com/Article/15410/List_{i}.shtml"));
                if (result.ErrorOccurred)
                {
                    Console.WriteLine("Crawl of {0} completed with error: {1}", result.RootUri.AbsoluteUri, result.ErrorException.Message);
                }
                else
                {
                    Console.WriteLine("Crawl of {0} completed without error.", result.RootUri.AbsoluteUri);
                }

                if (DownloadLinkList.Count > 0)
                {
                    DownloadMP3LinkList.Clear();
                    foreach (var link in DownloadLinkList)
                    {
                        var sub_crawler = new PoliteWebCrawler();
                        sub_crawler.PageCrawlStartingAsync        += sub_crawler_ProcessPageCrawlStarting;
                        sub_crawler.PageCrawlCompletedAsync       += sub_crawler_ProcessPageCrawlCompleted;
                        sub_crawler.PageCrawlDisallowedAsync      += sub_crawler_PageCrawlDisallowed;
                        sub_crawler.PageLinksCrawlDisallowedAsync += sub_crawler_PageLinksCrawlDisallowed;
                        sub_crawler.Crawl(new Uri(link));
                        Thread.Sleep(20000);
                        sub_crawler?.Dispose();
                    }
                }
                //"http://k6.kekenet.com/Sound/2018/01/scad180110.mp3"
                if (DownloadMP3LinkList.Count > 0)
                {
                    foreach (var mp3Link in DownloadMP3LinkList)
                    {
                        WebClient client = new WebClient();
                        Uri       ur     = new Uri(mp3Link);
                        client.DownloadProgressChanged += WebClientDownloadProgressChanged;
                        client.DownloadDataCompleted   += WebClientDownloadCompleted;
                        var file = @"C:\Users\lucao\Downloads\keke\" + mp3Link.Split('/').Last().ToString();
                        client.DownloadFile(ur, file);
                        Thread.Sleep(60000);
                    }
                }
            }
        }