Exemple #1
0
        public void RunWebCrawler(string rootURI, string[] uriContains)
        {
            if (rootURI == null || rootURI == "")
                throw new ArgumentNullException();

            Uri uri = new Uri(rootURI);

            CrawlConfiguration crawlConfig = AbotConfigurationSectionHandler.LoadFromXml().Convert();
            crawlConfig.CrawlTimeoutSeconds = 100;
            crawlConfig.MaxConcurrentThreads = 10;
            crawlConfig.MaxPagesToCrawl = 1000;

            XmlConfigurator.Configure();

            PoliteWebCrawler webCrawler = new PoliteWebCrawler();

            webCrawler.PageCrawlStartingAsync += crawler_ProcessPageCrawlStarting;
            webCrawler.PageCrawlCompletedAsync += crawler_ProcessPageCrawlCompleted;
            webCrawler.PageCrawlDisallowedAsync += crawler_PageCrawlDisallowed;
            webCrawler.PageLinksCrawlDisallowedAsync += crawler_PageLinksCrawlDisallowed;

            DBConfiguration dbConfig = DBConfiguration.GetDBConfiguration();
            dbConfig = DBConfigurationSectionHandler.LoadFromXml().Convert();

            IDatabase database = DBFactory.GetDatabase(dbConfig.DatabaseType);
            Func<IDatabase> databaseGenerator = database.GetDatabaseGenerator();
            _databaseConnectionPool = new DBConnectionPool(databaseGenerator);

            string userGUID = Guid.NewGuid().ToString();
            CreateProject(rootURI, userGUID);

            if (uriContains != null)
            {
                foreach (string uriContent in uriContains)
                {
                    //webCrawler.ShouldCrawlPage((pageToCrawl, crawlContext) =>
                    //{
                    //    CrawlDecision decision = new CrawlDecision();
                    //    Match match = Regex.Match(pageToCrawl.Uri.ToString(), uriContent, RegexOptions.IgnoreCase);
                    //    if (!match.Success)
                    //        return new CrawlDecision { Allow = false, Reason = "Include all uri parts" };
                    //
                    //    return decision;
                    //});
                }
            }

            CrawlResult result = webCrawler.Crawl(uri);

            if (result.ErrorOccurred)
                Console.WriteLine("Crawl of {0} completed with ERROR!!!", result.RootUri.AbsoluteUri);
            else
                Console.WriteLine("Crawl of {0} completed without error.", result.RootUri.AbsoluteUri);

            _databaseConnectionPool.CloseAllConnections();
            _databaseConnectionPool = null;
        }
Exemple #2
0
        public void RunWebCrawler(String rootURI, String[] uriContains)
        {
            if (rootURI == null || rootURI == "")
                throw new ArgumentNullException();

            Uri uri = new Uri(rootURI);

            CrawlConfiguration crawlConfig = AbotConfigurationSectionHandler.LoadFromXml().Convert();
            crawlConfig.CrawlTimeoutSeconds = 100;
            crawlConfig.MaxConcurrentThreads = 10;
            crawlConfig.MaxPagesToCrawl = 1000;

            XmlConfigurator.Configure();

            PoliteWebCrawler webCrawler = new PoliteWebCrawler();

            webCrawler.PageCrawlStartingAsync += crawler_ProcessPageCrawlStarting;
            webCrawler.PageCrawlCompletedAsync += crawler_ProcessPageCrawlCompleted;
            webCrawler.PageCrawlDisallowedAsync += crawler_PageCrawlDisallowed;
            webCrawler.PageLinksCrawlDisallowedAsync += crawler_PageLinksCrawlDisallowed;

            DBConfiguration dbConfig = DBConfiguration.GetDBConfiguration();
            dbConfig = DBConfigurationSectionHandler.LoadFromXml().Convert();

            IDatabase database = DBFactory.GetDatabase(dbConfig.DatabaseType);
            Func<IDatabase> databaseGenerator = database.GetDatabaseGenerator();
            _databaseConnectionPool = new DBConnectionPool(databaseGenerator);

            if (uriContains != null)
            {
                foreach (String uriContent in uriContains)
                {

                }
            }

            CrawlResult result = webCrawler.Crawl(uri);

            if (result.ErrorOccurred)
                Console.WriteLine("Crawl of {0} completed with ERROR!!!", result.RootUri.AbsoluteUri);
            else
                Console.WriteLine("Crawl of {0} completed without error.", result.RootUri.AbsoluteUri);

            _databaseConnectionPool.CloseAllConnections();
            _databaseConnectionPool = null;
        }