Beispiel #1
0
        private void InitBrowser()
        {
            browser.LoadingStateChanged += OnLoadingStateChanged;

            // browser.StatusMessage += OnBrowserStatusMessage;
            // browser.TitleChanged += OnBrowserTitleChanged;
            browser.AddressChanged += OnBrowserAddressChanged;
            browser.ConsoleMessage += Browser_ConsoleMessage;
            browser.FrameLoadEnd   += OnFrameLoadEnd;

            browser.RequestHandler = new AllRequestHandler();

            var requesthander = browser.RequestHandler as AllRequestHandler;

            requesthander.NotifyData += Requesthander_NotifyData;

            //  browser.JsDialogHandler = new AllJsDialogHandler(); ..no use

            _WebPageManager = new WebPageManager(browser);

            if (Uri.IsWellFormedUriString(urlText.Text, UriKind.RelativeOrAbsolute))
            {
                //  browser.Load(urlText.Text);
            }
        }
        public void test_fruit_strawberry_ladder()
        {
            var firstPageLink = "https://en.wikipedia.org/wiki/Fruit";
            var endPageLink   = "https://en.wikipedia.org/wiki/Strawberry";

            var startPage = new WebPage(WebPageManager.GetPageToString(firstPageLink), firstPageLink);
            var endPage   = new WebPage(WebPageManager.GetPageToString(endPageLink), endPageLink);

            var expected = new List <WebPage>()
            {
                startPage,
                endPage
            };

            var wikiRacer = new WikiRacer(endPage);
            var result    = wikiRacer.GetLadder(startPage);
        }
Beispiel #3
0
        /// <summary>
        ///     Initializes a new instance of the <see cref = "Crawl" /> class.
        /// </summary>
        /// <param name = "crawler">The crawler.</param>
        /// <param name="actionManager"></param>
        /// <param name="crawlRequestManager"></param>
        /// <param name="discoveryManager"></param>
        /// <param name="htmlManager"></param>
        /// <param name="politenessManager"></param>
        /// <param name="ruleManager"></param>
        /// <param name = "processData">if set to <c>true</c> [process data].</param>
        public Crawl(ApplicationSettings applicationSettings, WebSettings webSettings, Crawler <TArachnodeDAO> crawler, ActionManager <TArachnodeDAO> actionManager, ConsoleManager <TArachnodeDAO> consoleManager, CookieManager cookieManager, CrawlRequestManager <TArachnodeDAO> crawlRequestManager, DataTypeManager <TArachnodeDAO> dataTypeManager, DiscoveryManager <TArachnodeDAO> discoveryManager, EncodingManager <TArachnodeDAO> encodingManager, HtmlManager <TArachnodeDAO> htmlManager, PolitenessManager <TArachnodeDAO> politenessManager, ProxyManager <TArachnodeDAO> proxyManager, RuleManager <TArachnodeDAO> ruleManager, bool processData)
        {
            _applicationSettings = applicationSettings;
            _webSettings         = webSettings;

            UncrawledCrawlRequests = new PriorityQueue <CrawlRequest <TArachnodeDAO> >();
            UnassignedDiscoveries  = new HashSet <string>();

            _crawler = crawler;

            _crawlInfo.MaximumCrawlDepth = 1;

            _actionManager       = actionManager;
            _consoleManager      = consoleManager;
            _cookieManager       = cookieManager;
            _crawlRequestManager = crawlRequestManager;
            _dataTypeManager     = dataTypeManager;
            _discoveryManager    = discoveryManager;
            _encodingManager     = encodingManager;
            _htmlManager         = htmlManager;
            _politenessManager   = politenessManager;
            _proxyManager        = proxyManager;
            _ruleManager         = ruleManager;

            _processData = processData;

            _arachnodeDAO = (TArachnodeDAO)Activator.CreateInstance(typeof(TArachnodeDAO), _applicationSettings.ConnectionString, _applicationSettings, _webSettings, false, false);
            _arachnodeDAO.ApplicationSettings = applicationSettings;

            //_arachnodeDAO.OpenCommandConnections();

            _dataManager    = new DataManager <TArachnodeDAO>(_applicationSettings, _webSettings, _actionManager, _dataTypeManager, _discoveryManager, _ruleManager, _arachnodeDAO);
            _fileManager    = new FileManager <TArachnodeDAO>(_applicationSettings, _webSettings, _discoveryManager, _arachnodeDAO);
            _imageManager   = new ImageManager <TArachnodeDAO>(_applicationSettings, _webSettings, _discoveryManager, _arachnodeDAO);
            _webClient      = new WebClient <TArachnodeDAO>(_applicationSettings, _webSettings, _consoleManager, _cookieManager, _proxyManager);
            _webPageManager = new WebPageManager <TArachnodeDAO>(_applicationSettings, _webSettings, _discoveryManager, _htmlManager, _arachnodeDAO);
        }
Beispiel #4
0
        /// <summary>
        ///     Process a range of WebPageID after crawling.  Useful if crawled WebPages were not processed at crawl time according to desired ApplicationSettings configuration.
        ///     Calling this method DOES change the 'LastDiscovered' fields where applicable.
        ///     This method is not when crawling, rather during post-processing.
        /// </summary>
        /// <param name = "webPageIDLowerBound"></param>
        /// <param name = "webPageIDUpperBound"></param>
        public static void ProcessWebPages(Crawler <TArachnodeDAO> crawler, long webPageIDLowerBound, long webPageIDUpperBound)
        {
            //do not assign the application settings.  doing so will override the ApplicationSetting you set before calling this method...
            TArachnodeDAO arachnodeDAO = (TArachnodeDAO)Activator.CreateInstance(typeof(TArachnodeDAO), crawler.ApplicationSettings.ConnectionString, crawler.ApplicationSettings, crawler.WebSettings, false, false);

            ConsoleManager <TArachnodeDAO> consoleManager = new ConsoleManager <TArachnodeDAO>(crawler.ApplicationSettings, crawler.WebSettings);
            ActionManager <TArachnodeDAO>  actionManager  = new ActionManager <TArachnodeDAO>(crawler.ApplicationSettings, crawler.WebSettings, consoleManager);
            CookieManager cookieManager = new CookieManager();;
            MemoryManager <TArachnodeDAO>      memoryManager      = new MemoryManager <TArachnodeDAO>(crawler.ApplicationSettings, crawler.WebSettings);
            RuleManager <TArachnodeDAO>        ruleManager        = new RuleManager <TArachnodeDAO>(crawler.ApplicationSettings, crawler.WebSettings, consoleManager);
            CacheManager <TArachnodeDAO>       cacheManager       = new CacheManager <TArachnodeDAO>(crawler.ApplicationSettings, crawler.WebSettings);
            CrawlerPeerManager <TArachnodeDAO> crawlerPeerManager = new CrawlerPeerManager <TArachnodeDAO>(crawler.ApplicationSettings, crawler.WebSettings, null, arachnodeDAO);
            Cache <TArachnodeDAO>            cache            = new Cache <TArachnodeDAO>(crawler.ApplicationSettings, crawler.WebSettings, crawler, actionManager, cacheManager, crawlerPeerManager, memoryManager, ruleManager);
            DiscoveryManager <TArachnodeDAO> discoveryManager = new DiscoveryManager <TArachnodeDAO>(crawler.ApplicationSettings, crawler.WebSettings, cache, actionManager, cacheManager, memoryManager, ruleManager);
            HtmlManager <TArachnodeDAO>      htmlManager      = new HtmlManager <TArachnodeDAO>(crawler.ApplicationSettings, crawler.WebSettings, discoveryManager);

            //load the CrawlActions, CrawlRules and EngineActions...
            ruleManager.ProcessCrawlRules(crawler);
            actionManager.ProcessCrawlActions(crawler);
            actionManager.ProcessEngineActions(crawler);

            //these three methods are called in the Engine.
            UserDefinedFunctions.RefreshAllowedExtensions(true);
            UserDefinedFunctions.RefreshAllowedSchemes(true);
            UserDefinedFunctions.RefreshDisallowed();

            //instantiate a WebClient to access the ResponseHeaders...
            WebClient <TArachnodeDAO> webClient = new WebClient <TArachnodeDAO>(crawler.ApplicationSettings, crawler.WebSettings, consoleManager, cookieManager, new ProxyManager <TArachnodeDAO>(crawler.ApplicationSettings, crawler.WebSettings, consoleManager));

            webClient.GetHttpWebResponse("http://google.com", "GET", null, null, null, null);

            WebPageManager <TArachnodeDAO> webPageManager = new WebPageManager <TArachnodeDAO>(crawler.ApplicationSettings, crawler.WebSettings, discoveryManager, htmlManager, arachnodeDAO);

            for (long i = webPageIDLowerBound; i <= webPageIDUpperBound; i++)
            {
                ArachnodeDataSet.WebPagesRow webPagesRow = null;

                try
                {
                    //get the WebPage from the database.  we need the source data as we don't store this in the index.
                    //even though most of the fields are available in the Document, the WebPage is the authoritative source, so we'll use that for all of the fields.
                    webPagesRow = arachnodeDAO.GetWebPage(i.ToString());

                    if (webPagesRow != null)
                    {
                        if (webPagesRow.Source == null || webPagesRow.Source.Length == 0)
                        {
                            if (File.Exists(discoveryManager.GetDiscoveryPath(crawler.ApplicationSettings.DownloadedWebPagesDirectory, webPagesRow.AbsoluteUri, webPagesRow.FullTextIndexType)))
                            {
                                using (StreamReader streamReader = File.OpenText(discoveryManager.GetDiscoveryPath(crawler.ApplicationSettings.DownloadedWebPagesDirectory, webPagesRow.AbsoluteUri, webPagesRow.FullTextIndexType)))
                                {
                                    webPagesRow.Source = Encoding.UTF8.GetBytes(streamReader.ReadToEnd());
                                }
                            }
                            else
                            {
                                Console.WriteLine("WebPageID: " + i + " was NOT processed successfully.");
                                if (OnWebPageProcessed != null)
                                {
                                    OnWebPageProcessed.BeginInvoke(webPagesRow, "WebPageID: " + i + " was NOT processed successfully.", null, null);
                                }
                            }
                        }

                        ProcessWebPage(crawler.ApplicationSettings, crawler.WebSettings, crawler, webPagesRow, webClient, cache, actionManager, consoleManager, crawlerPeerManager, discoveryManager, memoryManager, ruleManager, webPageManager, arachnodeDAO);

                        Console.WriteLine("WebPageID: " + i + " was processed successfully.");
                        if (OnWebPageProcessed != null)
                        {
                            OnWebPageProcessed.BeginInvoke(webPagesRow, "WebPageID: " + i + " was processed successfully.", null, null);
                        }
                    }
                }
                catch (Exception exception)
                {
                    Console.WriteLine("WebPageID: " + i + " was NOT processed successfully.");
                    Console.WriteLine(exception.Message);

                    if (OnWebPageProcessed != null)
                    {
                        OnWebPageProcessed.BeginInvoke(webPagesRow, "WebPageID: " + i + " was NOT processed successfully.", null, null);
                        OnWebPageProcessed.BeginInvoke(webPagesRow, exception.Message, null, null);
                    }

                    arachnodeDAO.InsertException(null, null, exception, false);
                }
            }

            //stop the CrawlActions, CrawlRules and EngineActions...
            ruleManager.Stop();
            actionManager.Stop();
        }
Beispiel #5
0
        /// <summary>
        ///     Processes a WebPagesRow after crawling.
        /// </summary>
        /// <param name = "webPagesRow">The web pages row.</param>
        /// <param name="webClient"></param>
        /// <param name="actionManager"></param>
        /// <param name="consoleManager"></param>
        /// <param name="discoveryManager"></param>
        /// <param name="memoryManager"></param>
        /// <param name="ruleManager"></param>
        /// <param name = "webPageManager">The web page manager.</param>
        /// <param name = "arachnodeDAO">The arachnode DAO.</param>
        /// <param name = "fileManager">The file manager.</param>
        /// <param name = "imageManager">The image manager.</param>
        public static void ProcessWebPage(ApplicationSettings applicationSettings, WebSettings webSettings, Crawler <TArachnodeDAO> crawler, ArachnodeDataSet.WebPagesRow webPagesRow, WebClient <TArachnodeDAO> webClient, Cache <TArachnodeDAO> cache, ActionManager <TArachnodeDAO> actionManager, ConsoleManager <TArachnodeDAO> consoleManager, CrawlerPeerManager <TArachnodeDAO> crawlerPeerManager, DiscoveryManager <TArachnodeDAO> discoveryManager, MemoryManager <TArachnodeDAO> memoryManager, RuleManager <TArachnodeDAO> ruleManager, WebPageManager <TArachnodeDAO> webPageManager, IArachnodeDAO arachnodeDAO)
        {
            CacheManager <TArachnodeDAO> cacheManager = new CacheManager <TArachnodeDAO>(applicationSettings, webSettings);
            CookieManager cookieManager = new CookieManager();
            CrawlRequestManager <TArachnodeDAO> crawlRequestManager = new CrawlRequestManager <TArachnodeDAO>(applicationSettings, webSettings, cache, consoleManager, discoveryManager);
            DataTypeManager <TArachnodeDAO>     dataTypeManager     = new DataTypeManager <TArachnodeDAO>(applicationSettings, webSettings);
            EncodingManager <TArachnodeDAO>     encodingManager     = new EncodingManager <TArachnodeDAO>(applicationSettings, webSettings);
            PolitenessManager <TArachnodeDAO>   politenessManager   = new PolitenessManager <TArachnodeDAO>(applicationSettings, webSettings, cache);
            ProxyManager <TArachnodeDAO>        proxyManager        = new ProxyManager <TArachnodeDAO>(applicationSettings, webSettings, consoleManager);
            HtmlManager <TArachnodeDAO>         htmlManager         = new HtmlManager <TArachnodeDAO>(applicationSettings, webSettings, discoveryManager);
            Crawl <TArachnodeDAO> crawl = new Crawl <TArachnodeDAO>(applicationSettings, webSettings, crawler, actionManager, consoleManager, cookieManager, crawlRequestManager, dataTypeManager, discoveryManager, encodingManager, htmlManager, politenessManager, proxyManager, ruleManager, true);

            //create a CrawlRequest as this is what the internals of SiteCrawler.dll expect to operate on...
            CrawlRequest <TArachnodeDAO> crawlRequest = new CrawlRequest <TArachnodeDAO>(new Discovery <TArachnodeDAO>(webPagesRow.AbsoluteUri), webPagesRow.CrawlDepth, UriClassificationType.Host, UriClassificationType.Host, 0, RenderType.None, RenderType.None);

            crawlRequest.Crawl = crawl;
            crawlRequest.Discovery.DiscoveryType = DiscoveryType.WebPage;
            crawlRequest.Discovery.ID            = webPagesRow.ID;
            crawlRequest.Data         = webPagesRow.Source;
            crawlRequest.CurrentDepth = webPagesRow.CrawlDepth;
            crawlRequest.Encoding     = Encoding.GetEncoding(webPagesRow.CodePage);
            crawlRequest.ProcessData  = true;
            crawlRequest.WebClient    = webClient;

            crawlRequest.WebClient.HttpWebResponse.Headers.Clear();

            //parse the ResponseHeaders from the WebPagesRow.ResponseHeaders string...
            foreach (string responseHeader in webPagesRow.ResponseHeaders.Split("\r\n".ToCharArray(), StringSplitOptions.RemoveEmptyEntries))
            {
                string[] responseHeaderSplit = responseHeader.Split(":".ToCharArray());

                string name  = responseHeaderSplit[0];
                string value = UserDefinedFunctions.ExtractResponseHeader(webPagesRow.ResponseHeaders, name, true).Value;

                crawlRequest.WebClient.HttpWebResponse.Headers.Add(name, value);
            }

            //refresh the DataTypes in the DataTypeManager... (if necessary)...
            if (dataTypeManager.AllowedDataTypes.Count == 0)
            {
                dataTypeManager.RefreshDataTypes();
            }

            crawlRequest.DataType = dataTypeManager.DetermineDataType(crawlRequest);

            //now, process the bytes...
            encodingManager.ProcessCrawlRequest(crawlRequest, arachnodeDAO);

            if (applicationSettings.InsertWebPages)
            {
                crawlRequest.Discovery.ID = arachnodeDAO.InsertWebPage(crawlRequest.Discovery.Uri.AbsoluteUri, crawlRequest.WebClient.HttpWebResponse.Headers.ToString(), applicationSettings.InsertWebPageSource ? crawlRequest.Data : new byte[] { }, crawlRequest.Encoding.CodePage, crawlRequest.DataType.FullTextIndexType, crawlRequest.CurrentDepth, applicationSettings.ClassifyAbsoluteUris);
            }

            crawlRequest.ManagedDiscovery = webPageManager.ManageWebPage(crawlRequest.Discovery.ID.Value, crawlRequest.Discovery.Uri.AbsoluteUri, crawlRequest.Data, crawlRequest.Encoding, crawlRequest.DataType.FullTextIndexType, applicationSettings.ExtractWebPageMetaData, applicationSettings.InsertWebPageMetaData, applicationSettings.SaveDiscoveredWebPagesToDisk);

            //assigning FileAndImageDiscoveries isn't applicable because Files and Images need to be crawled to be properly classified... without classification we don't know whether they belong in dbo.Files or dbo.Images...
            crawlRequestManager.ProcessEmailAddresses(crawlRequest, arachnodeDAO);
            crawlRequestManager.ProcessHyperLinks(crawlRequest, arachnodeDAO);

            actionManager.PerformCrawlActions(crawlRequest, CrawlActionType.PostRequest, arachnodeDAO);

            discoveryManager.CloseAndDisposeManagedDiscovery(crawlRequest, arachnodeDAO);
        }
 /// <summary>
 ///     Processes the web page.
 /// </summary>
 /// <param name = "crawlRequest">The crawl request.</param>
 /// <param name = "webPageManager">The web page manager.</param>
 /// <param name = "arachnodeDAO">The arachnode DAO.</param>
 protected abstract void ProcessWebPage(CrawlRequest <TArachnodeDAO> crawlRequest, WebPageManager <TArachnodeDAO> webPageManager, IArachnodeDAO arachnodeDAO);
 /// <summary>
 ///     Processes the crawl request.
 /// </summary>
 /// <param name = "crawlRequest">The crawl request.</param>
 /// <param name = "fileManager">The file manager.</param>
 /// <param name = "imageManager">The image manager.</param>
 /// <param name = "webPageManager">The web page manager.</param>
 /// <param name = "arachnodeDAO">The arachnode DAO.</param>
 public abstract void ProcessCrawlRequest(CrawlRequest <TArachnodeDAO> crawlRequest, FileManager <TArachnodeDAO> fileManager, ImageManager <TArachnodeDAO> imageManager, WebPageManager <TArachnodeDAO> webPageManager, IArachnodeDAO arachnodeDAO);