Esempio n. 1
0
        protected void Page_Load(object sender, EventArgs e)
        {
            EnableViewState = false;

            try
            {
                if (Request.QueryString.Count == 2 && Request.QueryString.AllKeys[0] == "discoveryID" && Request.QueryString.AllKeys[1] == "absoluteUri")
                {
                    ArachnodeDataSet.WebPagesRow webPagesRow = ArachnodeDAO.GetWebPage(Request.QueryString["discoveryID"]);

                    if (webPagesRow != null)
                    {
                        string source = null;

                        if (webPagesRow.Source == null || webPagesRow.Source.Length == 0)
                        {
                            string discoveryPath = DiscoveryManager.GetDiscoveryPath(ApplicationSettings.DownloadedWebPagesDirectory, webPagesRow.AbsoluteUri, webPagesRow.FullTextIndexType);

                            if (File.Exists(discoveryPath))
                            {
                                source = File.ReadAllText(discoveryPath, Encoding.GetEncoding(webPagesRow.CodePage));
                            }
                            else
                            {
                                uxLblException.Text = "The WebPage source was not found in the database or on disk.";
                                uxLblException.Visible = true;

                                return;
                            }
                        }
                        else
                        {
                            source = Encoding.GetEncoding(webPagesRow.CodePage).GetString(webPagesRow.Source);
                        }

                        //Request.Url.Scheme + "://" + Request.Url.Authority

                        //ANODET: Should this be a configuration setting?  Perhaps - hotlinking isn't exactly polite, but does provide the best user experience.  (Version 1.5)
                        uxLWebPage.Text = HtmlManager.CreateHtmlDocument(webPagesRow.AbsoluteUri, webPagesRow.FullTextIndexType, source, UriQualificationType.AbsoluteWhenDownloadedDiscoveryIsUnavailable, ArachnodeDAO, true).DocumentNode.OuterHtml;
                    }
                    else
                    {
                        uxLblException.Text = "The WebPage was not found in the database.";
                        uxLblException.Visible = true;
                    }
                }
            }
            catch (Exception exception)
            {
                uxLblException.Text = exception.Message;
                uxLblException.Visible = true;

                ArachnodeDAO.InsertException(null, null, exception, false);
            }
        }
Esempio n. 2
0
        private void nudWebPageID_ValueChanged(object sender, EventArgs e)
        {
            _webPagesRow = _arachnodeDAO.GetWebPage(nudWebPageID.Value.ToString());

            if (_webPagesRow != null)
            {
                _webPageDiscoveryPath = _discoveryManager.GetDiscoveryPath(_applicationSettings.DownloadedWebPagesDirectory, _webPagesRow.AbsoluteUri, _webPagesRow.FullTextIndexType);

                llWebPageDiscoveryPathDirectory.Visible = true;
                llWebPageDiscoveryPathDirectory.Text    = Path.GetDirectoryName(_webPageDiscoveryPath);

                if (cbAutoView.Checked)
                {
                    btnViewWebPage_Click(sender, e);
                }
            }
            else
            {
                llWebPageDiscoveryPathDirectory.Visible = false;
                wbMain.DocumentText = "The WebPage with the ID of " + nudWebPageID.Value + " does not exist.";
            }
        }
        /// <summary>
        ///     Process a range of FileID after crawling.  Useful if crawled Files were not processed at crawl time according to desired ApplicationSettings configuration.
        ///     Calling this method DOES change the 'LastDiscovered' fields where applicable.
        ///     This method is not when crawling, rather during post-processing.
        /// </summary>
        /// <param name = "fileIDLowerBound"></param>
        /// <param name = "fileIDUpperBound"></param>
        public static void ProcessFiles(Crawler <TArachnodeDAO> crawler, long fileIDLowerBound, long fileIDUpperBound)
        {
            //do not assign the application settings.  doing so will override the ApplicationSetting you set before calling this method...
            TArachnodeDAO arachnodeDAO = (TArachnodeDAO)Activator.CreateInstance(typeof(TArachnodeDAO), crawler.ApplicationSettings.ConnectionString, crawler.ApplicationSettings, crawler.WebSettings, false, false);

            ConsoleManager <TArachnodeDAO> consoleManager = new ConsoleManager <TArachnodeDAO>(crawler.ApplicationSettings, crawler.WebSettings);
            ActionManager <TArachnodeDAO>  actionManager  = new ActionManager <TArachnodeDAO>(crawler.ApplicationSettings, crawler.WebSettings, consoleManager);
            CookieManager cookieManager = new CookieManager();;
            MemoryManager <TArachnodeDAO>      memoryManager      = new MemoryManager <TArachnodeDAO>(crawler.ApplicationSettings, crawler.WebSettings);
            RuleManager <TArachnodeDAO>        ruleManager        = new RuleManager <TArachnodeDAO>(crawler.ApplicationSettings, crawler.WebSettings, consoleManager);
            CacheManager <TArachnodeDAO>       cacheManager       = new CacheManager <TArachnodeDAO>(crawler.ApplicationSettings, crawler.WebSettings);
            CrawlerPeerManager <TArachnodeDAO> crawlerPeerManager = new CrawlerPeerManager <TArachnodeDAO>(crawler.ApplicationSettings, crawler.WebSettings, null, arachnodeDAO);
            Cache <TArachnodeDAO> cache = new Cache <TArachnodeDAO>(crawler.ApplicationSettings, crawler.WebSettings, crawler, actionManager, cacheManager, crawlerPeerManager, memoryManager, ruleManager);

            DiscoveryManager <TArachnodeDAO> discoveryManager = new DiscoveryManager <TArachnodeDAO>(crawler.ApplicationSettings, crawler.WebSettings, cache, actionManager, cacheManager, memoryManager, ruleManager);

            //load the CrawlActions, CrawlRules and EngineActions...
            ruleManager.ProcessCrawlRules(crawler);
            actionManager.ProcessCrawlActions(crawler);
            actionManager.ProcessEngineActions(crawler);

            //these three methods are called in the Engine.
            UserDefinedFunctions.RefreshAllowedExtensions(true);
            UserDefinedFunctions.RefreshAllowedSchemes(true);
            UserDefinedFunctions.RefreshDisallowed();

            //instantiate a WebClient to access the ResponseHeaders...
            WebClient <TArachnodeDAO> webClient = new WebClient <TArachnodeDAO>(crawler.ApplicationSettings, crawler.WebSettings, consoleManager, cookieManager, new ProxyManager <TArachnodeDAO>(crawler.ApplicationSettings, crawler.WebSettings, consoleManager));

            webClient.GetHttpWebResponse("http://google.com", "GET", null, null, null, null);

            FileManager <TArachnodeDAO> fileManager = new FileManager <TArachnodeDAO>(crawler.ApplicationSettings, crawler.WebSettings, discoveryManager, arachnodeDAO);

            for (long i = fileIDLowerBound; i <= fileIDUpperBound; i++)
            {
                ArachnodeDataSet.FilesRow filesRow = null;

                try
                {
                    //get the File from the database.  we need the source data as we don't store this in the index.
                    //even though most of the fields are available in the Document, the File is the authoritative source, so we'll use that for all of the fields.
                    filesRow = arachnodeDAO.GetFile(i.ToString());

                    if (filesRow != null)
                    {
                        if (filesRow.Source == null || filesRow.Source.Length == 0)
                        {
                            if (File.Exists(discoveryManager.GetDiscoveryPath(crawler.ApplicationSettings.DownloadedFilesDirectory, filesRow.AbsoluteUri, filesRow.FullTextIndexType)))
                            {
                                filesRow.Source = File.ReadAllBytes(discoveryManager.GetDiscoveryPath(crawler.ApplicationSettings.DownloadedFilesDirectory, filesRow.AbsoluteUri, filesRow.FullTextIndexType));
                            }
                            else
                            {
                                Console.WriteLine("FileID: " + i + " was NOT processed successfully.");
                                if (OnFileProcessed != null)
                                {
                                    OnFileProcessed.BeginInvoke(filesRow, "FileID: " + i + " was NOT processed successfully.", null, null);
                                }
                            }
                        }

                        ProcessFile(crawler.ApplicationSettings, crawler.WebSettings, crawler, filesRow, webClient, cache, actionManager, consoleManager, crawlerPeerManager, discoveryManager, fileManager, memoryManager, ruleManager, arachnodeDAO);

                        Console.WriteLine("FileID: " + i + " was processed successfully.");
                        if (OnFileProcessed != null)
                        {
                            OnFileProcessed.BeginInvoke(filesRow, "FileID: " + i + " was processed successfully.", null, null);
                        }
                    }
                }
                catch (Exception exception)
                {
                    Console.WriteLine("FileID: " + i + " was NOT processed successfully.");
                    Console.WriteLine(exception.Message);

                    if (OnFileProcessed != null)
                    {
                        OnFileProcessed.BeginInvoke(filesRow, "FileID: " + i + " was NOT processed successfully.", null, null);
                        OnFileProcessed.BeginInvoke(filesRow, exception.Message, null, null);
                    }

                    arachnodeDAO.InsertException(null, null, exception, false);
                }
            }

            //stop the CrawlActions, CrawlRules and EngineActions...
            ruleManager.Stop();
            actionManager.Stop();
        }