示例#1
0
        public override string GetWebPageSource(string webPageAbsoluteUriOrID, IArachnodeDAO arachnodeDAO)
        {
            if (ApplicationSettings.DownloadedWebPagesDirectory == null)
            {
                throw new Exception("_applicationSettings.DownloadedWebPagesDirectory is null.  This is usually the result of failing to initialize the Application configuration from the ArachnodeDAO.");
            }

            string webPageSource = null;

            ArachnodeDataSet.WebPagesRow webPagesRow = arachnodeDAO.GetWebPage(webPageAbsoluteUriOrID);

            if (webPagesRow != null)
            {
                if (webPagesRow.Source.Length != 0)
                {
                    webPageSource = Encoding.GetEncoding(webPagesRow.CodePage).GetString(webPagesRow.Source);
                }
                else
                {
                    string discoveryPath = GetDiscoveryPath(ApplicationSettings.DownloadedWebPagesDirectory, webPagesRow.AbsoluteUri, webPagesRow.FullTextIndexType);

                    if (!File.Exists(discoveryPath))
                    {
                        throw new Exception("Could not find the WebPage Source in the database or on disk.");
                    }

                    webPageSource = File.ReadAllText(discoveryPath, Encoding.GetEncoding(webPagesRow.CodePage));
                }
            }

            return(webPageSource);
        }
示例#2
0
        protected void Page_Load(object sender, EventArgs e)
        {
            EnableViewState = false;

            try
            {
                if (Request.QueryString.Count == 2 && Request.QueryString.AllKeys[0] == "discoveryID" && Request.QueryString.AllKeys[1] == "absoluteUri")
                {
                    ArachnodeDataSet.WebPagesRow webPagesRow = ArachnodeDAO.GetWebPage(Request.QueryString["discoveryID"]);

                    if (webPagesRow != null)
                    {
                        string source = null;

                        if (webPagesRow.Source == null || webPagesRow.Source.Length == 0)
                        {
                            string discoveryPath = DiscoveryManager.GetDiscoveryPath(ApplicationSettings.DownloadedWebPagesDirectory, webPagesRow.AbsoluteUri, webPagesRow.FullTextIndexType);

                            if (File.Exists(discoveryPath))
                            {
                                source = File.ReadAllText(discoveryPath, Encoding.GetEncoding(webPagesRow.CodePage));
                            }
                            else
                            {
                                uxLblException.Text = "The WebPage source was not found in the database or on disk.";
                                uxLblException.Visible = true;

                                return;
                            }
                        }
                        else
                        {
                            source = Encoding.GetEncoding(webPagesRow.CodePage).GetString(webPagesRow.Source);
                        }

                        //Request.Url.Scheme + "://" + Request.Url.Authority

                        //ANODET: Should this be a configuration setting?  Perhaps - hotlinking isn't exactly polite, but does provide the best user experience.  (Version 1.5)
                        uxLWebPage.Text = HtmlManager.CreateHtmlDocument(webPagesRow.AbsoluteUri, webPagesRow.FullTextIndexType, source, UriQualificationType.AbsoluteWhenDownloadedDiscoveryIsUnavailable, ArachnodeDAO, true).DocumentNode.OuterHtml;
                    }
                    else
                    {
                        uxLblException.Text = "The WebPage was not found in the database.";
                        uxLblException.Visible = true;
                    }
                }
            }
            catch (Exception exception)
            {
                uxLblException.Text = exception.Message;
                uxLblException.Visible = true;

                ArachnodeDAO.InsertException(null, null, exception, false);
            }
        }
示例#3
0
        /// <summary>
        ///     Handles the Load event of the Page control.
        /// </summary>
        /// <param name = "sender">The source of the event.</param>
        /// <param name = "e">The <see cref = "System.EventArgs" /> instance containing the event data.</param>
        protected void Page_Load(object sender, EventArgs e)
        {
            EnableViewState = false;

            try
            {
                if (Request.QueryString.Count == 5 && Request.QueryString.AllKeys[0] == "discoveryID" && Request.QueryString.AllKeys[1] == "absoluteUri" && Request.QueryString.AllKeys[2] == "webPage" && Request.QueryString.AllKeys[3] == "codePage" && Request.QueryString.AllKeys[4] == "fullTextIndexType")
                {
                    string source = null;

                    if (File.Exists(Encryption.DecryptRijndaelManaged(Request.QueryString["webPage"])))
                    {
                        source = File.ReadAllText(Encryption.DecryptRijndaelManaged(Request.QueryString["webPage"]), Encoding.GetEncoding(int.Parse(Request.QueryString["codePage"])));
                    }
                    else
                    {
                        ArachnodeDataSet.WebPagesRow webPagesRow = ArachnodeDAO.GetWebPage(Request.QueryString["discoveryID"]);

                        if (webPagesRow != null && webPagesRow.Source.Length != 0)
                        {
                            source = Encoding.GetEncoding(webPagesRow.CodePage).GetString(webPagesRow.Source);
                        }
                    }

                    if (source != null)
                    {
                        //ANODET: Should this be a configuration setting?  Perhaps - hotlinking isn't exactly polite, but does provide the best user experience.  (Version 1.5)
                        uxLWebPage.Text = HtmlManager.CreateHtmlDocument(Request.QueryString["absoluteUri"], Request.QueryString["fullTextIndexType"], source, UriQualificationType.AbsoluteWhenDownloadedDiscoveryIsUnavailable, ArachnodeDAO, false).DocumentNode.OuterHtml;
                    }
                    else
                    {
                        uxLWebPage.Text = "The WebPage source was not found in the database or on disk.";

                        try
                        {
                            throw new Exception("The WebPage source for " + HttpUtility.UrlDecode(Request.QueryString["absoluteUri"]) + " was not found in the database or on disk.");
                        }
                        catch (Exception exception)
                        {
                            ArachnodeDAO.InsertException(null, null, exception, false);
                        }
                    }
                }
            }
            catch (Exception exception)
            {
                ArachnodeDAO.InsertException(null, null, exception, false);
            }
        }
示例#4
0
        private void WebPageUtilities_OnWebPageProcessed(ArachnodeDataSet.WebPagesRow webPagesRow, string message)
        {
            BeginInvoke(new MethodInvoker(delegate
            {
                rtbPostProcessingStatus.Text = message + Environment.NewLine + rtbPostProcessingStatus.Text;

                if (rtbPostProcessingStatus.Text.Length > 10000)
                {
                    rtbPostProcessingStatus.Text = rtbPostProcessingStatus.Text.Substring(0, 10000);
                }
            }));

            //Application.DoEvents();

            //Thread.Sleep(100);
        }
示例#5
0
        private void nudWebPageID_ValueChanged(object sender, EventArgs e)
        {
            _webPagesRow = _arachnodeDAO.GetWebPage(nudWebPageID.Value.ToString());

            if (_webPagesRow != null)
            {
                _webPageDiscoveryPath = _discoveryManager.GetDiscoveryPath(_applicationSettings.DownloadedWebPagesDirectory, _webPagesRow.AbsoluteUri, _webPagesRow.FullTextIndexType);

                llWebPageDiscoveryPathDirectory.Visible = true;
                llWebPageDiscoveryPathDirectory.Text    = Path.GetDirectoryName(_webPageDiscoveryPath);

                if (cbAutoView.Checked)
                {
                    btnViewWebPage_Click(sender, e);
                }
            }
            else
            {
                llWebPageDiscoveryPathDirectory.Visible = false;
                wbMain.DocumentText = "The WebPage with the ID of " + nudWebPageID.Value + " does not exist.";
            }
        }
示例#6
0
        public SearchResults <Document> Search(string query, string discoveryType, int pageNumber, int pageSize, bool shouldDocumentsBeClustered, string sort)
        {
            try
            {
                Global.RefreshIndexSearcher();

                if (string.IsNullOrEmpty(discoveryType))
                {
                    discoveryType = "WebPage";
                }
                if (pageNumber == 0)
                {
                    pageNumber = 1;
                }
                if (pageSize == 0)
                {
                    pageSize = 20;
                }
                if (string.IsNullOrEmpty(sort))
                {
                }

                SearchResults <Lucene.Net.Documents.Document> searchResults = SearchManager.GetDocuments(Global.DefaultQueryParser, Global.CustomQueryParser, Global.IndexSearcher, query, (DiscoveryType)Enum.Parse(typeof(DiscoveryType), discoveryType), pageNumber, pageSize, shouldDocumentsBeClustered, sort, WebSettings.MaximumNumberOfDocumentsToReturnPerSearch);

                SearchResults <Document> searchResults2 = new SearchResults <Document>();

                searchResults2.Documents         = new List <Document>(searchResults.Documents.Count);
                searchResults2.TotalNumberOfHits = searchResults.TotalNumberOfHits;

                foreach (Lucene.Net.Documents.Document document in searchResults.Documents)
                {
                    try
                    {
                        Document document2 = new Document();

                        document2.AbsoluteUri   = document.GetField("absoluteuri").StringValue();
                        document2.Created       = DateTools.StringToDate(document.GetField("created").StringValue());
                        document2.DiscoveryID   = long.Parse(document.GetField("discoveryid").StringValue());
                        document2.DiscoveryPath = document.GetField("discoverypath").StringValue();
                        document2.Domain        = document.GetField("domain").StringValue();
                        document2.Extension     = document.GetField("extension").StringValue();
                        document2.Host          = document.GetField("host").StringValue();
                        document2.Scheme        = document.GetField("scheme").StringValue();
                        document2.Score         = float.Parse(document.GetField("relevancyscore").StringValue());
                        document2.Strength      = float.Parse(document.GetField("strength").StringValue());

                        string text = null;

                        if (File.Exists(document2.DiscoveryPath))
                        {
                            text = File.ReadAllText(document2.DiscoveryPath, Encoding.GetEncoding(int.Parse(document.GetField("codepage").StringValue())));
                        }
                        else
                        {
                            ArachnodeDataSet.WebPagesRow webPagesRow = ArachnodeDAO.GetWebPage(document2.DiscoveryID.ToString());

                            if (webPagesRow != null && webPagesRow.Source != null)
                            {
                                text = Encoding.GetEncoding(webPagesRow.CodePage).GetString(webPagesRow.Source);
                            }
                        }

                        if (text != null)
                        {
                            document2.Summary = SearchManager.Summarize(searchResults.Query, searchResults.WildcardSafeQuery, shouldDocumentsBeClustered, text);
                        }
                        else
                        {
                            document2.Summary = "The WebPage source was not found in the database or on disk.";

                            try
                            {
                                throw new Exception("The WebPage source for " + document2.AbsoluteUri + " was not found in the database or on disk.");
                            }
                            catch (Exception exception)
                            {
                                ArachnodeDAO.InsertException(null, null, exception, false);
                            }
                        }

                        document2.Title = document.GetField("title").StringValue();
                        if (document.GetField("updated") != null)
                        {
                            document2.Updated = DateTools.StringToDate(document.GetField("updated").StringValue());
                        }

                        searchResults2.Documents.Add(document2);
                    }
                    catch (Exception exception)
                    {
                        ArachnodeDAO.InsertException(null, null, exception, false);
                    }
                }

                return(searchResults2);
            }
            catch (Exception exception)
            {
                ArachnodeDAO.InsertException(null, null, exception, false);
            }

            return(null);
        }
        /// <summary>
        ///     Handles the Load event of the Page control.
        /// </summary>
        /// <param name = "sender">The source of the event.</param>
        /// <param name = "e">The <see cref = "System.EventArgs" /> instance containing the event data.</param>
        protected void Page_Load(object sender, EventArgs e)
        {
            if (Request.QueryString.Count == 5 && Request.QueryString.AllKeys[0] == "query" && Request.QueryString.AllKeys[1] == "discoveryType" && Request.QueryString.AllKeys[2] == "pageNumber" && Request.QueryString.AllKeys[3] == "pageSize" && Request.QueryString.AllKeys[4] == "shouldDocumentsBeClustered")
            {
                string query      = Request.QueryString["query"];
                int    pageNumber = int.Parse(Request.QueryString["pageNumber"]);
                int    pageSize   = int.Parse(Request.QueryString["pageSize"]);
                bool   shouldDocumentsBeClustered = Request.QueryString["shouldDocumentsBeClustered"] == "1" ? true : false;

                if (Results != null)
                {
                    for (int i = 0; i < Results.Documents.Count; i++)
                    {
                        SearchResult searchResult = (SearchResult)LoadControl("SearchResult.ascx");

                        searchResult.Document = Results.Documents[i];
                        searchResult.InitializeAsUserControl(Page);
                        searchResult.ID = "uxUcSearchResult_" + i;

                        //AN will no longer populate the filesystem from database sources...
                        //string discoveryPath = Results.Documents[i].GetField("discoverypath").StringValue();

                        //if (!File.Exists(discoveryPath))
                        //{
                        //    switch (Results.Documents[i].GetField("discoverypath").StringValue())
                        //    {
                        //        case "image":
                        //            //ArachnodeDataSet.ImagesRow imagesRow = null;

                        //            //try
                        //            //{
                        //            //    imagesRow = ArachnodeDAO.GetImage(Results.Documents[i].GetField("discoveryid").StringValue());

                        //            //    WebPageManager webPageManager = new WebPageManager(ArachnodeDAO);

                        //            //    ManagedWebPage managedWebPage = webPageManager.ManageWebPage(webPagesRow.ID, webPagesRow.AbsoluteUri, webPagesRow.Source, Encoding.GetEncoding(webPagesRow.CodePage), webPagesRow.FullTextIndexType, false, false, true);

                        //            //    managedWebPage.StreamWriter.Close();
                        //            //    managedWebPage.StreamWriter.Dispose();

                        //            //    discoveryPath = managedWebPage.DiscoveryPath;
                        //            //}
                        //            //catch (Exception exception)
                        //            //{
                        //            //    if (webPagesRow != null)
                        //            //    {
                        //            //        ArachnodeDAO.InsertException(webPagesRow.AbsoluteUri, null, exception, false);
                        //            //    }
                        //            //    else
                        //            //    {
                        //            //        ArachnodeDAO.InsertException(null, null, exception, false);
                        //            //    }

                        //            //    Results.TotalNumberOfHits--;

                        //            //    continue;
                        //            //}
                        //            break;

                        //        case "webpage":
                        //            ArachnodeDataSet.WebPagesRow webPagesRow = null;

                        //            try
                        //            {
                        //                webPagesRow = ArachnodeDAO.GetWebPage(Results.Documents[i].GetField("discoveryid").StringValue());

                        //                WebPageManager webPageManager = new WebPageManager(ArachnodeDAO);

                        //                ManagedWebPage managedWebPage = webPageManager.ManageWebPage(webPagesRow.ID, webPagesRow.AbsoluteUri, webPagesRow.Source, Encoding.GetEncoding(webPagesRow.CodePage), webPagesRow.FullTextIndexType, false, false, true);

                        //                managedWebPage.StreamWriter.Close();
                        //                managedWebPage.StreamWriter.Dispose();

                        //                discoveryPath = managedWebPage.DiscoveryPath;
                        //            }
                        //            catch (Exception exception)
                        //            {
                        //                if (webPagesRow != null)
                        //                {
                        //                    ArachnodeDAO.InsertException(webPagesRow.AbsoluteUri, null, exception, false);
                        //                }
                        //                else
                        //                {
                        //                    ArachnodeDAO.InsertException(null, null, exception, false);
                        //                }

                        //                Results.TotalNumberOfHits--;

                        //                continue;
                        //            }
                        //            break;
                        //    }
                        //}

                        string text = null;

                        if (File.Exists(Results.Documents[i].GetField("discoverypath").StringValue()))
                        {
                            text = File.ReadAllText(Results.Documents[i].GetField("discoverypath").StringValue(), Encoding.GetEncoding(int.Parse(Results.Documents[i].GetField("codepage").StringValue())));
                        }
                        else
                        {
                            ArachnodeDataSet.WebPagesRow webPagesRow = ArachnodeDAO.GetWebPage(Results.Documents[i].GetField("discoveryid").StringValue());

                            if (webPagesRow != null && webPagesRow.Source.Length != 0)
                            {
                                text = Encoding.GetEncoding(webPagesRow.CodePage).GetString(webPagesRow.Source);
                            }
                        }

                        if (text != null)
                        {
                            searchResult.Summary = SearchManager.Summarize(Results.Query, Results.WildcardSafeQuery, shouldDocumentsBeClustered, text);
                        }
                        else
                        {
                            searchResult.Summary = "The WebPage source for " + Results.Documents[i].GetField("absoluteuri").StringValue() + " was not found in the database or on disk.";

                            try
                            {
                                throw new Exception("The WebPage source for " + Results.Documents[i].GetField("absoluteuri").StringValue() + " was not found in the database or on disk.");
                            }
                            catch (Exception exception)
                            {
                                ArachnodeDAO.InsertException(null, null, exception, false);
                            }

                            Results.TotalNumberOfHits--;
                        }

                        uxPhSearchResults.Controls.Add(searchResult);
                    }

                    if (shouldDocumentsBeClustered)
                    {
                        uxHlShouldDocumentsBeClustered.NavigateUrl = Request.Url.AbsoluteUri.Replace("shouldDocumentsBeClustered=1", "shouldDocumentsBeClustered=0");
                    }
                    else
                    {
                        uxHlShouldDocumentsBeClustered.NavigateUrl = Request.Url.AbsoluteUri.Replace("shouldDocumentsBeClustered=0", "shouldDocumentsBeClustered=1");
                    }

                    uxHlShouldDocumentsBeClustered.Visible = true;

                    //create the page links.
                    for (int i = 1; i *pageSize <= Results.TotalNumberOfHits + pageSize && i *pageSize <= WebSettings.MaximumNumberOfDocumentsToReturnPerSearch; i++)
                    {
                        HyperLink hyperLink = new HyperLink();

                        if (pageNumber != i)
                        {
                            hyperLink.CssClass = "pageNumber";

                            hyperLink.NavigateUrl = Request.Url.LocalPath + "?query=" + query + "&discoveryType=" + Request.QueryString["discoveryType"] + "&pageNumber=" + i + "&pageSize=" + pageSize;

                            if (shouldDocumentsBeClustered)
                            {
                                hyperLink.NavigateUrl += "&shouldDocumentsBeClustered=1";
                            }
                            else
                            {
                                hyperLink.NavigateUrl += "&shouldDocumentsBeClustered=0";
                            }
                        }
                        else
                        {
                            hyperLink.CssClass = "currentPageNumber";
                        }

                        hyperLink.Text = i.ToString();

                        uxPhPages.Controls.Add(hyperLink);
                    }

                    uxLblPage.Visible        = true;
                    uxHlPrevious.Visible     = pageNumber > 1;
                    uxHlPrevious.NavigateUrl = "~/Search.aspx?query=" + HttpUtility.UrlEncode(query) + "&discoveryType=" + Request.QueryString["discoveryType"] + "&pageNumber=" + (pageNumber - 1) + "&pageSize=" + pageSize;
                    if (shouldDocumentsBeClustered)
                    {
                        uxHlPrevious.NavigateUrl += "&shouldDocumentsBeClustered=0";
                    }
                    else
                    {
                        uxHlPrevious.NavigateUrl += "&shouldDocumentsBeClustered=1";
                    }
                    uxHlNext.Visible     = pageNumber < Results.TotalNumberOfHits / pageSize && pageNumber < WebSettings.MaximumNumberOfDocumentsToReturnPerSearch / pageSize;
                    uxHlNext.NavigateUrl = "~/Search.aspx?query=" + HttpUtility.UrlEncode(query) + "&discoveryType=" + Request.QueryString["discoveryType"] + "&pageNumber=" + (pageNumber + 1) + "&pageSize=" + pageSize;
                    if (shouldDocumentsBeClustered)
                    {
                        uxHlNext.NavigateUrl += "&shouldDocumentsBeClustered=1";
                    }
                    else
                    {
                        uxHlNext.NavigateUrl += "&shouldDocumentsBeClustered=0";
                    }
                }
            }
        }
示例#8
0
        /// <summary>
        ///     Process a range of WebPageID after crawling.  Useful if crawled WebPages were not processed at crawl time according to desired ApplicationSettings configuration.
        ///     Calling this method DOES change the 'LastDiscovered' fields where applicable.
        ///     This method is not when crawling, rather during post-processing.
        /// </summary>
        /// <param name = "webPageIDLowerBound"></param>
        /// <param name = "webPageIDUpperBound"></param>
        public static void ProcessWebPages(Crawler <TArachnodeDAO> crawler, long webPageIDLowerBound, long webPageIDUpperBound)
        {
            //do not assign the application settings.  doing so will override the ApplicationSetting you set before calling this method...
            TArachnodeDAO arachnodeDAO = (TArachnodeDAO)Activator.CreateInstance(typeof(TArachnodeDAO), crawler.ApplicationSettings.ConnectionString, crawler.ApplicationSettings, crawler.WebSettings, false, false);

            ConsoleManager <TArachnodeDAO> consoleManager = new ConsoleManager <TArachnodeDAO>(crawler.ApplicationSettings, crawler.WebSettings);
            ActionManager <TArachnodeDAO>  actionManager  = new ActionManager <TArachnodeDAO>(crawler.ApplicationSettings, crawler.WebSettings, consoleManager);
            CookieManager cookieManager = new CookieManager();;
            MemoryManager <TArachnodeDAO>      memoryManager      = new MemoryManager <TArachnodeDAO>(crawler.ApplicationSettings, crawler.WebSettings);
            RuleManager <TArachnodeDAO>        ruleManager        = new RuleManager <TArachnodeDAO>(crawler.ApplicationSettings, crawler.WebSettings, consoleManager);
            CacheManager <TArachnodeDAO>       cacheManager       = new CacheManager <TArachnodeDAO>(crawler.ApplicationSettings, crawler.WebSettings);
            CrawlerPeerManager <TArachnodeDAO> crawlerPeerManager = new CrawlerPeerManager <TArachnodeDAO>(crawler.ApplicationSettings, crawler.WebSettings, null, arachnodeDAO);
            Cache <TArachnodeDAO>            cache            = new Cache <TArachnodeDAO>(crawler.ApplicationSettings, crawler.WebSettings, crawler, actionManager, cacheManager, crawlerPeerManager, memoryManager, ruleManager);
            DiscoveryManager <TArachnodeDAO> discoveryManager = new DiscoveryManager <TArachnodeDAO>(crawler.ApplicationSettings, crawler.WebSettings, cache, actionManager, cacheManager, memoryManager, ruleManager);
            HtmlManager <TArachnodeDAO>      htmlManager      = new HtmlManager <TArachnodeDAO>(crawler.ApplicationSettings, crawler.WebSettings, discoveryManager);

            //load the CrawlActions, CrawlRules and EngineActions...
            ruleManager.ProcessCrawlRules(crawler);
            actionManager.ProcessCrawlActions(crawler);
            actionManager.ProcessEngineActions(crawler);

            //these three methods are called in the Engine.
            UserDefinedFunctions.RefreshAllowedExtensions(true);
            UserDefinedFunctions.RefreshAllowedSchemes(true);
            UserDefinedFunctions.RefreshDisallowed();

            //instantiate a WebClient to access the ResponseHeaders...
            WebClient <TArachnodeDAO> webClient = new WebClient <TArachnodeDAO>(crawler.ApplicationSettings, crawler.WebSettings, consoleManager, cookieManager, new ProxyManager <TArachnodeDAO>(crawler.ApplicationSettings, crawler.WebSettings, consoleManager));

            webClient.GetHttpWebResponse("http://google.com", "GET", null, null, null, null);

            WebPageManager <TArachnodeDAO> webPageManager = new WebPageManager <TArachnodeDAO>(crawler.ApplicationSettings, crawler.WebSettings, discoveryManager, htmlManager, arachnodeDAO);

            for (long i = webPageIDLowerBound; i <= webPageIDUpperBound; i++)
            {
                ArachnodeDataSet.WebPagesRow webPagesRow = null;

                try
                {
                    //get the WebPage from the database.  we need the source data as we don't store this in the index.
                    //even though most of the fields are available in the Document, the WebPage is the authoritative source, so we'll use that for all of the fields.
                    webPagesRow = arachnodeDAO.GetWebPage(i.ToString());

                    if (webPagesRow != null)
                    {
                        if (webPagesRow.Source == null || webPagesRow.Source.Length == 0)
                        {
                            if (File.Exists(discoveryManager.GetDiscoveryPath(crawler.ApplicationSettings.DownloadedWebPagesDirectory, webPagesRow.AbsoluteUri, webPagesRow.FullTextIndexType)))
                            {
                                using (StreamReader streamReader = File.OpenText(discoveryManager.GetDiscoveryPath(crawler.ApplicationSettings.DownloadedWebPagesDirectory, webPagesRow.AbsoluteUri, webPagesRow.FullTextIndexType)))
                                {
                                    webPagesRow.Source = Encoding.UTF8.GetBytes(streamReader.ReadToEnd());
                                }
                            }
                            else
                            {
                                Console.WriteLine("WebPageID: " + i + " was NOT processed successfully.");
                                if (OnWebPageProcessed != null)
                                {
                                    OnWebPageProcessed.BeginInvoke(webPagesRow, "WebPageID: " + i + " was NOT processed successfully.", null, null);
                                }
                            }
                        }

                        ProcessWebPage(crawler.ApplicationSettings, crawler.WebSettings, crawler, webPagesRow, webClient, cache, actionManager, consoleManager, crawlerPeerManager, discoveryManager, memoryManager, ruleManager, webPageManager, arachnodeDAO);

                        Console.WriteLine("WebPageID: " + i + " was processed successfully.");
                        if (OnWebPageProcessed != null)
                        {
                            OnWebPageProcessed.BeginInvoke(webPagesRow, "WebPageID: " + i + " was processed successfully.", null, null);
                        }
                    }
                }
                catch (Exception exception)
                {
                    Console.WriteLine("WebPageID: " + i + " was NOT processed successfully.");
                    Console.WriteLine(exception.Message);

                    if (OnWebPageProcessed != null)
                    {
                        OnWebPageProcessed.BeginInvoke(webPagesRow, "WebPageID: " + i + " was NOT processed successfully.", null, null);
                        OnWebPageProcessed.BeginInvoke(webPagesRow, exception.Message, null, null);
                    }

                    arachnodeDAO.InsertException(null, null, exception, false);
                }
            }

            //stop the CrawlActions, CrawlRules and EngineActions...
            ruleManager.Stop();
            actionManager.Stop();
        }
示例#9
0
        /// <summary>
        ///     Processes a WebPagesRow after crawling.
        /// </summary>
        /// <param name = "webPagesRow">The web pages row.</param>
        /// <param name="webClient"></param>
        /// <param name="actionManager"></param>
        /// <param name="consoleManager"></param>
        /// <param name="discoveryManager"></param>
        /// <param name="memoryManager"></param>
        /// <param name="ruleManager"></param>
        /// <param name = "webPageManager">The web page manager.</param>
        /// <param name = "arachnodeDAO">The arachnode DAO.</param>
        /// <param name = "fileManager">The file manager.</param>
        /// <param name = "imageManager">The image manager.</param>
        public static void ProcessWebPage(ApplicationSettings applicationSettings, WebSettings webSettings, Crawler <TArachnodeDAO> crawler, ArachnodeDataSet.WebPagesRow webPagesRow, WebClient <TArachnodeDAO> webClient, Cache <TArachnodeDAO> cache, ActionManager <TArachnodeDAO> actionManager, ConsoleManager <TArachnodeDAO> consoleManager, CrawlerPeerManager <TArachnodeDAO> crawlerPeerManager, DiscoveryManager <TArachnodeDAO> discoveryManager, MemoryManager <TArachnodeDAO> memoryManager, RuleManager <TArachnodeDAO> ruleManager, WebPageManager <TArachnodeDAO> webPageManager, IArachnodeDAO arachnodeDAO)
        {
            CacheManager <TArachnodeDAO> cacheManager = new CacheManager <TArachnodeDAO>(applicationSettings, webSettings);
            CookieManager cookieManager = new CookieManager();
            CrawlRequestManager <TArachnodeDAO> crawlRequestManager = new CrawlRequestManager <TArachnodeDAO>(applicationSettings, webSettings, cache, consoleManager, discoveryManager);
            DataTypeManager <TArachnodeDAO>     dataTypeManager     = new DataTypeManager <TArachnodeDAO>(applicationSettings, webSettings);
            EncodingManager <TArachnodeDAO>     encodingManager     = new EncodingManager <TArachnodeDAO>(applicationSettings, webSettings);
            PolitenessManager <TArachnodeDAO>   politenessManager   = new PolitenessManager <TArachnodeDAO>(applicationSettings, webSettings, cache);
            ProxyManager <TArachnodeDAO>        proxyManager        = new ProxyManager <TArachnodeDAO>(applicationSettings, webSettings, consoleManager);
            HtmlManager <TArachnodeDAO>         htmlManager         = new HtmlManager <TArachnodeDAO>(applicationSettings, webSettings, discoveryManager);
            Crawl <TArachnodeDAO> crawl = new Crawl <TArachnodeDAO>(applicationSettings, webSettings, crawler, actionManager, consoleManager, cookieManager, crawlRequestManager, dataTypeManager, discoveryManager, encodingManager, htmlManager, politenessManager, proxyManager, ruleManager, true);

            //create a CrawlRequest as this is what the internals of SiteCrawler.dll expect to operate on...
            CrawlRequest <TArachnodeDAO> crawlRequest = new CrawlRequest <TArachnodeDAO>(new Discovery <TArachnodeDAO>(webPagesRow.AbsoluteUri), webPagesRow.CrawlDepth, UriClassificationType.Host, UriClassificationType.Host, 0, RenderType.None, RenderType.None);

            crawlRequest.Crawl = crawl;
            crawlRequest.Discovery.DiscoveryType = DiscoveryType.WebPage;
            crawlRequest.Discovery.ID            = webPagesRow.ID;
            crawlRequest.Data         = webPagesRow.Source;
            crawlRequest.CurrentDepth = webPagesRow.CrawlDepth;
            crawlRequest.Encoding     = Encoding.GetEncoding(webPagesRow.CodePage);
            crawlRequest.ProcessData  = true;
            crawlRequest.WebClient    = webClient;

            crawlRequest.WebClient.HttpWebResponse.Headers.Clear();

            //parse the ResponseHeaders from the WebPagesRow.ResponseHeaders string...
            foreach (string responseHeader in webPagesRow.ResponseHeaders.Split("\r\n".ToCharArray(), StringSplitOptions.RemoveEmptyEntries))
            {
                string[] responseHeaderSplit = responseHeader.Split(":".ToCharArray());

                string name  = responseHeaderSplit[0];
                string value = UserDefinedFunctions.ExtractResponseHeader(webPagesRow.ResponseHeaders, name, true).Value;

                crawlRequest.WebClient.HttpWebResponse.Headers.Add(name, value);
            }

            //refresh the DataTypes in the DataTypeManager... (if necessary)...
            if (dataTypeManager.AllowedDataTypes.Count == 0)
            {
                dataTypeManager.RefreshDataTypes();
            }

            crawlRequest.DataType = dataTypeManager.DetermineDataType(crawlRequest);

            //now, process the bytes...
            encodingManager.ProcessCrawlRequest(crawlRequest, arachnodeDAO);

            if (applicationSettings.InsertWebPages)
            {
                crawlRequest.Discovery.ID = arachnodeDAO.InsertWebPage(crawlRequest.Discovery.Uri.AbsoluteUri, crawlRequest.WebClient.HttpWebResponse.Headers.ToString(), applicationSettings.InsertWebPageSource ? crawlRequest.Data : new byte[] { }, crawlRequest.Encoding.CodePage, crawlRequest.DataType.FullTextIndexType, crawlRequest.CurrentDepth, applicationSettings.ClassifyAbsoluteUris);
            }

            crawlRequest.ManagedDiscovery = webPageManager.ManageWebPage(crawlRequest.Discovery.ID.Value, crawlRequest.Discovery.Uri.AbsoluteUri, crawlRequest.Data, crawlRequest.Encoding, crawlRequest.DataType.FullTextIndexType, applicationSettings.ExtractWebPageMetaData, applicationSettings.InsertWebPageMetaData, applicationSettings.SaveDiscoveredWebPagesToDisk);

            //assigning FileAndImageDiscoveries isn't applicable because Files and Images need to be crawled to be properly classified... without classification we don't know whether they belong in dbo.Files or dbo.Images...
            crawlRequestManager.ProcessEmailAddresses(crawlRequest, arachnodeDAO);
            crawlRequestManager.ProcessHyperLinks(crawlRequest, arachnodeDAO);

            actionManager.PerformCrawlActions(crawlRequest, CrawlActionType.PostRequest, arachnodeDAO);

            discoveryManager.CloseAndDisposeManagedDiscovery(crawlRequest, arachnodeDAO);
        }
示例#10
0
        public override void ProcessCrawlRequest(CrawlRequest <TArachnodeDAO> crawlRequest, bool obeyCrawlRules, bool executeCrawlActions)
        {
            IssueWebRequest(crawlRequest, "GET");

            crawlRequest.DataType = _dataTypeManager.DetermineDataType(crawlRequest);

            if (obeyCrawlRules)
            {
                _ruleManager.IsDisallowed(crawlRequest, CrawlRuleType.PreGet, _arachnodeDAO);
            }

            if (executeCrawlActions)
            {
                _actionManager.PerformCrawlActions(crawlRequest, CrawlActionType.PreGet, _arachnodeDAO);
            }

            if (!crawlRequest.IsDisallowed)
            {
                try
                {
                    if (crawlRequest.WebClient.HttpWebResponse != null)
                    {
                        crawlRequest.ProcessData = true;

                        bool isLastModifiedOutdated = true;

                        try
                        {
                            isLastModifiedOutdated = crawlRequest.WebClient.HttpWebResponse.LastModified != DateTime.Now;
                        }
                        catch (Exception exception)
                        {
                            _arachnodeDAO.InsertException(crawlRequest.Parent.Uri.AbsoluteUri, crawlRequest.Discovery.Uri.AbsoluteUri, exception, false);
                        }

                        if (isLastModifiedOutdated)
                        {
                            switch (crawlRequest.DataType.DiscoveryType)
                            {
                            case DiscoveryType.File:
                                if (ApplicationSettings.AssignFileAndImageDiscoveries)     //ANODET: robots.txt
                                {
                                    ArachnodeDataSet.FilesRow filesRow = _arachnodeDAO.GetFile(crawlRequest.Discovery.Uri.AbsoluteUri);

                                    if (filesRow == null)
                                    {
                                        crawlRequest.ProcessData = true;
                                    }
                                    else
                                    {
                                        if (!filesRow.IsResponseHeadersNull())
                                        {
                                            DateTime lastModified;

                                            SqlString lastModifiedValue = UserDefinedFunctions.ExtractResponseHeader(filesRow.ResponseHeaders, "Last-Modified: ", false);

                                            if (!lastModifiedValue.IsNull && DateTime.TryParse(lastModifiedValue.Value, out lastModified))
                                            {
                                                //crawlRequest.WebClient.HttpWebResponse.LastModified will equal DateTime.Now (or close to it) if the 'Last-Modified' ResponseHeader is not present...
                                                if ((crawlRequest.WebClient.HttpWebResponse).LastModified > lastModified)
                                                {
                                                    crawlRequest.ProcessData = true;
                                                }
                                                else
                                                {
                                                    crawlRequest.ProcessData = false;
                                                }
                                            }
                                            else
                                            {
                                                crawlRequest.ProcessData = false;
                                            }
                                        }
                                        else
                                        {
                                            crawlRequest.ProcessData = true;
                                        }

                                        if (!crawlRequest.ProcessData)
                                        {
                                            if (filesRow.Source.Length != 0)
                                            {
                                                crawlRequest.Data = filesRow.Source;
                                            }
                                            else
                                            {
                                                string discoveryPath = _discoveryManager.GetDiscoveryPath(ApplicationSettings.DownloadedFilesDirectory, crawlRequest.Discovery.Uri.AbsoluteUri, crawlRequest.DataType.FullTextIndexType);

                                                if (File.Exists(discoveryPath))
                                                {
                                                    crawlRequest.Data = File.ReadAllBytes(discoveryPath);
                                                }
                                                else
                                                {
                                                    try
                                                    {
                                                        throw new Exception("The 'LastModified' HttpResponse Header indicated that the Data was not stale, but the Data (Source) could not be found in the Files database table or at _applicationSettings.DownloadedFilesDirectory.  Therefore, the data was re-downloaded from the server.  The File file may have been deleted from disk or the 'Source' column in the 'Files' table may have been cleared or a previous crawl may have crawled with both _applicationSettings.InsertFileSource = false and _applicationSettings.SaveDiscoveredFilesToDisk = false.");
                                                    }
                                                    catch (Exception exception)
                                                    {
                                                        _arachnodeDAO.InsertException(crawlRequest.Parent.Uri.AbsoluteUri, crawlRequest.Discovery.Uri.AbsoluteUri, exception, false);
                                                    }

                                                    crawlRequest.ProcessData = true;
                                                }
                                            }
                                        }
                                    }
                                }
                                else
                                {
                                    crawlRequest.ProcessData = false;
                                }
                                break;

                            case DiscoveryType.Image:
                                if (ApplicationSettings.AssignFileAndImageDiscoveries)
                                {
                                    ArachnodeDataSet.ImagesRow imagesRow = _arachnodeDAO.GetImage(crawlRequest.Discovery.Uri.AbsoluteUri);

                                    if (imagesRow == null)
                                    {
                                        crawlRequest.ProcessData = true;
                                    }
                                    else
                                    {
                                        if (!imagesRow.IsResponseHeadersNull())
                                        {
                                            DateTime lastModified;

                                            SqlString lastModifiedValue = UserDefinedFunctions.ExtractResponseHeader(imagesRow.ResponseHeaders, "Last-Modified: ", false);

                                            if (!lastModifiedValue.IsNull && DateTime.TryParse(lastModifiedValue.Value, out lastModified))
                                            {
                                                //crawlRequest.WebClient.HttpWebResponse.LastModified will equal DateTime.Now (or close to it) if the 'Last-Modified' ResponseHeader is not present...
                                                if (crawlRequest.WebClient.HttpWebResponse.LastModified > lastModified)
                                                {
                                                    crawlRequest.ProcessData = true;
                                                }
                                                else
                                                {
                                                    crawlRequest.ProcessData = false;
                                                }
                                            }
                                            else
                                            {
                                                crawlRequest.ProcessData = false;
                                            }

                                            if (!crawlRequest.ProcessData)
                                            {
                                                if (imagesRow.Source.Length != 0)
                                                {
                                                    crawlRequest.Data = imagesRow.Source;
                                                }
                                                else
                                                {
                                                    string discoveryPath = _discoveryManager.GetDiscoveryPath(ApplicationSettings.DownloadedImagesDirectory, crawlRequest.Discovery.Uri.AbsoluteUri, crawlRequest.DataType.FullTextIndexType);

                                                    if (File.Exists(discoveryPath))
                                                    {
                                                        crawlRequest.Data = File.ReadAllBytes(discoveryPath);
                                                    }
                                                    else
                                                    {
                                                        try
                                                        {
                                                            throw new Exception("The 'LastModified' HttpResponse Header indicated that the Data was not stale, but the Data (Source) could not be found in the Images database table or at _applicationSettings.DownloadedImagesDirectory.  Therefore, the data was downloaded from the server.  The Image file may have been deleted from disk or the 'Source' column in the 'Images' table may have been cleared.  A previous crawl may have crawled with both _applicationSettings.InsertImageSource = false and _applicationSettings.SaveDiscoveredImagesToDisk = false.");
                                                        }
                                                        catch (Exception exception)
                                                        {
                                                            _arachnodeDAO.InsertException(crawlRequest.Parent.Uri.AbsoluteUri, crawlRequest.Discovery.Uri.AbsoluteUri, exception, false);
                                                        }

                                                        crawlRequest.ProcessData = true;
                                                    }
                                                }
                                            }
                                        }
                                        else
                                        {
                                            crawlRequest.ProcessData = true;
                                        }
                                    }
                                }
                                else
                                {
                                    crawlRequest.ProcessData = false;
                                }
                                break;

                            case DiscoveryType.WebPage:
                                ArachnodeDataSet.WebPagesRow webPagesRow = _arachnodeDAO.GetWebPage(crawlRequest.Discovery.Uri.AbsoluteUri);

                                if (webPagesRow == null)
                                {
                                    crawlRequest.ProcessData = true;
                                }
                                else
                                {
                                    if ((crawlRequest.WebClient.HttpWebResponse).LastModified > webPagesRow.LastDiscovered)
                                    {
                                        crawlRequest.ProcessData = true;
                                    }
                                    else
                                    {
                                        crawlRequest.ProcessData = false;
                                    }

                                    if (!crawlRequest.ProcessData)
                                    {
                                        if (webPagesRow.Source.Length != 0)
                                        {
                                            crawlRequest.Data = webPagesRow.Source;
                                        }
                                        else
                                        {
                                            string discoveryPath = _discoveryManager.GetDiscoveryPath(ApplicationSettings.DownloadedWebPagesDirectory, crawlRequest.Discovery.Uri.AbsoluteUri, crawlRequest.DataType.FullTextIndexType);

                                            if (File.Exists(discoveryPath))
                                            {
                                                crawlRequest.Data = File.ReadAllBytes(discoveryPath);
                                            }
                                            else
                                            {
                                                try
                                                {
                                                    throw new Exception("The 'LastModified' HttpResponse Header indicated that the Data was not stale, but the Data (Source) could not be found in the WebPages database table or at _applicationSettings.DownloadedWebPagesDirectory.  Therefore, the data was re-downloaded from the server.  The WebPage file may have been deleted from disk or the 'Source' column in the 'WebPages' table may have been cleared or a previous crawl may have crawled with both _applicationSettings.InsertWebPageSource = false and _applicationSettings.SaveDiscoveredWebPagesToDisk = false.");
                                                }
                                                catch (Exception exception)
                                                {
                                                    _arachnodeDAO.InsertException(crawlRequest.Parent.Uri.AbsoluteUri, crawlRequest.Discovery.Uri.AbsoluteUri, exception, false);
                                                }

                                                crawlRequest.ProcessData = true;
                                            }
                                        }
                                    }
                                }
                                break;

                            case DiscoveryType.None:
                                crawlRequest.ProcessData = true;
                                break;
                            }
                        }
                    }
                }
                catch (Exception exception)
                {
                    _arachnodeDAO.InsertException(crawlRequest.Parent.Uri.AbsoluteUri, crawlRequest.Discovery.Uri.AbsoluteUri, exception, false);
                }

                if (crawlRequest.ProcessData)
                {
                    if (crawlRequest.Data != null)
                    {
                    }

                    if (crawlRequest.RenderType == RenderType.None)
                    {
                        if (crawlRequest.Discovery.Uri.Scheme.ToLowerInvariant() != "ftp")
                        {
                            if (crawlRequest.WebClient.HttpWebResponse != null && crawlRequest.WebClient.HttpWebResponse.Method == "HEAD")
                            {
                                IssueWebRequest(crawlRequest, "GET");
                            }

                            if (crawlRequest.WebClient.HttpWebResponse != null)
                            {
                                crawlRequest.Data = crawlRequest.WebClient.DownloadHttpData(crawlRequest.Discovery.Uri.AbsoluteUri, crawlRequest.WebClient.HttpWebResponse.ContentEncoding.ToLowerInvariant() == "gzip", crawlRequest.WebClient.HttpWebResponse.ContentEncoding.ToLowerInvariant() == "deflate", crawlRequest.Crawl.Crawler.CookieContainer);
                            }
                        }
                        else
                        {
                            crawlRequest.Data = crawlRequest.WebClient.DownloadFtpData(crawlRequest.Discovery.Uri.AbsoluteUri);
                        }
                    }
                    else
                    {
                        RendererResponse rendererResponse = crawlRequest.Crawl.Crawler.Engine.Render(crawlRequest, RenderAction.Render, crawlRequest.RenderType);

                        if (rendererResponse != null)
                        {
                            if (rendererResponse.HTMLDocumentClass != null)
                            {
                                crawlRequest.Encoding = Encoding.GetEncoding(rendererResponse.HTMLDocumentClass.charset);

                                string outerHTML = rendererResponse.HTMLDocumentClass.documentElement.outerHTML;

                                crawlRequest.Data         = crawlRequest.Encoding.GetBytes(outerHTML);
                                crawlRequest.DecodedHtml  = HttpUtility.HtmlDecode(outerHTML);
                                crawlRequest.Html         = outerHTML;
                                crawlRequest.HtmlDocument = rendererResponse.HTMLDocumentClass;
                            }

                            crawlRequest.RendererMessage = rendererResponse.RendererMessage;
                        }
                    }
                }
            }
            else
            {
                if (crawlRequest.Data == null)
                {
                }
            }

            if (crawlRequest.Data == null)
            {
                crawlRequest.Data = new byte[0];
            }
        }
        public void TestThatAllWebPagesAreInTheIndex()
        {
            int minID;
            int maxID;

            SqlCommand sqlCommand = new SqlCommand("Select Min(ID) From WebPages");

            sqlCommand.Connection = new SqlConnection("Data Source=.;Initial Catalog=arachnode.net;Integrated Security=True;Connection Timeout=3600;");
            sqlCommand.Connection.Open();

            using (SqlDataReader sqlDataReader = sqlCommand.ExecuteReader())
            {
                sqlDataReader.Read();
                minID = int.Parse(sqlDataReader.GetValue(0).ToString());
            }

            sqlCommand.CommandText = "Select Max(ID) From WebPages";

            using (SqlDataReader sqlDataReader = sqlCommand.ExecuteReader())
            {
                sqlDataReader.Read();
                maxID = int.Parse(sqlDataReader.GetValue(0).ToString());
            }

            ApplicationSettings applicationSettings = new ApplicationSettings();

            ArachnodeDAO arachnodeDAO = new ArachnodeDAO(applicationSettings.ConnectionString);

            IndexSearcher    _indexSearcher   = new IndexSearcher(FSDirectory.Open(new DirectoryInfo("M:\\LDNI")), true);
            StandardAnalyzer standardAnalyzer = new StandardAnalyzer();
            QueryParser      queryParser      = new QueryParser("discoveryid", standardAnalyzer);

            for (int i = minID; i <= maxID; i++)
            {
                Debug.Print(i.ToString());

                ArachnodeDataSet.WebPagesRow webPagesRow = arachnodeDAO.GetWebPage(i.ToString());

                Query query = queryParser.Parse("\"" + webPagesRow.ID + "\"");

                Hits hits = _indexSearcher.Search(query);

                bool constainsTheWebPageAbsoluteUri = false;

                for (int j = 0; j < hits.Length(); j++)
                {
                    if (hits.Doc(j).GetField("discoverytype").StringValue() == "webpage")
                    {
                        constainsTheWebPageAbsoluteUri = true;
                    }
                }

                if (!constainsTheWebPageAbsoluteUri)
                {
                    //ANODET: Set Breakpoint...
                }

                Assert.IsTrue(constainsTheWebPageAbsoluteUri);
            }

            sqlCommand.Connection.Close();
        }