public override string GetWebPageSource(string webPageAbsoluteUriOrID, IArachnodeDAO arachnodeDAO) { if (ApplicationSettings.DownloadedWebPagesDirectory == null) { throw new Exception("_applicationSettings.DownloadedWebPagesDirectory is null. This is usually the result of failing to initialize the Application configuration from the ArachnodeDAO."); } string webPageSource = null; ArachnodeDataSet.WebPagesRow webPagesRow = arachnodeDAO.GetWebPage(webPageAbsoluteUriOrID); if (webPagesRow != null) { if (webPagesRow.Source.Length != 0) { webPageSource = Encoding.GetEncoding(webPagesRow.CodePage).GetString(webPagesRow.Source); } else { string discoveryPath = GetDiscoveryPath(ApplicationSettings.DownloadedWebPagesDirectory, webPagesRow.AbsoluteUri, webPagesRow.FullTextIndexType); if (!File.Exists(discoveryPath)) { throw new Exception("Could not find the WebPage Source in the database or on disk."); } webPageSource = File.ReadAllText(discoveryPath, Encoding.GetEncoding(webPagesRow.CodePage)); } } return(webPageSource); }
protected void Page_Load(object sender, EventArgs e) { EnableViewState = false; try { if (Request.QueryString.Count == 2 && Request.QueryString.AllKeys[0] == "discoveryID" && Request.QueryString.AllKeys[1] == "absoluteUri") { ArachnodeDataSet.WebPagesRow webPagesRow = ArachnodeDAO.GetWebPage(Request.QueryString["discoveryID"]); if (webPagesRow != null) { string source = null; if (webPagesRow.Source == null || webPagesRow.Source.Length == 0) { string discoveryPath = DiscoveryManager.GetDiscoveryPath(ApplicationSettings.DownloadedWebPagesDirectory, webPagesRow.AbsoluteUri, webPagesRow.FullTextIndexType); if (File.Exists(discoveryPath)) { source = File.ReadAllText(discoveryPath, Encoding.GetEncoding(webPagesRow.CodePage)); } else { uxLblException.Text = "The WebPage source was not found in the database or on disk."; uxLblException.Visible = true; return; } } else { source = Encoding.GetEncoding(webPagesRow.CodePage).GetString(webPagesRow.Source); } //Request.Url.Scheme + "://" + Request.Url.Authority //ANODET: Should this be a configuration setting? Perhaps - hotlinking isn't exactly polite, but does provide the best user experience. (Version 1.5) uxLWebPage.Text = HtmlManager.CreateHtmlDocument(webPagesRow.AbsoluteUri, webPagesRow.FullTextIndexType, source, UriQualificationType.AbsoluteWhenDownloadedDiscoveryIsUnavailable, ArachnodeDAO, true).DocumentNode.OuterHtml; } else { uxLblException.Text = "The WebPage was not found in the database."; uxLblException.Visible = true; } } } catch (Exception exception) { uxLblException.Text = exception.Message; uxLblException.Visible = true; ArachnodeDAO.InsertException(null, null, exception, false); } }
/// <summary> /// Handles the Load event of the Page control. /// </summary> /// <param name = "sender">The source of the event.</param> /// <param name = "e">The <see cref = "System.EventArgs" /> instance containing the event data.</param> protected void Page_Load(object sender, EventArgs e) { EnableViewState = false; try { if (Request.QueryString.Count == 5 && Request.QueryString.AllKeys[0] == "discoveryID" && Request.QueryString.AllKeys[1] == "absoluteUri" && Request.QueryString.AllKeys[2] == "webPage" && Request.QueryString.AllKeys[3] == "codePage" && Request.QueryString.AllKeys[4] == "fullTextIndexType") { string source = null; if (File.Exists(Encryption.DecryptRijndaelManaged(Request.QueryString["webPage"]))) { source = File.ReadAllText(Encryption.DecryptRijndaelManaged(Request.QueryString["webPage"]), Encoding.GetEncoding(int.Parse(Request.QueryString["codePage"]))); } else { ArachnodeDataSet.WebPagesRow webPagesRow = ArachnodeDAO.GetWebPage(Request.QueryString["discoveryID"]); if (webPagesRow != null && webPagesRow.Source.Length != 0) { source = Encoding.GetEncoding(webPagesRow.CodePage).GetString(webPagesRow.Source); } } if (source != null) { //ANODET: Should this be a configuration setting? Perhaps - hotlinking isn't exactly polite, but does provide the best user experience. (Version 1.5) uxLWebPage.Text = HtmlManager.CreateHtmlDocument(Request.QueryString["absoluteUri"], Request.QueryString["fullTextIndexType"], source, UriQualificationType.AbsoluteWhenDownloadedDiscoveryIsUnavailable, ArachnodeDAO, false).DocumentNode.OuterHtml; } else { uxLWebPage.Text = "The WebPage source was not found in the database or on disk."; try { throw new Exception("The WebPage source for " + HttpUtility.UrlDecode(Request.QueryString["absoluteUri"]) + " was not found in the database or on disk."); } catch (Exception exception) { ArachnodeDAO.InsertException(null, null, exception, false); } } } } catch (Exception exception) { ArachnodeDAO.InsertException(null, null, exception, false); } }
private void WebPageUtilities_OnWebPageProcessed(ArachnodeDataSet.WebPagesRow webPagesRow, string message) { BeginInvoke(new MethodInvoker(delegate { rtbPostProcessingStatus.Text = message + Environment.NewLine + rtbPostProcessingStatus.Text; if (rtbPostProcessingStatus.Text.Length > 10000) { rtbPostProcessingStatus.Text = rtbPostProcessingStatus.Text.Substring(0, 10000); } })); //Application.DoEvents(); //Thread.Sleep(100); }
private void nudWebPageID_ValueChanged(object sender, EventArgs e) { _webPagesRow = _arachnodeDAO.GetWebPage(nudWebPageID.Value.ToString()); if (_webPagesRow != null) { _webPageDiscoveryPath = _discoveryManager.GetDiscoveryPath(_applicationSettings.DownloadedWebPagesDirectory, _webPagesRow.AbsoluteUri, _webPagesRow.FullTextIndexType); llWebPageDiscoveryPathDirectory.Visible = true; llWebPageDiscoveryPathDirectory.Text = Path.GetDirectoryName(_webPageDiscoveryPath); if (cbAutoView.Checked) { btnViewWebPage_Click(sender, e); } } else { llWebPageDiscoveryPathDirectory.Visible = false; wbMain.DocumentText = "The WebPage with the ID of " + nudWebPageID.Value + " does not exist."; } }
public SearchResults <Document> Search(string query, string discoveryType, int pageNumber, int pageSize, bool shouldDocumentsBeClustered, string sort) { try { Global.RefreshIndexSearcher(); if (string.IsNullOrEmpty(discoveryType)) { discoveryType = "WebPage"; } if (pageNumber == 0) { pageNumber = 1; } if (pageSize == 0) { pageSize = 20; } if (string.IsNullOrEmpty(sort)) { } SearchResults <Lucene.Net.Documents.Document> searchResults = SearchManager.GetDocuments(Global.DefaultQueryParser, Global.CustomQueryParser, Global.IndexSearcher, query, (DiscoveryType)Enum.Parse(typeof(DiscoveryType), discoveryType), pageNumber, pageSize, shouldDocumentsBeClustered, sort, WebSettings.MaximumNumberOfDocumentsToReturnPerSearch); SearchResults <Document> searchResults2 = new SearchResults <Document>(); searchResults2.Documents = new List <Document>(searchResults.Documents.Count); searchResults2.TotalNumberOfHits = searchResults.TotalNumberOfHits; foreach (Lucene.Net.Documents.Document document in searchResults.Documents) { try { Document document2 = new Document(); document2.AbsoluteUri = document.GetField("absoluteuri").StringValue(); document2.Created = DateTools.StringToDate(document.GetField("created").StringValue()); document2.DiscoveryID = long.Parse(document.GetField("discoveryid").StringValue()); document2.DiscoveryPath = document.GetField("discoverypath").StringValue(); document2.Domain = document.GetField("domain").StringValue(); document2.Extension = document.GetField("extension").StringValue(); document2.Host = document.GetField("host").StringValue(); document2.Scheme = document.GetField("scheme").StringValue(); document2.Score = float.Parse(document.GetField("relevancyscore").StringValue()); document2.Strength = float.Parse(document.GetField("strength").StringValue()); string text = null; if (File.Exists(document2.DiscoveryPath)) { text = File.ReadAllText(document2.DiscoveryPath, Encoding.GetEncoding(int.Parse(document.GetField("codepage").StringValue()))); } else { ArachnodeDataSet.WebPagesRow webPagesRow = ArachnodeDAO.GetWebPage(document2.DiscoveryID.ToString()); if (webPagesRow != null && webPagesRow.Source != null) { text = Encoding.GetEncoding(webPagesRow.CodePage).GetString(webPagesRow.Source); } } if (text != null) { document2.Summary = SearchManager.Summarize(searchResults.Query, searchResults.WildcardSafeQuery, shouldDocumentsBeClustered, text); } else { document2.Summary = "The WebPage source was not found in the database or on disk."; try { throw new Exception("The WebPage source for " + document2.AbsoluteUri + " was not found in the database or on disk."); } catch (Exception exception) { ArachnodeDAO.InsertException(null, null, exception, false); } } document2.Title = document.GetField("title").StringValue(); if (document.GetField("updated") != null) { document2.Updated = DateTools.StringToDate(document.GetField("updated").StringValue()); } searchResults2.Documents.Add(document2); } catch (Exception exception) { ArachnodeDAO.InsertException(null, null, exception, false); } } return(searchResults2); } catch (Exception exception) { ArachnodeDAO.InsertException(null, null, exception, false); } return(null); }
/// <summary> /// Handles the Load event of the Page control. /// </summary> /// <param name = "sender">The source of the event.</param> /// <param name = "e">The <see cref = "System.EventArgs" /> instance containing the event data.</param> protected void Page_Load(object sender, EventArgs e) { if (Request.QueryString.Count == 5 && Request.QueryString.AllKeys[0] == "query" && Request.QueryString.AllKeys[1] == "discoveryType" && Request.QueryString.AllKeys[2] == "pageNumber" && Request.QueryString.AllKeys[3] == "pageSize" && Request.QueryString.AllKeys[4] == "shouldDocumentsBeClustered") { string query = Request.QueryString["query"]; int pageNumber = int.Parse(Request.QueryString["pageNumber"]); int pageSize = int.Parse(Request.QueryString["pageSize"]); bool shouldDocumentsBeClustered = Request.QueryString["shouldDocumentsBeClustered"] == "1" ? true : false; if (Results != null) { for (int i = 0; i < Results.Documents.Count; i++) { SearchResult searchResult = (SearchResult)LoadControl("SearchResult.ascx"); searchResult.Document = Results.Documents[i]; searchResult.InitializeAsUserControl(Page); searchResult.ID = "uxUcSearchResult_" + i; //AN will no longer populate the filesystem from database sources... //string discoveryPath = Results.Documents[i].GetField("discoverypath").StringValue(); //if (!File.Exists(discoveryPath)) //{ // switch (Results.Documents[i].GetField("discoverypath").StringValue()) // { // case "image": // //ArachnodeDataSet.ImagesRow imagesRow = null; // //try // //{ // // imagesRow = ArachnodeDAO.GetImage(Results.Documents[i].GetField("discoveryid").StringValue()); // // WebPageManager webPageManager = new WebPageManager(ArachnodeDAO); // // ManagedWebPage managedWebPage = webPageManager.ManageWebPage(webPagesRow.ID, webPagesRow.AbsoluteUri, webPagesRow.Source, Encoding.GetEncoding(webPagesRow.CodePage), webPagesRow.FullTextIndexType, false, false, true); // // managedWebPage.StreamWriter.Close(); // // managedWebPage.StreamWriter.Dispose(); // // discoveryPath = managedWebPage.DiscoveryPath; // //} // //catch (Exception exception) // //{ // // if (webPagesRow != null) // // { // // ArachnodeDAO.InsertException(webPagesRow.AbsoluteUri, null, exception, false); // // } // // else // // { // // ArachnodeDAO.InsertException(null, null, exception, false); // // } // // Results.TotalNumberOfHits--; // // continue; // //} // break; // case "webpage": // ArachnodeDataSet.WebPagesRow webPagesRow = null; // try // { // webPagesRow = ArachnodeDAO.GetWebPage(Results.Documents[i].GetField("discoveryid").StringValue()); // WebPageManager webPageManager = new WebPageManager(ArachnodeDAO); // ManagedWebPage managedWebPage = webPageManager.ManageWebPage(webPagesRow.ID, webPagesRow.AbsoluteUri, webPagesRow.Source, Encoding.GetEncoding(webPagesRow.CodePage), webPagesRow.FullTextIndexType, false, false, true); // managedWebPage.StreamWriter.Close(); // managedWebPage.StreamWriter.Dispose(); // discoveryPath = managedWebPage.DiscoveryPath; // } // catch (Exception exception) // { // if (webPagesRow != null) // { // ArachnodeDAO.InsertException(webPagesRow.AbsoluteUri, null, exception, false); // } // else // { // ArachnodeDAO.InsertException(null, null, exception, false); // } // Results.TotalNumberOfHits--; // continue; // } // break; // } //} string text = null; if (File.Exists(Results.Documents[i].GetField("discoverypath").StringValue())) { text = File.ReadAllText(Results.Documents[i].GetField("discoverypath").StringValue(), Encoding.GetEncoding(int.Parse(Results.Documents[i].GetField("codepage").StringValue()))); } else { ArachnodeDataSet.WebPagesRow webPagesRow = ArachnodeDAO.GetWebPage(Results.Documents[i].GetField("discoveryid").StringValue()); if (webPagesRow != null && webPagesRow.Source.Length != 0) { text = Encoding.GetEncoding(webPagesRow.CodePage).GetString(webPagesRow.Source); } } if (text != null) { searchResult.Summary = SearchManager.Summarize(Results.Query, Results.WildcardSafeQuery, shouldDocumentsBeClustered, text); } else { searchResult.Summary = "The WebPage source for " + Results.Documents[i].GetField("absoluteuri").StringValue() + " was not found in the database or on disk."; try { throw new Exception("The WebPage source for " + Results.Documents[i].GetField("absoluteuri").StringValue() + " was not found in the database or on disk."); } catch (Exception exception) { ArachnodeDAO.InsertException(null, null, exception, false); } Results.TotalNumberOfHits--; } uxPhSearchResults.Controls.Add(searchResult); } if (shouldDocumentsBeClustered) { uxHlShouldDocumentsBeClustered.NavigateUrl = Request.Url.AbsoluteUri.Replace("shouldDocumentsBeClustered=1", "shouldDocumentsBeClustered=0"); } else { uxHlShouldDocumentsBeClustered.NavigateUrl = Request.Url.AbsoluteUri.Replace("shouldDocumentsBeClustered=0", "shouldDocumentsBeClustered=1"); } uxHlShouldDocumentsBeClustered.Visible = true; //create the page links. for (int i = 1; i *pageSize <= Results.TotalNumberOfHits + pageSize && i *pageSize <= WebSettings.MaximumNumberOfDocumentsToReturnPerSearch; i++) { HyperLink hyperLink = new HyperLink(); if (pageNumber != i) { hyperLink.CssClass = "pageNumber"; hyperLink.NavigateUrl = Request.Url.LocalPath + "?query=" + query + "&discoveryType=" + Request.QueryString["discoveryType"] + "&pageNumber=" + i + "&pageSize=" + pageSize; if (shouldDocumentsBeClustered) { hyperLink.NavigateUrl += "&shouldDocumentsBeClustered=1"; } else { hyperLink.NavigateUrl += "&shouldDocumentsBeClustered=0"; } } else { hyperLink.CssClass = "currentPageNumber"; } hyperLink.Text = i.ToString(); uxPhPages.Controls.Add(hyperLink); } uxLblPage.Visible = true; uxHlPrevious.Visible = pageNumber > 1; uxHlPrevious.NavigateUrl = "~/Search.aspx?query=" + HttpUtility.UrlEncode(query) + "&discoveryType=" + Request.QueryString["discoveryType"] + "&pageNumber=" + (pageNumber - 1) + "&pageSize=" + pageSize; if (shouldDocumentsBeClustered) { uxHlPrevious.NavigateUrl += "&shouldDocumentsBeClustered=0"; } else { uxHlPrevious.NavigateUrl += "&shouldDocumentsBeClustered=1"; } uxHlNext.Visible = pageNumber < Results.TotalNumberOfHits / pageSize && pageNumber < WebSettings.MaximumNumberOfDocumentsToReturnPerSearch / pageSize; uxHlNext.NavigateUrl = "~/Search.aspx?query=" + HttpUtility.UrlEncode(query) + "&discoveryType=" + Request.QueryString["discoveryType"] + "&pageNumber=" + (pageNumber + 1) + "&pageSize=" + pageSize; if (shouldDocumentsBeClustered) { uxHlNext.NavigateUrl += "&shouldDocumentsBeClustered=1"; } else { uxHlNext.NavigateUrl += "&shouldDocumentsBeClustered=0"; } } } }
/// <summary> /// Process a range of WebPageID after crawling. Useful if crawled WebPages were not processed at crawl time according to desired ApplicationSettings configuration. /// Calling this method DOES change the 'LastDiscovered' fields where applicable. /// This method is not when crawling, rather during post-processing. /// </summary> /// <param name = "webPageIDLowerBound"></param> /// <param name = "webPageIDUpperBound"></param> public static void ProcessWebPages(Crawler <TArachnodeDAO> crawler, long webPageIDLowerBound, long webPageIDUpperBound) { //do not assign the application settings. doing so will override the ApplicationSetting you set before calling this method... TArachnodeDAO arachnodeDAO = (TArachnodeDAO)Activator.CreateInstance(typeof(TArachnodeDAO), crawler.ApplicationSettings.ConnectionString, crawler.ApplicationSettings, crawler.WebSettings, false, false); ConsoleManager <TArachnodeDAO> consoleManager = new ConsoleManager <TArachnodeDAO>(crawler.ApplicationSettings, crawler.WebSettings); ActionManager <TArachnodeDAO> actionManager = new ActionManager <TArachnodeDAO>(crawler.ApplicationSettings, crawler.WebSettings, consoleManager); CookieManager cookieManager = new CookieManager();; MemoryManager <TArachnodeDAO> memoryManager = new MemoryManager <TArachnodeDAO>(crawler.ApplicationSettings, crawler.WebSettings); RuleManager <TArachnodeDAO> ruleManager = new RuleManager <TArachnodeDAO>(crawler.ApplicationSettings, crawler.WebSettings, consoleManager); CacheManager <TArachnodeDAO> cacheManager = new CacheManager <TArachnodeDAO>(crawler.ApplicationSettings, crawler.WebSettings); CrawlerPeerManager <TArachnodeDAO> crawlerPeerManager = new CrawlerPeerManager <TArachnodeDAO>(crawler.ApplicationSettings, crawler.WebSettings, null, arachnodeDAO); Cache <TArachnodeDAO> cache = new Cache <TArachnodeDAO>(crawler.ApplicationSettings, crawler.WebSettings, crawler, actionManager, cacheManager, crawlerPeerManager, memoryManager, ruleManager); DiscoveryManager <TArachnodeDAO> discoveryManager = new DiscoveryManager <TArachnodeDAO>(crawler.ApplicationSettings, crawler.WebSettings, cache, actionManager, cacheManager, memoryManager, ruleManager); HtmlManager <TArachnodeDAO> htmlManager = new HtmlManager <TArachnodeDAO>(crawler.ApplicationSettings, crawler.WebSettings, discoveryManager); //load the CrawlActions, CrawlRules and EngineActions... ruleManager.ProcessCrawlRules(crawler); actionManager.ProcessCrawlActions(crawler); actionManager.ProcessEngineActions(crawler); //these three methods are called in the Engine. UserDefinedFunctions.RefreshAllowedExtensions(true); UserDefinedFunctions.RefreshAllowedSchemes(true); UserDefinedFunctions.RefreshDisallowed(); //instantiate a WebClient to access the ResponseHeaders... WebClient <TArachnodeDAO> webClient = new WebClient <TArachnodeDAO>(crawler.ApplicationSettings, crawler.WebSettings, consoleManager, cookieManager, new ProxyManager <TArachnodeDAO>(crawler.ApplicationSettings, crawler.WebSettings, consoleManager)); webClient.GetHttpWebResponse("http://google.com", "GET", null, null, null, null); WebPageManager <TArachnodeDAO> webPageManager = new WebPageManager <TArachnodeDAO>(crawler.ApplicationSettings, crawler.WebSettings, discoveryManager, htmlManager, arachnodeDAO); for (long i = webPageIDLowerBound; i <= webPageIDUpperBound; i++) { ArachnodeDataSet.WebPagesRow webPagesRow = null; try { //get the WebPage from the database. we need the source data as we don't store this in the index. //even though most of the fields are available in the Document, the WebPage is the authoritative source, so we'll use that for all of the fields. webPagesRow = arachnodeDAO.GetWebPage(i.ToString()); if (webPagesRow != null) { if (webPagesRow.Source == null || webPagesRow.Source.Length == 0) { if (File.Exists(discoveryManager.GetDiscoveryPath(crawler.ApplicationSettings.DownloadedWebPagesDirectory, webPagesRow.AbsoluteUri, webPagesRow.FullTextIndexType))) { using (StreamReader streamReader = File.OpenText(discoveryManager.GetDiscoveryPath(crawler.ApplicationSettings.DownloadedWebPagesDirectory, webPagesRow.AbsoluteUri, webPagesRow.FullTextIndexType))) { webPagesRow.Source = Encoding.UTF8.GetBytes(streamReader.ReadToEnd()); } } else { Console.WriteLine("WebPageID: " + i + " was NOT processed successfully."); if (OnWebPageProcessed != null) { OnWebPageProcessed.BeginInvoke(webPagesRow, "WebPageID: " + i + " was NOT processed successfully.", null, null); } } } ProcessWebPage(crawler.ApplicationSettings, crawler.WebSettings, crawler, webPagesRow, webClient, cache, actionManager, consoleManager, crawlerPeerManager, discoveryManager, memoryManager, ruleManager, webPageManager, arachnodeDAO); Console.WriteLine("WebPageID: " + i + " was processed successfully."); if (OnWebPageProcessed != null) { OnWebPageProcessed.BeginInvoke(webPagesRow, "WebPageID: " + i + " was processed successfully.", null, null); } } } catch (Exception exception) { Console.WriteLine("WebPageID: " + i + " was NOT processed successfully."); Console.WriteLine(exception.Message); if (OnWebPageProcessed != null) { OnWebPageProcessed.BeginInvoke(webPagesRow, "WebPageID: " + i + " was NOT processed successfully.", null, null); OnWebPageProcessed.BeginInvoke(webPagesRow, exception.Message, null, null); } arachnodeDAO.InsertException(null, null, exception, false); } } //stop the CrawlActions, CrawlRules and EngineActions... ruleManager.Stop(); actionManager.Stop(); }
/// <summary> /// Processes a WebPagesRow after crawling. /// </summary> /// <param name = "webPagesRow">The web pages row.</param> /// <param name="webClient"></param> /// <param name="actionManager"></param> /// <param name="consoleManager"></param> /// <param name="discoveryManager"></param> /// <param name="memoryManager"></param> /// <param name="ruleManager"></param> /// <param name = "webPageManager">The web page manager.</param> /// <param name = "arachnodeDAO">The arachnode DAO.</param> /// <param name = "fileManager">The file manager.</param> /// <param name = "imageManager">The image manager.</param> public static void ProcessWebPage(ApplicationSettings applicationSettings, WebSettings webSettings, Crawler <TArachnodeDAO> crawler, ArachnodeDataSet.WebPagesRow webPagesRow, WebClient <TArachnodeDAO> webClient, Cache <TArachnodeDAO> cache, ActionManager <TArachnodeDAO> actionManager, ConsoleManager <TArachnodeDAO> consoleManager, CrawlerPeerManager <TArachnodeDAO> crawlerPeerManager, DiscoveryManager <TArachnodeDAO> discoveryManager, MemoryManager <TArachnodeDAO> memoryManager, RuleManager <TArachnodeDAO> ruleManager, WebPageManager <TArachnodeDAO> webPageManager, IArachnodeDAO arachnodeDAO) { CacheManager <TArachnodeDAO> cacheManager = new CacheManager <TArachnodeDAO>(applicationSettings, webSettings); CookieManager cookieManager = new CookieManager(); CrawlRequestManager <TArachnodeDAO> crawlRequestManager = new CrawlRequestManager <TArachnodeDAO>(applicationSettings, webSettings, cache, consoleManager, discoveryManager); DataTypeManager <TArachnodeDAO> dataTypeManager = new DataTypeManager <TArachnodeDAO>(applicationSettings, webSettings); EncodingManager <TArachnodeDAO> encodingManager = new EncodingManager <TArachnodeDAO>(applicationSettings, webSettings); PolitenessManager <TArachnodeDAO> politenessManager = new PolitenessManager <TArachnodeDAO>(applicationSettings, webSettings, cache); ProxyManager <TArachnodeDAO> proxyManager = new ProxyManager <TArachnodeDAO>(applicationSettings, webSettings, consoleManager); HtmlManager <TArachnodeDAO> htmlManager = new HtmlManager <TArachnodeDAO>(applicationSettings, webSettings, discoveryManager); Crawl <TArachnodeDAO> crawl = new Crawl <TArachnodeDAO>(applicationSettings, webSettings, crawler, actionManager, consoleManager, cookieManager, crawlRequestManager, dataTypeManager, discoveryManager, encodingManager, htmlManager, politenessManager, proxyManager, ruleManager, true); //create a CrawlRequest as this is what the internals of SiteCrawler.dll expect to operate on... CrawlRequest <TArachnodeDAO> crawlRequest = new CrawlRequest <TArachnodeDAO>(new Discovery <TArachnodeDAO>(webPagesRow.AbsoluteUri), webPagesRow.CrawlDepth, UriClassificationType.Host, UriClassificationType.Host, 0, RenderType.None, RenderType.None); crawlRequest.Crawl = crawl; crawlRequest.Discovery.DiscoveryType = DiscoveryType.WebPage; crawlRequest.Discovery.ID = webPagesRow.ID; crawlRequest.Data = webPagesRow.Source; crawlRequest.CurrentDepth = webPagesRow.CrawlDepth; crawlRequest.Encoding = Encoding.GetEncoding(webPagesRow.CodePage); crawlRequest.ProcessData = true; crawlRequest.WebClient = webClient; crawlRequest.WebClient.HttpWebResponse.Headers.Clear(); //parse the ResponseHeaders from the WebPagesRow.ResponseHeaders string... foreach (string responseHeader in webPagesRow.ResponseHeaders.Split("\r\n".ToCharArray(), StringSplitOptions.RemoveEmptyEntries)) { string[] responseHeaderSplit = responseHeader.Split(":".ToCharArray()); string name = responseHeaderSplit[0]; string value = UserDefinedFunctions.ExtractResponseHeader(webPagesRow.ResponseHeaders, name, true).Value; crawlRequest.WebClient.HttpWebResponse.Headers.Add(name, value); } //refresh the DataTypes in the DataTypeManager... (if necessary)... if (dataTypeManager.AllowedDataTypes.Count == 0) { dataTypeManager.RefreshDataTypes(); } crawlRequest.DataType = dataTypeManager.DetermineDataType(crawlRequest); //now, process the bytes... encodingManager.ProcessCrawlRequest(crawlRequest, arachnodeDAO); if (applicationSettings.InsertWebPages) { crawlRequest.Discovery.ID = arachnodeDAO.InsertWebPage(crawlRequest.Discovery.Uri.AbsoluteUri, crawlRequest.WebClient.HttpWebResponse.Headers.ToString(), applicationSettings.InsertWebPageSource ? crawlRequest.Data : new byte[] { }, crawlRequest.Encoding.CodePage, crawlRequest.DataType.FullTextIndexType, crawlRequest.CurrentDepth, applicationSettings.ClassifyAbsoluteUris); } crawlRequest.ManagedDiscovery = webPageManager.ManageWebPage(crawlRequest.Discovery.ID.Value, crawlRequest.Discovery.Uri.AbsoluteUri, crawlRequest.Data, crawlRequest.Encoding, crawlRequest.DataType.FullTextIndexType, applicationSettings.ExtractWebPageMetaData, applicationSettings.InsertWebPageMetaData, applicationSettings.SaveDiscoveredWebPagesToDisk); //assigning FileAndImageDiscoveries isn't applicable because Files and Images need to be crawled to be properly classified... without classification we don't know whether they belong in dbo.Files or dbo.Images... crawlRequestManager.ProcessEmailAddresses(crawlRequest, arachnodeDAO); crawlRequestManager.ProcessHyperLinks(crawlRequest, arachnodeDAO); actionManager.PerformCrawlActions(crawlRequest, CrawlActionType.PostRequest, arachnodeDAO); discoveryManager.CloseAndDisposeManagedDiscovery(crawlRequest, arachnodeDAO); }
public override void ProcessCrawlRequest(CrawlRequest <TArachnodeDAO> crawlRequest, bool obeyCrawlRules, bool executeCrawlActions) { IssueWebRequest(crawlRequest, "GET"); crawlRequest.DataType = _dataTypeManager.DetermineDataType(crawlRequest); if (obeyCrawlRules) { _ruleManager.IsDisallowed(crawlRequest, CrawlRuleType.PreGet, _arachnodeDAO); } if (executeCrawlActions) { _actionManager.PerformCrawlActions(crawlRequest, CrawlActionType.PreGet, _arachnodeDAO); } if (!crawlRequest.IsDisallowed) { try { if (crawlRequest.WebClient.HttpWebResponse != null) { crawlRequest.ProcessData = true; bool isLastModifiedOutdated = true; try { isLastModifiedOutdated = crawlRequest.WebClient.HttpWebResponse.LastModified != DateTime.Now; } catch (Exception exception) { _arachnodeDAO.InsertException(crawlRequest.Parent.Uri.AbsoluteUri, crawlRequest.Discovery.Uri.AbsoluteUri, exception, false); } if (isLastModifiedOutdated) { switch (crawlRequest.DataType.DiscoveryType) { case DiscoveryType.File: if (ApplicationSettings.AssignFileAndImageDiscoveries) //ANODET: robots.txt { ArachnodeDataSet.FilesRow filesRow = _arachnodeDAO.GetFile(crawlRequest.Discovery.Uri.AbsoluteUri); if (filesRow == null) { crawlRequest.ProcessData = true; } else { if (!filesRow.IsResponseHeadersNull()) { DateTime lastModified; SqlString lastModifiedValue = UserDefinedFunctions.ExtractResponseHeader(filesRow.ResponseHeaders, "Last-Modified: ", false); if (!lastModifiedValue.IsNull && DateTime.TryParse(lastModifiedValue.Value, out lastModified)) { //crawlRequest.WebClient.HttpWebResponse.LastModified will equal DateTime.Now (or close to it) if the 'Last-Modified' ResponseHeader is not present... if ((crawlRequest.WebClient.HttpWebResponse).LastModified > lastModified) { crawlRequest.ProcessData = true; } else { crawlRequest.ProcessData = false; } } else { crawlRequest.ProcessData = false; } } else { crawlRequest.ProcessData = true; } if (!crawlRequest.ProcessData) { if (filesRow.Source.Length != 0) { crawlRequest.Data = filesRow.Source; } else { string discoveryPath = _discoveryManager.GetDiscoveryPath(ApplicationSettings.DownloadedFilesDirectory, crawlRequest.Discovery.Uri.AbsoluteUri, crawlRequest.DataType.FullTextIndexType); if (File.Exists(discoveryPath)) { crawlRequest.Data = File.ReadAllBytes(discoveryPath); } else { try { throw new Exception("The 'LastModified' HttpResponse Header indicated that the Data was not stale, but the Data (Source) could not be found in the Files database table or at _applicationSettings.DownloadedFilesDirectory. Therefore, the data was re-downloaded from the server. The File file may have been deleted from disk or the 'Source' column in the 'Files' table may have been cleared or a previous crawl may have crawled with both _applicationSettings.InsertFileSource = false and _applicationSettings.SaveDiscoveredFilesToDisk = false."); } catch (Exception exception) { _arachnodeDAO.InsertException(crawlRequest.Parent.Uri.AbsoluteUri, crawlRequest.Discovery.Uri.AbsoluteUri, exception, false); } crawlRequest.ProcessData = true; } } } } } else { crawlRequest.ProcessData = false; } break; case DiscoveryType.Image: if (ApplicationSettings.AssignFileAndImageDiscoveries) { ArachnodeDataSet.ImagesRow imagesRow = _arachnodeDAO.GetImage(crawlRequest.Discovery.Uri.AbsoluteUri); if (imagesRow == null) { crawlRequest.ProcessData = true; } else { if (!imagesRow.IsResponseHeadersNull()) { DateTime lastModified; SqlString lastModifiedValue = UserDefinedFunctions.ExtractResponseHeader(imagesRow.ResponseHeaders, "Last-Modified: ", false); if (!lastModifiedValue.IsNull && DateTime.TryParse(lastModifiedValue.Value, out lastModified)) { //crawlRequest.WebClient.HttpWebResponse.LastModified will equal DateTime.Now (or close to it) if the 'Last-Modified' ResponseHeader is not present... if (crawlRequest.WebClient.HttpWebResponse.LastModified > lastModified) { crawlRequest.ProcessData = true; } else { crawlRequest.ProcessData = false; } } else { crawlRequest.ProcessData = false; } if (!crawlRequest.ProcessData) { if (imagesRow.Source.Length != 0) { crawlRequest.Data = imagesRow.Source; } else { string discoveryPath = _discoveryManager.GetDiscoveryPath(ApplicationSettings.DownloadedImagesDirectory, crawlRequest.Discovery.Uri.AbsoluteUri, crawlRequest.DataType.FullTextIndexType); if (File.Exists(discoveryPath)) { crawlRequest.Data = File.ReadAllBytes(discoveryPath); } else { try { throw new Exception("The 'LastModified' HttpResponse Header indicated that the Data was not stale, but the Data (Source) could not be found in the Images database table or at _applicationSettings.DownloadedImagesDirectory. Therefore, the data was downloaded from the server. The Image file may have been deleted from disk or the 'Source' column in the 'Images' table may have been cleared. A previous crawl may have crawled with both _applicationSettings.InsertImageSource = false and _applicationSettings.SaveDiscoveredImagesToDisk = false."); } catch (Exception exception) { _arachnodeDAO.InsertException(crawlRequest.Parent.Uri.AbsoluteUri, crawlRequest.Discovery.Uri.AbsoluteUri, exception, false); } crawlRequest.ProcessData = true; } } } } else { crawlRequest.ProcessData = true; } } } else { crawlRequest.ProcessData = false; } break; case DiscoveryType.WebPage: ArachnodeDataSet.WebPagesRow webPagesRow = _arachnodeDAO.GetWebPage(crawlRequest.Discovery.Uri.AbsoluteUri); if (webPagesRow == null) { crawlRequest.ProcessData = true; } else { if ((crawlRequest.WebClient.HttpWebResponse).LastModified > webPagesRow.LastDiscovered) { crawlRequest.ProcessData = true; } else { crawlRequest.ProcessData = false; } if (!crawlRequest.ProcessData) { if (webPagesRow.Source.Length != 0) { crawlRequest.Data = webPagesRow.Source; } else { string discoveryPath = _discoveryManager.GetDiscoveryPath(ApplicationSettings.DownloadedWebPagesDirectory, crawlRequest.Discovery.Uri.AbsoluteUri, crawlRequest.DataType.FullTextIndexType); if (File.Exists(discoveryPath)) { crawlRequest.Data = File.ReadAllBytes(discoveryPath); } else { try { throw new Exception("The 'LastModified' HttpResponse Header indicated that the Data was not stale, but the Data (Source) could not be found in the WebPages database table or at _applicationSettings.DownloadedWebPagesDirectory. Therefore, the data was re-downloaded from the server. The WebPage file may have been deleted from disk or the 'Source' column in the 'WebPages' table may have been cleared or a previous crawl may have crawled with both _applicationSettings.InsertWebPageSource = false and _applicationSettings.SaveDiscoveredWebPagesToDisk = false."); } catch (Exception exception) { _arachnodeDAO.InsertException(crawlRequest.Parent.Uri.AbsoluteUri, crawlRequest.Discovery.Uri.AbsoluteUri, exception, false); } crawlRequest.ProcessData = true; } } } } break; case DiscoveryType.None: crawlRequest.ProcessData = true; break; } } } } catch (Exception exception) { _arachnodeDAO.InsertException(crawlRequest.Parent.Uri.AbsoluteUri, crawlRequest.Discovery.Uri.AbsoluteUri, exception, false); } if (crawlRequest.ProcessData) { if (crawlRequest.Data != null) { } if (crawlRequest.RenderType == RenderType.None) { if (crawlRequest.Discovery.Uri.Scheme.ToLowerInvariant() != "ftp") { if (crawlRequest.WebClient.HttpWebResponse != null && crawlRequest.WebClient.HttpWebResponse.Method == "HEAD") { IssueWebRequest(crawlRequest, "GET"); } if (crawlRequest.WebClient.HttpWebResponse != null) { crawlRequest.Data = crawlRequest.WebClient.DownloadHttpData(crawlRequest.Discovery.Uri.AbsoluteUri, crawlRequest.WebClient.HttpWebResponse.ContentEncoding.ToLowerInvariant() == "gzip", crawlRequest.WebClient.HttpWebResponse.ContentEncoding.ToLowerInvariant() == "deflate", crawlRequest.Crawl.Crawler.CookieContainer); } } else { crawlRequest.Data = crawlRequest.WebClient.DownloadFtpData(crawlRequest.Discovery.Uri.AbsoluteUri); } } else { RendererResponse rendererResponse = crawlRequest.Crawl.Crawler.Engine.Render(crawlRequest, RenderAction.Render, crawlRequest.RenderType); if (rendererResponse != null) { if (rendererResponse.HTMLDocumentClass != null) { crawlRequest.Encoding = Encoding.GetEncoding(rendererResponse.HTMLDocumentClass.charset); string outerHTML = rendererResponse.HTMLDocumentClass.documentElement.outerHTML; crawlRequest.Data = crawlRequest.Encoding.GetBytes(outerHTML); crawlRequest.DecodedHtml = HttpUtility.HtmlDecode(outerHTML); crawlRequest.Html = outerHTML; crawlRequest.HtmlDocument = rendererResponse.HTMLDocumentClass; } crawlRequest.RendererMessage = rendererResponse.RendererMessage; } } } } else { if (crawlRequest.Data == null) { } } if (crawlRequest.Data == null) { crawlRequest.Data = new byte[0]; } }
public void TestThatAllWebPagesAreInTheIndex() { int minID; int maxID; SqlCommand sqlCommand = new SqlCommand("Select Min(ID) From WebPages"); sqlCommand.Connection = new SqlConnection("Data Source=.;Initial Catalog=arachnode.net;Integrated Security=True;Connection Timeout=3600;"); sqlCommand.Connection.Open(); using (SqlDataReader sqlDataReader = sqlCommand.ExecuteReader()) { sqlDataReader.Read(); minID = int.Parse(sqlDataReader.GetValue(0).ToString()); } sqlCommand.CommandText = "Select Max(ID) From WebPages"; using (SqlDataReader sqlDataReader = sqlCommand.ExecuteReader()) { sqlDataReader.Read(); maxID = int.Parse(sqlDataReader.GetValue(0).ToString()); } ApplicationSettings applicationSettings = new ApplicationSettings(); ArachnodeDAO arachnodeDAO = new ArachnodeDAO(applicationSettings.ConnectionString); IndexSearcher _indexSearcher = new IndexSearcher(FSDirectory.Open(new DirectoryInfo("M:\\LDNI")), true); StandardAnalyzer standardAnalyzer = new StandardAnalyzer(); QueryParser queryParser = new QueryParser("discoveryid", standardAnalyzer); for (int i = minID; i <= maxID; i++) { Debug.Print(i.ToString()); ArachnodeDataSet.WebPagesRow webPagesRow = arachnodeDAO.GetWebPage(i.ToString()); Query query = queryParser.Parse("\"" + webPagesRow.ID + "\""); Hits hits = _indexSearcher.Search(query); bool constainsTheWebPageAbsoluteUri = false; for (int j = 0; j < hits.Length(); j++) { if (hits.Doc(j).GetField("discoverytype").StringValue() == "webpage") { constainsTheWebPageAbsoluteUri = true; } } if (!constainsTheWebPageAbsoluteUri) { //ANODET: Set Breakpoint... } Assert.IsTrue(constainsTheWebPageAbsoluteUri); } sqlCommand.Connection.Close(); }