/// <summary> /// Manages the web page. /// </summary> /// <param name = "webPageID">The web page ID.</param> /// <param name = "absoluteUri">The absolute URI.</param> /// <param name = "source">The source.</param> /// <param name = "encoding">The encoding.</param> /// <param name = "fullTextIndexType">Full type of the text index.</param> /// <param name = "extractWebPageMetaData">if set to <c>true</c> [extract web page meta data].</param> /// <param name = "insertWebPageMetaData">if set to <c>true</c> [insert web page meta data].</param> /// <param name = "saveWebPageToDisk">if set to <c>true</c> [save web page to disk].</param> /// <returns></returns> public override ManagedWebPage ManageWebPage(long webPageID, string absoluteUri, byte[] source, Encoding encoding, string fullTextIndexType, bool extractWebPageMetaData, bool insertWebPageMetaData, bool saveWebPageToDisk) { try { ManagedWebPage managedWebPage = new ManagedWebPage(); string source2 = null; if (extractWebPageMetaData || saveWebPageToDisk) { source2 = encoding.GetString(source); } if (extractWebPageMetaData) { string source3 = HttpUtility.HtmlDecode(source2); //ANODET: Enable the HtmlAgilityPack to work with bytes. managedWebPage.HtmlDocument = _htmlManager.CreateHtmlDocument(source2, Encoding.Unicode); managedWebPage.Tags = UserDefinedFunctions.ExtractTags(source3).Value; managedWebPage.Text = UserDefinedFunctions.ExtractText(source3).Value; #region Experimental Code comparing character parsing vs. regular expressions... //bool inATag = false; //StringBuilder stringBuilder = new StringBuilder(); //for (int i = 0; i < source3.Length; i++) //{ // if(source3[i] == '<') // { // inATag = true; // continue; // } // if (source3[i] == '>') // { // inATag = false; // continue; // } // if (!inATag && !char.IsControl(source3[i])) // { // stringBuilder.Append(source3[i]); // } //} //managedWebPage.Text = stringBuilder.ToString(); #endregion if (insertWebPageMetaData) { _arachnodeDAO.InsertWebPageMetaData(webPageID, absoluteUri, encoding.GetBytes(managedWebPage.Text), managedWebPage.HtmlDocument.DocumentNode.OuterHtml); } } if (saveWebPageToDisk) { managedWebPage.DiscoveryPath = _discoveryManager.GetDiscoveryPath(ApplicationSettings.DownloadedWebPagesDirectory, absoluteUri, fullTextIndexType); managedWebPage.StreamWriter = new StreamWriter(managedWebPage.DiscoveryPath, false, encoding); managedWebPage.StreamWriter.Write(source2); } return(managedWebPage); } catch (Exception exception) { //ANODET: Long paths... #if !DEMO _arachnodeDAO.InsertException(absoluteUri, null, exception, false); #endif } return(null); }