Ejemplo n.º 1
0
        /// <summary>
        ///     Manages the web page.
        /// </summary>
        /// <param name = "webPageID">The web page ID.</param>
        /// <param name = "absoluteUri">The absolute URI.</param>
        /// <param name = "source">The source.</param>
        /// <param name = "encoding">The encoding.</param>
        /// <param name = "fullTextIndexType">Full type of the text index.</param>
        /// <param name = "extractWebPageMetaData">if set to <c>true</c> [extract web page meta data].</param>
        /// <param name = "insertWebPageMetaData">if set to <c>true</c> [insert web page meta data].</param>
        /// <param name = "saveWebPageToDisk">if set to <c>true</c> [save web page to disk].</param>
        /// <returns></returns>
        public override ManagedWebPage ManageWebPage(long webPageID, string absoluteUri, byte[] source, Encoding encoding, string fullTextIndexType, bool extractWebPageMetaData, bool insertWebPageMetaData, bool saveWebPageToDisk)
        {
            try
            {
                ManagedWebPage managedWebPage = new ManagedWebPage();

                string source2 = null;

                if (extractWebPageMetaData || saveWebPageToDisk)
                {
                    source2 = encoding.GetString(source);
                }

                if (extractWebPageMetaData)
                {
                    string source3 = HttpUtility.HtmlDecode(source2);

                    //ANODET: Enable the HtmlAgilityPack to work with bytes.
                    managedWebPage.HtmlDocument = _htmlManager.CreateHtmlDocument(source2, Encoding.Unicode);
                    managedWebPage.Tags         = UserDefinedFunctions.ExtractTags(source3).Value;
                    managedWebPage.Text         = UserDefinedFunctions.ExtractText(source3).Value;

                    #region Experimental Code comparing character parsing vs. regular expressions...

                    //bool inATag = false;

                    //StringBuilder stringBuilder = new StringBuilder();

                    //for (int i = 0; i < source3.Length; i++)
                    //{
                    //    if(source3[i] == '<')
                    //    {
                    //        inATag = true;
                    //        continue;
                    //    }

                    //    if (source3[i] == '>')
                    //    {
                    //        inATag = false;
                    //        continue;
                    //    }

                    //    if (!inATag && !char.IsControl(source3[i]))
                    //    {
                    //        stringBuilder.Append(source3[i]);
                    //    }
                    //}

                    //managedWebPage.Text = stringBuilder.ToString();

                    #endregion

                    if (insertWebPageMetaData)
                    {
                        _arachnodeDAO.InsertWebPageMetaData(webPageID, absoluteUri, encoding.GetBytes(managedWebPage.Text), managedWebPage.HtmlDocument.DocumentNode.OuterHtml);
                    }
                }

                if (saveWebPageToDisk)
                {
                    managedWebPage.DiscoveryPath = _discoveryManager.GetDiscoveryPath(ApplicationSettings.DownloadedWebPagesDirectory, absoluteUri, fullTextIndexType);

                    managedWebPage.StreamWriter = new StreamWriter(managedWebPage.DiscoveryPath, false, encoding);

                    managedWebPage.StreamWriter.Write(source2);
                }

                return(managedWebPage);
            }
            catch (Exception exception)
            {
                //ANODET: Long paths...
#if !DEMO
                _arachnodeDAO.InsertException(absoluteUri, null, exception, false);
#endif
            }

            return(null);
        }