Example #1
0
        /// <summary>
        ///     Decompresses a LZF compressed byte array to a string.
        /// </summary>
        /// <param name = "compressedHtml">LZF compressed data.</param>
        /// <returns>
        ///     Decompressed string data.
        /// </returns>
        public static string DecompressLZF(byte[] compressedHtml)
        {
            var lzf = new LZF();
            var decompressedHtml = new byte[compressedHtml.Length * 10];
            var size             = lzf.Decompress(compressedHtml, compressedHtml.Length, decompressedHtml, decompressedHtml.Length);

            Array.Resize(ref decompressedHtml, size);
            return(System.Text.Encoding.UTF8.GetString(decompressedHtml));
        }
Example #2
0
        /// <summary>
        ///     Determines whether the specified crawl request is disallowed.
        /// </summary>
        /// <param name = "crawlRequest">The crawl request.</param>
        /// <param name = "arachnodeDAO">The arachnode DAO.</param>
        /// <returns>
        ///     <c>true</c> if the specified crawl request is disallowed; otherwise, <c>false</c>.
        /// </returns>
        public override bool IsDisallowed(CrawlRequest <TArachnodeDAO> crawlRequest, IArachnodeDAO arachnodeDAO)
        {
            if (string.IsNullOrEmpty(crawlRequest.Html))
            {
                return(false);
            }

            var html = crawlRequest.Html;
            var lzf  = new LZF();

            if (_removeCarriageReturns)
            {
                html = Replace(html, "\r", "", StringComparison.CurrentCulture);
            }
            if (_removeNewLines)
            {
                html = Replace(html, "\n", "", StringComparison.CurrentCulture);
            }
            if (_removeMultiWhiteSpaces)
            {
                html = MultiWhiteSpace.Replace(html, "");
            }
            if (_removeNonPrintableChars)
            {
                html = NonPrintableChars.Replace(html, "");
            }
            if (_removeHeadTags)
            {
                html = HeadTag.Replace(html, "");
            }
            if (_removeScriptTags)
            {
                html = ScriptTag.Replace(html, "");
            }
            if (_removeStyleTags)
            {
                html = StyleTag.Replace(html, "");
            }
            if (_removeCommentsTags)
            {
                html = CommentsTag.Replace(html, "");
            }
            if (_removeLinkTags)
            {
                html = LinkTag.Replace(html, "");
            }
            if (_removeIFrameTags)
            {
                html = IFrameTag.Replace(html, "");
            }
            if (_removeMetaTags)
            {
                html = MetaTag.Replace(html, "");
            }

            if (_lzfCompress)
            {
                var data        = System.Text.Encoding.UTF8.GetBytes(html);
                var destination = new Byte[data.Length + 36000];
                int size        = 0;

                size = lzf.Compress(data, data.Length, destination, destination.Length);

                if (size > 0)
                {
                    var compressed = new byte[size];
                    for (UInt32 i = 0; i < size; ++i)
                    {
                        compressed[i] = destination[i];
                    }
                    crawlRequest.Data = compressed;
                }
            }
            else
            {
                crawlRequest.Data = System.Text.Encoding.UTF8.GetBytes(html);
            }

            crawlRequest.Html        = html;
            crawlRequest.DecodedHtml = System.Web.HttpUtility.HtmlDecode(html);

            return(false);
        }