/// <summary> /// Decompresses a LZF compressed byte array to a string. /// </summary> /// <param name = "compressedHtml">LZF compressed data.</param> /// <returns> /// Decompressed string data. /// </returns> public static string DecompressLZF(byte[] compressedHtml) { var lzf = new LZF(); var decompressedHtml = new byte[compressedHtml.Length * 10]; var size = lzf.Decompress(compressedHtml, compressedHtml.Length, decompressedHtml, decompressedHtml.Length); Array.Resize(ref decompressedHtml, size); return(System.Text.Encoding.UTF8.GetString(decompressedHtml)); }
/// <summary> /// Determines whether the specified crawl request is disallowed. /// </summary> /// <param name = "crawlRequest">The crawl request.</param> /// <param name = "arachnodeDAO">The arachnode DAO.</param> /// <returns> /// <c>true</c> if the specified crawl request is disallowed; otherwise, <c>false</c>. /// </returns> public override bool IsDisallowed(CrawlRequest <TArachnodeDAO> crawlRequest, IArachnodeDAO arachnodeDAO) { if (string.IsNullOrEmpty(crawlRequest.Html)) { return(false); } var html = crawlRequest.Html; var lzf = new LZF(); if (_removeCarriageReturns) { html = Replace(html, "\r", "", StringComparison.CurrentCulture); } if (_removeNewLines) { html = Replace(html, "\n", "", StringComparison.CurrentCulture); } if (_removeMultiWhiteSpaces) { html = MultiWhiteSpace.Replace(html, ""); } if (_removeNonPrintableChars) { html = NonPrintableChars.Replace(html, ""); } if (_removeHeadTags) { html = HeadTag.Replace(html, ""); } if (_removeScriptTags) { html = ScriptTag.Replace(html, ""); } if (_removeStyleTags) { html = StyleTag.Replace(html, ""); } if (_removeCommentsTags) { html = CommentsTag.Replace(html, ""); } if (_removeLinkTags) { html = LinkTag.Replace(html, ""); } if (_removeIFrameTags) { html = IFrameTag.Replace(html, ""); } if (_removeMetaTags) { html = MetaTag.Replace(html, ""); } if (_lzfCompress) { var data = System.Text.Encoding.UTF8.GetBytes(html); var destination = new Byte[data.Length + 36000]; int size = 0; size = lzf.Compress(data, data.Length, destination, destination.Length); if (size > 0) { var compressed = new byte[size]; for (UInt32 i = 0; i < size; ++i) { compressed[i] = destination[i]; } crawlRequest.Data = compressed; } } else { crawlRequest.Data = System.Text.Encoding.UTF8.GetBytes(html); } crawlRequest.Html = html; crawlRequest.DecodedHtml = System.Web.HttpUtility.HtmlDecode(html); return(false); }