public static string Convert(string inputstr) { if (inputstr.Trim() == "") { return(""); } Tidy t = new Tidy(); t.Options.DocType = DocType.Strict; t.Options.Xhtml = true; MemoryStream input = new MemoryStream(UTF8Encoding.Default.GetBytes(inputstr)); MemoryStream output = new MemoryStream(); t.Parse(input, output, new TidyMessageCollection()); byte[] outputContent = new Byte[output.Length]; output.Position = 0; outputContent = output.ToArray(); UTF8Encoding ut = new UTF8Encoding(); String temp = ut.GetString(outputContent); return(temp.Substring(temp.LastIndexOf("<body>")).Replace("<body>", "").Replace("</body>", "").Replace("</html>", "")); }
private static string ParseMarkup(string markup, Tidy tidy, out TidyMessageCollection tidyMessages) { string result; tidyMessages = new TidyMessageCollection(); byte[] htmlByteArray = Encoding.UTF8.GetBytes(markup); using (var inputStream = new MemoryStream(htmlByteArray)) { using (var outputStream = new MemoryStream()) { tidy.Parse(inputStream, outputStream, tidyMessages); outputStream.Position = 0; using (var sr = new C1StreamReader(outputStream)) { result = sr.ReadToEnd(); } } } if (tidyMessages.Errors > 0) { var errorMessageBuilder = new StringBuilder(); foreach (TidyMessage message in tidyMessages) { if (message.Level == MessageLevel.Error) { errorMessageBuilder.AppendLine(message.ToString()); } } throw new InvalidOperationException($"Failed to parse html:\n\n{errorMessageBuilder}"); } return(result); }
/// <summary> /// Cleans HTML documents or fragments into XHTML conformant markup /// </summary> /// <param name="htmlMarkup">The html to clean</param> /// <returns>A fully structured XHTML document, incl. html, head and body elements.</returns> public static TidyHtmlResult TidyHtml(string htmlMarkup) { Tidy tidy = GetXhtmlConfiguredTidy(); List <string> namespacePrefixedElementNames = LocateNamespacePrefixedElementNames(htmlMarkup); Dictionary <string, string> namespacePrefixToUri = LocateNamespacePrefixToUriDeclarations(htmlMarkup); List <string> badNamespacePrefixedElementNames = namespacePrefixedElementNames .Where(s => !namespacePrefixToUri.Any(d => s.StartsWith(d.Key))).ToList(); AllowNamespacePrefixedElementNames(tidy, namespacePrefixedElementNames); AllowHtml5ElementNames(tidy); string xhtml = ParseMarkup(htmlMarkup, tidy, out TidyMessageCollection tidyMessages); if (xhtml.IndexOf("<html>") > -1) { xhtml = xhtml.Replace("<html>", "<html xmlns=\"http://www.w3.org/1999/xhtml\">"); } if (xhtml.IndexOf("xmlns=\"http://www.w3.org/1999/xhtml\"") == -1) { xhtml = xhtml.Replace("<html", "<html xmlns=\"http://www.w3.org/1999/xhtml\""); } xhtml = RemoveDuplicateAttributes(xhtml); xhtml = RemoveXmlDeclarations(xhtml); xhtml = UndoLowerCasingOfElementNames(xhtml, namespacePrefixedElementNames); xhtml = UndoLowerCasingOfNamespacePrefixes(xhtml, namespacePrefixToUri); StringBuilder messageBuilder = new StringBuilder(); foreach (TidyMessage message in tidyMessages) { if (message.Level == MessageLevel.Warning) { messageBuilder.AppendLine(message.ToString()); } } List <string> badNamespacePrefixes = badNamespacePrefixedElementNames.Select(n => n.Substring(0, n.IndexOf(':'))).Union(LocateAttributeNamespacePrefixes(xhtml)).Distinct().Where(f => IsValidXmlName(f)).ToList(); XDocument outputResult; if (badNamespacePrefixedElementNames.Any()) { string badDeclared = string.Join(" ", badNamespacePrefixes.Select(p => $"xmlns:{p}='#bad'")); XDocument badDoc = XDocument.Parse($"<root {badDeclared}>{xhtml}</root>"); badDoc.Descendants().Attributes().Where(e => e.Name.Namespace == "#bad").Remove(); badDoc.Descendants().Where(e => e.Name.Namespace == "#bad").Remove(); outputResult = new XDocument(badDoc.Root.Descendants().First()); } else { outputResult = XDocument.Parse(xhtml, LoadOptions.PreserveWhitespace); } return(new TidyHtmlResult { Output = outputResult, ErrorSummary = messageBuilder.ToString() }); }
public static string tidy(string str) { Tidy tidy = new Tidy(); TidyMessageCollection msg = new TidyMessageCollection(); MemoryStream input = new MemoryStream(); MemoryStream output = new MemoryStream(); tidy.Options.CharEncoding = CharEncoding.UTF8; tidy.Options.DocType = DocType.Strict; tidy.Options.DropFontTags = true; tidy.Options.LogicalEmphasis = true; tidy.Options.Xhtml = true; tidy.Options.MakeClean = true; tidy.Options.TidyMark = true; tidy.Options.TabSize = 0; if (str != null) { byte[] byteArray = Encoding.UTF8.GetBytes(str); input.Write(byteArray, 0, byteArray.Length); input.Position = 0; tidy.Parse(input, output, msg);//粘贴的Word文档在此有出现代码丢失的情况 string outputString = Encoding.UTF8.GetString(output.ToArray()); int bodybegin = outputString.IndexOf("<body>"); int bodyend = outputString.IndexOf("</body>"); if (bodybegin > 0 && bodyend > 0) { int length = bodyend - bodybegin - 10; if (length < 0) { length = 0; } string realoutputString = outputString.Substring(bodybegin + 8, length); return(realoutputString); } else { return(""); } } else { return(""); } }
static string CleanHtml(string badHtmlString) { //Clean bad html using TIDY // http://sourceforge.net/projects/tidynet/ Tidy tidy = new Tidy(); MemoryStream input = new MemoryStream(); MemoryStream output = new MemoryStream(); byte[] badHtml = Encoding.UTF8.GetBytes(badHtmlString); input.Write(badHtml, 0, badHtml.Length); input.Position = 0; TidyMessageCollection tidyMsg = new TidyMessageCollection(); tidy.Parse(input, output, tidyMsg); return(Encoding.UTF8.GetString(output.ToArray())); }
protected override void DoProcess(FieldSerializationPipelineArgs args) { Assert.ArgumentNotNull(args, "args"); if (args.ValueSerialized != null || string.IsNullOrWhiteSpace(args.ValueNormal) || !supportedFieldTypeKeys.Any(k => k.Equals(args.FieldTypeKey, StringComparison.InvariantCultureIgnoreCase))) { return; } Tidy tidy = new Tidy(); tidy.Options.DocType = DocType.Omit; tidy.Options.TidyMark = true; tidy.Options.IndentContent = true; TidyMessageCollection tmc = new TidyMessageCollection(); using (MemoryStream input = new MemoryStream()) using (MemoryStream output = new MemoryStream()) { byte[] byteArray = Encoding.UTF8.GetBytes(args.ValueNormal); input.Write(byteArray, 0, byteArray.Length); input.Position = 0; tidy.Parse(input, output, tmc); string html = Encoding.UTF8.GetString(output.ToArray()); string bodyTag = "<body>"; string bodyCloseTag = "</body>"; if (html.IndexOf(bodyTag) > 0) { html = html.Substring(html.IndexOf(bodyTag) + bodyTag.Length); } if (html.IndexOf(bodyCloseTag) > 0) { html = html.Substring(0, html.IndexOf(bodyCloseTag)); } html = html.Trim(); args.ValueSerialized = string.Concat( Environment.NewLine, html, Environment.NewLine); args.FieldSerializationType = FieldSerializationType.Html; } }
/// <summary> /// Indents the given html source. /// </summary> /// <param name="htmlSource">The html source.</param> /// <returns>A string with the new source.</returns> public String IndentContent(String htmlSource) { Tidy tidy = new Tidy(); tidy.Options.IndentContent = true; TidyMessageCollection tmc = new TidyMessageCollection(); MemoryStream input = new MemoryStream(); MemoryStream output = new MemoryStream(); byte[] byteArray = Encoding.Unicode.GetBytes(htmlSource); input.Write(byteArray, 0, byteArray.Length); input.Position = 0; tidy.Parse(input, output, tmc); htmlSource = Encoding.Unicode.GetString(output.ToArray()); return(htmlSource); }
public static string TidyHtml(string text) { var doc = new Tidy(); var messages = new TidyMessageCollection(); var input = new MemoryStream(); var output = new MemoryStream(); var array = Encoding.UTF8.GetBytes(text); input.Write(array, 0, array.Length); input.Position = 0; /* * // Disabled as it causes problems handling "font" tags * // There are occurences when it will muck up a font tag to "fontface=...etc... * //doc.Options.Xhtml = true; * doc.Options.MakeClean = false; * doc.Options.DocType = DocType.Strict; * doc.Options.CharEncoding = CharEncoding.UTF8; * doc.Options.LogicalEmphasis = true; * * doc.Options.SmartIndent = true; * doc.Options.IndentContent = true; * doc.Options.TidyMark = false; * doc.Options.QuoteAmpersand = true; * doc.Options.DropFontTags = false; * doc.Options.DropEmptyParas = true; * * // Required to stop spaces being removed, and tabs added etc... * doc.Options.Spaces = 0; * doc.Options.WrapLen = 32000; */ doc.Options.TidyMark = false; doc.Options.MakeClean = true; doc.Options.Word2000 = true; doc.Options.EncloseText = true; // Required to stop spaces being removed, and tabs added etc... doc.Options.Spaces = 0; doc.Options.WrapLen = 32000; doc.Parse(input, output, messages); //return Encoding.UTF8.GetString(output.ToArray()); return(RemoveTidyAdditions(Encoding.UTF8.GetString(output.ToArray()))); }
public static String ConvertHtmlToXhtml(String source) { MemoryStream input = new MemoryStream(Encoding.UTF8.GetBytes(source)); MemoryStream output = new MemoryStream(); TidyMessageCollection tmc = new TidyMessageCollection(); Tidy tidy = new Tidy(); tidy.Options.DocType = DocType.Omit; tidy.Options.DropFontTags = true; tidy.Options.LogicalEmphasis = true; tidy.Options.Xhtml = true; tidy.Options.XmlOut = true; tidy.Options.MakeClean = true; tidy.Options.TidyMark = false; tidy.Options.NumEntities = true; tidy.Parse(input, output, tmc); XmlDocument x = new XmlDocument(); XmlDocument xhtml = new XmlDocument(); xhtml.LoadXml("<body />"); XmlNode xhtmlBody = xhtml.SelectSingleNode("/body"); x.LoadXml(Encoding.UTF8.GetString(output.ToArray())); XmlAttribute ns = x.CreateAttribute("xmlns"); ns.Value = XhtmlNamespace; XmlNode body = x.SelectSingleNode("/html/body"); foreach (XmlNode node in body.ChildNodes) { if (node.NodeType == XmlNodeType.Element) { node.Attributes.Append(ns); } xhtmlBody.AppendChild(xhtml.ImportNode(node, true)); } return(xhtmlBody.InnerXml); }
/// <summary> /// Uses Tidy.Net to clean a html source. /// </summary> /// <param name="htmlSource">The original html source.</param> /// <param name="isWordHtml">Specifies if the source is an output from Microsoft Word</param> /// <returns>The cleaned Html.</returns> public string Clean(string htmlSource) { Tidy tidy = new Tidy(); //Options required dor xhtml conversion. tidy.Options.DocType = DocType.Strict; tidy.Options.DropFontTags = true; tidy.Options.LogicalEmphasis = true; tidy.Options.Xhtml = true; tidy.Options.XmlOut = true; tidy.Options.MakeClean = true; tidy.Options.TidyMark = false; tidy.Options.DropEmptyParas = true; tidy.Options.IndentContent = true; tidy.Options.SmartIndent = true; tidy.Options.Word2000 = isWordHtml; tidy.Options.EncloseBlockText = true; tidy.Options.XmlTags = true; tidy.Options.FixComments = true; TidyMessageCollection tmc = new TidyMessageCollection(); MemoryStream input = new MemoryStream(); MemoryStream output = new MemoryStream(); byte[] byteArray = Encoding.UTF8.GetBytes(htmlSource); input.Write(byteArray, 0, byteArray.Length); input.Position = 0; try { tidy.Parse(input, output, tmc); } catch (FormatException ex) { Log.Exception(ex); return(htmlSource); } string cleanContent = Encoding.UTF8.GetString(output.ToArray()); return(cleanContent); }
private void cleanContent(ref String initialContent, ref String cleanContent) { Tidy tidy = new Tidy(); /* * tidy.Options.DocType = DocType.Strict; * tidy.Options.DropFontTags = true; * tidy.Options.LogicalEmphasis = true; * tidy.Options.XmlOut = true; * dy.Options.TidyMark = false;*/ tidy.Options.Word2000 = true; //tidy.Options.MakeClean = true; tidy.Options.Xhtml = true; TidyMessageCollection tmc = new TidyMessageCollection(); MemoryStream input = new MemoryStream(); MemoryStream output = new MemoryStream(); byte[] byteArray = Encoding.UTF8.GetBytes(initialContent); input.Write(byteArray, 0, byteArray.Length); input.Position = 0; tidy.Parse(input, output, tmc); cleanContent = Encoding.UTF8.GetString(output.ToArray()); //Delete header & footer int startIndex, endIndex; startIndex = cleanContent.IndexOf("<body"); endIndex = cleanContent.IndexOf(">", startIndex); cleanContent = cleanContent.Remove(0, endIndex + 1); startIndex = cleanContent.IndexOf("</body"); if (startIndex >= 0) { cleanContent = cleanContent.Remove(startIndex); } }
/// <summary> /// Cleans HTML documents or fragments into XHTML conformant markup /// </summary> /// <param name="xmlMarkup">The html to clean</param> /// <returns></returns> public static XDocument TidyXml(string xmlMarkup) { try { return(XhtmlDocument.Parse(xmlMarkup)); } catch (Exception) { // take the slow road below... } Tidy tidy = GetXmlConfiguredTidy(); List <string> namespacePrefixedElementNames = LocateNamespacePrefixedElementNames(xmlMarkup); AllowNamespacePrefixedElementNames(tidy, namespacePrefixedElementNames); AllowHtml5ElementNames(tidy); string xml = ParseMarkup(xmlMarkup, tidy, out TidyMessageCollection _); xml = RemoveDuplicateAttributes(xml); return(XDocument.Parse(xml)); }
/// <summary> /// Cleans HTML documents or fragments into XHTML conformant markup /// </summary> /// <param name="htmlMarkup">The html to clean</param> /// <returns>A fully structured XHTML document, incl. html, head and body elements.</returns> public static TidyHtmlResult TidyHtml(string htmlMarkup) { byte[] htmlByteArray = Encoding.UTF8.GetBytes(htmlMarkup); Tidy tidy = GetXhtmlConfiguredTidy(); List <string> namespacePrefixedElementNames = LocateNamespacePrefixedElementNames(htmlMarkup); Dictionary <string, string> namespacePrefixToUri = LocateNamespacePrefixToUriDeclarations(htmlMarkup); List <string> badNamespacePrefixedElementNames = namespacePrefixedElementNames.Where(s => namespacePrefixToUri.Where(d => s.StartsWith(d.Key)).Any() == false).ToList(); AllowNamespacePrefixedElementNames(tidy, namespacePrefixedElementNames); AllowHtml5ElementNames(tidy); TidyMessageCollection tidyMessages = new TidyMessageCollection(); string xhtml = ""; using (MemoryStream inputStream = new MemoryStream(htmlByteArray)) { using (MemoryStream outputStream = new MemoryStream()) { tidy.Parse(inputStream, outputStream, tidyMessages); outputStream.Position = 0; C1StreamReader sr = new C1StreamReader(outputStream); xhtml = sr.ReadToEnd(); } } if (tidyMessages.Errors > 0) { StringBuilder errorMessageBuilder = new StringBuilder(); foreach (TidyMessage message in tidyMessages) { if (message.Level == MessageLevel.Error) { errorMessageBuilder.AppendLine(message.ToString()); } } throw new InvalidOperationException(string.Format("Failed to parse html:\n\n{0}", errorMessageBuilder.ToString())); } if (xhtml.IndexOf("<html>") > -1) { xhtml = xhtml.Replace("<html>", "<html xmlns=\"http://www.w3.org/1999/xhtml\">"); } if (xhtml.IndexOf("xmlns=\"http://www.w3.org/1999/xhtml\"") == -1) { xhtml = xhtml.Replace("<html", "<html xmlns=\"http://www.w3.org/1999/xhtml\""); } xhtml = RemoveDuplicateAttributes(xhtml); xhtml = RemoveXmlDeclarations(xhtml); xhtml = UndoLowerCasingOfElementNames(xhtml, namespacePrefixedElementNames); xhtml = UndoLowerCasingOfNamespacePrefixes(xhtml, namespacePrefixToUri); StringBuilder messageBuilder = new StringBuilder(); foreach (TidyMessage message in tidyMessages) { if (message.Level == MessageLevel.Warning) { messageBuilder.AppendLine(message.ToString()); } } List <string> badNamespacePrefixes = badNamespacePrefixedElementNames.Select(n => n.Substring(0, n.IndexOf(':'))).Union(LocateAttributeNamespacePrefixes(xhtml)).Distinct().Where(f => IsValidXmlName(f)).ToList(); XDocument outputResult; if (badNamespacePrefixedElementNames.Any()) { string badDeclared = string.Join(" ", badNamespacePrefixes.Select(p => string.Format("xmlns:{0}='#bad'", p)).ToArray()); XDocument badDoc = XDocument.Parse(string.Format("<root {0}>{1}</root>", badDeclared, xhtml)); badDoc.Descendants().Attributes().Where(e => e.Name.Namespace == "#bad").Remove(); badDoc.Descendants().Where(e => e.Name.Namespace == "#bad").Remove(); outputResult = new XDocument(badDoc.Root.Descendants().First()); } else { outputResult = XDocument.Parse(xhtml, LoadOptions.PreserveWhitespace); } return(new TidyHtmlResult { Output = outputResult, ErrorSummary = messageBuilder.ToString() }); }
public string Generate() { string result = Strings.BasicHtmlPage; string head = string.Empty; if (this.Charset == string.Empty) { this.Charset = "UTF-8"; } #region HeadInit if (this.Jquery != string.Empty) { head += string.Format(Strings.ScriptFrame, string.Format(Strings.JqueryUrl, this.Jquery)); } if (this.JqueryMobile != string.Empty) { head += string.Format(Strings.CssFrame, string.Format(Strings.JqueryMobileCssUrl, this.JqueryMobile)); head += string.Format(Strings.ScriptFrame, string.Format(Strings.JqueryMobileJsUrl, this.JqueryMobile)); } if (this.JqueryUI != string.Empty) { head += string.Format(Strings.CssFrame, string.Format(Strings.JqueryUICssUrl, this.JqueryUI)); head += string.Format(Strings.ScriptFrame, string.Format(Strings.JqueryUIJsUrl, this.JqueryUI)); } if (this.Angular != string.Empty) { head += string.Format(Strings.ScriptFrame, string.Format(Strings.AngularUrl, this.Angular)); } if (this.Dojo != string.Empty) { head += string.Format(Strings.ScriptFrame, string.Format(Strings.DojoUrl, this.Dojo)); } if (this.ExtJS != string.Empty) { head += string.Format(Strings.ScriptFrame, string.Format(Strings.ExtJsUrl, this.ExtJS)); } if (this.MooTools != string.Empty) { head += string.Format(Strings.ScriptFrame, string.Format(Strings.MooToolsUrl, this.MooTools)); } if (this.Protoptype != string.Empty) { head += string.Format(Strings.ScriptFrame, string.Format(Strings.PrototypeUrl, this.Protoptype)); } if (this.Scriptaculous != string.Empty) { head += string.Format(Strings.ScriptFrame, string.Format(Strings.ScriptaculousUrl, this.Scriptaculous)); } if (this.SWFObject != string.Empty) { head += string.Format(Strings.ScriptFrame, string.Format(Strings.SWFObjectUrl, this.SWFObject)); } if (this.ThreeJS != string.Empty) { head += string.Format(Strings.ScriptFrame, string.Format(Strings.ThreeJsUrl , this.ThreeJS)); } if (this.WebFontLoader != string.Empty) { head += string.Format(Strings.ScriptFrame, string.Format(Strings.WebFontLoaderUrl , this.WebFontLoader)); } #endregion result = result.Replace("{TITLE}", this.Title); result = result.Replace("{CHARSET}", this.Charset); result = result.Replace("{OTHER_HEAD}", head); result = result.Replace("{BODY}", ""); #region HtmlTidy if (this.HtmlTidy) { Tidy tidy = new Tidy(); tidy.Options.DocType = DocType.Strict; tidy.Options.DropFontTags = true; tidy.Options.LogicalEmphasis = true; tidy.Options.Xhtml = true; tidy.Options.XmlOut = true; tidy.Options.MakeClean = true; tidy.Options.TidyMark = false; TidyMessageCollection tmc = new TidyMessageCollection(); MemoryStream input = new MemoryStream(); MemoryStream output = new MemoryStream(); byte[] byteArray = Encoding.UTF8.GetBytes(result); input.Write(byteArray, 0, byteArray.Length); input.Position = 0; tidy.Parse(input, output, tmc); result = Encoding.UTF8.GetString(output.ToArray()); } #endregion return(result); }