public string DoTidy(string html) { Tidy.Core.Tidy document = new Tidy.Core.Tidy(); TidyMessageCollection messageCollection = new TidyMessageCollection(); document.Options.DocType = DocType.Omit; document.Options.Xhtml = true; document.Options.CharEncoding = CharEncoding.Utf8; document.Options.LogicalEmphasis = true; document.Options.MakeClean = false; document.Options.QuoteNbsp = false; document.Options.SmartIndent = false; document.Options.IndentContent = false; document.Options.TidyMark = false; document.Options.DropFontTags = false; document.Options.QuoteAmpersand = true; document.Options.DropEmptyParas = true; MemoryStream input = new MemoryStream(); MemoryStream output = new MemoryStream(); byte[] array = Encoding.UTF8.GetBytes(html); input.Write(array, 0, array.Length); input.Position = 0; document.Parse(input, output, messageCollection); string tidyXhtml = Encoding.UTF8.GetString(output.ToArray(), 0, output.ToArray().Length); return(XElement.Parse(tidyXhtml).ToString()); }
private static string ParseMarkup(string markup, Tidy tidy, out TidyMessageCollection tidyMessages) { string result; tidyMessages = new TidyMessageCollection(); byte[] htmlByteArray = Encoding.UTF8.GetBytes(markup); using (var inputStream = new MemoryStream(htmlByteArray)) { using (var outputStream = new MemoryStream()) { tidy.Parse(inputStream, outputStream, tidyMessages); outputStream.Position = 0; using (var sr = new C1StreamReader(outputStream)) { result = sr.ReadToEnd(); } } } if (tidyMessages.Errors > 0) { var errorMessageBuilder = new StringBuilder(); foreach (TidyMessage message in tidyMessages) { if (message.Level == MessageLevel.Error) { errorMessageBuilder.AppendLine(message.ToString()); } } throw new InvalidOperationException($"Failed to parse html:\n\n{errorMessageBuilder}"); } return(result); }
/// <summary> /// Cleans HTML documents or fragments into XHTML conformant markup /// </summary> /// <param name="xmlMarkup">The html to clean</param> /// <returns></returns> public static XDocument TidyXml(string xmlMarkup) { try { return(XhtmlDocument.Parse(xmlMarkup)); } catch (Exception) { // take the slow road below... } byte[] xmlByteArray = Encoding.UTF8.GetBytes(xmlMarkup); Tidy tidy = GetXmlConfiguredTidy(); List <string> namespacePrefixedElementNames = LocateNamespacePrefixedElementNames(xmlMarkup); AllowNamespacePrefixedElementNames(tidy, namespacePrefixedElementNames); AllowHtml5ElementNames(tidy); TidyMessageCollection tidyMessages = new TidyMessageCollection(); string xml = ""; using (MemoryStream inputStream = new MemoryStream(xmlByteArray)) { using (MemoryStream outputStream = new MemoryStream()) { tidy.Parse(inputStream, outputStream, tidyMessages); outputStream.Position = 0; C1StreamReader sr = new C1StreamReader(outputStream); xml = sr.ReadToEnd(); } } if (tidyMessages.Errors > 0) { StringBuilder errorMessageBuilder = new StringBuilder(); foreach (TidyMessage message in tidyMessages) { if (message.Level == MessageLevel.Error) { errorMessageBuilder.AppendLine(message.ToString()); } } throw new InvalidOperationException(string.Format("Failed to parse html:\n\n{0}", errorMessageBuilder.ToString())); } xml = RemoveDuplicateAttributes(xml); return(XDocument.Parse(xml)); }
public static string tidy(string str) { Tidy tidy = new Tidy(); TidyMessageCollection msg = new TidyMessageCollection(); MemoryStream input = new MemoryStream(); MemoryStream output = new MemoryStream(); tidy.Options.CharEncoding = CharEncoding.UTF8; tidy.Options.DocType = DocType.Strict; tidy.Options.DropFontTags = true; tidy.Options.LogicalEmphasis = true; tidy.Options.Xhtml = true; tidy.Options.MakeClean = true; tidy.Options.TidyMark = true; tidy.Options.TabSize = 0; if (str != null) { byte[] byteArray = Encoding.UTF8.GetBytes(str); input.Write(byteArray, 0, byteArray.Length); input.Position = 0; tidy.Parse(input, output, msg);//粘贴的Word文档在此有出现代码丢失的情况 string outputString = Encoding.UTF8.GetString(output.ToArray()); int bodybegin = outputString.IndexOf("<body>"); int bodyend = outputString.IndexOf("</body>"); if (bodybegin > 0 && bodyend > 0) { int length = bodyend - bodybegin - 10; if (length < 0) { length = 0; } string realoutputString = outputString.Substring(bodybegin + 8, length); return(realoutputString); } else { return(""); } } else { return(""); } }
static string CleanHtml(string badHtmlString) { //Clean bad html using TIDY // Tidy tidy = new Tidy(); MemoryStream input = new MemoryStream(); MemoryStream output = new MemoryStream(); byte[] badHtml = Encoding.UTF8.GetBytes(badHtmlString); input.Write(badHtml, 0, badHtml.Length); input.Position = 0; TidyMessageCollection tidyMsg = new TidyMessageCollection(); tidy.Parse(input, output, tidyMsg); return(Encoding.UTF8.GetString(output.ToArray())); }
protected override void DoProcess(FieldSerializationPipelineArgs args) { Assert.ArgumentNotNull(args, "args"); if (args.ValueSerialized != null || string.IsNullOrWhiteSpace(args.ValueNormal) || !supportedFieldTypeKeys.Any(k => k.Equals(args.FieldTypeKey, StringComparison.InvariantCultureIgnoreCase))) { return; } Tidy tidy = new Tidy(); tidy.Options.DocType = DocType.Omit; tidy.Options.TidyMark = true; tidy.Options.IndentContent = true; TidyMessageCollection tmc = new TidyMessageCollection(); using (MemoryStream input = new MemoryStream()) using (MemoryStream output = new MemoryStream()) { byte[] byteArray = Encoding.UTF8.GetBytes(args.ValueNormal); input.Write(byteArray, 0, byteArray.Length); input.Position = 0; tidy.Parse(input, output, tmc); string html = Encoding.UTF8.GetString(output.ToArray()); string bodyTag = "<body>"; string bodyCloseTag = "</body>"; if (html.IndexOf(bodyTag) > 0) { html = html.Substring(html.IndexOf(bodyTag) + bodyTag.Length); } if (html.IndexOf(bodyCloseTag) > 0) { html = html.Substring(0, html.IndexOf(bodyCloseTag)); } html = html.Trim(); args.ValueSerialized = string.Concat( Environment.NewLine, html, Environment.NewLine); args.FieldSerializationType = FieldSerializationType.Html; } }
/// <summary> /// Indents the given html source. /// </summary> /// <param name="htmlSource">The html source.</param> /// <returns>A string with the new source.</returns> public String IndentContent(String htmlSource) { Tidy tidy = new Tidy(); tidy.Options.IndentContent = true; TidyMessageCollection tmc = new TidyMessageCollection(); MemoryStream input = new MemoryStream(); MemoryStream output = new MemoryStream(); byte[] byteArray = Encoding.Unicode.GetBytes(htmlSource); input.Write(byteArray, 0, byteArray.Length); input.Position = 0; tidy.Parse(input, output, tmc); htmlSource = Encoding.Unicode.GetString(output.ToArray()); return(htmlSource); }
public static string TidyHtml(string text) { var doc = new Tidy(); var messages = new TidyMessageCollection(); var input = new MemoryStream(); var output = new MemoryStream(); var array = Encoding.UTF8.GetBytes(text); input.Write(array, 0, array.Length); input.Position = 0; /* * // Disabled as it causes problems handling "font" tags * // There are occurences when it will muck up a font tag to "fontface=...etc... * //doc.Options.Xhtml = true; * doc.Options.MakeClean = false; * doc.Options.DocType = DocType.Strict; * doc.Options.CharEncoding = CharEncoding.UTF8; * doc.Options.LogicalEmphasis = true; * * doc.Options.SmartIndent = true; * doc.Options.IndentContent = true; * doc.Options.TidyMark = false; * doc.Options.QuoteAmpersand = true; * doc.Options.DropFontTags = false; * doc.Options.DropEmptyParas = true; * * // Required to stop spaces being removed, and tabs added etc... * doc.Options.Spaces = 0; * doc.Options.WrapLen = 32000; */ doc.Options.TidyMark = false; doc.Options.MakeClean = true; doc.Options.Word2000 = true; doc.Options.EncloseText = true; // Required to stop spaces being removed, and tabs added etc... doc.Options.Spaces = 0; doc.Options.WrapLen = 32000; doc.Parse(input, output, messages); //return Encoding.UTF8.GetString(output.ToArray()); return(RemoveTidyAdditions(Encoding.UTF8.GetString(output.ToArray()))); }
public static String ConvertHtmlToXhtml(String source) { MemoryStream input = new MemoryStream(Encoding.UTF8.GetBytes(source)); MemoryStream output = new MemoryStream(); TidyMessageCollection tmc = new TidyMessageCollection(); Tidy tidy = new Tidy(); tidy.Options.DocType = DocType.Omit; tidy.Options.DropFontTags = true; tidy.Options.LogicalEmphasis = true; tidy.Options.Xhtml = true; tidy.Options.XmlOut = true; tidy.Options.MakeClean = true; tidy.Options.TidyMark = false; tidy.Options.NumEntities = true; tidy.Parse(input, output, tmc); XmlDocument x = new XmlDocument(); XmlDocument xhtml = new XmlDocument(); xhtml.LoadXml("<body />"); XmlNode xhtmlBody = xhtml.SelectSingleNode("/body"); x.LoadXml(Encoding.UTF8.GetString(output.ToArray())); XmlAttribute ns = x.CreateAttribute("xmlns"); ns.Value = XhtmlNamespace; XmlNode body = x.SelectSingleNode("/html/body"); foreach (XmlNode node in body.ChildNodes) { if (node.NodeType == XmlNodeType.Element) { node.Attributes.Append(ns); } xhtmlBody.AppendChild(xhtml.ImportNode(node, true)); } return(xhtmlBody.InnerXml); }
private static string CleanHtml(string dirtyHtml) { //try //{ // return System.Xml.Linq.XElement.Parse(dirtyHtml).ToString(); //} //catch (Exception ex) //{ // // isn't well-formed xml // Console.WriteLine(ex.ToString()); // return string.Empty; //} var tidy = new TidyNet.Tidy(); tidy.Options.SmartIndent = true; tidy.Options.IndentAttributes = false; tidy.Options.WrapLen = 0; tidy.Options.Spaces = 4; //tidy.Options.XmlOut = false; //tidy.Options.XmlTags = false; //tidy.Options.Xhtml = false; //tidy.Options.WrapLen = 0; var messages = new TidyMessageCollection(); using (var inStream = new MemoryStream(Encoding.Default.GetBytes(dirtyHtml))) using (var outStream = new MemoryStream()) { tidy.Parse(inStream, outStream, messages); return(Encoding.Default.GetString(outStream.ToArray())); } //using (Document doc = Document.FromString(dirtyHtml)) //{ // doc.OutputBodyOnly = AutoBool.Yes; // doc.Quiet = true; // doc.CleanAndRepair(); // return doc.Save(); //} }
/// <summary> /// Pretties the print. /// </summary> /// <param name="dirtyHtml">The dirty HTML.</param> /// <param name="messages">The messages.</param> /// <returns></returns> public static string PrettyPrint(string dirtyHtml, out TidyMessageCollection messages) { const int spaces = 8; var tidy = new Tidy.Core.Tidy(); tidy.Options.SmartIndent = true; tidy.Options.IndentAttributes = false; tidy.Options.WrapLen = 0; tidy.Options.Spaces = spaces; messages = new TidyMessageCollection(); using (var inStream = new MemoryStream(Encoding.Default.GetBytes(dirtyHtml))) using (var outStream = new MemoryStream()) { tidy.Parse(inStream, outStream, messages); return(Encoding.Default.GetString(outStream.ToArray()) .Replace(new string(' ', spaces), '\t'.ToString())); } }
/// <summary> /// Uses Tidy.Net to clean a html source. /// </summary> /// <param name="htmlSource">The original html source.</param> /// <param name="isWordHtml">Specifies if the source is an output from Microsoft Word</param> /// <returns>The cleaned Html.</returns> public string Clean(string htmlSource) { Tidy tidy = new Tidy(); //Options required dor xhtml conversion. tidy.Options.DocType = DocType.Strict; tidy.Options.DropFontTags = true; tidy.Options.LogicalEmphasis = true; tidy.Options.Xhtml = true; tidy.Options.XmlOut = true; tidy.Options.MakeClean = true; tidy.Options.TidyMark = false; tidy.Options.DropEmptyParas = true; tidy.Options.IndentContent = true; tidy.Options.SmartIndent = true; tidy.Options.Word2000 = isWordHtml; tidy.Options.EncloseBlockText = true; tidy.Options.XmlTags = true; tidy.Options.FixComments = true; TidyMessageCollection tmc = new TidyMessageCollection(); MemoryStream input = new MemoryStream(); MemoryStream output = new MemoryStream(); byte[] byteArray = Encoding.UTF8.GetBytes(htmlSource); input.Write(byteArray, 0, byteArray.Length); input.Position = 0; try { tidy.Parse(input, output, tmc); } catch (FormatException ex) { Log.Exception(ex); return(htmlSource); } string cleanContent = Encoding.UTF8.GetString(output.ToArray()); return(cleanContent); }
private void cleanContent(ref String initialContent, ref String cleanContent) { Tidy tidy = new Tidy(); /* * tidy.Options.DocType = DocType.Strict; * tidy.Options.DropFontTags = true; * tidy.Options.LogicalEmphasis = true; * tidy.Options.XmlOut = true; * dy.Options.TidyMark = false;*/ tidy.Options.Word2000 = true; //tidy.Options.MakeClean = true; tidy.Options.Xhtml = true; TidyMessageCollection tmc = new TidyMessageCollection(); MemoryStream input = new MemoryStream(); MemoryStream output = new MemoryStream(); byte[] byteArray = Encoding.UTF8.GetBytes(initialContent); input.Write(byteArray, 0, byteArray.Length); input.Position = 0; tidy.Parse(input, output, tmc); cleanContent = Encoding.UTF8.GetString(output.ToArray()); //Delete header & footer int startIndex, endIndex; startIndex = cleanContent.IndexOf("<body"); endIndex = cleanContent.IndexOf(">", startIndex); cleanContent = cleanContent.Remove(0, endIndex + 1); startIndex = cleanContent.IndexOf("</body"); if (startIndex >= 0) { cleanContent = cleanContent.Remove(startIndex); } }
/// <summary> /// Shortens a HTML formatted string, while keeping HTML formatting and complete words (also removes line-breakes at the end of the shortened string) /// </summary> /// <param name="input">The HTML formatted string</param> /// <param name="inputIsShortened">Output boolean telling if the input string has been shortened</param> /// <param name="length">The approximate length of the output string (default: 300)</param> /// <param name="elipsis">Elipsis text to append to the output string (use string.Empty when elipsis should not be added, default: ...)</param> /// <returns>The shortened input string with HTML formatting</returns> public static string ShortenHtml(this string input, out bool inputIsShortened, int length = 300, string elipsis = "...") { inputIsShortened = false; if (input.Length <= length) { return(input); } input = input.Replace("<br />", "<br/>"); string substring = input.Substring(0, length); string leftover = input.Substring(length); while (!leftover.StartsWith(" ") && leftover != string.Empty) { substring += leftover.Substring(0, 1); leftover = leftover.Substring(1); } substring = substring.Trim(); while (substring.EndsWith("<br/>")) { substring = substring.Substring(0, substring.Length - 5); substring = substring.Trim(); } if (input.Length > substring.Length) { inputIsShortened = true; } substring = substring.Replace("<br/>", "<br />"); Tidy tidy = new Tidy(); tidy.Options.DocType = DocType.Omit; tidy.Options.CharEncoding = CharEncoding.UTF8; tidy.Options.Xhtml = true; tidy.Options.NumEntities = true; TidyMessageCollection tmc = new TidyMessageCollection(); MemoryStream inputStream = new MemoryStream(); MemoryStream outputStream = new MemoryStream(); byte[] bytes = Encoding.UTF8.GetBytes(substring); inputStream.Write(bytes, 0, bytes.Length); inputStream.Position = 0; tidy.Parse(inputStream, outputStream, tmc); string tidyResult = Encoding.UTF8.GetString(outputStream.ToArray()); XmlDocument xmlDoc = new XmlDocument(); xmlDoc.LoadXml(tidyResult); tidyResult = xmlDoc.SelectSingleNode("//body").InnerXml; if (!string.IsNullOrEmpty(elipsis)) { if (tidyResult.EndsWith("</p>")) { return(string.Concat(tidyResult.Substring(0, tidyResult.Length - 4), elipsis, "</p>")); } return(string.Concat(tidyResult, elipsis)); } return(tidyResult); }
/// <summary> /// Parses the input stream and writes to the output. /// </summary> /// <param name="input">The input stream</param> /// <param name="output">The output stream</param> /// <param name="messages">The messages</param> public void Parse(Stream input, Stream output, TidyMessageCollection messages=null) { messages = messages ?? new TidyMessageCollection(); ParseInternal(input, output, messages); }
/// <summary> /// Cleans HTML documents or fragments into XHTML conformant markup /// </summary> /// <param name="htmlMarkup">The html to clean</param> /// <returns>A fully structured XHTML document, incl. html, head and body elements.</returns> public static TidyHtmlResult TidyHtml(string htmlMarkup) { byte[] htmlByteArray = Encoding.UTF8.GetBytes(htmlMarkup); Tidy tidy = GetXhtmlConfiguredTidy(); List <string> namespacePrefixedElementNames = LocateNamespacePrefixedElementNames(htmlMarkup); Dictionary <string, string> namespacePrefixToUri = LocateNamespacePrefixToUriDeclarations(htmlMarkup); List <string> badNamespacePrefixedElementNames = namespacePrefixedElementNames.Where(s => namespacePrefixToUri.Where(d => s.StartsWith(d.Key)).Any() == false).ToList(); AllowNamespacePrefixedElementNames(tidy, namespacePrefixedElementNames); AllowHtml5ElementNames(tidy); TidyMessageCollection tidyMessages = new TidyMessageCollection(); string xhtml = ""; using (MemoryStream inputStream = new MemoryStream(htmlByteArray)) { using (MemoryStream outputStream = new MemoryStream()) { tidy.Parse(inputStream, outputStream, tidyMessages); outputStream.Position = 0; C1StreamReader sr = new C1StreamReader(outputStream); xhtml = sr.ReadToEnd(); } } if (tidyMessages.Errors > 0) { StringBuilder errorMessageBuilder = new StringBuilder(); foreach (TidyMessage message in tidyMessages) { if (message.Level == MessageLevel.Error) { errorMessageBuilder.AppendLine(message.ToString()); } } throw new InvalidOperationException(string.Format("Failed to parse html:\n\n{0}", errorMessageBuilder.ToString())); } if (xhtml.IndexOf("<html>") > -1) { xhtml = xhtml.Replace("<html>", "<html xmlns=\"\">"); } if (xhtml.IndexOf("xmlns=\"\"") == -1) { xhtml = xhtml.Replace("<html", "<html xmlns=\"\""); } xhtml = RemoveDuplicateAttributes(xhtml); xhtml = RemoveXmlDeclarations(xhtml); xhtml = UndoLowerCasingOfElementNames(xhtml, namespacePrefixedElementNames); xhtml = UndoLowerCasingOfNamespacePrefixes(xhtml, namespacePrefixToUri); StringBuilder messageBuilder = new StringBuilder(); foreach (TidyMessage message in tidyMessages) { if (message.Level == MessageLevel.Warning) { messageBuilder.AppendLine(message.ToString()); } } List <string> badNamespacePrefixes = badNamespacePrefixedElementNames.Select(n => n.Substring(0, n.IndexOf(':'))).Union(LocateAttributeNamespacePrefixes(xhtml)).Distinct().Where(f => IsValidXmlName(f)).ToList(); XDocument outputResult; if (badNamespacePrefixedElementNames.Any()) { string badDeclared = string.Join(" ", badNamespacePrefixes.Select(p => string.Format("xmlns:{0}='#bad'", p)).ToArray()); XDocument badDoc = XDocument.Parse(string.Format("<root {0}>{1}</root>", badDeclared, xhtml)); badDoc.Descendants().Attributes().Where(e => e.Name.Namespace == "#bad").Remove(); badDoc.Descendants().Where(e => e.Name.Namespace == "#bad").Remove(); outputResult = new XDocument(badDoc.Root.Descendants().First()); } else { outputResult = XDocument.Parse(xhtml, LoadOptions.PreserveWhitespace); } return(new TidyHtmlResult { Output = outputResult, ErrorSummary = messageBuilder.ToString() }); }
public string Generate() { string result = Strings.BasicHtmlPage; string head = string.Empty; if (this.Charset == string.Empty) { this.Charset = "UTF-8"; } #region HeadInit if (this.Jquery != string.Empty) { head += string.Format(Strings.ScriptFrame, string.Format(Strings.JqueryUrl, this.Jquery)); } if (this.JqueryMobile != string.Empty) { head += string.Format(Strings.CssFrame, string.Format(Strings.JqueryMobileCssUrl, this.JqueryMobile)); head += string.Format(Strings.ScriptFrame, string.Format(Strings.JqueryMobileJsUrl, this.JqueryMobile)); } if (this.JqueryUI != string.Empty) { head += string.Format(Strings.CssFrame, string.Format(Strings.JqueryUICssUrl, this.JqueryUI)); head += string.Format(Strings.ScriptFrame, string.Format(Strings.JqueryUIJsUrl, this.JqueryUI)); } if (this.Angular != string.Empty) { head += string.Format(Strings.ScriptFrame, string.Format(Strings.AngularUrl, this.Angular)); } if (this.Dojo != string.Empty) { head += string.Format(Strings.ScriptFrame, string.Format(Strings.DojoUrl, this.Dojo)); } if (this.ExtJS != string.Empty) { head += string.Format(Strings.ScriptFrame, string.Format(Strings.ExtJsUrl, this.ExtJS)); } if (this.MooTools != string.Empty) { head += string.Format(Strings.ScriptFrame, string.Format(Strings.MooToolsUrl, this.MooTools)); } if (this.Protoptype != string.Empty) { head += string.Format(Strings.ScriptFrame, string.Format(Strings.PrototypeUrl, this.Protoptype)); } if (this.Scriptaculous != string.Empty) { head += string.Format(Strings.ScriptFrame, string.Format(Strings.ScriptaculousUrl, this.Scriptaculous)); } if (this.SWFObject != string.Empty) { head += string.Format(Strings.ScriptFrame, string.Format(Strings.SWFObjectUrl, this.SWFObject)); } if (this.ThreeJS != string.Empty) { head += string.Format(Strings.ScriptFrame, string.Format(Strings.ThreeJsUrl , this.ThreeJS)); } if (this.WebFontLoader != string.Empty) { head += string.Format(Strings.ScriptFrame, string.Format(Strings.WebFontLoaderUrl , this.WebFontLoader)); } #endregion result = result.Replace("{TITLE}", this.Title); result = result.Replace("{CHARSET}", this.Charset); result = result.Replace("{OTHER_HEAD}", head); result = result.Replace("{BODY}", ""); #region HtmlTidy if (this.HtmlTidy) { Tidy tidy = new Tidy(); tidy.Options.DocType = DocType.Strict; tidy.Options.DropFontTags = true; tidy.Options.LogicalEmphasis = true; tidy.Options.Xhtml = true; tidy.Options.XmlOut = true; tidy.Options.MakeClean = true; tidy.Options.TidyMark = false; TidyMessageCollection tmc = new TidyMessageCollection(); MemoryStream input = new MemoryStream(); MemoryStream output = new MemoryStream(); byte[] byteArray = Encoding.UTF8.GetBytes(result); input.Write(byteArray, 0, byteArray.Length); input.Position = 0; tidy.Parse(input, output, tmc); result = Encoding.UTF8.GetString(output.ToArray()); } #endregion return(result); }
public void Parse(string input, Stream output, TidyMessageCollection messages=null) { var html = Parse(input, messages); using (var writer = new StreamWriter(output)) writer.Write(html); }
public string Parse(Stream input, TidyMessageCollection messages=null) { var memoryStream = new MemoryStream(); Parse(input, memoryStream, messages); memoryStream.Position = 0; using (var reader = new StreamReader(memoryStream)) return reader.ReadToEnd(); }
public string Parse(string input, TidyMessageCollection messages=null) { var memoryStream = new MemoryStream(); using (var writer = new StreamWriter(memoryStream)) { writer.Write(input); var html = Parse(memoryStream, messages); return html; } }
public XElement ParseXml(Stream input, TidyMessageCollection messages=null) { Options.DocType=DocType.Strict; Options.QuoteNbsp = false; Options.XmlOut = true; Options.Xhtml = true; var html=Parse(input, messages); return XElement.Parse(html); }
/// <summary> /// Parses InputStream in and returns a DOM Document node. /// If out is non-null, pretty prints to OutputStream out. /// </summary> internal virtual IDocument ParseDom(Stream input, Stream output, TidyMessageCollection messages) { Node document = ParseInternal(input, output, messages); if (document != null) return (IDocument) document.Adapter; return null; }
/// <summary> /// Internal routine that actually does the parsing. The caller /// can pass either an InputStream or file name. If both are passed, /// the file name is preferred. /// </summary> internal Node ParseInternal(Stream input, Stream output, TidyMessageCollection messages) { Node document = null; Out o = new OutImpl(); /* normal output stream */ /* ensure config is self-consistent */ _options.Adjust(); if (input != null) { var lexer = new Lexer(new ClsStreamInImpl(input, _options.CharEncoding, _options.TabSize), _options) { Messages = messages }; /* store pointer to lexer in input stream to allow character encoding errors to be reported */ lexer.Input.Lexer = lexer; /* Tidy doesn't alter the doctype for generic XML docs */ Node doctype; if (_options.XmlTags) { document = ParserImpl.ParseXmlDocument(lexer); } else { document = ParserImpl.ParseDocument(lexer); if (!document.CheckNodeIntegrity()) { Report.BadTree(lexer); return null; } var cleaner = new Clean(_options.TagTable); /* simplifies <b><b> ... </b> ...</b> etc. */ cleaner.NestedEmphasis(document); /* cleans up <dir>indented text</dir> etc. */ cleaner.List2Bq(document); cleaner.Bq2Div(document); /* replaces i by em and b by strong */ if (_options.LogicalEmphasis) { cleaner.EmFromI(document); } if (_options.Word2000 && cleaner.IsWord2000(document, _options.TagTable)) { /* prune Word2000's <![if ...]> ... <![endif]> */ cleaner.DropSections(lexer, document); /* drop style & class attributes and empty p, span elements */ cleaner.CleanWord2000(lexer, document); } /* replaces presentational markup by style rules */ if (_options.MakeClean || _options.DropFontTags) { cleaner.CleanTree(lexer, document); } if (!document.CheckNodeIntegrity()) { Report.BadTree(lexer); return null; } doctype = document.FindDocType(); if (document.Content != null) { if (_options.Xhtml) { lexer.SetXhtmlDocType(document); } else { lexer.FixDocType(document); } if (_options.TidyMark) { lexer.AddGenerator(document); } } /* ensure presence of initial <?XML version="1.0"?> */ if (_options.XmlOut && _options.XmlPi) { lexer.FixXmlPi(document); } if (document.Content != null) { Report.ReportVersion(lexer, doctype); Report.ReportNumWarnings(lexer); } } if (lexer.Messages.Errors > 0) { Report.NeedsAuthorIntervention(lexer); } o.State = StreamIn.FSM_ASCII; o.Encoding = _options.CharEncoding; if (lexer.Messages.Errors == 0) { PPrint pprint; if (_options.BurstSlides) { /* remove doctype to avoid potential clash with markup introduced when bursting into slides */ /* discard the document type */ doctype = document.FindDocType(); if (doctype != null) { Node.DiscardElement(doctype); } /* slides use transitional features */ lexer.Versions |= HtmlVersion.Html40Loose; /* and patch up doctype to match */ if (_options.Xhtml) { lexer.SetXhtmlDocType(document); } else { lexer.FixDocType(document); } /* find the body element which may be implicit */ Node body = document.FindBody(_options.TagTable); if (body != null) { pprint = new PPrint(_options); Report.ReportNumberOfSlides(lexer, pprint.CountSlides(body)); pprint.CreateSlides(lexer, document); } else { Report.MissingBody(lexer); } } else if (output != null) { pprint = new PPrint(_options); o.Output = output; if (_options.XmlTags) { pprint.PrintXmlTree(o, 0, 0, lexer, document); } else { pprint.PrintTree(o, 0, 0, lexer, document); } pprint.FlushLine(o, 0); } } Report.ErrorSummary(lexer); } return document; }