public void CleanContentTrusted(AtomContent content) { Logger.Info("Cleaning content to be valid xhtml."); string text = content.Text; if (content.Type == "html") { text = "<div xmlns=\"" + Atom.XhtmlNs.NamespaceName + "\">" + text + "</div>"; } Tidy tidy = new Tidy(); /* Set the options you want */ tidy.Options.DocType = DocType.Strict; //tidy.Options.DropFontTags = true; tidy.Options.LogicalEmphasis = true; tidy.Options.Xhtml = true; tidy.Options.XmlOut = true; tidy.Options.MakeClean = true; tidy.Options.TidyMark = false; tidy.Options.QuoteNbsp = false; tidy.Options.NumEntities = true; tidy.Options.CharEncoding = CharEncoding.UTF8; tidy.Options.FixBackslash = true; tidy.Options.FixComments = true; TidyMessageCollection tmc = new TidyMessageCollection(); using (MemoryStream input = new MemoryStream()) using (MemoryStream output = new MemoryStream()) { byte[] bytes = Encoding.UTF8.GetBytes(text); input.Write(bytes, 0, bytes.Length); input.Position = 0; tidy.Parse(input, output, tmc); text = Encoding.UTF8.GetString(output.ToArray()); if (string.IsNullOrEmpty(text)) throw new FailedToCleanContentException( string.Format("{0} HTML Tidy Error(s)" + Environment.NewLine, tmc.Errors) + string.Join(Environment.NewLine, tmc.Cast<TidyMessage>() .Where(m => m.Level == MessageLevel.Error) .Select(m => m.ToString()).ToArray())); } //remove html/body that gets added by tidy //int start = text.IndexOf("<div xmlns"); //text = text.Substring(start, text.IndexOf("</body>") - start); XElement div = XElement.Parse(text).Element(Atom.XhtmlNs + "body").Element(Atom.XhtmlNs + "div"); //remove decendent xmlns that gets added by tidy div.Descendants().Where(d => d.Attribute("xmlns") != null && d.Attribute("xmlns").Value == Atom.XhtmlNs.NamespaceName) .Select(d => d.Attribute("xmlns")).Remove(); //text = text.Replace(" xmlns=\"http://www.w3.org/1999/xhtml\"", string.Empty); //text = "<div xmlns=\"http://www.w3.org/1999/xhtml\">" + text.Substring("<div>".Length); //set as xhtml content.Type = "xhtml"; content.Text = div.ToString(SaveOptions.None); }
private void Page_Load(System.Object sender, System.EventArgs e) { _messageHelper = contentAPI.EkMsgRef; if (contentAPI.RequestInformationRef.IsMembershipUser == 1 || contentAPI.RequestInformationRef.UserId == 0) { Response.Redirect(contentAPI.ApplicationPath + "reterror.aspx?info=" + Server.UrlEncode(_messageHelper.GetMessage("msg login cms user")), false); return; } //Put user code to initialize the page here this.pageTitle.Text = (new ApplicationAPI()).EkMsgRef.GetMessage("ektron translation"); htmleditor = this.Request.Form["htmleditor"]; htmcontent = this.Request.Form["mycontent"]; // also run Tidy on the text TidyNet.Tidy objTidy = new TidyNet.Tidy(); objTidy.Options.BreakBeforeBR = true; objTidy.Options.CharEncoding = TidyNet.CharEncoding.UTF8; objTidy.Options.DocType = TidyNet.DocType.Omit; objTidy.Options.DropEmptyParas = false; objTidy.Options.MakeClean = true; objTidy.Options.NumEntities = true; objTidy.Options.QuoteAmpersand = true; objTidy.Options.QuoteMarks = false; objTidy.Options.QuoteNbsp = true; objTidy.Options.RawOut = false; objTidy.Options.TidyMark = false; objTidy.Options.Word2000 = true; objTidy.Options.XmlOut = true; TidyNet.TidyMessageCollection messageCollection = new TidyNet.TidyMessageCollection(); System.IO.MemoryStream streamIn = new System.IO.MemoryStream(); System.IO.MemoryStream streamOut = new System.IO.MemoryStream(); byte[] byteArray = Encoding.UTF8.GetBytes(htmcontent); streamIn.Write(byteArray, 0, byteArray.Length); streamIn.Position = 0; objTidy.Parse(streamIn, streamOut, messageCollection); streamOut.Position = 0; string strTidyResult = Encoding.UTF8.GetString(streamOut.ToArray()); streamOut.Close(); streamIn.Close(); if ((strTidyResult == "") && (messageCollection.Errors > 0)) { foreach (TidyNet.TidyMessage msg in messageCollection) { if (msg.Level == TidyNet.MessageLevel.Error) { strTidyResult = strTidyResult + msg.ToString() + "<br />"; } } htmcontent = strTidyResult; content.Value = htmcontent; } else { strTidyResult = (string)(System.Text.RegularExpressions.Regex.Replace(strTidyResult, "[\\w\\W]*?<body>", "").Replace("</body>" + "\r\n" + "</html>", "")); content.Value = strTidyResult; } }
/// <summary> /// Parses the input stream and writes to the output. /// </summary> /// <param name="input">The input stream</param> /// <param name="Output">The output stream</param> /// <param name="messages">The messages</param> public virtual void Parse(Stream input, Stream output, TidyMessageCollection messages) { try { Parse(input, null, output, messages); } catch (FileNotFoundException) { } catch (IOException) { } }
/// <summary> Parses InputStream in and returns a DOM Document node. /// If out is non-null, pretty prints to OutputStream out. /// </summary> internal virtual IDocument ParseDom(Stream input, Stream Output, TidyMessageCollection messages) { Node document = ParseInternal(input, Output, messages); if (document != null) { return((IDocument)document.Adapter); } else { return(null); } }
/// <summary> /// Run the input html through the HtmlTidy library, first replacing £ sign with `, as /// lib does not seem to support iso-8859-1, the encoding for the original statement. /// Also comment out the js script tags, as these fail parsing. /// </summary> /// <returns>A wellformed xml (bank) statement</returns> public string TidyStatement(string statement) { //task.factory.startnew var ss = new StreamReader(statement, Encoding.GetEncoding(ReadEncoding)); var sourceStatementFileContent = ss.ReadToEnd(); var tmc = new TidyMessageCollection(); var input = new MemoryStream(); var output = new MemoryStream(); var bytes = Encoding.GetEncoding(ReadEncoding).GetBytes(sourceStatementFileContent.Replace('£', '`')); input.Write(bytes, 0, bytes.Length); input.Position = 0; _tidy.Parse(input, output, tmc); var outputResult = Encoding.GetEncoding(ReadEncoding).GetString(output.ToArray()); outputResult = StringUtils.InsertStringInString(outputResult, @"<script", @"<!--", true); return StringUtils.InsertStringInString(outputResult, @"</script>", @"-->", false); }
/// <summary> /// Indents the given html source. /// </summary> /// <param name="htmlSource">The html source.</param> /// <returns>A string with the new source.</returns> public String IndentContent(String htmlSource) { Tidy tidy = new Tidy(); tidy.Options.IndentContent = true; TidyMessageCollection tmc = new TidyMessageCollection(); MemoryStream input = new MemoryStream(); MemoryStream output = new MemoryStream(); byte[] byteArray = Encoding.Unicode.GetBytes(htmlSource); input.Write(byteArray, 0, byteArray.Length); input.Position = 0; tidy.Parse(input, output, tmc); htmlSource = Encoding.Unicode.GetString(output.ToArray()); return htmlSource; }
/// <summary> Parses InputStream in and returns the root Node. /// If out is non-null, pretty prints to OutputStream out. /// </summary> internal virtual Node ParseInternal(Stream input, Stream output, TidyMessageCollection messages) { Node document = null; try { document = ParseInternal(input, null, output, messages); } catch (FileNotFoundException) { } catch (IOException) { } return(document); }
/// <summary> /// Uses Tidy.Net to clean a html source. /// </summary> /// <param name="htmlSource">The original html source.</param> /// <param name="isWordHtml">Specifies if the source is an output from Microsoft Word</param> /// <returns>The cleaned Html.</returns> public string Clean(string htmlSource) { Tidy tidy = new Tidy(); //Options required dor xhtml conversion. tidy.Options.DocType = DocType.Strict; tidy.Options.DropFontTags = true; tidy.Options.LogicalEmphasis = true; tidy.Options.Xhtml = true; tidy.Options.XmlOut = true; tidy.Options.MakeClean = true; tidy.Options.TidyMark = false; tidy.Options.DropEmptyParas = true; tidy.Options.IndentContent = true; tidy.Options.SmartIndent = true; tidy.Options.Word2000 = isWordHtml; tidy.Options.EncloseBlockText = true; tidy.Options.XmlTags = true; tidy.Options.FixComments = true; TidyMessageCollection tmc = new TidyMessageCollection(); MemoryStream input = new MemoryStream(); MemoryStream output = new MemoryStream(); byte[] byteArray = Encoding.UTF8.GetBytes(htmlSource); input.Write(byteArray, 0, byteArray.Length); input.Position = 0; try { tidy.Parse(input, output, tmc); } catch (FormatException ex) { Log.Exception(ex); return htmlSource; } string cleanContent = Encoding.UTF8.GetString(output.ToArray()); return cleanContent; }
public TimeSpan GetUptime() { /* Declare the parameters that is needed */ TidyMessageCollection tmc = new TidyMessageCollection(); MemoryStream xhtmlStream = new MemoryStream(); var r = System.Net.WebRequest.Create("http://192.168.100.1/indexData.htm"); r.Timeout = 5000; using (var res = r.GetResponse()) using (var htmlStream = res.GetResponseStream()) { tidy.Parse(htmlStream, xhtmlStream, tmc); res.Close(); } //string result = Encoding.UTF8.GetString(xhtmlStream.ToArray()); var d = new System.Xml.XmlDocument(); xhtmlStream.Position = 0; d.Load(xhtmlStream); var navigator = d.CreateNavigator(); XmlNamespaceManager manager = new XmlNamespaceManager(navigator.NameTable); manager.AddNamespace("x", d.DocumentElement.NamespaceURI); // http://www.w3.org/1999/xhtml var eUptime = (XmlElement)d.SelectSingleNode("x:html/x:body/x:table[2]/x:tbody/x:tr[3]/x:td[2]", manager); var v = eUptime.InnerText; v = v.Replace(" days ", ":"); v = v.Replace("h", ""); v = v.Replace("m", ""); v = v.Replace("s", ""); return TimeSpan.Parse(v); }
public static String ConvertHtmlToXhtml(String source) { MemoryStream input = new MemoryStream(Encoding.UTF8.GetBytes(source)); MemoryStream output = new MemoryStream(); TidyMessageCollection tmc = new TidyMessageCollection(); Tidy tidy = new Tidy(); tidy.Options.DocType = DocType.Omit; tidy.Options.DropFontTags = true; tidy.Options.LogicalEmphasis = true; tidy.Options.Xhtml = true; tidy.Options.XmlOut = true; tidy.Options.MakeClean = true; tidy.Options.TidyMark = false; tidy.Options.NumEntities = true; tidy.Parse(input, output, tmc); XmlDocument x = new XmlDocument(); XmlDocument xhtml = new XmlDocument(); xhtml.LoadXml("<body />"); XmlNode xhtmlBody = xhtml.SelectSingleNode("/body"); x.LoadXml(Encoding.UTF8.GetString(output.ToArray())); XmlAttribute ns = x.CreateAttribute("xmlns"); ns.Value = XhtmlNamespace; XmlNode body = x.SelectSingleNode("/html/body"); foreach (XmlNode node in body.ChildNodes) { if (node.NodeType == XmlNodeType.Element) node.Attributes.Append(ns); xhtmlBody.AppendChild(xhtml.ImportNode(node, true)); } return xhtmlBody.InnerXml; }
/// <summary> Parses InputStream in and returns the root Node. /// If out is non-null, pretty prints to OutputStream out. /// </summary> internal virtual Node ParseInternal(Stream input, Stream output, TidyMessageCollection messages) { Node document = null; try { document = ParseInternal(input, null, output, messages); } catch (FileNotFoundException) { } catch (IOException) { } return document; }
/// <summary> Parses InputStream in and returns a DOM Document node. /// If out is non-null, pretty prints to OutputStream out. /// </summary> internal virtual IDocument ParseDom(Stream input, Stream Output, TidyMessageCollection messages) { Node document = ParseInternal(input, Output, messages); if (document != null) { return (IDocument) document.Adapter; } else { return null; } }
/// <summary> /// Parses the input stream or file and writes to the output. /// </summary> /// <param name="input">The input stream</param> /// <param name="file">The input file</param> /// <param name="Output">The output stream</param> /// <param name="messages">The messages</param> public void Parse(Stream input, string file, Stream Output, TidyMessageCollection messages) { ParseInternal(input, file, Output, messages); }
public override List<SearchResult> Search(ChapterInfo chapterInfo) { string result = string.Empty; using (WebClient wc = new WebClient()) { //NameValueCollection vars = new NameValueCollection(); //vars.Add("txtTitle", chapterInfo.Title); //vars.Add("btnSearch", "Search"); //wc.UploadValues(uri, "POST", vars); wc.Headers["Content-Type"] = "application/x-www-form-urlencoded"; Uri uri = new Uri("http://www.e-home.no/metaservices/search.aspx"); result = wc.UploadString(uri, "POST", //__VIEWSTATE=%2FwEPDwUKLTM3MTkwMDA5NQ9kFgICAQ9kFgICDQ88KwALAGRkg%2BhH%2F3tiaQDjnQncD1sYDdeni%2BA%3D&txtTitle=batman&btnSearch=Search&__EVENTVALIDATION=%2FwEWAwLXiqPdDAL55JyzBAKln%2FPuCgMJnDvHIVAx2tPEYdjNUbwqrR67 string.Format("__VIEWSTATE=%2FwEPDwUKLTM3MTkwMDA5NQ9kFgICAQ9kFgICDQ88KwALAGRkg%2BhH%2F3tiaQDjnQncD1sYDdeni%2BA%3D&txtTitle={0}&btnSearch=Search&__EVENTVALIDATION=%2FwEWAwLXiqPdDAL55JyzBAKln%2FPuCgMJnDvHIVAx2tPEYdjNUbwqrR67", HttpUtility.UrlEncode(chapterInfo.Title))); } //__VIEWSTATE=%2FwEPDwUKLTM3MTkwMDA5NQ9kFgICAQ9kFgICDQ88KwALAGRkg%2BhH%2F3tiaQDjnQncD1sYDdeni%2BA%3D&txtTitle=batman&btnSearch=Search&__EVENTVALIDATION=%2FwEWAwLXiqPdDAL55JyzBAKln%2FPuCgMJnDvHIVAx2tPEYdjNUbwqrR67 Tidy tidy = new Tidy(); /* Set the options you want */ tidy.Options.DocType = DocType.Strict; //tidy.Options.DropFontTags = true; tidy.Options.LogicalEmphasis = true; tidy.Options.Xhtml = true; tidy.Options.XmlOut = true; tidy.Options.MakeClean = true; tidy.Options.TidyMark = false; tidy.Options.QuoteNbsp = false; tidy.Options.NumEntities = true; tidy.Options.CharEncoding = CharEncoding.UTF8; tidy.Options.FixBackslash = true; tidy.Options.FixComments = true; TidyMessageCollection tmc = new TidyMessageCollection(); using (MemoryStream input = new MemoryStream()) using (MemoryStream output = new MemoryStream()) { byte[] bytes = Encoding.UTF8.GetBytes(result); input.Write(bytes, 0, bytes.Length); input.Position = 0; tidy.Parse(input, output, tmc); result = Encoding.UTF8.GetString(output.ToArray()); if (tmc.Errors > 0) throw new Exception( string.Format("{0} HTML Tidy Error(s)" + Environment.NewLine, tmc.Errors) + string.Join(Environment.NewLine, tmc.Cast<TidyMessage>() .Where(m => m.Level == MessageLevel.Error) .Select(m => m.ToString()).ToArray())); XNamespace ns = "http://www.w3.org/1999/xhtml"; //parse titles XDocument searchXhtml = XDocument.Parse(result); Debug.Write(searchXhtml.Descendants(ns + "tr") .Where(tr => (tr.Attribute("id") != null && tr.Attribute("id").Value.Length == 17)).Count()); var titles = searchXhtml.Descendants(ns + "tr") .Where(tr => (tr.Attribute("id") != null && tr.Attribute("id").Value.Length == 17)) .Select(tr => new SearchResult() { Id = (string)tr.Attribute("id"), Name = (string)tr.Elements(ns + "td").First() }); OnSearchComplete(); return titles.ToList(); } }
/// <summary> /// Cleans HTML documents or fragments into XHTML conformant markup /// </summary> /// <param name="htmlMarkup">The html to clean</param> /// <returns>A fully structured XHTML document, incl. html, head and body elements.</returns> public static TidyHtmlResult TidyHtml(string htmlMarkup) { byte[] htmlByteArray = Encoding.UTF8.GetBytes(htmlMarkup); Tidy tidy = GetXhtmlConfiguredTidy(); List<string> namespacePrefixedElementNames = LocateNamespacePrefixedElementNames(htmlMarkup); Dictionary<string, string> namespacePrefixToUri = LocateNamespacePrefixToUriDeclarations(htmlMarkup); List<string> badNamespacePrefixedElementNames = namespacePrefixedElementNames.Where(s => namespacePrefixToUri.Where(d => s.StartsWith(d.Key)).Any() == false).ToList(); AllowNamespacePrefixedElementNames(tidy, namespacePrefixedElementNames); AllowHtml5ElementNames(tidy); TidyMessageCollection tidyMessages = new TidyMessageCollection(); string xhtml = ""; using (MemoryStream inputStream = new MemoryStream(htmlByteArray)) { using (MemoryStream outputStream = new MemoryStream()) { tidy.Parse(inputStream, outputStream, tidyMessages); outputStream.Position = 0; C1StreamReader sr = new C1StreamReader(outputStream); xhtml = sr.ReadToEnd(); } } if (tidyMessages.Errors > 0) { StringBuilder errorMessageBuilder = new StringBuilder(); foreach (TidyMessage message in tidyMessages) { if (message.Level == MessageLevel.Error) errorMessageBuilder.AppendLine(message.ToString()); } throw new InvalidOperationException(string.Format("Failed to parse html:\n\n{0}", errorMessageBuilder.ToString())); } if (xhtml.IndexOf("<html>")>-1) { xhtml = xhtml.Replace("<html>", "<html xmlns=\"http://www.w3.org/1999/xhtml\">"); } if (xhtml.IndexOf("xmlns=\"http://www.w3.org/1999/xhtml\"") == -1) { xhtml = xhtml.Replace("<html", "<html xmlns=\"http://www.w3.org/1999/xhtml\""); } xhtml = RemoveDuplicateAttributes(xhtml); xhtml = RemoveXmlDeclarations(xhtml); xhtml = UndoLowerCasingOfElementNames(xhtml, namespacePrefixedElementNames); xhtml = UndoLowerCasingOfNamespacePrefixes(xhtml, namespacePrefixToUri); StringBuilder messageBuilder = new StringBuilder(); foreach (TidyMessage message in tidyMessages) { if (message.Level == MessageLevel.Warning) messageBuilder.AppendLine(message.ToString()); } List<string> badNamespacePrefixes = badNamespacePrefixedElementNames.Select(n => n.Substring(0, n.IndexOf(':'))).Union(LocateAttributeNamespacePrefixes(xhtml)).Distinct().Where(f => IsValidXmlName(f)).ToList(); XDocument outputResult; if (badNamespacePrefixedElementNames.Any()) { string badDeclared = string.Join(" ", badNamespacePrefixes.Select(p => string.Format("xmlns:{0}='#bad'", p)).ToArray()); XDocument badDoc = XDocument.Parse(string.Format("<root {0}>{1}</root>", badDeclared, xhtml)); badDoc.Descendants().Attributes().Where(e => e.Name.Namespace == "#bad").Remove(); badDoc.Descendants().Where(e => e.Name.Namespace == "#bad").Remove(); outputResult = new XDocument(badDoc.Root.Descendants().First()); } else { outputResult = XDocument.Parse(xhtml, LoadOptions.PreserveWhitespace); } return new TidyHtmlResult { Output = outputResult, ErrorSummary = messageBuilder.ToString() }; }
public static string ValidateHtml(string htmlString) { var tidy = new Tidy(); tidy.Options.DocType = DocType.Omit; tidy.Options.Xhtml = true; tidy.Options.XmlOut = true; tidy.Options.LogicalEmphasis = true; tidy.Options.MakeClean = true; tidy.Options.TidyMark = false; tidy.Options.CharEncoding = (CharEncoding)Enum.Parse(typeof(TidyNet.CharEncoding), UmbracoSettings.TidyCharEncoding); var tmc = new TidyMessageCollection(); var input = new MemoryStream(); var output = new MemoryStream(); var byteArray = Encoding.UTF8.GetBytes(htmlString); input.Write(byteArray, 0, byteArray.Length); input.Position = 0; tidy.Parse(input, output, tmc); var result = Encoding.UTF8.GetString(output.ToArray()); var regex = @"</{0,1}body[^>]*>"; var options = ((System.Text.RegularExpressions.RegexOptions.IgnorePatternWhitespace | System.Text.RegularExpressions.RegexOptions.Multiline) | System.Text.RegularExpressions.RegexOptions.IgnoreCase); var reg = new System.Text.RegularExpressions.Regex(regex, options); string[] s = reg.Split(result); if (s.Length > 1) { return s[1]; } return "[tidy error]"; }
private static string TidyHtml(string text) { var doc = new Tidy(); var messages = new TidyMessageCollection(); var input = new MemoryStream(); var output = new MemoryStream(); var array = Encoding.UTF8.GetBytes(text); input.Write(array, 0, array.Length); input.Position = 0; // Disabled as it causes problems handling "font" tags // There are occurences when it will muck up a font tag to "fontface=...etc... //doc.Options.Xhtml = true; doc.Options.MakeClean = false; doc.Options.DocType = DocType.Strict; doc.Options.CharEncoding = CharEncoding.UTF8; doc.Options.LogicalEmphasis = true; doc.Options.SmartIndent = true; doc.Options.IndentContent = true; doc.Options.TidyMark = false; doc.Options.QuoteAmpersand = true; doc.Options.DropFontTags = false; doc.Options.DropEmptyParas = true; // Required to stop spaces being removed, and tabs added etc... doc.Options.Spaces = 0; doc.Options.WrapLen = 32000; doc.Parse(input, output, messages); return RemoveTidyAdditions(Encoding.UTF8.GetString(output.ToArray())); }
private string RemoveHTML(string strText) { string returnValue; string TAGLIST = ";em;span;u;a;"; const string BLOCKTAGLIST = ";APPLET;"; var nPos1 = 0; var nPos2 = 0; var nPos3 = 0; string strResult = ""; object strTagName; object bRemove; object bSearchForBlock; nPos1 = strText.IndexOf("<") + 1; while (Convert.ToInt32(nPos1) > 0) { nPos2 = (Convert.ToInt32(nPos1) + 1).ToString().IndexOf(strText) + 1; if (nPos2 > 0) { strTagName = strText.Substring(Convert.ToInt32(nPos1) + 1 - 1, Convert.ToInt32(nPos2) - Convert.ToInt32(nPos1) - 1); strTagName = Strings.Replace(strTagName.ToString(), Constants.vbCr, " ", 1, -1, 0).Replace(Constants.vbLf, " "); nPos3 = strTagName.ToString().IndexOf(" ") + 1; if (nPos3 > 0) { strTagName = Strings.Left(strTagName.ToString(), System.Convert.ToInt32(nPos3 - 1)); } if (Strings.Left(strTagName.ToString(), 1) == "/") { strTagName = Strings.Mid(strTagName.ToString(), 2); bSearchForBlock = false; } else { bSearchForBlock = true; } if (TAGLIST.IndexOf((";" + strTagName + ";").ToString()) + 1 > 0) { bRemove = true; if ( Convert.ToBoolean(bSearchForBlock)) { if (BLOCKTAGLIST.ToString().IndexOf((";" + strTagName + ";").ToString()) + 1 > 0) { nPos2 = strText.Length; nPos3 = strText.IndexOf(("</" + strTagName).ToString(), nPos1 + 1 - 1) + 1; if (nPos3 > 0) { nPos3 = (nPos3 + 1).ToString().IndexOf(strText) + 1; } if (nPos3 > 0) { nPos2 = nPos3; } } } } else { bRemove = false; } if (Convert.ToBoolean(bRemove)) { strResult = strResult + strText.Substring(0, Convert.ToInt32(nPos1) - 1); strText = strText.Substring(Convert.ToInt32(nPos2) + 1 - 1); } else { strResult = strResult + strText.Substring(0, Convert.ToInt32(nPos1)); strText = strText.Substring(Convert.ToInt32(nPos1) + 1 - 1); } } else { strResult = strResult + strText; strText = ""; } nPos1 = strText.IndexOf("<") + 1; } strResult = strResult + strText; strResult = strResult.Replace(" ", " "); // also run Tidy on the text TidyNet.Tidy tidydoc = new TidyNet.Tidy(); tidydoc.Options.RawOut = false; tidydoc.Options.CharEncoding = TidyNet.CharEncoding.UTF8; tidydoc.Options.DocType = TidyNet.DocType.Omit; tidydoc.Options.TidyMark = false; tidydoc.Options.Word2000 = true; tidydoc.Options.QuoteNbsp = true; tidydoc.Options.QuoteAmpersand = true; tidydoc.Options.NumEntities = false; tidydoc.Options.QuoteMarks = true; tidydoc.Options.Xhtml = false; tidydoc.Options.MakeClean = true; TidyNet.TidyMessageCollection messageCollection = new TidyNet.TidyMessageCollection(); System.IO.MemoryStream tidyin = new System.IO.MemoryStream(); System.IO.MemoryStream tidyout = new System.IO.MemoryStream(); if (strResult == null) { strResult = "<p></p>"; } byte[] byteArray = Encoding.UTF8.GetBytes(strResult); tidyin.Write(byteArray, 0, byteArray.Length); tidyin.Position = 0; tidydoc.Parse(tidyin, tidyout, messageCollection); tidyout.Position = 0; string strTidyResult = Encoding.UTF8.GetString(tidyout.ToArray()); tidyout.Close(); if ((strTidyResult == "") && (messageCollection.Errors > 0)) { foreach (TidyNet.TidyMessage msg in messageCollection) { if (msg.Level == TidyNet.MessageLevel.Error) { strTidyResult = strTidyResult + msg.ToString() + "<BR>"; } } } else { strResult = strTidyResult; } returnValue = strResult; return returnValue; }
/// <summary> Internal routine that actually does the parsing. The caller /// can pass either an InputStream or file name. If both are passed, /// the file name is preferred. /// </summary> internal Node ParseInternal(Stream input, string file, Stream Output, TidyMessageCollection messages) { Lexer lexer; Node document = null; Node doctype; Out o = new OutImpl(); /* normal output stream */ PPrint pprint; /* ensure config is self-consistent */ _options.Adjust(); if (file != null) { input = new FileStream(file, FileMode.Open, FileAccess.Read); } else if (input == null) { input = Console.OpenStandardInput(); } if (input != null) { lexer = new Lexer(new ClsStreamInImpl(input, _options.CharEncoding, _options.TabSize), _options); lexer.messages = messages; /* store pointer to lexer in input stream to allow character encoding errors to be reported */ lexer.input.Lexer = lexer; /* Tidy doesn't alter the doctype for generic XML docs */ if (_options.XmlTags) { document = ParserImpl.parseXMLDocument(lexer); } else { document = ParserImpl.parseDocument(lexer); if (!document.CheckNodeIntegrity()) { Report.BadTree(lexer); return null; } Clean cleaner = new Clean(_options.tt); /* simplifies <b><b> ... </b> ...</b> etc. */ cleaner.NestedEmphasis(document); /* cleans up <dir>indented text</dir> etc. */ cleaner.List2BQ(document); cleaner.BQ2Div(document); /* replaces i by em and b by strong */ if (_options.LogicalEmphasis) { cleaner.EmFromI(document); } if (_options.Word2000 && cleaner.IsWord2000(document, _options.tt)) { /* prune Word2000's <![if ...]> ... <![endif]> */ cleaner.DropSections(lexer, document); /* drop style & class attributes and empty p, span elements */ cleaner.CleanWord2000(lexer, document); } /* replaces presentational markup by style rules */ if (_options.MakeClean || _options.DropFontTags) { cleaner.CleanTree(lexer, document); } if (!document.CheckNodeIntegrity()) { Report.BadTree(lexer); return null; } doctype = document.FindDocType(); if (document.Content != null) { if (_options.Xhtml) { lexer.SetXhtmlDocType(document); } else { lexer.FixDocType(document); } if (_options.TidyMark) { lexer.AddGenerator(document); } } /* ensure presence of initial <?XML version="1.0"?> */ if (_options.XmlOut && _options.XmlPi) { lexer.FixXmlPI(document); } if (document.Content != null) { Report.ReportVersion(lexer, doctype); Report.ReportNumWarnings(lexer); } } // Try to close the InputStream but only if if we created it. if ((file != null) && (input != Console.OpenStandardOutput())) { try { input.Close(); } catch (IOException) { } } if (lexer.messages.Errors > 0) { Report.NeedsAuthorIntervention(lexer); } o.State = StreamIn.FSM_ASCII; o.Encoding = _options.CharEncoding; if (lexer.messages.Errors == 0) { if (_options.BurstSlides) { Node body; body = null; /* remove doctype to avoid potential clash with markup introduced when bursting into slides */ /* discard the document type */ doctype = document.FindDocType(); if (doctype != null) { Node.DiscardElement(doctype); } /* slides use transitional features */ lexer.versions |= HtmlVersion.Html40Loose; /* and patch up doctype to match */ if (_options.Xhtml) { lexer.SetXhtmlDocType(document); } else { lexer.FixDocType(document); } /* find the body element which may be implicit */ body = document.FindBody(_options.tt); if (body != null) { pprint = new PPrint(_options); Report.ReportNumberOfSlides(lexer, pprint.CountSlides(body)); pprint.CreateSlides(lexer, document); } else { Report.MissingBody(lexer); } } else if (Output != null) { pprint = new PPrint(_options); o.Output = Output; if (_options.XmlTags) { pprint.PrintXmlTree(o, (short) 0, 0, lexer, document); } else { pprint.PrintTree(o, (short) 0, 0, lexer, document); } pprint.FlushLine(o, 0); } } Report.ErrorSummary(lexer); } return document; }
private void buttonGo_Click(object sender, EventArgs e) { Exception ex; // I had to throw in the len as Budget Revenue and Expenditures have the same ID except the Expenditures has a 1 at the end. ex = _db.ExecuteSqlReader("SELECT * FROM CIA_Fields WHERE fieldid > 2000 AND LEN(fieldid) = 4"); if (ex != null) { throw new Exception(ex.Message); } var fieldIDs = new List<int>(); while (_db.Reader.Read()) { fieldIDs.Add((int)_db.Reader["FieldID"]); } _db.Reader.Close(); foreach(var f in fieldIDs){ textBoxOutput.Text += f + Environment.NewLine; var input = File.OpenRead(textFolder.Text + "\\" + f + ".html"); var tmc = new TidyMessageCollection(); var output = new MemoryStream(); var tidy = new Tidy(); tidy.Options.DocType = DocType.Strict; tidy.Options.DropFontTags = true; tidy.Options.LogicalEmphasis = true; tidy.Options.Xhtml = true; tidy.Options.XmlOut = true; tidy.Options.MakeClean = true; tidy.Options.TidyMark = false; tidy.Options.WrapLen = 0; tidy.Parse(input, output, tmc); var result = Encoding.UTF8.GetString(output.ToArray()); HtmlAgilityPack.HtmlDocument doc = new HtmlAgilityPack.HtmlDocument(); doc.LoadHtml(result); var categoryData = doc.DocumentNode.SelectNodes("//td[@class='category_data']"); if (categoryData != null) { foreach (var i in categoryData) { if (i != null) { var tagID = _countryTagList.SingleOrDefault(a => a.Key == i.ParentNode.ParentNode.Id); if (tagID.Key == null) { continue; } switch(f) { case 2085: Parse.Parse2085(textBoxOutput, i.InnerText, f, tagID.Value); break; case 2091: Parse.Parse2091(textBoxOutput, i.InnerText, f, tagID.Value); break; case 2121: Parse.Parse2121(textBoxOutput, i.InnerText, f, tagID.Value); break; case 2056: Parse.Parse2056(textBoxOutput, i.InnerText, f, tagID.Value); break; default: textBoxOutput.Text += Parse.ParseTableData(i.InnerText, f, tagID.Value); break; } } } } else { textBoxOutput.Text += f + ": NO DATA" + Environment.NewLine; } } }
/// <summary> /// Shortens a HTML formatted string, while keeping HTML formatting and complete words (also removes line-breakes at the end of the shortened string) /// </summary> /// <param name="input">The HTML formatted string</param> /// <param name="inputIsShortened">Output boolean telling if the input string has been shortened</param> /// <param name="length">The approximate length of the output string (default: 300)</param> /// <param name="elipsis">Elipsis text to append to the output string (use string.Empty when elipsis should not be added, default: ...)</param> /// <returns>The shortened input string with HTML formatting</returns> public static string ShortenHtml(this string input, out bool inputIsShortened, int length = 300, string elipsis = "...") { inputIsShortened = false; if (input.Length <= length) return input; input = input.Replace("<br />", "<br/>"); string substring = input.Substring(0, length); string leftover = input.Substring(length); while (!leftover.StartsWith(" ") && leftover != string.Empty) { substring += leftover.Substring(0, 1); leftover = leftover.Substring(1); } substring = substring.Trim(); while (substring.EndsWith("<br/>")) { substring = substring.Substring(0, substring.Length - 5); substring = substring.Trim(); } if (input.Length > substring.Length) inputIsShortened = true; substring = substring.Replace("<br/>", "<br />"); Tidy tidy = new Tidy(); tidy.Options.DocType = DocType.Omit; tidy.Options.CharEncoding = CharEncoding.UTF8; tidy.Options.Xhtml = true; tidy.Options.NumEntities = true; TidyMessageCollection tmc = new TidyMessageCollection(); MemoryStream inputStream = new MemoryStream(); MemoryStream outputStream = new MemoryStream(); byte[] bytes = Encoding.UTF8.GetBytes(substring); inputStream.Write(bytes, 0, bytes.Length); inputStream.Position = 0; tidy.Parse(inputStream, outputStream, tmc); string tidyResult = Encoding.UTF8.GetString(outputStream.ToArray()); XmlDocument xmlDoc = new XmlDocument(); xmlDoc.LoadXml(tidyResult); tidyResult = xmlDoc.SelectSingleNode("//body").InnerXml; if (!string.IsNullOrEmpty(elipsis)) { if (tidyResult.EndsWith("</p>")) return string.Concat(tidyResult.Substring(0, tidyResult.Length - 4), elipsis, "</p>"); return string.Concat(tidyResult, elipsis); } return tidyResult; }
private void cleanContent(ref String initialContent,ref String cleanContent) { Tidy tidy = new Tidy(); /* tidy.Options.DocType = DocType.Strict; tidy.Options.DropFontTags = true; tidy.Options.LogicalEmphasis = true; tidy.Options.XmlOut = true; dy.Options.TidyMark = false;*/ tidy.Options.Word2000 = true; //tidy.Options.MakeClean = true; tidy.Options.Xhtml = true; TidyMessageCollection tmc = new TidyMessageCollection(); MemoryStream input = new MemoryStream(); MemoryStream output = new MemoryStream(); byte[] byteArray = Encoding.UTF8.GetBytes(initialContent); input.Write(byteArray, 0, byteArray.Length); input.Position = 0; tidy.Parse(input, output, tmc); cleanContent = Encoding.UTF8.GetString(output.ToArray()); //Delete header & footer int startIndex, endIndex; startIndex = cleanContent.IndexOf("<body"); endIndex = cleanContent.IndexOf(">", startIndex); cleanContent = cleanContent.Remove(0, endIndex + 1); startIndex = cleanContent.IndexOf("</body"); if(startIndex >= 0) cleanContent = cleanContent.Remove(startIndex); }
/// <summary> Internal routine that actually does the parsing. The caller /// can pass either an InputStream or file name. If both are passed, /// the file name is preferred. /// </summary> internal Node ParseInternal(Stream input, string file, Stream Output, TidyMessageCollection messages) { Lexer lexer; Node document = null; Node doctype; Out o = new OutImpl(); /* normal output stream */ PPrint pprint; /* ensure config is self-consistent */ _options.Adjust(); if (file != null) { input = new FileStream(file, FileMode.Open, FileAccess.Read); } else if (input == null) { input = Console.OpenStandardInput(); } if (input != null) { lexer = new Lexer(new ClsStreamInImpl(input, _options.CharEncoding, _options.TabSize), _options); lexer.messages = messages; /* * store pointer to lexer in input stream * to allow character encoding errors to be * reported */ lexer.input.Lexer = lexer; /* Tidy doesn't alter the doctype for generic XML docs */ if (_options.XmlTags) { document = ParserImpl.parseXMLDocument(lexer); } else { document = ParserImpl.parseDocument(lexer); if (!document.CheckNodeIntegrity()) { Report.BadTree(lexer); return(null); } Clean cleaner = new Clean(_options.tt); /* simplifies <b><b> ... </b> ...</b> etc. */ cleaner.NestedEmphasis(document); /* cleans up <dir>indented text</dir> etc. */ cleaner.List2BQ(document); cleaner.BQ2Div(document); /* replaces i by em and b by strong */ if (_options.LogicalEmphasis) { cleaner.EmFromI(document); } if (_options.Word2000 && cleaner.IsWord2000(document, _options.tt)) { /* prune Word2000's <![if ...]> ... <![endif]> */ cleaner.DropSections(lexer, document); /* drop style & class attributes and empty p, span elements */ cleaner.CleanWord2000(lexer, document); } /* replaces presentational markup by style rules */ if (_options.MakeClean || _options.DropFontTags) { cleaner.CleanTree(lexer, document); } if (!document.CheckNodeIntegrity()) { Report.BadTree(lexer); return(null); } doctype = document.FindDocType(); if (document.Content != null) { if (_options.Xhtml) { lexer.SetXhtmlDocType(document); } else { lexer.FixDocType(document); } if (_options.TidyMark) { lexer.AddGenerator(document); } } /* ensure presence of initial <?XML version="1.0"?> */ if (_options.XmlOut && _options.XmlPi) { lexer.FixXmlPI(document); } if (document.Content != null) { Report.ReportVersion(lexer, doctype); Report.ReportNumWarnings(lexer); } } // Try to close the InputStream but only if if we created it. if ((file != null) && (input != Console.OpenStandardOutput())) { try { input.Close(); } catch (IOException) { } } if (lexer.messages.Errors > 0) { Report.NeedsAuthorIntervention(lexer); } o.State = StreamIn.FSM_ASCII; o.Encoding = _options.CharEncoding; if (lexer.messages.Errors == 0) { if (_options.BurstSlides) { Node body; body = null; /* * remove doctype to avoid potential clash with * markup introduced when bursting into slides */ /* discard the document type */ doctype = document.FindDocType(); if (doctype != null) { Node.DiscardElement(doctype); } /* slides use transitional features */ lexer.versions |= HtmlVersion.Html40Loose; /* and patch up doctype to match */ if (_options.Xhtml) { lexer.SetXhtmlDocType(document); } else { lexer.FixDocType(document); } /* find the body element which may be implicit */ body = document.FindBody(_options.tt); if (body != null) { pprint = new PPrint(_options); Report.ReportNumberOfSlides(lexer, pprint.CountSlides(body)); pprint.CreateSlides(lexer, document); } else { Report.MissingBody(lexer); } } else if (Output != null) { pprint = new PPrint(_options); o.Output = Output; if (_options.XmlTags) { pprint.PrintXmlTree(o, (short)0, 0, lexer, document); } else { pprint.PrintTree(o, (short)0, 0, lexer, document); } pprint.FlushLine(o, 0); } } Report.ErrorSummary(lexer); } return(document); }
/// <summary> /// Cleans HTML documents or fragments into XHTML conformant markup /// </summary> /// <param name="xmlMarkup">The html to clean</param> /// <returns></returns> public static XDocument TidyXml(string xmlMarkup) { try { return XhtmlDocument.Parse(xmlMarkup); } catch (Exception) { // take the slow road below... } byte[] xmlByteArray = Encoding.UTF8.GetBytes(xmlMarkup); Tidy tidy = GetXmlConfiguredTidy(); List<string> namespacePrefixedElementNames = LocateNamespacePrefixedElementNames(xmlMarkup); AllowNamespacePrefixedElementNames(tidy, namespacePrefixedElementNames); AllowHtml5ElementNames(tidy); TidyMessageCollection tidyMessages = new TidyMessageCollection(); string xml = ""; using (MemoryStream inputStream = new MemoryStream(xmlByteArray)) { using (MemoryStream outputStream = new MemoryStream()) { tidy.Parse(inputStream, outputStream, tidyMessages); outputStream.Position = 0; C1StreamReader sr = new C1StreamReader(outputStream); xml = sr.ReadToEnd(); } } if (tidyMessages.Errors > 0) { StringBuilder errorMessageBuilder = new StringBuilder(); foreach (TidyMessage message in tidyMessages) { if (message.Level == MessageLevel.Error) errorMessageBuilder.AppendLine(message.ToString()); } throw new InvalidOperationException(string.Format("Failed to parse html:\n\n{0}", errorMessageBuilder.ToString())); } xml = RemoveDuplicateAttributes(xml); return XDocument.Parse(xml); }
static string CleanHtml(string badHtmlString) { //Clean bad html using TIDY // http://sourceforge.net/projects/tidynet/ Tidy tidy = new Tidy (); MemoryStream input = new MemoryStream (); MemoryStream output = new MemoryStream (); byte[] badHtml = Encoding.UTF8.GetBytes (badHtmlString); input.Write (badHtml, 0, badHtml.Length); input.Position = 0; TidyMessageCollection tidyMsg = new TidyMessageCollection (); tidy.Parse (input, output, tidyMsg); return Encoding.UTF8.GetString (output.ToArray ()); }
/// <summary> Internal routine that actually does the parsing. The caller /// can pass either an InputStream or file name. If both are passed, /// the file name is preferred. /// </summary> internal Node ParseInternal(string file, Stream Output, TidyMessageCollection messages) { Stream input = null; if (file != null) { input = new FileStream(file, FileMode.Open, FileAccess.Read); } else if (input == null) { input = Console.OpenStandardInput(); } Node node = ParseInternal(input, Output, messages); // Try to close the InputStream but only if if we created it. if ((file != null) && (input != Console.OpenStandardOutput())) { //BUG!!! try { input.Close(); } catch (IOException) { } } return node; }