예제 #1
0
        public static string Convert(string inputstr)
        {
            if (inputstr.Trim() == "")
            {
                return("");
            }

            Tidy t = new Tidy();

            t.Options.DocType = DocType.Strict;
            t.Options.Xhtml   = true;
            MemoryStream input = new MemoryStream(UTF8Encoding.Default.GetBytes(inputstr));

            MemoryStream output = new MemoryStream();

            t.Parse(input, output, new TidyMessageCollection());

            byte[] outputContent = new Byte[output.Length];
            output.Position = 0;
            outputContent   = output.ToArray();
            UTF8Encoding ut   = new UTF8Encoding();
            String       temp = ut.GetString(outputContent);

            return(temp.Substring(temp.LastIndexOf("<body>")).Replace("<body>", "").Replace("</body>", "").Replace("</html>", ""));
        }
예제 #2
0
        private static string ParseMarkup(string markup, Tidy tidy, out TidyMessageCollection tidyMessages)
        {
            string result;

            tidyMessages = new TidyMessageCollection();
            byte[] htmlByteArray = Encoding.UTF8.GetBytes(markup);

            using (var inputStream = new MemoryStream(htmlByteArray))
            {
                using (var outputStream = new MemoryStream())
                {
                    tidy.Parse(inputStream, outputStream, tidyMessages);
                    outputStream.Position = 0;
                    using (var sr = new C1StreamReader(outputStream))
                    {
                        result = sr.ReadToEnd();
                    }
                }
            }

            if (tidyMessages.Errors > 0)
            {
                var errorMessageBuilder = new StringBuilder();
                foreach (TidyMessage message in tidyMessages)
                {
                    if (message.Level == MessageLevel.Error)
                    {
                        errorMessageBuilder.AppendLine(message.ToString());
                    }
                }
                throw new InvalidOperationException($"Failed to parse html:\n\n{errorMessageBuilder}");
            }

            return(result);
        }
예제 #3
0
        /// <summary>
        /// Cleans HTML documents or fragments into XHTML conformant markup
        /// </summary>
        /// <param name="htmlMarkup">The html to clean</param>
        /// <returns>A fully structured XHTML document, incl. html, head and body elements.</returns>
        public static TidyHtmlResult TidyHtml(string htmlMarkup)
        {
            Tidy tidy = GetXhtmlConfiguredTidy();

            List <string> namespacePrefixedElementNames      = LocateNamespacePrefixedElementNames(htmlMarkup);
            Dictionary <string, string> namespacePrefixToUri = LocateNamespacePrefixToUriDeclarations(htmlMarkup);
            List <string> badNamespacePrefixedElementNames   = namespacePrefixedElementNames
                                                               .Where(s => !namespacePrefixToUri.Any(d => s.StartsWith(d.Key))).ToList();

            AllowNamespacePrefixedElementNames(tidy, namespacePrefixedElementNames);
            AllowHtml5ElementNames(tidy);

            string xhtml = ParseMarkup(htmlMarkup, tidy, out TidyMessageCollection tidyMessages);

            if (xhtml.IndexOf("<html>") > -1)
            {
                xhtml = xhtml.Replace("<html>", "<html xmlns=\"http://www.w3.org/1999/xhtml\">");
            }

            if (xhtml.IndexOf("xmlns=\"http://www.w3.org/1999/xhtml\"") == -1)
            {
                xhtml = xhtml.Replace("<html", "<html xmlns=\"http://www.w3.org/1999/xhtml\"");
            }

            xhtml = RemoveDuplicateAttributes(xhtml);
            xhtml = RemoveXmlDeclarations(xhtml);
            xhtml = UndoLowerCasingOfElementNames(xhtml, namespacePrefixedElementNames);
            xhtml = UndoLowerCasingOfNamespacePrefixes(xhtml, namespacePrefixToUri);
            StringBuilder messageBuilder = new StringBuilder();

            foreach (TidyMessage message in tidyMessages)
            {
                if (message.Level == MessageLevel.Warning)
                {
                    messageBuilder.AppendLine(message.ToString());
                }
            }

            List <string> badNamespacePrefixes = badNamespacePrefixedElementNames.Select(n => n.Substring(0, n.IndexOf(':'))).Union(LocateAttributeNamespacePrefixes(xhtml)).Distinct().Where(f => IsValidXmlName(f)).ToList();

            XDocument outputResult;

            if (badNamespacePrefixedElementNames.Any())
            {
                string    badDeclared = string.Join(" ", badNamespacePrefixes.Select(p => $"xmlns:{p}='#bad'"));
                XDocument badDoc      = XDocument.Parse($"<root {badDeclared}>{xhtml}</root>");
                badDoc.Descendants().Attributes().Where(e => e.Name.Namespace == "#bad").Remove();
                badDoc.Descendants().Where(e => e.Name.Namespace == "#bad").Remove();
                outputResult = new XDocument(badDoc.Root.Descendants().First());
            }
            else
            {
                outputResult = XDocument.Parse(xhtml, LoadOptions.PreserveWhitespace);
            }

            return(new TidyHtmlResult {
                Output = outputResult, ErrorSummary = messageBuilder.ToString()
            });
        }
예제 #4
0
        public static string tidy(string str)
        {
            Tidy tidy = new Tidy();
            TidyMessageCollection msg = new TidyMessageCollection();

            MemoryStream input  = new MemoryStream();
            MemoryStream output = new MemoryStream();

            tidy.Options.CharEncoding    = CharEncoding.UTF8;
            tidy.Options.DocType         = DocType.Strict;
            tidy.Options.DropFontTags    = true;
            tidy.Options.LogicalEmphasis = true;
            tidy.Options.Xhtml           = true;
            tidy.Options.MakeClean       = true;
            tidy.Options.TidyMark        = true;
            tidy.Options.TabSize         = 0;

            if (str != null)
            {
                byte[] byteArray = Encoding.UTF8.GetBytes(str);
                input.Write(byteArray, 0, byteArray.Length);
                input.Position = 0;

                tidy.Parse(input, output, msg);//粘贴的Word文档在此有出现代码丢失的情况
                string outputString = Encoding.UTF8.GetString(output.ToArray());

                int bodybegin = outputString.IndexOf("<body>");
                int bodyend   = outputString.IndexOf("</body>");
                if (bodybegin > 0 && bodyend > 0)
                {
                    int length = bodyend - bodybegin - 10;
                    if (length < 0)
                    {
                        length = 0;
                    }
                    string realoutputString = outputString.Substring(bodybegin + 8, length);
                    return(realoutputString);
                }
                else
                {
                    return("");
                }
            }
            else
            {
                return("");
            }
        }
예제 #5
0
        static string CleanHtml(string badHtmlString)
        {
            //Clean bad html using TIDY
            // http://sourceforge.net/projects/tidynet/
            Tidy         tidy   = new Tidy();
            MemoryStream input  = new MemoryStream();
            MemoryStream output = new MemoryStream();

            byte[] badHtml = Encoding.UTF8.GetBytes(badHtmlString);
            input.Write(badHtml, 0, badHtml.Length);
            input.Position = 0;
            TidyMessageCollection tidyMsg = new TidyMessageCollection();

            tidy.Parse(input, output, tidyMsg);
            return(Encoding.UTF8.GetString(output.ToArray()));
        }
        protected override void DoProcess(FieldSerializationPipelineArgs args)
        {
            Assert.ArgumentNotNull(args, "args");

            if (args.ValueSerialized != null ||
                string.IsNullOrWhiteSpace(args.ValueNormal) ||
                !supportedFieldTypeKeys.Any(k => k.Equals(args.FieldTypeKey, StringComparison.InvariantCultureIgnoreCase)))
            {
                return;
            }

            Tidy tidy = new Tidy();

            tidy.Options.DocType       = DocType.Omit;
            tidy.Options.TidyMark      = true;
            tidy.Options.IndentContent = true;

            TidyMessageCollection tmc = new TidyMessageCollection();

            using (MemoryStream input = new MemoryStream())
                using (MemoryStream output = new MemoryStream())
                {
                    byte[] byteArray = Encoding.UTF8.GetBytes(args.ValueNormal);
                    input.Write(byteArray, 0, byteArray.Length);
                    input.Position = 0;
                    tidy.Parse(input, output, tmc);

                    string html         = Encoding.UTF8.GetString(output.ToArray());
                    string bodyTag      = "<body>";
                    string bodyCloseTag = "</body>";
                    if (html.IndexOf(bodyTag) > 0)
                    {
                        html = html.Substring(html.IndexOf(bodyTag) + bodyTag.Length);
                    }
                    if (html.IndexOf(bodyCloseTag) > 0)
                    {
                        html = html.Substring(0, html.IndexOf(bodyCloseTag));
                    }
                    html = html.Trim();

                    args.ValueSerialized = string.Concat(
                        Environment.NewLine,
                        html,
                        Environment.NewLine);
                    args.FieldSerializationType = FieldSerializationType.Html;
                }
        }
예제 #7
0
        /// <summary>
        /// Indents the given html source.
        /// </summary>
        /// <param name="htmlSource">The html source.</param>
        /// <returns>A string with the new source.</returns>
        public String IndentContent(String htmlSource)
        {
            Tidy tidy = new Tidy();

            tidy.Options.IndentContent = true;
            TidyMessageCollection tmc    = new TidyMessageCollection();
            MemoryStream          input  = new MemoryStream();
            MemoryStream          output = new MemoryStream();

            byte[] byteArray = Encoding.Unicode.GetBytes(htmlSource);
            input.Write(byteArray, 0, byteArray.Length);
            input.Position = 0;
            tidy.Parse(input, output, tmc);

            htmlSource = Encoding.Unicode.GetString(output.ToArray());
            return(htmlSource);
        }
예제 #8
0
        public static string TidyHtml(string text)
        {
            var doc      = new Tidy();
            var messages = new TidyMessageCollection();
            var input    = new MemoryStream();
            var output   = new MemoryStream();

            var array = Encoding.UTF8.GetBytes(text);

            input.Write(array, 0, array.Length);
            input.Position = 0;

            /*
             * // Disabled as it causes problems handling "font" tags
             * // There are occurences when it will muck up a font tag to "fontface=...etc...
             * //doc.Options.Xhtml = true;
             * doc.Options.MakeClean = false;
             * doc.Options.DocType = DocType.Strict;
             * doc.Options.CharEncoding = CharEncoding.UTF8;
             * doc.Options.LogicalEmphasis = true;
             *
             * doc.Options.SmartIndent = true;
             * doc.Options.IndentContent = true;
             * doc.Options.TidyMark = false;
             * doc.Options.QuoteAmpersand = true;
             * doc.Options.DropFontTags = false;
             * doc.Options.DropEmptyParas = true;
             *
             * // Required to stop spaces being removed, and tabs added etc...
             * doc.Options.Spaces = 0;
             * doc.Options.WrapLen = 32000;
             */

            doc.Options.TidyMark    = false;
            doc.Options.MakeClean   = true;
            doc.Options.Word2000    = true;
            doc.Options.EncloseText = true;

            // Required to stop spaces being removed, and tabs added etc...
            doc.Options.Spaces  = 0;
            doc.Options.WrapLen = 32000;
            doc.Parse(input, output, messages);
            //return Encoding.UTF8.GetString(output.ToArray());
            return(RemoveTidyAdditions(Encoding.UTF8.GetString(output.ToArray())));
        }
        public static String ConvertHtmlToXhtml(String source)
        {
            MemoryStream input  = new MemoryStream(Encoding.UTF8.GetBytes(source));
            MemoryStream output = new MemoryStream();

            TidyMessageCollection tmc = new TidyMessageCollection();
            Tidy tidy = new Tidy();

            tidy.Options.DocType         = DocType.Omit;
            tidy.Options.DropFontTags    = true;
            tidy.Options.LogicalEmphasis = true;
            tidy.Options.Xhtml           = true;
            tidy.Options.XmlOut          = true;
            tidy.Options.MakeClean       = true;
            tidy.Options.TidyMark        = false;
            tidy.Options.NumEntities     = true;


            tidy.Parse(input, output, tmc);

            XmlDocument x     = new XmlDocument();
            XmlDocument xhtml = new XmlDocument();

            xhtml.LoadXml("<body />");
            XmlNode xhtmlBody = xhtml.SelectSingleNode("/body");

            x.LoadXml(Encoding.UTF8.GetString(output.ToArray()));
            XmlAttribute ns = x.CreateAttribute("xmlns");

            ns.Value = XhtmlNamespace;
            XmlNode body = x.SelectSingleNode("/html/body");

            foreach (XmlNode node in body.ChildNodes)
            {
                if (node.NodeType == XmlNodeType.Element)
                {
                    node.Attributes.Append(ns);
                }

                xhtmlBody.AppendChild(xhtml.ImportNode(node, true));
            }
            return(xhtmlBody.InnerXml);
        }
예제 #10
0
        /// <summary>
        /// Uses Tidy.Net to clean a html source.
        /// </summary>
        /// <param name="htmlSource">The original html source.</param>
        /// <param name="isWordHtml">Specifies if the source is an output from Microsoft Word</param>
        /// <returns>The cleaned Html.</returns>
        public string Clean(string htmlSource)
        {
            Tidy tidy = new Tidy();

            //Options required dor xhtml conversion.
            tidy.Options.DocType          = DocType.Strict;
            tidy.Options.DropFontTags     = true;
            tidy.Options.LogicalEmphasis  = true;
            tidy.Options.Xhtml            = true;
            tidy.Options.XmlOut           = true;
            tidy.Options.MakeClean        = true;
            tidy.Options.TidyMark         = false;
            tidy.Options.DropEmptyParas   = true;
            tidy.Options.IndentContent    = true;
            tidy.Options.SmartIndent      = true;
            tidy.Options.Word2000         = isWordHtml;
            tidy.Options.EncloseBlockText = true;

            tidy.Options.XmlTags     = true;
            tidy.Options.FixComments = true;
            TidyMessageCollection tmc    = new TidyMessageCollection();
            MemoryStream          input  = new MemoryStream();
            MemoryStream          output = new MemoryStream();

            byte[] byteArray = Encoding.UTF8.GetBytes(htmlSource);
            input.Write(byteArray, 0, byteArray.Length);
            input.Position = 0;
            try
            {
                tidy.Parse(input, output, tmc);
            }
            catch (FormatException ex)
            {
                Log.Exception(ex);
                return(htmlSource);
            }
            string cleanContent = Encoding.UTF8.GetString(output.ToArray());

            return(cleanContent);
        }
예제 #11
0
        private void cleanContent(ref String initialContent, ref String cleanContent)
        {
            Tidy tidy = new Tidy();

            /*
             * tidy.Options.DocType = DocType.Strict;
             * tidy.Options.DropFontTags = true;
             * tidy.Options.LogicalEmphasis = true;
             * tidy.Options.XmlOut = true;
             * dy.Options.TidyMark = false;*/
            tidy.Options.Word2000 = true;
            //tidy.Options.MakeClean = true;
            tidy.Options.Xhtml = true;

            TidyMessageCollection tmc    = new TidyMessageCollection();
            MemoryStream          input  = new MemoryStream();
            MemoryStream          output = new MemoryStream();

            byte[] byteArray = Encoding.UTF8.GetBytes(initialContent);
            input.Write(byteArray, 0, byteArray.Length);
            input.Position = 0;
            tidy.Parse(input, output, tmc);

            cleanContent = Encoding.UTF8.GetString(output.ToArray());

            //Delete header & footer
            int startIndex, endIndex;

            startIndex   = cleanContent.IndexOf("<body");
            endIndex     = cleanContent.IndexOf(">", startIndex);
            cleanContent = cleanContent.Remove(0, endIndex + 1);
            startIndex   = cleanContent.IndexOf("</body");
            if (startIndex >= 0)
            {
                cleanContent = cleanContent.Remove(startIndex);
            }
        }
예제 #12
0
        /// <summary>
        /// Cleans HTML documents or fragments into XHTML conformant markup
        /// </summary>
        /// <param name="xmlMarkup">The html to clean</param>
        /// <returns></returns>
        public static XDocument TidyXml(string xmlMarkup)
        {
            try
            {
                return(XhtmlDocument.Parse(xmlMarkup));
            }
            catch (Exception)
            {
                // take the slow road below...
            }

            Tidy tidy = GetXmlConfiguredTidy();

            List <string> namespacePrefixedElementNames = LocateNamespacePrefixedElementNames(xmlMarkup);

            AllowNamespacePrefixedElementNames(tidy, namespacePrefixedElementNames);
            AllowHtml5ElementNames(tidy);

            string xml = ParseMarkup(xmlMarkup, tidy, out TidyMessageCollection _);

            xml = RemoveDuplicateAttributes(xml);

            return(XDocument.Parse(xml));
        }
예제 #13
0
        /// <summary>
        /// Cleans HTML documents or fragments into XHTML conformant markup
        /// </summary>
        /// <param name="htmlMarkup">The html to clean</param>
        /// <returns>A fully structured XHTML document, incl. html, head and body elements.</returns>
        public static TidyHtmlResult TidyHtml(string htmlMarkup)
        {
            byte[] htmlByteArray = Encoding.UTF8.GetBytes(htmlMarkup);

            Tidy tidy = GetXhtmlConfiguredTidy();

            List <string> namespacePrefixedElementNames      = LocateNamespacePrefixedElementNames(htmlMarkup);
            Dictionary <string, string> namespacePrefixToUri = LocateNamespacePrefixToUriDeclarations(htmlMarkup);
            List <string> badNamespacePrefixedElementNames   = namespacePrefixedElementNames.Where(s => namespacePrefixToUri.Where(d => s.StartsWith(d.Key)).Any() == false).ToList();

            AllowNamespacePrefixedElementNames(tidy, namespacePrefixedElementNames);
            AllowHtml5ElementNames(tidy);

            TidyMessageCollection tidyMessages = new TidyMessageCollection();
            string xhtml = "";

            using (MemoryStream inputStream = new MemoryStream(htmlByteArray))
            {
                using (MemoryStream outputStream = new MemoryStream())
                {
                    tidy.Parse(inputStream, outputStream, tidyMessages);
                    outputStream.Position = 0;
                    C1StreamReader sr = new C1StreamReader(outputStream);
                    xhtml = sr.ReadToEnd();
                }
            }

            if (tidyMessages.Errors > 0)
            {
                StringBuilder errorMessageBuilder = new StringBuilder();
                foreach (TidyMessage message in tidyMessages)
                {
                    if (message.Level == MessageLevel.Error)
                    {
                        errorMessageBuilder.AppendLine(message.ToString());
                    }
                }
                throw new InvalidOperationException(string.Format("Failed to parse html:\n\n{0}", errorMessageBuilder.ToString()));
            }

            if (xhtml.IndexOf("<html>") > -1)
            {
                xhtml = xhtml.Replace("<html>", "<html xmlns=\"http://www.w3.org/1999/xhtml\">");
            }

            if (xhtml.IndexOf("xmlns=\"http://www.w3.org/1999/xhtml\"") == -1)
            {
                xhtml = xhtml.Replace("<html", "<html xmlns=\"http://www.w3.org/1999/xhtml\"");
            }

            xhtml = RemoveDuplicateAttributes(xhtml);
            xhtml = RemoveXmlDeclarations(xhtml);
            xhtml = UndoLowerCasingOfElementNames(xhtml, namespacePrefixedElementNames);
            xhtml = UndoLowerCasingOfNamespacePrefixes(xhtml, namespacePrefixToUri);
            StringBuilder messageBuilder = new StringBuilder();

            foreach (TidyMessage message in tidyMessages)
            {
                if (message.Level == MessageLevel.Warning)
                {
                    messageBuilder.AppendLine(message.ToString());
                }
            }

            List <string> badNamespacePrefixes = badNamespacePrefixedElementNames.Select(n => n.Substring(0, n.IndexOf(':'))).Union(LocateAttributeNamespacePrefixes(xhtml)).Distinct().Where(f => IsValidXmlName(f)).ToList();

            XDocument outputResult;

            if (badNamespacePrefixedElementNames.Any())
            {
                string    badDeclared = string.Join(" ", badNamespacePrefixes.Select(p => string.Format("xmlns:{0}='#bad'", p)).ToArray());
                XDocument badDoc      = XDocument.Parse(string.Format("<root {0}>{1}</root>", badDeclared, xhtml));
                badDoc.Descendants().Attributes().Where(e => e.Name.Namespace == "#bad").Remove();
                badDoc.Descendants().Where(e => e.Name.Namespace == "#bad").Remove();
                outputResult = new XDocument(badDoc.Root.Descendants().First());
            }
            else
            {
                outputResult = XDocument.Parse(xhtml, LoadOptions.PreserveWhitespace);
            }

            return(new TidyHtmlResult {
                Output = outputResult, ErrorSummary = messageBuilder.ToString()
            });
        }
예제 #14
0
        public string Generate()
        {
            string result = Strings.BasicHtmlPage;
            string head   = string.Empty;

            if (this.Charset == string.Empty)
            {
                this.Charset = "UTF-8";
            }

            #region HeadInit
            if (this.Jquery != string.Empty)
            {
                head += string.Format(Strings.ScriptFrame, string.Format(Strings.JqueryUrl, this.Jquery));
            }
            if (this.JqueryMobile != string.Empty)
            {
                head += string.Format(Strings.CssFrame, string.Format(Strings.JqueryMobileCssUrl, this.JqueryMobile));
                head += string.Format(Strings.ScriptFrame, string.Format(Strings.JqueryMobileJsUrl, this.JqueryMobile));
            }
            if (this.JqueryUI != string.Empty)
            {
                head += string.Format(Strings.CssFrame, string.Format(Strings.JqueryUICssUrl, this.JqueryUI));
                head += string.Format(Strings.ScriptFrame, string.Format(Strings.JqueryUIJsUrl, this.JqueryUI));
            }
            if (this.Angular != string.Empty)
            {
                head += string.Format(Strings.ScriptFrame, string.Format(Strings.AngularUrl, this.Angular));
            }
            if (this.Dojo != string.Empty)
            {
                head += string.Format(Strings.ScriptFrame, string.Format(Strings.DojoUrl, this.Dojo));
            }
            if (this.ExtJS != string.Empty)
            {
                head += string.Format(Strings.ScriptFrame, string.Format(Strings.ExtJsUrl, this.ExtJS));
            }
            if (this.MooTools != string.Empty)
            {
                head += string.Format(Strings.ScriptFrame, string.Format(Strings.MooToolsUrl, this.MooTools));
            }
            if (this.Protoptype != string.Empty)
            {
                head += string.Format(Strings.ScriptFrame, string.Format(Strings.PrototypeUrl, this.Protoptype));
            }
            if (this.Scriptaculous != string.Empty)
            {
                head += string.Format(Strings.ScriptFrame, string.Format(Strings.ScriptaculousUrl, this.Scriptaculous));
            }
            if (this.SWFObject != string.Empty)
            {
                head += string.Format(Strings.ScriptFrame, string.Format(Strings.SWFObjectUrl, this.SWFObject));
            }
            if (this.ThreeJS != string.Empty)
            {
                head += string.Format(Strings.ScriptFrame, string.Format(Strings.ThreeJsUrl
                                                                         , this.ThreeJS));
            }
            if (this.WebFontLoader != string.Empty)
            {
                head += string.Format(Strings.ScriptFrame, string.Format(Strings.WebFontLoaderUrl
                                                                         , this.WebFontLoader));
            }
            #endregion

            result = result.Replace("{TITLE}", this.Title);
            result = result.Replace("{CHARSET}", this.Charset);
            result = result.Replace("{OTHER_HEAD}", head);
            result = result.Replace("{BODY}", "");

            #region HtmlTidy
            if (this.HtmlTidy)
            {
                Tidy tidy = new Tidy();
                tidy.Options.DocType         = DocType.Strict;
                tidy.Options.DropFontTags    = true;
                tidy.Options.LogicalEmphasis = true;
                tidy.Options.Xhtml           = true;
                tidy.Options.XmlOut          = true;
                tidy.Options.MakeClean       = true;
                tidy.Options.TidyMark        = false;
                TidyMessageCollection tmc    = new TidyMessageCollection();
                MemoryStream          input  = new MemoryStream();
                MemoryStream          output = new MemoryStream();
                byte[] byteArray             = Encoding.UTF8.GetBytes(result);
                input.Write(byteArray, 0, byteArray.Length);
                input.Position = 0;
                tidy.Parse(input, output, tmc);
                result = Encoding.UTF8.GetString(output.ToArray());
            }
            #endregion

            return(result);
        }