public void CleanContentTrusted(AtomContent content)
    {
      Logger.Info("Cleaning content to be valid xhtml.");

      string text = content.Text;
      if (content.Type == "html")
      {
        text = "<div xmlns=\"" + Atom.XhtmlNs.NamespaceName + "\">" + text + "</div>";
      }

      Tidy tidy = new Tidy();
      /* Set the options you want */
      tidy.Options.DocType = DocType.Strict;
      //tidy.Options.DropFontTags = true; 
      tidy.Options.LogicalEmphasis = true;
      tidy.Options.Xhtml = true;
      tidy.Options.XmlOut = true;
      tidy.Options.MakeClean = true;
      tidy.Options.TidyMark = false;
      tidy.Options.QuoteNbsp = false;
      tidy.Options.NumEntities = true;
      tidy.Options.CharEncoding = CharEncoding.UTF8;
      tidy.Options.FixBackslash = true;
      tidy.Options.FixComments = true;

      TidyMessageCollection tmc = new TidyMessageCollection();
      using (MemoryStream input = new MemoryStream())
      using (MemoryStream output = new MemoryStream())
      {
        byte[] bytes = Encoding.UTF8.GetBytes(text);
        input.Write(bytes, 0, bytes.Length);
        input.Position = 0;
        tidy.Parse(input, output, tmc);
        text = Encoding.UTF8.GetString(output.ToArray());
        if (string.IsNullOrEmpty(text)) throw new FailedToCleanContentException(
          string.Format("{0} HTML Tidy Error(s)" + Environment.NewLine, tmc.Errors)
          + string.Join(Environment.NewLine,
          tmc.Cast<TidyMessage>()
          .Where(m => m.Level == MessageLevel.Error)
          .Select(m => m.ToString()).ToArray()));
      }

      //remove html/body that gets added by tidy
      //int start = text.IndexOf("<div xmlns");
      //text = text.Substring(start, text.IndexOf("</body>") - start);

      XElement div = XElement.Parse(text).Element(Atom.XhtmlNs + "body").Element(Atom.XhtmlNs + "div");

      //remove decendent xmlns that gets added by tidy
      div.Descendants().Where(d => d.Attribute("xmlns") != null && 
        d.Attribute("xmlns").Value == Atom.XhtmlNs.NamespaceName)
        .Select(d => d.Attribute("xmlns")).Remove();

      //text = text.Replace(" xmlns=\"http://www.w3.org/1999/xhtml\"", string.Empty);
      //text = "<div xmlns=\"http://www.w3.org/1999/xhtml\">" + text.Substring("<div>".Length);

      //set as xhtml
      content.Type = "xhtml";
      content.Text = div.ToString(SaveOptions.None);
    }
Example #2
0
    private void Page_Load(System.Object sender, System.EventArgs e)
    {
        _messageHelper = contentAPI.EkMsgRef;
        if (contentAPI.RequestInformationRef.IsMembershipUser == 1 || contentAPI.RequestInformationRef.UserId == 0)
        {
            Response.Redirect(contentAPI.ApplicationPath + "reterror.aspx?info=" + Server.UrlEncode(_messageHelper.GetMessage("msg login cms user")), false);
            return;
        }
        //Put user code to initialize the page here
        this.pageTitle.Text = (new ApplicationAPI()).EkMsgRef.GetMessage("ektron translation");
        htmleditor = this.Request.Form["htmleditor"];
        htmcontent = this.Request.Form["mycontent"];

        // also run Tidy on the text
        TidyNet.Tidy objTidy = new TidyNet.Tidy();
        objTidy.Options.BreakBeforeBR = true;
        objTidy.Options.CharEncoding = TidyNet.CharEncoding.UTF8;
        objTidy.Options.DocType = TidyNet.DocType.Omit;
        objTidy.Options.DropEmptyParas = false;
        objTidy.Options.MakeClean = true;
        objTidy.Options.NumEntities = true;
        objTidy.Options.QuoteAmpersand = true;
        objTidy.Options.QuoteMarks = false;
        objTidy.Options.QuoteNbsp = true;
        objTidy.Options.RawOut = false;
        objTidy.Options.TidyMark = false;
        objTidy.Options.Word2000 = true;
        objTidy.Options.XmlOut = true;
        TidyNet.TidyMessageCollection messageCollection = new TidyNet.TidyMessageCollection();
        System.IO.MemoryStream streamIn = new System.IO.MemoryStream();
        System.IO.MemoryStream streamOut = new System.IO.MemoryStream();
        byte[] byteArray = Encoding.UTF8.GetBytes(htmcontent);
        streamIn.Write(byteArray, 0, byteArray.Length);
        streamIn.Position = 0;
        objTidy.Parse(streamIn, streamOut, messageCollection);
        streamOut.Position = 0;
        string strTidyResult = Encoding.UTF8.GetString(streamOut.ToArray());
        streamOut.Close();
        streamIn.Close();
        if ((strTidyResult == "") && (messageCollection.Errors > 0))
        {
            foreach (TidyNet.TidyMessage msg in messageCollection)
            {
                if (msg.Level == TidyNet.MessageLevel.Error)
                {
                    strTidyResult = strTidyResult + msg.ToString() + "<br />";
                }
            }

            htmcontent = strTidyResult;
            content.Value = htmcontent;
        }
        else
        {
            strTidyResult = (string)(System.Text.RegularExpressions.Regex.Replace(strTidyResult, "[\\w\\W]*?<body>", "").Replace("</body>" + "\r\n" + "</html>", ""));
            content.Value = strTidyResult;
        }
    }
 private static void SetParameters(Tidy tidy)
 {
     tidy.Options.DocType = DocType.Omit;
     tidy.Options.Xhtml = true;
     tidy.Options.XmlOut = true;
     tidy.Options.CharEncoding = CharEncoding.Latin1;
     tidy.Options.QuoteNbsp = false;
     //tidy.Options.DropFontTags = true;
     //tidy.Options.LogicalEmphasis = true;
     //tidy.Options.MakeClean = true;
     //tidy.Options.TidyMark = false;
     //tidy.Options.NumEntities = true;
     //tidy.Options.NumEntities = false;
 }
Example #4
0
        /// <summary>
        /// Indents the given html source.
        /// </summary>
        /// <param name="htmlSource">The html source.</param>
        /// <returns>A string with the new source.</returns>
        public String IndentContent(String htmlSource)
        {
            Tidy tidy = new Tidy();
            tidy.Options.IndentContent = true;
            TidyMessageCollection tmc = new TidyMessageCollection();
            MemoryStream input = new MemoryStream();
            MemoryStream output = new MemoryStream();

            byte[] byteArray = Encoding.Unicode.GetBytes(htmlSource);
            input.Write(byteArray, 0, byteArray.Length);
            input.Position = 0;
            tidy.Parse(input, output, tmc);

            htmlSource = Encoding.Unicode.GetString(output.ToArray());
            return htmlSource;
        }
 public UptimeResolver()
 {
     tidy = new Tidy()
     {
         Options =
         {
             DocType = DocType.Strict,
             DropFontTags = true,
             LogicalEmphasis = true,
             Xhtml = true,
             XmlOut = true,
             MakeClean = true,
             TidyMark = false,
         }
     };
 }
 static HtmlCleaner()
 {
     tidy = new Tidy();
     tidy.Options.CharEncoding = CharEncoding.UTF8;
     tidy.Options.DocType = DocType.Strict;
     tidy.Options.DropFontTags = true;
     tidy.Options.DropEmptyParas = true;
     tidy.Options.IndentContent = false;
     tidy.Options.LogicalEmphasis = true;
     tidy.Options.MakeClean = false;
     tidy.Options.SmartIndent = true;
     tidy.Options.Spaces = 4;
     tidy.Options.TidyMark = false;
     tidy.Options.QuoteAmpersand = true;
     tidy.Options.WrapLen = 100;
     tidy.Options.Xhtml = true;
 }
        private static string CleanHtml(string dirtyHtml)
        {
            //try
            //{
            //    return System.Xml.Linq.XElement.Parse(dirtyHtml).ToString();
            //}
            //catch (Exception ex)
            //{
            //    // isn't well-formed xml
            //    Console.WriteLine(ex.ToString());
            //    return string.Empty;
            //}

            var tidy = new TidyNet.Tidy();

            tidy.Options.SmartIndent      = true;
            tidy.Options.IndentAttributes = false;
            tidy.Options.WrapLen          = 0;
            tidy.Options.Spaces           = 4;
            //tidy.Options.XmlOut = false;
            //tidy.Options.XmlTags = false;
            //tidy.Options.Xhtml = false;

            //tidy.Options.WrapLen = 0;

            var messages = new TidyMessageCollection();

            using (var inStream = new MemoryStream(Encoding.Default.GetBytes(dirtyHtml)))
                using (var outStream = new MemoryStream())
                {
                    tidy.Parse(inStream, outStream, messages);
                    return(Encoding.Default.GetString(outStream.ToArray()));
                }

            //using (Document doc = Document.FromString(dirtyHtml))
            //{
            //    doc.OutputBodyOnly = AutoBool.Yes;
            //    doc.Quiet = true;
            //    doc.CleanAndRepair();

            //    return doc.Save();
            //}
        }
        /// <summary>
        /// Uses Tidy.Net to clean a html source.
        /// </summary>
        /// <param name="htmlSource">The original html source.</param>
        /// <param name="isWordHtml">Specifies if the source is an output from Microsoft Word</param>
        /// <returns>The cleaned Html.</returns>
        public string Clean(string htmlSource)
        {
            Tidy tidy = new Tidy();
            //Options required dor xhtml conversion.
            tidy.Options.DocType = DocType.Strict;
            tidy.Options.DropFontTags = true;
            tidy.Options.LogicalEmphasis = true;
            tidy.Options.Xhtml = true;
            tidy.Options.XmlOut = true;
            tidy.Options.MakeClean = true;
            tidy.Options.TidyMark = false;
            tidy.Options.DropEmptyParas = true;
            tidy.Options.IndentContent = true;
            tidy.Options.SmartIndent = true;
            tidy.Options.Word2000 = isWordHtml;
            tidy.Options.EncloseBlockText = true;

            tidy.Options.XmlTags = true;
            tidy.Options.FixComments = true;
            TidyMessageCollection tmc = new TidyMessageCollection();
            MemoryStream input = new MemoryStream();
            MemoryStream output = new MemoryStream();

            byte[] byteArray = Encoding.UTF8.GetBytes(htmlSource);
            input.Write(byteArray, 0, byteArray.Length);
            input.Position = 0;
            try
            {
                tidy.Parse(input, output, tmc);
            }
            catch (FormatException ex)
            {
                Log.Exception(ex);
                return htmlSource;
            }
            string cleanContent = Encoding.UTF8.GetString(output.ToArray());
            return cleanContent;
        }
Example #9
0
        public static String ConvertHtmlToXhtml(String source)
        {
            MemoryStream input = new MemoryStream(Encoding.UTF8.GetBytes(source));
            MemoryStream output = new MemoryStream();

            TidyMessageCollection tmc = new TidyMessageCollection();
            Tidy tidy = new Tidy();

            tidy.Options.DocType = DocType.Omit;
            tidy.Options.DropFontTags = true;
            tidy.Options.LogicalEmphasis = true;
            tidy.Options.Xhtml = true;
            tidy.Options.XmlOut = true;
            tidy.Options.MakeClean = true;
            tidy.Options.TidyMark = false;
            tidy.Options.NumEntities = true;

            tidy.Parse(input, output, tmc);

            XmlDocument x = new XmlDocument();
            XmlDocument xhtml = new XmlDocument();
            xhtml.LoadXml("<body />");
            XmlNode xhtmlBody = xhtml.SelectSingleNode("/body");

            x.LoadXml(Encoding.UTF8.GetString(output.ToArray()));
            XmlAttribute ns = x.CreateAttribute("xmlns");
            ns.Value = XhtmlNamespace;
            XmlNode body = x.SelectSingleNode("/html/body");
            foreach (XmlNode node in body.ChildNodes)
            {
                if (node.NodeType == XmlNodeType.Element)
                    node.Attributes.Append(ns);

                xhtmlBody.AppendChild(xhtml.ImportNode(node, true));
            }
            return xhtmlBody.InnerXml;
        }
 static string CleanHtml(string badHtmlString)
 {
     //Clean bad html using TIDY
     // http://sourceforge.net/projects/tidynet/
     Tidy tidy = new Tidy ();
     MemoryStream input = new MemoryStream ();
     MemoryStream output = new MemoryStream ();
     byte[] badHtml = Encoding.UTF8.GetBytes (badHtmlString);
     input.Write (badHtml, 0, badHtml.Length);
     input.Position = 0;
     TidyMessageCollection tidyMsg = new TidyMessageCollection ();
     tidy.Parse (input, output, tidyMsg);
     return Encoding.UTF8.GetString (output.ToArray ());
 }
Example #11
0
        private void buttonGo_Click(object sender, EventArgs e)
        {
            Exception ex;

            // I had to throw in the len as Budget Revenue and Expenditures have the same ID except the Expenditures has a 1 at the end.
            ex = _db.ExecuteSqlReader("SELECT * FROM CIA_Fields WHERE fieldid > 2000 AND LEN(fieldid) = 4");

            if (ex != null)
            {
                throw new Exception(ex.Message);
            }
            var fieldIDs = new List<int>();
            while (_db.Reader.Read())
            {
                 fieldIDs.Add((int)_db.Reader["FieldID"]);
            }
            _db.Reader.Close();
            foreach(var f in fieldIDs){
                textBoxOutput.Text += f + Environment.NewLine;
                var input = File.OpenRead(textFolder.Text + "\\" + f + ".html");
                var tmc = new TidyMessageCollection();
                var output = new MemoryStream();

                var tidy = new Tidy();
                tidy.Options.DocType = DocType.Strict;
                tidy.Options.DropFontTags = true;
                tidy.Options.LogicalEmphasis = true;
                tidy.Options.Xhtml = true;
                tidy.Options.XmlOut = true;
                tidy.Options.MakeClean = true;
                tidy.Options.TidyMark = false;
                tidy.Options.WrapLen = 0;
                tidy.Parse(input, output, tmc);

                var result = Encoding.UTF8.GetString(output.ToArray());
                HtmlAgilityPack.HtmlDocument doc = new HtmlAgilityPack.HtmlDocument();
                doc.LoadHtml(result);

                var categoryData = doc.DocumentNode.SelectNodes("//td[@class='category_data']");
                if (categoryData != null)
                {
                    foreach (var i in categoryData)
                    {
                        if (i != null)
                        {
                            var tagID = _countryTagList.SingleOrDefault(a => a.Key == i.ParentNode.ParentNode.Id);
                            if (tagID.Key == null)
                            {
                                continue;
                            }
                            switch(f)
                            {
                                case 2085:
                                    Parse.Parse2085(textBoxOutput, i.InnerText, f, tagID.Value);
                                    break;
                                case 2091:
                                    Parse.Parse2091(textBoxOutput, i.InnerText, f, tagID.Value);
                                    break;
                                case 2121:
                                    Parse.Parse2121(textBoxOutput, i.InnerText, f, tagID.Value);
                                    break;
                                case 2056:
                                    Parse.Parse2056(textBoxOutput, i.InnerText, f, tagID.Value);
                                    break;
                                default:
                                    textBoxOutput.Text += Parse.ParseTableData(i.InnerText, f, tagID.Value);
                                    break;
                            }
                        }
                    }
                }
                else
                {
                    textBoxOutput.Text += f + ": NO DATA" + Environment.NewLine;
                }
            }
        }
 public SmileHtmlStatementParser()
 {
     _tidy = new Tidy();
     SetParameters(_tidy);
 }
Example #13
0
        private static string TidyHtml(string text)
        {
            var doc = new Tidy();
            var messages = new TidyMessageCollection();
            var input = new MemoryStream();
            var output = new MemoryStream();

            var array = Encoding.UTF8.GetBytes(text);
            input.Write(array, 0, array.Length);
            input.Position = 0;

            // Disabled as it causes problems handling "font" tags
            // There are occurences when it will muck up a font tag to "fontface=...etc...
            //doc.Options.Xhtml = true;
            doc.Options.MakeClean = false;
            doc.Options.DocType = DocType.Strict;
            doc.Options.CharEncoding = CharEncoding.UTF8;
            doc.Options.LogicalEmphasis = true;

            doc.Options.SmartIndent = true;
            doc.Options.IndentContent = true;
            doc.Options.TidyMark = false;
            doc.Options.QuoteAmpersand = true;
            doc.Options.DropFontTags = false;
            doc.Options.DropEmptyParas = true;

            // Required to stop spaces being removed, and tabs added etc...
            doc.Options.Spaces = 0;
            doc.Options.WrapLen = 32000;

            doc.Parse(input, output, messages);

            return RemoveTidyAdditions(Encoding.UTF8.GetString(output.ToArray()));
        }
        private static Tidy GetXmlConfiguredTidy()
        {
            Tidy t = new Tidy();

            t.Options.RawOut = true;
            t.Options.TidyMark = false;

            t.Options.CharEncoding = CharEncoding.UTF8;
            t.Options.DocType = DocType.Omit;
            t.Options.WrapLen = 0;

            t.Options.Xhtml = false;
            t.Options.XmlOut = true;

            t.Options.QuoteNbsp = false;
            t.Options.NumEntities = true;

            return t;
        }
        private static Tidy GetXhtmlConfiguredTidy()
        {
            var t = new Tidy();

            t.Options.RawOut = true;
            t.Options.TidyMark = false;

            t.Options.CharEncoding = CharEncoding.UTF8;
            t.Options.DocType = DocType.Omit;
            t.Options.WrapLen = 0;

            t.Options.BreakBeforeBR = true;
            t.Options.DropEmptyParas = true;
            t.Options.Word2000 = true;
            t.Options.MakeClean = false;
            t.Options.Xhtml = true;

            t.Options.QuoteNbsp = false;
            t.Options.NumEntities = true;
            t.Options.AllowElementPruning = false;
            t.Options.LogicalEmphasis = true;

            return t;
        }
        public override List<SearchResult> Search(ChapterInfo chapterInfo)
        {
            string result = string.Empty;
              using (WebClient wc = new WebClient())
              {
            //NameValueCollection vars = new NameValueCollection();
            //vars.Add("txtTitle", chapterInfo.Title);
            //vars.Add("btnSearch", "Search");
            //wc.UploadValues(uri, "POST", vars);
            wc.Headers["Content-Type"] = "application/x-www-form-urlencoded";
            Uri uri = new Uri("http://www.e-home.no/metaservices/search.aspx");
            result = wc.UploadString(uri, "POST",
              //__VIEWSTATE=%2FwEPDwUKLTM3MTkwMDA5NQ9kFgICAQ9kFgICDQ88KwALAGRkg%2BhH%2F3tiaQDjnQncD1sYDdeni%2BA%3D&txtTitle=batman&btnSearch=Search&__EVENTVALIDATION=%2FwEWAwLXiqPdDAL55JyzBAKln%2FPuCgMJnDvHIVAx2tPEYdjNUbwqrR67
            string.Format("__VIEWSTATE=%2FwEPDwUKLTM3MTkwMDA5NQ9kFgICAQ9kFgICDQ88KwALAGRkg%2BhH%2F3tiaQDjnQncD1sYDdeni%2BA%3D&txtTitle={0}&btnSearch=Search&__EVENTVALIDATION=%2FwEWAwLXiqPdDAL55JyzBAKln%2FPuCgMJnDvHIVAx2tPEYdjNUbwqrR67", HttpUtility.UrlEncode(chapterInfo.Title)));
              }
              //__VIEWSTATE=%2FwEPDwUKLTM3MTkwMDA5NQ9kFgICAQ9kFgICDQ88KwALAGRkg%2BhH%2F3tiaQDjnQncD1sYDdeni%2BA%3D&txtTitle=batman&btnSearch=Search&__EVENTVALIDATION=%2FwEWAwLXiqPdDAL55JyzBAKln%2FPuCgMJnDvHIVAx2tPEYdjNUbwqrR67

              Tidy tidy = new Tidy();
              /* Set the options you want */
              tidy.Options.DocType = DocType.Strict;
              //tidy.Options.DropFontTags = true;
              tidy.Options.LogicalEmphasis = true;
              tidy.Options.Xhtml = true;
              tidy.Options.XmlOut = true;
              tidy.Options.MakeClean = true;
              tidy.Options.TidyMark = false;
              tidy.Options.QuoteNbsp = false;
              tidy.Options.NumEntities = true;
              tidy.Options.CharEncoding = CharEncoding.UTF8;
              tidy.Options.FixBackslash = true;
              tidy.Options.FixComments = true;

              TidyMessageCollection tmc = new TidyMessageCollection();
              using (MemoryStream input = new MemoryStream())
              using (MemoryStream output = new MemoryStream())
              {
            byte[] bytes = Encoding.UTF8.GetBytes(result);
            input.Write(bytes, 0, bytes.Length);
            input.Position = 0;
            tidy.Parse(input, output, tmc);
            result = Encoding.UTF8.GetString(output.ToArray());
            if (tmc.Errors > 0) throw new Exception(
              string.Format("{0} HTML Tidy Error(s)" + Environment.NewLine, tmc.Errors)
              + string.Join(Environment.NewLine,
              tmc.Cast<TidyMessage>()
              .Where(m => m.Level == MessageLevel.Error)
              .Select(m => m.ToString()).ToArray()));
            XNamespace ns = "http://www.w3.org/1999/xhtml";
            //parse titles
            XDocument searchXhtml = XDocument.Parse(result);
            Debug.Write(searchXhtml.Descendants(ns + "tr")
              .Where(tr => (tr.Attribute("id") != null && tr.Attribute("id").Value.Length == 17)).Count());

            var titles = searchXhtml.Descendants(ns + "tr")
              .Where(tr => (tr.Attribute("id") != null && tr.Attribute("id").Value.Length == 17))
              .Select(tr => new SearchResult()
              {
            Id = (string)tr.Attribute("id"),
            Name = (string)tr.Elements(ns + "td").First()
              });
            OnSearchComplete();
            return titles.ToList();
              }
        }
Example #17
0
        /// <summary>
        /// Shortens a HTML formatted string, while keeping HTML formatting and complete words (also removes line-breakes at the end of the shortened string)
        /// </summary>
        /// <param name="input">The HTML formatted string</param>
        /// <param name="inputIsShortened">Output boolean telling if the input string has been shortened</param>
        /// <param name="length">The approximate length of the output string (default: 300)</param>
        /// <param name="elipsis">Elipsis text to append to the output string (use string.Empty when elipsis should not be added, default: ...)</param>
        /// <returns>The shortened input string with HTML formatting</returns>
        public static string ShortenHtml(this string input, out bool inputIsShortened, int length = 300, string elipsis = "...")
        {
            inputIsShortened = false;

            if (input.Length <= length)
                return input;

            input = input.Replace("<br />", "<br/>");

            string substring = input.Substring(0, length);
            string leftover = input.Substring(length);
            while (!leftover.StartsWith(" ") && leftover != string.Empty)
            {
                substring += leftover.Substring(0, 1);
                leftover = leftover.Substring(1);
            }
            substring = substring.Trim();
            while (substring.EndsWith("<br/>"))
            {
                substring = substring.Substring(0, substring.Length - 5);
                substring = substring.Trim();
            }

            if (input.Length > substring.Length)
                inputIsShortened = true;

            substring = substring.Replace("<br/>", "<br />");

            Tidy tidy = new Tidy();
            tidy.Options.DocType = DocType.Omit;
            tidy.Options.CharEncoding = CharEncoding.UTF8;
            tidy.Options.Xhtml = true;
            tidy.Options.NumEntities = true;

            TidyMessageCollection tmc = new TidyMessageCollection();
            MemoryStream inputStream = new MemoryStream();
            MemoryStream outputStream = new MemoryStream();

            byte[] bytes = Encoding.UTF8.GetBytes(substring);
            inputStream.Write(bytes, 0, bytes.Length);
            inputStream.Position = 0;
            tidy.Parse(inputStream, outputStream, tmc);

            string tidyResult = Encoding.UTF8.GetString(outputStream.ToArray());
            XmlDocument xmlDoc = new XmlDocument();
            xmlDoc.LoadXml(tidyResult);
            tidyResult = xmlDoc.SelectSingleNode("//body").InnerXml;

            if (!string.IsNullOrEmpty(elipsis))
            {
                if (tidyResult.EndsWith("</p>"))
                    return string.Concat(tidyResult.Substring(0, tidyResult.Length - 4), elipsis, "</p>");
                return string.Concat(tidyResult, elipsis);
            }
            return tidyResult;
        }
        private void cleanContent(ref String initialContent,ref String cleanContent)
        {
            Tidy tidy = new Tidy();
            /*
            tidy.Options.DocType = DocType.Strict;
            tidy.Options.DropFontTags = true;
            tidy.Options.LogicalEmphasis = true;
            tidy.Options.XmlOut = true;
            dy.Options.TidyMark = false;*/
            tidy.Options.Word2000 = true;
            //tidy.Options.MakeClean = true;
            tidy.Options.Xhtml = true;

            TidyMessageCollection tmc = new TidyMessageCollection();
            MemoryStream input = new MemoryStream();
            MemoryStream output = new MemoryStream();

            byte[] byteArray = Encoding.UTF8.GetBytes(initialContent);
            input.Write(byteArray, 0, byteArray.Length);
            input.Position = 0;
            tidy.Parse(input, output, tmc);

            cleanContent = Encoding.UTF8.GetString(output.ToArray());

            //Delete header & footer
            int startIndex, endIndex;
            startIndex = cleanContent.IndexOf("<body");
            endIndex = cleanContent.IndexOf(">", startIndex);
            cleanContent = cleanContent.Remove(0, endIndex + 1);
            startIndex = cleanContent.IndexOf("</body");
            if(startIndex >= 0)
                cleanContent = cleanContent.Remove(startIndex);
        }
        public static string ValidateHtml(string htmlString)
        {
            var tidy = new Tidy();
            tidy.Options.DocType = DocType.Omit;
            tidy.Options.Xhtml = true;
            tidy.Options.XmlOut = true;
            tidy.Options.LogicalEmphasis = true;
            tidy.Options.MakeClean = true;
            tidy.Options.TidyMark = false;
            tidy.Options.CharEncoding = (CharEncoding)Enum.Parse(typeof(TidyNet.CharEncoding), UmbracoSettings.TidyCharEncoding);

            var tmc = new TidyMessageCollection();
            var input = new MemoryStream();
            var output = new MemoryStream();

            var byteArray = Encoding.UTF8.GetBytes(htmlString);
            input.Write(byteArray, 0, byteArray.Length);
            input.Position = 0;
            tidy.Parse(input, output, tmc);

            var result = Encoding.UTF8.GetString(output.ToArray());
            var regex = @"</{0,1}body[^>]*>";
            var options = ((System.Text.RegularExpressions.RegexOptions.IgnorePatternWhitespace | System.Text.RegularExpressions.RegexOptions.Multiline)
                | System.Text.RegularExpressions.RegexOptions.IgnoreCase);
            var reg = new System.Text.RegularExpressions.Regex(regex, options);
            string[] s = reg.Split(result);
            if (s.Length > 1)
            {
                return s[1];
            }
            return "[tidy error]";
        }
Example #20
0
    private string RemoveHTML(string strText)
    {
        string returnValue;
            string TAGLIST = ";em;span;u;a;";
            const string BLOCKTAGLIST = ";APPLET;";
            var nPos1 = 0;
            var nPos2 = 0;
            var nPos3 = 0;
            string strResult = "";
            object strTagName;
            object bRemove;
            object bSearchForBlock;
            nPos1 = strText.IndexOf("<") + 1;
            while (Convert.ToInt32(nPos1) > 0)
            {
                nPos2 = (Convert.ToInt32(nPos1) + 1).ToString().IndexOf(strText) + 1;
                if (nPos2 > 0)
                {
                    strTagName = strText.Substring(Convert.ToInt32(nPos1) + 1 - 1, Convert.ToInt32(nPos2) - Convert.ToInt32(nPos1) - 1);
                    strTagName = Strings.Replace(strTagName.ToString(), Constants.vbCr, " ", 1, -1, 0).Replace(Constants.vbLf, " ");

                    nPos3 = strTagName.ToString().IndexOf(" ") + 1;
                    if (nPos3 > 0)
                    {
                        strTagName = Strings.Left(strTagName.ToString(), System.Convert.ToInt32(nPos3 - 1));
                    }

                    if (Strings.Left(strTagName.ToString(), 1) == "/")
                    {
                        strTagName = Strings.Mid(strTagName.ToString(), 2);
                        bSearchForBlock = false;
                    }
                    else
                    {
                        bSearchForBlock = true;
                    }

                    if (TAGLIST.IndexOf((";" + strTagName + ";").ToString()) + 1 > 0)
                    {
                        bRemove = true;
                        if ( Convert.ToBoolean(bSearchForBlock))
                        {
                            if (BLOCKTAGLIST.ToString().IndexOf((";" + strTagName + ";").ToString()) + 1 > 0)
                            {
                                nPos2 = strText.Length;
                                nPos3 = strText.IndexOf(("</" + strTagName).ToString(), nPos1 + 1 - 1) + 1;
                                if (nPos3 > 0)
                                {
                                    nPos3 = (nPos3 + 1).ToString().IndexOf(strText) + 1;
                                }

                                if (nPos3 > 0)
                                {
                                    nPos2 = nPos3;
                                }
                            }
                        }
                    }
                    else
                    {
                        bRemove = false;
                    }

                    if (Convert.ToBoolean(bRemove))
                    {
                        strResult = strResult + strText.Substring(0, Convert.ToInt32(nPos1) - 1);
                        strText = strText.Substring(Convert.ToInt32(nPos2) + 1 - 1);
                    }
                    else
                    {
                        strResult = strResult + strText.Substring(0, Convert.ToInt32(nPos1));
                        strText = strText.Substring(Convert.ToInt32(nPos1) + 1 - 1);
                    }
                }
                else
                {
                    strResult = strResult + strText;
                    strText = "";
                }

                nPos1 = strText.IndexOf("<") + 1;
            }
            strResult = strResult + strText;
            strResult = strResult.Replace("&#160;", " ");

            // also run Tidy on the text
            TidyNet.Tidy tidydoc = new TidyNet.Tidy();
            tidydoc.Options.RawOut = false;
            tidydoc.Options.CharEncoding = TidyNet.CharEncoding.UTF8;
            tidydoc.Options.DocType = TidyNet.DocType.Omit;
            tidydoc.Options.TidyMark = false;
            tidydoc.Options.Word2000 = true;
            tidydoc.Options.QuoteNbsp = true;
            tidydoc.Options.QuoteAmpersand = true;
            tidydoc.Options.NumEntities = false;
            tidydoc.Options.QuoteMarks = true;
            tidydoc.Options.Xhtml = false;
            tidydoc.Options.MakeClean = true;
            TidyNet.TidyMessageCollection messageCollection = new TidyNet.TidyMessageCollection();
            System.IO.MemoryStream tidyin = new System.IO.MemoryStream();
            System.IO.MemoryStream tidyout = new System.IO.MemoryStream();
            if (strResult == null)
            {
                strResult = "<p></p>";
            }
            byte[] byteArray = Encoding.UTF8.GetBytes(strResult);
            tidyin.Write(byteArray, 0, byteArray.Length);
            tidyin.Position = 0;
            tidydoc.Parse(tidyin, tidyout, messageCollection);
            tidyout.Position = 0;
            string strTidyResult = Encoding.UTF8.GetString(tidyout.ToArray());
            tidyout.Close();
            if ((strTidyResult == "") && (messageCollection.Errors > 0))
            {

                foreach (TidyNet.TidyMessage msg in messageCollection)
                {
                    if (msg.Level == TidyNet.MessageLevel.Error)
                    {
                        strTidyResult = strTidyResult + msg.ToString() + "<BR>";
                    }
                }
            }
            else
            {
                strResult = strTidyResult;
            }

            returnValue = strResult;
            return returnValue;
    }
 private static void AllowHtml5ElementNames(Tidy tidy)
 {
     foreach (string elementName in Html5specificElementNames)
     {
         tidy.Options.AddTag(elementName.ToLower());
     }
 }
 private static void AllowNamespacePrefixedElementNames(Tidy tidy, List<string> elementNames)
 {
     foreach (string elementName in elementNames.Where(en => en != "f:function" && en != "f:param")) // f:* written into TidyNet.dll to fix http://compositec1.codeplex.com/workitem/1144
     {
         tidy.Options.AddTag(elementName.ToLower());
     }
 }