public void CleanContentTrusted(AtomContent content)
    {
      Logger.Info("Cleaning content to be valid xhtml.");

      string text = content.Text;
      if (content.Type == "html")
      {
        text = "<div xmlns=\"" + Atom.XhtmlNs.NamespaceName + "\">" + text + "</div>";
      }

      Tidy tidy = new Tidy();
      /* Set the options you want */
      tidy.Options.DocType = DocType.Strict;
      //tidy.Options.DropFontTags = true; 
      tidy.Options.LogicalEmphasis = true;
      tidy.Options.Xhtml = true;
      tidy.Options.XmlOut = true;
      tidy.Options.MakeClean = true;
      tidy.Options.TidyMark = false;
      tidy.Options.QuoteNbsp = false;
      tidy.Options.NumEntities = true;
      tidy.Options.CharEncoding = CharEncoding.UTF8;
      tidy.Options.FixBackslash = true;
      tidy.Options.FixComments = true;

      TidyMessageCollection tmc = new TidyMessageCollection();
      using (MemoryStream input = new MemoryStream())
      using (MemoryStream output = new MemoryStream())
      {
        byte[] bytes = Encoding.UTF8.GetBytes(text);
        input.Write(bytes, 0, bytes.Length);
        input.Position = 0;
        tidy.Parse(input, output, tmc);
        text = Encoding.UTF8.GetString(output.ToArray());
        if (string.IsNullOrEmpty(text)) throw new FailedToCleanContentException(
          string.Format("{0} HTML Tidy Error(s)" + Environment.NewLine, tmc.Errors)
          + string.Join(Environment.NewLine,
          tmc.Cast<TidyMessage>()
          .Where(m => m.Level == MessageLevel.Error)
          .Select(m => m.ToString()).ToArray()));
      }

      //remove html/body that gets added by tidy
      //int start = text.IndexOf("<div xmlns");
      //text = text.Substring(start, text.IndexOf("</body>") - start);

      XElement div = XElement.Parse(text).Element(Atom.XhtmlNs + "body").Element(Atom.XhtmlNs + "div");

      //remove decendent xmlns that gets added by tidy
      div.Descendants().Where(d => d.Attribute("xmlns") != null && 
        d.Attribute("xmlns").Value == Atom.XhtmlNs.NamespaceName)
        .Select(d => d.Attribute("xmlns")).Remove();

      //text = text.Replace(" xmlns=\"http://www.w3.org/1999/xhtml\"", string.Empty);
      //text = "<div xmlns=\"http://www.w3.org/1999/xhtml\">" + text.Substring("<div>".Length);

      //set as xhtml
      content.Type = "xhtml";
      content.Text = div.ToString(SaveOptions.None);
    }
コード例 #2
0
ファイル: translate.aspx.cs プロジェクト: jaytem/minGit
    private void Page_Load(System.Object sender, System.EventArgs e)
    {
        _messageHelper = contentAPI.EkMsgRef;
        if (contentAPI.RequestInformationRef.IsMembershipUser == 1 || contentAPI.RequestInformationRef.UserId == 0)
        {
            Response.Redirect(contentAPI.ApplicationPath + "reterror.aspx?info=" + Server.UrlEncode(_messageHelper.GetMessage("msg login cms user")), false);
            return;
        }
        //Put user code to initialize the page here
        this.pageTitle.Text = (new ApplicationAPI()).EkMsgRef.GetMessage("ektron translation");
        htmleditor = this.Request.Form["htmleditor"];
        htmcontent = this.Request.Form["mycontent"];

        // also run Tidy on the text
        TidyNet.Tidy objTidy = new TidyNet.Tidy();
        objTidy.Options.BreakBeforeBR = true;
        objTidy.Options.CharEncoding = TidyNet.CharEncoding.UTF8;
        objTidy.Options.DocType = TidyNet.DocType.Omit;
        objTidy.Options.DropEmptyParas = false;
        objTidy.Options.MakeClean = true;
        objTidy.Options.NumEntities = true;
        objTidy.Options.QuoteAmpersand = true;
        objTidy.Options.QuoteMarks = false;
        objTidy.Options.QuoteNbsp = true;
        objTidy.Options.RawOut = false;
        objTidy.Options.TidyMark = false;
        objTidy.Options.Word2000 = true;
        objTidy.Options.XmlOut = true;
        TidyNet.TidyMessageCollection messageCollection = new TidyNet.TidyMessageCollection();
        System.IO.MemoryStream streamIn = new System.IO.MemoryStream();
        System.IO.MemoryStream streamOut = new System.IO.MemoryStream();
        byte[] byteArray = Encoding.UTF8.GetBytes(htmcontent);
        streamIn.Write(byteArray, 0, byteArray.Length);
        streamIn.Position = 0;
        objTidy.Parse(streamIn, streamOut, messageCollection);
        streamOut.Position = 0;
        string strTidyResult = Encoding.UTF8.GetString(streamOut.ToArray());
        streamOut.Close();
        streamIn.Close();
        if ((strTidyResult == "") && (messageCollection.Errors > 0))
        {
            foreach (TidyNet.TidyMessage msg in messageCollection)
            {
                if (msg.Level == TidyNet.MessageLevel.Error)
                {
                    strTidyResult = strTidyResult + msg.ToString() + "<br />";
                }
            }

            htmcontent = strTidyResult;
            content.Value = htmcontent;
        }
        else
        {
            strTidyResult = (string)(System.Text.RegularExpressions.Regex.Replace(strTidyResult, "[\\w\\W]*?<body>", "").Replace("</body>" + "\r\n" + "</html>", ""));
            content.Value = strTidyResult;
        }
    }
コード例 #3
0
ファイル: Tidy.cs プロジェクト: succ1984/demosolution
 /// <summary>
 /// Parses the input stream and writes to the output.
 /// </summary>
 /// <param name="input">The input stream</param>
 /// <param name="Output">The output stream</param>
 /// <param name="messages">The messages</param>
 public virtual void Parse(Stream input, Stream output, TidyMessageCollection messages)
 {
     try
     {
         Parse(input, null, output, messages);
     }
     catch (FileNotFoundException)
     {
     }
     catch (IOException)
     {
     }
 }
コード例 #4
0
ファイル: Tidy.cs プロジェクト: succ1984/demosolution
        /// <summary> Parses InputStream in and returns a DOM Document node.
        /// If out is non-null, pretty prints to OutputStream out.
        /// </summary>
        internal virtual IDocument ParseDom(Stream input, Stream Output, TidyMessageCollection messages)
        {
            Node document = ParseInternal(input, Output, messages);

            if (document != null)
            {
                return((IDocument)document.Adapter);
            }
            else
            {
                return(null);
            }
        }
コード例 #5
0
 /// <summary>
 /// Run the input html through the HtmlTidy library, first replacing £ sign with `, as
 /// lib does not seem to support iso-8859-1, the encoding for the original statement.
 /// Also comment out the js script tags, as these fail parsing.
 /// </summary>
 /// <returns>A wellformed xml (bank) statement</returns>
 public string TidyStatement(string statement)
 {
     //task.factory.startnew
     var ss = new StreamReader(statement, Encoding.GetEncoding(ReadEncoding));
     var sourceStatementFileContent = ss.ReadToEnd();
     var tmc = new TidyMessageCollection();
     var input = new MemoryStream();
     var output = new MemoryStream();
     var bytes = Encoding.GetEncoding(ReadEncoding).GetBytes(sourceStatementFileContent.Replace('£', '`'));
     input.Write(bytes, 0, bytes.Length);
     input.Position = 0;
     _tidy.Parse(input, output, tmc);
     var outputResult = Encoding.GetEncoding(ReadEncoding).GetString(output.ToArray());
     outputResult = StringUtils.InsertStringInString(outputResult, @"<script", @"<!--", true);
     return StringUtils.InsertStringInString(outputResult, @"</script>", @"-->", false);
 }
コード例 #6
0
        /// <summary>
        /// Indents the given html source.
        /// </summary>
        /// <param name="htmlSource">The html source.</param>
        /// <returns>A string with the new source.</returns>
        public String IndentContent(String htmlSource)
        {
            Tidy tidy = new Tidy();
            tidy.Options.IndentContent = true;
            TidyMessageCollection tmc = new TidyMessageCollection();
            MemoryStream input = new MemoryStream();
            MemoryStream output = new MemoryStream();

            byte[] byteArray = Encoding.Unicode.GetBytes(htmlSource);
            input.Write(byteArray, 0, byteArray.Length);
            input.Position = 0;
            tidy.Parse(input, output, tmc);

            htmlSource = Encoding.Unicode.GetString(output.ToArray());
            return htmlSource;
        }
コード例 #7
0
ファイル: Tidy.cs プロジェクト: succ1984/demosolution
        /// <summary> Parses InputStream in and returns the root Node.
        /// If out is non-null, pretty prints to OutputStream out.
        /// </summary>
        internal virtual Node ParseInternal(Stream input, Stream output, TidyMessageCollection messages)
        {
            Node document = null;

            try
            {
                document = ParseInternal(input, null, output, messages);
            }
            catch (FileNotFoundException)
            {
            }
            catch (IOException)
            {
            }

            return(document);
        }
コード例 #8
0
        /// <summary>
        /// Uses Tidy.Net to clean a html source.
        /// </summary>
        /// <param name="htmlSource">The original html source.</param>
        /// <param name="isWordHtml">Specifies if the source is an output from Microsoft Word</param>
        /// <returns>The cleaned Html.</returns>
        public string Clean(string htmlSource)
        {
            Tidy tidy = new Tidy();
            //Options required dor xhtml conversion.
            tidy.Options.DocType = DocType.Strict;
            tidy.Options.DropFontTags = true;
            tidy.Options.LogicalEmphasis = true;
            tidy.Options.Xhtml = true;
            tidy.Options.XmlOut = true;
            tidy.Options.MakeClean = true;
            tidy.Options.TidyMark = false;
            tidy.Options.DropEmptyParas = true;
            tidy.Options.IndentContent = true;
            tidy.Options.SmartIndent = true;
            tidy.Options.Word2000 = isWordHtml;
            tidy.Options.EncloseBlockText = true;

            tidy.Options.XmlTags = true;
            tidy.Options.FixComments = true;
            TidyMessageCollection tmc = new TidyMessageCollection();
            MemoryStream input = new MemoryStream();
            MemoryStream output = new MemoryStream();

            byte[] byteArray = Encoding.UTF8.GetBytes(htmlSource);
            input.Write(byteArray, 0, byteArray.Length);
            input.Position = 0;
            try
            {
                tidy.Parse(input, output, tmc);
            }
            catch (FormatException ex)
            {
                Log.Exception(ex);
                return htmlSource;
            }
            string cleanContent = Encoding.UTF8.GetString(output.ToArray());
            return cleanContent;
        }
コード例 #9
0
        public TimeSpan GetUptime()
        {
            /* Declare the parameters that is needed */
            TidyMessageCollection tmc = new TidyMessageCollection();
            MemoryStream xhtmlStream = new MemoryStream();

            var r = System.Net.WebRequest.Create("http://192.168.100.1/indexData.htm");
            r.Timeout = 5000;
            using (var res = r.GetResponse())
            using (var htmlStream = res.GetResponseStream())
            {
                tidy.Parse(htmlStream, xhtmlStream, tmc);
                res.Close();
            }

            //string result = Encoding.UTF8.GetString(xhtmlStream.ToArray());

            var d = new System.Xml.XmlDocument();
            xhtmlStream.Position = 0;
            d.Load(xhtmlStream);

            var navigator = d.CreateNavigator();

            XmlNamespaceManager manager = new XmlNamespaceManager(navigator.NameTable);
            manager.AddNamespace("x", d.DocumentElement.NamespaceURI); // http://www.w3.org/1999/xhtml

            var eUptime = (XmlElement)d.SelectSingleNode("x:html/x:body/x:table[2]/x:tbody/x:tr[3]/x:td[2]", manager);

            var v = eUptime.InnerText;

            v = v.Replace(" days ", ":");
            v = v.Replace("h", "");
            v = v.Replace("m", "");
            v = v.Replace("s", "");

            return TimeSpan.Parse(v);
        }
コード例 #10
0
        public static String ConvertHtmlToXhtml(String source)
        {
            MemoryStream input = new MemoryStream(Encoding.UTF8.GetBytes(source));
            MemoryStream output = new MemoryStream();

            TidyMessageCollection tmc = new TidyMessageCollection();
            Tidy tidy = new Tidy();

            tidy.Options.DocType = DocType.Omit;
            tidy.Options.DropFontTags = true;
            tidy.Options.LogicalEmphasis = true;
            tidy.Options.Xhtml = true;
            tidy.Options.XmlOut = true;
            tidy.Options.MakeClean = true;
            tidy.Options.TidyMark = false;
            tidy.Options.NumEntities = true;

            tidy.Parse(input, output, tmc);

            XmlDocument x = new XmlDocument();
            XmlDocument xhtml = new XmlDocument();
            xhtml.LoadXml("<body />");
            XmlNode xhtmlBody = xhtml.SelectSingleNode("/body");

            x.LoadXml(Encoding.UTF8.GetString(output.ToArray()));
            XmlAttribute ns = x.CreateAttribute("xmlns");
            ns.Value = XhtmlNamespace;
            XmlNode body = x.SelectSingleNode("/html/body");
            foreach (XmlNode node in body.ChildNodes)
            {
                if (node.NodeType == XmlNodeType.Element)
                    node.Attributes.Append(ns);

                xhtmlBody.AppendChild(xhtml.ImportNode(node, true));
            }
            return xhtmlBody.InnerXml;
        }
コード例 #11
0
ファイル: Tidy.cs プロジェクト: bgarrels/betterpoeditor
        /// <summary> Parses InputStream in and returns the root Node.
        /// If out is non-null, pretty prints to OutputStream out.
        /// </summary>
        internal virtual Node ParseInternal(Stream input, Stream output, TidyMessageCollection messages)
        {
            Node document = null;

            try
            {
                document = ParseInternal(input, null, output, messages);
            }
            catch (FileNotFoundException)
            {
            }
            catch (IOException)
            {
            }

            return document;
        }
コード例 #12
0
ファイル: Tidy.cs プロジェクト: bgarrels/betterpoeditor
 /// <summary> Parses InputStream in and returns a DOM Document node.
 /// If out is non-null, pretty prints to OutputStream out.
 /// </summary>
 internal virtual IDocument ParseDom(Stream input, Stream Output, TidyMessageCollection messages)
 {
     Node document = ParseInternal(input, Output, messages);
     if (document != null)
     {
         return (IDocument) document.Adapter;
     }
     else
     {
         return null;
     }
 }
コード例 #13
0
ファイル: Tidy.cs プロジェクト: bgarrels/betterpoeditor
 /// <summary>
 /// Parses the input stream or file and writes to the output.
 /// </summary>
 /// <param name="input">The input stream</param>
 /// <param name="file">The input file</param>
 /// <param name="Output">The output stream</param>
 /// <param name="messages">The messages</param>
 public void Parse(Stream input, string file, Stream Output, TidyMessageCollection messages)
 {
     ParseInternal(input, file, Output, messages);
 }
コード例 #14
0
ファイル: Tidy.cs プロジェクト: bgarrels/betterpoeditor
 /// <summary>
 /// Parses the input stream and writes to the output.
 /// </summary>
 /// <param name="input">The input stream</param>
 /// <param name="Output">The output stream</param>
 /// <param name="messages">The messages</param>
 public virtual void Parse(Stream input, Stream output, TidyMessageCollection messages)
 {
     try
     {
         Parse(input, null, output, messages);
     }
     catch (FileNotFoundException)
     {
     }
     catch (IOException)
     {
     }
 }
コード例 #15
0
        public override List<SearchResult> Search(ChapterInfo chapterInfo)
        {
            string result = string.Empty;
              using (WebClient wc = new WebClient())
              {
            //NameValueCollection vars = new NameValueCollection();
            //vars.Add("txtTitle", chapterInfo.Title);
            //vars.Add("btnSearch", "Search");
            //wc.UploadValues(uri, "POST", vars);
            wc.Headers["Content-Type"] = "application/x-www-form-urlencoded";
            Uri uri = new Uri("http://www.e-home.no/metaservices/search.aspx");
            result = wc.UploadString(uri, "POST",
              //__VIEWSTATE=%2FwEPDwUKLTM3MTkwMDA5NQ9kFgICAQ9kFgICDQ88KwALAGRkg%2BhH%2F3tiaQDjnQncD1sYDdeni%2BA%3D&txtTitle=batman&btnSearch=Search&__EVENTVALIDATION=%2FwEWAwLXiqPdDAL55JyzBAKln%2FPuCgMJnDvHIVAx2tPEYdjNUbwqrR67
            string.Format("__VIEWSTATE=%2FwEPDwUKLTM3MTkwMDA5NQ9kFgICAQ9kFgICDQ88KwALAGRkg%2BhH%2F3tiaQDjnQncD1sYDdeni%2BA%3D&txtTitle={0}&btnSearch=Search&__EVENTVALIDATION=%2FwEWAwLXiqPdDAL55JyzBAKln%2FPuCgMJnDvHIVAx2tPEYdjNUbwqrR67", HttpUtility.UrlEncode(chapterInfo.Title)));
              }
              //__VIEWSTATE=%2FwEPDwUKLTM3MTkwMDA5NQ9kFgICAQ9kFgICDQ88KwALAGRkg%2BhH%2F3tiaQDjnQncD1sYDdeni%2BA%3D&txtTitle=batman&btnSearch=Search&__EVENTVALIDATION=%2FwEWAwLXiqPdDAL55JyzBAKln%2FPuCgMJnDvHIVAx2tPEYdjNUbwqrR67

              Tidy tidy = new Tidy();
              /* Set the options you want */
              tidy.Options.DocType = DocType.Strict;
              //tidy.Options.DropFontTags = true;
              tidy.Options.LogicalEmphasis = true;
              tidy.Options.Xhtml = true;
              tidy.Options.XmlOut = true;
              tidy.Options.MakeClean = true;
              tidy.Options.TidyMark = false;
              tidy.Options.QuoteNbsp = false;
              tidy.Options.NumEntities = true;
              tidy.Options.CharEncoding = CharEncoding.UTF8;
              tidy.Options.FixBackslash = true;
              tidy.Options.FixComments = true;

              TidyMessageCollection tmc = new TidyMessageCollection();
              using (MemoryStream input = new MemoryStream())
              using (MemoryStream output = new MemoryStream())
              {
            byte[] bytes = Encoding.UTF8.GetBytes(result);
            input.Write(bytes, 0, bytes.Length);
            input.Position = 0;
            tidy.Parse(input, output, tmc);
            result = Encoding.UTF8.GetString(output.ToArray());
            if (tmc.Errors > 0) throw new Exception(
              string.Format("{0} HTML Tidy Error(s)" + Environment.NewLine, tmc.Errors)
              + string.Join(Environment.NewLine,
              tmc.Cast<TidyMessage>()
              .Where(m => m.Level == MessageLevel.Error)
              .Select(m => m.ToString()).ToArray()));
            XNamespace ns = "http://www.w3.org/1999/xhtml";
            //parse titles
            XDocument searchXhtml = XDocument.Parse(result);
            Debug.Write(searchXhtml.Descendants(ns + "tr")
              .Where(tr => (tr.Attribute("id") != null && tr.Attribute("id").Value.Length == 17)).Count());

            var titles = searchXhtml.Descendants(ns + "tr")
              .Where(tr => (tr.Attribute("id") != null && tr.Attribute("id").Value.Length == 17))
              .Select(tr => new SearchResult()
              {
            Id = (string)tr.Attribute("id"),
            Name = (string)tr.Elements(ns + "td").First()
              });
            OnSearchComplete();
            return titles.ToList();
              }
        }
コード例 #16
0
        /// <summary>
        /// Cleans HTML documents or fragments into XHTML conformant markup
        /// </summary>
        /// <param name="htmlMarkup">The html to clean</param>
        /// <returns>A fully structured XHTML document, incl. html, head and body elements.</returns>
        public static TidyHtmlResult TidyHtml(string htmlMarkup)
        {
            byte[] htmlByteArray = Encoding.UTF8.GetBytes(htmlMarkup);

            Tidy tidy = GetXhtmlConfiguredTidy();

            List<string> namespacePrefixedElementNames = LocateNamespacePrefixedElementNames(htmlMarkup);
            Dictionary<string, string> namespacePrefixToUri = LocateNamespacePrefixToUriDeclarations(htmlMarkup);
            List<string> badNamespacePrefixedElementNames = namespacePrefixedElementNames.Where(s => namespacePrefixToUri.Where(d => s.StartsWith(d.Key)).Any() == false).ToList();
            AllowNamespacePrefixedElementNames(tidy, namespacePrefixedElementNames);
            AllowHtml5ElementNames(tidy);

            TidyMessageCollection tidyMessages = new TidyMessageCollection();
            string xhtml = "";

            using (MemoryStream inputStream = new MemoryStream(htmlByteArray))
            {
                using (MemoryStream outputStream = new MemoryStream())
                {
                    tidy.Parse(inputStream, outputStream, tidyMessages);
                    outputStream.Position = 0;
                    C1StreamReader sr = new C1StreamReader(outputStream);
                    xhtml = sr.ReadToEnd();
                }
            }

            if (tidyMessages.Errors > 0)
            {
                StringBuilder errorMessageBuilder = new StringBuilder();
                foreach (TidyMessage message in tidyMessages)
                {
                    if (message.Level == MessageLevel.Error)
                        errorMessageBuilder.AppendLine(message.ToString());
                }
                throw new InvalidOperationException(string.Format("Failed to parse html:\n\n{0}", errorMessageBuilder.ToString()));
            }

            if (xhtml.IndexOf("<html>")>-1)
            {
                xhtml = xhtml.Replace("<html>", "<html xmlns=\"http://www.w3.org/1999/xhtml\">");
            }

            if (xhtml.IndexOf("xmlns=\"http://www.w3.org/1999/xhtml\"") == -1)
            {
                xhtml = xhtml.Replace("<html", "<html xmlns=\"http://www.w3.org/1999/xhtml\"");
            }

            xhtml = RemoveDuplicateAttributes(xhtml);
            xhtml = RemoveXmlDeclarations(xhtml);
            xhtml = UndoLowerCasingOfElementNames(xhtml, namespacePrefixedElementNames);
            xhtml = UndoLowerCasingOfNamespacePrefixes(xhtml, namespacePrefixToUri);
            StringBuilder messageBuilder = new StringBuilder();
            foreach (TidyMessage message in tidyMessages)
            {
                if (message.Level == MessageLevel.Warning)
                    messageBuilder.AppendLine(message.ToString());
            }

            List<string> badNamespacePrefixes = badNamespacePrefixedElementNames.Select(n => n.Substring(0, n.IndexOf(':'))).Union(LocateAttributeNamespacePrefixes(xhtml)).Distinct().Where(f => IsValidXmlName(f)).ToList();

            XDocument outputResult;
            if (badNamespacePrefixedElementNames.Any())
            {
                string badDeclared = string.Join(" ", badNamespacePrefixes.Select(p => string.Format("xmlns:{0}='#bad'", p)).ToArray());
                XDocument badDoc = XDocument.Parse(string.Format("<root {0}>{1}</root>", badDeclared, xhtml));
                badDoc.Descendants().Attributes().Where(e => e.Name.Namespace == "#bad").Remove();
                badDoc.Descendants().Where(e => e.Name.Namespace == "#bad").Remove();
                outputResult = new XDocument(badDoc.Root.Descendants().First());
            }
            else
            {
                outputResult = XDocument.Parse(xhtml, LoadOptions.PreserveWhitespace);
            }

            return new TidyHtmlResult { Output = outputResult, ErrorSummary = messageBuilder.ToString() };
        }
        public static string ValidateHtml(string htmlString)
        {
            var tidy = new Tidy();
            tidy.Options.DocType = DocType.Omit;
            tidy.Options.Xhtml = true;
            tidy.Options.XmlOut = true;
            tidy.Options.LogicalEmphasis = true;
            tidy.Options.MakeClean = true;
            tidy.Options.TidyMark = false;
            tidy.Options.CharEncoding = (CharEncoding)Enum.Parse(typeof(TidyNet.CharEncoding), UmbracoSettings.TidyCharEncoding);

            var tmc = new TidyMessageCollection();
            var input = new MemoryStream();
            var output = new MemoryStream();

            var byteArray = Encoding.UTF8.GetBytes(htmlString);
            input.Write(byteArray, 0, byteArray.Length);
            input.Position = 0;
            tidy.Parse(input, output, tmc);

            var result = Encoding.UTF8.GetString(output.ToArray());
            var regex = @"</{0,1}body[^>]*>";
            var options = ((System.Text.RegularExpressions.RegexOptions.IgnorePatternWhitespace | System.Text.RegularExpressions.RegexOptions.Multiline)
                | System.Text.RegularExpressions.RegexOptions.IgnoreCase);
            var reg = new System.Text.RegularExpressions.Regex(regex, options);
            string[] s = reg.Split(result);
            if (s.Length > 1)
            {
                return s[1];
            }
            return "[tidy error]";
        }
コード例 #18
0
ファイル: Tidy.cs プロジェクト: succ1984/demosolution
 /// <summary>
 /// Parses the input stream or file and writes to the output.
 /// </summary>
 /// <param name="input">The input stream</param>
 /// <param name="file">The input file</param>
 /// <param name="Output">The output stream</param>
 /// <param name="messages">The messages</param>
 public void Parse(Stream input, string file, Stream Output, TidyMessageCollection messages)
 {
     ParseInternal(input, file, Output, messages);
 }
コード例 #19
0
ファイル: HtmlUtility.cs プロジェクト: peterkhoa/hdtl
        private static string TidyHtml(string text)
        {
            var doc = new Tidy();
            var messages = new TidyMessageCollection();
            var input = new MemoryStream();
            var output = new MemoryStream();

            var array = Encoding.UTF8.GetBytes(text);
            input.Write(array, 0, array.Length);
            input.Position = 0;

            // Disabled as it causes problems handling "font" tags
            // There are occurences when it will muck up a font tag to "fontface=...etc...
            //doc.Options.Xhtml = true;
            doc.Options.MakeClean = false;
            doc.Options.DocType = DocType.Strict;
            doc.Options.CharEncoding = CharEncoding.UTF8;
            doc.Options.LogicalEmphasis = true;

            doc.Options.SmartIndent = true;
            doc.Options.IndentContent = true;
            doc.Options.TidyMark = false;
            doc.Options.QuoteAmpersand = true;
            doc.Options.DropFontTags = false;
            doc.Options.DropEmptyParas = true;

            // Required to stop spaces being removed, and tabs added etc...
            doc.Options.Spaces = 0;
            doc.Options.WrapLen = 32000;

            doc.Parse(input, output, messages);

            return RemoveTidyAdditions(Encoding.UTF8.GetString(output.ToArray()));
        }
コード例 #20
0
ファイル: compare.aspx.cs プロジェクト: jaytem/minGit
    private string RemoveHTML(string strText)
    {
        string returnValue;
            string TAGLIST = ";em;span;u;a;";
            const string BLOCKTAGLIST = ";APPLET;";
            var nPos1 = 0;
            var nPos2 = 0;
            var nPos3 = 0;
            string strResult = "";
            object strTagName;
            object bRemove;
            object bSearchForBlock;
            nPos1 = strText.IndexOf("<") + 1;
            while (Convert.ToInt32(nPos1) > 0)
            {
                nPos2 = (Convert.ToInt32(nPos1) + 1).ToString().IndexOf(strText) + 1;
                if (nPos2 > 0)
                {
                    strTagName = strText.Substring(Convert.ToInt32(nPos1) + 1 - 1, Convert.ToInt32(nPos2) - Convert.ToInt32(nPos1) - 1);
                    strTagName = Strings.Replace(strTagName.ToString(), Constants.vbCr, " ", 1, -1, 0).Replace(Constants.vbLf, " ");

                    nPos3 = strTagName.ToString().IndexOf(" ") + 1;
                    if (nPos3 > 0)
                    {
                        strTagName = Strings.Left(strTagName.ToString(), System.Convert.ToInt32(nPos3 - 1));
                    }

                    if (Strings.Left(strTagName.ToString(), 1) == "/")
                    {
                        strTagName = Strings.Mid(strTagName.ToString(), 2);
                        bSearchForBlock = false;
                    }
                    else
                    {
                        bSearchForBlock = true;
                    }

                    if (TAGLIST.IndexOf((";" + strTagName + ";").ToString()) + 1 > 0)
                    {
                        bRemove = true;
                        if ( Convert.ToBoolean(bSearchForBlock))
                        {
                            if (BLOCKTAGLIST.ToString().IndexOf((";" + strTagName + ";").ToString()) + 1 > 0)
                            {
                                nPos2 = strText.Length;
                                nPos3 = strText.IndexOf(("</" + strTagName).ToString(), nPos1 + 1 - 1) + 1;
                                if (nPos3 > 0)
                                {
                                    nPos3 = (nPos3 + 1).ToString().IndexOf(strText) + 1;
                                }

                                if (nPos3 > 0)
                                {
                                    nPos2 = nPos3;
                                }
                            }
                        }
                    }
                    else
                    {
                        bRemove = false;
                    }

                    if (Convert.ToBoolean(bRemove))
                    {
                        strResult = strResult + strText.Substring(0, Convert.ToInt32(nPos1) - 1);
                        strText = strText.Substring(Convert.ToInt32(nPos2) + 1 - 1);
                    }
                    else
                    {
                        strResult = strResult + strText.Substring(0, Convert.ToInt32(nPos1));
                        strText = strText.Substring(Convert.ToInt32(nPos1) + 1 - 1);
                    }
                }
                else
                {
                    strResult = strResult + strText;
                    strText = "";
                }

                nPos1 = strText.IndexOf("<") + 1;
            }
            strResult = strResult + strText;
            strResult = strResult.Replace("&#160;", " ");

            // also run Tidy on the text
            TidyNet.Tidy tidydoc = new TidyNet.Tidy();
            tidydoc.Options.RawOut = false;
            tidydoc.Options.CharEncoding = TidyNet.CharEncoding.UTF8;
            tidydoc.Options.DocType = TidyNet.DocType.Omit;
            tidydoc.Options.TidyMark = false;
            tidydoc.Options.Word2000 = true;
            tidydoc.Options.QuoteNbsp = true;
            tidydoc.Options.QuoteAmpersand = true;
            tidydoc.Options.NumEntities = false;
            tidydoc.Options.QuoteMarks = true;
            tidydoc.Options.Xhtml = false;
            tidydoc.Options.MakeClean = true;
            TidyNet.TidyMessageCollection messageCollection = new TidyNet.TidyMessageCollection();
            System.IO.MemoryStream tidyin = new System.IO.MemoryStream();
            System.IO.MemoryStream tidyout = new System.IO.MemoryStream();
            if (strResult == null)
            {
                strResult = "<p></p>";
            }
            byte[] byteArray = Encoding.UTF8.GetBytes(strResult);
            tidyin.Write(byteArray, 0, byteArray.Length);
            tidyin.Position = 0;
            tidydoc.Parse(tidyin, tidyout, messageCollection);
            tidyout.Position = 0;
            string strTidyResult = Encoding.UTF8.GetString(tidyout.ToArray());
            tidyout.Close();
            if ((strTidyResult == "") && (messageCollection.Errors > 0))
            {

                foreach (TidyNet.TidyMessage msg in messageCollection)
                {
                    if (msg.Level == TidyNet.MessageLevel.Error)
                    {
                        strTidyResult = strTidyResult + msg.ToString() + "<BR>";
                    }
                }
            }
            else
            {
                strResult = strTidyResult;
            }

            returnValue = strResult;
            return returnValue;
    }
コード例 #21
0
ファイル: Tidy.cs プロジェクト: bgarrels/betterpoeditor
        /// <summary> Internal routine that actually does the parsing.  The caller
        /// can pass either an InputStream or file name.  If both are passed,
        /// the file name is preferred.
        /// </summary>
        internal Node ParseInternal(Stream input, string file, Stream Output, TidyMessageCollection messages)
        {
            Lexer lexer;
            Node document = null;
            Node doctype;
            Out o = new OutImpl(); /* normal output stream */
            PPrint pprint;

            /* ensure config is self-consistent */
            _options.Adjust();

            if (file != null)
            {
                input = new FileStream(file, FileMode.Open, FileAccess.Read);
            }
            else if (input == null)
            {
                input = Console.OpenStandardInput();
            }

            if (input != null)
            {
                lexer = new Lexer(new ClsStreamInImpl(input, _options.CharEncoding, _options.TabSize), _options);
                lexer.messages = messages;

                /*
                store pointer to lexer in input stream
                to allow character encoding errors to be
                reported
                */
                lexer.input.Lexer = lexer;

                /* Tidy doesn't alter the doctype for generic XML docs */
                if (_options.XmlTags)
                {
                    document = ParserImpl.parseXMLDocument(lexer);
                }
                else
                {
                    document = ParserImpl.parseDocument(lexer);

                    if (!document.CheckNodeIntegrity())
                    {
                        Report.BadTree(lexer);
                        return null;
                    }

                    Clean cleaner = new Clean(_options.tt);

                    /* simplifies <b><b> ... </b> ...</b> etc. */
                    cleaner.NestedEmphasis(document);

                    /* cleans up <dir>indented text</dir> etc. */
                    cleaner.List2BQ(document);
                    cleaner.BQ2Div(document);

                    /* replaces i by em and b by strong */
                    if (_options.LogicalEmphasis)
                    {
                        cleaner.EmFromI(document);
                    }

                    if (_options.Word2000 && cleaner.IsWord2000(document, _options.tt))
                    {
                        /* prune Word2000's <![if ...]> ... <![endif]> */
                        cleaner.DropSections(lexer, document);

                        /* drop style & class attributes and empty p, span elements */
                        cleaner.CleanWord2000(lexer, document);
                    }

                    /* replaces presentational markup by style rules */
                    if (_options.MakeClean || _options.DropFontTags)
                    {
                        cleaner.CleanTree(lexer, document);
                    }

                    if (!document.CheckNodeIntegrity())
                    {
                        Report.BadTree(lexer);
                        return null;
                    }
                    doctype = document.FindDocType();
                    if (document.Content != null)
                    {
                        if (_options.Xhtml)
                        {
                            lexer.SetXhtmlDocType(document);
                        }
                        else
                        {
                            lexer.FixDocType(document);
                        }

                        if (_options.TidyMark)
                        {
                            lexer.AddGenerator(document);
                        }
                    }

                    /* ensure presence of initial <?XML version="1.0"?> */
                    if (_options.XmlOut && _options.XmlPi)
                    {
                        lexer.FixXmlPI(document);
                    }

                    if (document.Content != null)
                    {
                        Report.ReportVersion(lexer, doctype);
                        Report.ReportNumWarnings(lexer);
                    }
                }

                // Try to close the InputStream but only if if we created it.

                if ((file != null) && (input != Console.OpenStandardOutput()))
                {
                    try
                    {
                        input.Close();
                    }
                    catch (IOException)
                    {
                    }
                }

                if (lexer.messages.Errors > 0)
                {
                    Report.NeedsAuthorIntervention(lexer);
                }

                o.State = StreamIn.FSM_ASCII;
                o.Encoding = _options.CharEncoding;

                if (lexer.messages.Errors == 0)
                {
                    if (_options.BurstSlides)
                    {
                        Node body;

                        body = null;
                        /*
                        remove doctype to avoid potential clash with
                        markup introduced when bursting into slides
                        */
                        /* discard the document type */
                        doctype = document.FindDocType();

                        if (doctype != null)
                        {
                            Node.DiscardElement(doctype);
                        }

                        /* slides use transitional features */
                        lexer.versions |= HtmlVersion.Html40Loose;

                        /* and patch up doctype to match */
                        if (_options.Xhtml)
                        {
                            lexer.SetXhtmlDocType(document);
                        }
                        else
                        {
                            lexer.FixDocType(document);
                        }

                        /* find the body element which may be implicit */
                        body = document.FindBody(_options.tt);

                        if (body != null)
                        {
                            pprint = new PPrint(_options);
                            Report.ReportNumberOfSlides(lexer, pprint.CountSlides(body));
                            pprint.CreateSlides(lexer, document);
                        }
                        else
                        {
                            Report.MissingBody(lexer);
                        }
                    }
                    else if (Output != null)
                    {
                        pprint = new PPrint(_options);
                        o.Output = Output;

                        if (_options.XmlTags)
                        {
                            pprint.PrintXmlTree(o, (short) 0, 0, lexer, document);
                        }
                        else
                        {
                            pprint.PrintTree(o, (short) 0, 0, lexer, document);
                        }

                        pprint.FlushLine(o, 0);
                    }
                }

                Report.ErrorSummary(lexer);
            }

            return document;
        }
コード例 #22
0
ファイル: Form1.cs プロジェクト: MSU-IBC/CIA-Import
        private void buttonGo_Click(object sender, EventArgs e)
        {
            Exception ex;

            // I had to throw in the len as Budget Revenue and Expenditures have the same ID except the Expenditures has a 1 at the end.
            ex = _db.ExecuteSqlReader("SELECT * FROM CIA_Fields WHERE fieldid > 2000 AND LEN(fieldid) = 4");

            if (ex != null)
            {
                throw new Exception(ex.Message);
            }
            var fieldIDs = new List<int>();
            while (_db.Reader.Read())
            {
                 fieldIDs.Add((int)_db.Reader["FieldID"]);
            }
            _db.Reader.Close();
            foreach(var f in fieldIDs){
                textBoxOutput.Text += f + Environment.NewLine;
                var input = File.OpenRead(textFolder.Text + "\\" + f + ".html");
                var tmc = new TidyMessageCollection();
                var output = new MemoryStream();

                var tidy = new Tidy();
                tidy.Options.DocType = DocType.Strict;
                tidy.Options.DropFontTags = true;
                tidy.Options.LogicalEmphasis = true;
                tidy.Options.Xhtml = true;
                tidy.Options.XmlOut = true;
                tidy.Options.MakeClean = true;
                tidy.Options.TidyMark = false;
                tidy.Options.WrapLen = 0;
                tidy.Parse(input, output, tmc);

                var result = Encoding.UTF8.GetString(output.ToArray());
                HtmlAgilityPack.HtmlDocument doc = new HtmlAgilityPack.HtmlDocument();
                doc.LoadHtml(result);

                var categoryData = doc.DocumentNode.SelectNodes("//td[@class='category_data']");
                if (categoryData != null)
                {
                    foreach (var i in categoryData)
                    {
                        if (i != null)
                        {
                            var tagID = _countryTagList.SingleOrDefault(a => a.Key == i.ParentNode.ParentNode.Id);
                            if (tagID.Key == null)
                            {
                                continue;
                            }
                            switch(f)
                            {
                                case 2085:
                                    Parse.Parse2085(textBoxOutput, i.InnerText, f, tagID.Value);
                                    break;
                                case 2091:
                                    Parse.Parse2091(textBoxOutput, i.InnerText, f, tagID.Value);
                                    break;
                                case 2121:
                                    Parse.Parse2121(textBoxOutput, i.InnerText, f, tagID.Value);
                                    break;
                                case 2056:
                                    Parse.Parse2056(textBoxOutput, i.InnerText, f, tagID.Value);
                                    break;
                                default:
                                    textBoxOutput.Text += Parse.ParseTableData(i.InnerText, f, tagID.Value);
                                    break;
                            }
                        }
                    }
                }
                else
                {
                    textBoxOutput.Text += f + ": NO DATA" + Environment.NewLine;
                }
            }
        }
コード例 #23
0
        /// <summary>
        /// Shortens a HTML formatted string, while keeping HTML formatting and complete words (also removes line-breakes at the end of the shortened string)
        /// </summary>
        /// <param name="input">The HTML formatted string</param>
        /// <param name="inputIsShortened">Output boolean telling if the input string has been shortened</param>
        /// <param name="length">The approximate length of the output string (default: 300)</param>
        /// <param name="elipsis">Elipsis text to append to the output string (use string.Empty when elipsis should not be added, default: ...)</param>
        /// <returns>The shortened input string with HTML formatting</returns>
        public static string ShortenHtml(this string input, out bool inputIsShortened, int length = 300, string elipsis = "...")
        {
            inputIsShortened = false;

            if (input.Length <= length)
                return input;

            input = input.Replace("<br />", "<br/>");

            string substring = input.Substring(0, length);
            string leftover = input.Substring(length);
            while (!leftover.StartsWith(" ") && leftover != string.Empty)
            {
                substring += leftover.Substring(0, 1);
                leftover = leftover.Substring(1);
            }
            substring = substring.Trim();
            while (substring.EndsWith("<br/>"))
            {
                substring = substring.Substring(0, substring.Length - 5);
                substring = substring.Trim();
            }

            if (input.Length > substring.Length)
                inputIsShortened = true;

            substring = substring.Replace("<br/>", "<br />");

            Tidy tidy = new Tidy();
            tidy.Options.DocType = DocType.Omit;
            tidy.Options.CharEncoding = CharEncoding.UTF8;
            tidy.Options.Xhtml = true;
            tidy.Options.NumEntities = true;

            TidyMessageCollection tmc = new TidyMessageCollection();
            MemoryStream inputStream = new MemoryStream();
            MemoryStream outputStream = new MemoryStream();

            byte[] bytes = Encoding.UTF8.GetBytes(substring);
            inputStream.Write(bytes, 0, bytes.Length);
            inputStream.Position = 0;
            tidy.Parse(inputStream, outputStream, tmc);

            string tidyResult = Encoding.UTF8.GetString(outputStream.ToArray());
            XmlDocument xmlDoc = new XmlDocument();
            xmlDoc.LoadXml(tidyResult);
            tidyResult = xmlDoc.SelectSingleNode("//body").InnerXml;

            if (!string.IsNullOrEmpty(elipsis))
            {
                if (tidyResult.EndsWith("</p>"))
                    return string.Concat(tidyResult.Substring(0, tidyResult.Length - 4), elipsis, "</p>");
                return string.Concat(tidyResult, elipsis);
            }
            return tidyResult;
        }
コード例 #24
0
        private void cleanContent(ref String initialContent,ref String cleanContent)
        {
            Tidy tidy = new Tidy();
            /*
            tidy.Options.DocType = DocType.Strict;
            tidy.Options.DropFontTags = true;
            tidy.Options.LogicalEmphasis = true;
            tidy.Options.XmlOut = true;
            dy.Options.TidyMark = false;*/
            tidy.Options.Word2000 = true;
            //tidy.Options.MakeClean = true;
            tidy.Options.Xhtml = true;

            TidyMessageCollection tmc = new TidyMessageCollection();
            MemoryStream input = new MemoryStream();
            MemoryStream output = new MemoryStream();

            byte[] byteArray = Encoding.UTF8.GetBytes(initialContent);
            input.Write(byteArray, 0, byteArray.Length);
            input.Position = 0;
            tidy.Parse(input, output, tmc);

            cleanContent = Encoding.UTF8.GetString(output.ToArray());

            //Delete header & footer
            int startIndex, endIndex;
            startIndex = cleanContent.IndexOf("<body");
            endIndex = cleanContent.IndexOf(">", startIndex);
            cleanContent = cleanContent.Remove(0, endIndex + 1);
            startIndex = cleanContent.IndexOf("</body");
            if(startIndex >= 0)
                cleanContent = cleanContent.Remove(startIndex);
        }
コード例 #25
0
ファイル: Tidy.cs プロジェクト: succ1984/demosolution
        /// <summary> Internal routine that actually does the parsing.  The caller
        /// can pass either an InputStream or file name.  If both are passed,
        /// the file name is preferred.
        /// </summary>
        internal Node ParseInternal(Stream input, string file, Stream Output, TidyMessageCollection messages)
        {
            Lexer  lexer;
            Node   document = null;
            Node   doctype;
            Out    o = new OutImpl();          /* normal output stream */
            PPrint pprint;

            /* ensure config is self-consistent */
            _options.Adjust();

            if (file != null)
            {
                input = new FileStream(file, FileMode.Open, FileAccess.Read);
            }
            else if (input == null)
            {
                input = Console.OpenStandardInput();
            }

            if (input != null)
            {
                lexer          = new Lexer(new ClsStreamInImpl(input, _options.CharEncoding, _options.TabSize), _options);
                lexer.messages = messages;

                /*
                 * store pointer to lexer in input stream
                 * to allow character encoding errors to be
                 * reported
                 */
                lexer.input.Lexer = lexer;

                /* Tidy doesn't alter the doctype for generic XML docs */
                if (_options.XmlTags)
                {
                    document = ParserImpl.parseXMLDocument(lexer);
                }
                else
                {
                    document = ParserImpl.parseDocument(lexer);

                    if (!document.CheckNodeIntegrity())
                    {
                        Report.BadTree(lexer);
                        return(null);
                    }

                    Clean cleaner = new Clean(_options.tt);

                    /* simplifies <b><b> ... </b> ...</b> etc. */
                    cleaner.NestedEmphasis(document);

                    /* cleans up <dir>indented text</dir> etc. */
                    cleaner.List2BQ(document);
                    cleaner.BQ2Div(document);

                    /* replaces i by em and b by strong */
                    if (_options.LogicalEmphasis)
                    {
                        cleaner.EmFromI(document);
                    }

                    if (_options.Word2000 && cleaner.IsWord2000(document, _options.tt))
                    {
                        /* prune Word2000's <![if ...]> ... <![endif]> */
                        cleaner.DropSections(lexer, document);

                        /* drop style & class attributes and empty p, span elements */
                        cleaner.CleanWord2000(lexer, document);
                    }

                    /* replaces presentational markup by style rules */
                    if (_options.MakeClean || _options.DropFontTags)
                    {
                        cleaner.CleanTree(lexer, document);
                    }

                    if (!document.CheckNodeIntegrity())
                    {
                        Report.BadTree(lexer);
                        return(null);
                    }
                    doctype = document.FindDocType();
                    if (document.Content != null)
                    {
                        if (_options.Xhtml)
                        {
                            lexer.SetXhtmlDocType(document);
                        }
                        else
                        {
                            lexer.FixDocType(document);
                        }

                        if (_options.TidyMark)
                        {
                            lexer.AddGenerator(document);
                        }
                    }

                    /* ensure presence of initial <?XML version="1.0"?> */
                    if (_options.XmlOut && _options.XmlPi)
                    {
                        lexer.FixXmlPI(document);
                    }

                    if (document.Content != null)
                    {
                        Report.ReportVersion(lexer, doctype);
                        Report.ReportNumWarnings(lexer);
                    }
                }

                // Try to close the InputStream but only if if we created it.

                if ((file != null) && (input != Console.OpenStandardOutput()))
                {
                    try
                    {
                        input.Close();
                    }
                    catch (IOException)
                    {
                    }
                }

                if (lexer.messages.Errors > 0)
                {
                    Report.NeedsAuthorIntervention(lexer);
                }

                o.State    = StreamIn.FSM_ASCII;
                o.Encoding = _options.CharEncoding;

                if (lexer.messages.Errors == 0)
                {
                    if (_options.BurstSlides)
                    {
                        Node body;

                        body = null;

                        /*
                         * remove doctype to avoid potential clash with
                         * markup introduced when bursting into slides
                         */
                        /* discard the document type */
                        doctype = document.FindDocType();

                        if (doctype != null)
                        {
                            Node.DiscardElement(doctype);
                        }

                        /* slides use transitional features */
                        lexer.versions |= HtmlVersion.Html40Loose;

                        /* and patch up doctype to match */
                        if (_options.Xhtml)
                        {
                            lexer.SetXhtmlDocType(document);
                        }
                        else
                        {
                            lexer.FixDocType(document);
                        }

                        /* find the body element which may be implicit */
                        body = document.FindBody(_options.tt);

                        if (body != null)
                        {
                            pprint = new PPrint(_options);
                            Report.ReportNumberOfSlides(lexer, pprint.CountSlides(body));
                            pprint.CreateSlides(lexer, document);
                        }
                        else
                        {
                            Report.MissingBody(lexer);
                        }
                    }
                    else if (Output != null)
                    {
                        pprint   = new PPrint(_options);
                        o.Output = Output;

                        if (_options.XmlTags)
                        {
                            pprint.PrintXmlTree(o, (short)0, 0, lexer, document);
                        }
                        else
                        {
                            pprint.PrintTree(o, (short)0, 0, lexer, document);
                        }

                        pprint.FlushLine(o, 0);
                    }
                }

                Report.ErrorSummary(lexer);
            }

            return(document);
        }
コード例 #26
0
        /// <summary>
        /// Cleans HTML documents or fragments into XHTML conformant markup
        /// </summary>
        /// <param name="xmlMarkup">The html to clean</param>
        /// <returns></returns>
        public static XDocument TidyXml(string xmlMarkup)
        {
            try
            {
                return XhtmlDocument.Parse(xmlMarkup);
            }
            catch (Exception)
            {
                // take the slow road below...
            }

            byte[] xmlByteArray = Encoding.UTF8.GetBytes(xmlMarkup);

            Tidy tidy = GetXmlConfiguredTidy();

            List<string> namespacePrefixedElementNames = LocateNamespacePrefixedElementNames(xmlMarkup);
            AllowNamespacePrefixedElementNames(tidy, namespacePrefixedElementNames);
            AllowHtml5ElementNames(tidy);

            TidyMessageCollection tidyMessages = new TidyMessageCollection();
            string xml = "";

            using (MemoryStream inputStream = new MemoryStream(xmlByteArray))
            {
                using (MemoryStream outputStream = new MemoryStream())
                {
                    tidy.Parse(inputStream, outputStream, tidyMessages);
                    outputStream.Position = 0;
                    C1StreamReader sr = new C1StreamReader(outputStream);
                    xml = sr.ReadToEnd();
                }
            }

            if (tidyMessages.Errors > 0)
            {
                StringBuilder errorMessageBuilder = new StringBuilder();
                foreach (TidyMessage message in tidyMessages)
                {
                    if (message.Level == MessageLevel.Error)
                        errorMessageBuilder.AppendLine(message.ToString());
                }
                throw new InvalidOperationException(string.Format("Failed to parse html:\n\n{0}", errorMessageBuilder.ToString()));
            }

            xml = RemoveDuplicateAttributes(xml);

            return XDocument.Parse(xml);
        }
コード例 #27
0
 static string CleanHtml(string badHtmlString)
 {
     //Clean bad html using TIDY
     // http://sourceforge.net/projects/tidynet/
     Tidy tidy = new Tidy ();
     MemoryStream input = new MemoryStream ();
     MemoryStream output = new MemoryStream ();
     byte[] badHtml = Encoding.UTF8.GetBytes (badHtmlString);
     input.Write (badHtml, 0, badHtml.Length);
     input.Position = 0;
     TidyMessageCollection tidyMsg = new TidyMessageCollection ();
     tidy.Parse (input, output, tidyMsg);
     return Encoding.UTF8.GetString (output.ToArray ());
 }
コード例 #28
0
ファイル: Tidy.cs プロジェクト: Ark-kun/TidyNet
		/// <summary> Internal routine that actually does the parsing.  The caller
		/// can pass either an InputStream or file name.  If both are passed,
		/// the file name is preferred.
		/// </summary>
		internal Node ParseInternal(string file, Stream Output, TidyMessageCollection messages)
		{
            Stream input = null;
			if (file != null)
			{
				input = new FileStream(file, FileMode.Open, FileAccess.Read);
			}
			else if (input == null)
			{
				input = Console.OpenStandardInput();
			}
            Node node = ParseInternal(input, Output, messages);

            // Try to close the InputStream but only if if we created it.

            if ((file != null) && (input != Console.OpenStandardOutput())) { //BUG!!!
                try {
                    input.Close();
                } catch (IOException) {
                }
            }
            return node;
        }