Ejemplo n.º 1
0
 public static void Main(string[] args)
 {
     TagParser.TagParser g = new TagParser.TagParser();
     g.ParseText("<shake amount=3>BOO!</shake> Ah, <b>hahaha</b>. Did I <wave direction=updown>scare</wave> you? ");
     int[] ba = g.tags[2].ArrayOfIndices;
 }
Ejemplo n.º 2
0
        public static string ToXml(string html)
        {
            StringBuilder result = new StringBuilder();

            // Standard XML file header, including entities that are likely to be used.
            result.Append("<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n");

            ParseReader    reader       = new ParseReader(html);
            TagParser      parser       = new TagParser(reader);
            Stack <string> nestingStack = new Stack <string>();

            try
            {
                ParseToken token = parser.GetNextToken();

                // Ignore leading white-space.
                while (token is SpacesToken || token is NewlineToken || token is DoctypeToken)
                {
                    token = parser.GetNextToken();
                }

                while (!(token is EOFToken))
                {
                    Log.DebugFormat("Token = {0}", token);
                    if (token is TagToken)
                    {
                        TagToken t = (TagToken)token;
                        if (!t.Tag.IsEndTag)
                        {
                            // Deal with start-tag. Typically this will be new element nesting.
                            Tag startTag = t.Tag;
                            if (startTag is EmptyElement)
                            {
                                result.Append(((EmptyElement)startTag).ToString());
                            }
                            else
                            {
                                // Tags that are always empty elements are converted to empty elements here.
                                // Element names are pushed onto the stack to balance elements with missing end-tag.
                                string startTagName = startTag.Name.ToLower();
                                Log.DebugFormat("startTagName = {0}", startTagName);
                                if (EmptyElements.Contains(startTagName))
                                {
                                    result.Append((new EmptyElement(startTag)).ToString());
                                }
                                else
                                {
                                    result.Append(startTag.ToString());
                                    nestingStack.Push(startTagName);
                                }
                            }
                        }
                        else
                        {
                            // Deal with end-tag.
                            Tag endTag = t.Tag;

                            // Remove the '/' from beginning of the tag-name for comparison.
                            string endTagName = endTag.Name.Substring(1).ToLower();
                            Log.DebugFormat("endTagName = {0}", endTagName);

                            // Ignore some end-tags for empty elements that are handled with or without empty element syntax.
                            if (EmptyElements.Contains(endTagName))
                            {
                                Log.InfoFormat("Ignoring redundant end-tag: {0}", endTagName);
                            }
                            else
                            {
                                // Keep element tags matched appropriately.
                                string peek = nestingStack.Peek();
                                if (peek == null)
                                {
                                    Log.WarnFormat("Ignoring extra content at end of document! </{0}> ({1})", endTagName, parser.GetCharacterPosition());
                                }
                                else
                                {
                                    if (peek.Equals(endTagName))
                                    {
                                        nestingStack.Pop();
                                    }
                                    else
                                    {
                                        // Pair all the previous unmatched tags for these important structural elements.
                                        // These elements appear only once, so should never be automatically closed.
                                        if (SingleElements.Contains(endTagName))
                                        {
                                            while (peek != endTagName)
                                            {
                                                StringBuilder endtag = (new StringBuilder()).Append("</").Append(peek).Append('>');
                                                Log.WarnFormat("Adding a missing end-tag! {0} ({1})", endtag, parser.GetCharacterPosition());
                                                result.Append(endtag);
                                                nestingStack.Pop();
                                                peek = nestingStack.Peek();
                                            }

                                            // Remove the current item from the stack, as it has been paired now.
                                            nestingStack.Pop();
                                        }
                                        else
                                        {
                                            // Insert a matching start-tag before the unbalanced end-tag found.
                                            StringBuilder startTag = (new StringBuilder()).Append("<").Append(endTagName).Append('>');
                                            Log.WarnFormat("Adding a missing start-tag! {0} ({1})", startTag, parser.GetCharacterPosition());
                                            result.Append(startTag);
                                        }
                                    }

                                    // Write the current element end-tag.
                                    result.Append("</").Append(endTagName).Append('>');
                                }
                            }
                        }
                    }
                    else if (token is WordToken)
                    {
                        WordToken t = (WordToken)token;
                        result.Append(t.Word);
                    }
                    else if (token is SpacesToken)
                    {
                        SpacesToken t = (SpacesToken)token;
                        result.Append(t.Spaces);
                    }
                    else if (token is NumberToken)
                    {
                        NumberToken t = (NumberToken)token;
                        result.Append(t.Number);
                    }
                    else if (token is EntityReferenceToken)
                    {
                        EntityReferenceToken t = (EntityReferenceToken)token;
                        result.Append(XmlEntity(t.Name));
                    }
                    else if (token is PunctuationToken)
                    {
                        PunctuationToken t = (PunctuationToken)token;
                        result.Append(t.Character);
                    }
                    else if (token is CharacterEntityToken)
                    {
                        CharacterEntityToken t = (CharacterEntityToken)token;
                        result.Append(t.Character);
                    }
                    else if (token is NewlineToken)
                    {
                        result.Append('\n');
                    }
                    else if (token is ScriptToken)
                    {
                        ScriptToken t = (ScriptToken)token;
                        if (t.Script.Length > 0)
                        {
                            // Script element contents are often empty.
                            // NOTE: Removing any prior use of CDATA section in script, to avoid conflict.
                            string script = t.Script.Replace("<![CDATA[", "").Replace("]]>", "");
                            result.Append("/*<![CDATA[*/").Append(script).Append("/*]]>*/");
                        }
                    }
                    else if (token is CDataToken)
                    {
                        CDataToken t = (CDataToken)token;
                        result.Append("<![CDATA[").Append(t.Data).Append("]]>");
                    }
                    else if (token is CommentToken)
                    {
                        CommentToken t = (CommentToken)token;
                        result.Append("<!--").Append(t.Comment).Append("-->");
                    }
                    else if (token is DoctypeToken)
                    {
                        // Ignore.
                    }
                    else if (token is ProcessingInstructionToken)
                    {
                        // Ignore.
                    }
                    else
                    {
                        Log.WarnFormat("Unexpected token! {0}", token);
                    }
                    token = parser.GetNextToken();
                }

                Log.Info(parser.GetCompletionReport());
            }
            catch (Exception ex)
            {
                Log.Error("EXCEPTION", ex);
                result = null;
            }

            return(result == null ? null : result.ToString());
        }