/// <summary> /// This method (re)initialises the parser. /// </summary> /// <param name="stream">Character input stream.</param> private void Init(ParseReader stream) { Stream = stream; MaxErrors = 1000; NumErrors = 0; NumWarnings = 0; NumRecoveries = 0; }
/// <summary> /// This is the constructor for the abstract parser class. /// </summary> /// <param name="stream">Character input stream.</param> protected Parser(ParseReader stream) { Init(stream); }
public static string ToXml(string html) { StringBuilder result = new StringBuilder(); // Standard XML file header, including entities that are likely to be used. result.Append("<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n"); ParseReader reader = new ParseReader(html); TagParser parser = new TagParser(reader); Stack <string> nestingStack = new Stack <string>(); try { ParseToken token = parser.GetNextToken(); // Ignore leading white-space. while (token is SpacesToken || token is NewlineToken || token is DoctypeToken) { token = parser.GetNextToken(); } while (!(token is EOFToken)) { Log.DebugFormat("Token = {0}", token); if (token is TagToken) { TagToken t = (TagToken)token; if (!t.Tag.IsEndTag) { // Deal with start-tag. Typically this will be new element nesting. Tag startTag = t.Tag; if (startTag is EmptyElement) { result.Append(((EmptyElement)startTag).ToString()); } else { // Tags that are always empty elements are converted to empty elements here. // Element names are pushed onto the stack to balance elements with missing end-tag. string startTagName = startTag.Name.ToLower(); Log.DebugFormat("startTagName = {0}", startTagName); if (EmptyElements.Contains(startTagName)) { result.Append((new EmptyElement(startTag)).ToString()); } else { result.Append(startTag.ToString()); nestingStack.Push(startTagName); } } } else { // Deal with end-tag. Tag endTag = t.Tag; // Remove the '/' from beginning of the tag-name for comparison. string endTagName = endTag.Name.Substring(1).ToLower(); Log.DebugFormat("endTagName = {0}", endTagName); // Ignore some end-tags for empty elements that are handled with or without empty element syntax. if (EmptyElements.Contains(endTagName)) { Log.InfoFormat("Ignoring redundant end-tag: {0}", endTagName); } else { // Keep element tags matched appropriately. string peek = nestingStack.Peek(); if (peek == null) { Log.WarnFormat("Ignoring extra content at end of document! </{0}> ({1})", endTagName, parser.GetCharacterPosition()); } else { if (peek.Equals(endTagName)) { nestingStack.Pop(); } else { // Pair all the previous unmatched tags for these important structural elements. // These elements appear only once, so should never be automatically closed. if (SingleElements.Contains(endTagName)) { while (peek != endTagName) { StringBuilder endtag = (new StringBuilder()).Append("</").Append(peek).Append('>'); Log.WarnFormat("Adding a missing end-tag! {0} ({1})", endtag, parser.GetCharacterPosition()); result.Append(endtag); nestingStack.Pop(); peek = nestingStack.Peek(); } // Remove the current item from the stack, as it has been paired now. nestingStack.Pop(); } else { // Insert a matching start-tag before the unbalanced end-tag found. StringBuilder startTag = (new StringBuilder()).Append("<").Append(endTagName).Append('>'); Log.WarnFormat("Adding a missing start-tag! {0} ({1})", startTag, parser.GetCharacterPosition()); result.Append(startTag); } } // Write the current element end-tag. result.Append("</").Append(endTagName).Append('>'); } } } } else if (token is WordToken) { WordToken t = (WordToken)token; result.Append(t.Word); } else if (token is SpacesToken) { SpacesToken t = (SpacesToken)token; result.Append(t.Spaces); } else if (token is NumberToken) { NumberToken t = (NumberToken)token; result.Append(t.Number); } else if (token is EntityReferenceToken) { EntityReferenceToken t = (EntityReferenceToken)token; result.Append(XmlEntity(t.Name)); } else if (token is PunctuationToken) { PunctuationToken t = (PunctuationToken)token; result.Append(t.Character); } else if (token is CharacterEntityToken) { CharacterEntityToken t = (CharacterEntityToken)token; result.Append(t.Character); } else if (token is NewlineToken) { result.Append('\n'); } else if (token is ScriptToken) { ScriptToken t = (ScriptToken)token; if (t.Script.Length > 0) { // Script element contents are often empty. // NOTE: Removing any prior use of CDATA section in script, to avoid conflict. string script = t.Script.Replace("<![CDATA[", "").Replace("]]>", ""); result.Append("/*<![CDATA[*/").Append(script).Append("/*]]>*/"); } } else if (token is CDataToken) { CDataToken t = (CDataToken)token; result.Append("<![CDATA[").Append(t.Data).Append("]]>"); } else if (token is CommentToken) { CommentToken t = (CommentToken)token; result.Append("<!--").Append(t.Comment).Append("-->"); } else if (token is DoctypeToken) { // Ignore. } else if (token is ProcessingInstructionToken) { // Ignore. } else { Log.WarnFormat("Unexpected token! {0}", token); } token = parser.GetNextToken(); } Log.Info(parser.GetCompletionReport()); } catch (Exception ex) { Log.Error("EXCEPTION", ex); result = null; } return(result == null ? null : result.ToString()); }