/** * processes the tag. * @param start if true we are dealing with a tag that has just been opened; if false we are closing a tag. */ private void ProcessTag(bool start) { if (start) { nested++; doc.StartElement(tag, attributes); } else { nested--; doc.EndElement(tag); } }
/** * processes the tag. * @param start if true we are dealing with a tag that has just been opened; if false we are closing a tag. */ private void ProcessTag(bool start) { if (start) { nested++; doc.StartElement(tag, attributes); } else { // White spaces following new lines need to be ignored in HTML if (newLineHandler.IsNewLineTag(tag)) { nowhite = false; } nested--; doc.EndElement(tag); } }
/** * Parses the XML document firing the events to the handler. * @param doc the document handler * @param r the document. The encoding is already resolved. The reader is not closed * @throws IOException on error */ public static void Parse(ISimpleXMLDocHandler doc, ISimpleXMLDocHandlerComment comment, TextReader reader, bool html) { Stack st = new Stack(); int depth = 0; int mode = PRE; int c = 0; int quotec = '"'; depth = 0; StringBuilder sb = new StringBuilder(); StringBuilder etag = new StringBuilder(); String tagName = null; String lvalue = null; String rvalue = null; Hashtable attrs = null; st = new Stack(); doc.StartDocument(); int line=1, col=0; bool eol = false; if (html) mode = TEXT; int pushBack = -1; while (true) { if (pushBack != -1) { c = pushBack; pushBack = -1; } else c = reader.Read(); if (c == -1) break; // We need to map \r, \r\n, and \n to \n // See XML spec section 2.11 if (c == '\n' && eol) { eol = false; continue; } else if (eol) { eol = false; } else if (c == '\n') { line++; col=0; } else if (c == '\r') { eol = true; c = '\n'; line++; col=0; } else { col++; } if (mode == DONE) { doc.EndDocument(); return; // We are between tags collecting text. } else if (mode == TEXT) { if (c == '<') { st.Push(mode); mode = START_TAG; if (sb.Length > 0) { doc.Text(sb.ToString()); sb.Length = 0; } } else if (c == '&') { st.Push(mode); mode = ENTITY; etag.Length = 0; } else sb.Append((char)c); // we are processing a closing tag: e.g. </foo> } else if (mode == CLOSE_TAG) { if (c == '>') { mode = PopMode(st); tagName = sb.ToString(); if (html) tagName = tagName.ToLower(CultureInfo.InvariantCulture); sb.Length = 0; depth--; if (!html && depth==0) mode = DONE; doc.EndElement(tagName); } else { if (!char.IsWhiteSpace((char)c)) sb.Append((char)c); } // we are processing CDATA } else if (mode == CDATA) { if (c == '>' && sb.ToString().EndsWith("]]")) { sb.Length = sb.Length-2; doc.Text(sb.ToString()); sb.Length = 0; mode = PopMode(st); } else sb.Append((char)c); // we are processing a comment. We are inside // the <!-- .... --> looking for the -->. } else if (mode == COMMENT) { if (c == '>' && sb.ToString().EndsWith("--")) { if (comment != null) { sb.Length = sb.Length - 2; comment.Comment(sb.ToString()); } sb.Length = 0; mode = PopMode(st); } else sb.Append((char)c); // We are outside the root tag element } else if (mode == PRE) { if (c == '<') { mode = TEXT; st.Push(mode); mode = START_TAG; } // We are inside one of these <? ... ?> // or one of these <!DOCTYPE ... > } else if (mode == DOCTYPE) { if (c == '>') { mode = PopMode(st); if (mode == TEXT) mode = PRE; } // we have just seen a < and // are wondering what we are looking at // <foo>, </foo>, <!-- ... --->, etc. } else if (mode == START_TAG) { mode = PopMode(st); if (c == '/') { st.Push(mode); mode = CLOSE_TAG; } else if (c == '?') { mode = DOCTYPE; } else { st.Push(mode); mode = OPEN_TAG; tagName = null; attrs = new Hashtable(); sb.Append((char)c); } // we are processing an entity, e.g. <, », etc. } else if (mode == ENTITY) { if (c == ';') { mode = PopMode(st); String cent = etag.ToString(); etag.Length = 0; if (cent.StartsWith("#x")) { try { char ci = (char)int.Parse(cent.Substring(2), NumberStyles.AllowHexSpecifier); sb.Append(ci); } catch { sb.Append('&').Append(cent).Append(';'); } } else if (cent.StartsWith("#")) { try { char ci = (char)int.Parse(cent.Substring(1)); sb.Append(ci); } catch { sb.Append('&').Append(cent).Append(';'); } } else { char ce = EntitiesToUnicode.DecodeEntity(cent); if (ce == '\0') sb.Append('&').Append(cent).Append(';'); else sb.Append(ce); } } else if ((c != '#' && (c < '0' || c > '9') && (c < 'a' || c > 'z') && (c < 'A' || c > 'Z')) || etag.Length >= 7) { mode = PopMode(st); pushBack = c; sb.Append('&').Append(etag.ToString()); etag.Length = 0; } else { etag.Append((char)c); } // we have just seen something like this: // <foo a="b"/ // and are looking for the final >. } else if (mode == SINGLE_TAG) { if (tagName == null) tagName = sb.ToString(); if (html) tagName = tagName.ToLower(CultureInfo.InvariantCulture); if (c != '>') Exc("Expected > for tag: <"+tagName+"/>",line,col); doc.StartElement(tagName,attrs); doc.EndElement(tagName); if (!html && depth==0) { doc.EndDocument(); return; } sb.Length = 0; attrs = new Hashtable(); tagName = null; mode = PopMode(st); // we are processing something // like this <foo ... >. It could // still be a <!-- ... --> or something. } else if (mode == OPEN_TAG) { if (c == '>') { if (tagName == null) tagName = sb.ToString(); if (html) tagName = tagName.ToLower(CultureInfo.InvariantCulture); sb.Length = 0; depth++; doc.StartElement(tagName,attrs); tagName = null; attrs = new Hashtable(); mode = PopMode(st); } else if (c == '/') { mode = SINGLE_TAG; } else if (c == '-' && sb.ToString().Equals("!-")) { mode = COMMENT; sb.Length = 0; } else if (c == '[' && sb.ToString().Equals("![CDATA")) { mode = CDATA; sb.Length = 0; } else if (c == 'E' && sb.ToString().Equals("!DOCTYP")) { sb.Length = 0; mode = DOCTYPE; } else if (char.IsWhiteSpace((char)c)) { tagName = sb.ToString(); if (html) tagName = tagName.ToLower(CultureInfo.InvariantCulture); sb.Length = 0; mode = IN_TAG; } else { sb.Append((char)c); } // We are processing the quoted right-hand side // of an element's attribute. } else if (mode == QUOTE) { if (html && quotec == ' ' && c == '>') { rvalue = sb.ToString(); sb.Length = 0; attrs[lvalue] = rvalue; mode = PopMode(st); doc.StartElement(tagName,attrs); depth++; tagName = null; attrs = new Hashtable(); } else if (html && quotec == ' ' && char.IsWhiteSpace((char)c)) { rvalue = sb.ToString(); sb.Length = 0; attrs[lvalue] = rvalue; mode = IN_TAG; } else if (html && quotec == ' ') { sb.Append((char)c); } else if (c == quotec) { rvalue = sb.ToString(); sb.Length = 0; attrs[lvalue] = rvalue; mode = IN_TAG; // See section the XML spec, section 3.3.3 // on normalization processing. } else if (" \r\n\u0009".IndexOf((char)c)>=0) { sb.Append(' '); } else if (c == '&') { st.Push(mode); mode = ENTITY; etag.Length = 0; } else { sb.Append((char)c); } } else if (mode == ATTRIBUTE_RVALUE) { if (c == '"' || c == '\'') { quotec = c; mode = QUOTE; } else if (char.IsWhiteSpace((char)c)) { } else if (html && c == '>') { attrs[lvalue] = sb.ToString(); sb.Length = 0; mode = PopMode(st); doc.StartElement(tagName,attrs); depth++; tagName = null; attrs = new Hashtable(); } else if (html) { sb.Append((char)c); quotec = ' '; mode = QUOTE; } else { Exc("Error in attribute processing",line,col); } } else if (mode == ATTRIBUTE_LVALUE) { if (char.IsWhiteSpace((char)c)) { lvalue = sb.ToString(); if (html) lvalue = lvalue.ToLower(CultureInfo.InvariantCulture); sb.Length = 0; mode = ATTRIBUTE_EQUAL; } else if (c == '=') { lvalue = sb.ToString(); if (html) lvalue = lvalue.ToLower(CultureInfo.InvariantCulture); sb.Length = 0; mode = ATTRIBUTE_RVALUE; } else if (html && c == '>') { sb.Length = 0; mode = PopMode(st); doc.StartElement(tagName,attrs); depth++; tagName = null; attrs = new Hashtable(); } else { sb.Append((char)c); } } else if (mode == ATTRIBUTE_EQUAL) { if (c == '=') { mode = ATTRIBUTE_RVALUE; } else if (char.IsWhiteSpace((char)c)) { } else if (html && c == '>') { sb.Length = 0; mode = PopMode(st); doc.StartElement(tagName,attrs); depth++; tagName = null; attrs = new Hashtable(); } else if (html && c == '/') { sb.Length = 0; mode = SINGLE_TAG; } else if (html) { sb.Length = 0; sb.Append((char)c); mode = ATTRIBUTE_LVALUE; } else { Exc("Error in attribute processing.",line,col); } } else if (mode == IN_TAG) { if (c == '>') { mode = PopMode(st); doc.StartElement(tagName,attrs); depth++; tagName = null; attrs = new Hashtable(); } else if (c == '/') { mode = SINGLE_TAG; } else if (char.IsWhiteSpace((char)c)) { } else { mode = ATTRIBUTE_LVALUE; sb.Append((char)c); } } } if (html || mode == DONE) { if (html && mode == TEXT) doc.Text(sb.ToString()); doc.EndDocument(); } else Exc("missing end tag",line,col); }