private void TagEnd(string sTagName) { if (gbNormalizeXml) { HtmlTagType tagType = HtmlTags.GetHtmlTagType(sTagName); switch (tagType) { case HtmlTagType.Html: case HtmlTagType.Head: case HtmlTagType.Body: return; case HtmlTagType.Title: gCurrentNode = gCurrentTreeNode; return; case HtmlTagType.Table: if (gTable == null) return; //gCurrentNode = gCurrentTreeNode = gTable.Table.ParentNode; gCurrentNode = gCurrentTreeNode = GetParentXXNode(gTable.Table); gTable = null; if (gTableStack.Count != 0) gTable = gTableStack.Pop(); return; case HtmlTagType.DL: if (gDefinitionList == null) return; //gCurrentNode = gCurrentTreeNode = gDefinitionList.ParentNode; gCurrentNode = gCurrentTreeNode = GetParentXXNode(gDefinitionList); gDefinitionList = null; if (gDefinitionListStack.Count != 0) gDefinitionList = gDefinitionListStack.Pop(); return; } if (gTable != null) { switch (tagType) { case HtmlTagType.THead: case HtmlTagType.TBody: case HtmlTagType.TFoot: gCurrentNode = gCurrentTreeNode = gTable.Table; gTable.Body = null; return; case HtmlTagType.ColGroup: gCurrentNode = gCurrentTreeNode = gTable.Table; gTable.ColGroup = null; return; case HtmlTagType.Col: if (gTable.Col != null) { //gCurrentNode = gCurrentTreeNode = gTable.Col.ParentNode; gCurrentNode = gCurrentTreeNode = GetParentXXNode(gTable.Col); gTable.Col = null; } return; case HtmlTagType.TR: if (gTable.Row != null) { //gCurrentNode = gCurrentTreeNode = gTable.Row.ParentNode; gCurrentNode = gCurrentTreeNode = GetParentXXNode(gTable.Row); gTable.Row = null; } return; case HtmlTagType.TH: case HtmlTagType.TD: if (gTable.Data != null) { //gCurrentNode = gCurrentTreeNode = gTable.Data.ParentNode; gCurrentNode = gCurrentTreeNode = GetParentXXNode(gTable.Data); gTable.Data = null; } return; } } } //XmlNode node = gCurrentTreeNode; //while (node != null) //{ // if (node.Name == sTagName) // { // gCurrentTreeNode = node.ParentNode; // break; // } // node = node.ParentNode; //} XXXNode node = GetParentXXNodeByName(gCurrentTreeNode, sTagName); if (node != null) gCurrentTreeNode = GetParentXXNode(node); gCurrentNode = gCurrentTreeNode; }
private void TagBegin(string sTagName, bool bTagEnd) { gbNoTag = false; //sTagName = sTagName.Replace('-', '_'); //sTagName = sTagName.Replace('!', '_'); //sTagName = sTagName.Replace('[', '_'); //sTagName = sTagName.Replace(']', '_'); //sTagName = gReplace.Replace(sTagName, "_"); //if (sTagName == "") sTagName = "_"; HtmlTagType tagType = HtmlTags.GetHtmlTagType(sTagName); HtmlTag tag = HtmlTags.GetHtmlTag(tagType); if (gbNormalizeXml) { if (tagType == HtmlTagType.Html || tagType == HtmlTagType.Head) { gbNoTag = true; return; } if (tagType == HtmlTagType.Body) { gbNoTag = true; if (!gbBody) { gbBody = true; gCurrentNode = gCurrentTreeNode = gBodyNode; } return; } if (tagType == HtmlTagType.Title) { if (!gbTitle) { if (!bTagEnd) { gbTitle = true; gCurrentNode = gTitleNode; } } else gbNoTag = true; return; } if (!gbBody && tag.TagCategory != HtmlTagCategory.Head) { gbBody = true; gCurrentNode = gCurrentTreeNode = gBodyNode; } } //gCurrentNode = gXmlDocument.CreateElement(sTagName); gCurrentNode = CreateElement(sTagName); if (gbNormalizeXml) { if (tagType == HtmlTagType.Table && !bTagEnd) { if (gTable != null) gTableStack.Push(gTable); gTable = new HtmlTable(); gTable.Table = gCurrentNode; //gCurrentTreeNode.AppendChild(gCurrentNode); AddElement(gCurrentTreeNode, gCurrentNode); gCurrentTreeNode = gCurrentNode; return; } if (TagBeginTableCategory(tag, bTagEnd)) return; if (tagType == HtmlTagType.DL && !bTagEnd) { if (gDefinitionList != null) gDefinitionListStack.Push(gDefinitionList); gDefinitionList = gCurrentNode; //gCurrentTreeNode.AppendChild(gCurrentNode); AddElement(gCurrentTreeNode, gCurrentNode); gCurrentTreeNode = gCurrentNode; return; } if (TagBeginDefinitionListCategory(tag, bTagEnd)) return; // $$pb à revérifier // il faut au moins annuler gLastPNode quand un des parents de gLastPNode se ferme //if (tagType == HtmlTagTypeEnum.P) //{ // // pour gérer une balise <p> qui n'a pas de fin de balise </p> // if (gLastPNode != null) // { // gCurrentTreeNode = GetParentXXNode(gLastPNode); // gLastPNode = null; // } // if (!bTagEnd) gLastPNode = gCurrentNode; //} } //gCurrentTreeNode.AppendChild(gCurrentNode); AddElement(gCurrentTreeNode, gCurrentNode); if (!bTagEnd && tag.EndBoundType != HtmlBoundType.Forbidden) gCurrentTreeNode = gCurrentNode; }
private void GenerateXml() { // gbNormalizeXml = true : // - les tag html, head, title et body sont créés automatiquement // - les tag html, head, title et body rencontrés ne sont pas pris en compte // - seul les tag title et meta sont mis dans la partie head les autre tag sont mis dans la partie body // - si un tag meta est placé après le début de la partie body, ce tag reste dans la partie body // - seul le premier tag title est pris en compte et placé dans la partie head, les autre tag title ne sont pas pris en compte try { //cTrace.Trace("GenerateXml NewGenerateXml : {0}", XmlConfig.CurrentConfig.Get("NewGenerateXml")); InitXml(); gTableStack = new Stack<HtmlTable>(); gTable = null; gDefinitionListStack = new Stack<XXXNode>(); gDefinitionList = null; gbNoTag = false; gbBody = false; gbTitle = false; while (gHTMLReader.Read()) { if (gHTMLReader.IsText || gHTMLReader.IsComment) { if (gHTMLReader.IsText && !gHTMLReader.IsTextSeparator && !gbBody) { gbBody = true; gCurrentNode = gCurrentTreeNode = gBodyNode; } if (!gbGenerateXmlNodeOnly) { //if (string.Compare(XmlConfig.CurrentConfig.Get("NewGenerateXml"), "true", true) != 0) if (gbReadCommentInText) { //XmlText text = gXmlDocument.CreateTextNode("text"); //text.Value = gHTMLReader.Value; //gCurrentNode.AppendChild(text); AddText(gCurrentNode, gHTMLReader.Value); } else { if (gHTMLReader.IsText) { //XmlText text = gXmlDocument.CreateTextNode("text"); //text.Value = gHTMLReader.Value; //gCurrentNode.AppendChild(text); AddText(gCurrentNode, gHTMLReader.Value); } else { string s = gHTMLReader.Value; s = gCommentCorrection.Replace(s, "-"); if (s.EndsWith("-")) s += " "; //XmlComment comment = gXmlDocument.CreateComment(s); //gCurrentNode.AppendChild(comment); AddComment(gCurrentNode, s); } } } } else if (gHTMLReader.IsDocType) { //XmlAttribute attrib = gXmlDocument.CreateAttribute("doctype"); //attrib.Value = gHTMLReader.DocType; //gHtmlNode.Attributes.Append(attrib); AddAttribute(gHtmlNode, "doctype", gHTMLReader.DocType); } else if (gHTMLReader.IsProperty) { if (gbGenerateXmlNodeOnly || gbNoTag) continue; try { string sPropertyName = gHTMLReader.PropertyName; //sPropertyName = sPropertyName.Replace("\"", ""); //sPropertyName = sPropertyName.Replace("/", ""); //sPropertyName = sPropertyName.Replace("\\", ""); //sPropertyName = sPropertyName.Replace("-", ""); //sPropertyName = sPropertyName.Replace(",", ""); sPropertyName = gReplace.Replace(sPropertyName, ""); sPropertyName = sPropertyName.ToLower(); if (sPropertyName == "") sPropertyName = "__value"; //XmlAttribute attrib = gXmlDocument.CreateAttribute(sPropertyName); //attrib.Value = gHTMLReader.PropertyValue; //gCurrentNode.Attributes.Append(attrib); // modif le 28/01/2014 // hexadecimal value 0x03, is an invalid character // found in http://www.reseau-gesat.com/Gesat/Yvelines,78/Fontenay-le-Fleury,31443/esat-cotra,e1596/ // <html><head><meta name="keywords" content="Conditionnement, travaux &agrave; fa&ccedil;onToutes activit&eacute;s en entreprise Entretien et cr&eacute;ation despaces verts" /> string propertyValue = gHTMLReader.PropertyValue; if (propertyValue != null) propertyValue = propertyValue.Replace("\x03", ""); //AddAttribute(gCurrentNode, sPropertyName, gHTMLReader.PropertyValue); AddAttribute(gCurrentNode, sPropertyName, propertyValue); if (gHTMLReader.IsMarkBeginEnd) TagEnd(gHTMLReader.MarkName.ToLower()); } catch { } } else if (gHTMLReader.IsMarkBeginEnd) { //TagBegin(gHTMLReader.MarkName.ToLower(), true); string sTagName = gHTMLReader.MarkName.ToLower(); sTagName = gReplace.Replace(sTagName, "_"); if (sTagName == "") sTagName = "_"; TagBegin(sTagName, true); } else if (gHTMLReader.IsMarkBegin) { //TagBegin(gHTMLReader.MarkName.ToLower(), false); string sTagName = gHTMLReader.MarkName.ToLower(); sTagName = gReplace.Replace(sTagName, "_"); if (sTagName == "") sTagName = "_"; TagBegin(sTagName, false); } else if (gHTMLReader.IsMarkEnd) { //TagEnd(gHTMLReader.MarkName.ToLower()); string sTagName = gHTMLReader.MarkName.ToLower(); sTagName = gReplace.Replace(sTagName, "_"); if (sTagName == "") sTagName = "_"; TagEnd(sTagName); } } } finally { gHTMLReader.Close(); } }
private void TagEnd(string sTagName) { if (_normalizeXml) { HtmlTagType tagType = HtmlTags.GetHtmlTagType(sTagName); switch (tagType) { case HtmlTagType.Html: case HtmlTagType.Head: case HtmlTagType.Body: return; case HtmlTagType.Title: _currentNode = _currentTreeNode; return; case HtmlTagType.Table: if (_table == null) return; _currentNode = _currentTreeNode = GetParentXXNode(_table.Table); _table = null; if (_tableStack.Count != 0) _table = _tableStack.Pop(); return; case HtmlTagType.DL: if (_definitionList == null) return; _currentNode = _currentTreeNode = GetParentXXNode(_definitionList); _definitionList = null; if (_definitionListStack.Count != 0) _definitionList = _definitionListStack.Pop(); return; } if (_table != null) { switch (tagType) { case HtmlTagType.THead: case HtmlTagType.TBody: case HtmlTagType.TFoot: _currentNode = _currentTreeNode = _table.Table; _table.Body = null; return; case HtmlTagType.ColGroup: _currentNode = _currentTreeNode = _table.Table; _table.ColGroup = null; return; case HtmlTagType.Col: if (_table.Col != null) { _currentNode = _currentTreeNode = GetParentXXNode(_table.Col); _table.Col = null; } return; case HtmlTagType.TR: if (_table.Row != null) { _currentNode = _currentTreeNode = GetParentXXNode(_table.Row); _table.Row = null; } return; case HtmlTagType.TH: case HtmlTagType.TD: if (_table.Data != null) { _currentNode = _currentTreeNode = GetParentXXNode(_table.Data); _table.Data = null; } return; } } } XXXNode node = GetParentXXNodeByName(_currentTreeNode, sTagName); if (node != null) _currentTreeNode = GetParentXXNode(node); _currentNode = _currentTreeNode; }
private void TagBegin(string sTagName, bool bTagEnd) { _noTag = false; //sTagName = sTagName.Replace('-', '_'); //sTagName = sTagName.Replace('!', '_'); //sTagName = sTagName.Replace('[', '_'); //sTagName = sTagName.Replace(']', '_'); //sTagName = gReplace.Replace(sTagName, "_"); //if (sTagName == "") sTagName = "_"; HtmlTagType tagType = HtmlTags.GetHtmlTagType(sTagName); HtmlTag tag = HtmlTags.GetHtmlTag(tagType); if (_normalizeXml) { if (tagType == HtmlTagType.Html || tagType == HtmlTagType.Head) { _noTag = true; return; } if (tagType == HtmlTagType.Body) { _noTag = true; if (!_body) { _body = true; _currentNode = _currentTreeNode = _bodyNode; } return; } if (tagType == HtmlTagType.Title) { if (!_title) { if (!bTagEnd) { _title = true; _currentNode = _titleNode; } } else _noTag = true; return; } // $$pb modif le 11/01/2015 //if (!_body && tag.TagCategory != HtmlTagCategory.Head) //{ // _body = true; // _currentNode = _currentTreeNode = _bodyNode; //} } _currentNode = CreateElement(sTagName); if (_normalizeXml) { if (tagType == HtmlTagType.Table && !bTagEnd) { if (_table != null) _tableStack.Push(_table); _table = new HtmlTable(); _table.Table = _currentNode; AddElement(_currentTreeNode, _currentNode); _currentTreeNode = _currentNode; return; } if (TagBeginTableCategory(tag, bTagEnd)) return; if (tagType == HtmlTagType.DL && !bTagEnd) { if (_definitionList != null) _definitionListStack.Push(_definitionList); _definitionList = _currentNode; AddElement(_currentTreeNode, _currentNode); _currentTreeNode = _currentNode; return; } if (TagBeginDefinitionListCategory(tag, bTagEnd)) return; // $$pb à revérifier // il faut au moins annuler gLastPNode quand un des parents de gLastPNode se ferme //if (tagType == HtmlTagTypeEnum.P) //{ // // pour gérer une balise <p> qui n'a pas de fin de balise </p> // if (gLastPNode != null) // { // gCurrentTreeNode = GetParentXXNode(gLastPNode); // gLastPNode = null; // } // if (!bTagEnd) gLastPNode = gCurrentNode; //} } AddElement(_currentTreeNode, _currentNode); if (!bTagEnd && tag.EndBoundType != HtmlBoundType.Forbidden) _currentTreeNode = _currentNode; }
private void GenerateXml() { // gbNormalizeXml = true : // - les tag html, head, title et body sont créés automatiquement // - les tag html, head, title et body rencontrés ne sont pas pris en compte // - seul les tag title et meta sont mis dans la partie head les autre tag sont mis dans la partie body // - si un tag meta est placé après le début de la partie body, ce tag reste dans la partie body // - seul le premier tag title est pris en compte et placé dans la partie head, les autre tag title ne sont pas pris en compte try { //cTrace.Trace("GenerateXml NewGenerateXml : {0}", XmlConfig.CurrentConfig.Get("NewGenerateXml")); InitXml(); _tableStack = new Stack<HtmlTable>(); _table = null; _definitionListStack = new Stack<XXXNode>(); _definitionList = null; _noTag = false; _body = false; _title = false; while (_htmlReader.Read()) { if (_htmlReader.IsText || _htmlReader.IsComment) { // $$pb modif le 11/01/2015 //if (_htmlReader.IsText && !_htmlReader.IsTextSeparator && !_body) //if (_htmlReader.IsText && !_htmlReader.IsTextSeparator && !_htmlReader.IsScript && !_body) //{ // _body = true; // _currentNode = _currentTreeNode = _bodyNode; //} if (!_generateXmlNodeOnly) { if (_readCommentInText) { AddText(_currentNode, _htmlReader.Value); } else { if (_htmlReader.IsText) { AddText(_currentNode, _htmlReader.Value); } else { string s = _htmlReader.Value; s = _commentCorrection.Replace(s, "-"); if (s.EndsWith("-")) s += " "; AddComment(_currentNode, s); } } } } else if (_htmlReader.IsDocType) { AddAttribute(_htmlNode, "doctype", _htmlReader.DocType); } else if (_htmlReader.IsProperty) { if (_generateXmlNodeOnly || _noTag) continue; try { string sPropertyName = _htmlReader.PropertyName; //sPropertyName = sPropertyName.Replace("\"", ""); //sPropertyName = sPropertyName.Replace("/", ""); //sPropertyName = sPropertyName.Replace("\\", ""); //sPropertyName = sPropertyName.Replace("-", ""); //sPropertyName = sPropertyName.Replace(",", ""); sPropertyName = _replace.Replace(sPropertyName, ""); sPropertyName = sPropertyName.ToLower(); if (sPropertyName == "") sPropertyName = "__value"; // modif le 28/01/2014 // hexadecimal value 0x03, is an invalid character // found in http://www.reseau-gesat.com/Gesat/Yvelines,78/Fontenay-le-Fleury,31443/esat-cotra,e1596/ // <html><head><meta name="keywords" content="Conditionnement, travaux &agrave; fa&ccedil;onToutes activit&eacute;s en entreprise Entretien et cr&eacute;ation despaces verts" /> string propertyValue = _htmlReader.PropertyValue; if (propertyValue != null) propertyValue = propertyValue.Replace("\x03", ""); AddAttribute(_currentNode, sPropertyName, propertyValue); if (_htmlReader.IsMarkBeginEnd) TagEnd(_htmlReader.MarkName.ToLower()); } catch (Exception ex) { Trace.WriteLine("error in HtmlToXml.GenerateXml() : line {0} column {1}", _htmlReader.Line, _htmlReader.Column); Trace.WriteLine(ex.Message); } } else if (_htmlReader.IsMarkBeginEnd) { string sTagName = _htmlReader.MarkName.ToLower(); sTagName = _replace.Replace(sTagName, "_"); if (sTagName == "") sTagName = "_"; TagBegin(sTagName, true); } else if (_htmlReader.IsMarkBegin) { string sTagName = _htmlReader.MarkName.ToLower(); sTagName = _replace.Replace(sTagName, "_"); if (sTagName == "") sTagName = "_"; TagBegin(sTagName, false); } else if (_htmlReader.IsMarkEnd) { string sTagName = _htmlReader.MarkName.ToLower(); sTagName = _replace.Replace(sTagName, "_"); if (sTagName == "") sTagName = "_"; TagEnd(sTagName); } } } finally { _htmlReader.Close(); } }