private void TagEnd(string tagName) { if (_normalizeXml) { HtmlTagType tagType = HtmlTags.GetHtmlTagType(tagName); switch (tagType) { case HtmlTagType.Html: case HtmlTagType.Head: case HtmlTagType.Body: return; case HtmlTagType.Title: _currentNode = _currentTreeNode; return; case HtmlTagType.Table: if (_table == null) { return; } //_currentNode = _currentTreeNode = GetParentXXNode(_table.Table); _currentNode = _currentTreeNode = _table.Table.Parent; _table = null; if (_tableStack.Count != 0) { _table = _tableStack.Pop(); } return; case HtmlTagType.DL: if (_definitionList == null) { return; } //_currentNode = _currentTreeNode = GetParentXXNode(_definitionList); _currentNode = _currentTreeNode = _definitionList.Parent; _definitionList = null; if (_definitionListStack.Count != 0) { _definitionList = _definitionListStack.Pop(); } return; } if (_table != null) { switch (tagType) { case HtmlTagType.THead: case HtmlTagType.TBody: case HtmlTagType.TFoot: _currentNode = _currentTreeNode = _table.Table; _table.Body = null; return; case HtmlTagType.ColGroup: _currentNode = _currentTreeNode = _table.Table; _table.ColGroup = null; return; case HtmlTagType.Col: if (_table.Col != null) { //_currentNode = _currentTreeNode = GetParentXXNode(_table.Col); _currentNode = _currentTreeNode = _table.Col.Parent; _table.Col = null; } return; case HtmlTagType.TR: if (_table.Row != null) { //_currentNode = _currentTreeNode = GetParentXXNode(_table.Row); _currentNode = _currentTreeNode = _table.Row.Parent; _table.Row = null; } return; case HtmlTagType.TH: case HtmlTagType.TD: if (_table.Data != null) { //_currentNode = _currentTreeNode = GetParentXXNode(_table.Data); _currentNode = _currentTreeNode = _table.Data.Parent; _table.Data = null; } return; } } } XNode node = GetParentNodeByName(_currentTreeNode, tagName); if (node != null) { //_currentTreeNode = GetParentXXNode(node); _currentTreeNode = node.Parent; } _currentNode = _currentTreeNode; }
public XDocument CreateXml() { // ATTENTION HtmlReader_v4 dont manage ReadCommentInText //_htmlReader.ReadCommentInText = _readCommentInText; // need close tag //_htmlReader.GenerateCloseTag = true; //if (!_htmlReader.GenerateCloseTag) // throw new PBException("html reader must have option GenerateCloseTag"); //_xdocument = new XDocument(); _xdCreator = new XDocumentCreator(); //_documentNode = _xdocument; InitXml(); _tableStack = new Stack <HtmlTable_v3>(); _table = null; _definitionListStack = new Stack <XElement>(); _definitionList = null; _noTag = false; _body = false; _title = false; foreach (HtmlNode htmlNode in _htmlReader.Read()) { if (htmlNode.Type == HtmlNodeType.Text || htmlNode.Type == HtmlNodeType.Comment) { // $$pb modif le 11/01/2015 //if (_htmlReader.IsText && !_htmlReader.IsTextSeparator && !_body) //if (_htmlReader.IsText && !_htmlReader.IsTextSeparator && !_htmlReader.IsScript && !_body) //{ // _body = true; // _currentNode = _currentTreeNode = _bodyNode; //} if (!_generateXmlNodeOnly) { if (_readCommentInText) { if (htmlNode.Type == HtmlNodeType.Text) { AddText(_currentNode, ((HtmlNodeText)htmlNode).Text); } else //if (htmlNode.Type == HtmlNodeType.Comment) { AddText(_currentNode, ((HtmlNodeComment)htmlNode).Comment); } } else { if (htmlNode.Type == HtmlNodeType.Text) { AddText(_currentNode, ((HtmlNodeText)htmlNode).Text); } else //if (htmlNode.Type == HtmlNodeType.Comment) { string s = ((HtmlNodeComment)htmlNode).Comment; s = _commentCorrection.Replace(s, "-"); if (s.EndsWith("-")) { s += " "; } //AddComment(_currentNode, s); _xdCreator.AddComment(_currentNode, s); } } } } else if (htmlNode.Type == HtmlNodeType.Script) { AddText(_currentNode, ((HtmlNodeScript)htmlNode).Script); } else if (htmlNode.Type == HtmlNodeType.DocumentType) { //AddAttribute(_htmlNode, "doctype", ((HtmlNodeDocType)htmlNode).DocType); _xdCreator.AddAttribute(_htmlNode, "doctype", ((HtmlNodeDocType)htmlNode).DocType); } else if (htmlNode.Type == HtmlNodeType.Property) { if (_generateXmlNodeOnly || _noTag) { continue; } HtmlNodeProperty htmlNodeProperty = (HtmlNodeProperty)htmlNode; try { string propertyName = htmlNodeProperty.Name; propertyName = _nameCorrection.Replace(propertyName, ""); propertyName = propertyName.ToLower(); if (propertyName == "") { propertyName = "__value"; } // modif le 28/01/2014 // hexadecimal value 0x03, is an invalid character // found in http://www.reseau-gesat.com/Gesat/Yvelines,78/Fontenay-le-Fleury,31443/esat-cotra,e1596/ // <html><head><meta name="keywords" content="Conditionnement, travaux &agrave; fa&ccedil;onToutes activit&eacute;s en entreprise Entretien et cr&eacute;ation despaces verts" /> string propertyValue = htmlNodeProperty.Value; if (propertyValue != null) { propertyValue = propertyValue.Replace("\x03", ""); } //AddAttribute(_currentNode, propertyName, propertyValue); _xdCreator.AddAttribute(_currentNode, propertyName, propertyValue); //if (_htmlReader.IsMarkBeginEnd) // TagEnd(_htmlReader.MarkName.ToLower()); } catch (Exception ex) { Trace.WriteLine($"error in HtmlToXml_v2.CreateXml() : line {htmlNode.Line} column {htmlNode.Column}"); Trace.WriteLine(ex.Message); } } //else if (_htmlReader.IsMarkBeginEnd) //{ // string tagName = _htmlReader.MarkName.ToLower(); // tagName = _replace.Replace(tagName, "_"); // if (tagName == "") tagName = "_"; // TagBegin(tagName, true); //} //else if (_htmlReader.IsMarkBegin) else if (htmlNode.Type == HtmlNodeType.OpenTag) { HtmlNodeOpenTag htmlNodeOpenTag = (HtmlNodeOpenTag)htmlNode; string tagName = htmlNodeOpenTag.Name.ToLower(); tagName = _nameCorrection.Replace(tagName, "_"); if (tagName == "") { tagName = "_"; } //TagBegin(tagName, false); AddTagBegin(tagName); } //else if (htmlNode.Type == HtmlNodeType.CloseTag) //{ // HtmlNodeCloseTag htmlNodeCloseTag = (HtmlNodeCloseTag)htmlNode; // string tagName = htmlNodeCloseTag.Name.ToLower(); // tagName = _nameCorrection.Replace(tagName, "_"); // if (tagName == "") // tagName = "_"; // TagEnd(tagName); //} //else if (_htmlReader.IsMarkEnd) else if (htmlNode.Type == HtmlNodeType.EndTag) { HtmlNodeEndTag htmlNodeEndTag = (HtmlNodeEndTag)htmlNode; string tagName = htmlNodeEndTag.Name.ToLower(); tagName = _nameCorrection.Replace(tagName, "_"); if (tagName == "") { tagName = "_"; } TagEnd(tagName); } } //return _xdocument; return(_xdCreator.XDocument); }
// bool bTagEnd private void AddTagBegin(string tagName) { _noTag = false; HtmlTagType tagType = HtmlTags.GetHtmlTagType(tagName); HtmlTag tag = HtmlTags.GetHtmlTag(tagType); if (_normalizeXml) { if (tagType == HtmlTagType.Html || tagType == HtmlTagType.Head) { _noTag = true; return; } if (tagType == HtmlTagType.Body) { _noTag = true; if (!_body) { _body = true; _currentNode = _currentTreeNode = _bodyNode; } return; } if (tagType == HtmlTagType.Title) { if (!_title) { //if (!bTagEnd) //{ _title = true; _currentNode = _titleNode; //} } else { _noTag = true; } return; } // $$pb modif le 11/01/2015 //if (!_body && tag.TagCategory != HtmlTagCategory.Head) //{ // _body = true; // _currentNode = _currentTreeNode = _bodyNode; //} } //_currentNode = CreateElement(tagName); if (_normalizeXml) { //if (tagType == HtmlTagType.Table && !bTagEnd) if (tagType == HtmlTagType.Table) { //AddElement(_currentTreeNode, _currentNode); //_currentTreeNode = _currentNode; _currentTreeNode = _currentNode = _xdCreator.AddElement(_currentTreeNode, tagName); if (_table != null) { _tableStack.Push(_table); } _table = new HtmlTable_v3(); _table.Table = _currentNode; return; } //if (TagBeginTableCategory(tag, bTagEnd)) if (TagBeginTableCategory(tag, tagName)) { return; } //if (tagType == HtmlTagType.DL && !bTagEnd) if (tagType == HtmlTagType.DL) { //AddElement(_currentTreeNode, _currentNode); //_currentTreeNode = _currentNode; _currentTreeNode = _currentNode = _xdCreator.AddElement(_currentTreeNode, tagName); if (_definitionList != null) { _definitionListStack.Push(_definitionList); } _definitionList = _currentNode; return; } //if (TagBeginDefinitionListCategory(tag, bTagEnd)) if (TagBeginDefinitionListCategory(tag, tagName)) { return; } // $$pb à revérifier // il faut au moins annuler gLastPNode quand un des parents de gLastPNode se ferme //if (tagType == HtmlTagTypeEnum.P) //{ // // pour gérer une balise <p> qui n'a pas de fin de balise </p> // if (gLastPNode != null) // { // gCurrentTreeNode = GetParentXXNode(gLastPNode); // gLastPNode = null; // } // if (!bTagEnd) gLastPNode = gCurrentNode; //} } //AddElement(_currentTreeNode, _currentNode); _currentNode = _xdCreator.AddElement(_currentTreeNode, tagName); //if (!bTagEnd && tag.EndBoundType != HtmlBoundType.Forbidden) if (tag.EndBoundType != HtmlBoundType.Forbidden) { _currentTreeNode = _currentNode; } }