Example #1
0
        private void TagEnd(string tagName)
        {
            if (_normalizeXml)
            {
                HtmlTagType tagType = HtmlTags.GetHtmlTagType(tagName);
                switch (tagType)
                {
                case HtmlTagType.Html:
                case HtmlTagType.Head:
                case HtmlTagType.Body:
                    return;

                case HtmlTagType.Title:
                    _currentNode = _currentTreeNode;
                    return;

                case HtmlTagType.Table:
                    if (_table == null)
                    {
                        return;
                    }
                    //_currentNode = _currentTreeNode = GetParentXXNode(_table.Table);
                    _currentNode = _currentTreeNode = _table.Table.Parent;
                    _table       = null;
                    if (_tableStack.Count != 0)
                    {
                        _table = _tableStack.Pop();
                    }
                    return;

                case HtmlTagType.DL:
                    if (_definitionList == null)
                    {
                        return;
                    }
                    //_currentNode = _currentTreeNode = GetParentXXNode(_definitionList);
                    _currentNode    = _currentTreeNode = _definitionList.Parent;
                    _definitionList = null;
                    if (_definitionListStack.Count != 0)
                    {
                        _definitionList = _definitionListStack.Pop();
                    }
                    return;
                }
                if (_table != null)
                {
                    switch (tagType)
                    {
                    case HtmlTagType.THead:
                    case HtmlTagType.TBody:
                    case HtmlTagType.TFoot:
                        _currentNode = _currentTreeNode = _table.Table;
                        _table.Body  = null;
                        return;

                    case HtmlTagType.ColGroup:
                        _currentNode    = _currentTreeNode = _table.Table;
                        _table.ColGroup = null;
                        return;

                    case HtmlTagType.Col:
                        if (_table.Col != null)
                        {
                            //_currentNode = _currentTreeNode = GetParentXXNode(_table.Col);
                            _currentNode = _currentTreeNode = _table.Col.Parent;
                            _table.Col   = null;
                        }
                        return;

                    case HtmlTagType.TR:
                        if (_table.Row != null)
                        {
                            //_currentNode = _currentTreeNode = GetParentXXNode(_table.Row);
                            _currentNode = _currentTreeNode = _table.Row.Parent;
                            _table.Row   = null;
                        }
                        return;

                    case HtmlTagType.TH:
                    case HtmlTagType.TD:
                        if (_table.Data != null)
                        {
                            //_currentNode = _currentTreeNode = GetParentXXNode(_table.Data);
                            _currentNode = _currentTreeNode = _table.Data.Parent;
                            _table.Data  = null;
                        }
                        return;
                    }
                }
            }
            XNode node = GetParentNodeByName(_currentTreeNode, tagName);

            if (node != null)
            {
                //_currentTreeNode = GetParentXXNode(node);
                _currentTreeNode = node.Parent;
            }
            _currentNode = _currentTreeNode;
        }
Example #2
0
        public XDocument CreateXml()
        {
            // ATTENTION HtmlReader_v4 dont manage ReadCommentInText
            //_htmlReader.ReadCommentInText = _readCommentInText;

            // need close tag
            //_htmlReader.GenerateCloseTag = true;
            //if (!_htmlReader.GenerateCloseTag)
            //    throw new PBException("html reader must have option GenerateCloseTag");

            //_xdocument = new XDocument();
            _xdCreator = new XDocumentCreator();
            //_documentNode = _xdocument;

            InitXml();

            _tableStack = new Stack <HtmlTable_v3>();
            _table      = null;

            _definitionListStack = new Stack <XElement>();
            _definitionList      = null;

            _noTag = false;
            _body  = false;
            _title = false;

            foreach (HtmlNode htmlNode in _htmlReader.Read())
            {
                if (htmlNode.Type == HtmlNodeType.Text || htmlNode.Type == HtmlNodeType.Comment)
                {
                    // $$pb modif le 11/01/2015
                    //if (_htmlReader.IsText && !_htmlReader.IsTextSeparator && !_body)
                    //if (_htmlReader.IsText && !_htmlReader.IsTextSeparator && !_htmlReader.IsScript && !_body)
                    //{
                    //    _body = true;
                    //    _currentNode = _currentTreeNode = _bodyNode;
                    //}
                    if (!_generateXmlNodeOnly)
                    {
                        if (_readCommentInText)
                        {
                            if (htmlNode.Type == HtmlNodeType.Text)
                            {
                                AddText(_currentNode, ((HtmlNodeText)htmlNode).Text);
                            }
                            else //if (htmlNode.Type == HtmlNodeType.Comment)
                            {
                                AddText(_currentNode, ((HtmlNodeComment)htmlNode).Comment);
                            }
                        }
                        else
                        {
                            if (htmlNode.Type == HtmlNodeType.Text)
                            {
                                AddText(_currentNode, ((HtmlNodeText)htmlNode).Text);
                            }
                            else //if (htmlNode.Type == HtmlNodeType.Comment)
                            {
                                string s = ((HtmlNodeComment)htmlNode).Comment;
                                s = _commentCorrection.Replace(s, "-");
                                if (s.EndsWith("-"))
                                {
                                    s += " ";
                                }
                                //AddComment(_currentNode, s);
                                _xdCreator.AddComment(_currentNode, s);
                            }
                        }
                    }
                }
                else if (htmlNode.Type == HtmlNodeType.Script)
                {
                    AddText(_currentNode, ((HtmlNodeScript)htmlNode).Script);
                }
                else if (htmlNode.Type == HtmlNodeType.DocumentType)
                {
                    //AddAttribute(_htmlNode, "doctype", ((HtmlNodeDocType)htmlNode).DocType);
                    _xdCreator.AddAttribute(_htmlNode, "doctype", ((HtmlNodeDocType)htmlNode).DocType);
                }
                else if (htmlNode.Type == HtmlNodeType.Property)
                {
                    if (_generateXmlNodeOnly || _noTag)
                    {
                        continue;
                    }
                    HtmlNodeProperty htmlNodeProperty = (HtmlNodeProperty)htmlNode;
                    try
                    {
                        string propertyName = htmlNodeProperty.Name;
                        propertyName = _nameCorrection.Replace(propertyName, "");
                        propertyName = propertyName.ToLower();
                        if (propertyName == "")
                        {
                            propertyName = "__value";
                        }

                        // modif le 28/01/2014
                        //   hexadecimal value 0x03, is an invalid character
                        //   found in http://www.reseau-gesat.com/Gesat/Yvelines,78/Fontenay-le-Fleury,31443/esat-cotra,e1596/
                        //   <html><head><meta name="keywords" content="Conditionnement, travaux &amp;agrave; fa&amp;ccedil;onToutes activit&amp;eacute;s en entreprise Entretien et cr&amp;eacute;ation despaces verts" />
                        string propertyValue = htmlNodeProperty.Value;
                        if (propertyValue != null)
                        {
                            propertyValue = propertyValue.Replace("\x03", "");
                        }
                        //AddAttribute(_currentNode, propertyName, propertyValue);
                        _xdCreator.AddAttribute(_currentNode, propertyName, propertyValue);
                        //if (_htmlReader.IsMarkBeginEnd)
                        //    TagEnd(_htmlReader.MarkName.ToLower());
                    }
                    catch (Exception ex)
                    {
                        Trace.WriteLine($"error in HtmlToXml_v2.CreateXml() : line {htmlNode.Line} column {htmlNode.Column}");
                        Trace.WriteLine(ex.Message);
                    }
                }
                //else if (_htmlReader.IsMarkBeginEnd)
                //{
                //    string tagName = _htmlReader.MarkName.ToLower();
                //    tagName = _replace.Replace(tagName, "_");
                //    if (tagName == "") tagName = "_";

                //    TagBegin(tagName, true);
                //}
                //else if (_htmlReader.IsMarkBegin)
                else if (htmlNode.Type == HtmlNodeType.OpenTag)
                {
                    HtmlNodeOpenTag htmlNodeOpenTag = (HtmlNodeOpenTag)htmlNode;
                    string          tagName         = htmlNodeOpenTag.Name.ToLower();
                    tagName = _nameCorrection.Replace(tagName, "_");
                    if (tagName == "")
                    {
                        tagName = "_";
                    }

                    //TagBegin(tagName, false);
                    AddTagBegin(tagName);
                }
                //else if (htmlNode.Type == HtmlNodeType.CloseTag)
                //{
                //    HtmlNodeCloseTag htmlNodeCloseTag = (HtmlNodeCloseTag)htmlNode;
                //    string tagName = htmlNodeCloseTag.Name.ToLower();
                //    tagName = _nameCorrection.Replace(tagName, "_");
                //    if (tagName == "")
                //        tagName = "_";
                //    TagEnd(tagName);
                //}
                //else if (_htmlReader.IsMarkEnd)
                else if (htmlNode.Type == HtmlNodeType.EndTag)
                {
                    HtmlNodeEndTag htmlNodeEndTag = (HtmlNodeEndTag)htmlNode;
                    string         tagName        = htmlNodeEndTag.Name.ToLower();
                    tagName = _nameCorrection.Replace(tagName, "_");
                    if (tagName == "")
                    {
                        tagName = "_";
                    }
                    TagEnd(tagName);
                }
            }

            //return _xdocument;
            return(_xdCreator.XDocument);
        }
Example #3
0
        // bool bTagEnd
        private void AddTagBegin(string tagName)
        {
            _noTag = false;

            HtmlTagType tagType = HtmlTags.GetHtmlTagType(tagName);
            HtmlTag     tag     = HtmlTags.GetHtmlTag(tagType);

            if (_normalizeXml)
            {
                if (tagType == HtmlTagType.Html || tagType == HtmlTagType.Head)
                {
                    _noTag = true;
                    return;
                }
                if (tagType == HtmlTagType.Body)
                {
                    _noTag = true;
                    if (!_body)
                    {
                        _body        = true;
                        _currentNode = _currentTreeNode = _bodyNode;
                    }
                    return;
                }
                if (tagType == HtmlTagType.Title)
                {
                    if (!_title)
                    {
                        //if (!bTagEnd)
                        //{
                        _title       = true;
                        _currentNode = _titleNode;
                        //}
                    }
                    else
                    {
                        _noTag = true;
                    }
                    return;
                }
                // $$pb modif le 11/01/2015
                //if (!_body && tag.TagCategory != HtmlTagCategory.Head)
                //{
                //    _body = true;
                //    _currentNode = _currentTreeNode = _bodyNode;
                //}
            }
            //_currentNode = CreateElement(tagName);
            if (_normalizeXml)
            {
                //if (tagType == HtmlTagType.Table && !bTagEnd)
                if (tagType == HtmlTagType.Table)
                {
                    //AddElement(_currentTreeNode, _currentNode);
                    //_currentTreeNode = _currentNode;
                    _currentTreeNode = _currentNode = _xdCreator.AddElement(_currentTreeNode, tagName);
                    if (_table != null)
                    {
                        _tableStack.Push(_table);
                    }
                    _table       = new HtmlTable_v3();
                    _table.Table = _currentNode;
                    return;
                }
                //if (TagBeginTableCategory(tag, bTagEnd))
                if (TagBeginTableCategory(tag, tagName))
                {
                    return;
                }
                //if (tagType == HtmlTagType.DL && !bTagEnd)
                if (tagType == HtmlTagType.DL)
                {
                    //AddElement(_currentTreeNode, _currentNode);
                    //_currentTreeNode = _currentNode;
                    _currentTreeNode = _currentNode = _xdCreator.AddElement(_currentTreeNode, tagName);
                    if (_definitionList != null)
                    {
                        _definitionListStack.Push(_definitionList);
                    }
                    _definitionList = _currentNode;
                    return;
                }
                //if (TagBeginDefinitionListCategory(tag, bTagEnd))
                if (TagBeginDefinitionListCategory(tag, tagName))
                {
                    return;
                }

                // $$pb à revérifier
                // il faut au moins annuler gLastPNode quand un des parents de gLastPNode se ferme
                //if (tagType == HtmlTagTypeEnum.P)
                //{
                //    // pour gérer une balise <p> qui n'a pas de fin de balise </p>
                //    if (gLastPNode != null)
                //    {
                //        gCurrentTreeNode = GetParentXXNode(gLastPNode);
                //        gLastPNode = null;
                //    }
                //    if (!bTagEnd) gLastPNode = gCurrentNode;
                //}
            }
            //AddElement(_currentTreeNode, _currentNode);
            _currentNode = _xdCreator.AddElement(_currentTreeNode, tagName);
            //if (!bTagEnd && tag.EndBoundType != HtmlBoundType.Forbidden)
            if (tag.EndBoundType != HtmlBoundType.Forbidden)
            {
                _currentTreeNode = _currentNode;
            }
        }