예제 #1
0
        public static HtmlDocument ParseDocument(LayoutFarm.HtmlBoxes.HtmlHost htmlHost, ExternalHtmlTreeWalker externalTreeWalker)
        {
            HtmlDocument newdoc = new HtmlDocument(htmlHost);
            //start from
            HtmlElement         domElem    = (HtmlElement)newdoc.RootNode;
            Stack <HtmlElement> elemStack  = new Stack <HtmlElement>();
            HtmlElement         newDomElem = null;

            foreach (ExternalHtmlNode node in externalTreeWalker.GetHtmlNodeIter())
            {
                switch (node.HtmlNodeKind)
                {
                case ExternalHtmlNodeKind.EnterChildContext:
                {
                    elemStack.Push(domElem);
                    if (newDomElem != null)
                    {
                        domElem = newDomElem;
                    }
                }
                break;

                case ExternalHtmlNodeKind.ExitChildContext:
                {
                    domElem = elemStack.Pop();
                }
                break;

                case ExternalHtmlNodeKind.Attribute:
                {
                    node.GetAttributeNameAndValue(out string attrName, out string attrValue);
                    DomAttribute attr = newdoc.CreateAttribute(attrName, attrValue);
                    newDomElem.SetAttribute(attr);
                }
                break;

                case ExternalHtmlNodeKind.Element:

                    newDomElem = (HtmlElement)newdoc.CreateElement(node.HtmlElementName);
                    domElem.AddChild(newDomElem);

                    //System.Diagnostics.Debug.WriteLine(new string(' ', node.Level) + node.HtmlElementName);
                    break;

                case ExternalHtmlNodeKind.TextNode:
                    DomTextNode textnode = newdoc.CreateTextNode(node.CurrentTextNodeContent.ToCharArray());
                    domElem.AddChild(textnode);
                    //System.Diagnostics.Debug.WriteLine(new string(' ', node.Level) + node.CurrentTextNodeContent);
                    break;

                case ExternalHtmlNodeKind.Document:
                    //System.Diagnostics.Debug.WriteLine("Root");
                    break;
                }
            }
            return(newdoc);
        }
예제 #2
0
 public override void ResetParser()
 {
     this._resultHtmlDoc = null;
     this.openEltStack.Clear();
     this.curHtmlNode  = null;
     this.curAttr      = null;
     this.curTextNode  = null;
     this.parseState   = 0;
     this.textSnapshot = null;
 }
예제 #3
0
 public override void ResetParser()
 {
     _resultHtmlDoc = null;
     _openEltStack.Clear();
     _curHtmlNode  = null;
     _curAttr      = null;
     _curTextNode  = null;
     _parseState   = 0;
     _textSnapshot = null;
 }
예제 #4
0
 public virtual string GetInnerText()
 {
     using (DomTextWriterPool.Borrow(out DomTextWriter textWriter))
     {
         foreach (var childnode in this.GetChildNodeIterForward())
         {
             HtmlElement childHtmlNode = childnode as HtmlElement;
             if (childHtmlNode != null)
             {
                 childHtmlNode.CopyInnerText(textWriter);
             }
             DomTextNode textnode = childnode as DomTextNode;
             if (textnode != null)
             {
                 textnode.CopyInnerText(textWriter);
             }
         }
         return(textWriter.ToString());
     }
 }
예제 #5
0
        void LexStateChanged(HtmlLexerEvent lexEvent, int startIndex, int len)
        {
            switch (lexEvent)
            {
            case HtmlLexerEvent.CommentContent:
            {
                //var commentContent = this.textSnapshot.Copy(startIndex, len);
            }
            break;

            case HtmlLexerEvent.FromContentPart:
            {
                if (curTextNode == null)
                {
                    curTextNode = _resultHtmlDoc.CreateTextNode(
                        HtmlDecodeHelper.DecodeHtml(this.textSnapshot, startIndex, len));
                    if (curHtmlNode != null)
                    {
                        curHtmlNode.AddChild(curTextNode);
                    }
                }
                else
                {
                    curTextNode.AppendTextContent(HtmlDecodeHelper.DecodeHtml(this.textSnapshot, startIndex, len));
                }
            }
            break;

            case HtmlLexerEvent.AttributeValueAsLiteralString:
            {
                //assign value and add to parent
                if (parseState == 11)
                {
                    //document node
                    //doc
                    domDocNode.AddParameter(textSnapshot.Substring(startIndex, len));
                }
                else
                {
                    curAttr.Value = textSnapshot.Substring(startIndex, len);
                    curHtmlNode.AddAttribute(curAttr);
                }
            }
            break;

            case HtmlLexerEvent.Attribute:
            {
                //create attribute node and wait for its value
                string nodename = textSnapshot.Substring(startIndex, len);
                curAttr = this._resultHtmlDoc.CreateAttribute(null, nodename);
            }
            break;

            case HtmlLexerEvent.NodeNameOrAttribute:
            {
                //the lexer dose not store state of element name or attribute name
                //so we use parseState to decide here


                string name = textSnapshot.Substring(startIndex, len);
                switch (parseState)
                {
                case 0:
                {
                    //create element
                    DomElement elem = this._resultHtmlDoc.CreateElement(null, name);
                    if (curHtmlNode != null)
                    {
                        curHtmlNode.AddChild(elem);
                        openEltStack.Push(curHtmlNode);
                    }
                    curHtmlNode     = elem;
                    parseState      = 1;           //attribute
                    curTextNode     = null;
                    curAttr         = null;
                    waitingAttrName = null;
                }
                break;

                case 1:
                {
                    //wait for attr value
                    if (waitingAttrName != null)
                    {
                        //push waiting attr
                        curAttr       = this._resultHtmlDoc.CreateAttribute(null, waitingAttrName);
                        curAttr.Value = "";
                        curHtmlNode.AddAttribute(curAttr);
                        curAttr = null;
                    }
                    waitingAttrName = name;
                }
                break;

                case 2:
                {
                    //****
                    //node name after open slash
                    //TODO: review here,avoid direct string comparison
                    if (curHtmlNode.LocalName == name)
                    {
                        if (openEltStack.Count > 0)
                        {
                            waitingAttrName = null;
                            curTextNode     = null;
                            curAttr         = null;
                            curHtmlNode     = openEltStack.Pop();
                        }
                        parseState = 3;
                    }
                    else
                    {
                        //if not equal then check if current node need close tag or not
                        if (HtmlTagMatching.IsSingleTag(curHtmlNode.LocalNameIndex))
                        {
                            if (openEltStack.Count > 0)
                            {
                                waitingAttrName = null;
                                curHtmlNode     = openEltStack.Pop();
                                curAttr         = null;
                                curTextNode     = null;
                            }
                            if (curHtmlNode.LocalName == name)
                            {
                                if (openEltStack.Count > 0)
                                {
                                    curTextNode     = null;
                                    curAttr         = null;
                                    curHtmlNode     = openEltStack.Pop();
                                    waitingAttrName = null;
                                }
                                parseState = 3;
                            }
                            else
                            {
                                //implement err handling here!
                                throw new NotSupportedException();
                            }
                        }
                        else
                        {
                            //implement err handling here!
                            throw new NotSupportedException();
                        }
                    }
                }
                break;

                case 4:
                {
                    //attribute value as id
                    if (curAttr != null)
                    {
                        curAttr.Value   = name;
                        curAttr         = null;
                        parseState      = 0;
                        waitingAttrName = null;
                    }
                    else
                    {
                    }
                }
                break;

                case 10:
                {
                    //document node

                    parseState = 11;
                    //after docnodename , this may be attr of the document node
                    this.domDocNode        = (DomDocumentNode)this._resultHtmlDoc.CreateDocumentNodeElement();
                    domDocNode.DocNodeName = name;
                }
                break;

                case 11:
                {
                    //doc
                    domDocNode.AddParameter(name);
                }
                break;

                default:
                {
                }
                break;
                }
            }
            break;

            case HtmlLexerEvent.VisitCloseAngle:
            {
                //close angle of current new node
                //enter into its content

                if (parseState == 11)
                {
                    //add doctype to html
                    this._resultHtmlDoc.RootNode.AddChild(this.domDocNode);
                    domDocNode = null;
                }

                if (waitingAttrName != null)
                {
                    curAttr       = this._resultHtmlDoc.CreateAttribute(null, waitingAttrName);
                    curAttr.Value = "";
                    curHtmlNode.AddAttribute(curAttr);
                    curAttr = null;
                }


                waitingAttrName = null;
                parseState      = 0;
                curTextNode     = null;
                curAttr         = null;
            }
            break;

            case HtmlLexerEvent.VisitAttrAssign:
            {
                parseState = 4;
            }
            break;

            case HtmlLexerEvent.VisitOpenSlashAngle:
            {
                parseState = 2;
            }
            break;

            case HtmlLexerEvent.VisitCloseSlashAngle:
            {
                if (openEltStack.Count > 0)
                {
                    curTextNode     = null;
                    curAttr         = null;
                    waitingAttrName = null;
                    curHtmlNode     = openEltStack.Pop();
                }
                parseState = 0;
            }
            break;

            case HtmlLexerEvent.VisitOpenAngleExclimation:
            {
                //eg. doctype
                parseState = 10;
            }
            break;

            default:
            {
                //1. visit open angle
            }
            break;
            }
        }
예제 #6
0
        void LexStateChanged(HtmlLexerEvent lexEvent, int startIndex, int len)
        {
            switch (lexEvent)
            {
                case HtmlLexerEvent.CommentContent:
                    {
                        //var commentContent = this.textSnapshot.Copy(startIndex, len); 

                    } break;
                case HtmlLexerEvent.FromContentPart:
                    {

                        if (curTextNode == null)
                        {
                            curTextNode = _resultHtmlDoc.CreateTextNode(
                                HtmlDecodeHelper.DecodeHtml(this.textSnapshot, startIndex, len));

                            if (curHtmlNode != null)
                            {
                                curHtmlNode.AddChild(curTextNode);
                            }
                        }
                        else
                        {
                            curTextNode.AppendTextContent(HtmlDecodeHelper.DecodeHtml(this.textSnapshot, startIndex, len));

                        }
                    } break;
                case HtmlLexerEvent.AttributeValueAsLiteralString:
                    {
                        //assign value and add to parent
                        curAttr.Value = textSnapshot.Substring(startIndex, len);
                        curHtmlNode.AddAttribute(curAttr);

                    } break;

                case HtmlLexerEvent.Attribute:
                    {
                        string nodename = textSnapshot.Substring(startIndex, len);
                        curAttr = this._resultHtmlDoc.CreateAttribute(null, nodename);

                    } break;
                case HtmlLexerEvent.NodeNameOrAttribute:
                    {
                        string name = textSnapshot.Substring(startIndex, len);
                        switch (parseState)
                        {
                            case 0:
                                {
                                    //create element 
                                    DomElement elem = this._resultHtmlDoc.CreateElement(null, name);
                                    if (curHtmlNode != null)
                                    {
                                        curHtmlNode.AddChild(elem);
                                        openEltStack.Push(curHtmlNode);
                                    }
                                    curHtmlNode = elem;
                                    parseState = 1;//attribute
                                    curTextNode = null;
                                    curAttr = null;
                                    waitingAttrName = null;
                                } break;
                            case 1:
                                {
                                    //wait for attr value 
                                    if (waitingAttrName != null)
                                    {
                                        //push waiting attr
                                        curAttr = this._resultHtmlDoc.CreateAttribute(null, waitingAttrName);
                                        curAttr.Value = "";
                                        curHtmlNode.AddAttribute(curAttr);
                                        curAttr = null;
                                    }
                                    waitingAttrName = name;
                                } break;
                            case 2:
                                {
                                    //****
                                    //node name after open slash
                                    //TODO: review here,avoid direct string comparison
                                    if (curHtmlNode.LocalName == name)
                                    {
                                        if (openEltStack.Count > 0)
                                        {
                                            waitingAttrName = null;
                                            curTextNode = null;
                                            curAttr = null;
                                            curHtmlNode = openEltStack.Pop();
                                        }
                                        parseState = 3;
                                    }
                                    else
                                    {
                                        //if not equal then check if current node need close tag or not
                                        if (HtmlDecodeHelper.IsSingleTag(curHtmlNode.LocalNameIndex))
                                        {
                                            if (openEltStack.Count > 0)
                                            {
                                                waitingAttrName = null;
                                                curHtmlNode = openEltStack.Pop();
                                                curAttr = null;
                                                curTextNode = null;
                                            }
                                            if (curHtmlNode.LocalName == name)
                                            {
                                                if (openEltStack.Count > 0)
                                                {
                                                    curTextNode = null;
                                                    curAttr = null;
                                                    curHtmlNode = openEltStack.Pop();
                                                    waitingAttrName = null;
                                                }
                                                parseState = 3;
                                            }
                                            else
                                            {
                                                //implement err handling here!
                                                throw new NotSupportedException();
                                            }
                                        }
                                        else
                                        {
                                            //implement err handling here!
                                            throw new NotSupportedException();
                                        }
                                    }
                                } break;
                            case 4:
                                {
                                    //attribute value as id
                                    if (curAttr != null)
                                    {
                                        curAttr.Value = name;
                                        curAttr = null;
                                        parseState = 0;
                                        waitingAttrName = null;
                                    }
                                    else
                                    {

                                    }
                                } break;
                            case 10:
                                {
                                    //document node 

                                    parseState = 11;
                                    //after docnodename , this may be attr of the document node
                                    this.domDocNode = (DomDocumentNode)this._resultHtmlDoc.CreateDocumentNodeElement();
                                    domDocNode.DocNodeName = name;
                                } break;
                            case 11:
                                {
                                    //doc 
                                    domDocNode.AddParameter(name);

                                } break;
                            default:
                                {
                                } break;
                        }

                    } break;
                case HtmlLexerEvent.VisitCloseAngle:
                    {
                        //close angle of current new node
                        //enter into its content

                        if (parseState == 11)
                        {
                            //add doctype to html
                            this._resultHtmlDoc.RootNode.AddChild(this.domDocNode);
                            domDocNode = null;
                        } 

                        if (waitingAttrName != null)
                        {
                            curAttr = this._resultHtmlDoc.CreateAttribute(null, waitingAttrName);
                            curAttr.Value = "";
                            curHtmlNode.AddAttribute(curAttr);
                            curAttr = null;
                        }


                        waitingAttrName = null;
                        parseState = 0;
                        curTextNode = null;
                        curAttr = null;
                    } break;
                case HtmlLexerEvent.VisitAttrAssign:
                    {
                        parseState = 4;
                    } break;
                case HtmlLexerEvent.VisitOpenSlashAngle:
                    {
                        parseState = 2;
                    } break;
                case HtmlLexerEvent.VisitCloseSlashAngle:
                    {

                        if (openEltStack.Count > 0)
                        {
                            curTextNode = null;
                            curAttr = null;
                            waitingAttrName = null;
                            curHtmlNode = openEltStack.Pop();
                        }
                        parseState = 0;

                    } break;
                case HtmlLexerEvent.VisitOpenAngleExclimation:
                    {
                        parseState = 10;
                    } break;
                default:
                    {
                        //1. visit open angle
                    } break;

            }
        }
예제 #7
0
 public void ResetParser()
 {
     this._resultHtmlDoc = null;
     this.openEltStack.Clear();
     this.curHtmlNode = null;
     this.curAttr = null;
     this.curTextNode = null;
     this.parseState = 0;
     this.textSnapshot = null;
 }
        public static void AddTextContent(this DomElement elem, string text)
        {
            DomTextNode newTextNode = elem.OwnerDocument.CreateTextNode(text.ToCharArray());

            elem.AddChild(newTextNode);
        }