public static void ParseHtmlDom(TextSource snapSource, IHtmlDocument htmldoc, WebDom.DomElement parentElement) { var parser = GetHtmlParser(); //------------------------ parser.Parse(snapSource, (LayoutFarm.WebDom.Impl.HtmlDocument)htmldoc, parentElement); FreeHtmlParser(parser); }
/// <summary> /// Parses the source html to css boxes tree structure. /// </summary> /// <param name="source">the html source to parse</param> public static HtmlDocument ParseDocument(TextSource snapSource) { var parser = GetHtmlParser(); //------------------------ var blankHtmlDoc = new HtmlDocument(); parser.Parse(snapSource, blankHtmlDoc, blankHtmlDoc.RootNode); FreeHtmlParser(parser); return blankHtmlDoc; }
private void button1_Click(object sender, EventArgs e) { //test web parser var parser = HtmlParser.CreateHtmlParser(ParseEngineKind.MyHtmlParser); var blankHtmlDoc = new LayoutFarm.WebDom.Impl.HtmlDocument(); var snapSource = new TextSource(this.richTextBox1.Text.ToCharArray()); parser.Parse(snapSource, blankHtmlDoc, blankHtmlDoc.RootNode); this.treeView2.Nodes.Clear(); var rootNode = new TreeNode("root"); DescibeNode(blankHtmlDoc.RootNode, rootNode); treeView2.Nodes.Add(rootNode); this.treeView2.ExpandAll(); }
public override void Parse(TextSource textSnapshot, WebDocument htmldoc, DomElement currentNode) { this._resultHtmlDoc = htmldoc; char[] copyBuffer = textSnapshot.ActualSnapshot.Copy(0, textSnapshot.ActualSnapshot.Length); using (var ms = new System.IO.MemoryStream(System.Text.Encoding.UTF8.GetBytes(copyBuffer))) using (var textReader = new System.IO.StreamReader(ms)) { var tokenizer = new HtmlTokenizer(textReader); HtmlToken token; while (tokenizer.ReadNextToken(out token)) { switch (token.Kind) { case HtmlTokenKind.Data: { var text = (HtmlDataToken)token; currentNode.AddChild(_resultHtmlDoc.CreateTextNode(text.Data.ToCharArray())); } break; case HtmlTokenKind.Tag: { var tag = (HtmlTagToken)token; if (!tag.IsEndTag) { //open tag DomElement elem = this._resultHtmlDoc.CreateElement(null, tag.Name); currentNode.AddChild(elem); foreach (var attribute in tag.Attributes) { var attr = this._resultHtmlDoc.CreateAttribute(null, attribute.Name); if (attribute.Value != null) { attr.Value = attribute.Value; } elem.AddAttribute(attr); } if (!tag.IsEmptyElement) { openEltStack.Push(currentNode); currentNode = elem; } } else { //this is end tag //check end tag match or not int tagNameIndex = _resultHtmlDoc.AddStringIfNotExists(tag.Name); if (currentNode.Name == tag.Name) { currentNode = openEltStack.Pop(); } else { //if not equal then check if current node need close tag or not int count = 3;//? bool ok = false; while (count > 0) { if (HtmlTagMatching.IsSingleTag(currentNode.LocalNameIndex)) { if (openEltStack.Count > 0) { currentNode = openEltStack.Pop(); } if (currentNode.LocalName == tag.Name) { if (openEltStack.Count > 0) { currentNode = openEltStack.Pop(); ok = true; break; } } } else if (HtmlTagMatching.CanAutoClose(currentNode.LocalNameIndex)) { if (openEltStack.Count > 0) { currentNode = openEltStack.Pop(); } if (currentNode.LocalName == tag.Name) { if (openEltStack.Count > 0) { currentNode = openEltStack.Pop(); ok = true; break; } } } else { //implement err handling here! throw new NotSupportedException(); } count--; } if (!ok) { throw new NotSupportedException(); } } } } break; case HtmlTokenKind.Comment: break; case HtmlTokenKind.DocType: break; default: { } break; } } } }
public void Parse(TextSource textSnapshot, WebDocument htmldoc, DomElement currentNode) { this.Parse(textSnapshot.ActualSnapshot, htmldoc, currentNode); }
public abstract void Parse(TextSource textSnapshot, WebDocument htmldoc, DomElement currentNode);
public override void Parse(TextSource textSnapshot, WebDocument htmldoc, DomElement currentNode) { this._resultHtmlDoc = htmldoc; char[] copyBuffer = textSnapshot.ActualSnapshot.Copy(0, textSnapshot.ActualSnapshot.Length); using (var ms = new System.IO.MemoryStream(System.Text.Encoding.UTF8.GetBytes(copyBuffer))) using (var textReader = new System.IO.StreamReader(ms)) { var tokenizer = new HtmlTokenizer(textReader); HtmlToken token; while (tokenizer.ReadNextToken(out token)) { switch (token.Kind) { case HtmlTokenKind.Data: { var text = (HtmlDataToken)token; currentNode.AddChild(_resultHtmlDoc.CreateTextNode(text.Data.ToCharArray())); } break; case HtmlTokenKind.Tag: { var tag = (HtmlTagToken)token; if (!tag.IsEndTag) { //open tag DomElement elem = this._resultHtmlDoc.CreateElement(null, tag.Name); currentNode.AddChild(elem); foreach (var attribute in tag.Attributes) { var attr = this._resultHtmlDoc.CreateAttribute(null, attribute.Name); if (attribute.Value != null) { attr.Value = attribute.Value; } elem.AddAttribute(attr); } if (!tag.IsEmptyElement) { openEltStack.Push(currentNode); currentNode = elem; } } else { //this is end tag //check end tag match or not int tagNameIndex = _resultHtmlDoc.AddStringIfNotExists(tag.Name); if (currentNode.Name == tag.Name) { currentNode = openEltStack.Pop(); } else { //if not equal then check if current node need close tag or not int count = 3; //? bool ok = false; while (count > 0) { if (HtmlTagMatching.IsSingleTag(currentNode.LocalNameIndex)) { if (openEltStack.Count > 0) { currentNode = openEltStack.Pop(); } if (currentNode.LocalName == tag.Name) { if (openEltStack.Count > 0) { currentNode = openEltStack.Pop(); ok = true; break; } } } else if (HtmlTagMatching.CanAutoClose(currentNode.LocalNameIndex)) { if (openEltStack.Count > 0) { currentNode = openEltStack.Pop(); } if (currentNode.LocalName == tag.Name) { if (openEltStack.Count > 0) { currentNode = openEltStack.Pop(); ok = true; break; } } } else { //implement err handling here! throw new NotSupportedException(); } count--; } if (!ok) { throw new NotSupportedException(); } } } } break; case HtmlTokenKind.Comment: break; case HtmlTokenKind.DocType: break; default: { } break; } } } }
public override void Parse(TextSource textSnapshot, WebDocument htmldoc, DomElement currentNode) { this.Parse(textSnapshot.ActualSnapshot, htmldoc, currentNode); }
public static string GetInternalText(TextSource textsource) { return new string(TextSnapshot.UnsafeGetInternalBuffer(textsource.actualSnapshot)); }
public static string GetInternalText(TextSource textsource) { return(new string(TextSnapshot.UnsafeGetInternalBuffer(textsource.actualSnapshot))); }