void LexStateChanged(HtmlLexerEvent lexEvent, int startIndex, int len) { switch (lexEvent) { case HtmlLexerEvent.CommentContent: { //var commentContent = this.textSnapshot.Copy(startIndex, len); } break; case HtmlLexerEvent.FromContentPart: { if (curTextNode == null) { curTextNode = _resultHtmlDoc.CreateTextNode( HtmlDecodeHelper.DecodeHtml(this.textSnapshot, startIndex, len)); if (curHtmlNode != null) { curHtmlNode.AddChild(curTextNode); } } else { curTextNode.AppendTextContent(HtmlDecodeHelper.DecodeHtml(this.textSnapshot, startIndex, len)); } } break; case HtmlLexerEvent.AttributeValueAsLiteralString: { //assign value and add to parent if (parseState == 11) { //document node //doc domDocNode.AddParameter(textSnapshot.Substring(startIndex, len)); } else { curAttr.Value = textSnapshot.Substring(startIndex, len); curHtmlNode.AddAttribute(curAttr); } } break; case HtmlLexerEvent.Attribute: { //create attribute node and wait for its value string nodename = textSnapshot.Substring(startIndex, len); curAttr = this._resultHtmlDoc.CreateAttribute(null, nodename); } break; case HtmlLexerEvent.NodeNameOrAttribute: { //the lexer dose not store state of element name or attribute name //so we use parseState to decide here string name = textSnapshot.Substring(startIndex, len); switch (parseState) { case 0: { //create element DomElement elem = this._resultHtmlDoc.CreateElement(null, name); if (curHtmlNode != null) { curHtmlNode.AddChild(elem); openEltStack.Push(curHtmlNode); } curHtmlNode = elem; parseState = 1; //attribute curTextNode = null; curAttr = null; waitingAttrName = null; } break; case 1: { //wait for attr value if (waitingAttrName != null) { //push waiting attr curAttr = this._resultHtmlDoc.CreateAttribute(null, waitingAttrName); curAttr.Value = ""; curHtmlNode.AddAttribute(curAttr); curAttr = null; } waitingAttrName = name; } break; case 2: { //**** //node name after open slash //TODO: review here,avoid direct string comparison if (curHtmlNode.LocalName == name) { if (openEltStack.Count > 0) { waitingAttrName = null; curTextNode = null; curAttr = null; curHtmlNode = openEltStack.Pop(); } parseState = 3; } else { //if not equal then check if current node need close tag or not if (HtmlTagMatching.IsSingleTag(curHtmlNode.LocalNameIndex)) { if (openEltStack.Count > 0) { waitingAttrName = null; curHtmlNode = openEltStack.Pop(); curAttr = null; curTextNode = null; } if (curHtmlNode.LocalName == name) { if (openEltStack.Count > 0) { curTextNode = null; curAttr = null; curHtmlNode = openEltStack.Pop(); waitingAttrName = null; } parseState = 3; } else { //implement err handling here! throw new NotSupportedException(); } } else { //implement err handling here! throw new NotSupportedException(); } } } break; case 4: { //attribute value as id if (curAttr != null) { curAttr.Value = name; curAttr = null; parseState = 0; waitingAttrName = null; } else { } } break; case 10: { //document node parseState = 11; //after docnodename , this may be attr of the document node this.domDocNode = (DomDocumentNode)this._resultHtmlDoc.CreateDocumentNodeElement(); domDocNode.DocNodeName = name; } break; case 11: { //doc domDocNode.AddParameter(name); } break; default: { } break; } } break; case HtmlLexerEvent.VisitCloseAngle: { //close angle of current new node //enter into its content if (parseState == 11) { //add doctype to html this._resultHtmlDoc.RootNode.AddChild(this.domDocNode); domDocNode = null; } if (waitingAttrName != null) { curAttr = this._resultHtmlDoc.CreateAttribute(null, waitingAttrName); curAttr.Value = ""; curHtmlNode.AddAttribute(curAttr); curAttr = null; } waitingAttrName = null; parseState = 0; curTextNode = null; curAttr = null; } break; case HtmlLexerEvent.VisitAttrAssign: { parseState = 4; } break; case HtmlLexerEvent.VisitOpenSlashAngle: { parseState = 2; } break; case HtmlLexerEvent.VisitCloseSlashAngle: { if (openEltStack.Count > 0) { curTextNode = null; curAttr = null; waitingAttrName = null; curHtmlNode = openEltStack.Pop(); } parseState = 0; } break; case HtmlLexerEvent.VisitOpenAngleExclimation: { //eg. doctype parseState = 10; } break; default: { //1. visit open angle } break; } }
public override void Parse(TextSource textSnapshot, WebDocument htmldoc, DomElement currentNode) { this._resultHtmlDoc = htmldoc; char[] copyBuffer = textSnapshot.ActualSnapshot.Copy(0, textSnapshot.ActualSnapshot.Length); using (var ms = new System.IO.MemoryStream(System.Text.Encoding.UTF8.GetBytes(copyBuffer))) using (var textReader = new System.IO.StreamReader(ms)) { var tokenizer = new HtmlTokenizer(textReader); HtmlToken token; while (tokenizer.ReadNextToken(out token)) { switch (token.Kind) { case HtmlTokenKind.Data: { var text = (HtmlDataToken)token; currentNode.AddChild(_resultHtmlDoc.CreateTextNode(text.Data.ToCharArray())); } break; case HtmlTokenKind.Tag: { var tag = (HtmlTagToken)token; if (!tag.IsEndTag) { //open tag DomElement elem = this._resultHtmlDoc.CreateElement(null, tag.Name); currentNode.AddChild(elem); foreach (var attribute in tag.Attributes) { var attr = this._resultHtmlDoc.CreateAttribute(null, attribute.Name); if (attribute.Value != null) { attr.Value = attribute.Value; } elem.AddAttribute(attr); } if (!tag.IsEmptyElement) { openEltStack.Push(currentNode); currentNode = elem; } } else { //this is end tag //check end tag match or not int tagNameIndex = _resultHtmlDoc.AddStringIfNotExists(tag.Name); if (currentNode.Name == tag.Name) { currentNode = openEltStack.Pop(); } else { //if not equal then check if current node need close tag or not int count = 3; //? bool ok = false; while (count > 0) { if (HtmlTagMatching.IsSingleTag(currentNode.LocalNameIndex)) { if (openEltStack.Count > 0) { currentNode = openEltStack.Pop(); } if (currentNode.LocalName == tag.Name) { if (openEltStack.Count > 0) { currentNode = openEltStack.Pop(); ok = true; break; } } } else if (HtmlTagMatching.CanAutoClose(currentNode.LocalNameIndex)) { if (openEltStack.Count > 0) { currentNode = openEltStack.Pop(); } if (currentNode.LocalName == tag.Name) { if (openEltStack.Count > 0) { currentNode = openEltStack.Pop(); ok = true; break; } } } else { //implement err handling here! throw new NotSupportedException(); } count--; } if (!ok) { throw new NotSupportedException(); } } } } break; case HtmlTokenKind.Comment: break; case HtmlTokenKind.DocType: break; default: { } break; } } } }
void LexStateChanged(HtmlLexerEvent lexEvent, int startIndex, int len) { switch (lexEvent) { case HtmlLexerEvent.CommentContent: { //var commentContent = this.textSnapshot.Copy(startIndex, len); } break; case HtmlLexerEvent.FromContentPart: { if (curTextNode == null) { curTextNode = _resultHtmlDoc.CreateTextNode( HtmlDecodeHelper.DecodeHtml(this.textSnapshot, startIndex, len)); if (curHtmlNode != null) { curHtmlNode.AddChild(curTextNode); } } else { curTextNode.AppendTextContent(HtmlDecodeHelper.DecodeHtml(this.textSnapshot, startIndex, len)); } } break; case HtmlLexerEvent.AttributeValueAsLiteralString: { //assign value and add to parent curAttr.Value = textSnapshot.Substring(startIndex, len); curHtmlNode.AddAttribute(curAttr); } break; case HtmlLexerEvent.Attribute: { string nodename = textSnapshot.Substring(startIndex, len); curAttr = this._resultHtmlDoc.CreateAttribute(null, nodename); } break; case HtmlLexerEvent.NodeNameOrAttribute: { string name = textSnapshot.Substring(startIndex, len); switch (parseState) { case 0: { //create element DomElement elem = this._resultHtmlDoc.CreateElement(null, name); if (curHtmlNode != null) { curHtmlNode.AddChild(elem); openEltStack.Push(curHtmlNode); } curHtmlNode = elem; parseState = 1;//attribute curTextNode = null; curAttr = null; waitingAttrName = null; } break; case 1: { //wait for attr value if (waitingAttrName != null) { //push waiting attr curAttr = this._resultHtmlDoc.CreateAttribute(null, waitingAttrName); curAttr.Value = ""; curHtmlNode.AddAttribute(curAttr); curAttr = null; } waitingAttrName = name; } break; case 2: { //**** //node name after open slash //TODO: review here,avoid direct string comparison if (curHtmlNode.LocalName == name) { if (openEltStack.Count > 0) { waitingAttrName = null; curTextNode = null; curAttr = null; curHtmlNode = openEltStack.Pop(); } parseState = 3; } else { //if not equal then check if current node need close tag or not if (HtmlDecodeHelper.IsSingleTag(curHtmlNode.LocalNameIndex)) { if (openEltStack.Count > 0) { waitingAttrName = null; curHtmlNode = openEltStack.Pop(); curAttr = null; curTextNode = null; } if (curHtmlNode.LocalName == name) { if (openEltStack.Count > 0) { curTextNode = null; curAttr = null; curHtmlNode = openEltStack.Pop(); waitingAttrName = null; } parseState = 3; } else { //implement err handling here! throw new NotSupportedException(); } } else { //implement err handling here! throw new NotSupportedException(); } } } break; case 4: { //attribute value as id if (curAttr != null) { curAttr.Value = name; curAttr = null; parseState = 0; waitingAttrName = null; } else { } } break; case 10: { //document node parseState = 11; //after docnodename , this may be attr of the document node this.domDocNode = (DomDocumentNode)this._resultHtmlDoc.CreateDocumentNodeElement(); domDocNode.DocNodeName = name; } break; case 11: { //doc domDocNode.AddParameter(name); } break; default: { } break; } } break; case HtmlLexerEvent.VisitCloseAngle: { //close angle of current new node //enter into its content if (parseState == 11) { //add doctype to html this._resultHtmlDoc.RootNode.AddChild(this.domDocNode); domDocNode = null; } if (waitingAttrName != null) { curAttr = this._resultHtmlDoc.CreateAttribute(null, waitingAttrName); curAttr.Value = ""; curHtmlNode.AddAttribute(curAttr); curAttr = null; } waitingAttrName = null; parseState = 0; curTextNode = null; curAttr = null; } break; case HtmlLexerEvent.VisitAttrAssign: { parseState = 4; } break; case HtmlLexerEvent.VisitOpenSlashAngle: { parseState = 2; } break; case HtmlLexerEvent.VisitCloseSlashAngle: { if (openEltStack.Count > 0) { curTextNode = null; curAttr = null; waitingAttrName = null; curHtmlNode = openEltStack.Pop(); } parseState = 0; } break; case HtmlLexerEvent.VisitOpenAngleExclimation: { parseState = 10; } break; default: { //1. visit open angle } break; } }