static void VerifyHtmlTokenizerOutput(string path) { var tokens = Path.ChangeExtension(path, ".tokens"); var expected = File.Exists(tokens) ? File.ReadAllText(tokens) : string.Empty; var actual = new StringBuilder(); using (var textReader = File.OpenText(path)) { var tokenizer = new HtmlTokenizer(textReader); HtmlToken token; Assert.AreEqual(HtmlTokenizerState.Data, tokenizer.TokenizerState); while (tokenizer.ReadNextToken(out token)) { actual.AppendFormat("{0}: ", token.Kind); switch (token.Kind) { case HtmlTokenKind.Data: var text = (HtmlDataToken)token; for (int i = 0; i < text.Data.Length; i++) { switch (text.Data[i]) { case '\f': actual.Append("\\f"); break; case '\t': actual.Append("\\t"); break; case '\r': actual.Append("\\r"); break; case '\n': actual.Append("\\n"); break; default: actual.Append(text.Data[i]); break; } } actual.AppendLine(); break; case HtmlTokenKind.Tag: var tag = (HtmlTagToken)token; actual.AppendFormat("<{0}{1}", tag.IsEndTag ? "/" : "", tag.Name); foreach (var attribute in tag.Attributes) { if (attribute.Value != null) actual.AppendFormat(" {0}={1}", attribute.Name, Quote(attribute.Value)); else actual.AppendFormat(" {0}", attribute.Name); } actual.Append(tag.IsEmptyElement ? "/>" : ">"); actual.AppendLine(); break; case HtmlTokenKind.Comment: var comment = (HtmlCommentToken)token; actual.AppendLine(comment.Comment); break; case HtmlTokenKind.DocType: var doctype = (HtmlDocTypeToken)token; if (doctype.ForceQuirksMode) actual.Append("<!-- force quirks mode -->"); actual.Append("<!DOCTYPE"); if (doctype.Name != null) actual.AppendFormat(" {0}", doctype.Name.ToUpperInvariant()); if (doctype.PublicIdentifier != null) { actual.AppendFormat(" PUBLIC {0}", Quote(doctype.PublicIdentifier)); if (doctype.SystemIdentifier != null) actual.AppendFormat(" {0}", Quote(doctype.SystemIdentifier)); } else if (doctype.SystemIdentifier != null) { actual.AppendFormat(" SYSTEM {0}", Quote(doctype.SystemIdentifier)); } actual.Append(">"); actual.AppendLine(); break; default: Assert.Fail("Unhandled token type: {0}", token.Kind); break; } } Assert.AreEqual(HtmlTokenizerState.EndOfFile, tokenizer.TokenizerState); } if (!File.Exists(tokens)) File.WriteAllText(tokens, actual.ToString()); Assert.AreEqual(expected, actual.ToString(), "The token stream does not match the expected tokens."); }
public override void Parse(TextSource textSnapshot, WebDocument htmldoc, DomElement currentNode) { this._resultHtmlDoc = htmldoc; char[] copyBuffer = textSnapshot.ActualSnapshot.Copy(0, textSnapshot.ActualSnapshot.Length); using (var ms = new System.IO.MemoryStream(System.Text.Encoding.UTF8.GetBytes(copyBuffer))) using (var textReader = new System.IO.StreamReader(ms)) { var tokenizer = new HtmlTokenizer(textReader); HtmlToken token; while (tokenizer.ReadNextToken(out token)) { switch (token.Kind) { case HtmlTokenKind.Data: { var text = (HtmlDataToken)token; currentNode.AddChild(_resultHtmlDoc.CreateTextNode(text.Data.ToCharArray())); } break; case HtmlTokenKind.Tag: { var tag = (HtmlTagToken)token; if (!tag.IsEndTag) { //open tag DomElement elem = this._resultHtmlDoc.CreateElement(null, tag.Name); currentNode.AddChild(elem); foreach (var attribute in tag.Attributes) { var attr = this._resultHtmlDoc.CreateAttribute(null, attribute.Name); if (attribute.Value != null) { attr.Value = attribute.Value; } elem.AddAttribute(attr); } if (!tag.IsEmptyElement) { openEltStack.Push(currentNode); currentNode = elem; } } else { //this is end tag //check end tag match or not int tagNameIndex = _resultHtmlDoc.AddStringIfNotExists(tag.Name); if (currentNode.Name == tag.Name) { currentNode = openEltStack.Pop(); } else { //if not equal then check if current node need close tag or not int count = 3;//? bool ok = false; while (count > 0) { if (HtmlTagMatching.IsSingleTag(currentNode.LocalNameIndex)) { if (openEltStack.Count > 0) { currentNode = openEltStack.Pop(); } if (currentNode.LocalName == tag.Name) { if (openEltStack.Count > 0) { currentNode = openEltStack.Pop(); ok = true; break; } } } else if (HtmlTagMatching.CanAutoClose(currentNode.LocalNameIndex)) { if (openEltStack.Count > 0) { currentNode = openEltStack.Pop(); } if (currentNode.LocalName == tag.Name) { if (openEltStack.Count > 0) { currentNode = openEltStack.Pop(); ok = true; break; } } } else { //implement err handling here! throw new NotSupportedException(); } count--; } if (!ok) { throw new NotSupportedException(); } } } } break; case HtmlTokenKind.Comment: break; case HtmlTokenKind.DocType: break; default: { } break; } } } }