Example #1
0
        static void VerifyHtmlTokenizerOutput(string path)
        {
            var tokens = Path.ChangeExtension(path, ".tokens");
            var expected = File.Exists(tokens) ? File.ReadAllText(tokens) : string.Empty;
            var actual = new StringBuilder(); 

            using (var textReader = File.OpenText(path))
            {
                var tokenizer = new HtmlTokenizer(textReader);
                HtmlToken token;

                Assert.AreEqual(HtmlTokenizerState.Data, tokenizer.TokenizerState);

                while (tokenizer.ReadNextToken(out token))
                {
                    actual.AppendFormat("{0}: ", token.Kind);

                    switch (token.Kind)
                    {
                        case HtmlTokenKind.Data:
                            var text = (HtmlDataToken)token;

                            for (int i = 0; i < text.Data.Length; i++)
                            {
                                switch (text.Data[i])
                                {
                                    case '\f': actual.Append("\\f"); break;
                                    case '\t': actual.Append("\\t"); break;
                                    case '\r': actual.Append("\\r"); break;
                                    case '\n': actual.Append("\\n"); break;
                                    default: actual.Append(text.Data[i]); break;
                                }
                            }
                            actual.AppendLine();
                            break;
                        case HtmlTokenKind.Tag:
                            var tag = (HtmlTagToken)token;

                            actual.AppendFormat("<{0}{1}", tag.IsEndTag ? "/" : "", tag.Name);

                            foreach (var attribute in tag.Attributes)
                            {
                                if (attribute.Value != null)
                                    actual.AppendFormat(" {0}={1}", attribute.Name, Quote(attribute.Value));
                                else
                                    actual.AppendFormat(" {0}", attribute.Name);
                            }

                            actual.Append(tag.IsEmptyElement ? "/>" : ">");

                            actual.AppendLine();
                            break;
                        case HtmlTokenKind.Comment:
                            var comment = (HtmlCommentToken)token;
                            actual.AppendLine(comment.Comment);
                            break;
                        case HtmlTokenKind.DocType:
                            var doctype = (HtmlDocTypeToken)token;

                            if (doctype.ForceQuirksMode)
                                actual.Append("<!-- force quirks mode -->");

                            actual.Append("<!DOCTYPE");

                            if (doctype.Name != null)
                                actual.AppendFormat(" {0}", doctype.Name.ToUpperInvariant());

                            if (doctype.PublicIdentifier != null)
                            {
                                actual.AppendFormat(" PUBLIC {0}", Quote(doctype.PublicIdentifier));
                                if (doctype.SystemIdentifier != null)
                                    actual.AppendFormat(" {0}", Quote(doctype.SystemIdentifier));
                            }
                            else if (doctype.SystemIdentifier != null)
                            {
                                actual.AppendFormat(" SYSTEM {0}", Quote(doctype.SystemIdentifier));
                            }

                            actual.Append(">");
                            actual.AppendLine();
                            break;
                        default:
                            Assert.Fail("Unhandled token type: {0}", token.Kind);
                            break;
                    }
                }

                Assert.AreEqual(HtmlTokenizerState.EndOfFile, tokenizer.TokenizerState);
            }

            if (!File.Exists(tokens))
                File.WriteAllText(tokens, actual.ToString());

            Assert.AreEqual(expected, actual.ToString(), "The token stream does not match the expected tokens.");
        }
Example #2
0
        public override void Parse(TextSource textSnapshot, WebDocument htmldoc, DomElement currentNode)
        {
            this._resultHtmlDoc = htmldoc;
            char[] copyBuffer = textSnapshot.ActualSnapshot.Copy(0, textSnapshot.ActualSnapshot.Length);
            using (var ms = new System.IO.MemoryStream(System.Text.Encoding.UTF8.GetBytes(copyBuffer)))
            using (var textReader = new System.IO.StreamReader(ms))
            {
                var tokenizer = new HtmlTokenizer(textReader);
                HtmlToken token;
                while (tokenizer.ReadNextToken(out token))
                {
                    switch (token.Kind)
                    {
                        case HtmlTokenKind.Data:
                            {
                                var text = (HtmlDataToken)token;
                                currentNode.AddChild(_resultHtmlDoc.CreateTextNode(text.Data.ToCharArray()));
                            }
                            break;
                        case HtmlTokenKind.Tag:
                            {
                                var tag = (HtmlTagToken)token;
                                if (!tag.IsEndTag)
                                {
                                    //open tag 
                                    DomElement elem = this._resultHtmlDoc.CreateElement(null, tag.Name);
                                    currentNode.AddChild(elem);
                                    foreach (var attribute in tag.Attributes)
                                    {
                                        var attr = this._resultHtmlDoc.CreateAttribute(null, attribute.Name);
                                        if (attribute.Value != null)
                                        {
                                            attr.Value = attribute.Value;
                                        }
                                        elem.AddAttribute(attr);
                                    }
                                    if (!tag.IsEmptyElement)
                                    {
                                        openEltStack.Push(currentNode);
                                        currentNode = elem;
                                    }
                                }
                                else
                                {
                                    //this is end tag
                                    //check end tag match or not
                                    int tagNameIndex = _resultHtmlDoc.AddStringIfNotExists(tag.Name);
                                    if (currentNode.Name == tag.Name)
                                    {
                                        currentNode = openEltStack.Pop();
                                    }
                                    else
                                    {
                                        //if not equal then check if current node need close tag or not
                                        int count = 3;//?
                                        bool ok = false;
                                        while (count > 0)
                                        {
                                            if (HtmlTagMatching.IsSingleTag(currentNode.LocalNameIndex))
                                            {
                                                if (openEltStack.Count > 0)
                                                {
                                                    currentNode = openEltStack.Pop();
                                                }
                                                if (currentNode.LocalName == tag.Name)
                                                {
                                                    if (openEltStack.Count > 0)
                                                    {
                                                        currentNode = openEltStack.Pop();
                                                        ok = true;
                                                        break;
                                                    }
                                                }
                                            }
                                            else if (HtmlTagMatching.CanAutoClose(currentNode.LocalNameIndex))
                                            {
                                                if (openEltStack.Count > 0)
                                                {
                                                    currentNode = openEltStack.Pop();
                                                }
                                                if (currentNode.LocalName == tag.Name)
                                                {
                                                    if (openEltStack.Count > 0)
                                                    {
                                                        currentNode = openEltStack.Pop();
                                                        ok = true;
                                                        break;
                                                    }
                                                }
                                            }
                                            else
                                            {
                                                //implement err handling here!
                                                throw new NotSupportedException();
                                            }
                                            count--;
                                        }
                                        if (!ok)
                                        {
                                            throw new NotSupportedException();
                                        }
                                    }
                                }
                            }
                            break;
                        case HtmlTokenKind.Comment:

                            break;
                        case HtmlTokenKind.DocType:

                            break;
                        default:
                            {
                            }
                            break;
                    }
                }
            }
        }