/// <summary> /// Builds a list of nodes according with 8.4 Parsing HTML fragments. /// </summary> /// <param name="sourceCode">The string to use as source code.</param> /// <param name="context">[Optional] The context node to use.</param> /// <param name="configuration">[Optional] Custom options to use for the document generation.</param> /// <returns>A list of parsed nodes.</returns> public static NodeList HtmlFragment(String sourceCode, Node context = null, IConfiguration configuration = null) { if (configuration == null) { configuration = Configuration.Default; } var source = new SourceManager(sourceCode, configuration.DefaultEncoding()); var doc = new HTMLDocument { Options = configuration }; //Disable scripting for HTML fragments (security reasons) configuration.IsScripting = false; var db = new DocumentBuilder(source, doc, configuration); if (context != null) { if (context.OwnerDocument != null && context.OwnerDocument.QuirksMode != QuirksMode.Off) { doc.QuirksMode = context.OwnerDocument.QuirksMode; } var parser = (HtmlParser)db.parser; parser.SwitchToFragment(context); return(parser.Result.DocumentElement.ChildNodes); } return(db.HtmlResult.ChildNodes); }
public void NewspaperDtdComplete() { var s = new SourceManager(@"<!DOCTYPE NEWSPAPER [ <!ELEMENT NEWSPAPER (ARTICLE+)> <!ELEMENT ARTICLE (HEADLINE,BYLINE,LEAD,BODY,NOTES)> <!ELEMENT HEADLINE (#PCDATA)> <!ELEMENT BYLINE (#PCDATA)> <!ELEMENT LEAD (#PCDATA)> <!ELEMENT BODY (#PCDATA)> <!ELEMENT NOTES (#PCDATA)> <!ATTLIST ARTICLE AUTHOR CDATA #REQUIRED> <!ATTLIST ARTICLE EDITOR CDATA #IMPLIED> <!ATTLIST ARTICLE DATE CDATA #IMPLIED> <!ATTLIST ARTICLE EDITION CDATA #IMPLIED> <!ENTITY NEWSPAPER ""Vervet Logic Times""> <!ENTITY PUBLISHER ""Vervet Logic Press""> <!ENTITY COPYRIGHT 'Copyright 1998 Vervet Logic Press'> ]>"); var t = new XmlTokenizer(s); t.DTD.Reset(); var e = t.Get(); Assert.AreEqual(XmlTokenType.DOCTYPE, e.Type); var d = (XmlDoctypeToken)e; Assert.IsFalse(d.IsNameMissing); Assert.AreEqual("NEWSPAPER", d.Name); Assert.IsTrue(d.IsSystemIdentifierMissing); Assert.AreEqual(14, t.DTD.Count); }
public void TokenizationAttributeNameDetection() { var s = new SourceManager("<input required>"); var t = new HtmlTokenizer(s); var token = t.Get(); Assert.AreEqual("required", ((HtmlTagToken)token).Attributes[0].Key); }
public void EmptyXmlDocumentTokenization() { var s = new SourceManager(""); var t = new XmlTokenizer(s); var e = t.Get(); Assert.AreEqual(XmlToken.EOF, e); }
public void TokenizationAttributesDetected() { var s = new SourceManager("<a target='_blank' href='http://whatever' title='ho'>"); var t = new HtmlTokenizer(s); var token = t.Get(); Assert.AreEqual(3, ((HtmlTagToken)token).Attributes.Count); }
/// <summary> /// Builds a new XMLDocument with the given source code string. /// </summary> /// <param name="sourceCode">The string to use as source code.</param> /// <returns>The constructed XML document.</returns> public static XMLDocument Xml(String sourceCode) { var source = new SourceManager(sourceCode); var db = new DocumentBuilder(source, new XMLDocument()); return(db.XmlResult); }
/// <summary> /// Builds a new CSSStyleSheet with the given network stream. /// </summary> /// <param name="networkStream">The stream of chars to use as source code.</param> /// <returns>The constructed CSS stylesheet.</returns> public static CSSStyleSheet Css(Stream networkStream) { var source = new SourceManager(networkStream); var db = new DocumentBuilder(source, new CSSStyleSheet()); return(db.CssResult); }
/// <summary> /// Builds a new CSSStyleSheet with the given source code string. /// </summary> /// <param name="sourceCode">The string to use as source code.</param> /// <returns>The constructed CSS stylesheet.</returns> public static CSSStyleSheet Css(String sourceCode) { var source = new SourceManager(sourceCode); var db = new DocumentBuilder(source, new CSSStyleSheet()); return(db.CssResult); }
/// <summary> /// Builds a list of nodes according with 8.4 Parsing HTML fragments. /// </summary> /// <param name="sourceCode">The string to use as source code.</param> /// <param name="context">The context node to use.</param> /// <returns>A list of parsed nodes.</returns> public static NodeList HtmlFragment(String sourceCode, Node context = null) { var source = new SourceManager(sourceCode); var doc = new HTMLDocument(); var db = new DocumentBuilder(source, doc); if (context != null) { if (context.OwnerDocument != null && context.OwnerDocument.QuirksMode != QuirksMode.Off) { doc.QuirksMode = context.OwnerDocument.QuirksMode; } // Note: For performance reasons, an implementation that does not report errors and that uses // the actual state machine described in this specification directly could use the // PLAINTEXT state instead of the RAWTEXT and script data states where those are mentioned // in the list above. Except for rules regarding parse errors, they are equivalent, since // there is no appropriate end tag token in the fragment case, yet they involve far // fewer state transitions. ((HtmlParser)db.parser).SwitchToFragment(context); return(db.HtmlResult.DocumentElement.ChildNodes); } return(db.HtmlResult.ChildNodes); }
/// <summary> /// Builds a new HTMLDocument with the given network stream. /// </summary> /// <param name="networkStream">The stream of chars to use as source code.</param> /// <returns>The constructed HTML document.</returns> public static HTMLDocument Html(Stream networkStream) { var source = new SourceManager(networkStream); var db = new DocumentBuilder(source, new HTMLDocument()); return(db.HtmlResult); }
/// <summary> /// Builds a new CSSStyleSheet with the given URL. /// </summary> /// <param name="url">The URL which points to the address containing the source code.</param> /// <returns>The constructed CSS stylesheet.</returns> public static CSSStyleSheet Css(Uri url) { var stream = Builder.Stream(url); var source = new SourceManager(stream); var db = new DocumentBuilder(source, new CSSStyleSheet()); return(db.CssResult); }
public void TokenizationBogusCommentClosingTag() { var s = new SourceManager("</ >"); var t = new HtmlTokenizer(s); var token = t.Get(); Assert.AreEqual(HtmlTokenType.Comment, token.Type); Assert.AreEqual(" ", ((HtmlCommentToken)token).Data); }
public void ValidXmlDeclarationOnlyVersion() { var s = new SourceManager("<?xml version=\"1.0\"?>"); var t = new XmlTokenizer(s); var e = t.Get(); Assert.AreEqual(XmlTokenType.Declaration, e.Type); Assert.AreEqual("1.0", ((XmlDeclarationToken)e).Version); }
/// <summary> /// Builds a new HTMLDocument with the given URL. /// </summary> /// <param name="url">The URL which points to the address containing the source code.</param> /// <returns>The constructed HTML document.</returns> public static HTMLDocument Html(Uri url) { var stream = Builder.Stream(url); var source = new SourceManager(stream); var db = new DocumentBuilder(source, new HTMLDocument()); return(db.HtmlResult); }
public void TokenizationBogusCommentEmpty() { var s = new SourceManager("<!>"); var t = new HtmlTokenizer(s); var token = t.Get(); Assert.AreEqual(HtmlTokenType.Comment, token.Type); Assert.AreEqual(String.Empty, ((HtmlCommentToken)token).Data); }
public void TokenizationBogusCommentQuestionMark() { var s = new SourceManager("<?>"); var t = new HtmlTokenizer(s); var token = t.Get(); Assert.AreEqual(HtmlTokenType.Comment, token.Type); Assert.AreEqual("?", ((HtmlCommentToken)token).Data); }
public void OneCommentInXmlDocument() { var c = "My comment"; var s = new SourceManager("<!--" + c + "-->"); var t = new XmlTokenizer(s); var e = t.Get(); Assert.AreEqual(XmlTokenType.Comment, e.Type); Assert.AreEqual(c, ((XmlCommentToken)e).Data); }
/// <summary> /// Creates a new builder with the specified source. /// </summary> /// <param name="source">The code manager.</param> /// <param name="document">The document to fill.</param> /// <param name="options">Options to use for the document generation.</param> DocumentBuilder(SourceManager source, HTMLDocument document, DocumentOptions options) { document.Options = options; parser = new HtmlParser(document, source); parser.ErrorOccurred += ParseErrorOccurred; if (options.OnError != null) parser.ErrorOccurred += options.OnError; }
/// <summary> /// Creates a new builder with the specified source. /// </summary> /// <param name="source">The code manager.</param> /// <param name="sheet">The document to fill.</param> /// <param name="options">Options to use for the document generation.</param> DocumentBuilder(SourceManager source, CSSStyleSheet sheet, DocumentOptions options) { sheet.Options = options; parser = new CssParser(sheet, source); parser.ErrorOccurred += ParseErrorOccurred; if (options.OnError != null) parser.ErrorOccurred += options.OnError; }
public void ValidXmlDeclarationVersionAndEncoding() { var s = new SourceManager("<?xml version=\"1.1\" encoding=\"utf-8\" ?>"); var t = new XmlTokenizer(s); var e = t.Get(); Assert.AreEqual(XmlTokenType.Declaration, e.Type); var x = (XmlDeclarationToken)e; Assert.AreEqual("1.1", x.Version); Assert.IsFalse(x.IsEncodingMissing); Assert.AreEqual("utf-8", x.Encoding); }
public void OneDoctypeInXmlDocument() { var s = new SourceManager("<!DOCTYPE root_element SYSTEM \"DTD_location\">"); var t = new XmlTokenizer(s); var e = t.Get(); Assert.AreEqual(XmlTokenType.DOCTYPE, e.Type); var d = (XmlDoctypeToken)e; Assert.IsFalse(d.IsNameMissing); Assert.AreEqual("root_element", d.Name); Assert.IsFalse(d.IsSystemIdentifierMissing); Assert.AreEqual("DTD_location", d.SystemIdentifier); }
public void ValidXmlDeclarationEverything() { var s = new SourceManager("<?xml version='1.0' encoding='ISO-8859-1' standalone=\"yes\" ?>"); var t = new XmlTokenizer(s); var e = t.Get(); Assert.AreEqual(XmlTokenType.Declaration, e.Type); var x = (XmlDeclarationToken)e; Assert.AreEqual("1.0", x.Version); Assert.IsFalse(x.IsEncodingMissing); Assert.AreEqual("ISO-8859-1", x.Encoding); Assert.AreEqual(true, x.Standalone); }
/// <summary> /// Builds a new CSSStyleSheet with the given network stream. /// </summary> /// <param name="stream">The stream of chars to use as source code.</param> /// <param name="configuration">[Optional] Custom options to use for the document generation.</param> /// <returns>The constructed CSS stylesheet.</returns> public static CSSStyleSheet Css(Stream stream, IConfiguration configuration = null) { if (configuration == null) { configuration = Configuration.Default; } var source = new SourceManager(stream, configuration.DefaultEncoding()); var sheet = new CSSStyleSheet { Options = configuration }; var db = new DocumentBuilder(source, sheet, configuration); return(db.CssResult); }
/// <summary> /// Builds a new HTMLDocument with the given source code string. /// </summary> /// <param name="sourceCode">The string to use as source code.</param> /// <param name="configuration">[Optional] Custom options to use for the document generation.</param> /// <returns>The constructed HTML document.</returns> public static HTMLDocument Html(String sourceCode, IConfiguration configuration = null) { if (configuration == null) { configuration = Configuration.Default; } var source = new SourceManager(sourceCode, configuration.DefaultEncoding()); var doc = new HTMLDocument { Options = configuration }; var db = new DocumentBuilder(source, doc, configuration); return(db.HtmlResult); }
/// <summary> /// Builds a new HTMLDocument by asynchronously requesting the given URL. /// </summary> /// <param name="url">The URL which points to the address containing the source code.</param> /// <param name="configuration">[Optional] Custom options to use for the document generation.</param> /// <returns>The task that constructs the HTML document.</returns> public static async Task <HTMLDocument> HtmlAsync(Uri url, CancellationToken cancel, IConfiguration configuration = null) { if (configuration == null) { configuration = Configuration.Default; } var stream = await configuration.LoadAsync(url, cancel, force : true); var source = new SourceManager(stream, configuration.DefaultEncoding()); var doc = new HTMLDocument { Options = configuration, DocumentUri = url.OriginalString }; var db = new DocumentBuilder(source, doc, configuration); await db.parser.ParseAsync(); return(db.HtmlResult); }
/// <summary> /// Builds a new CSSStyleSheet asynchronously by requesting the given URL. /// </summary> /// <param name="url">The URL which points to the address containing the source code.</param> /// <param name="configuration">[Optional] Custom options to use for the document generation.</param> /// <returns>The task which constructs the CSS stylesheet.</returns> public static async Task <CSSStyleSheet> CssAsync(Uri url, CancellationToken cancel, IConfiguration configuration = null) { if (configuration == null) { configuration = Configuration.Default; } var stream = await configuration.LoadAsync(url, cancel, force : true); var source = new SourceManager(stream, configuration.DefaultEncoding()); var sheet = new CSSStyleSheet { Href = url.OriginalString, Options = configuration }; var db = new DocumentBuilder(source, sheet, configuration); await db.parser.ParseAsync(); return(db.CssResult); }
public void TokenizationCDataCorrectCharacters() { StringBuilder sb = new StringBuilder(); var s = new SourceManager("<![CDATA[hi mum how <!-- are you doing />]]>"); var t = new HtmlTokenizer(s); t.AcceptsCharacterData = true; HtmlToken token; do { token = t.Get(); if (token.Type == HtmlTokenType.Character) sb.Append(((HtmlCharacterToken)token).Data); } while (token != HtmlToken.EOF); Assert.AreEqual("hi mum how <!-- are you doing />", sb.ToString()); }
public void TVScheduleDtdSubset() { var dtd = @"<!ELEMENT TVSCHEDULE (CHANNEL+)> <!ELEMENT CHANNEL (BANNER,DAY+)> <!ELEMENT BANNER (#PCDATA)> <!ELEMENT DAY (DATE,(HOLIDAY|PROGRAMSLOT+)+)> <!ELEMENT HOLIDAY (#PCDATA)> <!ELEMENT DATE (#PCDATA)> <!ELEMENT PROGRAMSLOT (TIME,TITLE,DESCRIPTION?)> <!ELEMENT TIME (#PCDATA)> <!ELEMENT TITLE (#PCDATA)> <!ELEMENT DESCRIPTION (#PCDATA)> <!ATTLIST TVSCHEDULE NAME CDATA #REQUIRED> <!ATTLIST CHANNEL CHAN CDATA #REQUIRED> <!ATTLIST PROGRAMSLOT VTR CDATA #IMPLIED> <!ATTLIST TITLE RATING CDATA #IMPLIED> <!ATTLIST TITLE LANGUAGE CDATA #IMPLIED>"; var text = "<!DOCTYPE TVSCHEDULE [" + dtd + "]>"; var s = new SourceManager(text); var t = new XmlTokenizer(s); t.DTD.Reset(); var e = t.Get(); Assert.AreEqual(XmlTokenType.DOCTYPE, e.Type); var d = (XmlDoctypeToken)e; Assert.IsFalse(d.IsNameMissing); Assert.AreEqual("TVSCHEDULE", d.Name); Assert.IsTrue(d.IsSystemIdentifierMissing); Assert.AreEqual(15, t.DTD.Count); //Unfortunately C# counts newlines with 2 characters since \r\n is used Assert.AreEqual(dtd.Replace("\r\n", "\n"), d.InternalSubset); //This is annoying but meh - what can we do? W3C specifies we need to use //\n for newlines and omit \r completely. }
public void TokenizationCDataDetected() { var s = new SourceManager("<![CDATA[hi mum how <!-- are you doing />]]>"); var t = new HtmlTokenizer(s); var token = t.Get(); Assert.AreEqual(HtmlTokenType.Characters, token.Type); }
public BaseTokenizer(SourceManager source) { src = source; stringBuffer = new StringBuilder(); }
/// <summary> /// Creates a new builder with the specified source. /// </summary> /// <param name="source">The code manager.</param> /// <param name="document">The document to fill.</param> DocumentBuilder(SourceManager source, XMLDocument document) { parser = new XmlParser(document, source); parser.ErrorOccurred += ParseErrorOccurred; }
/// <summary> /// Creates a new builder with the specified source. /// </summary> /// <param name="source">The code manager.</param> /// <param name="sheet">The document to fill.</param> /// <param name="configuration">Options to use for the document generation.</param> DocumentBuilder(SourceManager source, CSSStyleSheet sheet, IConfiguration configuration) { sheet.Options = configuration; parser = new CssParser(sheet, source); parser.ParseError += (s, e) => configuration.ReportError(e); }
/// <summary> /// Builds a new CSSStyleSheet with the given network stream. /// </summary> /// <param name="stream">The stream of chars to use as source code.</param> /// <param name="options">[Optional] Options to use for the document generation.</param> /// <returns>The constructed CSS stylesheet.</returns> public static CSSStyleSheet Css(Stream stream, DocumentOptions options = null) { var source = new SourceManager(stream); var db = new DocumentBuilder(source, new CSSStyleSheet(), options ?? DocumentOptions.Default); return db.CssResult; }
/// <summary> /// Builds a list of nodes according with 8.4 Parsing HTML fragments. /// </summary> /// <param name="sourceCode">The string to use as source code.</param> /// <param name="context">[Optional] The context node to use.</param> /// <param name="options">[Optional] Options to use for the document generation.</param> /// <returns>A list of parsed nodes.</returns> public static NodeList HtmlFragment(String sourceCode, Node context = null, DocumentOptions options = null) { var source = new SourceManager(sourceCode); var doc = new HTMLDocument(); //Disable scripting for HTML fragments (security reasons) options = options ?? new DocumentOptions(scripting: false); var db = new DocumentBuilder(source, doc, options); if (context != null) { if (context.OwnerDocument != null && context.OwnerDocument.QuirksMode != QuirksMode.Off) doc.QuirksMode = context.OwnerDocument.QuirksMode; var parser = (HtmlParser)db.parser; parser.SwitchToFragment(context); return parser.Result.DocumentElement.ChildNodes; } return db.HtmlResult.ChildNodes; }
public void TokenizationTagMixedCaseHandling() { var s = new SourceManager("<InpUT>"); var t = new HtmlTokenizer(s); var token = t.Get(); Assert.AreEqual("input", ((HtmlTagToken)token).Name); }
public void TokenizationStartTagDetection() { var s = new SourceManager("<p>"); var t = new HtmlTokenizer(s); var token = t.Get(); Assert.AreEqual(HtmlTokenType.StartTag, token.Type); Assert.AreEqual("p", ((HtmlTagToken)token).Name); }
public void TokenizationFinalEOF() { var s = new SourceManager(""); var t = new HtmlTokenizer(s); var token = t.Get(); Assert.AreEqual(HtmlTokenType.EOF, token.Type); }
public void TokenizationDoctypeDetected() { var s = new SourceManager("<!doctype html>"); var t = new HtmlTokenizer(s); var token = t.Get(); Assert.AreEqual(HtmlTokenType.DOCTYPE, token.Type); }
public void TokenizationCharacterReferenceNotin() { var str = string.Empty; var src = "I'm ∉ I tell you"; var s = new SourceManager(src); var t = new HtmlTokenizer(s); HtmlToken token; do { token = t.Get(); if (token.Type == HtmlTokenType.Character) str += ((HtmlCharacterToken)token).Data; else if (token.Type == HtmlTokenType.Characters) str += new String(((HtmlCharactersToken)token).Data); } while (token != HtmlToken.EOF); Assert.AreEqual("I'm ∉ I tell you", str); }
/// <summary> /// Builds a new HTMLDocument by asynchronously requesting the given URL. /// </summary> /// <param name="url">The URL which points to the address containing the source code.</param> /// <param name="options">[Optional] Options to use for the document generation.</param> /// <returns>The task that constructs the HTML document.</returns> public static async Task<HTMLDocument> HtmlAsync(Uri url, DocumentOptions options = null) { var stream = await Builder.GetFromUrl(url); var source = new SourceManager(stream); var db = new DocumentBuilder(source, new HTMLDocument { DocumentURI = url.OriginalString }, options ?? DocumentOptions.Default); await db.parser.ParseAsync(); return db.HtmlResult; }
public void TokenizationTagSelfClosingDetected() { var s = new SourceManager("<img />"); var t = new HtmlTokenizer(s); var token = t.Get(); Assert.AreEqual(true, ((HtmlTagToken)token).IsSelfClosing); }
/// <summary> /// Builds a new CSSStyleSheet asynchronously by requesting the given URL. /// </summary> /// <param name="url">The URL which points to the address containing the source code.</param> /// <param name="options">[Optional] Options to use for the document generation.</param> /// <returns>The task which constructs the CSS stylesheet.</returns> public static async Task<CSSStyleSheet> CssAsync(Uri url, DocumentOptions options = null) { var stream = await Builder.GetFromUrl(url); var source = new SourceManager(stream); var db = new DocumentBuilder(source, new CSSStyleSheet { Href = url.OriginalString }, options ?? DocumentOptions.Default); await db.parser.ParseAsync(); return db.CssResult; }
public void TokenizationTagSpacesBehind() { var s = new SourceManager("<i >"); var t = new HtmlTokenizer(s); var token = t.Get(); Assert.AreEqual("i", ((HtmlTagToken)token).Name); }
public void TokenizationTagNameDetection() { var s = new SourceManager("<span>"); var t = new HtmlTokenizer(s); var token = t.Get(); Assert.AreEqual("span", ((HtmlTagToken)token).Name); }
/// <summary> /// Creates a new builder with the specified source. /// </summary> /// <param name="source">The code manager.</param> /// <param name="sheet">The document to fill.</param> DocumentBuilder(SourceManager source, CSSStyleSheet sheet) { parser = new CssParser(sheet, source); parser.ErrorOccurred += ParseErrorOccurred; }
/// <summary> /// Builds a new HTMLDocument with the given source code string. /// </summary> /// <param name="sourceCode">The string to use as source code.</param> /// <param name="options">[Optional] Options to use for the document generation.</param> /// <returns>The constructed HTML document.</returns> public static HTMLDocument Html(String sourceCode, DocumentOptions options = null) { var source = new SourceManager(sourceCode); var db = new DocumentBuilder(source, new HTMLDocument(), options ?? DocumentOptions.Default); return db.HtmlResult; }
/// <summary> /// Creates a new builder with the specified source. /// </summary> /// <param name="source">The code manager.</param> /// <param name="document">The document to fill.</param> /// <param name="configuration">Options to use for the document generation.</param> DocumentBuilder(SourceManager source, HTMLDocument document, IConfiguration configuration) { parser = new HtmlParser(document, source); parser.ParseError += (s, e) => configuration.ReportError(e); }