internal async static Task<IDocument> LoadAsync(IBrowsingContext context, CreateDocumentOptions options, CancellationToken cancelToken) { var scripting = context.Configuration.IsScripting(); var parserOptions = new HtmlParserOptions { IsScripting = scripting }; var document = new HtmlDocument(context, options.Source); var parser = new HtmlDomBuilder(document); document.Setup(options); context.NavigateTo(document); context.Fire(new HtmlParseEvent(document, completed: false)); await parser.ParseAsync(parserOptions, cancelToken).ConfigureAwait(false); context.Fire(new HtmlParseEvent(document, completed: true)); return document; }
/// <summary> /// Creates a new document fragment with the given nodelist as /// children. /// </summary> /// <param name="context">The context for the fragment mode.</param> /// <param name="html">The HTML source code to use.</param> internal DocumentFragment(Element context, String html) : this(context.Owner) { var source = new TextSource(html); var document = new HtmlDocument(Owner.Context, source); var parser = new HtmlDomBuilder(document); var options = new HtmlParserOptions { IsEmbedded = false, IsScripting = Owner.Options.IsScripting() }; var root = parser.ParseFragment(options, context).DocumentElement; while (root.HasChildNodes) { var child = root.FirstChild; root.RemoveChild(child); this.PreInsert(child, null); } }
internal async static Task<IDocument> LoadTextAsync(IBrowsingContext context, CreateDocumentOptions options, CancellationToken cancelToken) { var scripting = context.Configuration.IsScripting(); var parserOptions = new HtmlParserOptions { IsScripting = scripting }; var document = new HtmlDocument(context, options.Source); document.Setup(options); context.NavigateTo(document); var root = document.CreateElement(TagNames.Html); var head = document.CreateElement(TagNames.Head); var body = document.CreateElement(TagNames.Body); var pre = document.CreateElement(TagNames.Pre); document.AppendChild(root); root.AppendChild(head); root.AppendChild(body); body.AppendChild(pre); pre.SetAttribute(AttributeNames.Style, "word-wrap: break-word; white-space: pre-wrap;"); await options.Source.PrefetchAllAsync(cancelToken).ConfigureAwait(false); pre.TextContent = options.Source.Text; return document; }
public static INodeList ToHtmlFragment(this String sourceCode, IElement context = null, IConfiguration configuration = null) { var ctx = BrowsingContext.New(configuration); var source = new TextSource(sourceCode); var document = new HtmlDocument(ctx, source); var parser = new HtmlDomBuilder(sourceCode, configuration); var element = context as Element; if (element != null) { var options = new HtmlParserOptions { IsEmbedded = false, IsScripting = configuration.IsScripting() }; return parser.ParseFragment(options, element).DocumentElement.ChildNodes; } else { return parser.Parse(default(HtmlParserOptions)).ChildNodes; } }
/// <summary> /// Parses the given source asynchronously and creates the document. /// </summary> /// <param name="options">The options to use for parsing.</param> /// <param name="cancelToken">The cancellation token to use.</param> public async Task<HtmlDocument> ParseAsync(HtmlParserOptions options, CancellationToken cancelToken) { var source = _document.Source; var token = default(HtmlToken); _options = options; do { if (source.Length - source.Index < 1024) { await source.Prefetch(8192, cancelToken).ConfigureAwait(false); } token = _tokenizer.Get(); Consume(token); if (_waiting != null) { await _waiting.ConfigureAwait(false); _waiting = null; } } while (token.Type != HtmlTokenType.EndOfFile); return _document; }
/// <summary> /// Creates a new parser with the custom options and the given context. /// </summary> /// <param name="options">The options to use.</param> /// <param name="context">The context to use.</param> public HtmlParser(HtmlParserOptions options, IBrowsingContext context) { _options = options; _context = context; }
/// <summary> /// Creates a new parser with the custom options and configuration. /// </summary> /// <param name="options">The options to use.</param> /// <param name="configuration">The configuration to use.</param> public HtmlParser(HtmlParserOptions options, IConfiguration configuration) : this(options, BrowsingContext.New(configuration)) { }
/// <summary> /// Creates a new parser with the custom options. /// </summary> /// <param name="options">The options to use.</param> public HtmlParser(HtmlParserOptions options) : this(options, Configuration.Default) { }
public void NormalModeShouldError() { var html = @"<!DOCTYPE html> <title>Test</title> <body> <div myAttribute=""blabla>123</div> </body>"; var errors = new List<HtmlErrorEvent>(); var options = new HtmlParserOptions { IsStrictMode = false }; var context = BrowsingContext.New(Configuration.Default); context.ParseError += (s, ev) => errors.Add((HtmlErrorEvent)ev); var parser = new HtmlParser(options, context); parser.Parse(html); Assert.AreEqual(1, errors.Count); }
/// <summary> /// Loads the document in the provided context from the given response. /// </summary> /// <param name="context">The browsing context.</param> /// <param name="response">The response to consider.</param> /// <param name="source">The source to use.</param> /// <param name="cancelToken">Token for cancellation.</param> /// <returns>The task that builds the document.</returns> internal async static Task<HtmlDocument> LoadAsync(IBrowsingContext context, IResponse response, TextSource source, CancellationToken cancelToken) { var document = new HtmlDocument(context, source); using (var evt = new HtmlParseStartEvent(document)) { var config = context.Configuration; var events = config.Events; var parser = new HtmlDomBuilder(document); document.ContentType = response.Headers.GetOrDefault(HeaderNames.ContentType, MimeTypes.Html); document.Referrer = response.Headers.GetOrDefault(HeaderNames.Referer, String.Empty); document.DocumentUri = response.Address.Href; document.Cookie = response.Headers.GetOrDefault(HeaderNames.SetCookie, String.Empty); document.ReadyState = DocumentReadyState.Loading; if (events != null) events.Publish(evt); var options = new HtmlParserOptions { IsScripting = config.IsScripting() }; await parser.ParseAsync(options, cancelToken).ConfigureAwait(false); } return document; }
/// <summary> /// Loads the document in the provided context from the given response. /// </summary> /// <param name="context">The browsing context.</param> /// <param name="options">The creation options to consider.</param> /// <param name="cancelToken">Token for cancellation.</param> /// <returns>The task that builds the document.</returns> internal async static Task<IDocument> LoadAsync(IBrowsingContext context, CreateDocumentOptions options, CancellationToken cancelToken) { var document = new HtmlDocument(context, options.Source); var evt = new HtmlParseStartEvent(document); var config = context.Configuration; var events = config.Events; var parser = new HtmlDomBuilder(document); var parserOptions = new HtmlParserOptions { IsScripting = config.IsScripting() }; document.Setup(options); context.NavigateTo(document); if (events != null) { events.Publish(evt); } await parser.ParseAsync(parserOptions, cancelToken).ConfigureAwait(false); evt.FireEnd(); return document; }
/// <summary> /// Parses the given source and creates the document. /// </summary> /// <param name="options">The options to use for parsing.</param> public HtmlDocument Parse(HtmlParserOptions options) { var token = default(HtmlToken); _tokenizer.IsStrictMode = options.IsStrictMode; _options = options; do { token = _tokenizer.Get(); Consume(token); _waiting?.Wait(); _waiting = null; } while (token.Type != HtmlTokenType.EndOfFile); return _document; }
public void StrictModeShouldYieldException() { var html = @"<!DOCTYPE html> <title>Test</title> <body> <div myAttribute=""blabla>123</div> </body>"; var options = new HtmlParserOptions { IsStrictMode = true }; var parser = new HtmlParser(options); Assert.Catch<HtmlParseException>(() => parser.Parse(html)); }
/// <summary> /// Parses the given source and creates the document. /// </summary> /// <param name="options">The options to use for parsing.</param> public HtmlDocument Parse(HtmlParserOptions options) { var token = default(HtmlToken); _options = options; do { token = _tokenizer.Get(); Consume(token); if (_waiting != null) { _waiting.Wait(); _waiting = null; } } while (token.Type != HtmlTokenType.EndOfFile); return _document; }
/// <summary> /// Switches to the fragment algorithm with the specified context /// element. Then parses the given source and creates the document. /// </summary> /// <param name="options">The options to use for parsing.</param> /// <param name="context"> /// The context element where the algorithm is applied to. /// </param> public HtmlDocument ParseFragment(HtmlParserOptions options, Element context) { if (context == null) { throw new ArgumentNullException("context"); } var tagName = context.LocalName; if (tagName.IsOneOf(TagNames.Title, TagNames.Textarea)) { _tokenizer.State = HtmlParseMode.RCData; } else if (tagName.IsOneOf(TagNames.Style, TagNames.Xmp, TagNames.Iframe, TagNames.NoEmbed, TagNames.NoFrames)) { _tokenizer.State = HtmlParseMode.Rawtext; } else if (tagName.Is(TagNames.Script)) { _tokenizer.State = HtmlParseMode.Script; } else if (tagName.Is(TagNames.Plaintext)) { _tokenizer.State = HtmlParseMode.Plaintext; } else if (tagName.Is(TagNames.NoScript) && options.IsScripting) { _tokenizer.State = HtmlParseMode.Rawtext; } var root = new HtmlHtmlElement(_document); _document.AddNode(root); _openElements.Add(root); if (context is HtmlTemplateElement) { _templateModes.Push(HtmlTreeMode.InTemplate); } Reset(context); _fragmentContext = context; _tokenizer.IsAcceptingCharacterData = !AdjustedCurrentNode.Flags.HasFlag(NodeFlags.HtmlMember); do { if (context is HtmlFormElement) { _currentFormElement = (HtmlFormElement)context; break; } context = context.ParentElement as Element; } while (context != null); return Parse(options); }