private List <Node> Parse(List <MarkdownObject> markdownObjects) { var tokens = new List <IToken>(markdownObjects.Count); var errorListener = new AntlrErrorListener(_logger); foreach (MarkdownObject markdownObject in markdownObjects) { var blockSpan = markdownObject.Span; if (markdownObject is HtmlBlock || markdownObject is HtmlInline) { var lexer = new HtmlLexer(new CaseInsensitiveInputStream(_file.GetSubstring(blockSpan.Start, blockSpan.Length))); lexer.AddErrorListener(errorListener); var currentTokens = lexer.GetAllTokens(); foreach (IToken token in currentTokens) { tokens.Add(new HtmlToken(_file, token.Type, tokens.Count, token.StartIndex + blockSpan.Start, token.StopIndex + blockSpan.Start, token.Text, token.Channel)); } } else { var markdownNode = ParseMarkdown(markdownObject); if (markdownNode != null) { tokens.Add(new MarkdownToken(_file, tokens.Count, blockSpan.Start, blockSpan.End, markdownNode)); } } } var parser = new HtmlParser(new CommonTokenStream(new ListTokenSource(tokens)), _logger); parser.AddErrorListener(errorListener); var root = parser.root(); var children = new List <Node>(root.content().Length); foreach (var contentContext in root.content()) { children.Add(ParseContent(contentContext)); } return(children); }
/// <summary>Called when this node has been created and is being added to the given lexer. /// Closely related to Element.OnLexerCloseNode.</summary> /// <returns>True if this element handled itself.</returns> public override bool OnLexerAddNode(HtmlLexer lexer, int mode) { if ((mode & (HtmlTreeMode.InHead | HtmlTreeMode.InBody | HtmlTreeMode.InTemplate | HtmlTreeMode.InHeadNoScript)) != 0) { // Append it. DO NOT push to the stack: lexer.Push(this, false); } else if (mode == HtmlTreeMode.AfterHead) { lexer.AfterHeadHeadTag(this); } else { return(false); } return(true); }
/// <summary>Called when this node has been created and is being added to the given lexer. /// Closely related to Element.OnLexerCloseNode.</summary> /// <returns>True if this element handled itself.</returns> public override bool OnLexerAddNode(HtmlLexer lexer, int mode) { if (mode == HtmlTreeMode.InBody) { // Close paragraphs and append: lexer.CloseParagraphButtonScope(); lexer.Push(this, true); // Plaintext state: lexer.State = HtmlParseMode.Plaintext; } else { return(false); } return(true); }
/// <summary>Called when this node has been created and is being added to the given lexer.</summary> public override bool OnLexerAddNode(HtmlLexer lexer, int mode) { if (mode == HtmlTreeMode.InBody) { if (lexer.IsInScope("ruby")) { lexer.GenerateImpliedEndTagsExceptFor("rtc"); } lexer.Push(this, true); } else { return(false); } return(true); }
/// <summary>Called when this node has been created and is being added to the given lexer. /// Closely related to Element.OnLexerCloseNode.</summary> /// <returns>True if this element handled itself.</returns> public override bool OnLexerAddNode(HtmlLexer lexer, int mode) { if ((mode & (HtmlTreeMode.InHead | HtmlTreeMode.InBody | HtmlTreeMode.InTemplate)) != 0) { // Generic RCData algorithm: lexer.RawTextOrRcDataAlgorithm(this, HtmlParseMode.RCData); } else if (mode == HtmlTreeMode.AfterHead) { lexer.AfterHeadHeadTag(this); } else { return(false); } return(true); }
private TokenizeResult TokenizeHtml(string text, TextPosition pos, int length) { TokenizeResult result = new TokenizeResult(); Stopwatch timer = Stopwatch.StartNew(); using (HtmlLexer htmlLexer = new HtmlLexer(text, pos, length)) { IEnumerable <HtmlToken> htmlTokens = htmlLexer.Tokenize(); if (htmlTokens.FirstOrDefault(d => !d.IsEOF) != null) { result.AddTokens(htmlTokens); } result.AddErrors(htmlLexer.LexErrors); } timer.Stop(); result.Stats.HtmlDuration += timer.Elapsed; return(result); }
/// <summary>Called when a close tag of this element has /// been created and is being added to the given lexer.</summary> /// <returns>True if this element handled itself.</returns> public override bool OnLexerCloseNode(HtmlLexer lexer, int mode) { if (mode == HtmlTreeMode.InBody) { if (lexer.IsInListItemScope("li")) { lexer.GenerateImpliedEndTagsExceptFor("li"); lexer.CloseInclusive("li"); } } else { return(false); } return(true); }
/// <summary>Called when a close tag of this element has /// been created and is being added to the given lexer.</summary> /// <returns>True if this element handled itself.</returns> public override bool OnLexerCloseNode(HtmlLexer lexer, int mode) { if ((mode & IgnoreClose) != 0) { // Just ignore it/ do nothing. } else if (mode == HtmlTreeMode.InSelectInTable) { // Close down to select: lexer.CloseSelect(false, null, "caption"); } else { return(false); } return(true); }
/// <summary>Called when this node has been created and is being added to the given lexer. /// Closely related to Element.OnLexerCloseNode.</summary> /// <returns>True if this element handled itself.</returns> public override bool OnLexerAddNode(HtmlLexer lexer, int mode) { if ((mode & (HtmlTreeMode.InHead | HtmlTreeMode.InBody | HtmlTreeMode.InTemplate | HtmlTreeMode.InTable | HtmlTreeMode.InHeadNoScript)) != 0) { // Add as text: lexer.RawTextOrRcDataAlgorithm(this, HtmlParseMode.Rawtext); } else if (mode == HtmlTreeMode.AfterHead) { lexer.AfterHeadHeadTag(this); } else { return(false); } return(true); }
/// <summary>Called when a close tag of this element has /// been created and is being added to the given lexer.</summary> /// <returns>True if this element handled itself.</returns> public override bool OnLexerCloseNode(HtmlLexer lexer, int mode) { if (mode == HtmlTreeMode.InTable) { // Close it if (lexer.IsInTableScope("table")) { // Ignore otherwise lexer.CloseInclusive("table"); // Reset mode: lexer.Reset(); } } else if (mode == HtmlTreeMode.InTableBody) { // Close to table if in a table body context and reprocess: lexer.CloseToTableBodyIfBody(null, "table"); } else if (mode == HtmlTreeMode.InRow) { lexer.TableBodyIfTrInScope(null, "table"); } else if (mode == HtmlTreeMode.InCell) { lexer.CloseTableZoneInCell("table"); } else if (mode == HtmlTreeMode.InCaption) { lexer.CloseCaption(null, "table"); } else if (mode == HtmlTreeMode.InSelectInTable) { lexer.CloseSelect(false, null, "table"); } else { return(false); } return(true); }
/// <summary>Called when this node has been created and is being added to the given lexer. /// Closely related to Element.OnLexerCloseNode.</summary> /// <returns>True if this element handled itself.</returns> public override bool OnLexerAddNode(HtmlLexer lexer, int mode) { if (mode == HtmlTreeMode.InBody) { lexer.CloseParagraphButtonScope(); lexer.ReconstructFormatting(); lexer.FramesetOk = false; lexer.RawTextOrRcDataAlgorithm(this, HtmlParseMode.Rawtext); } else { return(false); } return(true); }
/// <summary>Called when this node has been created and is being added to the given lexer. /// Closely related to Element.OnLexerCloseNode.</summary> /// <returns>True if this element handled itself.</returns> public override bool OnLexerAddNode(HtmlLexer lexer, int mode) { if (mode == HtmlTreeMode.InTable) { if (lexer.TagCurrentlyOpen("template") || lexer.form != null) { // Ignore it. } else { // Add but don't push: lexer.Push(this, false); lexer.form = this; } } else if (mode == HtmlTreeMode.InBody) { bool openTemplate = lexer.TagCurrentlyOpen("template"); if (lexer.form != null && !openTemplate) { // Parse error - ignore the token. } else { lexer.CloseParagraphButtonScope(); // Add and set form: lexer.Push(this, true); if (!openTemplate) { lexer.form = this; } } } else { return(false); } return(true); }
public void CanParseComplex() { var testHtml = "<b class=test class=\"test2 again\" class='kjflds' disabled data-model=\"model\"><img alt='jfkdsljf' />This <!-- > --is-- --> a < test > <broken test <ns:test ns:attr=\"test\" /></b>"; var result = string.Join("\n", HtmlLexer.Read(testHtml).Select(x => x.ToString())); var expected = "<b class=\"test\" class=\"test2 again\" class=\"kjflds\" disabled data-model=\"model\">" + "\n<img alt=\"jfkdsljf\"/>" + "\nThis " + "\n<!-- > --is-- -->" + "\n a " + "\n<" + "\n test > " + "\n<" + "\nbroken test " + "\n<ns:test ns:attr=\"test\"/>" + "\n</b>"; Assert.Equal(expected, result); }
/// <summary>Called when a close tag of this element has /// been created and is being added to the given lexer.</summary> /// <returns>True if this element handled itself.</returns> public override bool OnLexerCloseNode(HtmlLexer lexer, int mode) { if (mode == HtmlTreeMode.InBody) { if (lexer.TagCurrentlyOpen("template")) { // Template in scope. lexer.GenerateImpliedEndTags(); lexer.CloseInclusive("form"); } else if (lexer.IsInScope("form")) { // No template - ordinary form. Element node = lexer.form; lexer.form = null; if (node != null && lexer.IsInScope("form")) { lexer.GenerateImpliedEndTags(); if (node == lexer.CurrentElement) { lexer.CloseCurrentNode(); } else { // Fatal parse error. throw new DOMException(DOMException.SYNTAX_ERR, (ushort)HtmlParseError.FormClosedWrong); } } } // Ignore otherwise } else { return(false); } return(true); }
/// <summary>Called when this node has been created and is being added to the given lexer. /// Closely related to Element.OnLexerCloseNode.</summary> /// <returns>True if this element handled itself.</returns> public override bool OnLexerAddNode(HtmlLexer lexer, int mode) { if (mode == HtmlTreeMode.InBody) { lexer.ReconstructFormatting(); lexer.Push(this, false); string type = this["type"]; if (type == null || type == "hidden") { lexer.FramesetOk = false; } } else if (mode == HtmlTreeMode.InTable) { string type = this["type"]; if (type == null || type == "hidden") { // Go the anything else route. lexer.InTableElse(this, null); } else { // Add but don't push: lexer.Push(this, false); } } else if (mode == HtmlTreeMode.InSelect) { lexer.InputOrTextareaInSelect(this); } else { return(false); } return(true); }
public void ExampleCodeWorks() { const string exampleHtml = @" <ul> <li><a href=""http://www.example.com/"">Example</a></li> <li><a href=""http://www.google.com/"">Google</a></li> <li><a href=""https://www.yahoo.com/"">Yahoo</a></li> </ul>"; const string https = "https://"; const string http = "http://"; var fragments = HtmlLexer.Read(exampleHtml); var output = new StringBuilder(exampleHtml.Length + 10); foreach (var fragment in fragments) { if (fragment.IsNamed("a") && fragment.HasAttribute("href")) { var href = fragment["href"]; if (href.Value != null && href.Value.StartsWith(http)) { href.Value = https + href.Value.Substring(http.Length); } output.Append(fragment.ToString()); } else { output.Append(exampleHtml, fragment.Trivia.StartPosition, fragment.Trivia.Length); } } string actualValue = output.ToString(); const string expectedValue = @" <ul> <li><a href=""https://www.example.com/"">Example</a></li> <li><a href=""https://www.google.com/"">Google</a></li> <li><a href=""https://www.yahoo.com/"">Yahoo</a></li> </ul>"; Assert.Equal(expectedValue, actualValue); }
/// <summary> /// See 8.2.4.66 After DOCTYPE system identifier state /// </summary> private void SystemIdentifierAfter(HtmlLexer lexer) { PropertyTextReader.SkipSpaces(lexer); char c = lexer.Peek(); if (c == '>') { lexer.Position++; lexer.State = HtmlParseMode.PCData; } else if (c == '\0') { quirksMode = true; } else { ParseBroken(lexer); } }
/// <summary>Sets the document content with a status code. /// Displays error info if html is blank or ErrorHandlers.CatchAll is set.</summary> internal void GotDocumentContent(string html, int status, bool openClose) { if (status != 200 && (string.IsNullOrEmpty(html) || ErrorHandlers.CatchAll)) { // Build an error message now: ErrorInfo error = new ErrorInfo(); error.document = this; error.Url = location; error.Custom = html; error.StatusCode = status; // Display: ErrorHandlers.Display(error); } else { if (openClose) { // Full open/close cycle: innerHTML = html; } else { // Parse now: HtmlLexer lexer = new HtmlLexer(html, this); lexer.Parse(); close(); } } if (resourcesLoading <= 0 && readyState != "complete") { // Fire onload now! ReadyStateChange(2); // Fire event: PowerUI.UIEvent de = new PowerUI.UIEvent("load"); de.SetTrusted(true); dispatchEvent(de); } }
/// <summary>Called when this node has been created and is being added to the given lexer. /// Closely related to Element.OnLexerCloseNode.</summary> /// <returns>True if this element handled itself.</returns> public override bool OnLexerAddNode(HtmlLexer lexer, int mode) { if ((mode & (HtmlTreeMode.InHead | HtmlTreeMode.InBody | HtmlTreeMode.InTemplate | HtmlTreeMode.InHeadNoScript)) != 0) { // Append it. DO NOT push to the stack: lexer.Push(this, false); // Should check for encoding here. // http://w3c.github.io/html/syntax.html#the-in-head-insertion-mode } else if (mode == HtmlTreeMode.AfterHead) { lexer.AfterHeadHeadTag(this); } else { return(false); } return(true); }
/// <summary>Called when a close tag of this element has /// been created and is being added to the given lexer.</summary> /// <returns>True if this element handled itself.</returns> public override bool OnLexerCloseNode(HtmlLexer lexer, int mode) { if (mode == HtmlTreeMode.InSelect) { if (lexer.CurrentElement.Tag == "option" && lexer.OpenElements[lexer.OpenElements.Count - 2].Tag == "optgroup") { lexer.CloseCurrentNode(); } if (lexer.CurrentElement.Tag == "optgroup") { lexer.CloseCurrentNode(); } } else { return(false); } return(true); }
/// <summary>Called when this node has been created and is being added to the given lexer.</summary> public override bool OnLexerAddNode(HtmlLexer lexer, int mode) { if (mode == HtmlTreeMode.InBody) { lexer.SkipNewline(); lexer.FramesetOk = false; lexer.RawTextOrRcDataAlgorithm(this, HtmlParseMode.RCData); } else if (mode == HtmlTreeMode.InSelect) { lexer.InputOrTextareaInSelect(this); } else { return(false); } return(true); }
/// <summary>Called when a close tag of this element has /// been created and is being added to the given lexer.</summary> /// <returns>True if this element handled itself.</returns> public override bool OnLexerCloseNode(HtmlLexer lexer, int mode) { if ((mode & InHeadClose) != 0) { // Close a template tag. if (lexer.TagCurrentlyOpen("template")) { // Generate implied thoroughly: lexer.GenerateImpliedEndTagsThorough(); // Close it: lexer.CloseTemplate(); } } else { return(false); } return(true); }
private static void Main() { string[] testFiles = Directory.GetFiles("files/", "*.html"); Driver d = new Driver(); ParseOptions options = new ParseOptions(); HtmlLexer hLexer = new HtmlLexer(); HtmlParse parser = new HtmlParse(); JsonConverter converter = new JsonConverter(); for (int i = 0; i < testFiles.Length; i++) { string fileName = testFiles[i]; string htmlContent = d.LoadFile(fileName); List <Token> tokens = hLexer.Lexer(htmlContent, options); //d.WriteTokensToConsole(tokens); d.WriteTokensToTextFile(tokens, fileName); HtmlNode parseResult = parser.Parser(tokens, options); string json = converter.ConvertHtml(parseResult); d.WriteToJSON(json, fileName); } d.Finish(); }
/// <summary>Called when this node has been created and is being added to the given lexer. /// Closely related to Element.OnLexerCloseNode.</summary> /// <returns>True if this element handled itself.</returns> public override bool OnLexerAddNode(HtmlLexer lexer, int mode) { if (mode == HtmlTreeMode.BeforeHead) { // Add: lexer.Push(this, true); lexer.head = this; // Switch: lexer.CurrentMode = HtmlTreeMode.InHead; } else if ((mode & (HtmlTreeMode.AfterHead | HtmlTreeMode.InHead | HtmlTreeMode.InHeadNoScript | HtmlTreeMode.InBody)) != 0) { // Just ignore it. } else { return(false); } return(true); }
/// <summary>Called when a close tag of this element has /// been created and is being added to the given lexer.</summary> /// <returns>True if this element handled itself.</returns> public override bool OnLexerCloseNode(HtmlLexer lexer, int mode) { if (mode == HtmlTreeMode.InHead) { // Close the head tag. lexer.CloseCurrentNode(); // Switch mode: lexer.CurrentMode = HtmlTreeMode.AfterHead; } else if (mode == HtmlTreeMode.BeforeHtml) { // Allowed to fall through the 'anything else' case: lexer.BeforeHtmlElse(null, "head"); } else { return(false); } return(true); }
/// <summary>Called when a close tag of this element has /// been created and is being added to the given lexer.</summary> /// <returns>True if this element handled itself.</returns> public override bool OnLexerCloseNode(HtmlLexer lexer, int mode) { if (mode == HtmlTreeMode.InBody) { // Check if the stack contains elements that aren't allowed to still be open: lexer.CheckAfterBodyStack(); // Ok! (It throws a fatal error otherwise) // Note that the spec doesn't actually tell us to close the body element. // Just change to after body: lexer.CurrentMode = HtmlTreeMode.AfterBody; } else if (mode == HtmlTreeMode.InHead) { // Use anything else method: lexer.InHeadElse(null, "body"); } else if (mode == HtmlTreeMode.AfterHead) { // Use anything else method: lexer.AfterHeadElse(null, "body"); } else if (mode == HtmlTreeMode.BeforeHtml) { // Allowed to fall through the 'anything else' case: lexer.BeforeHtmlElse(null, "body"); } else if ((mode & IgnoreClose) != 0) { // Just ignore it/ do nothing. } else { return(false); } return(true); }
public override bool OnLexerCloseNode(HtmlLexer lexer, int mode) { if (mode == HtmlTreeMode.InBody) { if (lexer.IsInButtonScope("p")) { lexer.CloseParagraph(); } else { // (Parse error) // Create a p node, but don't push it onto the stack: Element el = lexer.CreateTag("p", true); lexer.Push(el, false); } } else { return(false); } return(true); }
/// <summary>Called when this node has been created and is being added to the given lexer. /// Closely related to Element.OnLexerCloseNode.</summary> /// <returns>True if this element handled itself.</returns> public override bool OnLexerAddNode(HtmlLexer lexer, int mode) { if ((mode & InHeadOpen) != 0) { // Opening a template lexer.Push(this, true); lexer.AddScopeMarker(); lexer.FramesetOk = false; lexer.CurrentMode = HtmlTreeMode.InTemplate; lexer.TemplateModes.Push(HtmlTreeMode.InTemplate); } else if (mode == HtmlTreeMode.AfterHead) { lexer.AfterHeadHeadTag(this); } else { return(false); } return(true); }
/// <summary>Called when this node has been created and is being added to the given lexer. /// Closely related to Element.OnLexerCloseNode.</summary> /// <returns>True if this element handled itself.</returns> public override bool OnLexerAddNode(HtmlLexer lexer, int mode) { if ((mode & (HtmlTreeMode.InHead | HtmlTreeMode.InBody | HtmlTreeMode.InTemplate | HtmlTreeMode.InSelectInTable | HtmlTreeMode.InSelect | HtmlTreeMode.InTable)) != 0) { // Append it: lexer.Push(this, true); // Switch to Script data: lexer.PreviousMode = lexer.CurrentMode; lexer.CurrentMode = HtmlTreeMode.Text; lexer.State = HtmlParseMode.Script; } else if (mode == HtmlTreeMode.AfterHead) { lexer.AfterHeadHeadTag(this); } else { return(false); } return(true); }
/// <summary>Called when a close tag of this element has /// been created and is being added to the given lexer.</summary> /// <returns>True if this element handled itself.</returns> public override bool OnLexerCloseNode(HtmlLexer lexer, int mode) { if (mode == HtmlTreeMode.InFrameset) { if (lexer.CurrentElement.Tag != "html") { // Ignore otherwise lexer.CloseCurrentNode(); if (lexer.CurrentElement.Tag != "frameset") { lexer.CurrentMode = HtmlTreeMode.AfterFrameset; } } } else { return(false); } return(true); }
public static void rewrite(java.io.Reader content, Uri source, Dictionary<String, IHtmlTagTransformer> transformers, java.io.Writer writer) { CharProducer producer = CharProducer.Factory.create(content, new InputSource(new java.net.URI(source.ToString()))); HtmlLexer lexer = new HtmlLexer(producer); try { Token lastToken = null; Token currentTag = null; IHtmlTagTransformer currentTransformer = null; bool tagChanged; while (lexer.hasNext()) { tagChanged = false; Token token = lexer.next() as Token; if (token.type == HtmlTokenType.IGNORABLE) { continue; } if (token.type == HtmlTokenType.TAGBEGIN) { currentTag = token; tagChanged = true; } if (tagChanged) { if (currentTransformer == null) { transformers.TryGetValue(currentTag.toString().Substring(1).ToLower(), out currentTransformer); } else { if (!currentTransformer.acceptNextTag(currentTag)) { writer.write(currentTransformer.close()); transformers.TryGetValue(currentTag.toString().Substring(1).ToLower(), out currentTransformer); } } } if (currentTransformer == null) { writer.write(producePreTokenSeparator(token, lastToken)); writer.write(token.toString()); writer.write(producePostTokenSeparator(token, lastToken)); } else { currentTransformer.accept(token, lastToken); } if (token.type == HtmlTokenType.TAGEND) { currentTag = null; } lastToken = token; } if (currentTransformer != null) { writer.write(currentTransformer.close()); } writer.flush(); } catch (Exception pe) { throw pe; } }