public static string GetScriptData(StreamReader streamReader) { StringBuilder sb = new StringBuilder(); var tokenizer = new HtmlTokenizer(streamReader); HtmlToken token; while (tokenizer.ReadNextToken(out token)) { switch (token.Kind) { case HtmlTokenKind.ScriptData: var data = (HtmlScriptDataToken)token; sb.Append(data.Data); break; default: break; } } return(sb.ToString()); }
/// <summary> /// Convert the contents of <paramref name="reader"/> from the <see cref="InputFormat"/> to the /// <see cref="OutputFormat"/> and uses the <paramref name="writer"/> to write the resulting text. /// </summary> /// <remarks> /// Converts the contents of <paramref name="reader"/> from the <see cref="InputFormat"/> to the /// <see cref="OutputFormat"/> and uses the <paramref name="writer"/> to write the resulting text. /// </remarks> /// <param name="reader">The text reader.</param> /// <param name="writer">The text writer.</param> /// <exception cref="System.ArgumentNullException"> /// <para><paramref name="reader"/> is <c>null</c>.</para> /// <para>-or-</para> /// <para><paramref name="writer"/> is <c>null</c>.</para> /// </exception> public override void Convert(TextReader reader, TextWriter writer) { if (reader == null) { throw new ArgumentNullException(nameof(reader)); } if (writer == null) { throw new ArgumentNullException(nameof(writer)); } if (!string.IsNullOrEmpty(Header)) { if (HeaderFormat == HeaderFooterFormat.Text) { var converter = new TextToHtml { OutputHtmlFragment = true }; using (var sr = new StringReader(Header)) converter.Convert(sr, writer); } else { writer.Write(Header); } } using (var htmlWriter = new HtmlWriter(writer)) { var callback = HtmlTagCallback ?? DefaultHtmlTagCallback; var stack = new List <HtmlToHtmlTagContext> (); var tokenizer = new HtmlTokenizer(reader); HtmlToHtmlTagContext ctx; HtmlToken token; while (tokenizer.ReadNextToken(out token)) { switch (token.Kind) { default: if (!SuppressContent(stack)) { htmlWriter.WriteToken(token); } break; case HtmlTokenKind.Comment: if (!FilterComments && !SuppressContent(stack)) { htmlWriter.WriteToken(token); } break; case HtmlTokenKind.Tag: var tag = (HtmlTagToken)token; if (!tag.IsEndTag) { //if (NormalizeHtml && AutoClosingTags.Contains (startTag.TagName) && // (ctx = Pop (stack, startTag.TagName)) != null && // ctx.InvokeCallbackForEndTag && !SuppressContent (stack)) { // var value = string.Format ("</{0}>", ctx.TagName); // var name = ctx.TagName; // // ctx = new HtmlToHtmlTagContext (new HtmlTokenTag (HtmlTokenKind.EndTag, name, value)) { // InvokeCallbackForEndTag = ctx.InvokeCallbackForEndTag, // SuppressInnerContent = ctx.SuppressInnerContent, // DeleteEndTag = ctx.DeleteEndTag, // DeleteTag = ctx.DeleteTag // }; // callback (ctx, htmlWriter); //} if (!tag.IsEmptyElement) { ctx = new HtmlToHtmlTagContext(tag); if (FilterHtml && ctx.TagId == HtmlTagId.Script) { ctx.SuppressInnerContent = true; ctx.DeleteEndTag = true; ctx.DeleteTag = true; } else if (!SuppressContent(stack)) { callback(ctx, htmlWriter); } stack.Add(ctx); } else if (!SuppressContent(stack)) { ctx = new HtmlToHtmlTagContext(tag); if (!FilterHtml || ctx.TagId != HtmlTagId.Script) { callback(ctx, htmlWriter); } } } else { if ((ctx = Pop(stack, tag.Name)) != null) { if (!SuppressContent(stack)) { if (ctx.InvokeCallbackForEndTag) { ctx = new HtmlToHtmlTagContext(tag) { InvokeCallbackForEndTag = ctx.InvokeCallbackForEndTag, SuppressInnerContent = ctx.SuppressInnerContent, DeleteEndTag = ctx.DeleteEndTag, DeleteTag = ctx.DeleteTag }; callback(ctx, htmlWriter); } else if (!ctx.DeleteEndTag) { htmlWriter.WriteEndTag(tag.Name); } } } else if (!SuppressContent(stack)) { ctx = new HtmlToHtmlTagContext(tag); callback(ctx, htmlWriter); } } break; } } htmlWriter.Flush(); } if (!string.IsNullOrEmpty(Footer)) { if (FooterFormat == HeaderFooterFormat.Text) { var converter = new TextToHtml { OutputHtmlFragment = true }; using (var sr = new StringReader(Footer)) converter.Convert(sr, writer); } else { writer.Write(Footer); } } }
public override void Parse(TextSource textSnapshot, WebDocument htmldoc, DomElement currentNode) { this._resultHtmlDoc = htmldoc; char[] copyBuffer = textSnapshot.ActualSnapshot.Copy(0, textSnapshot.ActualSnapshot.Length); using (var ms = new System.IO.MemoryStream(System.Text.Encoding.UTF8.GetBytes(copyBuffer))) using (var textReader = new System.IO.StreamReader(ms)) { var tokenizer = new HtmlTokenizer(textReader); HtmlToken token; while (tokenizer.ReadNextToken(out token)) { switch (token.Kind) { case HtmlTokenKind.Data: { var text = (HtmlDataToken)token; currentNode.AddChild(_resultHtmlDoc.CreateTextNode(text.Data.ToCharArray())); } break; case HtmlTokenKind.Tag: { var tag = (HtmlTagToken)token; if (!tag.IsEndTag) { //open tag DomElement elem = this._resultHtmlDoc.CreateElement(null, tag.Name); currentNode.AddChild(elem); foreach (var attribute in tag.Attributes) { var attr = this._resultHtmlDoc.CreateAttribute(null, attribute.Name); if (attribute.Value != null) { attr.Value = attribute.Value; } elem.AddAttribute(attr); } if (!tag.IsEmptyElement) { openEltStack.Push(currentNode); currentNode = elem; } } else { //this is end tag //check end tag match or not int tagNameIndex = _resultHtmlDoc.AddStringIfNotExists(tag.Name); if (currentNode.Name == tag.Name) { currentNode = openEltStack.Pop(); } else { //if not equal then check if current node need close tag or not int count = 3; //? bool ok = false; while (count > 0) { if (HtmlTagMatching.IsSingleTag(currentNode.LocalNameIndex)) { if (openEltStack.Count > 0) { currentNode = openEltStack.Pop(); } if (currentNode.LocalName == tag.Name) { if (openEltStack.Count > 0) { currentNode = openEltStack.Pop(); ok = true; break; } } } else if (HtmlTagMatching.CanAutoClose(currentNode.LocalNameIndex)) { if (openEltStack.Count > 0) { currentNode = openEltStack.Pop(); } if (currentNode.LocalName == tag.Name) { if (openEltStack.Count > 0) { currentNode = openEltStack.Pop(); ok = true; break; } } } else { //implement err handling here! throw new NotSupportedException(); } count--; } if (!ok) { throw new NotSupportedException(); } } } } break; case HtmlTokenKind.Comment: break; case HtmlTokenKind.DocType: break; default: { } break; } } } }
static void VerifyHtmlTokenizerOutput(string path) { var outpath = Path.ChangeExtension(path, ".out.html"); var tokens = Path.ChangeExtension(path, ".tokens"); var expectedOutput = File.Exists(outpath) ? File.ReadAllText(outpath) : string.Empty; var expected = File.Exists(tokens) ? File.ReadAllText(tokens).Replace("\r\n", "\n") : string.Empty; var output = new StringBuilder(); var actual = new StringBuilder(); using (var textReader = new StreamReader(path, Encoding.GetEncoding(1252))) { var tokenizer = new HtmlTokenizer(textReader); HtmlToken token; Assert.AreEqual(HtmlTokenizerState.Data, tokenizer.TokenizerState); while (tokenizer.ReadNextToken(out token)) { output.Append(token); actual.AppendFormat("{0}: ", token.Kind); switch (token.Kind) { case HtmlTokenKind.ScriptData: case HtmlTokenKind.CData: case HtmlTokenKind.Data: var text = (HtmlDataToken)token; for (int i = 0; i < text.Data.Length; i++) { switch (text.Data[i]) { case '\f': actual.Append("\\f"); break; case '\t': actual.Append("\\t"); break; case '\r': break; case '\n': actual.Append("\\n"); break; default: actual.Append(text.Data[i]); break; } } actual.Append('\n'); break; case HtmlTokenKind.Tag: var tag = (HtmlTagToken)token; actual.AppendFormat("<{0}{1}", tag.IsEndTag ? "/" : "", tag.Name); foreach (var attribute in tag.Attributes) { if (attribute.Value != null) { actual.AppendFormat(" {0}={1}", attribute.Name, Quote(attribute.Value)); } else { actual.AppendFormat(" {0}", attribute.Name); } } actual.Append(tag.IsEmptyElement ? "/>" : ">"); actual.Append('\n'); break; case HtmlTokenKind.Comment: var comment = (HtmlCommentToken)token; actual.Append(comment.Comment.Replace("\r\n", "\n")); actual.Append('\n'); break; case HtmlTokenKind.DocType: var doctype = (HtmlDocTypeToken)token; if (doctype.ForceQuirksMode) { actual.Append("<!-- force quirks mode -->"); } actual.Append("<!DOCTYPE"); if (doctype.Name != null) { actual.AppendFormat(" {0}", doctype.Name.ToUpperInvariant()); } if (doctype.PublicIdentifier != null) { actual.AppendFormat(" PUBLIC {0}", Quote(doctype.PublicIdentifier)); if (doctype.SystemIdentifier != null) { actual.AppendFormat(" {0}", Quote(doctype.SystemIdentifier)); } } else if (doctype.SystemIdentifier != null) { actual.AppendFormat(" SYSTEM {0}", Quote(doctype.SystemIdentifier)); } actual.Append(">"); actual.Append('\n'); break; default: Assert.Fail("Unhandled token type: {0}", token.Kind); break; } } Assert.AreEqual(HtmlTokenizerState.EndOfFile, tokenizer.TokenizerState); } if (!File.Exists(tokens)) { File.WriteAllText(tokens, actual.ToString()); } if (!File.Exists(outpath)) { File.WriteAllText(outpath, output.ToString()); } Assert.AreEqual(expected, actual.ToString(), "The token stream does not match the expected tokens."); Assert.AreEqual(expectedOutput, output.ToString(), "The output stream does not match the expected output."); }
/// <summary> /// Get a text preview of a stream of text. /// </summary> /// <remarks> /// Gets a text preview of a stream of text. /// </remarks> /// <param name="reader">The original text stream.</param> /// <returns>A string representing a shortened preview of the original text.</returns> /// <exception cref="System.ArgumentNullException"> /// <paramref name="reader"/> is <c>null</c>. /// </exception> public override string GetPreviewText(TextReader reader) { if (reader == null) { throw new ArgumentNullException(nameof(reader)); } var tokenizer = new HtmlTokenizer(reader) { IgnoreTruncatedTags = true }; var preview = new char[MaximumPreviewLength]; var stack = new List <HtmlTagContext> (); var prefix = string.Empty; int previewLength = 0; HtmlTagContext ctx; HtmlAttribute attr; bool body = false; bool full = false; bool lwsp = true; HtmlToken token; while (!full && tokenizer.ReadNextToken(out token)) { switch (token.Kind) { case HtmlTokenKind.Tag: var tag = (HtmlTagToken)token; if (!tag.IsEndTag) { if (body) { switch (tag.Id) { case HtmlTagId.Image: if ((attr = tag.Attributes.FirstOrDefault(x => x.Id == HtmlAttributeId.Alt)) != null) { full = Append(preview, ref previewLength, prefix + attr.Value, ref lwsp); prefix = string.Empty; } break; case HtmlTagId.LI: if ((ctx = GetListItemContext(stack)) != null) { if (ctx.TagId == HtmlTagId.OL) { full = Append(preview, ref previewLength, $" {++ctx.ListIndex}. ", ref lwsp); prefix = string.Empty; } else { //full = Append (preview, ref previewLength, " \u2022 ", ref lwsp); prefix = " "; } } break; case HtmlTagId.Br: case HtmlTagId.P: prefix = " "; break; } if (!tag.IsEmptyElement) { ctx = new HtmlTagContext(tag.Id) { SuppressInnerContent = ShouldSuppressInnerContent(tag.Id) }; stack.Add(ctx); } } else if (tag.Id == HtmlTagId.Body && !tag.IsEmptyElement) { body = true; } } else if (tag.Id == HtmlTagId.Body) { stack.Clear(); body = false; } else { Pop(stack, tag.Id); } break; case HtmlTokenKind.Data: if (body && !SuppressContent(stack)) { var data = (HtmlDataToken)token; full = Append(preview, ref previewLength, prefix + data.Data, ref lwsp); prefix = string.Empty; } break; } } if (lwsp && previewLength > 0) { previewLength--; } return(new string (preview, 0, previewLength)); }
static string GetHtmlBody(TnefPropertyReader prop, int codepage, out Encoding encoding) { var rawValue = prop.ReadValueAsBytes(); int rawLength = rawValue.Length; while (rawLength > 0 && rawValue[rawLength - 1] == 0) { rawLength--; } // Try and extract the charset from the HTML meta Content-Type value. using (var stream = new MemoryStream(rawValue, 0, rawLength)) { // It should be safe to assume ISO-8859-1 for this purpose. We don't want to risk using UTF-8 (or any other charset) and having it throw an exception... using (var reader = new StreamReader(stream, CharsetUtils.Latin1, true)) { var tokenizer = new HtmlTokenizer(reader); HtmlToken token; while (tokenizer.ReadNextToken(out token)) { if (token.Kind != HtmlTokenKind.Tag) { continue; } var tag = (HtmlTagToken)token; if (tag.Id == HtmlTagId.Body || (tag.Id == HtmlTagId.Head && tag.IsEndTag)) { break; } if (tag.Id != HtmlTagId.Meta || tag.IsEndTag) { continue; } string httpEquiv = null; string content = null; for (int i = 0; i < tag.Attributes.Count; i++) { switch (tag.Attributes[i].Id) { case HtmlAttributeId.HttpEquiv: httpEquiv = httpEquiv ?? tag.Attributes[i].Value; break; case HtmlAttributeId.Content: content = content ?? tag.Attributes[i].Value; break; } } if (httpEquiv == null || !httpEquiv.Equals("Content-Type", StringComparison.OrdinalIgnoreCase)) { continue; } if (!ContentType.TryParse(content, out var contentType) || string.IsNullOrEmpty(contentType.Charset)) { break; } try { encoding = Encoding.GetEncoding(contentType.Charset, EncoderFallback.ExceptionFallback, DecoderFallback.ExceptionFallback); // If this converts cleanly, then we're golden. return(encoding.GetString(rawValue, 0, rawLength)); } catch { // Otherwise, fall back to assuming the TNEF message codepage. break; } } encoding = Encoding.GetEncoding(codepage); return(encoding.GetString(rawValue, 0, rawLength)); } } }
static void VerifyHtmlTokenizerOutput(string path) { var tokens = Path.ChangeExtension(path, ".tokens"); var expected = File.Exists(tokens) ? File.ReadAllText(tokens) : string.Empty; var actual = new StringBuilder(); using (var textReader = File.OpenText(path)) { var tokenizer = new HtmlTokenizer(textReader); HtmlToken token; System.Diagnostics.Debug.Assert(HtmlTokenizerState.Data == tokenizer.TokenizerState); while (tokenizer.ReadNextToken(out token)) { actual.AppendFormat("{0}: ", token.Kind); switch (token.Kind) { case HtmlTokenKind.Data: var text = (HtmlDataToken)token; for (int i = 0; i < text.Data.Length; i++) { switch (text.Data[i]) { case '\f': actual.Append("\\f"); break; case '\t': actual.Append("\\t"); break; case '\r': actual.Append("\\r"); break; case '\n': actual.Append("\\n"); break; default: actual.Append(text.Data[i]); break; } } actual.AppendLine(); break; case HtmlTokenKind.Tag: var tag = (HtmlTagToken)token; actual.AppendFormat("<{0}{1}", tag.IsEndTag ? "/" : "", tag.Name); foreach (var attribute in tag.Attributes) { if (attribute.Value != null) { actual.AppendFormat(" {0}={1}", attribute.Name, Quote(attribute.Value)); } else { actual.AppendFormat(" {0}", attribute.Name); } } actual.Append(tag.IsEmptyElement ? "/>" : ">"); actual.AppendLine(); break; case HtmlTokenKind.Comment: var comment = (HtmlCommentToken)token; actual.AppendLine(comment.Comment); break; case HtmlTokenKind.DocType: var doctype = (HtmlDocTypeToken)token; if (doctype.ForceQuirksMode) { actual.Append("<!-- force quirks mode -->"); } actual.Append("<!DOCTYPE"); if (doctype.Name != null) { actual.AppendFormat(" {0}", doctype.Name.ToUpperInvariant()); } if (doctype.PublicIdentifier != null) { actual.AppendFormat(" PUBLIC {0}", Quote(doctype.PublicIdentifier)); if (doctype.SystemIdentifier != null) { actual.AppendFormat(" {0}", Quote(doctype.SystemIdentifier)); } } else if (doctype.SystemIdentifier != null) { actual.AppendFormat(" SYSTEM {0}", Quote(doctype.SystemIdentifier)); } actual.Append(">"); actual.AppendLine(); break; default: System.Diagnostics.Debug.Fail($"Unhandled token type: {token.Kind}"); break; } } System.Diagnostics.Debug.Assert(HtmlTokenizerState.EndOfFile == tokenizer.TokenizerState); } if (!File.Exists(tokens)) { File.WriteAllText(tokens, actual.ToString()); } System.Diagnostics.Debug.Assert(expected == actual.ToString(), "The token stream does not match the expected tokens."); }
/// <summary> /// Convert the contents of <paramref name="reader"/> from the <see cref="InputFormat"/> to the /// <see cref="OutputFormat"/> and uses the <paramref name="writer"/> to write the resulting text. /// </summary> /// <remarks> /// Converts the contents of <paramref name="reader"/> from the <see cref="InputFormat"/> to the /// <see cref="OutputFormat"/> and uses the <paramref name="writer"/> to write the resulting text. /// </remarks> /// <param name="reader">The text reader.</param> /// <param name="writer">The text writer.</param> /// <exception cref="System.ArgumentNullException"> /// <para><paramref name="reader"/> is <c>null</c>.</para> /// <para>-or-</para> /// <para><paramref name="writer"/> is <c>null</c>.</para> /// </exception> public override void Convert(TextReader reader, TextWriter writer) { if (reader == null) { throw new ArgumentNullException(nameof(reader)); } if (writer == null) { throw new ArgumentNullException(nameof(writer)); } if (!string.IsNullOrEmpty(Header)) { if (HeaderFormat == HeaderFooterFormat.Text) { var converter = new TextToHtml(); using (var sr = new StringReader(Header)) converter.Convert(sr, writer); } else { writer.Write(Header); } } using (var htmlWriter = new HtmlWriter(writer)) { var callback = HtmlTagCallback ?? DefaultHtmlTagCallback; var stack = new List <HtmlToHtmlTagContext> (); var tokenizer = new HtmlTokenizer(reader); HtmlToHtmlTagContext ctx; HtmlToken token; while (tokenizer.ReadNextToken(out token)) { switch (token.Kind) { default: if (!SuppressContent(stack)) { htmlWriter.WriteToken(token); } break; case HtmlTokenKind.Tag: var tag = (HtmlTagToken)token; if (!tag.IsEndTag) { if (!tag.IsEmptyElement) { ctx = new HtmlToHtmlTagContext(tag); if (FilterHtml && ctx.TagId == HtmlTagId.Script) { ctx.SuppressInnerContent = true; ctx.DeleteEndTag = true; ctx.DeleteTag = true; } else if (!SuppressContent(stack)) { callback(ctx, htmlWriter); } stack.Add(ctx); } else if (!SuppressContent(stack)) { ctx = new HtmlToHtmlTagContext(tag); if (!FilterHtml || ctx.TagId != HtmlTagId.Script) { callback(ctx, htmlWriter); } } } else { if ((ctx = Pop(stack, tag.Name)) != null) { if (!SuppressContent(stack)) { if (ctx.InvokeCallbackForEndTag) { ctx = new HtmlToHtmlTagContext(tag) { InvokeCallbackForEndTag = ctx.InvokeCallbackForEndTag, SuppressInnerContent = ctx.SuppressInnerContent, DeleteEndTag = ctx.DeleteEndTag, DeleteTag = ctx.DeleteTag }; callback(ctx, htmlWriter); } else if (!ctx.DeleteEndTag) { htmlWriter.WriteEndTag(tag.Name); } } } else if (!SuppressContent(stack)) { ctx = new HtmlToHtmlTagContext(tag); callback(ctx, htmlWriter); } } break; case HtmlTokenKind.Comment: if (!StripComments) { htmlWriter.WriteToken(token); } break; } } htmlWriter.Flush(); } if (!string.IsNullOrEmpty(Footer)) { if (FooterFormat == HeaderFooterFormat.Text) { var converter = new TextToHtml(); using (var sr = new StringReader(Footer)) converter.Convert(sr, writer); } else { writer.Write(Footer); } } }