public void TestParse() { var rtf = new RTF(ByteUtils.AsAsciiStream(RtfTestData.WordGame)); rtf.ClassCallback[TokenClass.Control] = r => { if (r.Major == Major.CharAttr) { if (r.Minor == Minor.FontSize) { Trace.Write(string.Format("FontSize {0} ", r.Param)); return; } } }; var t = new StringBuilder(); rtf.ClassCallback[TokenClass.Text] = r => { t.Append(r.EncodedText); }; rtf.Read(); Trace.WriteLine(""); Trace.WriteLine(t.ToString()); }
public void TestParseHtml() { var doc = new HtmlDocument(); var importer = new RtfImporter(ByteUtils.AsAsciiStream(RtfTestData.WordGameUmlaut), doc); importer.Import(); ReportDetail(doc.Body); }
protected virtual void Digg(string source, Content <Stream> sink) { try { var parser = new TagParser(source); var body = new Element { Name = "body", Text = "" }; var elements = new Element[] { new Element { Name = "title", Text = "" }, new Element { Name = "h1", Text = "" }, new Element { Name = "h2", Text = "" }, new Element { Name = "h3", Text = "" }, new Element { Name = "h4", Text = "" }, body, }; var plainText = ""; parser.DoElement += stuff => { var tag = stuff.Element.ToLower(); foreach (var element in elements) { if (!element.Parsing && !element.Parsed && stuff.State == LCHP.State.Name && tag == element.Name) { element.Starts = stuff.TagPosition; element.Parsing = true; } } }; parser.DoTag += stuff => { var tag = stuff.Tag.ToLower(); var lineend = (tag == "</br>" || tag == "<br>" || tag == "<br/>" || tag == "<br />" || tag == "</div>" || tag == "</p>"); foreach (var element in elements) { if (element.Parsing && !element.Parsed && stuff.State == LCHP.State.Endtag && tag == element.EndTag) { element.Ends = stuff.Position; element.Parsing = false; element.Parsed = true; } } if (body.Parsing && lineend) { body.Parsing = false; body.Parsed = true; } }; parser.DoText += stuff => { var text = stuff.Text.ToString(stuff.Origin, stuff.Position - stuff.Origin); foreach (var element in elements) { if (element.Parsing) { element.Text += text; } } if (body.Parsed || body.Parsing) { plainText += text; } }; var notEncoded = false; parser.NotEncoded = stuff => { var co = stuff.Text.ToString(stuff.Position, 1); var c = System.Net.WebUtility.HtmlEncode(co); // HtmlEncode doesn't replace special unicode chars if (co == c) { c = string.Format("&#{0};", (int)c.ToCharArray(0, 1)[0]); } stuff.Text.Remove(stuff.Position, 1); stuff.Text.Insert(stuff.Position, c); stuff.Position += c.Length - 1; notEncoded = true; }; parser.Parse(); if (notEncoded) { source = parser.Stuff.Text.ToString(); sink.Data.Dispose(); sink.Data = ByteUtils.AsAsciiStream(source); } if (!body.Parsed) { source = "<html><head></head><body>" + source + "</body></html>"; sink.Data.Dispose(); sink.Data = ByteUtils.AsAsciiStream(source); Digg(source, sink); } plainText = System.Net.WebUtility.HtmlDecode(plainText.Replace("\n", " ").Replace("\r", " ").Trim()); string description = null; foreach (var element in elements.Where(e => e.Parsed)) { // TODO: replace unresolved unicode chars; see above description = System.Net.WebUtility.HtmlDecode(element.Text.Replace("\n", " ").Replace("\r", " ").Trim()); if (!string.IsNullOrWhiteSpace(description)) { break; } } if (description != null) { sink.Description = description; } //if (description == plainText) // sink.Data = null; } catch (Exception e) { throw e; } }