Пример #1
0
        public void TestParse()
        {
            var rtf = new RTF(ByteUtils.AsAsciiStream(RtfTestData.WordGame));

            rtf.ClassCallback[TokenClass.Control] = r => {
                if (r.Major == Major.CharAttr)
                {
                    if (r.Minor == Minor.FontSize)
                    {
                        Trace.Write(string.Format("FontSize {0} ", r.Param));
                        return;
                    }
                }
            };

            var t = new StringBuilder();

            rtf.ClassCallback[TokenClass.Text] = r => {
                t.Append(r.EncodedText);
            };

            rtf.Read();

            Trace.WriteLine("");
            Trace.WriteLine(t.ToString());
        }
Пример #2
0
        public void TestParseHtml()
        {
            var doc      = new HtmlDocument();
            var importer = new RtfImporter(ByteUtils.AsAsciiStream(RtfTestData.WordGameUmlaut), doc);

            importer.Import();
            ReportDetail(doc.Body);
        }
Пример #3
0
        protected virtual void Digg(string source, Content <Stream> sink)
        {
            try {
                var parser = new TagParser(source);
                var body   = new Element {
                    Name = "body", Text = ""
                };
                var elements = new Element[] {
                    new Element {
                        Name = "title", Text = ""
                    },
                    new Element {
                        Name = "h1", Text = ""
                    },
                    new Element {
                        Name = "h2", Text = ""
                    },
                    new Element {
                        Name = "h3", Text = ""
                    },
                    new Element {
                        Name = "h4", Text = ""
                    },
                    body,
                };
                var plainText = "";
                parser.DoElement += stuff => {
                    var tag = stuff.Element.ToLower();
                    foreach (var element in elements)
                    {
                        if (!element.Parsing && !element.Parsed && stuff.State == LCHP.State.Name && tag == element.Name)
                        {
                            element.Starts  = stuff.TagPosition;
                            element.Parsing = true;
                        }
                    }
                };
                parser.DoTag += stuff => {
                    var tag     = stuff.Tag.ToLower();
                    var lineend = (tag == "</br>" || tag == "<br>" || tag == "<br/>" || tag == "<br />" || tag == "</div>" || tag == "</p>");
                    foreach (var element in elements)
                    {
                        if (element.Parsing && !element.Parsed && stuff.State == LCHP.State.Endtag && tag == element.EndTag)
                        {
                            element.Ends    = stuff.Position;
                            element.Parsing = false;
                            element.Parsed  = true;
                        }
                    }
                    if (body.Parsing && lineend)
                    {
                        body.Parsing = false;
                        body.Parsed  = true;
                    }
                };
                parser.DoText += stuff => {
                    var text = stuff.Text.ToString(stuff.Origin, stuff.Position - stuff.Origin);
                    foreach (var element in elements)
                    {
                        if (element.Parsing)
                        {
                            element.Text += text;
                        }
                    }
                    if (body.Parsed || body.Parsing)
                    {
                        plainText += text;
                    }
                };

                var notEncoded = false;

                parser.NotEncoded = stuff => {
                    var co = stuff.Text.ToString(stuff.Position, 1);
                    var c  = System.Net.WebUtility.HtmlEncode(co);
                    // HtmlEncode doesn't replace special unicode chars
                    if (co == c)
                    {
                        c = string.Format("&#{0};", (int)c.ToCharArray(0, 1)[0]);
                    }
                    stuff.Text.Remove(stuff.Position, 1);
                    stuff.Text.Insert(stuff.Position, c);
                    stuff.Position += c.Length - 1;
                    notEncoded      = true;
                };

                parser.Parse();
                if (notEncoded)
                {
                    source = parser.Stuff.Text.ToString();
                    sink.Data.Dispose();
                    sink.Data = ByteUtils.AsAsciiStream(source);
                }

                if (!body.Parsed)
                {
                    source = "<html><head></head><body>" + source + "</body></html>";
                    sink.Data.Dispose();
                    sink.Data = ByteUtils.AsAsciiStream(source);
                    Digg(source, sink);
                }

                plainText = System.Net.WebUtility.HtmlDecode(plainText.Replace("\n", " ").Replace("\r", " ").Trim());
                string description = null;
                foreach (var element in elements.Where(e => e.Parsed))
                {
                    // TODO: replace unresolved unicode chars; see above
                    description = System.Net.WebUtility.HtmlDecode(element.Text.Replace("\n", " ").Replace("\r", " ").Trim());
                    if (!string.IsNullOrWhiteSpace(description))
                    {
                        break;
                    }
                }
                if (description != null)
                {
                    sink.Description = description;
                }
                //if (description == plainText)
                //    sink.Data = null;
            } catch (Exception e) {
                throw e;
            }
        }