示例#1
0
        override protected void DoPull()
        {
            if (!is_rich_text)
            {
                AppendText(description);
            }
            else
            {
                FilterHtml.AppendTextCallback append_text_cb = new FilterHtml.AppendTextCallback(AppendText);
                FilterHtml.AddPropertyCallback add_prop_cb = new FilterHtml.AddPropertyCallback(delegate(Beagle.Property p) {});
                FilterHtml.AppendSpaceCallback append_white_cb = new FilterHtml.AppendSpaceCallback(AppendWhiteSpace);
                FilterHtml.AppendSpaceCallback append_break_cb = new FilterHtml.AppendSpaceCallback(AppendStructuralBreak);
                FilterHtml.HotCallback         hot_up_cb       = new FilterHtml.HotCallback(HotUp);
                FilterHtml.HotCallback         hot_down_cb     = new FilterHtml.HotCallback(HotDown);

                FilterHtml html_filter = new FilterHtml(false);
                html_filter.ExtractText(description,
                                        append_text_cb,
                                        add_prop_cb,
                                        append_white_cb,
                                        append_break_cb,
                                        hot_up_cb,
                                        hot_down_cb);
            }

            Finished();
        }
示例#2
0
        public static TextReader GetHtmlReader(Stream stream, string charset, AddLinkCallback link_handler)
        {
            if (stream == null)
            {
                throw new ArgumentNullException("stream");
            }

            FilterHtml html_filter = new FilterHtml();

            html_filter.SnippetMode = false;
#if ENABLE_RDF_ADAPTER
            html_filter.SetAddLinkHandler(link_handler);
#endif

            html_filter.Indexable = new Indexable();              // fake an indexable
            html_filter.AddProperty(Property.NewUnsearched(StringFu.UnindexedNamespace + "encoding", charset));

            if (!html_filter.Open(stream, false))
            {
                throw new Exception("Cannot open html");
            }

            TextReader pr = html_filter.GetTextReader();
            return(pr);
        }
示例#3
0
		public static TextReader GetHtmlReader (Stream stream, string charset, AddLinkCallback link_handler)
		{
			if (stream == null)
				throw new ArgumentNullException ("stream");

			FilterHtml html_filter = new FilterHtml ();
			html_filter.SnippetMode = false;
#if ENABLE_RDF_ADAPTER
			html_filter.SetAddLinkHandler (link_handler);
#endif

			html_filter.Indexable = new Indexable (); // fake an indexable
			html_filter.AddProperty (Property.NewUnsearched (StringFu.              UnindexedNamespace + "encoding", charset));

			if (! html_filter.Open (stream, false))
				throw new Exception ("Cannot open html");

			TextReader pr = html_filter.GetTextReader ();
			return pr;
		}
示例#4
0
		override protected void DoPull ()
		{
			if (! is_rich_text)
				AppendText (description);
			else {
				FilterHtml.AppendTextCallback append_text_cb = new FilterHtml.AppendTextCallback (AppendText);
				FilterHtml.AddPropertyCallback add_prop_cb = new FilterHtml.AddPropertyCallback (delegate(Beagle.Property p) {});
				FilterHtml.AppendSpaceCallback append_white_cb = new FilterHtml.AppendSpaceCallback (AppendWhiteSpace);
				FilterHtml.AppendSpaceCallback append_break_cb = new FilterHtml.AppendSpaceCallback (AppendStructuralBreak);
				FilterHtml.HotCallback hot_up_cb = new FilterHtml.HotCallback (HotUp);
				FilterHtml.HotCallback hot_down_cb = new FilterHtml.HotCallback (HotDown);

				FilterHtml html_filter = new FilterHtml (false);
				html_filter.ExtractText (description,
							 append_text_cb,
							 add_prop_cb,
							 append_white_cb,
							 append_break_cb,
							 hot_up_cb,
							 hot_down_cb);
			}

			Finished ();
		}
示例#5
0
			public PartHandler (Indexable parent_indexable, FilterHtml.AddLinkCallback link_handler)
			{
				this.indexable = parent_indexable;
				this.link_handler = link_handler;
			}
示例#6
0
            public void OnEachPart(GMime.Object mime_part)
            {
                GMime.Object part = null;
                bool         part_needs_dispose = false;

                //for (int i = 0; i < this.depth; i++)
                //  Console.Write ("  ");
                //Console.WriteLine ("Content-Type: {0}", mime_part.ContentType);

                ++depth;

                if (mime_part is GMime.MessagePart)
                {
                    GMime.MessagePart msg_part = (GMime.MessagePart)mime_part;

                    using (GMime.Message message = msg_part.Message) {
                        using (GMime.Object subpart = message.MimePart)
                            this.OnEachPart(subpart);
                    }
                }
                else if (mime_part is GMime.Multipart)
                {
                    GMime.Multipart multipart = (GMime.Multipart)mime_part;
                    int             num_parts = multipart.Count;

                    // If the mimetype is multipart/alternative, we only want to index
                    // one part -- the richest one we can filter.
                    if (mime_part.ContentType.MediaSubtype.ToLower() == "alternative")
                    {
                        // The richest formats are at the end, so work from there
                        // backward.
                        for (int i = num_parts - 1; i >= 0; i--)
                        {
                            GMime.Object subpart = multipart[i];

                            if (IsMimeTypeHandled(subpart.ContentType.ToString()))
                            {
                                part = subpart;
                                part_needs_dispose = true;
                                break;
                            }
                            else
                            {
                                subpart.Dispose();
                            }
                        }
                    }

                    // If it's not alternative, or we don't know how to filter any of
                    // the parts, treat them like a bunch of attachments.
                    if (part == null)
                    {
                        for (int i = 0; i < num_parts; i++)
                        {
                            using (GMime.Object subpart = multipart[i])
                                this.OnEachPart(subpart);
                        }
                    }
                }
                else if (mime_part is GMime.Part)
                {
                    part = mime_part;
                }
                else
                {
                    throw new Exception(String.Format("Unknown part type: {0}", part.GetType()));
                }

                if (part != null)
                {
                    System.IO.Stream stream = null;

                    using (GMime.DataWrapper content_obj = ((GMime.Part)part).ContentObject)
                        stream = content_obj.Stream;

                    // If this is the only part and it's plain text, we
                    // want to just attach it to our filter instead of
                    // creating a child indexable for it.
                    bool no_child_needed = false;

                    string mime_type = part.ContentType.ToString().ToLower();

                    if (this.depth == 1 && this.count == 0)
                    {
                        if (mime_type == "text/plain")
                        {
                            no_child_needed = true;

                            this.reader = new StreamReader(stream);
                        }
                        else if (mime_type == "text/html")
                        {
                            no_child_needed = true;
                            html_part       = true;
                            string enc = part.ContentType.GetParameter("charset");
                            // DataWrapper.Stream is a very limited stream
                            // and does not allow Seek or Tell
                            // HtmlFilter requires Stream.Position=0.
                            // Play safe and create a memorystream
                            // for HTML parsing.

                            GMime.StreamMem mem_stream;
                            mem_stream = new GMime.StreamMem();

                            GMime.Stream data_stream;
                            data_stream = ((StreamWrapper)stream).GMimeStream;
                            data_stream.WriteToStream(mem_stream);
                            data_stream.Flush();

                            // The StreamWrapper and hence the memory_stream
                            // will be closed when the reader is closed
                            // after Pull()-ing is done.
                            System.IO.Stream html_stream;
                            html_stream = new StreamWrapper(mem_stream);
                            html_stream.Seek(0, SeekOrigin.Begin);

                            stream.Close();

                            try {
                                this.reader = FilterHtml.GetHtmlReader(html_stream, enc, link_handler);
                            } catch (Exception e) {
                                Log.Debug(e, "Exception while filtering HTML email {0}", this.indexable.Uri);
                                this.reader = null;
                                html_stream.Close();
                                html_part = false;
                            }
                        }
                    }

                    if (!no_child_needed)
                    {
                        // Check the mime type against the blacklist and don't index any
                        // parts that are contained within.  That way the user doesn't
                        // get flooded with pointless signatures and vcard and ical
                        // attachments along with (real) attachments.

                        if (Array.IndexOf(blacklisted_mime_types, mime_type) == -1)
                        {
                            string    sub_uri = "#" + this.count;
                            Indexable child;
                            child = new Indexable(UriFu.AddFragment(this.indexable.Uri, sub_uri, true));

                            child.DisplayUri = new Uri(this.indexable.DisplayUri.ToString() + "#" + this.count);

                            // This is a special case.
                            // Even for mails found on disk, MailMessage hitype is set
                            child.HitType  = "MailMessage";
                            child.MimeType = mime_type;

                            // If this is the richest part we found for multipart emails, add its content to textcache
                            if (snippet_attachment ||
                                (this.depth == 1 && this.count == 0))
                            {
                                child.CacheContent = true;
                            }
                            else
                            {
                                child.CacheContent = false;
                            }

                            string filename = ((GMime.Part)part).Filename;

                            if (!String.IsNullOrEmpty(filename))
                            {
                                child.AddProperty(Property.NewKeyword("fixme:attachment_title", filename));

                                foreach (Property prop in Property.StandardFileProperties(filename, false))
                                {
                                    child.AddProperty(prop);
                                }
                            }

                            // Store length of attachment
                            long length = stream.Length;
                            if (length != -1)
                            {
                                child.AddProperty(Property.NewUnsearched("fixme:filesize", length));
                            }

                            if (part.ContentType.MediaType.ToLower() == "text")
                            {
                                child.SetTextReader(new StreamReader(stream));
                            }
                            else
                            {
                                child.SetBinaryStream(stream);
                            }

                            child.SetChildOf(this.indexable);
                            child.StoreStream();
                            child.CloseStreams();
                            this.child_indexables.Add(child);
                        }
                        else
                        {
                            Log.Debug("Skipping attachment {0}#{1} with blacklisted mime type {2}",
                                      this.indexable.Uri, this.count, mime_type);
                        }
                    }

                    this.count++;
                }

                if (part_needs_dispose)
                {
                    part.Dispose();
                }

                --depth;
            }