override protected void DoPull() { if (!is_rich_text) { AppendText(description); } else { FilterHtml.AppendTextCallback append_text_cb = new FilterHtml.AppendTextCallback(AppendText); FilterHtml.AddPropertyCallback add_prop_cb = new FilterHtml.AddPropertyCallback(delegate(Beagle.Property p) {}); FilterHtml.AppendSpaceCallback append_white_cb = new FilterHtml.AppendSpaceCallback(AppendWhiteSpace); FilterHtml.AppendSpaceCallback append_break_cb = new FilterHtml.AppendSpaceCallback(AppendStructuralBreak); FilterHtml.HotCallback hot_up_cb = new FilterHtml.HotCallback(HotUp); FilterHtml.HotCallback hot_down_cb = new FilterHtml.HotCallback(HotDown); FilterHtml html_filter = new FilterHtml(false); html_filter.ExtractText(description, append_text_cb, add_prop_cb, append_white_cb, append_break_cb, hot_up_cb, hot_down_cb); } Finished(); }
public static TextReader GetHtmlReader(Stream stream, string charset, AddLinkCallback link_handler) { if (stream == null) { throw new ArgumentNullException("stream"); } FilterHtml html_filter = new FilterHtml(); html_filter.SnippetMode = false; #if ENABLE_RDF_ADAPTER html_filter.SetAddLinkHandler(link_handler); #endif html_filter.Indexable = new Indexable(); // fake an indexable html_filter.AddProperty(Property.NewUnsearched(StringFu.UnindexedNamespace + "encoding", charset)); if (!html_filter.Open(stream, false)) { throw new Exception("Cannot open html"); } TextReader pr = html_filter.GetTextReader(); return(pr); }
public static TextReader GetHtmlReader (Stream stream, string charset, AddLinkCallback link_handler) { if (stream == null) throw new ArgumentNullException ("stream"); FilterHtml html_filter = new FilterHtml (); html_filter.SnippetMode = false; #if ENABLE_RDF_ADAPTER html_filter.SetAddLinkHandler (link_handler); #endif html_filter.Indexable = new Indexable (); // fake an indexable html_filter.AddProperty (Property.NewUnsearched (StringFu. UnindexedNamespace + "encoding", charset)); if (! html_filter.Open (stream, false)) throw new Exception ("Cannot open html"); TextReader pr = html_filter.GetTextReader (); return pr; }
override protected void DoPull () { if (! is_rich_text) AppendText (description); else { FilterHtml.AppendTextCallback append_text_cb = new FilterHtml.AppendTextCallback (AppendText); FilterHtml.AddPropertyCallback add_prop_cb = new FilterHtml.AddPropertyCallback (delegate(Beagle.Property p) {}); FilterHtml.AppendSpaceCallback append_white_cb = new FilterHtml.AppendSpaceCallback (AppendWhiteSpace); FilterHtml.AppendSpaceCallback append_break_cb = new FilterHtml.AppendSpaceCallback (AppendStructuralBreak); FilterHtml.HotCallback hot_up_cb = new FilterHtml.HotCallback (HotUp); FilterHtml.HotCallback hot_down_cb = new FilterHtml.HotCallback (HotDown); FilterHtml html_filter = new FilterHtml (false); html_filter.ExtractText (description, append_text_cb, add_prop_cb, append_white_cb, append_break_cb, hot_up_cb, hot_down_cb); } Finished (); }
public PartHandler (Indexable parent_indexable, FilterHtml.AddLinkCallback link_handler) { this.indexable = parent_indexable; this.link_handler = link_handler; }
public void OnEachPart(GMime.Object mime_part) { GMime.Object part = null; bool part_needs_dispose = false; //for (int i = 0; i < this.depth; i++) // Console.Write (" "); //Console.WriteLine ("Content-Type: {0}", mime_part.ContentType); ++depth; if (mime_part is GMime.MessagePart) { GMime.MessagePart msg_part = (GMime.MessagePart)mime_part; using (GMime.Message message = msg_part.Message) { using (GMime.Object subpart = message.MimePart) this.OnEachPart(subpart); } } else if (mime_part is GMime.Multipart) { GMime.Multipart multipart = (GMime.Multipart)mime_part; int num_parts = multipart.Count; // If the mimetype is multipart/alternative, we only want to index // one part -- the richest one we can filter. if (mime_part.ContentType.MediaSubtype.ToLower() == "alternative") { // The richest formats are at the end, so work from there // backward. for (int i = num_parts - 1; i >= 0; i--) { GMime.Object subpart = multipart[i]; if (IsMimeTypeHandled(subpart.ContentType.ToString())) { part = subpart; part_needs_dispose = true; break; } else { subpart.Dispose(); } } } // If it's not alternative, or we don't know how to filter any of // the parts, treat them like a bunch of attachments. if (part == null) { for (int i = 0; i < num_parts; i++) { using (GMime.Object subpart = multipart[i]) this.OnEachPart(subpart); } } } else if (mime_part is GMime.Part) { part = mime_part; } else { throw new Exception(String.Format("Unknown part type: {0}", part.GetType())); } if (part != null) { System.IO.Stream stream = null; using (GMime.DataWrapper content_obj = ((GMime.Part)part).ContentObject) stream = content_obj.Stream; // If this is the only part and it's plain text, we // want to just attach it to our filter instead of // creating a child indexable for it. bool no_child_needed = false; string mime_type = part.ContentType.ToString().ToLower(); if (this.depth == 1 && this.count == 0) { if (mime_type == "text/plain") { no_child_needed = true; this.reader = new StreamReader(stream); } else if (mime_type == "text/html") { no_child_needed = true; html_part = true; string enc = part.ContentType.GetParameter("charset"); // DataWrapper.Stream is a very limited stream // and does not allow Seek or Tell // HtmlFilter requires Stream.Position=0. // Play safe and create a memorystream // for HTML parsing. GMime.StreamMem mem_stream; mem_stream = new GMime.StreamMem(); GMime.Stream data_stream; data_stream = ((StreamWrapper)stream).GMimeStream; data_stream.WriteToStream(mem_stream); data_stream.Flush(); // The StreamWrapper and hence the memory_stream // will be closed when the reader is closed // after Pull()-ing is done. System.IO.Stream html_stream; html_stream = new StreamWrapper(mem_stream); html_stream.Seek(0, SeekOrigin.Begin); stream.Close(); try { this.reader = FilterHtml.GetHtmlReader(html_stream, enc, link_handler); } catch (Exception e) { Log.Debug(e, "Exception while filtering HTML email {0}", this.indexable.Uri); this.reader = null; html_stream.Close(); html_part = false; } } } if (!no_child_needed) { // Check the mime type against the blacklist and don't index any // parts that are contained within. That way the user doesn't // get flooded with pointless signatures and vcard and ical // attachments along with (real) attachments. if (Array.IndexOf(blacklisted_mime_types, mime_type) == -1) { string sub_uri = "#" + this.count; Indexable child; child = new Indexable(UriFu.AddFragment(this.indexable.Uri, sub_uri, true)); child.DisplayUri = new Uri(this.indexable.DisplayUri.ToString() + "#" + this.count); // This is a special case. // Even for mails found on disk, MailMessage hitype is set child.HitType = "MailMessage"; child.MimeType = mime_type; // If this is the richest part we found for multipart emails, add its content to textcache if (snippet_attachment || (this.depth == 1 && this.count == 0)) { child.CacheContent = true; } else { child.CacheContent = false; } string filename = ((GMime.Part)part).Filename; if (!String.IsNullOrEmpty(filename)) { child.AddProperty(Property.NewKeyword("fixme:attachment_title", filename)); foreach (Property prop in Property.StandardFileProperties(filename, false)) { child.AddProperty(prop); } } // Store length of attachment long length = stream.Length; if (length != -1) { child.AddProperty(Property.NewUnsearched("fixme:filesize", length)); } if (part.ContentType.MediaType.ToLower() == "text") { child.SetTextReader(new StreamReader(stream)); } else { child.SetBinaryStream(stream); } child.SetChildOf(this.indexable); child.StoreStream(); child.CloseStreams(); this.child_indexables.Add(child); } else { Log.Debug("Skipping attachment {0}#{1} with blacklisted mime type {2}", this.indexable.Uri, this.count, mime_type); } } this.count++; } if (part_needs_dispose) { part.Dispose(); } --depth; }