// Since some parent properties maybe stored in child properties // as parent: property, any property change should be propagated // to all its children as well. private ArrayList GetChildPropertyChange(Hashtable children_docs, Indexable parent) { // FIXME FIXME FIXME: Post-Child-Indexable-Fix if (children_docs == null) { return(null); } Uri parent_uri = parent.Uri; ArrayList child_indexable_list = new ArrayList(); foreach (Uri uri in children_docs.Keys) { // FIXME: Currently, children_docs has both the parent and children docs if (UriFu.Equals(uri, parent_uri)) { continue; } Indexable child_indexable; child_indexable = new Indexable(IndexableType.PropertyChange, uri); Log.Debug("Creating property change child indexable for {1} (parent {0})", parent.Uri, uri); // This is where the child_indexables will have new properties from parent child_indexable.SetChildOf(parent); child_indexable_list.Add(child_indexable); } return(child_indexable_list); }
private IEnumerable GetNextEntryIndexable() { foreach (ZipEntry entry in archive) { if (entry.Name.IndexOf(".") != -1) { continue; } XmlDocument document = new XmlDocument(); document.Load(archive.GetInputStream(entry)); XmlNode type = document.SelectSingleNode("/Type"); if (type == null) { continue; } Indexable type_indexable = TypeNodeToIndexable(type, Indexable.Uri); type_indexable.SetChildOf(this.Indexable); type_indexable.StoreStream(); type_indexable.CloseStreams(); yield return(type_indexable); foreach (XmlNode member in type.SelectNodes("Members/Member")) { Indexable member_indexable = MemberNodeToIndexable(member, Indexable.Uri, type.Attributes ["FullName"].Value); member_indexable.SetChildOf(this.Indexable); member_indexable.StoreStream(); member_indexable.CloseStreams(); yield return(member_indexable); } } Finished(); }
override protected void DoPullProperties() { Stopwatch watch = new Stopwatch(); watch.Start(); while (reader.Read()) { switch (reader.NodeType) { case XmlNodeType.Element: if (reader.Name.StartsWith("sect") || reader.Name.StartsWith("chapter")) { string id = reader.GetAttribute("id"); if (id != null && id != String.Empty) { DocbookEntry entry = new DocbookEntry(); entry.Id = id; entry.Depth = reader.Depth; string language = reader.GetAttribute("lang"); if (language != null && language != String.Empty) { entry.Language = language; } entries_stack.Push(entry); } } else if (reader.Name == "article" || reader.Name == "book") { string language = reader.GetAttribute("lang"); if (language != null && language != String.Empty) { base_language = language; } } else if (reader.Name == "title") { reader.Read(); // Go to the text node if (entries_stack.Count == 0 && base_title == null) { // This is probably the book title base_title = reader.Value; } else if (entries_stack.Count > 0) { DocbookEntry entry = (DocbookEntry)entries_stack.Peek(); if (entry.Title == null) { entry.Title = reader.Value; } } } else if (reader.Name == "keyword") { reader.Read(); // read the text node AddProperty(Property.NewKeyword("dc:subject", reader.Value)); } break; case XmlNodeType.Text: // Append text to the child indexable if (entries_stack.Count > 0) { ((DocbookEntry)entries_stack.Peek()).Content.Append(reader.Value); } // Append text to the main indexable else { AppendWord(reader.Value); } break; case XmlNodeType.EndElement: if (entries_stack.Count > 0 && ((DocbookEntry)entries_stack.Peek()).Depth == reader.Depth) { DocbookEntry entry, parent_entry = null; entry = (DocbookEntry)entries_stack.Pop(); if (entries_stack.Count > 0) { parent_entry = (DocbookEntry)entries_stack.Peek(); } Indexable indexable; indexable = new Indexable(UriFu.AddFragment(Indexable.Uri, entry.Id, false)); indexable.HitType = "DocbookEntry"; indexable.MimeType = "text/x-docbook-entry"; indexable.AddProperty(Property.NewKeyword("beagle:FileType", "documentation")); indexable.Filtering = IndexableFiltering.AlreadyFiltered; indexable.AddProperty(Property.NewUnsearched("fixme:id", entry.Id)); indexable.AddProperty(Property.New("dc:title", entry.Title)); // Add the docbook book title indexable.AddProperty(Property.NewUnsearched("fixme:base_title", base_title)); // Add the child language (or docbook language if none is specified) if (entry.Language != null) { indexable.AddProperty(Property.NewUnsearched("fixme:language", entry.Language)); } else if (base_language != null) { indexable.AddProperty(Property.NewUnsearched("fixme:language", base_language)); } // Add any parent (as in docbook parent entry, not beagle) data if we have it if (parent_entry != null) { indexable.AddProperty(Property.NewUnsearched("fixme:parent_id", parent_entry.Id)); indexable.AddProperty(Property.NewUnsearched("fixme:parent_title", parent_entry.Title)); } StringReader content_reader = new StringReader(entry.Content.ToString()); indexable.SetTextReader(content_reader); indexable.SetChildOf(this.Indexable); AddIndexable(indexable); } break; } } // Add the common properties to the top-level // file item such as Title, Language etc. AddProperty(Property.New("dc:title", base_title)); AddProperty(Property.NewUnsearched("fixme:language", base_language)); watch.Stop(); // If we've successfully crawled the file but haven't // found any indexables, we shouldn't consider it // successfull at all (unless we have a title, which // means that it's actually a docbook file, just without // sections. if (!HasGeneratedIndexable && base_title == null) { Log.Error("Probably not a docbook. Ignoring {0}!", base_path); Error(); return; } Logger.Log.Debug("Parsed docbook file in {0}", watch); Finished(); }
public override bool GenerateNextIndexable(out Indexable child) { child = null; if (bib_process == null && !InitBibparse()) { return(false); } string line = null; string type = null, name = null; while ((line = reader.ReadLine()) != null) { if (line == String.Empty || line [0] != '@') { continue; } int i = line.IndexOf(' '); if (i == -1 || line.Length == i + 1) { continue; } type = line.Substring(1, i - 1).ToLower(); name = line.Substring(i + 1); break; } if (line == null) { return(false); } child = new Indexable(UriFu.AddFragment(Indexable.Uri, name, false)); child.CacheContent = false; child.MimeType = "text/x-bibtex"; child.DisplayUri = child.Uri; child.NoContent = true; child.AddProperty(Property.NewKeyword("bibtex:type", type)); string key, value; // Now fill in properties from the key=value lines while ((line = reader.ReadLine()) != null) { // Entries are separated by empty lines if (line == String.Empty) { break; } int i = line.IndexOf('='); // ensure non-empty key if (i < 1 || line.Length == i + i) { continue; } key = line.Substring(0, i).ToLower(); value = line.Substring(i + 1); foreach (Property prop in EntryLineToProperty(key, value)) { child.AddProperty(prop); } } child.SetChildOf(Indexable); return(true); }
// Since some parent properties maybe stored in child properties // as parent: property, any property change should be propagated // to all its children as well. private ArrayList GetChildPropertyChange (Hashtable children_docs, Indexable parent) { // FIXME FIXME FIXME: Post-Child-Indexable-Fix if (children_docs == null) return null; Uri parent_uri = parent.Uri; ArrayList child_indexable_list = new ArrayList (); foreach (Uri uri in children_docs.Keys) { // FIXME: Currently, children_docs has both the parent and children docs if (UriFu.Equals (uri, parent_uri)) continue; Indexable child_indexable; child_indexable = new Indexable (IndexableType.PropertyChange, uri); Log.Debug ("Creating property change child indexable for {1} (parent {0})", parent.Uri, uri); // This is where the child_indexables will have new properties from parent child_indexable.SetChildOf (parent); child_indexable_list.Add (child_indexable); } return child_indexable_list; }
override protected void DoPullProperties () { Stopwatch watch = new Stopwatch (); watch.Start (); while (reader.Read ()) { switch (reader.NodeType) { case XmlNodeType.Element: if (reader.Name.StartsWith ("sect") || reader.Name.StartsWith ("chapter")) { string id = reader.GetAttribute ("id"); if (id != null && id != String.Empty) { DocbookEntry entry = new DocbookEntry (); entry.Id = id; entry.Depth = reader.Depth; string language = reader.GetAttribute ("lang"); if (language != null && language != String.Empty) entry.Language = language; entries_stack.Push (entry); } } else if (reader.Name == "article" || reader.Name == "book") { string language = reader.GetAttribute ("lang"); if (language != null && language != String.Empty) base_language = language; } else if (reader.Name == "title") { reader.Read (); // Go to the text node if (entries_stack.Count == 0 && base_title == null) { // This is probably the book title base_title = reader.Value; } else if (entries_stack.Count > 0) { DocbookEntry entry = (DocbookEntry) entries_stack.Peek (); if (entry.Title == null) entry.Title = reader.Value; } } else if (reader.Name == "keyword") { reader.Read (); // read the text node AddProperty (Property.NewKeyword ("dc:subject", reader.Value)); } break; case XmlNodeType.Text: // Append text to the child indexable if (entries_stack.Count > 0) ((DocbookEntry) entries_stack.Peek ()).Content.Append (reader.Value); // Append text to the main indexable else AppendWord (reader.Value); break; case XmlNodeType.EndElement: if (entries_stack.Count > 0 && ((DocbookEntry) entries_stack.Peek ()).Depth == reader.Depth) { DocbookEntry entry, parent_entry = null; entry = (DocbookEntry) entries_stack.Pop (); if (entries_stack.Count > 0) parent_entry = (DocbookEntry) entries_stack.Peek (); Indexable indexable; indexable = new Indexable (UriFu.AddFragment (Indexable.Uri, entry.Id, false)); indexable.HitType = "DocbookEntry"; indexable.MimeType = "text/x-docbook-entry"; indexable.AddProperty (Property.NewKeyword ("beagle:FileType", "documentation")); indexable.Filtering = IndexableFiltering.AlreadyFiltered; indexable.AddProperty (Property.NewUnsearched ("fixme:id", entry.Id)); indexable.AddProperty (Property.New ("dc:title", entry.Title)); // Add the docbook book title indexable.AddProperty (Property.NewUnsearched ("fixme:base_title", base_title)); // Add the child language (or docbook language if none is specified) if (entry.Language != null) indexable.AddProperty (Property.NewUnsearched ("fixme:language", entry.Language)); else if (base_language != null) indexable.AddProperty (Property.NewUnsearched ("fixme:language", base_language)); // Add any parent (as in docbook parent entry, not beagle) data if we have it if (parent_entry != null) { indexable.AddProperty (Property.NewUnsearched ("fixme:parent_id", parent_entry.Id)); indexable.AddProperty (Property.NewUnsearched ("fixme:parent_title", parent_entry.Title)); } StringReader content_reader = new StringReader (entry.Content.ToString ()); indexable.SetTextReader (content_reader); indexable.SetChildOf (this.Indexable); AddIndexable (indexable); } break; } } // Add the common properties to the top-level // file item such as Title, Language etc. AddProperty (Property.New ("dc:title", base_title)); AddProperty (Property.NewUnsearched ("fixme:language", base_language)); watch.Stop (); // If we've successfully crawled the file but haven't // found any indexables, we shouldn't consider it // successfull at all (unless we have a title, which // means that it's actually a docbook file, just without // sections. if (! HasGeneratedIndexable && base_title == null) { Log.Error ("Probably not a docbook. Ignoring {0}!", base_path); Error (); return; } Logger.Log.Debug ("Parsed docbook file in {0}", watch); Finished (); }
public override bool GenerateNextIndexable(out Indexable child) { ArchiveEntry a_entry; child = null; if (!setup_done) { SetupArchiveStream(); } if (count >= MAX_CHILDREN) { Log.Debug("Archive {0} contains more than {1} files. Only {1} files indexed.", Indexable.DisplayUri.ToString(), count); Close(); return(false); } if (total_size > MAX_ALL_FILES) { Log.Debug("Archive {0} crossed our max uncompressed size threshold. Only {1} files extracted", Indexable.DisplayUri.ToString(), count); Close(); return(false); } a_entry = DoGetNextEntry(); if (a_entry == null) { Close(); return(false); } // Store file names in the archive AppendText(Path.GetFileName(a_entry.Name)); AppendWhiteSpace(); // If this is an invalid or oversized entry, skip it. if (a_entry.TempFile == null) { return(true); } ++count; total_size += a_entry.Size; // Add "#<escaped-path-to-entry>" to the end of the Indexable Uri // So, file b#c in archive foo.zip becomes file:///foo.zip#b%23c // And file c in archive b in archive foo.zip becomes file:///foo.zip#b#c child = new Indexable(UriFu.AddFragment(Indexable.Uri, a_entry.Name, false)); child.CacheContent = true; child.MimeType = a_entry.MimeType; child.DisplayUri = new Uri(Indexable.DisplayUri.ToString() + "#" + a_entry.Name); child.ContentUri = UriFu.PathToFileUri(a_entry.TempFile); child.DeleteContent = true; // FIXME Remove fixme:inside_archive during Property Hack Week // Replace most flag properties by value properties child.AddProperty(Property.NewBool("fixme:inside_archive", true)); // Use this instead of fixme:inside_archive child.AddProperty(Property.NewKeyword("archive:type", archive_type)); child.AddProperty(Property.NewKeyword("fixme:relativeuri", a_entry.Name)); child.AddProperty(Property.New("fixme:comment", a_entry.Comment)); child.AddProperty(Property.NewUnsearched("fixme:filesize", a_entry.Size)); foreach (Property prop in Property.StandardFileProperties(Path.GetFileName(a_entry.Name), false)) { child.AddProperty(prop); } child.SetChildOf(Indexable); return(true); }
public void OnEachPart(GMime.Object mime_part) { GMime.Object part = null; bool part_needs_dispose = false; //for (int i = 0; i < this.depth; i++) // Console.Write (" "); //Console.WriteLine ("Content-Type: {0}", mime_part.ContentType); ++depth; if (mime_part is GMime.MessagePart) { GMime.MessagePart msg_part = (GMime.MessagePart)mime_part; using (GMime.Message message = msg_part.Message) { using (GMime.Object subpart = message.MimePart) this.OnEachPart(subpart); } } else if (mime_part is GMime.Multipart) { GMime.Multipart multipart = (GMime.Multipart)mime_part; int num_parts = multipart.Count; // If the mimetype is multipart/alternative, we only want to index // one part -- the richest one we can filter. if (mime_part.ContentType.MediaSubtype.ToLower() == "alternative") { // The richest formats are at the end, so work from there // backward. for (int i = num_parts - 1; i >= 0; i--) { GMime.Object subpart = multipart[i]; if (IsMimeTypeHandled(subpart.ContentType.ToString())) { part = subpart; part_needs_dispose = true; break; } else { subpart.Dispose(); } } } // If it's not alternative, or we don't know how to filter any of // the parts, treat them like a bunch of attachments. if (part == null) { for (int i = 0; i < num_parts; i++) { using (GMime.Object subpart = multipart[i]) this.OnEachPart(subpart); } } } else if (mime_part is GMime.Part) { part = mime_part; } else { throw new Exception(String.Format("Unknown part type: {0}", part.GetType())); } if (part != null) { System.IO.Stream stream = null; using (GMime.DataWrapper content_obj = ((GMime.Part)part).ContentObject) stream = content_obj.Stream; // If this is the only part and it's plain text, we // want to just attach it to our filter instead of // creating a child indexable for it. bool no_child_needed = false; string mime_type = part.ContentType.ToString().ToLower(); if (this.depth == 1 && this.count == 0) { if (mime_type == "text/plain") { no_child_needed = true; this.reader = new StreamReader(stream); } else if (mime_type == "text/html") { no_child_needed = true; html_part = true; string enc = part.ContentType.GetParameter("charset"); // DataWrapper.Stream is a very limited stream // and does not allow Seek or Tell // HtmlFilter requires Stream.Position=0. // Play safe and create a memorystream // for HTML parsing. GMime.StreamMem mem_stream; mem_stream = new GMime.StreamMem(); GMime.Stream data_stream; data_stream = ((StreamWrapper)stream).GMimeStream; data_stream.WriteToStream(mem_stream); data_stream.Flush(); // The StreamWrapper and hence the memory_stream // will be closed when the reader is closed // after Pull()-ing is done. System.IO.Stream html_stream; html_stream = new StreamWrapper(mem_stream); html_stream.Seek(0, SeekOrigin.Begin); stream.Close(); try { this.reader = FilterHtml.GetHtmlReader(html_stream, enc, link_handler); } catch (Exception e) { Log.Debug(e, "Exception while filtering HTML email {0}", this.indexable.Uri); this.reader = null; html_stream.Close(); html_part = false; } } } if (!no_child_needed) { // Check the mime type against the blacklist and don't index any // parts that are contained within. That way the user doesn't // get flooded with pointless signatures and vcard and ical // attachments along with (real) attachments. if (Array.IndexOf(blacklisted_mime_types, mime_type) == -1) { string sub_uri = "#" + this.count; Indexable child; child = new Indexable(UriFu.AddFragment(this.indexable.Uri, sub_uri, true)); child.DisplayUri = new Uri(this.indexable.DisplayUri.ToString() + "#" + this.count); // This is a special case. // Even for mails found on disk, MailMessage hitype is set child.HitType = "MailMessage"; child.MimeType = mime_type; // If this is the richest part we found for multipart emails, add its content to textcache if (snippet_attachment || (this.depth == 1 && this.count == 0)) { child.CacheContent = true; } else { child.CacheContent = false; } string filename = ((GMime.Part)part).Filename; if (!String.IsNullOrEmpty(filename)) { child.AddProperty(Property.NewKeyword("fixme:attachment_title", filename)); foreach (Property prop in Property.StandardFileProperties(filename, false)) { child.AddProperty(prop); } } // Store length of attachment long length = stream.Length; if (length != -1) { child.AddProperty(Property.NewUnsearched("fixme:filesize", length)); } if (part.ContentType.MediaType.ToLower() == "text") { child.SetTextReader(new StreamReader(stream)); } else { child.SetBinaryStream(stream); } child.SetChildOf(this.indexable); child.StoreStream(); child.CloseStreams(); this.child_indexables.Add(child); } else { Log.Debug("Skipping attachment {0}#{1} with blacklisted mime type {2}", this.indexable.Uri, this.count, mime_type); } } this.count++; } if (part_needs_dispose) { part.Dispose(); } --depth; }
public override bool GenerateNextIndexable (out Indexable child) { child = null; if (bib_process == null && ! InitBibparse ()) return false; string line = null; string type = null, name = null; while ((line = reader.ReadLine ()) != null) { if (line == String.Empty || line [0] != '@') continue; int i = line.IndexOf (' '); if (i == -1 || line.Length == i + 1) continue; type = line.Substring (1, i - 1).ToLower (); name = line.Substring (i + 1); break; } if (line == null) return false; child = new Indexable (UriFu.AddFragment (Indexable.Uri, name, false)); child.CacheContent = false; child.MimeType = "text/x-bibtex"; child.DisplayUri = child.Uri; child.NoContent = true; child.AddProperty (Property.NewKeyword ("bibtex:type", type)); string key, value; // Now fill in properties from the key=value lines while ((line = reader.ReadLine ()) != null) { // Entries are separated by empty lines if (line == String.Empty) break; int i = line.IndexOf ('='); // ensure non-empty key if (i < 1 || line.Length == i + i) continue; key = line.Substring (0, i).ToLower (); value = line.Substring (i + 1); foreach (Property prop in EntryLineToProperty (key, value)) child.AddProperty (prop); } child.SetChildOf (Indexable); return true; }