// Since some parent properties maybe stored in child properties
        // as parent: property, any property change should be propagated
        // to all its children as well.
        private ArrayList GetChildPropertyChange(Hashtable children_docs,
                                                 Indexable parent)
        {
            // FIXME FIXME FIXME: Post-Child-Indexable-Fix
            if (children_docs == null)
            {
                return(null);
            }

            Uri       parent_uri           = parent.Uri;
            ArrayList child_indexable_list = new ArrayList();

            foreach (Uri uri in children_docs.Keys)
            {
                // FIXME: Currently, children_docs has both the parent and children docs
                if (UriFu.Equals(uri, parent_uri))
                {
                    continue;
                }

                Indexable child_indexable;
                child_indexable = new Indexable(IndexableType.PropertyChange, uri);
                Log.Debug("Creating property change child indexable for {1} (parent {0})", parent.Uri, uri);

                // This is where the child_indexables will have new properties from parent
                child_indexable.SetChildOf(parent);
                child_indexable_list.Add(child_indexable);
            }

            return(child_indexable_list);
        }
        private IEnumerable GetNextEntryIndexable()
        {
            foreach (ZipEntry entry in archive)
            {
                if (entry.Name.IndexOf(".") != -1)
                {
                    continue;
                }

                XmlDocument document = new XmlDocument();
                document.Load(archive.GetInputStream(entry));

                XmlNode type = document.SelectSingleNode("/Type");

                if (type == null)
                {
                    continue;
                }

                Indexable type_indexable = TypeNodeToIndexable(type, Indexable.Uri);
                type_indexable.SetChildOf(this.Indexable);
                type_indexable.StoreStream();
                type_indexable.CloseStreams();
                yield return(type_indexable);

                foreach (XmlNode member in type.SelectNodes("Members/Member"))
                {
                    Indexable member_indexable = MemberNodeToIndexable(member,
                                                                       Indexable.Uri,
                                                                       type.Attributes ["FullName"].Value);
                    member_indexable.SetChildOf(this.Indexable);
                    member_indexable.StoreStream();
                    member_indexable.CloseStreams();
                    yield return(member_indexable);
                }
            }

            Finished();
        }
Beispiel #3
0
        override protected void DoPullProperties()
        {
            Stopwatch watch = new Stopwatch();

            watch.Start();

            while (reader.Read())
            {
                switch (reader.NodeType)
                {
                case XmlNodeType.Element:
                    if (reader.Name.StartsWith("sect") || reader.Name.StartsWith("chapter"))
                    {
                        string id = reader.GetAttribute("id");

                        if (id != null && id != String.Empty)
                        {
                            DocbookEntry entry = new DocbookEntry();
                            entry.Id    = id;
                            entry.Depth = reader.Depth;

                            string language = reader.GetAttribute("lang");

                            if (language != null && language != String.Empty)
                            {
                                entry.Language = language;
                            }

                            entries_stack.Push(entry);
                        }
                    }
                    else if (reader.Name == "article" || reader.Name == "book")
                    {
                        string language = reader.GetAttribute("lang");

                        if (language != null && language != String.Empty)
                        {
                            base_language = language;
                        }
                    }
                    else if (reader.Name == "title")
                    {
                        reader.Read();                          // Go to the text node

                        if (entries_stack.Count == 0 && base_title == null)
                        {
                            // This is probably the book title
                            base_title = reader.Value;
                        }
                        else if (entries_stack.Count > 0)
                        {
                            DocbookEntry entry = (DocbookEntry)entries_stack.Peek();

                            if (entry.Title == null)
                            {
                                entry.Title = reader.Value;
                            }
                        }
                    }
                    else if (reader.Name == "keyword")
                    {
                        reader.Read();                          // read the text node
                        AddProperty(Property.NewKeyword("dc:subject", reader.Value));
                    }
                    break;

                case XmlNodeType.Text:
                    // Append text to the child indexable
                    if (entries_stack.Count > 0)
                    {
                        ((DocbookEntry)entries_stack.Peek()).Content.Append(reader.Value);
                    }

                    // Append text to the main indexable
                    else
                    {
                        AppendWord(reader.Value);
                    }
                    break;

                case XmlNodeType.EndElement:
                    if (entries_stack.Count > 0 &&
                        ((DocbookEntry)entries_stack.Peek()).Depth == reader.Depth)
                    {
                        DocbookEntry entry, parent_entry = null;

                        entry = (DocbookEntry)entries_stack.Pop();

                        if (entries_stack.Count > 0)
                        {
                            parent_entry = (DocbookEntry)entries_stack.Peek();
                        }

                        Indexable indexable;
                        indexable          = new Indexable(UriFu.AddFragment(Indexable.Uri, entry.Id, false));
                        indexable.HitType  = "DocbookEntry";
                        indexable.MimeType = "text/x-docbook-entry";
                        indexable.AddProperty(Property.NewKeyword("beagle:FileType", "documentation"));
                        indexable.Filtering = IndexableFiltering.AlreadyFiltered;

                        indexable.AddProperty(Property.NewUnsearched("fixme:id", entry.Id));
                        indexable.AddProperty(Property.New("dc:title", entry.Title));

                        // Add the docbook book title
                        indexable.AddProperty(Property.NewUnsearched("fixme:base_title", base_title));

                        // Add the child language (or docbook language if none is specified)
                        if (entry.Language != null)
                        {
                            indexable.AddProperty(Property.NewUnsearched("fixme:language", entry.Language));
                        }
                        else if (base_language != null)
                        {
                            indexable.AddProperty(Property.NewUnsearched("fixme:language", base_language));
                        }

                        // Add any parent (as in docbook parent entry, not beagle) data if we have it
                        if (parent_entry != null)
                        {
                            indexable.AddProperty(Property.NewUnsearched("fixme:parent_id", parent_entry.Id));
                            indexable.AddProperty(Property.NewUnsearched("fixme:parent_title", parent_entry.Title));
                        }


                        StringReader content_reader = new StringReader(entry.Content.ToString());
                        indexable.SetTextReader(content_reader);
                        indexable.SetChildOf(this.Indexable);

                        AddIndexable(indexable);
                    }
                    break;
                }
            }

            // Add the common properties to the top-level
            // file item such as Title, Language etc.

            AddProperty(Property.New("dc:title", base_title));
            AddProperty(Property.NewUnsearched("fixme:language", base_language));

            watch.Stop();

            // If we've successfully crawled the file but haven't
            // found any indexables, we shouldn't consider it
            // successfull at all (unless we have a title, which
            // means that it's actually a docbook file, just without
            // sections.
            if (!HasGeneratedIndexable && base_title == null)
            {
                Log.Error("Probably not a docbook. Ignoring {0}!", base_path);
                Error();
                return;
            }

            Logger.Log.Debug("Parsed docbook file in {0}", watch);

            Finished();
        }
Beispiel #4
0
        public override bool GenerateNextIndexable(out Indexable child)
        {
            child = null;

            if (bib_process == null && !InitBibparse())
            {
                return(false);
            }

            string line = null;
            string type = null, name = null;

            while ((line = reader.ReadLine()) != null)
            {
                if (line == String.Empty || line [0] != '@')
                {
                    continue;
                }

                int i = line.IndexOf(' ');
                if (i == -1 || line.Length == i + 1)
                {
                    continue;
                }
                type = line.Substring(1, i - 1).ToLower();
                name = line.Substring(i + 1);
                break;
            }

            if (line == null)
            {
                return(false);
            }

            child = new Indexable(UriFu.AddFragment(Indexable.Uri, name, false));
            child.CacheContent = false;
            child.MimeType     = "text/x-bibtex";
            child.DisplayUri   = child.Uri;
            child.NoContent    = true;
            child.AddProperty(Property.NewKeyword("bibtex:type", type));

            string key, value;

            // Now fill in properties from the key=value lines
            while ((line = reader.ReadLine()) != null)
            {
                // Entries are separated by empty lines
                if (line == String.Empty)
                {
                    break;
                }

                int i = line.IndexOf('=');
                // ensure non-empty key
                if (i < 1 || line.Length == i + i)
                {
                    continue;
                }
                key   = line.Substring(0, i).ToLower();
                value = line.Substring(i + 1);
                foreach (Property prop in EntryLineToProperty(key, value))
                {
                    child.AddProperty(prop);
                }
            }

            child.SetChildOf(Indexable);
            return(true);
        }
		// Since some parent properties maybe stored in child properties
		// as parent: property, any property change should be propagated
		// to all its children as well.
		private ArrayList GetChildPropertyChange (Hashtable children_docs,
							  Indexable parent)
		{
			// FIXME FIXME FIXME: Post-Child-Indexable-Fix
			if (children_docs == null)
				return null;

			Uri parent_uri = parent.Uri;
			ArrayList child_indexable_list = new ArrayList ();

			foreach (Uri uri in children_docs.Keys) {
				// FIXME: Currently, children_docs has both the parent and children docs
				if (UriFu.Equals (uri, parent_uri))
					continue;

				Indexable child_indexable;
				child_indexable = new Indexable (IndexableType.PropertyChange, uri);
				Log.Debug ("Creating property change child indexable for {1} (parent {0})", parent.Uri, uri);

				// This is where the child_indexables will have new properties from parent
				child_indexable.SetChildOf (parent);
				child_indexable_list.Add (child_indexable);
			}

			return child_indexable_list;
		}
		override protected void DoPullProperties ()
		{
			Stopwatch watch = new Stopwatch ();
			
			watch.Start ();

			while (reader.Read ()) {
				switch (reader.NodeType) {
				case XmlNodeType.Element:
					if (reader.Name.StartsWith ("sect") || reader.Name.StartsWith ("chapter")) {
						string id = reader.GetAttribute ("id");

						if (id != null && id != String.Empty) {
							DocbookEntry entry = new DocbookEntry ();
							entry.Id = id;
							entry.Depth = reader.Depth;

							string language = reader.GetAttribute ("lang");
							
							if (language != null && language != String.Empty)
								entry.Language = language;

							entries_stack.Push (entry);
						}
					} else if (reader.Name == "article" || reader.Name == "book") {
						string language = reader.GetAttribute ("lang");

						if (language != null && language != String.Empty)
							base_language = language;
					} else if (reader.Name == "title") {
						reader.Read (); // Go to the text node

						if (entries_stack.Count == 0 && base_title == null) {
							// This is probably the book title
							base_title = reader.Value;
						} else if (entries_stack.Count > 0) {
							DocbookEntry entry = (DocbookEntry) entries_stack.Peek ();

							if (entry.Title == null)
								entry.Title = reader.Value;
						}
					} else if (reader.Name == "keyword") {
						reader.Read (); // read the text node
						AddProperty (Property.NewKeyword ("dc:subject", reader.Value));
					}
					break;
					
				case XmlNodeType.Text:
					// Append text to the child indexable
					if (entries_stack.Count > 0)
						((DocbookEntry) entries_stack.Peek ()).Content.Append (reader.Value);

					// Append text to the main indexable
					else
						AppendWord (reader.Value);
					break;
					
				case XmlNodeType.EndElement:
					if (entries_stack.Count > 0 &&
					    ((DocbookEntry) entries_stack.Peek ()).Depth == reader.Depth) {
						DocbookEntry entry, parent_entry = null;

						entry = (DocbookEntry) entries_stack.Pop ();
						
						if (entries_stack.Count > 0)
							parent_entry = (DocbookEntry) entries_stack.Peek ();
						
						Indexable indexable;
						indexable = new Indexable (UriFu.AddFragment (Indexable.Uri, entry.Id, false));
						indexable.HitType = "DocbookEntry";
						indexable.MimeType = "text/x-docbook-entry";
						indexable.AddProperty (Property.NewKeyword ("beagle:FileType", "documentation"));
						indexable.Filtering = IndexableFiltering.AlreadyFiltered;

						indexable.AddProperty (Property.NewUnsearched ("fixme:id", entry.Id));
						indexable.AddProperty (Property.New ("dc:title", entry.Title));

						// Add the docbook book title
						indexable.AddProperty (Property.NewUnsearched ("fixme:base_title", base_title));

						// Add the child language (or docbook language if none is specified)
						if (entry.Language != null)
							indexable.AddProperty (Property.NewUnsearched ("fixme:language", entry.Language));
						else if (base_language != null)
							indexable.AddProperty (Property.NewUnsearched ("fixme:language", base_language));
						
						// Add any parent (as in docbook parent entry, not beagle) data if we have it
						if (parent_entry != null) {
							indexable.AddProperty (Property.NewUnsearched ("fixme:parent_id", parent_entry.Id));
							indexable.AddProperty (Property.NewUnsearched ("fixme:parent_title", parent_entry.Title));
						}


						StringReader content_reader = new StringReader (entry.Content.ToString ());
						indexable.SetTextReader (content_reader);
						indexable.SetChildOf (this.Indexable);

						AddIndexable (indexable);
					}
					break;
				}
			}

			// Add the common properties to the top-level
			// file item such as Title, Language etc.

			AddProperty (Property.New ("dc:title", base_title));
			AddProperty (Property.NewUnsearched ("fixme:language", base_language));

			watch.Stop ();
			
			// If we've successfully crawled the file but haven't 
			// found any indexables, we shouldn't consider it
			// successfull at all (unless we have a title, which
			// means that it's actually a docbook file, just without
			// sections.
			if (! HasGeneratedIndexable && base_title == null) {
				Log.Error ("Probably not a docbook. Ignoring {0}!", base_path);
				Error ();
				return;
			}

			Logger.Log.Debug ("Parsed docbook file in {0}", watch);

			Finished ();
		}
        public override bool GenerateNextIndexable(out Indexable child)
        {
            ArchiveEntry a_entry;

            child = null;

            if (!setup_done)
            {
                SetupArchiveStream();
            }

            if (count >= MAX_CHILDREN)
            {
                Log.Debug("Archive {0} contains more than {1} files.  Only {1} files indexed.", Indexable.DisplayUri.ToString(), count);
                Close();
                return(false);
            }

            if (total_size > MAX_ALL_FILES)
            {
                Log.Debug("Archive {0} crossed our max uncompressed size threshold.  Only {1} files extracted", Indexable.DisplayUri.ToString(), count);
                Close();
                return(false);
            }

            a_entry = DoGetNextEntry();
            if (a_entry == null)
            {
                Close();
                return(false);
            }

            // Store file names in the archive
            AppendText(Path.GetFileName(a_entry.Name));
            AppendWhiteSpace();

            // If this is an invalid or oversized entry, skip it.
            if (a_entry.TempFile == null)
            {
                return(true);
            }

            ++count;
            total_size += a_entry.Size;

            // Add "#<escaped-path-to-entry>" to the end of the Indexable Uri
            // So, file b#c in archive foo.zip becomes file:///foo.zip#b%23c
            // And file c in archive b in archive foo.zip becomes file:///foo.zip#b#c
            child = new Indexable(UriFu.AddFragment(Indexable.Uri, a_entry.Name, false));

            child.CacheContent = true;
            child.MimeType     = a_entry.MimeType;

            child.DisplayUri    = new Uri(Indexable.DisplayUri.ToString() + "#" + a_entry.Name);
            child.ContentUri    = UriFu.PathToFileUri(a_entry.TempFile);
            child.DeleteContent = true;

            // FIXME Remove fixme:inside_archive during Property Hack Week
            // Replace most flag properties by value properties
            child.AddProperty(Property.NewBool("fixme:inside_archive", true));
            // Use this instead of fixme:inside_archive
            child.AddProperty(Property.NewKeyword("archive:type", archive_type));

            child.AddProperty(Property.NewKeyword("fixme:relativeuri", a_entry.Name));
            child.AddProperty(Property.New("fixme:comment", a_entry.Comment));
            child.AddProperty(Property.NewUnsearched("fixme:filesize", a_entry.Size));

            foreach (Property prop in Property.StandardFileProperties(Path.GetFileName(a_entry.Name), false))
            {
                child.AddProperty(prop);
            }

            child.SetChildOf(Indexable);

            return(true);
        }
Beispiel #8
0
            public void OnEachPart(GMime.Object mime_part)
            {
                GMime.Object part = null;
                bool         part_needs_dispose = false;

                //for (int i = 0; i < this.depth; i++)
                //  Console.Write ("  ");
                //Console.WriteLine ("Content-Type: {0}", mime_part.ContentType);

                ++depth;

                if (mime_part is GMime.MessagePart)
                {
                    GMime.MessagePart msg_part = (GMime.MessagePart)mime_part;

                    using (GMime.Message message = msg_part.Message) {
                        using (GMime.Object subpart = message.MimePart)
                            this.OnEachPart(subpart);
                    }
                }
                else if (mime_part is GMime.Multipart)
                {
                    GMime.Multipart multipart = (GMime.Multipart)mime_part;
                    int             num_parts = multipart.Count;

                    // If the mimetype is multipart/alternative, we only want to index
                    // one part -- the richest one we can filter.
                    if (mime_part.ContentType.MediaSubtype.ToLower() == "alternative")
                    {
                        // The richest formats are at the end, so work from there
                        // backward.
                        for (int i = num_parts - 1; i >= 0; i--)
                        {
                            GMime.Object subpart = multipart[i];

                            if (IsMimeTypeHandled(subpart.ContentType.ToString()))
                            {
                                part = subpart;
                                part_needs_dispose = true;
                                break;
                            }
                            else
                            {
                                subpart.Dispose();
                            }
                        }
                    }

                    // If it's not alternative, or we don't know how to filter any of
                    // the parts, treat them like a bunch of attachments.
                    if (part == null)
                    {
                        for (int i = 0; i < num_parts; i++)
                        {
                            using (GMime.Object subpart = multipart[i])
                                this.OnEachPart(subpart);
                        }
                    }
                }
                else if (mime_part is GMime.Part)
                {
                    part = mime_part;
                }
                else
                {
                    throw new Exception(String.Format("Unknown part type: {0}", part.GetType()));
                }

                if (part != null)
                {
                    System.IO.Stream stream = null;

                    using (GMime.DataWrapper content_obj = ((GMime.Part)part).ContentObject)
                        stream = content_obj.Stream;

                    // If this is the only part and it's plain text, we
                    // want to just attach it to our filter instead of
                    // creating a child indexable for it.
                    bool no_child_needed = false;

                    string mime_type = part.ContentType.ToString().ToLower();

                    if (this.depth == 1 && this.count == 0)
                    {
                        if (mime_type == "text/plain")
                        {
                            no_child_needed = true;

                            this.reader = new StreamReader(stream);
                        }
                        else if (mime_type == "text/html")
                        {
                            no_child_needed = true;
                            html_part       = true;
                            string enc = part.ContentType.GetParameter("charset");
                            // DataWrapper.Stream is a very limited stream
                            // and does not allow Seek or Tell
                            // HtmlFilter requires Stream.Position=0.
                            // Play safe and create a memorystream
                            // for HTML parsing.

                            GMime.StreamMem mem_stream;
                            mem_stream = new GMime.StreamMem();

                            GMime.Stream data_stream;
                            data_stream = ((StreamWrapper)stream).GMimeStream;
                            data_stream.WriteToStream(mem_stream);
                            data_stream.Flush();

                            // The StreamWrapper and hence the memory_stream
                            // will be closed when the reader is closed
                            // after Pull()-ing is done.
                            System.IO.Stream html_stream;
                            html_stream = new StreamWrapper(mem_stream);
                            html_stream.Seek(0, SeekOrigin.Begin);

                            stream.Close();

                            try {
                                this.reader = FilterHtml.GetHtmlReader(html_stream, enc, link_handler);
                            } catch (Exception e) {
                                Log.Debug(e, "Exception while filtering HTML email {0}", this.indexable.Uri);
                                this.reader = null;
                                html_stream.Close();
                                html_part = false;
                            }
                        }
                    }

                    if (!no_child_needed)
                    {
                        // Check the mime type against the blacklist and don't index any
                        // parts that are contained within.  That way the user doesn't
                        // get flooded with pointless signatures and vcard and ical
                        // attachments along with (real) attachments.

                        if (Array.IndexOf(blacklisted_mime_types, mime_type) == -1)
                        {
                            string    sub_uri = "#" + this.count;
                            Indexable child;
                            child = new Indexable(UriFu.AddFragment(this.indexable.Uri, sub_uri, true));

                            child.DisplayUri = new Uri(this.indexable.DisplayUri.ToString() + "#" + this.count);

                            // This is a special case.
                            // Even for mails found on disk, MailMessage hitype is set
                            child.HitType  = "MailMessage";
                            child.MimeType = mime_type;

                            // If this is the richest part we found for multipart emails, add its content to textcache
                            if (snippet_attachment ||
                                (this.depth == 1 && this.count == 0))
                            {
                                child.CacheContent = true;
                            }
                            else
                            {
                                child.CacheContent = false;
                            }

                            string filename = ((GMime.Part)part).Filename;

                            if (!String.IsNullOrEmpty(filename))
                            {
                                child.AddProperty(Property.NewKeyword("fixme:attachment_title", filename));

                                foreach (Property prop in Property.StandardFileProperties(filename, false))
                                {
                                    child.AddProperty(prop);
                                }
                            }

                            // Store length of attachment
                            long length = stream.Length;
                            if (length != -1)
                            {
                                child.AddProperty(Property.NewUnsearched("fixme:filesize", length));
                            }

                            if (part.ContentType.MediaType.ToLower() == "text")
                            {
                                child.SetTextReader(new StreamReader(stream));
                            }
                            else
                            {
                                child.SetBinaryStream(stream);
                            }

                            child.SetChildOf(this.indexable);
                            child.StoreStream();
                            child.CloseStreams();
                            this.child_indexables.Add(child);
                        }
                        else
                        {
                            Log.Debug("Skipping attachment {0}#{1} with blacklisted mime type {2}",
                                      this.indexable.Uri, this.count, mime_type);
                        }
                    }

                    this.count++;
                }

                if (part_needs_dispose)
                {
                    part.Dispose();
                }

                --depth;
            }
Beispiel #9
0
		public override bool GenerateNextIndexable (out Indexable child)
		{
			child = null;

			if (bib_process == null && ! InitBibparse ())
				return false;

			string line = null;
			string type = null, name = null;
			while ((line = reader.ReadLine ()) != null) {
				if (line == String.Empty || line [0] != '@')
					continue;

				int i = line.IndexOf (' ');
				if (i == -1 || line.Length == i + 1)
					continue;
				type = line.Substring (1, i - 1).ToLower ();
				name = line.Substring (i + 1);
				break;
			}

			if (line == null)
				return false;

			child = new Indexable (UriFu.AddFragment (Indexable.Uri, name, false));
			child.CacheContent = false;
			child.MimeType = "text/x-bibtex";
			child.DisplayUri = child.Uri;
			child.NoContent = true;
			child.AddProperty (Property.NewKeyword ("bibtex:type", type));

			string key, value;
			// Now fill in properties from the key=value lines
			while ((line = reader.ReadLine ()) != null) {
				// Entries are separated by empty lines
				if (line == String.Empty)
					break;

				int i = line.IndexOf ('=');
				// ensure non-empty key
				if (i < 1 || line.Length == i + i)
					continue;
				key = line.Substring (0, i).ToLower ();
				value = line.Substring (i + 1);
				foreach (Property prop in EntryLineToProperty (key, value))
					child.AddProperty (prop);
			}

			child.SetChildOf (Indexable);
			return true;
		}