Beispiel #1
0
        // This doesn't check if it makes sense to actually
        // merge the two indexables: it just does it.
        public void Merge(Indexable other)
        {
            if (other.Timestamp > this.Timestamp)
            {
                this.Timestamp = other.Timestamp;
            }

            foreach (Property prop in other.Properties)
            {
                this.AddProperty(prop);
            }

            foreach (DictionaryEntry entry in other.local_state)
            {
                this.local_state [entry.Key] = entry.Value;
            }
        }
Beispiel #2
0
        private void CopyPropertyParentToChild(Indexable parent)
        {
            // Parent is a top level indexable
            // Copy all properties
            foreach (Property prop in parent.Properties)
            {
                Property new_prop = (Property)prop.Clone();
                // Add parent: to property names ONLY IF
                // - not private property (these are not properties of the file content)
                // - property name does not already start with parent:
                if (!new_prop.Key.StartsWith(Property.PrivateNamespace) &&
                    !new_prop.Key.StartsWith("parent:"))
                {
                    new_prop.Key = "parent:" + new_prop.Key;
                }

                this.AddProperty(new_prop);
            }
        }
Beispiel #3
0
        // FIXME: Copying the correct properties from parent to child:
        // (This is not perfect yet)
        // It does not make sense to have parent:parent:parent:...:parent:foo
        // for property names of a nested child
        // Moreover, if indexable a.mbox has child b.zip which has child c.zip,
        // then upon matching c.zip, we would like to get the information from
        // a.mbox (i.e. the toplevel indexable) only. Intermediate parent information
        // is not necessary for displaying results; in fact, storing them would cause
        // confusion during display.
        // E.g. storing parent:beagle:filename for all parents
        // would cause, parent:beagle:filename=a.mbox, parent.beagle.filename=b.zip
        // whereas we are only interested in toplevel parent:beagle:filename=a.mbox
        // For indexables which need to store the intermediate/immediate parent info
        // separately, explicitly store them.
        // Another problem is, toplevel indexable might want to store information
        // which should not be matched when searching for its child. Copying those
        // properties in all children will incorrectly match them.
        //

        private void CopyPropertyChildToChild(Indexable parent)
        {
            // If parent itself is a child,
            // then only copy parents' parent:xxx and _private:xxx properties
            foreach (Property prop in parent.Properties)
            {
                if (prop.Key.StartsWith("parent:") ||
                    prop.Key.StartsWith(Property.PrivateNamespace))
                {
                    Property new_prop = (Property)prop.Clone();
                    this.AddProperty(new_prop);
                }
                else
                {
                    Property new_prop = (Property)prop.Clone();
                    new_prop.IsStored = false;
                    this.AddProperty(new_prop);
                }
            }
        }
		public static void SendUpdate (IBrowsableItem item)
		{
			Indexable indexable = new Indexable (item.DefaultVersionUri);
			indexable.Type = IndexableType.PropertyChange;
			Beagle.Property prop;

			// Clear the existing tags
			prop = Beagle.Property.NewKeyword ("fspot:Tag", "");
			prop.IsMutable = true;
			prop.IsPersistent = true;
			indexable.AddProperty (prop);
			prop = Beagle.Property.NewKeyword ("image:Tag", "");
			prop.IsMutable = true;
			prop.IsPersistent = true;
			indexable.AddProperty (prop);

			foreach (Tag t in item.Tags) {
				prop = Beagle.Property.NewKeyword ("fspot:Tag", t.Name);
				prop.IsMutable = true;
				prop.IsPersistent = true;
				indexable.AddProperty (prop);
				prop = Beagle.Property.NewKeyword ("image:Tag", t.Name);
				prop.IsMutable = true;
				prop.IsPersistent = true;
				indexable.AddProperty (prop);
			}

			prop = Beagle.Property.New ("fspot:Description", item.Description);
			prop.IsMutable = true;
			prop.IsPersistent = true;
			indexable.AddProperty (prop);

			// Create a message to send to the daemon with this information.
			// The source tells it what index the existing "/home/joe/test.txt" document lives.
			IndexingServiceRequest req = new IndexingServiceRequest ();
			req.Keepalive = false;
			req.Source = "Files";
			req.Add (indexable);

			req.SendAsync ();
		}
Beispiel #5
0
        //////////////////////////

        public void SetChildOf(Indexable parent)
        {
            this.IsChild = true;
            if (parent.IsChild)
            {
                this.ParentUri = parent.ParentUri;
            }
            else
            {
                this.ParentUri = parent.Uri;
            }

            if (!this.ValidTimestamp)
            {
                this.Timestamp = parent.Timestamp;
            }

            if (string.IsNullOrEmpty(this.HitType))
            {
                this.HitType = parent.HitType;
            }

            this.Source = parent.Source;

            // FIXME: Set all of the parent's properties on the
            // child so that we get matches against the child
            // that otherwise would match only the parent, at
            // least until we have proper RDF support.

            if (parent.IsChild)
            {
                CopyPropertyChildToChild(parent);
            }
            else
            {
                CopyPropertyParentToChild(parent);
            }
        }
Beispiel #6
0
	static void Main (string[] args)
	{
		if (args.Length != 2) {
			Console.WriteLine ("Usage: beagle-master-delete-button index-name uri-to-delete");
			return;
		}

		string index_name = args [0];

		LuceneQueryingDriver driver = new LuceneQueryingDriver (index_name, -1, true);

		Uri uri = new Uri (args [1], false);
		Uri uri_to_delete = RemapUri (driver, uri);

		LuceneIndexingDriver indexer = new LuceneIndexingDriver (index_name, false);

		Indexable indexable = new Indexable (uri_to_delete);
		indexable.Type = IndexableType.Remove;

		IndexerRequest request = new IndexerRequest ();
		request.Add (indexable);

		IndexerReceipt [] receipts = indexer.Flush (request);
		if (receipts == null || receipts.Length == 0) {
			Console.WriteLine ("Uri {0} not found in {1}",
					   uri, index_name);
			return;
		}

		IndexerRemovedReceipt r = receipts [0] as IndexerRemovedReceipt;
		if (r == null || r.NumRemoved == 0) {
			Console.WriteLine ("Uri {0} not found in {1}",
					   uri, index_name);
			return;
		}

		Console.WriteLine ("Uri {0} deleted", uri);
	}
Beispiel #7
0
		/////////////////////////////////////////////////////////////////

		static void AddToRequest (Indexable indexable)
		{
			if (indexable == null)
				return;

			// Disable filtering and only index file attributes
			if (arg_disable_filtering)
				indexable.Filtering = IndexableFiltering.Never;
					
			// Tag the item for easy identification (for say, removal)
			if (arg_tag != null)
				indexable.AddProperty (Property.NewUnsearched("Tag", arg_tag));

			indexable.Source = arg_source;

			pending_request.Add (indexable);
			bool reschedule = false;

			do {
				if (Shutdown.ShutdownRequested)
					break;

				if (! reschedule && pending_request.Count < BATCH_SIZE)
					break;

				if (reschedule)
					Logger.Log.Debug ("Continuing indexing indexer generated indexables");
				else
					Logger.Log.Debug ("Flushing driver, {0} items in queue", pending_request.Count);

				reschedule = FlushIndexer (driver);

				// Super Lame Hack: gtk-sharp up to 2.10 requires a main loop
				// to dispose of any managed wrappers around GObjects.  Since
				// we don't have one, we'll process all the pending items in
				// a loop here.  This is particularly an issue with maildirs,
				// because we need the loop to clean up after GMime.  Without
				// it, GMime's streams are never completely unref'd, the
				// file descriptors aren't closed, and we run out and crash.
				while (GLib.MainContext.Pending ())
					GLib.MainContext.Iteration ();

			} while (reschedule);
		}
Beispiel #8
0
		static void DoIndexing ()
		{
			int count_dirs = 0;
			int count_files = 0;

			Indexable indexable;
			pending_request = new IndexerRequest ();
			Queue modified_directories = new Queue ();
			
			while (pending_directories.Count > 0) {
				DirectoryInfo dir = (DirectoryInfo) pending_directories.Dequeue ();

				AddToRequest (DirectoryToIndexable (dir, modified_directories));

				try {
					if (arg_recursive)
						foreach (DirectoryInfo subdir in DirectoryWalker.GetDirectoryInfos (dir))
							if (!Ignore (subdir)
							    && !FileSystem.IsSpecialFile (subdir.FullName))
								pending_directories.Enqueue (subdir);
				
					foreach (FileInfo file in DirectoryWalker.GetFileInfos (dir))
						if (!Ignore (file)
						    && !FileSystem.IsSpecialFile (file.FullName)) {
							AddToRequest (FileToIndexable (file));
							count_files ++;
						}
				
				} catch (DirectoryNotFoundException) {}
			
				if (Shutdown.ShutdownRequested)
					break;
			
				count_dirs++;
			}

			Logger.Log.Debug ("Scanned {0} files and directories in {1} directories", count_dirs + count_files, count_dirs);

			if (Shutdown.ShutdownRequested) {
				backing_fa_store.Flush ();
				return;
			}

			// Time to remove deleted directories from the index and attributes store
			while (modified_directories.Count > 0) {
				DirectoryInfo subdir = (DirectoryInfo) modified_directories.Dequeue ();
				Logger.Log.Debug ("Checking {0} for deleted files and directories", subdir.FullName);

				// Get a list of all documents from lucene index with ParentDirUriPropKey set as that of subdir
				ICollection all_dirent = GetAllItemsInDirectory (subdir);
				foreach (Dirent info in all_dirent) {
					// check if the item exists
					if ((! info.IsDirectory && File.Exists (info.FullName)) || 
					    (info.IsDirectory && Directory.Exists (info.FullName)))
						continue;

					if (info.IsDirectory)
						// Recursively remove deleted subdirectories
						modified_directories.Enqueue (new DirectoryInfo (info.FullName));
					
					// remove
					Uri uri = PathToUri (info.FullName);
					indexable = new Indexable (IndexableType.Remove, uri);
					AddToRequest (indexable);
				}
			}

			bool reschedule = false;
			// Call Flush until our request is empty.  We have to do this in a loop
			// because Flush happens in a batch size and some indexables might generate more indexables
			while (reschedule || pending_request.Count > 0) {
				if (Shutdown.ShutdownRequested)
					break;

				reschedule = FlushIndexer (driver);
			}

			backing_fa_store.Flush ();

			if (Shutdown.ShutdownRequested)
				return;

			Logger.Log.Debug ("Optimizing index");
			driver.OptimizeNow ();
		}
		private Indexable OperaRowToIndexable (OperaHistory.Row row)
		{
			// It's unsafe to index secure content since it may contain sensitive data
			if (row.Address.Scheme == Uri.UriSchemeHttps)
				return null;
			
			Indexable indexable = new Indexable (row.Address);
			
			indexable.HitType = "WebHistory";
			indexable.MimeType = row.MimeType;
			indexable.Timestamp = row.LastVisited;
			indexable.AddProperty(Beagle.Property.New ("fixme:host",row.Address.Host));
			indexable.AddProperty (Beagle.Property.NewUnsearched ("fixme:size", row.Length));
			// hint for the filter about the charset
			indexable.AddProperty (Property.NewUnsearched (StringFu.UnindexedNamespace + "charset", row.Encoding.ToString ()));

			if(row.Compression == "gzip")	
				indexable.SetBinaryStream (new GZipInputStream (File.OpenRead (Path.Combine (cache_dir, row.LocalFileName))));
			else
				indexable.ContentUri = new Uri (Path.Combine (cache_dir, row.LocalFileName));
			
			indexer.AttributeStore.AttachLastWriteTime (Path.Combine (cache_dir, row.LocalFileName), DateTime.UtcNow);
			
			return indexable;
		}
		public void Add (Indexable indexable)
		{
			indexable.StoreStream ();
			to_add.Add (indexable);
		}
Beispiel #11
0
        public int CompareTo(object obj)
        {
            Indexable other = (Indexable)obj;

            return(DateTime.Compare(this.Timestamp, other.Timestamp));
        }
Beispiel #12
0
		//////////////////////////

		public void SetChildOf (Indexable parent)
		{
			this.IsChild = true;
			if (parent.IsChild)
				this.ParentUri = parent.ParentUri;
			else
				this.ParentUri = parent.Uri;

			if (!this.ValidTimestamp)
				this.Timestamp = parent.Timestamp;

			if (string.IsNullOrEmpty (this.HitType))
				this.HitType = parent.HitType;

			this.Source = parent.Source;

			// FIXME: Set all of the parent's properties on the
			// child so that we get matches against the child
			// that otherwise would match only the parent, at
			// least until we have proper RDF support.

			if (parent.IsChild)
				CopyPropertyChildToChild (parent);
			else
				CopyPropertyParentToChild (parent);
		}
	static void Main (String[] args)
	{
		string uriStr = null;
		string title = null;
		string sourcefile = null;
		bool deletesourcefile = false;

		if (args.Length == 0 || Array.IndexOf (args, "--help") > -1) {
			PrintUsage ();
			Environment.Exit (1);
		}

		for (int i = 0; i < args.Length; i++) {
			switch (args [i]) {
			case "--url":
			case "--title":
			case "--sourcefile":
				if (i + 1 >= args.Length ||
				    args [i + 1].StartsWith ("--")) {
					PrintUsage ();
					Environment.Exit (1);
				}
				break;
			}

			switch (args [i]) {
			case "--url":
				uriStr = args [++i];
				break;
			case "--title":
				title = args [++i];
				break;
			case "--sourcefile":
				sourcefile = args [++i];
				break;
			case "--deletesourcefile":
				deletesourcefile = true;
				break;
			case "--help":
				PrintUsage ();
				return;
			case "--version":
				VersionFu.PrintVersion ();
				return;
			}
		}

		if (uriStr == null) {
			Logger.Log.Error ("URI not specified!\n");
			PrintUsage ();
			Environment.Exit (1);
		}

		Uri uri = new Uri (uriStr, true);
		if (uri.Scheme == Uri.UriSchemeHttps) {
			// For security/privacy reasons, we don't index any
			// SSL-encrypted pages.
			Logger.Log.Error ("Indexing secure https:// URIs is not secure!");
			Environment.Exit (1);
		}

		// We don't index file: Uris.  Silently exit.
		if (uri.IsFile)
			return;

		// We *definitely* don't index mailto: Uris.  Silently exit.
		if (uri.Scheme == Uri.UriSchemeMailto)
			return;

		Indexable indexable;
		
		indexable = new Indexable (uri);
		indexable.HitType = "WebHistory";
		indexable.MimeType = "text/html";
		indexable.Timestamp = DateTime.Now;

		if (title != null)
			indexable.AddProperty (Property.New ("dc:title", title));

		if (sourcefile != null) {
			
			if (!File.Exists (sourcefile)) {
				Logger.Log.Error ("sourcefile '{0}' does not exist!", sourcefile);
				Environment.Exit (1);
			}

			indexable.ContentUri = UriFu.PathToFileUri (sourcefile);
			indexable.DeleteContent = deletesourcefile;

		} else {
			Stream stdin = Console.OpenStandardInput ();
			if (stdin == null) {
				Logger.Log.Error ("No sourcefile specified, and no standard input!\n");
				PrintUsage ();
				Environment.Exit (1);
			}

			indexable.SetTextReader (new StreamReader (stdin));
		}

		IndexingServiceRequest req = new IndexingServiceRequest ();
		req.Add (indexable);

		try {
			Logger.Log.Info ("Indexing");
			Logger.Log.Debug ("SendAsync");
			req.SendAsync ();
			Logger.Log.Debug ("Close");
			req.Close ();
			Logger.Log.Debug ("Done");
		} catch (Exception e) {
			Logger.Log.Error ("Indexing failed: {0}", e);

			// Still clean up after ourselves, even if we couldn't
			// index the content.
			if (deletesourcefile)
				File.Delete (sourcefile);

			Environment.Exit (1);
		}
	}
Beispiel #14
0
			public void OnEachPart (GMime.Object mime_part)
			{
				GMime.Object part = null;
				bool part_needs_dispose = false;

				//for (int i = 0; i < this.depth; i++)
				//  Console.Write ("  ");
				//Console.WriteLine ("Content-Type: {0}", mime_part.ContentType);
			
				++depth;

				if (mime_part is GMime.MessagePart) {
					GMime.MessagePart msg_part = (GMime.MessagePart) mime_part;

					using (GMime.Message message = msg_part.Message) {
						using (GMime.Object subpart = message.MimePart)
							this.OnEachPart (subpart);
					}
				} else if (mime_part is GMime.Multipart) {
					GMime.Multipart multipart = (GMime.Multipart) mime_part;
					int num_parts = multipart.Count;

					// If the mimetype is multipart/alternative, we only want to index
					// one part -- the richest one we can filter.
					if (mime_part.ContentType.MediaSubtype.ToLower () == "alternative") {
						// The richest formats are at the end, so work from there
						// backward.
						for (int i = num_parts - 1; i >= 0; i--) {
							GMime.Object subpart = multipart[i];

							if (IsMimeTypeHandled (subpart.ContentType.ToString ())) {
								part = subpart;
								part_needs_dispose = true;
								break;
							} else {
								subpart.Dispose ();
							}
						}
					}

					// If it's not alternative, or we don't know how to filter any of
					// the parts, treat them like a bunch of attachments.
					if (part == null) {
						for (int i = 0; i < num_parts; i++) {
							using (GMime.Object subpart = multipart[i])
								this.OnEachPart (subpart);
						}
					}
				} else if (mime_part is GMime.Part)
					part = mime_part;
				else
					throw new Exception (String.Format ("Unknown part type: {0}", part.GetType ()));

				if (part != null) {
					System.IO.Stream stream = null;
					
					using (GMime.DataWrapper content_obj = ((GMime.Part) part).ContentObject)
						stream = content_obj.Stream;

					// If this is the only part and it's plain text, we
					// want to just attach it to our filter instead of
					// creating a child indexable for it.
					bool no_child_needed = false;

					string mime_type = part.ContentType.ToString ().ToLower ();

					if (this.depth == 1 && this.count == 0) {
						if (mime_type == "text/plain") {
							no_child_needed = true;

							this.reader = new StreamReader (stream);
						} else if (mime_type == "text/html") {
							no_child_needed = true;
							html_part = true;
							string enc = part.ContentType.GetParameter ("charset"); 
							// DataWrapper.Stream is a very limited stream
							// and does not allow Seek or Tell
							// HtmlFilter requires Stream.Position=0.
							// Play safe and create a memorystream
							// for HTML parsing.

							GMime.StreamMem mem_stream;
							mem_stream = new GMime.StreamMem ();

							GMime.Stream data_stream;
							data_stream = ((StreamWrapper) stream).GMimeStream;
							data_stream.WriteToStream (mem_stream);
							data_stream.Flush ();

							// The StreamWrapper and hence the memory_stream
							// will be closed when the reader is closed
							// after Pull()-ing is done.
							System.IO.Stream html_stream; 
							html_stream = new StreamWrapper (mem_stream);
							html_stream.Seek (0, SeekOrigin.Begin);

							stream.Close ();

							try {
								this.reader = FilterHtml.GetHtmlReader (html_stream, enc, link_handler);
							} catch (Exception e) {
								Log.Debug (e, "Exception while filtering HTML email {0}", this.indexable.Uri);
								this.reader = null;
								html_stream.Close ();
								html_part = false;
							}
						}
					}

					if (!no_child_needed) {
						// Check the mime type against the blacklist and don't index any
						// parts that are contained within.  That way the user doesn't
						// get flooded with pointless signatures and vcard and ical
						// attachments along with (real) attachments.

						if (Array.IndexOf (blacklisted_mime_types, mime_type) == -1) {
							string sub_uri = "#" + this.count;
							Indexable child;
							child = new Indexable (UriFu.AddFragment (this.indexable.Uri, sub_uri, true));

							child.DisplayUri = new Uri (this.indexable.DisplayUri.ToString () + "#" + this.count);

							// This is a special case.
							// Even for mails found on disk, MailMessage hitype is set
							child.HitType = "MailMessage";
							child.MimeType = mime_type;

							// If this is the richest part we found for multipart emails, add its content to textcache
							if (snippet_attachment ||
							    (this.depth == 1 && this.count == 0))
								child.CacheContent = true;
							else
								child.CacheContent = false;

							string filename = ((GMime.Part) part).Filename;

							if (! String.IsNullOrEmpty (filename)) {
								child.AddProperty (Property.NewKeyword ("fixme:attachment_title", filename));

								foreach (Property prop in Property.StandardFileProperties (filename, false))
									child.AddProperty (prop);
							}

							// Store length of attachment
							long length = stream.Length;
							if (length != -1)
								child.AddProperty (Property.NewUnsearched ("fixme:filesize", length));

							if (part.ContentType.MediaType.ToLower () == "text")
								child.SetTextReader (new StreamReader (stream));
							else
								child.SetBinaryStream (stream);

							child.SetChildOf (this.indexable);
							child.StoreStream ();
							child.CloseStreams ();
							this.child_indexables.Add (child);
						} else {
							Log.Debug ("Skipping attachment {0}#{1} with blacklisted mime type {2}",
								   this.indexable.Uri, this.count, mime_type);
						}
					}

					this.count++;
				}

				if (part_needs_dispose)
					part.Dispose ();

				--depth;
			}
Beispiel #15
0
		public override bool GenerateNextIndexable (out Indexable child)
		{
			ArchiveEntry a_entry;
			child = null;

			if (! setup_done)
				SetupArchiveStream ();

			if (count >= MAX_CHILDREN) {
				Log.Debug ("Archive {0} contains more than {1} files.  Only {1} files indexed.", Indexable.DisplayUri.ToString (), count);
				Close ();
				return false;
			}

			if (total_size > MAX_ALL_FILES) {
				Log.Debug ("Archive {0} crossed our max uncompressed size threshold.  Only {1} files extracted", Indexable.DisplayUri.ToString (), count);
				Close ();
				return false;
			}

			a_entry = DoGetNextEntry ();
			if (a_entry == null) {
				Close ();
				return false;
			}

			// Store file names in the archive
			AppendText (Path.GetFileName (a_entry.Name));
			AppendWhiteSpace ();

			// If this is an invalid or oversized entry, skip it.
			if (a_entry.TempFile == null)
				return true;

			++count;
			total_size += a_entry.Size;

			// Add "#<escaped-path-to-entry>" to the end of the Indexable Uri
			// So, file b#c in archive foo.zip becomes file:///foo.zip#b%23c
			// And file c in archive b in archive foo.zip becomes file:///foo.zip#b#c
			child = new Indexable (UriFu.AddFragment (Indexable.Uri, a_entry.Name, false));

			child.CacheContent = true;
			child.MimeType = a_entry.MimeType;

			child.DisplayUri = new Uri (Indexable.DisplayUri.ToString () + "#" + a_entry.Name);
			child.ContentUri = UriFu.PathToFileUri (a_entry.TempFile);
			child.DeleteContent = true;

			// FIXME Remove fixme:inside_archive during Property Hack Week
			// Replace most flag properties by value properties
			child.AddProperty (Property.NewBool ("fixme:inside_archive", true));
			// Use this instead of fixme:inside_archive
			child.AddProperty (Property.NewKeyword ("archive:type", archive_type));

			child.AddProperty (Property.NewKeyword ("fixme:relativeuri", a_entry.Name));
			child.AddProperty (Property.New ("fixme:comment", a_entry.Comment));
			child.AddProperty (Property.NewUnsearched ("fixme:filesize", a_entry.Size));

			foreach (Property prop in Property.StandardFileProperties (Path.GetFileName (a_entry.Name), false))
				child.AddProperty (prop);

			child.SetChildOf (Indexable);

			return true;
		}
Beispiel #16
0
			public PartHandler (Indexable parent_indexable, FilterHtml.AddLinkCallback link_handler)
			{
				this.indexable = parent_indexable;
				this.link_handler = link_handler;
			}
Beispiel #17
0
		private void CopyPropertyParentToChild (Indexable parent)
		{
			// Parent is a top level indexable
			// Copy all properties
			foreach (Property prop in parent.Properties) {

				Property new_prop = (Property) prop.Clone ();
				// Add parent: to property names ONLY IF
				// - not private property (these are not properties of the file content)
				// - property name does not already start with parent:
				if (! new_prop.Key.StartsWith (Property.PrivateNamespace) &&
				    ! new_prop.Key.StartsWith ("parent:"))
					new_prop.Key = "parent:" + new_prop.Key;

				this.AddProperty (new_prop);
			}
		}
Beispiel #18
0
		// FIXME: Copying the correct properties from parent to child:
		// (This is not perfect yet)
		// It does not make sense to have parent:parent:parent:...:parent:foo
		// for property names of a nested child
		// Moreover, if indexable a.mbox has child b.zip which has child c.zip,
		// then upon matching c.zip, we would like to get the information from
		// a.mbox (i.e. the toplevel indexable) only. Intermediate parent information
		// is not necessary for displaying results; in fact, storing them would cause
		// confusion during display.
		// E.g. storing parent:beagle:filename for all parents
		// would cause, parent:beagle:filename=a.mbox, parent.beagle.filename=b.zip
		// whereas we are only interested in toplevel parent:beagle:filename=a.mbox
		// For indexables which need to store the intermediate/immediate parent info
		// separately, explicitly store them.
		// Another problem is, toplevel indexable might want to store information
		// which should not be matched when searching for its child. Copying those
		// properties in all children will incorrectly match them.
		//

		private void CopyPropertyChildToChild (Indexable parent)
		{
			// If parent itself is a child,
			// then only copy parents' parent:xxx and _private:xxx properties
			foreach (Property prop in parent.Properties) {

				if (prop.Key.StartsWith ("parent:") ||
				    prop.Key.StartsWith (Property.PrivateNamespace)) {

					Property new_prop = (Property) prop.Clone ();
					this.AddProperty (new_prop);
				} else {
					
					Property new_prop = (Property) prop.Clone ();
					new_prop.IsStored = false;
					this.AddProperty (new_prop);
				}
			}
		}
Beispiel #19
0
		static Indexable FileToIndexable (FileInfo file)
		{
			if (!file.Exists)
				return null;
			
			if (fa_store.IsUpToDateAndFiltered (PathInIndex (file.FullName),
							    FileSystem.GetLastWriteTimeUtc (file.FullName)))
				return null;

			// Create the indexable and add the standard properties we
			// use in the FileSystemQueryable.
			Uri uri = PathToUri (file.FullName);
			Indexable indexable = new Indexable (uri);
			indexable.Timestamp = file.LastWriteTimeUtc;
			indexable.FlushBufferCache = true;
			indexable.AddProperty (Property.NewUnsearched ("fixme:filesize", file.Length));
			FSQ.AddStandardPropertiesToIndexable (indexable, file.Name, Guid.Empty, false);

			// Store directory name in the index
			string dirname = file.DirectoryName;
			indexable.AddProperty (Property.NewUnsearched (Property.ParentDirUriPropKey, PathToUri (dirname)));

			if (arg_removable) {
				indexable.AddProperty (Property.NewKeyword ("beagle:RemovableVolume", volume_label));
				indexable.ContentUri = UriFu.PathToFileUri (file.FullName);
			}

			return indexable;
		}
	static int Main (string[] args)
	{
		SystemInformation.SetProcessName ("beagle-extract-content");

		if (args.Length < 1 || Array.IndexOf (args, "--help") != -1) {
			PrintUsage ();
			return 0;
		}

		if (Array.IndexOf (args, "--debug") == -1)
			Log.Disable ();

		if (Array.IndexOf (args, "--version") != -1) {
			VersionFu.PrintVersion ();
			return 0;
		}

		if (Array.IndexOf (args, "--tokenize") != -1)
			tokenize = true;
		
		if (Array.IndexOf (args, "--analyze") != -1)
			analyze = true;
		
		if (Array.IndexOf (args, "--show-generated") != -1 || Array.IndexOf (args, "--show-children") != -1)
			show_generated = true;

		StreamWriter writer = null;
		string outfile = null;
		foreach (string arg in args) {

			// mime-type option
			if (arg.StartsWith ("--mimetype=")) {
				mime_type = arg.Substring (11);    
				continue;
			// output file option
			// we need this in case the output contains different encoding
			// printing to Console might not always display properly
			} else if (arg.StartsWith ("--outfile=")) {
				outfile = arg.Substring (10);    
				Console.WriteLine ("Redirecting output to " + outfile);
				FileStream f = new FileStream (outfile, FileMode.Create);
				writer = new StreamWriter (f, System.Text.Encoding.UTF8);
				continue;
			} else if (arg.StartsWith ("--")) // option, skip it 
				continue;
			
			Uri uri = UriFu.PathToFileUri (arg);
			Indexable indexable = new Indexable (uri);
			if (mime_type != null)
				indexable.MimeType = mime_type;

			try {
				if (writer != null) {
					Console.SetOut (writer);
				}

				Display (indexable);
				if (writer != null) {
					writer.Flush ();
				}
				
				if (outfile != null) {
					StreamWriter standardOutput = new StreamWriter(Console.OpenStandardOutput());
					standardOutput.AutoFlush = true;
					Console.SetOut(standardOutput);
				}
				
			} catch (Exception e) {
				Console.WriteLine ("Unable to filter {0}: {1}", uri, e.Message);
				return -1;
			}
			
			// Super Lame Hack: gtk-sharp up to 2.10 requires a main loop
			// to dispose of any managed wrappers around GObjects.  Since
			// we don't have one, we'll process all the pending items in
			// a loop here.  This is particularly an issue with maildirs,
			// because we need the loop to clean up after GMime.  Without
			// it, GMime's streams are never completely unref'd, the
			// file descriptors aren't closed, and we run out and crash.
			while (GLib.MainContext.Pending ())
				GLib.MainContext.Iteration ();
		}
		if (writer != null)
			writer.Close ();

		return 0;
	}
Beispiel #21
0
		static Indexable DirectoryToIndexable (DirectoryInfo dir, Queue modified_directories)
		{
			if (!dir.Exists)
				return null;

			// Check if the directory information is stored in attributes store
			// And if the mtime of the directory is same as that in the attributes store
			FileAttributes attr = fa_store.Read (PathInIndex (dir.FullName));

			// If the directory exists in the fa store, then it is already indexed.
			if (attr != null) {
				// If we don't care about deleted content then we are fine.
				// If the attributes are up-to-date, then we are fine too.
				if (! arg_delete || FileAttributesStore.IsUpToDate (attr, FileSystem.GetLastWriteTimeUtc (dir.FullName)))
					return null;

				// But the last write time needs to be uptodate to support enable-deletion,
				// so we actually index the directories, even if --disable-directories
				// is set.
				modified_directories.Enqueue (dir);
			}

			// Create the indexable and add the standard properties we
			// use in the FileSystemQueryable.
			Uri uri = PathToUri (dir.FullName);
			Indexable indexable = new Indexable (uri);
			indexable.MimeType = "inode/directory";
			indexable.NoContent = true;
			indexable.Timestamp = dir.LastWriteTimeUtc;

			// Store the directory information in the index anyway, but if --disable-directories
			// was passed, then do not store the names and other standard properties
			// used during searching
			if (! arg_disable_directories)
				FSQ.AddStandardPropertiesToIndexable (indexable, dir.Name, Guid.Empty, false);

			// Add directory name property
			string dirname = dir.Parent.FullName;
			indexable.AddProperty (Property.NewUnsearched (Property.ParentDirUriPropKey, PathToUri (dirname)));

			indexable.AddProperty (Property.NewBool (Property.IsDirectoryPropKey, true));

			if (arg_removable)
				indexable.AddProperty (Property.NewKeyword ("beagle:removable", volume_label));

			return indexable;
		}
	static void Display (Indexable indexable)
	{
		if (!first_indexable) {
			Console.WriteLine ();
			Console.WriteLine ("-----------------------------------------");
			Console.WriteLine ();
		}
		first_indexable = false;

		Console.WriteLine ("Filename: " + indexable.Uri);

		if (indexable.ParentUri != null)
			Console.WriteLine ("Parent: " + indexable.ParentUri);

		Stopwatch watch = new Stopwatch ();

		Filter filter;

		watch.Start ();
		if (! FilterFactory.FilterIndexable (indexable, out filter)) {
			indexable.Cleanup ();
			indexable.NoContent = true;
			filter = null;
		}
		watch.Stop ();

		Console.WriteLine ("Filter: {0} (determined in {1})", filter, watch);
		Console.WriteLine ("MimeType: {0}", indexable.MimeType);
		Console.WriteLine ();

		ArrayList generated_indexables = new ArrayList ();
		Indexable generated_indexable;

		bool first = true;
		if (filter != null && filter.HasGeneratedIndexable) {
			while (filter.GenerateNextIndexable (out generated_indexable)) {
				if (generated_indexable == null)
					continue;

				if (first) {
					Console.WriteLine ("Filter-generated indexables:");
					first = false;
				}
				
				Console.WriteLine ("  {0}", generated_indexable.Uri);

				if (show_generated)
					generated_indexables.Add (generated_indexable);
				else
					generated_indexable.Cleanup ();
			}
		}

		if (! first)
			Console.WriteLine ();

		// Make sure that the properties are sorted.
		ArrayList prop_array = new ArrayList (indexable.Properties);
		prop_array.Sort ();

		Console.WriteLine ("Properties:");

		if (indexable.ValidTimestamp)
			Console.WriteLine ("  Timestamp = {0}", DateTimeUtil.ToString (indexable.Timestamp));

		foreach (Beagle.Property prop in prop_array) {
			if (String.IsNullOrEmpty (prop.Value))
				continue;

			Console.WriteLine ("  {0} = {1}", prop.Key, prop.Value);
		}

		Console.WriteLine ();

		if (indexable.NoContent)
			return;

		watch.Reset ();
		watch.Start ();

		TextReader reader;
		Analyzer indexing_analyzer = new BeagleAnalyzer ();

		char[] buffer = new char [2048];
		reader = indexable.GetTextReader ();
		char separater_char = (tokenize ? '\n' : ' ');
		if (reader != null) {
			first = true;

			if (analyze) {
				if (! stats_only)
					Console.WriteLine ("Content:");

				TokenStream token_stream = indexing_analyzer.TokenStream ("Text", reader);
				Lucene.Net.Analysis.Token token = token_stream.Next ();
				first = (token == null);

				if (! stats_only)
					for (; token != null; token = token_stream.Next ())
						Console.Write ("{0}{1}", token.TermText (), separater_char);

				token_stream.Close ();
			} else {
#if false
				while (true) {
					int l = reader.Read (buffer, 0, 2048);
					if (l <= 0)
						break;
					if (first)
						first = false;
					if (! stats_only)
						DisplayContent (buffer, l);
				}
#else
				string line;
				first = true;
				while ((line = reader.ReadLine ()) != null) {
					if (first) {
						Console.WriteLine ("Content:");
						first = false;
					}
					if (! stats_only)
						DisplayContent (line);
				}
#endif
			}

			reader.Close ();

			if (first)
				Console.WriteLine ("(no content)");
			else
				Console.WriteLine ('\n');
		}
			
		/*
		reader = indexable.GetHotTextReader ();
		first = true;
		if (reader != null) {
			Console.WriteLine ("HotContent:");

			if (analyze) {
				TokenStream token_stream = indexing_analyzer.TokenStream ("HotText", reader);
				Lucene.Net.Analysis.Token token = token_stream.Next ();
				first = (token == null);

				for (; token != null; token = token_stream.Next ())
					Console.Write ("{0}{1}", token.TermText (), separater_char);

				token_stream.Close ();
			} else {
				while (true) {
					int l = reader.Read (buffer, 0, 2048);
					if (l <= 0)
						break;
					if (first)
						first = false;
					DisplayContent (buffer, l);
				}
			}

			reader.Close ();

			if (first)
				Console.WriteLine ("(no hot content)");
			else
				Console.WriteLine ('\n');
		}
		*/

		watch.Stop ();

		Console.WriteLine ();
		Console.WriteLine ("Text extracted in {0}", watch);

#if ENABLE_RDF_ADAPTER
		IList<string> links = indexable.Links;
		if (links != null && links.Count != 0) {
			Console.WriteLine ("Links:");
			foreach (string link in links)
				Console.WriteLine (link);
			Console.WriteLine ();
		}
#endif

		foreach (Indexable gi in generated_indexables)
			Display (gi);

		Stream stream = indexable.GetBinaryStream ();
		if (stream != null)
			stream.Close ();

		// Clean up any temporary files associated with filtering this indexable.
		indexable.Cleanup ();
	}
 public void Add(Indexable indexable)
 {
     indexable.StoreStream();
     to_add.Add(indexable);
 }
Beispiel #24
0
		// This doesn't check if it makes sense to actually
		// merge the two indexables: it just does it.
		public void Merge (Indexable other)
		{
			if (other.Timestamp > this.Timestamp)
				this.Timestamp = other.Timestamp;

			foreach (Property prop in other.Properties)
				this.AddProperty (prop);

			foreach (DictionaryEntry entry in other.local_state)
				this.local_state [entry.Key] = entry.Value;
		}