Exemplo n.º 1
0
                //////////////////////////////////////////////////////////////////////////////

                //
                // Dealing with documents
                //

                static protected void BuildDocuments (Indexable indexable,
                                                      out Document primary_doc,
                                                      out Document secondary_doc)
                {
                        primary_doc = new Document ();
                        secondary_doc = null;

                        Field f;

                        // During querying, we retrieve a lucene document with only certain fields
                        // like Uri and Timestamp and quickly check if the document is a good one
                        // The field specified document constructor runs faster if the fields that
                        // are asked for are located at the beginning of the document.
                        // Hence it is better to keep "Uri" and "Timestamp" at the beginning.
                        f = new Field ("Uri", UriFu.UriToEscapedString (indexable.Uri),
                                       Field.Store.YES, Field.Index.NO_NORMS);
                        primary_doc.Add (f);

                        if (indexable.ParentUri != null) {
                                f = new Field ("ParentUri", UriFu.UriToEscapedString (indexable.ParentUri),
                                               Field.Store.YES, Field.Index.NO_NORMS);
                                primary_doc.Add (f);
                        }

                        if (indexable.ValidTimestamp) {
                                // Note that we also want to search in the
                                // Timestamp field when we do a wildcard date
                                // query, so that's why we also add a wildcard
                                // field for each item here.

                                string wildcard_field = TypeToWildcardField (PropertyType.Date);

                                string str = StringFu.DateTimeToString (indexable.Timestamp);
                                f = new Field ("Timestamp", str, Field.Store.YES, Field.Index.NO_NORMS);
                                primary_doc.Add (f);
                                f = new Field (wildcard_field, str, Field.Store.NO, Field.Index.NO_NORMS);
                                primary_doc.Add (f);

                                // Create an inverted timestamp so that we can
                                // sort by timestamp at search-time.
                                long timeval = Convert.ToInt64 (str);
                                // Pad the inverted timestamp with zeroes for proper string comparison during termenum enumeration
                                f = new Field ("InvertedTimestamp", (Int64.MaxValue - timeval).ToString ("d19"),
                                               Field.Store.NO, Field.Index.NO_NORMS);
                                primary_doc.Add (f);

                                str = StringFu.DateTimeToYearMonthString (indexable.Timestamp);
                                f = new Field ("Timestamp(YM)", str, Field.Store.YES, Field.Index.NO_NORMS);
                                primary_doc.Add (f);
                                f = new Field (wildcard_field + "(YM)", str,
                                               Field.Store.NO, Field.Index.NO_NORMS);
                                primary_doc.Add (f);

                                str = StringFu.DateTimeToDayString (indexable.Timestamp);
                                f = new Field ("Timestamp(D)", str, Field.Store.YES, Field.Index.NO_NORMS);
                                primary_doc.Add (f);
                                f = new Field (wildcard_field + "(D)", str,
                                               Field.Store.NO, Field.Index.NO_NORMS);
                                primary_doc.Add (f);
                        }

                        if (indexable.NoContent) {
                                // If there is no content, make a note of that
                                // in a special property.
                                Property prop;
                                prop = Property.NewBool ("beagrep:NoContent", true);
                                AddPropertyToDocument (prop, primary_doc);

                        } else {

                                // Since we might have content, add our text
                                // readers.

                                TextReader reader;

                                // Add the field "Text" first
                                // It is important that the order is preserved
                                reader = indexable.GetTextReader ();
                                if (reader != null) {
                                        f = new Field ("Text", reader);
                                        primary_doc.Add (f);
                                }

                                // FIXME: HotText is ignored for now!
                                // Then add "HotText"
                                //reader = indexable.GetHotTextReader ();
                                //if (reader != null) {
                                //      f = new Field ("HotText", reader);
                                //      primary_doc.Add (f);
                                //}
                        }

                        // Store the Type and MimeType in special properties

                        if (indexable.HitType != null) {
                                Property prop;
                                prop = Property.NewUnsearched ("beagrep:HitType", indexable.HitType);
                                AddPropertyToDocument (prop, primary_doc);
                        }

                        if (indexable.MimeType != null) {
                                Property prop;
                                prop = Property.NewUnsearched ("beagrep:MimeType", indexable.MimeType);
                                AddPropertyToDocument (prop, primary_doc);
                        }

                        if (indexable.Source != null) {
                                Property prop;
                                prop = Property.NewUnsearched ("beagrep:Source", indexable.Source);
                                AddPropertyToDocument (prop, primary_doc);
                        }

                        {
                                Property prop;
                                prop = Property.NewBool (Property.IsChildPropKey, indexable.IsChild);
                                AddPropertyToDocument (prop, primary_doc);
                        }

                        // Store the other properties

                        foreach (Property prop in indexable.Properties) {
                                Document target_doc = primary_doc;
                                if (prop.IsMutable) {
                                        if (secondary_doc == null)
                                                secondary_doc = CreateSecondaryDocument (indexable.Uri, indexable.ParentUri);

                                        target_doc = secondary_doc;
                                }

                                AddPropertyToDocument (prop, target_doc);
                        }
                }
Exemplo n.º 2
0
    static void Display(Indexable indexable)
    {
        if (!first_indexable) {
            Console.WriteLine ();
            Console.WriteLine ("-----------------------------------------");
            Console.WriteLine ();
        }
        first_indexable = false;

        Console.WriteLine ("Filename: " + indexable.Uri);

        if (indexable.ParentUri != null)
            Console.WriteLine ("Parent: " + indexable.ParentUri);

        Stopwatch watch = new Stopwatch ();

        Filter filter;

        watch.Start ();
        if (! FilterFactory.FilterIndexable (indexable, out filter)) {
            indexable.Cleanup ();
            indexable.NoContent = true;
            filter = null;
        }
        watch.Stop ();

        Console.WriteLine ("Filter: {0} (determined in {1})", filter, watch);
        Console.WriteLine ("MimeType: {0}", indexable.MimeType);
        Console.WriteLine ();

        ArrayList generated_indexables = new ArrayList ();
        Indexable generated_indexable;

        bool first = true;
        if (filter != null && filter.HasGeneratedIndexable) {
            while (filter.GenerateNextIndexable (out generated_indexable)) {
                if (generated_indexable == null)
                    continue;

                if (first) {
                    Console.WriteLine ("Filter-generated indexables:");
                    first = false;
                }

                Console.WriteLine ("  {0}", generated_indexable.Uri);

                if (show_generated)
                    generated_indexables.Add (generated_indexable);
                else
                    generated_indexable.Cleanup ();
            }
        }

        if (! first)
            Console.WriteLine ();

        // Make sure that the properties are sorted.
        ArrayList prop_array = new ArrayList (indexable.Properties);
        prop_array.Sort ();

        Console.WriteLine ("Properties:");

        if (indexable.ValidTimestamp)
            Console.WriteLine ("  Timestamp = {0}", DateTimeUtil.ToString (indexable.Timestamp));

        foreach (Beagle.Property prop in prop_array) {
            if (String.IsNullOrEmpty (prop.Value))
                continue;

            Console.WriteLine ("  {0} = {1}", prop.Key, prop.Value);
        }

        Console.WriteLine ();

        if (indexable.NoContent)
            return;

        watch.Reset ();
        watch.Start ();

        TextReader reader;
        Analyzer indexing_analyzer = new BeagleAnalyzer ();

        char[] buffer = new char [2048];
        reader = indexable.GetTextReader ();
        char separater_char = (tokenize ? '\n' : ' ');
        if (reader != null) {
            first = true;

            if (analyze) {
                if (! stats_only)
                    Console.WriteLine ("Content:");

                TokenStream token_stream = indexing_analyzer.TokenStream ("Text", reader);
                Lucene.Net.Analysis.Token token = token_stream.Next ();
                first = (token == null);

                if (! stats_only)
                    for (; token != null; token = token_stream.Next ())
                        Console.Write ("{0}{1}", token.TermText (), separater_char);

                token_stream.Close ();
            } else {
        #if false
                while (true) {
                    int l = reader.Read (buffer, 0, 2048);
                    if (l <= 0)
                        break;
                    if (first)
                        first = false;
                    if (! stats_only)
                        DisplayContent (buffer, l);
                }
        #else
                string line;
                first = true;
                while ((line = reader.ReadLine ()) != null) {
                    if (first) {
                        Console.WriteLine ("Content:");
                        first = false;
                    }
                    if (! stats_only)
                        DisplayContent (line);
                }
        #endif
            }

            reader.Close ();

            if (first)
                Console.WriteLine ("(no content)");
            else
                Console.WriteLine ('\n');
        }

        /*
        reader = indexable.GetHotTextReader ();
        first = true;
        if (reader != null) {
            Console.WriteLine ("HotContent:");

            if (analyze) {
                TokenStream token_stream = indexing_analyzer.TokenStream ("HotText", reader);
                Lucene.Net.Analysis.Token token = token_stream.Next ();
                first = (token == null);

                for (; token != null; token = token_stream.Next ())
                    Console.Write ("{0}{1}", token.TermText (), separater_char);

                token_stream.Close ();
            } else {
                while (true) {
                    int l = reader.Read (buffer, 0, 2048);
                    if (l <= 0)
                        break;
                    if (first)
                        first = false;
                    DisplayContent (buffer, l);
                }
            }

            reader.Close ();

            if (first)
                Console.WriteLine ("(no hot content)");
            else
                Console.WriteLine ('\n');
        }
        */

        watch.Stop ();

        Console.WriteLine ();
        Console.WriteLine ("Text extracted in {0}", watch);

        #if ENABLE_RDF_ADAPTER
        IList<string> links = indexable.Links;
        if (links != null && links.Count != 0) {
            Console.WriteLine ("Links:");
            foreach (string link in links)
                Console.WriteLine (link);
            Console.WriteLine ();
        }
        #endif

        foreach (Indexable gi in generated_indexables)
            Display (gi);

        Stream stream = indexable.GetBinaryStream ();
        if (stream != null)
            stream.Close ();

        // Clean up any temporary files associated with filtering this indexable.
        indexable.Cleanup ();
    }
Exemplo n.º 3
0
        /* Returns false if content can't/needn't be indexed.
         * If AlreadyFiltered, then we don't return a filter but return true.
         */
        static public bool FilterIndexable(Indexable indexable, TextCache text_cache, out Filter filter)
        {
            filter = null;
            ICollection filters = null;

            if (indexable.Filtering == IndexableFiltering.AlreadyFiltered)
            {
                return(true);
            }

            if (!ShouldWeFilterThis(indexable))
            {
                return(false);
            }

            string path = null;

            // First, figure out which filter we should use to deal with
            // the indexable.

            // If a specific mime type is specified, try to index as that type.
            if (indexable.MimeType != null)
            {
                filters = CreateFiltersFromMimeType(indexable.MimeType);
            }

            if (indexable.ContentUri.IsFile)
            {
                path = indexable.ContentUri.LocalPath;

                // Otherwise, set the mime type for a directory,
                // or sniff it from the file.
                if (indexable.MimeType == null)
                {
                    if (Directory.Exists(path))
                    {
                        indexable.MimeType  = "inode/directory";
                        indexable.NoContent = true;
                    }
                    else if (File.Exists(path))
                    {
                        indexable.MimeType = XdgMime.GetMimeType(path);
                    }
                    else
                    {
                        Log.Warn("Unable to filter {0}.  {1} not found.", indexable.DisplayUri, path);
                        return(false);
                    }
                }

                // Set the timestamp to the last write time, if it isn't
                // set by the backend.
                if (!indexable.ValidTimestamp && indexable.IsNonTransient)
                {
                    indexable.Timestamp = FileSystem.GetLastWriteTimeUtc(path);
                }

                // Check the timestamp to make sure the file hasn't
                // disappeared from underneath us.
                if (!FileSystem.ExistsByDateTime(indexable.Timestamp))
                {
                    Log.Warn("Unable to filter {0}.  {1} appears to have disappeared from underneath us", indexable.DisplayUri, path);
                    return(false);
                }

                if (filters == null || filters.Count == 0)
                {
                    filters = CreateFiltersFromIndexable(indexable);
                }
            }

            // We don't know how to filter this, so there is nothing else to do.
            if (filters.Count == 0)
            {
                if (!indexable.NoContent)
                {
                    Logger.Log.Debug("No filter for {0} ({1}) [{2}]", indexable.DisplayUri, path, indexable.MimeType);
                }

                return(false);
            }

            foreach (Filter candidate_filter in filters)
            {
                if (Debug)
                {
                    Logger.Log.Debug("Testing filter: {0}", candidate_filter);
                }

                // Hook up the snippet writer.
                if (candidate_filter.SnippetMode && text_cache != null)
                {
                    if (candidate_filter.OriginalIsText && indexable.IsNonTransient)
                    {
                        text_cache.MarkAsSelfCached(indexable.Uri);
                    }
                    else if (indexable.CacheContent)
                    {
                        TextWriter writer = text_cache.GetWriter(indexable.Uri);
                        candidate_filter.AttachSnippetWriter(writer);
                    }
                }

                // Set the indexable on the filter.
                candidate_filter.Indexable = indexable;

                // Open the filter, copy the file's properties to the indexable,
                // and hook up the TextReaders.

                bool       successful_open = false;
                TextReader text_reader;
                Stream     binary_stream;

                if (path != null)
                {
                    successful_open = candidate_filter.Open(path);
                }
                else if ((text_reader = indexable.GetTextReader()) != null)
                {
                    successful_open = candidate_filter.Open(text_reader);
                }
                else if ((binary_stream = indexable.GetBinaryStream()) != null)
                {
                    successful_open = candidate_filter.Open(binary_stream);
                }

                if (successful_open)
                {
                    // Set FileType
                    indexable.AddProperty(Property.NewKeyword("beagrep:FileType", candidate_filter.FileType));

                    indexable.SetTextReader(candidate_filter.GetTextReader());
                    indexable.SetHotTextReader(candidate_filter.GetHotTextReader());

                    if (Debug)
                    {
                        Logger.Log.Debug("Successfully filtered {0} with {1}", path, candidate_filter);
                    }

                    filter = candidate_filter;
                    return(true);
                }
                else
                {
                    Log.Warn("Error in filtering {0} with {1}, falling back", path, candidate_filter);
                    candidate_filter.Cleanup();
                }
            }

            if (Debug)
            {
                Logger.Log.Debug("None of the matching filters could process the file: {0}", path);
            }

            return(false);
        }
Exemplo n.º 4
0
		/* Returns false if content can't/needn't be indexed.
		 * If AlreadyFiltered, then we don't return a filter but return true.
		 */
		static public bool FilterIndexable (Indexable indexable, TextCache text_cache, out Filter filter)
		{
			filter = null;
			ICollection filters = null;

			if (indexable.Filtering == IndexableFiltering.AlreadyFiltered)
				return true;

			if (! ShouldWeFilterThis (indexable))
				return false;

			string path = null;

			// First, figure out which filter we should use to deal with
			// the indexable.

			// If a specific mime type is specified, try to index as that type.
			if (indexable.MimeType != null)
				filters = CreateFiltersFromMimeType (indexable.MimeType);

			if (indexable.ContentUri.IsFile) {
				path = indexable.ContentUri.LocalPath;

				// Otherwise, set the mime type for a directory,
				// or sniff it from the file.
				if (indexable.MimeType == null) {
					if (Directory.Exists (path)) {
						indexable.MimeType = "inode/directory";
						indexable.NoContent = true;
					} else if (File.Exists (path)) {
						indexable.MimeType = XdgMime.GetMimeType (path);
					} else {
						Log.Warn ("Unable to filter {0}.  {1} not found.", indexable.DisplayUri, path);
						return false;
					}
				}

				// Set the timestamp to the last write time, if it isn't
				// set by the backend.
				if (! indexable.ValidTimestamp && indexable.IsNonTransient)
					indexable.Timestamp = FileSystem.GetLastWriteTimeUtc (path);

				// Check the timestamp to make sure the file hasn't
				// disappeared from underneath us.
				if (! FileSystem.ExistsByDateTime (indexable.Timestamp)) {
					Log.Warn ("Unable to filter {0}.  {1} appears to have disappeared from underneath us", indexable.DisplayUri, path);
					return false;
				}

				if (filters == null || filters.Count == 0) {
					filters = CreateFiltersFromIndexable (indexable);
				}
			}

			// We don't know how to filter this, so there is nothing else to do.
			if (filters.Count == 0) {
				if (! indexable.NoContent)
					Logger.Log.Debug ("No filter for {0} ({1}) [{2}]", indexable.DisplayUri, path, indexable.MimeType);

				return false;
			}

			foreach (Filter candidate_filter in filters) {
				if (Debug)
					Logger.Log.Debug ("Testing filter: {0}", candidate_filter);
				
				// Hook up the snippet writer.
				if (candidate_filter.SnippetMode && text_cache != null) {
					if (candidate_filter.OriginalIsText && indexable.IsNonTransient) {
						text_cache.MarkAsSelfCached (indexable.Uri);
					} else if (indexable.CacheContent) {
						TextWriter writer = text_cache.GetWriter (indexable.Uri);
						candidate_filter.AttachSnippetWriter (writer);
					}
				}

				// Set the indexable on the filter.
				candidate_filter.Indexable = indexable;

				// Open the filter, copy the file's properties to the indexable,
				// and hook up the TextReaders.

				bool successful_open = false;
				TextReader text_reader;
				Stream binary_stream;

				if (path != null)
					successful_open = candidate_filter.Open (path);
				else if ((text_reader = indexable.GetTextReader ()) != null)
					successful_open = candidate_filter.Open (text_reader);
				else if ((binary_stream = indexable.GetBinaryStream ()) != null)
					successful_open = candidate_filter.Open (binary_stream);
					
				if (successful_open) {
					// Set FileType
					indexable.AddProperty (Property.NewKeyword ("beagrep:FileType", candidate_filter.FileType));

					indexable.SetTextReader (candidate_filter.GetTextReader ());
					indexable.SetHotTextReader (candidate_filter.GetHotTextReader ());

					if (Debug)
						Logger.Log.Debug ("Successfully filtered {0} with {1}", path, candidate_filter);

					filter = candidate_filter;
					return true;
				} else {
					Log.Warn ("Error in filtering {0} with {1}, falling back", path, candidate_filter);
					candidate_filter.Cleanup ();
				}
			}

			if (Debug)
				Logger.Log.Debug ("None of the matching filters could process the file: {0}", path);

			return false;
		}