////////////////////////////////////////////////////////////////////////////// // // Dealing with documents // static protected void BuildDocuments (Indexable indexable, out Document primary_doc, out Document secondary_doc) { primary_doc = new Document (); secondary_doc = null; Field f; // During querying, we retrieve a lucene document with only certain fields // like Uri and Timestamp and quickly check if the document is a good one // The field specified document constructor runs faster if the fields that // are asked for are located at the beginning of the document. // Hence it is better to keep "Uri" and "Timestamp" at the beginning. f = new Field ("Uri", UriFu.UriToEscapedString (indexable.Uri), Field.Store.YES, Field.Index.NO_NORMS); primary_doc.Add (f); if (indexable.ParentUri != null) { f = new Field ("ParentUri", UriFu.UriToEscapedString (indexable.ParentUri), Field.Store.YES, Field.Index.NO_NORMS); primary_doc.Add (f); } if (indexable.ValidTimestamp) { // Note that we also want to search in the // Timestamp field when we do a wildcard date // query, so that's why we also add a wildcard // field for each item here. string wildcard_field = TypeToWildcardField (PropertyType.Date); string str = StringFu.DateTimeToString (indexable.Timestamp); f = new Field ("Timestamp", str, Field.Store.YES, Field.Index.NO_NORMS); primary_doc.Add (f); f = new Field (wildcard_field, str, Field.Store.NO, Field.Index.NO_NORMS); primary_doc.Add (f); // Create an inverted timestamp so that we can // sort by timestamp at search-time. long timeval = Convert.ToInt64 (str); // Pad the inverted timestamp with zeroes for proper string comparison during termenum enumeration f = new Field ("InvertedTimestamp", (Int64.MaxValue - timeval).ToString ("d19"), Field.Store.NO, Field.Index.NO_NORMS); primary_doc.Add (f); str = StringFu.DateTimeToYearMonthString (indexable.Timestamp); f = new Field ("Timestamp(YM)", str, Field.Store.YES, Field.Index.NO_NORMS); primary_doc.Add (f); f = new Field (wildcard_field + "(YM)", str, Field.Store.NO, Field.Index.NO_NORMS); primary_doc.Add (f); str = StringFu.DateTimeToDayString (indexable.Timestamp); f = new Field ("Timestamp(D)", str, Field.Store.YES, Field.Index.NO_NORMS); primary_doc.Add (f); f = new Field (wildcard_field + "(D)", str, Field.Store.NO, Field.Index.NO_NORMS); primary_doc.Add (f); } if (indexable.NoContent) { // If there is no content, make a note of that // in a special property. Property prop; prop = Property.NewBool ("beagrep:NoContent", true); AddPropertyToDocument (prop, primary_doc); } else { // Since we might have content, add our text // readers. TextReader reader; // Add the field "Text" first // It is important that the order is preserved reader = indexable.GetTextReader (); if (reader != null) { f = new Field ("Text", reader); primary_doc.Add (f); } // FIXME: HotText is ignored for now! // Then add "HotText" //reader = indexable.GetHotTextReader (); //if (reader != null) { // f = new Field ("HotText", reader); // primary_doc.Add (f); //} } // Store the Type and MimeType in special properties if (indexable.HitType != null) { Property prop; prop = Property.NewUnsearched ("beagrep:HitType", indexable.HitType); AddPropertyToDocument (prop, primary_doc); } if (indexable.MimeType != null) { Property prop; prop = Property.NewUnsearched ("beagrep:MimeType", indexable.MimeType); AddPropertyToDocument (prop, primary_doc); } if (indexable.Source != null) { Property prop; prop = Property.NewUnsearched ("beagrep:Source", indexable.Source); AddPropertyToDocument (prop, primary_doc); } { Property prop; prop = Property.NewBool (Property.IsChildPropKey, indexable.IsChild); AddPropertyToDocument (prop, primary_doc); } // Store the other properties foreach (Property prop in indexable.Properties) { Document target_doc = primary_doc; if (prop.IsMutable) { if (secondary_doc == null) secondary_doc = CreateSecondaryDocument (indexable.Uri, indexable.ParentUri); target_doc = secondary_doc; } AddPropertyToDocument (prop, target_doc); } }
static void Display(Indexable indexable) { if (!first_indexable) { Console.WriteLine (); Console.WriteLine ("-----------------------------------------"); Console.WriteLine (); } first_indexable = false; Console.WriteLine ("Filename: " + indexable.Uri); if (indexable.ParentUri != null) Console.WriteLine ("Parent: " + indexable.ParentUri); Stopwatch watch = new Stopwatch (); Filter filter; watch.Start (); if (! FilterFactory.FilterIndexable (indexable, out filter)) { indexable.Cleanup (); indexable.NoContent = true; filter = null; } watch.Stop (); Console.WriteLine ("Filter: {0} (determined in {1})", filter, watch); Console.WriteLine ("MimeType: {0}", indexable.MimeType); Console.WriteLine (); ArrayList generated_indexables = new ArrayList (); Indexable generated_indexable; bool first = true; if (filter != null && filter.HasGeneratedIndexable) { while (filter.GenerateNextIndexable (out generated_indexable)) { if (generated_indexable == null) continue; if (first) { Console.WriteLine ("Filter-generated indexables:"); first = false; } Console.WriteLine (" {0}", generated_indexable.Uri); if (show_generated) generated_indexables.Add (generated_indexable); else generated_indexable.Cleanup (); } } if (! first) Console.WriteLine (); // Make sure that the properties are sorted. ArrayList prop_array = new ArrayList (indexable.Properties); prop_array.Sort (); Console.WriteLine ("Properties:"); if (indexable.ValidTimestamp) Console.WriteLine (" Timestamp = {0}", DateTimeUtil.ToString (indexable.Timestamp)); foreach (Beagle.Property prop in prop_array) { if (String.IsNullOrEmpty (prop.Value)) continue; Console.WriteLine (" {0} = {1}", prop.Key, prop.Value); } Console.WriteLine (); if (indexable.NoContent) return; watch.Reset (); watch.Start (); TextReader reader; Analyzer indexing_analyzer = new BeagleAnalyzer (); char[] buffer = new char [2048]; reader = indexable.GetTextReader (); char separater_char = (tokenize ? '\n' : ' '); if (reader != null) { first = true; if (analyze) { if (! stats_only) Console.WriteLine ("Content:"); TokenStream token_stream = indexing_analyzer.TokenStream ("Text", reader); Lucene.Net.Analysis.Token token = token_stream.Next (); first = (token == null); if (! stats_only) for (; token != null; token = token_stream.Next ()) Console.Write ("{0}{1}", token.TermText (), separater_char); token_stream.Close (); } else { #if false while (true) { int l = reader.Read (buffer, 0, 2048); if (l <= 0) break; if (first) first = false; if (! stats_only) DisplayContent (buffer, l); } #else string line; first = true; while ((line = reader.ReadLine ()) != null) { if (first) { Console.WriteLine ("Content:"); first = false; } if (! stats_only) DisplayContent (line); } #endif } reader.Close (); if (first) Console.WriteLine ("(no content)"); else Console.WriteLine ('\n'); } /* reader = indexable.GetHotTextReader (); first = true; if (reader != null) { Console.WriteLine ("HotContent:"); if (analyze) { TokenStream token_stream = indexing_analyzer.TokenStream ("HotText", reader); Lucene.Net.Analysis.Token token = token_stream.Next (); first = (token == null); for (; token != null; token = token_stream.Next ()) Console.Write ("{0}{1}", token.TermText (), separater_char); token_stream.Close (); } else { while (true) { int l = reader.Read (buffer, 0, 2048); if (l <= 0) break; if (first) first = false; DisplayContent (buffer, l); } } reader.Close (); if (first) Console.WriteLine ("(no hot content)"); else Console.WriteLine ('\n'); } */ watch.Stop (); Console.WriteLine (); Console.WriteLine ("Text extracted in {0}", watch); #if ENABLE_RDF_ADAPTER IList<string> links = indexable.Links; if (links != null && links.Count != 0) { Console.WriteLine ("Links:"); foreach (string link in links) Console.WriteLine (link); Console.WriteLine (); } #endif foreach (Indexable gi in generated_indexables) Display (gi); Stream stream = indexable.GetBinaryStream (); if (stream != null) stream.Close (); // Clean up any temporary files associated with filtering this indexable. indexable.Cleanup (); }
/* Returns false if content can't/needn't be indexed. * If AlreadyFiltered, then we don't return a filter but return true. */ static public bool FilterIndexable(Indexable indexable, TextCache text_cache, out Filter filter) { filter = null; ICollection filters = null; if (indexable.Filtering == IndexableFiltering.AlreadyFiltered) { return(true); } if (!ShouldWeFilterThis(indexable)) { return(false); } string path = null; // First, figure out which filter we should use to deal with // the indexable. // If a specific mime type is specified, try to index as that type. if (indexable.MimeType != null) { filters = CreateFiltersFromMimeType(indexable.MimeType); } if (indexable.ContentUri.IsFile) { path = indexable.ContentUri.LocalPath; // Otherwise, set the mime type for a directory, // or sniff it from the file. if (indexable.MimeType == null) { if (Directory.Exists(path)) { indexable.MimeType = "inode/directory"; indexable.NoContent = true; } else if (File.Exists(path)) { indexable.MimeType = XdgMime.GetMimeType(path); } else { Log.Warn("Unable to filter {0}. {1} not found.", indexable.DisplayUri, path); return(false); } } // Set the timestamp to the last write time, if it isn't // set by the backend. if (!indexable.ValidTimestamp && indexable.IsNonTransient) { indexable.Timestamp = FileSystem.GetLastWriteTimeUtc(path); } // Check the timestamp to make sure the file hasn't // disappeared from underneath us. if (!FileSystem.ExistsByDateTime(indexable.Timestamp)) { Log.Warn("Unable to filter {0}. {1} appears to have disappeared from underneath us", indexable.DisplayUri, path); return(false); } if (filters == null || filters.Count == 0) { filters = CreateFiltersFromIndexable(indexable); } } // We don't know how to filter this, so there is nothing else to do. if (filters.Count == 0) { if (!indexable.NoContent) { Logger.Log.Debug("No filter for {0} ({1}) [{2}]", indexable.DisplayUri, path, indexable.MimeType); } return(false); } foreach (Filter candidate_filter in filters) { if (Debug) { Logger.Log.Debug("Testing filter: {0}", candidate_filter); } // Hook up the snippet writer. if (candidate_filter.SnippetMode && text_cache != null) { if (candidate_filter.OriginalIsText && indexable.IsNonTransient) { text_cache.MarkAsSelfCached(indexable.Uri); } else if (indexable.CacheContent) { TextWriter writer = text_cache.GetWriter(indexable.Uri); candidate_filter.AttachSnippetWriter(writer); } } // Set the indexable on the filter. candidate_filter.Indexable = indexable; // Open the filter, copy the file's properties to the indexable, // and hook up the TextReaders. bool successful_open = false; TextReader text_reader; Stream binary_stream; if (path != null) { successful_open = candidate_filter.Open(path); } else if ((text_reader = indexable.GetTextReader()) != null) { successful_open = candidate_filter.Open(text_reader); } else if ((binary_stream = indexable.GetBinaryStream()) != null) { successful_open = candidate_filter.Open(binary_stream); } if (successful_open) { // Set FileType indexable.AddProperty(Property.NewKeyword("beagrep:FileType", candidate_filter.FileType)); indexable.SetTextReader(candidate_filter.GetTextReader()); indexable.SetHotTextReader(candidate_filter.GetHotTextReader()); if (Debug) { Logger.Log.Debug("Successfully filtered {0} with {1}", path, candidate_filter); } filter = candidate_filter; return(true); } else { Log.Warn("Error in filtering {0} with {1}, falling back", path, candidate_filter); candidate_filter.Cleanup(); } } if (Debug) { Logger.Log.Debug("None of the matching filters could process the file: {0}", path); } return(false); }
/* Returns false if content can't/needn't be indexed. * If AlreadyFiltered, then we don't return a filter but return true. */ static public bool FilterIndexable (Indexable indexable, TextCache text_cache, out Filter filter) { filter = null; ICollection filters = null; if (indexable.Filtering == IndexableFiltering.AlreadyFiltered) return true; if (! ShouldWeFilterThis (indexable)) return false; string path = null; // First, figure out which filter we should use to deal with // the indexable. // If a specific mime type is specified, try to index as that type. if (indexable.MimeType != null) filters = CreateFiltersFromMimeType (indexable.MimeType); if (indexable.ContentUri.IsFile) { path = indexable.ContentUri.LocalPath; // Otherwise, set the mime type for a directory, // or sniff it from the file. if (indexable.MimeType == null) { if (Directory.Exists (path)) { indexable.MimeType = "inode/directory"; indexable.NoContent = true; } else if (File.Exists (path)) { indexable.MimeType = XdgMime.GetMimeType (path); } else { Log.Warn ("Unable to filter {0}. {1} not found.", indexable.DisplayUri, path); return false; } } // Set the timestamp to the last write time, if it isn't // set by the backend. if (! indexable.ValidTimestamp && indexable.IsNonTransient) indexable.Timestamp = FileSystem.GetLastWriteTimeUtc (path); // Check the timestamp to make sure the file hasn't // disappeared from underneath us. if (! FileSystem.ExistsByDateTime (indexable.Timestamp)) { Log.Warn ("Unable to filter {0}. {1} appears to have disappeared from underneath us", indexable.DisplayUri, path); return false; } if (filters == null || filters.Count == 0) { filters = CreateFiltersFromIndexable (indexable); } } // We don't know how to filter this, so there is nothing else to do. if (filters.Count == 0) { if (! indexable.NoContent) Logger.Log.Debug ("No filter for {0} ({1}) [{2}]", indexable.DisplayUri, path, indexable.MimeType); return false; } foreach (Filter candidate_filter in filters) { if (Debug) Logger.Log.Debug ("Testing filter: {0}", candidate_filter); // Hook up the snippet writer. if (candidate_filter.SnippetMode && text_cache != null) { if (candidate_filter.OriginalIsText && indexable.IsNonTransient) { text_cache.MarkAsSelfCached (indexable.Uri); } else if (indexable.CacheContent) { TextWriter writer = text_cache.GetWriter (indexable.Uri); candidate_filter.AttachSnippetWriter (writer); } } // Set the indexable on the filter. candidate_filter.Indexable = indexable; // Open the filter, copy the file's properties to the indexable, // and hook up the TextReaders. bool successful_open = false; TextReader text_reader; Stream binary_stream; if (path != null) successful_open = candidate_filter.Open (path); else if ((text_reader = indexable.GetTextReader ()) != null) successful_open = candidate_filter.Open (text_reader); else if ((binary_stream = indexable.GetBinaryStream ()) != null) successful_open = candidate_filter.Open (binary_stream); if (successful_open) { // Set FileType indexable.AddProperty (Property.NewKeyword ("beagrep:FileType", candidate_filter.FileType)); indexable.SetTextReader (candidate_filter.GetTextReader ()); indexable.SetHotTextReader (candidate_filter.GetHotTextReader ()); if (Debug) Logger.Log.Debug ("Successfully filtered {0} with {1}", path, candidate_filter); filter = candidate_filter; return true; } else { Log.Warn ("Error in filtering {0} with {1}, falling back", path, candidate_filter); candidate_filter.Cleanup (); } } if (Debug) Logger.Log.Debug ("None of the matching filters could process the file: {0}", path); return false; }