public StaticQueryable (string index_name, string index_path, bool read_only_mode) : base (index_path, read_only_mode) { Logger.Log.Debug ("Initializing static queryable: {0}", index_path); if (Directory.Exists (Path.Combine (index_path, "TextCache"))) { try { text_cache = new TextCache (index_path, true); } catch (UnauthorizedAccessException) { Logger.Log.Warn ("Unable to purge static queryable text cache in {0}. Will run without it.", index_path); } } }
public StaticQueryable(string index_name, string index_path, bool read_only_mode) : base(index_path, read_only_mode) { Logger.Log.Debug("Initializing static queryable: {0}", index_path); if (Directory.Exists(Path.Combine(index_path, "TextCache"))) { try { text_cache = new TextCache(index_path, true); } catch (UnauthorizedAccessException) { Logger.Log.Warn("Unable to purge static queryable text cache in {0}. Will run without it.", index_path); } } }
private IEnumerable GetTextLinks (Uri uri, TextCache text_cache) { if (text_cache == null) yield break; IList<string> links = text_cache.GetLinks (uri); if (links == null) yield break; foreach (string link in links) yield return Property.NewKeyword ("TextLinks", link); }
private ICollection DoLowLevelRDFQuery (Query query, PropertyType pred_type, string predicate, string field_value, TextCache text_cache) { Stopwatch total, a, b, c, d, e, f; total = new Stopwatch (); a = new Stopwatch (); b = new Stopwatch (); c = new Stopwatch (); d = new Stopwatch (); e = new Stopwatch (); f = new Stopwatch (); total.Start (); a.Start (); // Assemble all of the parts into a bunch of Lucene queries ArrayList primary_required_part_queries; ArrayList secondary_required_part_queries; LNS.BooleanQuery primary_prohibited_part_query; LNS.BooleanQuery secondary_prohibited_part_query; AndHitFilter all_hit_filters; ArrayList term_list; // Assemble all of the parts into a bunch of Lucene queries term_list = AssembleQuery (query, null, null, out primary_required_part_queries, out secondary_required_part_queries, out primary_prohibited_part_query, out secondary_prohibited_part_query, out all_hit_filters); a.Stop (); if (Debug) Log.Debug ("###### {0}: Building queries took {1}", IndexName, a); // If we have no required parts, give up. if (primary_required_part_queries == null) return null; b.Start (); // // Now that we have all of these nice queries, let's execute them! // // Create the searchers that we will need. IndexReader primary_reader; LNS.IndexSearcher primary_searcher; IndexReader secondary_reader; LNS.IndexSearcher secondary_searcher; // Create the searchers that we will need. if (! BuildSearchers (out primary_reader, out primary_searcher, out secondary_reader, out secondary_searcher)) return null; b.Stop (); if (Debug) Log.Debug ("###### {0}: Readers/searchers built in {1}", IndexName, b); // Build whitelists and blacklists for search subsets. c.Start (); // Possibly create our whitelists from the search subset. LuceneBitArray primary_whitelist, secondary_whitelist; CreateQueryWhitelists (null, primary_searcher, secondary_searcher, primary_prohibited_part_query, secondary_prohibited_part_query, out primary_whitelist, out secondary_whitelist); c.Stop (); if (Debug) Log.Debug ("###### {0}: Whitelists and blacklists built in {1}", IndexName, c); // Now run the low level queries against our indexes. d.Start (); BetterBitArray primary_matches = null; if (primary_required_part_queries != null) { if (secondary_searcher != null) primary_matches = DoRequiredQueries_TwoIndex (primary_searcher, secondary_searcher, primary_required_part_queries, secondary_required_part_queries, primary_whitelist, secondary_whitelist); else primary_matches = DoRequiredQueries (primary_searcher, primary_required_part_queries, primary_whitelist); } d.Stop (); if (Debug) Logger.Log.Debug ("###### {0}: Low-level queries finished in {1} and returned {2} matches", IndexName, d, primary_matches.TrueCount); e.Start (); int count = 0; Document doc; ArrayList hits = new ArrayList (primary_matches.TrueCount); TermDocs secondary_term_docs = null; if (secondary_searcher != null) secondary_term_docs = secondary_searcher.Reader.TermDocs (); FieldSelector fields = null; if (predicate != null) fields = new MapFieldSelector (new string[] { "Uri", "Timestamp", PropertyToFieldName (pred_type, predicate)}); for (int match_index = primary_matches.GetNextTrueIndex (0); match_index < primary_matches.Count; match_index = primary_matches.GetNextTrueIndex (++ match_index)) { count++; // If we have a HitFilter, apply it. // RDF FIXME: Ignore Hit Filter for now // If predicate was not specified but object was specified, // then figure out the right predicate if (predicate == null && field_value != null) { Hit hit = new Hit (); doc = primary_searcher.Doc (match_index); hit.Uri = GetUriFromDocument (doc); hit.Timestamp = StringFu.StringToDateTime (doc.Get ("Timestamp")); bool found_matching_predicate = false; foreach (Field field in doc.Fields ()) { if (! FieldIsPredicate (field, field_value)) continue; Property prop = new Property (); prop.Type = pred_type; prop.Key = predicate; prop.Value = field_value; hit.AddProperty (prop); found_matching_predicate = true; } // Now get the matching predicate from the secondary index if (secondary_searcher == null) { doc = null; } else { Term term = new Term ("Uri", doc.Get ("Uri")); secondary_term_docs.Seek (term); if (secondary_term_docs.Next ()) doc = secondary_searcher.Doc (secondary_term_docs.Doc ()); } if (doc != null) { foreach (Field field in doc.Fields ()) { if (! FieldIsPredicate (field, field_value)) continue; Property prop = new Property (); prop.Type = pred_type; prop.Key = predicate; prop.Value = field_value; hit.AddProperty (prop); found_matching_predicate = true; } } if (! found_matching_predicate) { // No matching predicate found // This means some unstored field matched the query // FIXME: Add a synthetic property #text hit.AddProperty (Property.New ("#text", field_value)); } hits.Add (hit); } else if (predicate == "TextLinks") { // Special treatment: TextLinks is not stored but can be queried doc = primary_searcher.Doc (match_index, fields_timestamp_uri); Hit hit = CreateHit (doc, secondary_reader, secondary_term_docs, fields); if (field_value != null) hit.AddProperty (Property.New ("TextLinks", field_value)); else { foreach (Property text_link_property in GetTextLinks (hit.Uri, text_cache)) hit.AddProperty (text_link_property); } hits.Add (hit); } else { doc = primary_searcher.Doc (match_index, fields); Hit hit = CreateHit (doc, secondary_reader, secondary_term_docs, fields); foreach (Property prop in hit.Properties) { if (prop.Key == predicate) prop.Value = field_value; } hits.Add (hit); } } e.Stop (); if (Debug) Log.Debug ("###### {0}: Query results generated in {1}", IndexName, e); // // Finally, we clean up after ourselves. // f.Start (); CloseSearchers (primary_reader, primary_searcher, secondary_reader, secondary_searcher); f.Stop (); if (Debug) Log.Debug ("###### {0}: Readers/searchers released in {1}", IndexName, f); total.Stop (); if (Debug) { Log.Debug ("###### {0}: Query time breakdown:", IndexName); Log.Debug ("###### {0}: Build queries {1,6} ({2:0.0}%)", IndexName, a, 100 * a.ElapsedTime / total.ElapsedTime); Log.Debug ("###### {0}: Got readers {1,6} ({2:0.0}%)", IndexName, b, 100 * b.ElapsedTime / total.ElapsedTime); Log.Debug ("###### {0}: Whitelists {1,6} ({2:0.0}%)", IndexName, c, 100 * c.ElapsedTime / total.ElapsedTime); Log.Debug ("###### {0}: Queries {1,6} ({2:0.0}%)", IndexName, d, 100 * d.ElapsedTime / total.ElapsedTime); Log.Debug ("###### {0}: Gen'd Results {1,6} ({2:0.0}%)", IndexName, e, 100 * e.ElapsedTime / total.ElapsedTime); Log.Debug ("###### {0}: Reader cleanup {1,6} ({2:0.0}%)", IndexName, f, 100 * f.ElapsedTime / total.ElapsedTime); Log.Debug ("###### {0}: TOTAL {1,6}", IndexName, total); Logger.Log.Debug ("###### {0}: Total query run in {1}", IndexName, total); } return hits; }
///////// RDF fu /////////////////////////////////////////////// // Returns a collection of Uris // HitFilter and UriFilter are ignored for now // They will come into play in the final FetchDocument part // FIXME: Should RDFQuery do any query mapping using backend_query_part_hook ? // I think it should not. QueryPart hooks are for human beings, RDF is for softwares. public ICollection DoRDFQuery (Query _query, TextCache text_cache) { RDFQuery query = (RDFQuery) _query; string subject, predicate, _object; PropertyType pred_type; subject = query.SubjectString; predicate = query.Predicate; pred_type = query.PredicateType; _object = query.Object; if (Debug) Logger.Log.Debug ("###### {0}: Starting low-level queries '{1}' : '{4}:{2}' = '{3}'", IndexName, subject, predicate, _object, pred_type); // ******** 8 cases ********** // Return all uris if (subject == String.Empty && predicate == String.Empty && _object == String.Empty) { ICollection hits = GetAllHitsByUri ().Values; foreach (Hit hit in hits) foreach (Property text_link_property in GetTextLinks (hit.Uri, text_cache)) hit.AddProperty (text_link_property); return hits; } // Normal query if (subject == String.Empty && predicate == String.Empty && _object != String.Empty) { QueryPart_Text part = new QueryPart_Text (); part.Text = _object; part.SearchFullText = false; // We only search properties in RDF query query.AddPart (part); return DoLowLevelRDFQuery (query, pred_type, predicate, _object, text_cache); } // Return uris for all documents with this property if (subject == String.Empty && predicate != String.Empty && _object == String.Empty) { string field_name = PropertyToFieldName (pred_type, predicate); QueryPart_Property part = new QueryPart_Property (); part.Type = PropertyType.Internal; part.Key = "Properties"; part.Value = field_name; query.AddPart (part); return DoLowLevelRDFQuery (query, pred_type, predicate, null, text_cache); } // Property query if (subject == String.Empty && predicate != String.Empty && _object != String.Empty) { QueryPart_Property part = new QueryPart_Property (); part.Type = pred_type; part.Key = predicate; part.Value = _object; query.AddPart (part); return DoLowLevelRDFQuery (query, pred_type, predicate, _object, text_cache); } // Return if the URI exists if (subject != String.Empty && predicate == String.Empty && _object == String.Empty) { QueryPart_Uri part = new QueryPart_Uri (); part.Uri = new Uri (subject, true); // better be URI! query.AddPart (part); // FIXME: Which properties to return in the hit? All or none ? return DoLowLevelRDFQuery (query, pred_type, predicate, null, text_cache); } // Normal query in the document with this URI if (subject != String.Empty && predicate == String.Empty && _object != String.Empty) { QueryPart_Uri uri_part = new QueryPart_Uri (); uri_part.Uri = new Uri (subject, true); // better be URI! query.AddPart (uri_part); QueryPart_Text part = new QueryPart_Text (); part.Text = _object; part.SearchFullText = false; // We only search properties in RDF query query.AddPart (part); return DoLowLevelRDFQuery (query, pred_type, predicate, _object, text_cache); } // Return URI if the document with this URI contains this property if (subject != String.Empty && predicate != String.Empty && _object == String.Empty) { ArrayList returned_uris = new ArrayList (1); ArrayList uri_list = new ArrayList (1); uri_list.Add (new Uri (subject, true)); string field_name = PropertyToFieldName (pred_type, predicate); FieldSelector fields = new MapFieldSelector (new string[] { "Uri", "Timestamp", field_name }); ICollection hits = GetHitsForUris (uri_list, fields); if (predicate == "TextLinks") { foreach (Hit hit in hits) foreach (Property text_link_property in GetTextLinks (hit.Uri, text_cache)) hit.AddProperty (text_link_property); } return hits; } // Property query in the document with this URI if (subject != String.Empty && predicate != String.Empty && _object != String.Empty) { QueryPart_Uri uri_part = new QueryPart_Uri (); uri_part.Uri = new Uri (subject, true); // better be URI! query.AddPart (uri_part); QueryPart_Property part = new QueryPart_Property (); part.Type = pred_type; part.Key = predicate; part.Value = _object; query.AddPart (part); return DoLowLevelRDFQuery (query, pred_type, predicate, _object, text_cache); } throw new Exception ("Never reaches"); }
/* Returns false if content can't/needn't be indexed. * If AlreadyFiltered, then we don't return a filter but return true. */ static public bool FilterIndexable (Indexable indexable, TextCache text_cache, out Filter filter) { filter = null; ICollection filters = null; if (indexable.Filtering == IndexableFiltering.AlreadyFiltered) return true; if (! ShouldWeFilterThis (indexable)) return false; string path = null; // First, figure out which filter we should use to deal with // the indexable. // If a specific mime type is specified, try to index as that type. if (indexable.MimeType != null) filters = CreateFiltersFromMimeType (indexable.MimeType); if (indexable.ContentUri.IsFile) { path = indexable.ContentUri.LocalPath; // Otherwise, set the mime type for a directory, // or sniff it from the file. if (indexable.MimeType == null) { if (Directory.Exists (path)) { indexable.MimeType = "inode/directory"; indexable.NoContent = true; } else if (File.Exists (path)) { indexable.MimeType = XdgMime.GetMimeType (path); } else { Log.Warn ("Unable to filter {0}. {1} not found.", indexable.DisplayUri, path); return false; } } // Set the timestamp to the last write time, if it isn't // set by the backend. if (! indexable.ValidTimestamp && indexable.IsNonTransient) indexable.Timestamp = FileSystem.GetLastWriteTimeUtc (path); // Check the timestamp to make sure the file hasn't // disappeared from underneath us. if (! FileSystem.ExistsByDateTime (indexable.Timestamp)) { Log.Warn ("Unable to filter {0}. {1} appears to have disappeared from underneath us", indexable.DisplayUri, path); return false; } if (filters == null || filters.Count == 0) { filters = CreateFiltersFromIndexable (indexable); } } // We don't know how to filter this, so there is nothing else to do. if (filters.Count == 0) { if (! indexable.NoContent) Logger.Log.Debug ("No filter for {0} ({1}) [{2}]", indexable.DisplayUri, path, indexable.MimeType); return false; } foreach (Filter candidate_filter in filters) { if (Debug) Logger.Log.Debug ("Testing filter: {0}", candidate_filter); // Hook up the snippet writer. if (candidate_filter.SnippetMode && text_cache != null) { if (candidate_filter.OriginalIsText && indexable.IsNonTransient) { text_cache.MarkAsSelfCached (indexable.Uri); } else if (indexable.CacheContent) { TextWriter writer = text_cache.GetWriter (indexable.Uri); candidate_filter.AttachSnippetWriter (writer); } } // Set the indexable on the filter. candidate_filter.Indexable = indexable; // Open the filter, copy the file's properties to the indexable, // and hook up the TextReaders. bool successful_open = false; TextReader text_reader; Stream binary_stream; if (path != null) successful_open = candidate_filter.Open (path); else if ((text_reader = indexable.GetTextReader ()) != null) successful_open = candidate_filter.Open (text_reader); else if ((binary_stream = indexable.GetBinaryStream ()) != null) successful_open = candidate_filter.Open (binary_stream); if (successful_open) { // Set FileType indexable.AddProperty (Property.NewKeyword ("beagle:FileType", candidate_filter.FileType)); indexable.SetTextReader (candidate_filter.GetTextReader ()); indexable.SetHotTextReader (candidate_filter.GetHotTextReader ()); #if ENABLE_RDF_ADAPTER indexable.Links = candidate_filter.Links; #endif if (Debug) Logger.Log.Debug ("Successfully filtered {0} with {1}", path, candidate_filter); filter = candidate_filter; return true; } else { Log.Warn ("Error in filtering {0} with {1}, falling back", path, candidate_filter); candidate_filter.Cleanup (); } } if (Debug) Logger.Log.Debug ("None of the matching filters could process the file: {0}", path); return false; }
/* Returns false if content can't/needn't be indexed. * If AlreadyFiltered, then we don't return a filter but return true. */ static public bool FilterIndexable(Indexable indexable, TextCache text_cache, out Filter filter) { filter = null; ICollection filters = null; if (indexable.Filtering == IndexableFiltering.AlreadyFiltered) { return(true); } if (!ShouldWeFilterThis(indexable)) { return(false); } string path = null; // First, figure out which filter we should use to deal with // the indexable. // If a specific mime type is specified, try to index as that type. if (indexable.MimeType != null) { filters = CreateFiltersFromMimeType(indexable.MimeType); } if (indexable.ContentUri.IsFile) { path = indexable.ContentUri.LocalPath; // Otherwise, set the mime type for a directory, // or sniff it from the file. if (indexable.MimeType == null) { if (Directory.Exists(path)) { indexable.MimeType = "inode/directory"; indexable.NoContent = true; } else if (File.Exists(path)) { indexable.MimeType = XdgMime.GetMimeType(path); } else { Log.Warn("Unable to filter {0}. {1} not found.", indexable.DisplayUri, path); return(false); } } // Set the timestamp to the last write time, if it isn't // set by the backend. if (!indexable.ValidTimestamp && indexable.IsNonTransient) { indexable.Timestamp = FileSystem.GetLastWriteTimeUtc(path); } // Check the timestamp to make sure the file hasn't // disappeared from underneath us. if (!FileSystem.ExistsByDateTime(indexable.Timestamp)) { Log.Warn("Unable to filter {0}. {1} appears to have disappeared from underneath us", indexable.DisplayUri, path); return(false); } if (filters == null || filters.Count == 0) { filters = CreateFiltersFromIndexable(indexable); } } // We don't know how to filter this, so there is nothing else to do. if (filters.Count == 0) { if (!indexable.NoContent) { Logger.Log.Debug("No filter for {0} ({1}) [{2}]", indexable.DisplayUri, path, indexable.MimeType); } return(false); } foreach (Filter candidate_filter in filters) { if (Debug) { Logger.Log.Debug("Testing filter: {0}", candidate_filter); } // Hook up the snippet writer. if (candidate_filter.SnippetMode && text_cache != null) { if (candidate_filter.OriginalIsText && indexable.IsNonTransient) { text_cache.MarkAsSelfCached(indexable.Uri); } else if (indexable.CacheContent) { TextWriter writer = text_cache.GetWriter(indexable.Uri); candidate_filter.AttachSnippetWriter(writer); } } // Set the indexable on the filter. candidate_filter.Indexable = indexable; // Open the filter, copy the file's properties to the indexable, // and hook up the TextReaders. bool successful_open = false; TextReader text_reader; Stream binary_stream; if (path != null) { successful_open = candidate_filter.Open(path); } else if ((text_reader = indexable.GetTextReader()) != null) { successful_open = candidate_filter.Open(text_reader); } else if ((binary_stream = indexable.GetBinaryStream()) != null) { successful_open = candidate_filter.Open(binary_stream); } if (successful_open) { // Set FileType indexable.AddProperty(Property.NewKeyword("beagle:FileType", candidate_filter.FileType)); indexable.SetTextReader(candidate_filter.GetTextReader()); indexable.SetHotTextReader(candidate_filter.GetHotTextReader()); #if ENABLE_RDF_ADAPTER indexable.Links = candidate_filter.Links; #endif if (Debug) { Logger.Log.Debug("Successfully filtered {0} with {1}", path, candidate_filter); } filter = candidate_filter; return(true); } else { Log.Warn("Error in filtering {0} with {1}, falling back", path, candidate_filter); candidate_filter.Cleanup(); } } if (Debug) { Logger.Log.Debug("None of the matching filters could process the file: {0}", path); } return(false); }