/// <summary> /// This constructor is used while retrieving the hit from the dump /// </summary> /// <param name="ixr">The dump indexer this Wiki topic belongs to</param> /// <param name="hit">The Lucene Hit object</param> public PageInfo(Indexer ixr, Hit hit) { indexer = ixr; Document doc = hit.GetDocument(); TopicId = Convert.ToInt64(doc.GetField("topicid").StringValue()); Name = doc.GetField("title").StringValue(); Beginnings = new long[doc.GetFields("beginning").Length]; Ends = new long[doc.GetFields("end").Length]; int i = 0; foreach (byte[] binVal in doc.GetBinaryValues("beginning")) { Beginnings[i] = BitConverter.ToInt64(binVal, 0); i++; } i = 0; foreach (byte[] binVal in doc.GetBinaryValues("end")) { Ends[i] = BitConverter.ToInt64(binVal, 0); i++; } Array.Sort(Beginnings); Array.Sort(Ends); }
public List <IndexPageResult> GetDocumentPagesWithQuery(string query) { List <IndexPageResult> results = new List <IndexPageResult>(); Dictionary <string, IndexPageResult> fingerprints_already_seen = new Dictionary <string, IndexPageResult>(); try { using (IndexReader index_reader = IndexReader.Open(LIBRARY_INDEX_BASE_PATH, true)) { using (IndexSearcher index_searcher = new IndexSearcher(index_reader)) { QueryParser query_parser = new QueryParser(Version.LUCENE_29, "content", analyzer); Lucene.Net.Search.Query query_object = query_parser.Parse(query); Lucene.Net.Search.Hits hits = index_searcher.Search(query_object); var i = hits.Iterator(); while (i.MoveNext()) { Lucene.Net.Search.Hit hit = (Lucene.Net.Search.Hit)i.Current; string fingerprint = hit.Get("fingerprint"); int page = Convert.ToInt32(hit.Get("page")); double score = hit.GetScore(); // If this is the first time we have seen this fingerprint, make the top-level record if (!fingerprints_already_seen.ContainsKey(fingerprint)) { IndexPageResult result = new IndexPageResult(); result.fingerprint = fingerprint; result.score = score; // Add to our structures results.Add(result); fingerprints_already_seen[fingerprint] = result; } // And add the page record { IndexPageResult result = fingerprints_already_seen[fingerprint]; result.page_results.Add(new PageResult { page = page, score = score }); } } // Close the index index_searcher.Close(); } index_reader.Close(); } } catch (Exception ex) { Logging.Warn(ex, $"GetDocumentPagesWithQuery: There was a problem opening the index file for searching (path: '{LIBRARY_INDEX_BASE_PATH}', query: '{query}')"); } return(results); }
public HashSet <string> GetDocumentsWithWord(string keyword) { HashSet <string> fingerprints = new HashSet <string>(); try { keyword = ReasonableWord.MakeReasonableWord(keyword); if (null != keyword) { ////Do a quick check for whether there are actually any segments files, otherwise we throw many exceptions in the IndexReader.Open in a very tight loop. ////Added by Nik to cope with some exception...will uncomment this when i know what the problem is... //var segments_files = Directory.GetFiles(LIBRARY_INDEX_BASE_PATH, "segments*", SearchOption.AllDirectories); //if (segments_files.Length <= 0) //{ // Logging.Debug("No index segments files found"); // return fingerprints; //} using (IndexReader index_reader = IndexReader.Open(LIBRARY_INDEX_BASE_PATH, true)) { using (IndexSearcher index_searcher = new IndexSearcher(index_reader)) { Lucene.Net.Search.TermQuery term_query = new Lucene.Net.Search.TermQuery(new Term("content", keyword)); Lucene.Net.Search.Hits hits = index_searcher.Search(term_query); var i = hits.Iterator(); while (i.MoveNext()) { Lucene.Net.Search.Hit hit = (Lucene.Net.Search.Hit)i.Current; string fingerprint = hit.Get("fingerprint"); fingerprints.Add(fingerprint); } // Close the index index_searcher.Close(); } index_reader.Close(); } } } catch (Exception ex) { Logging.Warn(ex, $"GetDocumentsWithWord: There was a problem opening the index file for searching (path: '{LIBRARY_INDEX_BASE_PATH}', keyword: '{keyword}')"); } return(fingerprints); }
/*** * Understands the lucene query syntax */ public List <Utilities.Language.TextIndexing.IndexResult> GetDocumentsWithQuery(string query) { List <Utilities.Language.TextIndexing.IndexResult> fingerprints = new List <Utilities.Language.TextIndexing.IndexResult>(); HashSet <string> fingerprints_already_seen = new HashSet <string>(); try { using (Lucene.Net.Index.IndexReader index_reader = Lucene.Net.Index.IndexReader.Open(LIBRARY_INDEX_BASE_PATH, true)) { using (Lucene.Net.Search.IndexSearcher index_searcher = new Lucene.Net.Search.IndexSearcher(index_reader)) { Lucene.Net.QueryParsers.QueryParser query_parser = new Lucene.Net.QueryParsers.QueryParser(Version.LUCENE_29, "content", analyzer); Lucene.Net.Search.Query query_object = query_parser.Parse(query); Lucene.Net.Search.Hits hits = index_searcher.Search(query_object); var i = hits.Iterator(); while (i.MoveNext()) { Lucene.Net.Search.Hit hit = (Lucene.Net.Search.Hit)i.Current; string fingerprint = hit.Get("fingerprint"); string page = hit.Get("page"); if (!fingerprints_already_seen.Contains(fingerprint)) { fingerprints_already_seen.Add(fingerprint); IndexResult index_result = new IndexResult { fingerprint = fingerprint, score = hit.GetScore() }; fingerprints.Add(index_result); } } // Close the index index_searcher.Close(); } index_reader.Close(); } } catch (Exception ex) { Logging.Warn(ex, "GetDocumentsWithQuery: There was a problem opening the index file for searching."); } return(fingerprints); }
static protected Hit DocumentToHit (Document doc) { Hit hit; hit = new Hit (); hit.Uri = GetUriFromDocument (doc); string str; str = doc.Get ("ParentUri"); if (str != null) hit.ParentUri = UriFu.EscapedStringToUri (str); hit.Timestamp = StringFu.StringToDateTime (doc.Get ("Timestamp")); AddPropertiesToHit (hit, doc, true); return hit; }
public bool HitFilter (Hit hit) { // First, check the Timestamp if (Key == QueryPart_DateRange.AllPropertiesKey || Key == QueryPart_DateRange.TimestampKey) { DateTime dt; dt = hit.Timestamp; if (StartDate <= dt && dt <= EndDate) return true; if (Key == QueryPart_DateRange.TimestampKey) return false; } if (Key == QueryPart_DateRange.AllPropertiesKey) { // Walk through all of the properties, and see if any // date properties fall inside the range. foreach (Property prop in hit.Properties) { if (prop.Type == PropertyType.Date) { DateTime dt; dt = StringFu.StringToDateTime (prop.Value); if (StartDate <= dt && dt <= EndDate) return true; } } return false; } else { // Walk through all of the properties with the given key, // and see if any of them fall inside of the range. string[] values; values = hit.GetProperties (Key); foreach (string v in values) { DateTime dt; dt = StringFu.StringToDateTime (v); if (StartDate <= dt && dt <= EndDate) return true; } return false; } }
public bool HitFilter (Hit hit) { return ! original (hit); }
public bool HitFilter (Hit hit) { foreach (HitFilter hit_filter in all) if (! hit_filter (hit)) return false; return true; }
public bool HitFilter (Hit hit) { if (contains_known_true) return true; foreach (HitFilter hit_filter in all) if (hit_filter (hit)) return true; return false; }
////////////////////////////////////////////////////////////////////////////// // // Special Hit Filtering classes // static private bool TrueHitFilter (Hit hit) { return true; }
static protected void AddPropertiesToHit (Hit hit, Document doc, bool from_primary_index) { Property prop; foreach (Field f in doc.Fields ()) { prop = GetPropertyFromDocument (f, doc, from_primary_index); if (prop != null) hit.AddProperty (prop); } }
/////////////////////////////////////////////////////////////////////////////////// // // Various ways to grab lots of hits at once. // These should never be used for querying, only for utility // functions. // public int GetBlockOfHits (int cookie, Hit [] block_of_hits) { IndexReader primary_reader; IndexReader secondary_reader; primary_reader = GetReader (PrimaryStore); secondary_reader = GetReader (SecondaryStore); int request_size; request_size = block_of_hits.Length; if (request_size > primary_reader.NumDocs ()) request_size = primary_reader.NumDocs (); int max_doc; max_doc = primary_reader.MaxDoc (); if (cookie < 0) { Random random; random = new Random (); cookie = random.Next (max_doc); } int original_cookie; original_cookie = cookie; Hashtable primary_docs, secondary_docs; primary_docs = UriFu.NewHashtable (); secondary_docs = UriFu.NewHashtable (); // Load the primary documents for (int i = 0; i < request_size; ++i) { if (! primary_reader.IsDeleted (cookie)) { Document doc; doc = primary_reader.Document (cookie); primary_docs [GetUriFromDocument (doc)] = doc; } ++cookie; if (cookie >= max_doc) // wrap around cookie = 0; // If we somehow end up back where we started, // give up. if (cookie == original_cookie) break; } // If necessary, load the secondary documents if (secondary_reader != null) { LNS.IndexSearcher searcher; searcher = new LNS.IndexSearcher (secondary_reader); LNS.Query uri_query; uri_query = UriQuery ("Uri", primary_docs.Keys); LNS.Hits hits; hits = searcher.Search (uri_query); for (int i = 0; i < hits.Length (); ++i) { Document doc; doc = hits.Doc (i); secondary_docs [GetUriFromDocument (doc)] = doc; } searcher.Close (); } ReleaseReader (primary_reader); ReleaseReader (secondary_reader); // Now assemble the hits int j = 0; foreach (Uri uri in primary_docs.Keys) { Document primary_doc, secondary_doc; primary_doc = primary_docs [uri] as Document; secondary_doc = secondary_docs [uri] as Document; Hit hit; hit = DocumentToHit (primary_doc); if (secondary_doc != null) AddPropertiesToHit (hit, secondary_doc, false); block_of_hits [j] = hit; ++j; } // null-pad the array, if necessary for (; j < block_of_hits.Length; ++j) block_of_hits [j] = null; // Return the new cookie return cookie; }
/// <summary> /// This constructor is used while retrieving the hit from the dump /// </summary> /// <param name="ltask">The dump indexer this Wiki topic belongs to</param> /// <param name="hit">The Lucene Hit object</param> public PageInfo(Indexer ixr, Hit hit) { TreatRedirectException = false; Indexer = ixr; // Decoder setter sort Beginnings and Ends. _decoder = ixr; Score = hit.GetScore(); Document doc = hit.GetDocument(); TopicId = Convert.ToInt64(doc.GetField("topicid").StringValue()); Name = doc.GetField("title").StringValue(); Beginnings = new long[doc.GetFields("beginning").Length]; Ends = new long[doc.GetFields("end").Length]; int i = 0; foreach (byte[] binVal in doc.GetBinaryValues("beginning")) { Beginnings[i] = BitConverter.ToInt64(binVal, 0); i++; } i = 0; foreach (byte[] binVal in doc.GetBinaryValues("end")) { Ends[i] = BitConverter.ToInt64(binVal, 0); i++; } Array.Sort(Beginnings); Array.Sort(Ends); }
private ICollection DoLowLevelRDFQuery (Query query, PropertyType pred_type, string predicate, string field_value, TextCache text_cache) { Stopwatch total, a, b, c, d, e, f; total = new Stopwatch (); a = new Stopwatch (); b = new Stopwatch (); c = new Stopwatch (); d = new Stopwatch (); e = new Stopwatch (); f = new Stopwatch (); total.Start (); a.Start (); // Assemble all of the parts into a bunch of Lucene queries ArrayList primary_required_part_queries; ArrayList secondary_required_part_queries; LNS.BooleanQuery primary_prohibited_part_query; LNS.BooleanQuery secondary_prohibited_part_query; AndHitFilter all_hit_filters; ArrayList term_list; // Assemble all of the parts into a bunch of Lucene queries term_list = AssembleQuery (query, null, null, out primary_required_part_queries, out secondary_required_part_queries, out primary_prohibited_part_query, out secondary_prohibited_part_query, out all_hit_filters); a.Stop (); if (Debug) Log.Debug ("###### {0}: Building queries took {1}", IndexName, a); // If we have no required parts, give up. if (primary_required_part_queries == null) return null; b.Start (); // // Now that we have all of these nice queries, let's execute them! // // Create the searchers that we will need. IndexReader primary_reader; LNS.IndexSearcher primary_searcher; IndexReader secondary_reader; LNS.IndexSearcher secondary_searcher; // Create the searchers that we will need. if (! BuildSearchers (out primary_reader, out primary_searcher, out secondary_reader, out secondary_searcher)) return null; b.Stop (); if (Debug) Log.Debug ("###### {0}: Readers/searchers built in {1}", IndexName, b); // Build whitelists and blacklists for search subsets. c.Start (); // Possibly create our whitelists from the search subset. LuceneBitArray primary_whitelist, secondary_whitelist; CreateQueryWhitelists (null, primary_searcher, secondary_searcher, primary_prohibited_part_query, secondary_prohibited_part_query, out primary_whitelist, out secondary_whitelist); c.Stop (); if (Debug) Log.Debug ("###### {0}: Whitelists and blacklists built in {1}", IndexName, c); // Now run the low level queries against our indexes. d.Start (); BetterBitArray primary_matches = null; if (primary_required_part_queries != null) { if (secondary_searcher != null) primary_matches = DoRequiredQueries_TwoIndex (primary_searcher, secondary_searcher, primary_required_part_queries, secondary_required_part_queries, primary_whitelist, secondary_whitelist); else primary_matches = DoRequiredQueries (primary_searcher, primary_required_part_queries, primary_whitelist); } d.Stop (); if (Debug) Logger.Log.Debug ("###### {0}: Low-level queries finished in {1} and returned {2} matches", IndexName, d, primary_matches.TrueCount); e.Start (); int count = 0; Document doc; ArrayList hits = new ArrayList (primary_matches.TrueCount); TermDocs secondary_term_docs = null; if (secondary_searcher != null) secondary_term_docs = secondary_searcher.Reader.TermDocs (); FieldSelector fields = null; if (predicate != null) fields = new MapFieldSelector (new string[] { "Uri", "Timestamp", PropertyToFieldName (pred_type, predicate)}); for (int match_index = primary_matches.GetNextTrueIndex (0); match_index < primary_matches.Count; match_index = primary_matches.GetNextTrueIndex (++ match_index)) { count++; // If we have a HitFilter, apply it. // RDF FIXME: Ignore Hit Filter for now // If predicate was not specified but object was specified, // then figure out the right predicate if (predicate == null && field_value != null) { Hit hit = new Hit (); doc = primary_searcher.Doc (match_index); hit.Uri = GetUriFromDocument (doc); hit.Timestamp = StringFu.StringToDateTime (doc.Get ("Timestamp")); bool found_matching_predicate = false; foreach (Field field in doc.Fields ()) { if (! FieldIsPredicate (field, field_value)) continue; Property prop = new Property (); prop.Type = pred_type; prop.Key = predicate; prop.Value = field_value; hit.AddProperty (prop); found_matching_predicate = true; } // Now get the matching predicate from the secondary index if (secondary_searcher == null) { doc = null; } else { Term term = new Term ("Uri", doc.Get ("Uri")); secondary_term_docs.Seek (term); if (secondary_term_docs.Next ()) doc = secondary_searcher.Doc (secondary_term_docs.Doc ()); } if (doc != null) { foreach (Field field in doc.Fields ()) { if (! FieldIsPredicate (field, field_value)) continue; Property prop = new Property (); prop.Type = pred_type; prop.Key = predicate; prop.Value = field_value; hit.AddProperty (prop); found_matching_predicate = true; } } if (! found_matching_predicate) { // No matching predicate found // This means some unstored field matched the query // FIXME: Add a synthetic property #text hit.AddProperty (Property.New ("#text", field_value)); } hits.Add (hit); } else if (predicate == "TextLinks") { // Special treatment: TextLinks is not stored but can be queried doc = primary_searcher.Doc (match_index, fields_timestamp_uri); Hit hit = CreateHit (doc, secondary_reader, secondary_term_docs, fields); if (field_value != null) hit.AddProperty (Property.New ("TextLinks", field_value)); else { foreach (Property text_link_property in GetTextLinks (hit.Uri, text_cache)) hit.AddProperty (text_link_property); } hits.Add (hit); } else { doc = primary_searcher.Doc (match_index, fields); Hit hit = CreateHit (doc, secondary_reader, secondary_term_docs, fields); foreach (Property prop in hit.Properties) { if (prop.Key == predicate) prop.Value = field_value; } hits.Add (hit); } } e.Stop (); if (Debug) Log.Debug ("###### {0}: Query results generated in {1}", IndexName, e); // // Finally, we clean up after ourselves. // f.Start (); CloseSearchers (primary_reader, primary_searcher, secondary_reader, secondary_searcher); f.Stop (); if (Debug) Log.Debug ("###### {0}: Readers/searchers released in {1}", IndexName, f); total.Stop (); if (Debug) { Log.Debug ("###### {0}: Query time breakdown:", IndexName); Log.Debug ("###### {0}: Build queries {1,6} ({2:0.0}%)", IndexName, a, 100 * a.ElapsedTime / total.ElapsedTime); Log.Debug ("###### {0}: Got readers {1,6} ({2:0.0}%)", IndexName, b, 100 * b.ElapsedTime / total.ElapsedTime); Log.Debug ("###### {0}: Whitelists {1,6} ({2:0.0}%)", IndexName, c, 100 * c.ElapsedTime / total.ElapsedTime); Log.Debug ("###### {0}: Queries {1,6} ({2:0.0}%)", IndexName, d, 100 * d.ElapsedTime / total.ElapsedTime); Log.Debug ("###### {0}: Gen'd Results {1,6} ({2:0.0}%)", IndexName, e, 100 * e.ElapsedTime / total.ElapsedTime); Log.Debug ("###### {0}: Reader cleanup {1,6} ({2:0.0}%)", IndexName, f, 100 * f.ElapsedTime / total.ElapsedTime); Log.Debug ("###### {0}: TOTAL {1,6}", IndexName, total); Logger.Log.Debug ("###### {0}: Total query run in {1}", IndexName, total); } return hits; }