Пример #1
0
        /// <summary>
        /// This constructor is used while retrieving the hit from the dump
        /// </summary>
        /// <param name="ixr">The dump indexer this Wiki topic belongs to</param>
        /// <param name="hit">The Lucene Hit object</param>
        public PageInfo(Indexer ixr, Hit hit)
        {
            indexer = ixr;

            Document doc = hit.GetDocument();

            TopicId = Convert.ToInt64(doc.GetField("topicid").StringValue());

            Name = doc.GetField("title").StringValue();

            Beginnings = new long[doc.GetFields("beginning").Length];
            Ends = new long[doc.GetFields("end").Length];

            int i = 0;

            foreach (byte[] binVal in doc.GetBinaryValues("beginning"))
            {
                Beginnings[i] = BitConverter.ToInt64(binVal, 0);

                i++;
            }

            i = 0;

            foreach (byte[] binVal in doc.GetBinaryValues("end"))
            {
                Ends[i] = BitConverter.ToInt64(binVal, 0);

                i++;
            }

            Array.Sort(Beginnings);
            Array.Sort(Ends);
        }
Пример #2
0
        public List <IndexPageResult> GetDocumentPagesWithQuery(string query)
        {
            List <IndexPageResult> results = new List <IndexPageResult>();
            Dictionary <string, IndexPageResult> fingerprints_already_seen = new Dictionary <string, IndexPageResult>();

            try
            {
                using (IndexReader index_reader = IndexReader.Open(LIBRARY_INDEX_BASE_PATH, true))
                {
                    using (IndexSearcher index_searcher = new IndexSearcher(index_reader))
                    {
                        QueryParser query_parser = new QueryParser(Version.LUCENE_29, "content", analyzer);

                        Lucene.Net.Search.Query query_object = query_parser.Parse(query);
                        Lucene.Net.Search.Hits  hits         = index_searcher.Search(query_object);

                        var i = hits.Iterator();
                        while (i.MoveNext())
                        {
                            Lucene.Net.Search.Hit hit = (Lucene.Net.Search.Hit)i.Current;
                            string fingerprint        = hit.Get("fingerprint");
                            int    page  = Convert.ToInt32(hit.Get("page"));
                            double score = hit.GetScore();

                            // If this is the first time we have seen this fingerprint, make the top-level record
                            if (!fingerprints_already_seen.ContainsKey(fingerprint))
                            {
                                IndexPageResult result = new IndexPageResult();
                                result.fingerprint = fingerprint;
                                result.score       = score;

                                // Add to our structures
                                results.Add(result);
                                fingerprints_already_seen[fingerprint] = result;
                            }

                            // And add the page record
                            {
                                IndexPageResult result = fingerprints_already_seen[fingerprint];
                                result.page_results.Add(new PageResult {
                                    page = page, score = score
                                });
                            }
                        }

                        // Close the index
                        index_searcher.Close();
                    }
                    index_reader.Close();
                }
            }
            catch (Exception ex)
            {
                Logging.Warn(ex, $"GetDocumentPagesWithQuery: There was a problem opening the index file for searching (path: '{LIBRARY_INDEX_BASE_PATH}', query: '{query}')");
            }

            return(results);
        }
Пример #3
0
        public HashSet <string> GetDocumentsWithWord(string keyword)
        {
            HashSet <string> fingerprints = new HashSet <string>();

            try
            {
                keyword = ReasonableWord.MakeReasonableWord(keyword);
                if (null != keyword)
                {
                    ////Do a quick check for whether there are actually any segments files, otherwise we throw many exceptions in the IndexReader.Open in a very tight loop.
                    ////Added by Nik to cope with some exception...will uncomment this when i know what the problem is...
                    //var segments_files = Directory.GetFiles(LIBRARY_INDEX_BASE_PATH, "segments*", SearchOption.AllDirectories);
                    //if (segments_files.Length <= 0)
                    //{
                    //    Logging.Debug("No index segments files found");
                    //    return fingerprints;
                    //}

                    using (IndexReader index_reader = IndexReader.Open(LIBRARY_INDEX_BASE_PATH, true))
                    {
                        using (IndexSearcher index_searcher = new IndexSearcher(index_reader))
                        {
                            Lucene.Net.Search.TermQuery term_query = new Lucene.Net.Search.TermQuery(new Term("content", keyword));
                            Lucene.Net.Search.Hits      hits       = index_searcher.Search(term_query);

                            var i = hits.Iterator();
                            while (i.MoveNext())
                            {
                                Lucene.Net.Search.Hit hit = (Lucene.Net.Search.Hit)i.Current;
                                string fingerprint        = hit.Get("fingerprint");
                                fingerprints.Add(fingerprint);
                            }

                            // Close the index
                            index_searcher.Close();
                        }
                        index_reader.Close();
                    }
                }
            }
            catch (Exception ex)
            {
                Logging.Warn(ex, $"GetDocumentsWithWord: There was a problem opening the index file for searching (path: '{LIBRARY_INDEX_BASE_PATH}', keyword: '{keyword}')");
            }

            return(fingerprints);
        }
Пример #4
0
        /***
         * Understands the lucene query syntax
         */
        public List <Utilities.Language.TextIndexing.IndexResult> GetDocumentsWithQuery(string query)
        {
            List <Utilities.Language.TextIndexing.IndexResult> fingerprints = new List <Utilities.Language.TextIndexing.IndexResult>();
            HashSet <string> fingerprints_already_seen = new HashSet <string>();

            try
            {
                using (Lucene.Net.Index.IndexReader index_reader = Lucene.Net.Index.IndexReader.Open(LIBRARY_INDEX_BASE_PATH, true))
                {
                    using (Lucene.Net.Search.IndexSearcher index_searcher = new Lucene.Net.Search.IndexSearcher(index_reader))
                    {
                        Lucene.Net.QueryParsers.QueryParser query_parser = new Lucene.Net.QueryParsers.QueryParser(Version.LUCENE_29, "content", analyzer);

                        Lucene.Net.Search.Query query_object = query_parser.Parse(query);
                        Lucene.Net.Search.Hits  hits         = index_searcher.Search(query_object);

                        var i = hits.Iterator();
                        while (i.MoveNext())
                        {
                            Lucene.Net.Search.Hit hit = (Lucene.Net.Search.Hit)i.Current;
                            string fingerprint        = hit.Get("fingerprint");
                            string page = hit.Get("page");

                            if (!fingerprints_already_seen.Contains(fingerprint))
                            {
                                fingerprints_already_seen.Add(fingerprint);

                                IndexResult index_result = new IndexResult {
                                    fingerprint = fingerprint, score = hit.GetScore()
                                };
                                fingerprints.Add(index_result);
                            }
                        }

                        // Close the index
                        index_searcher.Close();
                    }
                    index_reader.Close();
                }
            }
            catch (Exception ex)
            {
                Logging.Warn(ex, "GetDocumentsWithQuery: There was a problem opening the index file for searching.");
            }

            return(fingerprints);
        }
Пример #5
0
                static protected Hit DocumentToHit (Document doc)
                {
                        Hit hit;
                        hit = new Hit ();

                        hit.Uri = GetUriFromDocument (doc);

                        string str;
                        str = doc.Get ("ParentUri");
                        if (str != null)
                                hit.ParentUri = UriFu.EscapedStringToUri (str);

                        hit.Timestamp = StringFu.StringToDateTime (doc.Get ("Timestamp"));

                        AddPropertiesToHit (hit, doc, true);

                        return hit;
                }
Пример #6
0
                        public bool HitFilter (Hit hit)
                        {
                                // First, check the Timestamp
                                if (Key == QueryPart_DateRange.AllPropertiesKey
                                    || Key == QueryPart_DateRange.TimestampKey) {
                                        DateTime dt;
                                        dt = hit.Timestamp;
                                        if (StartDate <= dt && dt <= EndDate)
                                                return true;
                                        if (Key == QueryPart_DateRange.TimestampKey)
                                                return false;
                                }

                                if (Key == QueryPart_DateRange.AllPropertiesKey) {
                                        // Walk through all of the properties, and see if any
                                        // date properties fall inside the range.
                                        foreach (Property prop in hit.Properties) {
                                                if (prop.Type == PropertyType.Date) {
                                                        DateTime dt;
                                                        dt = StringFu.StringToDateTime (prop.Value);
                                                        if (StartDate <= dt && dt <= EndDate)
                                                                return true;
                                                }
                                        }
                                        return false;
                                } else {
                                        // Walk through all of the properties with the given key,
                                        // and see if any of them fall inside of the range.
                                        string[] values;
                                        values = hit.GetProperties (Key);
                                        foreach (string v in values) {
                                                DateTime dt;
                                                dt = StringFu.StringToDateTime (v);
                                                if (StartDate <= dt && dt <= EndDate)
                                                        return true;
                                        }
                                        return false;
                                }
                        }
Пример #7
0
 public bool HitFilter (Hit hit)
 {
         return ! original (hit);
 }
Пример #8
0
 public bool HitFilter (Hit hit)
 {
         foreach (HitFilter hit_filter in all)
                 if (! hit_filter (hit))
                         return false;
         return true;
 }
Пример #9
0
 public bool HitFilter (Hit hit)
 {
         if (contains_known_true)
                 return true;
         foreach (HitFilter hit_filter in all)
                 if (hit_filter (hit))
                         return true;
         return false;
 }
Пример #10
0
                //////////////////////////////////////////////////////////////////////////////

                //
                // Special Hit Filtering classes
                //

                static private bool TrueHitFilter (Hit hit)
                {
                        return true;
                }
Пример #11
0
 static protected void AddPropertiesToHit (Hit hit, Document doc, bool from_primary_index)
 {
         Property prop;
         foreach (Field f in doc.Fields ()) {
                 prop = GetPropertyFromDocument (f, doc, from_primary_index);
                 if (prop != null)
                         hit.AddProperty (prop);
         }
 }
Пример #12
0
		///////////////////////////////////////////////////////////////////////////////////

		//
		// Various ways to grab lots of hits at once.
		// These should never be used for querying, only for utility
		// functions.
		//

		public int GetBlockOfHits (int cookie,
					   Hit [] block_of_hits)
		{
			IndexReader primary_reader;
			IndexReader secondary_reader;
			primary_reader = GetReader (PrimaryStore);
			secondary_reader = GetReader (SecondaryStore);

			int request_size;
			request_size = block_of_hits.Length;
			if (request_size > primary_reader.NumDocs ())
				request_size = primary_reader.NumDocs ();

			int max_doc;
			max_doc = primary_reader.MaxDoc ();

			if (cookie < 0) {
				Random random;
				random = new Random ();
				cookie = random.Next (max_doc);
			}

			int original_cookie;
			original_cookie = cookie;

			Hashtable primary_docs, secondary_docs;
			primary_docs = UriFu.NewHashtable ();
			secondary_docs = UriFu.NewHashtable ();

			// Load the primary documents
			for (int i = 0; i < request_size; ++i) {
				
				if (! primary_reader.IsDeleted (cookie)) {
					Document doc;
					doc = primary_reader.Document (cookie);
					primary_docs [GetUriFromDocument (doc)] = doc;
				}
				
				++cookie;
				if (cookie >= max_doc) // wrap around
					cookie = 0;

				// If we somehow end up back where we started,
				// give up.
				if (cookie == original_cookie)
					break;
			}

			// If necessary, load the secondary documents
			if (secondary_reader != null) {
				LNS.IndexSearcher searcher;
				searcher = new LNS.IndexSearcher (secondary_reader);
				
				LNS.Query uri_query;
				uri_query = UriQuery ("Uri", primary_docs.Keys);
				
				LNS.Hits hits;
				hits = searcher.Search (uri_query);
				for (int i = 0; i < hits.Length (); ++i) {
					Document doc;
					doc = hits.Doc (i);
					secondary_docs [GetUriFromDocument (doc)] = doc;
				}
				
				searcher.Close ();
			}

			ReleaseReader (primary_reader);
			ReleaseReader (secondary_reader);

			// Now assemble the hits
			int j = 0;
			foreach (Uri uri in primary_docs.Keys) {
				Document primary_doc, secondary_doc;
				primary_doc = primary_docs [uri] as Document;
				secondary_doc = secondary_docs [uri] as Document;

				Hit hit;
				hit = DocumentToHit (primary_doc);
				if (secondary_doc != null)
					AddPropertiesToHit (hit, secondary_doc, false);
				
				block_of_hits [j] = hit;
				++j;
			}

			// null-pad the array, if necessary
			for (; j < block_of_hits.Length; ++j)
				block_of_hits [j] = null;


			// Return the new cookie
			return cookie;
		}
Пример #13
0
        /// <summary>
        /// This constructor is used while retrieving the hit from the dump
        /// </summary>
        /// <param name="ltask">The dump indexer this Wiki topic belongs to</param>
        /// <param name="hit">The Lucene Hit object</param>
        public PageInfo(Indexer ixr, Hit hit)
        {
            TreatRedirectException = false;
            Indexer = ixr;

            // Decoder setter sort Beginnings and Ends.
            _decoder = ixr;

            Score = hit.GetScore();

            Document doc = hit.GetDocument();

            TopicId = Convert.ToInt64(doc.GetField("topicid").StringValue());

            Name = doc.GetField("title").StringValue();

            Beginnings = new long[doc.GetFields("beginning").Length];
            Ends = new long[doc.GetFields("end").Length];

            int i = 0;

            foreach (byte[] binVal in doc.GetBinaryValues("beginning"))
            {
                Beginnings[i] = BitConverter.ToInt64(binVal, 0);

                i++;
            }

            i = 0;

            foreach (byte[] binVal in doc.GetBinaryValues("end"))
            {
                Ends[i] = BitConverter.ToInt64(binVal, 0);

                i++;
            }

            Array.Sort(Beginnings);
            Array.Sort(Ends);
        }
Пример #14
0
		private ICollection DoLowLevelRDFQuery (Query query,
							PropertyType pred_type,
							string predicate,
							string field_value,
							TextCache text_cache)
		{

			Stopwatch total, a, b, c, d, e, f;

			total = new Stopwatch ();
			a = new Stopwatch ();
			b = new Stopwatch ();
			c = new Stopwatch ();
			d = new Stopwatch ();
			e = new Stopwatch ();
			f = new Stopwatch ();

			total.Start ();
			a.Start ();

			// Assemble all of the parts into a bunch of Lucene queries

			ArrayList primary_required_part_queries;
			ArrayList secondary_required_part_queries;

			LNS.BooleanQuery primary_prohibited_part_query;
			LNS.BooleanQuery secondary_prohibited_part_query;

			AndHitFilter all_hit_filters;

			ArrayList term_list;

			// Assemble all of the parts into a bunch of Lucene queries

			term_list = AssembleQuery (query,
				null,
				null,
				out primary_required_part_queries,
				out secondary_required_part_queries,
				out primary_prohibited_part_query,
				out secondary_prohibited_part_query,
				out all_hit_filters);

			a.Stop ();
			if (Debug)
				Log.Debug ("###### {0}: Building queries took {1}", IndexName, a);

			// If we have no required parts, give up.
			if (primary_required_part_queries == null)
				return null;

			b.Start ();
			
			//
			// Now that we have all of these nice queries, let's execute them!
			//

			// Create the searchers that we will need.

			IndexReader primary_reader;
			LNS.IndexSearcher primary_searcher;
			IndexReader secondary_reader;
			LNS.IndexSearcher secondary_searcher;

			// Create the searchers that we will need.
			if (! BuildSearchers (out primary_reader, out primary_searcher, out secondary_reader, out secondary_searcher))
				return null;

			b.Stop ();
			if (Debug)
				Log.Debug ("###### {0}: Readers/searchers built in {1}", IndexName, b);

			// Build whitelists and blacklists for search subsets.
			c.Start ();
			
			// Possibly create our whitelists from the search subset.
			LuceneBitArray primary_whitelist, secondary_whitelist;
			CreateQueryWhitelists (null,
				primary_searcher,
				secondary_searcher,
				primary_prohibited_part_query,
				secondary_prohibited_part_query,
				out primary_whitelist,
				out secondary_whitelist);

			c.Stop ();
			if (Debug)
				Log.Debug ("###### {0}: Whitelists and blacklists built in {1}", IndexName, c);

			// Now run the low level queries against our indexes.
			d.Start ();

			BetterBitArray primary_matches = null;

			if (primary_required_part_queries != null) {

				if (secondary_searcher != null)
					primary_matches = DoRequiredQueries_TwoIndex (primary_searcher,
										      secondary_searcher,
										      primary_required_part_queries,
										      secondary_required_part_queries,
										      primary_whitelist,
										      secondary_whitelist);
				else
					primary_matches = DoRequiredQueries (primary_searcher,
									     primary_required_part_queries,
									     primary_whitelist);

			} 

			d.Stop ();
			if (Debug)
				Logger.Log.Debug ("###### {0}: Low-level queries finished in {1} and returned {2} matches", IndexName, d, primary_matches.TrueCount);

			e.Start ();

			int count = 0;
			Document doc;
			ArrayList hits = new ArrayList (primary_matches.TrueCount);

			TermDocs secondary_term_docs = null;
			if (secondary_searcher != null)
				secondary_term_docs = secondary_searcher.Reader.TermDocs ();
		
			FieldSelector fields = null;
			if (predicate != null)
				fields = new MapFieldSelector (new string[] { "Uri", "Timestamp", PropertyToFieldName (pred_type, predicate)});

			for (int match_index = primary_matches.GetNextTrueIndex (0);
			     match_index < primary_matches.Count; 
			     match_index = primary_matches.GetNextTrueIndex (++ match_index)) {

				count++;

				// If we have a HitFilter, apply it.
				// RDF FIXME: Ignore Hit Filter for now

				// If predicate was not specified but object was specified,
				// then figure out the right predicate
				if (predicate == null && field_value != null) {
					Hit hit = new Hit ();
					doc = primary_searcher.Doc (match_index);
					hit.Uri = GetUriFromDocument (doc);
					hit.Timestamp = StringFu.StringToDateTime (doc.Get ("Timestamp"));

					bool found_matching_predicate = false;

					foreach (Field field in doc.Fields ()) {
						if (! FieldIsPredicate (field, field_value))
							continue;

						Property prop = new Property ();
						prop.Type = pred_type;
						prop.Key = predicate;
						prop.Value = field_value;
						hit.AddProperty (prop);

						found_matching_predicate = true;
					}

					// Now get the matching predicate from the secondary index
					if (secondary_searcher == null) {
						doc = null;
					} else {
						Term term = new Term ("Uri", doc.Get ("Uri"));
						secondary_term_docs.Seek (term);
						if (secondary_term_docs.Next ())
							doc = secondary_searcher.Doc (secondary_term_docs.Doc ());
					}

					if (doc != null) {
						foreach (Field field in doc.Fields ()) {
							if (! FieldIsPredicate (field, field_value))
								continue;

							Property prop = new Property ();
							prop.Type = pred_type;
							prop.Key = predicate;
							prop.Value = field_value;
							hit.AddProperty (prop);

							found_matching_predicate = true;
						}
					}

					if (! found_matching_predicate) {
						// No matching predicate found
						// This means some unstored field matched the query
						// FIXME: Add a synthetic property #text
						hit.AddProperty (Property.New ("#text", field_value));
					}
					
					hits.Add (hit);
				} else if (predicate == "TextLinks") {
					// Special treatment: TextLinks is not stored but can be queried
					doc = primary_searcher.Doc (match_index, fields_timestamp_uri);
					Hit hit = CreateHit (doc, secondary_reader, secondary_term_docs, fields);
					if (field_value != null)
						hit.AddProperty (Property.New ("TextLinks", field_value));
					else {
						foreach (Property text_link_property in GetTextLinks (hit.Uri, text_cache))
							hit.AddProperty (text_link_property);
					}
					hits.Add (hit);
				} else {
					doc = primary_searcher.Doc (match_index, fields);
					Hit hit = CreateHit (doc, secondary_reader, secondary_term_docs, fields);
					foreach (Property prop in hit.Properties) {
						if (prop.Key == predicate)
							prop.Value = field_value;
					}

					hits.Add (hit);
				}
			}

			e.Stop ();

			if (Debug)
				Log.Debug ("###### {0}: Query results generated in {1}", IndexName, e);

			//
			// Finally, we clean up after ourselves.
			//

			f.Start ();
			CloseSearchers (primary_reader, primary_searcher, secondary_reader, secondary_searcher);
			f.Stop ();
			
			if (Debug)
				Log.Debug ("###### {0}: Readers/searchers released in {1}", IndexName, f);

			total.Stop ();
			if (Debug) {
				Log.Debug ("###### {0}: Query time breakdown:", IndexName);
				Log.Debug ("###### {0}:    Build queries {1,6} ({2:0.0}%)", IndexName, a, 100 * a.ElapsedTime / total.ElapsedTime);
				Log.Debug ("###### {0}:      Got readers {1,6} ({2:0.0}%)", IndexName, b, 100 * b.ElapsedTime / total.ElapsedTime);
				Log.Debug ("###### {0}:       Whitelists {1,6} ({2:0.0}%)", IndexName, c, 100 * c.ElapsedTime / total.ElapsedTime);
				Log.Debug ("###### {0}:          Queries {1,6} ({2:0.0}%)", IndexName, d, 100 * d.ElapsedTime / total.ElapsedTime);
				Log.Debug ("###### {0}:    Gen'd Results {1,6} ({2:0.0}%)", IndexName, e, 100 * e.ElapsedTime / total.ElapsedTime);
				Log.Debug ("###### {0}:   Reader cleanup {1,6} ({2:0.0}%)", IndexName, f, 100 * f.ElapsedTime / total.ElapsedTime);
				Log.Debug ("###### {0}:            TOTAL {1,6}", IndexName, total);

				Logger.Log.Debug ("###### {0}: Total query run in {1}", IndexName, total);
			}

			return hits;
		}