Beispiel #1
0
        public virtual void  TestSetBufferSize()
        {
            System.IO.FileInfo indexDir = new System.IO.FileInfo(System.IO.Path.Combine(SupportClass.AppSettings.Get("tempDir", ""), "testSetBufferSize"));
            MockFSDirectory    dir      = new MockFSDirectory(indexDir, NewRandom());

            try
            {
                IndexWriter writer = new IndexWriter(dir, new WhitespaceAnalyzer(), true, IndexWriter.MaxFieldLength.LIMITED);
                writer.SetUseCompoundFile(false);
                for (int i = 0; i < 37; i++)
                {
                    Document doc = new Document();
                    doc.Add(new Field("content", "aaa bbb ccc ddd" + i, Field.Store.YES, Field.Index.ANALYZED));
                    doc.Add(new Field("id", "" + i, Field.Store.YES, Field.Index.ANALYZED));
                    writer.AddDocument(doc);
                }
                writer.Close();

                dir.allIndexInputs.Clear();

                IndexReader reader = IndexReader.Open(dir);
                Term        aaa    = new Term("content", "aaa");
                Term        bbb    = new Term("content", "bbb");
                Term        ccc    = new Term("content", "ccc");
                Assert.AreEqual(37, reader.DocFreq(ccc));
                reader.DeleteDocument(0);
                Assert.AreEqual(37, reader.DocFreq(aaa));
                dir.tweakBufferSizes();
                reader.DeleteDocument(4);
                Assert.AreEqual(reader.DocFreq(bbb), 37);
                dir.tweakBufferSizes();

                IndexSearcher searcher = new IndexSearcher(reader);
                ScoreDoc[]    hits     = searcher.Search(new TermQuery(bbb), null, 1000).scoreDocs;
                dir.tweakBufferSizes();
                Assert.AreEqual(35, hits.Length);
                dir.tweakBufferSizes();
                hits = searcher.Search(new TermQuery(new Term("id", "33")), null, 1000).scoreDocs;
                dir.tweakBufferSizes();
                Assert.AreEqual(1, hits.Length);
                hits = searcher.Search(new TermQuery(aaa), null, 1000).scoreDocs;
                dir.tweakBufferSizes();
                Assert.AreEqual(35, hits.Length);
                searcher.Close();
                reader.Close();
            }
            finally
            {
                _TestUtil.RmDir(indexDir);
            }
        }
Beispiel #2
0
            public override Query Rewrite(IndexReader reader, MultiTermQuery query)
            {
                // Get the enum and start visiting terms.  If we
                // exhaust the enum before hitting either of the
                // cutoffs, we use ConstantBooleanQueryRewrite; else,
                // ConstantFilterRewrite:
                System.Collections.ArrayList pendingTerms = new System.Collections.ArrayList();
                int docCountCutoff = (int)((docCountPercent / 100.0) * reader.MaxDoc());
                int termCountLimit = System.Math.Min(BooleanQuery.GetMaxClauseCount(), termCountCutoff);
                int docVisitCount  = 0;

                FilteredTermEnum enumerator = query.GetEnum(reader);

                try
                {
                    while (true)
                    {
                        Term t = enumerator.Term();
                        if (t != null)
                        {
                            pendingTerms.Add(t);
                            // Loading the TermInfo from the terms dict here
                            // should not be costly, because 1) the
                            // query/filter will load the TermInfo when it
                            // runs, and 2) the terms dict has a cache:
                            docVisitCount += reader.DocFreq(t);
                        }

                        if (pendingTerms.Count >= termCountLimit || docVisitCount >= docCountCutoff)
                        {
                            // Too many terms -- make a filter.
                            Query result = new ConstantScoreQuery(new MultiTermQueryWrapperFilter(query));
                            result.SetBoost(query.GetBoost());
                            return(result);
                        }
                        else if (!enumerator.Next())
                        {
                            // Enumeration is done, and we hit a small
                            // enough number of terms & docs -- just make a
                            // BooleanQuery, now
                            System.Collections.IEnumerator it = pendingTerms.GetEnumerator();
                            BooleanQuery bq = new BooleanQuery(true);
                            while (it.MoveNext())
                            {
                                TermQuery tq = new TermQuery((Term)it.Current);
                                bq.Add(tq, BooleanClause.Occur.SHOULD);
                            }
                            // Strip scores
                            Query result = new ConstantScoreQuery(new QueryWrapperFilter(bq));
                            result.SetBoost(query.GetBoost());
                            query.IncTotalNumberOfTerms(pendingTerms.Count);
                            return(result);
                        }
                    }
                }
                finally
                {
                    enumerator.Close();
                }
            }
Beispiel #3
0
 /// <summary> Check whether the word exists in the index.</summary>
 /// <param name="word">String
 /// </param>
 /// <throws>  IOException </throws>
 /// <returns> true iff the word exists in the index
 /// </returns>
 public virtual bool Exist(System.String word)
 {
     if (reader == null)
     {
         reader = IndexReader.Open(spellindex);
     }
     return(reader.DocFreq(new Term(F_WORD, word)) > 0);
 }
		/// <summary> Extracts all terms texts of a given Query into an array of WeightedTerms
		/// 
		/// </summary>
		/// <param name="query">     Query to extract term texts from
		/// </param>
		/// <param name="reader">used to compute IDF which can be used to a) score selected fragments better 
		/// b) use graded highlights eg chaning intensity of font color
		/// </param>
		/// <param name="fieldName">the field on which Inverse Document Frequency (IDF) calculations are based
		/// </param>
		/// <returns> an array of the terms used in a query, plus their weights.
		/// </returns>
		public static WeightedTerm[] GetIdfWeightedTerms(Query query, IndexReader reader, System.String fieldName)
		{
			WeightedTerm[] terms = GetTerms(query, false, fieldName);
			int totalNumDocs = reader.NumDocs();
			for (int i = 0; i < terms.Length; i++)
			{
				try
				{
					int docFreq = reader.DocFreq(new Term(fieldName, terms[i].term));
					//IDF algorithm taken from DefaultSimilarity class
					float idf = (float) (System.Math.Log((float) totalNumDocs / (double) (docFreq + 1)) + 1.0);
					terms[i].weight *= idf;
				}
				catch (System.IO.IOException e)
				{
					//ignore 
				}
			}
			return terms;
		}
        /// <summary> Extracts all terms texts of a given Query into an array of WeightedTerms
        ///
        /// </summary>
        /// <param name="query">     Query to extract term texts from
        /// </param>
        /// <param name="reader">used to compute IDF which can be used to a) score selected fragments better
        /// b) use graded highlights eg chaning intensity of font color
        /// </param>
        /// <param name="fieldName">the field on which Inverse Document Frequency (IDF) calculations are based
        /// </param>
        /// <returns> an array of the terms used in a query, plus their weights.
        /// </returns>
        public static WeightedTerm[] GetIdfWeightedTerms(Query query, IndexReader reader, System.String fieldName)
        {
            WeightedTerm[] terms        = GetTerms(query, false, fieldName);
            int            totalNumDocs = reader.NumDocs();

            for (int i = 0; i < terms.Length; i++)
            {
                try
                {
                    int docFreq = reader.DocFreq(new Term(fieldName, terms[i].term));
                    //IDF algorithm taken from DefaultSimilarity class
                    float idf = (float)(System.Math.Log((float)totalNumDocs / (double)(docFreq + 1)) + 1.0);
                    terms[i].weight *= idf;
                }
                catch (System.IO.IOException e)
                {
                    //ignore
                }
            }
            return(terms);
        }
        public virtual Explanation Explain(IndexReader reader, int doc)
        {
            Explanation result = new Explanation();
            result.SetDescription("weight(" + GetQuery() + " in " + doc + "), product of:");
            System.String field = ((SpanQuery) GetQuery()).GetField();

            System.Text.StringBuilder docFreqs = new System.Text.StringBuilder();
            System.Collections.IEnumerator i = terms.GetEnumerator();
            while (i.MoveNext())
            {
                System.Collections.DictionaryEntry tmp = (System.Collections.DictionaryEntry) i.Current;
                Term term = (Term) tmp.Key;
                docFreqs.Append(term.Text());
                docFreqs.Append("=");
                docFreqs.Append(reader.DocFreq(term));

                if (i.MoveNext())
                {
                    docFreqs.Append(" ");
                }
            }

            Explanation idfExpl = new Explanation(idf, "idf(" + field + ": " + docFreqs + ")");

            // explain query weight
            Explanation queryExpl = new Explanation();
            queryExpl.SetDescription("queryWeight(" + GetQuery() + "), product of:");

            Explanation boostExpl = new Explanation(GetQuery().GetBoost(), "boost");
            if (GetQuery().GetBoost() != 1.0f)
                queryExpl.AddDetail(boostExpl);
            queryExpl.AddDetail(idfExpl);

            Explanation queryNormExpl = new Explanation(queryNorm, "queryNorm");
            queryExpl.AddDetail(queryNormExpl);

            queryExpl.SetValue(boostExpl.GetValue() * idfExpl.GetValue() * queryNormExpl.GetValue());

            result.AddDetail(queryExpl);

            // explain field weight
            Explanation fieldExpl = new Explanation();
            fieldExpl.SetDescription("fieldWeight(" + field + ":" + query.ToString(field) + " in " + doc + "), product of:");

            Explanation tfExpl = Scorer(reader).Explain(doc);
            fieldExpl.AddDetail(tfExpl);
            fieldExpl.AddDetail(idfExpl);

            Explanation fieldNormExpl = new Explanation();
            byte[] fieldNorms = reader.Norms(field);
            float fieldNorm = fieldNorms != null ? Similarity.DecodeNorm(fieldNorms[doc]) : 0.0f;
            fieldNormExpl.SetValue(fieldNorm);
            fieldNormExpl.SetDescription("fieldNorm(field=" + field + ", doc=" + doc + ")");
            fieldExpl.AddDetail(fieldNormExpl);

            fieldExpl.SetValue(tfExpl.GetValue() * idfExpl.GetValue() * fieldNormExpl.GetValue());

            result.AddDetail(fieldExpl);

            // combine them
            result.SetValue(queryExpl.GetValue() * fieldExpl.GetValue());

            if (queryExpl.GetValue() == 1.0f)
                return fieldExpl;

            return result;
        }
Beispiel #7
0
            public virtual Explanation Explain(IndexReader reader, int doc)
            {
                ComplexExplanation result = new ComplexExplanation();

                result.SetDescription("weight(" + GetQuery() + " in " + doc + "), product of:");

                Explanation idfExpl = new Explanation(idf, "idf(docFreq=" + reader.DocFreq(Enclosing_Instance.term) + ", numDocs=" + reader.NumDocs() + ")");

                // explain query weight
                Explanation queryExpl = new Explanation();

                queryExpl.SetDescription("queryWeight(" + GetQuery() + "), product of:");

                Explanation boostExpl = new Explanation(Enclosing_Instance.GetBoost(), "boost");

                if (Enclosing_Instance.GetBoost() != 1.0f)
                {
                    queryExpl.AddDetail(boostExpl);
                }
                queryExpl.AddDetail(idfExpl);

                Explanation queryNormExpl = new Explanation(queryNorm, "queryNorm");

                queryExpl.AddDetail(queryNormExpl);

                queryExpl.SetValue(boostExpl.GetValue() * idfExpl.GetValue() * queryNormExpl.GetValue());

                result.AddDetail(queryExpl);

                // explain field weight
                System.String      field     = Enclosing_Instance.term.Field();
                ComplexExplanation fieldExpl = new ComplexExplanation();

                fieldExpl.SetDescription("fieldWeight(" + Enclosing_Instance.term + " in " + doc + "), product of:");

                Explanation tfExpl = Scorer(reader).Explain(doc);

                fieldExpl.AddDetail(tfExpl);
                fieldExpl.AddDetail(idfExpl);

                Explanation fieldNormExpl = new Explanation();

                byte[] fieldNorms = reader.Norms(field);
                float  fieldNorm  = fieldNorms != null?Similarity.DecodeNorm(fieldNorms[doc]) : 0.0f;

                fieldNormExpl.SetValue(fieldNorm);
                fieldNormExpl.SetDescription("fieldNorm(field=" + field + ", doc=" + doc + ")");
                fieldExpl.AddDetail(fieldNormExpl);

                fieldExpl.SetMatch(tfExpl.IsMatch());
                fieldExpl.SetValue(tfExpl.GetValue() * idfExpl.GetValue() * fieldNormExpl.GetValue());

                result.AddDetail(fieldExpl);
                System.Boolean tempAux = fieldExpl.GetMatch();
                result.SetMatch(tempAux);

                // combine them
                result.SetValue(queryExpl.GetValue() * fieldExpl.GetValue());

                if (queryExpl.GetValue() == 1.0f)
                {
                    return(fieldExpl);
                }

                return(result);
            }
Beispiel #8
0
        /// <summary> Suggest similar words (restricted or not to a field of a user index)</summary>
        /// <param name="word">String the word you want a spell check done on
        /// </param>
        /// <param name="num_sug">int the number of suggest words
        /// </param>
        /// <param name="ir">the indexReader of the user index (can be null see field param)
        /// </param>
        /// <param name="field">String the field of the user index: if field is not null, the suggested
        /// words are restricted to the words present in this field.
        /// </param>
        /// <param name="morePopular">boolean return only the suggest words that are more frequent than the searched word
        /// (only if restricted mode = (indexReader!=null and field!=null)
        /// </param>
        /// <throws>  IOException </throws>
        /// <returns> String[] the sorted list of the suggest words with this 2 criteria:
        /// first criteria: the edit distance, second criteria (only if restricted mode): the popularity
        /// of the suggest words in the field of the user index
        /// </returns>
        public virtual System.String[] SuggestSimilar(System.String word, int num_sug, IndexReader ir, System.String field, bool morePopular)
        {
            float            min = this.minScore;
            TRStringDistance sd  = new TRStringDistance(word);
            int lengthWord       = word.Length;

            int goalFreq = (morePopular && ir != null) ? ir.DocFreq(new Term(field, word)) : 0;

            if (!morePopular && goalFreq > 0)
            {
                return(new System.String[] { word }); // return the word if it exist in the index and i don't want a more popular word
            }

            BooleanQuery query = new BooleanQuery();

            System.String[] grams;
            System.String   key;

            for (int ng = GetMin(lengthWord); ng <= GetMax(lengthWord); ng++)
            {
                key = "gram" + ng;           // form key

                grams = FormGrams(word, ng); // form word into ngrams (allow dups too)

                if (grams.Length == 0)
                {
                    continue; // hmm
                }

                if (bStart > 0)
                {
                    // should we boost prefixes?
                    Add(query, "start" + ng, grams[0], bStart); // matches start of word
                }
                if (bEnd > 0)
                {
                    // should we boost suffixes
                    Add(query, "end" + ng, grams[grams.Length - 1], bEnd); // matches end of word
                }
                for (int i = 0; i < grams.Length; i++)
                {
                    Add(query, key, grams[i]);
                }
            }

            IndexSearcher    searcher = new IndexSearcher(this.spellindex);
            Hits             hits     = searcher.Search(query);
            SuggestWordQueue sugqueue = new SuggestWordQueue(num_sug);

            int         stop    = Math.Min(hits.Length(), 10 * num_sug); // go thru more than 'maxr' matches in case the distance filter triggers
            SuggestWord sugword = new SuggestWord();

            for (int i = 0; i < stop; i++)
            {
                sugword.string_Renamed = hits.Doc(i).Get(F_WORD); // get orig word)

                if (sugword.string_Renamed.Equals(word))
                {
                    continue; // don't suggest a word for itself, that would be silly
                }

                //edit distance/normalize with the min word length
                sugword.score = 1.0f - ((float)sd.GetDistance(sugword.string_Renamed) / System.Math.Min(sugword.string_Renamed.Length, lengthWord));
                if (sugword.score < min)
                {
                    continue;
                }

                if (ir != null)
                {
                    // use the user index
                    sugword.freq = ir.DocFreq(new Term(field, sugword.string_Renamed)); // freq in the index
                    if ((morePopular && goalFreq > sugword.freq) || sugword.freq < 1)
                    {
                        // don't suggest a word that is not present in the field
                        continue;
                    }
                }
                sugqueue.Insert(sugword);
                if (sugqueue.Size() == num_sug)
                {
                    //if queue full , maintain the min score
                    min = ((SuggestWord)sugqueue.Top()).score;
                }
                sugword = new SuggestWord();
            }

            // convert to array string
            System.String[] list = new System.String[sugqueue.Size()];
            for (int i = sugqueue.Size() - 1; i >= 0; i--)
            {
                list[i] = ((SuggestWord)sugqueue.Pop()).string_Renamed;
            }

            searcher.Close();
            return(list);
        }
			public override Query Rewrite(IndexReader reader, MultiTermQuery query)
			{
				// Get the enum and start visiting terms.  If we
				// exhaust the enum before hitting either of the
				// cutoffs, we use ConstantBooleanQueryRewrite; else,
				// ConstantFilterRewrite:
				System.Collections.ArrayList pendingTerms = new System.Collections.ArrayList();
				int docCountCutoff = (int) ((docCountPercent / 100.0) * reader.MaxDoc());
				int termCountLimit = System.Math.Min(BooleanQuery.GetMaxClauseCount(), termCountCutoff);
				int docVisitCount = 0;
				
				FilteredTermEnum enumerator = query.GetEnum(reader);
				try
				{
					while (true)
					{
						Term t = enumerator.Term();
						if (t != null)
						{
							pendingTerms.Add(t);
							// Loading the TermInfo from the terms dict here
							// should not be costly, because 1) the
							// query/filter will load the TermInfo when it
							// runs, and 2) the terms dict has a cache:
							docVisitCount += reader.DocFreq(t);
						}
						
						if (pendingTerms.Count >= termCountLimit || docVisitCount >= docCountCutoff)
						{
							// Too many terms -- make a filter.
							Query result = new ConstantScoreQuery(new MultiTermQueryWrapperFilter(query));
							result.SetBoost(query.GetBoost());
							return result;
						}
						else if (!enumerator.Next())
						{
							// Enumeration is done, and we hit a small
							// enough number of terms & docs -- just make a
							// BooleanQuery, now
							System.Collections.IEnumerator it = pendingTerms.GetEnumerator();
							BooleanQuery bq = new BooleanQuery(true);
							while (it.MoveNext())
							{
								TermQuery tq = new TermQuery((Term) it.Current);
								bq.Add(tq, BooleanClause.Occur.SHOULD);
							}
							// Strip scores
							Query result = new ConstantScoreQuery(new QueryWrapperFilter(bq));
							result.SetBoost(query.GetBoost());
							query.IncTotalNumberOfTerms(pendingTerms.Count);
							return result;
						}
					}
				}
				finally
				{
					enumerator.Close();
				}
			}
Beispiel #10
0
        public virtual Explanation Explain(IndexReader reader, int doc)
        {
            Explanation result = new Explanation();

            result.SetDescription("weight(" + GetQuery() + " in " + doc + "), product of:");
            System.String field = ((SpanQuery)GetQuery()).GetField();

            System.Text.StringBuilder      docFreqs = new System.Text.StringBuilder();
            System.Collections.IEnumerator i        = terms.GetEnumerator();
            while (i.MoveNext())
            {
                Term term = (Term)i.Current;
                docFreqs.Append(term.Text());
                docFreqs.Append("=");
                docFreqs.Append(reader.DocFreq(term));

                if (i.MoveNext())
                {
                    docFreqs.Append(" ");
                }
            }

            Explanation idfExpl = new Explanation(idf, "idf(" + field + ": " + docFreqs + ")");

            // explain query weight
            Explanation queryExpl = new Explanation();

            queryExpl.SetDescription("queryWeight(" + GetQuery() + "), product of:");

            Explanation boostExpl = new Explanation(GetQuery().GetBoost(), "boost");

            if (GetQuery().GetBoost() != 1.0f)
            {
                queryExpl.AddDetail(boostExpl);
            }
            queryExpl.AddDetail(idfExpl);

            Explanation queryNormExpl = new Explanation(queryNorm, "queryNorm");

            queryExpl.AddDetail(queryNormExpl);

            queryExpl.SetValue(boostExpl.GetValue() * idfExpl.GetValue() * queryNormExpl.GetValue());

            result.AddDetail(queryExpl);

            // explain field weight
            Explanation fieldExpl = new Explanation();

            fieldExpl.SetDescription("fieldWeight(" + field + ":" + query.ToString(field) + " in " + doc + "), product of:");

            Explanation tfExpl = Scorer(reader).Explain(doc);

            fieldExpl.AddDetail(tfExpl);
            fieldExpl.AddDetail(idfExpl);

            Explanation fieldNormExpl = new Explanation();

            byte[] fieldNorms = reader.Norms(field);
            float  fieldNorm  = fieldNorms != null?Similarity.DecodeNorm(fieldNorms[doc]) : 0.0f;

            fieldNormExpl.SetValue(fieldNorm);
            fieldNormExpl.SetDescription("fieldNorm(field=" + field + ", doc=" + doc + ")");
            fieldExpl.AddDetail(fieldNormExpl);

            fieldExpl.SetValue(tfExpl.GetValue() * idfExpl.GetValue() * fieldNormExpl.GetValue());

            result.AddDetail(fieldExpl);

            // combine them
            result.SetValue(queryExpl.GetValue() * fieldExpl.GetValue());

            if (queryExpl.GetValue() == 1.0f)
            {
                return(fieldExpl);
            }

            return(result);
        }
Beispiel #11
0
 // inherit javadoc
 public override int DocFreq(Term term)
 {
     return(reader.DocFreq(term));
 }
Beispiel #12
0
            public virtual Explanation Explain(IndexReader reader, int doc)
            {
                Explanation result = new Explanation();

                result.SetDescription("weight(" + GetQuery() + " in " + doc + "), product of:");

                System.Text.StringBuilder docFreqs = new System.Text.StringBuilder();
                System.Text.StringBuilder query    = new System.Text.StringBuilder();
                query.Append('\"');
                for (int i = 0; i < Enclosing_Instance.terms.Count; i++)
                {
                    if (i != 0)
                    {
                        docFreqs.Append(" ");
                        query.Append(" ");
                    }

                    Term term = (Term)Enclosing_Instance.terms[i];

                    docFreqs.Append(term.Text());
                    docFreqs.Append("=");
                    docFreqs.Append(reader.DocFreq(term));

                    query.Append(term.Text());
                }
                query.Append('\"');

                Explanation idfExpl = new Explanation(idf, "idf(" + Enclosing_Instance.field + ": " + docFreqs + ")");

                // explain query weight
                Explanation queryExpl = new Explanation();

                queryExpl.SetDescription("queryWeight(" + GetQuery() + "), product of:");

                Explanation boostExpl = new Explanation(Enclosing_Instance.GetBoost(), "boost");

                if (Enclosing_Instance.GetBoost() != 1.0f)
                {
                    queryExpl.AddDetail(boostExpl);
                }
                queryExpl.AddDetail(idfExpl);

                Explanation queryNormExpl = new Explanation(queryNorm, "queryNorm");

                queryExpl.AddDetail(queryNormExpl);

                queryExpl.SetValue(boostExpl.GetValue() * idfExpl.GetValue() * queryNormExpl.GetValue());

                result.AddDetail(queryExpl);

                // explain field weight
                Explanation fieldExpl = new Explanation();

                fieldExpl.SetDescription("fieldWeight(" + Enclosing_Instance.field + ":" + query + " in " + doc + "), product of:");

                Explanation tfExpl = Scorer(reader).Explain(doc);

                fieldExpl.AddDetail(tfExpl);
                fieldExpl.AddDetail(idfExpl);

                Explanation fieldNormExpl = new Explanation();

                byte[] fieldNorms = reader.Norms(Enclosing_Instance.field);
                float  fieldNorm  = fieldNorms != null?Similarity.DecodeNorm(fieldNorms[doc]) : 0.0f;

                fieldNormExpl.SetValue(fieldNorm);
                fieldNormExpl.SetDescription("fieldNorm(field=" + Enclosing_Instance.field + ", doc=" + doc + ")");
                fieldExpl.AddDetail(fieldNormExpl);

                fieldExpl.SetValue(tfExpl.GetValue() * idfExpl.GetValue() * fieldNormExpl.GetValue());

                result.AddDetail(fieldExpl);

                // combine them
                result.SetValue(queryExpl.GetValue() * fieldExpl.GetValue());

                if (queryExpl.GetValue() == 1.0f)
                {
                    return(fieldExpl);
                }

                return(result);
            }
Beispiel #13
0
        /// <summary> Suggest similar words (restricted or not to a field of a user index)</summary>
        /// <param name="word">String the word you want a spell check done on
        /// </param>
        /// <param name="numSug">int the number of suggest words
        /// </param>
        /// <param name="ir">the indexReader of the user index (can be null see field param)
        /// </param>
        /// <param name="field">String the field of the user index: if field is not null, the suggested
        /// words are restricted to the words present in this field.
        /// </param>
        /// <param name="morePopular">boolean return only the suggest words that are more frequent than the searched word
        /// (only if restricted mode = (indexReader!=null and field!=null)
        /// </param>
        /// <throws>  IOException </throws>
        /// <returns> String[] the sorted list of the suggest words with this 2 criteria:
        /// first criteria: the edit distance, second criteria (only if restricted mode): the popularity
        /// of the suggest words in the field of the user index
        /// </returns>
        public virtual System.String[] SuggestSimilar(System.String word, int numSug, IndexReader ir, System.String field, bool morePopular)
        {
            // obtainSearcher calls ensureOpen
            IndexSearcher indexSearcher = ObtainSearcher();
            try
            {
                float min = this.minScore;
                int lengthWord = word.Length;

                int freq = (ir != null && field != null) ? ir.DocFreq(new Term(field, word)) : 0;
                int goalFreq = (morePopular && ir != null && field != null) ? freq : 0;
                // if the word exists in the real index and we don't care for word frequency, return the word itself
                if (!morePopular && freq > 0)
                {
                    return new String[] { word };
                }

                var query = new BooleanQuery();
                String[] grams;
                String key;

                var alreadySeen = new HashSet<string>();
                for (var ng = GetMin(lengthWord); ng <= GetMax(lengthWord); ng++)
                {
                    key = "gram" + ng; // form key

                    grams = FormGrams(word, ng); // form word into ngrams (allow dups too)

                    if (grams.Length == 0)
                    {
                        continue; // hmm
                    }

                    if (bStart > 0)
                    { // should we boost prefixes?
                        Add(query, "start" + ng, grams[0], bStart); // matches start of word

                    }
                    if (bEnd > 0)
                    { // should we boost suffixes
                        Add(query, "end" + ng, grams[grams.Length - 1], bEnd); // matches end of word

                    }
                    for (int i = 0; i < grams.Length; i++)
                    {
                        Add(query, key, grams[i]);
                    }
                }

                int maxHits = 10 * numSug;

                //    System.out.println("Q: " + query);
                ScoreDoc[] hits = indexSearcher.Search(query, null, maxHits).ScoreDocs;
                //    System.out.println("HITS: " + hits.length());
                SuggestWordQueue sugQueue = new SuggestWordQueue(numSug);

                // go thru more than 'maxr' matches in case the distance filter triggers
                int stop = Math.Min(hits.Length, maxHits);
                SuggestWord sugWord = new SuggestWord();
                for (int i = 0; i < stop; i++)
                {
                    sugWord.termString = indexSearcher.Doc(hits[i].Doc).Get(F_WORD); // get orig word

                    // don't suggest a word for itself, that would be silly
                    if (sugWord.termString.Equals(word))
                    {
                        continue;
                    }

                    // edit distance
                    sugWord.score = sd.GetDistance(word, sugWord.termString);
                    if (sugWord.score < min)
                    {
                        continue;
                    }

                    if (ir != null && field != null)
                    { // use the user index
                        sugWord.freq = ir.DocFreq(new Term(field, sugWord.termString)); // freq in the index
                        // don't suggest a word that is not present in the field
                        if ((morePopular && goalFreq > sugWord.freq) || sugWord.freq < 1)
                        {
                            continue;
                        }
                    }

                    if (alreadySeen.Add(sugWord.termString) == false) // we already seen this word, no point returning it twice
                        continue;

                    sugQueue.InsertWithOverflow(sugWord);
                    if (sugQueue.Size() == numSug)
                    {
                        // if queue full, maintain the minScore score
                        min = ((SuggestWord)sugQueue.Top()).score;
                    }
                    sugWord = new SuggestWord();
                }

                // convert to array string
                String[] list = new String[sugQueue.Size()];
                for (int i = sugQueue.Size() - 1; i >= 0; i--)
                {
                    list[i] = ((SuggestWord)sugQueue.Pop()).termString;
                }

                return list;
            }
            finally
            {
                ReleaseSearcher(indexSearcher);
            }
        }
Beispiel #14
0
        /// <summary> Suggest similar words (restricted or not to a field of a user index)</summary>
        /// <param name="word">String the word you want a spell check done on
        /// </param>
        /// <param name="num_sug">int the number of suggest words
        /// </param>
        /// <param name="ir">the indexReader of the user index (can be null see field param)
        /// </param>
        /// <param name="field">String the field of the user index: if field is not null, the suggested
        /// words are restricted to the words present in this field.
        /// </param>
        /// <param name="morePopular">boolean return only the suggest words that are more frequent than the searched word
        /// (only if restricted mode = (indexReader!=null and field!=null)
        /// </param>
        /// <throws>  IOException </throws>
        /// <returns> String[] the sorted list of the suggest words with this 2 criteria:
        /// first criteria: the edit distance, second criteria (only if restricted mode): the popularity
        /// of the suggest words in the field of the user index
        /// </returns>
        public virtual System.String[] SuggestSimilar(System.String word, int num_sug, IndexReader ir, System.String field, bool morePopular)
        {
            float min = this.minScore;
            TRStringDistance sd = new TRStringDistance(word);
            int lengthWord = word.Length;
			
            int goalFreq = (morePopular && ir != null) ? ir.DocFreq(new Term(field, word)) : 0;
            if (!morePopular && goalFreq > 0)
            {
                return new System.String[]{word}; // return the word if it exist in the index and i don't want a more popular word
            }
			
            BooleanQuery query = new BooleanQuery();
            System.String[] grams;
            System.String key;
			
            for (int ng = GetMin(lengthWord); ng <= GetMax(lengthWord); ng++)
            {
				
                key = "gram" + ng; // form key
				
                grams = FormGrams(word, ng); // form word into ngrams (allow dups too)
				
                if (grams.Length == 0)
                {
                    continue; // hmm
                }
				
                if (bStart > 0)
                {
                    // should we boost prefixes?
                    Add(query, "start" + ng, grams[0], bStart); // matches start of word
                }
                if (bEnd > 0)
                {
                    // should we boost suffixes
                    Add(query, "end" + ng, grams[grams.Length - 1], bEnd); // matches end of word
                }
                for (int i = 0; i < grams.Length; i++)
                {
                    Add(query, key, grams[i]);
                }
            }
			
            IndexSearcher searcher = new IndexSearcher(this.spellindex);
            Hits hits = searcher.Search(query);
            SuggestWordQueue sugqueue = new SuggestWordQueue(num_sug);
			
            int stop = Math.Min(hits.Length(), 10 * num_sug); // go thru more than 'maxr' matches in case the distance filter triggers
            SuggestWord sugword = new SuggestWord();
            for (int i = 0; i < stop; i++)
            {
				
                sugword.string_Renamed = hits.Doc(i).Get(F_WORD); // get orig word)
				
                if (sugword.string_Renamed.Equals(word))
                {
                    continue; // don't suggest a word for itself, that would be silly
                }
				
                //edit distance/normalize with the min word length
                sugword.score = 1.0f - ((float) sd.GetDistance(sugword.string_Renamed) / System.Math.Min(sugword.string_Renamed.Length, lengthWord));
                if (sugword.score < min)
                {
                    continue;
                }
				
                if (ir != null)
                {
                    // use the user index
                    sugword.freq = ir.DocFreq(new Term(field, sugword.string_Renamed)); // freq in the index
                    if ((morePopular && goalFreq > sugword.freq) || sugword.freq < 1)
                    {
                        // don't suggest a word that is not present in the field
                        continue;
                    }
                }
                sugqueue.Insert(sugword);
                if (sugqueue.Size() == num_sug)
                {
                    //if queue full , maintain the min score
                    min = ((SuggestWord) sugqueue.Top()).score;
                }
                sugword = new SuggestWord();
            }
			
            // convert to array string
            System.String[] list = new System.String[sugqueue.Size()];
            for (int i = sugqueue.Size() - 1; i >= 0; i--)
            {
                list[i] = ((SuggestWord) sugqueue.Pop()).string_Renamed;
            }
			
            searcher.Close();
            return list;
        }
			public virtual Explanation Explain(IndexReader reader, int doc)
			{
				
				Explanation result = new Explanation();
				result.SetDescription("weight(" + GetQuery() + " in " + doc + "), product of:");
				
				System.Text.StringBuilder docFreqs = new System.Text.StringBuilder();
				System.Text.StringBuilder query = new System.Text.StringBuilder();
				query.Append('\"');
				for (int i = 0; i < Enclosing_Instance.terms.Count; i++)
				{
					if (i != 0)
					{
						docFreqs.Append(" ");
						query.Append(" ");
					}
					
					Term term = (Term) Enclosing_Instance.terms[i];
					
					docFreqs.Append(term.Text());
					docFreqs.Append("=");
					docFreqs.Append(reader.DocFreq(term));
					
					query.Append(term.Text());
				}
				query.Append('\"');
				
				Explanation idfExpl = new Explanation(idf, "idf(" + Enclosing_Instance.field + ": " + docFreqs + ")");
				
				// explain query weight
				Explanation queryExpl = new Explanation();
				queryExpl.SetDescription("queryWeight(" + GetQuery() + "), product of:");
				
				Explanation boostExpl = new Explanation(Enclosing_Instance.GetBoost(), "boost");
				if (Enclosing_Instance.GetBoost() != 1.0f)
					queryExpl.AddDetail(boostExpl);
				queryExpl.AddDetail(idfExpl);
				
				Explanation queryNormExpl = new Explanation(queryNorm, "queryNorm");
				queryExpl.AddDetail(queryNormExpl);
				
				queryExpl.SetValue(boostExpl.GetValue() * idfExpl.GetValue() * queryNormExpl.GetValue());
				
				result.AddDetail(queryExpl);
				
				// explain field weight
				Explanation fieldExpl = new Explanation();
				fieldExpl.SetDescription("fieldWeight(" + Enclosing_Instance.field + ":" + query + " in " + doc + "), product of:");
				
				Explanation tfExpl = Scorer(reader).Explain(doc);
				fieldExpl.AddDetail(tfExpl);
				fieldExpl.AddDetail(idfExpl);
				
				Explanation fieldNormExpl = new Explanation();
				byte[] fieldNorms = reader.Norms(Enclosing_Instance.field);
				float fieldNorm = fieldNorms != null ? Similarity.DecodeNorm(fieldNorms[doc]) : 0.0f;
				fieldNormExpl.SetValue(fieldNorm);
				fieldNormExpl.SetDescription("fieldNorm(field=" + Enclosing_Instance.field + ", doc=" + doc + ")");
				fieldExpl.AddDetail(fieldNormExpl);
				
				fieldExpl.SetValue(tfExpl.GetValue() * idfExpl.GetValue() * fieldNormExpl.GetValue());
				
				result.AddDetail(fieldExpl);
				
				// combine them
				result.SetValue(queryExpl.GetValue() * fieldExpl.GetValue());
				
				if (queryExpl.GetValue() == 1.0f)
					return fieldExpl;
				
				return result;
			}
Beispiel #16
0
			public virtual Explanation Explain(IndexReader reader, int doc)
			{
				
				ComplexExplanation result = new ComplexExplanation();
				result.SetDescription("weight(" + GetQuery() + " in " + doc + "), product of:");
				
				Explanation idfExpl = new Explanation(idf, "idf(docFreq=" + reader.DocFreq(Enclosing_Instance.term) + ", numDocs=" + reader.NumDocs() + ")");
				
				// explain query weight
				Explanation queryExpl = new Explanation();
				queryExpl.SetDescription("queryWeight(" + GetQuery() + "), product of:");
				
				Explanation boostExpl = new Explanation(Enclosing_Instance.GetBoost(), "boost");
				if (Enclosing_Instance.GetBoost() != 1.0f)
					queryExpl.AddDetail(boostExpl);
				queryExpl.AddDetail(idfExpl);
				
				Explanation queryNormExpl = new Explanation(queryNorm, "queryNorm");
				queryExpl.AddDetail(queryNormExpl);
				
				queryExpl.SetValue(boostExpl.GetValue() * idfExpl.GetValue() * queryNormExpl.GetValue());
				
				result.AddDetail(queryExpl);
				
				// explain field weight
				System.String field = Enclosing_Instance.term.Field();
				ComplexExplanation fieldExpl = new ComplexExplanation();
				fieldExpl.SetDescription("fieldWeight(" + Enclosing_Instance.term + " in " + doc + "), product of:");
				
				Explanation tfExpl = Scorer(reader).Explain(doc);
				fieldExpl.AddDetail(tfExpl);
				fieldExpl.AddDetail(idfExpl);
				
				Explanation fieldNormExpl = new Explanation();
				byte[] fieldNorms = reader.Norms(field);
				float fieldNorm = fieldNorms != null ? Similarity.DecodeNorm(fieldNorms[doc]) : 0.0f;
				fieldNormExpl.SetValue(fieldNorm);
				fieldNormExpl.SetDescription("fieldNorm(field=" + field + ", doc=" + doc + ")");
				fieldExpl.AddDetail(fieldNormExpl);
				
				fieldExpl.SetMatch(tfExpl.IsMatch());
				fieldExpl.SetValue(tfExpl.GetValue() * idfExpl.GetValue() * fieldNormExpl.GetValue());
				
				result.AddDetail(fieldExpl);
				System.Boolean tempAux = fieldExpl.GetMatch();
				result.SetMatch(tempAux);
				
				// combine them
				result.SetValue(queryExpl.GetValue() * fieldExpl.GetValue());
				
				if (queryExpl.GetValue() == 1.0f)
					return fieldExpl;
				
				return result;
			}
Beispiel #17
0
        /// <summary> Suggest similar words (restricted or not to a field of a user index)</summary>
        /// <param name="word">String the word you want a spell check done on
        /// </param>
        /// <param name="numSug">int the number of suggest words
        /// </param>
        /// <param name="ir">the indexReader of the user index (can be null see field param)
        /// </param>
        /// <param name="field">String the field of the user index: if field is not null, the suggested
        /// words are restricted to the words present in this field.
        /// </param>
        /// <param name="morePopular">boolean return only the suggest words that are more frequent than the searched word
        /// (only if restricted mode = (indexReader!=null and field!=null)
        /// </param>
        /// <throws>  IOException </throws>
        /// <returns> String[] the sorted list of the suggest words with this 2 criteria:
        /// first criteria: the edit distance, second criteria (only if restricted mode): the popularity
        /// of the suggest words in the field of the user index
        /// </returns>
        public virtual System.String[] SuggestSimilar(System.String word, int numSug, IndexReader ir, System.String field, bool morePopular)
        {    // obtainSearcher calls ensureOpen
            IndexSearcher indexSearcher = ObtainSearcher();

            try
            {
                float min        = this.minScore;
                int   lengthWord = word.Length;

                int freq     = (ir != null && field != null) ? ir.DocFreq(new Term(field, word)) : 0;
                int goalFreq = (morePopular && ir != null && field != null) ? freq : 0;
                // if the word exists in the real index and we don't care for word frequency, return the word itself
                if (!morePopular && freq > 0)
                {
                    return(new String[] { word });
                }

                BooleanQuery query = new BooleanQuery();
                String[]     grams;
                String       key;

                for (int ng = GetMin(lengthWord); ng <= GetMax(lengthWord); ng++)
                {
                    key = "gram" + ng;           // form key

                    grams = FormGrams(word, ng); // form word into ngrams (allow dups too)

                    if (grams.Length == 0)
                    {
                        continue; // hmm
                    }

                    if (bStart > 0)
                    {                                               // should we boost prefixes?
                        Add(query, "start" + ng, grams[0], bStart); // matches start of word
                    }
                    if (bEnd > 0)
                    {                                                          // should we boost suffixes
                        Add(query, "end" + ng, grams[grams.Length - 1], bEnd); // matches end of word
                    }
                    for (int i = 0; i < grams.Length; i++)
                    {
                        Add(query, key, grams[i]);
                    }
                }

                int maxHits = 10 * numSug;

                //    System.out.println("Q: " + query);
                ScoreDoc[] hits = indexSearcher.Search(query, null, maxHits).scoreDocs;
                //    System.out.println("HITS: " + hits.length());
                SuggestWordQueue sugQueue = new SuggestWordQueue(numSug);

                // go thru more than 'maxr' matches in case the distance filter triggers
                int         stop    = Math.Min(hits.Length, maxHits);
                SuggestWord sugWord = new SuggestWord();
                for (int i = 0; i < stop; i++)
                {
                    sugWord.string_Renamed = indexSearcher.Doc(hits[i].doc).Get(F_WORD); // get orig word

                    // don't suggest a word for itself, that would be silly
                    if (sugWord.string_Renamed.Equals(word))
                    {
                        continue;
                    }

                    // edit distance
                    sugWord.score = sd.GetDistance(word, sugWord.string_Renamed);
                    if (sugWord.score < min)
                    {
                        continue;
                    }

                    if (ir != null && field != null)
                    {                                                                       // use the user index
                        sugWord.freq = ir.DocFreq(new Term(field, sugWord.string_Renamed)); // freq in the index
                        // don't suggest a word that is not present in the field
                        if ((morePopular && goalFreq > sugWord.freq) || sugWord.freq < 1)
                        {
                            continue;
                        }
                    }
                    sugQueue.InsertWithOverflow(sugWord);
                    if (sugQueue.Size() == numSug)
                    {
                        // if queue full, maintain the minScore score
                        min = ((SuggestWord)sugQueue.Top()).score;
                    }
                    sugWord = new SuggestWord();
                }

                // convert to array string
                String[] list = new String[sugQueue.Size()];
                for (int i = sugQueue.Size() - 1; i >= 0; i--)
                {
                    list[i] = ((SuggestWord)sugQueue.Pop()).string_Renamed;
                }

                return(list);
            }
            finally
            {
                ReleaseSearcher(indexSearcher);
            }
        }