public virtual void TestSetBufferSize() { System.IO.FileInfo indexDir = new System.IO.FileInfo(System.IO.Path.Combine(SupportClass.AppSettings.Get("tempDir", ""), "testSetBufferSize")); MockFSDirectory dir = new MockFSDirectory(indexDir, NewRandom()); try { IndexWriter writer = new IndexWriter(dir, new WhitespaceAnalyzer(), true, IndexWriter.MaxFieldLength.LIMITED); writer.SetUseCompoundFile(false); for (int i = 0; i < 37; i++) { Document doc = new Document(); doc.Add(new Field("content", "aaa bbb ccc ddd" + i, Field.Store.YES, Field.Index.ANALYZED)); doc.Add(new Field("id", "" + i, Field.Store.YES, Field.Index.ANALYZED)); writer.AddDocument(doc); } writer.Close(); dir.allIndexInputs.Clear(); IndexReader reader = IndexReader.Open(dir); Term aaa = new Term("content", "aaa"); Term bbb = new Term("content", "bbb"); Term ccc = new Term("content", "ccc"); Assert.AreEqual(37, reader.DocFreq(ccc)); reader.DeleteDocument(0); Assert.AreEqual(37, reader.DocFreq(aaa)); dir.tweakBufferSizes(); reader.DeleteDocument(4); Assert.AreEqual(reader.DocFreq(bbb), 37); dir.tweakBufferSizes(); IndexSearcher searcher = new IndexSearcher(reader); ScoreDoc[] hits = searcher.Search(new TermQuery(bbb), null, 1000).scoreDocs; dir.tweakBufferSizes(); Assert.AreEqual(35, hits.Length); dir.tweakBufferSizes(); hits = searcher.Search(new TermQuery(new Term("id", "33")), null, 1000).scoreDocs; dir.tweakBufferSizes(); Assert.AreEqual(1, hits.Length); hits = searcher.Search(new TermQuery(aaa), null, 1000).scoreDocs; dir.tweakBufferSizes(); Assert.AreEqual(35, hits.Length); searcher.Close(); reader.Close(); } finally { _TestUtil.RmDir(indexDir); } }
public override Query Rewrite(IndexReader reader, MultiTermQuery query) { // Get the enum and start visiting terms. If we // exhaust the enum before hitting either of the // cutoffs, we use ConstantBooleanQueryRewrite; else, // ConstantFilterRewrite: System.Collections.ArrayList pendingTerms = new System.Collections.ArrayList(); int docCountCutoff = (int)((docCountPercent / 100.0) * reader.MaxDoc()); int termCountLimit = System.Math.Min(BooleanQuery.GetMaxClauseCount(), termCountCutoff); int docVisitCount = 0; FilteredTermEnum enumerator = query.GetEnum(reader); try { while (true) { Term t = enumerator.Term(); if (t != null) { pendingTerms.Add(t); // Loading the TermInfo from the terms dict here // should not be costly, because 1) the // query/filter will load the TermInfo when it // runs, and 2) the terms dict has a cache: docVisitCount += reader.DocFreq(t); } if (pendingTerms.Count >= termCountLimit || docVisitCount >= docCountCutoff) { // Too many terms -- make a filter. Query result = new ConstantScoreQuery(new MultiTermQueryWrapperFilter(query)); result.SetBoost(query.GetBoost()); return(result); } else if (!enumerator.Next()) { // Enumeration is done, and we hit a small // enough number of terms & docs -- just make a // BooleanQuery, now System.Collections.IEnumerator it = pendingTerms.GetEnumerator(); BooleanQuery bq = new BooleanQuery(true); while (it.MoveNext()) { TermQuery tq = new TermQuery((Term)it.Current); bq.Add(tq, BooleanClause.Occur.SHOULD); } // Strip scores Query result = new ConstantScoreQuery(new QueryWrapperFilter(bq)); result.SetBoost(query.GetBoost()); query.IncTotalNumberOfTerms(pendingTerms.Count); return(result); } } } finally { enumerator.Close(); } }
/// <summary> Check whether the word exists in the index.</summary> /// <param name="word">String /// </param> /// <throws> IOException </throws> /// <returns> true iff the word exists in the index /// </returns> public virtual bool Exist(System.String word) { if (reader == null) { reader = IndexReader.Open(spellindex); } return(reader.DocFreq(new Term(F_WORD, word)) > 0); }
/// <summary> Extracts all terms texts of a given Query into an array of WeightedTerms /// /// </summary> /// <param name="query"> Query to extract term texts from /// </param> /// <param name="reader">used to compute IDF which can be used to a) score selected fragments better /// b) use graded highlights eg chaning intensity of font color /// </param> /// <param name="fieldName">the field on which Inverse Document Frequency (IDF) calculations are based /// </param> /// <returns> an array of the terms used in a query, plus their weights. /// </returns> public static WeightedTerm[] GetIdfWeightedTerms(Query query, IndexReader reader, System.String fieldName) { WeightedTerm[] terms = GetTerms(query, false, fieldName); int totalNumDocs = reader.NumDocs(); for (int i = 0; i < terms.Length; i++) { try { int docFreq = reader.DocFreq(new Term(fieldName, terms[i].term)); //IDF algorithm taken from DefaultSimilarity class float idf = (float) (System.Math.Log((float) totalNumDocs / (double) (docFreq + 1)) + 1.0); terms[i].weight *= idf; } catch (System.IO.IOException e) { //ignore } } return terms; }
/// <summary> Extracts all terms texts of a given Query into an array of WeightedTerms /// /// </summary> /// <param name="query"> Query to extract term texts from /// </param> /// <param name="reader">used to compute IDF which can be used to a) score selected fragments better /// b) use graded highlights eg chaning intensity of font color /// </param> /// <param name="fieldName">the field on which Inverse Document Frequency (IDF) calculations are based /// </param> /// <returns> an array of the terms used in a query, plus their weights. /// </returns> public static WeightedTerm[] GetIdfWeightedTerms(Query query, IndexReader reader, System.String fieldName) { WeightedTerm[] terms = GetTerms(query, false, fieldName); int totalNumDocs = reader.NumDocs(); for (int i = 0; i < terms.Length; i++) { try { int docFreq = reader.DocFreq(new Term(fieldName, terms[i].term)); //IDF algorithm taken from DefaultSimilarity class float idf = (float)(System.Math.Log((float)totalNumDocs / (double)(docFreq + 1)) + 1.0); terms[i].weight *= idf; } catch (System.IO.IOException e) { //ignore } } return(terms); }
public virtual Explanation Explain(IndexReader reader, int doc) { Explanation result = new Explanation(); result.SetDescription("weight(" + GetQuery() + " in " + doc + "), product of:"); System.String field = ((SpanQuery) GetQuery()).GetField(); System.Text.StringBuilder docFreqs = new System.Text.StringBuilder(); System.Collections.IEnumerator i = terms.GetEnumerator(); while (i.MoveNext()) { System.Collections.DictionaryEntry tmp = (System.Collections.DictionaryEntry) i.Current; Term term = (Term) tmp.Key; docFreqs.Append(term.Text()); docFreqs.Append("="); docFreqs.Append(reader.DocFreq(term)); if (i.MoveNext()) { docFreqs.Append(" "); } } Explanation idfExpl = new Explanation(idf, "idf(" + field + ": " + docFreqs + ")"); // explain query weight Explanation queryExpl = new Explanation(); queryExpl.SetDescription("queryWeight(" + GetQuery() + "), product of:"); Explanation boostExpl = new Explanation(GetQuery().GetBoost(), "boost"); if (GetQuery().GetBoost() != 1.0f) queryExpl.AddDetail(boostExpl); queryExpl.AddDetail(idfExpl); Explanation queryNormExpl = new Explanation(queryNorm, "queryNorm"); queryExpl.AddDetail(queryNormExpl); queryExpl.SetValue(boostExpl.GetValue() * idfExpl.GetValue() * queryNormExpl.GetValue()); result.AddDetail(queryExpl); // explain field weight Explanation fieldExpl = new Explanation(); fieldExpl.SetDescription("fieldWeight(" + field + ":" + query.ToString(field) + " in " + doc + "), product of:"); Explanation tfExpl = Scorer(reader).Explain(doc); fieldExpl.AddDetail(tfExpl); fieldExpl.AddDetail(idfExpl); Explanation fieldNormExpl = new Explanation(); byte[] fieldNorms = reader.Norms(field); float fieldNorm = fieldNorms != null ? Similarity.DecodeNorm(fieldNorms[doc]) : 0.0f; fieldNormExpl.SetValue(fieldNorm); fieldNormExpl.SetDescription("fieldNorm(field=" + field + ", doc=" + doc + ")"); fieldExpl.AddDetail(fieldNormExpl); fieldExpl.SetValue(tfExpl.GetValue() * idfExpl.GetValue() * fieldNormExpl.GetValue()); result.AddDetail(fieldExpl); // combine them result.SetValue(queryExpl.GetValue() * fieldExpl.GetValue()); if (queryExpl.GetValue() == 1.0f) return fieldExpl; return result; }
public virtual Explanation Explain(IndexReader reader, int doc) { ComplexExplanation result = new ComplexExplanation(); result.SetDescription("weight(" + GetQuery() + " in " + doc + "), product of:"); Explanation idfExpl = new Explanation(idf, "idf(docFreq=" + reader.DocFreq(Enclosing_Instance.term) + ", numDocs=" + reader.NumDocs() + ")"); // explain query weight Explanation queryExpl = new Explanation(); queryExpl.SetDescription("queryWeight(" + GetQuery() + "), product of:"); Explanation boostExpl = new Explanation(Enclosing_Instance.GetBoost(), "boost"); if (Enclosing_Instance.GetBoost() != 1.0f) { queryExpl.AddDetail(boostExpl); } queryExpl.AddDetail(idfExpl); Explanation queryNormExpl = new Explanation(queryNorm, "queryNorm"); queryExpl.AddDetail(queryNormExpl); queryExpl.SetValue(boostExpl.GetValue() * idfExpl.GetValue() * queryNormExpl.GetValue()); result.AddDetail(queryExpl); // explain field weight System.String field = Enclosing_Instance.term.Field(); ComplexExplanation fieldExpl = new ComplexExplanation(); fieldExpl.SetDescription("fieldWeight(" + Enclosing_Instance.term + " in " + doc + "), product of:"); Explanation tfExpl = Scorer(reader).Explain(doc); fieldExpl.AddDetail(tfExpl); fieldExpl.AddDetail(idfExpl); Explanation fieldNormExpl = new Explanation(); byte[] fieldNorms = reader.Norms(field); float fieldNorm = fieldNorms != null?Similarity.DecodeNorm(fieldNorms[doc]) : 0.0f; fieldNormExpl.SetValue(fieldNorm); fieldNormExpl.SetDescription("fieldNorm(field=" + field + ", doc=" + doc + ")"); fieldExpl.AddDetail(fieldNormExpl); fieldExpl.SetMatch(tfExpl.IsMatch()); fieldExpl.SetValue(tfExpl.GetValue() * idfExpl.GetValue() * fieldNormExpl.GetValue()); result.AddDetail(fieldExpl); System.Boolean tempAux = fieldExpl.GetMatch(); result.SetMatch(tempAux); // combine them result.SetValue(queryExpl.GetValue() * fieldExpl.GetValue()); if (queryExpl.GetValue() == 1.0f) { return(fieldExpl); } return(result); }
/// <summary> Suggest similar words (restricted or not to a field of a user index)</summary> /// <param name="word">String the word you want a spell check done on /// </param> /// <param name="num_sug">int the number of suggest words /// </param> /// <param name="ir">the indexReader of the user index (can be null see field param) /// </param> /// <param name="field">String the field of the user index: if field is not null, the suggested /// words are restricted to the words present in this field. /// </param> /// <param name="morePopular">boolean return only the suggest words that are more frequent than the searched word /// (only if restricted mode = (indexReader!=null and field!=null) /// </param> /// <throws> IOException </throws> /// <returns> String[] the sorted list of the suggest words with this 2 criteria: /// first criteria: the edit distance, second criteria (only if restricted mode): the popularity /// of the suggest words in the field of the user index /// </returns> public virtual System.String[] SuggestSimilar(System.String word, int num_sug, IndexReader ir, System.String field, bool morePopular) { float min = this.minScore; TRStringDistance sd = new TRStringDistance(word); int lengthWord = word.Length; int goalFreq = (morePopular && ir != null) ? ir.DocFreq(new Term(field, word)) : 0; if (!morePopular && goalFreq > 0) { return(new System.String[] { word }); // return the word if it exist in the index and i don't want a more popular word } BooleanQuery query = new BooleanQuery(); System.String[] grams; System.String key; for (int ng = GetMin(lengthWord); ng <= GetMax(lengthWord); ng++) { key = "gram" + ng; // form key grams = FormGrams(word, ng); // form word into ngrams (allow dups too) if (grams.Length == 0) { continue; // hmm } if (bStart > 0) { // should we boost prefixes? Add(query, "start" + ng, grams[0], bStart); // matches start of word } if (bEnd > 0) { // should we boost suffixes Add(query, "end" + ng, grams[grams.Length - 1], bEnd); // matches end of word } for (int i = 0; i < grams.Length; i++) { Add(query, key, grams[i]); } } IndexSearcher searcher = new IndexSearcher(this.spellindex); Hits hits = searcher.Search(query); SuggestWordQueue sugqueue = new SuggestWordQueue(num_sug); int stop = Math.Min(hits.Length(), 10 * num_sug); // go thru more than 'maxr' matches in case the distance filter triggers SuggestWord sugword = new SuggestWord(); for (int i = 0; i < stop; i++) { sugword.string_Renamed = hits.Doc(i).Get(F_WORD); // get orig word) if (sugword.string_Renamed.Equals(word)) { continue; // don't suggest a word for itself, that would be silly } //edit distance/normalize with the min word length sugword.score = 1.0f - ((float)sd.GetDistance(sugword.string_Renamed) / System.Math.Min(sugword.string_Renamed.Length, lengthWord)); if (sugword.score < min) { continue; } if (ir != null) { // use the user index sugword.freq = ir.DocFreq(new Term(field, sugword.string_Renamed)); // freq in the index if ((morePopular && goalFreq > sugword.freq) || sugword.freq < 1) { // don't suggest a word that is not present in the field continue; } } sugqueue.Insert(sugword); if (sugqueue.Size() == num_sug) { //if queue full , maintain the min score min = ((SuggestWord)sugqueue.Top()).score; } sugword = new SuggestWord(); } // convert to array string System.String[] list = new System.String[sugqueue.Size()]; for (int i = sugqueue.Size() - 1; i >= 0; i--) { list[i] = ((SuggestWord)sugqueue.Pop()).string_Renamed; } searcher.Close(); return(list); }
public override Query Rewrite(IndexReader reader, MultiTermQuery query) { // Get the enum and start visiting terms. If we // exhaust the enum before hitting either of the // cutoffs, we use ConstantBooleanQueryRewrite; else, // ConstantFilterRewrite: System.Collections.ArrayList pendingTerms = new System.Collections.ArrayList(); int docCountCutoff = (int) ((docCountPercent / 100.0) * reader.MaxDoc()); int termCountLimit = System.Math.Min(BooleanQuery.GetMaxClauseCount(), termCountCutoff); int docVisitCount = 0; FilteredTermEnum enumerator = query.GetEnum(reader); try { while (true) { Term t = enumerator.Term(); if (t != null) { pendingTerms.Add(t); // Loading the TermInfo from the terms dict here // should not be costly, because 1) the // query/filter will load the TermInfo when it // runs, and 2) the terms dict has a cache: docVisitCount += reader.DocFreq(t); } if (pendingTerms.Count >= termCountLimit || docVisitCount >= docCountCutoff) { // Too many terms -- make a filter. Query result = new ConstantScoreQuery(new MultiTermQueryWrapperFilter(query)); result.SetBoost(query.GetBoost()); return result; } else if (!enumerator.Next()) { // Enumeration is done, and we hit a small // enough number of terms & docs -- just make a // BooleanQuery, now System.Collections.IEnumerator it = pendingTerms.GetEnumerator(); BooleanQuery bq = new BooleanQuery(true); while (it.MoveNext()) { TermQuery tq = new TermQuery((Term) it.Current); bq.Add(tq, BooleanClause.Occur.SHOULD); } // Strip scores Query result = new ConstantScoreQuery(new QueryWrapperFilter(bq)); result.SetBoost(query.GetBoost()); query.IncTotalNumberOfTerms(pendingTerms.Count); return result; } } } finally { enumerator.Close(); } }
public virtual Explanation Explain(IndexReader reader, int doc) { Explanation result = new Explanation(); result.SetDescription("weight(" + GetQuery() + " in " + doc + "), product of:"); System.String field = ((SpanQuery)GetQuery()).GetField(); System.Text.StringBuilder docFreqs = new System.Text.StringBuilder(); System.Collections.IEnumerator i = terms.GetEnumerator(); while (i.MoveNext()) { Term term = (Term)i.Current; docFreqs.Append(term.Text()); docFreqs.Append("="); docFreqs.Append(reader.DocFreq(term)); if (i.MoveNext()) { docFreqs.Append(" "); } } Explanation idfExpl = new Explanation(idf, "idf(" + field + ": " + docFreqs + ")"); // explain query weight Explanation queryExpl = new Explanation(); queryExpl.SetDescription("queryWeight(" + GetQuery() + "), product of:"); Explanation boostExpl = new Explanation(GetQuery().GetBoost(), "boost"); if (GetQuery().GetBoost() != 1.0f) { queryExpl.AddDetail(boostExpl); } queryExpl.AddDetail(idfExpl); Explanation queryNormExpl = new Explanation(queryNorm, "queryNorm"); queryExpl.AddDetail(queryNormExpl); queryExpl.SetValue(boostExpl.GetValue() * idfExpl.GetValue() * queryNormExpl.GetValue()); result.AddDetail(queryExpl); // explain field weight Explanation fieldExpl = new Explanation(); fieldExpl.SetDescription("fieldWeight(" + field + ":" + query.ToString(field) + " in " + doc + "), product of:"); Explanation tfExpl = Scorer(reader).Explain(doc); fieldExpl.AddDetail(tfExpl); fieldExpl.AddDetail(idfExpl); Explanation fieldNormExpl = new Explanation(); byte[] fieldNorms = reader.Norms(field); float fieldNorm = fieldNorms != null?Similarity.DecodeNorm(fieldNorms[doc]) : 0.0f; fieldNormExpl.SetValue(fieldNorm); fieldNormExpl.SetDescription("fieldNorm(field=" + field + ", doc=" + doc + ")"); fieldExpl.AddDetail(fieldNormExpl); fieldExpl.SetValue(tfExpl.GetValue() * idfExpl.GetValue() * fieldNormExpl.GetValue()); result.AddDetail(fieldExpl); // combine them result.SetValue(queryExpl.GetValue() * fieldExpl.GetValue()); if (queryExpl.GetValue() == 1.0f) { return(fieldExpl); } return(result); }
// inherit javadoc public override int DocFreq(Term term) { return(reader.DocFreq(term)); }
public virtual Explanation Explain(IndexReader reader, int doc) { Explanation result = new Explanation(); result.SetDescription("weight(" + GetQuery() + " in " + doc + "), product of:"); System.Text.StringBuilder docFreqs = new System.Text.StringBuilder(); System.Text.StringBuilder query = new System.Text.StringBuilder(); query.Append('\"'); for (int i = 0; i < Enclosing_Instance.terms.Count; i++) { if (i != 0) { docFreqs.Append(" "); query.Append(" "); } Term term = (Term)Enclosing_Instance.terms[i]; docFreqs.Append(term.Text()); docFreqs.Append("="); docFreqs.Append(reader.DocFreq(term)); query.Append(term.Text()); } query.Append('\"'); Explanation idfExpl = new Explanation(idf, "idf(" + Enclosing_Instance.field + ": " + docFreqs + ")"); // explain query weight Explanation queryExpl = new Explanation(); queryExpl.SetDescription("queryWeight(" + GetQuery() + "), product of:"); Explanation boostExpl = new Explanation(Enclosing_Instance.GetBoost(), "boost"); if (Enclosing_Instance.GetBoost() != 1.0f) { queryExpl.AddDetail(boostExpl); } queryExpl.AddDetail(idfExpl); Explanation queryNormExpl = new Explanation(queryNorm, "queryNorm"); queryExpl.AddDetail(queryNormExpl); queryExpl.SetValue(boostExpl.GetValue() * idfExpl.GetValue() * queryNormExpl.GetValue()); result.AddDetail(queryExpl); // explain field weight Explanation fieldExpl = new Explanation(); fieldExpl.SetDescription("fieldWeight(" + Enclosing_Instance.field + ":" + query + " in " + doc + "), product of:"); Explanation tfExpl = Scorer(reader).Explain(doc); fieldExpl.AddDetail(tfExpl); fieldExpl.AddDetail(idfExpl); Explanation fieldNormExpl = new Explanation(); byte[] fieldNorms = reader.Norms(Enclosing_Instance.field); float fieldNorm = fieldNorms != null?Similarity.DecodeNorm(fieldNorms[doc]) : 0.0f; fieldNormExpl.SetValue(fieldNorm); fieldNormExpl.SetDescription("fieldNorm(field=" + Enclosing_Instance.field + ", doc=" + doc + ")"); fieldExpl.AddDetail(fieldNormExpl); fieldExpl.SetValue(tfExpl.GetValue() * idfExpl.GetValue() * fieldNormExpl.GetValue()); result.AddDetail(fieldExpl); // combine them result.SetValue(queryExpl.GetValue() * fieldExpl.GetValue()); if (queryExpl.GetValue() == 1.0f) { return(fieldExpl); } return(result); }
/// <summary> Suggest similar words (restricted or not to a field of a user index)</summary> /// <param name="word">String the word you want a spell check done on /// </param> /// <param name="numSug">int the number of suggest words /// </param> /// <param name="ir">the indexReader of the user index (can be null see field param) /// </param> /// <param name="field">String the field of the user index: if field is not null, the suggested /// words are restricted to the words present in this field. /// </param> /// <param name="morePopular">boolean return only the suggest words that are more frequent than the searched word /// (only if restricted mode = (indexReader!=null and field!=null) /// </param> /// <throws> IOException </throws> /// <returns> String[] the sorted list of the suggest words with this 2 criteria: /// first criteria: the edit distance, second criteria (only if restricted mode): the popularity /// of the suggest words in the field of the user index /// </returns> public virtual System.String[] SuggestSimilar(System.String word, int numSug, IndexReader ir, System.String field, bool morePopular) { // obtainSearcher calls ensureOpen IndexSearcher indexSearcher = ObtainSearcher(); try { float min = this.minScore; int lengthWord = word.Length; int freq = (ir != null && field != null) ? ir.DocFreq(new Term(field, word)) : 0; int goalFreq = (morePopular && ir != null && field != null) ? freq : 0; // if the word exists in the real index and we don't care for word frequency, return the word itself if (!morePopular && freq > 0) { return new String[] { word }; } var query = new BooleanQuery(); String[] grams; String key; var alreadySeen = new HashSet<string>(); for (var ng = GetMin(lengthWord); ng <= GetMax(lengthWord); ng++) { key = "gram" + ng; // form key grams = FormGrams(word, ng); // form word into ngrams (allow dups too) if (grams.Length == 0) { continue; // hmm } if (bStart > 0) { // should we boost prefixes? Add(query, "start" + ng, grams[0], bStart); // matches start of word } if (bEnd > 0) { // should we boost suffixes Add(query, "end" + ng, grams[grams.Length - 1], bEnd); // matches end of word } for (int i = 0; i < grams.Length; i++) { Add(query, key, grams[i]); } } int maxHits = 10 * numSug; // System.out.println("Q: " + query); ScoreDoc[] hits = indexSearcher.Search(query, null, maxHits).ScoreDocs; // System.out.println("HITS: " + hits.length()); SuggestWordQueue sugQueue = new SuggestWordQueue(numSug); // go thru more than 'maxr' matches in case the distance filter triggers int stop = Math.Min(hits.Length, maxHits); SuggestWord sugWord = new SuggestWord(); for (int i = 0; i < stop; i++) { sugWord.termString = indexSearcher.Doc(hits[i].Doc).Get(F_WORD); // get orig word // don't suggest a word for itself, that would be silly if (sugWord.termString.Equals(word)) { continue; } // edit distance sugWord.score = sd.GetDistance(word, sugWord.termString); if (sugWord.score < min) { continue; } if (ir != null && field != null) { // use the user index sugWord.freq = ir.DocFreq(new Term(field, sugWord.termString)); // freq in the index // don't suggest a word that is not present in the field if ((morePopular && goalFreq > sugWord.freq) || sugWord.freq < 1) { continue; } } if (alreadySeen.Add(sugWord.termString) == false) // we already seen this word, no point returning it twice continue; sugQueue.InsertWithOverflow(sugWord); if (sugQueue.Size() == numSug) { // if queue full, maintain the minScore score min = ((SuggestWord)sugQueue.Top()).score; } sugWord = new SuggestWord(); } // convert to array string String[] list = new String[sugQueue.Size()]; for (int i = sugQueue.Size() - 1; i >= 0; i--) { list[i] = ((SuggestWord)sugQueue.Pop()).termString; } return list; } finally { ReleaseSearcher(indexSearcher); } }
/// <summary> Suggest similar words (restricted or not to a field of a user index)</summary> /// <param name="word">String the word you want a spell check done on /// </param> /// <param name="num_sug">int the number of suggest words /// </param> /// <param name="ir">the indexReader of the user index (can be null see field param) /// </param> /// <param name="field">String the field of the user index: if field is not null, the suggested /// words are restricted to the words present in this field. /// </param> /// <param name="morePopular">boolean return only the suggest words that are more frequent than the searched word /// (only if restricted mode = (indexReader!=null and field!=null) /// </param> /// <throws> IOException </throws> /// <returns> String[] the sorted list of the suggest words with this 2 criteria: /// first criteria: the edit distance, second criteria (only if restricted mode): the popularity /// of the suggest words in the field of the user index /// </returns> public virtual System.String[] SuggestSimilar(System.String word, int num_sug, IndexReader ir, System.String field, bool morePopular) { float min = this.minScore; TRStringDistance sd = new TRStringDistance(word); int lengthWord = word.Length; int goalFreq = (morePopular && ir != null) ? ir.DocFreq(new Term(field, word)) : 0; if (!morePopular && goalFreq > 0) { return new System.String[]{word}; // return the word if it exist in the index and i don't want a more popular word } BooleanQuery query = new BooleanQuery(); System.String[] grams; System.String key; for (int ng = GetMin(lengthWord); ng <= GetMax(lengthWord); ng++) { key = "gram" + ng; // form key grams = FormGrams(word, ng); // form word into ngrams (allow dups too) if (grams.Length == 0) { continue; // hmm } if (bStart > 0) { // should we boost prefixes? Add(query, "start" + ng, grams[0], bStart); // matches start of word } if (bEnd > 0) { // should we boost suffixes Add(query, "end" + ng, grams[grams.Length - 1], bEnd); // matches end of word } for (int i = 0; i < grams.Length; i++) { Add(query, key, grams[i]); } } IndexSearcher searcher = new IndexSearcher(this.spellindex); Hits hits = searcher.Search(query); SuggestWordQueue sugqueue = new SuggestWordQueue(num_sug); int stop = Math.Min(hits.Length(), 10 * num_sug); // go thru more than 'maxr' matches in case the distance filter triggers SuggestWord sugword = new SuggestWord(); for (int i = 0; i < stop; i++) { sugword.string_Renamed = hits.Doc(i).Get(F_WORD); // get orig word) if (sugword.string_Renamed.Equals(word)) { continue; // don't suggest a word for itself, that would be silly } //edit distance/normalize with the min word length sugword.score = 1.0f - ((float) sd.GetDistance(sugword.string_Renamed) / System.Math.Min(sugword.string_Renamed.Length, lengthWord)); if (sugword.score < min) { continue; } if (ir != null) { // use the user index sugword.freq = ir.DocFreq(new Term(field, sugword.string_Renamed)); // freq in the index if ((morePopular && goalFreq > sugword.freq) || sugword.freq < 1) { // don't suggest a word that is not present in the field continue; } } sugqueue.Insert(sugword); if (sugqueue.Size() == num_sug) { //if queue full , maintain the min score min = ((SuggestWord) sugqueue.Top()).score; } sugword = new SuggestWord(); } // convert to array string System.String[] list = new System.String[sugqueue.Size()]; for (int i = sugqueue.Size() - 1; i >= 0; i--) { list[i] = ((SuggestWord) sugqueue.Pop()).string_Renamed; } searcher.Close(); return list; }
public virtual Explanation Explain(IndexReader reader, int doc) { Explanation result = new Explanation(); result.SetDescription("weight(" + GetQuery() + " in " + doc + "), product of:"); System.Text.StringBuilder docFreqs = new System.Text.StringBuilder(); System.Text.StringBuilder query = new System.Text.StringBuilder(); query.Append('\"'); for (int i = 0; i < Enclosing_Instance.terms.Count; i++) { if (i != 0) { docFreqs.Append(" "); query.Append(" "); } Term term = (Term) Enclosing_Instance.terms[i]; docFreqs.Append(term.Text()); docFreqs.Append("="); docFreqs.Append(reader.DocFreq(term)); query.Append(term.Text()); } query.Append('\"'); Explanation idfExpl = new Explanation(idf, "idf(" + Enclosing_Instance.field + ": " + docFreqs + ")"); // explain query weight Explanation queryExpl = new Explanation(); queryExpl.SetDescription("queryWeight(" + GetQuery() + "), product of:"); Explanation boostExpl = new Explanation(Enclosing_Instance.GetBoost(), "boost"); if (Enclosing_Instance.GetBoost() != 1.0f) queryExpl.AddDetail(boostExpl); queryExpl.AddDetail(idfExpl); Explanation queryNormExpl = new Explanation(queryNorm, "queryNorm"); queryExpl.AddDetail(queryNormExpl); queryExpl.SetValue(boostExpl.GetValue() * idfExpl.GetValue() * queryNormExpl.GetValue()); result.AddDetail(queryExpl); // explain field weight Explanation fieldExpl = new Explanation(); fieldExpl.SetDescription("fieldWeight(" + Enclosing_Instance.field + ":" + query + " in " + doc + "), product of:"); Explanation tfExpl = Scorer(reader).Explain(doc); fieldExpl.AddDetail(tfExpl); fieldExpl.AddDetail(idfExpl); Explanation fieldNormExpl = new Explanation(); byte[] fieldNorms = reader.Norms(Enclosing_Instance.field); float fieldNorm = fieldNorms != null ? Similarity.DecodeNorm(fieldNorms[doc]) : 0.0f; fieldNormExpl.SetValue(fieldNorm); fieldNormExpl.SetDescription("fieldNorm(field=" + Enclosing_Instance.field + ", doc=" + doc + ")"); fieldExpl.AddDetail(fieldNormExpl); fieldExpl.SetValue(tfExpl.GetValue() * idfExpl.GetValue() * fieldNormExpl.GetValue()); result.AddDetail(fieldExpl); // combine them result.SetValue(queryExpl.GetValue() * fieldExpl.GetValue()); if (queryExpl.GetValue() == 1.0f) return fieldExpl; return result; }
public virtual Explanation Explain(IndexReader reader, int doc) { ComplexExplanation result = new ComplexExplanation(); result.SetDescription("weight(" + GetQuery() + " in " + doc + "), product of:"); Explanation idfExpl = new Explanation(idf, "idf(docFreq=" + reader.DocFreq(Enclosing_Instance.term) + ", numDocs=" + reader.NumDocs() + ")"); // explain query weight Explanation queryExpl = new Explanation(); queryExpl.SetDescription("queryWeight(" + GetQuery() + "), product of:"); Explanation boostExpl = new Explanation(Enclosing_Instance.GetBoost(), "boost"); if (Enclosing_Instance.GetBoost() != 1.0f) queryExpl.AddDetail(boostExpl); queryExpl.AddDetail(idfExpl); Explanation queryNormExpl = new Explanation(queryNorm, "queryNorm"); queryExpl.AddDetail(queryNormExpl); queryExpl.SetValue(boostExpl.GetValue() * idfExpl.GetValue() * queryNormExpl.GetValue()); result.AddDetail(queryExpl); // explain field weight System.String field = Enclosing_Instance.term.Field(); ComplexExplanation fieldExpl = new ComplexExplanation(); fieldExpl.SetDescription("fieldWeight(" + Enclosing_Instance.term + " in " + doc + "), product of:"); Explanation tfExpl = Scorer(reader).Explain(doc); fieldExpl.AddDetail(tfExpl); fieldExpl.AddDetail(idfExpl); Explanation fieldNormExpl = new Explanation(); byte[] fieldNorms = reader.Norms(field); float fieldNorm = fieldNorms != null ? Similarity.DecodeNorm(fieldNorms[doc]) : 0.0f; fieldNormExpl.SetValue(fieldNorm); fieldNormExpl.SetDescription("fieldNorm(field=" + field + ", doc=" + doc + ")"); fieldExpl.AddDetail(fieldNormExpl); fieldExpl.SetMatch(tfExpl.IsMatch()); fieldExpl.SetValue(tfExpl.GetValue() * idfExpl.GetValue() * fieldNormExpl.GetValue()); result.AddDetail(fieldExpl); System.Boolean tempAux = fieldExpl.GetMatch(); result.SetMatch(tempAux); // combine them result.SetValue(queryExpl.GetValue() * fieldExpl.GetValue()); if (queryExpl.GetValue() == 1.0f) return fieldExpl; return result; }
/// <summary> Suggest similar words (restricted or not to a field of a user index)</summary> /// <param name="word">String the word you want a spell check done on /// </param> /// <param name="numSug">int the number of suggest words /// </param> /// <param name="ir">the indexReader of the user index (can be null see field param) /// </param> /// <param name="field">String the field of the user index: if field is not null, the suggested /// words are restricted to the words present in this field. /// </param> /// <param name="morePopular">boolean return only the suggest words that are more frequent than the searched word /// (only if restricted mode = (indexReader!=null and field!=null) /// </param> /// <throws> IOException </throws> /// <returns> String[] the sorted list of the suggest words with this 2 criteria: /// first criteria: the edit distance, second criteria (only if restricted mode): the popularity /// of the suggest words in the field of the user index /// </returns> public virtual System.String[] SuggestSimilar(System.String word, int numSug, IndexReader ir, System.String field, bool morePopular) { // obtainSearcher calls ensureOpen IndexSearcher indexSearcher = ObtainSearcher(); try { float min = this.minScore; int lengthWord = word.Length; int freq = (ir != null && field != null) ? ir.DocFreq(new Term(field, word)) : 0; int goalFreq = (morePopular && ir != null && field != null) ? freq : 0; // if the word exists in the real index and we don't care for word frequency, return the word itself if (!morePopular && freq > 0) { return(new String[] { word }); } BooleanQuery query = new BooleanQuery(); String[] grams; String key; for (int ng = GetMin(lengthWord); ng <= GetMax(lengthWord); ng++) { key = "gram" + ng; // form key grams = FormGrams(word, ng); // form word into ngrams (allow dups too) if (grams.Length == 0) { continue; // hmm } if (bStart > 0) { // should we boost prefixes? Add(query, "start" + ng, grams[0], bStart); // matches start of word } if (bEnd > 0) { // should we boost suffixes Add(query, "end" + ng, grams[grams.Length - 1], bEnd); // matches end of word } for (int i = 0; i < grams.Length; i++) { Add(query, key, grams[i]); } } int maxHits = 10 * numSug; // System.out.println("Q: " + query); ScoreDoc[] hits = indexSearcher.Search(query, null, maxHits).scoreDocs; // System.out.println("HITS: " + hits.length()); SuggestWordQueue sugQueue = new SuggestWordQueue(numSug); // go thru more than 'maxr' matches in case the distance filter triggers int stop = Math.Min(hits.Length, maxHits); SuggestWord sugWord = new SuggestWord(); for (int i = 0; i < stop; i++) { sugWord.string_Renamed = indexSearcher.Doc(hits[i].doc).Get(F_WORD); // get orig word // don't suggest a word for itself, that would be silly if (sugWord.string_Renamed.Equals(word)) { continue; } // edit distance sugWord.score = sd.GetDistance(word, sugWord.string_Renamed); if (sugWord.score < min) { continue; } if (ir != null && field != null) { // use the user index sugWord.freq = ir.DocFreq(new Term(field, sugWord.string_Renamed)); // freq in the index // don't suggest a word that is not present in the field if ((morePopular && goalFreq > sugWord.freq) || sugWord.freq < 1) { continue; } } sugQueue.InsertWithOverflow(sugWord); if (sugQueue.Size() == numSug) { // if queue full, maintain the minScore score min = ((SuggestWord)sugQueue.Top()).score; } sugWord = new SuggestWord(); } // convert to array string String[] list = new String[sugQueue.Size()]; for (int i = sugQueue.Size() - 1; i >= 0; i--) { list[i] = ((SuggestWord)sugQueue.Pop()).string_Renamed; } return(list); } finally { ReleaseSearcher(indexSearcher); } }