t
public virtual void TestSetBufferSize() { System.IO.DirectoryInfo indexDir = new System.IO.DirectoryInfo(System.IO.Path.Combine(AppSettings.Get("tempDir", ""), "testSetBufferSize")); MockFSDirectory dir = new MockFSDirectory(indexDir, NewRandom()); try { IndexWriter writer = new IndexWriter(dir, new WhitespaceAnalyzer(), true, IndexWriter.MaxFieldLength.LIMITED, null); writer.UseCompoundFile = false; for (int i = 0; i < 37; i++) { Document doc = new Document(); doc.Add(new Field("content", "aaa bbb ccc ddd" + i, Field.Store.YES, Field.Index.ANALYZED)); doc.Add(new Field("id", "" + i, Field.Store.YES, Field.Index.ANALYZED)); writer.AddDocument(doc, null); } writer.Close(); dir.allIndexInputs.Clear(); IndexReader reader = IndexReader.Open((Directory)dir, false, null); Term aaa = new Term("content", "aaa"); Term bbb = new Term("content", "bbb"); Term ccc = new Term("content", "ccc"); Assert.AreEqual(37, reader.DocFreq(ccc, null)); reader.DeleteDocument(0, null); Assert.AreEqual(37, reader.DocFreq(aaa, null)); dir.tweakBufferSizes(); reader.DeleteDocument(4, null); Assert.AreEqual(reader.DocFreq(bbb, null), 37); dir.tweakBufferSizes(); IndexSearcher searcher = new IndexSearcher(reader); ScoreDoc[] hits = searcher.Search(new TermQuery(bbb), null, 1000, null).ScoreDocs; dir.tweakBufferSizes(); Assert.AreEqual(35, hits.Length); dir.tweakBufferSizes(); hits = searcher.Search(new TermQuery(new Term("id", "33")), null, 1000, null).ScoreDocs; dir.tweakBufferSizes(); Assert.AreEqual(1, hits.Length); hits = searcher.Search(new TermQuery(aaa), null, 1000, null).ScoreDocs; dir.tweakBufferSizes(); Assert.AreEqual(35, hits.Length); searcher.Close(); reader.Close(); } finally { _TestUtil.RmDir(indexDir); } }
/// <summary> Extracts all terms texts of a given Query into an array of WeightedTerms /// /// </summary> /// <param name="query">Query to extract term texts from</param> /// <param name="reader">used to compute IDF which can be used to a) score selected fragments better /// b) use graded highlights eg chaning intensity of font color</param> /// <param name="fieldName">the field on which Inverse Document Frequency (IDF) calculations are based</param> /// <returns> an array of the terms used in a query, plus their weights.</returns> public static WeightedTerm[] GetIdfWeightedTerms(Query query, IndexReader reader, string fieldName) { WeightedTerm[] terms = GetTerms(query, false, fieldName); int totalNumDocs = reader.NumDocs(); foreach (WeightedTerm t in terms) { try { int docFreq = reader.DocFreq(new Term(fieldName, t.Term)); // docFreq counts deletes if (totalNumDocs < docFreq) { docFreq = totalNumDocs; } //IDF algorithm taken from DefaultSimilarity class var idf = (float)(Math.Log((float)totalNumDocs / (double)(docFreq + 1)) + 1.0); t.Weight *= idf; } catch (IOException e) { //ignore } } return terms; }
// inherit javadoc public override int DocFreq(Term term, IState state) { return(reader.DocFreq(term, state)); }
private int DocFreq(IndexReader r, string term) { return r.DocFreq(new Term(FIELD, term)); }
private int DocFreq(IndexReader r, string term) { return(r.DocFreq(new Term(FIELD, term))); }
//////////////////////////////////////////////////////////////// static private void ScoreHits (Dictionary<int, Hit> hits_by_id, IndexReader reader, ICollection term_list) { LNS.Similarity similarity; similarity = LNS.Similarity.GetDefault (); TermDocs term_docs = reader.TermDocs (); Hit hit; foreach (Term term in term_list) { double idf; idf = similarity.Idf (reader.DocFreq (term), reader.MaxDoc ()); int hit_count; hit_count = hits_by_id.Count; term_docs.Seek (term); while (term_docs.Next () && hit_count > 0) { int id; id = term_docs.Doc (); if (hits_by_id.TryGetValue (id, out hit)) { double tf; tf = similarity.Tf (term_docs.Freq ()); hit.Score += tf * idf; --hit_count; } } } term_docs.Close (); }
public virtual void TestAddIndexOnDiskFull() { // MemoryCodec, since it uses FST, is not necessarily // "additive", ie if you add up N small FSTs, then merge // them, the merged result can easily be larger than the // sum because the merged FST may use array encoding for // some arcs (which uses more space): string idFormat = TestUtil.GetPostingsFormat("id"); string contentFormat = TestUtil.GetPostingsFormat("content"); AssumeFalse("this test cannot run with Memory codec", idFormat.Equals("Memory", StringComparison.Ordinal) || contentFormat.Equals("Memory", StringComparison.Ordinal)); int START_COUNT = 57; int NUM_DIR = TestNightly ? 50 : 5; int END_COUNT = START_COUNT + NUM_DIR * (TestNightly ? 25 : 5); // Build up a bunch of dirs that have indexes which we // will then merge together by calling addIndexes(*): Directory[] dirs = new Directory[NUM_DIR]; long inputDiskUsage = 0; for (int i = 0; i < NUM_DIR; i++) { dirs[i] = NewDirectory(); IndexWriter writer = new IndexWriter(dirs[i], NewIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(Random))); for (int j = 0; j < 25; j++) { AddDocWithIndex(writer, 25 * i + j); } writer.Dispose(); string[] files = dirs[i].ListAll(); for (int j = 0; j < files.Length; j++) { inputDiskUsage += dirs[i].FileLength(files[j]); } } // Now, build a starting index that has START_COUNT docs. We // will then try to addIndexes into a copy of this: MockDirectoryWrapper startDir = NewMockDirectory(); IndexWriter indWriter = new IndexWriter(startDir, NewIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(Random))); for (int j = 0; j < START_COUNT; j++) { AddDocWithIndex(indWriter, j); } indWriter.Dispose(); // Make sure starting index seems to be working properly: Term searchTerm = new Term("content", "aaa"); IndexReader reader = DirectoryReader.Open(startDir); Assert.AreEqual(57, reader.DocFreq(searchTerm), "first docFreq"); IndexSearcher searcher = NewSearcher(reader); ScoreDoc[] hits = searcher.Search(new TermQuery(searchTerm), null, 1000).ScoreDocs; Assert.AreEqual(57, hits.Length, "first number of hits"); reader.Dispose(); // Iterate with larger and larger amounts of free // disk space. With little free disk space, // addIndexes will certainly run out of space & // fail. Verify that when this happens, index is // not corrupt and index in fact has added no // documents. Then, we increase disk space by 2000 // bytes each iteration. At some point there is // enough free disk space and addIndexes should // succeed and index should show all documents were // added. // String[] files = startDir.ListAll(); long diskUsage = startDir.GetSizeInBytes(); long startDiskUsage = 0; string[] files_ = startDir.ListAll(); for (int i = 0; i < files_.Length; i++) { startDiskUsage += startDir.FileLength(files_[i]); } for (int iter = 0; iter < 3; iter++) { if (Verbose) { Console.WriteLine("TEST: iter=" + iter); } // Start with 100 bytes more than we are currently using: long diskFree = diskUsage + TestUtil.NextInt32(Random, 50, 200); int method = iter; bool success = false; bool done = false; string methodName; if (0 == method) { methodName = "addIndexes(Directory[]) + forceMerge(1)"; } else if (1 == method) { methodName = "addIndexes(IndexReader[])"; } else { methodName = "addIndexes(Directory[])"; } while (!done) { if (Verbose) { Console.WriteLine("TEST: cycle..."); } // Make a new dir that will enforce disk usage: MockDirectoryWrapper dir = new MockDirectoryWrapper(Random, new RAMDirectory(startDir, NewIOContext(Random))); indWriter = new IndexWriter(dir, NewIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(Random)).SetOpenMode(OpenMode.APPEND).SetMergePolicy(NewLogMergePolicy(false))); IOException err = null; IMergeScheduler ms = indWriter.Config.MergeScheduler; for (int x = 0; x < 2; x++) { if (ms is IConcurrentMergeScheduler) // this test intentionally produces exceptions // in the threads that CMS launches; we don't // want to pollute test output with these. { if (0 == x) { ((IConcurrentMergeScheduler)ms).SetSuppressExceptions(); } else { ((IConcurrentMergeScheduler)ms).ClearSuppressExceptions(); } } // Two loops: first time, limit disk space & // throw random IOExceptions; second time, no // disk space limit: double rate = 0.05; double diskRatio = ((double)diskFree) / diskUsage; long thisDiskFree; string testName = null; if (0 == x) { dir.RandomIOExceptionRateOnOpen = Random.NextDouble() * 0.01; thisDiskFree = diskFree; if (diskRatio >= 2.0) { rate /= 2; } if (diskRatio >= 4.0) { rate /= 2; } if (diskRatio >= 6.0) { rate = 0.0; } if (Verbose) { testName = "disk full test " + methodName + " with disk full at " + diskFree + " bytes"; } } else { dir.RandomIOExceptionRateOnOpen = 0.0; thisDiskFree = 0; rate = 0.0; if (Verbose) { testName = "disk full test " + methodName + " with unlimited disk space"; } } if (Verbose) { Console.WriteLine("\ncycle: " + testName); } dir.TrackDiskUsage = true; dir.MaxSizeInBytes = thisDiskFree; dir.RandomIOExceptionRate = rate; try { if (0 == method) { if (Verbose) { Console.WriteLine("TEST: now addIndexes count=" + dirs.Length); } indWriter.AddIndexes(dirs); if (Verbose) { Console.WriteLine("TEST: now forceMerge"); } indWriter.ForceMerge(1); } else if (1 == method) { IndexReader[] readers = new IndexReader[dirs.Length]; for (int i = 0; i < dirs.Length; i++) { readers[i] = DirectoryReader.Open(dirs[i]); } try { indWriter.AddIndexes(readers); } finally { for (int i = 0; i < dirs.Length; i++) { readers[i].Dispose(); } } } else { indWriter.AddIndexes(dirs); } success = true; if (Verbose) { Console.WriteLine(" success!"); } if (0 == x) { done = true; } } catch (IOException e) { success = false; err = e; if (Verbose) { Console.WriteLine(" hit IOException: " + e); Console.WriteLine(e.StackTrace); } if (1 == x) { Console.WriteLine(e.StackTrace); Assert.Fail(methodName + " hit IOException after disk space was freed up"); } } // Make sure all threads from // ConcurrentMergeScheduler are done TestUtil.SyncConcurrentMerges(indWriter); if (Verbose) { Console.WriteLine(" now test readers"); } // Finally, verify index is not corrupt, and, if // we succeeded, we see all docs added, and if we // failed, we see either all docs or no docs added // (transactional semantics): dir.RandomIOExceptionRateOnOpen = 0.0; try { reader = DirectoryReader.Open(dir); } catch (IOException e) { Console.WriteLine(e.StackTrace); Assert.Fail(testName + ": exception when creating IndexReader: " + e); } int result = reader.DocFreq(searchTerm); if (success) { if (result != START_COUNT) { Assert.Fail(testName + ": method did not throw exception but docFreq('aaa') is " + result + " instead of expected " + START_COUNT); } } else { // On hitting exception we still may have added // all docs: if (result != START_COUNT && result != END_COUNT) { Console.WriteLine(err.StackTrace); Assert.Fail(testName + ": method did throw exception but docFreq('aaa') is " + result + " instead of expected " + START_COUNT + " or " + END_COUNT); } } searcher = NewSearcher(reader); try { hits = searcher.Search(new TermQuery(searchTerm), null, END_COUNT).ScoreDocs; } catch (IOException e) { Console.WriteLine(e.StackTrace); Assert.Fail(testName + ": exception when searching: " + e); } int result2 = hits.Length; if (success) { if (result2 != result) { Assert.Fail(testName + ": method did not throw exception but hits.Length for search on term 'aaa' is " + result2 + " instead of expected " + result); } } else { // On hitting exception we still may have added // all docs: if (result2 != result) { Console.WriteLine(err.StackTrace); Assert.Fail(testName + ": method did throw exception but hits.Length for search on term 'aaa' is " + result2 + " instead of expected " + result); } } reader.Dispose(); if (Verbose) { Console.WriteLine(" count is " + result); } if (done || result == END_COUNT) { break; } } if (Verbose) { Console.WriteLine(" start disk = " + startDiskUsage + "; input disk = " + inputDiskUsage + "; max used = " + dir.MaxUsedSizeInBytes); } if (done) { // Javadocs state that temp free Directory space // required is at most 2X total input size of // indices so let's make sure: Assert.IsTrue((dir.MaxUsedSizeInBytes - startDiskUsage) < 2 * (startDiskUsage + inputDiskUsage), "max free Directory space required exceeded 1X the total input index sizes during " + methodName + ": max temp usage = " + (dir.MaxUsedSizeInBytes - startDiskUsage) + " bytes vs limit=" + (2 * (startDiskUsage + inputDiskUsage)) + "; starting disk usage = " + startDiskUsage + " bytes; " + "input index disk usage = " + inputDiskUsage + " bytes"); } // Make sure we don't hit disk full during close below: dir.MaxSizeInBytes = 0; dir.RandomIOExceptionRate = 0.0; dir.RandomIOExceptionRateOnOpen = 0.0; indWriter.Dispose(); // Wait for all BG threads to finish else // dir.Dispose() will throw IOException because // there are still open files TestUtil.SyncConcurrentMerges(ms); dir.Dispose(); // Try again with more free space: diskFree += TestNightly ? TestUtil.NextInt32(Random, 4000, 8000) : TestUtil.NextInt32(Random, 40000, 80000); } } startDir.Dispose(); foreach (Directory dir in dirs) { dir.Dispose(); } }
public override int DocFreq(Term t) { EnsureOpen(); return(in_Renamed.DocFreq(t)); }
public override int DocFreq(Term term) { IndexReader reader = ((IndexReader)fieldToReader[term.Field()]); return(reader == null ? 0 : reader.DocFreq(term)); }
/// <summary> /// Suggest similar words (optionally restricted to a field of an index). /// /// <para>As the Lucene similarity that is used to fetch the most relevant n-grammed terms /// is not the same as the edit distance strategy used to calculate the best /// matching spell-checked word from the hits that Lucene found, one usually has /// to retrieve a couple of numSug's in order to get the true best match. /// /// </para> /// <para>I.e. if numSug == 1, don't count on that suggestion being the best one. /// Thus, you should set this value to <b>at least</b> 5 for a good suggestion. /// /// </para> /// </summary> /// <param name="word"> the word you want a spell check done on </param> /// <param name="numSug"> the number of suggested words </param> /// <param name="ir"> the indexReader of the user index (can be null see field param) </param> /// <param name="field"> the field of the user index: if field is not null, the suggested /// words are restricted to the words present in this field. </param> /// <param name="suggestMode"> /// (NOTE: if indexReader==null and/or field==null, then this is overridden with SuggestMode.SUGGEST_ALWAYS) </param> /// <param name="accuracy"> The minimum score a suggestion must have in order to qualify for inclusion in the results </param> /// <exception cref="IOException"> if the underlying index throws an <seealso cref="IOException"/> </exception> /// <exception cref="AlreadyClosedException"> if the Spellchecker is already closed </exception> /// <returns> String[] the sorted list of the suggest words with these 2 criteria: /// first criteria: the edit distance, second criteria (only if restricted mode): the popularity /// of the suggest words in the field of the user index /// </returns> public virtual string[] SuggestSimilar(string word, int numSug, IndexReader ir, string field, SuggestMode suggestMode, float accuracy) { // obtainSearcher calls ensureOpen IndexSearcher indexSearcher = ObtainSearcher(); try { if (ir == null || field == null) { suggestMode = SuggestMode.SUGGEST_ALWAYS; } if (suggestMode == SuggestMode.SUGGEST_ALWAYS) { ir = null; field = null; } int lengthWord = word.Length; int freq = (ir != null && field != null) ? ir.DocFreq(new Term(field, word)) : 0; int goalFreq = suggestMode == SuggestMode.SUGGEST_MORE_POPULAR ? freq : 0; // if the word exists in the real index and we don't care for word frequency, return the word itself if (suggestMode == SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX && freq > 0) { return new string[] { word }; } BooleanQuery query = new BooleanQuery(); string[] grams; string key; for (int ng = GetMin(lengthWord); ng <= GetMax(lengthWord); ng++) { key = "gram" + ng; // form key grams = FormGrams(word, ng); // form word into ngrams (allow dups too) if (grams.Length == 0) { continue; // hmm } if (bStart > 0) // should we boost prefixes? { Add(query, "start" + ng, grams[0], bStart); // matches start of word } if (bEnd > 0) // should we boost suffixes { Add(query, "end" + ng, grams[grams.Length - 1], bEnd); // matches end of word } for (int i = 0; i < grams.Length; i++) { Add(query, key, grams[i]); } } int maxHits = 10 * numSug; // System.out.println("Q: " + query); ScoreDoc[] hits = indexSearcher.Search(query, null, maxHits).ScoreDocs; // System.out.println("HITS: " + hits.length()); SuggestWordQueue sugQueue = new SuggestWordQueue(numSug, comparator); // go thru more than 'maxr' matches in case the distance filter triggers int stop = Math.Min(hits.Length, maxHits); SuggestWord sugWord = new SuggestWord(); for (int i = 0; i < stop; i++) { sugWord.@string = indexSearcher.Doc(hits[i].Doc).Get(F_WORD); // get orig word // don't suggest a word for itself, that would be silly if ([email protected](word)) { continue; } // edit distance sugWord.score = sd.GetDistance(word, sugWord.@string); if (sugWord.score < accuracy) { continue; } if (ir != null && field != null) // use the user index { sugWord.freq = ir.DocFreq(new Term(field, sugWord.@string)); // freq in the index // don't suggest a word that is not present in the field if ((suggestMode == SuggestMode.SUGGEST_MORE_POPULAR && goalFreq > sugWord.freq) || sugWord.freq < 1) { continue; } } sugQueue.InsertWithOverflow(sugWord); if (sugQueue.Size() == numSug) { // if queue full, maintain the minScore score accuracy = sugQueue.Top().score; } sugWord = new SuggestWord(); } // convert to array string string[] list = new string[sugQueue.Size()]; for (int i = sugQueue.Size() - 1; i >= 0; i--) { list[i] = sugQueue.Pop().@string; } return list; } finally { ReleaseSearcher(indexSearcher); } }
// inherit javadoc public override int DocFreq(Term term) { return(reader.DocFreq(term)); }
public override int DocFreq(Term t) { return(in_Renamed.DocFreq(t)); }
public virtual void TestRandomExceptionsThreads() { MockRAMDirectory dir = new MockRAMDirectory(); MockIndexWriter writer = new MockIndexWriter(this, dir, new WhitespaceAnalyzer(), true, IndexWriter.MaxFieldLength.LIMITED); ((ConcurrentMergeScheduler)writer.GetMergeScheduler()).SetSuppressExceptions(); //writer.setMaxBufferedDocs(10); writer.SetRAMBufferSizeMB(0.2); if (DEBUG) { System.IO.StreamWriter temp_writer; temp_writer = new System.IO.StreamWriter(System.Console.OpenStandardOutput(), System.Console.Out.Encoding); temp_writer.AutoFlush = true; writer.SetInfoStream(temp_writer); } int NUM_THREADS = 4; IndexerThread[] threads = new IndexerThread[NUM_THREADS]; for (int i = 0; i < NUM_THREADS; i++) { threads[i] = new IndexerThread(this, i, writer); threads[i].Start(); } for (int i = 0; i < NUM_THREADS; i++) { threads[i].Join(); } for (int i = 0; i < NUM_THREADS; i++) { if (threads[i].failure != null) { Assert.Fail("thread " + threads[i].Name + ": hit unexpected failure"); } } writer.Commit(); try { writer.Close(); } catch (System.Exception t) { System.Console.Out.WriteLine("exception during close:"); System.Console.Out.WriteLine(t.StackTrace); writer.Rollback(); } // Confirm that when doc hits exception partway through tokenization, it's deleted: IndexReader r2 = IndexReader.Open(dir); int count = r2.DocFreq(new Term("content4", "aaa")); int count2 = r2.DocFreq(new Term("content4", "ddd")); Assert.AreEqual(count, count2); r2.Close(); _TestUtil.CheckIndex(dir); }