/// <summary> /// Called only from <seealso cref="#doOpenIfChanged()"/>. If the taxonomy has been /// recreated, you should pass {@code null} as the caches and parent/children /// arrays. /// </summary> internal DirectoryTaxonomyReader(DirectoryReader indexReader, DirectoryTaxonomyWriter taxoWriter, LRUHashMap<FacetLabel, IntClass> ordinalCache, LRUHashMap<int, FacetLabel> categoryCache, TaxonomyIndexArrays taxoArrays) { this.indexReader = indexReader; this.taxoWriter = taxoWriter; this.taxoEpoch = taxoWriter == null ? -1 : taxoWriter.TaxonomyEpoch; // use the same instance of the cache, note the protective code in getOrdinal and getPath this.ordinalCache = ordinalCache == null ? new LRUHashMap<FacetLabel, IntClass>(DEFAULT_CACHE_VALUE) : ordinalCache; this.categoryCache = categoryCache == null ? new LRUHashMap<int, FacetLabel>(DEFAULT_CACHE_VALUE) : categoryCache; this.taxoArrays = taxoArrays != null ? new TaxonomyIndexArrays(indexReader, taxoArrays) : null; }
public virtual void TestNonIndexedFields() { Directory dir = NewDirectory(); RandomIndexWriter iw = new RandomIndexWriter( #if FEATURE_INSTANCE_TESTDATA_INITIALIZATION this, #endif Random, dir); Document doc = new Document(); doc.Add(new StoredField("bogusbytes", "bogus")); doc.Add(new StoredField("bogusshorts", "bogus")); doc.Add(new StoredField("bogusints", "bogus")); doc.Add(new StoredField("boguslongs", "bogus")); doc.Add(new StoredField("bogusfloats", "bogus")); doc.Add(new StoredField("bogusdoubles", "bogus")); doc.Add(new StoredField("bogusterms", "bogus")); doc.Add(new StoredField("bogustermsindex", "bogus")); doc.Add(new StoredField("bogusmultivalued", "bogus")); doc.Add(new StoredField("bogusbits", "bogus")); iw.AddDocument(doc); DirectoryReader ir = iw.GetReader(); iw.Dispose(); AtomicReader ar = GetOnlySegmentReader(ir); IFieldCache cache = FieldCache.DEFAULT; cache.PurgeAllCaches(); Assert.AreEqual(0, cache.GetCacheEntries().Length); #pragma warning disable 612, 618 Bytes bytes = cache.GetBytes(ar, "bogusbytes", true); Assert.AreEqual((byte)0, bytes.Get(0)); Int16s shorts = cache.GetInt16s(ar, "bogusshorts", true); Assert.AreEqual(0, shorts.Get(0)); #pragma warning restore 612, 618 Int32s ints = cache.GetInt32s(ar, "bogusints", true); Assert.AreEqual(0, ints.Get(0)); Int64s longs = cache.GetInt64s(ar, "boguslongs", true); Assert.AreEqual(0, longs.Get(0)); Singles floats = cache.GetSingles(ar, "bogusfloats", true); Assert.AreEqual(0, floats.Get(0), 0.0f); Doubles doubles = cache.GetDoubles(ar, "bogusdoubles", true); Assert.AreEqual(0, doubles.Get(0), 0.0D); BytesRef scratch = new BytesRef(); BinaryDocValues binaries = cache.GetTerms(ar, "bogusterms", true); binaries.Get(0, scratch); Assert.AreEqual(0, scratch.Length); SortedDocValues sorted = cache.GetTermsIndex(ar, "bogustermsindex"); Assert.AreEqual(-1, sorted.GetOrd(0)); sorted.Get(0, scratch); Assert.AreEqual(0, scratch.Length); SortedSetDocValues sortedSet = cache.GetDocTermOrds(ar, "bogusmultivalued"); sortedSet.SetDocument(0); Assert.AreEqual(SortedSetDocValues.NO_MORE_ORDS, sortedSet.NextOrd()); IBits bits = cache.GetDocsWithField(ar, "bogusbits"); Assert.IsFalse(bits.Get(0)); // check that we cached nothing Assert.AreEqual(0, cache.GetCacheEntries().Length); ir.Dispose(); dir.Dispose(); }
public ThreadAnonymousClass(TestCachedOrdinalsReader outerInstance, string threadName, DirectoryReader reader, CachedOrdinalsReader ordsReader) : base(threadName) { this.outerInstance = outerInstance; this.reader = reader; this.ordsReader = ordsReader; }
/// <summary> /// Open the <see cref="DirectoryReader"/> from this <see cref="IndexWriter"/>. /// </summary> protected virtual DirectoryReader OpenIndexReader(IndexWriter writer) { return(DirectoryReader.Open(writer, false)); }
/// <summary> /// Implements the opening of a new <see cref="DirectoryTaxonomyReader"/> instance if /// the taxonomy has changed. /// /// <para> /// <b>NOTE:</b> the returned <see cref="DirectoryTaxonomyReader"/> shares the /// ordinal and category caches with this reader. This is not expected to cause /// any issues, unless the two instances continue to live. The reader /// guarantees that the two instances cannot affect each other in terms of /// correctness of the caches, however if the size of the cache is changed /// through <see cref="SetCacheSize(int)"/>, it will affect both reader instances. /// </para> /// </summary> protected override TaxonomyReader DoOpenIfChanged() { EnsureOpen(); // This works for both NRT and non-NRT readers (i.e. an NRT reader remains NRT). var r2 = DirectoryReader.OpenIfChanged(indexReader); if (r2 == null) { return(null); // no changes, nothing to do } // check if the taxonomy was recreated bool success = false; try { bool recreated = false; if (taxoWriter == null) { // not NRT, check epoch from commit data string t1 = indexReader.IndexCommit.UserData[DirectoryTaxonomyWriter.INDEX_EPOCH]; string t2 = r2.IndexCommit.UserData[DirectoryTaxonomyWriter.INDEX_EPOCH]; if (t1 == null) { if (t2 != null) { recreated = true; } } else if (!t1.Equals(t2)) { // t1 != null and t2 cannot be null b/c DirTaxoWriter always puts the commit data. // it's ok to use String.equals because we require the two epoch values to be the same. recreated = true; } } else { // NRT, compare current taxoWriter.epoch() vs the one that was given at construction if (taxoEpoch != taxoWriter.TaxonomyEpoch) { recreated = true; } } DirectoryTaxonomyReader newtr; if (recreated) { // if recreated, do not reuse anything from this instace. the information // will be lazily computed by the new instance when needed. newtr = new DirectoryTaxonomyReader(r2, taxoWriter, null, null, null); } else { newtr = new DirectoryTaxonomyReader(r2, taxoWriter, ordinalCache, categoryCache, taxoArrays); } success = true; return(newtr); } finally { if (!success) { IOUtils.CloseWhileHandlingException(r2); } } }
private bool IndexExists(string appname) { return(DirectoryReader.IndexExists(GetDirectory(appname))); }
private void DoTest(DocValuesType type) { Directory d = NewDirectory(); IndexWriterConfig iwConfig = NewIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(Random)); int nDocs = AtLeast(50); Field id = new NumericDocValuesField("id", 0); Field f; switch (type) { case DocValuesType.BINARY: f = new BinaryDocValuesField("dv", new BytesRef()); break; case DocValuesType.SORTED: f = new SortedDocValuesField("dv", new BytesRef()); break; case DocValuesType.NUMERIC: f = new NumericDocValuesField("dv", 0); break; default: throw AssertionError.Create(); } Document document = new Document(); document.Add(id); document.Add(f); object[] vals = new object[nDocs]; RandomIndexWriter iw = new RandomIndexWriter(Random, d, iwConfig); for (int i = 0; i < nDocs; ++i) { id.SetInt64Value(i); switch (type) { case DocValuesType.SORTED: case DocValuesType.BINARY: do { vals[i] = TestUtil.RandomSimpleString(Random, 20); } while (((string)vals[i]).Length == 0); f.SetBytesValue(new BytesRef((string)vals[i])); break; case DocValuesType.NUMERIC: int bitsPerValue = RandomInts.RandomInt32Between(Random, 1, 31); // keep it an int vals[i] = (long)Random.Next((int)PackedInt32s.MaxValue(bitsPerValue)); f.SetInt64Value((long)vals[i]); break; } iw.AddDocument(document); if (Random.NextBoolean() && i % 10 == 9) { iw.Commit(); } } iw.Dispose(); DirectoryReader rd = DirectoryReader.Open(d); foreach (AtomicReaderContext leave in rd.Leaves) { FunctionValues ids = (new Int64FieldSource("id")).GetValues(null, leave); ValueSource vs; switch (type) { case DocValuesType.BINARY: case DocValuesType.SORTED: vs = new BytesRefFieldSource("dv"); break; case DocValuesType.NUMERIC: vs = new Int64FieldSource("dv"); break; default: throw AssertionError.Create(); } FunctionValues values = vs.GetValues(null, leave); BytesRef bytes = new BytesRef(); for (int i = 0; i < leave.AtomicReader.MaxDoc; ++i) { assertTrue(values.Exists(i)); if (vs is BytesRefFieldSource) { assertTrue(values.ObjectVal(i) is string); } else if (vs is Int64FieldSource) { assertTrue(values.ObjectVal(i) is J2N.Numerics.Int64); assertTrue(values.BytesVal(i, bytes)); } else { throw AssertionError.Create(); } object expected = vals[ids.Int32Val(i)]; switch (type) { case DocValuesType.SORTED: values.OrdVal(i); // no exception assertTrue(values.NumOrd >= 1); goto case DocValuesType.BINARY; case DocValuesType.BINARY: assertEquals(expected, values.ObjectVal(i)); assertEquals(expected, values.StrVal(i)); assertEquals(expected, values.ObjectVal(i)); assertEquals(expected, values.StrVal(i)); assertTrue(values.BytesVal(i, bytes)); assertEquals(new BytesRef((string)expected), bytes); break; case DocValuesType.NUMERIC: assertEquals(Convert.ToInt64(expected, CultureInfo.InvariantCulture), values.Int64Val(i)); break; } } } rd.Dispose(); d.Dispose(); }
protected override DirectoryReader DoWrapDirectoryReader(DirectoryReader @in) { return new AssertingDirectoryReader(@in); }
public override void BeforeClass() { base.BeforeClass(); Directory = NewDirectory(); RandomIndexWriter writer = new RandomIndexWriter(Random, Directory, NewIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(Random)).SetMergePolicy(NewLogMergePolicy())); for (int i = 0; i < DocFields.Length; i++) { Document doc = new Document(); doc.Add(NewTextField(field, DocFields[i], Field.Store.NO)); writer.AddDocument(doc); } writer.Dispose(); LittleReader = DirectoryReader.Open(Directory); Searcher = NewSearcher(LittleReader); // this is intentionally using the baseline sim, because it compares against bigSearcher (which uses a random one) Searcher.Similarity = new DefaultSimilarity(); // Make big index Dir2 = new MockDirectoryWrapper(Random, new RAMDirectory(Directory, IOContext.DEFAULT)); // First multiply small test index: MulFactor = 1; int docCount = 0; if (VERBOSE) { Console.WriteLine("\nTEST: now copy index..."); } do { if (VERBOSE) { Console.WriteLine("\nTEST: cycle..."); } Directory copy = new MockDirectoryWrapper(Random, new RAMDirectory(Dir2, IOContext.DEFAULT)); RandomIndexWriter w = new RandomIndexWriter( #if FEATURE_INSTANCE_TESTDATA_INITIALIZATION this, #endif Random, Dir2); w.AddIndexes(copy); docCount = w.MaxDoc; w.Dispose(); MulFactor *= 2; } while (docCount < 3000); RandomIndexWriter riw = new RandomIndexWriter(Random, Dir2, NewIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(Random)).SetMaxBufferedDocs(TestUtil.NextInt32(Random, 50, 1000))); Document doc_ = new Document(); doc_.Add(NewTextField("field2", "xxx", Field.Store.NO)); for (int i = 0; i < NUM_EXTRA_DOCS / 2; i++) { riw.AddDocument(doc_); } doc_ = new Document(); doc_.Add(NewTextField("field2", "big bad bug", Field.Store.NO)); for (int i = 0; i < NUM_EXTRA_DOCS / 2; i++) { riw.AddDocument(doc_); } Reader = riw.GetReader(); BigSearcher = NewSearcher(Reader); riw.Dispose(); }
public override int DoLogic() { int res = 0; // open reader or use existing one IndexSearcher searcher = RunData.GetIndexSearcher(); IndexReader reader; bool closeSearcher; if (searcher == null) { // open our own reader Directory dir = RunData.Directory; reader = DirectoryReader.Open(dir); searcher = new IndexSearcher(reader); closeSearcher = true; } else { // use existing one; this passes +1 ref to us reader = searcher.IndexReader; closeSearcher = false; } // optionally warm and add num docs traversed to count if (WithWarm) { Document doc = null; IBits liveDocs = MultiFields.GetLiveDocs(reader); for (int m = 0; m < reader.MaxDoc; m++) { if (null == liveDocs || liveDocs.Get(m)) { doc = reader.Document(m); res += (doc == null ? 0 : 1); } } } if (WithSearch) { res++; Query q = queryMaker.MakeQuery(); Sort sort = Sort; TopDocs hits = null; int numHits = NumHits; if (numHits > 0) { if (WithCollector == false) { if (sort != null) { // TODO: instead of always passing false we // should detect based on the query; if we make // the IndexSearcher search methods that take // Weight public again, we can go back to // pulling the Weight ourselves: TopFieldCollector collector = TopFieldCollector.Create(sort, numHits, true, WithScore, WithMaxScore, false); searcher.Search(q, null, collector); hits = collector.GetTopDocs(); } else { hits = searcher.Search(q, numHits); } } else { ICollector collector = CreateCollector(); searcher.Search(q, null, collector); //hits = collector.topDocs(); } string printHitsField = RunData.Config.Get("print.hits.field", null); if (hits != null && printHitsField != null && printHitsField.Length > 0) { Console.WriteLine("totalHits = " + hits.TotalHits); Console.WriteLine("maxDoc() = " + reader.MaxDoc); Console.WriteLine("numDocs() = " + reader.NumDocs); for (int i = 0; i < hits.ScoreDocs.Length; i++) { int docID = hits.ScoreDocs[i].Doc; Document doc = reader.Document(docID); Console.WriteLine(" " + i + ": doc=" + docID + " score=" + hits.ScoreDocs[i].Score + " " + printHitsField + " =" + doc.Get(printHitsField)); } } if (WithTraverse) { ScoreDoc[] scoreDocs = hits.ScoreDocs; int traversalSize = Math.Min(scoreDocs.Length, TraversalSize); if (traversalSize > 0) { bool retrieve = WithRetrieve; int numHighlight = Math.Min(NumToHighlight, scoreDocs.Length); Analyzer analyzer = RunData.Analyzer; BenchmarkHighlighter highlighter = null; if (numHighlight > 0) { highlighter = GetBenchmarkHighlighter(q); } for (int m = 0; m < traversalSize; m++) { int id = scoreDocs[m].Doc; res++; if (retrieve) { Document document = RetrieveDoc(reader, id); res += document != null ? 1 : 0; if (numHighlight > 0 && m < numHighlight) { ICollection <string> fieldsToHighlight = GetFieldsToHighlight(document); foreach (string field in fieldsToHighlight) { string text = document.Get(field); res += highlighter.DoHighlight(reader, id, field, document, analyzer, text); } } } } } } } } if (closeSearcher) { reader.Dispose(); } else { // Release our +1 ref from above reader.DecRef(); } return(res); }
/// <summary> /// Build the suggest index, using up to the specified /// amount of temporary RAM while building. Note that /// the weights for the suggestions are ignored. /// </summary> public virtual void Build(InputIterator iterator, double ramBufferSizeMB) { if (iterator.HasPayloads) { throw new System.ArgumentException("this suggester doesn't support payloads"); } if (iterator.HasContexts) { throw new System.ArgumentException("this suggester doesn't support contexts"); } string prefix = this.GetType().Name; var directory = OfflineSorter.DefaultTempDir(); // TODO: messy ... java7 has Files.createTempDirectory // ... but 4.x is java6: File tempIndexPath = null; Random random = new Random(); while (true) { tempIndexPath = new File(directory, prefix + ".index." + random.Next(int.MaxValue)); if (tempIndexPath.mkdir()) { break; } } Directory dir = FSDirectory.Open(tempIndexPath); IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_CURRENT, indexAnalyzer); iwc.OpenMode = IndexWriterConfig.OpenMode_e.CREATE; iwc.RAMBufferSizeMB = ramBufferSizeMB; IndexWriter writer = new IndexWriter(dir, iwc); var ft = new FieldType(TextField.TYPE_NOT_STORED); // TODO: if only we had IndexOptions.TERMS_ONLY... ft.IndexOptions = FieldInfo.IndexOptions.DOCS_AND_FREQS; ft.OmitNorms = true; ft.Freeze(); Document doc = new Document(); Field field = new Field("body", "", ft); doc.Add(field); totTokens = 0; IndexReader reader = null; bool success = false; count = 0; try { while (true) { BytesRef surfaceForm = iterator.Next(); if (surfaceForm == null) { break; } field.StringValue = surfaceForm.Utf8ToString(); writer.AddDocument(doc); count++; } reader = DirectoryReader.Open(writer, false); Terms terms = MultiFields.GetTerms(reader, "body"); if (terms == null) { throw new System.ArgumentException("need at least one suggestion"); } // Move all ngrams into an FST: TermsEnum termsEnum = terms.Iterator(null); Outputs <long?> outputs = PositiveIntOutputs.Singleton; Builder <long?> builder = new Builder <long?>(FST.INPUT_TYPE.BYTE1, outputs); IntsRef scratchInts = new IntsRef(); while (true) { BytesRef term = termsEnum.Next(); if (term == null) { break; } int ngramCount = CountGrams(term); if (ngramCount > grams) { throw new System.ArgumentException("tokens must not contain separator byte; got token=" + term + " but gramCount=" + ngramCount + ", which is greater than expected max ngram size=" + grams); } if (ngramCount == 1) { totTokens += termsEnum.TotalTermFreq(); } builder.Add(Lucene.Net.Util.Fst.Util.ToIntsRef(term, scratchInts), EncodeWeight(termsEnum.TotalTermFreq())); } fst = builder.Finish(); if (fst == null) { throw new System.ArgumentException("need at least one suggestion"); } //System.out.println("FST: " + fst.getNodeCount() + " nodes"); /* * PrintWriter pw = new PrintWriter("/x/tmp/out.dot"); * Util.toDot(fst, pw, true, true); * pw.close(); */ success = true; } finally { try { if (success) { IOUtils.Close(writer, reader); } else { IOUtils.CloseWhileHandlingException(writer, reader); } } finally { foreach (string file in dir.ListAll()) { File path = new File(tempIndexPath, file); if (path.Delete() == false) { throw new InvalidOperationException("failed to remove " + path); } } if (tempIndexPath.Delete() == false) { throw new InvalidOperationException("failed to remove " + tempIndexPath); } dir.Dispose(); } } }
public void TestClose() { using (IndexReader r = DirectoryReader.Open(userindex)) { spellChecker.ClearIndex(); string field = "field1"; Addwords(r, spellChecker, "field1"); int num_field1 = this.NumDoc(); Addwords(r, spellChecker, "field2"); int num_field2 = this.NumDoc(); assertEquals(num_field2, num_field1 + 1); CheckCommonSuggestions(r); AssertLastSearcherOpen(4); spellChecker.Dispose(); AssertSearchersClosed(); // LUCENENET NOTE: Per MSDN, calling Dispose() multiple times // should be a safe operation. http://stackoverflow.com/a/5306896/181087 // Certainly, there shouldn't be a problem with calling Dispose() within // a using block if you decide to free up resources early. //try //{ // spellChecker.Dispose(); // fail("spellchecker was already closed"); //} //catch (ObjectDisposedException e) //{ // // expected //} try { CheckCommonSuggestions(r); fail("spellchecker was already closed"); } catch (ObjectDisposedException /*e*/) { // expected } try { spellChecker.ClearIndex(); fail("spellchecker was already closed"); } catch (ObjectDisposedException /*e*/) { // expected } try { spellChecker.IndexDictionary(new LuceneDictionary(r, field), NewIndexWriterConfig(TEST_VERSION_CURRENT, null), false); fail("spellchecker was already closed"); } catch (ObjectDisposedException /*e*/) { // expected } try { spellChecker.SetSpellIndex(spellindex); fail("spellchecker was already closed"); } catch (ObjectDisposedException /*e*/) { // expected } assertEquals(4, searchers.Count); AssertSearchersClosed(); } }
public void TestSuggestModes() { using (IndexReader r = DirectoryReader.Open(userindex)) { spellChecker.ClearIndex(); Addwords(r, spellChecker, "field1"); { string[] similar = spellChecker.SuggestSimilar("eighty", 2, r, "field1", SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX); assertEquals(1, similar.Length); assertEquals("eighty", similar[0]); } { string[] similar = spellChecker.SuggestSimilar("eight", 2, r, "field1", SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX); assertEquals(1, similar.Length); assertEquals("eight", similar[0]); } { string[] similar = spellChecker.SuggestSimilar("eighty", 5, r, "field1", SuggestMode.SUGGEST_MORE_POPULAR); assertEquals(5, similar.Length); assertEquals("eight", similar[0]); } { string[] similar = spellChecker.SuggestSimilar("twenty", 5, r, "field1", SuggestMode.SUGGEST_MORE_POPULAR); assertEquals(1, similar.Length); assertEquals("twenty-one", similar[0]); } { string[] similar = spellChecker.SuggestSimilar("eight", 5, r, "field1", SuggestMode.SUGGEST_MORE_POPULAR); assertEquals(0, similar.Length); } { string[] similar = spellChecker.SuggestSimilar("eighty", 5, r, "field1", SuggestMode.SUGGEST_ALWAYS); assertEquals(5, similar.Length); assertEquals("eight", similar[0]); } { string[] similar = spellChecker.SuggestSimilar("eight", 5, r, "field1", SuggestMode.SUGGEST_ALWAYS); assertEquals(5, similar.Length); assertEquals("eighty", similar[0]); } } }
/// <summary> /// Creates a new read-only IndexSearcher </summary> /// <param name="dir"> the directory used to open the searcher </param> /// <returns> a new read-only IndexSearcher </returns> /// <exception cref="IOException"> f there is a low-level IO error </exception> // for testing purposes internal virtual IndexSearcher CreateSearcher(Directory dir) { return(new IndexSearcher(DirectoryReader.Open(dir))); }
public virtual void TestInfiniteValues() { Directory dir = NewDirectory(); RandomIndexWriter writer = new RandomIndexWriter(Random, dir, NewIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(Random))); Document doc = new Document(); doc.Add(new SingleField("float", float.NegativeInfinity, Field.Store.NO)); doc.Add(new Int32Field("int", int.MinValue, Field.Store.NO)); writer.AddDocument(doc); doc = new Document(); doc.Add(new SingleField("float", float.PositiveInfinity, Field.Store.NO)); doc.Add(new Int32Field("int", int.MaxValue, Field.Store.NO)); writer.AddDocument(doc); doc = new Document(); doc.Add(new SingleField("float", 0.0f, Field.Store.NO)); doc.Add(new Int32Field("int", 0, Field.Store.NO)); writer.AddDocument(doc); foreach (float f in TestNumericUtils.FLOAT_NANs) { doc = new Document(); doc.Add(new SingleField("float", f, Field.Store.NO)); writer.AddDocument(doc); } writer.Dispose(); IndexReader r = DirectoryReader.Open(dir); IndexSearcher s = NewSearcher(r); Query q = NumericRangeQuery.NewInt32Range("int", null, null, true, true); TopDocs topDocs = s.Search(q, 10); Assert.AreEqual(3, topDocs.ScoreDocs.Length, "Score doc count"); q = NumericRangeQuery.NewInt32Range("int", null, null, false, false); topDocs = s.Search(q, 10); Assert.AreEqual(3, topDocs.ScoreDocs.Length, "Score doc count"); q = NumericRangeQuery.NewInt32Range("int", int.MinValue, int.MaxValue, true, true); topDocs = s.Search(q, 10); Assert.AreEqual(3, topDocs.ScoreDocs.Length, "Score doc count"); q = NumericRangeQuery.NewInt32Range("int", int.MinValue, int.MaxValue, false, false); topDocs = s.Search(q, 10); Assert.AreEqual(1, topDocs.ScoreDocs.Length, "Score doc count"); q = NumericRangeQuery.NewSingleRange("float", null, null, true, true); topDocs = s.Search(q, 10); Assert.AreEqual(3, topDocs.ScoreDocs.Length, "Score doc count"); q = NumericRangeQuery.NewSingleRange("float", null, null, false, false); topDocs = s.Search(q, 10); Assert.AreEqual(3, topDocs.ScoreDocs.Length, "Score doc count"); q = NumericRangeQuery.NewSingleRange("float", float.NegativeInfinity, float.PositiveInfinity, true, true); topDocs = s.Search(q, 10); Assert.AreEqual(3, topDocs.ScoreDocs.Length, "Score doc count"); q = NumericRangeQuery.NewSingleRange("float", float.NegativeInfinity, float.PositiveInfinity, false, false); topDocs = s.Search(q, 10); Assert.AreEqual(1, topDocs.ScoreDocs.Length, "Score doc count"); q = NumericRangeQuery.NewSingleRange("float", float.NaN, float.NaN, true, true); topDocs = s.Search(q, 10); Assert.AreEqual(TestNumericUtils.FLOAT_NANs.Length, topDocs.ScoreDocs.Length, "Score doc count"); r.Dispose(); dir.Dispose(); }
public static bool IndexExists(string luceneIndex) { luceneIndex.RequireNotNullOrEmpty(nameof(luceneIndex)); return(DirectoryReader.IndexExists(FSDirectory.Open(luceneIndex))); }
public virtual void TestParsingAndSearching() { string field = "content"; string[] docs = new string[] { "\\ abcdefg1", "\\x00079 hijklmn1", "\\\\ opqrstu1" }; // queries that should find all docs Query[] matchAll = new Query[] { new WildcardQuery(new Term(field, "*")), new WildcardQuery(new Term(field, "*1")), new WildcardQuery(new Term(field, "**1")), new WildcardQuery(new Term(field, "*?")), new WildcardQuery(new Term(field, "*?1")), new WildcardQuery(new Term(field, "?*1")), new WildcardQuery(new Term(field, "**")), new WildcardQuery(new Term(field, "***")), new WildcardQuery(new Term(field, "\\\\*")) }; // queries that should find no docs Query[] matchNone = new Query[] { new WildcardQuery(new Term(field, "a*h")), new WildcardQuery(new Term(field, "a?h")), new WildcardQuery(new Term(field, "*a*h")), new WildcardQuery(new Term(field, "?a")), new WildcardQuery(new Term(field, "a?")) }; PrefixQuery[][] matchOneDocPrefix = new PrefixQuery[][] { new PrefixQuery[] { new PrefixQuery(new Term(field, "a")), new PrefixQuery(new Term(field, "ab")), new PrefixQuery(new Term(field, "abc")) }, new PrefixQuery[] { new PrefixQuery(new Term(field, "h")), new PrefixQuery(new Term(field, "hi")), new PrefixQuery(new Term(field, "hij")), new PrefixQuery(new Term(field, "\\x0007")) }, new PrefixQuery[] { new PrefixQuery(new Term(field, "o")), new PrefixQuery(new Term(field, "op")), new PrefixQuery(new Term(field, "opq")), new PrefixQuery(new Term(field, "\\\\")) } }; WildcardQuery[][] matchOneDocWild = new WildcardQuery[][] { new WildcardQuery[] { new WildcardQuery(new Term(field, "*a*")), new WildcardQuery(new Term(field, "*ab*")), new WildcardQuery(new Term(field, "*abc**")), new WildcardQuery(new Term(field, "ab*e*")), new WildcardQuery(new Term(field, "*g?")), new WildcardQuery(new Term(field, "*f?1")) }, new WildcardQuery[] { new WildcardQuery(new Term(field, "*h*")), new WildcardQuery(new Term(field, "*hi*")), new WildcardQuery(new Term(field, "*hij**")), new WildcardQuery(new Term(field, "hi*k*")), new WildcardQuery(new Term(field, "*n?")), new WildcardQuery(new Term(field, "*m?1")), new WildcardQuery(new Term(field, "hij**")) }, new WildcardQuery[] { new WildcardQuery(new Term(field, "*o*")), new WildcardQuery(new Term(field, "*op*")), new WildcardQuery(new Term(field, "*opq**")), new WildcardQuery(new Term(field, "op*q*")), new WildcardQuery(new Term(field, "*u?")), new WildcardQuery(new Term(field, "*t?1")), new WildcardQuery(new Term(field, "opq**")) } }; // prepare the index Directory dir = NewDirectory(); RandomIndexWriter iw = new RandomIndexWriter(Random(), dir, NewIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(Random())).SetMergePolicy(NewLogMergePolicy())); for (int i = 0; i < docs.Length; i++) { Document doc = new Document(); doc.Add(NewTextField(field, docs[i], Field.Store.NO)); iw.AddDocument(doc); } iw.Dispose(); IndexReader reader = DirectoryReader.Open(dir); IndexSearcher searcher = NewSearcher(reader); // test queries that must find all foreach (Query q in matchAll) { if (VERBOSE) { Console.WriteLine("matchAll: q=" + q + " " + q.GetType().Name); } ScoreDoc[] hits = searcher.Search(q, null, 1000).ScoreDocs; Assert.AreEqual(docs.Length, hits.Length); } // test queries that must find none foreach (Query q in matchNone) { if (VERBOSE) { Console.WriteLine("matchNone: q=" + q + " " + q.GetType().Name); } ScoreDoc[] hits = searcher.Search(q, null, 1000).ScoreDocs; Assert.AreEqual(0, hits.Length); } // thest the prefi queries find only one doc for (int i = 0; i < matchOneDocPrefix.Length; i++) { for (int j = 0; j < matchOneDocPrefix[i].Length; j++) { Query q = matchOneDocPrefix[i][j]; if (VERBOSE) { Console.WriteLine("match 1 prefix: doc=" + docs[i] + " q=" + q + " " + q.GetType().Name); } ScoreDoc[] hits = searcher.Search(q, null, 1000).ScoreDocs; Assert.AreEqual(1, hits.Length); Assert.AreEqual(i, hits[0].Doc); } } // test the wildcard queries find only one doc for (int i = 0; i < matchOneDocWild.Length; i++) { for (int j = 0; j < matchOneDocWild[i].Length; j++) { Query q = matchOneDocWild[i][j]; if (VERBOSE) { Console.WriteLine("match 1 wild: doc=" + docs[i] + " q=" + q + " " + q.GetType().Name); } ScoreDoc[] hits = searcher.Search(q, null, 1000).ScoreDocs; Assert.AreEqual(1, hits.Length); Assert.AreEqual(i, hits[0].Doc); } } reader.Dispose(); dir.Dispose(); }
// we need to guarantee that if several threads call this concurrently, only // one executes it, and after it returns, the cache is updated and is either // complete or not. private void PerhapsFillCache() { lock (this) { if (cacheMisses.Get() < cacheMissesUntilFill) { return; } if (!shouldFillCache) { // we already filled the cache once, there's no need to re-fill it return; } shouldFillCache = false; InitReaderManager(); bool aborted = false; DirectoryReader reader = readerManager.Acquire(); try { TermsEnum termsEnum = null; DocsEnum docsEnum = null; foreach (AtomicReaderContext ctx in reader.Leaves) { Terms terms = ctx.AtomicReader.Terms(Consts.FULL); if (terms != null) // cannot really happen, but be on the safe side { termsEnum = terms.Iterator(termsEnum); while (termsEnum.Next() != null) { if (!cache.Full) { BytesRef t = termsEnum.Term(); // Since we guarantee uniqueness of categories, each term has exactly // one document. Also, since we do not allow removing categories (and // hence documents), there are no deletions in the index. Therefore, it // is sufficient to call next(), and then doc(), exactly once with no // 'validation' checks. FacetLabel cp = new FacetLabel(FacetsConfig.StringToPath(t.Utf8ToString())); docsEnum = termsEnum.Docs(null, docsEnum, DocsEnum.FLAG_NONE); bool res = cache.Put(cp, docsEnum.NextDoc() + ctx.DocBase); Debug.Assert(!res, "entries should not have been evicted from the cache"); } else { // the cache is full and the next put() will evict entries from it, therefore abort the iteration. aborted = true; break; } } } if (aborted) { break; } } } finally { readerManager.Release(reader); } cacheIsComplete = !aborted; if (cacheIsComplete) { lock (this) { // everything is in the cache, so no need to keep readerManager open. // this block is executed in a sync block so that it works well with // initReaderManager called in parallel. readerManager.Dispose(); readerManager = null; initializedReaderManager = false; } } } }
/// <summary>Simple command-line based search demo.</summary> public static void Main(string[] args) { // The <CONSOLE_APP_NAME> should be the assembly name of the application // this code is compiled into. In .NET Framework, it is the name of the EXE file. // In .NET Core, you have the option of compiling this into either a DLL or an EXE // (see https://docs.microsoft.com/en-us/dotnet/core/deploying/index). // In the first case, the <CONSOLE_APP_NAME> will be "dotnet <DLL_NAME>.dll". string usage = "Usage: <CONSOLE_APP_NAME> <INDEX_DIRECTORY> [-f|--field <FIELD>] " + "[-r|--repeat <NUMBER>] [-qf|--queries-file <PATH>] [-q|--query <QUERY>] " + "[--raw] [-p|--page-size <NUMBER>]\n\n" + "Use no --query or --queries-file option for interactive mode.\n\n" + "See http://lucene.apache.org/core/4_8_0/demo/ for details."; if (args.Length < 1 || args.Length > 0 && ("?".Equals(args[0], StringComparison.Ordinal) || "-h".Equals(args[0], StringComparison.Ordinal) || "--help".Equals(args[0], StringComparison.Ordinal))) { Console.WriteLine(usage); Environment.Exit(0); } string index = args[0]; string field = "contents"; string queries = null; int repeat = 0; bool raw = false; string queryString = null; int hitsPerPage = 10; for (int i = 0; i < args.Length; i++) { if ("-f".Equals(args[i], StringComparison.Ordinal) || "-field".Equals(args[i], StringComparison.Ordinal)) { field = args[i + 1]; i++; } else if ("-qf".Equals(args[i], StringComparison.Ordinal) || "--queries-file".Equals(args[i], StringComparison.Ordinal)) { queries = args[i + 1]; i++; } else if ("-q".Equals(args[i], StringComparison.Ordinal) || "--query".Equals(args[i], StringComparison.Ordinal)) { queryString = args[i + 1]; i++; } else if ("-r".Equals(args[i], StringComparison.Ordinal) || "--repeat".Equals(args[i], StringComparison.Ordinal)) { repeat = int.Parse(args[i + 1], CultureInfo.InvariantCulture); i++; } else if ("--raw".Equals(args[i], StringComparison.Ordinal)) { raw = true; } else if ("-p".Equals(args[i], StringComparison.Ordinal) || "--paging".Equals(args[i], StringComparison.Ordinal)) { hitsPerPage = int.Parse(args[i + 1], CultureInfo.InvariantCulture); if (hitsPerPage <= 0) { Console.WriteLine("There must be at least 1 hit per page."); Environment.Exit(1); } i++; } } using IndexReader reader = DirectoryReader.Open(FSDirectory.Open(index)); IndexSearcher searcher = new IndexSearcher(reader); // :Post-Release-Update-Version.LUCENE_XY: Analyzer analyzer = new StandardAnalyzer(LuceneVersion.LUCENE_48); TextReader input = null; if (queries != null) { input = new StreamReader(new FileStream(queries, FileMode.Open, FileAccess.Read), Encoding.UTF8); } else { input = Console.In; } // :Post-Release-Update-Version.LUCENE_XY: QueryParser parser = new QueryParser(LuceneVersion.LUCENE_48, field, analyzer); while (true) { if (queries is null && queryString is null) { // prompt the user Console.WriteLine("Enter query (or press Enter to exit): "); } string line = queryString ?? input.ReadLine(); if (line is null || line.Length == 0) { break; } line = line.Trim(); if (line.Length == 0) { break; } Query query = parser.Parse(line); Console.WriteLine("Searching for: " + query.ToString(field)); if (repeat > 0) // repeat & time as benchmark { DateTime start = DateTime.UtcNow; for (int i = 0; i < repeat; i++) { searcher.Search(query, null, 100); } DateTime end = DateTime.UtcNow; Console.WriteLine("Time: " + (end - start).TotalMilliseconds + "ms"); } DoPagingSearch(searcher, query, hitsPerPage, raw, queries is null && queryString is null); if (queryString != null) { break; } } }
/// <summary> /// An <see cref="IndexReader"/> is an instance of the index at a given point in time /// We need to update this Reader by reopen the <see cref="IndexReader"/> /// Maybe change this method later ? /// </summary> /// <param name="appName"></param> private void UpdateReader(string appName) { _readers[appName] = DirectoryReader.OpenIfChanged(_readers[appName] as DirectoryReader) ?? _readers[appName]; }
public void Index() { var indexConfig = LuceneIndexDefaults.CreateStandardIndexWriterConfig(); long readCount = 0; // Read All lines in the file (IEnumerable, yield) // And group them by QCode. var subjectGroups = FileHelper.GetInputLines(InputFilename).GroupBySubject(); using var luceneDirectory = FSDirectory.Open(EntitiesIndexPath); using var luceneDirectoryReader = DirectoryReader.Open(luceneDirectory); var docCount = luceneDirectoryReader.MaxDoc; for (var i = 0; i < docCount; i++) { var doc = luceneDirectoryReader.Document(i); var entity = doc.MapEntity(); var reverseProperties = entity.ReverseProperties.Select(x => x.Id.ToInt()).ToList(); var properties = entity.Properties.Select(x => x.Id.ToInt()).ToList(); //TODO: Use constant: var otherProperties = properties.Where(x => !x.Equals(31)).ToList(); var types = entity.ParentTypes.Select(x => x.ToInt()).ToList(); var isType = entity.IsType; //Range //TODO: Use constant: //if (isType) RangeDictionary.AddSafe(31, types); foreach (var reversePropertyId in reverseProperties) { RangeDictionary.AddSafe(reversePropertyId, types); } //Domain DomainDictionary.AddSafe(31, types); foreach (var propertyId in otherProperties) { DomainDictionary.AddSafe(propertyId, types); } //Frequency foreach (var propertyIntId in properties) { if (!FrequencyHashTable.ContainsKey(propertyIntId)) { FrequencyHashTable.Add(propertyIntId, 0); } FrequencyHashTable[propertyIntId] = (int)FrequencyHashTable[propertyIntId] + 1; } LogMessage(readCount++, "Frequency, Domain, Range", false); } LogMessage(readCount, "Frequency, Domain, Range"); readCount = 0; using (var indexDirectory = FSDirectory.Open(OutputDirectory.GetOrCreateDirectory())) { using var writer = new IndexWriter(indexDirectory, indexConfig); foreach (var subjectGroup in subjectGroups.Where(FilterGroups)) { var document = new Document(); foreach (var field in FrequencyGetField(subjectGroup)) { document.Add(field); } foreach (var field in DomainGetField(subjectGroup)) { document.Add(field); } foreach (var field in RangeGetField(subjectGroup)) { document.Add(field); } var boostField = document.Fields.FirstOrDefault(x => x.Name.Equals(Labels.Rank.ToString())); var boost = 0.0; if (boostField != null) { boost = (double)boostField.GetDoubleValue(); } foreach (var fieldIndexer in FieldIndexers) { fieldIndexer.Boost = boost; } foreach (var fieldIndexer in FieldIndexers) { foreach (var field in fieldIndexer.GetField(subjectGroup)) { document.Add(field); } } LogProgress(readCount++); writer.AddDocument(document); } } LogProgress(readCount, true); }
/// <summary> /// 执行搜索 /// </summary> /// <param name="options">搜索选项</param> /// <param name="safeSearch">启用安全搜索</param> /// <returns></returns> private ILuceneSearchResultCollection PerformSearch(SearchOptions options, bool safeSearch) { // 结果集 ILuceneSearchResultCollection results = new LuceneSearchResultCollection(); using var reader = DirectoryReader.Open(_directory); var searcher = new IndexSearcher(reader); Query query; // 启用安全搜索 if (safeSearch) { options.Keywords = QueryParserBase.Escape(options.Keywords); } if (options.Fields.Count == 1) { // 单字段搜索 var queryParser = new QueryParser(Lucene.Net.Util.LuceneVersion.LUCENE_48, options.Fields[0], _analyzer); query = queryParser.Parse(options.Keywords); } else { // 多字段搜索 var multiFieldQueryParser = new MultiFieldQueryParser(Lucene.Net.Util.LuceneVersion.LUCENE_48, options.Fields.ToArray(), _analyzer, options.Boosts); query = GetFuzzyquery(multiFieldQueryParser, options.Keywords); } var sortFields = new List <SortField> { SortField.FIELD_SCORE }; sortFields.AddRange(options.OrderBy.Select(sortField => new SortField(sortField, SortFieldType.STRING))); // 排序规则处理 var sort = new Sort(sortFields.ToArray()); Expression <Func <ScoreDoc, bool> > where = m => m.Score >= options.Score; if (options.Type != null) { // 过滤掉已经设置了类型的对象 @where = @where.And(m => options.Type.AssemblyQualifiedName == searcher.Doc(m.Doc).Get("Type")); } var matches = searcher.Search(query, null, options.MaximumNumberOfHits, sort, true, true).ScoreDocs.Where(@where.Compile()); results.TotalHits = matches.Count(); // 分页处理 if (options.Skip.HasValue) { matches = matches.Skip(options.Skip.Value); } if (options.Take.HasValue) { matches = matches.Take(options.Take.Value); } var docs = matches.ToList(); // 创建结果集 foreach (var match in docs) { var doc = searcher.Doc(match.Doc); results.Results.Add(new LuceneSearchResult() { Score = match.Score, Document = doc }); } return(results); }
public virtual void TestNonIndexedFields() { Directory dir = NewDirectory(); RandomIndexWriter iw = new RandomIndexWriter(Random(), dir, Similarity, TimeZone); Document doc = new Document(); doc.Add(new StoredField("bogusbytes", "bogus")); doc.Add(new StoredField("bogusshorts", "bogus")); doc.Add(new StoredField("bogusints", "bogus")); doc.Add(new StoredField("boguslongs", "bogus")); doc.Add(new StoredField("bogusfloats", "bogus")); doc.Add(new StoredField("bogusdoubles", "bogus")); doc.Add(new StoredField("bogusterms", "bogus")); doc.Add(new StoredField("bogustermsindex", "bogus")); doc.Add(new StoredField("bogusmultivalued", "bogus")); doc.Add(new StoredField("bogusbits", "bogus")); iw.AddDocument(doc); DirectoryReader ir = iw.Reader; iw.Dispose(); AtomicReader ar = GetOnlySegmentReader(ir); IFieldCache cache = FieldCache.DEFAULT; cache.PurgeAllCaches(); Assert.AreEqual(0, cache.CacheEntries.Length); Bytes bytes = cache.GetBytes(ar, "bogusbytes", true); Assert.AreEqual(0, bytes.Get(0)); Shorts shorts = cache.GetShorts(ar, "bogusshorts", true); Assert.AreEqual(0, shorts.Get(0)); Ints ints = cache.GetInts(ar, "bogusints", true); Assert.AreEqual(0, ints.Get(0)); Longs longs = cache.GetLongs(ar, "boguslongs", true); Assert.AreEqual(0, longs.Get(0)); Floats floats = cache.GetFloats(ar, "bogusfloats", true); Assert.AreEqual(0, floats.Get(0), 0.0f); Doubles doubles = cache.GetDoubles(ar, "bogusdoubles", true); Assert.AreEqual(0, doubles.Get(0), 0.0D); BytesRef scratch = new BytesRef(); BinaryDocValues binaries = cache.GetTerms(ar, "bogusterms", true); binaries.Get(0, scratch); Assert.AreEqual(0, scratch.Length); SortedDocValues sorted = cache.GetTermsIndex(ar, "bogustermsindex"); Assert.AreEqual(-1, sorted.GetOrd(0)); sorted.Get(0, scratch); Assert.AreEqual(0, scratch.Length); SortedSetDocValues sortedSet = cache.GetDocTermOrds(ar, "bogusmultivalued"); sortedSet.Document = 0; Assert.AreEqual(SortedSetDocValues.NO_MORE_ORDS, sortedSet.NextOrd()); Bits bits = cache.GetDocsWithField(ar, "bogusbits"); Assert.IsFalse(bits.Get(0)); // check that we cached nothing Assert.AreEqual(0, cache.CacheEntries.Length); ir.Dispose(); dir.Dispose(); }
/// <summary> /// Open the <see cref="DirectoryReader"/> from this <see cref="Directory"/>. /// </summary> protected virtual DirectoryReader OpenIndexReader(Directory directory) { return(DirectoryReader.Open(directory)); }
public void TestRandomDiscreteMultiValueHighlighting() { String[] randomValues = new String[3 + Random().nextInt(10 * RANDOM_MULTIPLIER)]; for (int i = 0; i < randomValues.Length; i++) { String randomValue; do { randomValue = TestUtil.RandomSimpleString(Random()); } while ("".equals(randomValue)); randomValues[i] = randomValue; } Directory dir = NewDirectory(); RandomIndexWriter writer = new RandomIndexWriter( Random(), dir, NewIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(Random())).SetMergePolicy(NewLogMergePolicy())); FieldType customType = new FieldType(TextField.TYPE_STORED); customType.StoreTermVectors = (true); customType.StoreTermVectorOffsets = (true); customType.StoreTermVectorPositions = (true); int numDocs = randomValues.Length * 5; int numFields = 2 + Random().nextInt(5); int numTerms = 2 + Random().nextInt(3); List <Doc> docs = new List <Doc>(numDocs); List <Document> documents = new List <Document>(numDocs); IDictionary <String, ISet <int> > valueToDocId = new HashMap <String, ISet <int> >(); for (int i = 0; i < numDocs; i++) { Document document = new Document(); String[][] fields = RectangularArrays.ReturnRectangularArray <string>(numFields, numTerms); //new String[numFields][numTerms]; for (int j = 0; j < numFields; j++) { String[] fieldValues = new String[numTerms]; fieldValues[0] = getRandomValue(randomValues, valueToDocId, i); StringBuilder builder = new StringBuilder(fieldValues[0]); for (int k = 1; k < numTerms; k++) { fieldValues[k] = getRandomValue(randomValues, valueToDocId, i); builder.append(' ').append(fieldValues[k]); } document.Add(new Field(F, builder.toString(), customType)); fields[j] = fieldValues; } docs.Add(new Doc(fields)); documents.Add(document); } writer.AddDocuments(documents); writer.Dispose(); IndexReader reader = DirectoryReader.Open(dir); try { int highlightIters = 1 + Random().nextInt(120 * RANDOM_MULTIPLIER); for (int highlightIter = 0; highlightIter < highlightIters; highlightIter++) { String queryTerm = randomValues[Random().nextInt(randomValues.Length)]; var iter = valueToDocId[queryTerm].GetEnumerator(); iter.MoveNext(); int randomHit = iter.Current; List <StringBuilder> builders = new List <StringBuilder>(); foreach (String[] fieldValues in docs[randomHit].fieldValues) { StringBuilder builder = new StringBuilder(); bool hit = false; for (int i = 0; i < fieldValues.Length; i++) { if (queryTerm.equals(fieldValues[i])) { builder.append("<b>").append(queryTerm).append("</b>"); hit = true; } else { builder.append(fieldValues[i]); } if (i != fieldValues.Length - 1) { builder.append(' '); } } if (hit) { builders.Add(builder); } } FieldQuery fq = new FieldQuery(tq(queryTerm), true, true); FieldTermStack stack = new FieldTermStack(reader, randomHit, F, fq); FieldPhraseList fpl = new FieldPhraseList(stack, fq); SimpleFragListBuilder sflb = new SimpleFragListBuilder(100); FieldFragList ffl = sflb.CreateFieldFragList(fpl, 300); SimpleFragmentsBuilder sfb = new SimpleFragmentsBuilder(); sfb.IsDiscreteMultiValueHighlighting = (true); String[] actualFragments = sfb.CreateFragments(reader, randomHit, F, ffl, numFields); assertEquals(builders.size(), actualFragments.Length); for (int i = 0; i < actualFragments.Length; i++) { assertEquals(builders[i].toString(), actualFragments[i]); } } } finally { reader.Dispose(); dir.Dispose(); } }
/// <summary> /// Build the suggest index, using up to the specified /// amount of temporary RAM while building. Note that /// the weights for the suggestions are ignored. /// </summary> public virtual void Build(IInputIterator iterator, double ramBufferSizeMB) { if (iterator.HasPayloads) { throw new System.ArgumentException("this suggester doesn't support payloads"); } if (iterator.HasContexts) { throw new System.ArgumentException("this suggester doesn't support contexts"); } string prefix = this.GetType().Name; var directory = OfflineSorter.DefaultTempDir(); // LUCENENET specific - using GetRandomFileName() instead of picking a random int DirectoryInfo tempIndexPath = null; while (true) { tempIndexPath = new DirectoryInfo(Path.Combine(directory.FullName, prefix + ".index." + Path.GetFileNameWithoutExtension(Path.GetRandomFileName()))); tempIndexPath.Create(); if (System.IO.Directory.Exists(tempIndexPath.FullName)) { break; } } Directory dir = FSDirectory.Open(tempIndexPath); try { #pragma warning disable 612, 618 IndexWriterConfig iwc = new IndexWriterConfig(LuceneVersion.LUCENE_CURRENT, indexAnalyzer); #pragma warning restore 612, 618 iwc.SetOpenMode(OpenMode.CREATE); iwc.SetRAMBufferSizeMB(ramBufferSizeMB); IndexWriter writer = new IndexWriter(dir, iwc); var ft = new FieldType(TextField.TYPE_NOT_STORED); // TODO: if only we had IndexOptions.TERMS_ONLY... ft.IndexOptions = IndexOptions.DOCS_AND_FREQS; ft.OmitNorms = true; ft.Freeze(); Document doc = new Document(); Field field = new Field("body", "", ft); doc.Add(field); totTokens = 0; IndexReader reader = null; bool success = false; count = 0; try { while (true) { BytesRef surfaceForm = iterator.Next(); if (surfaceForm == null) { break; } field.SetStringValue(surfaceForm.Utf8ToString()); writer.AddDocument(doc); count++; } reader = DirectoryReader.Open(writer, false); Terms terms = MultiFields.GetTerms(reader, "body"); if (terms == null) { throw new System.ArgumentException("need at least one suggestion"); } // Move all ngrams into an FST: TermsEnum termsEnum = terms.GetIterator(null); Outputs <long?> outputs = PositiveInt32Outputs.Singleton; Builder <long?> builder = new Builder <long?>(FST.INPUT_TYPE.BYTE1, outputs); Int32sRef scratchInts = new Int32sRef(); while (true) { BytesRef term = termsEnum.Next(); if (term == null) { break; } int ngramCount = CountGrams(term); if (ngramCount > grams) { throw new System.ArgumentException("tokens must not contain separator byte; got token=" + term + " but gramCount=" + ngramCount + ", which is greater than expected max ngram size=" + grams); } if (ngramCount == 1) { totTokens += termsEnum.TotalTermFreq; } builder.Add(Lucene.Net.Util.Fst.Util.ToInt32sRef(term, scratchInts), EncodeWeight(termsEnum.TotalTermFreq)); } fst = builder.Finish(); if (fst == null) { throw new System.ArgumentException("need at least one suggestion"); } //System.out.println("FST: " + fst.getNodeCount() + " nodes"); /* * PrintWriter pw = new PrintWriter("/x/tmp/out.dot"); * Util.toDot(fst, pw, true, true); * pw.close(); */ success = true; } finally { if (success) { IOUtils.Dispose(writer, reader); } else { IOUtils.DisposeWhileHandlingException(writer, reader); } } } finally { try { IOUtils.Dispose(dir); } finally { // LUCENENET specific - since we are removing the entire directory anyway, // it doesn't make sense to first do a loop in order remove the files. // Let the System.IO.Directory.Delete() method handle that. // We also need to dispose the Directory instance first before deleting from disk. try { System.IO.Directory.Delete(tempIndexPath.FullName, true); } catch (Exception e) { throw new InvalidOperationException("failed to remove " + tempIndexPath, e); } } } }
public void TestRandom() { int numberOfRuns = TestUtil.NextInt32(Random, 3, 6); for (int iter = 0; iter < numberOfRuns; iter++) { if (VERBOSE) { Console.WriteLine(string.Format("TEST: iter={0} total={1}", iter, numberOfRuns)); } int numDocs = TestUtil.NextInt32(Random, 100, 1000) * RANDOM_MULTIPLIER; int numGroups = TestUtil.NextInt32(Random, 1, numDocs); if (VERBOSE) { Console.WriteLine("TEST: numDocs=" + numDocs + " numGroups=" + numGroups); } List <BytesRef> groups = new List <BytesRef>(); for (int i = 0; i < numGroups; i++) { string randomValue; do { // B/c of DV based impl we can't see the difference between an empty string and a null value. // For that reason we don't generate empty string groups. randomValue = TestUtil.RandomRealisticUnicodeString(Random); } while ("".Equals(randomValue, StringComparison.Ordinal)); groups.Add(new BytesRef(randomValue)); } string[] contentStrings = new string[TestUtil.NextInt32(Random, 2, 20)]; if (VERBOSE) { Console.WriteLine("TEST: create fake content"); } for (int contentIDX = 0; contentIDX < contentStrings.Length; contentIDX++) { StringBuilder sb = new StringBuilder(); sb.append("real").append(Random.nextInt(3)).append(' '); int fakeCount = Random.nextInt(10); for (int fakeIDX = 0; fakeIDX < fakeCount; fakeIDX++) { sb.append("fake "); } contentStrings[contentIDX] = sb.toString(); if (VERBOSE) { Console.WriteLine(" content=" + sb.toString()); } } Directory dir = NewDirectory(); RandomIndexWriter w = new RandomIndexWriter( Random, dir, NewIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(Random))); bool preFlex = "Lucene3x".Equals(w.IndexWriter.Config.Codec.Name, StringComparison.Ordinal); bool canUseIDV = !preFlex; DocValuesType valueType = vts[Random.nextInt(vts.Length)]; Document doc = new Document(); Document docNoGroup = new Document(); Field group = NewStringField("group", "", Field.Store.NO); doc.Add(group); Field valuesField = null; if (canUseIDV) { switch (valueType) { case DocValuesType.BINARY: valuesField = new BinaryDocValuesField("group_dv", new BytesRef()); break; case DocValuesType.SORTED: valuesField = new SortedDocValuesField("group_dv", new BytesRef()); break; default: fail("unhandled type"); break; } doc.Add(valuesField); } Field sort1 = NewStringField("sort1", "", Field.Store.NO); doc.Add(sort1); docNoGroup.Add(sort1); Field sort2 = NewStringField("sort2", "", Field.Store.NO); doc.Add(sort2); docNoGroup.Add(sort2); Field sort3 = NewStringField("sort3", "", Field.Store.NO); doc.Add(sort3); docNoGroup.Add(sort3); Field content = NewTextField("content", "", Field.Store.NO); doc.Add(content); docNoGroup.Add(content); Int32Field id = new Int32Field("id", 0, Field.Store.NO); doc.Add(id); docNoGroup.Add(id); GroupDoc[] groupDocs = new GroupDoc[numDocs]; for (int i = 0; i < numDocs; i++) { BytesRef groupValue; if (Random.nextInt(24) == 17) { // So we test the "doc doesn't have the group'd // field" case: groupValue = null; } else { groupValue = groups[Random.nextInt(groups.size())]; } GroupDoc groupDoc = new GroupDoc( i, groupValue, groups[Random.nextInt(groups.size())], groups[Random.nextInt(groups.size())], new BytesRef(string.Format(CultureInfo.InvariantCulture, "{0:D5}", i)), contentStrings[Random.nextInt(contentStrings.Length)] ); if (VERBOSE) { Console.WriteLine(" doc content=" + groupDoc.content + " id=" + i + " group=" + (groupDoc.group == null ? "null" : groupDoc.group.Utf8ToString()) + " sort1=" + groupDoc.sort1.Utf8ToString() + " sort2=" + groupDoc.sort2.Utf8ToString() + " sort3=" + groupDoc.sort3.Utf8ToString()); } groupDocs[i] = groupDoc; if (groupDoc.group != null) { group.SetStringValue(groupDoc.group.Utf8ToString()); if (canUseIDV) { valuesField.SetBytesValue(new BytesRef(groupDoc.group.Utf8ToString())); } } sort1.SetStringValue(groupDoc.sort1.Utf8ToString()); sort2.SetStringValue(groupDoc.sort2.Utf8ToString()); sort3.SetStringValue(groupDoc.sort3.Utf8ToString()); content.SetStringValue(groupDoc.content); id.SetInt32Value(groupDoc.id); if (groupDoc.group == null) { w.AddDocument(docNoGroup); } else { w.AddDocument(doc); } } DirectoryReader r = w.GetReader(); w.Dispose(); // NOTE: intentional but temporary field cache insanity! FieldCache.Int32s docIdToFieldId = FieldCache.DEFAULT.GetInt32s(SlowCompositeReaderWrapper.Wrap(r), "id", false); int[] fieldIdToDocID = new int[numDocs]; for (int i = 0; i < numDocs; i++) { int fieldId = docIdToFieldId.Get(i); fieldIdToDocID[fieldId] = i; } try { IndexSearcher s = NewSearcher(r); if (typeof(SlowCompositeReaderWrapper).GetTypeInfo().IsAssignableFrom(s.IndexReader.GetType())) { canUseIDV = false; } else { canUseIDV = !preFlex; } for (int contentID = 0; contentID < 3; contentID++) { ScoreDoc[] hits = s.Search(new TermQuery(new Term("content", "real" + contentID)), numDocs).ScoreDocs; foreach (ScoreDoc hit in hits) { GroupDoc gd = groupDocs[docIdToFieldId.Get(hit.Doc)]; assertTrue(gd.score == 0.0); gd.score = hit.Score; int docId = gd.id; assertEquals(docId, docIdToFieldId.Get(hit.Doc)); } } foreach (GroupDoc gd in groupDocs) { assertTrue(gd.score != 0.0); } for (int searchIter = 0; searchIter < 100; searchIter++) { if (VERBOSE) { Console.WriteLine("TEST: searchIter=" + searchIter); } string searchTerm = "real" + Random.nextInt(3); bool sortByScoreOnly = Random.nextBoolean(); Sort sortWithinGroup = GetRandomSort(sortByScoreOnly); AbstractAllGroupHeadsCollector allGroupHeadsCollector = CreateRandomCollector("group", sortWithinGroup, canUseIDV, valueType); s.Search(new TermQuery(new Term("content", searchTerm)), allGroupHeadsCollector); int[] expectedGroupHeads = CreateExpectedGroupHeads(searchTerm, groupDocs, sortWithinGroup, sortByScoreOnly, fieldIdToDocID); int[] actualGroupHeads = allGroupHeadsCollector.RetrieveGroupHeads(); // The actual group heads contains Lucene ids. Need to change them into our id value. for (int i = 0; i < actualGroupHeads.Length; i++) { actualGroupHeads[i] = docIdToFieldId.Get(actualGroupHeads[i]); } // Allows us the easily iterate and assert the actual and expected results. Array.Sort(expectedGroupHeads); Array.Sort(actualGroupHeads); if (VERBOSE) { Console.WriteLine("Collector: " + allGroupHeadsCollector.GetType().Name); Console.WriteLine("Sort within group: " + sortWithinGroup); Console.WriteLine("Num group: " + numGroups); Console.WriteLine("Num doc: " + numDocs); Console.WriteLine("\n=== Expected: \n"); foreach (int expectedDocId in expectedGroupHeads) { GroupDoc expectedGroupDoc = groupDocs[expectedDocId]; string expectedGroup = expectedGroupDoc.group == null ? null : expectedGroupDoc.group.Utf8ToString(); Console.WriteLine( string.Format(CultureInfo.InvariantCulture, "Group:{0,10} score{1:0.0#######,5} Sort1:{2,10} Sort2:{3,10} Sort3:{4,10} doc:{5,10}", expectedGroup, expectedGroupDoc.score, expectedGroupDoc.sort1.Utf8ToString(), expectedGroupDoc.sort2.Utf8ToString(), expectedGroupDoc.sort3.Utf8ToString(), expectedDocId) ); } Console.WriteLine("\n=== Actual: \n"); foreach (int actualDocId in actualGroupHeads) { GroupDoc actualGroupDoc = groupDocs[actualDocId]; string actualGroup = actualGroupDoc.group == null ? null : actualGroupDoc.group.Utf8ToString(); Console.WriteLine( string.Format(CultureInfo.InvariantCulture, "Group:{0,10} score{1:0.0#######,5} Sort1:{2,10} Sort2:{3,10} Sort3:{4,10} doc:{5,10}", actualGroup, actualGroupDoc.score, actualGroupDoc.sort1.Utf8ToString(), actualGroupDoc.sort2.Utf8ToString(), actualGroupDoc.sort3.Utf8ToString(), actualDocId) ); } Console.WriteLine("\n==================================================================================="); } assertArrayEquals(expectedGroupHeads, actualGroupHeads); } } finally { QueryUtils.PurgeFieldCache(r); } r.Dispose(); dir.Dispose(); } }
public virtual void TestDocValuesIntegration() { AssumeTrue("3.x does not support docvalues", DefaultCodecSupportsDocValues); Directory dir = NewDirectory(); IndexWriterConfig iwc = NewIndexWriterConfig(TEST_VERSION_CURRENT, null); RandomIndexWriter iw = new RandomIndexWriter(Random, dir, iwc); Document doc = new Document(); doc.Add(new BinaryDocValuesField("binary", new BytesRef("binary value"))); doc.Add(new SortedDocValuesField("sorted", new BytesRef("sorted value"))); doc.Add(new NumericDocValuesField("numeric", 42)); if (DefaultCodecSupportsSortedSet) { doc.Add(new SortedSetDocValuesField("sortedset", new BytesRef("sortedset value1"))); doc.Add(new SortedSetDocValuesField("sortedset", new BytesRef("sortedset value2"))); } iw.AddDocument(doc); DirectoryReader ir = iw.GetReader(); iw.Dispose(); AtomicReader ar = GetOnlySegmentReader(ir); BytesRef scratch = new BytesRef(); // Binary type: can be retrieved via getTerms() try { FieldCache.DEFAULT.GetInt32s(ar, "binary", false); Assert.Fail(); } #pragma warning disable 168 catch (InvalidOperationException expected) #pragma warning restore 168 { } BinaryDocValues binary = FieldCache.DEFAULT.GetTerms(ar, "binary", true); binary.Get(0, scratch); Assert.AreEqual("binary value", scratch.Utf8ToString()); try { FieldCache.DEFAULT.GetTermsIndex(ar, "binary"); Assert.Fail(); } #pragma warning disable 168 catch (InvalidOperationException expected) #pragma warning restore 168 { } try { FieldCache.DEFAULT.GetDocTermOrds(ar, "binary"); Assert.Fail(); } #pragma warning disable 168 catch (InvalidOperationException expected) #pragma warning restore 168 { } try { new DocTermOrds(ar, null, "binary"); Assert.Fail(); } #pragma warning disable 168 catch (InvalidOperationException expected) #pragma warning restore 168 { } IBits bits = FieldCache.DEFAULT.GetDocsWithField(ar, "binary"); Assert.IsTrue(bits.Get(0)); // Sorted type: can be retrieved via getTerms(), getTermsIndex(), getDocTermOrds() try { FieldCache.DEFAULT.GetInt32s(ar, "sorted", false); Assert.Fail(); } #pragma warning disable 168 catch (InvalidOperationException expected) #pragma warning restore 168 { } try { new DocTermOrds(ar, null, "sorted"); Assert.Fail(); } #pragma warning disable 168 catch (InvalidOperationException expected) #pragma warning restore 168 { } binary = FieldCache.DEFAULT.GetTerms(ar, "sorted", true); binary.Get(0, scratch); Assert.AreEqual("sorted value", scratch.Utf8ToString()); SortedDocValues sorted = FieldCache.DEFAULT.GetTermsIndex(ar, "sorted"); Assert.AreEqual(0, sorted.GetOrd(0)); Assert.AreEqual(1, sorted.ValueCount); sorted.Get(0, scratch); Assert.AreEqual("sorted value", scratch.Utf8ToString()); SortedSetDocValues sortedSet = FieldCache.DEFAULT.GetDocTermOrds(ar, "sorted"); sortedSet.SetDocument(0); Assert.AreEqual(0, sortedSet.NextOrd()); Assert.AreEqual(SortedSetDocValues.NO_MORE_ORDS, sortedSet.NextOrd()); Assert.AreEqual(1, sortedSet.ValueCount); bits = FieldCache.DEFAULT.GetDocsWithField(ar, "sorted"); Assert.IsTrue(bits.Get(0)); // Numeric type: can be retrieved via getInts() and so on Int32s numeric = FieldCache.DEFAULT.GetInt32s(ar, "numeric", false); Assert.AreEqual(42, numeric.Get(0)); try { FieldCache.DEFAULT.GetTerms(ar, "numeric", true); Assert.Fail(); } #pragma warning disable 168 catch (InvalidOperationException expected) #pragma warning restore 168 { } try { FieldCache.DEFAULT.GetTermsIndex(ar, "numeric"); Assert.Fail(); } #pragma warning disable 168 catch (InvalidOperationException expected) #pragma warning restore 168 { } try { FieldCache.DEFAULT.GetDocTermOrds(ar, "numeric"); Assert.Fail(); } #pragma warning disable 168 catch (InvalidOperationException expected) #pragma warning restore 168 { } try { new DocTermOrds(ar, null, "numeric"); Assert.Fail(); } #pragma warning disable 168 catch (InvalidOperationException expected) #pragma warning restore 168 { } bits = FieldCache.DEFAULT.GetDocsWithField(ar, "numeric"); Assert.IsTrue(bits.Get(0)); // SortedSet type: can be retrieved via getDocTermOrds() if (DefaultCodecSupportsSortedSet) { try { FieldCache.DEFAULT.GetInt32s(ar, "sortedset", false); Assert.Fail(); } #pragma warning disable 168 catch (InvalidOperationException expected) #pragma warning restore 168 { } try { FieldCache.DEFAULT.GetTerms(ar, "sortedset", true); Assert.Fail(); } #pragma warning disable 168 catch (InvalidOperationException expected) #pragma warning restore 168 { } try { FieldCache.DEFAULT.GetTermsIndex(ar, "sortedset"); Assert.Fail(); } #pragma warning disable 168 catch (InvalidOperationException expected) #pragma warning restore 168 { } try { new DocTermOrds(ar, null, "sortedset"); Assert.Fail(); } #pragma warning disable 168 catch (InvalidOperationException expected) #pragma warning restore 168 { } sortedSet = FieldCache.DEFAULT.GetDocTermOrds(ar, "sortedset"); sortedSet.SetDocument(0); Assert.AreEqual(0, sortedSet.NextOrd()); Assert.AreEqual(1, sortedSet.NextOrd()); Assert.AreEqual(SortedSetDocValues.NO_MORE_ORDS, sortedSet.NextOrd()); Assert.AreEqual(2, sortedSet.ValueCount); bits = FieldCache.DEFAULT.GetDocsWithField(ar, "sortedset"); Assert.IsTrue(bits.Get(0)); } ir.Dispose(); dir.Dispose(); }
public override void BeforeClass() { base.BeforeClass(); dir = NewDirectory(); sdir1 = NewDirectory(); sdir2 = NewDirectory(); RandomIndexWriter writer = new RandomIndexWriter( #if FEATURE_INSTANCE_TESTDATA_INITIALIZATION this, #endif Random, dir, new MockAnalyzer(Random)); RandomIndexWriter swriter1 = new RandomIndexWriter( #if FEATURE_INSTANCE_TESTDATA_INITIALIZATION this, #endif Random, sdir1, new MockAnalyzer(Random)); RandomIndexWriter swriter2 = new RandomIndexWriter( #if FEATURE_INSTANCE_TESTDATA_INITIALIZATION this, #endif Random, sdir2, new MockAnalyzer(Random)); for (int i = 0; i < 10; i++) { Document doc = new Document(); doc.Add(NewStringField("data", Convert.ToString(i), Field.Store.NO)); writer.AddDocument(doc); ((i % 2 == 0) ? swriter1 : swriter2).AddDocument(doc); } writer.ForceMerge(1); swriter1.ForceMerge(1); swriter2.ForceMerge(1); writer.Dispose(); swriter1.Dispose(); swriter2.Dispose(); reader = DirectoryReader.Open(dir); searcher = NewSearcher(reader); multiReader = new MultiReader(new IndexReader[] { DirectoryReader.Open(sdir1), DirectoryReader.Open(sdir2) }, true); multiSearcher = NewSearcher(multiReader); multiReaderDupls = new MultiReader(new IndexReader[] { DirectoryReader.Open(sdir1), DirectoryReader.Open(dir) }, true); multiSearcherDupls = NewSearcher(multiReaderDupls); }
public void TestLazy() { int id = Random.nextInt(NUM_DOCS); IndexReader reader = DirectoryReader.Open(dir); try { Query q = new TermQuery(new Term("docid", "" + id)); IndexSearcher searcher = NewSearcher(reader); ScoreDoc[] hits = searcher.Search(q, 100).ScoreDocs; assertEquals("Too many docs", 1, hits.Length); LazyTestingStoredFieldVisitor visitor = new LazyTestingStoredFieldVisitor(new LazyDocument(reader, hits[0].Doc), FIELDS); reader.Document(hits[0].Doc, visitor); Document d = visitor.doc; int numFieldValues = 0; IDictionary <string, int> fieldValueCounts = new JCG.Dictionary <string, int>(); // at this point, all FIELDS should be Lazy and unrealized foreach (IIndexableField f in d) { numFieldValues++; if (f.Name.Equals("never_load", StringComparison.Ordinal)) { fail("never_load was loaded"); } if (f.Name.Equals("load_later", StringComparison.Ordinal)) { fail("load_later was loaded on first pass"); } if (f.Name.Equals("docid", StringComparison.Ordinal)) { assertFalse(f.Name, f is LazyDocument.LazyField); } else { if (!fieldValueCounts.TryGetValue(f.Name, out int count)) { count = 0; } count++; fieldValueCounts.Put(f.Name, count); assertTrue(f.Name + " is " + f.GetType(), f is LazyDocument.LazyField); LazyDocument.LazyField lf = (LazyDocument.LazyField)f; assertFalse(f.Name + " is loaded", lf.HasBeenLoaded); } } Console.WriteLine("numFieldValues == " + numFieldValues); assertEquals("numFieldValues", 1 + (NUM_VALUES * FIELDS.Length), numFieldValues); foreach (string field in fieldValueCounts.Keys) { assertEquals("fieldName count: " + field, NUM_VALUES, fieldValueCounts[field]); } // pick a single field name to load a single value string fieldName = FIELDS[Random.nextInt(FIELDS.Length)]; IIndexableField[] fieldValues = d.GetFields(fieldName); assertEquals("#vals in field: " + fieldName, NUM_VALUES, fieldValues.Length); int valNum = Random.nextInt(fieldValues.Length); assertEquals(id + "_" + fieldName + "_" + valNum, fieldValues[valNum].GetStringValue()); // now every value of fieldName should be loaded foreach (IIndexableField f in d) { if (f.Name.Equals("never_load", StringComparison.Ordinal)) { fail("never_load was loaded"); } if (f.Name.Equals("load_later", StringComparison.Ordinal)) { fail("load_later was loaded too soon"); } if (f.Name.Equals("docid", StringComparison.Ordinal)) { assertFalse(f.Name, f is LazyDocument.LazyField); } else { assertTrue(f.Name + " is " + f.GetType(), f is LazyDocument.LazyField); LazyDocument.LazyField lf = (LazyDocument.LazyField)f; assertEquals(f.Name + " is loaded?", lf.Name.Equals(fieldName, StringComparison.Ordinal), lf.HasBeenLoaded); } } // use the same LazyDoc to ask for one more lazy field visitor = new LazyTestingStoredFieldVisitor(new LazyDocument(reader, hits[0].Doc), "load_later"); reader.Document(hits[0].Doc, visitor); d = visitor.doc; // ensure we have all the values we expect now, and that // adding one more lazy field didn't "unload" the existing LazyField's // we already loaded. foreach (IIndexableField f in d) { if (f.Name.Equals("never_load", StringComparison.Ordinal)) { fail("never_load was loaded"); } if (f.Name.Equals("docid", StringComparison.Ordinal)) { assertFalse(f.Name, f is LazyDocument.LazyField); } else { assertTrue(f.Name + " is " + f.GetType(), f is LazyDocument.LazyField); LazyDocument.LazyField lf = (LazyDocument.LazyField)f; assertEquals(f.Name + " is loaded?", lf.Name.Equals(fieldName, StringComparison.Ordinal), lf.HasBeenLoaded); } } // even the underlying doc shouldn't have never_load assertNull("never_load was loaded in wrapped doc", visitor.lazyDoc.GetDocument().GetField("never_load")); } finally { reader.Dispose(); } }
/// <summary> /// Construct a Taxonomy writer. /// </summary> /// <param name="directory"> /// The <seealso cref="Directory"/> in which to store the taxonomy. Note that /// the taxonomy is written directly to that directory (not to a /// subdirectory of it). </param> /// <param name="openMode"> /// Specifies how to open a taxonomy for writing: <code>APPEND</code> /// means open an existing index for append (failing if the index does /// not yet exist). <code>CREATE</code> means create a new index (first /// deleting the old one if it already existed). /// <code>APPEND_OR_CREATE</code> appends to an existing index if there /// is one, otherwise it creates a new index. </param> /// <param name="cache"> /// A <seealso cref="TaxonomyWriterCache"/> implementation which determines /// the in-memory caching policy. See for example /// <seealso cref="LruTaxonomyWriterCache"/> and <seealso cref="Cl2oTaxonomyWriterCache"/>. /// If null or missing, <seealso cref="#defaultTaxonomyWriterCache()"/> is used. </param> /// <exception cref="CorruptIndexException"> /// if the taxonomy is corrupted. </exception> /// <exception cref="LockObtainFailedException"> /// if the taxonomy is locked by another writer. If it is known /// that no other concurrent writer is active, the lock might /// have been left around by an old dead process, and should be /// removed using <seealso cref="#unlock(Directory)"/>. </exception> /// <exception cref="IOException"> /// if another error occurred. </exception> public DirectoryTaxonomyWriter(Directory directory, OpenMode openMode, TaxonomyWriterCache cache) { dir = directory; IndexWriterConfig config = CreateIndexWriterConfig(openMode); indexWriter = OpenIndexWriter(dir, config); // verify (to some extent) that merge policy in effect would preserve category docids if (indexWriter != null) { Debug.Assert(!(indexWriter.Config.MergePolicy is TieredMergePolicy), "for preserving category docids, merging none-adjacent segments is not allowed"); } // after we opened the writer, and the index is locked, it's safe to check // the commit data and read the index epoch openMode = config.OpenMode.HasValue ? config.OpenMode.Value : OpenMode.CREATE_OR_APPEND; if (!DirectoryReader.IndexExists(directory)) { indexEpoch = 1; } else { string epochStr = null; IDictionary <string, string> commitData = ReadCommitData(directory); if (commitData != null && commitData.ContainsKey(INDEX_EPOCH)) { epochStr = commitData[INDEX_EPOCH]; } // no commit data, or no epoch in it means an old taxonomy, so set its epoch to 1, for lack // of a better value. indexEpoch = epochStr == null ? 1 : Convert.ToInt64(epochStr, 16); } if (openMode == OpenMode.CREATE) { ++indexEpoch; } FieldType ft = new FieldType(TextField.TYPE_NOT_STORED); ft.OmitNorms = true; parentStreamField = new Field(Consts.FIELD_PAYLOADS, parentStream, ft); fullPathField = new StringField(Consts.FULL, "", Field.Store.YES); if (indexWriter == null) { return; } nextID = indexWriter.MaxDoc; if (cache == null) { cache = DefaultTaxonomyWriterCache(); } this.cache = cache; if (nextID == 0) { cacheIsComplete = true; // Make sure that the taxonomy always contain the root category // with category id 0. AddCategory(new FacetLabel()); } else { // There are some categories on the disk, which we have not yet // read into the cache, and therefore the cache is incomplete. // We choose not to read all the categories into the cache now, // to avoid terrible performance when a taxonomy index is opened // to add just a single category. We will do it later, after we // notice a few cache misses. cacheIsComplete = false; } }
/// <summary> /// Opens a <seealso cref="DirectoryTaxonomyReader"/> over the given /// <seealso cref="DirectoryTaxonomyWriter"/> (for NRT). /// </summary> /// <param name="taxoWriter"> /// The <seealso cref="DirectoryTaxonomyWriter"/> from which to obtain newly /// added categories, in real-time. </param> public DirectoryTaxonomyReader(DirectoryTaxonomyWriter taxoWriter) { this.taxoWriter = taxoWriter; taxoEpoch = taxoWriter.TaxonomyEpoch; indexReader = OpenIndexReader(taxoWriter.InternalIndexWriter); // These are the default cache sizes; they can be configured after // construction with the cache's setMaxSize() method ordinalCache = new LRUHashMap<FacetLabel, IntClass>(DEFAULT_CACHE_VALUE); categoryCache = new LRUHashMap<int, FacetLabel>(DEFAULT_CACHE_VALUE); }
/// <summary> /// Look up the given category in the cache and/or the on-disk storage, /// returning the category's ordinal, or a negative number in case the /// category does not yet exist in the taxonomy. /// </summary> protected virtual int FindCategory(FacetLabel categoryPath) { lock (this) { // If we can find the category in the cache, or we know the cache is // complete, we can return the response directly from it int res = cache.Get(categoryPath); if (res >= 0 || cacheIsComplete) { return(res); } cacheMisses.IncrementAndGet(); // After a few cache misses, it makes sense to read all the categories // from disk and into the cache. The reason not to do this on the first // cache miss (or even when opening the writer) is that it will // significantly slow down the case when a taxonomy is opened just to // add one category. The idea only spending a long time on reading // after enough time was spent on cache misses is known as an "online // algorithm". PerhapsFillCache(); res = cache.Get(categoryPath); if (res >= 0 || cacheIsComplete) { // if after filling the cache from the info on disk, the category is in it // or the cache is complete, return whatever cache.get returned. return(res); } // if we get here, it means the category is not in the cache, and it is not // complete, and therefore we must look for the category on disk. // We need to get an answer from the on-disk index. InitReaderManager(); int doc = -1; DirectoryReader reader = readerManager.Acquire(); try { BytesRef catTerm = new BytesRef(FacetsConfig.PathToString(categoryPath.Components, categoryPath.Length)); TermsEnum termsEnum = null; // reuse DocsEnum docs = null; // reuse foreach (AtomicReaderContext ctx in reader.Leaves) { Terms terms = ctx.AtomicReader.Terms(Consts.FULL); if (terms != null) { termsEnum = terms.Iterator(termsEnum); if (termsEnum.SeekExact(catTerm)) { // liveDocs=null because the taxonomy has no deletes docs = termsEnum.Docs(null, docs, 0); // freqs not required // if the term was found, we know it has exactly one document. doc = docs.NextDoc() + ctx.DocBase; break; } } } } finally { readerManager.Release(reader); } if (doc > 0) { AddToCache(categoryPath, doc); } return(doc); } }
/// <summary> /// Open for reading a taxonomy stored in a given <seealso cref="Directory"/>. /// </summary> /// <param name="directory"> /// The <seealso cref="Directory"/> in which the taxonomy resides. </param> /// <exception cref="CorruptIndexException"> /// if the Taxonomy is corrupt. </exception> /// <exception cref="IOException"> /// if another error occurred. </exception> public DirectoryTaxonomyReader(Directory directory) { indexReader = OpenIndexReader(directory); taxoWriter = null; taxoEpoch = -1; // These are the default cache sizes; they can be configured after // construction with the cache's setMaxSize() method ordinalCache = new LRUHashMap<FacetLabel, IntClass>(DEFAULT_CACHE_VALUE); categoryCache = new LRUHashMap<int, FacetLabel>(DEFAULT_CACHE_VALUE); }
public AssertingDirectoryReader(DirectoryReader @in) : base(@in, new AssertingSubReaderWrapper()) { }