/// <summary> /// Takes the categories from the given taxonomy directory, and adds the /// missing ones to this taxonomy. Additionally, it fills the given /// <see cref="IOrdinalMap"/> with a mapping from the original ordinal to the new /// ordinal. /// </summary> public virtual void AddTaxonomy(Directory taxoDir, IOrdinalMap map) { EnsureOpen(); DirectoryReader r = DirectoryReader.Open(taxoDir); try { int size = r.NumDocs; IOrdinalMap ordinalMap = map; ordinalMap.SetSize(size); int @base = 0; TermsEnum te = null; DocsEnum docs = null; foreach (AtomicReaderContext ctx in r.Leaves) { AtomicReader ar = ctx.AtomicReader; Terms terms = ar.GetTerms(Consts.FULL); te = terms.GetIterator(te); while (te.Next() != null) { FacetLabel cp = new FacetLabel(FacetsConfig.StringToPath(te.Term.Utf8ToString())); int ordinal = AddCategory(cp); docs = te.Docs(null, docs, DocsFlags.NONE); ordinalMap.AddMapping(docs.NextDoc() + @base, ordinal); } @base += ar.MaxDoc; // no deletions, so we're ok } ordinalMap.AddDone(); } finally { r.Dispose(); } }
public override FacetLabel GetPath(int ordinal) { EnsureOpen(); // Since the cache is shared with DTR instances allocated from // doOpenIfChanged, we need to ensure that the ordinal is one that this DTR // instance recognizes. Therefore we do this check up front, before we hit // the cache. if (ordinal < 0 || ordinal >= indexReader.MaxDoc) { return(null); } // TODO: can we use an int-based hash impl, such as IntToObjectMap, // wrapped as LRU? // LUCENENET NOTE: We don't need to convert ordinal from int to int here as was done in Java. // LUCENENET: Lock was removed here because the underlying cache is thread-safe, // and removing the lock seems to make the performance better. if (categoryCache.TryGetValue(ordinal, out FacetLabel res)) { return(res); } Document doc = indexReader.Document(ordinal); res = new FacetLabel(FacetsConfig.StringToPath(doc.Get(Consts.FULL))); // LUCENENET: Lock was removed here because the underlying cache is thread-safe, // and removing the lock seems to make the performance better. categoryCache.Put(ordinal, res); return(res); }
private FacetResult GetDim(string dim, OrdRange ordRange, int topN) { TopOrdAndIntQueue q = null; int bottomCount = 0; int dimCount = 0; int childCount = 0; TopOrdAndIntQueue.OrdAndValue reuse = null; //System.out.println("getDim : " + ordRange.start + " - " + ordRange.end); for (int ord = ordRange.Start; ord <= ordRange.End; ord++) { //System.out.println(" ord=" + ord + " count=" + counts[ord]); if (counts[ord] > 0) { dimCount += counts[ord]; childCount++; if (counts[ord] > bottomCount) { if (reuse == null) { reuse = new TopOrdAndIntQueue.OrdAndValue(); } reuse.Ord = ord; reuse.Value = counts[ord]; if (q == null) { // Lazy init, so we don't create this for the // sparse case unnecessarily q = new TopOrdAndIntQueue(topN); } reuse = q.InsertWithOverflow(reuse); if (q.Size() == topN) { bottomCount = q.Top().Value; } } } } if (q == null) { return(null); } LabelAndValue[] labelValues = new LabelAndValue[q.Size()]; for (int i = labelValues.Length - 1; i >= 0; i--) { TopOrdAndIntQueue.OrdAndValue ordAndValue = q.Pop(); var term = new BytesRef(); dv.LookupOrd(ordAndValue.Ord, term); string[] parts = FacetsConfig.StringToPath(term.Utf8ToString()); labelValues[i] = new LabelAndValue(parts[1], ordAndValue.Value); } return(new FacetResult(dim, new string[0], dimCount, labelValues, childCount)); }
/// <summary> /// Creates this, pulling doc values from the specified /// field. /// </summary> public DefaultSortedSetDocValuesReaderState(IndexReader reader, string field = FacetsConfig.DEFAULT_INDEX_FIELD_NAME) { this.field = field; this.origReader = reader; // We need this to create thread-safe MultiSortedSetDV // per collector: topReader = SlowCompositeReaderWrapper.Wrap(reader); SortedSetDocValues dv = topReader.GetSortedSetDocValues(field); if (dv is null) { throw new ArgumentException("field \"" + field + "\" was not indexed with SortedSetDocValues"); } if (dv.ValueCount > int.MaxValue) { throw new ArgumentException("can only handle valueCount < System.Int32.MaxValue; got " + dv.ValueCount); } valueCount = (int)dv.ValueCount; // TODO: we can make this more efficient if eg we can be // "involved" when IOrdinalMap is being created? Ie see // each term/ord it's assigning as it goes... string lastDim = null; int startOrd = -1; BytesRef spare = new BytesRef(); // TODO: this approach can work for full hierarchy?; // TaxoReader can't do this since ords are not in // "sorted order" ... but we should generalize this to // support arbitrary hierarchy: for (int ord = 0; ord < valueCount; ord++) { dv.LookupOrd(ord, spare); string[] components = FacetsConfig.StringToPath(spare.Utf8ToString()); if (components.Length != 2) { throw new ArgumentException("this class can only handle 2 level hierarchy (dim/value); got: " + Arrays.ToString(components) + " " + spare.Utf8ToString()); } if (!components[0].Equals(lastDim, StringComparison.Ordinal)) { if (lastDim != null) { prefixToOrdRange[lastDim] = new OrdRange(startOrd, ord - 1); } startOrd = ord; lastDim = components[0]; } } if (lastDim != null) { prefixToOrdRange[lastDim] = new OrdRange(startOrd, valueCount - 1); } }
private FacetResult GetDim(string dim, OrdRange ordRange, int topN) { TopOrdAndInt32Queue q = null; int bottomCount = 0; int dimCount = 0; int childCount = 0; //System.out.println("getDim : " + ordRange.start + " - " + ordRange.end); for (int ord = ordRange.Start; ord <= ordRange.End; ord++) { //System.out.println(" ord=" + ord + " count=" + counts[ord]); if (counts[ord] > 0) { dimCount += counts[ord]; childCount++; if (counts[ord] > bottomCount) { if (q == null) { // Lazy init, so we don't create this for the // sparse case unnecessarily q = new TopOrdAndInt32Queue(topN); } // LUCENENET specific - use struct instead of reusing class instance for better performance q.Insert(new OrdAndValue <int>(ord, counts[ord])); if (q.Count == topN) { bottomCount = q.Top.Value; } } } } if (q == null) { return(null); } var scratch = new BytesRef(); LabelAndValue[] labelValues = new LabelAndValue[q.Count]; for (int i = labelValues.Length - 1; i >= 0; i--) { var ordAndValue = q.Pop(); dv.LookupOrd(ordAndValue.Ord, scratch); string[] parts = FacetsConfig.StringToPath(scratch.Utf8ToString()); labelValues[i] = new LabelAndValue(parts[1], ordAndValue.Value); } return(new FacetResult(dim, Arrays.Empty <string>(), dimCount, labelValues, childCount)); }
public override FacetLabel GetPath(int ordinal) { EnsureOpen(); // Since the cache is shared with DTR instances allocated from // doOpenIfChanged, we need to ensure that the ordinal is one that this DTR // instance recognizes. Therefore we do this check up front, before we hit // the cache. if (ordinal < 0 || ordinal >= indexReader.MaxDoc) { return(null); } // TODO: can we use an int-based hash impl, such as IntToObjectMap, // wrapped as LRU? // LUCENENET NOTE: We don't need to convert ordinal from int to int here as was done in Java. // LUCENENET: Despite LRUHashMap being thread-safe, we get much better performance // if reads are separated from writes. categoryCacheLock.EnterReadLock(); try { if (categoryCache.TryGetValue(ordinal, out FacetLabel res)) { return(res); } } finally { categoryCacheLock.ExitReadLock(); } Document doc = indexReader.Document(ordinal); var result = new FacetLabel(FacetsConfig.StringToPath(doc.Get(Consts.FULL))); categoryCacheLock.EnterWriteLock(); try { categoryCache[ordinal] = result; } finally { categoryCacheLock.ExitWriteLock(); } return(result); }
public override FacetLabel GetPath(int ordinal) { EnsureOpen(); // Since the cache is shared with DTR instances allocated from // doOpenIfChanged, we need to ensure that the ordinal is one that this DTR // instance recognizes. Therefore we do this check up front, before we hit // the cache. if (ordinal < 0 || ordinal >= indexReader.MaxDoc) { return(null); } // TODO: can we use an int-based hash impl, such as IntToObjectMap, // wrapped as LRU? int catIDInteger = Convert.ToInt32(ordinal); lock (categoryCache) { var res = categoryCache.Get(catIDInteger, false); if (res != null) { return(res); } } Document doc = indexReader.Document(ordinal); FacetLabel ret = new FacetLabel(FacetsConfig.StringToPath(doc.Get(Consts.FULL))); lock (categoryCache) { categoryCache.Put(catIDInteger, ret); } return(ret); }
public virtual void TestConcurrency() { AtomicInt32 numDocs = new AtomicInt32(AtLeast(10000)); Directory indexDir = NewDirectory(); Directory taxoDir = NewDirectory(); ConcurrentDictionary <string, string> values = new ConcurrentDictionary <string, string>(); IndexWriter iw = new IndexWriter(indexDir, NewIndexWriterConfig(TEST_VERSION_CURRENT, null)); var tw = new DirectoryTaxonomyWriter(taxoDir, OpenMode.CREATE, NewTaxoWriterCache(numDocs)); ThreadJob[] indexThreads = new ThreadJob[AtLeast(4)]; FacetsConfig config = new FacetsConfig(); for (int i = 0; i < 10; i++) { config.SetHierarchical("l1." + i, true); config.SetMultiValued("l1." + i, true); } for (int i = 0; i < indexThreads.Length; i++) { indexThreads[i] = new ThreadAnonymousInnerClassHelper(this, numDocs, values, iw, tw, config); } foreach (ThreadJob t in indexThreads) { t.Start(); } foreach (ThreadJob t in indexThreads) { t.Join(); } var tr = new DirectoryTaxonomyReader(tw); // +1 for root category if (values.Count + 1 != tr.Count) { foreach (string value in values.Keys) { FacetLabel label = new FacetLabel(FacetsConfig.StringToPath(value)); if (tr.GetOrdinal(label) == -1) { Console.WriteLine("FAIL: path=" + label + " not recognized"); } } fail("mismatch number of categories"); } int[] parents = tr.ParallelTaxonomyArrays.Parents; foreach (string cat in values.Keys) { FacetLabel cp = new FacetLabel(FacetsConfig.StringToPath(cat)); Assert.IsTrue(tr.GetOrdinal(cp) > 0, "category not found " + cp); int level = cp.Length; int parentOrd = 0; // for root, parent is always virtual ROOT (ord=0) FacetLabel path = null; for (int i = 0; i < level; i++) { path = cp.Subpath(i + 1); int ord = tr.GetOrdinal(path); Assert.AreEqual(parentOrd, parents[ord], "invalid parent for cp=" + path); parentOrd = ord; // next level should have this parent } } IOUtils.Dispose(tw, iw, tr, taxoDir, indexDir); }
public virtual void TestConcurrency() { int ncats = AtLeast(100000); // add many categories int range = ncats * 3; // affects the categories selection AtomicInteger numCats = new AtomicInteger(ncats); Directory dir = NewDirectory(); var values = new ConcurrentDictionary <string, string>(); double d = Random().NextDouble(); TaxonomyWriterCache cache; if (d < 0.7) { // this is the fastest, yet most memory consuming cache = new Cl2oTaxonomyWriterCache(1024, 0.15f, 3); } else if (TEST_NIGHTLY && d > 0.98) { // this is the slowest, but tests the writer concurrency when no caching is done. // only pick it during NIGHTLY tests, and even then, with very low chances. cache = NO_OP_CACHE; } else { // this is slower than CL2O, but less memory consuming, and exercises finding categories on disk too. cache = new LruTaxonomyWriterCache(ncats / 10); } if (VERBOSE) { Console.WriteLine("TEST: use cache=" + cache); } var tw = new DirectoryTaxonomyWriter(dir, OpenMode.CREATE, cache); ThreadClass[] addThreads = new ThreadClass[AtLeast(4)]; for (int z = 0; z < addThreads.Length; z++) { addThreads[z] = new ThreadAnonymousInnerClassHelper(this, range, numCats, values, tw); } foreach (var t in addThreads) { t.Start(); } foreach (var t in addThreads) { t.Join(); } tw.Dispose(); DirectoryTaxonomyReader dtr = new DirectoryTaxonomyReader(dir); // +1 for root category if (values.Count + 1 != dtr.Size) { foreach (string value in values.Keys) { FacetLabel label = new FacetLabel(FacetsConfig.StringToPath(value)); if (dtr.GetOrdinal(label) == -1) { Console.WriteLine("FAIL: path=" + label + " not recognized"); } } Fail("mismatch number of categories"); } int[] parents = dtr.ParallelTaxonomyArrays.Parents(); foreach (string cat in values.Keys) { FacetLabel cp = new FacetLabel(FacetsConfig.StringToPath(cat)); Assert.True(dtr.GetOrdinal(cp) > 0, "category not found " + cp); int level = cp.Length; int parentOrd = 0; // for root, parent is always virtual ROOT (ord=0) FacetLabel path = new FacetLabel(); for (int i = 0; i < level; i++) { path = cp.Subpath(i + 1); int ord = dtr.GetOrdinal(path); Assert.AreEqual(parentOrd, parents[ord], "invalid parent for cp=" + path); parentOrd = ord; // next level should have this parent } } IOUtils.Close(dtr, dir); }
// we need to guarantee that if several threads call this concurrently, only // one executes it, and after it returns, the cache is updated and is either // complete or not. private void PerhapsFillCache() { lock (this) { if (cacheMisses < cacheMissesUntilFill) { return; } if (!shouldFillCache) { // we already filled the cache once, there's no need to re-fill it return; } shouldFillCache = false; InitReaderManager(); bool aborted = false; DirectoryReader reader = readerManager.Acquire(); try { TermsEnum termsEnum = null; DocsEnum docsEnum = null; foreach (AtomicReaderContext ctx in reader.Leaves) { Terms terms = ctx.AtomicReader.GetTerms(Consts.FULL); if (terms != null) // cannot really happen, but be on the safe side { termsEnum = terms.GetIterator(termsEnum); while (termsEnum.Next() != null) { if (!cache.IsFull) { BytesRef t = termsEnum.Term; // Since we guarantee uniqueness of categories, each term has exactly // one document. Also, since we do not allow removing categories (and // hence documents), there are no deletions in the index. Therefore, it // is sufficient to call next(), and then doc(), exactly once with no // 'validation' checks. FacetLabel cp = new FacetLabel(FacetsConfig.StringToPath(t.Utf8ToString())); docsEnum = termsEnum.Docs(null, docsEnum, DocsFlags.NONE); bool res = cache.Put(cp, docsEnum.NextDoc() + ctx.DocBase); Debug.Assert(!res, "entries should not have been evicted from the cache"); } else { // the cache is full and the next put() will evict entries from it, therefore abort the iteration. aborted = true; break; } } } if (aborted) { break; } } } finally { readerManager.Release(reader); } cacheIsComplete = !aborted; if (cacheIsComplete) { lock (this) { // everything is in the cache, so no need to keep readerManager open. // this block is executed in a sync block so that it works well with // initReaderManager called in parallel. readerManager.Dispose(); readerManager = null; initializedReaderManager = false; } } } }