예제 #1
0
        /// <summary>
        /// Takes the categories from the given taxonomy directory, and adds the
        /// missing ones to this taxonomy. Additionally, it fills the given
        /// <see cref="IOrdinalMap"/> with a mapping from the original ordinal to the new
        /// ordinal.
        /// </summary>
        public virtual void AddTaxonomy(Directory taxoDir, IOrdinalMap map)
        {
            EnsureOpen();
            DirectoryReader r = DirectoryReader.Open(taxoDir);

            try
            {
                int         size       = r.NumDocs;
                IOrdinalMap ordinalMap = map;
                ordinalMap.SetSize(size);
                int       @base = 0;
                TermsEnum te    = null;
                DocsEnum  docs  = null;
                foreach (AtomicReaderContext ctx in r.Leaves)
                {
                    AtomicReader ar    = ctx.AtomicReader;
                    Terms        terms = ar.GetTerms(Consts.FULL);
                    te = terms.GetIterator(te);
                    while (te.Next() != null)
                    {
                        FacetLabel cp      = new FacetLabel(FacetsConfig.StringToPath(te.Term.Utf8ToString()));
                        int        ordinal = AddCategory(cp);
                        docs = te.Docs(null, docs, DocsFlags.NONE);
                        ordinalMap.AddMapping(docs.NextDoc() + @base, ordinal);
                    }
                    @base += ar.MaxDoc; // no deletions, so we're ok
                }
                ordinalMap.AddDone();
            }
            finally
            {
                r.Dispose();
            }
        }
        public override FacetLabel GetPath(int ordinal)
        {
            EnsureOpen();

            // Since the cache is shared with DTR instances allocated from
            // doOpenIfChanged, we need to ensure that the ordinal is one that this DTR
            // instance recognizes. Therefore we do this check up front, before we hit
            // the cache.
            if (ordinal < 0 || ordinal >= indexReader.MaxDoc)
            {
                return(null);
            }

            // TODO: can we use an int-based hash impl, such as IntToObjectMap,
            // wrapped as LRU?

            // LUCENENET NOTE: We don't need to convert ordinal from int to int here as was done in Java.
            // LUCENENET: Lock was removed here because the underlying cache is thread-safe,
            // and removing the lock seems to make the performance better.
            if (categoryCache.TryGetValue(ordinal, out FacetLabel res))
            {
                return(res);
            }

            Document doc = indexReader.Document(ordinal);

            res = new FacetLabel(FacetsConfig.StringToPath(doc.Get(Consts.FULL)));
            // LUCENENET: Lock was removed here because the underlying cache is thread-safe,
            // and removing the lock seems to make the performance better.
            categoryCache.Put(ordinal, res);

            return(res);
        }
예제 #3
0
        private FacetResult GetDim(string dim, OrdRange ordRange, int topN)
        {
            TopOrdAndIntQueue q = null;

            int bottomCount = 0;

            int dimCount   = 0;
            int childCount = 0;

            TopOrdAndIntQueue.OrdAndValue reuse = null;
            //System.out.println("getDim : " + ordRange.start + " - " + ordRange.end);
            for (int ord = ordRange.Start; ord <= ordRange.End; ord++)
            {
                //System.out.println("  ord=" + ord + " count=" + counts[ord]);
                if (counts[ord] > 0)
                {
                    dimCount += counts[ord];
                    childCount++;
                    if (counts[ord] > bottomCount)
                    {
                        if (reuse == null)
                        {
                            reuse = new TopOrdAndIntQueue.OrdAndValue();
                        }
                        reuse.Ord   = ord;
                        reuse.Value = counts[ord];
                        if (q == null)
                        {
                            // Lazy init, so we don't create this for the
                            // sparse case unnecessarily
                            q = new TopOrdAndIntQueue(topN);
                        }
                        reuse = q.InsertWithOverflow(reuse);
                        if (q.Size() == topN)
                        {
                            bottomCount = q.Top().Value;
                        }
                    }
                }
            }

            if (q == null)
            {
                return(null);
            }

            LabelAndValue[] labelValues = new LabelAndValue[q.Size()];
            for (int i = labelValues.Length - 1; i >= 0; i--)
            {
                TopOrdAndIntQueue.OrdAndValue ordAndValue = q.Pop();
                var term = new BytesRef();
                dv.LookupOrd(ordAndValue.Ord, term);
                string[] parts = FacetsConfig.StringToPath(term.Utf8ToString());
                labelValues[i] = new LabelAndValue(parts[1], ordAndValue.Value);
            }

            return(new FacetResult(dim, new string[0], dimCount, labelValues, childCount));
        }
예제 #4
0
        /// <summary>
        /// Creates this, pulling doc values from the specified
        /// field.
        /// </summary>
        public DefaultSortedSetDocValuesReaderState(IndexReader reader, string field = FacetsConfig.DEFAULT_INDEX_FIELD_NAME)
        {
            this.field      = field;
            this.origReader = reader;

            // We need this to create thread-safe MultiSortedSetDV
            // per collector:
            topReader = SlowCompositeReaderWrapper.Wrap(reader);
            SortedSetDocValues dv = topReader.GetSortedSetDocValues(field);

            if (dv is null)
            {
                throw new ArgumentException("field \"" + field + "\" was not indexed with SortedSetDocValues");
            }
            if (dv.ValueCount > int.MaxValue)
            {
                throw new ArgumentException("can only handle valueCount < System.Int32.MaxValue; got " + dv.ValueCount);
            }
            valueCount = (int)dv.ValueCount;

            // TODO: we can make this more efficient if eg we can be
            // "involved" when IOrdinalMap is being created?  Ie see
            // each term/ord it's assigning as it goes...
            string   lastDim  = null;
            int      startOrd = -1;
            BytesRef spare    = new BytesRef();

            // TODO: this approach can work for full hierarchy?;
            // TaxoReader can't do this since ords are not in
            // "sorted order" ... but we should generalize this to
            // support arbitrary hierarchy:
            for (int ord = 0; ord < valueCount; ord++)
            {
                dv.LookupOrd(ord, spare);
                string[] components = FacetsConfig.StringToPath(spare.Utf8ToString());
                if (components.Length != 2)
                {
                    throw new ArgumentException("this class can only handle 2 level hierarchy (dim/value); got: " + Arrays.ToString(components) + " " + spare.Utf8ToString());
                }
                if (!components[0].Equals(lastDim, StringComparison.Ordinal))
                {
                    if (lastDim != null)
                    {
                        prefixToOrdRange[lastDim] = new OrdRange(startOrd, ord - 1);
                    }
                    startOrd = ord;
                    lastDim  = components[0];
                }
            }

            if (lastDim != null)
            {
                prefixToOrdRange[lastDim] = new OrdRange(startOrd, valueCount - 1);
            }
        }
예제 #5
0
        private FacetResult GetDim(string dim, OrdRange ordRange, int topN)
        {
            TopOrdAndInt32Queue q = null;

            int bottomCount = 0;

            int dimCount   = 0;
            int childCount = 0;

            //System.out.println("getDim : " + ordRange.start + " - " + ordRange.end);
            for (int ord = ordRange.Start; ord <= ordRange.End; ord++)
            {
                //System.out.println("  ord=" + ord + " count=" + counts[ord]);
                if (counts[ord] > 0)
                {
                    dimCount += counts[ord];
                    childCount++;
                    if (counts[ord] > bottomCount)
                    {
                        if (q == null)
                        {
                            // Lazy init, so we don't create this for the
                            // sparse case unnecessarily
                            q = new TopOrdAndInt32Queue(topN);
                        }
                        // LUCENENET specific - use struct instead of reusing class instance for better performance
                        q.Insert(new OrdAndValue <int>(ord, counts[ord]));
                        if (q.Count == topN)
                        {
                            bottomCount = q.Top.Value;
                        }
                    }
                }
            }

            if (q == null)
            {
                return(null);
            }

            var scratch = new BytesRef();

            LabelAndValue[] labelValues = new LabelAndValue[q.Count];
            for (int i = labelValues.Length - 1; i >= 0; i--)
            {
                var ordAndValue = q.Pop();
                dv.LookupOrd(ordAndValue.Ord, scratch);
                string[] parts = FacetsConfig.StringToPath(scratch.Utf8ToString());
                labelValues[i] = new LabelAndValue(parts[1], ordAndValue.Value);
            }

            return(new FacetResult(dim, Arrays.Empty <string>(), dimCount, labelValues, childCount));
        }
예제 #6
0
        public override FacetLabel GetPath(int ordinal)
        {
            EnsureOpen();

            // Since the cache is shared with DTR instances allocated from
            // doOpenIfChanged, we need to ensure that the ordinal is one that this DTR
            // instance recognizes. Therefore we do this check up front, before we hit
            // the cache.
            if (ordinal < 0 || ordinal >= indexReader.MaxDoc)
            {
                return(null);
            }

            // TODO: can we use an int-based hash impl, such as IntToObjectMap,
            // wrapped as LRU?

            // LUCENENET NOTE: We don't need to convert ordinal from int to int here as was done in Java.
            // LUCENENET: Despite LRUHashMap being thread-safe, we get much better performance
            // if reads are separated from writes.
            categoryCacheLock.EnterReadLock();
            try
            {
                if (categoryCache.TryGetValue(ordinal, out FacetLabel res))
                {
                    return(res);
                }
            }
            finally
            {
                categoryCacheLock.ExitReadLock();
            }

            Document doc    = indexReader.Document(ordinal);
            var      result = new FacetLabel(FacetsConfig.StringToPath(doc.Get(Consts.FULL)));

            categoryCacheLock.EnterWriteLock();
            try
            {
                categoryCache[ordinal] = result;
            }
            finally
            {
                categoryCacheLock.ExitWriteLock();
            }

            return(result);
        }
예제 #7
0
        public override FacetLabel GetPath(int ordinal)
        {
            EnsureOpen();

            // Since the cache is shared with DTR instances allocated from
            // doOpenIfChanged, we need to ensure that the ordinal is one that this DTR
            // instance recognizes. Therefore we do this check up front, before we hit
            // the cache.
            if (ordinal < 0 || ordinal >= indexReader.MaxDoc)
            {
                return(null);
            }

            // TODO: can we use an int-based hash impl, such as IntToObjectMap,
            // wrapped as LRU?
            int catIDInteger = Convert.ToInt32(ordinal);

            lock (categoryCache)
            {
                var res = categoryCache.Get(catIDInteger, false);
                if (res != null)
                {
                    return(res);
                }
            }

            Document   doc = indexReader.Document(ordinal);
            FacetLabel ret = new FacetLabel(FacetsConfig.StringToPath(doc.Get(Consts.FULL)));

            lock (categoryCache)
            {
                categoryCache.Put(catIDInteger, ret);
            }

            return(ret);
        }
예제 #8
0
        public virtual void TestConcurrency()
        {
            AtomicInt32 numDocs  = new AtomicInt32(AtLeast(10000));
            Directory   indexDir = NewDirectory();
            Directory   taxoDir  = NewDirectory();
            ConcurrentDictionary <string, string> values = new ConcurrentDictionary <string, string>();
            IndexWriter iw = new IndexWriter(indexDir, NewIndexWriterConfig(TEST_VERSION_CURRENT, null));
            var         tw = new DirectoryTaxonomyWriter(taxoDir, OpenMode.CREATE, NewTaxoWriterCache(numDocs));

            ThreadJob[]  indexThreads = new ThreadJob[AtLeast(4)];
            FacetsConfig config       = new FacetsConfig();

            for (int i = 0; i < 10; i++)
            {
                config.SetHierarchical("l1." + i, true);
                config.SetMultiValued("l1." + i, true);
            }

            for (int i = 0; i < indexThreads.Length; i++)
            {
                indexThreads[i] = new ThreadAnonymousInnerClassHelper(this, numDocs, values, iw, tw, config);
            }

            foreach (ThreadJob t in indexThreads)
            {
                t.Start();
            }
            foreach (ThreadJob t in indexThreads)
            {
                t.Join();
            }

            var tr = new DirectoryTaxonomyReader(tw);

            // +1 for root category
            if (values.Count + 1 != tr.Count)
            {
                foreach (string value in values.Keys)
                {
                    FacetLabel label = new FacetLabel(FacetsConfig.StringToPath(value));
                    if (tr.GetOrdinal(label) == -1)
                    {
                        Console.WriteLine("FAIL: path=" + label + " not recognized");
                    }
                }
                fail("mismatch number of categories");
            }
            int[] parents = tr.ParallelTaxonomyArrays.Parents;
            foreach (string cat in values.Keys)
            {
                FacetLabel cp = new FacetLabel(FacetsConfig.StringToPath(cat));
                Assert.IsTrue(tr.GetOrdinal(cp) > 0, "category not found " + cp);
                int        level     = cp.Length;
                int        parentOrd = 0; // for root, parent is always virtual ROOT (ord=0)
                FacetLabel path      = null;
                for (int i = 0; i < level; i++)
                {
                    path = cp.Subpath(i + 1);
                    int ord = tr.GetOrdinal(path);
                    Assert.AreEqual(parentOrd, parents[ord], "invalid parent for cp=" + path);
                    parentOrd = ord; // next level should have this parent
                }
            }

            IOUtils.Dispose(tw, iw, tr, taxoDir, indexDir);
        }
        public virtual void TestConcurrency()
        {
            int                 ncats   = AtLeast(100000); // add many categories
            int                 range   = ncats * 3;       // affects the categories selection
            AtomicInteger       numCats = new AtomicInteger(ncats);
            Directory           dir     = NewDirectory();
            var                 values  = new ConcurrentDictionary <string, string>();
            double              d       = Random().NextDouble();
            TaxonomyWriterCache cache;

            if (d < 0.7)
            {
                // this is the fastest, yet most memory consuming
                cache = new Cl2oTaxonomyWriterCache(1024, 0.15f, 3);
            }
            else if (TEST_NIGHTLY && d > 0.98)
            {
                // this is the slowest, but tests the writer concurrency when no caching is done.
                // only pick it during NIGHTLY tests, and even then, with very low chances.
                cache = NO_OP_CACHE;
            }
            else
            {
                // this is slower than CL2O, but less memory consuming, and exercises finding categories on disk too.
                cache = new LruTaxonomyWriterCache(ncats / 10);
            }
            if (VERBOSE)
            {
                Console.WriteLine("TEST: use cache=" + cache);
            }
            var tw = new DirectoryTaxonomyWriter(dir, OpenMode.CREATE, cache);

            ThreadClass[] addThreads = new ThreadClass[AtLeast(4)];
            for (int z = 0; z < addThreads.Length; z++)
            {
                addThreads[z] = new ThreadAnonymousInnerClassHelper(this, range, numCats, values, tw);
            }

            foreach (var t in addThreads)
            {
                t.Start();
            }
            foreach (var t in addThreads)
            {
                t.Join();
            }
            tw.Dispose();

            DirectoryTaxonomyReader dtr = new DirectoryTaxonomyReader(dir);

            // +1 for root category
            if (values.Count + 1 != dtr.Size)
            {
                foreach (string value in values.Keys)
                {
                    FacetLabel label = new FacetLabel(FacetsConfig.StringToPath(value));
                    if (dtr.GetOrdinal(label) == -1)
                    {
                        Console.WriteLine("FAIL: path=" + label + " not recognized");
                    }
                }
                Fail("mismatch number of categories");
            }

            int[] parents = dtr.ParallelTaxonomyArrays.Parents();
            foreach (string cat in values.Keys)
            {
                FacetLabel cp = new FacetLabel(FacetsConfig.StringToPath(cat));
                Assert.True(dtr.GetOrdinal(cp) > 0, "category not found " + cp);
                int        level     = cp.Length;
                int        parentOrd = 0; // for root, parent is always virtual ROOT (ord=0)
                FacetLabel path      = new FacetLabel();
                for (int i = 0; i < level; i++)
                {
                    path = cp.Subpath(i + 1);
                    int ord = dtr.GetOrdinal(path);
                    Assert.AreEqual(parentOrd, parents[ord], "invalid parent for cp=" + path);
                    parentOrd = ord; // next level should have this parent
                }
            }

            IOUtils.Close(dtr, dir);
        }
예제 #10
0
        // we need to guarantee that if several threads call this concurrently, only
        // one executes it, and after it returns, the cache is updated and is either
        // complete or not.
        private void PerhapsFillCache()
        {
            lock (this)
            {
                if (cacheMisses < cacheMissesUntilFill)
                {
                    return;
                }

                if (!shouldFillCache)
                {
                    // we already filled the cache once, there's no need to re-fill it
                    return;
                }
                shouldFillCache = false;

                InitReaderManager();

                bool            aborted = false;
                DirectoryReader reader  = readerManager.Acquire();
                try
                {
                    TermsEnum termsEnum = null;
                    DocsEnum  docsEnum  = null;
                    foreach (AtomicReaderContext ctx in reader.Leaves)
                    {
                        Terms terms = ctx.AtomicReader.GetTerms(Consts.FULL);
                        if (terms != null) // cannot really happen, but be on the safe side
                        {
                            termsEnum = terms.GetIterator(termsEnum);
                            while (termsEnum.Next() != null)
                            {
                                if (!cache.IsFull)
                                {
                                    BytesRef t = termsEnum.Term;
                                    // Since we guarantee uniqueness of categories, each term has exactly
                                    // one document. Also, since we do not allow removing categories (and
                                    // hence documents), there are no deletions in the index. Therefore, it
                                    // is sufficient to call next(), and then doc(), exactly once with no
                                    // 'validation' checks.
                                    FacetLabel cp = new FacetLabel(FacetsConfig.StringToPath(t.Utf8ToString()));
                                    docsEnum = termsEnum.Docs(null, docsEnum, DocsFlags.NONE);
                                    bool res = cache.Put(cp, docsEnum.NextDoc() + ctx.DocBase);
                                    Debug.Assert(!res, "entries should not have been evicted from the cache");
                                }
                                else
                                {
                                    // the cache is full and the next put() will evict entries from it, therefore abort the iteration.
                                    aborted = true;
                                    break;
                                }
                            }
                        }
                        if (aborted)
                        {
                            break;
                        }
                    }
                }
                finally
                {
                    readerManager.Release(reader);
                }

                cacheIsComplete = !aborted;
                if (cacheIsComplete)
                {
                    lock (this)
                    {
                        // everything is in the cache, so no need to keep readerManager open.
                        // this block is executed in a sync block so that it works well with
                        // initReaderManager called in parallel.
                        readerManager.Dispose();
                        readerManager            = null;
                        initializedReaderManager = false;
                    }
                }
            }
        }