/// <summary>
        /// Check whether the <see cref="FacetLabel"/> is equal to the one serialized in
        /// <see cref="CharBlockArray"/>.
        /// </summary>
        public static bool EqualsToSerialized(FacetLabel cp, CharBlockArray charBlockArray, int offset)
        {
            int n = charBlockArray.CharAt(offset++);
            if (cp.Length != n)
            {
                return false;
            }
            if (cp.Length == 0)
            {
                return true;
            }

            for (int i = 0; i < cp.Length; i++)
            {
                int len = charBlockArray.CharAt(offset++);
                if (len != cp.Components[i].Length)
                {
                    return false;
                }

                if (!cp.Components[i].Equals(charBlockArray.SubSequence(offset, offset + len), StringComparison.Ordinal))
                {
                    return false;
                }
                offset += len;
            }
            return true;
        }
Exemple #2
0
 /// <summary>
 /// Initializes a new instance of the <see cref="Facet"/> class.
 /// </summary>
 /// <param name="group">The group.</param>
 /// <param name="key">The key.</param>
 /// <param name="count">The count.</param>
 /// <param name="labels">The labels.</param>
 public Facet(FacetGroup group, string key, int count, FacetLabel[] labels)
 {
     Group = group;
     Key = key;
     Labels = labels;
     Count = count;
 }
 public virtual void TestDefaultConstructor()
 {
     // test that the default constructor (no parameters) currently
     // defaults to creating an object with a 0 initial capacity.
     // If we change this default later, we also need to change this
     // test.
     FacetLabel p = new FacetLabel();
     Assert.AreEqual(0, p.Length);
     Assert.AreEqual("FacetLabel: []", p.ToString());
 }
 public virtual int Get(FacetLabel categoryPath)
 {
     @lock.AcquireReaderLock(LockTimeOut);
     try
     {
         return cache.GetOrdinal(categoryPath);
     }
     finally
     {
         @lock.ReleaseReaderLock();
     }
 }
 /// <summary>
 /// Serializes the given <see cref="FacetLabel"/> to the <see cref="CharBlockArray"/>.
 /// </summary>
 public static void Serialize(FacetLabel cp, CharBlockArray charBlockArray)
 {
     charBlockArray.Append((char)cp.Length);
     if (cp.Length == 0)
     {
         return;
     }
     for (int i = 0; i < cp.Length; i++)
     {
         charBlockArray.Append((char)cp.Components[i].Length);
         charBlockArray.Append(cp.Components[i]);
     }
 }
 public virtual void TestCompareTo()
 {
     FacetLabel p = new FacetLabel("a", "b", "c", "d");
     FacetLabel pother = new FacetLabel("a", "b", "c", "d");
     Assert.AreEqual(0, pother.CompareTo(p));
     Assert.AreEqual(0, p.CompareTo(pother));
     pother = new FacetLabel();
     Assert.True(pother.CompareTo(p) < 0);
     Assert.True(p.CompareTo(pother) > 0);
     pother = new FacetLabel("a", "b_", "c", "d");
     Assert.True(pother.CompareTo(p) > 0);
     Assert.True(p.CompareTo(pother) < 0);
     pother = new FacetLabel("a", "b", "c");
     Assert.True(pother.CompareTo(p) < 0);
     Assert.True(p.CompareTo(pother) > 0);
     pother = new FacetLabel("a", "b", "c", "e");
     Assert.True(pother.CompareTo(p) > 0);
     Assert.True(p.CompareTo(pother) < 0);
 }
        public virtual void TestConcurrency()
        {
            int ncats = AtLeast(100000); // add many categories
            int range = ncats * 3; // affects the categories selection
            AtomicInteger numCats = new AtomicInteger(ncats);
            Directory dir = NewDirectory();
            var values = new ConcurrentDictionary<string, string>();
            double d = Random().NextDouble();
            ITaxonomyWriterCache cache;
            if (d < 0.7)
            {
                // this is the fastest, yet most memory consuming
                cache = new Cl2oTaxonomyWriterCache(1024, 0.15f, 3);
            }
            else if (TEST_NIGHTLY && d > 0.98)
            {
                // this is the slowest, but tests the writer concurrency when no caching is done.
                // only pick it during NIGHTLY tests, and even then, with very low chances.
                cache = NO_OP_CACHE;
            }
            else
            {
                // this is slower than CL2O, but less memory consuming, and exercises finding categories on disk too.
                cache = new LruTaxonomyWriterCache(ncats / 10);
            }
            if (VERBOSE)
            {
                Console.WriteLine("TEST: use cache=" + cache);
            }
            var tw = new DirectoryTaxonomyWriter(dir, OpenMode.CREATE, cache);
            ThreadClass[] addThreads = new ThreadClass[AtLeast(4)];
            for (int z = 0; z < addThreads.Length; z++)
            {
                addThreads[z] = new ThreadAnonymousInnerClassHelper(this, range, numCats, values, tw);
            }

            foreach (var t in addThreads)
            {
                t.Start();
            }
            foreach (var t in addThreads)
            {
                t.Join();
            }
            tw.Dispose();

            DirectoryTaxonomyReader dtr = new DirectoryTaxonomyReader(dir);
            // +1 for root category
            if (values.Count + 1 != dtr.Count)
            {
                foreach (string value in values.Keys)
                {
                    FacetLabel label = new FacetLabel(FacetsConfig.StringToPath(value));
                    if (dtr.GetOrdinal(label) == -1)
                    {
                        Console.WriteLine("FAIL: path=" + label + " not recognized");
                    }
                }
                Fail("mismatch number of categories");
            }

            int[] parents = dtr.ParallelTaxonomyArrays.Parents;
            foreach (string cat in values.Keys)
            {
                FacetLabel cp = new FacetLabel(FacetsConfig.StringToPath(cat));
                Assert.True(dtr.GetOrdinal(cp) > 0, "category not found " + cp);
                int level = cp.Length;
                int parentOrd = 0; // for root, parent is always virtual ROOT (ord=0)
                FacetLabel path = new FacetLabel();
                for (int i = 0; i < level; i++)
                {
                    path = cp.Subpath(i + 1);
                    int ord = dtr.GetOrdinal(path);
                    Assert.AreEqual(parentOrd, parents[ord], "invalid parent for cp=" + path);
                    parentOrd = ord; // next level should have this parent
                }
            }

            IOUtils.Close(dtr, dir);
        }
        private void doTestReadRecreatedTaxonomy(Random random, bool closeReader)
        {
            Directory dir = null;
            TaxonomyWriter tw = null;
            TaxonomyReader tr = null;

            // prepare a few categories
            int n = 10;
            FacetLabel[] cp = new FacetLabel[n];
            for (int i = 0; i < n; i++)
            {
                cp[i] = new FacetLabel("a", Convert.ToString(i));
            }

            try
            {
                dir = NewDirectory();

                tw = new DirectoryTaxonomyWriter(dir);
                tw.AddCategory(new FacetLabel("a"));
                tw.Dispose();

                tr = new DirectoryTaxonomyReader(dir);
                int baseNumCategories = tr.Size;

                for (int i = 0; i < n; i++)
                {
                    int k = random.Next(n);
                    tw = new DirectoryTaxonomyWriter(dir, IndexWriterConfig.OpenMode_e.CREATE);
                    for (int j = 0; j <= k; j++)
                    {
                        tw.AddCategory(cp[j]);
                    }
                    tw.Dispose();
                    if (closeReader)
                    {
                        tr.Dispose(true);
                        tr = new DirectoryTaxonomyReader(dir);
                    }
                    else
                    {
                        var newtr = TaxonomyReader.OpenIfChanged(tr);
                        Assert.NotNull(newtr);
                        tr.Dispose(true);
                        tr = newtr;
                    }
                    Assert.AreEqual(baseNumCategories + 1 + k, tr.Size, "Wrong #categories in taxonomy (i=" + i + ", k=" + k + ")");
                }
            }
            finally
            {
                IOUtils.Close(tr as DirectoryTaxonomyReader, tw, dir);
            }
        }
        public virtual void TestOpenIfChangedReuse()
        {
            // test the reuse of data from the old DTR instance
            foreach (bool nrt in new bool[] { false, true })
            {
                Directory dir = NewDirectory();
                DirectoryTaxonomyWriter writer = new DirectoryTaxonomyWriter(dir);

                FacetLabel cp_a = new FacetLabel("a");
                writer.AddCategory(cp_a);
                if (!nrt)
                {
                    writer.Commit();
                }

                DirectoryTaxonomyReader r1 = nrt ? new DirectoryTaxonomyReader(writer) : new DirectoryTaxonomyReader(dir);
                // fill r1's caches
                Assert.AreEqual(1, r1.GetOrdinal(cp_a));
                Assert.AreEqual(cp_a, r1.GetPath(1));

                FacetLabel cp_b = new FacetLabel("b");
                writer.AddCategory(cp_b);
                if (!nrt)
                {
                    writer.Commit();
                }

                DirectoryTaxonomyReader r2 = TaxonomyReader.OpenIfChanged(r1);
                Assert.NotNull(r2);

                // add r2's categories to the caches
                Assert.AreEqual(2, r2.GetOrdinal(cp_b));
                Assert.AreEqual(cp_b, r2.GetPath(2));

                // check that r1 doesn't see cp_b
                Assert.AreEqual(TaxonomyReader.INVALID_ORDINAL, r1.GetOrdinal(cp_b));
                Assert.Null(r1.GetPath(2));

                (r1).Dispose();
                (r2).Dispose();
                writer.Dispose();
                dir.Dispose();
            }
        }
        public virtual void TestConcurrency()
        {
            AtomicInt32 numDocs  = new AtomicInt32(AtLeast(10000));
            Directory   indexDir = NewDirectory();
            Directory   taxoDir  = NewDirectory();
            ConcurrentDictionary <string, string> values = new ConcurrentDictionary <string, string>();
            IndexWriter iw = new IndexWriter(indexDir, NewIndexWriterConfig(TEST_VERSION_CURRENT, null));
            var         tw = new DirectoryTaxonomyWriter(taxoDir, OpenMode.CREATE, NewTaxoWriterCache(numDocs));

            ThreadJob[]  indexThreads = new ThreadJob[AtLeast(4)];
            FacetsConfig config       = new FacetsConfig();

            for (int i = 0; i < 10; i++)
            {
                config.SetHierarchical("l1." + i, true);
                config.SetMultiValued("l1." + i, true);
            }

            for (int i = 0; i < indexThreads.Length; i++)
            {
                indexThreads[i] = new ThreadAnonymousInnerClassHelper(this, numDocs, values, iw, tw, config);
            }

            foreach (ThreadJob t in indexThreads)
            {
                t.Start();
            }
            foreach (ThreadJob t in indexThreads)
            {
                t.Join();
            }

            var tr = new DirectoryTaxonomyReader(tw);

            // +1 for root category
            if (values.Count + 1 != tr.Count)
            {
                foreach (string value in values.Keys)
                {
                    FacetLabel label = new FacetLabel(FacetsConfig.StringToPath(value));
                    if (tr.GetOrdinal(label) == -1)
                    {
                        Console.WriteLine("FAIL: path=" + label + " not recognized");
                    }
                }
                fail("mismatch number of categories");
            }
            int[] parents = tr.ParallelTaxonomyArrays.Parents;
            foreach (string cat in values.Keys)
            {
                FacetLabel cp = new FacetLabel(FacetsConfig.StringToPath(cat));
                Assert.True(tr.GetOrdinal(cp) > 0, "category not found " + cp);
                int        level     = cp.Length;
                int        parentOrd = 0; // for root, parent is always virtual ROOT (ord=0)
                FacetLabel path      = null;
                for (int i = 0; i < level; i++)
                {
                    path = cp.Subpath(i + 1);
                    int ord = tr.GetOrdinal(path);
                    Assert.AreEqual(parentOrd, parents[ord], "invalid parent for cp=" + path);
                    parentOrd = ord; // next level should have this parent
                }
            }

            IOUtils.Dispose(tw, iw, tr, taxoDir, indexDir);
        }
Exemple #11
0
 /// <summary>
 /// Add a new value to cache.
 /// Return true if cache became full and some room need to be made.
 /// </summary>
 internal virtual bool Put(FacetLabel name, int val)
 {
     cache[Key(name)] = val;
     return(IsCacheFull);
 }
 /// <summary>
 /// Subclasses can override this to provide caching by e.g. hash of the string. </summary>
 internal virtual object Key(FacetLabel name)
 {
     return name;
 }
        // we need to guarantee that if several threads call this concurrently, only
        // one executes it, and after it returns, the cache is updated and is either
        // complete or not.
        private void PerhapsFillCache()
        {
            lock (this)
            {
                if (cacheMisses.Get() < cacheMissesUntilFill)
                {
                    return;
                }

                if (!shouldFillCache)
                {
                    // we already filled the cache once, there's no need to re-fill it
                    return;
                }
                shouldFillCache = false;

                InitReaderManager();

                bool aborted = false;
                DirectoryReader reader = readerManager.Acquire();
                try
                {
                    TermsEnum termsEnum = null;
                    DocsEnum docsEnum = null;
                    foreach (AtomicReaderContext ctx in reader.Leaves)
                    {
                        Terms terms = ctx.AtomicReader.Terms(Consts.FULL);
                        if (terms != null) // cannot really happen, but be on the safe side
                        {
                            termsEnum = terms.Iterator(termsEnum);
                            while (termsEnum.Next() != null)
                            {
                                if (!cache.Full)
                                {
                                    BytesRef t = termsEnum.Term();
                                    // Since we guarantee uniqueness of categories, each term has exactly
                                    // one document. Also, since we do not allow removing categories (and
                                    // hence documents), there are no deletions in the index. Therefore, it
                                    // is sufficient to call next(), and then doc(), exactly once with no
                                    // 'validation' checks.
                                    FacetLabel cp = new FacetLabel(FacetsConfig.StringToPath(t.Utf8ToString()));
                                    docsEnum = termsEnum.Docs(null, docsEnum, DocsEnum.FLAG_NONE);
                                    bool res = cache.Put(cp, docsEnum.NextDoc() + ctx.DocBase);
                                    Debug.Assert(!res, "entries should not have been evicted from the cache");
                                }
                                else
                                {
                                    // the cache is full and the next put() will evict entries from it, therefore abort the iteration.
                                    aborted = true;
                                    break;
                                }
                            }
                        }
                        if (aborted)
                        {
                            break;
                        }
                    }
                }
                finally
                {
                    readerManager.Release(reader);
                }

                cacheIsComplete = !aborted;
                if (cacheIsComplete)
                {
                    lock (this)
                    {
                        // everything is in the cache, so no need to keep readerManager open.
                        // this block is executed in a sync block so that it works well with
                        // initReaderManager called in parallel.
                        readerManager.Dispose();
                        readerManager = null;
                        initializedReaderManager = false;
                    }
                }
            }
        }
Exemple #14
0
        /// <summary>
        /// Look up the given category in the cache and/or the on-disk storage,
        /// returning the category's ordinal, or a negative number in case the
        /// category does not yet exist in the taxonomy.
        /// </summary>
        protected virtual int FindCategory(FacetLabel categoryPath)
        {
            lock (this)
            {
                // If we can find the category in the cache, or we know the cache is
                // complete, we can return the response directly from it
                int res = cache.Get(categoryPath);
                if (res >= 0 || cacheIsComplete)
                {
                    return(res);
                }

                cacheMisses.IncrementAndGet();
                // After a few cache misses, it makes sense to read all the categories
                // from disk and into the cache. The reason not to do this on the first
                // cache miss (or even when opening the writer) is that it will
                // significantly slow down the case when a taxonomy is opened just to
                // add one category. The idea only spending a long time on reading
                // after enough time was spent on cache misses is known as an "online
                // algorithm".
                PerhapsFillCache();
                res = cache.Get(categoryPath);
                if (res >= 0 || cacheIsComplete)
                {
                    // if after filling the cache from the info on disk, the category is in it
                    // or the cache is complete, return whatever cache.get returned.
                    return(res);
                }

                // if we get here, it means the category is not in the cache, and it is not
                // complete, and therefore we must look for the category on disk.

                // We need to get an answer from the on-disk index.
                InitReaderManager();

                int             doc    = -1;
                DirectoryReader reader = readerManager.Acquire();
                try
                {
                    BytesRef  catTerm   = new BytesRef(FacetsConfig.PathToString(categoryPath.Components, categoryPath.Length));
                    TermsEnum termsEnum = null; // reuse
                    DocsEnum  docs      = null; // reuse
                    foreach (AtomicReaderContext ctx in reader.Leaves)
                    {
                        Terms terms = ctx.AtomicReader.Terms(Consts.FULL);
                        if (terms != null)
                        {
                            termsEnum = terms.Iterator(termsEnum);
                            if (termsEnum.SeekExact(catTerm))
                            {
                                // liveDocs=null because the taxonomy has no deletes
                                docs = termsEnum.Docs(null, docs, 0); // freqs not required
                                // if the term was found, we know it has exactly one document.
                                doc = docs.NextDoc() + ctx.DocBase;
                                break;
                            }
                        }
                    }
                }
                finally
                {
                    readerManager.Release(reader);
                }
                if (doc > 0)
                {
                    AddToCache(categoryPath, doc);
                }
                return(doc);
            }
        }
Exemple #15
0
 /// <summary>
 /// Returns the ordinal assigned to the given label,
 /// or <see cref="INVALID_ORDINAL"/> if the label cannot be found in this table.
 /// </summary>
 public abstract int GetOrdinal(FacetLabel label);
Exemple #16
0
 /// <summary>
 /// Adds a new label if its not yet in the table.
 /// Throws an <see cref="ArgumentException"/> if the same label with
 /// a different ordinal was previoulsy added to this table.
 /// </summary>
 public abstract void AddLabel(FacetLabel label, int ordinal);
 internal override object Key(FacetLabel name, int prefixLen)
 {
     return(new long?(name.Subpath(prefixLen).LongHashCode()));
 }
 internal override object Key(FacetLabel name)
 {
     return(new long?(name.LongHashCode()));
 }
Exemple #19
0
 internal virtual object Key(FacetLabel name, int prefixLen)
 {
     return(name.Subpath(prefixLen));
 }
Exemple #20
0
 /// <summary>
 /// Subclasses can override this to provide caching by e.g. hash of the string. </summary>
 internal virtual object Key(FacetLabel name)
 {
     return(name);
 }
Exemple #21
0
 internal virtual bool Put(FacetLabel name, int prefixLen, int?val)
 {
     cache[Key(name, prefixLen)] = val;
     return(CacheFull);
 }
        /// <summary>
        /// Note that the methods calling addCategoryDocument() are synchornized, so
        /// this method is effectively synchronized as well.
        /// </summary>
        private int AddCategoryDocument(FacetLabel categoryPath, int parent)
        {
            // Before Lucene 2.9, position increments >=0 were supported, so we
            // added 1 to parent to allow the parent -1 (the parent of the root).
            // Unfortunately, starting with Lucene 2.9, after LUCENE-1542, this is
            // no longer enough, since 0 is not encoded consistently either (see
            // comment in SinglePositionTokenStream). But because we must be
            // backward-compatible with existing indexes, we can't just fix what
            // we write here (e.g., to write parent+2), and need to do a workaround
            // in the reader (which knows that anyway only category 0 has a parent
            // -1).    
            parentStream.Set(Math.Max(parent + 1, 1));
            Document d = new Document();
            d.Add(parentStreamField);

            fullPathField.StringValue = FacetsConfig.PathToString(categoryPath.Components, categoryPath.Length);
            d.Add(fullPathField);

            // Note that we do no pass an Analyzer here because the fields that are
            // added to the Document are untokenized or contains their own TokenStream.
            // Therefore the IndexWriter's Analyzer has no effect.
            indexWriter.AddDocument(d);
            int id = nextID++;

            // added a category document, mark that ReaderManager is not up-to-date
            shouldRefreshReaderManager = true;

            // also add to the parent array
            taxoArrays = TaxoArrays.Add(id, parent);

            // NOTE: this line must be executed last, or else the cache gets updated
            // before the parents array (LUCENE-4596)
            AddToCache(categoryPath, id);

            return id;
        }
Exemple #23
0
        // we need to guarantee that if several threads call this concurrently, only
        // one executes it, and after it returns, the cache is updated and is either
        // complete or not.
        private void PerhapsFillCache()
        {
            lock (this)
            {
                if (cacheMisses.Get() < cacheMissesUntilFill)
                {
                    return;
                }

                if (!shouldFillCache)
                {
                    // we already filled the cache once, there's no need to re-fill it
                    return;
                }
                shouldFillCache = false;

                InitReaderManager();

                bool            aborted = false;
                DirectoryReader reader  = readerManager.Acquire();
                try
                {
                    TermsEnum termsEnum = null;
                    DocsEnum  docsEnum  = null;
                    foreach (AtomicReaderContext ctx in reader.Leaves)
                    {
                        Terms terms = ctx.AtomicReader.Terms(Consts.FULL);
                        if (terms != null) // cannot really happen, but be on the safe side
                        {
                            termsEnum = terms.Iterator(termsEnum);
                            while (termsEnum.Next() != null)
                            {
                                if (!cache.Full)
                                {
                                    BytesRef t = termsEnum.Term();
                                    // Since we guarantee uniqueness of categories, each term has exactly
                                    // one document. Also, since we do not allow removing categories (and
                                    // hence documents), there are no deletions in the index. Therefore, it
                                    // is sufficient to call next(), and then doc(), exactly once with no
                                    // 'validation' checks.
                                    FacetLabel cp = new FacetLabel(FacetsConfig.StringToPath(t.Utf8ToString()));
                                    docsEnum = termsEnum.Docs(null, docsEnum, DocsEnum.FLAG_NONE);
                                    bool res = cache.Put(cp, docsEnum.NextDoc() + ctx.DocBase);
                                    Debug.Assert(!res, "entries should not have been evicted from the cache");
                                }
                                else
                                {
                                    // the cache is full and the next put() will evict entries from it, therefore abort the iteration.
                                    aborted = true;
                                    break;
                                }
                            }
                        }
                        if (aborted)
                        {
                            break;
                        }
                    }
                }
                finally
                {
                    readerManager.Release(reader);
                }

                cacheIsComplete = !aborted;
                if (cacheIsComplete)
                {
                    lock (this)
                    {
                        // everything is in the cache, so no need to keep readerManager open.
                        // this block is executed in a sync block so that it works well with
                        // initReaderManager called in parallel.
                        readerManager.Dispose();
                        readerManager            = null;
                        initializedReaderManager = false;
                    }
                }
            }
        }
        public override int GetOrdinal(FacetLabel cp)
        {
            EnsureOpen();
            if (cp.Length == 0)
            {
                return ROOT_ORDINAL;
            }

            // First try to find the answer in the LRU cache:
            lock (ordinalCache)
            {
                IntClass res = ordinalCache.Get(cp);
                if (res != null && res.IntItem != null)
                {
                    if ((int)res.IntItem.Value < indexReader.MaxDoc)
                    {
                        // Since the cache is shared with DTR instances allocated from
                        // doOpenIfChanged, we need to ensure that the ordinal is one that
                        // this DTR instance recognizes.
                        return (int)res.IntItem.Value;
                    }
                    else
                    {
                        // if we get here, it means that the category was found in the cache,
                        // but is not recognized by this TR instance. Therefore there's no
                        // need to continue search for the path on disk, because we won't find
                        // it there too.
                        return TaxonomyReader.INVALID_ORDINAL;
                    }
                }
            }

            // If we're still here, we have a cache miss. We need to fetch the
            // value from disk, and then also put it in the cache:
            int ret = TaxonomyReader.INVALID_ORDINAL;
            DocsEnum docs = MultiFields.GetTermDocsEnum(indexReader, null, Consts.FULL, new BytesRef(FacetsConfig.PathToString(cp.Components, cp.Length)), 0);
            if (docs != null && docs.NextDoc() != DocIdSetIterator.NO_MORE_DOCS)
            {
                ret = docs.DocID();

                // we only store the fact that a category exists, not its inexistence.
                // This is required because the caches are shared with new DTR instances
                // that are allocated from doOpenIfChanged. Therefore, if we only store
                // information about found categories, we cannot accidently tell a new
                // generation of DTR that a category does not exist.
                lock (ordinalCache)
                {
                    ordinalCache.Put(cp, new IntClass { IntItem = Convert.ToInt32(ret) });
                }
            }

            return ret;
        }
Exemple #25
0
        // LUCENENET: No need for Key() functions, they are passed in as delegates through the constructor.

        /// <summary>
        /// Add a new value to cache.
        /// Return true if cache became full and some room need to be made.
        /// </summary>
        bool IInternalNameInt32CacheLru.Put(FacetLabel name, int val)
        {
            cache[getKey(name)] = val;
            return(IsCacheFull);
        }
 internal virtual bool Put(FacetLabel name, int prefixLen, int? val)
 {
     cache[Key(name, prefixLen)] = val;
     return CacheFull;
 }
Exemple #27
0
 bool IInternalNameInt32CacheLru.Put(FacetLabel name, int prefixLen, int val)
 {
     cache[getKeyWithPrefixLength(name, prefixLen)] = val;
     return(IsCacheFull);
 }
 public virtual bool Put(FacetLabel categoryPath, int ordinal)
 {
     return(true);
 }
Exemple #29
0
 /// <inheritdoc/>
 bool IInternalNameInt32CacheLru.Put(FacetLabel name, int prefixLen, int val) => cache.Put(name, prefixLen, val);
        public virtual void TestOpenIfChangedReplaceTaxonomy()
        {
            // test openIfChanged when replaceTaxonomy is called, which is equivalent to recreate
            // only can work with NRT as well
            Directory src = NewDirectory();
            DirectoryTaxonomyWriter w = new DirectoryTaxonomyWriter(src);
            FacetLabel cp_b = new FacetLabel("b");
            w.AddCategory(cp_b);
            w.Dispose();

            foreach (bool nrt in new bool[] { false, true })
            {
                Directory dir = NewDirectory();
                var writer = new DirectoryTaxonomyWriter(dir);

                FacetLabel cp_a = new FacetLabel("a");
                writer.AddCategory(cp_a);
                if (!nrt)
                {
                    writer.Commit();
                }

                DirectoryTaxonomyReader r1 = nrt ? new DirectoryTaxonomyReader(writer) : new DirectoryTaxonomyReader(dir);
                // fill r1's caches
                Assert.AreEqual(1, r1.GetOrdinal(cp_a));
                Assert.AreEqual(cp_a, r1.GetPath(1));

                // now replace taxonomy
                writer.ReplaceTaxonomy(src);
                if (!nrt)
                {
                    writer.Commit();
                }

                DirectoryTaxonomyReader r2 = TaxonomyReader.OpenIfChanged(r1);
                Assert.NotNull(r2);

                // fill r2's caches
                Assert.AreEqual(1, r2.GetOrdinal(cp_b));
                Assert.AreEqual(cp_b, r2.GetPath(1));

                // check that r1 doesn't see cp_b
                Assert.AreEqual(TaxonomyReader.INVALID_ORDINAL, r1.GetOrdinal(cp_b));
                Assert.AreEqual(cp_a, r1.GetPath(1));

                // check that r2 doesn't see cp_a
                Assert.AreEqual(TaxonomyReader.INVALID_ORDINAL, r2.GetOrdinal(cp_a));
                Assert.AreEqual(cp_b, r2.GetPath(1));

                (r2).Dispose();
                (r1).Dispose();
                writer.Dispose();
                dir.Dispose();
            }

            src.Dispose();
        }
Exemple #31
0
 /// <inheritdoc/>
 bool IInternalNameInt32CacheLru.TryGetValue(FacetLabel name, out int value) => cache.TryGetValue(name, out value);
        public virtual void TestOpenIfChangedReuseAfterRecreate()
        {
            // tests that if the taxonomy is recreated, no data is reused from the previous taxonomy
            Directory dir = NewDirectory();
            DirectoryTaxonomyWriter writer = new DirectoryTaxonomyWriter(dir);
            FacetLabel cp_a = new FacetLabel("a");
            writer.AddCategory(cp_a);
            writer.Dispose();

            DirectoryTaxonomyReader r1 = new DirectoryTaxonomyReader(dir);
            // fill r1's caches
            Assert.AreEqual(1, r1.GetOrdinal(cp_a));
            Assert.AreEqual(cp_a, r1.GetPath(1));

            // now recreate, add a different category
            writer = new DirectoryTaxonomyWriter(dir, IndexWriterConfig.OpenMode_e.CREATE);
            FacetLabel cp_b = new FacetLabel("b");
            writer.AddCategory(cp_b);
            writer.Dispose();

            DirectoryTaxonomyReader r2 = TaxonomyReader.OpenIfChanged(r1);
            Assert.NotNull(r2);

            // fill r2's caches
            Assert.AreEqual(1, r2.GetOrdinal(cp_b));
            Assert.AreEqual(cp_b, r2.GetPath(1));

            // check that r1 doesn't see cp_b
            Assert.AreEqual(TaxonomyReader.INVALID_ORDINAL, r1.GetOrdinal(cp_b));
            Assert.AreEqual(cp_a, r1.GetPath(1));

            // check that r2 doesn't see cp_a
            Assert.AreEqual(TaxonomyReader.INVALID_ORDINAL, r2.GetOrdinal(cp_a));
            Assert.AreEqual(cp_b, r2.GetPath(1));

            (r2).Dispose();
            (r1).Dispose();
            dir.Dispose();
        }
        public virtual void TestHugeLabel()
        {
            Directory indexDir = NewDirectory(), taxoDir = NewDirectory();
            IndexWriter indexWriter = new IndexWriter(indexDir, NewIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(Random())));
            DirectoryTaxonomyWriter taxoWriter = new DirectoryTaxonomyWriter(taxoDir, OpenMode.CREATE, new Cl2oTaxonomyWriterCache(2, 1f, 1));
            FacetsConfig config = new FacetsConfig();

            // Add one huge label:
            string bigs = null;
            int ordinal = -1;

            int len = FacetLabel.MAX_CATEGORY_PATH_LENGTH - 4; // for the dimension and separator
            bigs = TestUtil.RandomSimpleString(Random(), len, len);
            FacetField ff = new FacetField("dim", bigs);
            FacetLabel cp = new FacetLabel("dim", bigs);
            ordinal = taxoWriter.AddCategory(cp);
            Document doc = new Document();
            doc.Add(ff);
            indexWriter.AddDocument(config.Build(taxoWriter, doc));

            // Add tiny ones to cause a re-hash
            for (int i = 0; i < 3; i++)
            {
                string s = TestUtil.RandomSimpleString(Random(), 1, 10);
                taxoWriter.AddCategory(new FacetLabel("dim", s));
                doc = new Document();
                doc.Add(new FacetField("dim", s));
                indexWriter.AddDocument(config.Build(taxoWriter, doc));
            }

            // when too large components were allowed to be added, this resulted in a new added category
            Assert.AreEqual(ordinal, taxoWriter.AddCategory(cp));

            IOUtils.Close(indexWriter, taxoWriter);

            DirectoryReader indexReader = DirectoryReader.Open(indexDir);
            var taxoReader = new DirectoryTaxonomyReader(taxoDir);
            IndexSearcher searcher = new IndexSearcher(indexReader);
            DrillDownQuery ddq = new DrillDownQuery(new FacetsConfig());
            ddq.Add("dim", bigs);
            Assert.AreEqual(1, searcher.Search(ddq, 10).TotalHits);

            IOUtils.Close(indexReader, taxoReader, indexDir, taxoDir);
        }
        private void TouchTaxo(DirectoryTaxonomyWriter taxoWriter, FacetLabel cp)
        {
            taxoWriter.AddCategory(cp);
            taxoWriter.CommitData = new Dictionary<string, string>()
		{
			{"just", "data"}
		};
            taxoWriter.Commit();
        }
 public virtual bool Put(FacetLabel categoryPath, int ordinal)
 {
     return true;
 }
 public override void Run()
 {
     Random random = Random();
     while (numCats.DecrementAndGet() > 0)
     {
         try
         {
             int value = random.Next(range);
             FacetLabel cp = new FacetLabel(Convert.ToString(value / 1000), Convert.ToString(value / 10000), Convert.ToString(value / 100000), Convert.ToString(value));
             int ord = tw.AddCategory(cp);
             Assert.True(tw.GetParent(ord) != -1, "invalid parent for ordinal " + ord + ", category " + cp);
             string l1 = FacetsConfig.PathToString(cp.Components, 1);
             string l2 = FacetsConfig.PathToString(cp.Components, 2);
             string l3 = FacetsConfig.PathToString(cp.Components, 3);
             string l4 = FacetsConfig.PathToString(cp.Components, 4);
             values[l1] = l1;
             values[l2] = l2;
             values[l3] = l3;
             values[l4] = l4;
         }
         catch (IOException e)
         {
             throw new Exception(e.Message, e);
         }
     }
 }
        public override int GetOrdinal(FacetLabel cp)
        {
            EnsureOpen();
            if (cp.Length == 0)
            {
                return(ROOT_ORDINAL);
            }

            // First try to find the answer in the LRU cache:

            // LUCENENET: Despite LRUHashMap being thread-safe, we get much better performance
            // if reads are separated from writes.
            ordinalCacheLock.EnterReadLock();
            try
            {
                if (ordinalCache.TryGetValue(cp, out Int32Class res))
                {
                    if (res < indexReader.MaxDoc)
                    {
                        // Since the cache is shared with DTR instances allocated from
                        // doOpenIfChanged, we need to ensure that the ordinal is one that
                        // this DTR instance recognizes.
                        return(res);
                    }
                    else
                    {
                        // if we get here, it means that the category was found in the cache,
                        // but is not recognized by this TR instance. Therefore there's no
                        // need to continue search for the path on disk, because we won't find
                        // it there too.
                        return(TaxonomyReader.INVALID_ORDINAL);
                    }
                }
            }
            finally
            {
                ordinalCacheLock.ExitReadLock();
            }

            // If we're still here, we have a cache miss. We need to fetch the
            // value from disk, and then also put it in the cache:
            int      ret  = TaxonomyReader.INVALID_ORDINAL;
            DocsEnum docs = MultiFields.GetTermDocsEnum(indexReader, null, Consts.FULL, new BytesRef(FacetsConfig.PathToString(cp.Components, cp.Length)), 0);

            if (docs != null && docs.NextDoc() != DocIdSetIterator.NO_MORE_DOCS)
            {
                ret = docs.DocID;

                // we only store the fact that a category exists, not its inexistence.
                // This is required because the caches are shared with new DTR instances
                // that are allocated from doOpenIfChanged. Therefore, if we only store
                // information about found categories, we cannot accidently tell a new
                // generation of DTR that a category does not exist.

                ordinalCacheLock.EnterWriteLock();
                try
                {
                    ordinalCache[cp] = ret;
                }
                finally
                {
                    ordinalCacheLock.ExitWriteLock();
                }
            }

            return(ret);
        }
 public virtual int Get(FacetLabel categoryPath)
 {
     return -1;
 }
        public override FacetResult GetTopChildren(int topN, string dim, params string[] path)
        {
            if (topN <= 0)
            {
                throw new System.ArgumentException("topN must be > 0 (got: " + topN + ")");
            }
            FacetsConfig.DimConfig dimConfig = VerifyDim(dim);
            FacetLabel cp = new FacetLabel(dim, path);
            int dimOrd = taxoReader.GetOrdinal(cp);
            if (dimOrd == -1)
            {
                return null;
            }

            TopOrdAndFloatQueue q = new TopOrdAndFloatQueue(Math.Min(taxoReader.Count, topN));
            float bottomValue = 0;

            int ord = children[dimOrd];
            float sumValues = 0;
            int childCount = 0;

            TopOrdAndFloatQueue.OrdAndValue reuse = null;
            while (ord != TaxonomyReader.INVALID_ORDINAL)
            {
                if (values[ord] > 0)
                {
                    sumValues += values[ord];
                    childCount++;
                    if (values[ord] > bottomValue)
                    {
                        if (reuse == null)
                        {
                            reuse = new TopOrdAndFloatQueue.OrdAndValue();
                        }
                        reuse.Ord = ord;
                        reuse.Value = values[ord];
                        reuse = q.InsertWithOverflow(reuse);
                        if (q.Size() == topN)
                        {
                            bottomValue = q.Top().Value;
                        }
                    }
                }

                ord = siblings[ord];
            }

            if (sumValues == 0)
            {
                return null;
            }

            if (dimConfig.MultiValued)
            {
                if (dimConfig.RequireDimCount)
                {
                    sumValues = values[dimOrd];
                }
                else
                {
                    // Our sum'd count is not correct, in general:
                    sumValues = -1;
                }
            }
            else
            {
                // Our sum'd dim count is accurate, so we keep it
            }

            LabelAndValue[] labelValues = new LabelAndValue[q.Size()];
            for (int i = labelValues.Length - 1; i >= 0; i--)
            {
                TopOrdAndFloatQueue.OrdAndValue ordAndValue = q.Pop();
                FacetLabel child = taxoReader.GetPath(ordAndValue.Ord);
                labelValues[i] = new LabelAndValue(child.Components[cp.Length], ordAndValue.Value);
            }

            return new FacetResult(dim, path, sumValues, labelValues, childCount);
        }
 public virtual int AddCategory(FacetLabel categoryPath)
 {
     EnsureOpen();
     // check the cache outside the synchronized block. this results in better
     // concurrency when categories are there.
     int res = cache.Get(categoryPath);
     if (res < 0)
     {
         // the category is not in the cache - following code cannot be executed in parallel.
         lock (this)
         {
             res = FindCategory(categoryPath);
             if (res < 0)
             {
                 // This is a new category, and we need to insert it into the index
                 // (and the cache). Actually, we might also need to add some of
                 // the category's ancestors before we can add the category itself
                 // (while keeping the invariant that a parent is always added to
                 // the taxonomy before its child). internalAddCategory() does all
                 // this recursively
                 res = InternalAddCategory(categoryPath);
             }
         }
     }
     return res;
 }
Exemple #41
0
 internal virtual int Get(FacetLabel name)
 {
     TryGetValue(name, out int result);
     return(result);
 }
        /// <summary>
        /// Look up the given category in the cache and/or the on-disk storage,
        /// returning the category's ordinal, or a negative number in case the
        /// category does not yet exist in the taxonomy.
        /// </summary>
        protected virtual int FindCategory(FacetLabel categoryPath)
        {
            lock (this)
            {
                // If we can find the category in the cache, or we know the cache is
                // complete, we can return the response directly from it
                int res = cache.Get(categoryPath);
                if (res >= 0 || cacheIsComplete)
                {
                    return res;
                }

                cacheMisses.IncrementAndGet();
                // After a few cache misses, it makes sense to read all the categories
                // from disk and into the cache. The reason not to do this on the first
                // cache miss (or even when opening the writer) is that it will
                // significantly slow down the case when a taxonomy is opened just to
                // add one category. The idea only spending a long time on reading
                // after enough time was spent on cache misses is known as an "online
                // algorithm".
                PerhapsFillCache();
                res = cache.Get(categoryPath);
                if (res >= 0 || cacheIsComplete)
                {
                    // if after filling the cache from the info on disk, the category is in it
                    // or the cache is complete, return whatever cache.get returned.
                    return res;
                }

                // if we get here, it means the category is not in the cache, and it is not
                // complete, and therefore we must look for the category on disk.

                // We need to get an answer from the on-disk index.
                InitReaderManager();

                int doc = -1;
                DirectoryReader reader = readerManager.Acquire();
                try
                {
                    BytesRef catTerm = new BytesRef(FacetsConfig.PathToString(categoryPath.Components, categoryPath.Length));
                    TermsEnum termsEnum = null; // reuse
                    DocsEnum docs = null; // reuse
                    foreach (AtomicReaderContext ctx in reader.Leaves)
                    {
                        Terms terms = ctx.AtomicReader.Terms(Consts.FULL);
                        if (terms != null)
                        {
                            termsEnum = terms.Iterator(termsEnum);
                            if (termsEnum.SeekExact(catTerm))
                            {
                                // liveDocs=null because the taxonomy has no deletes
                                docs = termsEnum.Docs(null, docs, 0); // freqs not required
                                // if the term was found, we know it has exactly one document.
                                doc = docs.NextDoc() + ctx.DocBase;
                                break;
                            }
                        }
                    }
                }
                finally
                {
                    readerManager.Release(reader);
                }
                if (doc > 0)
                {
                    AddToCache(categoryPath, doc);
                }
                return doc;
            }
        }
        public virtual void TestGetChildren()
        {
            Directory dir = NewDirectory();
            var       taxoWriter = new DirectoryTaxonomyWriter(dir);
            int       numCategories = AtLeast(10);
            int       numA = 0, numB = 0;
            Random    random = Random;

            // add the two categories for which we'll also add children (so asserts are simpler)
            taxoWriter.AddCategory(new FacetLabel("a"));
            taxoWriter.AddCategory(new FacetLabel("b"));
            for (int i = 0; i < numCategories; i++)
            {
                if (random.NextBoolean())
                {
                    taxoWriter.AddCategory(new FacetLabel("a", Convert.ToString(i)));
                    ++numA;
                }
                else
                {
                    taxoWriter.AddCategory(new FacetLabel("b", Convert.ToString(i)));
                    ++numB;
                }
            }
            // add category with no children
            taxoWriter.AddCategory(new FacetLabel("c"));
            taxoWriter.Dispose();

            var taxoReader = new DirectoryTaxonomyReader(dir);

            // non existing category
            TaxonomyReader.ChildrenIterator it = taxoReader.GetChildren(taxoReader.GetOrdinal(new FacetLabel("invalid")));
            Assert.AreEqual(TaxonomyReader.INVALID_ORDINAL, it.Next());

            // a category with no children
            it = taxoReader.GetChildren(taxoReader.GetOrdinal(new FacetLabel("c")));
            Assert.AreEqual(TaxonomyReader.INVALID_ORDINAL, it.Next());

            // arbitrary negative ordinal
            it = taxoReader.GetChildren(-2);
            Assert.AreEqual(TaxonomyReader.INVALID_ORDINAL, it.Next());

            // root's children
            var roots = new JCG.HashSet <string> {
                "a", "b", "c"
            };

            it = taxoReader.GetChildren(TaxonomyReader.ROOT_ORDINAL);
            while (roots.Count > 0)
            {
                FacetLabel root = taxoReader.GetPath(it.Next());
                Assert.AreEqual(1, root.Length);
                Assert.True(roots.Remove(root.Components[0]));
            }
            Assert.AreEqual(TaxonomyReader.INVALID_ORDINAL, it.Next());

            for (int i = 0; i < 2; i++)
            {
                FacetLabel cp      = i == 0 ? new FacetLabel("a") : new FacetLabel("b");
                int        ordinal = taxoReader.GetOrdinal(cp);
                it = taxoReader.GetChildren(ordinal);
                int numChildren = 0;
                int child;
                while ((child = it.Next()) != TaxonomyReader.INVALID_ORDINAL)
                {
                    FacetLabel path = taxoReader.GetPath(child);
                    Assert.AreEqual(2, path.Length);
                    Assert.AreEqual(path.Components[0], i == 0 ? "a" : "b");
                    ++numChildren;
                }
                int expected = i == 0 ? numA : numB;
                Assert.AreEqual(expected, numChildren, "invalid num children");
            }
            taxoReader.Dispose();

            dir.Dispose();
        }
        /// <summary>
        /// Add a new category into the index (and the cache), and return its new
        /// ordinal.
        /// <para>
        /// Actually, we might also need to add some of the category's ancestors
        /// before we can add the category itself (while keeping the invariant that a
        /// parent is always added to the taxonomy before its child). We do this by
        /// recursion.
        /// </para>
        /// </summary>
        private int InternalAddCategory(FacetLabel cp)
        {
            // Find our parent's ordinal (recursively adding the parent category
            // to the taxonomy if it's not already there). Then add the parent
            // ordinal as payloads (rather than a stored field; payloads can be
            // more efficiently read into memory in bulk by LuceneTaxonomyReader)
            int parent;
            if (cp.Length > 1)
            {
                FacetLabel parentPath = cp.Subpath(cp.Length - 1);
                parent = FindCategory(parentPath);
                if (parent < 0)
                {
                    parent = InternalAddCategory(parentPath);
                }
            }
            else if (cp.Length == 1)
            {
                parent = TaxonomyReader.ROOT_ORDINAL;
            }
            else
            {
                parent = TaxonomyReader.INVALID_ORDINAL;
            }
            int id = AddCategoryDocument(cp, parent);

            return id;
        }
Exemple #45
0
        private void ProcessFacetFields(ITaxonomyWriter taxoWriter, IDictionary <string, IList <FacetField> > byField, Document doc)
        {
            foreach (KeyValuePair <string, IList <FacetField> > ent in byField)
            {
                string indexFieldName = ent.Key;
                //System.out.println("  indexFieldName=" + indexFieldName + " fields=" + ent.getValue());

                Int32sRef ordinals = new Int32sRef(32);
                foreach (FacetField facetField in ent.Value)
                {
                    FacetsConfig.DimConfig ft = GetDimConfig(facetField.Dim);
                    if (facetField.Path.Length > 1 && ft.IsHierarchical == false)
                    {
                        throw new ArgumentException("dimension \"" + facetField.Dim + "\" is not hierarchical yet has " + facetField.Path.Length + " components");
                    }

                    FacetLabel cp = new FacetLabel(facetField.Dim, facetField.Path);

                    CheckTaxoWriter(taxoWriter);
                    int ordinal = taxoWriter.AddCategory(cp);
                    if (ordinals.Length == ordinals.Int32s.Length)
                    {
                        ordinals.Grow(ordinals.Length + 1);
                    }
                    ordinals.Int32s[ordinals.Length++] = ordinal;
                    //System.out.println("ords[" + (ordinals.length-1) + "]=" + ordinal);
                    //System.out.println("  add cp=" + cp);

                    if (ft.IsMultiValued && (ft.IsHierarchical || ft.RequireDimCount))
                    {
                        //System.out.println("  add parents");
                        // Add all parents too:
                        int parent = taxoWriter.GetParent(ordinal);
                        while (parent > 0)
                        {
                            if (ordinals.Int32s.Length == ordinals.Length)
                            {
                                ordinals.Grow(ordinals.Length + 1);
                            }
                            ordinals.Int32s[ordinals.Length++] = parent;
                            parent = taxoWriter.GetParent(parent);
                        }

                        if (ft.RequireDimCount == false)
                        {
                            // Remove last (dimension) ord:
                            ordinals.Length--;
                        }
                    }

                    // Drill down:
                    for (int i = 1; i <= cp.Length; i++)
                    {
                        doc.Add(new StringField(indexFieldName, PathToString(cp.Components, i), Field.Store.NO));
                    }
                }

                // Facet counts:
                // DocValues are considered stored fields:
                doc.Add(new BinaryDocValuesField(indexFieldName, DedupAndEncode(ordinals)));
            }
        }
 private void AddToCache(FacetLabel categoryPath, int id)
 {
     if (cache.Put(categoryPath, id))
     {
         // If cache.put() returned true, it means the cache was limited in
         // size, became full, and parts of it had to be evicted. It is
         // possible that a relatively-new category that isn't yet visible
         // to our 'reader' was evicted, and therefore we must now refresh 
         // the reader.
         RefreshReaderManager();
         cacheIsComplete = false;
     }
 }
Exemple #47
0
        public virtual void TestL2OBasic()
        {
            LabelToOrdinal map = new LabelToOrdinalMap();

            CompactLabelToOrdinal compact = new CompactLabelToOrdinal(200, 0.15f, 3);

            int n = 50;

            string[] uniqueValues = new string[]
            {
                //@"�",
                //@"�r�G��F�\u0382�7\u0019�h�\u0015���#\u001d3\r{��q�_���Ԃ������",
                "foo bar one",
                //new string(new char[] { (char)65533, (char)65533, (char)65, (char)65533, (char)45, (char)106, (char)40, (char)643, (char)65533, (char)11, (char)65533, (char)88, (char)65533, (char)78, (char)126, (char)56, (char)12, (char)71 }),
                //"foo bar two",
                //"foo bar three",
                //"foo bar four",
                //"foo bar five",
                //"foo bar six",
                //"foo bar seven",
                //"foo bar eight",
                //"foo bar nine",
                //"foo bar ten",
                //"foo/bar/one",
                //"foo/bar/two",
                //"foo/bar/three",
                //"foo/bar/four",
                //"foo/bar/five",
                //"foo/bar/six",
                //"foo/bar/seven",
                //"foo/bar/eight",
                //"foo/bar/nine",
                //"foo/bar/ten",
                //""
            };

            var tmpDir        = CreateTempDir("testLableToOrdinal");
            var f             = new FileInfo(Path.Combine(tmpDir.FullName, "CompactLabelToOrdinalTest.tmp"));
            int flushInterval = 10;

            for (int i = 0; i < n; i++)
            {
                if (i > 0 && i % flushInterval == 0)
                {
                    using (var fileStream = new FileStream(f.FullName, FileMode.OpenOrCreate, FileAccess.ReadWrite))
                    {
                        compact.Flush(fileStream);
                    }
                    compact = CompactLabelToOrdinal.Open(f, 0.15f, 3);
                    //assertTrue(f.Delete());
                    f.Delete();
                    assertFalse(File.Exists(f.FullName));
                    if (flushInterval < (n / 10))
                    {
                        flushInterval *= 10;
                    }
                }

                FacetLabel label = new FacetLabel();
                foreach (string s in uniqueValues)
                {
                    if (s.Length == 0)
                    {
                        label = new FacetLabel();
                    }
                    else
                    {
                        label = new FacetLabel(s.Split(new char[] { '/' }, StringSplitOptions.RemoveEmptyEntries));
                    }

                    int ord1 = map.GetOrdinal(label);
                    int ord2 = compact.GetOrdinal(label);

                    if (Verbose)
                    {
                        Console.WriteLine("Testing label: " + label.ToString());
                    }

                    assertEquals(ord1, ord2);

                    if (ord1 == LabelToOrdinal.INVALID_ORDINAL)
                    {
                        ord1 = compact.GetNextOrdinal();
                        map.AddLabel(label, ord1);
                        compact.AddLabel(label, ord1);
                    }
                }
            }

            for (int i = 0; i < uniqueValues.Length; i++)
            {
                FacetLabel label;
                string     s = uniqueValues[i];
                if (s.Length == 0)
                {
                    label = new FacetLabel();
                }
                else
                {
                    label = new FacetLabel(s.Split(new char[] { '/' }, StringSplitOptions.RemoveEmptyEntries));
                }
                int ord1 = map.GetOrdinal(label);
                int ord2 = compact.GetOrdinal(label);

                if (Verbose)
                {
                    Console.WriteLine("Testing label 2: " + label.ToString());
                }

                assertEquals(ord1, ord2);
            }
        }
 /// <summary>
 /// Takes the categories from the given taxonomy directory, and adds the
 /// missing ones to this taxonomy. Additionally, it fills the given
 /// <seealso cref="OrdinalMap"/> with a mapping from the original ordinal to the new
 /// ordinal.
 /// </summary>
 public virtual void AddTaxonomy(Directory taxoDir, OrdinalMap map)
 {
     EnsureOpen();
     DirectoryReader r = DirectoryReader.Open(taxoDir);
     try
     {
         int size = r.NumDocs;
         OrdinalMap ordinalMap = map;
         ordinalMap.Size = size;
         int @base = 0;
         TermsEnum te = null;
         DocsEnum docs = null;
         foreach (AtomicReaderContext ctx in r.Leaves)
         {
             AtomicReader ar = ctx.AtomicReader;
             Terms terms = ar.Terms(Consts.FULL);
             te = terms.Iterator(te);
             while (te.Next() != null)
             {
                 FacetLabel cp = new FacetLabel(FacetsConfig.StringToPath(te.Term().Utf8ToString()));
                 int ordinal = AddCategory(cp);
                 docs = te.Docs(null, docs, DocsEnum.FLAG_NONE);
                 ordinalMap.AddMapping(docs.NextDoc() + @base, ordinal);
             }
             @base += ar.MaxDoc; // no deletions, so we're ok
         }
         ordinalMap.AddDone();
     }
     finally
     {
         r.Dispose();
     }
 }
Exemple #49
0
 public override void AddLabel(FacetLabel label, int ordinal)
 {
     map[label] = ordinal;
 }
        public override FacetLabel GetPath(int ordinal)
        {
            EnsureOpen();

            // Since the cache is shared with DTR instances allocated from
            // doOpenIfChanged, we need to ensure that the ordinal is one that this DTR
            // instance recognizes. Therefore we do this check up front, before we hit
            // the cache.
            if (ordinal < 0 || ordinal >= indexReader.MaxDoc)
            {
                return null;
            }

            // TODO: can we use an int-based hash impl, such as IntToObjectMap,
            // wrapped as LRU?
            int catIDInteger = Convert.ToInt32(ordinal);
            lock (categoryCache)
            {
                var res = categoryCache.Get(catIDInteger,false);
                if (res != null)
                {
                    return res;
                }
            }

            Document doc = indexReader.Document(ordinal);
            FacetLabel ret = new FacetLabel(FacetsConfig.StringToPath(doc.Get(Consts.FULL)));
            lock (categoryCache)
            {
                categoryCache.Put(catIDInteger, ret);
            }

            return ret;
        }
Exemple #51
0
        public virtual void TestL2O()
        {
            LabelToOrdinal map = new LabelToOrdinalMap();

            CompactLabelToOrdinal compact = new CompactLabelToOrdinal(2000000, 0.15f, 3);

            int       n = AtLeast(10 * 1000);
            const int numUniqueValues = 50 * 1000;

            string[] uniqueValues = new string[numUniqueValues];
            byte[]   buffer       = new byte[50];

            // This is essentially the equivalent of
            // CharsetDecoder decoder = StandardCharsets.UTF_8.newDecoder()
            //     .onUnmappableCharacter(CodingErrorAction.REPLACE)
            //     .onMalformedInput(CodingErrorAction.REPLACE);
            //
            // Encoding decoder = Encoding.GetEncoding(Encoding.UTF8.CodePage,
            //     new EncoderReplacementFallback("?"),
            //     new DecoderReplacementFallback("?"));

            Random random = Random;

            for (int i = 0; i < numUniqueValues;)
            {
                random.NextBytes(buffer);
                int size = 1 + random.Next(buffer.Length);

                // This test is turning random bytes into a string,
                // this is asking for trouble.
                Encoding decoder = Encoding.GetEncoding(Encoding.UTF8.CodePage,
                                                        new EncoderReplacementFallback("?"),
                                                        new DecoderReplacementFallback("?"));
                uniqueValues[i] = decoder.GetString(buffer, 0, size);
                // we cannot have empty path components, so eliminate all prefix as well
                // as middle consecutive delimiter chars.
                uniqueValues[i] = Regex.Replace(uniqueValues[i], "/+", "/");
                if (uniqueValues[i].StartsWith("/", StringComparison.Ordinal))
                {
                    uniqueValues[i] = uniqueValues[i].Substring(1);
                }
                if (uniqueValues[i].IndexOf(CompactLabelToOrdinal.TERMINATOR_CHAR) == -1)
                {
                    i++;
                }
            }

            var tmpDir        = CreateTempDir("testLableToOrdinal");
            var f             = new FileInfo(Path.Combine(tmpDir.FullName, "CompactLabelToOrdinalTest.tmp"));
            int flushInterval = 10;

            for (int i = 0; i < n; i++)
            {
                if (i > 0 && i % flushInterval == 0)
                {
                    using (var fileStream = new FileStream(f.FullName, FileMode.OpenOrCreate, FileAccess.ReadWrite))
                    {
                        compact.Flush(fileStream);
                    }
                    compact = CompactLabelToOrdinal.Open(f, 0.15f, 3);
                    //assertTrue(f.Delete());
                    f.Delete();
                    assertFalse(File.Exists(f.FullName));
                    if (flushInterval < (n / 10))
                    {
                        flushInterval *= 10;
                    }
                }

                int        index = random.Next(numUniqueValues);
                FacetLabel label;
                string     s = uniqueValues[index];
                if (s.Length == 0)
                {
                    label = new FacetLabel();
                }
                else
                {
                    label = new FacetLabel(s.Split(new char[] { '/' }, StringSplitOptions.RemoveEmptyEntries));
                }

                int ord1 = map.GetOrdinal(label);
                int ord2 = compact.GetOrdinal(label);

                if (Verbose)
                {
                    Console.WriteLine("Testing label: " + label.ToString());
                }

                assertEquals(ord1, ord2);

                if (ord1 == LabelToOrdinal.INVALID_ORDINAL)
                {
                    ord1 = compact.GetNextOrdinal();
                    map.AddLabel(label, ord1);
                    compact.AddLabel(label, ord1);
                }
            }

            for (int i = 0; i < numUniqueValues; i++)
            {
                FacetLabel label;
                string     s = uniqueValues[i];
                if (s.Length == 0)
                {
                    label = new FacetLabel();
                }
                else
                {
                    label = new FacetLabel(s.Split(new char[] { '/' }, StringSplitOptions.RemoveEmptyEntries));
                }
                int ord1 = map.GetOrdinal(label);
                int ord2 = compact.GetOrdinal(label);

                if (Verbose)
                {
                    Console.WriteLine("Testing label 2: " + label.ToString());
                }

                assertEquals(ord1, ord2);
            }
        }
 internal virtual object Key(FacetLabel name, int prefixLen)
 {
     return name.Subpath(prefixLen);
 }
        public virtual void TestConcurrency()
        {
            int                  ncats   = AtLeast(100000); // add many categories
            int                  range   = ncats * 3;       // affects the categories selection
            AtomicInt32          numCats = new AtomicInt32(ncats);
            Directory            dir     = NewDirectory();
            var                  values  = new ConcurrentDictionary <string, string>();
            double               d       = Random.NextDouble();
            ITaxonomyWriterCache cache;

            if (d < 0.7)
            {
                // this is the fastest, yet most memory consuming
                cache = new Cl2oTaxonomyWriterCache(1024, 0.15f, 3);
            }
            else if (TEST_NIGHTLY && d > 0.98)
            {
                // this is the slowest, but tests the writer concurrency when no caching is done.
                // only pick it during NIGHTLY tests, and even then, with very low chances.
                cache = NO_OP_CACHE;
            }
            else
            {
                // this is slower than CL2O, but less memory consuming, and exercises finding categories on disk too.
                cache = new LruTaxonomyWriterCache(ncats / 10);
            }
            if (VERBOSE)
            {
                Console.WriteLine("TEST: use cache=" + cache);
            }
            var tw = new DirectoryTaxonomyWriter(dir, OpenMode.CREATE, cache);

            ThreadClass[] addThreads = new ThreadClass[AtLeast(4)];
            for (int z = 0; z < addThreads.Length; z++)
            {
                addThreads[z] = new ThreadAnonymousInnerClassHelper(this, range, numCats, values, tw);
            }

            foreach (var t in addThreads)
            {
                t.Start();
            }
            foreach (var t in addThreads)
            {
                t.Join();
            }
            tw.Dispose();

            DirectoryTaxonomyReader dtr = new DirectoryTaxonomyReader(dir);

            // +1 for root category
            if (values.Count + 1 != dtr.Count)
            {
                foreach (string value in values.Keys)
                {
                    FacetLabel label = new FacetLabel(FacetsConfig.StringToPath(value));
                    if (dtr.GetOrdinal(label) == -1)
                    {
                        Console.WriteLine("FAIL: path=" + label + " not recognized");
                    }
                }
                Fail("mismatch number of categories");
            }

            int[] parents = dtr.ParallelTaxonomyArrays.Parents;
            foreach (string cat in values.Keys)
            {
                FacetLabel cp = new FacetLabel(FacetsConfig.StringToPath(cat));
                Assert.True(dtr.GetOrdinal(cp) > 0, "category not found " + cp);
                int        level     = cp.Length;
                int        parentOrd = 0; // for root, parent is always virtual ROOT (ord=0)
                FacetLabel path      = new FacetLabel();
                for (int i = 0; i < level; i++)
                {
                    path = cp.Subpath(i + 1);
                    int ord = dtr.GetOrdinal(path);
                    Assert.AreEqual(parentOrd, parents[ord], "invalid parent for cp=" + path);
                    parentOrd = ord; // next level should have this parent
                }
            }

            IOUtils.Dispose(dtr, dir);
        }
 internal virtual int? Get(FacetLabel name)
 {
     int? res = cache[Key(name)];
     if (res == null)
     {
         nMisses++;
     }
     else
     {
         nHits++;
     }
     return res;
 }
 public virtual int Get(FacetLabel categoryPath)
 {
     return(-1);
 }