Beispiel #1
0
        public virtual void TestTermOrd()
        {
            Directory   d = NewDirectory();
            IndexWriter w = new IndexWriter(d, NewIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(Random())).SetCodec(TestUtil.AlwaysPostingsFormat(new Lucene41PostingsFormat())));

            Documents.Document doc = new Documents.Document();
            doc.Add(NewTextField("f", "a b c", Field.Store.NO));
            w.AddDocument(doc);
            w.ForceMerge(1);
            DirectoryReader r     = w.Reader;
            TermsEnum       terms = GetOnlySegmentReader(r).Fields.Terms("f").Iterator(null);

            Assert.IsTrue(terms.Next() != null);
            try
            {
                Assert.AreEqual(0, terms.Ord());
            }
            catch (System.NotSupportedException uoe)
            {
                // ok -- codec is not required to support this op
            }
            r.Dispose();
            w.Dispose();
            d.Dispose();
        }
Beispiel #2
0
        public virtual void TestPrevTermAtEnd()
        {
            IndexWriter writer = new IndexWriter(Dir, NewIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(Random())).SetCodec(TestUtil.AlwaysPostingsFormat(new Lucene41PostingsFormat())));

            AddDoc(writer, "aaa bbb");
            writer.Dispose();
            SegmentReader reader = GetOnlySegmentReader(DirectoryReader.Open(Dir));
            TermsEnum     terms  = reader.Fields.Terms("content").Iterator(null);

            Assert.IsNotNull(terms.Next());
            Assert.AreEqual("aaa", terms.Term().Utf8ToString());
            Assert.IsNotNull(terms.Next());
            long ordB;

            try
            {
                ordB = terms.Ord();
            }
            catch (System.NotSupportedException uoe)
            {
                // ok -- codec is not required to support ord
                reader.Dispose();
                return;
            }
            Assert.AreEqual("bbb", terms.Term().Utf8ToString());
            Assert.IsNull(terms.Next());

            terms.SeekExact(ordB);
            Assert.AreEqual("bbb", terms.Term().Utf8ToString());
            reader.Dispose();
        }
Beispiel #3
0
 public override long LookupTerm(BytesRef key)
 {
     try
     {
         if (Te.SeekCeil(key) == SeekStatus.FOUND)
         {
             return(Te.Ord());
         }
         else
         {
             return(-Te.Ord() - 1);
         }
     }
     catch (System.IO.IOException e)
     {
         throw new Exception(e.Message, e);
     }
 }
Beispiel #4
0
        /// <summary>
        /// Call this only once (if you subclass!) </summary>
        protected internal virtual void Uninvert(AtomicReader reader, Bits liveDocs, BytesRef termPrefix)
        {
            FieldInfo info = reader.FieldInfos.FieldInfo(Field);

            if (info != null && info.HasDocValues())
            {
                throw new InvalidOperationException("Type mismatch: " + Field + " was indexed as " + info.DocValuesType);
            }
            //System.out.println("DTO uninvert field=" + field + " prefix=" + termPrefix);
            long startTime = DateTime.Now.Millisecond;

            Prefix = termPrefix == null ? null : BytesRef.DeepCopyOf(termPrefix);

            int maxDoc = reader.MaxDoc;

            int[]     index    = new int[maxDoc];     // immediate term numbers, or the index into the byte[] representing the last number
            int[]     lastTerm = new int[maxDoc];     // last term we saw for this document
            sbyte[][] bytes    = new sbyte[maxDoc][]; // list of term numbers for the doc (delta encoded vInts)

            Fields fields = reader.Fields;

            if (fields == null)
            {
                // No terms
                return;
            }
            Terms terms = fields.Terms(Field);

            if (terms == null)
            {
                // No terms
                return;
            }

            TermsEnum te        = terms.Iterator(null);
            BytesRef  seekStart = termPrefix != null ? termPrefix : new BytesRef();

            //System.out.println("seekStart=" + seekStart.utf8ToString());
            if (te.SeekCeil(seekStart) == TermsEnum.SeekStatus.END)
            {
                // No terms match
                return;
            }

            // If we need our "term index wrapper", these will be
            // init'd below:
            IList <BytesRef> indexedTerms      = null;
            PagedBytes       indexedTermsBytes = null;

            bool testedOrd = false;

            // we need a minimum of 9 bytes, but round up to 12 since the space would
            // be wasted with most allocators anyway.
            sbyte[] tempArr = new sbyte[12];

            //
            // enumerate all terms, and build an intermediate form of the un-inverted field.
            //
            // During this intermediate form, every document has a (potential) byte[]
            // and the int[maxDoc()] array either contains the termNumber list directly
            // or the *end* offset of the termNumber list in it's byte array (for faster
            // appending and faster creation of the final form).
            //
            // idea... if things are too large while building, we could do a range of docs
            // at a time (but it would be a fair amount slower to build)
            // could also do ranges in parallel to take advantage of multiple CPUs

            // OPTIONAL: remap the largest df terms to the lowest 128 (single byte)
            // values.  this requires going over the field first to find the most
            // frequent terms ahead of time.

            int termNum = 0;

            DocsEnum = null;

            // Loop begins with te positioned to first term (we call
            // seek above):
            for (; ;)
            {
                BytesRef t = te.Term();
                if (t == null || (termPrefix != null && !StringHelper.StartsWith(t, termPrefix)))
                {
                    break;
                }
                //System.out.println("visit term=" + t.utf8ToString() + " " + t + " termNum=" + termNum);

                if (!testedOrd)
                {
                    try
                    {
                        OrdBase = (int)te.Ord();
                        //System.out.println("got ordBase=" + ordBase);
                    }
                    catch (System.NotSupportedException uoe)
                    {
                        // Reader cannot provide ord support, so we wrap
                        // our own support by creating our own terms index:
                        indexedTerms      = new List <BytesRef>();
                        indexedTermsBytes = new PagedBytes(15);
                        //System.out.println("NO ORDS");
                    }
                    testedOrd = true;
                }

                VisitTerm(te, termNum);

                if (indexedTerms != null && (termNum & IndexIntervalMask) == 0)
                {
                    // Index this term
                    SizeOfIndexedStrings += t.Length;
                    BytesRef indexedTerm = new BytesRef();
                    indexedTermsBytes.Copy(t, indexedTerm);
                    // TODO: really should 1) strip off useless suffix,
                    // and 2) use FST not array/PagedBytes
                    indexedTerms.Add(indexedTerm);
                }

                int df = te.DocFreq();
                if (df <= MaxTermDocFreq)
                {
                    DocsEnum = te.Docs(liveDocs, DocsEnum, DocsEnum.FLAG_NONE);

                    // dF, but takes deletions into account
                    int actualDF = 0;

                    for (; ;)
                    {
                        int doc = DocsEnum.NextDoc();
                        if (doc == DocIdSetIterator.NO_MORE_DOCS)
                        {
                            break;
                        }
                        //System.out.println("  chunk=" + chunk + " docs");

                        actualDF++;
                        TermInstances++;

                        //System.out.println("    docID=" + doc);
                        // add TNUM_OFFSET to the term number to make room for special reserved values:
                        // 0 (end term) and 1 (index into byte array follows)
                        int delta = termNum - lastTerm[doc] + TNUM_OFFSET;
                        lastTerm[doc] = termNum;
                        int val = index[doc];

                        if ((val & 0xff) == 1)
                        {
                            // index into byte array (actually the end of
                            // the doc-specific byte[] when building)
                            int     pos    = (int)((uint)val >> 8);
                            int     ilen   = VIntSize(delta);
                            sbyte[] arr    = bytes[doc];
                            int     newend = pos + ilen;
                            if (newend > arr.Length)
                            {
                                // We avoid a doubling strategy to lower memory usage.
                                // this faceting method isn't for docs with many terms.
                                // In hotspot, objects have 2 words of overhead, then fields, rounded up to a 64-bit boundary.
                                // TODO: figure out what array lengths we can round up to w/o actually using more memory
                                // (how much space does a byte[] take up?  Is data preceded by a 32 bit length only?
                                // It should be safe to round up to the nearest 32 bits in any case.
                                int     newLen = (newend + 3) & unchecked ((int)0xfffffffc); // 4 byte alignment
                                sbyte[] newarr = new sbyte[newLen];
                                Array.Copy(arr, 0, newarr, 0, pos);
                                arr        = newarr;
                                bytes[doc] = newarr;
                            }
                            pos        = WriteInt(delta, arr, pos);
                            index[doc] = (pos << 8) | 1; // update pointer to end index in byte[]
                        }
                        else
                        {
                            // OK, this int has data in it... find the end (a zero starting byte - not
                            // part of another number, hence not following a byte with the high bit set).
                            int ipos;
                            if (val == 0)
                            {
                                ipos = 0;
                            }
                            else if ((val & 0x0000ff80) == 0)
                            {
                                ipos = 1;
                            }
                            else if ((val & 0x00ff8000) == 0)
                            {
                                ipos = 2;
                            }
                            else if ((val & 0xff800000) == 0)
                            {
                                ipos = 3;
                            }
                            else
                            {
                                ipos = 4;
                            }

                            //System.out.println("      ipos=" + ipos);

                            int endPos = WriteInt(delta, tempArr, ipos);
                            //System.out.println("      endpos=" + endPos);
                            if (endPos <= 4)
                            {
                                //System.out.println("      fits!");
                                // value will fit in the integer... move bytes back
                                for (int j = ipos; j < endPos; j++)
                                {
                                    val |= (tempArr[j] & 0xff) << (j << 3);
                                }
                                index[doc] = val;
                            }
                            else
                            {
                                // value won't fit... move integer into byte[]
                                for (int j = 0; j < ipos; j++)
                                {
                                    tempArr[j] = (sbyte)val;
                                    val        = (int)((uint)val >> 8);
                                }
                                // point at the end index in the byte[]
                                index[doc] = (endPos << 8) | 1;
                                bytes[doc] = tempArr;
                                tempArr    = new sbyte[12];
                            }
                        }
                    }
                    SetActualDocFreq(termNum, actualDF);
                }

                termNum++;
                if (te.Next() == null)
                {
                    break;
                }
            }

            NumTermsInField = termNum;

            long midPoint = DateTime.Now.Millisecond;

            if (TermInstances == 0)
            {
                // we didn't invert anything
                // lower memory consumption.
                Tnums = null;
            }
            else
            {
                this.Index = index;

                //
                // transform intermediate form into the final form, building a single byte[]
                // at a time, and releasing the intermediate byte[]s as we go to avoid
                // increasing the memory footprint.
                //

                for (int pass = 0; pass < 256; pass++)
                {
                    sbyte[] target = Tnums[pass];
                    int     pos    = 0; // end in target;
                    if (target != null)
                    {
                        pos = target.Length;
                    }
                    else
                    {
                        target = new sbyte[4096];
                    }

                    // loop over documents, 0x00ppxxxx, 0x01ppxxxx, 0x02ppxxxx
                    // where pp is the pass (which array we are building), and xx is all values.
                    // each pass shares the same byte[] for termNumber lists.
                    for (int docbase = pass << 16; docbase < maxDoc; docbase += (1 << 24))
                    {
                        int lim = Math.Min(docbase + (1 << 16), maxDoc);
                        for (int doc = docbase; doc < lim; doc++)
                        {
                            //System.out.println("  pass="******" process docID=" + doc);
                            int val = index[doc];
                            if ((val & 0xff) == 1)
                            {
                                int len = (int)((uint)val >> 8);
                                //System.out.println("    ptr pos=" + pos);
                                index[doc] = (pos << 8) | 1; // change index to point to start of array
                                if ((pos & 0xff000000) != 0)
                                {
                                    // we only have 24 bits for the array index
                                    throw new InvalidOperationException("Too many values for UnInvertedField faceting on field " + Field);
                                }
                                sbyte[] arr = bytes[doc];

                                /*
                                 * for(byte b : arr) {
                                 * //System.out.println("      b=" + Integer.toHexString((int) b));
                                 * }
                                 */
                                bytes[doc] = null; // IMPORTANT: allow GC to avoid OOM
                                if (target.Length <= pos + len)
                                {
                                    int newlen = target.Length;
                                    /// <summary>
                                    ///* we don't have to worry about the array getting too large
                                    /// since the "pos" param will overflow first (only 24 bits available)
                                    /// if ((newlen<<1) <= 0) {
                                    ///  // overflow...
                                    ///  newlen = Integer.MAX_VALUE;
                                    ///  if (newlen <= pos + len) {
                                    ///    throw new SolrException(400,"Too many terms to uninvert field!");
                                    ///  }
                                    /// } else {
                                    ///  while (newlen <= pos + len) newlen<<=1;  // doubling strategy
                                    /// }
                                    /// ***
                                    /// </summary>
                                    while (newlen <= pos + len) // doubling strategy
                                    {
                                        newlen <<= 1;
                                    }
                                    sbyte[] newtarget = new sbyte[newlen];
                                    Array.Copy(target, 0, newtarget, 0, pos);
                                    target = newtarget;
                                }
                                Array.Copy(arr, 0, target, pos, len);
                                pos += len + 1; // skip single byte at end and leave it 0 for terminator
                            }
                        }
                    }

                    // shrink array
                    if (pos < target.Length)
                    {
                        sbyte[] newtarget = new sbyte[pos];
                        Array.Copy(target, 0, newtarget, 0, pos);
                        target = newtarget;
                    }

                    Tnums[pass] = target;

                    if ((pass << 16) > maxDoc)
                    {
                        break;
                    }
                }
            }
            if (indexedTerms != null)
            {
                IndexedTermsArray = indexedTerms.ToArray();
            }

            long endTime = DateTime.Now.Millisecond;

            Total_time  = (int)(endTime - startTime);
            Phase1_time = (int)(midPoint - startTime);
        }
Beispiel #5
0
 public override long Ord()
 {
     return(Tenum.Ord());
 }
Beispiel #6
0
        public virtual void TestSortedTermsEnum()
        {
            Directory         directory = NewDirectory();
            Analyzer          analyzer  = new MockAnalyzer(Random());
            IndexWriterConfig iwconfig  = NewIndexWriterConfig(TEST_VERSION_CURRENT, analyzer);

            iwconfig.SetMergePolicy(NewLogMergePolicy());
            RandomIndexWriter iwriter = new RandomIndexWriter(Random(), directory, iwconfig);

            Document doc = new Document();

            doc.Add(new StringField("field", "hello", Field.Store.NO));
            iwriter.AddDocument(doc);

            doc = new Document();
            doc.Add(new StringField("field", "world", Field.Store.NO));
            iwriter.AddDocument(doc);

            doc = new Document();
            doc.Add(new StringField("field", "beer", Field.Store.NO));
            iwriter.AddDocument(doc);
            iwriter.ForceMerge(1);

            DirectoryReader ireader = iwriter.Reader;

            iwriter.Dispose();

            AtomicReader       ar = GetOnlySegmentReader(ireader);
            SortedSetDocValues dv = FieldCache.DEFAULT.GetDocTermOrds(ar, "field");

            Assert.AreEqual(3, dv.ValueCount);

            TermsEnum termsEnum = dv.TermsEnum();

            // next()
            Assert.AreEqual("beer", termsEnum.Next().Utf8ToString());
            Assert.AreEqual(0, termsEnum.Ord());
            Assert.AreEqual("hello", termsEnum.Next().Utf8ToString());
            Assert.AreEqual(1, termsEnum.Ord());
            Assert.AreEqual("world", termsEnum.Next().Utf8ToString());
            Assert.AreEqual(2, termsEnum.Ord());

            // seekCeil()
            Assert.AreEqual(SeekStatus.NOT_FOUND, termsEnum.SeekCeil(new BytesRef("ha!")));
            Assert.AreEqual("hello", termsEnum.Term().Utf8ToString());
            Assert.AreEqual(1, termsEnum.Ord());
            Assert.AreEqual(SeekStatus.FOUND, termsEnum.SeekCeil(new BytesRef("beer")));
            Assert.AreEqual("beer", termsEnum.Term().Utf8ToString());
            Assert.AreEqual(0, termsEnum.Ord());
            Assert.AreEqual(SeekStatus.END, termsEnum.SeekCeil(new BytesRef("zzz")));

            // seekExact()
            Assert.IsTrue(termsEnum.SeekExact(new BytesRef("beer")));
            Assert.AreEqual("beer", termsEnum.Term().Utf8ToString());
            Assert.AreEqual(0, termsEnum.Ord());
            Assert.IsTrue(termsEnum.SeekExact(new BytesRef("hello")));
            Assert.AreEqual("hello", termsEnum.Term().Utf8ToString());
            Assert.AreEqual(1, termsEnum.Ord());
            Assert.IsTrue(termsEnum.SeekExact(new BytesRef("world")));
            Assert.AreEqual("world", termsEnum.Term().Utf8ToString());
            Assert.AreEqual(2, termsEnum.Ord());
            Assert.IsFalse(termsEnum.SeekExact(new BytesRef("bogus")));

            // seek(ord)
            termsEnum.SeekExact(0);
            Assert.AreEqual("beer", termsEnum.Term().Utf8ToString());
            Assert.AreEqual(0, termsEnum.Ord());
            termsEnum.SeekExact(1);
            Assert.AreEqual("hello", termsEnum.Term().Utf8ToString());
            Assert.AreEqual(1, termsEnum.Ord());
            termsEnum.SeekExact(2);
            Assert.AreEqual("world", termsEnum.Term().Utf8ToString());
            Assert.AreEqual(2, termsEnum.Ord());
            ireader.Dispose();
            directory.Dispose();
        }
Beispiel #7
0
        private void Verify(AtomicReader r, int[][] idToOrds, BytesRef[] termsArray, BytesRef prefixRef)
        {
            DocTermOrds dto = new DocTermOrds(r, r.LiveDocs, "field", prefixRef, int.MaxValue, TestUtil.NextInt(Random(), 2, 10));

            FieldCache.Ints docIDToID = FieldCache.DEFAULT.GetInts(r, "id", false);

            /*
             * for(int docID=0;docID<subR.MaxDoc;docID++) {
             * System.out.println("  docID=" + docID + " id=" + docIDToID[docID]);
             * }
             */

            if (VERBOSE)
            {
                Console.WriteLine("TEST: verify prefix=" + (prefixRef == null ? "null" : prefixRef.Utf8ToString()));
                Console.WriteLine("TEST: all TERMS:");
                TermsEnum allTE = MultiFields.GetTerms(r, "field").Iterator(null);
                int       ord   = 0;
                while (allTE.Next() != null)
                {
                    Console.WriteLine("  ord=" + (ord++) + " term=" + allTE.Term().Utf8ToString());
                }
            }

            //final TermsEnum te = subR.Fields.Terms("field").iterator();
            TermsEnum te = dto.GetOrdTermsEnum(r);

            if (dto.NumTerms() == 0)
            {
                if (prefixRef == null)
                {
                    Assert.IsNull(MultiFields.GetTerms(r, "field"));
                }
                else
                {
                    Terms terms = MultiFields.GetTerms(r, "field");
                    if (terms != null)
                    {
                        TermsEnum            termsEnum = terms.Iterator(null);
                        TermsEnum.SeekStatus result    = termsEnum.SeekCeil(prefixRef);
                        if (result != TermsEnum.SeekStatus.END)
                        {
                            Assert.IsFalse(StringHelper.StartsWith(termsEnum.Term(), prefixRef), "term=" + termsEnum.Term().Utf8ToString() + " matches prefix=" + prefixRef.Utf8ToString());
                        }
                        else
                        {
                            // ok
                        }
                    }
                    else
                    {
                        // ok
                    }
                }
                return;
            }

            if (VERBOSE)
            {
                Console.WriteLine("TEST: TERMS:");
                te.SeekExact(0);
                while (true)
                {
                    Console.WriteLine("  ord=" + te.Ord() + " term=" + te.Term().Utf8ToString());
                    if (te.Next() == null)
                    {
                        break;
                    }
                }
            }

            SortedSetDocValues iter = dto.GetIterator(r);

            for (int docID = 0; docID < r.MaxDoc; docID++)
            {
                if (VERBOSE)
                {
                    Console.WriteLine("TEST: docID=" + docID + " of " + r.MaxDoc + " (id=" + docIDToID.Get(docID) + ")");
                }
                iter.Document = docID;
                int[] answers = idToOrds[docIDToID.Get(docID)];
                int   upto    = 0;
                long  ord;
                while ((ord = iter.NextOrd()) != SortedSetDocValues.NO_MORE_ORDS)
                {
                    te.SeekExact(ord);
                    BytesRef expected = termsArray[answers[upto++]];
                    if (VERBOSE)
                    {
                        Console.WriteLine("  exp=" + expected.Utf8ToString() + " actual=" + te.Term().Utf8ToString());
                    }
                    Assert.AreEqual(expected, te.Term(), "expected=" + expected.Utf8ToString() + " actual=" + te.Term().Utf8ToString() + " ord=" + ord);
                }
                Assert.AreEqual(answers.Length, upto);
            }
        }
Beispiel #8
0
 public override long Ord()
 {
     return(@in.Ord());
 }
        private void AssertEquals(long numOrds, TermsEnum expected, TermsEnum actual)
        {
            BytesRef @ref;

            // sequential next() through all terms
            while ((@ref = expected.Next()) != null)
            {
                Assert.AreEqual(@ref, actual.Next());
                Assert.AreEqual(expected.Ord(), actual.Ord());
                Assert.AreEqual(expected.Term(), actual.Term());
            }
            Assert.IsNull(actual.Next());

            // sequential seekExact(ord) through all terms
            for (long i = 0; i < numOrds; i++)
            {
                expected.SeekExact(i);
                actual.SeekExact(i);
                Assert.AreEqual(expected.Ord(), actual.Ord());
                Assert.AreEqual(expected.Term(), actual.Term());
            }

            // sequential seekExact(BytesRef) through all terms
            for (long i = 0; i < numOrds; i++)
            {
                expected.SeekExact(i);
                Assert.IsTrue(actual.SeekExact(expected.Term()));
                Assert.AreEqual(expected.Ord(), actual.Ord());
                Assert.AreEqual(expected.Term(), actual.Term());
            }

            // sequential seekCeil(BytesRef) through all terms
            for (long i = 0; i < numOrds; i++)
            {
                expected.SeekExact(i);
                Assert.AreEqual(SeekStatus.FOUND, actual.SeekCeil(expected.Term()));
                Assert.AreEqual(expected.Ord(), actual.Ord());
                Assert.AreEqual(expected.Term(), actual.Term());
            }

            // random seekExact(ord)
            for (long i = 0; i < numOrds; i++)
            {
                long randomOrd = TestUtil.NextLong(Random(), 0, numOrds - 1);
                expected.SeekExact(randomOrd);
                actual.SeekExact(randomOrd);
                Assert.AreEqual(expected.Ord(), actual.Ord());
                Assert.AreEqual(expected.Term(), actual.Term());
            }

            // random seekExact(BytesRef)
            for (long i = 0; i < numOrds; i++)
            {
                long randomOrd = TestUtil.NextLong(Random(), 0, numOrds - 1);
                expected.SeekExact(randomOrd);
                actual.SeekExact(expected.Term());
                Assert.AreEqual(expected.Ord(), actual.Ord());
                Assert.AreEqual(expected.Term(), actual.Term());
            }

            // random seekCeil(BytesRef)
            for (long i = 0; i < numOrds; i++)
            {
                BytesRef target = new BytesRef(TestUtil.RandomUnicodeString(Random()));
                SeekStatus expectedStatus = expected.SeekCeil(target);
                Assert.AreEqual(expectedStatus, actual.SeekCeil(target));
                if (expectedStatus != SeekStatus.END)
                {
                    Assert.AreEqual(expected.Ord(), actual.Ord());
                    Assert.AreEqual(expected.Term(), actual.Term());
                }
            }
        }