예제 #1
0
        public override sealed int DocFreq(Term term)
        {
            Fields fields = Fields;

            if (fields == null)
            {
                return(0);
            }
            Terms terms = fields.Terms(term.Field);

            if (terms == null)
            {
                return(0);
            }
            TermsEnum termsEnum = terms.Iterator(null);

            if (termsEnum.SeekExact(term.Bytes))
            {
                return(termsEnum.DocFreq());
            }
            else
            {
                return(0);
            }
        }
예제 #2
0
        public virtual void TestDocsEnum()
        {
            TermVectorsReader reader = Codec.Default.TermVectorsFormat().VectorsReader(Dir, Seg.Info, FieldInfos, NewIOContext(Random()));

            for (int j = 0; j < 5; j++)
            {
                Terms vector = reader.Get(j).Terms(TestFields[0]);
                Assert.IsNotNull(vector);
                Assert.AreEqual(TestTerms.Length, vector.Size());
                TermsEnum termsEnum = vector.Iterator(null);
                DocsEnum  docsEnum  = null;
                for (int i = 0; i < TestTerms.Length; i++)
                {
                    BytesRef text = termsEnum.Next();
                    Assert.IsNotNull(text);
                    string term = text.Utf8ToString();
                    //System.out.println("Term: " + term);
                    Assert.AreEqual(TestTerms[i], term);

                    docsEnum = TestUtil.Docs(Random(), termsEnum, null, docsEnum, DocsEnum.FLAG_NONE);
                    Assert.IsNotNull(docsEnum);
                    int doc = docsEnum.DocID();
                    Assert.AreEqual(-1, doc);
                    Assert.IsTrue(docsEnum.NextDoc() != DocIdSetIterator.NO_MORE_DOCS);
                    Assert.AreEqual(DocIdSetIterator.NO_MORE_DOCS, docsEnum.NextDoc());
                }
                Assert.IsNull(termsEnum.Next());
            }
            reader.Dispose();
        }
예제 #3
0
        public virtual void TestTerms()
        {
            Fields fields = MultiFields.GetFields(Reader);

            foreach (string field in fields)
            {
                Terms terms = fields.Terms(field);
                Assert.IsNotNull(terms);
                TermsEnum termsEnum = terms.Iterator(null);
                while (termsEnum.Next() != null)
                {
                    BytesRef term = termsEnum.Term();
                    Assert.IsTrue(term != null);
                    string fieldValue = (string)DocHelper.NameValues[field];
                    Assert.IsTrue(fieldValue.IndexOf(term.Utf8ToString()) != -1);
                }
            }

            DocsEnum termDocs = TestUtil.Docs(Random(), Reader, DocHelper.TEXT_FIELD_1_KEY, new BytesRef("field"), MultiFields.GetLiveDocs(Reader), null, 0);

            Assert.IsTrue(termDocs.NextDoc() != DocIdSetIterator.NO_MORE_DOCS);

            termDocs = TestUtil.Docs(Random(), Reader, DocHelper.NO_NORMS_KEY, new BytesRef(DocHelper.NO_NORMS_TEXT), MultiFields.GetLiveDocs(Reader), null, 0);

            Assert.IsTrue(termDocs.NextDoc() != DocIdSetIterator.NO_MORE_DOCS);

            DocsAndPositionsEnum positions = MultiFields.GetTermPositionsEnum(Reader, MultiFields.GetLiveDocs(Reader), DocHelper.TEXT_FIELD_1_KEY, new BytesRef("field"));

            // NOTE: prior rev of this test was failing to first
            // call next here:
            Assert.IsTrue(positions.NextDoc() != DocIdSetIterator.NO_MORE_DOCS);
            Assert.IsTrue(positions.DocID() == 0);
            Assert.IsTrue(positions.NextPosition() >= 0);
        }
예제 #4
0
 /// <summary>
 /// Returns a TermsEnum that implements ord.  If the
 ///  provided reader supports ord, we just return its
 ///  TermsEnum; if it does not, we build a "private" terms
 ///  index internally (WARNING: consumes RAM) and use that
 ///  index to implement ord.  this also enables ord on top
 ///  of a composite reader.  The returned TermsEnum is
 ///  unpositioned.  this returns null if there are no terms.
 ///
 ///  <p><b>NOTE</b>: you must pass the same reader that was
 ///  used when creating this class
 /// </summary>
 public virtual TermsEnum GetOrdTermsEnum(AtomicReader reader)
 {
     if (IndexedTermsArray == null)
     {
         //System.out.println("GET normal enum");
         Fields fields = reader.Fields;
         if (fields == null)
         {
             return(null);
         }
         Terms terms = fields.Terms(Field);
         if (terms == null)
         {
             return(null);
         }
         else
         {
             return(terms.Iterator(null));
         }
     }
     else
     {
         //System.out.println("GET wrapped enum ordBase=" + ordBase);
         return(new OrdWrappedTermsEnum(this, reader));
     }
 }
예제 #5
0
        private void AssertSumDocFreq(IndexReader ir)
        {
            // compute sumDocFreq across all fields
            Fields fields = MultiFields.GetFields(ir);

            foreach (string f in fields)
            {
                Terms terms      = fields.Terms(f);
                long  sumDocFreq = terms.SumDocFreq;
                if (sumDocFreq == -1)
                {
                    if (VERBOSE)
                    {
                        Console.WriteLine("skipping field: " + f + ", codec does not support sumDocFreq");
                    }
                    continue;
                }

                long      computedSumDocFreq = 0;
                TermsEnum termsEnum          = terms.Iterator(null);
                while (termsEnum.Next() != null)
                {
                    computedSumDocFreq += termsEnum.DocFreq();
                }
                Assert.AreEqual(computedSumDocFreq, sumDocFreq);
            }
        }
예제 #6
0
        /// <summary>
        /// Returns the number of documents containing the term
        /// <code>t</code>.  this method returns 0 if the term or
        /// field does not exists.  this method does not take into
        /// account deleted documents that have not yet been merged
        /// away.
        /// </summary>
        public override sealed long TotalTermFreq(Term term)
        {
            Fields fields = Fields;

            if (fields == null)
            {
                return(0);
            }
            Terms terms = fields.Terms(term.Field());

            if (terms == null)
            {
                return(0);
            }
            TermsEnum termsEnum = terms.Iterator(null);

            if (termsEnum.SeekExact(term.Bytes()))
            {
                return(termsEnum.TotalTermFreq());
            }
            else
            {
                return(0);
            }
        }
예제 #7
0
        /// <summary>
        /// Creates a <seealso cref="TermContext"/> from a top-level <seealso cref="IndexReaderContext"/> and the
        /// given <seealso cref="Term"/>. this method will lookup the given term in all context's leaf readers
        /// and register each of the readers containing the term in the returned <seealso cref="TermContext"/>
        /// using the leaf reader's ordinal.
        /// <p>
        /// Note: the given context must be a top-level context.
        /// </summary>
        public static TermContext Build(IndexReaderContext context, Term term)
        {
            Debug.Assert(context != null && context.IsTopLevel);
            string      field = term.Field();
            BytesRef    bytes = term.Bytes();
            TermContext perReaderTermState = new TermContext(context);

            //if (DEBUG) System.out.println("prts.build term=" + term);
            foreach (AtomicReaderContext ctx in context.Leaves)
            {
                //if (DEBUG) System.out.println("  r=" + leaves[i].reader);
                Fields fields = ctx.AtomicReader.Fields;
                if (fields != null)
                {
                    Terms terms = fields.Terms(field);
                    if (terms != null)
                    {
                        TermsEnum termsEnum = terms.Iterator(null);
                        if (termsEnum.SeekExact(bytes))
                        {
                            TermState termState = termsEnum.TermState();
                            //if (DEBUG) System.out.println("    found");
                            perReaderTermState.Register(termState, ctx.Ord, termsEnum.DocFreq(), termsEnum.TotalTermFreq());
                        }
                    }
                }
            }
            return(perReaderTermState);
        }
예제 #8
0
        private void VerifyCount(IndexReader ir)
        {
            Fields fields = MultiFields.GetFields(ir);

            if (fields == null)
            {
                return;
            }
            foreach (string field in fields)
            {
                Terms terms = fields.Terms(field);
                if (terms == null)
                {
                    continue;
                }
                int         docCount = terms.DocCount;
                FixedBitSet visited  = new FixedBitSet(ir.MaxDoc);
                TermsEnum   te       = terms.Iterator(null);
                while (te.Next() != null)
                {
                    DocsEnum de = TestUtil.Docs(Random(), te, null, null, DocsEnum.FLAG_NONE);
                    while (de.NextDoc() != DocIdSetIterator.NO_MORE_DOCS)
                    {
                        visited.Set(de.DocID());
                    }
                }
                Assert.AreEqual(visited.Cardinality(), docCount);
            }
        }
예제 #9
0
        public virtual void TestZeroTerms()
        {
            var d = NewDirectory();
            RandomIndexWriter w   = new RandomIndexWriter(Random(), d, Similarity, TimeZone);
            Document          doc = new Document();

            doc.Add(NewTextField("field", "one two three", Field.Store.NO));
            doc = new Document();
            doc.Add(NewTextField("field2", "one two three", Field.Store.NO));
            w.AddDocument(doc);
            w.Commit();
            w.DeleteDocuments(new Term("field", "one"));
            w.ForceMerge(1);
            IndexReader r = w.Reader;

            w.Dispose();
            Assert.AreEqual(1, r.NumDocs);
            Assert.AreEqual(1, r.MaxDoc);
            Terms terms = MultiFields.GetTerms(r, "field");

            if (terms != null)
            {
                Assert.IsNull(terms.Iterator(null).Next());
            }
            r.Dispose();
            d.Dispose();
        }
예제 #10
0
        public override TermsEnum GetTermsEnum(Terms terms, AttributeSource atts)
        {
            if (_terms.Size() == 0)
            {
                return TermsEnum.EMPTY;
            }

            return new SeekingTermSetTermsEnum(terms.Iterator(null), _terms, _ords);
        }
예제 #11
0
        public virtual int[] ToDocsArray(Term term, Bits bits, IndexReader reader)
        {
            Fields    fields     = MultiFields.GetFields(reader);
            Terms     cterms     = fields.Terms(term.Field);
            TermsEnum ctermsEnum = cterms.Iterator(null);

            if (ctermsEnum.SeekExact(new BytesRef(term.Text())))
            {
                DocsEnum docsEnum = TestUtil.Docs(Random(), ctermsEnum, bits, null, DocsEnum.FLAG_NONE);
                return(ToArray(docsEnum));
            }
            return(null);
        }
예제 #12
0
        public virtual void TestMixupDocs()
        {
            Directory         dir = NewDirectory();
            IndexWriterConfig iwc = NewIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(Random()));

            iwc.SetMergePolicy(NewLogMergePolicy());
            RandomIndexWriter writer     = new RandomIndexWriter(Random(), dir, iwc);
            Document          doc        = new Document();
            FieldType         customType = new FieldType(TextField.TYPE_NOT_STORED);

            customType.StoreTermVectors         = true;
            customType.StoreTermVectorPositions = true;
            customType.StoreTermVectorPayloads  = true;
            customType.StoreTermVectorOffsets   = Random().NextBoolean();
            Field       field = new Field("field", "", customType);
            TokenStream ts    = new MockTokenizer(new StringReader("here we go"), MockTokenizer.WHITESPACE, true);

            Assert.IsFalse(ts.HasAttribute <IPayloadAttribute>());
            field.TokenStream = ts;
            doc.Add(field);
            writer.AddDocument(doc);

            Token withPayload = new Token("withPayload", 0, 11);

            withPayload.Payload = new BytesRef("test");
            ts = new CannedTokenStream(withPayload);
            Assert.IsTrue(ts.HasAttribute <IPayloadAttribute>());
            field.TokenStream = ts;
            writer.AddDocument(doc);

            ts = new MockTokenizer(new StringReader("another"), MockTokenizer.WHITESPACE, true);
            Assert.IsFalse(ts.HasAttribute <IPayloadAttribute>());
            field.TokenStream = ts;
            writer.AddDocument(doc);

            DirectoryReader reader = writer.Reader;
            Terms           terms  = reader.GetTermVector(1, "field");

            Debug.Assert(terms != null);
            TermsEnum termsEnum = terms.Iterator(null);

            Assert.IsTrue(termsEnum.SeekExact(new BytesRef("withPayload")));
            DocsAndPositionsEnum de = termsEnum.DocsAndPositions(null, null);

            Assert.AreEqual(0, de.NextDoc());
            Assert.AreEqual(0, de.NextPosition());
            Assert.AreEqual(new BytesRef("test"), de.Payload);
            writer.Dispose();
            reader.Dispose();
            dir.Dispose();
        }
예제 #13
0
        public virtual DocsAndPositionsEnum GetDocsAndPositions(AtomicReader reader, BytesRef bytes, Bits liveDocs)
        {
            Terms terms = reader.Terms(FieldName);

            if (terms != null)
            {
                TermsEnum te = terms.Iterator(null);
                if (te.SeekExact(bytes))
                {
                    return(te.DocsAndPositions(liveDocs, null));
                }
            }
            return(null);
        }
        public virtual void TestMixupMultiValued()
        {
            Directory         dir        = NewDirectory();
            RandomIndexWriter writer     = new RandomIndexWriter(Random(), dir, Similarity, TimeZone);
            Document          doc        = new Document();
            FieldType         customType = new FieldType(TextField.TYPE_NOT_STORED);

            customType.StoreTermVectors         = true;
            customType.StoreTermVectorPositions = true;
            customType.StoreTermVectorPayloads  = true;
            customType.StoreTermVectorOffsets   = Random().NextBoolean();
            Field       field = new Field("field", "", customType);
            TokenStream ts    = new MockTokenizer(new StringReader("here we go"), MockTokenizer.WHITESPACE, true);

            Assert.IsFalse(ts.HasAttribute <IPayloadAttribute>());
            field.TokenStream = ts;
            doc.Add(field);
            Field field2      = new Field("field", "", customType);
            Token withPayload = new Token("withPayload", 0, 11);

            withPayload.Payload = new BytesRef("test");
            ts = new CannedTokenStream(withPayload);
            Assert.IsTrue(ts.HasAttribute <IPayloadAttribute>());
            field2.TokenStream = ts;
            doc.Add(field2);
            Field field3 = new Field("field", "", customType);

            ts = new MockTokenizer(new StringReader("nopayload"), MockTokenizer.WHITESPACE, true);
            Assert.IsFalse(ts.HasAttribute <IPayloadAttribute>());
            field3.TokenStream = ts;
            doc.Add(field3);
            writer.AddDocument(doc);
            DirectoryReader reader = writer.Reader;
            Terms           terms  = reader.GetTermVector(0, "field");

            Debug.Assert(terms != null);
            TermsEnum termsEnum = terms.Iterator(null);

            Assert.IsTrue(termsEnum.SeekExact(new BytesRef("withPayload")));
            DocsAndPositionsEnum de = termsEnum.DocsAndPositions(null, null);

            Assert.AreEqual(0, de.NextDoc());
            Assert.AreEqual(3, de.NextPosition());
            Assert.AreEqual(new BytesRef("test"), de.Payload);
            writer.Dispose();
            reader.Dispose();
            dir.Dispose();
        }
예제 #15
0
        /// <summary>
        /// Returns <seealso cref="DocsAndPositionsEnum"/> for the specified
        ///  field & term, with control over whether offsets and payloads are
        ///  required.  Some codecs may be able to optimize
        ///  their implementation when offsets and/or payloads are not
        ///  required. this will return null if the field or term does not
        ///  exist or positions were not indexed. See {@link
        ///  TermsEnum#docsAndPositions(Bits,DocsAndPositionsEnum,int)}.
        /// </summary>
        public static DocsAndPositionsEnum GetTermPositionsEnum(IndexReader r, Bits liveDocs, string field, BytesRef term, int flags)
        {
            Debug.Assert(field != null);
            Debug.Assert(term != null);
            Terms terms = GetTerms(r, field);

            if (terms != null)
            {
                TermsEnum termsEnum = terms.Iterator(null);
                if (termsEnum.SeekExact(term))
                {
                    return(termsEnum.DocsAndPositions(liveDocs, null, flags));
                }
            }
            return(null);
        }
예제 #16
0
        private void CheckTerms(Terms terms, Bits liveDocs, params string[] termsList)
        {
            Assert.IsNotNull(terms);
            TermsEnum te = terms.Iterator(null);

            foreach (string t in termsList)
            {
                BytesRef b = te.Next();
                Assert.IsNotNull(b);
                Assert.AreEqual(t, b.Utf8ToString());
                DocsEnum td = TestUtil.Docs(Random(), te, liveDocs, null, DocsEnum.FLAG_NONE);
                Assert.IsTrue(td.NextDoc() != DocIdSetIterator.NO_MORE_DOCS);
                Assert.AreEqual(0, td.DocID());
                Assert.AreEqual(td.NextDoc(), DocIdSetIterator.NO_MORE_DOCS);
            }
            Assert.IsNull(te.Next());
        }
예제 #17
0
        private void CheckTerms(Terms terms, Bits liveDocs, params string[] termsList)
        {
            Assert.IsNotNull(terms);
            TermsEnum te = terms.Iterator(null);

            foreach (string t in termsList)
            {
                BytesRef b = te.Next();
                Assert.IsNotNull(b);
                Assert.AreEqual(t, b.Utf8ToString());
                DocsEnum td = TestUtil.Docs(Random(), te, liveDocs, null, DocsEnum.FLAG_NONE);
                Assert.IsTrue(td.NextDoc() != DocIdSetIterator.NO_MORE_DOCS);
                Assert.AreEqual(0, td.DocID());
                Assert.AreEqual(td.NextDoc(), DocIdSetIterator.NO_MORE_DOCS);
            }
            Assert.IsNull(te.Next());
        }
예제 #18
0
        public virtual void TestOffsetReader()
        {
            TermVectorsReader reader = Codec.Default.TermVectorsFormat().VectorsReader(Dir, Seg.Info, FieldInfos, NewIOContext(Random()));
            Terms             vector = reader.Get(0).Terms(TestFields[0]);

            Assert.IsNotNull(vector);
            TermsEnum termsEnum = vector.Iterator(null);

            Assert.IsNotNull(termsEnum);
            Assert.AreEqual(TestTerms.Length, vector.Size());
            DocsAndPositionsEnum dpEnum = null;

            for (int i = 0; i < TestTerms.Length; i++)
            {
                BytesRef text = termsEnum.Next();
                Assert.IsNotNull(text);
                string term = text.Utf8ToString();
                Assert.AreEqual(TestTerms[i], term);

                dpEnum = termsEnum.DocsAndPositions(null, dpEnum);
                Assert.IsNotNull(dpEnum);
                Assert.IsTrue(dpEnum.NextDoc() != DocIdSetIterator.NO_MORE_DOCS);
                Assert.AreEqual(dpEnum.Freq(), Positions[i].Length);
                for (int j = 0; j < Positions[i].Length; j++)
                {
                    Assert.AreEqual(Positions[i][j], dpEnum.NextPosition());
                }
                Assert.AreEqual(DocIdSetIterator.NO_MORE_DOCS, dpEnum.NextDoc());

                dpEnum = termsEnum.DocsAndPositions(null, dpEnum);
                Assert.IsTrue(dpEnum.NextDoc() != DocIdSetIterator.NO_MORE_DOCS);
                Assert.IsNotNull(dpEnum);
                Assert.AreEqual(dpEnum.Freq(), Positions[i].Length);
                for (int j = 0; j < Positions[i].Length; j++)
                {
                    Assert.AreEqual(Positions[i][j], dpEnum.NextPosition());
                    Assert.AreEqual(j * 10, dpEnum.StartOffset());
                    Assert.AreEqual(j * 10 + TestTerms[i].Length, dpEnum.EndOffset());
                }
                Assert.AreEqual(DocIdSetIterator.NO_MORE_DOCS, dpEnum.NextDoc());
            }
            reader.Dispose();
        }
예제 #19
0
        /// <summary>
        /// Returns <seealso cref="DocsAndPositionsEnum"/> for the specified
        ///  term.  this will return null if the
        ///  field or term does not exist or positions weren't indexed. </summary>
        ///  <seealso cref= TermsEnum#docsAndPositions(Bits, DocsAndPositionsEnum)  </seealso>
        public DocsAndPositionsEnum TermPositionsEnum(Term term)
        {
            Debug.Assert(term.Field() != null);
            Debug.Assert(term.Bytes() != null);
            Fields fields = Fields;

            if (fields != null)
            {
                Terms terms = fields.Terms(term.Field());
                if (terms != null)
                {
                    TermsEnum termsEnum = terms.Iterator(null);
                    if (termsEnum.SeekExact(term.Bytes()))
                    {
                        return(termsEnum.DocsAndPositions(LiveDocs, null));
                    }
                }
            }
            return(null);
        }
예제 #20
0
        public virtual void TestTermVectors()
        {
            Terms result = Reader.GetTermVectors(0).Terms(DocHelper.TEXT_FIELD_2_KEY);

            Assert.IsNotNull(result);
            Assert.AreEqual(3, result.Size());
            TermsEnum termsEnum = result.Iterator(null);

            while (termsEnum.Next() != null)
            {
                string term = termsEnum.Term().Utf8ToString();
                int    freq = (int)termsEnum.TotalTermFreq();
                Assert.IsTrue(DocHelper.FIELD_2_TEXT.IndexOf(term) != -1);
                Assert.IsTrue(freq > 0);
            }

            Fields results = Reader.GetTermVectors(0);

            Assert.IsTrue(results != null);
            Assert.AreEqual(3, results.Size, "We do not have 3 term freq vectors");
        }
예제 #21
0
        private void PrintSegment(StreamWriter @out, SegmentCommitInfo si)
        {
            SegmentReader reader = new SegmentReader(si, DirectoryReader.DEFAULT_TERMS_INDEX_DIVISOR, NewIOContext(Random()));

            for (int i = 0; i < reader.NumDocs; i++)
            {
                @out.WriteLine(reader.Document(i));
            }

            Fields fields = reader.Fields;

            foreach (string field in fields)
            {
                Terms terms = fields.Terms(field);
                Assert.IsNotNull(terms);
                TermsEnum tis = terms.Iterator(null);
                while (tis.Next() != null)
                {
                    @out.Write("  term=" + field + ":" + tis.Term());
                    @out.WriteLine("    DF=" + tis.DocFreq());

                    DocsAndPositionsEnum positions = tis.DocsAndPositions(reader.LiveDocs, null);

                    while (positions.NextDoc() != DocIdSetIterator.NO_MORE_DOCS)
                    {
                        @out.Write(" doc=" + positions.DocID());
                        @out.Write(" TF=" + positions.Freq());
                        @out.Write(" pos=");
                        @out.Write(positions.NextPosition());
                        for (int j = 1; j < positions.Freq(); j++)
                        {
                            @out.Write("," + positions.NextPosition());
                        }
                        @out.WriteLine("");
                    }
                }
            }
            reader.Dispose();
        }
예제 #22
0
        public virtual void TestReader()
        {
            TermVectorsReader reader = Codec.Default.TermVectorsFormat().VectorsReader(Dir, Seg.Info, FieldInfos, NewIOContext(Random()));

            for (int j = 0; j < 5; j++)
            {
                Terms vector = reader.Get(j).Terms(TestFields[0]);
                Assert.IsNotNull(vector);
                Assert.AreEqual(TestTerms.Length, vector.Size());
                TermsEnum termsEnum = vector.Iterator(null);
                for (int i = 0; i < TestTerms.Length; i++)
                {
                    BytesRef text = termsEnum.Next();
                    Assert.IsNotNull(text);
                    string term = text.Utf8ToString();
                    //System.out.println("Term: " + term);
                    Assert.AreEqual(TestTerms[i], term);
                }
                Assert.IsNull(termsEnum.Next());
            }
            reader.Dispose();
        }
예제 #23
0
        /// <summary>
        /// Call this only once (if you subclass!) </summary>
        protected internal virtual void Uninvert(AtomicReader reader, Bits liveDocs, BytesRef termPrefix)
        {
            FieldInfo info = reader.FieldInfos.FieldInfo(Field);

            if (info != null && info.HasDocValues())
            {
                throw new InvalidOperationException("Type mismatch: " + Field + " was indexed as " + info.DocValuesType);
            }
            //System.out.println("DTO uninvert field=" + field + " prefix=" + termPrefix);
            long startTime = DateTime.Now.Millisecond;

            Prefix = termPrefix == null ? null : BytesRef.DeepCopyOf(termPrefix);

            int maxDoc = reader.MaxDoc;

            int[]     index    = new int[maxDoc];     // immediate term numbers, or the index into the byte[] representing the last number
            int[]     lastTerm = new int[maxDoc];     // last term we saw for this document
            sbyte[][] bytes    = new sbyte[maxDoc][]; // list of term numbers for the doc (delta encoded vInts)

            Fields fields = reader.Fields;

            if (fields == null)
            {
                // No terms
                return;
            }
            Terms terms = fields.Terms(Field);

            if (terms == null)
            {
                // No terms
                return;
            }

            TermsEnum te        = terms.Iterator(null);
            BytesRef  seekStart = termPrefix != null ? termPrefix : new BytesRef();

            //System.out.println("seekStart=" + seekStart.utf8ToString());
            if (te.SeekCeil(seekStart) == TermsEnum.SeekStatus.END)
            {
                // No terms match
                return;
            }

            // If we need our "term index wrapper", these will be
            // init'd below:
            IList <BytesRef> indexedTerms      = null;
            PagedBytes       indexedTermsBytes = null;

            bool testedOrd = false;

            // we need a minimum of 9 bytes, but round up to 12 since the space would
            // be wasted with most allocators anyway.
            sbyte[] tempArr = new sbyte[12];

            //
            // enumerate all terms, and build an intermediate form of the un-inverted field.
            //
            // During this intermediate form, every document has a (potential) byte[]
            // and the int[maxDoc()] array either contains the termNumber list directly
            // or the *end* offset of the termNumber list in it's byte array (for faster
            // appending and faster creation of the final form).
            //
            // idea... if things are too large while building, we could do a range of docs
            // at a time (but it would be a fair amount slower to build)
            // could also do ranges in parallel to take advantage of multiple CPUs

            // OPTIONAL: remap the largest df terms to the lowest 128 (single byte)
            // values.  this requires going over the field first to find the most
            // frequent terms ahead of time.

            int termNum = 0;

            DocsEnum = null;

            // Loop begins with te positioned to first term (we call
            // seek above):
            for (; ;)
            {
                BytesRef t = te.Term();
                if (t == null || (termPrefix != null && !StringHelper.StartsWith(t, termPrefix)))
                {
                    break;
                }
                //System.out.println("visit term=" + t.utf8ToString() + " " + t + " termNum=" + termNum);

                if (!testedOrd)
                {
                    try
                    {
                        OrdBase = (int)te.Ord();
                        //System.out.println("got ordBase=" + ordBase);
                    }
                    catch (System.NotSupportedException uoe)
                    {
                        // Reader cannot provide ord support, so we wrap
                        // our own support by creating our own terms index:
                        indexedTerms      = new List <BytesRef>();
                        indexedTermsBytes = new PagedBytes(15);
                        //System.out.println("NO ORDS");
                    }
                    testedOrd = true;
                }

                VisitTerm(te, termNum);

                if (indexedTerms != null && (termNum & IndexIntervalMask) == 0)
                {
                    // Index this term
                    SizeOfIndexedStrings += t.Length;
                    BytesRef indexedTerm = new BytesRef();
                    indexedTermsBytes.Copy(t, indexedTerm);
                    // TODO: really should 1) strip off useless suffix,
                    // and 2) use FST not array/PagedBytes
                    indexedTerms.Add(indexedTerm);
                }

                int df = te.DocFreq();
                if (df <= MaxTermDocFreq)
                {
                    DocsEnum = te.Docs(liveDocs, DocsEnum, DocsEnum.FLAG_NONE);

                    // dF, but takes deletions into account
                    int actualDF = 0;

                    for (; ;)
                    {
                        int doc = DocsEnum.NextDoc();
                        if (doc == DocIdSetIterator.NO_MORE_DOCS)
                        {
                            break;
                        }
                        //System.out.println("  chunk=" + chunk + " docs");

                        actualDF++;
                        TermInstances++;

                        //System.out.println("    docID=" + doc);
                        // add TNUM_OFFSET to the term number to make room for special reserved values:
                        // 0 (end term) and 1 (index into byte array follows)
                        int delta = termNum - lastTerm[doc] + TNUM_OFFSET;
                        lastTerm[doc] = termNum;
                        int val = index[doc];

                        if ((val & 0xff) == 1)
                        {
                            // index into byte array (actually the end of
                            // the doc-specific byte[] when building)
                            int     pos    = (int)((uint)val >> 8);
                            int     ilen   = VIntSize(delta);
                            sbyte[] arr    = bytes[doc];
                            int     newend = pos + ilen;
                            if (newend > arr.Length)
                            {
                                // We avoid a doubling strategy to lower memory usage.
                                // this faceting method isn't for docs with many terms.
                                // In hotspot, objects have 2 words of overhead, then fields, rounded up to a 64-bit boundary.
                                // TODO: figure out what array lengths we can round up to w/o actually using more memory
                                // (how much space does a byte[] take up?  Is data preceded by a 32 bit length only?
                                // It should be safe to round up to the nearest 32 bits in any case.
                                int     newLen = (newend + 3) & unchecked ((int)0xfffffffc); // 4 byte alignment
                                sbyte[] newarr = new sbyte[newLen];
                                Array.Copy(arr, 0, newarr, 0, pos);
                                arr        = newarr;
                                bytes[doc] = newarr;
                            }
                            pos        = WriteInt(delta, arr, pos);
                            index[doc] = (pos << 8) | 1; // update pointer to end index in byte[]
                        }
                        else
                        {
                            // OK, this int has data in it... find the end (a zero starting byte - not
                            // part of another number, hence not following a byte with the high bit set).
                            int ipos;
                            if (val == 0)
                            {
                                ipos = 0;
                            }
                            else if ((val & 0x0000ff80) == 0)
                            {
                                ipos = 1;
                            }
                            else if ((val & 0x00ff8000) == 0)
                            {
                                ipos = 2;
                            }
                            else if ((val & 0xff800000) == 0)
                            {
                                ipos = 3;
                            }
                            else
                            {
                                ipos = 4;
                            }

                            //System.out.println("      ipos=" + ipos);

                            int endPos = WriteInt(delta, tempArr, ipos);
                            //System.out.println("      endpos=" + endPos);
                            if (endPos <= 4)
                            {
                                //System.out.println("      fits!");
                                // value will fit in the integer... move bytes back
                                for (int j = ipos; j < endPos; j++)
                                {
                                    val |= (tempArr[j] & 0xff) << (j << 3);
                                }
                                index[doc] = val;
                            }
                            else
                            {
                                // value won't fit... move integer into byte[]
                                for (int j = 0; j < ipos; j++)
                                {
                                    tempArr[j] = (sbyte)val;
                                    val        = (int)((uint)val >> 8);
                                }
                                // point at the end index in the byte[]
                                index[doc] = (endPos << 8) | 1;
                                bytes[doc] = tempArr;
                                tempArr    = new sbyte[12];
                            }
                        }
                    }
                    SetActualDocFreq(termNum, actualDF);
                }

                termNum++;
                if (te.Next() == null)
                {
                    break;
                }
            }

            NumTermsInField = termNum;

            long midPoint = DateTime.Now.Millisecond;

            if (TermInstances == 0)
            {
                // we didn't invert anything
                // lower memory consumption.
                Tnums = null;
            }
            else
            {
                this.Index = index;

                //
                // transform intermediate form into the final form, building a single byte[]
                // at a time, and releasing the intermediate byte[]s as we go to avoid
                // increasing the memory footprint.
                //

                for (int pass = 0; pass < 256; pass++)
                {
                    sbyte[] target = Tnums[pass];
                    int     pos    = 0; // end in target;
                    if (target != null)
                    {
                        pos = target.Length;
                    }
                    else
                    {
                        target = new sbyte[4096];
                    }

                    // loop over documents, 0x00ppxxxx, 0x01ppxxxx, 0x02ppxxxx
                    // where pp is the pass (which array we are building), and xx is all values.
                    // each pass shares the same byte[] for termNumber lists.
                    for (int docbase = pass << 16; docbase < maxDoc; docbase += (1 << 24))
                    {
                        int lim = Math.Min(docbase + (1 << 16), maxDoc);
                        for (int doc = docbase; doc < lim; doc++)
                        {
                            //System.out.println("  pass="******" process docID=" + doc);
                            int val = index[doc];
                            if ((val & 0xff) == 1)
                            {
                                int len = (int)((uint)val >> 8);
                                //System.out.println("    ptr pos=" + pos);
                                index[doc] = (pos << 8) | 1; // change index to point to start of array
                                if ((pos & 0xff000000) != 0)
                                {
                                    // we only have 24 bits for the array index
                                    throw new InvalidOperationException("Too many values for UnInvertedField faceting on field " + Field);
                                }
                                sbyte[] arr = bytes[doc];

                                /*
                                 * for(byte b : arr) {
                                 * //System.out.println("      b=" + Integer.toHexString((int) b));
                                 * }
                                 */
                                bytes[doc] = null; // IMPORTANT: allow GC to avoid OOM
                                if (target.Length <= pos + len)
                                {
                                    int newlen = target.Length;
                                    /// <summary>
                                    ///* we don't have to worry about the array getting too large
                                    /// since the "pos" param will overflow first (only 24 bits available)
                                    /// if ((newlen<<1) <= 0) {
                                    ///  // overflow...
                                    ///  newlen = Integer.MAX_VALUE;
                                    ///  if (newlen <= pos + len) {
                                    ///    throw new SolrException(400,"Too many terms to uninvert field!");
                                    ///  }
                                    /// } else {
                                    ///  while (newlen <= pos + len) newlen<<=1;  // doubling strategy
                                    /// }
                                    /// ***
                                    /// </summary>
                                    while (newlen <= pos + len) // doubling strategy
                                    {
                                        newlen <<= 1;
                                    }
                                    sbyte[] newtarget = new sbyte[newlen];
                                    Array.Copy(target, 0, newtarget, 0, pos);
                                    target = newtarget;
                                }
                                Array.Copy(arr, 0, target, pos, len);
                                pos += len + 1; // skip single byte at end and leave it 0 for terminator
                            }
                        }
                    }

                    // shrink array
                    if (pos < target.Length)
                    {
                        sbyte[] newtarget = new sbyte[pos];
                        Array.Copy(target, 0, newtarget, 0, pos);
                        target = newtarget;
                    }

                    Tnums[pass] = target;

                    if ((pass << 16) > maxDoc)
                    {
                        break;
                    }
                }
            }
            if (indexedTerms != null)
            {
                IndexedTermsArray = indexedTerms.ToArray();
            }

            long endTime = DateTime.Now.Millisecond;

            Total_time  = (int)(endTime - startTime);
            Phase1_time = (int)(midPoint - startTime);
        }
        protected internal virtual void AssertEquals(RandomTokenStream tk, FieldType ft, Terms terms)
        {
            Assert.AreEqual(1, terms.DocCount);
            int termCount = (new HashSet<string>(Arrays.AsList(tk.Terms))).Count;
            Assert.AreEqual(termCount, terms.Size());
            Assert.AreEqual(termCount, terms.SumDocFreq);
            Assert.AreEqual(ft.StoreTermVectorPositions, terms.HasPositions());
            Assert.AreEqual(ft.StoreTermVectorOffsets, terms.HasOffsets());
            Assert.AreEqual(ft.StoreTermVectorPayloads && tk.HasPayloads(), terms.HasPayloads());
            HashSet<BytesRef> uniqueTerms = new HashSet<BytesRef>();
            foreach (string term in tk.Freqs.Keys)
            {
                uniqueTerms.Add(new BytesRef(term));
            }
            BytesRef[] sortedTerms = uniqueTerms.ToArray(/*new BytesRef[0]*/);
            Array.Sort(sortedTerms, terms.Comparator);
            TermsEnum termsEnum = terms.Iterator(Random().NextBoolean() ? null : this.termsEnum.Value);
            this.termsEnum.Value = termsEnum;
            for (int i = 0; i < sortedTerms.Length; ++i)
            {
                BytesRef nextTerm = termsEnum.Next();
                Assert.AreEqual(sortedTerms[i], nextTerm);
                Assert.AreEqual(sortedTerms[i], termsEnum.Term());
                Assert.AreEqual(1, termsEnum.DocFreq());

                FixedBitSet bits = new FixedBitSet(1);
                DocsEnum docsEnum = termsEnum.Docs(bits, Random().NextBoolean() ? null : this.docsEnum.Value);
                Assert.AreEqual(DocsEnum.NO_MORE_DOCS, docsEnum.NextDoc());
                bits.Set(0);

                docsEnum = termsEnum.Docs(Random().NextBoolean() ? bits : null, Random().NextBoolean() ? null : docsEnum);
                Assert.IsNotNull(docsEnum);
                Assert.AreEqual(0, docsEnum.NextDoc());
                Assert.AreEqual(0, docsEnum.DocID());
                Assert.AreEqual(tk.Freqs[termsEnum.Term().Utf8ToString()], (int?)docsEnum.Freq());
                Assert.AreEqual(DocsEnum.NO_MORE_DOCS, docsEnum.NextDoc());
                this.docsEnum.Value = docsEnum;

                bits.Clear(0);
                DocsAndPositionsEnum docsAndPositionsEnum = termsEnum.DocsAndPositions(bits, Random().NextBoolean() ? null : this.docsAndPositionsEnum.Value);
                Assert.AreEqual(ft.StoreTermVectorOffsets || ft.StoreTermVectorPositions, docsAndPositionsEnum != null);
                if (docsAndPositionsEnum != null)
                {
                    Assert.AreEqual(DocsEnum.NO_MORE_DOCS, docsAndPositionsEnum.NextDoc());
                }
                bits.Set(0);

                docsAndPositionsEnum = termsEnum.DocsAndPositions(Random().NextBoolean() ? bits : null, Random().NextBoolean() ? null : docsAndPositionsEnum);
                Assert.AreEqual(ft.StoreTermVectorOffsets || ft.StoreTermVectorPositions, docsAndPositionsEnum != null);
                if (terms.HasPositions() || terms.HasOffsets())
                {
                    Assert.AreEqual(0, docsAndPositionsEnum.NextDoc());
                    int freq = docsAndPositionsEnum.Freq();
                    Assert.AreEqual(tk.Freqs[termsEnum.Term().Utf8ToString()], (int?)freq);
                    if (docsAndPositionsEnum != null)
                    {
                        for (int k = 0; k < freq; ++k)
                        {
                            int position = docsAndPositionsEnum.NextPosition();
                            ISet<int?> indexes;
                            if (terms.HasPositions())
                            {
                                indexes = tk.PositionToTerms[position];
                                Assert.IsNotNull(indexes);
                            }
                            else
                            {
                                indexes = tk.StartOffsetToTerms[docsAndPositionsEnum.StartOffset()];
                                Assert.IsNotNull(indexes);
                            }
                            if (terms.HasPositions())
                            {
                                bool foundPosition = false;
                                foreach (int index in indexes)
                                {
                                    if (tk.TermBytes[index].Equals(termsEnum.Term()) && tk.Positions[index] == position)
                                    {
                                        foundPosition = true;
                                        break;
                                    }
                                }
                                Assert.IsTrue(foundPosition);
                            }
                            if (terms.HasOffsets())
                            {
                                bool foundOffset = false;
                                foreach (int index in indexes)
                                {
                                    if (tk.TermBytes[index].Equals(termsEnum.Term()) && tk.StartOffsets[index] == docsAndPositionsEnum.StartOffset() && tk.EndOffsets[index] == docsAndPositionsEnum.EndOffset())
                                    {
                                        foundOffset = true;
                                        break;
                                    }
                                }
                                Assert.IsTrue(foundOffset);
                            }
                            if (terms.HasPayloads())
                            {
                                bool foundPayload = false;
                                foreach (int index in indexes)
                                {
                                    if (tk.TermBytes[index].Equals(termsEnum.Term()) && Equals(tk.Payloads[index], docsAndPositionsEnum.Payload))
                                    {
                                        foundPayload = true;
                                        break;
                                    }
                                }
                                Assert.IsTrue(foundPayload);
                            }
                        }
                        try
                        {
                            docsAndPositionsEnum.NextPosition();
                            Assert.Fail();
                        }
                        catch (Exception e)
                        {
                            // ok
                        }
                    }
                    Assert.AreEqual(DocsEnum.NO_MORE_DOCS, docsAndPositionsEnum.NextDoc());
                }
                this.docsAndPositionsEnum.Value = docsAndPositionsEnum;
            }
            Assert.IsNull(termsEnum.Next());
            for (int i = 0; i < 5; ++i)
            {
                if (Random().NextBoolean())
                {
                    Assert.IsTrue(termsEnum.SeekExact(RandomInts.RandomFrom(Random(), tk.TermBytes)));
                }
                else
                {
                    Assert.AreEqual(SeekStatus.FOUND, termsEnum.SeekCeil(RandomInts.RandomFrom(Random(), tk.TermBytes)));
                }
            }
        }
예제 #25
0
        public virtual void Test()
        {
            IList <string> postingsList   = new List <string>();
            int            numTerms       = AtLeast(300);
            int            maxTermsPerDoc = TestUtil.NextInt(Random(), 10, 20);
            bool           isSimpleText   = "SimpleText".Equals(TestUtil.GetPostingsFormat("field"));

            IndexWriterConfig iwc = NewIndexWriterConfig(Random(), TEST_VERSION_CURRENT, new MockAnalyzer(Random()));

            if ((isSimpleText || iwc.MergePolicy is MockRandomMergePolicy) && (TEST_NIGHTLY || RANDOM_MULTIPLIER > 1))
            {
                // Otherwise test can take way too long (> 2 hours)
                numTerms /= 2;
            }
            if (VERBOSE)
            {
                Console.WriteLine("maxTermsPerDoc=" + maxTermsPerDoc);
                Console.WriteLine("numTerms=" + numTerms);
            }
            for (int i = 0; i < numTerms; i++)
            {
                string term = Convert.ToString(i);
                for (int j = 0; j < i; j++)
                {
                    postingsList.Add(term);
                }
            }

            postingsList = CollectionsHelper.Shuffle(postingsList);

            ConcurrentQueue <string> postings = new ConcurrentQueue <string>(postingsList);

            Directory dir = NewFSDirectory(CreateTempDir(GetFullMethodName()));

            RandomIndexWriter iw = new RandomIndexWriter(Random(), dir, iwc);

            int threadCount = TestUtil.NextInt(Random(), 1, 5);

            if (VERBOSE)
            {
                Console.WriteLine("config: " + iw.w.Config);
                Console.WriteLine("threadCount=" + threadCount);
            }

            Field     prototype = NewTextField("field", "", Field.Store.NO);
            FieldType fieldType = new FieldType((FieldType)prototype.FieldType);

            if (Random().NextBoolean())
            {
                fieldType.OmitNorms = true;
            }
            int options = Random().Next(3);

            if (options == 0)
            {
                fieldType.IndexOptions     = FieldInfo.IndexOptions.DOCS_AND_FREQS; // we dont actually need positions
                fieldType.StoreTermVectors = true;                                  // but enforce term vectors when we do this so we check SOMETHING
            }
            else if (options == 1 && !DoesntSupportOffsets.Contains(TestUtil.GetPostingsFormat("field")))
            {
                fieldType.IndexOptions = FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS;
            }
            // else just positions

            ThreadClass[]  threads     = new ThreadClass[threadCount];
            CountdownEvent startingGun = new CountdownEvent(1);

            for (int threadID = 0; threadID < threadCount; threadID++)
            {
                Random   threadRandom = new Random(Random().Next());
                Document document     = new Document();
                Field    field        = new Field("field", "", fieldType);
                document.Add(field);
                threads[threadID] = new ThreadAnonymousInnerClassHelper(this, numTerms, maxTermsPerDoc, postings, iw, startingGun, threadRandom, document, field);
                threads[threadID].Start();
            }
            startingGun.Signal();
            foreach (ThreadClass t in threads)
            {
                t.Join();
            }

            iw.ForceMerge(1);
            DirectoryReader ir = iw.Reader;

            Assert.AreEqual(1, ir.Leaves.Count);
            AtomicReader air   = (AtomicReader)ir.Leaves[0].Reader;
            Terms        terms = air.Terms("field");

            // numTerms-1 because there cannot be a term 0 with 0 postings:
            Assert.AreEqual(numTerms - 1, terms.Size());
            TermsEnum termsEnum = terms.Iterator(null);
            BytesRef  termBR;

            while ((termBR = termsEnum.Next()) != null)
            {
                int value = Convert.ToInt32(termBR.Utf8ToString());
                Assert.AreEqual(value, termsEnum.TotalTermFreq());
                // don't really need to check more than this, as CheckIndex
                // will verify that totalTermFreq == total number of positions seen
                // from a docsAndPositionsEnum.
            }
            ir.Dispose();
            iw.Dispose();
            dir.Dispose();
        }
예제 #26
0
 public override void Run()
 {
     if (VERBOSE)
     {
         Console.WriteLine(Thread.CurrentThread.Name + ": launch search thread");
     }
     while (DateTime.UtcNow < StopTime)
     {
         try
         {
             IndexSearcher s = OuterInstance.CurrentSearcher;
             try
             {
                 // Verify 1) IW is correctly setting
                 // diagnostics, and 2) segment warming for
                 // merged segments is actually happening:
                 foreach (AtomicReaderContext sub in s.IndexReader.Leaves)
                 {
                     SegmentReader segReader = (SegmentReader)sub.Reader;
                     IDictionary <string, string> diagnostics = segReader.SegmentInfo.Info.Diagnostics;
                     Assert.IsNotNull(diagnostics);
                     string source = diagnostics["source"];
                     Assert.IsNotNull(source);
                     if (source.Equals("merge"))
                     {
                         Assert.IsTrue(!OuterInstance.AssertMergedSegmentsWarmed || OuterInstance.Warmed.ContainsKey((SegmentCoreReaders)segReader.CoreCacheKey), "sub reader " + sub + " wasn't warmed: warmed=" + OuterInstance.Warmed + " diagnostics=" + diagnostics + " si=" + segReader.SegmentInfo);
                     }
                 }
                 if (s.IndexReader.NumDocs > 0)
                 {
                     OuterInstance.SmokeTestSearcher(s);
                     Fields fields = MultiFields.GetFields(s.IndexReader);
                     if (fields == null)
                     {
                         continue;
                     }
                     Terms terms = fields.Terms("body");
                     if (terms == null)
                     {
                         continue;
                     }
                     TermsEnum termsEnum     = terms.Iterator(null);
                     int       seenTermCount = 0;
                     int       shift;
                     int       trigger;
                     if (TotTermCount.Get() < 30)
                     {
                         shift   = 0;
                         trigger = 1;
                     }
                     else
                     {
                         trigger = TotTermCount.Get() / 30;
                         shift   = Random().Next(trigger);
                     }
                     while (DateTime.UtcNow < StopTime)
                     {
                         BytesRef term = termsEnum.Next();
                         if (term == null)
                         {
                             TotTermCount.Set(seenTermCount);
                             break;
                         }
                         seenTermCount++;
                         // search 30 terms
                         if ((seenTermCount + shift) % trigger == 0)
                         {
                             //if (VERBOSE) {
                             //System.out.println(Thread.currentThread().getName() + " now search body:" + term.Utf8ToString());
                             //}
                             TotHits.AddAndGet(OuterInstance.RunQuery(s, new TermQuery(new Term("body", term))));
                         }
                     }
                     //if (VERBOSE) {
                     //System.out.println(Thread.currentThread().getName() + ": search done");
                     //}
                 }
             }
             finally
             {
                 OuterInstance.ReleaseSearcher(s);
             }
         }
         catch (Exception t)
         {
             Console.WriteLine(Thread.CurrentThread.Name + ": hit exc");
             OuterInstance.Failed.Set(true);
             Console.WriteLine(t.StackTrace);
             throw new Exception(t.Message, t);
         }
     }
 }
예제 #27
0
        /// <summary>
        /// Terms api equivalency
        /// </summary>
        public void AssertTermsEquals(string info, IndexReader leftReader, Terms leftTerms, Terms rightTerms, bool deep)
        {
            if (leftTerms == null || rightTerms == null)
            {
                Assert.IsNull(leftTerms, info);
                Assert.IsNull(rightTerms, info);
                return;
            }
            AssertTermsStatisticsEquals(info, leftTerms, rightTerms);
            Assert.AreEqual(leftTerms.HasOffsets(), rightTerms.HasOffsets());
            Assert.AreEqual(leftTerms.HasPositions(), rightTerms.HasPositions());
            Assert.AreEqual(leftTerms.HasPayloads(), rightTerms.HasPayloads());

            TermsEnum leftTermsEnum = leftTerms.Iterator(null);
            TermsEnum rightTermsEnum = rightTerms.Iterator(null);
            AssertTermsEnumEquals(info, leftReader, leftTermsEnum, rightTermsEnum, true);

            AssertTermsSeekingEquals(info, leftTerms, rightTerms);

            if (deep)
            {
                int numIntersections = AtLeast(3);
                for (int i = 0; i < numIntersections; i++)
                {
                    string re = AutomatonTestUtil.RandomRegexp(Random());
                    CompiledAutomaton automaton = new CompiledAutomaton((new RegExp(re, RegExp.NONE)).ToAutomaton());
                    if (automaton.Type == CompiledAutomaton.AUTOMATON_TYPE.NORMAL)
                    {
                        // TODO: test start term too
                        TermsEnum leftIntersection = leftTerms.Intersect(automaton, null);
                        TermsEnum rightIntersection = rightTerms.Intersect(automaton, null);
                        AssertTermsEnumEquals(info, leftReader, leftIntersection, rightIntersection, Rarely());
                    }
                }
            }
        }
예제 #28
0
 public override TermsEnum GetTermsEnum(Terms terms, AttributeSource atts)
 {
     if (!termLongEnough)
     {  // can only match if it's exact
         return new SingleTermsEnum(terms.Iterator(null), term.Bytes);
     }
     return new SlowFuzzyTermsEnum(terms, atts, Term, minimumSimilarity, prefixLength);
 }
예제 #29
0
        public virtual void TestDoubleOffsetCounting()
        {
            Directory   dir        = NewDirectory();
            IndexWriter w          = new IndexWriter(dir, NewIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(Random())));
            Document    doc        = new Document();
            FieldType   customType = new FieldType(StringField.TYPE_NOT_STORED);

            customType.StoreTermVectors         = true;
            customType.StoreTermVectorPositions = true;
            customType.StoreTermVectorOffsets   = true;
            Field f = NewField("field", "abcd", customType);

            doc.Add(f);
            doc.Add(f);
            Field f2 = NewField("field", "", customType);

            doc.Add(f2);
            doc.Add(f);
            w.AddDocument(doc);
            w.Dispose();

            IndexReader r      = DirectoryReader.Open(dir);
            Terms       vector = r.GetTermVectors(0).Terms("field");

            Assert.IsNotNull(vector);
            TermsEnum termsEnum = vector.Iterator(null);

            Assert.IsNotNull(termsEnum.Next());
            Assert.AreEqual("", termsEnum.Term().Utf8ToString());

            // Token "" occurred once
            Assert.AreEqual(1, termsEnum.TotalTermFreq());

            DocsAndPositionsEnum dpEnum = termsEnum.DocsAndPositions(null, null);

            Assert.IsTrue(dpEnum.NextDoc() != DocIdSetIterator.NO_MORE_DOCS);
            dpEnum.NextPosition();
            Assert.AreEqual(8, dpEnum.StartOffset());
            Assert.AreEqual(8, dpEnum.EndOffset());
            Assert.AreEqual(DocIdSetIterator.NO_MORE_DOCS, dpEnum.NextDoc());

            // Token "abcd" occurred three times
            Assert.AreEqual(new BytesRef("abcd"), termsEnum.Next());
            dpEnum = termsEnum.DocsAndPositions(null, dpEnum);
            Assert.AreEqual(3, termsEnum.TotalTermFreq());

            Assert.IsTrue(dpEnum.NextDoc() != DocIdSetIterator.NO_MORE_DOCS);
            dpEnum.NextPosition();
            Assert.AreEqual(0, dpEnum.StartOffset());
            Assert.AreEqual(4, dpEnum.EndOffset());

            dpEnum.NextPosition();
            Assert.AreEqual(4, dpEnum.StartOffset());
            Assert.AreEqual(8, dpEnum.EndOffset());

            dpEnum.NextPosition();
            Assert.AreEqual(8, dpEnum.StartOffset());
            Assert.AreEqual(12, dpEnum.EndOffset());

            Assert.AreEqual(DocIdSetIterator.NO_MORE_DOCS, dpEnum.NextDoc());
            Assert.IsNull(termsEnum.Next());
            r.Dispose();
            dir.Dispose();
        }
예제 #30
0
        public virtual void TestArbitraryFields()
        {
            Directory         dir = NewDirectory();
            RandomIndexWriter w   = new RandomIndexWriter(Random(), dir);

            int NUM_DOCS = AtLeast(27);

            if (VERBOSE)
            {
                Console.WriteLine("TEST: " + NUM_DOCS + " docs");
            }
            int[] fieldsPerDoc = new int[NUM_DOCS];
            int   baseCount    = 0;

            for (int docCount = 0; docCount < NUM_DOCS; docCount++)
            {
                int fieldCount = TestUtil.NextInt(Random(), 1, 17);
                fieldsPerDoc[docCount] = fieldCount - 1;

                int finalDocCount = docCount;
                if (VERBOSE)
                {
                    Console.WriteLine("TEST: " + fieldCount + " fields in doc " + docCount);
                }

                int finalBaseCount = baseCount;
                baseCount += fieldCount - 1;

                w.AddDocument(new IterableAnonymousInnerClassHelper(this, fieldCount, finalDocCount, finalBaseCount));
            }

            IndexReader r = w.Reader;

            w.Dispose();

            IndexSearcher s       = NewSearcher(r);
            int           counter = 0;

            for (int id = 0; id < NUM_DOCS; id++)
            {
                if (VERBOSE)
                {
                    Console.WriteLine("TEST: verify doc id=" + id + " (" + fieldsPerDoc[id] + " fields) counter=" + counter);
                }
                TopDocs hits = s.Search(new TermQuery(new Term("id", "" + id)), 1);
                Assert.AreEqual(1, hits.TotalHits);
                int      docID      = hits.ScoreDocs[0].Doc;
                Document doc        = s.Doc(docID);
                int      endCounter = counter + fieldsPerDoc[id];
                while (counter < endCounter)
                {
                    string name    = "f" + counter;
                    int    fieldID = counter % 10;

                    bool stored  = (counter & 1) == 0 || fieldID == 3;
                    bool binary  = fieldID == 3;
                    bool indexed = fieldID != 3;

                    string stringValue;
                    if (fieldID != 3 && fieldID != 9)
                    {
                        stringValue = "text " + counter;
                    }
                    else
                    {
                        stringValue = null;
                    }

                    // stored:
                    if (stored)
                    {
                        IndexableField f = doc.GetField(name);
                        Assert.IsNotNull(f, "doc " + id + " doesn't have field f" + counter);
                        if (binary)
                        {
                            Assert.IsNotNull(f, "doc " + id + " doesn't have field f" + counter);
                            BytesRef b = f.BinaryValue();
                            Assert.IsNotNull(b);
                            Assert.AreEqual(10, b.Length);
                            for (int idx = 0; idx < 10; idx++)
                            {
                                Assert.AreEqual((byte)(idx + counter), b.Bytes[b.Offset + idx]);
                            }
                        }
                        else
                        {
                            Debug.Assert(stringValue != null);
                            Assert.AreEqual(stringValue, f.StringValue);
                        }
                    }

                    if (indexed)
                    {
                        bool tv = counter % 2 == 1 && fieldID != 9;
                        if (tv)
                        {
                            Terms tfv = r.GetTermVectors(docID).Terms(name);
                            Assert.IsNotNull(tfv);
                            TermsEnum termsEnum = tfv.Iterator(null);
                            Assert.AreEqual(new BytesRef("" + counter), termsEnum.Next());
                            Assert.AreEqual(1, termsEnum.TotalTermFreq());
                            DocsAndPositionsEnum dpEnum = termsEnum.DocsAndPositions(null, null);
                            Assert.IsTrue(dpEnum.NextDoc() != DocIdSetIterator.NO_MORE_DOCS);
                            Assert.AreEqual(1, dpEnum.Freq());
                            Assert.AreEqual(1, dpEnum.NextPosition());

                            Assert.AreEqual(new BytesRef("text"), termsEnum.Next());
                            Assert.AreEqual(1, termsEnum.TotalTermFreq());
                            dpEnum = termsEnum.DocsAndPositions(null, dpEnum);
                            Assert.IsTrue(dpEnum.NextDoc() != DocIdSetIterator.NO_MORE_DOCS);
                            Assert.AreEqual(1, dpEnum.Freq());
                            Assert.AreEqual(0, dpEnum.NextPosition());

                            Assert.IsNull(termsEnum.Next());

                            // TODO: offsets
                        }
                        else
                        {
                            Fields vectors = r.GetTermVectors(docID);
                            Assert.IsTrue(vectors == null || vectors.Terms(name) == null);
                        }

                        BooleanQuery bq = new BooleanQuery();
                        bq.Add(new TermQuery(new Term("id", "" + id)), BooleanClause.Occur.MUST);
                        bq.Add(new TermQuery(new Term(name, "text")), BooleanClause.Occur.MUST);
                        TopDocs hits2 = s.Search(bq, 1);
                        Assert.AreEqual(1, hits2.TotalHits);
                        Assert.AreEqual(docID, hits2.ScoreDocs[0].Doc);

                        bq = new BooleanQuery();
                        bq.Add(new TermQuery(new Term("id", "" + id)), BooleanClause.Occur.MUST);
                        bq.Add(new TermQuery(new Term(name, "" + counter)), BooleanClause.Occur.MUST);
                        TopDocs hits3 = s.Search(bq, 1);
                        Assert.AreEqual(1, hits3.TotalHits);
                        Assert.AreEqual(docID, hits3.ScoreDocs[0].Doc);
                    }

                    counter++;
                }
            }

            r.Dispose();
            dir.Dispose();
        }
예제 #31
0
        public virtual void Test()
        {
            IList <string> postingsList   = new List <string>();
            int            numTerms       = AtLeast(300);
            int            maxTermsPerDoc = TestUtil.NextInt(Random(), 10, 20);

            bool isSimpleText = "SimpleText".Equals(TestUtil.GetPostingsFormat("field"));

            IndexWriterConfig iwc = NewIndexWriterConfig(Random(), TEST_VERSION_CURRENT, new MockAnalyzer(Random()));

            if ((isSimpleText || iwc.MergePolicy is MockRandomMergePolicy) && (TEST_NIGHTLY || RANDOM_MULTIPLIER > 1))
            {
                // Otherwise test can take way too long (> 2 hours)
                numTerms /= 2;
            }

            if (VERBOSE)
            {
                Console.WriteLine("maxTermsPerDoc=" + maxTermsPerDoc);
                Console.WriteLine("numTerms=" + numTerms);
            }

            for (int i = 0; i < numTerms; i++)
            {
                string term = Convert.ToString(i);
                for (int j = 0; j < i; j++)
                {
                    postingsList.Add(term);
                }
            }
            postingsList = CollectionsHelper.Shuffle(postingsList);

            ConcurrentQueue <string> postings = new ConcurrentQueue <string>(postingsList);

            Directory         dir = NewFSDirectory(CreateTempDir("bagofpostings"));
            RandomIndexWriter iw  = new RandomIndexWriter(Random(), dir, iwc);

            int threadCount = TestUtil.NextInt(Random(), 1, 5);

            if (VERBOSE)
            {
                Console.WriteLine("config: " + iw.w.Config);
                Console.WriteLine("threadCount=" + threadCount);
            }

            ThreadClass[]  threads     = new ThreadClass[threadCount];
            CountdownEvent startingGun = new CountdownEvent(1);

            for (int threadID = 0; threadID < threadCount; threadID++)
            {
                threads[threadID] = new ThreadAnonymousInnerClassHelper(this, maxTermsPerDoc, postings, iw, startingGun);
                threads[threadID].Start();
            }
            startingGun.Signal();
            foreach (ThreadClass t in threads)
            {
                t.Join();
            }

            iw.ForceMerge(1);
            DirectoryReader ir = iw.Reader;

            Assert.AreEqual(1, ir.Leaves.Count);
            AtomicReader air   = (AtomicReader)ir.Leaves[0].Reader;
            Terms        terms = air.Terms("field");

            // numTerms-1 because there cannot be a term 0 with 0 postings:
            Assert.AreEqual(numTerms - 1, air.Fields.UniqueTermCount);
            if (iwc.Codec is Lucene3xCodec == false)
            {
                Assert.AreEqual(numTerms - 1, terms.Size());
            }
            TermsEnum termsEnum = terms.Iterator(null);
            BytesRef  term_;

            while ((term_ = termsEnum.Next()) != null)
            {
                int value = Convert.ToInt32(term_.Utf8ToString());
                Assert.AreEqual(value, termsEnum.DocFreq());
                // don't really need to check more than this, as CheckIndex
                // will verify that docFreq == actual number of documents seen
                // from a docsAndPositionsEnum.
            }
            ir.Dispose();
            iw.Dispose();
            dir.Dispose();
        }
예제 #32
0
        // DocValues updates
        private void ApplyDocValuesUpdates <T1>(IEnumerable <T1> updates, ReadersAndUpdates rld, SegmentReader reader, DocValuesFieldUpdates.Container dvUpdatesContainer) where T1 : DocValuesUpdate
        {
            lock (this)
            {
                Fields fields = reader.Fields;
                if (fields == null)
                {
                    // this reader has no postings
                    return;
                }

                // TODO: we can process the updates per DV field, from last to first so that
                // if multiple terms affect same document for the same field, we add an update
                // only once (that of the last term). To do that, we can keep a bitset which
                // marks which documents have already been updated. So e.g. if term T1
                // updates doc 7, and then we process term T2 and it updates doc 7 as well,
                // we don't apply the update since we know T1 came last and therefore wins
                // the update.
                // We can also use that bitset as 'liveDocs' to pass to TermEnum.docs(), so
                // that these documents aren't even returned.

                string    currentField = null;
                TermsEnum termsEnum    = null;
                DocsEnum  docs         = null;

                //System.out.println(Thread.currentThread().getName() + " numericDVUpdate reader=" + reader);
                foreach (DocValuesUpdate update in updates)
                {
                    Term term  = update.Term;
                    int  limit = update.DocIDUpto;

                    // TODO: we traverse the terms in update order (not term order) so that we
                    // apply the updates in the correct order, i.e. if two terms udpate the
                    // same document, the last one that came in wins, irrespective of the
                    // terms lexical order.
                    // we can apply the updates in terms order if we keep an updatesGen (and
                    // increment it with every update) and attach it to each NumericUpdate. Note
                    // that we cannot rely only on docIDUpto because an app may send two updates
                    // which will get same docIDUpto, yet will still need to respect the order
                    // those updates arrived.

                    if (!term.Field().Equals(currentField))
                    {
                        // if we change the code to process updates in terms order, enable this assert
                        //        assert currentField == null || currentField.compareTo(term.field()) < 0;
                        currentField = term.Field();
                        Terms terms = fields.Terms(currentField);
                        if (terms != null)
                        {
                            termsEnum = terms.Iterator(termsEnum);
                        }
                        else
                        {
                            termsEnum = null;
                            continue; // no terms in that field
                        }
                    }

                    if (termsEnum == null)
                    {
                        continue;
                    }
                    // System.out.println("  term=" + term);

                    if (termsEnum.SeekExact(term.Bytes()))
                    {
                        // we don't need term frequencies for this
                        DocsEnum docsEnum = termsEnum.Docs(rld.LiveDocs, docs, DocsEnum.FLAG_NONE);

                        //System.out.println("BDS: got docsEnum=" + docsEnum);

                        DocValuesFieldUpdates dvUpdates = dvUpdatesContainer.GetUpdates(update.Field, update.Type);
                        if (dvUpdates == null)
                        {
                            dvUpdates = dvUpdatesContainer.NewUpdates(update.Field, update.Type, reader.MaxDoc);
                        }
                        int doc;
                        while ((doc = docsEnum.NextDoc()) != DocIdSetIterator.NO_MORE_DOCS)
                        {
                            //System.out.println(Thread.currentThread().getName() + " numericDVUpdate term=" + term + " doc=" + docID);
                            if (doc >= limit)
                            {
                                break; // no more docs that can be updated for this term
                            }
                            dvUpdates.Add(doc, update.Value);
                        }
                    }
                }
            }
        }
예제 #33
0
        // Delete by Term
        private long ApplyTermDeletes(IEnumerable <Term> termsIter, ReadersAndUpdates rld, SegmentReader reader)
        {
            lock (this)
            {
                long   delCount = 0;
                Fields fields   = reader.Fields;
                if (fields == null)
                {
                    // this reader has no postings
                    return(0);
                }

                TermsEnum termsEnum = null;

                string   currentField = null;
                DocsEnum docs         = null;

                Debug.Assert(CheckDeleteTerm(null));

                bool any = false;

                //System.out.println(Thread.currentThread().getName() + " del terms reader=" + reader);
                foreach (Term term in termsIter)
                {
                    // Since we visit terms sorted, we gain performance
                    // by re-using the same TermsEnum and seeking only
                    // forwards
                    if (!term.Field().Equals(currentField))
                    {
                        Debug.Assert(currentField == null || currentField.CompareTo(term.Field()) < 0);
                        currentField = term.Field();
                        Terms terms = fields.Terms(currentField);
                        if (terms != null)
                        {
                            termsEnum = terms.Iterator(termsEnum);
                        }
                        else
                        {
                            termsEnum = null;
                        }
                    }

                    if (termsEnum == null)
                    {
                        continue;
                    }
                    Debug.Assert(CheckDeleteTerm(term));

                    // System.out.println("  term=" + term);

                    if (termsEnum.SeekExact(term.Bytes()))
                    {
                        // we don't need term frequencies for this
                        DocsEnum docsEnum = termsEnum.Docs(rld.LiveDocs, docs, DocsEnum.FLAG_NONE);
                        //System.out.println("BDS: got docsEnum=" + docsEnum);

                        if (docsEnum != null)
                        {
                            while (true)
                            {
                                int docID = docsEnum.NextDoc();
                                //System.out.println(Thread.currentThread().getName() + " del term=" + term + " doc=" + docID);
                                if (docID == DocIdSetIterator.NO_MORE_DOCS)
                                {
                                    break;
                                }
                                if (!any)
                                {
                                    rld.InitWritableLiveDocs();
                                    any = true;
                                }
                                // NOTE: there is no limit check on the docID
                                // when deleting by Term (unlike by Query)
                                // because on flush we apply all Term deletes to
                                // each segment.  So all Term deleting here is
                                // against prior segments:
                                if (rld.Delete(docID))
                                {
                                    delCount++;
                                }
                            }
                        }
                    }
                }

                return(delCount);
            }
        }
예제 #34
0
 public TermsEnum TermsEnum(Terms terms)
 {
     return terms.Iterator(null);
 }
예제 #35
0
        public virtual void TestFixedPostings()
        {
            const int NUM_TERMS = 100;

            TermData[] terms = new TermData[NUM_TERMS];
            for (int i = 0; i < NUM_TERMS; i++)
            {
                int[]  docs = new int[] { i };
                string text = Convert.ToString(i);
                terms[i] = new TermData(this, text, docs, null);
            }

            FieldInfos.Builder builder = new FieldInfos.Builder();

            FieldData field = new FieldData(this, "field", builder, terms, true, false);

            FieldData[] fields     = new FieldData[] { field };
            FieldInfos  fieldInfos = builder.Finish();
            Directory   dir        = NewDirectory();

            this.Write(fieldInfos, dir, fields, true);
            Codec       codec = Codec.Default;
            SegmentInfo si    = new SegmentInfo(dir, Constants.LUCENE_MAIN_VERSION, SEGMENT, 10000, false, codec, null);

            FieldsProducer reader = codec.PostingsFormat().FieldsProducer(new SegmentReadState(dir, si, fieldInfos, NewIOContext(Random()), DirectoryReader.DEFAULT_TERMS_INDEX_DIVISOR));

            IEnumerator <string> fieldsEnum = reader.GetEnumerator();

            fieldsEnum.MoveNext();
            string fieldName = fieldsEnum.Current;

            Assert.IsNotNull(fieldName);
            Terms terms2 = reader.Terms(fieldName);

            Assert.IsNotNull(terms2);

            TermsEnum termsEnum = terms2.Iterator(null);

            DocsEnum docsEnum = null;

            for (int i = 0; i < NUM_TERMS; i++)
            {
                BytesRef term = termsEnum.Next();
                Assert.IsNotNull(term);
                Assert.AreEqual(terms[i].Text2, term.Utf8ToString());

                // do this twice to stress test the codec's reuse, ie,
                // make sure it properly fully resets (rewinds) its
                // internal state:
                for (int iter = 0; iter < 2; iter++)
                {
                    docsEnum = TestUtil.Docs(Random(), termsEnum, null, docsEnum, DocsEnum.FLAG_NONE);
                    Assert.AreEqual(terms[i].Docs[0], docsEnum.NextDoc());
                    Assert.AreEqual(DocIdSetIterator.NO_MORE_DOCS, docsEnum.NextDoc());
                }
            }
            Assert.IsNull(termsEnum.Next());

            for (int i = 0; i < NUM_TERMS; i++)
            {
                Assert.AreEqual(termsEnum.SeekCeil(new BytesRef(terms[i].Text2)), TermsEnum.SeekStatus.FOUND);
            }

            Assert.IsFalse(fieldsEnum.MoveNext());
            reader.Dispose();
            dir.Dispose();
        }
        // following code is almost an exact dup of code from TestDuelingCodecs: sorry!

        public virtual void AssertTerms(Terms leftTerms, Terms rightTerms, bool deep)
        {
            if (leftTerms == null || rightTerms == null)
            {
                Assert.IsNull(leftTerms);
                Assert.IsNull(rightTerms);
                return;
            }
            AssertTermsStatistics(leftTerms, rightTerms);

            // NOTE: we don't assert hasOffsets/hasPositions/hasPayloads because they are allowed to be different

            TermsEnum leftTermsEnum = leftTerms.Iterator(null);
            TermsEnum rightTermsEnum = rightTerms.Iterator(null);
            AssertTermsEnum(leftTermsEnum, rightTermsEnum, true);

            AssertTermsSeeking(leftTerms, rightTerms);

            if (deep)
            {
                int numIntersections = AtLeast(3);
                for (int i = 0; i < numIntersections; i++)
                {
                    string re = AutomatonTestUtil.RandomRegexp(Random());
                    CompiledAutomaton automaton = new CompiledAutomaton((new RegExp(re, RegExp.NONE)).ToAutomaton());
                    if (automaton.Type == CompiledAutomaton.AUTOMATON_TYPE.NORMAL)
                    {
                        // TODO: test start term too
                        TermsEnum leftIntersection = leftTerms.Intersect(automaton, null);
                        TermsEnum rightIntersection = rightTerms.Intersect(automaton, null);
                        AssertTermsEnum(leftIntersection, rightIntersection, Rarely());
                    }
                }
            }
        }
예제 #37
0
        private void AssertTermsSeekingEquals(string info, Terms leftTerms, Terms rightTerms)
        {
            TermsEnum leftEnum = null;
            TermsEnum rightEnum = null;

            // just an upper bound
            int numTests = AtLeast(20);
            Random random = Random();

            // collect this number of terms from the left side
            HashSet<BytesRef> tests = new HashSet<BytesRef>();
            int numPasses = 0;
            while (numPasses < 10 && tests.Count < numTests)
            {
                leftEnum = leftTerms.Iterator(leftEnum);
                BytesRef term = null;
                while ((term = leftEnum.Next()) != null)
                {
                    int code = random.Next(10);
                    if (code == 0)
                    {
                        // the term
                        tests.Add(BytesRef.DeepCopyOf(term));
                    }
                    else if (code == 1)
                    {
                        // truncated subsequence of term
                        term = BytesRef.DeepCopyOf(term);
                        if (term.Length > 0)
                        {
                            // truncate it
                            term.Length = random.Next(term.Length);
                        }
                    }
                    else if (code == 2)
                    {
                        // term, but ensure a non-zero offset
                        var newbytes = new byte[term.Length + 5];
                        Array.Copy(term.Bytes, term.Offset, newbytes, 5, term.Length);
                        tests.Add(new BytesRef(newbytes, 5, term.Length));
                    }
                    else if (code == 3)
                    {
                        switch (Random().Next(3))
                        {
                            case 0:
                                tests.Add(new BytesRef()); // before the first term
                                break;

                            case 1:
                                tests.Add(new BytesRef(new byte[] { unchecked((byte)0xFF), unchecked((byte)0xFF) })); // past the last term
                                break;

                            case 2:
                                tests.Add(new BytesRef(TestUtil.RandomSimpleString(Random()))); // random term
                                break;

                            default:
                                throw new InvalidOperationException();
                        }
                    }
                }
                numPasses++;
            }

            rightEnum = rightTerms.Iterator(rightEnum);

            IList<BytesRef> shuffledTests = new List<BytesRef>(tests);
            shuffledTests = CollectionsHelper.Shuffle(shuffledTests);

            foreach (BytesRef b in shuffledTests)
            {
                if (Rarely())
                {
                    // reuse the enums
                    leftEnum = leftTerms.Iterator(leftEnum);
                    rightEnum = rightTerms.Iterator(rightEnum);
                }

                bool seekExact = Random().NextBoolean();

                if (seekExact)
                {
                    Assert.AreEqual(leftEnum.SeekExact(b), rightEnum.SeekExact(b), info);
                }
                else
                {
                    TermsEnum.SeekStatus leftStatus = leftEnum.SeekCeil(b);
                    TermsEnum.SeekStatus rightStatus = rightEnum.SeekCeil(b);
                    Assert.AreEqual(leftStatus, rightStatus, info);
                    if (leftStatus != TermsEnum.SeekStatus.END)
                    {
                        Assert.AreEqual(leftEnum.Term(), rightEnum.Term(), info);
                        AssertTermStatsEquals(info, leftEnum, rightEnum);
                    }
                }
            }
        }
        private void AssertTermsSeeking(Terms leftTerms, Terms rightTerms)
        {
            TermsEnum leftEnum = null;
            TermsEnum rightEnum = null;

            // just an upper bound
            int numTests = AtLeast(20);
            Random random = Random();

            // collect this number of terms from the left side
            HashSet<BytesRef> tests = new HashSet<BytesRef>();
            int numPasses = 0;
            while (numPasses < 10 && tests.Count < numTests)
            {
                leftEnum = leftTerms.Iterator(leftEnum);
                BytesRef term = null;
                while ((term = leftEnum.Next()) != null)
                {
                    int code = random.Next(10);
                    if (code == 0)
                    {
                        // the term
                        tests.Add(BytesRef.DeepCopyOf(term));
                    }
                    else if (code == 1)
                    {
                        // truncated subsequence of term
                        term = BytesRef.DeepCopyOf(term);
                        if (term.Length > 0)
                        {
                            // truncate it
                            term.Length = random.Next(term.Length);
                        }
                    }
                    else if (code == 2)
                    {
                        // term, but ensure a non-zero offset
                        var newbytes = new byte[term.Length + 5];
                        Array.Copy(term.Bytes, term.Offset, newbytes, 5, term.Length);
                        tests.Add(new BytesRef(newbytes, 5, term.Length));
                    }
                }
                numPasses++;
            }

            List<BytesRef> shuffledTests = new List<BytesRef>(tests);
            shuffledTests = (List<BytesRef>)CollectionsHelper.Shuffle(shuffledTests);

            foreach (BytesRef b in shuffledTests)
            {
                leftEnum = leftTerms.Iterator(leftEnum);
                rightEnum = rightTerms.Iterator(rightEnum);

                Assert.AreEqual(leftEnum.SeekExact(b), rightEnum.SeekExact(b));
                Assert.AreEqual(leftEnum.SeekExact(b), rightEnum.SeekExact(b));

                SeekStatus leftStatus;
                SeekStatus rightStatus;

                leftStatus = leftEnum.SeekCeil(b);
                rightStatus = rightEnum.SeekCeil(b);
                Assert.AreEqual(leftStatus, rightStatus);
                if (leftStatus != SeekStatus.END)
                {
                    Assert.AreEqual(leftEnum.Term(), rightEnum.Term());
                }

                leftStatus = leftEnum.SeekCeil(b);
                rightStatus = rightEnum.SeekCeil(b);
                Assert.AreEqual(leftStatus, rightStatus);
                if (leftStatus != SeekStatus.END)
                {
                    Assert.AreEqual(leftEnum.Term(), rightEnum.Term());
                }
            }
        }
        protected internal virtual void AssertEquals(RandomTokenStream tk, FieldType ft, Terms terms)
        {
            Assert.AreEqual(1, terms.DocCount);
            int termCount = (new HashSet <string>(Arrays.AsList(tk.Terms))).Count;

            Assert.AreEqual(termCount, terms.Size());
            Assert.AreEqual(termCount, terms.SumDocFreq);
            Assert.AreEqual(ft.StoreTermVectorPositions, terms.HasPositions());
            Assert.AreEqual(ft.StoreTermVectorOffsets, terms.HasOffsets());
            Assert.AreEqual(ft.StoreTermVectorPayloads && tk.HasPayloads(), terms.HasPayloads());
            HashSet <BytesRef> uniqueTerms = new HashSet <BytesRef>();

            foreach (string term in tk.Freqs.Keys)
            {
                uniqueTerms.Add(new BytesRef(term));
            }
            BytesRef[] sortedTerms = uniqueTerms.ToArray(/*new BytesRef[0]*/);
            Array.Sort(sortedTerms, terms.Comparator);
            TermsEnum termsEnum = terms.Iterator(Random().NextBoolean() ? null : this.termsEnum.Value);

            this.termsEnum.Value = termsEnum;
            for (int i = 0; i < sortedTerms.Length; ++i)
            {
                BytesRef nextTerm = termsEnum.Next();
                Assert.AreEqual(sortedTerms[i], nextTerm);
                Assert.AreEqual(sortedTerms[i], termsEnum.Term());
                Assert.AreEqual(1, termsEnum.DocFreq());

                FixedBitSet bits     = new FixedBitSet(1);
                DocsEnum    docsEnum = termsEnum.Docs(bits, Random().NextBoolean() ? null : this.docsEnum.Value);
                Assert.AreEqual(DocsEnum.NO_MORE_DOCS, docsEnum.NextDoc());
                bits.Set(0);

                docsEnum = termsEnum.Docs(Random().NextBoolean() ? bits : null, Random().NextBoolean() ? null : docsEnum);
                Assert.IsNotNull(docsEnum);
                Assert.AreEqual(0, docsEnum.NextDoc());
                Assert.AreEqual(0, docsEnum.DocID());
                Assert.AreEqual(tk.Freqs[termsEnum.Term().Utf8ToString()], (int?)docsEnum.Freq());
                Assert.AreEqual(DocsEnum.NO_MORE_DOCS, docsEnum.NextDoc());
                this.docsEnum.Value = docsEnum;

                bits.Clear(0);
                DocsAndPositionsEnum docsAndPositionsEnum = termsEnum.DocsAndPositions(bits, Random().NextBoolean() ? null : this.docsAndPositionsEnum.Value);
                Assert.AreEqual(ft.StoreTermVectorOffsets || ft.StoreTermVectorPositions, docsAndPositionsEnum != null);
                if (docsAndPositionsEnum != null)
                {
                    Assert.AreEqual(DocsEnum.NO_MORE_DOCS, docsAndPositionsEnum.NextDoc());
                }
                bits.Set(0);

                docsAndPositionsEnum = termsEnum.DocsAndPositions(Random().NextBoolean() ? bits : null, Random().NextBoolean() ? null : docsAndPositionsEnum);
                Assert.AreEqual(ft.StoreTermVectorOffsets || ft.StoreTermVectorPositions, docsAndPositionsEnum != null);
                if (terms.HasPositions() || terms.HasOffsets())
                {
                    Assert.AreEqual(0, docsAndPositionsEnum.NextDoc());
                    int freq = docsAndPositionsEnum.Freq();
                    Assert.AreEqual(tk.Freqs[termsEnum.Term().Utf8ToString()], (int?)freq);
                    if (docsAndPositionsEnum != null)
                    {
                        for (int k = 0; k < freq; ++k)
                        {
                            int         position = docsAndPositionsEnum.NextPosition();
                            ISet <int?> indexes;
                            if (terms.HasPositions())
                            {
                                indexes = tk.PositionToTerms[position];
                                Assert.IsNotNull(indexes);
                            }
                            else
                            {
                                indexes = tk.StartOffsetToTerms[docsAndPositionsEnum.StartOffset()];
                                Assert.IsNotNull(indexes);
                            }
                            if (terms.HasPositions())
                            {
                                bool foundPosition = false;
                                foreach (int index in indexes)
                                {
                                    if (tk.TermBytes[index].Equals(termsEnum.Term()) && tk.Positions[index] == position)
                                    {
                                        foundPosition = true;
                                        break;
                                    }
                                }
                                Assert.IsTrue(foundPosition);
                            }
                            if (terms.HasOffsets())
                            {
                                bool foundOffset = false;
                                foreach (int index in indexes)
                                {
                                    if (tk.TermBytes[index].Equals(termsEnum.Term()) && tk.StartOffsets[index] == docsAndPositionsEnum.StartOffset() && tk.EndOffsets[index] == docsAndPositionsEnum.EndOffset())
                                    {
                                        foundOffset = true;
                                        break;
                                    }
                                }
                                Assert.IsTrue(foundOffset);
                            }
                            if (terms.HasPayloads())
                            {
                                bool foundPayload = false;
                                foreach (int index in indexes)
                                {
                                    if (tk.TermBytes[index].Equals(termsEnum.Term()) && Equals(tk.Payloads[index], docsAndPositionsEnum.Payload))
                                    {
                                        foundPayload = true;
                                        break;
                                    }
                                }
                                Assert.IsTrue(foundPayload);
                            }
                        }
                        try
                        {
                            docsAndPositionsEnum.NextPosition();
                            Assert.Fail();
                        }
                        catch (Exception e)
                        {
                            // ok
                        }
                    }
                    Assert.AreEqual(DocsEnum.NO_MORE_DOCS, docsAndPositionsEnum.NextDoc());
                }
                this.docsAndPositionsEnum.Value = docsAndPositionsEnum;
            }
            Assert.IsNull(termsEnum.Next());
            for (int i = 0; i < 5; ++i)
            {
                if (Random().NextBoolean())
                {
                    Assert.IsTrue(termsEnum.SeekExact(RandomInts.RandomFrom(Random(), tk.TermBytes)));
                }
                else
                {
                    Assert.AreEqual(SeekStatus.FOUND, termsEnum.SeekCeil(RandomInts.RandomFrom(Random(), tk.TermBytes)));
                }
            }
        }
예제 #40
0
 public TermsEnum TermsEnum(Terms terms)
 {
     return NumericUtils.FilterPrefixCodedLongs(terms.Iterator(null));
 }