Ejemplo n.º 1
0
        public virtual void AddValue(int docID, long value)
        {
            if (docID < Pending.Size())
            {
                throw new System.ArgumentException("DocValuesField \"" + FieldInfo.Name + "\" appears more than once in this document (only one value is allowed per field)");
            }

            // Fill in any holes:
            for (int i = (int)Pending.Size(); i < docID; ++i)
            {
                Pending.Add(MISSING);
            }

            Pending.Add(value);
            if (DocsWithField != null)
            {
                DocsWithField = FixedBitSet.EnsureCapacity(DocsWithField, docID);
                DocsWithField.Set(docID);
            }

            UpdateBytesUsed();
        }
Ejemplo n.º 2
0
		public void testHashCodeEquals()
		{
			// This test can't handle numBits==0:
			int numBits = rnd.Next(2000) + 1;
			FixedBitSet b1 = new FixedBitSet(numBits);
			FixedBitSet b2 = new FixedBitSet(numBits);
			Assert.IsTrue(b1.Equals(b2));
			Assert.IsTrue(b2.Equals(b1));
			for (int iter = 0; iter < 10 * rnd.Next(500); iter++)
			{
				int idx = rnd.Next(numBits);
				if (!b1.Get(idx))
				{
					b1.Set(idx);
					Assert.IsFalse(b1.Equals(b2));
					Assert.AreNotEqual(b1.GetHashCode(), b2.GetHashCode());
					b2.Set(idx);
					Assert.AreEqual(b1, b2);
					Assert.AreEqual(b1.GetHashCode(), b2.GetHashCode());
				}
			}
		}
 public override void Merge(DocValuesFieldUpdates other)
 {
     BinaryDocValuesFieldUpdates otherUpdates = (BinaryDocValuesFieldUpdates)other;
     int newSize = Size + otherUpdates.Size;
     if (newSize > int.MaxValue)
     {
         throw new InvalidOperationException("cannot support more than Integer.MAX_VALUE doc/value entries; size=" + Size + " other.size=" + otherUpdates.Size);
     }
     Docs = Docs.Grow(newSize);
     Offsets = Offsets.Grow(newSize);
     Lengths = Lengths.Grow(newSize);
     DocsWithField = FixedBitSet.EnsureCapacity(DocsWithField, (int)Docs.Size());
     for (int i = 0; i < otherUpdates.Size; i++)
     {
         int doc = (int)otherUpdates.Docs.Get(i);
         if (otherUpdates.DocsWithField.Get(i))
         {
             DocsWithField.Set(Size);
         }
         Docs.Set(Size, doc);
         Offsets.Set(Size, Values.Length + otherUpdates.Offsets.Get(i)); // correct relative offset
         Lengths.Set(Size, otherUpdates.Lengths.Get(i));
         ++Size;
     }
     Values.Append(otherUpdates.Values);
 }
Ejemplo n.º 4
0
        /// <summary>
        /// Returns the a DocIdSetIterator representing the Boolean composition
        /// of the filters that have been added.
        /// </summary>
        public override DocIdSet GetDocIdSet(AtomicReaderContext context, Bits acceptDocs)
        {
            FixedBitSet  res    = null;
            AtomicReader reader = context.AtomicReader;

            bool hasShouldClauses = false;

            foreach (FilterClause fc in clauses)
            {
                if (fc.Occur == BooleanClause.Occur.SHOULD)
                {
                    hasShouldClauses = true;
                    DocIdSetIterator disi = GetDISI(fc.Filter, context);
                    if (disi == null)
                    {
                        continue;
                    }
                    if (res == null)
                    {
                        res = new FixedBitSet(reader.MaxDoc);
                    }
                    res.Or(disi);
                }
            }
            if (hasShouldClauses && res == null)
            {
                return(null);
            }

            foreach (FilterClause fc in clauses)
            {
                if (fc.Occur == BooleanClause.Occur.MUST_NOT)
                {
                    if (res == null)
                    {
                        Debug.Assert(!hasShouldClauses);
                        res = new FixedBitSet(reader.MaxDoc);
                        res.Set(0, reader.MaxDoc); // NOTE: may set bits on deleted docs
                    }

                    DocIdSetIterator disi = GetDISI(fc.Filter, context);
                    if (disi != null)
                    {
                        res.AndNot(disi);
                    }
                }
            }

            foreach (FilterClause fc in clauses)
            {
                if (fc.Occur == BooleanClause.Occur.MUST)
                {
                    DocIdSetIterator disi = GetDISI(fc.Filter, context);
                    if (disi == null)
                    {
                        return(null); // no documents can match
                    }
                    if (res == null)
                    {
                        res = new FixedBitSet(reader.MaxDoc);
                        res.Or(disi);
                    }
                    else
                    {
                        res.And(disi);
                    }
                }
            }

            return(BitsFilteredDocIdSet.Wrap(res, acceptDocs));
        }
Ejemplo n.º 5
0
            private void LoadTerms()
            {
                var posIntOutputs = PositiveInt32Outputs.Singleton;
                var outputsInner  = new PairOutputs <long?, long?>(posIntOutputs, posIntOutputs);
                var outputs       = new PairOutputs <long?, PairOutputs <long?, long?> .Pair>(posIntOutputs, outputsInner);

                // honestly, wtf kind of generic mess is this.
                var b     = new Builder <PairOutputs <long?, PairOutputs <long?, long?> .Pair> .Pair>(FST.INPUT_TYPE.BYTE1, outputs);
                var input = (IndexInput)_outerInstance._input.Clone();

                input.Seek(_termsStart);

                var  lastTerm      = new BytesRef(10);
                long lastDocsStart = -1;
                int  docFreq       = 0;
                long totalTermFreq = 0;
                var  visitedDocs   = new FixedBitSet(_maxDoc);

                var scratchIntsRef = new Int32sRef();

                while (true)
                {
                    SimpleTextUtil.ReadLine(input, _scratch);
                    if (_scratch.Equals(SimpleTextFieldsWriter.END) || StringHelper.StartsWith(_scratch, SimpleTextFieldsWriter.FIELD))
                    {
                        if (lastDocsStart != -1)
                        {
                            b.Add(Util.ToInt32sRef(lastTerm, scratchIntsRef),
                                  outputs.NewPair(lastDocsStart, outputsInner.NewPair(docFreq, totalTermFreq)));
                            _sumTotalTermFreq += totalTermFreq;
                        }
                        break;
                    }

                    if (StringHelper.StartsWith(_scratch, SimpleTextFieldsWriter.DOC))
                    {
                        docFreq++;
                        _sumDocFreq++;
                        UnicodeUtil.UTF8toUTF16(_scratch.Bytes, _scratch.Offset + SimpleTextFieldsWriter.DOC.Length, _scratch.Length - SimpleTextFieldsWriter.DOC.Length,
                                                _scratchUtf16);
                        int docId = ArrayUtil.ParseInt32(_scratchUtf16.Chars, 0, _scratchUtf16.Length);
                        visitedDocs.Set(docId);
                    }
                    else if (StringHelper.StartsWith(_scratch, SimpleTextFieldsWriter.FREQ))
                    {
                        UnicodeUtil.UTF8toUTF16(_scratch.Bytes, _scratch.Offset + SimpleTextFieldsWriter.FREQ.Length,
                                                _scratch.Length - SimpleTextFieldsWriter.FREQ.Length, _scratchUtf16);
                        totalTermFreq += ArrayUtil.ParseInt32(_scratchUtf16.Chars, 0, _scratchUtf16.Length);
                    }
                    else if (StringHelper.StartsWith(_scratch, SimpleTextFieldsWriter.TERM))
                    {
                        if (lastDocsStart != -1)
                        {
                            b.Add(Util.ToInt32sRef(lastTerm, scratchIntsRef),
                                  outputs.NewPair(lastDocsStart, outputsInner.NewPair(docFreq, totalTermFreq)));
                        }
                        lastDocsStart = input.GetFilePointer();
                        int len = _scratch.Length - SimpleTextFieldsWriter.TERM.Length;
                        if (len > lastTerm.Length)
                        {
                            lastTerm.Grow(len);
                        }
                        Array.Copy(_scratch.Bytes, SimpleTextFieldsWriter.TERM.Length, lastTerm.Bytes, 0, len);
                        lastTerm.Length    = len;
                        docFreq            = 0;
                        _sumTotalTermFreq += totalTermFreq;
                        totalTermFreq      = 0;
                        _termCount++;
                    }
                }
                _docCount = visitedDocs.Cardinality();
                _fst      = b.Finish();
            }
Ejemplo n.º 6
0
 public virtual void Collect(int doc)
 {
     actualResult.Set(doc + _docBase);
     topScoreDocCollector.Collect(doc);
 }
Ejemplo n.º 7
0
 public void Docs(FixedBitSet bits)
 {
     var termDocs = reader.TermDocs(new Term(fieldName, Term().Text));
     while (termDocs.Next())
     {
         bits.Set(termDocs.Doc);
     }
 }
Ejemplo n.º 8
0
        public virtual void DoTestLongPostingsNoPositions(FieldInfo.IndexOptions options)
        {
            // Don't use TestUtil.getTempDir so that we own the
            // randomness (ie same seed will point to same dir):
            Directory dir = NewFSDirectory(CreateTempDir("longpostings" + "." + Random().NextLong()));

            int NUM_DOCS = AtLeast(2000);

            if (VERBOSE)
            {
                Console.WriteLine("TEST: NUM_DOCS=" + NUM_DOCS);
            }

            string s1 = GetRandomTerm(null);
            string s2 = GetRandomTerm(s1);

            if (VERBOSE)
            {
                Console.WriteLine("\nTEST: s1=" + s1 + " s2=" + s2);
                /*
                for(int idx=0;idx<s1.Length();idx++) {
                  System.out.println("  s1 ch=0x" + Integer.toHexString(s1.charAt(idx)));
                }
                for(int idx=0;idx<s2.Length();idx++) {
                  System.out.println("  s2 ch=0x" + Integer.toHexString(s2.charAt(idx)));
                }
                */
            }

            FixedBitSet isS1 = new FixedBitSet(NUM_DOCS);
            for (int idx = 0; idx < NUM_DOCS; idx++)
            {
                if (Random().NextBoolean())
                {
                    isS1.Set(idx);
                }
            }

            IndexReader r;
            if (true)
            {
                IndexWriterConfig iwc = NewIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(Random())).SetOpenMode(IndexWriterConfig.OpenMode_e.CREATE).SetMergePolicy(NewLogMergePolicy());
                iwc.SetRAMBufferSizeMB(16.0 + 16.0 * Random().NextDouble());
                iwc.SetMaxBufferedDocs(-1);
                RandomIndexWriter riw = new RandomIndexWriter(Random(), dir, iwc);

                FieldType ft = new FieldType(TextField.TYPE_NOT_STORED);
                ft.IndexOptions = options;
                for (int idx = 0; idx < NUM_DOCS; idx++)
                {
                    Document doc = new Document();
                    string s = isS1.Get(idx) ? s1 : s2;
                    Field f = NewField("field", s, ft);
                    int count = TestUtil.NextInt(Random(), 1, 4);
                    for (int ct = 0; ct < count; ct++)
                    {
                        doc.Add(f);
                    }
                    riw.AddDocument(doc);
                }

                r = riw.Reader;
                riw.Dispose();
            }
            else
            {
                r = DirectoryReader.Open(dir);
            }

            /*
            if (VERBOSE) {
              System.out.println("TEST: terms");
              TermEnum termEnum = r.Terms();
              while(termEnum.Next()) {
                System.out.println("  term=" + termEnum.Term() + " len=" + termEnum.Term().Text().Length());
                Assert.IsTrue(termEnum.DocFreq() > 0);
                System.out.println("    s1?=" + (termEnum.Term().Text().equals(s1)) + " s1len=" + s1.Length());
                System.out.println("    s2?=" + (termEnum.Term().Text().equals(s2)) + " s2len=" + s2.Length());
                final String s = termEnum.Term().Text();
                for(int idx=0;idx<s.Length();idx++) {
                  System.out.println("      ch=0x" + Integer.toHexString(s.charAt(idx)));
                }
              }
            }
            */

            Assert.AreEqual(NUM_DOCS, r.NumDocs);
            Assert.IsTrue(r.DocFreq(new Term("field", s1)) > 0);
            Assert.IsTrue(r.DocFreq(new Term("field", s2)) > 0);

            int num = AtLeast(1000);
            for (int iter = 0; iter < num; iter++)
            {
                string term;
                bool doS1;
                if (Random().NextBoolean())
                {
                    term = s1;
                    doS1 = true;
                }
                else
                {
                    term = s2;
                    doS1 = false;
                }

                if (VERBOSE)
                {
                    Console.WriteLine("\nTEST: iter=" + iter + " doS1=" + doS1 + " term=" + term);
                }

                DocsEnum docs;
                DocsEnum postings;

                if (options == FieldInfo.IndexOptions.DOCS_ONLY)
                {
                    docs = TestUtil.Docs(Random(), r, "field", new BytesRef(term), null, null, DocsEnum.FLAG_NONE);
                    postings = null;
                }
                else
                {
                    docs = postings = TestUtil.Docs(Random(), r, "field", new BytesRef(term), null, null, DocsEnum.FLAG_FREQS);
                    Debug.Assert(postings != null);
                }
                Debug.Assert(docs != null);

                int docID = -1;
                while (docID < DocIdSetIterator.NO_MORE_DOCS)
                {
                    int what = Random().Next(3);
                    if (what == 0)
                    {
                        if (VERBOSE)
                        {
                            Console.WriteLine("TEST: docID=" + docID + "; do next()");
                        }
                        // nextDoc
                        int expected = docID + 1;
                        while (true)
                        {
                            if (expected == NUM_DOCS)
                            {
                                expected = int.MaxValue;
                                break;
                            }
                            else if (isS1.Get(expected) == doS1)
                            {
                                break;
                            }
                            else
                            {
                                expected++;
                            }
                        }
                        docID = docs.NextDoc();
                        if (VERBOSE)
                        {
                            Console.WriteLine("  got docID=" + docID);
                        }
                        Assert.AreEqual(expected, docID);
                        if (docID == DocIdSetIterator.NO_MORE_DOCS)
                        {
                            break;
                        }

                        if (Random().Next(6) == 3 && postings != null)
                        {
                            int freq = postings.Freq();
                            Assert.IsTrue(freq >= 1 && freq <= 4);
                        }
                    }
                    else
                    {
                        // advance
                        int targetDocID;
                        if (docID == -1)
                        {
                            targetDocID = Random().Next(NUM_DOCS + 1);
                        }
                        else
                        {
                            targetDocID = docID + TestUtil.NextInt(Random(), 1, NUM_DOCS - docID);
                        }
                        if (VERBOSE)
                        {
                            Console.WriteLine("TEST: docID=" + docID + "; do advance(" + targetDocID + ")");
                        }
                        int expected = targetDocID;
                        while (true)
                        {
                            if (expected == NUM_DOCS)
                            {
                                expected = int.MaxValue;
                                break;
                            }
                            else if (isS1.Get(expected) == doS1)
                            {
                                break;
                            }
                            else
                            {
                                expected++;
                            }
                        }

                        docID = docs.Advance(targetDocID);
                        if (VERBOSE)
                        {
                            Console.WriteLine("  got docID=" + docID);
                        }
                        Assert.AreEqual(expected, docID);
                        if (docID == DocIdSetIterator.NO_MORE_DOCS)
                        {
                            break;
                        }

                        if (Random().Next(6) == 3 && postings != null)
                        {
                            int freq = postings.Freq();
                            Assert.IsTrue(freq >= 1 && freq <= 4, "got invalid freq=" + freq);
                        }
                    }
                }
            }
            r.Dispose();
            dir.Dispose();
        }
Ejemplo n.º 9
0
        public virtual void TestLongPostings_Mem()
        {
            // Don't use TestUtil.getTempDir so that we own the
            // randomness (ie same seed will point to same dir):
            Directory dir = NewFSDirectory(CreateTempDir("longpostings" + "." + Random.NextInt64()));

            int NUM_DOCS = AtLeast(2000);

            if (Verbose)
            {
                Console.WriteLine("TEST: NUM_DOCS=" + NUM_DOCS);
            }

            string s1 = GetRandomTerm(null);
            string s2 = GetRandomTerm(s1);

            if (Verbose)
            {
                Console.WriteLine("\nTEST: s1=" + s1 + " s2=" + s2);

                /*
                 * for(int idx=0;idx<s1.length();idx++) {
                 * System.out.println("  s1 ch=0x" + Integer.toHexString(s1.charAt(idx)));
                 * }
                 * for(int idx=0;idx<s2.length();idx++) {
                 * System.out.println("  s2 ch=0x" + Integer.toHexString(s2.charAt(idx)));
                 * }
                 */
            }

            FixedBitSet isS1 = new FixedBitSet(NUM_DOCS);

            for (int idx = 0; idx < NUM_DOCS; idx++)
            {
                if (Random.NextBoolean())
                {
                    isS1.Set(idx);
                }
            }

            IndexReader       r;
            IndexWriterConfig iwc = NewIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(Random)).SetOpenMode(OpenMode.CREATE).SetMergePolicy(NewLogMergePolicy());

            iwc.SetRAMBufferSizeMB(16.0 + 16.0 * Random.NextDouble());
            iwc.SetMaxBufferedDocs(-1);
            RandomIndexWriter riw = new RandomIndexWriter(Random, dir, iwc);

            for (int idx = 0; idx < NUM_DOCS; idx++)
            {
                Document doc   = new Document();
                string   s     = isS1.Get(idx) ? s1 : s2;
                Field    f     = NewTextField("field", s, Field.Store.NO);
                int      count = TestUtil.NextInt32(Random, 1, 4);
                for (int ct = 0; ct < count; ct++)
                {
                    doc.Add(f);
                }
                riw.AddDocument(doc);
            }

            r = riw.GetReader();
            riw.Dispose();

            /*
             * if (VERBOSE) {
             * System.out.println("TEST: terms");
             * TermEnum termEnum = r.Terms();
             * while(termEnum.Next()) {
             *  System.out.println("  term=" + termEnum.Term() + " len=" + termEnum.Term().Text().length());
             *  Assert.IsTrue(termEnum.DocFreq() > 0);
             *  System.out.println("    s1?=" + (termEnum.Term().Text().equals(s1)) + " s1len=" + s1.length());
             *  System.out.println("    s2?=" + (termEnum.Term().Text().equals(s2)) + " s2len=" + s2.length());
             *  final String s = termEnum.Term().Text();
             *  for(int idx=0;idx<s.length();idx++) {
             *    System.out.println("      ch=0x" + Integer.toHexString(s.charAt(idx)));
             *  }
             * }
             * }
             */

            Assert.AreEqual(NUM_DOCS, r.NumDocs);
            Assert.IsTrue(r.DocFreq(new Term("field", s1)) > 0);
            Assert.IsTrue(r.DocFreq(new Term("field", s2)) > 0);

            int num = AtLeast(1000);

            for (int iter = 0; iter < num; iter++)
            {
                string term;
                bool   doS1;
                if (Random.NextBoolean())
                {
                    term = s1;
                    doS1 = true;
                }
                else
                {
                    term = s2;
                    doS1 = false;
                }

                if (Verbose)
                {
                    Console.WriteLine("\nTEST: iter=" + iter + " doS1=" + doS1);
                }

                DocsAndPositionsEnum postings = MultiFields.GetTermPositionsEnum(r, null, "field", new BytesRef(term));

                int docID = -1;
                while (docID < DocIdSetIterator.NO_MORE_DOCS)
                {
                    int what = Random.Next(3);
                    if (what == 0)
                    {
                        if (Verbose)
                        {
                            Console.WriteLine("TEST: docID=" + docID + "; do next()");
                        }
                        // nextDoc
                        int expected = docID + 1;
                        while (true)
                        {
                            if (expected == NUM_DOCS)
                            {
                                expected = int.MaxValue;
                                break;
                            }
                            else if (isS1.Get(expected) == doS1)
                            {
                                break;
                            }
                            else
                            {
                                expected++;
                            }
                        }
                        docID = postings.NextDoc();
                        if (Verbose)
                        {
                            Console.WriteLine("  got docID=" + docID);
                        }
                        Assert.AreEqual(expected, docID);
                        if (docID == DocIdSetIterator.NO_MORE_DOCS)
                        {
                            break;
                        }

                        if (Random.Next(6) == 3)
                        {
                            int freq = postings.Freq;
                            Assert.IsTrue(freq >= 1 && freq <= 4);
                            for (int pos = 0; pos < freq; pos++)
                            {
                                Assert.AreEqual(pos, postings.NextPosition());
                                if (Random.NextBoolean())
                                {
                                    var dummy = postings.GetPayload();
                                    if (Random.NextBoolean())
                                    {
                                        dummy = postings.GetPayload(); // get it again
                                    }
                                }
                            }
                        }
                    }
                    else
                    {
                        // advance
                        int targetDocID;
                        if (docID == -1)
                        {
                            targetDocID = Random.Next(NUM_DOCS + 1);
                        }
                        else
                        {
                            targetDocID = docID + TestUtil.NextInt32(Random, 1, NUM_DOCS - docID);
                        }
                        if (Verbose)
                        {
                            Console.WriteLine("TEST: docID=" + docID + "; do advance(" + targetDocID + ")");
                        }
                        int expected = targetDocID;
                        while (true)
                        {
                            if (expected == NUM_DOCS)
                            {
                                expected = int.MaxValue;
                                break;
                            }
                            else if (isS1.Get(expected) == doS1)
                            {
                                break;
                            }
                            else
                            {
                                expected++;
                            }
                        }

                        docID = postings.Advance(targetDocID);
                        if (Verbose)
                        {
                            Console.WriteLine("  got docID=" + docID);
                        }
                        Assert.AreEqual(expected, docID);
                        if (docID == DocIdSetIterator.NO_MORE_DOCS)
                        {
                            break;
                        }

                        if (Random.Next(6) == 3)
                        {
                            int freq = postings.Freq;
                            Assert.IsTrue(freq >= 1 && freq <= 4);
                            for (int pos = 0; pos < freq; pos++)
                            {
                                Assert.AreEqual(pos, postings.NextPosition());
                                if (Random.NextBoolean())
                                {
                                    var dummy = postings.GetPayload();
                                    if (Random.NextBoolean())
                                    {
                                        dummy = postings.GetPayload(); // get it again
                                    }
                                }
                            }
                        }
                    }
                }
            }
            r.Dispose();
            dir.Dispose();
        }
Ejemplo n.º 10
0
        public virtual void TestBuildDocMap()
        {
            int maxDoc = TestUtil.NextInt(Random(), 1, 128);
            int numDocs = TestUtil.NextInt(Random(), 0, maxDoc);
            int numDeletedDocs = maxDoc - numDocs;
            FixedBitSet liveDocs = new FixedBitSet(maxDoc);
            for (int i = 0; i < numDocs; ++i)
            {
                while (true)
                {
                    int docID = Random().Next(maxDoc);
                    if (!liveDocs.Get(docID))
                    {
                        liveDocs.Set(docID);
                        break;
                    }
                }
            }

            MergeState.DocMap docMap = MergeState.DocMap.Build(maxDoc, liveDocs);

            Assert.AreEqual(maxDoc, docMap.MaxDoc);
            Assert.AreEqual(numDocs, docMap.NumDocs);
            Assert.AreEqual(numDeletedDocs, docMap.NumDeletedDocs);
            // assert the mapping is compact
            for (int i = 0, del = 0; i < maxDoc; ++i)
            {
                if (!liveDocs.Get(i))
                {
                    Assert.AreEqual(-1, docMap.Get(i));
                    ++del;
                }
                else
                {
                    Assert.AreEqual(i - del, docMap.Get(i));
                }
            }
        }
Ejemplo n.º 11
0
        /// <summary>
        /// checks Fields api is consistent with itself.
        /// searcher is optional, to verify with queries. Can be null.
        /// </summary>
        private static Status.TermIndexStatus CheckFields(Fields fields, Bits liveDocs, int maxDoc, FieldInfos fieldInfos, bool doPrint, bool isVectors, StreamWriter infoStream, bool verbose)
        {
            // TODO: we should probably return our own stats thing...?!

            Status.TermIndexStatus status = new Status.TermIndexStatus();
            int computedFieldCount = 0;

            if (fields == null)
            {
                Msg(infoStream, "OK [no fields/terms]");
                return status;
            }

            DocsEnum docs = null;
            DocsEnum docsAndFreqs = null;
            DocsAndPositionsEnum postings = null;

            string lastField = null;
            foreach (string field in fields)
            {
                // MultiFieldsEnum relies upon this order...
                if (lastField != null && field.CompareTo(lastField) <= 0)
                {
                    throw new Exception("fields out of order: lastField=" + lastField + " field=" + field);
                }
                lastField = field;

                // check that the field is in fieldinfos, and is indexed.
                // TODO: add a separate test to check this for different reader impls
                FieldInfo fieldInfo = fieldInfos.FieldInfo(field);
                if (fieldInfo == null)
                {
                    throw new Exception("fieldsEnum inconsistent with fieldInfos, no fieldInfos for: " + field);
                }
                if (!fieldInfo.Indexed)
                {
                    throw new Exception("fieldsEnum inconsistent with fieldInfos, isIndexed == false for: " + field);
                }

                // TODO: really the codec should not return a field
                // from FieldsEnum if it has no Terms... but we do
                // this today:
                // assert fields.terms(field) != null;
                computedFieldCount++;

                Terms terms = fields.Terms(field);
                if (terms == null)
                {
                    continue;
                }

                bool hasFreqs = terms.HasFreqs();
                bool hasPositions = terms.HasPositions();
                bool hasPayloads = terms.HasPayloads();
                bool hasOffsets = terms.HasOffsets();

                // term vectors cannot omit TF:
                bool expectedHasFreqs = (isVectors || fieldInfo.FieldIndexOptions >= FieldInfo.IndexOptions.DOCS_AND_FREQS);

                if (hasFreqs != expectedHasFreqs)
                {
                    throw new Exception("field \"" + field + "\" should have hasFreqs=" + expectedHasFreqs + " but got " + hasFreqs);
                }

                if (hasFreqs == false)
                {
                    if (terms.SumTotalTermFreq != -1)
                    {
                        throw new Exception("field \"" + field + "\" hasFreqs is false, but Terms.getSumTotalTermFreq()=" + terms.SumTotalTermFreq + " (should be -1)");
                    }
                }

                if (!isVectors)
                {
                    bool expectedHasPositions = fieldInfo.FieldIndexOptions >= FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS;
                    if (hasPositions != expectedHasPositions)
                    {
                        throw new Exception("field \"" + field + "\" should have hasPositions=" + expectedHasPositions + " but got " + hasPositions);
                    }

                    bool expectedHasPayloads = fieldInfo.HasPayloads();
                    if (hasPayloads != expectedHasPayloads)
                    {
                        throw new Exception("field \"" + field + "\" should have hasPayloads=" + expectedHasPayloads + " but got " + hasPayloads);
                    }

                    bool expectedHasOffsets = fieldInfo.FieldIndexOptions >= FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS;
                    if (hasOffsets != expectedHasOffsets)
                    {
                        throw new Exception("field \"" + field + "\" should have hasOffsets=" + expectedHasOffsets + " but got " + hasOffsets);
                    }
                }

                TermsEnum termsEnum = terms.Iterator(null);

                bool hasOrd = true;
                long termCountStart = status.DelTermCount + status.TermCount;

                BytesRef lastTerm = null;

                IComparer<BytesRef> termComp = terms.Comparator;

                long sumTotalTermFreq = 0;
                long sumDocFreq = 0;
                FixedBitSet visitedDocs = new FixedBitSet(maxDoc);
                while (true)
                {
                    BytesRef term = termsEnum.Next();
                    if (term == null)
                    {
                        break;
                    }

                    Debug.Assert(term.Valid);

                    // make sure terms arrive in order according to
                    // the comp
                    if (lastTerm == null)
                    {
                        lastTerm = BytesRef.DeepCopyOf(term);
                    }
                    else
                    {
                        if (termComp.Compare(lastTerm, term) >= 0)
                        {
                            throw new Exception("terms out of order: lastTerm=" + lastTerm + " term=" + term);
                        }
                        lastTerm.CopyBytes(term);
                    }

                    int docFreq = termsEnum.DocFreq();
                    if (docFreq <= 0)
                    {
                        throw new Exception("docfreq: " + docFreq + " is out of bounds");
                    }
                    sumDocFreq += docFreq;

                    docs = termsEnum.Docs(liveDocs, docs);
                    postings = termsEnum.DocsAndPositions(liveDocs, postings);

                    if (hasFreqs == false)
                    {
                        if (termsEnum.TotalTermFreq() != -1)
                        {
                            throw new Exception("field \"" + field + "\" hasFreqs is false, but TermsEnum.totalTermFreq()=" + termsEnum.TotalTermFreq() + " (should be -1)");
                        }
                    }

                    if (hasOrd)
                    {
                        long ord = -1;
                        try
                        {
                            ord = termsEnum.Ord();
                        }
                        catch (System.NotSupportedException uoe)
                        {
                            hasOrd = false;
                        }

                        if (hasOrd)
                        {
                            long ordExpected = status.DelTermCount + status.TermCount - termCountStart;
                            if (ord != ordExpected)
                            {
                                throw new Exception("ord mismatch: TermsEnum has ord=" + ord + " vs actual=" + ordExpected);
                            }
                        }
                    }

                    DocsEnum docs2;
                    if (postings != null)
                    {
                        docs2 = postings;
                    }
                    else
                    {
                        docs2 = docs;
                    }

                    int lastDoc = -1;
                    int docCount = 0;
                    long totalTermFreq = 0;
                    while (true)
                    {
                        int doc = docs2.NextDoc();
                        if (doc == DocIdSetIterator.NO_MORE_DOCS)
                        {
                            break;
                        }
                        status.TotFreq++;
                        visitedDocs.Set(doc);
                        int freq = -1;
                        if (hasFreqs)
                        {
                            freq = docs2.Freq();
                            if (freq <= 0)
                            {
                                throw new Exception("term " + term + ": doc " + doc + ": freq " + freq + " is out of bounds");
                            }
                            status.TotPos += freq;
                            totalTermFreq += freq;
                        }
                        else
                        {
                            // When a field didn't index freq, it must
                            // consistently "lie" and pretend that freq was
                            // 1:
                            if (docs2.Freq() != 1)
                            {
                                throw new Exception("term " + term + ": doc " + doc + ": freq " + freq + " != 1 when Terms.hasFreqs() is false");
                            }
                        }
                        docCount++;

                        if (doc <= lastDoc)
                        {
                            throw new Exception("term " + term + ": doc " + doc + " <= lastDoc " + lastDoc);
                        }
                        if (doc >= maxDoc)
                        {
                            throw new Exception("term " + term + ": doc " + doc + " >= maxDoc " + maxDoc);
                        }

                        lastDoc = doc;

                        int lastPos = -1;
                        int lastOffset = 0;
                        if (hasPositions)
                        {
                            for (int j = 0; j < freq; j++)
                            {
                                int pos = postings.NextPosition();

                                if (pos < 0)
                                {
                                    throw new Exception("term " + term + ": doc " + doc + ": pos " + pos + " is out of bounds");
                                }
                                if (pos < lastPos)
                                {
                                    throw new Exception("term " + term + ": doc " + doc + ": pos " + pos + " < lastPos " + lastPos);
                                }
                                lastPos = pos;
                                BytesRef payload = postings.Payload;
                                if (payload != null)
                                {
                                    Debug.Assert(payload.Valid);
                                }
                                if (payload != null && payload.Length < 1)
                                {
                                    throw new Exception("term " + term + ": doc " + doc + ": pos " + pos + " payload length is out of bounds " + payload.Length);
                                }
                                if (hasOffsets)
                                {
                                    int startOffset = postings.StartOffset();
                                    int endOffset = postings.EndOffset();
                                    // NOTE: we cannot enforce any bounds whatsoever on vectors... they were a free-for-all before?
                                    // but for offsets in the postings lists these checks are fine: they were always enforced by IndexWriter
                                    if (!isVectors)
                                    {
                                        if (startOffset < 0)
                                        {
                                            throw new Exception("term " + term + ": doc " + doc + ": pos " + pos + ": startOffset " + startOffset + " is out of bounds");
                                        }
                                        if (startOffset < lastOffset)
                                        {
                                            throw new Exception("term " + term + ": doc " + doc + ": pos " + pos + ": startOffset " + startOffset + " < lastStartOffset " + lastOffset);
                                        }
                                        if (endOffset < 0)
                                        {
                                            throw new Exception("term " + term + ": doc " + doc + ": pos " + pos + ": endOffset " + endOffset + " is out of bounds");
                                        }
                                        if (endOffset < startOffset)
                                        {
                                            throw new Exception("term " + term + ": doc " + doc + ": pos " + pos + ": endOffset " + endOffset + " < startOffset " + startOffset);
                                        }
                                    }
                                    lastOffset = startOffset;
                                }
                            }
                        }
                    }

                    if (docCount != 0)
                    {
                        status.TermCount++;
                    }
                    else
                    {
                        status.DelTermCount++;
                    }

                    long totalTermFreq2 = termsEnum.TotalTermFreq();
                    bool hasTotalTermFreq = hasFreqs && totalTermFreq2 != -1;

                    // Re-count if there are deleted docs:
                    if (liveDocs != null)
                    {
                        if (hasFreqs)
                        {
                            DocsEnum docsNoDel = termsEnum.Docs(null, docsAndFreqs);
                            docCount = 0;
                            totalTermFreq = 0;
                            while (docsNoDel.NextDoc() != DocIdSetIterator.NO_MORE_DOCS)
                            {
                                visitedDocs.Set(docsNoDel.DocID());
                                docCount++;
                                totalTermFreq += docsNoDel.Freq();
                            }
                        }
                        else
                        {
                            DocsEnum docsNoDel = termsEnum.Docs(null, docs, DocsEnum.FLAG_NONE);
                            docCount = 0;
                            totalTermFreq = -1;
                            while (docsNoDel.NextDoc() != DocIdSetIterator.NO_MORE_DOCS)
                            {
                                visitedDocs.Set(docsNoDel.DocID());
                                docCount++;
                            }
                        }
                    }

                    if (docCount != docFreq)
                    {
                        throw new Exception("term " + term + " docFreq=" + docFreq + " != tot docs w/o deletions " + docCount);
                    }
                    if (hasTotalTermFreq)
                    {
                        if (totalTermFreq2 <= 0)
                        {
                            throw new Exception("totalTermFreq: " + totalTermFreq2 + " is out of bounds");
                        }
                        sumTotalTermFreq += totalTermFreq;
                        if (totalTermFreq != totalTermFreq2)
                        {
                            throw new Exception("term " + term + " totalTermFreq=" + totalTermFreq2 + " != recomputed totalTermFreq=" + totalTermFreq);
                        }
                    }

                    // Test skipping
                    if (hasPositions)
                    {
                        for (int idx = 0; idx < 7; idx++)
                        {
                            int skipDocID = (int)(((idx + 1) * (long)maxDoc) / 8);
                            postings = termsEnum.DocsAndPositions(liveDocs, postings);
                            int docID = postings.Advance(skipDocID);
                            if (docID == DocIdSetIterator.NO_MORE_DOCS)
                            {
                                break;
                            }
                            else
                            {
                                if (docID < skipDocID)
                                {
                                    throw new Exception("term " + term + ": advance(docID=" + skipDocID + ") returned docID=" + docID);
                                }
                                int freq = postings.Freq();
                                if (freq <= 0)
                                {
                                    throw new Exception("termFreq " + freq + " is out of bounds");
                                }
                                int lastPosition = -1;
                                int lastOffset = 0;
                                for (int posUpto = 0; posUpto < freq; posUpto++)
                                {
                                    int pos = postings.NextPosition();

                                    if (pos < 0)
                                    {
                                        throw new Exception("position " + pos + " is out of bounds");
                                    }
                                    if (pos < lastPosition)
                                    {
                                        throw new Exception("position " + pos + " is < lastPosition " + lastPosition);
                                    }
                                    lastPosition = pos;
                                    if (hasOffsets)
                                    {
                                        int startOffset = postings.StartOffset();
                                        int endOffset = postings.EndOffset();
                                        // NOTE: we cannot enforce any bounds whatsoever on vectors... they were a free-for-all before?
                                        // but for offsets in the postings lists these checks are fine: they were always enforced by IndexWriter
                                        if (!isVectors)
                                        {
                                            if (startOffset < 0)
                                            {
                                                throw new Exception("term " + term + ": doc " + docID + ": pos " + pos + ": startOffset " + startOffset + " is out of bounds");
                                            }
                                            if (startOffset < lastOffset)
                                            {
                                                throw new Exception("term " + term + ": doc " + docID + ": pos " + pos + ": startOffset " + startOffset + " < lastStartOffset " + lastOffset);
                                            }
                                            if (endOffset < 0)
                                            {
                                                throw new Exception("term " + term + ": doc " + docID + ": pos " + pos + ": endOffset " + endOffset + " is out of bounds");
                                            }
                                            if (endOffset < startOffset)
                                            {
                                                throw new Exception("term " + term + ": doc " + docID + ": pos " + pos + ": endOffset " + endOffset + " < startOffset " + startOffset);
                                            }
                                        }
                                        lastOffset = startOffset;
                                    }
                                }

                                int nextDocID = postings.NextDoc();
                                if (nextDocID == DocIdSetIterator.NO_MORE_DOCS)
                                {
                                    break;
                                }
                                if (nextDocID <= docID)
                                {
                                    throw new Exception("term " + term + ": advance(docID=" + skipDocID + "), then .next() returned docID=" + nextDocID + " vs prev docID=" + docID);
                                }
                            }
                        }
                    }
                    else
                    {
                        for (int idx = 0; idx < 7; idx++)
                        {
                            int skipDocID = (int)(((idx + 1) * (long)maxDoc) / 8);
                            docs = termsEnum.Docs(liveDocs, docs, DocsEnum.FLAG_NONE);
                            int docID = docs.Advance(skipDocID);
                            if (docID == DocIdSetIterator.NO_MORE_DOCS)
                            {
                                break;
                            }
                            else
                            {
                                if (docID < skipDocID)
                                {
                                    throw new Exception("term " + term + ": advance(docID=" + skipDocID + ") returned docID=" + docID);
                                }
                                int nextDocID = docs.NextDoc();
                                if (nextDocID == DocIdSetIterator.NO_MORE_DOCS)
                                {
                                    break;
                                }
                                if (nextDocID <= docID)
                                {
                                    throw new Exception("term " + term + ": advance(docID=" + skipDocID + "), then .next() returned docID=" + nextDocID + " vs prev docID=" + docID);
                                }
                            }
                        }
                    }
                }

                Terms fieldTerms = fields.Terms(field);
                if (fieldTerms == null)
                {
                    // Unusual: the FieldsEnum returned a field but
                    // the Terms for that field is null; this should
                    // only happen if it's a ghost field (field with
                    // no terms, eg there used to be terms but all
                    // docs got deleted and then merged away):
                }
                else
                {
                    if (fieldTerms is BlockTreeTermsReader.FieldReader)
                    {
                        BlockTreeTermsReader.Stats stats = ((BlockTreeTermsReader.FieldReader)fieldTerms).ComputeStats();
                        Debug.Assert(stats != null);
                        if (status.BlockTreeStats == null)
                        {
                            status.BlockTreeStats = new Dictionary<string, BlockTreeTermsReader.Stats>();
                        }
                        status.BlockTreeStats[field] = stats;
                    }

                    if (sumTotalTermFreq != 0)
                    {
                        long v = fields.Terms(field).SumTotalTermFreq;
                        if (v != -1 && sumTotalTermFreq != v)
                        {
                            throw new Exception("sumTotalTermFreq for field " + field + "=" + v + " != recomputed sumTotalTermFreq=" + sumTotalTermFreq);
                        }
                    }

                    if (sumDocFreq != 0)
                    {
                        long v = fields.Terms(field).SumDocFreq;
                        if (v != -1 && sumDocFreq != v)
                        {
                            throw new Exception("sumDocFreq for field " + field + "=" + v + " != recomputed sumDocFreq=" + sumDocFreq);
                        }
                    }

                    if (fieldTerms != null)
                    {
                        int v = fieldTerms.DocCount;
                        if (v != -1 && visitedDocs.Cardinality() != v)
                        {
                            throw new Exception("docCount for field " + field + "=" + v + " != recomputed docCount=" + visitedDocs.Cardinality());
                        }
                    }

                    // Test seek to last term:
                    if (lastTerm != null)
                    {
                        if (termsEnum.SeekCeil(lastTerm) != TermsEnum.SeekStatus.FOUND)
                        {
                            throw new Exception("seek to last term " + lastTerm + " failed");
                        }

                        int expectedDocFreq = termsEnum.DocFreq();
                        DocsEnum d = termsEnum.Docs(null, null, DocsEnum.FLAG_NONE);
                        int docFreq = 0;
                        while (d.NextDoc() != DocIdSetIterator.NO_MORE_DOCS)
                        {
                            docFreq++;
                        }
                        if (docFreq != expectedDocFreq)
                        {
                            throw new Exception("docFreq for last term " + lastTerm + "=" + expectedDocFreq + " != recomputed docFreq=" + docFreq);
                        }
                    }

                    // check unique term count
                    long termCount = -1;

                    if ((status.DelTermCount + status.TermCount) - termCountStart > 0)
                    {
                        termCount = fields.Terms(field).Size();

                        if (termCount != -1 && termCount != status.DelTermCount + status.TermCount - termCountStart)
                        {
                            throw new Exception("termCount mismatch " + (status.DelTermCount + termCount) + " vs " + (status.TermCount - termCountStart));
                        }
                    }

                    // Test seeking by ord
                    if (hasOrd && status.TermCount - termCountStart > 0)
                    {
                        int seekCount = (int)Math.Min(10000L, termCount);
                        if (seekCount > 0)
                        {
                            BytesRef[] seekTerms = new BytesRef[seekCount];

                            // Seek by ord
                            for (int i = seekCount - 1; i >= 0; i--)
                            {
                                long ord = i * (termCount / seekCount);
                                termsEnum.SeekExact(ord);
                                seekTerms[i] = BytesRef.DeepCopyOf(termsEnum.Term());
                            }

                            // Seek by term
                            long totDocCount = 0;
                            for (int i = seekCount - 1; i >= 0; i--)
                            {
                                if (termsEnum.SeekCeil(seekTerms[i]) != TermsEnum.SeekStatus.FOUND)
                                {
                                    throw new Exception("seek to existing term " + seekTerms[i] + " failed");
                                }

                                docs = termsEnum.Docs(liveDocs, docs, DocsEnum.FLAG_NONE);
                                if (docs == null)
                                {
                                    throw new Exception("null DocsEnum from to existing term " + seekTerms[i]);
                                }

                                while (docs.NextDoc() != DocIdSetIterator.NO_MORE_DOCS)
                                {
                                    totDocCount++;
                                }
                            }

                            long totDocCountNoDeletes = 0;
                            long totDocFreq = 0;
                            for (int i = 0; i < seekCount; i++)
                            {
                                if (!termsEnum.SeekExact(seekTerms[i]))
                                {
                                    throw new Exception("seek to existing term " + seekTerms[i] + " failed");
                                }

                                totDocFreq += termsEnum.DocFreq();
                                docs = termsEnum.Docs(null, docs, DocsEnum.FLAG_NONE);
                                if (docs == null)
                                {
                                    throw new Exception("null DocsEnum from to existing term " + seekTerms[i]);
                                }

                                while (docs.NextDoc() != DocIdSetIterator.NO_MORE_DOCS)
                                {
                                    totDocCountNoDeletes++;
                                }
                            }

                            if (totDocCount > totDocCountNoDeletes)
                            {
                                throw new Exception("more postings with deletes=" + totDocCount + " than without=" + totDocCountNoDeletes);
                            }

                            if (totDocCountNoDeletes != totDocFreq)
                            {
                                throw new Exception("docfreqs=" + totDocFreq + " != recomputed docfreqs=" + totDocCountNoDeletes);
                            }
                        }
                    }
                }
            }

            int fieldCount = fields.Size;

            if (fieldCount != -1)
            {
                if (fieldCount < 0)
                {
                    throw new Exception("invalid fieldCount: " + fieldCount);
                }
                if (fieldCount != computedFieldCount)
                {
                    throw new Exception("fieldCount mismatch " + fieldCount + " vs recomputed field count " + computedFieldCount);
                }
            }

            // for most implementations, this is boring (just the sum across all fields)
            // but codecs that don't work per-field like preflex actually implement this,
            // but don't implement it on Terms, so the check isn't redundant.
            long uniqueTermCountAllFields = fields.UniqueTermCount;

            if (uniqueTermCountAllFields != -1 && status.TermCount + status.DelTermCount != uniqueTermCountAllFields)
            {
                throw new Exception("termCount mismatch " + uniqueTermCountAllFields + " vs " + (status.TermCount + status.DelTermCount));
            }

            if (doPrint)
            {
                Msg(infoStream, "OK [" + status.TermCount + " terms; " + status.TotFreq + " terms/docs pairs; " + status.TotPos + " tokens]");
            }

            if (verbose && status.BlockTreeStats != null && infoStream != null && status.TermCount > 0)
            {
                foreach (KeyValuePair<string, BlockTreeTermsReader.Stats> ent in status.BlockTreeStats)
                {
                    infoStream.WriteLine("      field \"" + ent.Key + "\":");
                    infoStream.WriteLine("      " + ent.Value.ToString().Replace("\n", "\n      "));
                }
            }

            return status;
        }
Ejemplo n.º 12
0
 private static void CheckSortedDocValues(string fieldName, AtomicReader reader, SortedDocValues dv, Bits docsWithField)
 {
     CheckBinaryDocValues(fieldName, reader, dv, docsWithField);
     int maxOrd = dv.ValueCount - 1;
     FixedBitSet seenOrds = new FixedBitSet(dv.ValueCount);
     int maxOrd2 = -1;
     for (int i = 0; i < reader.MaxDoc; i++)
     {
         int ord = dv.GetOrd(i);
         if (ord == -1)
         {
             if (docsWithField.Get(i))
             {
                 throw new Exception("dv for field: " + fieldName + " has -1 ord but is not marked missing for doc: " + i);
             }
         }
         else if (ord < -1 || ord > maxOrd)
         {
             throw new Exception("ord out of bounds: " + ord);
         }
         else
         {
             if (!docsWithField.Get(i))
             {
                 throw new Exception("dv for field: " + fieldName + " is missing but has ord=" + ord + " for doc: " + i);
             }
             maxOrd2 = Math.Max(maxOrd2, ord);
             seenOrds.Set(ord);
         }
     }
     if (maxOrd != maxOrd2)
     {
         throw new Exception("dv for field: " + fieldName + " reports wrong maxOrd=" + maxOrd + " but this is not the case: " + maxOrd2);
     }
     if (seenOrds.Cardinality() != dv.ValueCount)
     {
         throw new Exception("dv for field: " + fieldName + " has holes in its ords, valueCount=" + dv.ValueCount + " but only used: " + seenOrds.Cardinality());
     }
     BytesRef lastValue = null;
     BytesRef scratch = new BytesRef();
     for (int i = 0; i <= maxOrd; i++)
     {
         dv.LookupOrd(i, scratch);
         Debug.Assert(scratch.Valid);
         if (lastValue != null)
         {
             if (scratch.CompareTo(lastValue) <= 0)
             {
                 throw new Exception("dv for field: " + fieldName + " has ords out of order: " + lastValue + " >=" + scratch);
             }
         }
         lastValue = BytesRef.DeepCopyOf(scratch);
     }
 }
Ejemplo n.º 13
0
        protected virtual void AssertEquals(RandomTokenStream tk, FieldType ft, Terms terms)
        {
            Assert.AreEqual(1, terms.DocCount);
            int termCount = new JCG.HashSet <string>(tk.terms).Count;

            Assert.AreEqual((long)termCount, terms.Count);      // LUCENENET specific - cast required because types don't match (xUnit checks this)
            Assert.AreEqual((long)termCount, terms.SumDocFreq); // LUCENENET specific - cast required because types don't match (xUnit checks this)
            Assert.AreEqual(ft.StoreTermVectorPositions, terms.HasPositions);
            Assert.AreEqual(ft.StoreTermVectorOffsets, terms.HasOffsets);
            Assert.AreEqual(ft.StoreTermVectorPayloads && tk.HasPayloads(), terms.HasPayloads);
            ISet <BytesRef> uniqueTerms = new JCG.HashSet <BytesRef>();

            foreach (string term in tk.freqs.Keys)
            {
                uniqueTerms.Add(new BytesRef(term));
            }
            BytesRef[] sortedTerms = uniqueTerms.ToArray(/*new BytesRef[0]*/);
            Array.Sort(sortedTerms, terms.Comparer);
            TermsEnum termsEnum = terms.GetEnumerator(Random.NextBoolean() ? null : this.termsEnum.Value);

            this.termsEnum.Value = termsEnum;
            for (int i = 0; i < sortedTerms.Length; ++i)
            {
                Assert.IsTrue(termsEnum.MoveNext());
                Assert.AreEqual(sortedTerms[i], termsEnum.Term);
                Assert.AreEqual(1, termsEnum.DocFreq);

                FixedBitSet bits     = new FixedBitSet(1);
                DocsEnum    docsEnum = termsEnum.Docs(bits, Random.NextBoolean() ? null : this.docsEnum.Value);
                Assert.AreEqual(DocsEnum.NO_MORE_DOCS, docsEnum.NextDoc());
                bits.Set(0);

                docsEnum = termsEnum.Docs(Random.NextBoolean() ? bits : null, Random.NextBoolean() ? null : docsEnum);
                Assert.IsNotNull(docsEnum);
                Assert.AreEqual(0, docsEnum.NextDoc());
                Assert.AreEqual(0, docsEnum.DocID);
                Assert.AreEqual(tk.freqs[termsEnum.Term.Utf8ToString()], docsEnum.Freq);
                Assert.AreEqual(DocsEnum.NO_MORE_DOCS, docsEnum.NextDoc());
                this.docsEnum.Value = docsEnum;

                bits.Clear(0);
                DocsAndPositionsEnum docsAndPositionsEnum = termsEnum.DocsAndPositions(bits, Random.NextBoolean() ? null : this.docsAndPositionsEnum.Value);
                Assert.AreEqual(ft.StoreTermVectorOffsets || ft.StoreTermVectorPositions, docsAndPositionsEnum != null);
                if (docsAndPositionsEnum != null)
                {
                    Assert.AreEqual(DocsEnum.NO_MORE_DOCS, docsAndPositionsEnum.NextDoc());
                }
                bits.Set(0);

                docsAndPositionsEnum = termsEnum.DocsAndPositions(Random.NextBoolean() ? bits : null, Random.NextBoolean() ? null : docsAndPositionsEnum);
                Assert.AreEqual(ft.StoreTermVectorOffsets || ft.StoreTermVectorPositions, docsAndPositionsEnum != null);
                if (terms.HasPositions || terms.HasOffsets)
                {
                    Assert.AreEqual(0, docsAndPositionsEnum.NextDoc());
                    int freq = docsAndPositionsEnum.Freq;
                    Assert.AreEqual(tk.freqs[termsEnum.Term.Utf8ToString()], freq);
                    if (docsAndPositionsEnum != null)
                    {
                        for (int k = 0; k < freq; ++k)
                        {
                            int        position = docsAndPositionsEnum.NextPosition();
                            ISet <int> indexes;
                            if (terms.HasPositions)
                            {
                                indexes = tk.positionToTerms[position];
                                Assert.IsNotNull(indexes);
                            }
                            else
                            {
                                indexes = tk.startOffsetToTerms[docsAndPositionsEnum.StartOffset];
                                Assert.IsNotNull(indexes);
                            }
                            if (terms.HasPositions)
                            {
                                bool foundPosition = false;
                                foreach (int index in indexes)
                                {
                                    if (tk.termBytes[index].Equals(termsEnum.Term) && tk.positions[index] == position)
                                    {
                                        foundPosition = true;
                                        break;
                                    }
                                }
                                Assert.IsTrue(foundPosition);
                            }
                            if (terms.HasOffsets)
                            {
                                bool foundOffset = false;
                                foreach (int index in indexes)
                                {
                                    if (tk.termBytes[index].Equals(termsEnum.Term) && tk.startOffsets[index] == docsAndPositionsEnum.StartOffset && tk.endOffsets[index] == docsAndPositionsEnum.EndOffset)
                                    {
                                        foundOffset = true;
                                        break;
                                    }
                                }
                                Assert.IsTrue(foundOffset);
                            }
                            if (terms.HasPayloads)
                            {
                                bool foundPayload = false;
                                foreach (int index in indexes)
                                {
                                    if (tk.termBytes[index].Equals(termsEnum.Term) && Equals(tk.payloads[index], docsAndPositionsEnum.GetPayload()))
                                    {
                                        foundPayload = true;
                                        break;
                                    }
                                }
                                Assert.IsTrue(foundPayload);
                            }
                        }
                        try
                        {
                            docsAndPositionsEnum.NextPosition();
                            Assert.Fail();
                        }
                        catch (Exception e) when(e.IsException())
                        {
                            // ok
                        }
                        catch (Exception e) when(e.IsAssertionError())
                        {
                            // ok
                        }
                    }
                    Assert.AreEqual(DocsEnum.NO_MORE_DOCS, docsAndPositionsEnum.NextDoc());
                }
                this.docsAndPositionsEnum.Value = docsAndPositionsEnum;
            }
            Assert.IsFalse(termsEnum.MoveNext());
            for (int i = 0; i < 5; ++i)
            {
                if (Random.NextBoolean())
                {
                    Assert.IsTrue(termsEnum.SeekExact(RandomPicks.RandomFrom(Random, tk.termBytes)));
                }
                else
                {
                    Assert.AreEqual(SeekStatus.FOUND, termsEnum.SeekCeil(RandomPicks.RandomFrom(Random, tk.termBytes)));
                }
            }
        }
Ejemplo n.º 14
0
		private FixedBitSet makeFixedBitSet(int[] a, int numBits)
		{
			FixedBitSet bs = new FixedBitSet(numBits);
			foreach (int e in a)
			{
				bs.Set(e);
			}
			return bs;
		}
Ejemplo n.º 15
0
            internal virtual void LoadTerms()
            {
                PositiveIntOutputs posIntOutputs = PositiveIntOutputs.Singleton;
                Builder <PairOutputs.Pair <long?, PairOutputs.Pair <long?, long?> > > b;
                PairOutputs <long?, long?> outputsInner = new PairOutputs <long?, long?>(posIntOutputs, posIntOutputs);
                PairOutputs <long?, PairOutputs.Pair <long?, long?> > outputs =
                    new PairOutputs <long?, PairOutputs.Pair <long?, long?> >(posIntOutputs, outputsInner);

                b = new Builder <>(FST.INPUT_TYPE.BYTE1, outputs);
                IndexInput @in = (IndexInput)outerInstance._input.Clone();

                @in.Seek(termsStart);

                BytesRef    lastTerm      = new BytesRef(10);
                long        lastDocsStart = -1;
                int         docFreq       = 0;
                long        totalTermFreq = 0;
                FixedBitSet visitedDocs   = new FixedBitSet(maxDoc);

                IntsRef scratchIntsRef = new IntsRef();

                while (true)
                {
                    SimpleTextUtil.ReadLine(@in, scratch);
                    if (scratch.Equals(END) || StringHelper.StartsWith(scratch, FIELD))
                    {
                        if (lastDocsStart != -1)
                        {
                            b.Add(Util.ToIntsRef(lastTerm, scratchIntsRef),
                                  outputs.NewPair(lastDocsStart, outputsInner.NewPair((long)docFreq, totalTermFreq)));
                            sumTotalTermFreq += totalTermFreq;
                        }
                        break;
                    }
                    else if (StringHelper.StartsWith(scratch, DOC))
                    {
                        docFreq++;
                        sumDocFreq++;
                        UnicodeUtil.UTF8toUTF16(scratch.Bytes, scratch.Offset + DOC.length, scratch.Length - DOC.length,
                                                scratchUTF16);
                        int docID = ArrayUtil.ParseInt(scratchUTF16.Chars, 0, scratchUTF16.length);
                        visitedDocs.Set(docID);
                    }
                    else if (StringHelper.StartsWith(scratch, FREQ))
                    {
                        UnicodeUtil.UTF8toUTF16(scratch.Bytes, scratch.Offset + FREQ.length,
                                                scratch.Length - FREQ.length, scratchUTF16);
                        totalTermFreq += ArrayUtil.ParseInt(scratchUTF16.Chars, 0, scratchUTF16.length);
                    }
                    else if (StringHelper.StartsWith(scratch, TERM))
                    {
                        if (lastDocsStart != -1)
                        {
                            b.Add(Util.ToIntsRef(lastTerm, scratchIntsRef),
                                  outputs.NewPair(lastDocsStart, outputsInner.NewPair((long)docFreq, totalTermFreq)));
                        }
                        lastDocsStart = @in.FilePointer;
                        int len = scratch.Length - TERM.length;
                        if (len > lastTerm.Length)
                        {
                            lastTerm.Grow(len);
                        }
                        Array.Copy(scratch.Bytes, TERM.length, lastTerm.Bytes, 0, len);
                        lastTerm.Length   = len;
                        docFreq           = 0;
                        sumTotalTermFreq += totalTermFreq;
                        totalTermFreq     = 0;
                        termCount++;
                    }
                }
                docCount = visitedDocs.Cardinality();
                fst      = b.Finish();
            }
        // maxAllowed = the "highest" we can index, but we will still
        // randomly index at lower IndexOption
        private FieldsProducer BuildIndex(Directory dir, FieldInfo.IndexOptions maxAllowed, bool allowPayloads, bool alwaysTestMax)
        {
            Codec codec = Codec;
            SegmentInfo segmentInfo = new SegmentInfo(dir, Constants.LUCENE_MAIN_VERSION, "_0", MaxDoc, false, codec, null);

            int maxIndexOption = Enum.GetValues(typeof(FieldInfo.IndexOptions)).Cast<FieldInfo.IndexOptions>().ToList().IndexOf(maxAllowed);
            if (VERBOSE)
            {
                Console.WriteLine("\nTEST: now build index");
            }

            int maxIndexOptionNoOffsets = Enum.GetValues(typeof(FieldInfo.IndexOptions)).Cast<FieldInfo.IndexOptions>().ToList().IndexOf(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS);

            // TODO use allowPayloads

            var newFieldInfoArray = new FieldInfo[Fields.Count];
            for (int fieldUpto = 0; fieldUpto < Fields.Count; fieldUpto++)
            {
                FieldInfo oldFieldInfo = FieldInfos.FieldInfo(fieldUpto);

                string pf = TestUtil.GetPostingsFormat(codec, oldFieldInfo.Name);
                int fieldMaxIndexOption;
                if (DoesntSupportOffsets.Contains(pf))
                {
                    fieldMaxIndexOption = Math.Min(maxIndexOptionNoOffsets, maxIndexOption);
                }
                else
                {
                    fieldMaxIndexOption = maxIndexOption;
                }

                // Randomly picked the IndexOptions to index this
                // field with:
                FieldInfo.IndexOptions indexOptions = Enum.GetValues(typeof(FieldInfo.IndexOptions)).Cast<FieldInfo.IndexOptions>().ToArray()[alwaysTestMax ? fieldMaxIndexOption : Random().Next(1 + fieldMaxIndexOption)];
                bool doPayloads = indexOptions.CompareTo(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0 && allowPayloads;

                newFieldInfoArray[fieldUpto] = new FieldInfo(oldFieldInfo.Name, true, fieldUpto, false, false, doPayloads, indexOptions, null, DocValuesType.NUMERIC, null);
            }

            FieldInfos newFieldInfos = new FieldInfos(newFieldInfoArray);

            // Estimate that flushed segment size will be 25% of
            // what we use in RAM:
            long bytes = TotalPostings * 8 + TotalPayloadBytes;

            SegmentWriteState writeState = new SegmentWriteState(null, dir, segmentInfo, newFieldInfos, 32, null, new IOContext(new FlushInfo(MaxDoc, bytes)));
            FieldsConsumer fieldsConsumer = codec.PostingsFormat().FieldsConsumer(writeState);

            foreach (KeyValuePair<string, SortedDictionary<BytesRef, long>> fieldEnt in Fields)
            {
                string field = fieldEnt.Key;
                IDictionary<BytesRef, long> terms = fieldEnt.Value;

                FieldInfo fieldInfo = newFieldInfos.FieldInfo(field);

                FieldInfo.IndexOptions? indexOptions = fieldInfo.FieldIndexOptions;

                if (VERBOSE)
                {
                    Console.WriteLine("field=" + field + " indexOtions=" + indexOptions);
                }

                bool doFreq = indexOptions >= FieldInfo.IndexOptions.DOCS_AND_FREQS;
                bool doPos = indexOptions >= FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS;
                bool doPayloads = indexOptions >= FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS && allowPayloads;
                bool doOffsets = indexOptions >= FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS;

                TermsConsumer termsConsumer = fieldsConsumer.AddField(fieldInfo);
                long sumTotalTF = 0;
                long sumDF = 0;
                FixedBitSet seenDocs = new FixedBitSet(MaxDoc);
                foreach (KeyValuePair<BytesRef, long> termEnt in terms)
                {
                    BytesRef term = termEnt.Key;
                    SeedPostings postings = GetSeedPostings(term.Utf8ToString(), termEnt.Value, false, maxAllowed);
                    if (VERBOSE)
                    {
                        Console.WriteLine("  term=" + field + ":" + term.Utf8ToString() + " docFreq=" + postings.DocFreq + " seed=" + termEnt.Value);
                    }

                    PostingsConsumer postingsConsumer = termsConsumer.StartTerm(term);
                    long totalTF = 0;
                    int docID = 0;
                    while ((docID = postings.NextDoc()) != DocsEnum.NO_MORE_DOCS)
                    {
                        int freq = postings.Freq();
                        if (VERBOSE)
                        {
                            Console.WriteLine("    " + postings.Upto + ": docID=" + docID + " freq=" + postings.Freq_Renamed);
                        }
                        postingsConsumer.StartDoc(docID, doFreq ? postings.Freq_Renamed : -1);
                        seenDocs.Set(docID);
                        if (doPos)
                        {
                            totalTF += postings.Freq_Renamed;
                            for (int posUpto = 0; posUpto < freq; posUpto++)
                            {
                                int pos = postings.NextPosition();
                                BytesRef payload = postings.Payload;

                                if (VERBOSE)
                                {
                                    if (doPayloads)
                                    {
                                        Console.WriteLine("      pos=" + pos + " payload=" + (payload == null ? "null" : payload.Length + " bytes"));
                                    }
                                    else
                                    {
                                        Console.WriteLine("      pos=" + pos);
                                    }
                                }
                                postingsConsumer.AddPosition(pos, doPayloads ? payload : null, doOffsets ? postings.StartOffset() : -1, doOffsets ? postings.EndOffset() : -1);
                            }
                        }
                        else if (doFreq)
                        {
                            totalTF += freq;
                        }
                        else
                        {
                            totalTF++;
                        }
                        postingsConsumer.FinishDoc();
                    }
                    termsConsumer.FinishTerm(term, new TermStats(postings.DocFreq, doFreq ? totalTF : -1));
                    sumTotalTF += totalTF;
                    sumDF += postings.DocFreq;
                }

                termsConsumer.Finish(doFreq ? sumTotalTF : -1, sumDF, seenDocs.Cardinality());
            }

            fieldsConsumer.Dispose();

            if (VERBOSE)
            {
                Console.WriteLine("TEST: after indexing: files=");
                foreach (string file in dir.ListAll())
                {
                    Console.WriteLine("  " + file + ": " + dir.FileLength(file) + " bytes");
                }
            }

            CurrentFieldInfos = newFieldInfos;

            SegmentReadState readState = new SegmentReadState(dir, segmentInfo, newFieldInfos, IOContext.READ, 1);

            return codec.PostingsFormat().FieldsProducer(readState);
        }
Ejemplo n.º 17
0
 public virtual void Collect(int i)
 {
     bitset.Set(this.baseDoc + i);
 }
 internal RandomBits(int maxDoc, double pctLive, Random random)
 {
     Bits = new FixedBitSet(maxDoc);
     for (int i = 0; i < maxDoc; i++)
     {
         if (random.NextDouble() <= pctLive)
         {
             Bits.Set(i);
         }
     }
 }
Ejemplo n.º 19
0
 public override void Collect(int doc)
 {
     ActualResult.Set(doc + _docBase);
     TopScoreDocCollector.Collect(doc);
 }
Ejemplo n.º 20
0
        /// <summary>
        /// Default merge impl: append documents, mapping around
        ///  deletes
        /// </summary>
        public virtual TermStats Merge(MergeState mergeState, IndexOptions indexOptions, DocsEnum postings, FixedBitSet visitedDocs)
        {
            int  df    = 0;
            long totTF = 0;

            if (indexOptions == IndexOptions.DOCS_ONLY)
            {
                while (true)
                {
                    int doc = postings.NextDoc();
                    if (doc == DocIdSetIterator.NO_MORE_DOCS)
                    {
                        break;
                    }
                    visitedDocs.Set(doc);
                    this.StartDoc(doc, -1);
                    this.FinishDoc();
                    df++;
                }
                totTF = -1;
            }
            else if (indexOptions == IndexOptions.DOCS_AND_FREQS)
            {
                while (true)
                {
                    int doc = postings.NextDoc();
                    if (doc == DocIdSetIterator.NO_MORE_DOCS)
                    {
                        break;
                    }
                    visitedDocs.Set(doc);
                    int freq = postings.Freq;
                    this.StartDoc(doc, freq);
                    this.FinishDoc();
                    df++;
                    totTF += freq;
                }
            }
            else if (indexOptions == IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)
            {
                var postingsEnum = (DocsAndPositionsEnum)postings;
                while (true)
                {
                    int doc = postingsEnum.NextDoc();
                    if (doc == DocIdSetIterator.NO_MORE_DOCS)
                    {
                        break;
                    }
                    visitedDocs.Set(doc);
                    int freq = postingsEnum.Freq;
                    this.StartDoc(doc, freq);
                    totTF += freq;
                    for (int i = 0; i < freq; i++)
                    {
                        int      position = postingsEnum.NextPosition();
                        BytesRef payload  = postingsEnum.GetPayload();
                        this.AddPosition(position, payload, -1, -1);
                    }
                    this.FinishDoc();
                    df++;
                }
            }
            else
            {
                Debug.Assert(indexOptions == IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
                var postingsEnum = (DocsAndPositionsEnum)postings;
                while (true)
                {
                    int doc = postingsEnum.NextDoc();
                    if (doc == DocIdSetIterator.NO_MORE_DOCS)
                    {
                        break;
                    }
                    visitedDocs.Set(doc);
                    int freq = postingsEnum.Freq;
                    this.StartDoc(doc, freq);
                    totTF += freq;
                    for (int i = 0; i < freq; i++)
                    {
                        int      position = postingsEnum.NextPosition();
                        BytesRef payload  = postingsEnum.GetPayload();
                        this.AddPosition(position, payload, postingsEnum.StartOffset, postingsEnum.EndOffset);
                    }
                    this.FinishDoc();
                    df++;
                }
            }
            return(new TermStats(df, indexOptions == IndexOptions.DOCS_ONLY ? -1 : totTF));
        }
Ejemplo n.º 21
0
        public virtual void AddValue(int docID, BytesRef value)
        {
            if (docID < AddedValues)
            {
                throw new System.ArgumentException("DocValuesField \"" + FieldInfo.Name + "\" appears more than once in this document (only one value is allowed per field)");
            }
            if (value == null)
            {
                throw new System.ArgumentException("field=\"" + FieldInfo.Name + "\": null value not allowed");
            }
            if (value.Length > MAX_LENGTH)
            {
                throw new System.ArgumentException("DocValuesField \"" + FieldInfo.Name + "\" is too large, must be <= " + MAX_LENGTH);
            }

            // Fill in any holes:
            while (AddedValues < docID)
            {
                AddedValues++;
                Lengths.Add(0);
            }
            AddedValues++;
            Lengths.Add(value.Length);
            try
            {
                BytesOut.WriteBytes(value.Bytes, value.Offset, value.Length);
            }
            catch (System.IO.IOException ioe)
            {
                // Should never happen!
                throw new Exception(ioe.Message, ioe);
            }
            DocsWithField = FixedBitSet.EnsureCapacity(DocsWithField, docID);
            DocsWithField.Set(docID);
            UpdateBytesUsed();
        }
Ejemplo n.º 22
0
        protected internal virtual void AssertEquals(RandomTokenStream tk, FieldType ft, Terms terms)
        {
            Assert.AreEqual(1, terms.DocCount);
            int termCount = (new HashSet <string>(Arrays.AsList(tk.Terms))).Count;

            Assert.AreEqual(termCount, terms.Size());
            Assert.AreEqual(termCount, terms.SumDocFreq);
            Assert.AreEqual(ft.StoreTermVectorPositions, terms.HasPositions());
            Assert.AreEqual(ft.StoreTermVectorOffsets, terms.HasOffsets());
            Assert.AreEqual(ft.StoreTermVectorPayloads && tk.HasPayloads(), terms.HasPayloads());
            HashSet <BytesRef> uniqueTerms = new HashSet <BytesRef>();

            foreach (string term in tk.Freqs.Keys)
            {
                uniqueTerms.Add(new BytesRef(term));
            }
            BytesRef[] sortedTerms = uniqueTerms.ToArray(/*new BytesRef[0]*/);
            Array.Sort(sortedTerms, terms.Comparator);
            TermsEnum termsEnum = terms.Iterator(Random().NextBoolean() ? null : this.termsEnum.Value);

            this.termsEnum.Value = termsEnum;
            for (int i = 0; i < sortedTerms.Length; ++i)
            {
                BytesRef nextTerm = termsEnum.Next();
                Assert.AreEqual(sortedTerms[i], nextTerm);
                Assert.AreEqual(sortedTerms[i], termsEnum.Term());
                Assert.AreEqual(1, termsEnum.DocFreq());

                FixedBitSet bits     = new FixedBitSet(1);
                DocsEnum    docsEnum = termsEnum.Docs(bits, Random().NextBoolean() ? null : this.docsEnum.Value);
                Assert.AreEqual(DocsEnum.NO_MORE_DOCS, docsEnum.NextDoc());
                bits.Set(0);

                docsEnum = termsEnum.Docs(Random().NextBoolean() ? bits : null, Random().NextBoolean() ? null : docsEnum);
                Assert.IsNotNull(docsEnum);
                Assert.AreEqual(0, docsEnum.NextDoc());
                Assert.AreEqual(0, docsEnum.DocID());
                Assert.AreEqual(tk.Freqs[termsEnum.Term().Utf8ToString()], (int?)docsEnum.Freq());
                Assert.AreEqual(DocsEnum.NO_MORE_DOCS, docsEnum.NextDoc());
                this.docsEnum.Value = docsEnum;

                bits.Clear(0);
                DocsAndPositionsEnum docsAndPositionsEnum = termsEnum.DocsAndPositions(bits, Random().NextBoolean() ? null : this.docsAndPositionsEnum.Value);
                Assert.AreEqual(ft.StoreTermVectorOffsets || ft.StoreTermVectorPositions, docsAndPositionsEnum != null);
                if (docsAndPositionsEnum != null)
                {
                    Assert.AreEqual(DocsEnum.NO_MORE_DOCS, docsAndPositionsEnum.NextDoc());
                }
                bits.Set(0);

                docsAndPositionsEnum = termsEnum.DocsAndPositions(Random().NextBoolean() ? bits : null, Random().NextBoolean() ? null : docsAndPositionsEnum);
                Assert.AreEqual(ft.StoreTermVectorOffsets || ft.StoreTermVectorPositions, docsAndPositionsEnum != null);
                if (terms.HasPositions() || terms.HasOffsets())
                {
                    Assert.AreEqual(0, docsAndPositionsEnum.NextDoc());
                    int freq = docsAndPositionsEnum.Freq();
                    Assert.AreEqual(tk.Freqs[termsEnum.Term().Utf8ToString()], (int?)freq);
                    if (docsAndPositionsEnum != null)
                    {
                        for (int k = 0; k < freq; ++k)
                        {
                            int         position = docsAndPositionsEnum.NextPosition();
                            ISet <int?> indexes;
                            if (terms.HasPositions())
                            {
                                indexes = tk.PositionToTerms[position];
                                Assert.IsNotNull(indexes);
                            }
                            else
                            {
                                indexes = tk.StartOffsetToTerms[docsAndPositionsEnum.StartOffset()];
                                Assert.IsNotNull(indexes);
                            }
                            if (terms.HasPositions())
                            {
                                bool foundPosition = false;
                                foreach (int index in indexes)
                                {
                                    if (tk.TermBytes[index].Equals(termsEnum.Term()) && tk.Positions[index] == position)
                                    {
                                        foundPosition = true;
                                        break;
                                    }
                                }
                                Assert.IsTrue(foundPosition);
                            }
                            if (terms.HasOffsets())
                            {
                                bool foundOffset = false;
                                foreach (int index in indexes)
                                {
                                    if (tk.TermBytes[index].Equals(termsEnum.Term()) && tk.StartOffsets[index] == docsAndPositionsEnum.StartOffset() && tk.EndOffsets[index] == docsAndPositionsEnum.EndOffset())
                                    {
                                        foundOffset = true;
                                        break;
                                    }
                                }
                                Assert.IsTrue(foundOffset);
                            }
                            if (terms.HasPayloads())
                            {
                                bool foundPayload = false;
                                foreach (int index in indexes)
                                {
                                    if (tk.TermBytes[index].Equals(termsEnum.Term()) && Equals(tk.Payloads[index], docsAndPositionsEnum.Payload))
                                    {
                                        foundPayload = true;
                                        break;
                                    }
                                }
                                Assert.IsTrue(foundPayload);
                            }
                        }
                        try
                        {
                            docsAndPositionsEnum.NextPosition();
                            Assert.Fail();
                        }
                        catch (Exception e)
                        {
                            // ok
                        }
                    }
                    Assert.AreEqual(DocsEnum.NO_MORE_DOCS, docsAndPositionsEnum.NextDoc());
                }
                this.docsAndPositionsEnum.Value = docsAndPositionsEnum;
            }
            Assert.IsNull(termsEnum.Next());
            for (int i = 0; i < 5; ++i)
            {
                if (Random().NextBoolean())
                {
                    Assert.IsTrue(termsEnum.SeekExact(RandomInts.RandomFrom(Random(), tk.TermBytes)));
                }
                else
                {
                    Assert.AreEqual(SeekStatus.FOUND, termsEnum.SeekCeil(RandomInts.RandomFrom(Random(), tk.TermBytes)));
                }
            }
        }
Ejemplo n.º 23
0
            public override void AddSortedField(FieldInfo field, IEnumerable <BytesRef> values, IEnumerable <long?> docToOrd)
            {
                int      valueCount = 0;
                BytesRef lastValue  = null;

                foreach (BytesRef b in values)
                {
                    if (Debugging.AssertsEnabled)
                    {
                        Debugging.Assert(b != null);
                    }
                    if (Debugging.AssertsEnabled)
                    {
                        Debugging.Assert(b.IsValid());
                    }
                    if (valueCount > 0)
                    {
                        if (Debugging.AssertsEnabled)
                        {
                            Debugging.Assert(b.CompareTo(lastValue) > 0);
                        }
                    }
                    lastValue = BytesRef.DeepCopyOf(b);
                    valueCount++;
                }
                if (Debugging.AssertsEnabled)
                {
                    Debugging.Assert(valueCount <= maxDoc);
                }

                FixedBitSet seenOrds = new FixedBitSet(valueCount);

                int count = 0;

                foreach (long?v in docToOrd)
                {
                    if (Debugging.AssertsEnabled)
                    {
                        Debugging.Assert(v != null);
                    }
                    int ord = (int)v.Value;
                    if (Debugging.AssertsEnabled)
                    {
                        Debugging.Assert(ord >= -1 && ord < valueCount);
                    }
                    if (ord >= 0)
                    {
                        seenOrds.Set(ord);
                    }
                    count++;
                }

                if (Debugging.AssertsEnabled)
                {
                    Debugging.Assert(count == maxDoc);
                }
                if (Debugging.AssertsEnabled)
                {
                    Debugging.Assert(seenOrds.Cardinality == valueCount);
                }
                CheckIterator(values.GetEnumerator(), valueCount, false);
                CheckIterator(docToOrd.GetEnumerator(), maxDoc, false);
                @in.AddSortedField(field, values, docToOrd);
            }
Ejemplo n.º 24
0
        internal void Flush(string fieldName, FieldsConsumer consumer, SegmentWriteState state)
        {
            if (!fieldInfo.IsIndexed)
            {
                return; // nothing to flush, don't bother the codec with the unindexed field
            }

            TermsConsumer        termsConsumer = consumer.AddField(fieldInfo);
            IComparer <BytesRef> termComp      = termsConsumer.Comparer;

            // CONFUSING: this.indexOptions holds the index options
            // that were current when we first saw this field.  But
            // it's possible this has changed, eg when other
            // documents are indexed that cause a "downgrade" of the
            // IndexOptions.  So we must decode the in-RAM buffer
            // according to this.indexOptions, but then write the
            // new segment to the directory according to
            // currentFieldIndexOptions:
            IndexOptions currentFieldIndexOptions = fieldInfo.IndexOptions;

            if (Debugging.AssertsEnabled)
            {
                Debugging.Assert(currentFieldIndexOptions != IndexOptions.NONE);
            }

            // LUCENENET specific - to avoid boxing, changed from CompareTo() to IndexOptionsComparer.Compare()
            bool writeTermFreq  = IndexOptionsComparer.Default.Compare(currentFieldIndexOptions, IndexOptions.DOCS_AND_FREQS) >= 0;
            bool writePositions = IndexOptionsComparer.Default.Compare(currentFieldIndexOptions, IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0;
            bool writeOffsets   = IndexOptionsComparer.Default.Compare(currentFieldIndexOptions, IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0;

            bool readTermFreq  = this.hasFreq;
            bool readPositions = this.hasProx;
            bool readOffsets   = this.hasOffsets;

            //System.out.println("flush readTF=" + readTermFreq + " readPos=" + readPositions + " readOffs=" + readOffsets);

            // Make sure FieldInfo.update is working correctly!:
            if (Debugging.AssertsEnabled)
            {
                Debugging.Assert(!writeTermFreq || readTermFreq);
                Debugging.Assert(!writePositions || readPositions);
                Debugging.Assert(!writeOffsets || readOffsets);

                Debugging.Assert(!writeOffsets || writePositions);
            }

            IDictionary <Term, int?> segDeletes;

            if (state.SegUpdates != null && state.SegUpdates.terms.Count > 0)
            {
                segDeletes = state.SegUpdates.terms;
            }
            else
            {
                segDeletes = null;
            }

            int[]    termIDs  = termsHashPerField.SortPostings(termComp);
            int      numTerms = termsHashPerField.bytesHash.Count;
            BytesRef text     = new BytesRef();
            FreqProxPostingsArray postings = (FreqProxPostingsArray)termsHashPerField.postingsArray;
            ByteSliceReader       freq     = new ByteSliceReader();
            ByteSliceReader       prox     = new ByteSliceReader();

            FixedBitSet visitedDocs      = new FixedBitSet(state.SegmentInfo.DocCount);
            long        sumTotalTermFreq = 0;
            long        sumDocFreq       = 0;

            Term protoTerm = new Term(fieldName);

            for (int i = 0; i < numTerms; i++)
            {
                int termID = termIDs[i];
                // Get BytesRef
                int textStart = postings.textStarts[termID];
                termsHashPerField.bytePool.SetBytesRef(text, textStart);

                termsHashPerField.InitReader(freq, termID, 0);
                if (readPositions || readOffsets)
                {
                    termsHashPerField.InitReader(prox, termID, 1);
                }

                // TODO: really TermsHashPerField should take over most
                // of this loop, including merge sort of terms from
                // multiple threads and interacting with the
                // TermsConsumer, only calling out to us (passing us the
                // DocsConsumer) to handle delivery of docs/positions

                PostingsConsumer postingsConsumer = termsConsumer.StartTerm(text);

                int?delDocLimit;
                if (segDeletes != null)
                {
                    protoTerm.Bytes = text;
                    if (segDeletes.TryGetValue(protoTerm, out int?docIDUpto) && docIDUpto != null)
                    {
                        delDocLimit = docIDUpto;
                    }
                    else
                    {
                        delDocLimit = 0;
                    }
                }
                else
                {
                    delDocLimit = 0;
                }

                // Now termStates has numToMerge FieldMergeStates
                // which all share the same term.  Now we must
                // interleave the docID streams.
                int  docFreq       = 0;
                long totalTermFreq = 0;
                int  docID         = 0;

                while (true)
                {
                    //System.out.println("  cycle");
                    int termFreq;
                    if (freq.Eof())
                    {
                        if (postings.lastDocCodes[termID] != -1)
                        {
                            // Return last doc
                            docID = postings.lastDocIDs[termID];
                            if (readTermFreq)
                            {
                                termFreq = postings.termFreqs[termID];
                            }
                            else
                            {
                                termFreq = -1;
                            }
                            postings.lastDocCodes[termID] = -1;
                        }
                        else
                        {
                            // EOF
                            break;
                        }
                    }
                    else
                    {
                        int code = freq.ReadVInt32();
                        if (!readTermFreq)
                        {
                            docID   += code;
                            termFreq = -1;
                        }
                        else
                        {
                            docID += code.TripleShift(1);
                            if ((code & 1) != 0)
                            {
                                termFreq = 1;
                            }
                            else
                            {
                                termFreq = freq.ReadVInt32();
                            }
                        }

                        if (Debugging.AssertsEnabled)
                        {
                            Debugging.Assert(docID != postings.lastDocIDs[termID]);
                        }
                    }

                    docFreq++;
                    if (Debugging.AssertsEnabled)
                    {
                        Debugging.Assert(docID < state.SegmentInfo.DocCount, "doc={0} maxDoc={1}", docID, state.SegmentInfo.DocCount);
                    }

                    // NOTE: we could check here if the docID was
                    // deleted, and skip it.  However, this is somewhat
                    // dangerous because it can yield non-deterministic
                    // behavior since we may see the docID before we see
                    // the term that caused it to be deleted.  this
                    // would mean some (but not all) of its postings may
                    // make it into the index, which'd alter the docFreq
                    // for those terms.  We could fix this by doing two
                    // passes, ie first sweep marks all del docs, and
                    // 2nd sweep does the real flush, but I suspect
                    // that'd add too much time to flush.
                    visitedDocs.Set(docID);
                    postingsConsumer.StartDoc(docID, writeTermFreq ? termFreq : -1);
                    if (docID < delDocLimit)
                    {
                        // Mark it deleted.  TODO: we could also skip
                        // writing its postings; this would be
                        // deterministic (just for this Term's docs).

                        // TODO: can we do this reach-around in a cleaner way????
                        if (state.LiveDocs == null)
                        {
                            state.LiveDocs = docState.docWriter.codec.LiveDocsFormat.NewLiveDocs(state.SegmentInfo.DocCount);
                        }
                        if (state.LiveDocs.Get(docID))
                        {
                            state.DelCountOnFlush++;
                            state.LiveDocs.Clear(docID);
                        }
                    }

                    totalTermFreq += termFreq;

                    // Carefully copy over the prox + payload info,
                    // changing the format to match Lucene's segment
                    // format.

                    if (readPositions || readOffsets)
                    {
                        // we did record positions (& maybe payload) and/or offsets
                        int position = 0;
                        int offset   = 0;
                        for (int j = 0; j < termFreq; j++)
                        {
                            BytesRef thisPayload;

                            if (readPositions)
                            {
                                int code = prox.ReadVInt32();
                                position += code.TripleShift(1);

                                if ((code & 1) != 0)
                                {
                                    // this position has a payload
                                    int payloadLength = prox.ReadVInt32();

                                    if (payload == null)
                                    {
                                        payload       = new BytesRef();
                                        payload.Bytes = new byte[payloadLength];
                                    }
                                    else if (payload.Bytes.Length < payloadLength)
                                    {
                                        payload.Grow(payloadLength);
                                    }

                                    prox.ReadBytes(payload.Bytes, 0, payloadLength);
                                    payload.Length = payloadLength;
                                    thisPayload    = payload;
                                }
                                else
                                {
                                    thisPayload = null;
                                }

                                if (readOffsets)
                                {
                                    int startOffset = offset + prox.ReadVInt32();
                                    int endOffset   = startOffset + prox.ReadVInt32();
                                    if (writePositions)
                                    {
                                        if (writeOffsets)
                                        {
                                            if (Debugging.AssertsEnabled)
                                            {
                                                Debugging.Assert(startOffset >= 0 && endOffset >= startOffset, "startOffset={0},endOffset={1},offset={2}", startOffset, endOffset, offset);
                                            }
                                            postingsConsumer.AddPosition(position, thisPayload, startOffset, endOffset);
                                        }
                                        else
                                        {
                                            postingsConsumer.AddPosition(position, thisPayload, -1, -1);
                                        }
                                    }
                                    offset = startOffset;
                                }
                                else if (writePositions)
                                {
                                    postingsConsumer.AddPosition(position, thisPayload, -1, -1);
                                }
                            }
                        }
                    }
                    postingsConsumer.FinishDoc();
                }
                termsConsumer.FinishTerm(text, new TermStats(docFreq, writeTermFreq ? totalTermFreq : -1));
                sumTotalTermFreq += totalTermFreq;
                sumDocFreq       += docFreq;
            }

            termsConsumer.Finish(writeTermFreq ? sumTotalTermFreq : -1, sumDocFreq, visitedDocs.Cardinality());
        }
Ejemplo n.º 25
0
        /// <param name="targetMaxSaturation">
        /// A number between 0 and 1 describing the % of bits that would ideally be set in the result. 
        /// Lower values have better accuracy but require more space.
        /// </param>
        /// <return>A smaller FuzzySet or null if the current set is already over-saturated</return>
        public FuzzySet Downsize(float targetMaxSaturation)
        {
            var numBitsSet = _filter.Cardinality();
            FixedBitSet rightSizedBitSet;
            var rightSizedBitSetSize = _bloomSize;
            //Hopefully find a smaller size bitset into which we can project accumulated values while maintaining desired saturation level
            foreach (var candidateBitsetSize in from candidateBitsetSize in UsableBitSetSizes let candidateSaturation = numBitsSet
                                                                                                                         /(float) candidateBitsetSize where candidateSaturation <= targetMaxSaturation select candidateBitsetSize)
            {
                rightSizedBitSetSize = candidateBitsetSize;
                break;
            }
            // Re-project the numbers to a smaller space if necessary
            if (rightSizedBitSetSize < _bloomSize)
            {
                // Reset the choice of bitset to the smaller version
                rightSizedBitSet = new FixedBitSet(rightSizedBitSetSize + 1);
                // Map across the bits from the large set to the smaller one
                var bitIndex = 0;
                do
                {
                    bitIndex = _filter.NextSetBit(bitIndex);
                    if (bitIndex < 0) continue;

                    // Project the larger number into a smaller one effectively
                    // modulo-ing by using the target bitset size as a mask
                    var downSizedBitIndex = bitIndex & rightSizedBitSetSize;
                    rightSizedBitSet.Set(downSizedBitIndex);
                    bitIndex++;
                } while ((bitIndex >= 0) && (bitIndex <= _bloomSize));
            }
            else
            {
                return null;
            }
            return new FuzzySet(rightSizedBitSet, rightSizedBitSetSize, _hashFunction);
        }
Ejemplo n.º 26
0
 public virtual void Collect(int doc)
 {
     Assert.IsTrue(doc < End.Get(), "collected doc=" + doc + " beyond max=" + End);
     Hits.Set(doc);
 }
        /// <summary>
        /// Create a sampled of the given hits. </summary>
        private MatchingDocs CreateSample(MatchingDocs docs)
        {
            int maxdoc = docs.context.Reader.MaxDoc;

            // TODO: we could try the WAH8DocIdSet here as well, as the results will be sparse
            FixedBitSet sampleDocs = new FixedBitSet(maxdoc);

            int binSize = (int)(1.0 / samplingRate);

            try
            {
                int counter = 0;
                int limit, randomIndex;
                if (leftoverBin != NOT_CALCULATED)
                {
                    limit = leftoverBin;
                    // either NOT_CALCULATED, which means we already sampled from that bin,
                    // or the next document to sample
                    randomIndex = leftoverIndex;
                }
                else
                {
                    limit       = binSize;
                    randomIndex = random.NextInt(binSize);
                }
                DocIdSetIterator it = docs.bits.GetIterator();
                for (int doc = it.NextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = it.NextDoc())
                {
                    if (counter == randomIndex)
                    {
                        sampleDocs.Set(doc);
                    }
                    counter++;
                    if (counter >= limit)
                    {
                        counter     = 0;
                        limit       = binSize;
                        randomIndex = random.NextInt(binSize);
                    }
                }

                if (counter == 0)
                {
                    // we either exhausted the bin and the iterator at the same time, or
                    // this segment had no results. in the latter case we might want to
                    // carry leftover to the next segment as is, but that complicates the
                    // code and doesn't seem so important.
                    leftoverBin = leftoverIndex = NOT_CALCULATED;
                }
                else
                {
                    leftoverBin = limit - counter;
                    if (randomIndex > counter)
                    {
                        // the document to sample is in the next bin
                        leftoverIndex = randomIndex - counter;
                    }
                    else if (randomIndex < counter)
                    {
                        // we sampled a document from the bin, so just skip over remaining
                        // documents in the bin in the next segment.
                        leftoverIndex = NOT_CALCULATED;
                    }
                }

                return(new MatchingDocs(docs.context, sampleDocs, docs.totalHits, null));
            }
            catch (IOException)
            {
                throw new Exception();
            }
        }
Ejemplo n.º 28
0
		public void testEquals()
		{
			// This test can't handle numBits==0:
			int numBits = rnd.Next(2000) + 1;
			FixedBitSet b1 = new FixedBitSet(numBits);
			FixedBitSet b2 = new FixedBitSet(numBits);
			Assert.IsTrue(b1.Equals(b2));
			Assert.IsTrue(b2.Equals(b1));
			for (int iter = 0; iter < 10 * rnd.Next(500); iter++)
			{
				int idx = rnd.Next(numBits);
				if (!b1.Get(idx))
				{
					b1.Set(idx);
					Assert.IsFalse(b1.Equals(b2));
					Assert.IsFalse(b2.Equals(b1));
					b2.Set(idx);
					Assert.IsTrue(b1.Equals(b2));
					Assert.IsTrue(b2.Equals(b1));
				}
			}

			// try different type of object
			Assert.IsFalse(b1.Equals(new Object()));
		}
 public override void Merge(DocValuesFieldUpdates other)
 {
     Debug.Assert(other is NumericDocValuesFieldUpdates);
     NumericDocValuesFieldUpdates otherUpdates = (NumericDocValuesFieldUpdates)other;
     if (Size + otherUpdates.Size > int.MaxValue)
     {
         throw new InvalidOperationException("cannot support more than Integer.MAX_VALUE doc/value entries; size=" + Size + " other.size=" + otherUpdates.Size);
     }
     Docs = Docs.Grow(Size + otherUpdates.Size);
     Values = Values.Grow(Size + otherUpdates.Size);
     DocsWithField = FixedBitSet.EnsureCapacity(DocsWithField, (int)Docs.Size());
     for (int i = 0; i < otherUpdates.Size; i++)
     {
         int doc = (int)otherUpdates.Docs.Get(i);
         if (otherUpdates.DocsWithField.Get(i))
         {
             DocsWithField.Set(Size);
         }
         Docs.Set(Size, doc);
         Values.Set(Size, otherUpdates.Values.Get(i));
         ++Size;
     }
 }
Ejemplo n.º 30
0
		public void testSmallBitSets()
		{
			// Make sure size 0-10 bit sets are OK:
			for (int numBits = 0; numBits < 10; numBits++)
			{
				FixedBitSet b1 = new FixedBitSet(numBits);
				FixedBitSet b2 = new FixedBitSet(numBits);
				Assert.IsTrue(b1.Equals(b2));
				Assert.AreEqual(b1.GetHashCode(), b2.GetHashCode());
				Assert.AreEqual(0, b1.Cardinality());
				if (numBits > 0)
				{
					b1.Set(0, numBits);
					Assert.AreEqual(numBits, b1.Cardinality());
					//b1.Flip(0, numBits);
					//Assert.AreEqual(0, b1.Cardinality());
				}
			}
		}
Ejemplo n.º 31
0
 public override void AddDoc(int docId)
 {
     bits.Set(docId);
 }
        public static void CreatePostings()
        {
            TotalPostings = 0;
            TotalPayloadBytes = 0;
            Fields = new SortedDictionary<string, SortedDictionary<BytesRef, long>>();

            int numFields = TestUtil.NextInt(Random(), 1, 5);
            if (VERBOSE)
            {
                Console.WriteLine("TEST: " + numFields + " fields");
            }
            MaxDoc = 0;

            FieldInfo[] fieldInfoArray = new FieldInfo[numFields];
            int fieldUpto = 0;
            while (fieldUpto < numFields)
            {
                string field = TestUtil.RandomSimpleString(Random());
                if (Fields.ContainsKey(field))
                {
                    continue;
                }

                fieldInfoArray[fieldUpto] = new FieldInfo(field, true, fieldUpto, false, false, true, FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS, null, DocValuesType.NUMERIC, null);
                fieldUpto++;

                SortedDictionary<BytesRef, long> postings = new SortedDictionary<BytesRef, long>();
                Fields[field] = postings;
                HashSet<string> seenTerms = new HashSet<string>();

                int numTerms;
                if (Random().Next(10) == 7)
                {
                    numTerms = AtLeast(50);
                }
                else
                {
                    numTerms = TestUtil.NextInt(Random(), 2, 20);
                }

                for (int termUpto = 0; termUpto < numTerms; termUpto++)
                {
                    string term = TestUtil.RandomSimpleString(Random());
                    if (seenTerms.Contains(term))
                    {
                        continue;
                    }
                    seenTerms.Add(term);

                    if (TEST_NIGHTLY && termUpto == 0 && fieldUpto == 1)
                    {
                        // Make 1 big term:
                        term = "big_" + term;
                    }
                    else if (termUpto == 1 && fieldUpto == 1)
                    {
                        // Make 1 medium term:
                        term = "medium_" + term;
                    }
                    else if (Random().NextBoolean())
                    {
                        // Low freq term:
                        term = "low_" + term;
                    }
                    else
                    {
                        // Very low freq term (don't multiply by RANDOM_MULTIPLIER):
                        term = "verylow_" + term;
                    }

                    long termSeed = Random().NextLong();
                    postings[new BytesRef(term)] = termSeed;

                    // NOTE: sort of silly: we enum all the docs just to
                    // get the maxDoc
                    DocsEnum docsEnum = GetSeedPostings(term, termSeed, false, FieldInfo.IndexOptions.DOCS_ONLY);
                    int doc;
                    int lastDoc = 0;
                    while ((doc = docsEnum.NextDoc()) != DocsEnum.NO_MORE_DOCS)
                    {
                        lastDoc = doc;
                    }
                    MaxDoc = Math.Max(lastDoc, MaxDoc);
                }
            }

            FieldInfos = new FieldInfos(fieldInfoArray);

            // It's the count, not the last docID:
            MaxDoc++;

            GlobalLiveDocs = new FixedBitSet(MaxDoc);
            double liveRatio = Random().NextDouble();
            for (int i = 0; i < MaxDoc; i++)
            {
                if (Random().NextDouble() <= liveRatio)
                {
                    GlobalLiveDocs.Set(i);
                }
            }

            AllTerms = new List<FieldAndTerm>();
            foreach (KeyValuePair<string, SortedDictionary<BytesRef, long>> fieldEnt in Fields)
            {
                string field = fieldEnt.Key;
                foreach (KeyValuePair<BytesRef, long> termEnt in fieldEnt.Value.EntrySet())
                {
                    AllTerms.Add(new FieldAndTerm(field, termEnt.Key));
                }
            }

            if (VERBOSE)
            {
                Console.WriteLine("TEST: done init postings; " + AllTerms.Count + " total terms, across " + FieldInfos.Size() + " fields");
            }
        }
Ejemplo n.º 33
0
        /// <summary>
        /// Used when drill downs are highly constraining vs
        /// baseQuery.
        /// </summary>
        private void DoDrillDownAdvanceScoring(Collector collector, DocIdSetIterator[] disis, Collector[] sidewaysCollectors)
        {
            int maxDoc  = context.Reader.MaxDoc;
            int numDims = dims.Length;

            //if (DEBUG) {
            //  System.out.println("  doDrillDownAdvanceScoring");
            //}

            // TODO: maybe a class like BS, instead of parallel arrays
            int[]   filledSlots = new int[CHUNK];
            int[]   docIDs      = new int[CHUNK];
            float[] scores      = new float[CHUNK];
            int[]   missingDims = new int[CHUNK];
            int[]   counts      = new int[CHUNK];

            docIDs[0] = -1;
            int nextChunkStart = CHUNK;

            FixedBitSet seen = new FixedBitSet(CHUNK);

            while (true)
            {
                //if (DEBUG) {
                //  System.out.println("\ncycle nextChunkStart=" + nextChunkStart + " docIds[0]=" + docIDs[0]);
                //}

                // First dim:
                //if (DEBUG) {
                //  System.out.println("  dim0");
                //}
                DocIdSetIterator disi = disis[0];
                if (disi != null)
                {
                    int docID = disi.DocID();
                    while (docID < nextChunkStart)
                    {
                        int slot = docID & MASK;

                        if (docIDs[slot] != docID)
                        {
                            seen.Set(slot);
                            // Mark slot as valid:
                            //if (DEBUG) {
                            //  System.out.println("    set docID=" + docID + " id=" + context.reader().document(docID).get("id"));
                            //}
                            docIDs[slot]      = docID;
                            missingDims[slot] = 1;
                            counts[slot]      = 1;
                        }

                        docID = disi.NextDoc();
                    }
                }

                // Second dim:
                //if (DEBUG) {
                //  System.out.println("  dim1");
                //}
                disi = disis[1];
                if (disi != null)
                {
                    int docID = disi.DocID();
                    while (docID < nextChunkStart)
                    {
                        int slot = docID & MASK;

                        if (docIDs[slot] != docID)
                        {
                            // Mark slot as valid:
                            seen.Set(slot);
                            //if (DEBUG) {
                            //  System.out.println("    set docID=" + docID + " missingDim=0 id=" + context.reader().document(docID).get("id"));
                            //}
                            docIDs[slot]      = docID;
                            missingDims[slot] = 0;
                            counts[slot]      = 1;
                        }
                        else
                        {
                            // TODO: single-valued dims will always be true
                            // below; we could somehow specialize
                            if (missingDims[slot] >= 1)
                            {
                                missingDims[slot] = 2;
                                counts[slot]      = 2;
                                //if (DEBUG) {
                                //  System.out.println("    set docID=" + docID + " missingDim=2 id=" + context.reader().document(docID).get("id"));
                                //}
                            }
                            else
                            {
                                counts[slot] = 1;
                                //if (DEBUG) {
                                //  System.out.println("    set docID=" + docID + " missingDim=" + missingDims[slot] + " id=" + context.reader().document(docID).get("id"));
                                //}
                            }
                        }

                        docID = disi.NextDoc();
                    }
                }

                // After this we can "upgrade" to conjunction, because
                // any doc not seen by either dim 0 or dim 1 cannot be
                // a hit or a near miss:

                //if (DEBUG) {
                //  System.out.println("  baseScorer");
                //}

                // Fold in baseScorer, using advance:
                int filledCount = 0;
                int slot0       = 0;
                while (slot0 < CHUNK && (slot0 = seen.NextSetBit(slot0)) != -1)
                {
                    int ddDocID = docIDs[slot0];
                    Debug.Assert(ddDocID != -1);

                    int baseDocID = baseScorer.DocID();
                    if (baseDocID < ddDocID)
                    {
                        baseDocID = baseScorer.Advance(ddDocID);
                    }
                    if (baseDocID == ddDocID)
                    {
                        //if (DEBUG) {
                        //  System.out.println("    keep docID=" + ddDocID + " id=" + context.reader().document(ddDocID).get("id"));
                        //}
                        scores[slot0] = baseScorer.Score();
                        filledSlots[filledCount++] = slot0;
                        counts[slot0]++;
                    }
                    else
                    {
                        //if (DEBUG) {
                        //  System.out.println("    no docID=" + ddDocID + " id=" + context.reader().document(ddDocID).get("id"));
                        //}
                        docIDs[slot0] = -1;

                        // TODO: we could jump slot0 forward to the
                        // baseDocID ... but we'd need to set docIDs for
                        // intervening slots to -1
                    }
                    slot0++;
                }
                seen.Clear(0, CHUNK);

                if (filledCount == 0)
                {
                    if (nextChunkStart >= maxDoc)
                    {
                        break;
                    }
                    nextChunkStart += CHUNK;
                    continue;
                }

                // TODO: factor this out & share w/ union scorer,
                // except we start from dim=2 instead:
                for (int dim = 2; dim < numDims; dim++)
                {
                    //if (DEBUG) {
                    //  System.out.println("  dim=" + dim + " [" + dims[dim].dim + "]");
                    //}
                    disi = disis[dim];
                    if (disi != null)
                    {
                        int docID = disi.DocID();
                        while (docID < nextChunkStart)
                        {
                            int slot = docID & MASK;
                            if (docIDs[slot] == docID && counts[slot] >= dim)
                            {
                                // TODO: single-valued dims will always be true
                                // below; we could somehow specialize
                                if (missingDims[slot] >= dim)
                                {
                                    //if (DEBUG) {
                                    //  System.out.println("    set docID=" + docID + " count=" + (dim+2));
                                    //}
                                    missingDims[slot] = dim + 1;
                                    counts[slot]      = dim + 2;
                                }
                                else
                                {
                                    //if (DEBUG) {
                                    //  System.out.println("    set docID=" + docID + " missing count=" + (dim+1));
                                    //}
                                    counts[slot] = dim + 1;
                                }
                            }

                            // TODO: sometimes use advance?
                            docID = disi.NextDoc();
                        }
                    }
                }

                // Collect:
                //if (DEBUG) {
                //  System.out.println("  now collect: " + filledCount + " hits");
                //}
                for (int i = 0; i < filledCount; i++)
                {
                    int slot = filledSlots[i];
                    collectDocID = docIDs[slot];
                    collectScore = scores[slot];
                    //if (DEBUG) {
                    //  System.out.println("    docID=" + docIDs[slot] + " count=" + counts[slot]);
                    //}
                    if (counts[slot] == 1 + numDims)
                    {
                        CollectHit(collector, sidewaysCollectors);
                    }
                    else if (counts[slot] == numDims)
                    {
                        CollectNearMiss(sidewaysCollectors[missingDims[slot]]);
                    }
                }

                if (nextChunkStart >= maxDoc)
                {
                    break;
                }

                nextChunkStart += CHUNK;
            }
        }
Ejemplo n.º 34
0
        /// <summary>
        /// Default merge impl: append documents, mapping around
        ///  deletes
        /// </summary>
        public virtual TermStats Merge(MergeState mergeState, FieldInfo.IndexOptions? indexOptions, DocsEnum postings, FixedBitSet visitedDocs)
        {
            int df = 0;
            long totTF = 0;

            if (indexOptions == FieldInfo.IndexOptions.DOCS_ONLY)
            {
                while (true)
                {
                    int doc = postings.NextDoc();
                    if (doc == DocIdSetIterator.NO_MORE_DOCS)
                    {
                        break;
                    }
                    visitedDocs.Set(doc);
                    this.StartDoc(doc, -1);
                    this.FinishDoc();
                    df++;
                }
                totTF = -1;
            }
            else if (indexOptions == FieldInfo.IndexOptions.DOCS_AND_FREQS)
            {
                while (true)
                {
                    int doc = postings.NextDoc();
                    if (doc == DocIdSetIterator.NO_MORE_DOCS)
                    {
                        break;
                    }
                    visitedDocs.Set(doc);
                    int freq = postings.Freq();
                    this.StartDoc(doc, freq);
                    this.FinishDoc();
                    df++;
                    totTF += freq;
                }
            }
            else if (indexOptions == FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)
            {
                var postingsEnum = (DocsAndPositionsEnum)postings;
                while (true)
                {
                    int doc = postingsEnum.NextDoc();
                    if (doc == DocIdSetIterator.NO_MORE_DOCS)
                    {
                        break;
                    }
                    visitedDocs.Set(doc);
                    int freq = postingsEnum.Freq();
                    this.StartDoc(doc, freq);
                    totTF += freq;
                    for (int i = 0; i < freq; i++)
                    {
                        int position = postingsEnum.NextPosition();
                        BytesRef payload = postingsEnum.Payload;
                        this.AddPosition(position, payload, -1, -1);
                    }
                    this.FinishDoc();
                    df++;
                }
            }
            else
            {
                Debug.Assert(indexOptions == FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
                var postingsEnum = (DocsAndPositionsEnum)postings;
                while (true)
                {
                    int doc = postingsEnum.NextDoc();
                    if (doc == DocIdSetIterator.NO_MORE_DOCS)
                    {
                        break;
                    }
                    visitedDocs.Set(doc);
                    int freq = postingsEnum.Freq();
                    this.StartDoc(doc, freq);
                    totTF += freq;
                    for (int i = 0; i < freq; i++)
                    {
                        int position = postingsEnum.NextPosition();
                        BytesRef payload = postingsEnum.Payload;
                        this.AddPosition(position, payload, postingsEnum.StartOffset(), postingsEnum.EndOffset());
                    }
                    this.FinishDoc();
                    df++;
                }
            }
            return new TermStats(df, indexOptions == FieldInfo.IndexOptions.DOCS_ONLY ? -1 : totTF);
        }
Ejemplo n.º 35
0
 public virtual void Collect(int doc)
 {
     Assert.IsTrue(doc < end, "collected doc=" + doc + " beyond max=" + end);
     hits.Set(doc);
 }
        /// <summary>
        /// Create a sampled of the given hits. </summary>
        private MatchingDocs CreateSample(MatchingDocs docs)
        {
            int maxdoc = docs.context.Reader.MaxDoc;

            // TODO: we could try the WAH8DocIdSet here as well, as the results will be sparse
            FixedBitSet sampleDocs = new FixedBitSet(maxdoc);

            int binSize = (int)(1.0 / samplingRate);

            try
            {
                int counter = 0;
                int limit, randomIndex;
                if (leftoverBin != NOT_CALCULATED)
                {
                    limit = leftoverBin;
                    // either NOT_CALCULATED, which means we already sampled from that bin,
                    // or the next document to sample
                    randomIndex = leftoverIndex;
                }
                else
                {
                    limit = binSize;
                    randomIndex = random.NextInt(binSize);
                }
                DocIdSetIterator it = docs.bits.GetIterator();
                for (int doc = it.NextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = it.NextDoc())
                {
                    if (counter == randomIndex)
                    {
                        sampleDocs.Set(doc);
                    }
                    counter++;
                    if (counter >= limit)
                    {
                        counter = 0;
                        limit = binSize;
                        randomIndex = random.NextInt(binSize);
                    }
                }

                if (counter == 0)
                {
                    // we either exhausted the bin and the iterator at the same time, or
                    // this segment had no results. in the latter case we might want to
                    // carry leftover to the next segment as is, but that complicates the
                    // code and doesn't seem so important.
                    leftoverBin = leftoverIndex = NOT_CALCULATED;
                }
                else
                {
                    leftoverBin = limit - counter;
                    if (randomIndex > counter)
                    {
                        // the document to sample is in the next bin
                        leftoverIndex = randomIndex - counter;
                    }
                    else if (randomIndex < counter)
                    {
                        // we sampled a document from the bin, so just skip over remaining
                        // documents in the bin in the next segment.
                        leftoverIndex = NOT_CALCULATED;
                    }
                }

                return new MatchingDocs(docs.context, sampleDocs, docs.totalHits, null);
            }
            catch (IOException)
            {
                throw new Exception();
            }
        }
Ejemplo n.º 37
0
        public virtual void DoTestLongPostingsNoPositions(FieldInfo.IndexOptions options)
        {
            // Don't use TestUtil.getTempDir so that we own the
            // randomness (ie same seed will point to same dir):
            Directory dir = NewFSDirectory(CreateTempDir("longpostings" + "." + Random().NextLong()));

            int NUM_DOCS = AtLeast(2000);

            if (VERBOSE)
            {
                Console.WriteLine("TEST: NUM_DOCS=" + NUM_DOCS);
            }

            string s1 = GetRandomTerm(null);
            string s2 = GetRandomTerm(s1);

            if (VERBOSE)
            {
                Console.WriteLine("\nTEST: s1=" + s1 + " s2=" + s2);

                /*
                 * for(int idx=0;idx<s1.Length();idx++) {
                 * System.out.println("  s1 ch=0x" + Integer.toHexString(s1.charAt(idx)));
                 * }
                 * for(int idx=0;idx<s2.Length();idx++) {
                 * System.out.println("  s2 ch=0x" + Integer.toHexString(s2.charAt(idx)));
                 * }
                 */
            }

            FixedBitSet isS1 = new FixedBitSet(NUM_DOCS);

            for (int idx = 0; idx < NUM_DOCS; idx++)
            {
                if (Random().NextBoolean())
                {
                    isS1.Set(idx);
                }
            }

            IndexReader r;

            if (true)
            {
                IndexWriterConfig iwc = NewIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(Random())).SetOpenMode(IndexWriterConfig.OpenMode_e.CREATE).SetMergePolicy(NewLogMergePolicy());
                iwc.SetRAMBufferSizeMB(16.0 + 16.0 * Random().NextDouble());
                iwc.SetMaxBufferedDocs(-1);
                RandomIndexWriter riw = new RandomIndexWriter(Random(), dir, iwc);

                FieldType ft = new FieldType(TextField.TYPE_NOT_STORED);
                ft.IndexOptions = options;
                for (int idx = 0; idx < NUM_DOCS; idx++)
                {
                    Document doc   = new Document();
                    string   s     = isS1.Get(idx) ? s1 : s2;
                    Field    f     = NewField("field", s, ft);
                    int      count = TestUtil.NextInt(Random(), 1, 4);
                    for (int ct = 0; ct < count; ct++)
                    {
                        doc.Add(f);
                    }
                    riw.AddDocument(doc);
                }

                r = riw.Reader;
                riw.Dispose();
            }
            else
            {
                r = DirectoryReader.Open(dir);
            }

            /*
             * if (VERBOSE) {
             * System.out.println("TEST: terms");
             * TermEnum termEnum = r.Terms();
             * while(termEnum.Next()) {
             *  System.out.println("  term=" + termEnum.Term() + " len=" + termEnum.Term().Text().Length());
             *  Assert.IsTrue(termEnum.DocFreq() > 0);
             *  System.out.println("    s1?=" + (termEnum.Term().Text().equals(s1)) + " s1len=" + s1.Length());
             *  System.out.println("    s2?=" + (termEnum.Term().Text().equals(s2)) + " s2len=" + s2.Length());
             *  final String s = termEnum.Term().Text();
             *  for(int idx=0;idx<s.Length();idx++) {
             *    System.out.println("      ch=0x" + Integer.toHexString(s.charAt(idx)));
             *  }
             * }
             * }
             */

            Assert.AreEqual(NUM_DOCS, r.NumDocs);
            Assert.IsTrue(r.DocFreq(new Term("field", s1)) > 0);
            Assert.IsTrue(r.DocFreq(new Term("field", s2)) > 0);

            int num = AtLeast(1000);

            for (int iter = 0; iter < num; iter++)
            {
                string term;
                bool   doS1;
                if (Random().NextBoolean())
                {
                    term = s1;
                    doS1 = true;
                }
                else
                {
                    term = s2;
                    doS1 = false;
                }

                if (VERBOSE)
                {
                    Console.WriteLine("\nTEST: iter=" + iter + " doS1=" + doS1 + " term=" + term);
                }

                DocsEnum docs;
                DocsEnum postings;

                if (options == FieldInfo.IndexOptions.DOCS_ONLY)
                {
                    docs     = TestUtil.Docs(Random(), r, "field", new BytesRef(term), null, null, DocsEnum.FLAG_NONE);
                    postings = null;
                }
                else
                {
                    docs = postings = TestUtil.Docs(Random(), r, "field", new BytesRef(term), null, null, DocsEnum.FLAG_FREQS);
                    Debug.Assert(postings != null);
                }
                Debug.Assert(docs != null);

                int docID = -1;
                while (docID < DocIdSetIterator.NO_MORE_DOCS)
                {
                    int what = Random().Next(3);
                    if (what == 0)
                    {
                        if (VERBOSE)
                        {
                            Console.WriteLine("TEST: docID=" + docID + "; do next()");
                        }
                        // nextDoc
                        int expected = docID + 1;
                        while (true)
                        {
                            if (expected == NUM_DOCS)
                            {
                                expected = int.MaxValue;
                                break;
                            }
                            else if (isS1.Get(expected) == doS1)
                            {
                                break;
                            }
                            else
                            {
                                expected++;
                            }
                        }
                        docID = docs.NextDoc();
                        if (VERBOSE)
                        {
                            Console.WriteLine("  got docID=" + docID);
                        }
                        Assert.AreEqual(expected, docID);
                        if (docID == DocIdSetIterator.NO_MORE_DOCS)
                        {
                            break;
                        }

                        if (Random().Next(6) == 3 && postings != null)
                        {
                            int freq = postings.Freq();
                            Assert.IsTrue(freq >= 1 && freq <= 4);
                        }
                    }
                    else
                    {
                        // advance
                        int targetDocID;
                        if (docID == -1)
                        {
                            targetDocID = Random().Next(NUM_DOCS + 1);
                        }
                        else
                        {
                            targetDocID = docID + TestUtil.NextInt(Random(), 1, NUM_DOCS - docID);
                        }
                        if (VERBOSE)
                        {
                            Console.WriteLine("TEST: docID=" + docID + "; do advance(" + targetDocID + ")");
                        }
                        int expected = targetDocID;
                        while (true)
                        {
                            if (expected == NUM_DOCS)
                            {
                                expected = int.MaxValue;
                                break;
                            }
                            else if (isS1.Get(expected) == doS1)
                            {
                                break;
                            }
                            else
                            {
                                expected++;
                            }
                        }

                        docID = docs.Advance(targetDocID);
                        if (VERBOSE)
                        {
                            Console.WriteLine("  got docID=" + docID);
                        }
                        Assert.AreEqual(expected, docID);
                        if (docID == DocIdSetIterator.NO_MORE_DOCS)
                        {
                            break;
                        }

                        if (Random().Next(6) == 3 && postings != null)
                        {
                            int freq = postings.Freq();
                            Assert.IsTrue(freq >= 1 && freq <= 4, "got invalid freq=" + freq);
                        }
                    }
                }
            }
            r.Dispose();
            dir.Dispose();
        }
        protected internal virtual void AssertEquals(RandomTokenStream tk, FieldType ft, Terms terms)
        {
            Assert.AreEqual(1, terms.DocCount);
            int termCount = (new HashSet<string>(Arrays.AsList(tk.Terms))).Count;
            Assert.AreEqual(termCount, terms.Size());
            Assert.AreEqual(termCount, terms.SumDocFreq);
            Assert.AreEqual(ft.StoreTermVectorPositions, terms.HasPositions());
            Assert.AreEqual(ft.StoreTermVectorOffsets, terms.HasOffsets());
            Assert.AreEqual(ft.StoreTermVectorPayloads && tk.HasPayloads(), terms.HasPayloads());
            HashSet<BytesRef> uniqueTerms = new HashSet<BytesRef>();
            foreach (string term in tk.Freqs.Keys)
            {
                uniqueTerms.Add(new BytesRef(term));
            }
            BytesRef[] sortedTerms = uniqueTerms.ToArray(/*new BytesRef[0]*/);
            Array.Sort(sortedTerms, terms.Comparator);
            TermsEnum termsEnum = terms.Iterator(Random().NextBoolean() ? null : this.termsEnum.Value);
            this.termsEnum.Value = termsEnum;
            for (int i = 0; i < sortedTerms.Length; ++i)
            {
                BytesRef nextTerm = termsEnum.Next();
                Assert.AreEqual(sortedTerms[i], nextTerm);
                Assert.AreEqual(sortedTerms[i], termsEnum.Term());
                Assert.AreEqual(1, termsEnum.DocFreq());

                FixedBitSet bits = new FixedBitSet(1);
                DocsEnum docsEnum = termsEnum.Docs(bits, Random().NextBoolean() ? null : this.docsEnum.Value);
                Assert.AreEqual(DocsEnum.NO_MORE_DOCS, docsEnum.NextDoc());
                bits.Set(0);

                docsEnum = termsEnum.Docs(Random().NextBoolean() ? bits : null, Random().NextBoolean() ? null : docsEnum);
                Assert.IsNotNull(docsEnum);
                Assert.AreEqual(0, docsEnum.NextDoc());
                Assert.AreEqual(0, docsEnum.DocID());
                Assert.AreEqual(tk.Freqs[termsEnum.Term().Utf8ToString()], (int?)docsEnum.Freq());
                Assert.AreEqual(DocsEnum.NO_MORE_DOCS, docsEnum.NextDoc());
                this.docsEnum.Value = docsEnum;

                bits.Clear(0);
                DocsAndPositionsEnum docsAndPositionsEnum = termsEnum.DocsAndPositions(bits, Random().NextBoolean() ? null : this.docsAndPositionsEnum.Value);
                Assert.AreEqual(ft.StoreTermVectorOffsets || ft.StoreTermVectorPositions, docsAndPositionsEnum != null);
                if (docsAndPositionsEnum != null)
                {
                    Assert.AreEqual(DocsEnum.NO_MORE_DOCS, docsAndPositionsEnum.NextDoc());
                }
                bits.Set(0);

                docsAndPositionsEnum = termsEnum.DocsAndPositions(Random().NextBoolean() ? bits : null, Random().NextBoolean() ? null : docsAndPositionsEnum);
                Assert.AreEqual(ft.StoreTermVectorOffsets || ft.StoreTermVectorPositions, docsAndPositionsEnum != null);
                if (terms.HasPositions() || terms.HasOffsets())
                {
                    Assert.AreEqual(0, docsAndPositionsEnum.NextDoc());
                    int freq = docsAndPositionsEnum.Freq();
                    Assert.AreEqual(tk.Freqs[termsEnum.Term().Utf8ToString()], (int?)freq);
                    if (docsAndPositionsEnum != null)
                    {
                        for (int k = 0; k < freq; ++k)
                        {
                            int position = docsAndPositionsEnum.NextPosition();
                            ISet<int?> indexes;
                            if (terms.HasPositions())
                            {
                                indexes = tk.PositionToTerms[position];
                                Assert.IsNotNull(indexes);
                            }
                            else
                            {
                                indexes = tk.StartOffsetToTerms[docsAndPositionsEnum.StartOffset()];
                                Assert.IsNotNull(indexes);
                            }
                            if (terms.HasPositions())
                            {
                                bool foundPosition = false;
                                foreach (int index in indexes)
                                {
                                    if (tk.TermBytes[index].Equals(termsEnum.Term()) && tk.Positions[index] == position)
                                    {
                                        foundPosition = true;
                                        break;
                                    }
                                }
                                Assert.IsTrue(foundPosition);
                            }
                            if (terms.HasOffsets())
                            {
                                bool foundOffset = false;
                                foreach (int index in indexes)
                                {
                                    if (tk.TermBytes[index].Equals(termsEnum.Term()) && tk.StartOffsets[index] == docsAndPositionsEnum.StartOffset() && tk.EndOffsets[index] == docsAndPositionsEnum.EndOffset())
                                    {
                                        foundOffset = true;
                                        break;
                                    }
                                }
                                Assert.IsTrue(foundOffset);
                            }
                            if (terms.HasPayloads())
                            {
                                bool foundPayload = false;
                                foreach (int index in indexes)
                                {
                                    if (tk.TermBytes[index].Equals(termsEnum.Term()) && Equals(tk.Payloads[index], docsAndPositionsEnum.Payload))
                                    {
                                        foundPayload = true;
                                        break;
                                    }
                                }
                                Assert.IsTrue(foundPayload);
                            }
                        }
                        try
                        {
                            docsAndPositionsEnum.NextPosition();
                            Assert.Fail();
                        }
                        catch (Exception e)
                        {
                            // ok
                        }
                    }
                    Assert.AreEqual(DocsEnum.NO_MORE_DOCS, docsAndPositionsEnum.NextDoc());
                }
                this.docsAndPositionsEnum.Value = docsAndPositionsEnum;
            }
            Assert.IsNull(termsEnum.Next());
            for (int i = 0; i < 5; ++i)
            {
                if (Random().NextBoolean())
                {
                    Assert.IsTrue(termsEnum.SeekExact(RandomInts.RandomFrom(Random(), tk.TermBytes)));
                }
                else
                {
                    Assert.AreEqual(SeekStatus.FOUND, termsEnum.SeekCeil(RandomInts.RandomFrom(Random(), tk.TermBytes)));
                }
            }
        }
Ejemplo n.º 39
0
            private void LoadTerms()
            {
                PositiveInt32Outputs posIntOutputs = PositiveInt32Outputs.Singleton;
                var outputsInner = new PairOutputs <Int64, Int64>(posIntOutputs, posIntOutputs);
                var outputs      = new PairOutputs <Int64, PairOutputs <Int64, Int64> .Pair>(posIntOutputs,
                                                                                             outputsInner);
                var        b   = new Builder <PairOutputs <Int64, PairOutputs <Int64, Int64> .Pair> .Pair>(FST.INPUT_TYPE.BYTE1, outputs);
                IndexInput @in = (IndexInput)outerInstance.input.Clone();

                @in.Seek(termsStart);
                BytesRef    lastTerm       = new BytesRef(10);
                long        lastDocsStart  = -1;
                int         docFreq        = 0;
                long        totalTermFreq  = 0;
                FixedBitSet visitedDocs    = new FixedBitSet(maxDoc);
                Int32sRef   scratchIntsRef = new Int32sRef();

                while (true)
                {
                    SimpleTextUtil.ReadLine(@in, scratch);
                    if (scratch.Equals(SimpleTextFieldsWriter.END) || StringHelper.StartsWith(scratch, SimpleTextFieldsWriter.FIELD))
                    {
                        if (lastDocsStart != -1)
                        {
                            b.Add(Util.ToInt32sRef(lastTerm, scratchIntsRef),
                                  outputs.NewPair(lastDocsStart,
                                                  outputsInner.NewPair((long)docFreq, totalTermFreq)));
                            sumTotalTermFreq += totalTermFreq;
                        }
                        break;
                    }
                    else if (StringHelper.StartsWith(scratch, SimpleTextFieldsWriter.DOC))
                    {
                        docFreq++;
                        sumDocFreq++;
                        UnicodeUtil.UTF8toUTF16(scratch.Bytes, scratch.Offset + SimpleTextFieldsWriter.DOC.Length, scratch.Length - SimpleTextFieldsWriter.DOC.Length, scratchUTF16);
                        int docID = ArrayUtil.ParseInt32(scratchUTF16.Chars, 0, scratchUTF16.Length);
                        visitedDocs.Set(docID);
                    }
                    else if (StringHelper.StartsWith(scratch, SimpleTextFieldsWriter.FREQ))
                    {
                        UnicodeUtil.UTF8toUTF16(scratch.Bytes, scratch.Offset + SimpleTextFieldsWriter.FREQ.Length, scratch.Length - SimpleTextFieldsWriter.FREQ.Length, scratchUTF16);
                        totalTermFreq += ArrayUtil.ParseInt32(scratchUTF16.Chars, 0, scratchUTF16.Length);
                    }
                    else if (StringHelper.StartsWith(scratch, SimpleTextFieldsWriter.TERM))
                    {
                        if (lastDocsStart != -1)
                        {
                            b.Add(Util.ToInt32sRef(lastTerm, scratchIntsRef), outputs.NewPair(lastDocsStart,
                                                                                              outputsInner.NewPair((long)docFreq, totalTermFreq)));
                        }
                        lastDocsStart = @in.Position; // LUCENENET specific: Renamed from getFilePointer() to match FileStream
                        int len = scratch.Length - SimpleTextFieldsWriter.TERM.Length;
                        if (len > lastTerm.Length)
                        {
                            lastTerm.Grow(len);
                        }
                        System.Array.Copy(scratch.Bytes, SimpleTextFieldsWriter.TERM.Length, lastTerm.Bytes, 0, len);
                        lastTerm.Length   = len;
                        docFreq           = 0;
                        sumTotalTermFreq += totalTermFreq;
                        totalTermFreq     = 0;
                        termCount++;
                    }
                }
                docCount = visitedDocs.Cardinality;
                fst      = b.Finish();

                /*
                 * PrintStream ps = new PrintStream("out.dot");
                 * fst.toDot(ps);
                 * ps.close();
                 * System.out.println("SAVED out.dot");
                 */
                //System.out.println("FST " + fst.sizeInBytes());
            }