public void  InitReader(ByteSliceReader reader, RawPostingList p, int stream)
        {
            System.Diagnostics.Debug.Assert(stream < streamCount);
            int[] ints = intPool.buffers[p.intStart >> DocumentsWriter.INT_BLOCK_SHIFT];
            int   upto = p.intStart & DocumentsWriter.INT_BLOCK_MASK;

            reader.Init(bytePool, p.byteStart + stream * ByteBlockPool.FIRST_LEVEL_SIZE, ints[upto + stream]);
        }
示例#2
0
        public void InitReader(ByteSliceReader reader, int termID, int stream)
        {
            Debug.Assert(stream < streamCount);
            int intStart = postingsArray.intStarts[termID];

            int[] ints = intPool.Buffers[intStart >> Int32BlockPool.INT32_BLOCK_SHIFT];
            int   upto = intStart & Int32BlockPool.INT32_BLOCK_MASK;

            reader.Init(bytePool, postingsArray.byteStarts[termID] + stream * ByteBlockPool.FIRST_LEVEL_SIZE, ints[upto + stream]);
        }
示例#3
0
        /* Walk through all unique text tokens (Posting
         * instances) found in this field and serialize them
         * into a single RAM segment. */

        internal void Flush(string fieldName, FieldsConsumer consumer, SegmentWriteState state)
        {
            if (!fieldInfo.Indexed)
            {
                return; // nothing to flush, don't bother the codec with the unindexed field
            }

            TermsConsumer        termsConsumer = consumer.AddField(fieldInfo);
            IComparer <BytesRef> termComp      = termsConsumer.Comparator;

            // CONFUSING: this.indexOptions holds the index options
            // that were current when we first saw this field.  But
            // it's possible this has changed, eg when other
            // documents are indexed that cause a "downgrade" of the
            // IndexOptions.  So we must decode the in-RAM buffer
            // according to this.indexOptions, but then write the
            // new segment to the directory according to
            // currentFieldIndexOptions:
            FieldInfo.IndexOptions?currentFieldIndexOptions = fieldInfo.FieldIndexOptions;
            Debug.Assert(currentFieldIndexOptions != null);

            bool writeTermFreq  = currentFieldIndexOptions >= FieldInfo.IndexOptions.DOCS_AND_FREQS;
            bool writePositions = currentFieldIndexOptions >= FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS;
            bool writeOffsets   = currentFieldIndexOptions >= FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS;

            bool readTermFreq  = this.HasFreq;
            bool readPositions = this.HasProx;
            bool readOffsets   = this.HasOffsets;

            //System.out.println("flush readTF=" + readTermFreq + " readPos=" + readPositions + " readOffs=" + readOffsets);

            // Make sure FieldInfo.update is working correctly!:
            Debug.Assert(!writeTermFreq || readTermFreq);
            Debug.Assert(!writePositions || readPositions);
            Debug.Assert(!writeOffsets || readOffsets);

            Debug.Assert(!writeOffsets || writePositions);

            IDictionary <Term, int?> segDeletes;

            if (state.SegUpdates != null && state.SegUpdates.Terms.Count > 0)
            {
                segDeletes = state.SegUpdates.Terms;
            }
            else
            {
                segDeletes = null;
            }

            int[]    termIDs  = TermsHashPerField.SortPostings(termComp);
            int      numTerms = TermsHashPerField.BytesHash.Size();
            BytesRef text     = new BytesRef();
            FreqProxPostingsArray postings = (FreqProxPostingsArray)TermsHashPerField.PostingsArray;
            ByteSliceReader       freq     = new ByteSliceReader();
            ByteSliceReader       prox     = new ByteSliceReader();

            FixedBitSet visitedDocs      = new FixedBitSet(state.SegmentInfo.DocCount);
            long        sumTotalTermFreq = 0;
            long        sumDocFreq       = 0;

            Term protoTerm = new Term(fieldName);

            for (int i = 0; i < numTerms; i++)
            {
                int termID = termIDs[i];
                // Get BytesRef
                int textStart = postings.TextStarts[termID];
                TermsHashPerField.BytePool.SetBytesRef(text, textStart);

                TermsHashPerField.InitReader(freq, termID, 0);
                if (readPositions || readOffsets)
                {
                    TermsHashPerField.InitReader(prox, termID, 1);
                }

                // TODO: really TermsHashPerField should take over most
                // of this loop, including merge sort of terms from
                // multiple threads and interacting with the
                // TermsConsumer, only calling out to us (passing us the
                // DocsConsumer) to handle delivery of docs/positions

                PostingsConsumer postingsConsumer = termsConsumer.StartTerm(text);

                int?delDocLimit;
                if (segDeletes != null)
                {
                    protoTerm.Bytes_Renamed = text;
                    int?docIDUpto;
                    segDeletes.TryGetValue(protoTerm, out docIDUpto);
                    if (docIDUpto != null)
                    {
                        delDocLimit = docIDUpto;
                    }
                    else
                    {
                        delDocLimit = 0;
                    }
                }
                else
                {
                    delDocLimit = 0;
                }

                // Now termStates has numToMerge FieldMergeStates
                // which all share the same term.  Now we must
                // interleave the docID streams.
                int  docFreq       = 0;
                long totalTermFreq = 0;
                int  docID         = 0;

                while (true)
                {
                    //System.out.println("  cycle");
                    int termFreq;
                    if (freq.Eof())
                    {
                        if (postings.LastDocCodes[termID] != -1)
                        {
                            // Return last doc
                            docID = postings.LastDocIDs[termID];
                            if (readTermFreq)
                            {
                                termFreq = postings.TermFreqs[termID];
                            }
                            else
                            {
                                termFreq = -1;
                            }
                            postings.LastDocCodes[termID] = -1;
                        }
                        else
                        {
                            // EOF
                            break;
                        }
                    }
                    else
                    {
                        int code = freq.ReadVInt();
                        if (!readTermFreq)
                        {
                            docID   += code;
                            termFreq = -1;
                        }
                        else
                        {
                            docID += (int)((uint)code >> 1);
                            if ((code & 1) != 0)
                            {
                                termFreq = 1;
                            }
                            else
                            {
                                termFreq = freq.ReadVInt();
                            }
                        }

                        Debug.Assert(docID != postings.LastDocIDs[termID]);
                    }

                    docFreq++;
                    Debug.Assert(docID < state.SegmentInfo.DocCount, "doc=" + docID + " maxDoc=" + state.SegmentInfo.DocCount);

                    // NOTE: we could check here if the docID was
                    // deleted, and skip it.  However, this is somewhat
                    // dangerous because it can yield non-deterministic
                    // behavior since we may see the docID before we see
                    // the term that caused it to be deleted.  this
                    // would mean some (but not all) of its postings may
                    // make it into the index, which'd alter the docFreq
                    // for those terms.  We could fix this by doing two
                    // passes, ie first sweep marks all del docs, and
                    // 2nd sweep does the real flush, but I suspect
                    // that'd add too much time to flush.
                    visitedDocs.Set(docID);
                    postingsConsumer.StartDoc(docID, writeTermFreq ? termFreq : -1);
                    if (docID < delDocLimit)
                    {
                        // Mark it deleted.  TODO: we could also skip
                        // writing its postings; this would be
                        // deterministic (just for this Term's docs).

                        // TODO: can we do this reach-around in a cleaner way????
                        if (state.LiveDocs == null)
                        {
                            state.LiveDocs = DocState.DocWriter.Codec.LiveDocsFormat().NewLiveDocs(state.SegmentInfo.DocCount);
                        }
                        if (state.LiveDocs.Get(docID))
                        {
                            state.DelCountOnFlush++;
                            state.LiveDocs.Clear(docID);
                        }
                    }

                    totalTermFreq += termFreq;

                    // Carefully copy over the prox + payload info,
                    // changing the format to match Lucene's segment
                    // format.

                    if (readPositions || readOffsets)
                    {
                        // we did record positions (& maybe payload) and/or offsets
                        int position = 0;
                        int offset   = 0;
                        for (int j = 0; j < termFreq; j++)
                        {
                            BytesRef thisPayload;

                            if (readPositions)
                            {
                                int code = prox.ReadVInt();
                                position += (int)((uint)code >> 1);

                                if ((code & 1) != 0)
                                {
                                    // this position has a payload
                                    int payloadLength = prox.ReadVInt();

                                    if (Payload == null)
                                    {
                                        Payload       = new BytesRef();
                                        Payload.Bytes = new sbyte[payloadLength];
                                    }
                                    else if (Payload.Bytes.Length < payloadLength)
                                    {
                                        Payload.Grow(payloadLength);
                                    }

                                    prox.ReadBytes(Payload.Bytes, 0, payloadLength);
                                    Payload.Length = payloadLength;
                                    thisPayload    = Payload;
                                }
                                else
                                {
                                    thisPayload = null;
                                }

                                if (readOffsets)
                                {
                                    int startOffset = offset + prox.ReadVInt();
                                    int endOffset   = startOffset + prox.ReadVInt();
                                    if (writePositions)
                                    {
                                        if (writeOffsets)
                                        {
                                            Debug.Assert(startOffset >= 0 && endOffset >= startOffset, "startOffset=" + startOffset + ",endOffset=" + endOffset + ",offset=" + offset);
                                            postingsConsumer.AddPosition(position, thisPayload, startOffset, endOffset);
                                        }
                                        else
                                        {
                                            postingsConsumer.AddPosition(position, thisPayload, -1, -1);
                                        }
                                    }
                                    offset = startOffset;
                                }
                                else if (writePositions)
                                {
                                    postingsConsumer.AddPosition(position, thisPayload, -1, -1);
                                }
                            }
                        }
                    }
                    postingsConsumer.FinishDoc();
                }
                termsConsumer.FinishTerm(text, new TermStats(docFreq, writeTermFreq ? totalTermFreq : -1));
                sumTotalTermFreq += totalTermFreq;
                sumDocFreq       += docFreq;
            }

            termsConsumer.Finish(writeTermFreq ? sumTotalTermFreq : -1, sumDocFreq, visitedDocs.Cardinality());
        }
示例#4
0
        internal void FinishDocument()
        {
            Debug.Assert(docState.TestPoint("TermVectorsTermsWriterPerField.finish start"));

            int numPostings = termsHashPerField.bytesHash.Count;

            BytesRef flushTerm = termsWriter.flushTerm;

            Debug.Assert(numPostings >= 0);

            if (numPostings > maxNumPostings)
            {
                maxNumPostings = numPostings;
            }

            // this is called once, after inverting all occurrences
            // of a given field in the doc.  At this point we flush
            // our hash into the DocWriter.

            Debug.Assert(termsWriter.VectorFieldsInOrder(fieldInfo));

            TermVectorsPostingsArray postings = (TermVectorsPostingsArray)termsHashPerField.postingsArray;
            TermVectorsWriter        tv       = termsWriter.writer;

            int[] termIDs = termsHashPerField.SortPostings(tv.Comparer);

            tv.StartField(fieldInfo, numPostings, doVectorPositions, doVectorOffsets, hasPayloads);

            ByteSliceReader posReader = doVectorPositions ? termsWriter.vectorSliceReaderPos : null;
            ByteSliceReader offReader = doVectorOffsets ? termsWriter.vectorSliceReaderOff : null;

            ByteBlockPool termBytePool = termsHashPerField.termBytePool;

            for (int j = 0; j < numPostings; j++)
            {
                int termID = termIDs[j];
                int freq   = postings.freqs[termID];

                // Get BytesRef
                termBytePool.SetBytesRef(flushTerm, postings.textStarts[termID]);
                tv.StartTerm(flushTerm, freq);

                if (doVectorPositions || doVectorOffsets)
                {
                    if (posReader != null)
                    {
                        termsHashPerField.InitReader(posReader, termID, 0);
                    }
                    if (offReader != null)
                    {
                        termsHashPerField.InitReader(offReader, termID, 1);
                    }
                    tv.AddProx(freq, posReader, offReader);
                }
                tv.FinishTerm();
            }
            tv.FinishField();

            termsHashPerField.Reset();

            fieldInfo.SetStoreTermVectors();
        }
        /// <summary>Called once per field per document if term vectors
        /// are enabled, to write the vectors to
        /// RAMOutputStream, which is then quickly flushed to
        /// * the real term vectors files in the Directory.
        /// </summary>
        internal override void  Finish()
        {
            System.Diagnostics.Debug.Assert(docState.TestPoint("TermVectorsTermsWriterPerField.finish start"));

            int numPostings = termsHashPerField.numPostings;

            System.Diagnostics.Debug.Assert(numPostings >= 0);

            if (!doVectors || numPostings == 0)
            {
                return;
            }

            if (numPostings > maxNumPostings)
            {
                maxNumPostings = numPostings;
            }

            IndexOutput tvf = perThread.doc.tvf;

            // This is called once, after inverting all occurences
            // of a given field in the doc.  At this point we flush
            // our hash into the DocWriter.

            System.Diagnostics.Debug.Assert(fieldInfo.storeTermVector);
            System.Diagnostics.Debug.Assert(perThread.VectorFieldsInOrder(fieldInfo));

            perThread.doc.AddField(termsHashPerField.fieldInfo.number);

            RawPostingList[] postings = termsHashPerField.SortPostings();

            tvf.WriteVInt(numPostings);
            byte bits = (byte)(0x0);

            if (doVectorPositions)
            {
                bits |= TermVectorsReader.STORE_POSITIONS_WITH_TERMVECTOR;
            }
            if (doVectorOffsets)
            {
                bits |= TermVectorsReader.STORE_OFFSET_WITH_TERMVECTOR;
            }
            tvf.WriteByte(bits);

            int encoderUpto        = 0;
            int lastTermBytesCount = 0;

            ByteSliceReader reader = perThread.vectorSliceReader;

            char[][] charBuffers = perThread.termsHashPerThread.charPool.buffers;
            for (int j = 0; j < numPostings; j++)
            {
                TermVectorsTermsWriter.PostingList posting = (TermVectorsTermsWriter.PostingList)postings[j];
                int freq = posting.freq;

                char[] text2  = charBuffers[posting.textStart >> DocumentsWriter.CHAR_BLOCK_SHIFT];
                int    start2 = posting.textStart & DocumentsWriter.CHAR_BLOCK_MASK;

                // We swap between two encoders to save copying
                // last Term's byte array
                UnicodeUtil.UTF8Result utf8Result = perThread.utf8Results[encoderUpto];

                // TODO: we could do this incrementally
                UnicodeUtil.UTF16toUTF8(text2, start2, utf8Result);
                int termBytesCount = utf8Result.length;

                // TODO: UTF16toUTF8 could tell us this prefix
                // Compute common prefix between last term and
                // this term
                int prefix = 0;
                if (j > 0)
                {
                    byte[] lastTermBytes = perThread.utf8Results[1 - encoderUpto].result;
                    byte[] termBytes     = perThread.utf8Results[encoderUpto].result;
                    while (prefix < lastTermBytesCount && prefix < termBytesCount)
                    {
                        if (lastTermBytes[prefix] != termBytes[prefix])
                        {
                            break;
                        }
                        prefix++;
                    }
                }
                encoderUpto        = 1 - encoderUpto;
                lastTermBytesCount = termBytesCount;

                int suffix = termBytesCount - prefix;
                tvf.WriteVInt(prefix);
                tvf.WriteVInt(suffix);
                tvf.WriteBytes(utf8Result.result, prefix, suffix);
                tvf.WriteVInt(freq);

                if (doVectorPositions)
                {
                    termsHashPerField.InitReader(reader, posting, 0);
                    reader.WriteTo(tvf);
                }

                if (doVectorOffsets)
                {
                    termsHashPerField.InitReader(reader, posting, 1);
                    reader.WriteTo(tvf);
                }
            }

            termsHashPerField.Reset();
            perThread.termsHashPerThread.Reset(false);
        }
示例#6
0
        public virtual void TestBasic()
        {
            ByteBlockPool pool = new ByteBlockPool(new RecyclingByteBlockAllocator(ByteBlockPool.BYTE_BLOCK_SIZE, Random().Next(100)));

            int NUM_STREAM = AtLeast(100);

            ByteSliceWriter writer = new ByteSliceWriter(pool);

            int[] starts = new int[NUM_STREAM];
            int[] uptos = new int[NUM_STREAM];
            int[] counters = new int[NUM_STREAM];

            ByteSliceReader reader = new ByteSliceReader();

            for (int ti = 0; ti < 100; ti++)
            {
                for (int stream = 0; stream < NUM_STREAM; stream++)
                {
                    starts[stream] = -1;
                    counters[stream] = 0;
                }

                int num = AtLeast(3000);
                for (int iter = 0; iter < num; iter++)
                {
                    int stream;
                    if (Random().NextBoolean())
                    {
                        stream = Random().Next(3);
                    }
                    else
                    {
                        stream = Random().Next(NUM_STREAM);
                    }

                    if (VERBOSE)
                    {
                        Console.WriteLine("write stream=" + stream);
                    }

                    if (starts[stream] == -1)
                    {
                        int spot = pool.NewSlice(ByteBlockPool.FIRST_LEVEL_SIZE);
                        starts[stream] = uptos[stream] = spot + pool.ByteOffset;
                        if (VERBOSE)
                        {
                            Console.WriteLine("  init to " + starts[stream]);
                        }
                    }

                    writer.Init(uptos[stream]);
                    int numValue;
                    if (Random().Next(10) == 3)
                    {
                        numValue = Random().Next(100);
                    }
                    else if (Random().Next(5) == 3)
                    {
                        numValue = Random().Next(3);
                    }
                    else
                    {
                        numValue = Random().Next(20);
                    }

                    for (int j = 0; j < numValue; j++)
                    {
                        if (VERBOSE)
                        {
                            Console.WriteLine("    write " + (counters[stream] + j));
                        }
                        // write some large (incl. negative) ints:
                        writer.WriteVInt(Random().Next());
                        writer.WriteVInt(counters[stream] + j);
                    }
                    counters[stream] += numValue;
                    uptos[stream] = writer.Address;
                    if (VERBOSE)
                    {
                        Console.WriteLine("    addr now " + uptos[stream]);
                    }
                }

                for (int stream = 0; stream < NUM_STREAM; stream++)
                {
                    if (VERBOSE)
                    {
                        Console.WriteLine("  stream=" + stream + " count=" + counters[stream]);
                    }

                    if (starts[stream] != -1 && starts[stream] != uptos[stream])
                    {
                        reader.Init(pool, starts[stream], uptos[stream]);
                        for (int j = 0; j < counters[stream]; j++)
                        {
                            reader.ReadVInt();
                            Assert.AreEqual(j, reader.ReadVInt());
                        }
                    }
                }

                pool.Reset();
            }
        }
        public virtual void  TestBasic()
        {
            ByteBlockPool pool = new ByteBlockPool(new ByteBlockAllocator(), false);

            int NUM_STREAM = 25;

            ByteSliceWriter writer = new ByteSliceWriter(pool);

            int[] starts   = new int[NUM_STREAM];
            int[] uptos    = new int[NUM_STREAM];
            int[] counters = new int[NUM_STREAM];

            System.Random r = NewRandom();

            ByteSliceReader reader = new ByteSliceReader();

            for (int ti = 0; ti < 100; ti++)
            {
                for (int stream = 0; stream < NUM_STREAM; stream++)
                {
                    starts[stream]   = -1;
                    counters[stream] = 0;
                }

                bool debug = false;

                for (int iter = 0; iter < 10000; iter++)
                {
                    int stream = r.Next(NUM_STREAM);
                    if (debug)
                    {
                        System.Console.Out.WriteLine("write stream=" + stream);
                    }

                    if (starts[stream] == -1)
                    {
                        int spot = pool.NewSlice(ByteBlockPool.FIRST_LEVEL_SIZE_ForNUnit);
                        starts[stream] = uptos[stream] = spot + pool.byteOffset;
                        if (debug)
                        {
                            System.Console.Out.WriteLine("  init to " + starts[stream]);
                        }
                    }

                    writer.Init(uptos[stream]);
                    int numValue = r.Next(20);
                    for (int j = 0; j < numValue; j++)
                    {
                        if (debug)
                        {
                            System.Console.Out.WriteLine("    write " + (counters[stream] + j));
                        }
                        writer.WriteVInt(counters[stream] + j);
                        //writer.writeVInt(ti);
                    }
                    counters[stream] += numValue;
                    uptos[stream]     = writer.GetAddress();
                    if (debug)
                    {
                        System.Console.Out.WriteLine("    addr now " + uptos[stream]);
                    }
                }

                for (int stream = 0; stream < NUM_STREAM; stream++)
                {
                    if (debug)
                    {
                        System.Console.Out.WriteLine("  stream=" + stream + " count=" + counters[stream]);
                    }

                    if (starts[stream] != uptos[stream])
                    {
                        reader.Init(pool, starts[stream], uptos[stream]);
                        for (int j = 0; j < counters[stream]; j++)
                        {
                            Assert.AreEqual(j, reader.ReadVInt());
                        }
                        //Assert.AreEqual(ti, reader.readVInt());
                    }
                }

                pool.Reset();
            }
        }
        /* Walk through all unique text tokens (Posting
         * instances) found in this field and serialize them
         * into a single RAM segment. */
        internal void  AppendPostings(FreqProxTermsWriterPerField[] fields, FormatPostingsFieldsConsumer consumer)
        {
            int numFields = fields.Length;

            FreqProxFieldMergeState[] mergeStates = new FreqProxFieldMergeState[numFields];

            for (int i = 0; i < numFields; i++)
            {
                FreqProxFieldMergeState fms = mergeStates[i] = new FreqProxFieldMergeState(fields[i]);

                System.Diagnostics.Debug.Assert(fms.field.fieldInfo == fields [0].fieldInfo);

                // Should always be true
                bool result = fms.NextTerm();
                System.Diagnostics.Debug.Assert(result);
            }

            FormatPostingsTermsConsumer termsConsumer = consumer.AddField(fields[0].fieldInfo);

            FreqProxFieldMergeState[] termStates = new FreqProxFieldMergeState[numFields];

            bool currentFieldOmitTermFreqAndPositions = fields[0].fieldInfo.omitTermFreqAndPositions;

            while (numFields > 0)
            {
                // Get the next term to merge
                termStates[0] = mergeStates[0];
                int numToMerge = 1;

                for (int i = 1; i < numFields; i++)
                {
                    char[] text       = mergeStates[i].text;
                    int    textOffset = mergeStates[i].textOffset;
                    int    cmp        = compareText(text, textOffset, termStates[0].text, termStates[0].textOffset);

                    if (cmp < 0)
                    {
                        termStates[0] = mergeStates[i];
                        numToMerge    = 1;
                    }
                    else if (cmp == 0)
                    {
                        termStates[numToMerge++] = mergeStates[i];
                    }
                }

                FormatPostingsDocsConsumer docConsumer = termsConsumer.AddTerm(termStates[0].text, termStates[0].textOffset);

                // Now termStates has numToMerge FieldMergeStates
                // which all share the same term.  Now we must
                // interleave the docID streams.
                while (numToMerge > 0)
                {
                    FreqProxFieldMergeState minState = termStates[0];
                    for (int i = 1; i < numToMerge; i++)
                    {
                        if (termStates[i].docID < minState.docID)
                        {
                            minState = termStates[i];
                        }
                    }

                    int termDocFreq = minState.termFreq;

                    FormatPostingsPositionsConsumer posConsumer = docConsumer.AddDoc(minState.docID, termDocFreq);

                    ByteSliceReader prox = minState.prox;

                    // Carefully copy over the prox + payload info,
                    // changing the format to match Lucene's segment
                    // format.
                    if (!currentFieldOmitTermFreqAndPositions)
                    {
                        // omitTermFreqAndPositions == false so we do write positions &
                        // payload
                        int position = 0;
                        for (int j = 0; j < termDocFreq; j++)
                        {
                            int code = prox.ReadVInt();
                            position += (code >> 1);

                            int payloadLength;
                            if ((code & 1) != 0)
                            {
                                // This position has a payload
                                payloadLength = prox.ReadVInt();

                                if (payloadBuffer == null || payloadBuffer.Length < payloadLength)
                                {
                                    payloadBuffer = new byte[payloadLength];
                                }

                                prox.ReadBytes(payloadBuffer, 0, payloadLength);
                            }
                            else
                            {
                                payloadLength = 0;
                            }

                            posConsumer.AddPosition(position, payloadBuffer, 0, payloadLength);
                        }                         //End for

                        posConsumer.Finish();
                    }

                    if (!minState.NextDoc())
                    {
                        // Remove from termStates
                        int upto = 0;
                        for (int i = 0; i < numToMerge; i++)
                        {
                            if (termStates[i] != minState)
                            {
                                termStates[upto++] = termStates[i];
                            }
                        }
                        numToMerge--;
                        System.Diagnostics.Debug.Assert(upto == numToMerge);

                        // Advance this state to the next term

                        if (!minState.NextTerm())
                        {
                            // OK, no more terms, so remove from mergeStates
                            // as well
                            upto = 0;
                            for (int i = 0; i < numFields; i++)
                            {
                                if (mergeStates[i] != minState)
                                {
                                    mergeStates[upto++] = mergeStates[i];
                                }
                            }
                            numFields--;
                            System.Diagnostics.Debug.Assert(upto == numFields);
                        }
                    }
                }

                docConsumer.Finish();
            }

            termsConsumer.Finish();
        }
示例#9
0
 public void InitReader(ByteSliceReader reader, int termID, int stream)
 {
     Debug.Assert(stream < StreamCount);
     int intStart = PostingsArray.IntStarts[termID];
     int[] ints = IntPool.Buffers[intStart >> IntBlockPool.INT_BLOCK_SHIFT];
     int upto = intStart & IntBlockPool.INT_BLOCK_MASK;
     reader.Init(BytePool, PostingsArray.ByteStarts[termID] + stream * ByteBlockPool.FIRST_LEVEL_SIZE, ints[upto + stream]);
 }
示例#10
0
        public virtual void TestBasic()
        {
            // LUCENENET specific: NUnit will crash with an OOM if we do the full test
            // with verbosity enabled. So, making this a manual setting that can be
            // turned on if, and only if, needed for debugging. If the setting is turned
            // on, we are decresing the number of iterations by 1/3, which seems to
            // keep it from crashing.
            bool isVerbose = false;

            if (!isVerbose)
            {
                Console.WriteLine("Verbosity disabled to keep NUnit from running out of memory - enable manually");
            }

            ByteBlockPool pool = new ByteBlockPool(new RecyclingByteBlockAllocator(ByteBlockPool.BYTE_BLOCK_SIZE, Random.Next(100)));

            int NUM_STREAM = AtLeast(100);

            ByteSliceWriter writer = new ByteSliceWriter(pool);

            int[] starts   = new int[NUM_STREAM];
            int[] uptos    = new int[NUM_STREAM];
            int[] counters = new int[NUM_STREAM];

            ByteSliceReader reader = new ByteSliceReader();

            for (int ti = 0; ti < 100; ti++)
            {
                for (int stream = 0; stream < NUM_STREAM; stream++)
                {
                    starts[stream]   = -1;
                    counters[stream] = 0;
                }

                // LUCENENET NOTE: Since upgrading to NUnit 3, this test
                // will crash if VERBOSE is true because of an OutOfMemoryException.
                // This not only keeps this test from finishing, it crashes NUnit
                // and no other tests will run.
                // So, we need to allocate a smaller size to ensure this
                // doesn't happen with verbosity enabled.
                int num = isVerbose ? AtLeast(2000) : AtLeast(3000);
                for (int iter = 0; iter < num; iter++)
                {
                    int stream;
                    if (Random.NextBoolean())
                    {
                        stream = Random.Next(3);
                    }
                    else
                    {
                        stream = Random.Next(NUM_STREAM);
                    }

                    if (isVerbose)
                    {
                        Console.WriteLine("write stream=" + stream);
                    }

                    if (starts[stream] == -1)
                    {
                        int spot = pool.NewSlice(ByteBlockPool.FIRST_LEVEL_SIZE);
                        starts[stream] = uptos[stream] = spot + pool.ByteOffset;
                        if (isVerbose)
                        {
                            Console.WriteLine("  init to " + starts[stream]);
                        }
                    }

                    writer.Init(uptos[stream]);
                    int numValue;
                    if (Random.Next(10) == 3)
                    {
                        numValue = Random.Next(100);
                    }
                    else if (Random.Next(5) == 3)
                    {
                        numValue = Random.Next(3);
                    }
                    else
                    {
                        numValue = Random.Next(20);
                    }

                    for (int j = 0; j < numValue; j++)
                    {
                        if (isVerbose)
                        {
                            Console.WriteLine("    write " + (counters[stream] + j));
                        }
                        // write some large (incl. negative) ints:
                        writer.WriteVInt32(Random.Next());
                        writer.WriteVInt32(counters[stream] + j);
                    }
                    counters[stream] += numValue;
                    uptos[stream]     = writer.Address;
                    if (isVerbose)
                    {
                        Console.WriteLine("    addr now " + uptos[stream]);
                    }
                }

                for (int stream = 0; stream < NUM_STREAM; stream++)
                {
                    if (isVerbose)
                    {
                        Console.WriteLine("  stream=" + stream + " count=" + counters[stream]);
                    }

                    if (starts[stream] != -1 && starts[stream] != uptos[stream])
                    {
                        reader.Init(pool, starts[stream], uptos[stream]);
                        for (int j = 0; j < counters[stream]; j++)
                        {
                            reader.ReadVInt32();
                            Assert.AreEqual(j, reader.ReadVInt32());
                        }
                    }
                }

                pool.Reset();
            }
        }
示例#11
0
		public virtual void  TestBasic()
		{
			ByteBlockPool pool = new ByteBlockPool(new ByteBlockAllocator(), false);
			
			int NUM_STREAM = 25;
			
			ByteSliceWriter writer = new ByteSliceWriter(pool);
			
			int[] starts = new int[NUM_STREAM];
			int[] uptos = new int[NUM_STREAM];
			int[] counters = new int[NUM_STREAM];
			
			System.Random r = NewRandom();
			
			ByteSliceReader reader = new ByteSliceReader();
			
			for (int ti = 0; ti < 100; ti++)
			{
				
				for (int stream = 0; stream < NUM_STREAM; stream++)
				{
					starts[stream] = - 1;
					counters[stream] = 0;
				}
				
				bool debug = false;
				
				for (int iter = 0; iter < 10000; iter++)
				{
					int stream = r.Next(NUM_STREAM);
					if (debug)
						System.Console.Out.WriteLine("write stream=" + stream);
					
					if (starts[stream] == - 1)
					{
						int spot = pool.NewSlice(ByteBlockPool.FIRST_LEVEL_SIZE_ForNUnit);
						starts[stream] = uptos[stream] = spot + pool.byteOffset;
						if (debug)
							System.Console.Out.WriteLine("  init to " + starts[stream]);
					}
					
					writer.Init(uptos[stream]);
					int numValue = r.Next(20);
					for (int j = 0; j < numValue; j++)
					{
						if (debug)
							System.Console.Out.WriteLine("    write " + (counters[stream] + j));
						writer.WriteVInt(counters[stream] + j);
						//writer.writeVInt(ti);
					}
					counters[stream] += numValue;
					uptos[stream] = writer.GetAddress();
					if (debug)
						System.Console.Out.WriteLine("    addr now " + uptos[stream]);
				}
				
				for (int stream = 0; stream < NUM_STREAM; stream++)
				{
					if (debug)
						System.Console.Out.WriteLine("  stream=" + stream + " count=" + counters[stream]);
					
					if (starts[stream] != uptos[stream])
					{
						reader.Init(pool, starts[stream], uptos[stream]);
						for (int j = 0; j < counters[stream]; j++)
							Assert.AreEqual(j, reader.ReadVInt());
						//Assert.AreEqual(ti, reader.readVInt());
					}
				}
				
				pool.Reset();
			}
		}
		public void  InitReader(ByteSliceReader reader, RawPostingList p, int stream)
		{
			System.Diagnostics.Debug.Assert(stream < streamCount);
			int[] ints = intPool.buffers[p.intStart >> DocumentsWriter.INT_BLOCK_SHIFT];
			int upto = p.intStart & DocumentsWriter.INT_BLOCK_MASK;
			reader.Init(bytePool, p.byteStart + stream * ByteBlockPool.FIRST_LEVEL_SIZE, ints[upto + stream]);
		}
示例#13
0
        public virtual void TestBasic()
        {
            ByteBlockPool pool = new ByteBlockPool(new RecyclingByteBlockAllocator(ByteBlockPool.BYTE_BLOCK_SIZE, Random().Next(100)));

            int NUM_STREAM = AtLeast(100);

            ByteSliceWriter writer = new ByteSliceWriter(pool);

            int[] starts   = new int[NUM_STREAM];
            int[] uptos    = new int[NUM_STREAM];
            int[] counters = new int[NUM_STREAM];

            ByteSliceReader reader = new ByteSliceReader();

            for (int ti = 0; ti < 100; ti++)
            {
                for (int stream = 0; stream < NUM_STREAM; stream++)
                {
                    starts[stream]   = -1;
                    counters[stream] = 0;
                }

                int num = AtLeast(3000);
                for (int iter = 0; iter < num; iter++)
                {
                    int stream;
                    if (Random().NextBoolean())
                    {
                        stream = Random().Next(3);
                    }
                    else
                    {
                        stream = Random().Next(NUM_STREAM);
                    }

                    if (VERBOSE)
                    {
                        Console.WriteLine("write stream=" + stream);
                    }

                    if (starts[stream] == -1)
                    {
                        int spot = pool.NewSlice(ByteBlockPool.FIRST_LEVEL_SIZE);
                        starts[stream] = uptos[stream] = spot + pool.ByteOffset;
                        if (VERBOSE)
                        {
                            Console.WriteLine("  init to " + starts[stream]);
                        }
                    }

                    writer.Init(uptos[stream]);
                    int numValue;
                    if (Random().Next(10) == 3)
                    {
                        numValue = Random().Next(100);
                    }
                    else if (Random().Next(5) == 3)
                    {
                        numValue = Random().Next(3);
                    }
                    else
                    {
                        numValue = Random().Next(20);
                    }

                    for (int j = 0; j < numValue; j++)
                    {
                        if (VERBOSE)
                        {
                            Console.WriteLine("    write " + (counters[stream] + j));
                        }
                        // write some large (incl. negative) ints:
                        writer.WriteVInt(Random().Next());
                        writer.WriteVInt(counters[stream] + j);
                    }
                    counters[stream] += numValue;
                    uptos[stream]     = writer.Address;
                    if (VERBOSE)
                    {
                        Console.WriteLine("    addr now " + uptos[stream]);
                    }
                }

                for (int stream = 0; stream < NUM_STREAM; stream++)
                {
                    if (VERBOSE)
                    {
                        Console.WriteLine("  stream=" + stream + " count=" + counters[stream]);
                    }

                    if (starts[stream] != -1 && starts[stream] != uptos[stream])
                    {
                        reader.Init(pool, starts[stream], uptos[stream]);
                        for (int j = 0; j < counters[stream]; j++)
                        {
                            reader.ReadVInt();
                            Assert.AreEqual(j, reader.ReadVInt());
                        }
                    }
                }

                pool.Reset();
            }
        }
        /* Walk through all unique text tokens (Posting
         * instances) found in this field and serialize them
         * into a single RAM segment. */
        void AppendPostings(DocumentsWriter.FlushState flushState,
                            FreqProxTermsWriterPerField[] fields,
                            TermInfosWriter termsOut,
                            IndexOutput freqOut,
                            IndexOutput proxOut,
                            DefaultSkipListWriter skipListWriter)
        {
            int fieldNumber = fields[0].fieldInfo.number;
            int numFields   = fields.Length;

            FreqProxFieldMergeState[] mergeStates = new FreqProxFieldMergeState[numFields];

            for (int i = 0; i < numFields; i++)
            {
                FreqProxFieldMergeState fms = mergeStates[i] = new FreqProxFieldMergeState(fields[i]);

                System.Diagnostics.Debug.Assert(fms.field.fieldInfo == fields[0].fieldInfo);

                // Should always be true
                bool result = fms.nextTerm();
                System.Diagnostics.Debug.Assert(result);
            }

            int  skipInterval       = termsOut.skipInterval;
            bool currentFieldOmitTf = fields[0].fieldInfo.omitTf;

            // If current field omits tf then it cannot store
            // payloads.  We silently drop the payloads in this case:
            bool currentFieldStorePayloads = currentFieldOmitTf ? false : fields[0].fieldInfo.storePayloads;

            FreqProxFieldMergeState[] termStates = new FreqProxFieldMergeState[numFields];

            while (numFields > 0)
            {
                // Get the next term to merge
                termStates[0] = mergeStates[0];
                int numToMerge = 1;

                for (int i = 1; i < numFields; i++)
                {
                    char[] text       = mergeStates[i].text;
                    int    textOffset = mergeStates[i].textOffset;
                    int    cmp        = compareText(text, textOffset, termStates[0].text, termStates[0].textOffset);

                    if (cmp < 0)
                    {
                        termStates[0] = mergeStates[i];
                        numToMerge    = 1;
                    }
                    else if (cmp == 0)
                    {
                        termStates[numToMerge++] = mergeStates[i];
                    }
                }

                int df = 0;
                int lastPayloadLength = -1;

                int lastDoc = 0;

                char[] text_Renamed = termStates[0].text;
                int    start        = termStates[0].textOffset;

                long freqPointer = freqOut.GetFilePointer();
                long proxPointer;
                if (proxOut != null)
                {
                    proxPointer = proxOut.GetFilePointer();
                }
                else
                {
                    proxPointer = 0;
                }

                skipListWriter.ResetSkip();

                // Now termStates has numToMerge FieldMergeStates
                // which all share the same term.  Now we must
                // interleave the docID streams.
                while (numToMerge > 0)
                {
                    if ((++df % skipInterval) == 0)
                    {
                        skipListWriter.SetSkipData(lastDoc, currentFieldStorePayloads, lastPayloadLength);
                        skipListWriter.BufferSkip(df);
                    }

                    FreqProxFieldMergeState minState = termStates[0];
                    for (int i = 1; i < numToMerge; i++)
                    {
                        if (termStates[i].docID < minState.docID)
                        {
                            minState = termStates[i];
                        }
                    }

                    int doc         = minState.docID;
                    int termDocFreq = minState.termFreq;

                    System.Diagnostics.Debug.Assert(doc < flushState.numDocsInRAM);
                    System.Diagnostics.Debug.Assert(doc > lastDoc || df == 1);

                    ByteSliceReader prox = minState.prox;

                    // Carefully copy over the prox + payload info,
                    // changing the format to match Lucene's segment
                    // format.
                    if (!currentFieldOmitTf)
                    {
                        // omitTf == false so we do write positions & payload
                        System.Diagnostics.Debug.Assert(proxOut != null);
                        for (int j = 0; j < termDocFreq; j++)
                        {
                            int code = prox.ReadVInt();
                            if (currentFieldStorePayloads)
                            {
                                int payloadLength;
                                if ((code & 1) != 0)
                                {
                                    // This position has a payload
                                    payloadLength = prox.ReadVInt();
                                }
                                else
                                {
                                    payloadLength = 0;
                                }
                                if (payloadLength != lastPayloadLength)
                                {
                                    proxOut.WriteVInt(code | 1);
                                    proxOut.WriteVInt(payloadLength);
                                    lastPayloadLength = payloadLength;
                                }
                                else
                                {
                                    proxOut.WriteVInt(code & (~1));
                                }
                                if (payloadLength > 0)
                                {
                                    copyBytes(prox, proxOut, payloadLength);
                                }
                            }
                            else
                            {
                                System.Diagnostics.Debug.Assert(0 == (code & 1));
                                proxOut.WriteVInt(code >> 1);
                            }
                        } //End for

                        int newDocCode = (doc - lastDoc) << 1;

                        if (1 == termDocFreq)
                        {
                            freqOut.WriteVInt(newDocCode | 1);
                        }
                        else
                        {
                            freqOut.WriteVInt(newDocCode);
                            freqOut.WriteVInt(termDocFreq);
                        }
                    }
                    else
                    {
                        // omitTf==true: we store only the docs, without
                        // term freq, positions, payloads
                        freqOut.WriteVInt(doc - lastDoc);
                    }

                    lastDoc = doc;

                    if (!minState.nextDoc())
                    {
                        // Remove from termStates
                        int upto = 0;
                        for (int i = 0; i < numToMerge; i++)
                        {
                            if (termStates[i] != minState)
                            {
                                termStates[upto++] = termStates[i];
                            }
                        }
                        numToMerge--;
                        System.Diagnostics.Debug.Assert(upto == numToMerge);

                        // Advance this state to the next term

                        if (!minState.nextTerm())
                        {
                            // OK, no more terms, so remove from mergeStates
                            // as well
                            upto = 0;
                            for (int i = 0; i < numFields; i++)
                            {
                                if (mergeStates[i] != minState)
                                {
                                    mergeStates[upto++] = mergeStates[i];
                                }
                            }
                            numFields--;
                            System.Diagnostics.Debug.Assert(upto == numFields);
                        }
                    }
                }

                System.Diagnostics.Debug.Assert(df > 0);

                // Done merging this term

                long skipPointer = skipListWriter.WriteSkip(freqOut);

                // Write term
                termInfo.Set(df, freqPointer, proxPointer, (int)(skipPointer - freqPointer));

                // TODO: we could do this incrementally
                UnicodeUtil.UTF16toUTF8(text_Renamed, start, termsUTF8);

                // TODO: we could save O(n) re-scan of the term by
                // computing the shared prefix with the last term
                // while during the UTF8 encoding
                termsOut.Add(fieldNumber,
                             termsUTF8.result,
                             termsUTF8.length,
                             termInfo);
            }
        }