Пример #1
0
 private void BufferSkip()
 {
     _skipOut.WriteVInt64(_statsOut.GetFilePointer() - _lastBlockStatsFp);
     _skipOut.WriteVInt64(_metaLongsOut.GetFilePointer() - _lastBlockMetaLongsFp);
     _skipOut.WriteVInt64(_metaBytesOut.GetFilePointer() - _lastBlockMetaBytesFp);
     for (var i = 0; i < _longsSize; i++)
     {
         _skipOut.WriteVInt64(_lastLongs[i] - _lastBlockLongs[i]);
     }
     _lastBlockStatsFp     = _statsOut.GetFilePointer();
     _lastBlockMetaLongsFp = _metaLongsOut.GetFilePointer();
     _lastBlockMetaBytesFp = _metaBytesOut.GetFilePointer();
     Array.Copy(_lastLongs, 0, _lastBlockLongs, 0, _longsSize);
 }
Пример #2
0
            public override void FinishTerm(BytesRef text, TermStats stats)
            {
                // write term meta data into fst

                var state = _outerInstance._postingsWriter.NewTermState();

                var meta = new FSTTermOutputs.TermData
                {
                    longs         = new long[_longsSize],
                    bytes         = null,
                    docFreq       = state.DocFreq = stats.DocFreq,
                    totalTermFreq = state.TotalTermFreq = stats.TotalTermFreq
                };

                _outerInstance._postingsWriter.FinishTerm(state);
                _outerInstance._postingsWriter.EncodeTerm(meta.longs, _metaWriter, _fieldInfo, state, true);
                var bytesSize = (int)_metaWriter.GetFilePointer();

                if (bytesSize > 0)
                {
                    meta.bytes = new byte[bytesSize];
                    _metaWriter.WriteTo(meta.bytes, 0);
                    _metaWriter.Reset();
                }
                _builder.Add(Util.ToInt32sRef(text, _scratchTerm), meta);
                _numTerms++;
            }
Пример #3
0
            public override void FinishTerm(BytesRef text, TermStats stats)
            {
                if (_numTerms > 0 && _numTerms % SKIP_INTERVAL == 0)
                {
                    BufferSkip();
                }
                // write term meta data into fst
                var longs = new long[_longsSize];

                long delta = stats.TotalTermFreq - stats.DocFreq;

                if (stats.TotalTermFreq > 0)
                {
                    if (delta == 0)
                    {
                        _statsOut.WriteVInt32(stats.DocFreq << 1 | 1);
                    }
                    else
                    {
                        _statsOut.WriteVInt32(stats.DocFreq << 1 | 0);
                        _statsOut.WriteVInt64(stats.TotalTermFreq - stats.DocFreq);
                    }
                }
                else
                {
                    _statsOut.WriteVInt32(stats.DocFreq);
                }
                var state = _outerInstance.postingsWriter.NewTermState();

                state.DocFreq       = stats.DocFreq;
                state.TotalTermFreq = stats.TotalTermFreq;
                _outerInstance.postingsWriter.FinishTerm(state);
                _outerInstance.postingsWriter.EncodeTerm(longs, _metaBytesOut, _fieldInfo, state, true);
                for (var i = 0; i < _longsSize; i++)
                {
                    _metaLongsOut.WriteVInt64(longs[i] - _lastLongs[i]);
                    _lastLongs[i] = longs[i];
                }
                _metaLongsOut.WriteVInt64(_metaBytesOut.GetFilePointer() - _lastMetaBytesFp);

                _builder.Add(Util.ToInt32sRef(text, _scratchTerm), _numTerms);
                _numTerms++;

                _lastMetaBytesFp = _metaBytesOut.GetFilePointer();
            }
Пример #4
0
 public virtual PostingsWriter Reset()
 {
     Debug.Assert(buffer.GetFilePointer() == 0);
     lastDocID        = 0;
     docCount         = 0;
     lastPayloadLen   = 0;
     lastOffsetLength = -1;
     return(this);
 }
Пример #5
0
            public override void FinishTerm(BytesRef text, TermStats stats)
            {
                if (Debugging.AssertsEnabled)
                {
                    Debugging.Assert(postingsWriter.docCount == stats.DocFreq);
                }

                if (Debugging.AssertsEnabled)
                {
                    Debugging.Assert(buffer2.GetFilePointer() == 0);
                }

                buffer2.WriteVInt32(stats.DocFreq);
                if (field.IndexOptions != IndexOptions.DOCS_ONLY)
                {
                    buffer2.WriteVInt64(stats.TotalTermFreq - stats.DocFreq);
                }
                int pos = (int)buffer2.GetFilePointer();

                buffer2.WriteTo(finalBuffer, 0);
                buffer2.Reset();

                int totalBytes = pos + (int)postingsWriter.buffer.GetFilePointer();

                if (totalBytes > finalBuffer.Length)
                {
                    finalBuffer = ArrayUtil.Grow(finalBuffer, totalBytes);
                }
                postingsWriter.buffer.WriteTo(finalBuffer, pos);
                postingsWriter.buffer.Reset();

                spare.Bytes  = finalBuffer;
                spare.Length = totalBytes;

                //System.out.println("    finishTerm term=" + text.utf8ToString() + " " + totalBytes + " bytes totalTF=" + stats.totalTermFreq);
                //for(int i=0;i<totalBytes;i++) {
                //  System.out.println("      " + Integer.toHexString(finalBuffer[i]&0xFF));
                //}

                builder.Add(Util.ToInt32sRef(text, scratchIntsRef), BytesRef.DeepCopyOf(spare));
                termCount++;
            }
Пример #6
0
 internal SortingDocsAndPositionsEnum(int maxDoc, SortingDocsAndPositionsEnum reuse, DocsAndPositionsEnum @in, Sorter.DocMap docMap, bool storeOffsets)
     : base(@in)
 {
     this.maxDoc       = maxDoc;
     this.storeOffsets = storeOffsets;
     if (reuse != null)
     {
         docs    = reuse.docs;
         offsets = reuse.offsets;
         payload = reuse.payload;
         file    = reuse.file;
         if (reuse.maxDoc == maxDoc)
         {
             sorter = reuse.sorter;
         }
         else
         {
             sorter = new DocOffsetSorter(maxDoc);
         }
     }
     else
     {
         docs    = new int[32];
         offsets = new long[32];
         payload = new BytesRef(32);
         file    = new RAMFile();
         sorter  = new DocOffsetSorter(maxDoc);
     }
     using (IndexOutput @out = new RAMOutputStream(file))
     {
         int doc;
         int i = 0;
         while ((doc = @in.NextDoc()) != DocIdSetIterator.NO_MORE_DOCS)
         {
             if (i == docs.Length)
             {
                 int newLength = ArrayUtil.Oversize(i + 1, 4);
                 docs    = Arrays.CopyOf(docs, newLength);
                 offsets = Arrays.CopyOf(offsets, newLength);
             }
             docs[i]    = docMap.OldToNew(doc);
             offsets[i] = @out.GetFilePointer();
             AddPositions(@in, @out);
             i++;
         }
         upto = i;
         sorter.Reset(docs, offsets);
         sorter.Sort(0, upto);
     }
     this.postingInput = new RAMInputStream("", file);
 }
Пример #7
0
        /// <summary>
        /// Called when we are done adding docs to this term
        /// </summary>
        /// <param name="state"></param>
        public override void FinishTerm(BlockTermState state)
        {
            var state2 = (PulsingTermState)state;

            Debug.Assert(_pendingCount > 0 || _pendingCount == -1);

            if (_pendingCount == -1)
            {
                state2.wrappedState.DocFreq       = state2.DocFreq;
                state2.wrappedState.TotalTermFreq = state2.TotalTermFreq;
                state2.bytes = null;
                _wrappedPostingsWriter.FinishTerm(state2.wrappedState);
            }
            else
            {
                // There were few enough total occurrences for this
                // term, so we fully inline our postings data into
                // terms dict, now:

                // TODO: it'd be better to share this encoding logic
                // in some inner codec that knows how to write a
                // single doc / single position, etc.  This way if a
                // given codec wants to store other interesting
                // stuff, it could use this pulsing codec to do so

                if (_indexOptions.CompareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0)
                {
                    var lastDocID         = 0;
                    var pendingIDX        = 0;
                    var lastPayloadLength = -1;
                    var lastOffsetLength  = -1;
                    while (pendingIDX < _pendingCount)
                    {
                        var doc = _pending[pendingIDX];

                        var delta = doc.docID - lastDocID;
                        lastDocID = doc.docID;

                        // if (DEBUG) System.out.println("  write doc=" + doc.docID + " freq=" + doc.termFreq);

                        if (doc.termFreq == 1)
                        {
                            _buffer.WriteVInt32((delta << 1) | 1);
                        }
                        else
                        {
                            _buffer.WriteVInt32(delta << 1);
                            _buffer.WriteVInt32(doc.termFreq);
                        }

                        var lastPos    = 0;
                        var lastOffset = 0;
                        for (var posIDX = 0; posIDX < doc.termFreq; posIDX++)
                        {
                            var pos = _pending[pendingIDX++];
                            Debug.Assert(pos.docID == doc.docID);
                            var posDelta = pos.pos - lastPos;
                            lastPos = pos.pos;

                            var payloadLength = pos.payload == null ? 0 : pos.payload.Length;
                            if (_storePayloads)
                            {
                                if (payloadLength != lastPayloadLength)
                                {
                                    _buffer.WriteVInt32((posDelta << 1) | 1);
                                    _buffer.WriteVInt32(payloadLength);
                                    lastPayloadLength = payloadLength;
                                }
                                else
                                {
                                    _buffer.WriteVInt32(posDelta << 1);
                                }
                            }
                            else
                            {
                                _buffer.WriteVInt32(posDelta);
                            }

                            if (_indexOptions.CompareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0)
                            {
                                //System.out.println("write=" + pos.startOffset + "," + pos.endOffset);
                                var offsetDelta  = pos.startOffset - lastOffset;
                                var offsetLength = pos.endOffset - pos.startOffset;
                                if (offsetLength != lastOffsetLength)
                                {
                                    _buffer.WriteVInt32(offsetDelta << 1 | 1);
                                    _buffer.WriteVInt32(offsetLength);
                                }
                                else
                                {
                                    _buffer.WriteVInt32(offsetDelta << 1);
                                }
                                lastOffset       = pos.startOffset;
                                lastOffsetLength = offsetLength;
                            }

                            if (payloadLength > 0)
                            {
                                Debug.Assert(_storePayloads);
                                _buffer.WriteBytes(pos.payload.Bytes, 0, pos.payload.Length);
                            }
                        }
                    }
                }
                else if (_indexOptions == IndexOptions.DOCS_AND_FREQS)
                {
                    int lastDocId = 0;
                    for (int posIdx = 0; posIdx < _pendingCount; posIdx++)
                    {
                        Position doc   = _pending[posIdx];
                        int      delta = doc.docID - lastDocId;

                        Debug.Assert(doc.termFreq != 0);

                        if (doc.termFreq == 1)
                        {
                            _buffer.WriteVInt32((delta << 1) | 1);
                        }
                        else
                        {
                            _buffer.WriteVInt32(delta << 1);
                            _buffer.WriteVInt32(doc.termFreq);
                        }
                        lastDocId = doc.docID;
                    }
                }
                else if (_indexOptions == IndexOptions.DOCS_ONLY)
                {
                    int lastDocId = 0;
                    for (int posIdx = 0; posIdx < _pendingCount; posIdx++)
                    {
                        Position doc = _pending[posIdx];
                        _buffer.WriteVInt32(doc.docID - lastDocId);
                        lastDocId = doc.docID;
                    }
                }

                state2.bytes = new byte[(int)_buffer.GetFilePointer()];
                _buffer.WriteTo(state2.bytes, 0);
                _buffer.Reset();
            }
            _pendingCount = 0;
        }
Пример #8
0
            private void FlushBlock()
            {
                //System.out.println("BTW.flushBlock seg=" + segment + " pendingCount=" + pendingCount + " fp=" + out.getFilePointer());

                // First pass: compute common prefix for all terms
                // in the block, against term before first term in
                // this block:
                int commonPrefix = SharedPrefix(lastPrevTerm, pendingTerms[0].Term);

                for (int termCount = 1; termCount < pendingCount; termCount++)
                {
                    commonPrefix = Math.Min(commonPrefix,
                                            SharedPrefix(lastPrevTerm,
                                                         pendingTerms[termCount].Term));
                }

                outerInstance.m_output.WriteVInt32(pendingCount);
                outerInstance.m_output.WriteVInt32(commonPrefix);

                // 2nd pass: write suffixes, as separate byte[] blob
                for (int termCount = 0; termCount < pendingCount; termCount++)
                {
                    int suffix = pendingTerms[termCount].Term.Length - commonPrefix;
                    // TODO: cutover to better intblock codec, instead
                    // of interleaving here:
                    bytesWriter.WriteVInt32(suffix);
                    bytesWriter.WriteBytes(pendingTerms[termCount].Term.Bytes, commonPrefix, suffix);
                }
                outerInstance.m_output.WriteVInt32((int)bytesWriter.GetFilePointer());
                bytesWriter.WriteTo(outerInstance.m_output);
                bytesWriter.Reset();

                // 3rd pass: write the freqs as byte[] blob
                // TODO: cutover to better intblock codec.  simple64?
                // write prefix, suffix first:
                for (int termCount = 0; termCount < pendingCount; termCount++)
                {
                    BlockTermState state = pendingTerms[termCount].State;
                    Debug.Assert(state != null);
                    bytesWriter.WriteVInt32(state.DocFreq);
                    if (fieldInfo.IndexOptions != IndexOptions.DOCS_ONLY)
                    {
                        bytesWriter.WriteVInt64(state.TotalTermFreq - state.DocFreq);
                    }
                }
                outerInstance.m_output.WriteVInt32((int)bytesWriter.GetFilePointer());
                bytesWriter.WriteTo(outerInstance.m_output);
                bytesWriter.Reset();

                // 4th pass: write the metadata
                long[] longs    = new long[longsSize];
                bool   absolute = true;

                for (int termCount = 0; termCount < pendingCount; termCount++)
                {
                    BlockTermState state = pendingTerms[termCount].State;
                    postingsWriter.EncodeTerm(longs, bufferWriter, fieldInfo, state, absolute);
                    for (int i = 0; i < longsSize; i++)
                    {
                        bytesWriter.WriteVInt64(longs[i]);
                    }
                    bufferWriter.WriteTo(bytesWriter);
                    bufferWriter.Reset();
                    absolute = false;
                }
                outerInstance.m_output.WriteVInt32((int)bytesWriter.GetFilePointer());
                bytesWriter.WriteTo(outerInstance.m_output);
                bytesWriter.Reset();

                lastPrevTerm.CopyBytes(pendingTerms[pendingCount - 1].Term);
                pendingCount = 0;
            }