Exemple #1
0
        /// <summary>Add a new position &amp; payload </summary>
        internal override void  AddPosition(int position, byte[] payload, int payloadOffset, int payloadLength)
        {
            System.Diagnostics.Debug.Assert(!omitTermFreqAndPositions, "omitTermFreqAndPositions is true");
            System.Diagnostics.Debug.Assert(out_Renamed != null);

            int delta = position - lastPosition;

            lastPosition = position;

            if (storePayloads)
            {
                if (payloadLength != lastPayloadLength)
                {
                    lastPayloadLength = payloadLength;
                    out_Renamed.WriteVInt((delta << 1) | 1);
                    out_Renamed.WriteVInt(payloadLength);
                }
                else
                {
                    out_Renamed.WriteVInt(delta << 1);
                }
                if (payloadLength > 0)
                {
                    out_Renamed.WriteBytes(payload, payloadLength);
                }
            }
            else
            {
                out_Renamed.WriteVInt(delta);
            }
        }
Exemple #2
0
        /// <summary>Adds a new doc in this term.  If this returns null
        /// then we just skip consuming positions/payloads.
        /// </summary>
        internal override FormatPostingsPositionsConsumer AddDoc(int docID, int termDocFreq)
        {
            int delta = docID - lastDocID;

            if (docID < 0 || (df > 0 && delta <= 0))
            {
                throw new CorruptIndexException("docs out of order (" + docID + " <= " + lastDocID + " )");
            }

            if ((++df % skipInterval) == 0)
            {
                // TODO: abstraction violation
                skipListWriter.SetSkipData(lastDocID, storePayloads, posWriter.lastPayloadLength);
                skipListWriter.BufferSkip(df);
            }

            System.Diagnostics.Debug.Assert(docID < totalNumDocs, "docID=" + docID + " totalNumDocs=" + totalNumDocs);

            lastDocID = docID;
            if (omitTermFreqAndPositions)
            {
                out_Renamed.WriteVInt(delta);
            }
            else if (1 == termDocFreq)
            {
                out_Renamed.WriteVInt((delta << 1) | 1);
            }
            else
            {
                out_Renamed.WriteVInt(delta << 1);
                out_Renamed.WriteVInt(termDocFreq);
            }

            return(posWriter);
        }
Exemple #3
0
        /// <summary>Process postings from multiple segments all positioned on the
        /// same term. Writes out merged entries into freqOutput and
        /// the proxOutput streams.
        ///
        /// </summary>
        /// <param name="smis">array of segments
        /// </param>
        /// <param name="n">number of cells in the array actually occupied
        /// </param>
        /// <returns> number of documents across all segments where this term was found
        /// </returns>
        private int AppendPostings(SegmentMergeInfo[] smis, int n)
        {
            int lastDoc = 0;
            int df      = 0;        // number of docs w/ term

            ResetSkip();
            for (int i = 0; i < n; i++)
            {
                SegmentMergeInfo smi      = smis[i];
                TermPositions    postings = smi.GetPositions();
                int   base_Renamed        = smi.base_Renamed;
                int[] docMap = smi.GetDocMap();
                postings.Seek(smi.termEnum);
                while (postings.Next())
                {
                    int doc = postings.Doc();
                    if (docMap != null)
                    {
                        doc = docMap[doc];                   // map around deletions
                    }
                    doc += base_Renamed;                     // convert to merged space

                    if (doc < lastDoc)
                    {
                        throw new System.SystemException("docs out of order (" + doc + " < " + lastDoc + " )");
                    }

                    df++;

                    if ((df % skipInterval) == 0)
                    {
                        BufferSkip(lastDoc);
                    }

                    int docCode = (doc - lastDoc) << 1;                     // use low bit to flag freq=1
                    lastDoc = doc;

                    int freq = postings.Freq();
                    if (freq == 1)
                    {
                        freqOutput.WriteVInt(docCode | 1);                         // write doc & freq=1
                    }
                    else
                    {
                        freqOutput.WriteVInt(docCode);                      // write doc
                        freqOutput.WriteVInt(freq);                         // write frequency in doc
                    }

                    int lastPosition = 0;                     // write position deltas
                    for (int j = 0; j < freq; j++)
                    {
                        int position = postings.NextPosition();
                        proxOutput.WriteVInt(position - lastPosition);
                        lastPosition = position;
                    }
                }
            }
            return(df);
        }
Exemple #4
0
 public void  Write(IndexOutput output)
 {
     output.WriteVInt(Size());
     for (int i = 0; i < Size(); i++)
     {
         FieldInfo fi   = FieldInfo(i);
         byte      bits = (byte)(0x0);
         if (fi.isIndexed)
         {
             bits |= IS_INDEXED;
         }
         if (fi.storeTermVector)
         {
             bits |= STORE_TERMVECTOR;
         }
         if (fi.storePositionWithTermVector)
         {
             bits |= STORE_POSITIONS_WITH_TERMVECTOR;
         }
         if (fi.storeOffsetWithTermVector)
         {
             bits |= STORE_OFFSET_WITH_TERMVECTOR;
         }
         if (fi.omitNorms)
         {
             bits |= OMIT_NORMS;
         }
         output.WriteString(fi.name);
         output.WriteByte(bits);
     }
 }
Exemple #5
0
        /// <summary>Fills in no-term-vectors for all docs we haven't seen
        /// since the last doc that had term vectors.
        /// </summary>
        internal void  Fill(int docID)
        {
            int docStoreOffset = docWriter.DocStoreOffset;
            int end            = docID + docStoreOffset;

            if (lastDocID < end)
            {
                long tvfPosition = tvf.FilePointer;
                while (lastDocID < end)
                {
                    tvx.WriteLong(tvd.FilePointer);
                    tvd.WriteVInt(0);
                    tvx.WriteLong(tvfPosition);
                    lastDocID++;
                }
            }
        }
Exemple #6
0
        /// <summary>Adds a new <<fieldNumber, termText>, TermInfo> pair to the set.
        /// Term must be lexicographically greater than all previous Terms added.
        /// TermInfo pointers must be positive and greater than all previous.
        /// </summary>
        internal void  Add(int fieldNumber, char[] termText, int termTextStart, int termTextLength, TermInfo ti)
        {
            System.Diagnostics.Debug.Assert(CompareToLastTerm(fieldNumber, termText, termTextStart, termTextLength) < 0 ||
                                            (isIndex && termTextLength == 0 && lastTermTextLength == 0),
                                            "Terms are out of order: field=" + fieldInfos.FieldName(fieldNumber) + "(number " + fieldNumber + ")" +
                                            " lastField=" + fieldInfos.FieldName(lastFieldNumber) + " (number " + lastFieldNumber + ")" +
                                            " text=" + new String(termText, termTextStart, termTextLength) + " lastText=" + new String(lastTermText, 0, lastTermTextLength));

            System.Diagnostics.Debug.Assert(ti.freqPointer >= lastTi.freqPointer, "freqPointer out of order (" + ti.freqPointer + " < " + lastTi.freqPointer + ")");
            System.Diagnostics.Debug.Assert(ti.proxPointer >= lastTi.proxPointer, "proxPointer out of order (" + ti.proxPointer + " < " + lastTi.proxPointer + ")");

            if (!isIndex && size % indexInterval == 0)
            {
                other.Add(lastFieldNumber, lastTermText, 0, lastTermTextLength, lastTi); // add an index term
            }
            WriteTerm(fieldNumber, termText, termTextStart, termTextLength);             // write term

            output.WriteVInt(ti.docFreq);                                                // write doc freq
            output.WriteVLong(ti.freqPointer - lastTi.freqPointer);                      // write pointers
            output.WriteVLong(ti.proxPointer - lastTi.proxPointer);

            if (ti.docFreq >= skipInterval)
            {
                output.WriteVInt(ti.skipOffset);
            }

            if (isIndex)
            {
                output.WriteVLong(other.output.GetFilePointer() - lastIndexPointer);
                lastIndexPointer = other.output.GetFilePointer();                 // write pointer
            }

            if (lastTermText.Length < termTextLength)
            {
                lastTermText = new char[(int)(termTextLength * 1.25)];
            }
            Array.Copy(termText, termTextStart, lastTermText, 0, termTextLength);
            lastTermTextLength = termTextLength;
            lastFieldNumber    = fieldNumber;

            lastTi.Set(ti);
            size++;
        }
Exemple #7
0
        public void  Write(IndexOutput output)
        {
            output.WriteVInt(CURRENT_FORMAT);
            output.WriteVInt(Size());
            for (int i = 0; i < Size(); i++)
            {
                FieldInfo fi   = FieldInfo(i);
                var       bits = (byte)(0x0);
                if (fi.isIndexed)
                {
                    bits |= IS_INDEXED;
                }
                if (fi.storeTermVector)
                {
                    bits |= STORE_TERMVECTOR;
                }
                if (fi.storePositionWithTermVector)
                {
                    bits |= STORE_POSITIONS_WITH_TERMVECTOR;
                }
                if (fi.storeOffsetWithTermVector)
                {
                    bits |= STORE_OFFSET_WITH_TERMVECTOR;
                }
                if (fi.omitNorms)
                {
                    bits |= OMIT_NORMS;
                }
                if (fi.storePayloads)
                {
                    bits |= STORE_PAYLOADS;
                }
                if (fi.omitTermFreqAndPositions)
                {
                    bits |= OMIT_TERM_FREQ_AND_POSITIONS;
                }

                output.WriteString(fi.name);
                output.WriteByte(bits);
            }
        }
Exemple #8
0
        protected internal override void WriteSkipData(int level, IndexOutput skipBuffer)
        {
            int delta = CurDoc - LastSkipDoc[level];

            // if (DEBUG) {
            //   System.out.println("writeSkipData level=" + level + " lastDoc=" + curDoc + " delta=" + delta + " curDocPointer=" + curDocPointer);
            // }
            skipBuffer.WriteVInt(delta);
            LastSkipDoc[level] = CurDoc;

            skipBuffer.WriteVInt((int)(CurDocPointer - LastSkipDocPointer[level]));
            LastSkipDocPointer[level] = CurDocPointer;

            if (FieldHasPositions)
            {
                // if (DEBUG) {
                //   System.out.println("  curPosPointer=" + curPosPointer + " curPosBufferUpto=" + curPosBufferUpto);
                // }
                skipBuffer.WriteVInt((int)(CurPosPointer - LastSkipPosPointer[level]));
                LastSkipPosPointer[level] = CurPosPointer;
                skipBuffer.WriteVInt(CurPosBufferUpto);

                if (FieldHasPayloads)
                {
                    skipBuffer.WriteVInt(CurPayloadByteUpto);
                }

                if (FieldHasOffsets || FieldHasPayloads)
                {
                    skipBuffer.WriteVInt((int)(CurPayPointer - LastSkipPayPointer[level]));
                    LastSkipPayPointer[level] = CurPayPointer;
                }
            }
        }
        /// <summary>Adds a new <Term, TermInfo> pair to the set.
        /// Term must be lexicographically greater than all previous Terms added.
        /// TermInfo pointers must be positive and greater than all previous.
        /// </summary>
        public /*internal*/ void  Add(Term term, TermInfo ti)
        {
            if (!isIndex && term.CompareTo(lastTerm) <= 0)
            {
                throw new System.IO.IOException("term out of order");
            }
            if (ti.freqPointer < lastTi.freqPointer)
            {
                throw new System.IO.IOException("freqPointer out of order");
            }
            if (ti.proxPointer < lastTi.proxPointer)
            {
                throw new System.IO.IOException("proxPointer out of order");
            }

            if (!isIndex && size % indexInterval == 0)
            {
                other.Add(lastTerm, lastTi);                        // add an index term
            }
            WriteTerm(term);                                        // write term
            output.WriteVInt(ti.docFreq);                           // write doc freq
            output.WriteVLong(ti.freqPointer - lastTi.freqPointer); // write pointers
            output.WriteVLong(ti.proxPointer - lastTi.proxPointer);

            if (ti.docFreq >= skipInterval)
            {
                output.WriteVInt(ti.skipOffset);
            }

            if (isIndex)
            {
                output.WriteVLong(other.output.GetFilePointer() - lastIndexPointer);
                lastIndexPointer = other.output.GetFilePointer();                 // write pointer
            }

            lastTi.Set(ti);
            size++;
        }
Exemple #10
0
        /// <summary>Adds a new &lt;fieldNumber, termBytes&gt;, TermInfo> pair to the set.
        /// Term must be lexicographically greater than all previous Terms added.
        /// TermInfo pointers must be positive and greater than all previous.
        /// </summary>
        internal void  Add(int fieldNumber, byte[] termBytes, int termBytesLength, TermInfo ti)
        {
            System.Diagnostics.Debug.Assert(CompareToLastTerm(fieldNumber, termBytes, termBytesLength) < 0 ||
                                            (isIndex && termBytesLength == 0 && lastTermBytesLength == 0),
                                            "Terms are out of order: field=" + fieldInfos.FieldName(fieldNumber) + " (number " + fieldNumber + ")" +
                                            " lastField=" + fieldInfos.FieldName(lastFieldNumber) + " (number " + lastFieldNumber + ")" +
                                            " text=" + System.Text.Encoding.UTF8.GetString(termBytes, 0, termBytesLength) + " lastText=" + System.Text.Encoding.UTF8.GetString(lastTermBytes, 0, lastTermBytesLength));

            System.Diagnostics.Debug.Assert(ti.freqPointer >= lastTi.freqPointer, "freqPointer out of order (" + ti.freqPointer + " < " + lastTi.freqPointer + ")");
            System.Diagnostics.Debug.Assert(ti.proxPointer >= lastTi.proxPointer, "proxPointer out of order (" + ti.proxPointer + " < " + lastTi.proxPointer + ")");

            if (!isIndex && size % indexInterval == 0)
            {
                other.Add(lastFieldNumber, lastTermBytes, lastTermBytesLength, lastTi); // add an index term
            }
            WriteTerm(fieldNumber, termBytes, termBytesLength);                         // write term

            output.WriteVInt(ti.docFreq);                                               // write doc freq
            output.WriteVLong(ti.freqPointer - lastTi.freqPointer);                     // write pointers
            output.WriteVLong(ti.proxPointer - lastTi.proxPointer);

            if (ti.docFreq >= skipInterval)
            {
                output.WriteVInt(ti.skipOffset);
            }

            if (isIndex)
            {
                output.WriteVLong(other.output.GetFilePointer() - lastIndexPointer);
                lastIndexPointer = other.output.GetFilePointer();                 // write pointer
            }

            lastFieldNumber = fieldNumber;
            lastTi.Set(ti);
            size++;
        }
        protected internal override void  WriteSkipData(int level, IndexOutput skipBuffer)
        {
            // To efficiently store payloads in the posting lists we do not store the length of
            // every payload. Instead we omit the length for a payload if the previous payload had
            // the same length.
            // However, in order to support skipping the payload length at every skip point must be known.
            // So we use the same length encoding that we use for the posting lists for the skip data as well:
            // Case 1: current field does not store payloads
            //           SkipDatum                 --> DocSkip, FreqSkip, ProxSkip
            //           DocSkip,FreqSkip,ProxSkip --> VInt
            //           DocSkip records the document number before every SkipInterval th  document in TermFreqs.
            //           Document numbers are represented as differences from the previous value in the sequence.
            // Case 2: current field stores payloads
            //           SkipDatum                 --> DocSkip, PayloadLength?, FreqSkip,ProxSkip
            //           DocSkip,FreqSkip,ProxSkip --> VInt
            //           PayloadLength             --> VInt
            //         In this case DocSkip/2 is the difference between
            //         the current and the previous value. If DocSkip
            //         is odd, then a PayloadLength encoded as VInt follows,
            //         if DocSkip is even, then it is assumed that the
            //         current payload length equals the length at the previous
            //         skip point
            if (curStorePayloads)
            {
                int delta = curDoc - lastSkipDoc[level];
                if (curPayloadLength == lastSkipPayloadLength[level])
                {
                    // the current payload length equals the length at the previous skip point,
                    // so we don't store the length again
                    skipBuffer.WriteVInt(delta * 2);
                }
                else
                {
                    // the payload length is different from the previous one. We shift the DocSkip,
                    // set the lowest bit and store the current payload length as VInt.
                    skipBuffer.WriteVInt(delta * 2 + 1);
                    skipBuffer.WriteVInt(curPayloadLength);
                    lastSkipPayloadLength[level] = curPayloadLength;
                }
            }
            else
            {
                // current field does not store payloads
                skipBuffer.WriteVInt(curDoc - lastSkipDoc[level]);
            }
            skipBuffer.WriteVInt((int)(curFreqPointer - lastSkipFreqPointer[level]));
            skipBuffer.WriteVInt((int)(curProxPointer - lastSkipProxPointer[level]));

            lastSkipDoc[level] = curDoc;
            //System.out.println("write doc at level " + level + ": " + curDoc);

            lastSkipFreqPointer[level] = curFreqPointer;
            lastSkipProxPointer[level] = curProxPointer;
        }
        /// <summary>Write as a d-gaps list </summary>
        private void  WriteDgaps(IndexOutput output)
        {
            output.WriteInt(-1);              // mark using d-gaps
            output.WriteInt(Size());          // write size
            output.WriteInt(Count());         // write count
            int last = 0;
            int n    = Count();
            int m    = bits.Length;

            for (int i = 0; i < m && n > 0; i++)
            {
                if (bits[i] != 0)
                {
                    output.WriteVInt(i - last);
                    output.WriteByte(bits[i]);
                    last = i;
                    n   -= BYTE_COUNTS[bits[i] & 0xFF];
                }
            }
        }
Exemple #13
0
        /// <summary>Merge files with the extensions added up to now.
        /// All files with these extensions are combined sequentially into the
        /// compound stream. After successful merge, the source files
        /// are deleted.
        /// </summary>
        /// <throws>  IllegalStateException if close() had been called before or </throws>
        /// <summary>   if no file has been added to this object
        /// </summary>
        public void Dispose()
        {
            // Extract into protected method if class ever becomes unsealed

            // TODO: Dispose shouldn't throw exceptions!
            if (merged)
                throw new SystemException("Merge already performed");

            if ((entries.Count == 0))
                throw new SystemException("No entries to merge have been defined");

            merged = true;

            // open the compound stream
            IndexOutput os = null;
            try
            {
                os = directory.CreateOutput(fileName);

                // Write the number of entries
                os.WriteVInt(entries.Count);

                // Write the directory with all offsets at 0.
                // Remember the positions of directory entries so that we can
                // adjust the offsets later
                long totalSize = 0;
                foreach (FileEntry fe in entries)
                {
                    fe.directoryOffset = os.FilePointer;
                    os.WriteLong(0); // for now
                    os.WriteString(fe.file);
                    totalSize += directory.FileLength(fe.file);
                }

                // Pre-allocate size of file as optimization --
                // this can potentially help IO performance as
                // we write the file and also later during
                // searching.  It also uncovers a disk-full
                // situation earlier and hopefully without
                // actually filling disk to 100%:
                long finalLength = totalSize + os.FilePointer;
                os.SetLength(finalLength);

                // Open the files and copy their data into the stream.
                // Remember the locations of each file's data section.
                var buffer = new byte[16384];
                foreach (FileEntry fe in entries)
                {
                    fe.dataOffset = os.FilePointer;
                    CopyFile(fe, os, buffer);
                }

                // Write the data offsets into the directory of the compound stream
                foreach (FileEntry fe in entries)
                {
                    os.Seek(fe.directoryOffset);
                    os.WriteLong(fe.dataOffset);
                }

                System.Diagnostics.Debug.Assert(finalLength == os.Length);

                // Close the output stream. Set the os to null before trying to
                // close so that if an exception occurs during the close, the
                // finally clause below will not attempt to close the stream
                // the second time.
                IndexOutput tmp = os;
                os = null;
                tmp.Close();
            }
            finally
            {
                if (os != null)
                    try
                    {
                        os.Close();
                    }
                    catch (System.IO.IOException)
                    {
                    }
            }
        }
        /// <summary>Called once per field per document if term vectors
        /// are enabled, to write the vectors to
        /// RAMOutputStream, which is then quickly flushed to
        /// * the real term vectors files in the Directory.
        /// </summary>
        internal override void  Finish()
        {
            System.Diagnostics.Debug.Assert(docState.TestPoint("TermVectorsTermsWriterPerField.finish start"));

            int numPostings = termsHashPerField.numPostings;

            System.Diagnostics.Debug.Assert(numPostings >= 0);

            if (!doVectors || numPostings == 0)
            {
                return;
            }

            if (numPostings > maxNumPostings)
            {
                maxNumPostings = numPostings;
            }

            IndexOutput tvf = perThread.doc.tvf;

            // This is called once, after inverting all occurences
            // of a given field in the doc.  At this point we flush
            // our hash into the DocWriter.

            System.Diagnostics.Debug.Assert(fieldInfo.storeTermVector);
            System.Diagnostics.Debug.Assert(perThread.VectorFieldsInOrder(fieldInfo));

            perThread.doc.AddField(termsHashPerField.fieldInfo.number);

            RawPostingList[] postings = termsHashPerField.SortPostings();

            tvf.WriteVInt(numPostings);
            byte bits = (byte)(0x0);

            if (doVectorPositions)
            {
                bits |= TermVectorsReader.STORE_POSITIONS_WITH_TERMVECTOR;
            }
            if (doVectorOffsets)
            {
                bits |= TermVectorsReader.STORE_OFFSET_WITH_TERMVECTOR;
            }
            tvf.WriteByte(bits);

            int encoderUpto        = 0;
            int lastTermBytesCount = 0;

            ByteSliceReader reader = perThread.vectorSliceReader;

            char[][] charBuffers = perThread.termsHashPerThread.charPool.buffers;
            for (int j = 0; j < numPostings; j++)
            {
                TermVectorsTermsWriter.PostingList posting = (TermVectorsTermsWriter.PostingList)postings[j];
                int freq = posting.freq;

                char[] text2  = charBuffers[posting.textStart >> DocumentsWriter.CHAR_BLOCK_SHIFT];
                int    start2 = posting.textStart & DocumentsWriter.CHAR_BLOCK_MASK;

                // We swap between two encoders to save copying
                // last Term's byte array
                UnicodeUtil.UTF8Result utf8Result = perThread.utf8Results[encoderUpto];

                // TODO: we could do this incrementally
                UnicodeUtil.UTF16toUTF8(text2, start2, utf8Result);
                int termBytesCount = utf8Result.length;

                // TODO: UTF16toUTF8 could tell us this prefix
                // Compute common prefix between last term and
                // this term
                int prefix = 0;
                if (j > 0)
                {
                    byte[] lastTermBytes = perThread.utf8Results[1 - encoderUpto].result;
                    byte[] termBytes     = perThread.utf8Results[encoderUpto].result;
                    while (prefix < lastTermBytesCount && prefix < termBytesCount)
                    {
                        if (lastTermBytes[prefix] != termBytes[prefix])
                        {
                            break;
                        }
                        prefix++;
                    }
                }
                encoderUpto        = 1 - encoderUpto;
                lastTermBytesCount = termBytesCount;

                int suffix = termBytesCount - prefix;
                tvf.WriteVInt(prefix);
                tvf.WriteVInt(suffix);
                tvf.WriteBytes(utf8Result.result, prefix, suffix);
                tvf.WriteVInt(freq);

                if (doVectorPositions)
                {
                    termsHashPerField.InitReader(reader, posting, 0);
                    reader.WriteTo(tvf);
                }

                if (doVectorOffsets)
                {
                    termsHashPerField.InitReader(reader, posting, 1);
                    reader.WriteTo(tvf);
                }
            }

            termsHashPerField.Reset();
            perThread.termsHashPerThread.Reset(false);
        }
 /// <summary>Write as a d-gaps list </summary>
 private void WriteDgaps(IndexOutput output)
 {
     output.WriteInt(- 1); // mark using d-gaps
     output.WriteInt(Size()); // write size
     output.WriteInt(Count()); // write count
     int last = 0;
     int n = Count();
     int m = bits.Length;
     for (int i = 0; i < m && n > 0; i++)
     {
         if (bits[i] != 0)
         {
             output.WriteVInt(i - last);
             output.WriteByte(bits[i]);
             last = i;
             n -= BYTE_COUNTS[bits[i] & 0xFF];
         }
     }
 }
        internal void  AddDocument(Document doc)
        {
            indexStream.WriteLong(fieldsStream.GetFilePointer());

            int storedCount = 0;

            foreach (Field field  in doc.Fields())
            {
                if (field.IsStored())
                {
                    storedCount++;
                }
            }
            fieldsStream.WriteVInt(storedCount);

            foreach (Field field in doc.Fields())
            {
                if (field.IsStored())
                {
                    fieldsStream.WriteVInt(fieldInfos.FieldNumber(field.Name()));

                    byte bits = 0;
                    if (field.IsTokenized())
                    {
                        bits |= FieldsWriter.FIELD_IS_TOKENIZED;
                    }
                    if (field.IsBinary())
                    {
                        bits |= FieldsWriter.FIELD_IS_BINARY;
                    }
                    if (field.IsCompressed())
                    {
                        bits |= FieldsWriter.FIELD_IS_COMPRESSED;
                    }

                    fieldsStream.WriteByte(bits);

                    if (field.IsCompressed())
                    {
                        // compression is enabled for the current field
                        byte[] data = null;
                        // check if it is a binary field
                        if (field.IsBinary())
                        {
                            data = Compress(field.BinaryValue());
                        }
                        else
                        {
                            data = Compress(System.Text.Encoding.GetEncoding("UTF-8").GetBytes(field.StringValue()));
                        }
                        int len = data.Length;
                        fieldsStream.WriteVInt(len);
                        fieldsStream.WriteBytes(data, len);
                    }
                    else
                    {
                        // compression is disabled for the current field
                        if (field.IsBinary())
                        {
                            byte[] data = field.BinaryValue();
                            int    len  = data.Length;
                            fieldsStream.WriteVInt(len);
                            fieldsStream.WriteBytes(data, len);
                        }
                        else
                        {
                            fieldsStream.WriteString(field.StringValue());
                        }
                    }
                }
            }
        }
        protected internal override void WriteSkipData(int level, IndexOutput skipBuffer)
        {
            int delta = CurDoc - LastSkipDoc[level];
            // if (DEBUG) {
            //   System.out.println("writeSkipData level=" + level + " lastDoc=" + curDoc + " delta=" + delta + " curDocPointer=" + curDocPointer);
            // }
            skipBuffer.WriteVInt(delta);
            LastSkipDoc[level] = CurDoc;

            skipBuffer.WriteVInt((int)(CurDocPointer - LastSkipDocPointer[level]));
            LastSkipDocPointer[level] = CurDocPointer;

            if (FieldHasPositions)
            {
                // if (DEBUG) {
                //   System.out.println("  curPosPointer=" + curPosPointer + " curPosBufferUpto=" + curPosBufferUpto);
                // }
                skipBuffer.WriteVInt((int)(CurPosPointer - LastSkipPosPointer[level]));
                LastSkipPosPointer[level] = CurPosPointer;
                skipBuffer.WriteVInt(CurPosBufferUpto);

                if (FieldHasPayloads)
                {
                    skipBuffer.WriteVInt(CurPayloadByteUpto);
                }

                if (FieldHasOffsets || FieldHasPayloads)
                {
                    skipBuffer.WriteVInt((int)(CurPayPointer - LastSkipPayPointer[level]));
                    LastSkipPayPointer[level] = CurPayPointer;
                }
            }
        }
		public void  Write(IndexOutput output)
		{
			output.WriteVInt(CURRENT_FORMAT);
			output.WriteVInt(Size());
			for (int i = 0; i < Size(); i++)
			{
				FieldInfo fi = FieldInfo(i);
				byte bits = (byte) (0x0);
				if (fi.isIndexed)
					bits |= IS_INDEXED;
				if (fi.storeTermVector)
					bits |= STORE_TERMVECTOR;
				if (fi.storePositionWithTermVector)
					bits |= STORE_POSITIONS_WITH_TERMVECTOR;
				if (fi.storeOffsetWithTermVector)
					bits |= STORE_OFFSET_WITH_TERMVECTOR;
				if (fi.omitNorms)
					bits |= OMIT_NORMS;
				if (fi.storePayloads)
					bits |= STORE_PAYLOADS;
				if (fi.omitTermFreqAndPositions)
					bits |= OMIT_TERM_FREQ_AND_POSITIONS;
				
				output.WriteString(fi.name);
				output.WriteByte(bits);
			}
		}
        protected override void WriteSkipData(int level, IndexOutput skipBuffer)
        {
            // To efficiently store payloads/offsets in the posting lists we do not store the length of
            // every payload/offset. Instead we omit the length if the previous lengths were the same
            //
            // However, in order to support skipping, the length at every skip point must be known.
            // So we use the same length encoding that we use for the posting lists for the skip data as well:
            // Case 1: current field does not store payloads/offsets
            //           SkipDatum                 --> DocSkip, FreqSkip, ProxSkip
            //           DocSkip,FreqSkip,ProxSkip --> VInt
            //           DocSkip records the document number before every SkipInterval th  document in TermFreqs.
            //           Document numbers are represented as differences from the previous value in the sequence.
            // Case 2: current field stores payloads/offsets
            //           SkipDatum                 --> DocSkip, PayloadLength?,OffsetLength?,FreqSkip,ProxSkip
            //           DocSkip,FreqSkip,ProxSkip --> VInt
            //           PayloadLength,OffsetLength--> VInt
            //         In this case DocSkip/2 is the difference between
            //         the current and the previous value. If DocSkip
            //         is odd, then a PayloadLength encoded as VInt follows,
            //         if DocSkip is even, then it is assumed that the
            //         current payload/offset lengths equals the lengths at the previous
            //         skip point
            int delta = CurDoc - LastSkipDoc[level];

            if (CurStorePayloads || CurStoreOffsets)
            {
                Debug.Assert(CurStorePayloads || CurPayloadLength == LastSkipPayloadLength[level]);
                Debug.Assert(CurStoreOffsets || CurOffsetLength == LastSkipOffsetLength[level]);

                if (CurPayloadLength == LastSkipPayloadLength[level] && CurOffsetLength == LastSkipOffsetLength[level])
                {
                    // the current payload/offset lengths equals the lengths at the previous skip point,
                    // so we don't store the lengths again
                    skipBuffer.WriteVInt(delta << 1);
                }
                else
                {
                    // the payload and/or offset length is different from the previous one. We shift the DocSkip,
                    // set the lowest bit and store the current payload and/or offset lengths as VInts.
                    skipBuffer.WriteVInt(delta << 1 | 1);

                    if (CurStorePayloads)
                    {
                        skipBuffer.WriteVInt(CurPayloadLength);
                        LastSkipPayloadLength[level] = CurPayloadLength;
                    }
                    if (CurStoreOffsets)
                    {
                        skipBuffer.WriteVInt(CurOffsetLength);
                        LastSkipOffsetLength[level] = CurOffsetLength;
                    }
                }
            }
            else
            {
                // current field does not store payloads or offsets
                skipBuffer.WriteVInt(delta);
            }

            skipBuffer.WriteVInt((int)(CurFreqPointer - LastSkipFreqPointer[level]));
            skipBuffer.WriteVInt((int)(CurProxPointer - LastSkipProxPointer[level]));

            LastSkipDoc[level] = CurDoc;

            LastSkipFreqPointer[level] = CurFreqPointer;
            LastSkipProxPointer[level] = CurProxPointer;
        }
        /* Walk through all unique text tokens (Posting
         * instances) found in this field and serialize them
         * into a single RAM segment. */
        void AppendPostings(DocumentsWriter.FlushState flushState,
                            FreqProxTermsWriterPerField[] fields,
                            TermInfosWriter termsOut,
                            IndexOutput freqOut,
                            IndexOutput proxOut,
                            DefaultSkipListWriter skipListWriter)
        {
            int fieldNumber = fields[0].fieldInfo.number;
            int numFields = fields.Length;

            FreqProxFieldMergeState[] mergeStates = new FreqProxFieldMergeState[numFields];

            for (int i = 0; i < numFields; i++)
            {
                FreqProxFieldMergeState fms = mergeStates[i] = new FreqProxFieldMergeState(fields[i]);

                System.Diagnostics.Debug.Assert(fms.field.fieldInfo == fields[0].fieldInfo);

                // Should always be true
                bool result = fms.nextTerm();
                System.Diagnostics.Debug.Assert(result);
            }

            int skipInterval = termsOut.skipInterval;
            bool currentFieldOmitTf = fields[0].fieldInfo.omitTf;

            // If current field omits tf then it cannot store
            // payloads.  We silently drop the payloads in this case:
            bool currentFieldStorePayloads = currentFieldOmitTf ? false : fields[0].fieldInfo.storePayloads;

            FreqProxFieldMergeState[] termStates = new FreqProxFieldMergeState[numFields];

            while (numFields > 0)
            {

                // Get the next term to merge
                termStates[0] = mergeStates[0];
                int numToMerge = 1;

                for (int i = 1; i < numFields; i++)
                {
                    char[] text = mergeStates[i].text;
                    int textOffset = mergeStates[i].textOffset;
                    int cmp = compareText(text, textOffset, termStates[0].text, termStates[0].textOffset);

                    if (cmp < 0)
                    {
                        termStates[0] = mergeStates[i];
                        numToMerge = 1;
                    }
                    else if (cmp == 0)
                        termStates[numToMerge++] = mergeStates[i];
                }

                int df = 0;
                int lastPayloadLength = -1;

                int lastDoc = 0;

                char[] text_Renamed = termStates[0].text;
                int start = termStates[0].textOffset;

                long freqPointer = freqOut.GetFilePointer();
                long proxPointer;
                if (proxOut != null)
                    proxPointer = proxOut.GetFilePointer();
                else
                    proxPointer = 0;

                skipListWriter.ResetSkip();

                // Now termStates has numToMerge FieldMergeStates
                // which all share the same term.  Now we must
                // interleave the docID streams.
                while (numToMerge > 0)
                {

                    if ((++df % skipInterval) == 0)
                    {
                        skipListWriter.SetSkipData(lastDoc, currentFieldStorePayloads, lastPayloadLength);
                        skipListWriter.BufferSkip(df);
                    }

                    FreqProxFieldMergeState minState = termStates[0];
                    for (int i = 1; i < numToMerge; i++)
                        if (termStates[i].docID < minState.docID)
                            minState = termStates[i];

                    int doc = minState.docID;
                    int termDocFreq = minState.termFreq;

                    System.Diagnostics.Debug.Assert(doc < flushState.numDocsInRAM);
                    System.Diagnostics.Debug.Assert(doc > lastDoc || df == 1);

                    ByteSliceReader prox = minState.prox;

                    // Carefully copy over the prox + payload info,
                    // changing the format to match Lucene's segment
                    // format.
                    if (!currentFieldOmitTf)
                    {
                        // omitTf == false so we do write positions & payload
                        System.Diagnostics.Debug.Assert(proxOut != null);
                        for (int j = 0; j < termDocFreq; j++)
                        {
                            int code = prox.ReadVInt();
                            if (currentFieldStorePayloads)
                            {
                                int payloadLength;
                                if ((code & 1) != 0)
                                {
                                    // This position has a payload
                                    payloadLength = prox.ReadVInt();
                                }
                                else
                                    payloadLength = 0;
                                if (payloadLength != lastPayloadLength)
                                {
                                    proxOut.WriteVInt(code | 1);
                                    proxOut.WriteVInt(payloadLength);
                                    lastPayloadLength = payloadLength;
                                }
                                else
                                    proxOut.WriteVInt(code & (~1));
                                if (payloadLength > 0)
                                    copyBytes(prox, proxOut, payloadLength);
                            }
                            else
                            {
                                System.Diagnostics.Debug.Assert(0 == (code & 1));
                                proxOut.WriteVInt(code >> 1);
                            }
                        } //End for

                        int newDocCode = (doc - lastDoc) << 1;

                        if (1 == termDocFreq)
                        {
                            freqOut.WriteVInt(newDocCode | 1);
                        }
                        else
                        {
                            freqOut.WriteVInt(newDocCode);
                            freqOut.WriteVInt(termDocFreq);
                        }
                    }
                    else
                    {
                        // omitTf==true: we store only the docs, without
                        // term freq, positions, payloads
                        freqOut.WriteVInt(doc - lastDoc);
                    }

                    lastDoc = doc;

                    if (!minState.nextDoc())
                    {

                        // Remove from termStates
                        int upto = 0;
                        for (int i = 0; i < numToMerge; i++)
                            if (termStates[i] != minState)
                                termStates[upto++] = termStates[i];
                        numToMerge--;
                        System.Diagnostics.Debug.Assert(upto == numToMerge);

                        // Advance this state to the next term

                        if (!minState.nextTerm())
                        {
                            // OK, no more terms, so remove from mergeStates
                            // as well
                            upto = 0;
                            for (int i = 0; i < numFields; i++)
                                if (mergeStates[i] != minState)
                                    mergeStates[upto++] = mergeStates[i];
                            numFields--;
                            System.Diagnostics.Debug.Assert(upto == numFields);
                        }
                    }
                }

                System.Diagnostics.Debug.Assert(df > 0);

                // Done merging this term

                long skipPointer = skipListWriter.WriteSkip(freqOut);

                // Write term
                termInfo.Set(df, freqPointer, proxPointer, (int)(skipPointer - freqPointer));

                // TODO: we could do this incrementally
                UnicodeUtil.UTF16toUTF8(text_Renamed, start, termsUTF8);

                // TODO: we could save O(n) re-scan of the term by
                // computing the shared prefix with the last term
                // while during the UTF8 encoding
                termsOut.Add(fieldNumber,
                             termsUTF8.result,
                             termsUTF8.length,
                             termInfo);
            }
        }
Exemple #21
0
        /// <summary> Add a complete document specified by all its term vectors. If document has no
        /// term vectors, add value for tvx.
        ///
        /// </summary>
        /// <param name="vectors">
        /// </param>
        /// <throws>  IOException </throws>
        public void  AddAllDocVectors(ITermFreqVector[] vectors)
        {
            tvx.WriteLong(tvd.FilePointer);
            tvx.WriteLong(tvf.FilePointer);

            if (vectors != null)
            {
                int numFields = vectors.Length;
                tvd.WriteVInt(numFields);

                var fieldPointers = new long[numFields];

                for (int i = 0; i < numFields; i++)
                {
                    fieldPointers[i] = tvf.FilePointer;

                    int fieldNumber = fieldInfos.FieldNumber(vectors[i].Field);

                    // 1st pass: write field numbers to tvd
                    tvd.WriteVInt(fieldNumber);

                    int numTerms = vectors[i].Size;
                    tvf.WriteVInt(numTerms);

                    TermPositionVector tpVector;

                    byte bits;
                    bool storePositions;
                    bool storeOffsets;

                    if (vectors[i] is TermPositionVector)
                    {
                        // May have positions & offsets
                        tpVector       = (TermPositionVector)vectors[i];
                        storePositions = tpVector.Size > 0 && tpVector.GetTermPositions(0) != null;
                        storeOffsets   = tpVector.Size > 0 && tpVector.GetOffsets(0) != null;
                        bits           = (byte)((storePositions?TermVectorsReader.STORE_POSITIONS_WITH_TERMVECTOR: (byte)0) + (storeOffsets?TermVectorsReader.STORE_OFFSET_WITH_TERMVECTOR: (byte)0));
                    }
                    else
                    {
                        tpVector       = null;
                        bits           = 0;
                        storePositions = false;
                        storeOffsets   = false;
                    }

                    tvf.WriteVInt(bits);

                    System.String[] terms = vectors[i].GetTerms();
                    int[]           freqs = vectors[i].GetTermFrequencies();

                    int utf8Upto = 0;
                    utf8Results[1].length = 0;

                    for (int j = 0; j < numTerms; j++)
                    {
                        UnicodeUtil.UTF16toUTF8(terms[j], 0, terms[j].Length, utf8Results[utf8Upto]);

                        int start  = StringHelper.BytesDifference(utf8Results[1 - utf8Upto].result, utf8Results[1 - utf8Upto].length, utf8Results[utf8Upto].result, utf8Results[utf8Upto].length);
                        int length = utf8Results[utf8Upto].length - start;
                        tvf.WriteVInt(start);                                        // write shared prefix length
                        tvf.WriteVInt(length);                                       // write delta length
                        tvf.WriteBytes(utf8Results[utf8Upto].result, start, length); // write delta bytes
                        utf8Upto = 1 - utf8Upto;

                        int termFreq = freqs[j];

                        tvf.WriteVInt(termFreq);

                        if (storePositions)
                        {
                            int[] positions = tpVector.GetTermPositions(j);
                            if (positions == null)
                            {
                                throw new System.SystemException("Trying to write positions that are null!");
                            }
                            System.Diagnostics.Debug.Assert(positions.Length == termFreq);

                            // use delta encoding for positions
                            int lastPosition = 0;
                            foreach (int position in positions)
                            {
                                tvf.WriteVInt(position - lastPosition);
                                lastPosition = position;
                            }
                        }

                        if (storeOffsets)
                        {
                            TermVectorOffsetInfo[] offsets = tpVector.GetOffsets(j);
                            if (offsets == null)
                            {
                                throw new System.SystemException("Trying to write offsets that are null!");
                            }
                            System.Diagnostics.Debug.Assert(offsets.Length == termFreq);

                            // use delta encoding for offsets
                            int lastEndOffset = 0;
                            foreach (TermVectorOffsetInfo t in offsets)
                            {
                                int startOffset = t.StartOffset;
                                int endOffset   = t.EndOffset;
                                tvf.WriteVInt(startOffset - lastEndOffset);
                                tvf.WriteVInt(endOffset - startOffset);
                                lastEndOffset = endOffset;
                            }
                        }
                    }
                }

                // 2nd pass: write field pointers to tvd
                if (numFields > 1)
                {
                    long lastFieldPointer = fieldPointers[0];
                    for (int i = 1; i < numFields; i++)
                    {
                        long fieldPointer = fieldPointers[i];
                        tvd.WriteVLong(fieldPointer - lastFieldPointer);
                        lastFieldPointer = fieldPointer;
                    }
                }
            }
            else
            {
                tvd.WriteVInt(0);
            }
        }
Exemple #22
0
        private void  WritePostings(Posting[] postings, System.String segment)
        {
            IndexOutput       freq = null, prox = null;
            TermInfosWriter   tis              = null;
            TermVectorsWriter termVectorWriter = null;

            try
            {
                //open files for inverse index storage
                freq = directory.CreateOutput(segment + ".frq");
                prox = directory.CreateOutput(segment + ".prx");
                tis  = new TermInfosWriter(directory, segment, fieldInfos, termIndexInterval);
                TermInfo      ti           = new TermInfo();
                System.String currentField = null;

                for (int i = 0; i < postings.Length; i++)
                {
                    Posting posting = postings[i];

                    // add an entry to the dictionary with pointers to prox and freq files
                    ti.Set(1, freq.GetFilePointer(), prox.GetFilePointer(), -1);
                    tis.Add(posting.term, ti);

                    // add an entry to the freq file
                    int postingFreq = posting.freq;
                    if (postingFreq == 1)
                    {
                        // optimize freq=1
                        freq.WriteVInt(1);
                    }
                    // set low bit of doc num.
                    else
                    {
                        freq.WriteVInt(0);                         // the document number
                        freq.WriteVInt(postingFreq);               // frequency in doc
                    }

                    int   lastPosition = 0;                   // write positions
                    int[] positions    = posting.positions;
                    for (int j = 0; j < postingFreq; j++)
                    {
                        // use delta-encoding
                        int position = positions[j];
                        prox.WriteVInt(position - lastPosition);
                        lastPosition = position;
                    }
                    // check to see if we switched to a new field
                    System.String termField = posting.term.Field();
                    if (currentField != termField)
                    {
                        // changing field - see if there is something to save
                        currentField = termField;
                        FieldInfo fi = fieldInfos.FieldInfo(currentField);
                        if (fi.storeTermVector)
                        {
                            if (termVectorWriter == null)
                            {
                                termVectorWriter = new TermVectorsWriter(directory, segment, fieldInfos);
                                termVectorWriter.OpenDocument();
                            }
                            termVectorWriter.OpenField(currentField);
                        }
                        else if (termVectorWriter != null)
                        {
                            termVectorWriter.CloseField();
                        }
                    }
                    if (termVectorWriter != null && termVectorWriter.IsFieldOpen())
                    {
                        termVectorWriter.AddTerm(posting.term.Text(), postingFreq, posting.positions, posting.offsets);
                    }
                }
                if (termVectorWriter != null)
                {
                    termVectorWriter.CloseDocument();
                }
            }
            finally
            {
                // make an effort to close all streams we can but remember and re-throw
                // the first exception encountered in this process
                System.IO.IOException keep = null;
                if (freq != null)
                {
                    try
                    {
                        freq.Close();
                    }
                    catch (System.IO.IOException e)
                    {
                        if (keep == null)
                        {
                            keep = e;
                        }
                    }
                }
                if (prox != null)
                {
                    try
                    {
                        prox.Close();
                    }
                    catch (System.IO.IOException e)
                    {
                        if (keep == null)
                        {
                            keep = e;
                        }
                    }
                }
                if (tis != null)
                {
                    try
                    {
                        tis.Close();
                    }
                    catch (System.IO.IOException e)
                    {
                        if (keep == null)
                        {
                            keep = e;
                        }
                    }
                }
                if (termVectorWriter != null)
                {
                    try
                    {
                        termVectorWriter.Close();
                    }
                    catch (System.IO.IOException e)
                    {
                        if (keep == null)
                        {
                            keep = e;
                        }
                    }
                }
                if (keep != null)
                {
                    throw new System.IO.IOException(keep.StackTrace);
                }
            }
        }
		/* Walk through all unique text tokens (Posting
		* instances) found in this field and serialize them
		* into a single RAM segment. */
		internal void  AppendPostings(ThreadState.FieldData[] fields, TermInfosWriter termsOut, IndexOutput freqOut, IndexOutput proxOut)
		{
			
			int fieldNumber = fields[0].fieldInfo.number;
			int numFields = fields.Length;
			
			FieldMergeState[] mergeStates = new FieldMergeState[numFields];
			
			for (int i = 0; i < numFields; i++)
			{
				FieldMergeState fms = mergeStates[i] = new FieldMergeState();
				fms.field = fields[i];
				fms.postings = fms.field.SortPostings();
				
				System.Diagnostics.Debug.Assert(fms.field.fieldInfo == fields [0].fieldInfo);
				
				// Should always be true
				bool result = fms.NextTerm();
				System.Diagnostics.Debug.Assert(result);
			}
			
			int skipInterval = termsOut.skipInterval;
			currentFieldStorePayloads = fields[0].fieldInfo.storePayloads;
			
			FieldMergeState[] termStates = new FieldMergeState[numFields];
			
			while (numFields > 0)
			{
				
				// Get the next term to merge
				termStates[0] = mergeStates[0];
				int numToMerge = 1;
				
				for (int i = 1; i < numFields; i++)
				{
					char[] text = mergeStates[i].text;
					int textOffset = mergeStates[i].textOffset;
					int cmp = CompareText(text, textOffset, termStates[0].text, termStates[0].textOffset);
					
					if (cmp < 0)
					{
						termStates[0] = mergeStates[i];
						numToMerge = 1;
					}
					else if (cmp == 0)
						termStates[numToMerge++] = mergeStates[i];
				}
				
				int df = 0;
				int lastPayloadLength = - 1;
				
				int lastDoc = 0;
				
				char[] text2 = termStates[0].text;
				int start = termStates[0].textOffset;
				int pos = start;
				while (text2[pos] != 0xffff)
					pos++;
				
				long freqPointer = freqOut.GetFilePointer();
				long proxPointer = proxOut.GetFilePointer();
				
				skipListWriter.ResetSkip();
				
				// Now termStates has numToMerge FieldMergeStates
				// which all share the same term.  Now we must
				// interleave the docID streams.
				while (numToMerge > 0)
				{
					
					if ((++df % skipInterval) == 0)
					{
						skipListWriter.SetSkipData(lastDoc, currentFieldStorePayloads, lastPayloadLength);
						skipListWriter.BufferSkip(df);
					}
					
					FieldMergeState minState = termStates[0];
					for (int i = 1; i < numToMerge; i++)
						if (termStates[i].docID < minState.docID)
							minState = termStates[i];
					
					int doc = minState.docID;
					int termDocFreq = minState.termFreq;
					
					System.Diagnostics.Debug.Assert(doc < numDocsInRAM);
					System.Diagnostics.Debug.Assert(doc > lastDoc || df == 1);
					
					int newDocCode = (doc - lastDoc) << 1;
					lastDoc = doc;
					
					ByteSliceReader prox = minState.prox;
					
					// Carefully copy over the prox + payload info,
					// changing the format to match Lucene's segment
					// format.
					for (int j = 0; j < termDocFreq; j++)
					{
						int code = prox.ReadVInt();
						if (currentFieldStorePayloads)
						{
							int payloadLength;
							if ((code & 1) != 0)
							{
								// This position has a payload
								payloadLength = prox.ReadVInt();
							}
							else
								payloadLength = 0;
							if (payloadLength != lastPayloadLength)
							{
								proxOut.WriteVInt(code | 1);
								proxOut.WriteVInt(payloadLength);
								lastPayloadLength = payloadLength;
							}
							else
								proxOut.WriteVInt(code & (~ 1));
							if (payloadLength > 0)
								CopyBytes(prox, proxOut, payloadLength);
						}
						else
						{
							System.Diagnostics.Debug.Assert(0 ==(code & 1));
							proxOut.WriteVInt(code >> 1);
						}
					}
					
					if (1 == termDocFreq)
					{
						freqOut.WriteVInt(newDocCode | 1);
					}
					else
					{
						freqOut.WriteVInt(newDocCode);
						freqOut.WriteVInt(termDocFreq);
					}
					
					if (!minState.NextDoc())
					{
						
						// Remove from termStates
						int upto = 0;
						for (int i = 0; i < numToMerge; i++)
							if (termStates[i] != minState)
								termStates[upto++] = termStates[i];
						numToMerge--;
						System.Diagnostics.Debug.Assert(upto == numToMerge);
						
						// Advance this state to the next term
						
						if (!minState.NextTerm())
						{
							// OK, no more terms, so remove from mergeStates
							// as well
							upto = 0;
							for (int i = 0; i < numFields; i++)
								if (mergeStates[i] != minState)
									mergeStates[upto++] = mergeStates[i];
							numFields--;
							System.Diagnostics.Debug.Assert(upto == numFields);
						}
					}
				}
				
				System.Diagnostics.Debug.Assert(df > 0);
				
				// Done merging this term
				
				long skipPointer = skipListWriter.WriteSkip(freqOut);
				
				// Write term
				termInfo.Set(df, freqPointer, proxPointer, (int) (skipPointer - freqPointer));
				termsOut.Add(fieldNumber, text2, start, pos - start, termInfo);
			}
		}
		protected internal override void  WriteSkipData(int level, IndexOutput skipBuffer)
		{
			// To efficiently store payloads in the posting lists we do not store the length of
			// every payload. Instead we omit the length for a payload if the previous payload had
			// the same length.
			// However, in order to support skipping the payload length at every skip point must be known.
			// So we use the same length encoding that we use for the posting lists for the skip data as well:
			// Case 1: current field does not store payloads
			//           SkipDatum                 --> DocSkip, FreqSkip, ProxSkip
			//           DocSkip,FreqSkip,ProxSkip --> VInt
			//           DocSkip records the document number before every SkipInterval th  document in TermFreqs. 
			//           Document numbers are represented as differences from the previous value in the sequence.
			// Case 2: current field stores payloads
			//           SkipDatum                 --> DocSkip, PayloadLength?, FreqSkip,ProxSkip
			//           DocSkip,FreqSkip,ProxSkip --> VInt
			//           PayloadLength             --> VInt    
			//         In this case DocSkip/2 is the difference between
			//         the current and the previous value. If DocSkip
			//         is odd, then a PayloadLength encoded as VInt follows,
			//         if DocSkip is even, then it is assumed that the
			//         current payload length equals the length at the previous
			//         skip point
			if (curStorePayloads)
			{
				int delta = curDoc - lastSkipDoc[level];
				if (curPayloadLength == lastSkipPayloadLength[level])
				{
					// the current payload length equals the length at the previous skip point,
					// so we don't store the length again
					skipBuffer.WriteVInt(delta * 2);
				}
				else
				{
					// the payload length is different from the previous one. We shift the DocSkip, 
					// set the lowest bit and store the current payload length as VInt.
					skipBuffer.WriteVInt(delta * 2 + 1);
					skipBuffer.WriteVInt(curPayloadLength);
					lastSkipPayloadLength[level] = curPayloadLength;
				}
			}
			else
			{
				// current field does not store payloads
				skipBuffer.WriteVInt(curDoc - lastSkipDoc[level]);
			}
			skipBuffer.WriteVInt((int) (curFreqPointer - lastSkipFreqPointer[level]));
			skipBuffer.WriteVInt((int) (curProxPointer - lastSkipProxPointer[level]));
			
			lastSkipDoc[level] = curDoc;
			//System.out.println("write doc at level " + level + ": " + curDoc);
			
			lastSkipFreqPointer[level] = curFreqPointer;
			lastSkipProxPointer[level] = curProxPointer;
		}
Exemple #25
0
        internal void  AddDocument(Document doc)
        {
            indexStream.WriteLong(fieldsStream.GetFilePointer());

            int storedCount = 0;

            System.Collections.IEnumerator fieldIterator = doc.GetFields().GetEnumerator();
            while (fieldIterator.MoveNext())
            {
                Fieldable field = (Fieldable)fieldIterator.Current;
                if (field.IsStored())
                {
                    storedCount++;
                }
            }
            fieldsStream.WriteVInt(storedCount);

            fieldIterator = doc.GetFields().GetEnumerator();
            while (fieldIterator.MoveNext())
            {
                Fieldable field = (Fieldable)fieldIterator.Current;
                // if the field as an instanceof FieldsReader.FieldForMerge, we're in merge mode
                // and field.binaryValue() already returns the compressed value for a field
                // with isCompressed()==true, so we disable compression in that case
                bool disableCompression = (field is FieldsReader.FieldForMerge);
                if (field.IsStored())
                {
                    fieldsStream.WriteVInt(fieldInfos.FieldNumber(field.Name()));

                    byte bits = 0;
                    if (field.IsTokenized())
                    {
                        bits |= FieldsWriter.FIELD_IS_TOKENIZED;
                    }
                    if (field.IsBinary())
                    {
                        bits |= FieldsWriter.FIELD_IS_BINARY;
                    }
                    if (field.IsCompressed())
                    {
                        bits |= FieldsWriter.FIELD_IS_COMPRESSED;
                    }

                    fieldsStream.WriteByte(bits);

                    if (field.IsCompressed())
                    {
                        // compression is enabled for the current field
                        byte[] data = null;

                        if (disableCompression)
                        {
                            // optimized case for merging, the data
                            // is already compressed
                            data = field.BinaryValue();
                        }
                        else
                        {
                            // check if it is a binary field
                            if (field.IsBinary())
                            {
                                data = Compress(field.BinaryValue());
                            }
                            else
                            {
                                data = Compress(System.Text.Encoding.GetEncoding("UTF-8").GetBytes(field.StringValue()));
                            }
                        }
                        int len = data.Length;
                        fieldsStream.WriteVInt(len);
                        fieldsStream.WriteBytes(data, len);
                    }
                    else
                    {
                        // compression is disabled for the current field
                        if (field.IsBinary())
                        {
                            byte[] data = field.BinaryValue();
                            int    len  = data.Length;
                            fieldsStream.WriteVInt(len);
                            fieldsStream.WriteBytes(data, len);
                        }
                        else
                        {
                            fieldsStream.WriteString(field.StringValue());
                        }
                    }
                }
            }
        }
Exemple #26
0
        /// <summary>Merge files with the extensions added up to now.
        /// All files with these extensions are combined sequentially into the
        /// compound stream. After successful merge, the source files
        /// are deleted.
        /// </summary>
        /// <throws>  IllegalStateException if close() had been called before or </throws>
        /// <summary>   if no file has been added to this object
        /// </summary>
        public void  Close()
        {
            if (merged)
            {
                throw new System.SystemException("Merge already performed");
            }

            if ((entries.Count == 0))
            {
                throw new System.SystemException("No entries to merge have been defined");
            }

            merged = true;

            // open the compound stream
            IndexOutput os = null;

            try
            {
                os = directory.CreateOutput(fileName);

                // Write the number of entries
                os.WriteVInt(entries.Count);

                // Write the directory with all offsets at 0.
                // Remember the positions of directory entries so that we can
                // adjust the offsets later
                System.Collections.IEnumerator it = entries.GetEnumerator();
                long totalSize = 0;
                while (it.MoveNext())
                {
                    FileEntry fe = (FileEntry)it.Current;
                    fe.directoryOffset = os.GetFilePointer();
                    os.WriteLong(0);                     // for now
                    os.WriteString(fe.file);
                    totalSize += directory.FileLength(fe.file);
                }

                // Pre-allocate size of file as optimization --
                // this can potentially help IO performance as
                // we write the file and also later during
                // searching.  It also uncovers a disk-full
                // situation earlier and hopefully without
                // actually filling disk to 100%:
                long finalLength = totalSize + os.GetFilePointer();
                os.SetLength(finalLength);

                // Open the files and copy their data into the stream.
                // Remember the locations of each file's data section.
                byte[] buffer = new byte[16384];
                it = entries.GetEnumerator();
                while (it.MoveNext())
                {
                    FileEntry fe = (FileEntry)it.Current;
                    fe.dataOffset = os.GetFilePointer();
                    CopyFile(fe, os, buffer);
                }

                // Write the data offsets into the directory of the compound stream
                it = entries.GetEnumerator();
                while (it.MoveNext())
                {
                    FileEntry fe = (FileEntry)it.Current;
                    os.Seek(fe.directoryOffset);
                    os.WriteLong(fe.dataOffset);
                }

                System.Diagnostics.Debug.Assert(finalLength == os.Length());

                // Close the output stream. Set the os to null before trying to
                // close so that if an exception occurs during the close, the
                // finally clause below will not attempt to close the stream
                // the second time.
                IndexOutput tmp = os;
                os = null;
                tmp.Close();
            }
            finally
            {
                if (os != null)
                {
                    try
                    {
                        os.Close();
                    }
                    catch (System.IO.IOException e)
                    {
                    }
                }
            }
        }
Exemple #27
0
 // Writes the contents of buffer into the fields stream
 // and adds a new entry for this document into the index
 // stream.  This assumes the buffer was already written
 // in the correct fields format.
 internal void  FlushDocument(int numStoredFields, RAMOutputStream buffer)
 {
     indexStream.WriteLong(fieldsStream.GetFilePointer());
     fieldsStream.WriteVInt(numStoredFields);
     buffer.WriteTo(fieldsStream);
 }
        /* Walk through all unique text tokens (Posting
         * instances) found in this field and serialize them
         * into a single RAM segment. */
        void AppendPostings(DocumentsWriter.FlushState flushState,
                            FreqProxTermsWriterPerField[] fields,
                            TermInfosWriter termsOut,
                            IndexOutput freqOut,
                            IndexOutput proxOut,
                            DefaultSkipListWriter skipListWriter)
        {
            int fieldNumber = fields[0].fieldInfo.number;
            int numFields   = fields.Length;

            FreqProxFieldMergeState[] mergeStates = new FreqProxFieldMergeState[numFields];

            for (int i = 0; i < numFields; i++)
            {
                FreqProxFieldMergeState fms = mergeStates[i] = new FreqProxFieldMergeState(fields[i]);

                System.Diagnostics.Debug.Assert(fms.field.fieldInfo == fields[0].fieldInfo);

                // Should always be true
                bool result = fms.nextTerm();
                System.Diagnostics.Debug.Assert(result);
            }

            int  skipInterval       = termsOut.skipInterval;
            bool currentFieldOmitTf = fields[0].fieldInfo.omitTf;

            // If current field omits tf then it cannot store
            // payloads.  We silently drop the payloads in this case:
            bool currentFieldStorePayloads = currentFieldOmitTf ? false : fields[0].fieldInfo.storePayloads;

            FreqProxFieldMergeState[] termStates = new FreqProxFieldMergeState[numFields];

            while (numFields > 0)
            {
                // Get the next term to merge
                termStates[0] = mergeStates[0];
                int numToMerge = 1;

                for (int i = 1; i < numFields; i++)
                {
                    char[] text       = mergeStates[i].text;
                    int    textOffset = mergeStates[i].textOffset;
                    int    cmp        = compareText(text, textOffset, termStates[0].text, termStates[0].textOffset);

                    if (cmp < 0)
                    {
                        termStates[0] = mergeStates[i];
                        numToMerge    = 1;
                    }
                    else if (cmp == 0)
                    {
                        termStates[numToMerge++] = mergeStates[i];
                    }
                }

                int df = 0;
                int lastPayloadLength = -1;

                int lastDoc = 0;

                char[] text_Renamed = termStates[0].text;
                int    start        = termStates[0].textOffset;

                long freqPointer = freqOut.GetFilePointer();
                long proxPointer;
                if (proxOut != null)
                {
                    proxPointer = proxOut.GetFilePointer();
                }
                else
                {
                    proxPointer = 0;
                }

                skipListWriter.ResetSkip();

                // Now termStates has numToMerge FieldMergeStates
                // which all share the same term.  Now we must
                // interleave the docID streams.
                while (numToMerge > 0)
                {
                    if ((++df % skipInterval) == 0)
                    {
                        skipListWriter.SetSkipData(lastDoc, currentFieldStorePayloads, lastPayloadLength);
                        skipListWriter.BufferSkip(df);
                    }

                    FreqProxFieldMergeState minState = termStates[0];
                    for (int i = 1; i < numToMerge; i++)
                    {
                        if (termStates[i].docID < minState.docID)
                        {
                            minState = termStates[i];
                        }
                    }

                    int doc         = minState.docID;
                    int termDocFreq = minState.termFreq;

                    System.Diagnostics.Debug.Assert(doc < flushState.numDocsInRAM);
                    System.Diagnostics.Debug.Assert(doc > lastDoc || df == 1);

                    ByteSliceReader prox = minState.prox;

                    // Carefully copy over the prox + payload info,
                    // changing the format to match Lucene's segment
                    // format.
                    if (!currentFieldOmitTf)
                    {
                        // omitTf == false so we do write positions & payload
                        System.Diagnostics.Debug.Assert(proxOut != null);
                        for (int j = 0; j < termDocFreq; j++)
                        {
                            int code = prox.ReadVInt();
                            if (currentFieldStorePayloads)
                            {
                                int payloadLength;
                                if ((code & 1) != 0)
                                {
                                    // This position has a payload
                                    payloadLength = prox.ReadVInt();
                                }
                                else
                                {
                                    payloadLength = 0;
                                }
                                if (payloadLength != lastPayloadLength)
                                {
                                    proxOut.WriteVInt(code | 1);
                                    proxOut.WriteVInt(payloadLength);
                                    lastPayloadLength = payloadLength;
                                }
                                else
                                {
                                    proxOut.WriteVInt(code & (~1));
                                }
                                if (payloadLength > 0)
                                {
                                    copyBytes(prox, proxOut, payloadLength);
                                }
                            }
                            else
                            {
                                System.Diagnostics.Debug.Assert(0 == (code & 1));
                                proxOut.WriteVInt(code >> 1);
                            }
                        } //End for

                        int newDocCode = (doc - lastDoc) << 1;

                        if (1 == termDocFreq)
                        {
                            freqOut.WriteVInt(newDocCode | 1);
                        }
                        else
                        {
                            freqOut.WriteVInt(newDocCode);
                            freqOut.WriteVInt(termDocFreq);
                        }
                    }
                    else
                    {
                        // omitTf==true: we store only the docs, without
                        // term freq, positions, payloads
                        freqOut.WriteVInt(doc - lastDoc);
                    }

                    lastDoc = doc;

                    if (!minState.nextDoc())
                    {
                        // Remove from termStates
                        int upto = 0;
                        for (int i = 0; i < numToMerge; i++)
                        {
                            if (termStates[i] != minState)
                            {
                                termStates[upto++] = termStates[i];
                            }
                        }
                        numToMerge--;
                        System.Diagnostics.Debug.Assert(upto == numToMerge);

                        // Advance this state to the next term

                        if (!minState.nextTerm())
                        {
                            // OK, no more terms, so remove from mergeStates
                            // as well
                            upto = 0;
                            for (int i = 0; i < numFields; i++)
                            {
                                if (mergeStates[i] != minState)
                                {
                                    mergeStates[upto++] = mergeStates[i];
                                }
                            }
                            numFields--;
                            System.Diagnostics.Debug.Assert(upto == numFields);
                        }
                    }
                }

                System.Diagnostics.Debug.Assert(df > 0);

                // Done merging this term

                long skipPointer = skipListWriter.WriteSkip(freqOut);

                // Write term
                termInfo.Set(df, freqPointer, proxPointer, (int)(skipPointer - freqPointer));

                // TODO: we could do this incrementally
                UnicodeUtil.UTF16toUTF8(text_Renamed, start, termsUTF8);

                // TODO: we could save O(n) re-scan of the term by
                // computing the shared prefix with the last term
                // while during the UTF8 encoding
                termsOut.Add(fieldNumber,
                             termsUTF8.result,
                             termsUTF8.length,
                             termInfo);
            }
        }
 internal CompressingStoredFieldsIndexWriter(IndexOutput indexOutput)
 {
     this.FieldsIndexOut = indexOutput;
     Reset();
     TotalDocs = 0;
     DocBaseDeltas = new int[BLOCK_SIZE];
     StartPointerDeltas = new long[BLOCK_SIZE];
     FieldsIndexOut.WriteVInt(PackedInts.VERSION_CURRENT);
 }
        /// <summary>Merge files with the extensions added up to now.
        /// All files with these extensions are combined sequentially into the
        /// compound stream. After successful merge, the source files
        /// are deleted.
        /// </summary>
        /// <throws>  IllegalStateException if close() had been called before or </throws>
        /// <summary>   if no file has been added to this object
        /// </summary>
        public void  Close()
        {
            if (merged)
            {
                throw new System.SystemException("Merge already performed");
            }

            if ((entries.Count == 0))
            {
                throw new System.SystemException("No entries to merge have been defined");
            }

            merged = true;

            // open the compound stream
            IndexOutput os = null;

            try
            {
                os = directory.CreateOutput(fileName);

                // Write the number of entries
                os.WriteVInt(entries.Count);

                // Write the directory with all offsets at 0.
                // Remember the positions of directory entries so that we can
                // adjust the offsets later
                System.Collections.IEnumerator it = entries.GetEnumerator();
                while (it.MoveNext())
                {
                    FileEntry fe = (FileEntry)it.Current;
                    fe.directoryOffset = os.GetFilePointer();
                    os.WriteLong(0);                     // for now
                    os.WriteString(fe.file);
                }

                // Open the files and copy their data into the stream.
                // Remember the locations of each file's data section.
                byte[] buffer = new byte[1024];
                it = entries.GetEnumerator();
                while (it.MoveNext())
                {
                    FileEntry fe = (FileEntry)it.Current;
                    fe.dataOffset = os.GetFilePointer();
                    CopyFile(fe, os, buffer);
                }

                // Write the data offsets into the directory of the compound stream
                it = entries.GetEnumerator();
                while (it.MoveNext())
                {
                    FileEntry fe = (FileEntry)it.Current;
                    os.Seek(fe.directoryOffset);
                    os.WriteLong(fe.dataOffset);
                }

                // Close the output stream. Set the os to null before trying to
                // close so that if an exception occurs during the close, the
                // finally clause below will not attempt to close the stream
                // the second time.
                IndexOutput tmp = os;
                os = null;
                tmp.Close();
            }
            finally
            {
                if (os != null)
                {
                    try
                    {
                        os.Close();
                    }
                    catch (System.IO.IOException)
                    {
                    }
                }
            }
        }
Exemple #31
0
 /// <summary>
 /// Write as a d-gaps list </summary>
 private void WriteClearedDgaps(IndexOutput output)
 {
     output.WriteInt(-1); // mark using d-gaps
     output.WriteInt(Size()); // write size
     output.WriteInt(Count()); // write count
     int last = 0;
     int numCleared = Size() - Count();
     for (int i = 0; i < Bits.Length && numCleared > 0; i++)
     {
         if (Bits[i] != unchecked((byte)0xff))
         {
             output.WriteVInt(i - last);
             output.WriteByte(Bits[i]);
             last = i;
             numCleared -= (8 - BitUtil.BitCount(Bits[i]));
             Debug.Assert(numCleared >= 0 || (i == (Bits.Length - 1) && numCleared == -(8 - (Size_Renamed & 7))));
         }
     }
 }