Пример #1
0
        private void  MergeNorms()
        {
            byte[]      normBuffer = null;
            IndexOutput output     = null;

            try
            {
                int numFieldInfos = fieldInfos.Size();
                for (int i = 0; i < numFieldInfos; i++)
                {
                    FieldInfo fi = fieldInfos.FieldInfo(i);
                    if (fi.isIndexed && !fi.omitNorms)
                    {
                        if (output == null)
                        {
                            output = directory.CreateOutput(segment + "." + IndexFileNames.NORMS_EXTENSION);
                            output.WriteBytes(NORMS_HEADER, NORMS_HEADER.Length);
                        }
                        for (System.Collections.IEnumerator iter = readers.GetEnumerator(); iter.MoveNext();)
                        {
                            IndexReader reader = (IndexReader)iter.Current;
                            int         maxDoc = reader.MaxDoc();
                            if (normBuffer == null || normBuffer.Length < maxDoc)
                            {
                                // the buffer is too small for the current segment
                                normBuffer = new byte[maxDoc];
                            }
                            reader.Norms(fi.name, normBuffer, 0);
                            if (!reader.HasDeletions())
                            {
                                //optimized case for segments without deleted docs
                                output.WriteBytes(normBuffer, maxDoc);
                            }
                            else
                            {
                                // this segment has deleted docs, so we have to
                                // check for every doc if it is deleted or not
                                for (int k = 0; k < maxDoc; k++)
                                {
                                    if (!reader.IsDeleted(k))
                                    {
                                        output.WriteByte(normBuffer[k]);
                                    }
                                }
                            }
                            checkAbort.Work(maxDoc);
                        }
                    }
                }
            }
            finally
            {
                if (output != null)
                {
                    output.Close();
                }
            }
        }
Пример #2
0
        /// <summary>Add a new position &amp; payload </summary>
        internal override void  AddPosition(int position, byte[] payload, int payloadOffset, int payloadLength)
        {
            System.Diagnostics.Debug.Assert(!omitTermFreqAndPositions, "omitTermFreqAndPositions is true");
            System.Diagnostics.Debug.Assert(out_Renamed != null);

            int delta = position - lastPosition;

            lastPosition = position;

            if (storePayloads)
            {
                if (payloadLength != lastPayloadLength)
                {
                    lastPayloadLength = payloadLength;
                    out_Renamed.WriteVInt((delta << 1) | 1);
                    out_Renamed.WriteVInt(payloadLength);
                }
                else
                {
                    out_Renamed.WriteVInt(delta << 1);
                }
                if (payloadLength > 0)
                {
                    out_Renamed.WriteBytes(payload, payloadLength);
                }
            }
            else
            {
                out_Renamed.WriteVInt(delta);
            }
        }
Пример #3
0
        private void  WriteTerm(int fieldNumber, byte[] termBytes, int termBytesLength)
        {
            // TODO: UTF16toUTF8 could tell us this prefix
            // Compute prefix in common with last term:
            int start = 0;
            int limit = termBytesLength < lastTermBytesLength?termBytesLength:lastTermBytesLength;

            while (start < limit)
            {
                if (termBytes[start] != lastTermBytes[start])
                {
                    break;
                }
                start++;
            }

            int length = termBytesLength - start;

            output.WriteVInt(start);                     // write shared prefix length
            output.WriteVInt(length);                    // write delta length
            output.WriteBytes(termBytes, start, length); // write delta bytes
            output.WriteVInt(fieldNumber);               // write field num
            if (lastTermBytes.Length < termBytesLength)
            {
                byte[] newArray = new byte[(int)(termBytesLength * 1.5)];
                Array.Copy(lastTermBytes, 0, newArray, 0, start);
                lastTermBytes = newArray;
            }
            Array.Copy(termBytes, start, lastTermBytes, start, length);
            lastTermBytesLength = termBytesLength;
        }
Пример #4
0
        public long WriteTo(IndexOutput out_Renamed)
        {
            long size = 0;

            while (true)
            {
                if (limit + bufferOffset == endIndex)
                {
                    System.Diagnostics.Debug.Assert(endIndex - bufferOffset >= upto);
                    out_Renamed.WriteBytes(buffer, upto, limit - upto);
                    size += limit - upto;
                    break;
                }
                else
                {
                    out_Renamed.WriteBytes(buffer, upto, limit - upto);
                    size += limit - upto;
                    NextSlice();
                }
            }

            return(size);
        }
Пример #5
0
        /// <summary>Copy the contents of the file with specified extension into the
        /// provided output stream. Use the provided buffer for moving data
        /// to reduce memory allocation.
        /// </summary>
        private void  CopyFile(FileEntry source, IndexOutput os, byte[] buffer)
        {
            IndexInput is_Renamed = null;

            try
            {
                long startPtr = os.GetFilePointer();

                is_Renamed = directory.OpenInput(source.file);
                long length    = is_Renamed.Length();
                long remainder = length;
                int  chunk     = buffer.Length;

                while (remainder > 0)
                {
                    int len = (int)System.Math.Min(chunk, remainder);
                    is_Renamed.ReadBytes(buffer, 0, len, false);
                    os.WriteBytes(buffer, len);
                    remainder -= len;
                    if (checkAbort != null)
                    {
                        // Roughly every 2 MB we will check if
                        // it's time to abort
                        checkAbort.Work(80);
                    }
                }

                // Verify that remainder is 0
                if (remainder != 0)
                {
                    throw new System.IO.IOException("Non-zero remainder length after copying: " + remainder + " (id: " + source.file + ", length: " + length + ", buffer size: " + chunk + ")");
                }

                // Verify that the output length diff is equal to original file
                long endPtr = os.GetFilePointer();
                long diff   = endPtr - startPtr;
                if (diff != length)
                {
                    throw new System.IO.IOException("Difference in the output file offsets " + diff + " does not match the original file length " + length);
                }
            }
            finally
            {
                if (is_Renamed != null)
                {
                    is_Renamed.Close();
                }
            }
        }
Пример #6
0
        /// <summary>Produce _X.nrm if any document had a field with norms
        /// not disabled
        /// </summary>
        public override void  Flush(System.Collections.IDictionary threadsAndFields, SegmentWriteState state)
        {
            System.Collections.IDictionary byField = new System.Collections.Hashtable();

            // Typically, each thread will have encountered the same
            // field.  So first we collate by field, ie, all
            // per-thread field instances that correspond to the
            // same FieldInfo
            System.Collections.IEnumerator it = new System.Collections.Hashtable(threadsAndFields).GetEnumerator();
            while (it.MoveNext())
            {
                System.Collections.DictionaryEntry entry = (System.Collections.DictionaryEntry)it.Current;

                System.Collections.ICollection fields         = (System.Collections.ICollection)entry.Value;
                System.Collections.IEnumerator fieldsIt       = fields.GetEnumerator();
                System.Collections.ArrayList   fieldsToRemove = new System.Collections.ArrayList();

                while (fieldsIt.MoveNext())
                {
                    NormsWriterPerField perField = (NormsWriterPerField)((System.Collections.DictionaryEntry)fieldsIt.Current).Key;

                    if (perField.upto > 0)
                    {
                        // It has some norms
                        System.Collections.IList l = (System.Collections.IList)byField[perField.fieldInfo];
                        if (l == null)
                        {
                            l = new System.Collections.ArrayList();
                            byField[perField.fieldInfo] = l;
                        }
                        l.Add(perField);
                    }
                    // Remove this field since we haven't seen it
                    // since the previous flush
                    else
                    {
                        fieldsToRemove.Add(perField);
                    }
                }

                System.Collections.Hashtable fieldsHT = (System.Collections.Hashtable)fields;
                for (int i = 0; i < fieldsToRemove.Count; i++)
                {
                    fieldsHT.Remove(fieldsToRemove[i]);
                }
            }

            System.String normsFileName = state.segmentName + "." + IndexFileNames.NORMS_EXTENSION;
            state.flushedFiles[normsFileName] = normsFileName;
            IndexOutput normsOut = state.directory.CreateOutput(normsFileName);

            try
            {
                normsOut.WriteBytes(SegmentMerger.NORMS_HEADER, 0, SegmentMerger.NORMS_HEADER.Length);

                int numField = fieldInfos.Size();

                int normCount = 0;

                for (int fieldNumber = 0; fieldNumber < numField; fieldNumber++)
                {
                    FieldInfo fieldInfo = fieldInfos.FieldInfo(fieldNumber);

                    System.Collections.IList toMerge = (System.Collections.IList)byField[fieldInfo];
                    int upto = 0;
                    if (toMerge != null)
                    {
                        int numFields = toMerge.Count;

                        normCount++;

                        NormsWriterPerField[] fields = new NormsWriterPerField[numFields];
                        int[] uptos = new int[numFields];

                        for (int j = 0; j < numFields; j++)
                        {
                            fields[j] = (NormsWriterPerField)toMerge[j];
                        }

                        int numLeft = numFields;

                        while (numLeft > 0)
                        {
                            System.Diagnostics.Debug.Assert(uptos [0] < fields [0].docIDs.Length, " uptos[0]=" + uptos [0] + " len=" + (fields [0].docIDs.Length));

                            int minLoc   = 0;
                            int minDocID = fields[0].docIDs[uptos[0]];

                            for (int j = 1; j < numLeft; j++)
                            {
                                int docID = fields[j].docIDs[uptos[j]];
                                if (docID < minDocID)
                                {
                                    minDocID = docID;
                                    minLoc   = j;
                                }
                            }

                            System.Diagnostics.Debug.Assert(minDocID < state.numDocs);

                            // Fill hole
                            for (; upto < minDocID; upto++)
                            {
                                normsOut.WriteByte(defaultNorm);
                            }

                            normsOut.WriteByte(fields[minLoc].norms[uptos[minLoc]]);
                            (uptos[minLoc])++;
                            upto++;

                            if (uptos[minLoc] == fields[minLoc].upto)
                            {
                                fields[minLoc].Reset();
                                if (minLoc != numLeft - 1)
                                {
                                    fields[minLoc] = fields[numLeft - 1];
                                    uptos[minLoc]  = uptos[numLeft - 1];
                                }
                                numLeft--;
                            }
                        }

                        // Fill final hole with defaultNorm
                        for (; upto < state.numDocs; upto++)
                        {
                            normsOut.WriteByte(defaultNorm);
                        }
                    }
                    else if (fieldInfo.isIndexed && !fieldInfo.omitNorms)
                    {
                        normCount++;
                        // Fill entire field with default norm:
                        for (; upto < state.numDocs; upto++)
                        {
                            normsOut.WriteByte(defaultNorm);
                        }
                    }

                    System.Diagnostics.Debug.Assert(4 + normCount * state.numDocs == normsOut.GetFilePointer(), ".nrm file size mismatch: expected=" + (4 + normCount * state.numDocs) + " actual=" + normsOut.GetFilePointer());
                }
            }
            finally
            {
                normsOut.Close();
            }
        }
Пример #7
0
        internal void  WriteField(FieldInfo fi, Fieldable field)
        {
            // if the field as an instanceof FieldsReader.FieldForMerge, we're in merge mode
            // and field.binaryValue() already returns the compressed value for a field
            // with isCompressed()==true, so we disable compression in that case
            bool disableCompression = (field is FieldsReader.FieldForMerge);

            fieldsStream.WriteVInt(fi.number);
            byte bits = 0;

            if (field.IsTokenized())
            {
                bits |= FieldsWriter.FIELD_IS_TOKENIZED;
            }
            if (field.IsBinary())
            {
                bits |= FieldsWriter.FIELD_IS_BINARY;
            }
            if (field.IsCompressed())
            {
                bits |= FieldsWriter.FIELD_IS_COMPRESSED;
            }

            fieldsStream.WriteByte(bits);

            if (field.IsCompressed())
            {
                // compression is enabled for the current field
                byte[] data;
                int    len;
                int    offset;
                if (disableCompression)
                {
                    // optimized case for merging, the data
                    // is already compressed
                    data = field.GetBinaryValue();
                    System.Diagnostics.Debug.Assert(data != null);
                    len    = field.GetBinaryLength();
                    offset = field.GetBinaryOffset();
                }
                else
                {
                    // check if it is a binary field
                    if (field.IsBinary())
                    {
                        data = CompressionTools.Compress(field.GetBinaryValue(), field.GetBinaryOffset(), field.GetBinaryLength());
                    }
                    else
                    {
                        byte[] x = System.Text.Encoding.GetEncoding("UTF-8").GetBytes(field.StringValue());
                        data = CompressionTools.Compress(x, 0, x.Length);
                    }
                    len    = data.Length;
                    offset = 0;
                }

                fieldsStream.WriteVInt(len);
                fieldsStream.WriteBytes(data, offset, len);
            }
            else
            {
                // compression is disabled for the current field
                if (field.IsBinary())
                {
                    byte[] data;
                    int    len;
                    int    offset;
                    data   = field.GetBinaryValue();
                    len    = field.GetBinaryLength();
                    offset = field.GetBinaryOffset();

                    fieldsStream.WriteVInt(len);
                    fieldsStream.WriteBytes(data, offset, len);
                }
                else
                {
                    fieldsStream.WriteString(field.StringValue());
                }
            }
        }
Пример #8
0
		/// <summary>Copy the contents of the file with specified extension into the
		/// provided output stream. Use the provided buffer for moving data
		/// to reduce memory allocation.
		/// </summary>
		private void  CopyFile(FileEntry source, IndexOutput os, byte[] buffer)
		{
			IndexInput is_Renamed = null;
			try
			{
				long startPtr = os.GetFilePointer();
				
				is_Renamed = directory.OpenInput(source.file);
				long length = is_Renamed.Length();
				long remainder = length;
				int chunk = buffer.Length;
				
				while (remainder > 0)
				{
					int len = (int) System.Math.Min(chunk, remainder);
					is_Renamed.ReadBytes(buffer, 0, len, false);
					os.WriteBytes(buffer, len);
					remainder -= len;
					if (checkAbort != null)
					// Roughly every 2 MB we will check if
					// it's time to abort
						checkAbort.Work(80);
				}
				
				// Verify that remainder is 0
				if (remainder != 0)
					throw new System.IO.IOException("Non-zero remainder length after copying: " + remainder + " (id: " + source.file + ", length: " + length + ", buffer size: " + chunk + ")");
				
				// Verify that the output length diff is equal to original file
				long endPtr = os.GetFilePointer();
				long diff = endPtr - startPtr;
				if (diff != length)
					throw new System.IO.IOException("Difference in the output file offsets " + diff + " does not match the original file length " + length);
			}
			finally
			{
				if (is_Renamed != null)
					is_Renamed.Close();
			}
		}
Пример #9
0
        /// <summary>Called once per field per document if term vectors
        /// are enabled, to write the vectors to
        /// RAMOutputStream, which is then quickly flushed to
        /// the real term vectors files in the Directory.
        /// </summary>
        internal override void  Finish()
        {
            System.Diagnostics.Debug.Assert(docState.TestPoint("TermVectorsTermsWriterPerField.finish start"));

            int numPostings = termsHashPerField.numPostings;

            System.Diagnostics.Debug.Assert(numPostings >= 0);

            if (!doVectors || numPostings == 0)
            {
                return;
            }

            if (numPostings > maxNumPostings)
            {
                maxNumPostings = numPostings;
            }

            IndexOutput tvf = perThread.doc.perDocTvf;

            // This is called once, after inverting all occurences
            // of a given field in the doc.  At this point we flush
            // our hash into the DocWriter.

            System.Diagnostics.Debug.Assert(fieldInfo.storeTermVector);
            System.Diagnostics.Debug.Assert(perThread.VectorFieldsInOrder(fieldInfo));

            perThread.doc.AddField(termsHashPerField.fieldInfo.number);

            RawPostingList[] postings = termsHashPerField.SortPostings();

            tvf.WriteVInt(numPostings);
            byte bits = (byte)(0x0);

            if (doVectorPositions)
            {
                bits |= TermVectorsReader.STORE_POSITIONS_WITH_TERMVECTOR;
            }
            if (doVectorOffsets)
            {
                bits |= TermVectorsReader.STORE_OFFSET_WITH_TERMVECTOR;
            }
            tvf.WriteByte(bits);

            int encoderUpto        = 0;
            int lastTermBytesCount = 0;

            ByteSliceReader reader = perThread.vectorSliceReader;

            char[][] charBuffers = perThread.termsHashPerThread.charPool.buffers;
            for (int j = 0; j < numPostings; j++)
            {
                TermVectorsTermsWriter.PostingList posting = (TermVectorsTermsWriter.PostingList)postings[j];
                int freq = posting.freq;

                char[] text2  = charBuffers[posting.textStart >> DocumentsWriter.CHAR_BLOCK_SHIFT];
                int    start2 = posting.textStart & DocumentsWriter.CHAR_BLOCK_MASK;

                // We swap between two encoders to save copying
                // last Term's byte array
                UnicodeUtil.UTF8Result utf8Result = perThread.utf8Results[encoderUpto];

                // TODO: we could do this incrementally
                UnicodeUtil.UTF16toUTF8(text2, start2, utf8Result);
                int termBytesCount = utf8Result.length;

                // TODO: UTF16toUTF8 could tell us this prefix
                // Compute common prefix between last term and
                // this term
                int prefix = 0;
                if (j > 0)
                {
                    byte[] lastTermBytes = perThread.utf8Results[1 - encoderUpto].result;
                    byte[] termBytes     = perThread.utf8Results[encoderUpto].result;
                    while (prefix < lastTermBytesCount && prefix < termBytesCount)
                    {
                        if (lastTermBytes[prefix] != termBytes[prefix])
                        {
                            break;
                        }
                        prefix++;
                    }
                }
                encoderUpto        = 1 - encoderUpto;
                lastTermBytesCount = termBytesCount;

                int suffix = termBytesCount - prefix;
                tvf.WriteVInt(prefix);
                tvf.WriteVInt(suffix);
                tvf.WriteBytes(utf8Result.result, prefix, suffix);
                tvf.WriteVInt(freq);

                if (doVectorPositions)
                {
                    termsHashPerField.InitReader(reader, posting, 0);
                    reader.WriteTo(tvf);
                }

                if (doVectorOffsets)
                {
                    termsHashPerField.InitReader(reader, posting, 1);
                    reader.WriteTo(tvf);
                }
            }

            termsHashPerField.Reset();

            // NOTE: we clear, per-field, at the thread level,
            // because term vectors fully write themselves on each
            // field; this saves RAM (eg if large doc has two large
            // fields w/ term vectors on) because we recycle/reuse
            // all RAM after each field:
            perThread.termsHashPerThread.Reset(false);
        }
Пример #10
0
 /// <summary>Write as a bit set </summary>
 private void  WriteBits(IndexOutput output)
 {
     output.WriteInt(Size());             // write size
     output.WriteInt(Count());            // write count
     output.WriteBytes(bits, bits.Length);
 }
Пример #11
0
		/// <summary>Write as a bit set </summary>
		private void  WriteBits(IndexOutput output)
		{
			output.WriteInt(Size()); // write size
			output.WriteInt(Count()); // write count
			output.WriteBytes(bits, bits.Length);
		}