Beispiel #1
0
        /// <summary>
        /// Sole constructor. </summary>
        public CompressingTermVectorsWriter(Directory directory, SegmentInfo si, string segmentSuffix, IOContext context, string formatName, CompressionMode compressionMode, int chunkSize)
        {
            if (Debugging.AssertsEnabled)
            {
                Debugging.Assert(directory != null);
            }
            this.directory       = directory;
            this.segment         = si.Name;
            this.segmentSuffix   = segmentSuffix;
            this.compressionMode = compressionMode;
            this.compressor      = compressionMode.NewCompressor();
            this.chunkSize       = chunkSize;

            numDocs      = 0;
            pendingDocs  = new LinkedList <DocData>();
            termSuffixes = new GrowableByteArrayDataOutput(ArrayUtil.Oversize(chunkSize, 1));
            payloadBytes = new GrowableByteArrayDataOutput(ArrayUtil.Oversize(1, 1));
            lastTerm     = new BytesRef(ArrayUtil.Oversize(30, 1));

            bool        success     = false;
            IndexOutput indexStream = directory.CreateOutput(IndexFileNames.SegmentFileName(segment, segmentSuffix, VECTORS_INDEX_EXTENSION), context);

            try
            {
                vectorsStream = directory.CreateOutput(IndexFileNames.SegmentFileName(segment, segmentSuffix, VECTORS_EXTENSION), context);

                string codecNameIdx = formatName + CODEC_SFX_IDX;
                string codecNameDat = formatName + CODEC_SFX_DAT;
                CodecUtil.WriteHeader(indexStream, codecNameIdx, VERSION_CURRENT);
                CodecUtil.WriteHeader(vectorsStream, codecNameDat, VERSION_CURRENT);
                if (Debugging.AssertsEnabled)
                {
                    Debugging.Assert(CodecUtil.HeaderLength(codecNameDat) == vectorsStream.Position); // LUCENENET specific: Renamed from getFilePointer() to match FileStream
                    Debugging.Assert(CodecUtil.HeaderLength(codecNameIdx) == indexStream.Position);   // LUCENENET specific: Renamed from getFilePointer() to match FileStream
                }

                indexWriter = new CompressingStoredFieldsIndexWriter(indexStream);
                indexStream = null;

                vectorsStream.WriteVInt32(PackedInt32s.VERSION_CURRENT);
                vectorsStream.WriteVInt32(chunkSize);
                writer = new BlockPackedWriter(vectorsStream, BLOCK_SIZE);

                positionsBuf      = new int[1024];
                startOffsetsBuf   = new int[1024];
                lengthsBuf        = new int[1024];
                payloadLengthsBuf = new int[1024];

                success = true;
            }
            finally
            {
                if (!success)
                {
                    IOUtils.DisposeWhileHandlingException(indexStream);
                    Abort();
                }
            }
        }
Beispiel #2
0
        /// <summary>
        /// Sole constructor. </summary>
        public CompressingTermVectorsWriter(Directory directory, SegmentInfo si, string segmentSuffix, IOContext context, string formatName, CompressionMode compressionMode, int chunkSize)
        {
            Debug.Assert(directory != null);
            this.Directory       = directory;
            this.Segment         = si.Name;
            this.SegmentSuffix   = segmentSuffix;
            this.CompressionMode = compressionMode;
            this.Compressor      = compressionMode.NewCompressor();
            this.ChunkSize       = chunkSize;

            NumDocs      = 0;
            PendingDocs  = new LinkedList <DocData>();
            TermSuffixes = new GrowableByteArrayDataOutput(ArrayUtil.Oversize(chunkSize, 1));
            PayloadBytes = new GrowableByteArrayDataOutput(ArrayUtil.Oversize(1, 1));
            LastTerm     = new BytesRef(ArrayUtil.Oversize(30, 1));

            bool        success     = false;
            IndexOutput indexStream = directory.CreateOutput(IndexFileNames.SegmentFileName(Segment, segmentSuffix, VECTORS_INDEX_EXTENSION), context);

            try
            {
                VectorsStream = directory.CreateOutput(IndexFileNames.SegmentFileName(Segment, segmentSuffix, VECTORS_EXTENSION), context);

                string codecNameIdx = formatName + CODEC_SFX_IDX;
                string codecNameDat = formatName + CODEC_SFX_DAT;
                CodecUtil.WriteHeader(indexStream, codecNameIdx, VERSION_CURRENT);
                CodecUtil.WriteHeader(VectorsStream, codecNameDat, VERSION_CURRENT);
                Debug.Assert(CodecUtil.HeaderLength(codecNameDat) == VectorsStream.FilePointer);
                Debug.Assert(CodecUtil.HeaderLength(codecNameIdx) == indexStream.FilePointer);

                IndexWriter = new CompressingStoredFieldsIndexWriter(indexStream);
                indexStream = null;

                VectorsStream.WriteVInt(PackedInts.VERSION_CURRENT);
                VectorsStream.WriteVInt(chunkSize);
                Writer = new BlockPackedWriter(VectorsStream, BLOCK_SIZE);

                PositionsBuf      = new int[1024];
                StartOffsetsBuf   = new int[1024];
                LengthsBuf        = new int[1024];
                PayloadLengthsBuf = new int[1024];

                success = true;
            }
            finally
            {
                if (!success)
                {
                    IOUtils.CloseWhileHandlingException(indexStream);
                    Abort();
                }
            }
        }
        internal virtual void AddNumericField(FieldInfo field, IEnumerable<long?> values, bool optimizeStorage)
        {
            long count = 0;
            long minValue = long.MaxValue;
            long maxValue = long.MinValue;
            long gcd = 0;
            bool missing = false;
            // TODO: more efficient?
            HashSet<long> uniqueValues = null;
            
            if (optimizeStorage)
            {
                uniqueValues = new HashSet<long>();

                foreach (long? nv in values)
                {
                    long v;
                    if (nv == null)
                    {
                        v = 0;
                        missing = true;
                    }
                    else
                    {
                        v = nv.Value;
                    }

                    if (gcd != 1)
                    {
                        if (v < long.MinValue / 2 || v > long.MaxValue / 2)
                        {
                            // in that case v - minValue might overflow and make the GCD computation return
                            // wrong results. Since these extreme values are unlikely, we just discard
                            // GCD computation for them
                            gcd = 1;
                        } // minValue needs to be set first
                        else if (count != 0)
                        {
                            gcd = MathUtil.Gcd(gcd, v - minValue);
                        }
                    }

                    minValue = Math.Min(minValue, v);
                    maxValue = Math.Max(maxValue, v);

                    if (uniqueValues != null)
                    {
                        if (uniqueValues.Add(v))
                        {
                            if (uniqueValues.Count > 256)
                            {
                                uniqueValues = null;
                            }
                        }
                    }

                    ++count;
                }
            }
            else
            {
                foreach (var nv in values)
                {
                    ++count;
                }
            }

            long delta = maxValue - minValue;

            int format;
            if (uniqueValues != null && (delta < 0L || PackedInts.BitsRequired(uniqueValues.Count - 1) < PackedInts.BitsRequired(delta)) && count <= int.MaxValue)
            {
                format = TABLE_COMPRESSED;
            }
            else if (gcd != 0 && gcd != 1)
            {
                format = GCD_COMPRESSED;
            }
            else
            {
                format = DELTA_COMPRESSED;
            }
            Meta.WriteVInt(field.Number);
            Meta.WriteByte((byte)Lucene45DocValuesFormat.NUMERIC);
            Meta.WriteVInt(format);
            if (missing)
            {
                Meta.WriteLong(Data.FilePointer);
                WriteMissingBitset(values);
            }
            else
            {
                Meta.WriteLong(-1L);
            }
            Meta.WriteVInt(PackedInts.VERSION_CURRENT);
            Meta.WriteLong(Data.FilePointer);
            Meta.WriteVLong(count);
            Meta.WriteVInt(BLOCK_SIZE);

            switch (format)
            {
                case GCD_COMPRESSED:
                    Meta.WriteLong(minValue);
                    Meta.WriteLong(gcd);
                    BlockPackedWriter quotientWriter = new BlockPackedWriter(Data, BLOCK_SIZE);
                    foreach (long? nv in values)
                    {
                        long value = nv == null ? 0 : nv.Value;
                        quotientWriter.Add((value - minValue) / gcd);
                    }
                    quotientWriter.Finish();
                    break;

                case DELTA_COMPRESSED:
                    BlockPackedWriter writer = new BlockPackedWriter(Data, BLOCK_SIZE);
                    foreach (long? nv in values)
                    {
                        writer.Add(nv == null ? 0 : nv.Value);
                    }
                    writer.Finish();
                    break;

                case TABLE_COMPRESSED:
                    long[] decode = uniqueValues.ToArray();//LUCENE TO-DO Hadd oparamerter before
                    Dictionary<long, int> encode = new Dictionary<long, int>();
                    Meta.WriteVInt(decode.Length);
                    for (int i = 0; i < decode.Length; i++)
                    {
                        Meta.WriteLong(decode[i]);
                        encode[decode[i]] = i;
                    }
                    int bitsRequired = PackedInts.BitsRequired(uniqueValues.Count - 1);
                    PackedInts.Writer ordsWriter = PackedInts.GetWriterNoHeader(Data, PackedInts.Format.PACKED, (int)count, bitsRequired, PackedInts.DEFAULT_BUFFER_SIZE);
                    foreach (long? nv in values)
                    {
                        ordsWriter.Add(encode[nv == null ? 0 : nv.Value]);
                    }
                    ordsWriter.Finish();
                    break;

                default:
                    throw new InvalidOperationException();
            }
        }
        internal virtual void AddNumericField(FieldInfo field, IEnumerable <long?> values, bool optimizeStorage)
        {
            long count    = 0;
            long minValue = long.MaxValue;
            long maxValue = long.MinValue;
            long gcd      = 0;
            bool missing  = false;
            // TODO: more efficient?
            HashSet <long> uniqueValues = null;

            if (optimizeStorage)
            {
                uniqueValues = new HashSet <long>();

                foreach (long?nv in values)
                {
                    long v;
                    if (nv == null)
                    {
                        v       = 0;
                        missing = true;
                    }
                    else
                    {
                        v = nv.Value;
                    }

                    if (gcd != 1)
                    {
                        if (v < long.MinValue / 2 || v > long.MaxValue / 2)
                        {
                            // in that case v - minValue might overflow and make the GCD computation return
                            // wrong results. Since these extreme values are unlikely, we just discard
                            // GCD computation for them
                            gcd = 1;
                        } // minValue needs to be set first
                        else if (count != 0)
                        {
                            gcd = MathUtil.Gcd(gcd, v - minValue);
                        }
                    }

                    minValue = Math.Min(minValue, v);
                    maxValue = Math.Max(maxValue, v);

                    if (uniqueValues != null)
                    {
                        if (uniqueValues.Add(v))
                        {
                            if (uniqueValues.Count > 256)
                            {
                                uniqueValues = null;
                            }
                        }
                    }

                    ++count;
                }
            }
            else
            {
                foreach (var nv in values)
                {
                    ++count;
                }
            }

            long delta = maxValue - minValue;

            int format;

            if (uniqueValues != null && (delta < 0L || PackedInts.BitsRequired(uniqueValues.Count - 1) < PackedInts.BitsRequired(delta)) && count <= int.MaxValue)
            {
                format = TABLE_COMPRESSED;
            }
            else if (gcd != 0 && gcd != 1)
            {
                format = GCD_COMPRESSED;
            }
            else
            {
                format = DELTA_COMPRESSED;
            }
            Meta.WriteVInt(field.Number);
            Meta.WriteByte((byte)Lucene45DocValuesFormat.NUMERIC);
            Meta.WriteVInt(format);
            if (missing)
            {
                Meta.WriteLong(Data.FilePointer);
                WriteMissingBitset(values);
            }
            else
            {
                Meta.WriteLong(-1L);
            }
            Meta.WriteVInt(PackedInts.VERSION_CURRENT);
            Meta.WriteLong(Data.FilePointer);
            Meta.WriteVLong(count);
            Meta.WriteVInt(BLOCK_SIZE);

            switch (format)
            {
            case GCD_COMPRESSED:
                Meta.WriteLong(minValue);
                Meta.WriteLong(gcd);
                BlockPackedWriter quotientWriter = new BlockPackedWriter(Data, BLOCK_SIZE);
                foreach (long?nv in values)
                {
                    long value = nv == null ? 0 : nv.Value;
                    quotientWriter.Add((value - minValue) / gcd);
                }
                quotientWriter.Finish();
                break;

            case DELTA_COMPRESSED:
                BlockPackedWriter writer = new BlockPackedWriter(Data, BLOCK_SIZE);
                foreach (long?nv in values)
                {
                    writer.Add(nv == null ? 0 : nv.Value);
                }
                writer.Finish();
                break;

            case TABLE_COMPRESSED:
                long[] decode = uniqueValues.ToArray();    //LUCENE TO-DO Hadd oparamerter before
                Dictionary <long, int> encode = new Dictionary <long, int>();
                Meta.WriteVInt(decode.Length);
                for (int i = 0; i < decode.Length; i++)
                {
                    Meta.WriteLong(decode[i]);
                    encode[decode[i]] = i;
                }
                int bitsRequired             = PackedInts.BitsRequired(uniqueValues.Count - 1);
                PackedInts.Writer ordsWriter = PackedInts.GetWriterNoHeader(Data, PackedInts.Format.PACKED, (int)count, bitsRequired, PackedInts.DEFAULT_BUFFER_SIZE);
                foreach (long?nv in values)
                {
                    ordsWriter.Add(encode[nv == null ? 0 : nv.Value]);
                }
                ordsWriter.Finish();
                break;

            default:
                throw new InvalidOperationException();
            }
        }
        internal virtual void AddNumericField(FieldInfo field, IEnumerable <long> values, bool optimizeStorage)
        {
            Meta.WriteVInt(field.Number);
            Meta.WriteByte(Lucene42DocValuesProducer.NUMBER);
            Meta.WriteLong(Data.FilePointer);
            long minValue = long.MaxValue;
            long maxValue = long.MinValue;
            long gcd      = 0;
            // TODO: more efficient?
            HashSet <long> uniqueValues = null;

            if (optimizeStorage)
            {
                uniqueValues = new HashSet <long>();

                long count = 0;
                foreach (long nv in values)
                {
                    // TODO: support this as MemoryDVFormat (and be smart about missing maybe)
                    long v = nv == null ? 0 : (long)nv;

                    if (gcd != 1)
                    {
                        if (v < long.MinValue / 2 || v > long.MaxValue / 2)
                        {
                            // in that case v - minValue might overflow and make the GCD computation return
                            // wrong results. Since these extreme values are unlikely, we just discard
                            // GCD computation for them
                            gcd = 1;
                        } // minValue needs to be set first
                        else if (count != 0)
                        {
                            gcd = MathUtil.Gcd(gcd, v - minValue);
                        }
                    }

                    minValue = Math.Min(minValue, v);
                    maxValue = Math.Max(maxValue, v);

                    if (uniqueValues != null)
                    {
                        if (uniqueValues.Add(v))
                        {
                            if (uniqueValues.Count > 256)
                            {
                                uniqueValues = null;
                            }
                        }
                    }

                    ++count;
                }
                Debug.Assert(count == MaxDoc);
            }

            if (uniqueValues != null)
            {
                // small number of unique values
                int           bitsPerValue  = PackedInts.BitsRequired(uniqueValues.Count - 1);
                FormatAndBits formatAndBits = PackedInts.FastestFormatAndBits(MaxDoc, bitsPerValue, AcceptableOverheadRatio);
                if (formatAndBits.bitsPerValue == 8 && minValue >= sbyte.MinValue && maxValue <= sbyte.MaxValue)
                {
                    Meta.WriteByte(Lucene42DocValuesProducer.UNCOMPRESSED); // uncompressed
                    foreach (long nv in values)
                    {
                        Data.WriteByte(nv == null ? (byte)0 : (byte)nv);
                    }
                }
                else
                {
                    Meta.WriteByte(Lucene42DocValuesProducer.TABLE_COMPRESSED); // table-compressed
                    long[] decode = uniqueValues.ToArray(/*new long?[uniqueValues.Count]*/);
                    Dictionary <long, int> encode = new Dictionary <long, int>();
                    Data.WriteVInt(decode.Length);
                    for (int i = 0; i < decode.Length; i++)
                    {
                        Data.WriteLong(decode[i]);
                        encode[decode[i]] = i;
                    }

                    Meta.WriteVInt(PackedInts.VERSION_CURRENT);
                    Data.WriteVInt(formatAndBits.format.id);
                    Data.WriteVInt(formatAndBits.bitsPerValue);

                    PackedInts.Writer writer = PackedInts.GetWriterNoHeader(Data, formatAndBits.format, MaxDoc, formatAndBits.bitsPerValue, PackedInts.DEFAULT_BUFFER_SIZE);
                    foreach (long nv in values)
                    {
                        writer.Add(encode[nv == null ? 0 : (long)nv]);
                    }
                    writer.Finish();
                }
            }
            else if (gcd != 0 && gcd != 1)
            {
                Meta.WriteByte(Lucene42DocValuesProducer.GCD_COMPRESSED);
                Meta.WriteVInt(PackedInts.VERSION_CURRENT);
                Data.WriteLong(minValue);
                Data.WriteLong(gcd);
                Data.WriteVInt(Lucene42DocValuesProducer.BLOCK_SIZE);

                BlockPackedWriter writer = new BlockPackedWriter(Data, Lucene42DocValuesProducer.BLOCK_SIZE);
                foreach (long nv in values)
                {
                    long value = nv == null ? 0 : (long)nv;
                    writer.Add((value - minValue) / gcd);
                }
                writer.Finish();
            }
            else
            {
                Meta.WriteByte(Lucene42DocValuesProducer.DELTA_COMPRESSED); // delta-compressed

                Meta.WriteVInt(PackedInts.VERSION_CURRENT);
                Data.WriteVInt(Lucene42DocValuesProducer.BLOCK_SIZE);

                BlockPackedWriter writer = new BlockPackedWriter(Data, Lucene42DocValuesProducer.BLOCK_SIZE);
                foreach (long nv in values)
                {
                    writer.Add(nv == null ? 0 : (long)nv);
                }
                writer.Finish();
            }
        }
        internal virtual void AddNumericField(FieldInfo field, IEnumerable <long?> values, bool optimizeStorage)
        {
            long count    = 0;
            long minValue = long.MaxValue;
            long maxValue = long.MinValue;
            long gcd      = 0;
            bool missing  = false;

            // TODO: more efficient?
            JCG.HashSet <long> uniqueValues = null;

            if (optimizeStorage)
            {
                uniqueValues = new JCG.HashSet <long>();

                foreach (long?nv in values)
                {
                    long v;
                    if (nv == null)
                    {
                        v       = 0;
                        missing = true;
                    }
                    else
                    {
                        v = nv.Value;
                    }

                    if (gcd != 1)
                    {
                        if (v < long.MinValue / 2 || v > long.MaxValue / 2)
                        {
                            // in that case v - minValue might overflow and make the GCD computation return
                            // wrong results. Since these extreme values are unlikely, we just discard
                            // GCD computation for them
                            gcd = 1;
                        } // minValue needs to be set first
                        else if (count != 0)
                        {
                            gcd = MathUtil.Gcd(gcd, v - minValue);
                        }
                    }

                    minValue = Math.Min(minValue, v);
                    maxValue = Math.Max(maxValue, v);

                    if (uniqueValues != null)
                    {
                        if (uniqueValues.Add(v))
                        {
                            if (uniqueValues.Count > 256)
                            {
                                uniqueValues = null;
                            }
                        }
                    }

                    ++count;
                }
            }
            else
            {
                foreach (var nv in values)
                {
                    ++count;
                }
            }

            long delta = maxValue - minValue;

            int format;

            if (uniqueValues != null && (delta < 0L || PackedInt32s.BitsRequired(uniqueValues.Count - 1) < PackedInt32s.BitsRequired(delta)) && count <= int.MaxValue)
            {
                format = TABLE_COMPRESSED;
            }
            else if (gcd != 0 && gcd != 1)
            {
                format = GCD_COMPRESSED;
            }
            else
            {
                format = DELTA_COMPRESSED;
            }
            meta.WriteVInt32(field.Number);
            meta.WriteByte((byte)Lucene45DocValuesFormat.NUMERIC);
            meta.WriteVInt32(format);
            if (missing)
            {
                meta.WriteInt64(data.Position); // LUCENENET specific: Renamed from getFilePointer() to match FileStream
                WriteMissingBitset(values);
            }
            else
            {
                meta.WriteInt64(-1L);
            }
            meta.WriteVInt32(PackedInt32s.VERSION_CURRENT);
            meta.WriteInt64(data.Position); // LUCENENET specific: Renamed from getFilePointer() to match FileStream
            meta.WriteVInt64(count);
            meta.WriteVInt32(BLOCK_SIZE);

            switch (format)
            {
            case GCD_COMPRESSED:
                meta.WriteInt64(minValue);
                meta.WriteInt64(gcd);
                BlockPackedWriter quotientWriter = new BlockPackedWriter(data, BLOCK_SIZE);
                foreach (long?nv in values)
                {
                    quotientWriter.Add((nv.GetValueOrDefault() - minValue) / gcd);
                }
                quotientWriter.Finish();
                break;

            case DELTA_COMPRESSED:
                BlockPackedWriter writer = new BlockPackedWriter(data, BLOCK_SIZE);
                foreach (long?nv in values)
                {
                    writer.Add(nv.GetValueOrDefault());
                }
                writer.Finish();
                break;

            case TABLE_COMPRESSED:
                // LUCENENET NOTE: diming an array and then using .CopyTo() for better efficiency than LINQ .ToArray()
                long[] decode = new long[uniqueValues.Count];
                uniqueValues.CopyTo(decode, 0);
                Dictionary <long, int> encode = new Dictionary <long, int>();
                meta.WriteVInt32(decode.Length);
                for (int i = 0; i < decode.Length; i++)
                {
                    meta.WriteInt64(decode[i]);
                    encode[decode[i]] = i;
                }
                int bitsRequired = PackedInt32s.BitsRequired(uniqueValues.Count - 1);
                PackedInt32s.Writer ordsWriter = PackedInt32s.GetWriterNoHeader(data, PackedInt32s.Format.PACKED, (int)count, bitsRequired, PackedInt32s.DEFAULT_BUFFER_SIZE);
                foreach (long?nv in values)
                {
                    ordsWriter.Add(encode[nv.GetValueOrDefault()]);
                }
                ordsWriter.Finish();
                break;

            default:
                throw AssertionError.Create();
            }
        }
Beispiel #7
0
        internal virtual void AddNumericField(FieldInfo field, IEnumerable <long?> values, bool optimizeStorage)
        {
            meta.WriteVInt32(field.Number);
            meta.WriteByte(MemoryDocValuesProducer.NUMBER);
            meta.WriteInt64(data.GetFilePointer());
            long minValue = long.MaxValue;
            long maxValue = long.MinValue;
            long gcd      = 0;
            bool missing  = false;
            // TODO: more efficient?
            ISet <long?> uniqueValues = null;

            if (optimizeStorage)
            {
                uniqueValues = new JCG.HashSet <long?>();

                long count = 0;
                foreach (var nv in values)
                {
                    long v;
                    if (nv == null)
                    {
                        v       = 0;
                        missing = true;
                    }
                    else
                    {
                        v = nv.Value;
                    }

                    if (gcd != 1)
                    {
                        if (v < long.MinValue / 2 || v > long.MaxValue / 2)
                        {
                            // in that case v - minValue might overflow and make the GCD computation return
                            // wrong results. Since these extreme values are unlikely, we just discard
                            // GCD computation for them
                            gcd = 1;
                        } // minValue needs to be set first
                        else if (count != 0)
                        {
                            gcd = MathUtil.Gcd(gcd, v - minValue);
                        }
                    }

                    minValue = Math.Min(minValue, v);
                    maxValue = Math.Max(maxValue, v);

                    if (uniqueValues != null)
                    {
                        if (uniqueValues.Add(v))
                        {
                            if (uniqueValues.Count > 256)
                            {
                                uniqueValues = null;
                            }
                        }
                    }

                    ++count;
                }
                if (Debugging.AssertsEnabled)
                {
                    Debugging.Assert(count == maxDoc);
                }
            }

            if (missing)
            {
                long start = data.GetFilePointer();
                WriteMissingBitset(values);
                meta.WriteInt64(start);
                meta.WriteInt64(data.GetFilePointer() - start);
            }
            else
            {
                meta.WriteInt64(-1L);
            }

            if (uniqueValues != null)
            {
                // small number of unique values

                int           bitsPerValue  = PackedInt32s.BitsRequired(uniqueValues.Count - 1);
                FormatAndBits formatAndBits = PackedInt32s.FastestFormatAndBits(maxDoc, bitsPerValue,
                                                                                acceptableOverheadRatio);
                if (formatAndBits.BitsPerValue == 8 && minValue >= sbyte.MinValue && maxValue <= sbyte.MaxValue)
                {
                    meta.WriteByte(MemoryDocValuesProducer.UNCOMPRESSED); // uncompressed
                    foreach (var nv in values)
                    {
                        data.WriteByte((byte)nv.GetValueOrDefault());
                    }
                }
                else
                {
                    meta.WriteByte(MemoryDocValuesProducer.TABLE_COMPRESSED); // table-compressed
                    long?[] decode = new long?[uniqueValues.Count];
                    uniqueValues.CopyTo(decode, 0);

                    var encode = new Dictionary <long?, int?>();
                    data.WriteVInt32(decode.Length);
                    for (int i = 0; i < decode.Length; i++)
                    {
                        data.WriteInt64(decode[i].Value);
                        encode[decode[i]] = i;
                    }

                    meta.WriteVInt32(PackedInt32s.VERSION_CURRENT);
                    data.WriteVInt32(formatAndBits.Format.Id);
                    data.WriteVInt32(formatAndBits.BitsPerValue);

                    PackedInt32s.Writer writer = PackedInt32s.GetWriterNoHeader(data, formatAndBits.Format, maxDoc,
                                                                                formatAndBits.BitsPerValue, PackedInt32s.DEFAULT_BUFFER_SIZE);
                    foreach (var nv in values)
                    {
                        var v = encode[nv.GetValueOrDefault()];

                        writer.Add((long)v);
                    }
                    writer.Finish();
                }
            }
            else if (gcd != 0 && gcd != 1)
            {
                meta.WriteByte(MemoryDocValuesProducer.GCD_COMPRESSED);
                meta.WriteVInt32(PackedInt32s.VERSION_CURRENT);
                data.WriteInt64(minValue);
                data.WriteInt64(gcd);
                data.WriteVInt32(MemoryDocValuesProducer.BLOCK_SIZE);

                var writer = new BlockPackedWriter(data, MemoryDocValuesProducer.BLOCK_SIZE);
                foreach (var nv in values)
                {
                    writer.Add((nv.GetValueOrDefault() - minValue) / gcd);
                }
                writer.Finish();
            }
            else
            {
                meta.WriteByte(MemoryDocValuesProducer.DELTA_COMPRESSED); // delta-compressed

                meta.WriteVInt32(PackedInt32s.VERSION_CURRENT);
                data.WriteVInt32(MemoryDocValuesProducer.BLOCK_SIZE);

                var writer = new BlockPackedWriter(data, MemoryDocValuesProducer.BLOCK_SIZE);
                foreach (var nv in values)
                {
                    writer.Add(nv.GetValueOrDefault());
                }
                writer.Finish();
            }
        }
Beispiel #8
0
        public override void AddNumericField(FieldInfo field, IEnumerable <long?> values)
        {
            Meta.WriteVInt(field.Number);
            Meta.WriteByte((byte)NUMBER);
            Meta.WriteLong(Data.FilePointer);
            long minValue = long.MaxValue;
            long maxValue = long.MinValue;
            long gcd      = 0;
            // TODO: more efficient?
            HashSet <long> uniqueValues = null;

            if (true)
            {
                uniqueValues = new HashSet <long>();

                long count = 0;
                foreach (long?nv in values)
                {
                    Debug.Assert(nv != null);
                    long v = nv.Value;

                    if (gcd != 1)
                    {
                        if (v < long.MinValue / 2 || v > long.MaxValue / 2)
                        {
                            // in that case v - minValue might overflow and make the GCD computation return
                            // wrong results. Since these extreme values are unlikely, we just discard
                            // GCD computation for them
                            gcd = 1;
                        } // minValue needs to be set first
                        else if (count != 0)
                        {
                            gcd = MathUtil.Gcd(gcd, v - minValue);
                        }
                    }

                    minValue = Math.Min(minValue, v);
                    maxValue = Math.Max(maxValue, v);

                    if (uniqueValues != null)
                    {
                        if (uniqueValues.Add(v))
                        {
                            if (uniqueValues.Count > 256)
                            {
                                uniqueValues = null;
                            }
                        }
                    }

                    ++count;
                }
                Debug.Assert(count == MaxDoc);
            }

            if (uniqueValues != null)
            {
                // small number of unique values
                int           bitsPerValue  = PackedInts.BitsRequired(uniqueValues.Count - 1);
                FormatAndBits formatAndBits = PackedInts.FastestFormatAndBits(MaxDoc, bitsPerValue, AcceptableOverheadRatio);
                if (formatAndBits.bitsPerValue == 8 && minValue >= sbyte.MinValue && maxValue <= sbyte.MaxValue)
                {
                    Meta.WriteByte((byte)UNCOMPRESSED); // uncompressed
                    foreach (long?nv in values)
                    {
                        Data.WriteByte(nv == null ? (byte)0 : (byte)(sbyte)nv.Value);
                    }
                }
                else
                {
                    Meta.WriteByte((byte)TABLE_COMPRESSED); // table-compressed
                    //LUCENE TO-DO, ToArray had a parameter to start
                    var decode = uniqueValues.ToArray();
                    var encode = new Dictionary <long, int>();
                    Data.WriteVInt(decode.Length);
                    for (int i = 0; i < decode.Length; i++)
                    {
                        Data.WriteLong(decode[i]);
                        encode[decode[i]] = i;
                    }

                    Meta.WriteVInt(PackedInts.VERSION_CURRENT);
                    Data.WriteVInt(formatAndBits.format.id);
                    Data.WriteVInt(formatAndBits.bitsPerValue);

                    PackedInts.Writer writer = PackedInts.GetWriterNoHeader(Data, formatAndBits.format, MaxDoc, formatAndBits.bitsPerValue, PackedInts.DEFAULT_BUFFER_SIZE);
                    foreach (long?nv in values)
                    {
                        writer.Add(encode[nv == null ? 0 : nv.Value]);
                    }
                    writer.Finish();
                }
            }
            else if (gcd != 0 && gcd != 1)
            {
                Meta.WriteByte((byte)GCD_COMPRESSED);
                Meta.WriteVInt(PackedInts.VERSION_CURRENT);
                Data.WriteLong(minValue);
                Data.WriteLong(gcd);
                Data.WriteVInt(BLOCK_SIZE);

                var writer = new BlockPackedWriter(Data, BLOCK_SIZE);
                foreach (long?nv in values)
                {
                    long value = nv == null ? 0 : nv.Value;
                    writer.Add((value - minValue) / gcd);
                }
                writer.Finish();
            }
            else
            {
                Meta.WriteByte((byte)DELTA_COMPRESSED); // delta-compressed

                Meta.WriteVInt(PackedInts.VERSION_CURRENT);
                Data.WriteVInt(BLOCK_SIZE);

                var writer = new BlockPackedWriter(Data, BLOCK_SIZE);
                foreach (long?nv in values)
                {
                    writer.Add(nv == null ? 0 : nv.Value);
                }
                writer.Finish();
            }
        }
        public override void AddNumericField(FieldInfo field, IEnumerable<long> values)
        {
            Meta.WriteVInt(field.Number);
            Meta.WriteByte(NUMBER);
            Meta.WriteLong(Data.FilePointer);
            long minValue = long.MaxValue;
            long maxValue = long.MinValue;
            long gcd = 0;
            // TODO: more efficient?
            HashSet<long> uniqueValues = null;
            if (true)
            {
                uniqueValues = new HashSet<long>();

                long count = 0;
                foreach (long nv in values)
                {
                    Debug.Assert(nv != null);
                    long v = nv;

                    if (gcd != 1)
                    {
                        if (v < long.MinValue / 2 || v > long.MaxValue / 2)
                        {
                            // in that case v - minValue might overflow and make the GCD computation return
                            // wrong results. Since these extreme values are unlikely, we just discard
                            // GCD computation for them
                            gcd = 1;
                        } // minValue needs to be set first
                        else if (count != 0)
                        {
                            gcd = MathUtil.Gcd(gcd, v - minValue);
                        }
                    }

                    minValue = Math.Min(minValue, v);
                    maxValue = Math.Max(maxValue, v);

                    if (uniqueValues != null)
                    {
                        if (uniqueValues.Add(v))
                        {
                            if (uniqueValues.Count > 256)
                            {
                                uniqueValues = null;
                            }
                        }
                    }

                    ++count;
                }
                Debug.Assert(count == MaxDoc);
            }

            if (uniqueValues != null)
            {
                // small number of unique values
                int bitsPerValue = PackedInts.BitsRequired(uniqueValues.Count - 1);
                FormatAndBits formatAndBits = PackedInts.FastestFormatAndBits(MaxDoc, bitsPerValue, AcceptableOverheadRatio);
                if (formatAndBits.bitsPerValue == 8 && minValue >= sbyte.MinValue && maxValue <= sbyte.MaxValue)
                {
                    Meta.WriteByte(UNCOMPRESSED); // uncompressed
                    foreach (long nv in values)
                    {
                        Data.WriteByte((sbyte)nv);
                    }
                }
                else
                {
                    Meta.WriteByte(TABLE_COMPRESSED); // table-compressed
                    //LUCENE TO-DO, ToArray had a parameter to start
                    long[] decode = uniqueValues.ToArray();
                    Dictionary<long, int> encode = new Dictionary<long, int>();
                    Data.WriteVInt(decode.Length);
                    for (int i = 0; i < decode.Length; i++)
                    {
                        Data.WriteLong(decode[i]);
                        encode[decode[i]] = i;
                    }

                    Meta.WriteVInt(PackedInts.VERSION_CURRENT);
                    Data.WriteVInt(formatAndBits.format.id);
                    Data.WriteVInt(formatAndBits.bitsPerValue);

                    PackedInts.Writer writer = PackedInts.GetWriterNoHeader(Data, formatAndBits.format, MaxDoc, formatAndBits.bitsPerValue, PackedInts.DEFAULT_BUFFER_SIZE);
                    foreach (long nv in values)
                    {
                        writer.Add(encode[nv == null ? 0 : nv]);
                    }
                    writer.Finish();
                }
            }
            else if (gcd != 0 && gcd != 1)
            {
                Meta.WriteByte(GCD_COMPRESSED);
                Meta.WriteVInt(PackedInts.VERSION_CURRENT);
                Data.WriteLong(minValue);
                Data.WriteLong(gcd);
                Data.WriteVInt(BLOCK_SIZE);

                BlockPackedWriter writer = new BlockPackedWriter(Data, BLOCK_SIZE);
                foreach (long nv in values)
                {
                    long value = nv;
                    writer.Add((value - minValue) / gcd);
                }
                writer.Finish();
            }
            else
            {
                Meta.WriteByte(DELTA_COMPRESSED); // delta-compressed

                Meta.WriteVInt(PackedInts.VERSION_CURRENT);
                Data.WriteVInt(BLOCK_SIZE);

                BlockPackedWriter writer = new BlockPackedWriter(Data, BLOCK_SIZE);
                foreach (long nv in values)
                {
                    writer.Add(nv);
                }
                writer.Finish();
            }
        }
Beispiel #10
0
        public override void AddNumericField(FieldInfo field, IEnumerable <long?> values)
        {
            meta.WriteVInt32(field.Number);
            meta.WriteByte((byte)NUMBER);
            meta.WriteInt64(data.GetFilePointer());
            long minValue = long.MaxValue;
            long maxValue = long.MinValue;
            long gcd      = 0;

            // TODO: more efficient?
            JCG.HashSet <long> uniqueValues /* = null*/; // LUCENENET: IDE0059: Remove unnecessary value assignment
            if (true)
            {
                uniqueValues = new JCG.HashSet <long>();

                long count = 0;
                foreach (long?nv in values)
                {
                    if (Debugging.AssertsEnabled)
                    {
                        Debugging.Assert(nv != null);
                    }
                    long v = nv.Value;

                    if (gcd != 1)
                    {
                        if (v < long.MinValue / 2 || v > long.MaxValue / 2)
                        {
                            // in that case v - minValue might overflow and make the GCD computation return
                            // wrong results. Since these extreme values are unlikely, we just discard
                            // GCD computation for them
                            gcd = 1;
                        } // minValue needs to be set first
                        else if (count != 0)
                        {
                            gcd = MathUtil.Gcd(gcd, v - minValue);
                        }
                    }

                    minValue = Math.Min(minValue, v);
                    maxValue = Math.Max(maxValue, v);

                    if (uniqueValues != null)
                    {
                        if (uniqueValues.Add(v))
                        {
                            if (uniqueValues.Count > 256)
                            {
                                uniqueValues = null;
                            }
                        }
                    }

                    ++count;
                }
                if (Debugging.AssertsEnabled)
                {
                    Debugging.Assert(count == maxDoc);
                }
            }

            if (uniqueValues != null)
            {
                // small number of unique values
                int           bitsPerValue  = PackedInt32s.BitsRequired(uniqueValues.Count - 1);
                FormatAndBits formatAndBits = PackedInt32s.FastestFormatAndBits(maxDoc, bitsPerValue, acceptableOverheadRatio);
                if (formatAndBits.BitsPerValue == 8 && minValue >= sbyte.MinValue && maxValue <= sbyte.MaxValue)
                {
                    meta.WriteByte((byte)UNCOMPRESSED); // uncompressed
                    foreach (long?nv in values)
                    {
                        data.WriteByte((byte)nv.GetValueOrDefault());
                    }
                }
                else
                {
                    meta.WriteByte((byte)TABLE_COMPRESSED); // table-compressed
                    var decode = new long[uniqueValues.Count];
                    uniqueValues.CopyTo(decode);
                    var encode = new Dictionary <long, int>();
                    data.WriteVInt32(decode.Length);
                    for (int i = 0; i < decode.Length; i++)
                    {
                        data.WriteInt64(decode[i]);
                        encode[decode[i]] = i;
                    }

                    meta.WriteVInt32(PackedInt32s.VERSION_CURRENT);
                    data.WriteVInt32(formatAndBits.Format.Id);
                    data.WriteVInt32(formatAndBits.BitsPerValue);

                    PackedInt32s.Writer writer = PackedInt32s.GetWriterNoHeader(data, formatAndBits.Format, maxDoc, formatAndBits.BitsPerValue, PackedInt32s.DEFAULT_BUFFER_SIZE);
                    foreach (long?nv in values)
                    {
                        writer.Add(encode[nv.GetValueOrDefault()]);
                    }
                    writer.Finish();
                }
            }
            else if (gcd != 0 && gcd != 1)
            {
                meta.WriteByte((byte)GCD_COMPRESSED);
                meta.WriteVInt32(PackedInt32s.VERSION_CURRENT);
                data.WriteInt64(minValue);
                data.WriteInt64(gcd);
                data.WriteVInt32(BLOCK_SIZE);

                var writer = new BlockPackedWriter(data, BLOCK_SIZE);
                foreach (long?nv in values)
                {
                    writer.Add((nv.GetValueOrDefault() - minValue) / gcd);
                }
                writer.Finish();
            }
            else
            {
                meta.WriteByte((byte)DELTA_COMPRESSED); // delta-compressed

                meta.WriteVInt32(PackedInt32s.VERSION_CURRENT);
                data.WriteVInt32(BLOCK_SIZE);

                var writer = new BlockPackedWriter(data, BLOCK_SIZE);
                foreach (long?nv in values)
                {
                    writer.Add(nv.GetValueOrDefault());
                }
                writer.Finish();
            }
        }
Beispiel #11
0
//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
//ORIGINAL LINE: void addNumericField(index.FieldInfo field, Iterable<Number> values, boolean optimizeStorage) throws java.io.IOException
        internal virtual void addNumericField(FieldInfo field, IEnumerable <Number> values, bool optimizeStorage)
        {
            meta.writeVInt(field.number);
            meta.writeByte(NUMBER);
            meta.writeLong(data.FilePointer);
            long minValue = long.MaxValue;
            long maxValue = long.MinValue;
            long gcd      = 0;
            bool missing  = false;
            // TODO: more efficient?
            HashSet <long?> uniqueValues = null;

            if (optimizeStorage)
            {
                uniqueValues = new HashSet <>();

                long count = 0;
                foreach (Number nv in values)
                {
//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
//ORIGINAL LINE: final long v;
                    long v;
                    if (nv == null)
                    {
                        v       = 0;
                        missing = true;
                    }
                    else
                    {
                        v = (long)nv;
                    }

                    if (gcd != 1)
                    {
                        if (v < long.MinValue / 2 || v > long.MaxValue / 2)
                        {
                            // in that case v - minValue might overflow and make the GCD computation return
                            // wrong results. Since these extreme values are unlikely, we just discard
                            // GCD computation for them
                            gcd = 1;
                        }   // minValue needs to be set first
                        else if (count != 0)
                        {
                            gcd = MathUtil.gcd(gcd, v - minValue);
                        }
                    }

                    minValue = Math.Min(minValue, v);
                    maxValue = Math.Max(maxValue, v);

                    if (uniqueValues != null)
                    {
                        if (uniqueValues.Add(v))
                        {
                            if (uniqueValues.Count > 256)
                            {
                                uniqueValues = null;
                            }
                        }
                    }

                    ++count;
                }
                Debug.Assert(count == maxDoc);
            }

            if (missing)
            {
                long start = data.FilePointer;
                writeMissingBitset(values);
                meta.writeLong(start);
                meta.writeLong(data.FilePointer - start);
            }
            else
            {
                meta.writeLong(-1L);
            }

            if (uniqueValues != null)
            {
                // small number of unique values
//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
//ORIGINAL LINE: final int bitsPerValue = util.packed.PackedInts.bitsRequired(uniqueValues.size()-1);
                int           bitsPerValue  = PackedInts.bitsRequired(uniqueValues.Count - 1);
                FormatAndBits formatAndBits = PackedInts.fastestFormatAndBits(maxDoc, bitsPerValue, acceptableOverheadRatio);
                if (formatAndBits.bitsPerValue == 8 && minValue >= sbyte.MinValue && maxValue <= sbyte.MaxValue)
                {
                    meta.writeByte(UNCOMPRESSED);     // uncompressed
                    foreach (Number nv in values)
                    {
                        data.writeByte(nv == null ? 0 : (long)(sbyte)nv);
                    }
                }
                else
                {
                    meta.writeByte(TABLE_COMPRESSED);     // table-compressed
                    long?[] decode = uniqueValues.toArray(new long?[uniqueValues.Count]);
//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
//ORIGINAL LINE: final java.util.HashMap<Long,Integer> encode = new java.util.HashMap<>();
                    Dictionary <long?, int?> encode = new Dictionary <long?, int?>();
                    data.writeVInt(decode.Length);
                    for (int i = 0; i < decode.Length; i++)
                    {
                        data.writeLong(decode[i]);
                        encode[decode[i]] = i;
                    }

                    meta.writeVInt(PackedInts.VERSION_CURRENT);
                    data.writeVInt(formatAndBits.format.Id);
                    data.writeVInt(formatAndBits.bitsPerValue);

//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
//ORIGINAL LINE: final util.packed.PackedInts.Writer writer = util.packed.PackedInts.getWriterNoHeader(data, formatAndBits.format, maxDoc, formatAndBits.bitsPerValue, util.packed.PackedInts.DEFAULT_BUFFER_SIZE);
                    PackedInts.Writer writer = PackedInts.getWriterNoHeader(data, formatAndBits.format, maxDoc, formatAndBits.bitsPerValue, PackedInts.DEFAULT_BUFFER_SIZE);
                    foreach (Number nv in values)
                    {
                        writer.add(encode[nv == null ? 0 : (long)nv]);
                    }
                    writer.finish();
                }
            }
            else if (gcd != 0 && gcd != 1)
            {
                meta.writeByte(GCD_COMPRESSED);
                meta.writeVInt(PackedInts.VERSION_CURRENT);
                data.writeLong(minValue);
                data.writeLong(gcd);
                data.writeVInt(BLOCK_SIZE);

//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
//ORIGINAL LINE: final util.packed.BlockPackedWriter writer = new util.packed.BlockPackedWriter(data, BLOCK_SIZE);
                BlockPackedWriter writer = new BlockPackedWriter(data, BLOCK_SIZE);
                foreach (Number nv in values)
                {
                    long value = nv == null ? 0 : (long)nv;
                    writer.add((value - minValue) / gcd);
                }
                writer.finish();
            }
            else
            {
                meta.writeByte(DELTA_COMPRESSED);   // delta-compressed

                meta.writeVInt(PackedInts.VERSION_CURRENT);
                data.writeVInt(BLOCK_SIZE);

//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
//ORIGINAL LINE: final util.packed.BlockPackedWriter writer = new util.packed.BlockPackedWriter(data, BLOCK_SIZE);
                BlockPackedWriter writer = new BlockPackedWriter(data, BLOCK_SIZE);
                foreach (Number nv in values)
                {
                    writer.add(nv == null ? 0 : (long)nv);
                }
                writer.finish();
            }
        }
        /// <summary>
        /// Sole constructor. </summary>
        public CompressingTermVectorsWriter(Directory directory, SegmentInfo si, string segmentSuffix, IOContext context, string formatName, CompressionMode compressionMode, int chunkSize)
        {
            Debug.Assert(directory != null);
            this.Directory = directory;
            this.Segment = si.Name;
            this.SegmentSuffix = segmentSuffix;
            this.CompressionMode = compressionMode;
            this.Compressor = compressionMode.NewCompressor();
            this.ChunkSize = chunkSize;

            NumDocs = 0;
            PendingDocs = new LinkedList<DocData>();
            TermSuffixes = new GrowableByteArrayDataOutput(ArrayUtil.Oversize(chunkSize, 1));
            PayloadBytes = new GrowableByteArrayDataOutput(ArrayUtil.Oversize(1, 1));
            LastTerm = new BytesRef(ArrayUtil.Oversize(30, 1));

            bool success = false;
            IndexOutput indexStream = directory.CreateOutput(IndexFileNames.SegmentFileName(Segment, segmentSuffix, VECTORS_INDEX_EXTENSION), context);
            try
            {
                VectorsStream = directory.CreateOutput(IndexFileNames.SegmentFileName(Segment, segmentSuffix, VECTORS_EXTENSION), context);

                string codecNameIdx = formatName + CODEC_SFX_IDX;
                string codecNameDat = formatName + CODEC_SFX_DAT;
                CodecUtil.WriteHeader(indexStream, codecNameIdx, VERSION_CURRENT);
                CodecUtil.WriteHeader(VectorsStream, codecNameDat, VERSION_CURRENT);
                Debug.Assert(CodecUtil.HeaderLength(codecNameDat) == VectorsStream.FilePointer);
                Debug.Assert(CodecUtil.HeaderLength(codecNameIdx) == indexStream.FilePointer);

                IndexWriter = new CompressingStoredFieldsIndexWriter(indexStream);
                indexStream = null;

                VectorsStream.WriteVInt(PackedInts.VERSION_CURRENT);
                VectorsStream.WriteVInt(chunkSize);
                Writer = new BlockPackedWriter(VectorsStream, BLOCK_SIZE);

                PositionsBuf = new int[1024];
                StartOffsetsBuf = new int[1024];
                LengthsBuf = new int[1024];
                PayloadLengthsBuf = new int[1024];

                success = true;
            }
            finally
            {
                if (!success)
                {
                    IOUtils.CloseWhileHandlingException(indexStream);
                    Abort();
                }
            }
        }