/// <summary> /// Sole constructor. </summary> public CompressingTermVectorsWriter(Directory directory, SegmentInfo si, string segmentSuffix, IOContext context, string formatName, CompressionMode compressionMode, int chunkSize) { if (Debugging.AssertsEnabled) { Debugging.Assert(directory != null); } this.directory = directory; this.segment = si.Name; this.segmentSuffix = segmentSuffix; this.compressionMode = compressionMode; this.compressor = compressionMode.NewCompressor(); this.chunkSize = chunkSize; numDocs = 0; pendingDocs = new LinkedList <DocData>(); termSuffixes = new GrowableByteArrayDataOutput(ArrayUtil.Oversize(chunkSize, 1)); payloadBytes = new GrowableByteArrayDataOutput(ArrayUtil.Oversize(1, 1)); lastTerm = new BytesRef(ArrayUtil.Oversize(30, 1)); bool success = false; IndexOutput indexStream = directory.CreateOutput(IndexFileNames.SegmentFileName(segment, segmentSuffix, VECTORS_INDEX_EXTENSION), context); try { vectorsStream = directory.CreateOutput(IndexFileNames.SegmentFileName(segment, segmentSuffix, VECTORS_EXTENSION), context); string codecNameIdx = formatName + CODEC_SFX_IDX; string codecNameDat = formatName + CODEC_SFX_DAT; CodecUtil.WriteHeader(indexStream, codecNameIdx, VERSION_CURRENT); CodecUtil.WriteHeader(vectorsStream, codecNameDat, VERSION_CURRENT); if (Debugging.AssertsEnabled) { Debugging.Assert(CodecUtil.HeaderLength(codecNameDat) == vectorsStream.Position); // LUCENENET specific: Renamed from getFilePointer() to match FileStream Debugging.Assert(CodecUtil.HeaderLength(codecNameIdx) == indexStream.Position); // LUCENENET specific: Renamed from getFilePointer() to match FileStream } indexWriter = new CompressingStoredFieldsIndexWriter(indexStream); indexStream = null; vectorsStream.WriteVInt32(PackedInt32s.VERSION_CURRENT); vectorsStream.WriteVInt32(chunkSize); writer = new BlockPackedWriter(vectorsStream, BLOCK_SIZE); positionsBuf = new int[1024]; startOffsetsBuf = new int[1024]; lengthsBuf = new int[1024]; payloadLengthsBuf = new int[1024]; success = true; } finally { if (!success) { IOUtils.DisposeWhileHandlingException(indexStream); Abort(); } } }
/// <summary> /// Sole constructor. </summary> public CompressingTermVectorsWriter(Directory directory, SegmentInfo si, string segmentSuffix, IOContext context, string formatName, CompressionMode compressionMode, int chunkSize) { Debug.Assert(directory != null); this.Directory = directory; this.Segment = si.Name; this.SegmentSuffix = segmentSuffix; this.CompressionMode = compressionMode; this.Compressor = compressionMode.NewCompressor(); this.ChunkSize = chunkSize; NumDocs = 0; PendingDocs = new LinkedList <DocData>(); TermSuffixes = new GrowableByteArrayDataOutput(ArrayUtil.Oversize(chunkSize, 1)); PayloadBytes = new GrowableByteArrayDataOutput(ArrayUtil.Oversize(1, 1)); LastTerm = new BytesRef(ArrayUtil.Oversize(30, 1)); bool success = false; IndexOutput indexStream = directory.CreateOutput(IndexFileNames.SegmentFileName(Segment, segmentSuffix, VECTORS_INDEX_EXTENSION), context); try { VectorsStream = directory.CreateOutput(IndexFileNames.SegmentFileName(Segment, segmentSuffix, VECTORS_EXTENSION), context); string codecNameIdx = formatName + CODEC_SFX_IDX; string codecNameDat = formatName + CODEC_SFX_DAT; CodecUtil.WriteHeader(indexStream, codecNameIdx, VERSION_CURRENT); CodecUtil.WriteHeader(VectorsStream, codecNameDat, VERSION_CURRENT); Debug.Assert(CodecUtil.HeaderLength(codecNameDat) == VectorsStream.FilePointer); Debug.Assert(CodecUtil.HeaderLength(codecNameIdx) == indexStream.FilePointer); IndexWriter = new CompressingStoredFieldsIndexWriter(indexStream); indexStream = null; VectorsStream.WriteVInt(PackedInts.VERSION_CURRENT); VectorsStream.WriteVInt(chunkSize); Writer = new BlockPackedWriter(VectorsStream, BLOCK_SIZE); PositionsBuf = new int[1024]; StartOffsetsBuf = new int[1024]; LengthsBuf = new int[1024]; PayloadLengthsBuf = new int[1024]; success = true; } finally { if (!success) { IOUtils.CloseWhileHandlingException(indexStream); Abort(); } } }
internal virtual void AddNumericField(FieldInfo field, IEnumerable<long?> values, bool optimizeStorage) { long count = 0; long minValue = long.MaxValue; long maxValue = long.MinValue; long gcd = 0; bool missing = false; // TODO: more efficient? HashSet<long> uniqueValues = null; if (optimizeStorage) { uniqueValues = new HashSet<long>(); foreach (long? nv in values) { long v; if (nv == null) { v = 0; missing = true; } else { v = nv.Value; } if (gcd != 1) { if (v < long.MinValue / 2 || v > long.MaxValue / 2) { // in that case v - minValue might overflow and make the GCD computation return // wrong results. Since these extreme values are unlikely, we just discard // GCD computation for them gcd = 1; } // minValue needs to be set first else if (count != 0) { gcd = MathUtil.Gcd(gcd, v - minValue); } } minValue = Math.Min(minValue, v); maxValue = Math.Max(maxValue, v); if (uniqueValues != null) { if (uniqueValues.Add(v)) { if (uniqueValues.Count > 256) { uniqueValues = null; } } } ++count; } } else { foreach (var nv in values) { ++count; } } long delta = maxValue - minValue; int format; if (uniqueValues != null && (delta < 0L || PackedInts.BitsRequired(uniqueValues.Count - 1) < PackedInts.BitsRequired(delta)) && count <= int.MaxValue) { format = TABLE_COMPRESSED; } else if (gcd != 0 && gcd != 1) { format = GCD_COMPRESSED; } else { format = DELTA_COMPRESSED; } Meta.WriteVInt(field.Number); Meta.WriteByte((byte)Lucene45DocValuesFormat.NUMERIC); Meta.WriteVInt(format); if (missing) { Meta.WriteLong(Data.FilePointer); WriteMissingBitset(values); } else { Meta.WriteLong(-1L); } Meta.WriteVInt(PackedInts.VERSION_CURRENT); Meta.WriteLong(Data.FilePointer); Meta.WriteVLong(count); Meta.WriteVInt(BLOCK_SIZE); switch (format) { case GCD_COMPRESSED: Meta.WriteLong(minValue); Meta.WriteLong(gcd); BlockPackedWriter quotientWriter = new BlockPackedWriter(Data, BLOCK_SIZE); foreach (long? nv in values) { long value = nv == null ? 0 : nv.Value; quotientWriter.Add((value - minValue) / gcd); } quotientWriter.Finish(); break; case DELTA_COMPRESSED: BlockPackedWriter writer = new BlockPackedWriter(Data, BLOCK_SIZE); foreach (long? nv in values) { writer.Add(nv == null ? 0 : nv.Value); } writer.Finish(); break; case TABLE_COMPRESSED: long[] decode = uniqueValues.ToArray();//LUCENE TO-DO Hadd oparamerter before Dictionary<long, int> encode = new Dictionary<long, int>(); Meta.WriteVInt(decode.Length); for (int i = 0; i < decode.Length; i++) { Meta.WriteLong(decode[i]); encode[decode[i]] = i; } int bitsRequired = PackedInts.BitsRequired(uniqueValues.Count - 1); PackedInts.Writer ordsWriter = PackedInts.GetWriterNoHeader(Data, PackedInts.Format.PACKED, (int)count, bitsRequired, PackedInts.DEFAULT_BUFFER_SIZE); foreach (long? nv in values) { ordsWriter.Add(encode[nv == null ? 0 : nv.Value]); } ordsWriter.Finish(); break; default: throw new InvalidOperationException(); } }
internal virtual void AddNumericField(FieldInfo field, IEnumerable <long?> values, bool optimizeStorage) { long count = 0; long minValue = long.MaxValue; long maxValue = long.MinValue; long gcd = 0; bool missing = false; // TODO: more efficient? HashSet <long> uniqueValues = null; if (optimizeStorage) { uniqueValues = new HashSet <long>(); foreach (long?nv in values) { long v; if (nv == null) { v = 0; missing = true; } else { v = nv.Value; } if (gcd != 1) { if (v < long.MinValue / 2 || v > long.MaxValue / 2) { // in that case v - minValue might overflow and make the GCD computation return // wrong results. Since these extreme values are unlikely, we just discard // GCD computation for them gcd = 1; } // minValue needs to be set first else if (count != 0) { gcd = MathUtil.Gcd(gcd, v - minValue); } } minValue = Math.Min(minValue, v); maxValue = Math.Max(maxValue, v); if (uniqueValues != null) { if (uniqueValues.Add(v)) { if (uniqueValues.Count > 256) { uniqueValues = null; } } } ++count; } } else { foreach (var nv in values) { ++count; } } long delta = maxValue - minValue; int format; if (uniqueValues != null && (delta < 0L || PackedInts.BitsRequired(uniqueValues.Count - 1) < PackedInts.BitsRequired(delta)) && count <= int.MaxValue) { format = TABLE_COMPRESSED; } else if (gcd != 0 && gcd != 1) { format = GCD_COMPRESSED; } else { format = DELTA_COMPRESSED; } Meta.WriteVInt(field.Number); Meta.WriteByte((byte)Lucene45DocValuesFormat.NUMERIC); Meta.WriteVInt(format); if (missing) { Meta.WriteLong(Data.FilePointer); WriteMissingBitset(values); } else { Meta.WriteLong(-1L); } Meta.WriteVInt(PackedInts.VERSION_CURRENT); Meta.WriteLong(Data.FilePointer); Meta.WriteVLong(count); Meta.WriteVInt(BLOCK_SIZE); switch (format) { case GCD_COMPRESSED: Meta.WriteLong(minValue); Meta.WriteLong(gcd); BlockPackedWriter quotientWriter = new BlockPackedWriter(Data, BLOCK_SIZE); foreach (long?nv in values) { long value = nv == null ? 0 : nv.Value; quotientWriter.Add((value - minValue) / gcd); } quotientWriter.Finish(); break; case DELTA_COMPRESSED: BlockPackedWriter writer = new BlockPackedWriter(Data, BLOCK_SIZE); foreach (long?nv in values) { writer.Add(nv == null ? 0 : nv.Value); } writer.Finish(); break; case TABLE_COMPRESSED: long[] decode = uniqueValues.ToArray(); //LUCENE TO-DO Hadd oparamerter before Dictionary <long, int> encode = new Dictionary <long, int>(); Meta.WriteVInt(decode.Length); for (int i = 0; i < decode.Length; i++) { Meta.WriteLong(decode[i]); encode[decode[i]] = i; } int bitsRequired = PackedInts.BitsRequired(uniqueValues.Count - 1); PackedInts.Writer ordsWriter = PackedInts.GetWriterNoHeader(Data, PackedInts.Format.PACKED, (int)count, bitsRequired, PackedInts.DEFAULT_BUFFER_SIZE); foreach (long?nv in values) { ordsWriter.Add(encode[nv == null ? 0 : nv.Value]); } ordsWriter.Finish(); break; default: throw new InvalidOperationException(); } }
internal virtual void AddNumericField(FieldInfo field, IEnumerable <long> values, bool optimizeStorage) { Meta.WriteVInt(field.Number); Meta.WriteByte(Lucene42DocValuesProducer.NUMBER); Meta.WriteLong(Data.FilePointer); long minValue = long.MaxValue; long maxValue = long.MinValue; long gcd = 0; // TODO: more efficient? HashSet <long> uniqueValues = null; if (optimizeStorage) { uniqueValues = new HashSet <long>(); long count = 0; foreach (long nv in values) { // TODO: support this as MemoryDVFormat (and be smart about missing maybe) long v = nv == null ? 0 : (long)nv; if (gcd != 1) { if (v < long.MinValue / 2 || v > long.MaxValue / 2) { // in that case v - minValue might overflow and make the GCD computation return // wrong results. Since these extreme values are unlikely, we just discard // GCD computation for them gcd = 1; } // minValue needs to be set first else if (count != 0) { gcd = MathUtil.Gcd(gcd, v - minValue); } } minValue = Math.Min(minValue, v); maxValue = Math.Max(maxValue, v); if (uniqueValues != null) { if (uniqueValues.Add(v)) { if (uniqueValues.Count > 256) { uniqueValues = null; } } } ++count; } Debug.Assert(count == MaxDoc); } if (uniqueValues != null) { // small number of unique values int bitsPerValue = PackedInts.BitsRequired(uniqueValues.Count - 1); FormatAndBits formatAndBits = PackedInts.FastestFormatAndBits(MaxDoc, bitsPerValue, AcceptableOverheadRatio); if (formatAndBits.bitsPerValue == 8 && minValue >= sbyte.MinValue && maxValue <= sbyte.MaxValue) { Meta.WriteByte(Lucene42DocValuesProducer.UNCOMPRESSED); // uncompressed foreach (long nv in values) { Data.WriteByte(nv == null ? (byte)0 : (byte)nv); } } else { Meta.WriteByte(Lucene42DocValuesProducer.TABLE_COMPRESSED); // table-compressed long[] decode = uniqueValues.ToArray(/*new long?[uniqueValues.Count]*/); Dictionary <long, int> encode = new Dictionary <long, int>(); Data.WriteVInt(decode.Length); for (int i = 0; i < decode.Length; i++) { Data.WriteLong(decode[i]); encode[decode[i]] = i; } Meta.WriteVInt(PackedInts.VERSION_CURRENT); Data.WriteVInt(formatAndBits.format.id); Data.WriteVInt(formatAndBits.bitsPerValue); PackedInts.Writer writer = PackedInts.GetWriterNoHeader(Data, formatAndBits.format, MaxDoc, formatAndBits.bitsPerValue, PackedInts.DEFAULT_BUFFER_SIZE); foreach (long nv in values) { writer.Add(encode[nv == null ? 0 : (long)nv]); } writer.Finish(); } } else if (gcd != 0 && gcd != 1) { Meta.WriteByte(Lucene42DocValuesProducer.GCD_COMPRESSED); Meta.WriteVInt(PackedInts.VERSION_CURRENT); Data.WriteLong(minValue); Data.WriteLong(gcd); Data.WriteVInt(Lucene42DocValuesProducer.BLOCK_SIZE); BlockPackedWriter writer = new BlockPackedWriter(Data, Lucene42DocValuesProducer.BLOCK_SIZE); foreach (long nv in values) { long value = nv == null ? 0 : (long)nv; writer.Add((value - minValue) / gcd); } writer.Finish(); } else { Meta.WriteByte(Lucene42DocValuesProducer.DELTA_COMPRESSED); // delta-compressed Meta.WriteVInt(PackedInts.VERSION_CURRENT); Data.WriteVInt(Lucene42DocValuesProducer.BLOCK_SIZE); BlockPackedWriter writer = new BlockPackedWriter(Data, Lucene42DocValuesProducer.BLOCK_SIZE); foreach (long nv in values) { writer.Add(nv == null ? 0 : (long)nv); } writer.Finish(); } }
internal virtual void AddNumericField(FieldInfo field, IEnumerable <long?> values, bool optimizeStorage) { long count = 0; long minValue = long.MaxValue; long maxValue = long.MinValue; long gcd = 0; bool missing = false; // TODO: more efficient? JCG.HashSet <long> uniqueValues = null; if (optimizeStorage) { uniqueValues = new JCG.HashSet <long>(); foreach (long?nv in values) { long v; if (nv == null) { v = 0; missing = true; } else { v = nv.Value; } if (gcd != 1) { if (v < long.MinValue / 2 || v > long.MaxValue / 2) { // in that case v - minValue might overflow and make the GCD computation return // wrong results. Since these extreme values are unlikely, we just discard // GCD computation for them gcd = 1; } // minValue needs to be set first else if (count != 0) { gcd = MathUtil.Gcd(gcd, v - minValue); } } minValue = Math.Min(minValue, v); maxValue = Math.Max(maxValue, v); if (uniqueValues != null) { if (uniqueValues.Add(v)) { if (uniqueValues.Count > 256) { uniqueValues = null; } } } ++count; } } else { foreach (var nv in values) { ++count; } } long delta = maxValue - minValue; int format; if (uniqueValues != null && (delta < 0L || PackedInt32s.BitsRequired(uniqueValues.Count - 1) < PackedInt32s.BitsRequired(delta)) && count <= int.MaxValue) { format = TABLE_COMPRESSED; } else if (gcd != 0 && gcd != 1) { format = GCD_COMPRESSED; } else { format = DELTA_COMPRESSED; } meta.WriteVInt32(field.Number); meta.WriteByte((byte)Lucene45DocValuesFormat.NUMERIC); meta.WriteVInt32(format); if (missing) { meta.WriteInt64(data.Position); // LUCENENET specific: Renamed from getFilePointer() to match FileStream WriteMissingBitset(values); } else { meta.WriteInt64(-1L); } meta.WriteVInt32(PackedInt32s.VERSION_CURRENT); meta.WriteInt64(data.Position); // LUCENENET specific: Renamed from getFilePointer() to match FileStream meta.WriteVInt64(count); meta.WriteVInt32(BLOCK_SIZE); switch (format) { case GCD_COMPRESSED: meta.WriteInt64(minValue); meta.WriteInt64(gcd); BlockPackedWriter quotientWriter = new BlockPackedWriter(data, BLOCK_SIZE); foreach (long?nv in values) { quotientWriter.Add((nv.GetValueOrDefault() - minValue) / gcd); } quotientWriter.Finish(); break; case DELTA_COMPRESSED: BlockPackedWriter writer = new BlockPackedWriter(data, BLOCK_SIZE); foreach (long?nv in values) { writer.Add(nv.GetValueOrDefault()); } writer.Finish(); break; case TABLE_COMPRESSED: // LUCENENET NOTE: diming an array and then using .CopyTo() for better efficiency than LINQ .ToArray() long[] decode = new long[uniqueValues.Count]; uniqueValues.CopyTo(decode, 0); Dictionary <long, int> encode = new Dictionary <long, int>(); meta.WriteVInt32(decode.Length); for (int i = 0; i < decode.Length; i++) { meta.WriteInt64(decode[i]); encode[decode[i]] = i; } int bitsRequired = PackedInt32s.BitsRequired(uniqueValues.Count - 1); PackedInt32s.Writer ordsWriter = PackedInt32s.GetWriterNoHeader(data, PackedInt32s.Format.PACKED, (int)count, bitsRequired, PackedInt32s.DEFAULT_BUFFER_SIZE); foreach (long?nv in values) { ordsWriter.Add(encode[nv.GetValueOrDefault()]); } ordsWriter.Finish(); break; default: throw AssertionError.Create(); } }
internal virtual void AddNumericField(FieldInfo field, IEnumerable <long?> values, bool optimizeStorage) { meta.WriteVInt32(field.Number); meta.WriteByte(MemoryDocValuesProducer.NUMBER); meta.WriteInt64(data.GetFilePointer()); long minValue = long.MaxValue; long maxValue = long.MinValue; long gcd = 0; bool missing = false; // TODO: more efficient? ISet <long?> uniqueValues = null; if (optimizeStorage) { uniqueValues = new JCG.HashSet <long?>(); long count = 0; foreach (var nv in values) { long v; if (nv == null) { v = 0; missing = true; } else { v = nv.Value; } if (gcd != 1) { if (v < long.MinValue / 2 || v > long.MaxValue / 2) { // in that case v - minValue might overflow and make the GCD computation return // wrong results. Since these extreme values are unlikely, we just discard // GCD computation for them gcd = 1; } // minValue needs to be set first else if (count != 0) { gcd = MathUtil.Gcd(gcd, v - minValue); } } minValue = Math.Min(minValue, v); maxValue = Math.Max(maxValue, v); if (uniqueValues != null) { if (uniqueValues.Add(v)) { if (uniqueValues.Count > 256) { uniqueValues = null; } } } ++count; } if (Debugging.AssertsEnabled) { Debugging.Assert(count == maxDoc); } } if (missing) { long start = data.GetFilePointer(); WriteMissingBitset(values); meta.WriteInt64(start); meta.WriteInt64(data.GetFilePointer() - start); } else { meta.WriteInt64(-1L); } if (uniqueValues != null) { // small number of unique values int bitsPerValue = PackedInt32s.BitsRequired(uniqueValues.Count - 1); FormatAndBits formatAndBits = PackedInt32s.FastestFormatAndBits(maxDoc, bitsPerValue, acceptableOverheadRatio); if (formatAndBits.BitsPerValue == 8 && minValue >= sbyte.MinValue && maxValue <= sbyte.MaxValue) { meta.WriteByte(MemoryDocValuesProducer.UNCOMPRESSED); // uncompressed foreach (var nv in values) { data.WriteByte((byte)nv.GetValueOrDefault()); } } else { meta.WriteByte(MemoryDocValuesProducer.TABLE_COMPRESSED); // table-compressed long?[] decode = new long?[uniqueValues.Count]; uniqueValues.CopyTo(decode, 0); var encode = new Dictionary <long?, int?>(); data.WriteVInt32(decode.Length); for (int i = 0; i < decode.Length; i++) { data.WriteInt64(decode[i].Value); encode[decode[i]] = i; } meta.WriteVInt32(PackedInt32s.VERSION_CURRENT); data.WriteVInt32(formatAndBits.Format.Id); data.WriteVInt32(formatAndBits.BitsPerValue); PackedInt32s.Writer writer = PackedInt32s.GetWriterNoHeader(data, formatAndBits.Format, maxDoc, formatAndBits.BitsPerValue, PackedInt32s.DEFAULT_BUFFER_SIZE); foreach (var nv in values) { var v = encode[nv.GetValueOrDefault()]; writer.Add((long)v); } writer.Finish(); } } else if (gcd != 0 && gcd != 1) { meta.WriteByte(MemoryDocValuesProducer.GCD_COMPRESSED); meta.WriteVInt32(PackedInt32s.VERSION_CURRENT); data.WriteInt64(minValue); data.WriteInt64(gcd); data.WriteVInt32(MemoryDocValuesProducer.BLOCK_SIZE); var writer = new BlockPackedWriter(data, MemoryDocValuesProducer.BLOCK_SIZE); foreach (var nv in values) { writer.Add((nv.GetValueOrDefault() - minValue) / gcd); } writer.Finish(); } else { meta.WriteByte(MemoryDocValuesProducer.DELTA_COMPRESSED); // delta-compressed meta.WriteVInt32(PackedInt32s.VERSION_CURRENT); data.WriteVInt32(MemoryDocValuesProducer.BLOCK_SIZE); var writer = new BlockPackedWriter(data, MemoryDocValuesProducer.BLOCK_SIZE); foreach (var nv in values) { writer.Add(nv.GetValueOrDefault()); } writer.Finish(); } }
public override void AddNumericField(FieldInfo field, IEnumerable <long?> values) { Meta.WriteVInt(field.Number); Meta.WriteByte((byte)NUMBER); Meta.WriteLong(Data.FilePointer); long minValue = long.MaxValue; long maxValue = long.MinValue; long gcd = 0; // TODO: more efficient? HashSet <long> uniqueValues = null; if (true) { uniqueValues = new HashSet <long>(); long count = 0; foreach (long?nv in values) { Debug.Assert(nv != null); long v = nv.Value; if (gcd != 1) { if (v < long.MinValue / 2 || v > long.MaxValue / 2) { // in that case v - minValue might overflow and make the GCD computation return // wrong results. Since these extreme values are unlikely, we just discard // GCD computation for them gcd = 1; } // minValue needs to be set first else if (count != 0) { gcd = MathUtil.Gcd(gcd, v - minValue); } } minValue = Math.Min(minValue, v); maxValue = Math.Max(maxValue, v); if (uniqueValues != null) { if (uniqueValues.Add(v)) { if (uniqueValues.Count > 256) { uniqueValues = null; } } } ++count; } Debug.Assert(count == MaxDoc); } if (uniqueValues != null) { // small number of unique values int bitsPerValue = PackedInts.BitsRequired(uniqueValues.Count - 1); FormatAndBits formatAndBits = PackedInts.FastestFormatAndBits(MaxDoc, bitsPerValue, AcceptableOverheadRatio); if (formatAndBits.bitsPerValue == 8 && minValue >= sbyte.MinValue && maxValue <= sbyte.MaxValue) { Meta.WriteByte((byte)UNCOMPRESSED); // uncompressed foreach (long?nv in values) { Data.WriteByte(nv == null ? (byte)0 : (byte)(sbyte)nv.Value); } } else { Meta.WriteByte((byte)TABLE_COMPRESSED); // table-compressed //LUCENE TO-DO, ToArray had a parameter to start var decode = uniqueValues.ToArray(); var encode = new Dictionary <long, int>(); Data.WriteVInt(decode.Length); for (int i = 0; i < decode.Length; i++) { Data.WriteLong(decode[i]); encode[decode[i]] = i; } Meta.WriteVInt(PackedInts.VERSION_CURRENT); Data.WriteVInt(formatAndBits.format.id); Data.WriteVInt(formatAndBits.bitsPerValue); PackedInts.Writer writer = PackedInts.GetWriterNoHeader(Data, formatAndBits.format, MaxDoc, formatAndBits.bitsPerValue, PackedInts.DEFAULT_BUFFER_SIZE); foreach (long?nv in values) { writer.Add(encode[nv == null ? 0 : nv.Value]); } writer.Finish(); } } else if (gcd != 0 && gcd != 1) { Meta.WriteByte((byte)GCD_COMPRESSED); Meta.WriteVInt(PackedInts.VERSION_CURRENT); Data.WriteLong(minValue); Data.WriteLong(gcd); Data.WriteVInt(BLOCK_SIZE); var writer = new BlockPackedWriter(Data, BLOCK_SIZE); foreach (long?nv in values) { long value = nv == null ? 0 : nv.Value; writer.Add((value - minValue) / gcd); } writer.Finish(); } else { Meta.WriteByte((byte)DELTA_COMPRESSED); // delta-compressed Meta.WriteVInt(PackedInts.VERSION_CURRENT); Data.WriteVInt(BLOCK_SIZE); var writer = new BlockPackedWriter(Data, BLOCK_SIZE); foreach (long?nv in values) { writer.Add(nv == null ? 0 : nv.Value); } writer.Finish(); } }
public override void AddNumericField(FieldInfo field, IEnumerable<long> values) { Meta.WriteVInt(field.Number); Meta.WriteByte(NUMBER); Meta.WriteLong(Data.FilePointer); long minValue = long.MaxValue; long maxValue = long.MinValue; long gcd = 0; // TODO: more efficient? HashSet<long> uniqueValues = null; if (true) { uniqueValues = new HashSet<long>(); long count = 0; foreach (long nv in values) { Debug.Assert(nv != null); long v = nv; if (gcd != 1) { if (v < long.MinValue / 2 || v > long.MaxValue / 2) { // in that case v - minValue might overflow and make the GCD computation return // wrong results. Since these extreme values are unlikely, we just discard // GCD computation for them gcd = 1; } // minValue needs to be set first else if (count != 0) { gcd = MathUtil.Gcd(gcd, v - minValue); } } minValue = Math.Min(minValue, v); maxValue = Math.Max(maxValue, v); if (uniqueValues != null) { if (uniqueValues.Add(v)) { if (uniqueValues.Count > 256) { uniqueValues = null; } } } ++count; } Debug.Assert(count == MaxDoc); } if (uniqueValues != null) { // small number of unique values int bitsPerValue = PackedInts.BitsRequired(uniqueValues.Count - 1); FormatAndBits formatAndBits = PackedInts.FastestFormatAndBits(MaxDoc, bitsPerValue, AcceptableOverheadRatio); if (formatAndBits.bitsPerValue == 8 && minValue >= sbyte.MinValue && maxValue <= sbyte.MaxValue) { Meta.WriteByte(UNCOMPRESSED); // uncompressed foreach (long nv in values) { Data.WriteByte((sbyte)nv); } } else { Meta.WriteByte(TABLE_COMPRESSED); // table-compressed //LUCENE TO-DO, ToArray had a parameter to start long[] decode = uniqueValues.ToArray(); Dictionary<long, int> encode = new Dictionary<long, int>(); Data.WriteVInt(decode.Length); for (int i = 0; i < decode.Length; i++) { Data.WriteLong(decode[i]); encode[decode[i]] = i; } Meta.WriteVInt(PackedInts.VERSION_CURRENT); Data.WriteVInt(formatAndBits.format.id); Data.WriteVInt(formatAndBits.bitsPerValue); PackedInts.Writer writer = PackedInts.GetWriterNoHeader(Data, formatAndBits.format, MaxDoc, formatAndBits.bitsPerValue, PackedInts.DEFAULT_BUFFER_SIZE); foreach (long nv in values) { writer.Add(encode[nv == null ? 0 : nv]); } writer.Finish(); } } else if (gcd != 0 && gcd != 1) { Meta.WriteByte(GCD_COMPRESSED); Meta.WriteVInt(PackedInts.VERSION_CURRENT); Data.WriteLong(minValue); Data.WriteLong(gcd); Data.WriteVInt(BLOCK_SIZE); BlockPackedWriter writer = new BlockPackedWriter(Data, BLOCK_SIZE); foreach (long nv in values) { long value = nv; writer.Add((value - minValue) / gcd); } writer.Finish(); } else { Meta.WriteByte(DELTA_COMPRESSED); // delta-compressed Meta.WriteVInt(PackedInts.VERSION_CURRENT); Data.WriteVInt(BLOCK_SIZE); BlockPackedWriter writer = new BlockPackedWriter(Data, BLOCK_SIZE); foreach (long nv in values) { writer.Add(nv); } writer.Finish(); } }
public override void AddNumericField(FieldInfo field, IEnumerable <long?> values) { meta.WriteVInt32(field.Number); meta.WriteByte((byte)NUMBER); meta.WriteInt64(data.GetFilePointer()); long minValue = long.MaxValue; long maxValue = long.MinValue; long gcd = 0; // TODO: more efficient? JCG.HashSet <long> uniqueValues /* = null*/; // LUCENENET: IDE0059: Remove unnecessary value assignment if (true) { uniqueValues = new JCG.HashSet <long>(); long count = 0; foreach (long?nv in values) { if (Debugging.AssertsEnabled) { Debugging.Assert(nv != null); } long v = nv.Value; if (gcd != 1) { if (v < long.MinValue / 2 || v > long.MaxValue / 2) { // in that case v - minValue might overflow and make the GCD computation return // wrong results. Since these extreme values are unlikely, we just discard // GCD computation for them gcd = 1; } // minValue needs to be set first else if (count != 0) { gcd = MathUtil.Gcd(gcd, v - minValue); } } minValue = Math.Min(minValue, v); maxValue = Math.Max(maxValue, v); if (uniqueValues != null) { if (uniqueValues.Add(v)) { if (uniqueValues.Count > 256) { uniqueValues = null; } } } ++count; } if (Debugging.AssertsEnabled) { Debugging.Assert(count == maxDoc); } } if (uniqueValues != null) { // small number of unique values int bitsPerValue = PackedInt32s.BitsRequired(uniqueValues.Count - 1); FormatAndBits formatAndBits = PackedInt32s.FastestFormatAndBits(maxDoc, bitsPerValue, acceptableOverheadRatio); if (formatAndBits.BitsPerValue == 8 && minValue >= sbyte.MinValue && maxValue <= sbyte.MaxValue) { meta.WriteByte((byte)UNCOMPRESSED); // uncompressed foreach (long?nv in values) { data.WriteByte((byte)nv.GetValueOrDefault()); } } else { meta.WriteByte((byte)TABLE_COMPRESSED); // table-compressed var decode = new long[uniqueValues.Count]; uniqueValues.CopyTo(decode); var encode = new Dictionary <long, int>(); data.WriteVInt32(decode.Length); for (int i = 0; i < decode.Length; i++) { data.WriteInt64(decode[i]); encode[decode[i]] = i; } meta.WriteVInt32(PackedInt32s.VERSION_CURRENT); data.WriteVInt32(formatAndBits.Format.Id); data.WriteVInt32(formatAndBits.BitsPerValue); PackedInt32s.Writer writer = PackedInt32s.GetWriterNoHeader(data, formatAndBits.Format, maxDoc, formatAndBits.BitsPerValue, PackedInt32s.DEFAULT_BUFFER_SIZE); foreach (long?nv in values) { writer.Add(encode[nv.GetValueOrDefault()]); } writer.Finish(); } } else if (gcd != 0 && gcd != 1) { meta.WriteByte((byte)GCD_COMPRESSED); meta.WriteVInt32(PackedInt32s.VERSION_CURRENT); data.WriteInt64(minValue); data.WriteInt64(gcd); data.WriteVInt32(BLOCK_SIZE); var writer = new BlockPackedWriter(data, BLOCK_SIZE); foreach (long?nv in values) { writer.Add((nv.GetValueOrDefault() - minValue) / gcd); } writer.Finish(); } else { meta.WriteByte((byte)DELTA_COMPRESSED); // delta-compressed meta.WriteVInt32(PackedInt32s.VERSION_CURRENT); data.WriteVInt32(BLOCK_SIZE); var writer = new BlockPackedWriter(data, BLOCK_SIZE); foreach (long?nv in values) { writer.Add(nv.GetValueOrDefault()); } writer.Finish(); } }
//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET: //ORIGINAL LINE: void addNumericField(index.FieldInfo field, Iterable<Number> values, boolean optimizeStorage) throws java.io.IOException internal virtual void addNumericField(FieldInfo field, IEnumerable <Number> values, bool optimizeStorage) { meta.writeVInt(field.number); meta.writeByte(NUMBER); meta.writeLong(data.FilePointer); long minValue = long.MaxValue; long maxValue = long.MinValue; long gcd = 0; bool missing = false; // TODO: more efficient? HashSet <long?> uniqueValues = null; if (optimizeStorage) { uniqueValues = new HashSet <>(); long count = 0; foreach (Number nv in values) { //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final long v; long v; if (nv == null) { v = 0; missing = true; } else { v = (long)nv; } if (gcd != 1) { if (v < long.MinValue / 2 || v > long.MaxValue / 2) { // in that case v - minValue might overflow and make the GCD computation return // wrong results. Since these extreme values are unlikely, we just discard // GCD computation for them gcd = 1; } // minValue needs to be set first else if (count != 0) { gcd = MathUtil.gcd(gcd, v - minValue); } } minValue = Math.Min(minValue, v); maxValue = Math.Max(maxValue, v); if (uniqueValues != null) { if (uniqueValues.Add(v)) { if (uniqueValues.Count > 256) { uniqueValues = null; } } } ++count; } Debug.Assert(count == maxDoc); } if (missing) { long start = data.FilePointer; writeMissingBitset(values); meta.writeLong(start); meta.writeLong(data.FilePointer - start); } else { meta.writeLong(-1L); } if (uniqueValues != null) { // small number of unique values //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final int bitsPerValue = util.packed.PackedInts.bitsRequired(uniqueValues.size()-1); int bitsPerValue = PackedInts.bitsRequired(uniqueValues.Count - 1); FormatAndBits formatAndBits = PackedInts.fastestFormatAndBits(maxDoc, bitsPerValue, acceptableOverheadRatio); if (formatAndBits.bitsPerValue == 8 && minValue >= sbyte.MinValue && maxValue <= sbyte.MaxValue) { meta.writeByte(UNCOMPRESSED); // uncompressed foreach (Number nv in values) { data.writeByte(nv == null ? 0 : (long)(sbyte)nv); } } else { meta.writeByte(TABLE_COMPRESSED); // table-compressed long?[] decode = uniqueValues.toArray(new long?[uniqueValues.Count]); //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final java.util.HashMap<Long,Integer> encode = new java.util.HashMap<>(); Dictionary <long?, int?> encode = new Dictionary <long?, int?>(); data.writeVInt(decode.Length); for (int i = 0; i < decode.Length; i++) { data.writeLong(decode[i]); encode[decode[i]] = i; } meta.writeVInt(PackedInts.VERSION_CURRENT); data.writeVInt(formatAndBits.format.Id); data.writeVInt(formatAndBits.bitsPerValue); //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final util.packed.PackedInts.Writer writer = util.packed.PackedInts.getWriterNoHeader(data, formatAndBits.format, maxDoc, formatAndBits.bitsPerValue, util.packed.PackedInts.DEFAULT_BUFFER_SIZE); PackedInts.Writer writer = PackedInts.getWriterNoHeader(data, formatAndBits.format, maxDoc, formatAndBits.bitsPerValue, PackedInts.DEFAULT_BUFFER_SIZE); foreach (Number nv in values) { writer.add(encode[nv == null ? 0 : (long)nv]); } writer.finish(); } } else if (gcd != 0 && gcd != 1) { meta.writeByte(GCD_COMPRESSED); meta.writeVInt(PackedInts.VERSION_CURRENT); data.writeLong(minValue); data.writeLong(gcd); data.writeVInt(BLOCK_SIZE); //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final util.packed.BlockPackedWriter writer = new util.packed.BlockPackedWriter(data, BLOCK_SIZE); BlockPackedWriter writer = new BlockPackedWriter(data, BLOCK_SIZE); foreach (Number nv in values) { long value = nv == null ? 0 : (long)nv; writer.add((value - minValue) / gcd); } writer.finish(); } else { meta.writeByte(DELTA_COMPRESSED); // delta-compressed meta.writeVInt(PackedInts.VERSION_CURRENT); data.writeVInt(BLOCK_SIZE); //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final util.packed.BlockPackedWriter writer = new util.packed.BlockPackedWriter(data, BLOCK_SIZE); BlockPackedWriter writer = new BlockPackedWriter(data, BLOCK_SIZE); foreach (Number nv in values) { writer.add(nv == null ? 0 : (long)nv); } writer.finish(); } }
/// <summary> /// Sole constructor. </summary> public CompressingTermVectorsWriter(Directory directory, SegmentInfo si, string segmentSuffix, IOContext context, string formatName, CompressionMode compressionMode, int chunkSize) { Debug.Assert(directory != null); this.Directory = directory; this.Segment = si.Name; this.SegmentSuffix = segmentSuffix; this.CompressionMode = compressionMode; this.Compressor = compressionMode.NewCompressor(); this.ChunkSize = chunkSize; NumDocs = 0; PendingDocs = new LinkedList<DocData>(); TermSuffixes = new GrowableByteArrayDataOutput(ArrayUtil.Oversize(chunkSize, 1)); PayloadBytes = new GrowableByteArrayDataOutput(ArrayUtil.Oversize(1, 1)); LastTerm = new BytesRef(ArrayUtil.Oversize(30, 1)); bool success = false; IndexOutput indexStream = directory.CreateOutput(IndexFileNames.SegmentFileName(Segment, segmentSuffix, VECTORS_INDEX_EXTENSION), context); try { VectorsStream = directory.CreateOutput(IndexFileNames.SegmentFileName(Segment, segmentSuffix, VECTORS_EXTENSION), context); string codecNameIdx = formatName + CODEC_SFX_IDX; string codecNameDat = formatName + CODEC_SFX_DAT; CodecUtil.WriteHeader(indexStream, codecNameIdx, VERSION_CURRENT); CodecUtil.WriteHeader(VectorsStream, codecNameDat, VERSION_CURRENT); Debug.Assert(CodecUtil.HeaderLength(codecNameDat) == VectorsStream.FilePointer); Debug.Assert(CodecUtil.HeaderLength(codecNameIdx) == indexStream.FilePointer); IndexWriter = new CompressingStoredFieldsIndexWriter(indexStream); indexStream = null; VectorsStream.WriteVInt(PackedInts.VERSION_CURRENT); VectorsStream.WriteVInt(chunkSize); Writer = new BlockPackedWriter(VectorsStream, BLOCK_SIZE); PositionsBuf = new int[1024]; StartOffsetsBuf = new int[1024]; LengthsBuf = new int[1024]; PayloadLengthsBuf = new int[1024]; success = true; } finally { if (!success) { IOUtils.CloseWhileHandlingException(indexStream); Abort(); } } }