private void FlushNumTerms(int totalFields) { int maxNumTerms = 0; foreach (DocData dd in pendingDocs) { foreach (FieldData fd in dd.fields) { maxNumTerms |= fd.numTerms; } } int bitsRequired = PackedInt32s.BitsRequired(maxNumTerms); vectorsStream.WriteVInt32(bitsRequired); PackedInt32s.Writer writer = PackedInt32s.GetWriterNoHeader(vectorsStream, PackedInt32s.Format.PACKED, totalFields, bitsRequired, 1); foreach (DocData dd in pendingDocs) { foreach (FieldData fd in dd.fields) { writer.Add(fd.numTerms); } } if (Debugging.AssertsEnabled) { Debugging.Assert(writer.Ord == totalFields - 1); } writer.Finish(); }
private void FlushFields(int totalFields, int[] fieldNums) { PackedInt32s.Writer writer = PackedInt32s.GetWriterNoHeader(vectorsStream, PackedInt32s.Format.PACKED, totalFields, PackedInt32s.BitsRequired(fieldNums.Length - 1), 1); foreach (DocData dd in pendingDocs) { foreach (FieldData fd in dd.fields) { int fieldNumIndex = Array.BinarySearch(fieldNums, fd.fieldNum); Debug.Assert(fieldNumIndex >= 0); writer.Add(fieldNumIndex); } } writer.Finish(); }
/// <summary> /// NOTE: This was saveInts() in Lucene. /// </summary> private static void SaveInt32s(int[] values, int length, DataOutput @out) { if (Debugging.AssertsEnabled) { Debugging.Assert(length > 0); } if (length == 1) { @out.WriteVInt32(values[0]); } else { bool allEqual = true; for (int i = 1; i < length; ++i) { if (values[i] != values[0]) { allEqual = false; break; } } if (allEqual) { @out.WriteVInt32(0); @out.WriteVInt32(values[0]); } else { long max = 0; for (int i = 0; i < length; ++i) { max |= (uint)values[i]; } int bitsRequired = PackedInt32s.BitsRequired(max); @out.WriteVInt32(bitsRequired); PackedInt32s.Writer w = PackedInt32s.GetWriterNoHeader(@out, PackedInt32s.Format.PACKED, length, bitsRequired, 1); for (int i = 0; i < length; ++i) { w.Add(values[i]); } w.Finish(); } } }
/// <summary> /// Returns a sorted array containing unique field numbers. </summary> private int[] FlushFieldNums() { JCG.SortedSet <int> fieldNums = new JCG.SortedSet <int>(); foreach (DocData dd in pendingDocs) { foreach (FieldData fd in dd.fields) { fieldNums.Add(fd.fieldNum); } } int numDistinctFields = fieldNums.Count; if (Debugging.AssertsEnabled) { Debugging.Assert(numDistinctFields > 0); } int bitsRequired = PackedInt32s.BitsRequired(fieldNums.Max); int token = (Math.Min(numDistinctFields - 1, 0x07) << 5) | bitsRequired; vectorsStream.WriteByte((byte)token); if (numDistinctFields - 1 >= 0x07) { vectorsStream.WriteVInt32(numDistinctFields - 1 - 0x07); } PackedInt32s.Writer writer = PackedInt32s.GetWriterNoHeader(vectorsStream, PackedInt32s.Format.PACKED, fieldNums.Count, bitsRequired, 1); foreach (int fieldNum in fieldNums) { writer.Add(fieldNum); } writer.Finish(); int[] fns = new int[fieldNums.Count]; int i = 0; foreach (int key in fieldNums) { fns[i++] = key; } return(fns); }
private void WriteBlock() { if (Debugging.AssertsEnabled) { Debugging.Assert(blockChunks > 0); } fieldsIndexOut.WriteVInt32(blockChunks); // The trick here is that we only store the difference from the average start // pointer or doc base, this helps save bits per value. // And in order to prevent a few chunks that would be far from the average to // raise the number of bits per value for all of them, we only encode blocks // of 1024 chunks at once // See LUCENE-4512 // doc bases int avgChunkDocs; if (blockChunks == 1) { avgChunkDocs = 0; } else { avgChunkDocs = (int)Math.Round((float)(blockDocs - docBaseDeltas[blockChunks - 1]) / (blockChunks - 1)); } fieldsIndexOut.WriteVInt32(totalDocs - blockDocs); // docBase fieldsIndexOut.WriteVInt32(avgChunkDocs); int docBase = 0; long maxDelta = 0; for (int i = 0; i < blockChunks; ++i) { int delta = docBase - avgChunkDocs * i; maxDelta |= MoveSignToLowOrderBit(delta); docBase += docBaseDeltas[i]; } int bitsPerDocBase = PackedInt32s.BitsRequired(maxDelta); fieldsIndexOut.WriteVInt32(bitsPerDocBase); PackedInt32s.Writer writer = PackedInt32s.GetWriterNoHeader(fieldsIndexOut, PackedInt32s.Format.PACKED, blockChunks, bitsPerDocBase, 1); docBase = 0; for (int i = 0; i < blockChunks; ++i) { long delta = docBase - avgChunkDocs * i; if (Debugging.AssertsEnabled) { Debugging.Assert(PackedInt32s.BitsRequired(MoveSignToLowOrderBit(delta)) <= writer.BitsPerValue); } writer.Add(MoveSignToLowOrderBit(delta)); docBase += docBaseDeltas[i]; } writer.Finish(); // start pointers fieldsIndexOut.WriteVInt64(firstStartPointer); long avgChunkSize; if (blockChunks == 1) { avgChunkSize = 0; } else { avgChunkSize = (maxStartPointer - firstStartPointer) / (blockChunks - 1); } fieldsIndexOut.WriteVInt64(avgChunkSize); long startPointer = 0; maxDelta = 0; for (int i = 0; i < blockChunks; ++i) { startPointer += startPointerDeltas[i]; long delta = startPointer - avgChunkSize * i; maxDelta |= MoveSignToLowOrderBit(delta); } int bitsPerStartPointer = PackedInt32s.BitsRequired(maxDelta); fieldsIndexOut.WriteVInt32(bitsPerStartPointer); writer = PackedInt32s.GetWriterNoHeader(fieldsIndexOut, PackedInt32s.Format.PACKED, blockChunks, bitsPerStartPointer, 1); startPointer = 0; for (int i = 0; i < blockChunks; ++i) { startPointer += startPointerDeltas[i]; long delta = startPointer - avgChunkSize * i; if (Debugging.AssertsEnabled) { Debugging.Assert(PackedInt32s.BitsRequired(MoveSignToLowOrderBit(delta)) <= writer.BitsPerValue); } writer.Add(MoveSignToLowOrderBit(delta)); } writer.Finish(); }
internal virtual void AddNumericField(FieldInfo field, IEnumerable <long?> values, bool optimizeStorage) { long count = 0; long minValue = long.MaxValue; long maxValue = long.MinValue; long gcd = 0; bool missing = false; // TODO: more efficient? JCG.HashSet <long> uniqueValues = null; if (optimizeStorage) { uniqueValues = new JCG.HashSet <long>(); foreach (long?nv in values) { long v; if (nv == null) { v = 0; missing = true; } else { v = nv.Value; } if (gcd != 1) { if (v < long.MinValue / 2 || v > long.MaxValue / 2) { // in that case v - minValue might overflow and make the GCD computation return // wrong results. Since these extreme values are unlikely, we just discard // GCD computation for them gcd = 1; } // minValue needs to be set first else if (count != 0) { gcd = MathUtil.Gcd(gcd, v - minValue); } } minValue = Math.Min(minValue, v); maxValue = Math.Max(maxValue, v); if (uniqueValues != null) { if (uniqueValues.Add(v)) { if (uniqueValues.Count > 256) { uniqueValues = null; } } } ++count; } } else { foreach (var nv in values) { ++count; } } long delta = maxValue - minValue; int format; if (uniqueValues != null && (delta < 0L || PackedInt32s.BitsRequired(uniqueValues.Count - 1) < PackedInt32s.BitsRequired(delta)) && count <= int.MaxValue) { format = TABLE_COMPRESSED; } else if (gcd != 0 && gcd != 1) { format = GCD_COMPRESSED; } else { format = DELTA_COMPRESSED; } meta.WriteVInt32(field.Number); meta.WriteByte((byte)Lucene45DocValuesFormat.NUMERIC); meta.WriteVInt32(format); if (missing) { meta.WriteInt64(data.Position); // LUCENENET specific: Renamed from getFilePointer() to match FileStream WriteMissingBitset(values); } else { meta.WriteInt64(-1L); } meta.WriteVInt32(PackedInt32s.VERSION_CURRENT); meta.WriteInt64(data.Position); // LUCENENET specific: Renamed from getFilePointer() to match FileStream meta.WriteVInt64(count); meta.WriteVInt32(BLOCK_SIZE); switch (format) { case GCD_COMPRESSED: meta.WriteInt64(minValue); meta.WriteInt64(gcd); BlockPackedWriter quotientWriter = new BlockPackedWriter(data, BLOCK_SIZE); foreach (long?nv in values) { quotientWriter.Add((nv.GetValueOrDefault() - minValue) / gcd); } quotientWriter.Finish(); break; case DELTA_COMPRESSED: BlockPackedWriter writer = new BlockPackedWriter(data, BLOCK_SIZE); foreach (long?nv in values) { writer.Add(nv.GetValueOrDefault()); } writer.Finish(); break; case TABLE_COMPRESSED: // LUCENENET NOTE: diming an array and then using .CopyTo() for better efficiency than LINQ .ToArray() long[] decode = new long[uniqueValues.Count]; uniqueValues.CopyTo(decode, 0); Dictionary <long, int> encode = new Dictionary <long, int>(); meta.WriteVInt32(decode.Length); for (int i = 0; i < decode.Length; i++) { meta.WriteInt64(decode[i]); encode[decode[i]] = i; } int bitsRequired = PackedInt32s.BitsRequired(uniqueValues.Count - 1); PackedInt32s.Writer ordsWriter = PackedInt32s.GetWriterNoHeader(data, PackedInt32s.Format.PACKED, (int)count, bitsRequired, PackedInt32s.DEFAULT_BUFFER_SIZE); foreach (long?nv in values) { ordsWriter.Add(encode[nv.GetValueOrDefault()]); } ordsWriter.Finish(); break; default: throw AssertionError.Create(); } }
internal virtual void AddNumericField(FieldInfo field, IEnumerable <long?> values, bool optimizeStorage) { meta.WriteVInt32(field.Number); meta.WriteByte(MemoryDocValuesProducer.NUMBER); meta.WriteInt64(data.GetFilePointer()); long minValue = long.MaxValue; long maxValue = long.MinValue; long gcd = 0; bool missing = false; // TODO: more efficient? ISet <long?> uniqueValues = null; if (optimizeStorage) { uniqueValues = new JCG.HashSet <long?>(); long count = 0; foreach (var nv in values) { long v; if (nv == null) { v = 0; missing = true; } else { v = nv.Value; } if (gcd != 1) { if (v < long.MinValue / 2 || v > long.MaxValue / 2) { // in that case v - minValue might overflow and make the GCD computation return // wrong results. Since these extreme values are unlikely, we just discard // GCD computation for them gcd = 1; } // minValue needs to be set first else if (count != 0) { gcd = MathUtil.Gcd(gcd, v - minValue); } } minValue = Math.Min(minValue, v); maxValue = Math.Max(maxValue, v); if (uniqueValues != null) { if (uniqueValues.Add(v)) { if (uniqueValues.Count > 256) { uniqueValues = null; } } } ++count; } if (Debugging.AssertsEnabled) { Debugging.Assert(count == maxDoc); } } if (missing) { long start = data.GetFilePointer(); WriteMissingBitset(values); meta.WriteInt64(start); meta.WriteInt64(data.GetFilePointer() - start); } else { meta.WriteInt64(-1L); } if (uniqueValues != null) { // small number of unique values int bitsPerValue = PackedInt32s.BitsRequired(uniqueValues.Count - 1); FormatAndBits formatAndBits = PackedInt32s.FastestFormatAndBits(maxDoc, bitsPerValue, acceptableOverheadRatio); if (formatAndBits.BitsPerValue == 8 && minValue >= sbyte.MinValue && maxValue <= sbyte.MaxValue) { meta.WriteByte(MemoryDocValuesProducer.UNCOMPRESSED); // uncompressed foreach (var nv in values) { data.WriteByte((byte)nv.GetValueOrDefault()); } } else { meta.WriteByte(MemoryDocValuesProducer.TABLE_COMPRESSED); // table-compressed long?[] decode = new long?[uniqueValues.Count]; uniqueValues.CopyTo(decode, 0); var encode = new Dictionary <long?, int?>(); data.WriteVInt32(decode.Length); for (int i = 0; i < decode.Length; i++) { data.WriteInt64(decode[i].Value); encode[decode[i]] = i; } meta.WriteVInt32(PackedInt32s.VERSION_CURRENT); data.WriteVInt32(formatAndBits.Format.Id); data.WriteVInt32(formatAndBits.BitsPerValue); PackedInt32s.Writer writer = PackedInt32s.GetWriterNoHeader(data, formatAndBits.Format, maxDoc, formatAndBits.BitsPerValue, PackedInt32s.DEFAULT_BUFFER_SIZE); foreach (var nv in values) { var v = encode[nv.GetValueOrDefault()]; writer.Add((long)v); } writer.Finish(); } } else if (gcd != 0 && gcd != 1) { meta.WriteByte(MemoryDocValuesProducer.GCD_COMPRESSED); meta.WriteVInt32(PackedInt32s.VERSION_CURRENT); data.WriteInt64(minValue); data.WriteInt64(gcd); data.WriteVInt32(MemoryDocValuesProducer.BLOCK_SIZE); var writer = new BlockPackedWriter(data, MemoryDocValuesProducer.BLOCK_SIZE); foreach (var nv in values) { writer.Add((nv.GetValueOrDefault() - minValue) / gcd); } writer.Finish(); } else { meta.WriteByte(MemoryDocValuesProducer.DELTA_COMPRESSED); // delta-compressed meta.WriteVInt32(PackedInt32s.VERSION_CURRENT); data.WriteVInt32(MemoryDocValuesProducer.BLOCK_SIZE); var writer = new BlockPackedWriter(data, MemoryDocValuesProducer.BLOCK_SIZE); foreach (var nv in values) { writer.Add(nv.GetValueOrDefault()); } writer.Finish(); } }
private void FlushFlags(int totalFields, int[] fieldNums) { // check if fields always have the same flags bool nonChangingFlags = true; int[] fieldFlags = new int[fieldNums.Length]; Arrays.Fill(fieldFlags, -1); bool breakOuterLoop; foreach (DocData dd in pendingDocs) { breakOuterLoop = false; foreach (FieldData fd in dd.fields) { int fieldNumOff = Array.BinarySearch(fieldNums, fd.fieldNum); if (Debugging.AssertsEnabled) { Debugging.Assert(fieldNumOff >= 0); } if (fieldFlags[fieldNumOff] == -1) { fieldFlags[fieldNumOff] = fd.flags; } else if (fieldFlags[fieldNumOff] != fd.flags) { nonChangingFlags = false; breakOuterLoop = true; } } if (breakOuterLoop) { break; } } if (nonChangingFlags) { // write one flag per field num vectorsStream.WriteVInt32(0); PackedInt32s.Writer writer = PackedInt32s.GetWriterNoHeader(vectorsStream, PackedInt32s.Format.PACKED, fieldFlags.Length, FLAGS_BITS, 1); foreach (int flags in fieldFlags) { if (Debugging.AssertsEnabled) { Debugging.Assert(flags >= 0); } writer.Add(flags); } if (Debugging.AssertsEnabled) { Debugging.Assert(writer.Ord == fieldFlags.Length - 1); } writer.Finish(); } else { // write one flag for every field instance vectorsStream.WriteVInt32(1); PackedInt32s.Writer writer = PackedInt32s.GetWriterNoHeader(vectorsStream, PackedInt32s.Format.PACKED, totalFields, FLAGS_BITS, 1); foreach (DocData dd in pendingDocs) { foreach (FieldData fd in dd.fields) { writer.Add(fd.flags); } } if (Debugging.AssertsEnabled) { Debugging.Assert(writer.Ord == totalFields - 1); } writer.Finish(); } }
public override void AddNumericField(FieldInfo field, IEnumerable <long?> values) { meta.WriteVInt32(field.Number); meta.WriteByte((byte)NUMBER); meta.WriteInt64(data.GetFilePointer()); long minValue = long.MaxValue; long maxValue = long.MinValue; long gcd = 0; // TODO: more efficient? HashSet <long> uniqueValues = null; if (true) { uniqueValues = new HashSet <long>(); long count = 0; foreach (long?nv in values) { Debug.Assert(nv != null); long v = nv.Value; if (gcd != 1) { if (v < long.MinValue / 2 || v > long.MaxValue / 2) { // in that case v - minValue might overflow and make the GCD computation return // wrong results. Since these extreme values are unlikely, we just discard // GCD computation for them gcd = 1; } // minValue needs to be set first else if (count != 0) { gcd = MathUtil.Gcd(gcd, v - minValue); } } minValue = Math.Min(minValue, v); maxValue = Math.Max(maxValue, v); if (uniqueValues != null) { if (uniqueValues.Add(v)) { if (uniqueValues.Count > 256) { uniqueValues = null; } } } ++count; } Debug.Assert(count == maxDoc); } if (uniqueValues != null) { // small number of unique values int bitsPerValue = PackedInt32s.BitsRequired(uniqueValues.Count - 1); FormatAndBits formatAndBits = PackedInt32s.FastestFormatAndBits(maxDoc, bitsPerValue, acceptableOverheadRatio); if (formatAndBits.BitsPerValue == 8 && minValue >= sbyte.MinValue && maxValue <= sbyte.MaxValue) { meta.WriteByte((byte)UNCOMPRESSED); // uncompressed foreach (long?nv in values) { data.WriteByte((byte)nv.GetValueOrDefault()); } } else { meta.WriteByte((byte)TABLE_COMPRESSED); // table-compressed var decode = uniqueValues.ToArray(); var encode = new Dictionary <long, int>(); data.WriteVInt32(decode.Length); for (int i = 0; i < decode.Length; i++) { data.WriteInt64(decode[i]); encode[decode[i]] = i; } meta.WriteVInt32(PackedInt32s.VERSION_CURRENT); data.WriteVInt32(formatAndBits.Format.Id); data.WriteVInt32(formatAndBits.BitsPerValue); PackedInt32s.Writer writer = PackedInt32s.GetWriterNoHeader(data, formatAndBits.Format, maxDoc, formatAndBits.BitsPerValue, PackedInt32s.DEFAULT_BUFFER_SIZE); foreach (long?nv in values) { writer.Add(encode[nv.GetValueOrDefault()]); } writer.Finish(); } } else if (gcd != 0 && gcd != 1) { meta.WriteByte((byte)GCD_COMPRESSED); meta.WriteVInt32(PackedInt32s.VERSION_CURRENT); data.WriteInt64(minValue); data.WriteInt64(gcd); data.WriteVInt32(BLOCK_SIZE); var writer = new BlockPackedWriter(data, BLOCK_SIZE); foreach (long?nv in values) { writer.Add((nv.GetValueOrDefault() - minValue) / gcd); } writer.Finish(); } else { meta.WriteByte((byte)DELTA_COMPRESSED); // delta-compressed meta.WriteVInt32(PackedInt32s.VERSION_CURRENT); data.WriteVInt32(BLOCK_SIZE); var writer = new BlockPackedWriter(data, BLOCK_SIZE); foreach (long?nv in values) { writer.Add(nv.GetValueOrDefault()); } writer.Finish(); } }
internal virtual void AddNumericField(FieldInfo field, IEnumerable <long?> values, bool optimizeStorage) { Meta.WriteVInt32(field.Number); Meta.WriteByte((byte)Lucene42DocValuesProducer.NUMBER); Meta.WriteInt64(Data.GetFilePointer()); long minValue = long.MaxValue; long maxValue = long.MinValue; long gcd = 0; // TODO: more efficient? HashSet <long> uniqueValues = null; if (optimizeStorage) { uniqueValues = new HashSet <long>(); long count = 0; foreach (long?nv in values) { // TODO: support this as MemoryDVFormat (and be smart about missing maybe) long v = nv.GetValueOrDefault(); if (gcd != 1) { if (v < long.MinValue / 2 || v > long.MaxValue / 2) { // in that case v - minValue might overflow and make the GCD computation return // wrong results. Since these extreme values are unlikely, we just discard // GCD computation for them gcd = 1; } // minValue needs to be set first else if (count != 0) { gcd = MathUtil.Gcd(gcd, v - minValue); } } minValue = Math.Min(minValue, v); maxValue = Math.Max(maxValue, v); if (uniqueValues != null) { if (uniqueValues.Add(v)) { if (uniqueValues.Count > 256) { uniqueValues = null; } } } ++count; } Debug.Assert(count == MaxDoc); } if (uniqueValues != null) { // small number of unique values int bitsPerValue = PackedInt32s.BitsRequired(uniqueValues.Count - 1); FormatAndBits formatAndBits = PackedInt32s.FastestFormatAndBits(MaxDoc, bitsPerValue, AcceptableOverheadRatio); if (formatAndBits.BitsPerValue == 8 && minValue >= sbyte.MinValue && maxValue <= sbyte.MaxValue) { Meta.WriteByte((byte)Lucene42DocValuesProducer.UNCOMPRESSED); // uncompressed foreach (long?nv in values) { Data.WriteByte((byte)nv.GetValueOrDefault()); } } else { Meta.WriteByte((byte)Lucene42DocValuesProducer.TABLE_COMPRESSED); // table-compressed long[] decode = uniqueValues.ToArray(/*new long?[uniqueValues.Count]*/); var encode = new Dictionary <long, int>(); Data.WriteVInt32(decode.Length); for (int i = 0; i < decode.Length; i++) { Data.WriteInt64(decode[i]); encode[decode[i]] = i; } Meta.WriteVInt32(PackedInt32s.VERSION_CURRENT); Data.WriteVInt32(formatAndBits.Format.Id); Data.WriteVInt32(formatAndBits.BitsPerValue); PackedInt32s.Writer writer = PackedInt32s.GetWriterNoHeader(Data, formatAndBits.Format, MaxDoc, formatAndBits.BitsPerValue, PackedInt32s.DEFAULT_BUFFER_SIZE); foreach (long?nv in values) { writer.Add(encode[nv.GetValueOrDefault()]); } writer.Finish(); } } else if (gcd != 0 && gcd != 1) { Meta.WriteByte((byte)Lucene42DocValuesProducer.GCD_COMPRESSED); Meta.WriteVInt32(PackedInt32s.VERSION_CURRENT); Data.WriteInt64(minValue); Data.WriteInt64(gcd); Data.WriteVInt32(Lucene42DocValuesProducer.BLOCK_SIZE); BlockPackedWriter writer = new BlockPackedWriter(Data, Lucene42DocValuesProducer.BLOCK_SIZE); foreach (long?nv in values) { writer.Add((nv.GetValueOrDefault() - minValue) / gcd); } writer.Finish(); } else { Meta.WriteByte((byte)Lucene42DocValuesProducer.DELTA_COMPRESSED); // delta-compressed Meta.WriteVInt32(PackedInt32s.VERSION_CURRENT); Data.WriteVInt32(Lucene42DocValuesProducer.BLOCK_SIZE); BlockPackedWriter writer = new BlockPackedWriter(Data, Lucene42DocValuesProducer.BLOCK_SIZE); foreach (long?nv in values) { writer.Add(nv.GetValueOrDefault()); } writer.Finish(); } }