public override void Fill(int fromIndex, int toIndex, long val) { if (Debugging.AssertsEnabled) { Debugging.Assert(fromIndex >= 0); Debugging.Assert(fromIndex <= toIndex); Debugging.Assert(PackedInt32s.BitsRequired(val) <= m_bitsPerValue); } int valuesPerBlock = 64 / m_bitsPerValue; if (toIndex - fromIndex <= valuesPerBlock << 1) { // there needs to be at least one full block to set for the block // approach to be worth trying base.Fill(fromIndex, toIndex, val); return; } // set values naively until the next block start int fromOffsetInBlock = fromIndex % valuesPerBlock; if (fromOffsetInBlock != 0) { for (int i = fromOffsetInBlock; i < valuesPerBlock; ++i) { Set(fromIndex++, val); } if (Debugging.AssertsEnabled) { Debugging.Assert(fromIndex % valuesPerBlock == 0); } } // bulk set of the inner blocks int fromBlock = fromIndex / valuesPerBlock; int toBlock = toIndex / valuesPerBlock; if (Debugging.AssertsEnabled) { Debugging.Assert(fromBlock * valuesPerBlock == fromIndex); } long blockValue = 0L; for (int i = 0; i < valuesPerBlock; ++i) { blockValue |= (val << (i * m_bitsPerValue)); } Arrays.Fill(blocks, fromBlock, toBlock, blockValue); // fill the gap for (int i = valuesPerBlock * toBlock; i < toIndex; ++i) { Set(i, val); } }
internal void Reset(int len) { int bitsPerOffset = PackedInt32s.BitsRequired(len - LAST_LITERALS); int bitsPerOffsetLog = 32 - (bitsPerOffset - 1).LeadingZeroCount(); hashLog = MEMORY_USAGE + 3 - bitsPerOffsetLog; if (hashTable is null || hashTable.Count < 1 << hashLog || hashTable.BitsPerValue < bitsPerOffset) { hashTable = PackedInt32s.GetMutable(1 << hashLog, bitsPerOffset, PackedInt32s.DEFAULT); }
public BinaryDocValuesFieldUpdates(string field, int maxDoc) : base(field, DocValuesFieldUpdatesType.BINARY) { docsWithField = new FixedBitSet(64); docs = new PagedMutable(1, 1024, PackedInt32s.BitsRequired(maxDoc - 1), PackedInt32s.COMPACT); offsets = new PagedGrowableWriter(1, 1024, 1, PackedInt32s.FAST); lengths = new PagedGrowableWriter(1, 1024, 1, PackedInt32s.FAST); values = new BytesRef(16); // start small size = 0; }
/// <summary> /// Compress <c>bytes[off:off+len]</c> into <paramref name="out"/> using /// at most 16KB of memory. <paramref name="ht"/> shouldn't be shared across threads /// but can safely be reused. /// </summary> public static void Compress(byte[] bytes, int off, int len, DataOutput @out, HashTable ht) { int @base = off; int end = off + len; int anchor = off++; if (len > LAST_LITERALS + MIN_MATCH) { int limit = end - LAST_LITERALS; int matchLimit = limit - MIN_MATCH; ht.Reset(len); int hashLog = ht.hashLog; PackedInt32s.Mutable hashTable = ht.hashTable; while (off <= limit) { // find a match int @ref; while (true) { if (off >= matchLimit) { goto mainBreak; } int v = ReadInt32(bytes, off); int h = Hash(v, hashLog); @ref = @base + (int)hashTable.Get(h); Debug.Assert(PackedInt32s.BitsRequired(off - @base) <= hashTable.BitsPerValue); hashTable.Set(h, off - @base); if (off - @ref < MAX_DISTANCE && ReadInt32(bytes, @ref) == v) { break; } ++off; } // compute match length int matchLen = MIN_MATCH + CommonBytes(bytes, @ref + MIN_MATCH, off + MIN_MATCH, limit); EncodeSequence(bytes, anchor, @ref, off, matchLen, @out); off += matchLen; anchor = off; //mainContinue: ; // LUCENENET NOTE: Not Referenced } mainBreak :; } // last literals int literalLen = end - anchor; Debug.Assert(literalLen >= LAST_LITERALS || literalLen == len); EncodeLastLiterals(bytes, anchor, end - anchor, @out); }
/// <summary> /// Compute the number of bits required to serialize any of the longs in /// <paramref name="data"/>. /// </summary> private static int BitsRequired(int[] data) { long or = 0; for (int i = 0; i < Lucene41PostingsFormat.BLOCK_SIZE; ++i) { Debug.Assert(data[i] >= 0); or |= (uint)data[i]; } return(PackedInt32s.BitsRequired(or)); }
private void AddVarSortedBytesField(FieldInfo field, IndexOutput data, IndexOutput index, IEnumerable <BytesRef> values, IEnumerable <long?> docToOrd) { field.PutAttribute(legacyKey, LegacyDocValuesType.BYTES_VAR_SORTED.ToString()); CodecUtil.WriteHeader(data, Lucene40DocValuesFormat.BYTES_VAR_SORTED_CODEC_NAME_DAT, Lucene40DocValuesFormat.BYTES_VAR_SORTED_VERSION_CURRENT); CodecUtil.WriteHeader(index, Lucene40DocValuesFormat.BYTES_VAR_SORTED_CODEC_NAME_IDX, Lucene40DocValuesFormat.BYTES_VAR_SORTED_VERSION_CURRENT); /* values */ long startPos = data.GetFilePointer(); int valueCount = 0; foreach (BytesRef v in values) { data.WriteBytes(v.Bytes, v.Offset, v.Length); valueCount++; } /* addresses */ long maxAddress = data.GetFilePointer() - startPos; index.WriteInt64(maxAddress); Debug.Assert(valueCount != int.MaxValue); // unsupported by the 4.0 impl PackedInt32s.Writer w = PackedInt32s.GetWriter(index, valueCount + 1, PackedInt32s.BitsRequired(maxAddress), PackedInt32s.DEFAULT); long currentPosition = 0; foreach (BytesRef v in values) { w.Add(currentPosition); currentPosition += v.Length; } // write sentinel Debug.Assert(currentPosition == maxAddress); w.Add(currentPosition); w.Finish(); /* ordinals */ int maxDoc = state.SegmentInfo.DocCount; Debug.Assert(valueCount > 0); PackedInt32s.Writer ords = PackedInt32s.GetWriter(index, maxDoc, PackedInt32s.BitsRequired(valueCount - 1), PackedInt32s.DEFAULT); foreach (long n in docToOrd) { ords.Add((long)n); } ords.Finish(); }
public override void AddNumericField(FieldInfo field, IEnumerable <long?> values) { // examine the values to determine best type to use long minValue = long.MaxValue; long maxValue = long.MinValue; foreach (long?n in values) { long v = n.GetValueOrDefault(); minValue = Math.Min(minValue, v); maxValue = Math.Max(maxValue, v); } string fileName = IndexFileNames.SegmentFileName(state.SegmentInfo.Name + "_" + Convert.ToString(field.Number, CultureInfo.InvariantCulture), segmentSuffix, "dat"); IndexOutput data = dir.CreateOutput(fileName, state.Context); bool success = false; try { if (minValue >= sbyte.MinValue && maxValue <= sbyte.MaxValue && PackedInt32s.BitsRequired(maxValue - minValue) > 4) { // fits in a byte[], would be more than 4bpv, just write byte[] AddBytesField(field, data, values); } else if (minValue >= short.MinValue && maxValue <= short.MaxValue && PackedInt32s.BitsRequired(maxValue - minValue) > 8) { // fits in a short[], would be more than 8bpv, just write short[] AddShortsField(field, data, values); } else if (minValue >= int.MinValue && maxValue <= int.MaxValue && PackedInt32s.BitsRequired(maxValue - minValue) > 16) { // fits in a int[], would be more than 16bpv, just write int[] AddIntsField(field, data, values); } else { AddVarIntsField(field, data, values, minValue, maxValue); } success = true; } finally { if (success) { IOUtils.Dispose(data); } else { IOUtils.DisposeWhileHandlingException(data); } } }
// NOTE: 4.0 file format docs are crazy/wrong here... private void AddVarStraightBytesField(FieldInfo field, IndexOutput data, IndexOutput index, IEnumerable <BytesRef> values) { field.PutAttribute(legacyKey, LegacyDocValuesType.BYTES_VAR_STRAIGHT.ToString()); CodecUtil.WriteHeader(data, Lucene40DocValuesFormat.BYTES_VAR_STRAIGHT_CODEC_NAME_DAT, Lucene40DocValuesFormat.BYTES_VAR_STRAIGHT_VERSION_CURRENT); CodecUtil.WriteHeader(index, Lucene40DocValuesFormat.BYTES_VAR_STRAIGHT_CODEC_NAME_IDX, Lucene40DocValuesFormat.BYTES_VAR_STRAIGHT_VERSION_CURRENT); /* values */ long startPos = data.Position; // LUCENENET specific: Renamed from getFilePointer() to match FileStream foreach (BytesRef v in values) { if (v != null) { data.WriteBytes(v.Bytes, v.Offset, v.Length); } } /* addresses */ long maxAddress = data.Position - startPos; // LUCENENET specific: Renamed from getFilePointer() to match FileStream index.WriteVInt64(maxAddress); int maxDoc = state.SegmentInfo.DocCount; if (Debugging.AssertsEnabled) { Debugging.Assert(maxDoc != int.MaxValue); // unsupported by the 4.0 impl } PackedInt32s.Writer w = PackedInt32s.GetWriter(index, maxDoc + 1, PackedInt32s.BitsRequired(maxAddress), PackedInt32s.DEFAULT); long currentPosition = 0; foreach (BytesRef v in values) { w.Add(currentPosition); if (v != null) { currentPosition += v.Length; } } // write sentinel if (Debugging.AssertsEnabled) { Debugging.Assert(currentPosition == maxAddress); } w.Add(currentPosition); w.Finish(); }
protected override void Flush() { if (Debugging.AssertsEnabled) { Debugging.Assert(m_off > 0); } long min = long.MaxValue, max = long.MinValue; for (int i = 0; i < m_off; ++i) { min = Math.Min(m_values[i], min); max = Math.Max(m_values[i], max); } long delta = max - min; int bitsRequired = delta < 0 ? 64 : delta == 0L ? 0 : PackedInt32s.BitsRequired(delta); if (bitsRequired == 64) { // no need to delta-encode min = 0L; } else if (min > 0L) { // make min as small as possible so that writeVLong requires fewer bytes min = Math.Max(0L, max - PackedInt32s.MaxValue(bitsRequired)); } int token = (bitsRequired << BPV_SHIFT) | (min == 0 ? MIN_VALUE_EQUALS_0 : 0); m_out.WriteByte((byte)(sbyte)token); if (min != 0) { WriteVInt64(m_out, ZigZagEncode(min) - 1); } if (bitsRequired > 0) { if (min != 0) { for (int i = 0; i < m_off; ++i) { m_values[i] -= min; } } WriteValues(bitsRequired); } m_off = 0; }
private void AddFixedDerefBytesField(FieldInfo field, IndexOutput data, IndexOutput index, IEnumerable <BytesRef> values, int length) { field.PutAttribute(legacyKey, LegacyDocValuesType.BYTES_FIXED_DEREF.ToString()); CodecUtil.WriteHeader(data, Lucene40DocValuesFormat.BYTES_FIXED_DEREF_CODEC_NAME_DAT, Lucene40DocValuesFormat.BYTES_FIXED_DEREF_VERSION_CURRENT); CodecUtil.WriteHeader(index, Lucene40DocValuesFormat.BYTES_FIXED_DEREF_CODEC_NAME_IDX, Lucene40DocValuesFormat.BYTES_FIXED_DEREF_VERSION_CURRENT); // deduplicate JCG.SortedSet <BytesRef> dictionary = new JCG.SortedSet <BytesRef>(); foreach (BytesRef v in values) { dictionary.Add(v == null ? new BytesRef() : BytesRef.DeepCopyOf(v)); } /* values */ data.WriteInt32(length); foreach (BytesRef v in dictionary) { data.WriteBytes(v.Bytes, v.Offset, v.Length); } /* ordinals */ int valueCount = dictionary.Count; if (Debugging.AssertsEnabled) { Debugging.Assert(valueCount > 0); } index.WriteInt32(valueCount); int maxDoc = state.SegmentInfo.DocCount; PackedInt32s.Writer w = PackedInt32s.GetWriter(index, maxDoc, PackedInt32s.BitsRequired(valueCount - 1), PackedInt32s.DEFAULT); BytesRef brefDummy; foreach (BytesRef v in values) { brefDummy = v; if (v == null) { brefDummy = new BytesRef(); } //int ord = dictionary.HeadSet(brefDummy).Size(); int ord = dictionary.Count(@ref => @ref.CompareTo(brefDummy) < 0); w.Add(ord); } w.Finish(); }
private void Rehash() { PagedGrowableWriter oldTable = table; table = new PagedGrowableWriter(2 * oldTable.Count, 1 << 30, PackedInt32s.BitsRequired(count), PackedInt32s.COMPACT); mask = table.Count - 1; for (long idx = 0; idx < oldTable.Count; idx++) { long address = oldTable.Get(idx); if (address != 0) { AddNew(address); } } }
private void EnsureCapacity(long value) { if ((value & currentMask) == value) { return; } int bitsRequired = value < 0 ? 64 : PackedInt32s.BitsRequired(value); Debug.Assert(bitsRequired > current.BitsPerValue); int valueCount = Count; PackedInt32s.Mutable next = PackedInt32s.GetMutable(valueCount, bitsRequired, acceptableOverheadRatio); PackedInt32s.Copy(current, 0, next, 0, valueCount, PackedInt32s.DEFAULT_BUFFER_SIZE); current = next; currentMask = Mask(current.BitsPerValue); }
/// <summary> /// NOTE: This was saveInts() in Lucene. /// </summary> private static void SaveInt32s(int[] values, int length, DataOutput @out) { if (Debugging.AssertsEnabled) { Debugging.Assert(length > 0); } if (length == 1) { @out.WriteVInt32(values[0]); } else { bool allEqual = true; for (int i = 1; i < length; ++i) { if (values[i] != values[0]) { allEqual = false; break; } } if (allEqual) { @out.WriteVInt32(0); @out.WriteVInt32(values[0]); } else { long max = 0; for (int i = 0; i < length; ++i) { max |= (uint)values[i]; } int bitsRequired = PackedInt32s.BitsRequired(max); @out.WriteVInt32(bitsRequired); PackedInt32s.Writer w = PackedInt32s.GetWriterNoHeader(@out, PackedInt32s.Format.PACKED, length, bitsRequired, 1); for (int i = 0; i < length; ++i) { w.Add(values[i]); } w.Finish(); } } }
private void AddVarDerefBytesField(FieldInfo field, IndexOutput data, IndexOutput index, IEnumerable <BytesRef> values) { field.PutAttribute(legacyKey, LegacyDocValuesType.BYTES_VAR_DEREF.ToString()); CodecUtil.WriteHeader(data, Lucene40DocValuesFormat.BYTES_VAR_DEREF_CODEC_NAME_DAT, Lucene40DocValuesFormat.BYTES_VAR_DEREF_VERSION_CURRENT); CodecUtil.WriteHeader(index, Lucene40DocValuesFormat.BYTES_VAR_DEREF_CODEC_NAME_IDX, Lucene40DocValuesFormat.BYTES_VAR_DEREF_VERSION_CURRENT); // deduplicate SortedSet <BytesRef> dictionary = new SortedSet <BytesRef>(); foreach (BytesRef v in values) { dictionary.Add(v == null ? new BytesRef() : BytesRef.DeepCopyOf(v)); } /* values */ long startPosition = data.GetFilePointer(); long currentAddress = 0; Dictionary <BytesRef, long> valueToAddress = new Dictionary <BytesRef, long>(); foreach (BytesRef v in dictionary) { currentAddress = data.GetFilePointer() - startPosition; valueToAddress[v] = currentAddress; WriteVShort(data, v.Length); data.WriteBytes(v.Bytes, v.Offset, v.Length); } /* ordinals */ long totalBytes = data.GetFilePointer() - startPosition; index.WriteInt64(totalBytes); int maxDoc = state.SegmentInfo.DocCount; PackedInt32s.Writer w = PackedInt32s.GetWriter(index, maxDoc, PackedInt32s.BitsRequired(currentAddress), PackedInt32s.DEFAULT); foreach (BytesRef v in values) { w.Add(valueToAddress[v == null ? new BytesRef() : v]); } w.Finish(); }
internal override void PackPendingValues() { if (Debugging.AssertsEnabled) { Debugging.Assert(pendingOff > 0); } minValues[valuesOff] = pending[0]; averages[valuesOff] = pendingOff == 1 ? 0 : (float)(pending[pendingOff - 1] - pending[0]) / (pendingOff - 1); for (int i = 0; i < pendingOff; ++i) { // LUCENENET NOTE: IMPORTANT: The cast to float is critical here for it to work in x86 pending[i] = ZigZagEncode(pending[i] - minValues[valuesOff] - (long)(float)(averages[valuesOff] * (long)i)); } long maxDelta = 0; for (int i = 0; i < pendingOff; ++i) { if (pending[i] < 0) { maxDelta = -1; break; } else { maxDelta = Math.Max(maxDelta, pending[i]); } } if (maxDelta == 0) { values[valuesOff] = new PackedInt32s.NullReader(pendingOff); } else { int bitsRequired = maxDelta < 0 ? 64 : PackedInt32s.BitsRequired(maxDelta); PackedInt32s.Mutable mutable = PackedInt32s.GetMutable(pendingOff, bitsRequired, acceptableOverheadRatio); for (int i = 0; i < pendingOff;) { i += mutable.Set(i, pending, i, pendingOff - i); } values[valuesOff] = mutable; } }
/// <summary> /// Returns a sorted array containing unique field numbers. </summary> private int[] FlushFieldNums() { JCG.SortedSet <int> fieldNums = new JCG.SortedSet <int>(); foreach (DocData dd in pendingDocs) { foreach (FieldData fd in dd.fields) { fieldNums.Add(fd.fieldNum); } } int numDistinctFields = fieldNums.Count; if (Debugging.AssertsEnabled) { Debugging.Assert(numDistinctFields > 0); } int bitsRequired = PackedInt32s.BitsRequired(fieldNums.Max); int token = (Math.Min(numDistinctFields - 1, 0x07) << 5) | bitsRequired; vectorsStream.WriteByte((byte)token); if (numDistinctFields - 1 >= 0x07) { vectorsStream.WriteVInt32(numDistinctFields - 1 - 0x07); } PackedInt32s.Writer writer = PackedInt32s.GetWriterNoHeader(vectorsStream, PackedInt32s.Format.PACKED, fieldNums.Count, bitsRequired, 1); foreach (int fieldNum in fieldNums) { writer.Add(fieldNum); } writer.Finish(); int[] fns = new int[fieldNums.Count]; int i = 0; foreach (int key in fieldNums) { fns[i++] = key; } return(fns); }
internal override void PackPendingValues() { // compute max delta long minValue = pending[0]; long maxValue = pending[0]; for (int i = 1; i < pendingOff; ++i) { minValue = Math.Min(minValue, pending[i]); maxValue = Math.Max(maxValue, pending[i]); } // build a new packed reader int bitsRequired = minValue < 0 ? 64 : PackedInt32s.BitsRequired(maxValue); PackedInt32s.Mutable mutable = PackedInt32s.GetMutable(pendingOff, bitsRequired, acceptableOverheadRatio); for (int i = 0; i < pendingOff;) { i += mutable.Set(i, pending, i, pendingOff - i); } values[valuesOff] = mutable; }
public override void Encode(int[] values, int valuesOffset, byte[] blocks, int blocksOffset, int iterations) { int nextBlock = 0; int bitsLeft = 8; for (int i = 0; i < byteValueCount * iterations; ++i) { int v = values[valuesOffset++]; if (Debugging.AssertsEnabled) { Debugging.Assert(PackedInt32s.BitsRequired(v & 0xFFFFFFFFL) <= bitsPerValue); } if (bitsPerValue < bitsLeft) { // just buffer nextBlock |= v << (bitsLeft - bitsPerValue); bitsLeft -= bitsPerValue; } else { // flush as many blocks as possible int bits = bitsPerValue - bitsLeft; blocks[blocksOffset++] = (byte)(nextBlock | (v.TripleShift(bits))); while (bits >= 8) { bits -= 8; blocks[blocksOffset++] = (byte)(v.TripleShift(bits)); } // then buffer bitsLeft = 8 - bits; nextBlock = (v & ((1 << bits) - 1)) << bitsLeft; } } if (Debugging.AssertsEnabled) { Debugging.Assert(bitsLeft == 8); } }
internal virtual void AddNumericField(FieldInfo field, IEnumerable <long?> values, bool optimizeStorage) { Meta.WriteVInt32(field.Number); Meta.WriteByte((byte)Lucene42DocValuesProducer.NUMBER); Meta.WriteInt64(Data.GetFilePointer()); long minValue = long.MaxValue; long maxValue = long.MinValue; long gcd = 0; // TODO: more efficient? HashSet <long> uniqueValues = null; if (optimizeStorage) { uniqueValues = new HashSet <long>(); long count = 0; foreach (long?nv in values) { // TODO: support this as MemoryDVFormat (and be smart about missing maybe) long v = nv.GetValueOrDefault(); if (gcd != 1) { if (v < long.MinValue / 2 || v > long.MaxValue / 2) { // in that case v - minValue might overflow and make the GCD computation return // wrong results. Since these extreme values are unlikely, we just discard // GCD computation for them gcd = 1; } // minValue needs to be set first else if (count != 0) { gcd = MathUtil.Gcd(gcd, v - minValue); } } minValue = Math.Min(minValue, v); maxValue = Math.Max(maxValue, v); if (uniqueValues != null) { if (uniqueValues.Add(v)) { if (uniqueValues.Count > 256) { uniqueValues = null; } } } ++count; } Debug.Assert(count == MaxDoc); } if (uniqueValues != null) { // small number of unique values int bitsPerValue = PackedInt32s.BitsRequired(uniqueValues.Count - 1); FormatAndBits formatAndBits = PackedInt32s.FastestFormatAndBits(MaxDoc, bitsPerValue, AcceptableOverheadRatio); if (formatAndBits.BitsPerValue == 8 && minValue >= sbyte.MinValue && maxValue <= sbyte.MaxValue) { Meta.WriteByte((byte)Lucene42DocValuesProducer.UNCOMPRESSED); // uncompressed foreach (long?nv in values) { Data.WriteByte((byte)nv.GetValueOrDefault()); } } else { Meta.WriteByte((byte)Lucene42DocValuesProducer.TABLE_COMPRESSED); // table-compressed long[] decode = uniqueValues.ToArray(/*new long?[uniqueValues.Count]*/); var encode = new Dictionary <long, int>(); Data.WriteVInt32(decode.Length); for (int i = 0; i < decode.Length; i++) { Data.WriteInt64(decode[i]); encode[decode[i]] = i; } Meta.WriteVInt32(PackedInt32s.VERSION_CURRENT); Data.WriteVInt32(formatAndBits.Format.Id); Data.WriteVInt32(formatAndBits.BitsPerValue); PackedInt32s.Writer writer = PackedInt32s.GetWriterNoHeader(Data, formatAndBits.Format, MaxDoc, formatAndBits.BitsPerValue, PackedInt32s.DEFAULT_BUFFER_SIZE); foreach (long?nv in values) { writer.Add(encode[nv.GetValueOrDefault()]); } writer.Finish(); } } else if (gcd != 0 && gcd != 1) { Meta.WriteByte((byte)Lucene42DocValuesProducer.GCD_COMPRESSED); Meta.WriteVInt32(PackedInt32s.VERSION_CURRENT); Data.WriteInt64(minValue); Data.WriteInt64(gcd); Data.WriteVInt32(Lucene42DocValuesProducer.BLOCK_SIZE); BlockPackedWriter writer = new BlockPackedWriter(Data, Lucene42DocValuesProducer.BLOCK_SIZE); foreach (long?nv in values) { writer.Add((nv.GetValueOrDefault() - minValue) / gcd); } writer.Finish(); } else { Meta.WriteByte((byte)Lucene42DocValuesProducer.DELTA_COMPRESSED); // delta-compressed Meta.WriteVInt32(PackedInt32s.VERSION_CURRENT); Data.WriteVInt32(Lucene42DocValuesProducer.BLOCK_SIZE); BlockPackedWriter writer = new BlockPackedWriter(Data, Lucene42DocValuesProducer.BLOCK_SIZE); foreach (long?nv in values) { writer.Add(nv.GetValueOrDefault()); } writer.Finish(); } }
private void WriteBlock() { if (Debugging.AssertsEnabled) { Debugging.Assert(blockChunks > 0); } fieldsIndexOut.WriteVInt32(blockChunks); // The trick here is that we only store the difference from the average start // pointer or doc base, this helps save bits per value. // And in order to prevent a few chunks that would be far from the average to // raise the number of bits per value for all of them, we only encode blocks // of 1024 chunks at once // See LUCENE-4512 // doc bases int avgChunkDocs; if (blockChunks == 1) { avgChunkDocs = 0; } else { avgChunkDocs = (int)Math.Round((float)(blockDocs - docBaseDeltas[blockChunks - 1]) / (blockChunks - 1)); } fieldsIndexOut.WriteVInt32(totalDocs - blockDocs); // docBase fieldsIndexOut.WriteVInt32(avgChunkDocs); int docBase = 0; long maxDelta = 0; for (int i = 0; i < blockChunks; ++i) { int delta = docBase - avgChunkDocs * i; maxDelta |= MoveSignToLowOrderBit(delta); docBase += docBaseDeltas[i]; } int bitsPerDocBase = PackedInt32s.BitsRequired(maxDelta); fieldsIndexOut.WriteVInt32(bitsPerDocBase); PackedInt32s.Writer writer = PackedInt32s.GetWriterNoHeader(fieldsIndexOut, PackedInt32s.Format.PACKED, blockChunks, bitsPerDocBase, 1); docBase = 0; for (int i = 0; i < blockChunks; ++i) { long delta = docBase - avgChunkDocs * i; if (Debugging.AssertsEnabled) { Debugging.Assert(PackedInt32s.BitsRequired(MoveSignToLowOrderBit(delta)) <= writer.BitsPerValue); } writer.Add(MoveSignToLowOrderBit(delta)); docBase += docBaseDeltas[i]; } writer.Finish(); // start pointers fieldsIndexOut.WriteVInt64(firstStartPointer); long avgChunkSize; if (blockChunks == 1) { avgChunkSize = 0; } else { avgChunkSize = (maxStartPointer - firstStartPointer) / (blockChunks - 1); } fieldsIndexOut.WriteVInt64(avgChunkSize); long startPointer = 0; maxDelta = 0; for (int i = 0; i < blockChunks; ++i) { startPointer += startPointerDeltas[i]; long delta = startPointer - avgChunkSize * i; maxDelta |= MoveSignToLowOrderBit(delta); } int bitsPerStartPointer = PackedInt32s.BitsRequired(maxDelta); fieldsIndexOut.WriteVInt32(bitsPerStartPointer); writer = PackedInt32s.GetWriterNoHeader(fieldsIndexOut, PackedInt32s.Format.PACKED, blockChunks, bitsPerStartPointer, 1); startPointer = 0; for (int i = 0; i < blockChunks; ++i) { startPointer += startPointerDeltas[i]; long delta = startPointer - avgChunkSize * i; if (Debugging.AssertsEnabled) { Debugging.Assert(PackedInt32s.BitsRequired(MoveSignToLowOrderBit(delta)) <= writer.BitsPerValue); } writer.Add(MoveSignToLowOrderBit(delta)); } writer.Finish(); }
[ExceptionToNetNumericConvention] // LUCENENET: Private API, keeping as-is private void AddVarIntsField(FieldInfo field, IndexOutput output, IEnumerable <long?> values, long minValue, long maxValue) { field.PutAttribute(legacyKey, LegacyDocValuesType.VAR_INTS.ToString()); CodecUtil.WriteHeader(output, Lucene40DocValuesFormat.VAR_INTS_CODEC_NAME, Lucene40DocValuesFormat.VAR_INTS_VERSION_CURRENT); long delta = maxValue - minValue; if (delta < 0) { // writes longs output.WriteByte((byte)Lucene40DocValuesFormat.VAR_INTS_FIXED_64); foreach (long?n in values) { output.WriteInt64(n.GetValueOrDefault()); } } else { // writes packed ints output.WriteByte((byte)Lucene40DocValuesFormat.VAR_INTS_PACKED); output.WriteInt64(minValue); output.WriteInt64(0 - minValue); // default value (representation of 0) PackedInt32s.Writer writer = PackedInt32s.GetWriter(output, state.SegmentInfo.DocCount, PackedInt32s.BitsRequired(delta), PackedInt32s.DEFAULT); foreach (long?n in values) { writer.Add(n.GetValueOrDefault() - minValue); } writer.Finish(); } }
public override Fields Get(int doc) { EnsureOpen(); // seek to the right place { long startPointer = indexReader.GetStartPointer(doc); vectorsStream.Seek(startPointer); } // decode // - docBase: first doc ID of the chunk // - chunkDocs: number of docs of the chunk int docBase = vectorsStream.ReadVInt32(); int chunkDocs = vectorsStream.ReadVInt32(); if (doc < docBase || doc >= docBase + chunkDocs || docBase + chunkDocs > numDocs) { throw new CorruptIndexException("docBase=" + docBase + ",chunkDocs=" + chunkDocs + ",doc=" + doc + " (resource=" + vectorsStream + ")"); } int skip; // number of fields to skip int numFields; // number of fields of the document we're looking for int totalFields; // total number of fields of the chunk (sum for all docs) if (chunkDocs == 1) { skip = 0; numFields = totalFields = vectorsStream.ReadVInt32(); } else { reader.Reset(vectorsStream, chunkDocs); int sum = 0; for (int i = docBase; i < doc; ++i) { sum += (int)reader.Next(); } skip = sum; numFields = (int)reader.Next(); sum += numFields; for (int i = doc + 1; i < docBase + chunkDocs; ++i) { sum += (int)reader.Next(); } totalFields = sum; } if (numFields == 0) { // no vectors return(null); } // read field numbers that have term vectors int[] fieldNums; { int token = vectorsStream.ReadByte() & 0xFF; Debug.Assert(token != 0); // means no term vectors, cannot happen since we checked for numFields == 0 int bitsPerFieldNum = token & 0x1F; int totalDistinctFields = (int)((uint)token >> 5); if (totalDistinctFields == 0x07) { totalDistinctFields += vectorsStream.ReadVInt32(); } ++totalDistinctFields; PackedInt32s.IReaderIterator it = PackedInt32s.GetReaderIteratorNoHeader(vectorsStream, PackedInt32s.Format.PACKED, packedIntsVersion, totalDistinctFields, bitsPerFieldNum, 1); fieldNums = new int[totalDistinctFields]; for (int i = 0; i < totalDistinctFields; ++i) { fieldNums[i] = (int)it.Next(); } } // read field numbers and flags int[] fieldNumOffs = new int[numFields]; PackedInt32s.Reader flags; { int bitsPerOff = PackedInt32s.BitsRequired(fieldNums.Length - 1); PackedInt32s.Reader allFieldNumOffs = PackedInt32s.GetReaderNoHeader(vectorsStream, PackedInt32s.Format.PACKED, packedIntsVersion, totalFields, bitsPerOff); switch (vectorsStream.ReadVInt32()) { case 0: PackedInt32s.Reader fieldFlags = PackedInt32s.GetReaderNoHeader(vectorsStream, PackedInt32s.Format.PACKED, packedIntsVersion, fieldNums.Length, CompressingTermVectorsWriter.FLAGS_BITS); PackedInt32s.Mutable f = PackedInt32s.GetMutable(totalFields, CompressingTermVectorsWriter.FLAGS_BITS, PackedInt32s.COMPACT); for (int i = 0; i < totalFields; ++i) { int fieldNumOff = (int)allFieldNumOffs.Get(i); Debug.Assert(fieldNumOff >= 0 && fieldNumOff < fieldNums.Length); int fgs = (int)fieldFlags.Get(fieldNumOff); f.Set(i, fgs); } flags = f; break; case 1: flags = PackedInt32s.GetReaderNoHeader(vectorsStream, PackedInt32s.Format.PACKED, packedIntsVersion, totalFields, CompressingTermVectorsWriter.FLAGS_BITS); break; default: throw new Exception(); } for (int i = 0; i < numFields; ++i) { fieldNumOffs[i] = (int)allFieldNumOffs.Get(skip + i); } } // number of terms per field for all fields PackedInt32s.Reader numTerms; int totalTerms; { int bitsRequired = vectorsStream.ReadVInt32(); numTerms = PackedInt32s.GetReaderNoHeader(vectorsStream, PackedInt32s.Format.PACKED, packedIntsVersion, totalFields, bitsRequired); int sum = 0; for (int i = 0; i < totalFields; ++i) { sum += (int)numTerms.Get(i); } totalTerms = sum; } // term lengths int docOff = 0, docLen = 0, totalLen; int[] fieldLengths = new int[numFields]; int[][] prefixLengths = new int[numFields][]; int[][] suffixLengths = new int[numFields][]; { reader.Reset(vectorsStream, totalTerms); // skip int toSkip = 0; for (int i = 0; i < skip; ++i) { toSkip += (int)numTerms.Get(i); } reader.Skip(toSkip); // read prefix lengths for (int i = 0; i < numFields; ++i) { int termCount = (int)numTerms.Get(skip + i); int[] fieldPrefixLengths = new int[termCount]; prefixLengths[i] = fieldPrefixLengths; for (int j = 0; j < termCount;) { Int64sRef next = reader.Next(termCount - j); for (int k = 0; k < next.Length; ++k) { fieldPrefixLengths[j++] = (int)next.Int64s[next.Offset + k]; } } } reader.Skip(totalTerms - reader.Ord); reader.Reset(vectorsStream, totalTerms); // skip toSkip = 0; for (int i = 0; i < skip; ++i) { for (int j = 0; j < numTerms.Get(i); ++j) { docOff += (int)reader.Next(); } } for (int i = 0; i < numFields; ++i) { int termCount = (int)numTerms.Get(skip + i); int[] fieldSuffixLengths = new int[termCount]; suffixLengths[i] = fieldSuffixLengths; for (int j = 0; j < termCount;) { Int64sRef next = reader.Next(termCount - j); for (int k = 0; k < next.Length; ++k) { fieldSuffixLengths[j++] = (int)next.Int64s[next.Offset + k]; } } fieldLengths[i] = Sum(suffixLengths[i]); docLen += fieldLengths[i]; } totalLen = docOff + docLen; for (int i = skip + numFields; i < totalFields; ++i) { for (int j = 0; j < numTerms.Get(i); ++j) { totalLen += (int)reader.Next(); } } } // term freqs int[] termFreqs = new int[totalTerms]; { reader.Reset(vectorsStream, totalTerms); for (int i = 0; i < totalTerms;) { Int64sRef next = reader.Next(totalTerms - i); for (int k = 0; k < next.Length; ++k) { termFreqs[i++] = 1 + (int)next.Int64s[next.Offset + k]; } } } // total number of positions, offsets and payloads int totalPositions = 0, totalOffsets = 0, totalPayloads = 0; for (int i = 0, termIndex = 0; i < totalFields; ++i) { int f = (int)flags.Get(i); int termCount = (int)numTerms.Get(i); for (int j = 0; j < termCount; ++j) { int freq = termFreqs[termIndex++]; if ((f & CompressingTermVectorsWriter.POSITIONS) != 0) { totalPositions += freq; } if ((f & CompressingTermVectorsWriter.OFFSETS) != 0) { totalOffsets += freq; } if ((f & CompressingTermVectorsWriter.PAYLOADS) != 0) { totalPayloads += freq; } } Debug.Assert(i != totalFields - 1 || termIndex == totalTerms, termIndex + " " + totalTerms); } int[][] positionIndex = PositionIndex(skip, numFields, numTerms, termFreqs); int[][] positions, startOffsets, lengths; if (totalPositions > 0) { positions = ReadPositions(skip, numFields, flags, numTerms, termFreqs, CompressingTermVectorsWriter.POSITIONS, totalPositions, positionIndex); } else { positions = new int[numFields][]; } if (totalOffsets > 0) { // average number of chars per term float[] charsPerTerm = new float[fieldNums.Length]; for (int i = 0; i < charsPerTerm.Length; ++i) { charsPerTerm[i] = J2N.BitConversion.Int32BitsToSingle(vectorsStream.ReadInt32()); } startOffsets = ReadPositions(skip, numFields, flags, numTerms, termFreqs, CompressingTermVectorsWriter.OFFSETS, totalOffsets, positionIndex); lengths = ReadPositions(skip, numFields, flags, numTerms, termFreqs, CompressingTermVectorsWriter.OFFSETS, totalOffsets, positionIndex); for (int i = 0; i < numFields; ++i) { int[] fStartOffsets = startOffsets[i]; int[] fPositions = positions[i]; // patch offsets from positions if (fStartOffsets != null && fPositions != null) { float fieldCharsPerTerm = charsPerTerm[fieldNumOffs[i]]; for (int j = 0; j < startOffsets[i].Length; ++j) { fStartOffsets[j] += (int)(fieldCharsPerTerm * fPositions[j]); } } if (fStartOffsets != null) { int[] fPrefixLengths = prefixLengths[i]; int[] fSuffixLengths = suffixLengths[i]; int[] fLengths = lengths[i]; for (int j = 0, end = (int)numTerms.Get(skip + i); j < end; ++j) { // delta-decode start offsets and patch lengths using term lengths int termLength = fPrefixLengths[j] + fSuffixLengths[j]; lengths[i][positionIndex[i][j]] += termLength; for (int k = positionIndex[i][j] + 1; k < positionIndex[i][j + 1]; ++k) { fStartOffsets[k] += fStartOffsets[k - 1]; fLengths[k] += termLength; } } } } } else { startOffsets = lengths = new int[numFields][]; } if (totalPositions > 0) { // delta-decode positions for (int i = 0; i < numFields; ++i) { int[] fPositions = positions[i]; int[] fpositionIndex = positionIndex[i]; if (fPositions != null) { for (int j = 0, end = (int)numTerms.Get(skip + i); j < end; ++j) { // delta-decode start offsets for (int k = fpositionIndex[j] + 1; k < fpositionIndex[j + 1]; ++k) { fPositions[k] += fPositions[k - 1]; } } } } } // payload lengths int[][] payloadIndex = new int[numFields][]; int totalPayloadLength = 0; int payloadOff = 0; int payloadLen = 0; if (totalPayloads > 0) { reader.Reset(vectorsStream, totalPayloads); // skip int termIndex = 0; for (int i = 0; i < skip; ++i) { int f = (int)flags.Get(i); int termCount = (int)numTerms.Get(i); if ((f & CompressingTermVectorsWriter.PAYLOADS) != 0) { for (int j = 0; j < termCount; ++j) { int freq = termFreqs[termIndex + j]; for (int k = 0; k < freq; ++k) { int l = (int)reader.Next(); payloadOff += l; } } } termIndex += termCount; } totalPayloadLength = payloadOff; // read doc payload lengths for (int i = 0; i < numFields; ++i) { int f = (int)flags.Get(skip + i); int termCount = (int)numTerms.Get(skip + i); if ((f & CompressingTermVectorsWriter.PAYLOADS) != 0) { int totalFreq = positionIndex[i][termCount]; payloadIndex[i] = new int[totalFreq + 1]; int posIdx = 0; payloadIndex[i][posIdx] = payloadLen; for (int j = 0; j < termCount; ++j) { int freq = termFreqs[termIndex + j]; for (int k = 0; k < freq; ++k) { int payloadLength = (int)reader.Next(); payloadLen += payloadLength; payloadIndex[i][posIdx + 1] = payloadLen; ++posIdx; } } Debug.Assert(posIdx == totalFreq); } termIndex += termCount; } totalPayloadLength += payloadLen; for (int i = skip + numFields; i < totalFields; ++i) { int f = (int)flags.Get(i); int termCount = (int)numTerms.Get(i); if ((f & CompressingTermVectorsWriter.PAYLOADS) != 0) { for (int j = 0; j < termCount; ++j) { int freq = termFreqs[termIndex + j]; for (int k = 0; k < freq; ++k) { totalPayloadLength += (int)reader.Next(); } } } termIndex += termCount; } Debug.Assert(termIndex == totalTerms, termIndex + " " + totalTerms); } // decompress data BytesRef suffixBytes = new BytesRef(); decompressor.Decompress(vectorsStream, totalLen + totalPayloadLength, docOff + payloadOff, docLen + payloadLen, suffixBytes); suffixBytes.Length = docLen; BytesRef payloadBytes = new BytesRef(suffixBytes.Bytes, suffixBytes.Offset + docLen, payloadLen); int[] FieldFlags = new int[numFields]; for (int i = 0; i < numFields; ++i) { FieldFlags[i] = (int)flags.Get(skip + i); } int[] fieldNumTerms = new int[numFields]; for (int i = 0; i < numFields; ++i) { fieldNumTerms[i] = (int)numTerms.Get(skip + i); } int[][] fieldTermFreqs = new int[numFields][]; { int termIdx = 0; for (int i = 0; i < skip; ++i) { termIdx += (int)numTerms.Get(i); } for (int i = 0; i < numFields; ++i) { int termCount = (int)numTerms.Get(skip + i); fieldTermFreqs[i] = new int[termCount]; for (int j = 0; j < termCount; ++j) { fieldTermFreqs[i][j] = termFreqs[termIdx++]; } } } Debug.Assert(Sum(fieldLengths) == docLen, Sum(fieldLengths) + " != " + docLen); return(new TVFields(this, fieldNums, FieldFlags, fieldNumOffs, fieldNumTerms, fieldLengths, prefixLengths, suffixLengths, fieldTermFreqs, positionIndex, positions, startOffsets, lengths, payloadBytes, payloadIndex, suffixBytes)); }
internal virtual void AddNumericField(FieldInfo field, IEnumerable <long?> values, bool optimizeStorage) { long count = 0; long minValue = long.MaxValue; long maxValue = long.MinValue; long gcd = 0; bool missing = false; // TODO: more efficient? JCG.HashSet <long> uniqueValues = null; if (optimizeStorage) { uniqueValues = new JCG.HashSet <long>(); foreach (long?nv in values) { long v; if (nv == null) { v = 0; missing = true; } else { v = nv.Value; } if (gcd != 1) { if (v < long.MinValue / 2 || v > long.MaxValue / 2) { // in that case v - minValue might overflow and make the GCD computation return // wrong results. Since these extreme values are unlikely, we just discard // GCD computation for them gcd = 1; } // minValue needs to be set first else if (count != 0) { gcd = MathUtil.Gcd(gcd, v - minValue); } } minValue = Math.Min(minValue, v); maxValue = Math.Max(maxValue, v); if (uniqueValues != null) { if (uniqueValues.Add(v)) { if (uniqueValues.Count > 256) { uniqueValues = null; } } } ++count; } } else { foreach (var nv in values) { ++count; } } long delta = maxValue - minValue; int format; if (uniqueValues != null && (delta < 0L || PackedInt32s.BitsRequired(uniqueValues.Count - 1) < PackedInt32s.BitsRequired(delta)) && count <= int.MaxValue) { format = TABLE_COMPRESSED; } else if (gcd != 0 && gcd != 1) { format = GCD_COMPRESSED; } else { format = DELTA_COMPRESSED; } meta.WriteVInt32(field.Number); meta.WriteByte((byte)Lucene45DocValuesFormat.NUMERIC); meta.WriteVInt32(format); if (missing) { meta.WriteInt64(data.Position); // LUCENENET specific: Renamed from getFilePointer() to match FileStream WriteMissingBitset(values); } else { meta.WriteInt64(-1L); } meta.WriteVInt32(PackedInt32s.VERSION_CURRENT); meta.WriteInt64(data.Position); // LUCENENET specific: Renamed from getFilePointer() to match FileStream meta.WriteVInt64(count); meta.WriteVInt32(BLOCK_SIZE); switch (format) { case GCD_COMPRESSED: meta.WriteInt64(minValue); meta.WriteInt64(gcd); BlockPackedWriter quotientWriter = new BlockPackedWriter(data, BLOCK_SIZE); foreach (long?nv in values) { quotientWriter.Add((nv.GetValueOrDefault() - minValue) / gcd); } quotientWriter.Finish(); break; case DELTA_COMPRESSED: BlockPackedWriter writer = new BlockPackedWriter(data, BLOCK_SIZE); foreach (long?nv in values) { writer.Add(nv.GetValueOrDefault()); } writer.Finish(); break; case TABLE_COMPRESSED: // LUCENENET NOTE: diming an array and then using .CopyTo() for better efficiency than LINQ .ToArray() long[] decode = new long[uniqueValues.Count]; uniqueValues.CopyTo(decode, 0); Dictionary <long, int> encode = new Dictionary <long, int>(); meta.WriteVInt32(decode.Length); for (int i = 0; i < decode.Length; i++) { meta.WriteInt64(decode[i]); encode[decode[i]] = i; } int bitsRequired = PackedInt32s.BitsRequired(uniqueValues.Count - 1); PackedInt32s.Writer ordsWriter = PackedInt32s.GetWriterNoHeader(data, PackedInt32s.Format.PACKED, (int)count, bitsRequired, PackedInt32s.DEFAULT_BUFFER_SIZE); foreach (long?nv in values) { ordsWriter.Add(encode[nv.GetValueOrDefault()]); } ordsWriter.Finish(); break; default: throw AssertionError.Create(); } }
public override void Finish(long termsFilePointer) { // write primary terms dict offsets packedIndexStart = outerInstance.m_output.GetFilePointer(); PackedInt32s.Writer w = PackedInt32s.GetWriter(outerInstance.m_output, numIndexTerms, PackedInt32s.BitsRequired(termsFilePointer), PackedInt32s.DEFAULT); // relative to our indexStart long upto = 0; for (int i = 0; i < numIndexTerms; i++) { upto += termsPointerDeltas[i]; w.Add(upto); } w.Finish(); packedOffsetsStart = outerInstance.m_output.GetFilePointer(); // write offsets into the byte[] terms w = PackedInt32s.GetWriter(outerInstance.m_output, 1 + numIndexTerms, PackedInt32s.BitsRequired(totTermLength), PackedInt32s.DEFAULT); upto = 0; for (int i = 0; i < numIndexTerms; i++) { w.Add(upto); upto += termLengths[i]; } w.Add(upto); w.Finish(); // our referrer holds onto us, while other fields are // being written, so don't tie up this RAM: termLengths = null; termsPointerDeltas = null; }
internal virtual void AddNumericField(FieldInfo field, IEnumerable <long?> values, bool optimizeStorage) { meta.WriteVInt32(field.Number); meta.WriteByte(MemoryDocValuesProducer.NUMBER); meta.WriteInt64(data.GetFilePointer()); long minValue = long.MaxValue; long maxValue = long.MinValue; long gcd = 0; bool missing = false; // TODO: more efficient? ISet <long?> uniqueValues = null; if (optimizeStorage) { uniqueValues = new JCG.HashSet <long?>(); long count = 0; foreach (var nv in values) { long v; if (nv == null) { v = 0; missing = true; } else { v = nv.Value; } if (gcd != 1) { if (v < long.MinValue / 2 || v > long.MaxValue / 2) { // in that case v - minValue might overflow and make the GCD computation return // wrong results. Since these extreme values are unlikely, we just discard // GCD computation for them gcd = 1; } // minValue needs to be set first else if (count != 0) { gcd = MathUtil.Gcd(gcd, v - minValue); } } minValue = Math.Min(minValue, v); maxValue = Math.Max(maxValue, v); if (uniqueValues != null) { if (uniqueValues.Add(v)) { if (uniqueValues.Count > 256) { uniqueValues = null; } } } ++count; } if (Debugging.AssertsEnabled) { Debugging.Assert(count == maxDoc); } } if (missing) { long start = data.GetFilePointer(); WriteMissingBitset(values); meta.WriteInt64(start); meta.WriteInt64(data.GetFilePointer() - start); } else { meta.WriteInt64(-1L); } if (uniqueValues != null) { // small number of unique values int bitsPerValue = PackedInt32s.BitsRequired(uniqueValues.Count - 1); FormatAndBits formatAndBits = PackedInt32s.FastestFormatAndBits(maxDoc, bitsPerValue, acceptableOverheadRatio); if (formatAndBits.BitsPerValue == 8 && minValue >= sbyte.MinValue && maxValue <= sbyte.MaxValue) { meta.WriteByte(MemoryDocValuesProducer.UNCOMPRESSED); // uncompressed foreach (var nv in values) { data.WriteByte((byte)nv.GetValueOrDefault()); } } else { meta.WriteByte(MemoryDocValuesProducer.TABLE_COMPRESSED); // table-compressed long?[] decode = new long?[uniqueValues.Count]; uniqueValues.CopyTo(decode, 0); var encode = new Dictionary <long?, int?>(); data.WriteVInt32(decode.Length); for (int i = 0; i < decode.Length; i++) { data.WriteInt64(decode[i].Value); encode[decode[i]] = i; } meta.WriteVInt32(PackedInt32s.VERSION_CURRENT); data.WriteVInt32(formatAndBits.Format.Id); data.WriteVInt32(formatAndBits.BitsPerValue); PackedInt32s.Writer writer = PackedInt32s.GetWriterNoHeader(data, formatAndBits.Format, maxDoc, formatAndBits.BitsPerValue, PackedInt32s.DEFAULT_BUFFER_SIZE); foreach (var nv in values) { var v = encode[nv.GetValueOrDefault()]; writer.Add((long)v); } writer.Finish(); } } else if (gcd != 0 && gcd != 1) { meta.WriteByte(MemoryDocValuesProducer.GCD_COMPRESSED); meta.WriteVInt32(PackedInt32s.VERSION_CURRENT); data.WriteInt64(minValue); data.WriteInt64(gcd); data.WriteVInt32(MemoryDocValuesProducer.BLOCK_SIZE); var writer = new BlockPackedWriter(data, MemoryDocValuesProducer.BLOCK_SIZE); foreach (var nv in values) { writer.Add((nv.GetValueOrDefault() - minValue) / gcd); } writer.Finish(); } else { meta.WriteByte(MemoryDocValuesProducer.DELTA_COMPRESSED); // delta-compressed meta.WriteVInt32(PackedInt32s.VERSION_CURRENT); data.WriteVInt32(MemoryDocValuesProducer.BLOCK_SIZE); var writer = new BlockPackedWriter(data, MemoryDocValuesProducer.BLOCK_SIZE); foreach (var nv in values) { writer.Add(nv.GetValueOrDefault()); } writer.Finish(); } }
public override void AddNumericField(FieldInfo field, IEnumerable <long?> values) { meta.WriteVInt32(field.Number); meta.WriteByte((byte)NUMBER); meta.WriteInt64(data.GetFilePointer()); long minValue = long.MaxValue; long maxValue = long.MinValue; long gcd = 0; // TODO: more efficient? HashSet <long> uniqueValues = null; if (true) { uniqueValues = new HashSet <long>(); long count = 0; foreach (long?nv in values) { Debug.Assert(nv != null); long v = nv.Value; if (gcd != 1) { if (v < long.MinValue / 2 || v > long.MaxValue / 2) { // in that case v - minValue might overflow and make the GCD computation return // wrong results. Since these extreme values are unlikely, we just discard // GCD computation for them gcd = 1; } // minValue needs to be set first else if (count != 0) { gcd = MathUtil.Gcd(gcd, v - minValue); } } minValue = Math.Min(minValue, v); maxValue = Math.Max(maxValue, v); if (uniqueValues != null) { if (uniqueValues.Add(v)) { if (uniqueValues.Count > 256) { uniqueValues = null; } } } ++count; } Debug.Assert(count == maxDoc); } if (uniqueValues != null) { // small number of unique values int bitsPerValue = PackedInt32s.BitsRequired(uniqueValues.Count - 1); FormatAndBits formatAndBits = PackedInt32s.FastestFormatAndBits(maxDoc, bitsPerValue, acceptableOverheadRatio); if (formatAndBits.BitsPerValue == 8 && minValue >= sbyte.MinValue && maxValue <= sbyte.MaxValue) { meta.WriteByte((byte)UNCOMPRESSED); // uncompressed foreach (long?nv in values) { data.WriteByte((byte)nv.GetValueOrDefault()); } } else { meta.WriteByte((byte)TABLE_COMPRESSED); // table-compressed var decode = uniqueValues.ToArray(); var encode = new Dictionary <long, int>(); data.WriteVInt32(decode.Length); for (int i = 0; i < decode.Length; i++) { data.WriteInt64(decode[i]); encode[decode[i]] = i; } meta.WriteVInt32(PackedInt32s.VERSION_CURRENT); data.WriteVInt32(formatAndBits.Format.Id); data.WriteVInt32(formatAndBits.BitsPerValue); PackedInt32s.Writer writer = PackedInt32s.GetWriterNoHeader(data, formatAndBits.Format, maxDoc, formatAndBits.BitsPerValue, PackedInt32s.DEFAULT_BUFFER_SIZE); foreach (long?nv in values) { writer.Add(encode[nv.GetValueOrDefault()]); } writer.Finish(); } } else if (gcd != 0 && gcd != 1) { meta.WriteByte((byte)GCD_COMPRESSED); meta.WriteVInt32(PackedInt32s.VERSION_CURRENT); data.WriteInt64(minValue); data.WriteInt64(gcd); data.WriteVInt32(BLOCK_SIZE); var writer = new BlockPackedWriter(data, BLOCK_SIZE); foreach (long?nv in values) { writer.Add((nv.GetValueOrDefault() - minValue) / gcd); } writer.Finish(); } else { meta.WriteByte((byte)DELTA_COMPRESSED); // delta-compressed meta.WriteVInt32(PackedInt32s.VERSION_CURRENT); data.WriteVInt32(BLOCK_SIZE); var writer = new BlockPackedWriter(data, BLOCK_SIZE); foreach (long?nv in values) { writer.Add(nv.GetValueOrDefault()); } writer.Finish(); } }
private void FlushFields(int totalFields, int[] fieldNums) { PackedInt32s.Writer writer = PackedInt32s.GetWriterNoHeader(vectorsStream, PackedInt32s.Format.PACKED, totalFields, PackedInt32s.BitsRequired(fieldNums.Length - 1), 1); foreach (DocData dd in pendingDocs) { foreach (FieldData fd in dd.fields) { int fieldNumIndex = Array.BinarySearch(fieldNums, fd.fieldNum); if (Debugging.AssertsEnabled) { Debugging.Assert(fieldNumIndex >= 0); } writer.Add(fieldNumIndex); } } writer.Finish(); }
public override void Fill(int fromIndex, int toIndex, long val) { if (Debugging.AssertsEnabled) { Debugging.Assert(PackedInt32s.BitsRequired(val) <= BitsPerValue); Debugging.Assert(fromIndex <= toIndex); } // minimum number of values that use an exact number of full blocks int nAlignedValues = 64 / Gcd(64, m_bitsPerValue); int span = toIndex - fromIndex; if (span <= 3 * nAlignedValues) { // there needs be at least 2 * nAlignedValues aligned values for the // block approach to be worth trying base.Fill(fromIndex, toIndex, val); return; } // fill the first values naively until the next block start int fromIndexModNAlignedValues = fromIndex % nAlignedValues; if (fromIndexModNAlignedValues != 0) { for (int i = fromIndexModNAlignedValues; i < nAlignedValues; ++i) { Set(fromIndex++, val); } } if (Debugging.AssertsEnabled) { Debugging.Assert(fromIndex % nAlignedValues == 0); } // compute the long[] blocks for nAlignedValues consecutive values and // use them to set as many values as possible without applying any mask // or shift int nAlignedBlocks = (nAlignedValues * m_bitsPerValue) >> 6; long[] nAlignedValuesBlocks; { Packed64 values = new Packed64(nAlignedValues, m_bitsPerValue); for (int i = 0; i < nAlignedValues; ++i) { values.Set(i, val); } nAlignedValuesBlocks = values.blocks; if (Debugging.AssertsEnabled) { Debugging.Assert(nAlignedBlocks <= nAlignedValuesBlocks.Length); } } int startBlock = (int)(((long)fromIndex * m_bitsPerValue).TripleShift(6)); int endBlock = (int)(((long)toIndex * m_bitsPerValue).TripleShift(6)); for (int block = startBlock; block < endBlock; ++block) { long blockValue = nAlignedValuesBlocks[block % nAlignedBlocks]; blocks[block] = blockValue; } // fill the gap for (int i = (int)(((long)endBlock << 6) / m_bitsPerValue); i < toIndex; ++i) { Set(i, val); } }