public virtual void TestDataInputOutput() { Random random = Random; for (int iter = 0; iter < 5 * RandomMultiplier; iter++) { BaseDirectoryWrapper dir = NewFSDirectory(CreateTempDir("testOverflow")); if (dir is MockDirectoryWrapper) { ((MockDirectoryWrapper)dir).Throttling = Throttling.NEVER; } int blockBits = TestUtil.NextInt32(random, 1, 20); int blockSize = 1 << blockBits; PagedBytes p = new PagedBytes(blockBits); IndexOutput @out = dir.CreateOutput("foo", IOContext.DEFAULT); int numBytes = TestUtil.NextInt32(LuceneTestCase.Random, 2, 10000000); byte[] answer = new byte[numBytes]; LuceneTestCase.Random.NextBytes(answer); int written = 0; while (written < numBytes) { if (LuceneTestCase.Random.Next(10) == 7) { @out.WriteByte(answer[written++]); } else { int chunk = Math.Min(LuceneTestCase.Random.Next(1000), numBytes - written); @out.WriteBytes(answer, written, chunk); written += chunk; } } @out.Dispose(); IndexInput input = dir.OpenInput("foo", IOContext.DEFAULT); DataInput @in = (DataInput)input.Clone(); p.Copy(input, input.Length); PagedBytes.Reader reader = p.Freeze(random.NextBoolean()); byte[] verify = new byte[numBytes]; int read = 0; while (read < numBytes) { if (LuceneTestCase.Random.Next(10) == 7) { verify[read++] = @in.ReadByte(); } else { int chunk = Math.Min(LuceneTestCase.Random.Next(1000), numBytes - read); @in.ReadBytes(verify, read, chunk); read += chunk; } } Assert.IsTrue(Arrays.Equals(answer, verify)); BytesRef slice = new BytesRef(); for (int iter2 = 0; iter2 < 100; iter2++) { int pos = random.Next(numBytes - 1); int len = random.Next(Math.Min(blockSize + 1, numBytes - pos)); reader.FillSlice(slice, pos, len); for (int byteUpto = 0; byteUpto < len; byteUpto++) { Assert.AreEqual(answer[pos + byteUpto], (byte)slice.Bytes[slice.Offset + byteUpto]); } } input.Dispose(); dir.Dispose(); } }
public override void AddNumericField(FieldInfo field, IEnumerable <long?> values) { meta.WriteVInt32(field.Number); meta.WriteByte((byte)NUMBER); meta.WriteInt64(data.Position); // LUCENENET specific: Renamed from getFilePointer() to match FileStream long minValue = long.MaxValue; long maxValue = long.MinValue; long gcd = 0; // TODO: more efficient? JCG.HashSet <long> uniqueValues /* = null*/; // LUCENENET: IDE0059: Remove unnecessary value assignment if (true) { uniqueValues = new JCG.HashSet <long>(); long count = 0; foreach (long?nv in values) { if (Debugging.AssertsEnabled) { Debugging.Assert(nv != null); } long v = nv.Value; if (gcd != 1) { if (v < long.MinValue / 2 || v > long.MaxValue / 2) { // in that case v - minValue might overflow and make the GCD computation return // wrong results. Since these extreme values are unlikely, we just discard // GCD computation for them gcd = 1; } // minValue needs to be set first else if (count != 0) { gcd = MathUtil.Gcd(gcd, v - minValue); } } minValue = Math.Min(minValue, v); maxValue = Math.Max(maxValue, v); if (uniqueValues != null) { if (uniqueValues.Add(v)) { if (uniqueValues.Count > 256) { uniqueValues = null; } } } ++count; } if (Debugging.AssertsEnabled) { Debugging.Assert(count == maxDoc); } } if (uniqueValues != null) { // small number of unique values int bitsPerValue = PackedInt32s.BitsRequired(uniqueValues.Count - 1); FormatAndBits formatAndBits = PackedInt32s.FastestFormatAndBits(maxDoc, bitsPerValue, acceptableOverheadRatio); if (formatAndBits.BitsPerValue == 8 && minValue >= sbyte.MinValue && maxValue <= sbyte.MaxValue) { meta.WriteByte((byte)UNCOMPRESSED); // uncompressed foreach (long?nv in values) { data.WriteByte((byte)nv.GetValueOrDefault()); } } else { meta.WriteByte((byte)TABLE_COMPRESSED); // table-compressed var decode = new long[uniqueValues.Count]; uniqueValues.CopyTo(decode); var encode = new Dictionary <long, int>(); data.WriteVInt32(decode.Length); for (int i = 0; i < decode.Length; i++) { data.WriteInt64(decode[i]); encode[decode[i]] = i; } meta.WriteVInt32(PackedInt32s.VERSION_CURRENT); data.WriteVInt32(formatAndBits.Format.Id); data.WriteVInt32(formatAndBits.BitsPerValue); PackedInt32s.Writer writer = PackedInt32s.GetWriterNoHeader(data, formatAndBits.Format, maxDoc, formatAndBits.BitsPerValue, PackedInt32s.DEFAULT_BUFFER_SIZE); foreach (long?nv in values) { writer.Add(encode[nv.GetValueOrDefault()]); } writer.Finish(); } } else if (gcd != 0 && gcd != 1) { meta.WriteByte((byte)GCD_COMPRESSED); meta.WriteVInt32(PackedInt32s.VERSION_CURRENT); data.WriteInt64(minValue); data.WriteInt64(gcd); data.WriteVInt32(BLOCK_SIZE); var writer = new BlockPackedWriter(data, BLOCK_SIZE); foreach (long?nv in values) { writer.Add((nv.GetValueOrDefault() - minValue) / gcd); } writer.Finish(); } else { meta.WriteByte((byte)DELTA_COMPRESSED); // delta-compressed meta.WriteVInt32(PackedInt32s.VERSION_CURRENT); data.WriteVInt32(BLOCK_SIZE); var writer = new BlockPackedWriter(data, BLOCK_SIZE); foreach (long?nv in values) { writer.Add(nv.GetValueOrDefault()); } writer.Finish(); } }
public override void WriteByte(byte b) { _indexOutput.WriteByte(b); }
public override void Write(Directory directory, string segmentName, string segmentSuffix, FieldInfos infos, IOContext context) { string fileName = IndexFileNames.SegmentFileName(segmentName, segmentSuffix, Lucene46FieldInfosFormat.EXTENSION); IndexOutput output = directory.CreateOutput(fileName, context); bool success = false; try { CodecUtil.WriteHeader(output, Lucene46FieldInfosFormat.CODEC_NAME, Lucene46FieldInfosFormat.FORMAT_CURRENT); output.WriteVInt32(infos.Count); foreach (FieldInfo fi in infos) { IndexOptions indexOptions = fi.IndexOptions; sbyte bits = 0x0; if (fi.HasVectors) { bits |= Lucene46FieldInfosFormat.STORE_TERMVECTOR; } if (fi.OmitsNorms) { bits |= Lucene46FieldInfosFormat.OMIT_NORMS; } if (fi.HasPayloads) { bits |= Lucene46FieldInfosFormat.STORE_PAYLOADS; } if (fi.IsIndexed) { bits |= Lucene46FieldInfosFormat.IS_INDEXED; Debug.Assert(indexOptions.CompareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0 || !fi.HasPayloads); if (indexOptions == IndexOptions.DOCS_ONLY) { bits |= Lucene46FieldInfosFormat.OMIT_TERM_FREQ_AND_POSITIONS; } else if (indexOptions == IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) { bits |= Lucene46FieldInfosFormat.STORE_OFFSETS_IN_POSTINGS; } else if (indexOptions == IndexOptions.DOCS_AND_FREQS) { bits |= Lucene46FieldInfosFormat.OMIT_POSITIONS; } } output.WriteString(fi.Name); output.WriteVInt32(fi.Number); output.WriteByte((byte)bits); // pack the DV types in one byte var dv = DocValuesByte(fi.DocValuesType); var nrm = DocValuesByte(fi.NormType); Debug.Assert((dv & (~0xF)) == 0 && (nrm & (~0x0F)) == 0); var val = (byte)(0xff & ((nrm << 4) | (byte)dv)); output.WriteByte(val); output.WriteInt64(fi.DocValuesGen); output.WriteStringStringMap(fi.Attributes); } CodecUtil.WriteFooter(output); success = true; } finally { if (success) { output.Dispose(); } else { IOUtils.CloseWhileHandlingException(output); } } }
public override void WriteField(FieldInfo info, IIndexableField field) { fieldsStream.WriteVInt32(info.Number); int bits = 0; BytesRef bytes; string @string; // TODO: maybe a field should serialize itself? // this way we don't bake into indexer all these // specific encodings for different fields? and apps // can customize... // LUCENENET specific - To avoid boxing/unboxing, we don't // call GetNumericValue(). Instead, we check the field.NumericType and then // call the appropriate conversion method. if (field.NumericType != NumericFieldType.NONE) { switch (field.NumericType) { case NumericFieldType.BYTE: case NumericFieldType.INT16: case NumericFieldType.INT32: bits |= FIELD_IS_NUMERIC_INT; break; case NumericFieldType.INT64: bits |= FIELD_IS_NUMERIC_LONG; break; case NumericFieldType.SINGLE: bits |= FIELD_IS_NUMERIC_FLOAT; break; case NumericFieldType.DOUBLE: bits |= FIELD_IS_NUMERIC_DOUBLE; break; default: throw new ArgumentException("cannot store numeric type " + field.NumericType); } @string = null; bytes = null; } else { bytes = field.GetBinaryValue(); if (bytes != null) { bits |= FIELD_IS_BINARY; @string = null; } else { @string = field.GetStringValue(); if (@string is null) { throw new ArgumentException("field " + field.Name + " is stored but does not have binaryValue, stringValue nor numericValue"); } } } fieldsStream.WriteByte((byte)bits); if (bytes != null) { fieldsStream.WriteVInt32(bytes.Length); fieldsStream.WriteBytes(bytes.Bytes, bytes.Offset, bytes.Length); } else if (@string != null) { fieldsStream.WriteString(field.GetStringValue()); } else { switch (field.NumericType) { case NumericFieldType.BYTE: case NumericFieldType.INT16: case NumericFieldType.INT32: fieldsStream.WriteInt32(field.GetInt32Value().Value); break; case NumericFieldType.INT64: fieldsStream.WriteInt64(field.GetInt64Value().Value); break; case NumericFieldType.SINGLE: fieldsStream.WriteInt32(BitConversion.SingleToInt32Bits(field.GetSingleValue().Value)); break; case NumericFieldType.DOUBLE: fieldsStream.WriteInt64(BitConversion.DoubleToInt64Bits(field.GetDoubleValue().Value)); break; default: throw AssertionError.Create("Cannot get here"); } } }
public override void WriteByte(byte b) { _cacheDirIndexOutput.WriteByte(b); }
public override void AddBinaryField(FieldInfo field, IEnumerable <BytesRef> values) { if (Debugging.AssertsEnabled) { Debugging.Assert(FieldSeen(field.Name)); Debugging.Assert(field.DocValuesType == DocValuesType.BINARY); } var maxLength = 0; foreach (var value in values) { var length = value == null ? 0 : value.Length; maxLength = Math.Max(maxLength, length); } WriteFieldEntry(field, DocValuesType.BINARY); // write maxLength SimpleTextUtil.Write(data, MAXLENGTH); SimpleTextUtil.Write(data, maxLength.ToString(CultureInfo.InvariantCulture), scratch); SimpleTextUtil.WriteNewline(data); var maxBytesLength = maxLength.ToString(CultureInfo.InvariantCulture).Length; var sb = new StringBuilder(); for (var i = 0; i < maxBytesLength; i++) { sb.Append('0'); } // write our pattern for encoding lengths var patternString = sb.ToString(); SimpleTextUtil.Write(data, PATTERN); SimpleTextUtil.Write(data, patternString, scratch); SimpleTextUtil.WriteNewline(data); int numDocsWritten = 0; foreach (BytesRef value in values) { int length = value == null ? 0 : value.Length; SimpleTextUtil.Write(data, LENGTH); SimpleTextUtil.Write(data, length.ToString(patternString, CultureInfo.InvariantCulture), scratch); SimpleTextUtil.WriteNewline(data); // write bytes -- don't use SimpleText.Write // because it escapes: if (value != null) { data.WriteBytes(value.Bytes, value.Offset, value.Length); } // pad to fit for (int i = length; i < maxLength; i++) { data.WriteByte((byte)(sbyte)' '); } SimpleTextUtil.WriteNewline(data); SimpleTextUtil.Write(data, value == null ? "F" : "T", scratch); SimpleTextUtil.WriteNewline(data); numDocsWritten++; } if (Debugging.AssertsEnabled) { Debugging.Assert(numDocs == numDocsWritten); } }
public override void Write(Directory directory, string segmentName, string segmentSuffix, FieldInfos infos, IOContext context) { string fileName = IndexFileNames.SegmentFileName(segmentName, "", FIELD_INFOS_EXTENSION); IndexOutput output = directory.CreateOutput(fileName, context); bool success = false; try { output.WriteVInt32(FORMAT_PREFLEX_RW); output.WriteVInt32(infos.Count); foreach (FieldInfo fi in infos) { sbyte bits = 0x0; if (fi.HasVectors) { bits |= STORE_TERMVECTOR; } if (fi.OmitsNorms) { bits |= OMIT_NORMS; } if (fi.HasPayloads) { bits |= STORE_PAYLOADS; } if (fi.IsIndexed) { bits |= IS_INDEXED; if (Debugging.AssertsEnabled) { Debugging.Assert(fi.IndexOptions == IndexOptions.DOCS_AND_FREQS_AND_POSITIONS || !fi.HasPayloads); } if (fi.IndexOptions == IndexOptions.DOCS_ONLY) { bits |= OMIT_TERM_FREQ_AND_POSITIONS; } else if (fi.IndexOptions == IndexOptions.DOCS_AND_FREQS) { bits |= OMIT_POSITIONS; } } output.WriteString(fi.Name); /* * we need to write the field number since IW tries * to stabelize the field numbers across segments so the * FI ordinal is not necessarily equivalent to the field number */ output.WriteInt32(fi.Number); output.WriteByte((byte)bits); if (fi.IsIndexed && !fi.OmitsNorms) { // to allow null norm types we need to indicate if norms are written // only in RW case output.WriteByte((byte)(sbyte)(fi.NormType == Index.DocValuesType.NONE ? 0 : 1)); } if (Debugging.AssertsEnabled) { Debugging.Assert(fi.Attributes == null); // not used or supported } } success = true; } finally { if (success) { output.Dispose(); } else { IOUtils.DisposeWhileHandlingException(output); } } }
public override void WriteField(FieldInfo info, IIndexableField field) { fieldsStream.WriteVInt32(info.Number); int bits = 0; BytesRef bytes; string @string; // TODO: maybe a field should serialize itself? // this way we don't bake into indexer all these // specific encodings for different fields? and apps // can customize... object number = (object)field.GetNumericValue(); if (number != null) { if (number is sbyte || number is short || number is int) { bits |= FIELD_IS_NUMERIC_INT; } else if (number is long) { bits |= FIELD_IS_NUMERIC_LONG; } else if (number is float) { bits |= FIELD_IS_NUMERIC_FLOAT; } else if (number is double) { bits |= FIELD_IS_NUMERIC_DOUBLE; } else { throw new System.ArgumentException("cannot store numeric type " + number.GetType()); } @string = null; bytes = null; } else { bytes = field.GetBinaryValue(); if (bytes != null) { bits |= FIELD_IS_BINARY; @string = null; } else { @string = field.GetStringValue(); if (@string == null) { throw new System.ArgumentException("field " + field.Name + " is stored but does not have binaryValue, stringValue nor numericValue"); } } } fieldsStream.WriteByte((byte)(sbyte)bits); if (bytes != null) { fieldsStream.WriteVInt32(bytes.Length); fieldsStream.WriteBytes(bytes.Bytes, bytes.Offset, bytes.Length); } else if (@string != null) { fieldsStream.WriteString(field.GetStringValue()); } else { if (number is sbyte || number is short || number is int) { fieldsStream.WriteInt32((int)number); } else if (number is long) { fieldsStream.WriteInt64((long)number); } else if (number is float) { fieldsStream.WriteInt32(Number.SingleToInt32Bits((float)number)); } else if (number is double) { fieldsStream.WriteInt64(BitConverter.DoubleToInt64Bits((double)number)); } else { throw new InvalidOperationException("Cannot get here"); } } }
/// <summary>Called once per field per document if term vectors /// are enabled, to write the vectors to /// RAMOutputStream, which is then quickly flushed to /// the real term vectors files in the Directory. /// </summary> internal override void Finish() { System.Diagnostics.Debug.Assert(docState.TestPoint("TermVectorsTermsWriterPerField.finish start")); int numPostings = termsHashPerField.numPostings; System.Diagnostics.Debug.Assert(numPostings >= 0); if (!doVectors || numPostings == 0) { return; } if (numPostings > maxNumPostings) { maxNumPostings = numPostings; } IndexOutput tvf = perThread.doc.perDocTvf; // This is called once, after inverting all occurences // of a given field in the doc. At this point we flush // our hash into the DocWriter. System.Diagnostics.Debug.Assert(fieldInfo.storeTermVector); System.Diagnostics.Debug.Assert(perThread.VectorFieldsInOrder(fieldInfo)); perThread.doc.AddField(termsHashPerField.fieldInfo.number); RawPostingList[] postings = termsHashPerField.SortPostings(); tvf.WriteVInt(numPostings); byte bits = (byte)(0x0); if (doVectorPositions) { bits |= TermVectorsReader.STORE_POSITIONS_WITH_TERMVECTOR; } if (doVectorOffsets) { bits |= TermVectorsReader.STORE_OFFSET_WITH_TERMVECTOR; } tvf.WriteByte(bits); int encoderUpto = 0; int lastTermBytesCount = 0; ByteSliceReader reader = perThread.vectorSliceReader; char[][] charBuffers = perThread.termsHashPerThread.charPool.buffers; for (int j = 0; j < numPostings; j++) { TermVectorsTermsWriter.PostingList posting = (TermVectorsTermsWriter.PostingList)postings[j]; int freq = posting.freq; char[] text2 = charBuffers[posting.textStart >> DocumentsWriter.CHAR_BLOCK_SHIFT]; int start2 = posting.textStart & DocumentsWriter.CHAR_BLOCK_MASK; // We swap between two encoders to save copying // last Term's byte array UnicodeUtil.UTF8Result utf8Result = perThread.utf8Results[encoderUpto]; // TODO: we could do this incrementally UnicodeUtil.UTF16toUTF8(text2, start2, utf8Result); int termBytesCount = utf8Result.length; // TODO: UTF16toUTF8 could tell us this prefix // Compute common prefix between last term and // this term int prefix = 0; if (j > 0) { byte[] lastTermBytes = perThread.utf8Results[1 - encoderUpto].result; byte[] termBytes = perThread.utf8Results[encoderUpto].result; while (prefix < lastTermBytesCount && prefix < termBytesCount) { if (lastTermBytes[prefix] != termBytes[prefix]) { break; } prefix++; } } encoderUpto = 1 - encoderUpto; lastTermBytesCount = termBytesCount; int suffix = termBytesCount - prefix; tvf.WriteVInt(prefix); tvf.WriteVInt(suffix); tvf.WriteBytes(utf8Result.result, prefix, suffix); tvf.WriteVInt(freq); if (doVectorPositions) { termsHashPerField.InitReader(reader, posting, 0); reader.WriteTo(tvf); } if (doVectorOffsets) { termsHashPerField.InitReader(reader, posting, 1); reader.WriteTo(tvf); } } termsHashPerField.Reset(); // NOTE: we clear, per-field, at the thread level, // because term vectors fully write themselves on each // field; this saves RAM (eg if large doc has two large // fields w/ term vectors on) because we recycle/reuse // all RAM after each field: perThread.termsHashPerThread.Reset(false); }
public override void WriteByte(byte b) { cacheOutput.WriteByte(b); isWritten = true; }
public override void WriteByte(byte b) { _digest.Update(b); _indexOutput.WriteByte(b); }