public void testHalfDistinctCheckDisabled() { ObjectInspector inspector = ObjectInspectorFactory.getReflectionObjectInspector(typeof(string)); int[] input = new int[20000]; // conf.setBoolean(ConfVars.HIVE_ORC_ROW_INDEX_STRIDE_DICTIONARY_CHECK.varname, false); using (Stream file = File.OpenWrite(TestFilePath)) using (Writer writer = OrcFile.createWriter(TestFilePath, file, OrcFile.writerOptions(conf) .inspector(inspector) .compress(CompressionKind.NONE) .bufferSize(10000))) { Random rand = new Random(123); for (int i = 0; i < 20000; i++) { input[i] = rand.Next(10000); } for (int i = 0; i < 20000; i++) { writer.addRow(input[i].ToString()); } } Reader reader = OrcFile.createReader(TestFilePath, OrcFile.readerOptions(conf)); using (RecordReader rows = reader.rows()) { int idx = 0; while (rows.hasNext()) { object row = rows.next(); Assert.Equal(input[idx++].ToString(), row); } // make sure the encoding type is correct foreach (StripeInformation stripe in reader.getStripes()) { // hacky but does the job, this casting will work as long this test resides // within the same package as ORC reader OrcProto.StripeFooter footer = ((RecordReaderImpl)rows).readStripeFooter(stripe); for (int i = 0; i < footer.ColumnsCount; ++i) { OrcProto.ColumnEncoding encoding = footer.GetColumns(i); Assert.Equal(OrcProto.ColumnEncoding.Types.Kind.DICTIONARY_V2, encoding.Kind); } } } }
public void testTooManyDistinctV11AlwaysDictionary() { ObjectInspector inspector = ObjectInspectorFactory.getReflectionObjectInspector(typeof(string)); using (Stream file = File.OpenWrite(TestFilePath)) using (Writer writer = OrcFile.createWriter(TestFilePath, file, OrcFile.writerOptions(conf) .inspector(inspector) .compress(CompressionKind.NONE) .version(OrcFile.Version.V_0_11) .bufferSize(10000))) { for (int i = 0; i < 20000; i++) { writer.addRow(i.ToString()); } } Reader reader = OrcFile.createReader(TestFilePath, OrcFile.readerOptions(conf)); using (RecordReader rows = reader.rows()) { int idx = 0; while (rows.hasNext()) { object row = rows.next(); Assert.Equal((idx++).ToString(), row); } // make sure the encoding type is correct foreach (StripeInformation stripe in reader.getStripes()) { // hacky but does the job, this casting will work as long this test resides // within the same package as ORC reader OrcProto.StripeFooter footer = ((RecordReaderImpl)rows).readStripeFooter(stripe); for (int i = 0; i < footer.ColumnsCount; ++i) { OrcProto.ColumnEncoding encoding = footer.GetColumns(i); Assert.Equal(OrcProto.ColumnEncoding.Types.Kind.DICTIONARY, encoding.Kind); } } } }
public static void printJsonMetaData(List <string> files, Configuration conf, List <int> rowIndexCols, bool prettyPrint, bool printTimeZone) { JsonWriter writer = new JsonWriter(); bool multiFile = files.Count > 1; if (multiFile) { writer.array(); } else { writer.newObject(); } foreach (string filename in files) { if (multiFile) { writer.newObject(); } writer.key("fileName").value(Path.GetFileName(filename)); Reader reader = OrcFile.createReader(filename, OrcFile.readerOptions(conf)); writer.key("fileVersion").value(OrcFile.VersionHelper.getName(reader.getFileVersion())); writer.key("writerVersion").value(reader.getWriterVersion().ToString()); using (RecordReaderImpl rows = (RecordReaderImpl)reader.rows()) { writer.key("numberOfRows").value(reader.getNumberOfRows()); writer.key("compression").value(reader.getCompression().ToString()); if (reader.getCompression() != CompressionKind.NONE) { writer.key("compressionBufferSize").value(reader.getCompressionSize()); } writer.key("schemaString").value(reader.getObjectInspector().getTypeName()); writer.key("schema").array(); writeSchema(writer, reader.getTypes()); writer.endArray(); writer.key("stripeStatistics").array(); List <StripeStatistics> stripeStatistics = reader.getStripeStatistics(); for (int n = 0; n < stripeStatistics.Count; n++) { writer.newObject(); writer.key("stripeNumber").value(n + 1); StripeStatistics ss = stripeStatistics[n]; writer.key("columnStatistics").array(); for (int i = 0; i < ss.getColumnStatistics().Length; i++) { writer.newObject(); writer.key("columnId").value(i); writeColumnStatistics(writer, ss.getColumnStatistics()[i]); writer.endObject(); } writer.endArray(); writer.endObject(); } writer.endArray(); ColumnStatistics[] stats = reader.getStatistics(); int colCount = stats.Length; writer.key("fileStatistics").array(); for (int i = 0; i < stats.Length; ++i) { writer.newObject(); writer.key("columnId").value(i); writeColumnStatistics(writer, stats[i]); writer.endObject(); } writer.endArray(); writer.key("stripes").array(); int stripeIx = -1; foreach (StripeInformation stripe in reader.getStripes()) { ++stripeIx; long stripeStart = stripe.getOffset(); OrcProto.StripeFooter footer = rows.readStripeFooter(stripe); writer.newObject(); // start of stripe information writer.key("stripeNumber").value(stripeIx + 1); writer.key("stripeInformation"); writeStripeInformation(writer, stripe); if (printTimeZone) { writer.key("writerTimezone").value( footer.HasWriterTimezone ? footer.WriterTimezone : FileDump.UNKNOWN); } long sectionStart = stripeStart; writer.key("streams").array(); foreach (OrcProto.Stream section in footer.StreamsList) { writer.newObject(); string kind = section.HasKind ? section.Kind.ToString() : FileDump.UNKNOWN; writer.key("columnId").value(section.Column); writer.key("section").value(kind); writer.key("startOffset").value(sectionStart); writer.key("length").value(section.Length); sectionStart += (long)section.Length; writer.endObject(); } writer.endArray(); writer.key("encodings").array(); for (int i = 0; i < footer.ColumnsCount; ++i) { writer.newObject(); OrcProto.ColumnEncoding encoding = footer.ColumnsList[i]; writer.key("columnId").value(i); writer.key("kind").value(encoding.Kind.ToString()); if (encoding.Kind == OrcProto.ColumnEncoding.Types.Kind.DICTIONARY || encoding.Kind == OrcProto.ColumnEncoding.Types.Kind.DICTIONARY_V2) { writer.key("dictionarySize").value(encoding.DictionarySize); } writer.endObject(); } writer.endArray(); if (rowIndexCols != null && rowIndexCols.Count != 0) { // include the columns that are specified, only if the columns are included, bloom filter // will be read bool[] sargColumns = new bool[colCount]; foreach (int colIdx in rowIndexCols) { sargColumns[colIdx] = true; } RecordReaderImpl.Index indices = rows.readRowIndex(stripeIx, null, sargColumns); writer.key("indexes").array(); foreach (int col in rowIndexCols) { writer.newObject(); writer.key("columnId").value(col); writeRowGroupIndexes(writer, col, indices.getRowGroupIndex()); writeBloomFilterIndexes(writer, col, indices.getBloomFilterIndex()); writer.endObject(); } writer.endArray(); } writer.endObject(); // end of stripe information } writer.endArray(); long fileLen = new FileInfo(filename).Length; long paddedBytes = FileDump.getTotalPaddingSize(reader); // empty ORC file is ~45 bytes. Assumption here is file length always >0 double percentPadding = ((double)paddedBytes / (double)fileLen) * 100; writer.key("fileLength").value(fileLen); writer.key("paddingLength").value(paddedBytes); writer.key("paddingRatio").value(percentPadding); rows.close(); } writer.endObject(); } if (multiFile) { writer.endArray(); } if (prettyPrint) { #if false string prettyJson; if (multiFile) { JSONArray jsonArray = new JSONArray(writer.toString()); prettyJson = jsonArray.toString(2); } else { JSONObject jsonObject = new JSONObject(writer.toString()); prettyJson = jsonObject.toString(2); } #else string prettyJson = writer.ToString(); #endif System.Console.WriteLine(prettyJson); } else { System.Console.WriteLine(writer.ToString()); } }
public RecordReaderImpl.Index readRowIndex(StripeInformation stripe, OrcProto.StripeFooter footer, bool[] included, OrcProto.RowIndex[] indexes, bool[] sargColumns, OrcProto.BloomFilterIndex[] bloomFilterIndices) { if (footer == null) { footer = readStripeFooter(stripe); } if (indexes == null) { indexes = new OrcProto.RowIndex[typeCount]; } if (bloomFilterIndices == null) { bloomFilterIndices = new OrcProto.BloomFilterIndex[typeCount]; } long offset = stripe.getOffset(); IList <OrcProto.Stream> streams = footer.StreamsList; for (int i = 0; i < streams.Count; i++) { OrcProto.Stream stream = streams[i]; OrcProto.Stream nextStream = null; if (i < streams.Count - 1) { nextStream = streams[i + 1]; } int col = (int)stream.Column; int len = (int)stream.Length; // row index stream and bloom filter are interlaced, check if the sarg column contains bloom // filter and combine the io to read row index and bloom filters for that column together if (stream.HasKind && (stream.Kind == OrcProto.Stream.Types.Kind.ROW_INDEX)) { bool readBloomFilter = false; if (sargColumns != null && sargColumns[col] && nextStream.Kind == OrcProto.Stream.Types.Kind.BLOOM_FILTER) { len += (int)nextStream.Length; i += 1; readBloomFilter = true; } if ((included == null || included[col]) && indexes[col] == null) { byte[] buffer = new byte[len]; file.readFully(offset, buffer, 0, buffer.Length); ByteBuffer bb = ByteBuffer.wrap(buffer); indexes[col] = OrcProto.RowIndex.ParseFrom(InStream.create(null, "index", new List <DiskRange> { new RecordReaderImpl.BufferChunk(bb, 0) }, (long)stream.Length, codec, bufferSize)); if (readBloomFilter) { bb.position((int)stream.Length); bloomFilterIndices[col] = OrcProto.BloomFilterIndex.ParseFrom(InStream.create( null, "bloom_filter", new List <DiskRange> { new RecordReaderImpl.BufferChunk(bb, 0) }, (long)nextStream.Length, codec, bufferSize)); } } } offset += len; } RecordReaderImpl.Index index = new RecordReaderImpl.Index(indexes, bloomFilterIndices); return(index); }
public void testMultiStripeWithNull() { ObjectInspector inspector = ObjectInspectorFactory.getReflectionObjectInspector(typeof(MyStruct)); using (Stream file = File.OpenWrite(TestFilePath)) using (Writer writer = OrcFile.createWriter(TestFilePath, file, OrcFile.writerOptions(conf) .inspector(inspector) .stripeSize(100000) .compress(CompressionKind.NONE) .bufferSize(10000))) { Random rand = new Random(100); writer.addRow(new MyStruct(null, null, true, new List <InnerStruct> { new InnerStruct(100) })); for (int i = 2; i < 20000; i++) { writer.addRow(new MyStruct(rand.Next(1), "a", true, new List <InnerStruct> { new InnerStruct(100) })); } writer.addRow(new MyStruct(null, null, true, new List <InnerStruct> { new InnerStruct(100) })); } Reader reader = OrcFile.createReader(TestFilePath, OrcFile.readerOptions(conf)); // check the stats ColumnStatistics[] stats = reader.getStatistics(); Assert.Equal(20000, reader.getNumberOfRows()); Assert.Equal(20000, stats[0].getNumberOfValues()); Assert.Equal(0, ((IntegerColumnStatistics)stats[1]).getMaximum()); Assert.Equal(0, ((IntegerColumnStatistics)stats[1]).getMinimum()); Assert.Equal(true, ((IntegerColumnStatistics)stats[1]).isSumDefined()); Assert.Equal(0, ((IntegerColumnStatistics)stats[1]).getSum()); Assert.Equal("count: 19998 hasNull: True min: 0 max: 0 sum: 0", stats[1].ToString()); Assert.Equal("a", ((StringColumnStatistics)stats[2]).getMaximum()); Assert.Equal("a", ((StringColumnStatistics)stats[2]).getMinimum()); Assert.Equal(19998, stats[2].getNumberOfValues()); Assert.Equal("count: 19998 hasNull: True min: a max: a sum: 19998", stats[2].ToString()); // check the inspectors StructObjectInspector readerInspector = (StructObjectInspector)reader.getObjectInspector(); Assert.Equal(ObjectInspectorCategory.STRUCT, readerInspector.getCategory()); Assert.Equal("struct<a:int,b:string,c:boolean,list:array<struct<z:int>>>", readerInspector.getTypeName()); using (RecordReader rows = reader.rows()) { List <bool> expected = new List <bool>(); foreach (StripeInformation sinfo in reader.getStripes()) { expected.Add(false); } // only the first and last stripe will have PRESENT stream expected[0] = true; expected[expected.Count - 1] = true; List <bool> got = new List <bool>(); // check if the strip footer contains PRESENT stream foreach (StripeInformation sinfo in reader.getStripes()) { OrcProto.StripeFooter sf = ((RecordReaderImpl)rows).readStripeFooter(sinfo); got.Add(sf.ToString().IndexOf(OrcProto.Stream.Types.Kind.PRESENT.ToString()) != -1); } Assert.Equal(expected, got); // row 1 OrcStruct row = (OrcStruct)rows.next(); Assert.NotNull(row); Assert.Null(row.getFieldValue(0)); Assert.Null(row.getFieldValue(1)); Assert.Equal(true, row.getFieldValue(2)); Assert.Equal(100, ((OrcStruct)((IList <object>)row.getFieldValue(3))[0]). getFieldValue(0)); rows.seekToRow(19998); // last-1 row row = (OrcStruct)rows.next(); Assert.NotNull(row); Assert.NotNull(row.getFieldValue(1)); Assert.Equal(0, row.getFieldValue(0)); Assert.Equal(true, row.getFieldValue(2)); Assert.Equal(100, ((OrcStruct)((IList <object>)row.getFieldValue(3))[0]). getFieldValue(0)); // last row row = (OrcStruct)rows.next(); Assert.NotNull(row); Assert.Null(row.getFieldValue(0)); Assert.Null(row.getFieldValue(1)); Assert.Equal(true, row.getFieldValue(2)); Assert.Equal(100, ((OrcStruct)((IList <object>)row.getFieldValue(3))[0]). getFieldValue(0)); } }
public void testColumnsWithNullAndCompression() { ObjectInspector inspector = ObjectInspectorFactory.getReflectionObjectInspector(typeof(MyStruct)); using (Stream file = File.OpenWrite(TestFilePath)) using (Writer writer = OrcFile.createWriter(TestFilePath, file, OrcFile.writerOptions(conf) .inspector(inspector) .stripeSize(100000) .bufferSize(10000))) { writer.addRow(new MyStruct(3, "a", true, Lists.newArrayList(new InnerStruct(100)))); writer.addRow(new MyStruct(null, "b", true, Lists.newArrayList(new InnerStruct(100)))); writer.addRow(new MyStruct(3, null, false, Lists.newArrayList(new InnerStruct(100)))); writer.addRow(new MyStruct(3, "d", true, Lists.newArrayList(new InnerStruct(100)))); writer.addRow(new MyStruct(2, "e", true, Lists.newArrayList(new InnerStruct(100)))); writer.addRow(new MyStruct(2, "f", true, Lists.newArrayList(new InnerStruct(100)))); writer.addRow(new MyStruct(2, "g", true, Lists.newArrayList(new InnerStruct(100)))); writer.addRow(new MyStruct(2, "h", true, Lists.newArrayList(new InnerStruct(100)))); } Reader reader = OrcFile.createReader(TestFilePath, OrcFile.readerOptions(conf)); // check the stats ColumnStatistics[] stats = reader.getStatistics(); Assert.Equal(8, reader.getNumberOfRows()); Assert.Equal(8, stats[0].getNumberOfValues()); Assert.Equal(3, ((IntegerColumnStatistics)stats[1]).getMaximum()); Assert.Equal(2, ((IntegerColumnStatistics)stats[1]).getMinimum()); Assert.Equal(true, ((IntegerColumnStatistics)stats[1]).isSumDefined()); Assert.Equal(17, ((IntegerColumnStatistics)stats[1]).getSum()); Assert.Equal("count: 7 hasNull: True min: 2 max: 3 sum: 17", stats[1].ToString()); Assert.Equal("h", ((StringColumnStatistics)stats[2]).getMaximum()); Assert.Equal("a", ((StringColumnStatistics)stats[2]).getMinimum()); Assert.Equal(7, stats[2].getNumberOfValues()); Assert.Equal("count: 7 hasNull: True min: a max: h sum: 7", stats[2].ToString()); // check the inspectors StructObjectInspector readerInspector = (StructObjectInspector)reader.getObjectInspector(); Assert.Equal(ObjectInspectorCategory.STRUCT, readerInspector.getCategory()); Assert.Equal("struct<a:int,b:string,c:boolean,list:array<struct<z:int>>>", readerInspector.getTypeName()); using (RecordReader rows = reader.rows()) { // only the last strip will have PRESENT stream List <bool> expected = new List <bool>(); foreach (StripeInformation sinfo in reader.getStripes()) { expected.Add(false); } expected[expected.Count - 1] = true; List <bool> got = new List <bool>(); // check if the strip footer contains PRESENT stream foreach (StripeInformation sinfo in reader.getStripes()) { OrcProto.StripeFooter sf = ((RecordReaderImpl)rows).readStripeFooter(sinfo); got.Add(sf.ToString().IndexOf(OrcProto.Stream.Types.Kind.PRESENT.ToString()) != -1); } Assert.Equal(expected, got); // row 1 OrcStruct row = (OrcStruct)rows.next(); Assert.NotNull(row); Assert.Equal(3, row.getFieldValue(0)); Assert.Equal("a", row.getFieldValue(1).ToString()); Assert.Equal(true, row.getFieldValue(2)); Assert.Equal(100, ((OrcStruct)((IList <object>)row.getFieldValue(3))[0]). getFieldValue(0)); // row 2 row = (OrcStruct)rows.next(); Assert.NotNull(row); Assert.Null(row.getFieldValue(0)); Assert.Equal("b", row.getFieldValue(1).ToString()); Assert.Equal(true, row.getFieldValue(2)); Assert.Equal(100, ((OrcStruct)((IList <object>)row.getFieldValue(3))[0]). getFieldValue(0)); // row 3 row = (OrcStruct)rows.next(); Assert.NotNull(row); Assert.Null(row.getFieldValue(1)); Assert.Equal(3, row.getFieldValue(0)); Assert.Equal(false, row.getFieldValue(2)); Assert.Equal(100, ((OrcStruct)((IList <object>)row.getFieldValue(3))[0]). getFieldValue(0)); } }