public void testDeltaUnknownSign() { ObjectInspector inspector = ObjectInspectorFactory.getReflectionObjectInspector(typeof(int)); using (Stream file = File.OpenWrite(TestFilePath)) using (Writer w = OrcFile.createWriter(TestFilePath, file, OrcFile.writerOptions(conf) .compress(CompressionKind.NONE) .inspector(inspector) .rowIndexStride(0) .encodingStrategy(OrcFile.EncodingStrategy.COMPRESSION) .version(OrcFile.Version.V_0_12))) { w.addRow(0); for (int i = 0; i < 511; ++i) { w.addRow(i); } } using (CaptureStdoutToMemory capture = new CaptureStdoutToMemory()) { FileDump.Main(TestFilePath); // monotonicity will be undetermined for this sequence 0,0,1,2,3,...510. Hence DIRECT encoding // will be used. 2 bytes for header and 640 bytes for data (512 values with fixed bit of 10 bits // each, 5120/8 = 640). Total bytes 642 Assert.True(capture.Text.Contains("Stream: column 0 section DATA start: 3 length 642")); } }
public void testShortRepeat() { ObjectInspector inspector = ObjectInspectorFactory.getReflectionObjectInspector(typeof(int)); using (Stream file = File.OpenWrite(TestFilePath)) using (Writer w = OrcFile.createWriter(TestFilePath, file, OrcFile.writerOptions(conf) .compress(CompressionKind.NONE) .inspector(inspector) .rowIndexStride(0) .encodingStrategy(OrcFile.EncodingStrategy.COMPRESSION) .version(OrcFile.Version.V_0_12))) { for (int i = 0; i < 5; ++i) { w.addRow(10); } } using (CaptureStdoutToMemory capture = new CaptureStdoutToMemory()) { FileDump.Main(TestFilePath); // 1 byte header + 1 byte value Assert.True(capture.Text.Contains("Stream: column 0 section DATA start: 3 length 2")); } }
public void testFixedDeltaOneDescending() { ObjectInspector inspector = ObjectInspectorFactory.getReflectionObjectInspector(typeof(int)); using (Stream file = File.OpenWrite(TestFilePath)) using (Writer w = OrcFile.createWriter(TestFilePath, file, OrcFile.writerOptions(conf) .compress(CompressionKind.NONE) .inspector(inspector) .rowIndexStride(0) .encodingStrategy(OrcFile.EncodingStrategy.COMPRESSION) .version(OrcFile.Version.V_0_12))) { for (int i = 0; i < 5120; ++i) { w.addRow(512 - (i % 512)); } } using (CaptureStdoutToMemory capture = new CaptureStdoutToMemory()) { FileDump.Main(TestFilePath); // 10 runs of 512 elements. Each run has 2 bytes header, 2 byte base (base = 512, zigzag + varint) // and 1 byte delta (delta = 1). In total, 5 bytes per run. Assert.True(capture.Text.Contains("Stream: column 0 section DATA start: 3 length 50")); } }
public void testDump() { // conf.set(HiveConf.ConfVars.HIVE_ORC_ENCODING_STRATEGY.varname, "COMPRESSION"); using (Stream file = File.OpenWrite(TestFilePath)) { OrcFile.WriterOptions options = new OrcFile.WriterOptions(new Properties(), conf); options.inspector(ObjectInspectorFactory.getReflectionObjectInspector(typeof(MyRecord))); options.stripeSize(100000); options.compress(CompressionKind.ZLIB); options.bufferSize(10000); options.rowIndexStride(1000); using (Writer writer = OrcFile.createWriter(TestFilePath, file, options)) { Random r1 = new Random(1); for (int i = 0; i < 21000; ++i) { writer.addRow(new MyRecord(r1.Next(), r1.NextLong(), TestHelpers.words[r1.Next(TestHelpers.words.Length)])); } } } string outputFilename = "orc-file-dump.out"; using (CaptureStdout capture = new CaptureStdout(Path.Combine(workDir, outputFilename))) { FileDump.Main(new string[] { TestFilePath.ToString(), "--rowindex=1,2,3" }); } TestHelpers.CompareFilesByLine(outputFilename, Path.Combine(workDir, outputFilename)); }
public void testPatchedBase() { ObjectInspector inspector = ObjectInspectorFactory.getReflectionObjectInspector(typeof(int)); using (Stream file = File.OpenWrite(TestFilePath)) using (Writer w = OrcFile.createWriter(TestFilePath, file, OrcFile.writerOptions(conf) .compress(CompressionKind.NONE) .inspector(inspector) .rowIndexStride(0) .encodingStrategy(OrcFile.EncodingStrategy.COMPRESSION) .version(OrcFile.Version.V_0_12))) { Random rand = new Random(123); w.addRow(10000000); for (int i = 0; i < 511; ++i) { w.addRow(rand.Next(i + 1)); } } using (CaptureStdoutToMemory capture = new CaptureStdoutToMemory()) { FileDump.Main(TestFilePath); // use PATCHED_BASE encoding Assert.True(capture.Text.Contains("Stream: column 0 section DATA start: 3 length 583")); } }
public void testOrcSerDeStatsMap() { ObjectInspector inspector = ObjectInspectorFactory.getReflectionObjectInspector(typeof(MapStruct)); using (Stream file = File.OpenWrite(TestFilePath)) using (Writer writer = OrcFile.createWriter(TestFilePath, file, OrcFile.writerOptions(conf) .inspector(inspector) .stripeSize(10000) .bufferSize(10000))) { for (int row = 0; row < 1000; row++) { Dictionary <string, double> test = new Dictionary <string, double>(); for (int i = 0; i < 10; i++) { test.Add("hi" + i, 2.0); } writer.addRow(new MapStruct(test)); } writer.close(); // stats from writer Assert.Equal(1000, writer.getNumberOfRows()); Assert.Equal(950000, writer.getRawDataSize()); } Reader reader = OrcFile.createReader(TestFilePath, OrcFile.readerOptions(conf)); // stats from reader Assert.Equal(1000, reader.getNumberOfRows()); Assert.Equal(950000, reader.getRawDataSize()); Assert.Equal(950000, reader.getRawDataSizeOfColumns(Lists.newArrayList("map1"))); }
public void testHalfDistinctCheckDisabled() { ObjectInspector inspector = ObjectInspectorFactory.getReflectionObjectInspector(typeof(string)); int[] input = new int[20000]; // conf.setBoolean(ConfVars.HIVE_ORC_ROW_INDEX_STRIDE_DICTIONARY_CHECK.varname, false); using (Stream file = File.OpenWrite(TestFilePath)) using (Writer writer = OrcFile.createWriter(TestFilePath, file, OrcFile.writerOptions(conf) .inspector(inspector) .compress(CompressionKind.NONE) .bufferSize(10000))) { Random rand = new Random(123); for (int i = 0; i < 20000; i++) { input[i] = rand.Next(10000); } for (int i = 0; i < 20000; i++) { writer.addRow(input[i].ToString()); } } Reader reader = OrcFile.createReader(TestFilePath, OrcFile.readerOptions(conf)); using (RecordReader rows = reader.rows()) { int idx = 0; while (rows.hasNext()) { object row = rows.next(); Assert.Equal(input[idx++].ToString(), row); } // make sure the encoding type is correct foreach (StripeInformation stripe in reader.getStripes()) { // hacky but does the job, this casting will work as long this test resides // within the same package as ORC reader OrcProto.StripeFooter footer = ((RecordReaderImpl)rows).readStripeFooter(stripe); for (int i = 0; i < footer.ColumnsCount; ++i) { OrcProto.ColumnEncoding encoding = footer.GetColumns(i); Assert.Equal(OrcProto.ColumnEncoding.Types.Kind.DICTIONARY_V2, encoding.Kind); } } } }
public void testTimestampWriter(string writerTimeZone, string readerTimeZone) { ObjectInspector inspector = ObjectInspectorFactory.getReflectionObjectInspector(typeof(Timestamp)); List <string> ts = new List <string>(); using (Stream file = File.OpenWrite(TestFilePath)) using (Writer writer = OrcFile.createWriter(TestFilePath, file, OrcFile.writerOptions(conf) .inspector(inspector) .stripeSize(100000) .bufferSize(10000))) using (TestHelpers.SetTimeZoneInfo(writerTimeZone)) { ts.Add("2003-01-01 01:00:00.000000222"); ts.Add("1996-08-02 09:00:00.723100809"); ts.Add("1999-01-01 02:00:00.999999999"); ts.Add("1995-01-02 03:00:00.688888888"); ts.Add("2002-01-01 04:00:00.1"); ts.Add("2010-03-02 05:00:00.000009001"); ts.Add("2005-01-01 06:00:00.000002229"); ts.Add("2006-01-01 07:00:00.900203003"); ts.Add("2003-01-01 08:00:00.800000007"); ts.Add("1998-11-02 10:00:00.857340643"); ts.Add("2008-10-02 11:00:00.0"); ts.Add("2037-01-01 00:00:00.000999"); ts.Add("2014-03-28 00:00:00.0"); foreach (string t in ts) { writer.addRow(Timestamp.Parse(t)); } } using (TestHelpers.SetTimeZoneInfo(readerTimeZone)) { Reader reader = OrcFile.createReader(TestFilePath, OrcFile.readerOptions(conf)); using (RecordReader rows = reader.rows(null)) { int idx = 0; while (rows.hasNext()) { object row = rows.next(); Timestamp got = ((Timestamp)row); Assert.Equal(ts[idx++], got.ToString()); } } } }
public void SimpleTest() { OrcFile.WriterOptions options = new OrcFile.WriterOptions(new Properties(), new Configuration()); options.inspector(PrimitiveObjectInspectorFactory.writableStringObjectInspector); using (Stream file = File.Create(filename)) using (Writer writer = OrcFile.createWriter(filename, file, options)) { writer.addRow("hello"); } Reader reader = OrcFile.createReader(() => File.OpenRead(filename), filename); using (RecordReader recordReader = reader.rows()) { object value = recordReader.next(); Assert.True(value is string); Assert.Equal("hello", value); } }
public void testTooManyDistinctV11AlwaysDictionary() { ObjectInspector inspector = ObjectInspectorFactory.getReflectionObjectInspector(typeof(string)); using (Stream file = File.OpenWrite(TestFilePath)) using (Writer writer = OrcFile.createWriter(TestFilePath, file, OrcFile.writerOptions(conf) .inspector(inspector) .compress(CompressionKind.NONE) .version(OrcFile.Version.V_0_11) .bufferSize(10000))) { for (int i = 0; i < 20000; i++) { writer.addRow(i.ToString()); } } Reader reader = OrcFile.createReader(TestFilePath, OrcFile.readerOptions(conf)); using (RecordReader rows = reader.rows()) { int idx = 0; while (rows.hasNext()) { object row = rows.next(); Assert.Equal((idx++).ToString(), row); } // make sure the encoding type is correct foreach (StripeInformation stripe in reader.getStripes()) { // hacky but does the job, this casting will work as long this test resides // within the same package as ORC reader OrcProto.StripeFooter footer = ((RecordReaderImpl)rows).readStripeFooter(stripe); for (int i = 0; i < footer.ColumnsCount; ++i) { OrcProto.ColumnEncoding encoding = footer.GetColumns(i); Assert.Equal(OrcProto.ColumnEncoding.Types.Kind.DICTIONARY, encoding.Kind); } } } }
public void testDictionaryThreshold() { // conf.set(HiveConf.ConfVars.HIVE_ORC_ENCODING_STRATEGY.varname, "COMPRESSION"); // conf.setFloat(HiveConf.ConfVars.HIVE_ORC_DICTIONARY_KEY_SIZE_THRESHOLD.varname, 0.49f); using (Stream file = File.OpenWrite(TestFilePath)) { OrcFile.WriterOptions options = new OrcFile.WriterOptions(new Properties(), conf); options.inspector(ObjectInspectorFactory.getReflectionObjectInspector(typeof(MyRecord))); options.stripeSize(100000); options.compress(CompressionKind.ZLIB); options.bufferSize(10000); options.rowIndexStride(1000); using (Writer writer = OrcFile.createWriter(TestFilePath, file, options)) { Random r1 = new Random(1); int nextInt = 0; for (int i = 0; i < 21000; ++i) { // Write out the same string twice, this guarantees the fraction of rows with // distinct strings is 0.5 if (i % 2 == 0) { nextInt = r1.Next(TestHelpers.words.Length); // Append the value of i to the word, this guarantees when an index or word is repeated // the actual string is unique. TestHelpers.words[nextInt] += "-" + i; } writer.addRow(new MyRecord(r1.Next(), r1.NextLong(), TestHelpers.words[nextInt])); } } } string outputFilename = "orc-file-dump-dictionary-threshold.out"; using (CaptureStdout capture = new CaptureStdout(Path.Combine(workDir, outputFilename))) { FileDump.Main(new string[] { TestFilePath.ToString(), "--rowindex=1,2,3" }); } TestHelpers.CompareFilesByLine(outputFilename, Path.Combine(workDir, outputFilename)); }
public void testJsonDump() { ObjectInspector inspector; inspector = ObjectInspectorFactory.getReflectionObjectInspector(typeof(MyRecord)); // conf.set(HiveConf.ConfVars.HIVE_ORC_ENCODING_STRATEGY.varname, "COMPRESSION"); OrcFile.WriterOptions options = OrcFile.writerOptions(conf) .inspector(inspector) .stripeSize(100000) .compress(CompressionKind.ZLIB) .bufferSize(10000) .rowIndexStride(1000) .bloomFilterColumns("s"); using (Stream file = File.OpenWrite(TestFilePath)) using (Writer writer = OrcFile.createWriter(TestFilePath, file, options)) { Random r1 = new Random(1); for (int i = 0; i < 21000; ++i) { if (i % 100 == 0) { writer.addRow(new MyRecord(r1.Next(), r1.NextLong(), null)); } else { writer.addRow(new MyRecord(r1.Next(), r1.NextLong(), TestHelpers.words[r1.Next(TestHelpers.words.Length)])); } } } const string outputFilename = "orc-file-dump.json"; using (CaptureStdout capture = new CaptureStdout(Path.Combine(workDir, outputFilename))) { FileDump.Main(new string[] { TestFilePath.ToString(), "-j", "-p", "--rowindex=3" }); } TestHelpers.CompareFilesByLine(outputFilename, Path.Combine(workDir, outputFilename)); }
public void testOrcSerDeStatsSimpleWithNulls() { ObjectInspector inspector = ObjectInspectorFactory.getReflectionObjectInspector(typeof(SimpleStruct)); using (Stream file = File.OpenWrite(TestFilePath)) using (Writer writer = OrcFile.createWriter(TestFilePath, file, OrcFile.writerOptions(conf) .inspector(inspector) .stripeSize(10000) .bufferSize(10000))) { for (int row = 0; row < 1000; row++) { if (row % 2 == 0) { writer.addRow(new SimpleStruct(new byte[] { 1, 2, 3 }, "hi")); } else { writer.addRow(null); } } writer.close(); // stats from writer Assert.Equal(1000, writer.getNumberOfRows()); Assert.Equal(44500, writer.getRawDataSize()); } Reader reader = OrcFile.createReader(TestFilePath, OrcFile.readerOptions(conf)); // stats from reader Assert.Equal(1000, reader.getNumberOfRows()); Assert.Equal(44500, reader.getRawDataSize()); Assert.Equal(1500, reader.getRawDataSizeOfColumns(Lists.newArrayList("bytes1"))); Assert.Equal(43000, reader.getRawDataSizeOfColumns(Lists.newArrayList("string1"))); Assert.Equal(44500, reader.getRawDataSizeOfColumns(Lists.newArrayList("bytes1", "string1"))); }
public void testBitPack64Large() { ObjectInspector inspector = ObjectInspectorFactory.getReflectionObjectInspector(typeof(long)); const int size = 1080832; long[] inp = new long[size]; Random rand = new Random(1234); for (int i = 0; i < size; i++) { inp[i] = rand.NextLong(); } List <long> input = inp.ToList(); using (Stream file = File.OpenWrite(TestFilePath)) using (Writer writer = OrcFile.createWriter(TestFilePath, file, OrcFile.writerOptions(conf) .inspector(inspector) .compress(CompressionKind.ZLIB))) { foreach (long l in input) { writer.addRow(l); } } Reader reader = OrcFile.createReader(TestFilePath, OrcFile.readerOptions(conf)); using (RecordReader rows = reader.rows()) { int idx = 0; while (rows.hasNext()) { object row = rows.next(); Assert.Equal(input[idx++], ((long)row)); } } }
public void testBitPacking(long val) { long[] input = new long[] { val, 0, val, val, 0, val, 0, val, val, 0, val, 0, val, val, 0, 0, val, val, 0, val, 0, 0, val, 0, val, 0, val, 0, 0, val, 0, val, 0, val, 0, 0, val, 0, val, 0, val, 0, 0, val, 0, val, 0, val, 0, 0, val, 0, val, 0, val, 0, 0, val, 0, val, 0, val, 0, 0, val, 0, val, 0, val, 0, 0, val, 0, val, 0, val, 0, 0, val, 0, val, 0, val, 0, 0, val, 0, val, 0, val, 0, 0, val, 0, val, 0, 0, val, val }; ObjectInspector inspector = ObjectInspectorFactory.getReflectionObjectInspector(typeof(long)); using (Stream file = File.OpenWrite(TestFilePath)) using (Writer writer = OrcFile.createWriter(TestFilePath, file, OrcFile.writerOptions(conf) .inspector(inspector) .stripeSize(100000) .compress(CompressionKind.NONE) .bufferSize(10000))) { foreach (long l in input) { writer.addRow(l); } } Reader reader = OrcFile.createReader(TestFilePath, OrcFile.readerOptions(conf)); using (RecordReader rows = reader.rows()) { int idx = 0; while (rows.hasNext()) { object row = rows.next(); Assert.Equal(input[idx++], ((long)row)); } } }
public void createFile() { ObjectInspector inspector = ObjectInspectorFactory.getReflectionObjectInspector(typeof(MyRecord)); using (Stream file = File.OpenWrite(TestFilePath)) using (Writer writer = OrcFile.createWriter(TestFilePath, file, OrcFile.writerOptions(conf) .stripeSize(100000) .compress(CompressionKind.ZLIB) .inspector(inspector) .bufferSize(10000) .rowIndexStride(10000))) { Random r1 = new Random(1); string[] words = TestHelpers.words; string[] dates = new string[] { "1991-02-28", "1970-01-31", "1950-04-23" }; string[] decimalStrings = new string[] { "234.443", "10001000", "0.3333367", "67788798.0", "-234.443", "-10001000", "-0.3333367", "-67788798.0", "0" }; for (int i = 0; i < 21000; ++i) { if ((i % 7) != 0) { writer.addRow(new MyRecord(((i % 3) == 0), (sbyte)(i % 5), i, (long)200, (short)(300 + i), (double)(400 + i), words[r1.Next(words.Length)], new Timestamp(DateTime.Now), Date.Parse(dates[i % 3]), HiveDecimal.Parse(decimalStrings[i % decimalStrings.Length]))); } else { writer.addRow(new MyRecord(null, null, i, (long)200, null, null, null, null, null, null)); } } } checkVectorizedReader(); }
public void testColumnsWithNullAndCompression() { ObjectInspector inspector = ObjectInspectorFactory.getReflectionObjectInspector(typeof(MyStruct)); using (Stream file = File.OpenWrite(TestFilePath)) using (Writer writer = OrcFile.createWriter(TestFilePath, file, OrcFile.writerOptions(conf) .inspector(inspector) .stripeSize(100000) .bufferSize(10000))) { writer.addRow(new MyStruct(3, "a", true, Lists.newArrayList(new InnerStruct(100)))); writer.addRow(new MyStruct(null, "b", true, Lists.newArrayList(new InnerStruct(100)))); writer.addRow(new MyStruct(3, null, false, Lists.newArrayList(new InnerStruct(100)))); writer.addRow(new MyStruct(3, "d", true, Lists.newArrayList(new InnerStruct(100)))); writer.addRow(new MyStruct(2, "e", true, Lists.newArrayList(new InnerStruct(100)))); writer.addRow(new MyStruct(2, "f", true, Lists.newArrayList(new InnerStruct(100)))); writer.addRow(new MyStruct(2, "g", true, Lists.newArrayList(new InnerStruct(100)))); writer.addRow(new MyStruct(2, "h", true, Lists.newArrayList(new InnerStruct(100)))); } Reader reader = OrcFile.createReader(TestFilePath, OrcFile.readerOptions(conf)); // check the stats ColumnStatistics[] stats = reader.getStatistics(); Assert.Equal(8, reader.getNumberOfRows()); Assert.Equal(8, stats[0].getNumberOfValues()); Assert.Equal(3, ((IntegerColumnStatistics)stats[1]).getMaximum()); Assert.Equal(2, ((IntegerColumnStatistics)stats[1]).getMinimum()); Assert.Equal(true, ((IntegerColumnStatistics)stats[1]).isSumDefined()); Assert.Equal(17, ((IntegerColumnStatistics)stats[1]).getSum()); Assert.Equal("count: 7 hasNull: True min: 2 max: 3 sum: 17", stats[1].ToString()); Assert.Equal("h", ((StringColumnStatistics)stats[2]).getMaximum()); Assert.Equal("a", ((StringColumnStatistics)stats[2]).getMinimum()); Assert.Equal(7, stats[2].getNumberOfValues()); Assert.Equal("count: 7 hasNull: True min: a max: h sum: 7", stats[2].ToString()); // check the inspectors StructObjectInspector readerInspector = (StructObjectInspector)reader.getObjectInspector(); Assert.Equal(ObjectInspectorCategory.STRUCT, readerInspector.getCategory()); Assert.Equal("struct<a:int,b:string,c:boolean,list:array<struct<z:int>>>", readerInspector.getTypeName()); using (RecordReader rows = reader.rows()) { // only the last strip will have PRESENT stream List <bool> expected = new List <bool>(); foreach (StripeInformation sinfo in reader.getStripes()) { expected.Add(false); } expected[expected.Count - 1] = true; List <bool> got = new List <bool>(); // check if the strip footer contains PRESENT stream foreach (StripeInformation sinfo in reader.getStripes()) { OrcProto.StripeFooter sf = ((RecordReaderImpl)rows).readStripeFooter(sinfo); got.Add(sf.ToString().IndexOf(OrcProto.Stream.Types.Kind.PRESENT.ToString()) != -1); } Assert.Equal(expected, got); // row 1 OrcStruct row = (OrcStruct)rows.next(); Assert.NotNull(row); Assert.Equal(3, row.getFieldValue(0)); Assert.Equal("a", row.getFieldValue(1).ToString()); Assert.Equal(true, row.getFieldValue(2)); Assert.Equal(100, ((OrcStruct)((IList <object>)row.getFieldValue(3))[0]). getFieldValue(0)); // row 2 row = (OrcStruct)rows.next(); Assert.NotNull(row); Assert.Null(row.getFieldValue(0)); Assert.Equal("b", row.getFieldValue(1).ToString()); Assert.Equal(true, row.getFieldValue(2)); Assert.Equal(100, ((OrcStruct)((IList <object>)row.getFieldValue(3))[0]). getFieldValue(0)); // row 3 row = (OrcStruct)rows.next(); Assert.NotNull(row); Assert.Null(row.getFieldValue(1)); Assert.Equal(3, row.getFieldValue(0)); Assert.Equal(false, row.getFieldValue(2)); Assert.Equal(100, ((OrcStruct)((IList <object>)row.getFieldValue(3))[0]). getFieldValue(0)); } }
public void testSplitEliminationComplexExpr() { ObjectInspector inspector = ObjectInspectorFactory.getReflectionObjectInspector(typeof(AllTypesRow)); using (Stream file = File.OpenWrite(testFilePath)) using (Writer writer = OrcFile.createWriter(testFilePath, file, conf, inspector, 100000, CompressionKind.NONE, 10000, 10000)) { writeData(writer); } conf.set(ShimLoader.getHadoopShims().getHadoopConfNames().get("MAPREDMINSPLITSIZE"), "1000"); conf.set(ShimLoader.getHadoopShims().getHadoopConfNames().get("MAPREDMAXSPLITSIZE"), "150000"); InputFormat @in = new OrcInputFormat(); FileInputFormat.setInputPaths(conf, testFilePath.ToString()); // predicate expression: userid <= 100 and subtype <= 1000.0 GenericUDF udf = new GenericUDFOPEqualOrLessThan(); List <ExprNodeDesc> childExpr = new List <ExprNodeDesc>(); ExprNodeColumnDesc col = new ExprNodeColumnDesc(typeof(long), "userid", "T", false); ExprNodeConstantDesc con = new ExprNodeConstantDesc(100); childExpr.Add(col); childExpr.Add(con); ExprNodeGenericFuncDesc en = new ExprNodeGenericFuncDesc(inspector, udf, childExpr); GenericUDF udf1 = new GenericUDFOPEqualOrLessThan(); List <ExprNodeDesc> childExpr1 = new List <ExprNodeDesc>(); ExprNodeColumnDesc col1 = new ExprNodeColumnDesc(typeof(double), "subtype", "T", false); ExprNodeConstantDesc con1 = new ExprNodeConstantDesc(1000.0); childExpr1.Add(col1); childExpr1.Add(con1); ExprNodeGenericFuncDesc en1 = new ExprNodeGenericFuncDesc(inspector, udf1, childExpr1); GenericUDF udf2 = new GenericUDFOPAnd(); List <ExprNodeDesc> childExpr2 = new List <ExprNodeDesc>(); childExpr2.Add(en); childExpr2.Add(en1); ExprNodeGenericFuncDesc en2 = new ExprNodeGenericFuncDesc(inspector, udf2, childExpr2); string sargStr = Utilities.serializeExpression(en2); conf.set("hive.io.filter.expr.serialized", sargStr); InputSplit[] splits = @in.getSplits(conf, 1); Assert.Equal(2, splits.Length); con = new ExprNodeConstantDesc(2); childExpr[1] = con; en = new ExprNodeGenericFuncDesc(inspector, udf, childExpr); con1 = new ExprNodeConstantDesc(0.0); childExpr1[1] = con1; en1 = new ExprNodeGenericFuncDesc(inspector, udf1, childExpr1); childExpr2[0] = en; childExpr2[1] = en1; en2 = new ExprNodeGenericFuncDesc(inspector, udf2, childExpr2); sargStr = Utilities.serializeExpression(en2); conf.set("hive.io.filter.expr.serialized", sargStr); splits = @in.getSplits(conf, 1); // no stripe will satisfy the predicate Assert.Equal(0, splits.Length); con = new ExprNodeConstantDesc(2); childExpr[1] = con; en = new ExprNodeGenericFuncDesc(inspector, udf, childExpr); con1 = new ExprNodeConstantDesc(1.0); childExpr1[1] = con1; en1 = new ExprNodeGenericFuncDesc(inspector, udf1, childExpr1); childExpr2[0] = en; childExpr2[1] = en1; en2 = new ExprNodeGenericFuncDesc(inspector, udf2, childExpr2); sargStr = Utilities.serializeExpression(en2); conf.set("hive.io.filter.expr.serialized", sargStr); splits = @in.getSplits(conf, 1); // only first stripe will satisfy condition and hence single split Assert.Equal(1, splits.Length); udf = new GenericUDFOPEqual(); con = new ExprNodeConstantDesc(13); childExpr[1] = con; en = new ExprNodeGenericFuncDesc(inspector, udf, childExpr); con1 = new ExprNodeConstantDesc(80.0); childExpr1[1] = con1; en1 = new ExprNodeGenericFuncDesc(inspector, udf1, childExpr1); childExpr2[0] = en; childExpr2[1] = en1; en2 = new ExprNodeGenericFuncDesc(inspector, udf2, childExpr2); sargStr = Utilities.serializeExpression(en2); conf.set("hive.io.filter.expr.serialized", sargStr); splits = @in.getSplits(conf, 1); // first two stripes will satisfy condition and hence single split Assert.Equal(2, splits.Length); udf = new GenericUDFOPEqual(); con = new ExprNodeConstantDesc(13); childExpr[1] = con; en = new ExprNodeGenericFuncDesc(inspector, udf, childExpr); udf1 = new GenericUDFOPEqual(); con1 = new ExprNodeConstantDesc(80.0); childExpr1[1] = con1; en1 = new ExprNodeGenericFuncDesc(inspector, udf1, childExpr1); childExpr2[0] = en; childExpr2[1] = en1; en2 = new ExprNodeGenericFuncDesc(inspector, udf2, childExpr2); sargStr = Utilities.serializeExpression(en2); conf.set("hive.io.filter.expr.serialized", sargStr); splits = @in.getSplits(conf, 1); // only second stripes will satisfy condition and hence single split Assert.Equal(1, splits.Length); }
public void testSplitEliminationSmallMaxSplit() { ObjectInspector inspector = ObjectInspectorFactory.getReflectionObjectInspector(typeof(AllTypesRow)); using (Stream file = File.OpenWrite(testFilePath)) using (Writer writer = OrcFile.createWriter(testFilePath, file, conf, inspector, 100000, CompressionKind.NONE, 10000, 10000)) { writeData(writer); } conf.set(ShimLoader.getHadoopShims().getHadoopConfNames().get("MAPREDMINSPLITSIZE"), "1000"); conf.set(ShimLoader.getHadoopShims().getHadoopConfNames().get("MAPREDMAXSPLITSIZE"), "5000"); InputFormat @in = new OrcInputFormat(); FileInputFormat.setInputPaths(conf, testFilePath.ToString()); GenericUDF udf = new GenericUDFOPEqualOrLessThan(); List <ExprNodeDesc> childExpr = new List <ExprNodeDesc>(); ExprNodeColumnDesc col = new ExprNodeColumnDesc(typeof(long), "userid", "T", false); ExprNodeConstantDesc con = new ExprNodeConstantDesc(100); childExpr.Add(col); childExpr.Add(con); ExprNodeGenericFuncDesc en = new ExprNodeGenericFuncDesc(inspector, udf, childExpr); string sargStr = Utilities.serializeExpression(en); conf.set("hive.io.filter.expr.serialized", sargStr); InputSplit[] splits = @in.getSplits(conf, 1); Assert.Equal(5, splits.Length); con = new ExprNodeConstantDesc(1); childExpr[1] = con; en = new ExprNodeGenericFuncDesc(inspector, udf, childExpr); sargStr = Utilities.serializeExpression(en); conf.set("hive.io.filter.expr.serialized", sargStr); splits = @in.getSplits(conf, 1); Assert.Equal(0, splits.Length); con = new ExprNodeConstantDesc(2); childExpr[1] = con; en = new ExprNodeGenericFuncDesc(inspector, udf, childExpr); sargStr = Utilities.serializeExpression(en); conf.set("hive.io.filter.expr.serialized", sargStr); splits = @in.getSplits(conf, 1); Assert.Equal(1, splits.Length); con = new ExprNodeConstantDesc(5); childExpr[1] = con; en = new ExprNodeGenericFuncDesc(inspector, udf, childExpr); sargStr = Utilities.serializeExpression(en); conf.set("hive.io.filter.expr.serialized", sargStr); splits = @in.getSplits(conf, 1); Assert.Equal(2, splits.Length); con = new ExprNodeConstantDesc(13); childExpr[1] = con; en = new ExprNodeGenericFuncDesc(inspector, udf, childExpr); sargStr = Utilities.serializeExpression(en); conf.set("hive.io.filter.expr.serialized", sargStr); splits = @in.getSplits(conf, 1); Assert.Equal(3, splits.Length); con = new ExprNodeConstantDesc(29); childExpr[1] = con; en = new ExprNodeGenericFuncDesc(inspector, udf, childExpr); sargStr = Utilities.serializeExpression(en); conf.set("hive.io.filter.expr.serialized", sargStr); splits = @in.getSplits(conf, 1); Assert.Equal(4, splits.Length); con = new ExprNodeConstantDesc(70); childExpr[1] = con; en = new ExprNodeGenericFuncDesc(inspector, udf, childExpr); sargStr = Utilities.serializeExpression(en); conf.set("hive.io.filter.expr.serialized", sargStr); splits = @in.getSplits(conf, 1); Assert.Equal(5, splits.Length); }
public void testHasNull() { ObjectInspector inspector = ObjectInspectorFactory.getReflectionObjectInspector(typeof(SimpleStruct)); using (Stream file = File.OpenWrite(TestFilePath)) using (Writer writer = OrcFile.createWriter(TestFilePath, file, OrcFile.writerOptions(conf) .inspector(inspector) .rowIndexStride(1000) .stripeSize(10000) .bufferSize(10000))) { // STRIPE 1 // RG1 for (int i = 0; i < 1000; i++) { writer.addRow(new SimpleStruct(bytes(1, 2, 3), "RG1")); } // RG2 for (int i = 0; i < 1000; i++) { writer.addRow(new SimpleStruct(bytes(1, 2, 3), null)); } // RG3 for (int i = 0; i < 1000; i++) { writer.addRow(new SimpleStruct(bytes(1, 2, 3), "RG3")); } // RG4 for (int i = 0; i < 1000; i++) { writer.addRow(new SimpleStruct(bytes(1, 2, 3), null)); } // RG5 for (int i = 0; i < 1000; i++) { writer.addRow(new SimpleStruct(bytes(1, 2, 3), null)); } // STRIPE 2 for (int i = 0; i < 5000; i++) { writer.addRow(new SimpleStruct(bytes(1, 2, 3), null)); } // STRIPE 3 for (int i = 0; i < 5000; i++) { writer.addRow(new SimpleStruct(bytes(1, 2, 3), "STRIPE-3")); } // STRIPE 4 for (int i = 0; i < 5000; i++) { writer.addRow(new SimpleStruct(bytes(1, 2, 3), null)); } } Reader reader = OrcFile.createReader(TestFilePath, OrcFile.readerOptions(conf)); // check the file level stats ColumnStatistics[] stats = reader.getStatistics(); Assert.Equal(20000, stats[0].getNumberOfValues()); Assert.Equal(20000, stats[1].getNumberOfValues()); Assert.Equal(7000, stats[2].getNumberOfValues()); Assert.Equal(false, stats[0].hasNull()); Assert.Equal(false, stats[1].hasNull()); Assert.Equal(true, stats[2].hasNull()); // check the stripe level stats List <StripeStatistics> stripeStats = reader.getStripeStatistics(); // stripe 1 stats StripeStatistics ss1 = stripeStats[0]; ColumnStatistics ss1_cs1 = ss1.getColumnStatistics()[0]; ColumnStatistics ss1_cs2 = ss1.getColumnStatistics()[1]; ColumnStatistics ss1_cs3 = ss1.getColumnStatistics()[2]; Assert.Equal(false, ss1_cs1.hasNull()); Assert.Equal(false, ss1_cs2.hasNull()); Assert.Equal(true, ss1_cs3.hasNull()); // stripe 2 stats StripeStatistics ss2 = stripeStats[1]; ColumnStatistics ss2_cs1 = ss2.getColumnStatistics()[0]; ColumnStatistics ss2_cs2 = ss2.getColumnStatistics()[1]; ColumnStatistics ss2_cs3 = ss2.getColumnStatistics()[2]; Assert.Equal(false, ss2_cs1.hasNull()); Assert.Equal(false, ss2_cs2.hasNull()); Assert.Equal(true, ss2_cs3.hasNull()); // stripe 3 stats StripeStatistics ss3 = stripeStats[2]; ColumnStatistics ss3_cs1 = ss3.getColumnStatistics()[0]; ColumnStatistics ss3_cs2 = ss3.getColumnStatistics()[1]; ColumnStatistics ss3_cs3 = ss3.getColumnStatistics()[2]; Assert.Equal(false, ss3_cs1.hasNull()); Assert.Equal(false, ss3_cs2.hasNull()); Assert.Equal(false, ss3_cs3.hasNull()); // stripe 4 stats StripeStatistics ss4 = stripeStats[3]; ColumnStatistics ss4_cs1 = ss4.getColumnStatistics()[0]; ColumnStatistics ss4_cs2 = ss4.getColumnStatistics()[1]; ColumnStatistics ss4_cs3 = ss4.getColumnStatistics()[2]; Assert.Equal(false, ss4_cs1.hasNull()); Assert.Equal(false, ss4_cs2.hasNull()); Assert.Equal(true, ss4_cs3.hasNull()); #if false // Test file dump TextWriter origOut = System.Console.Out; string outputFilename = "orc-file-has-null.out"; FileOutputStream myOut = new FileOutputStream(workDir + File.separator + outputFilename); // replace stdout and run command System.Console.SetOut(new StreamWriter(myOut)); FileDump.main(new String[] { testFilePath.toString(), "--rowindex=2" }); System.Console.Out.Flush(); System.SetOut(origOut); TestFileDump.checkOutput(outputFilename, workDir + File.separator + outputFilename); #endif }
public void testDataDump() { using (Stream file = File.OpenWrite(TestFilePath)) { OrcFile.WriterOptions options = new OrcFile.WriterOptions(new Properties(), conf); options.inspector(ObjectInspectorFactory.getReflectionObjectInspector(typeof(AllTypesRecord))); options.stripeSize(100000); options.compress(CompressionKind.NONE); options.bufferSize(10000); options.rowIndexStride(1000); using (Writer writer = OrcFile.createWriter(TestFilePath, file, options)) { Dictionary <string, string> m = new Dictionary <string, string>(2); m.Add("k1", "v1"); writer.addRow(new AllTypesRecord( true, (sbyte)10, (short)100, 1000, 10000L, 4.0f, 20.0, HiveDecimal.Parse("4.2222"), new Timestamp(1416967764000L), new Date(1416967764000L), "string", m, new List <int> { 100, 200 }, new AllTypesRecord.Struct(10, "foo"))); m.Clear(); m.Add("k3", "v3"); writer.addRow(new AllTypesRecord( false, (sbyte)20, (short)200, 2000, 20000L, 8.0f, 40.0, HiveDecimal.Parse("2.2222"), new Timestamp(1416967364000L), new Date(1411967764000L), "abcd", m, new List <int> { 200, 300 }, new AllTypesRecord.Struct(20, "bar"))); } } string[] lines; using (CaptureStdoutToMemory capture = new CaptureStdoutToMemory()) { FileDump.Main(TestFilePath, "-d"); lines = capture.Text.Split(new[] { '\n' }, StringSplitOptions.RemoveEmptyEntries); } Assert.Equal(2, lines.Length); // Don't be fooled by the big space in the middle, this line is quite long Assert.Equal("{\"b\":true,\"bt\":10,\"s\":100,\"i\":1000,\"l\":10000,\"f\":4,\"d\":20,\"de\":\"4.2222\",\"t\":\"2014-11-25 18:09:24\",\"dt\":\"2014-11-25\",\"str\":\"string\",\"c\":\"hello \",\"vc\":\"hello\",\"m\":[{\"_key\":\"k1\",\"_value\":\"v1\"}],\"a\":[100,200],\"st\":{\"i\":10,\"s\":\"foo\"}}", lines[0]); Assert.Equal("{\"b\":false,\"bt\":20,\"s\":200,\"i\":2000,\"l\":20000,\"f\":8,\"d\":40,\"de\":\"2.2222\",\"t\":\"2014-11-25 18:02:44\",\"dt\":\"2014-09-28\",\"str\":\"abcd\",\"c\":\"world \",\"vc\":\"world\",\"m\":[{\"_key\":\"k3\",\"_value\":\"v3\"}],\"a\":[200,300],\"st\":{\"i\":20,\"s\":\"bar\"}}", lines[1]); }
public void testStringAndBinaryStatistics() { ObjectInspector inspector = ObjectInspectorFactory.getReflectionObjectInspector(typeof(SimpleStruct)); using (Stream file = File.OpenWrite(TestFilePath)) using (Writer writer = OrcFile.createWriter(TestFilePath, file, OrcFile.writerOptions(conf) .inspector(inspector) .stripeSize(100000) .bufferSize(10000))) { writer.addRow(new SimpleStruct(bytes(0, 1, 2, 3, 4), "foo")); writer.addRow(new SimpleStruct(bytes(0, 1, 2, 3), "bar")); writer.addRow(new SimpleStruct(bytes(0, 1, 2, 3, 4, 5), null)); writer.addRow(new SimpleStruct(null, "hi")); writer.close(); Assert.Equal(4, writer.getNumberOfRows()); Assert.Equal(273, writer.getRawDataSize()); } Reader reader = OrcFile.createReader(TestFilePath, OrcFile.readerOptions(conf)); Assert.Equal(4, reader.getNumberOfRows()); Assert.Equal(273, reader.getRawDataSize()); Assert.Equal(15, reader.getRawDataSizeOfColumns(Lists.newArrayList("bytes1"))); Assert.Equal(258, reader.getRawDataSizeOfColumns(Lists.newArrayList("string1"))); Assert.Equal(273, reader.getRawDataSizeOfColumns(Lists.newArrayList("bytes1", "string1"))); // check the stats ColumnStatistics[] stats = reader.getStatistics(); Assert.Equal(4, stats[0].getNumberOfValues()); Assert.Equal("count: 4 hasNull: False", stats[0].ToString()); Assert.Equal(3, stats[1].getNumberOfValues()); Assert.Equal(15, ((BinaryColumnStatistics)stats[1]).getSum()); Assert.Equal("count: 3 hasNull: True sum: 15", stats[1].ToString()); Assert.Equal(3, stats[2].getNumberOfValues()); Assert.Equal("bar", ((StringColumnStatistics)stats[2]).getMinimum()); Assert.Equal("hi", ((StringColumnStatistics)stats[2]).getMaximum()); Assert.Equal(8, ((StringColumnStatistics)stats[2]).getSum()); Assert.Equal("count: 3 hasNull: True min: bar max: hi sum: 8", stats[2].ToString()); // check the inspectors StructObjectInspector readerInspector = (StructObjectInspector)reader.getObjectInspector(); Assert.Equal(ObjectInspectorCategory.STRUCT, readerInspector.getCategory()); Assert.Equal("struct<bytes1:binary,string1:string>", readerInspector.getTypeName()); IList <StructField> fields = readerInspector.getAllStructFieldRefs(); BinaryObjectInspector bi = (BinaryObjectInspector)readerInspector. getStructFieldRef("bytes1").getFieldObjectInspector(); StringObjectInspector st = (StringObjectInspector)readerInspector. getStructFieldRef("string1").getFieldObjectInspector(); using (RecordReader rows = reader.rows()) { object row = rows.next(); Assert.NotNull(row); // check the contents of the first row Assert.Equal(bytes(0, 1, 2, 3, 4), bi.get( readerInspector.getStructFieldData(row, fields[0]))); Assert.Equal("foo", st.getPrimitiveJavaObject(readerInspector. getStructFieldData(row, fields[1]))); // check the contents of second row Assert.Equal(true, rows.hasNext()); row = rows.next(); Assert.Equal(bytes(0, 1, 2, 3), bi.get( readerInspector.getStructFieldData(row, fields[0]))); Assert.Equal("bar", st.getPrimitiveJavaObject(readerInspector. getStructFieldData(row, fields[1]))); // check the contents of second row Assert.Equal(true, rows.hasNext()); row = rows.next(); Assert.Equal(bytes(0, 1, 2, 3, 4, 5), bi.get( readerInspector.getStructFieldData(row, fields[0]))); Assert.Null(st.getPrimitiveJavaObject(readerInspector. getStructFieldData(row, fields[1]))); // check the contents of second row Assert.Equal(true, rows.hasNext()); row = rows.next(); Assert.Null(bi.get( readerInspector.getStructFieldData(row, fields[0]))); Assert.Equal("hi", st.getPrimitiveJavaObject(readerInspector. getStructFieldData(row, fields[1]))); Assert.Equal(false, rows.hasNext()); } }
public void testMultiStripeWithNull() { ObjectInspector inspector = ObjectInspectorFactory.getReflectionObjectInspector(typeof(MyStruct)); using (Stream file = File.OpenWrite(TestFilePath)) using (Writer writer = OrcFile.createWriter(TestFilePath, file, OrcFile.writerOptions(conf) .inspector(inspector) .stripeSize(100000) .compress(CompressionKind.NONE) .bufferSize(10000))) { Random rand = new Random(100); writer.addRow(new MyStruct(null, null, true, new List <InnerStruct> { new InnerStruct(100) })); for (int i = 2; i < 20000; i++) { writer.addRow(new MyStruct(rand.Next(1), "a", true, new List <InnerStruct> { new InnerStruct(100) })); } writer.addRow(new MyStruct(null, null, true, new List <InnerStruct> { new InnerStruct(100) })); } Reader reader = OrcFile.createReader(TestFilePath, OrcFile.readerOptions(conf)); // check the stats ColumnStatistics[] stats = reader.getStatistics(); Assert.Equal(20000, reader.getNumberOfRows()); Assert.Equal(20000, stats[0].getNumberOfValues()); Assert.Equal(0, ((IntegerColumnStatistics)stats[1]).getMaximum()); Assert.Equal(0, ((IntegerColumnStatistics)stats[1]).getMinimum()); Assert.Equal(true, ((IntegerColumnStatistics)stats[1]).isSumDefined()); Assert.Equal(0, ((IntegerColumnStatistics)stats[1]).getSum()); Assert.Equal("count: 19998 hasNull: True min: 0 max: 0 sum: 0", stats[1].ToString()); Assert.Equal("a", ((StringColumnStatistics)stats[2]).getMaximum()); Assert.Equal("a", ((StringColumnStatistics)stats[2]).getMinimum()); Assert.Equal(19998, stats[2].getNumberOfValues()); Assert.Equal("count: 19998 hasNull: True min: a max: a sum: 19998", stats[2].ToString()); // check the inspectors StructObjectInspector readerInspector = (StructObjectInspector)reader.getObjectInspector(); Assert.Equal(ObjectInspectorCategory.STRUCT, readerInspector.getCategory()); Assert.Equal("struct<a:int,b:string,c:boolean,list:array<struct<z:int>>>", readerInspector.getTypeName()); using (RecordReader rows = reader.rows()) { List <bool> expected = new List <bool>(); foreach (StripeInformation sinfo in reader.getStripes()) { expected.Add(false); } // only the first and last stripe will have PRESENT stream expected[0] = true; expected[expected.Count - 1] = true; List <bool> got = new List <bool>(); // check if the strip footer contains PRESENT stream foreach (StripeInformation sinfo in reader.getStripes()) { OrcProto.StripeFooter sf = ((RecordReaderImpl)rows).readStripeFooter(sinfo); got.Add(sf.ToString().IndexOf(OrcProto.Stream.Types.Kind.PRESENT.ToString()) != -1); } Assert.Equal(expected, got); // row 1 OrcStruct row = (OrcStruct)rows.next(); Assert.NotNull(row); Assert.Null(row.getFieldValue(0)); Assert.Null(row.getFieldValue(1)); Assert.Equal(true, row.getFieldValue(2)); Assert.Equal(100, ((OrcStruct)((IList <object>)row.getFieldValue(3))[0]). getFieldValue(0)); rows.seekToRow(19998); // last-1 row row = (OrcStruct)rows.next(); Assert.NotNull(row); Assert.NotNull(row.getFieldValue(1)); Assert.Equal(0, row.getFieldValue(0)); Assert.Equal(true, row.getFieldValue(2)); Assert.Equal(100, ((OrcStruct)((IList <object>)row.getFieldValue(3))[0]). getFieldValue(0)); // last row row = (OrcStruct)rows.next(); Assert.NotNull(row); Assert.Null(row.getFieldValue(0)); Assert.Null(row.getFieldValue(1)); Assert.Equal(true, row.getFieldValue(2)); Assert.Equal(100, ((OrcStruct)((IList <object>)row.getFieldValue(3))[0]). getFieldValue(0)); } }
public void testOrcSerDeStatsComplexOldFormat() { ObjectInspector inspector = ObjectInspectorFactory.getReflectionObjectInspector(typeof(BigRow)); long rawDataSize; using (Stream file = File.OpenWrite(TestFilePath)) using (Writer writer = OrcFile.createWriter(TestFilePath, file, OrcFile.writerOptions(conf) .inspector(inspector) .stripeSize(100000) .version(OrcFile.Version.V_0_11) .bufferSize(10000))) { // 1 + 2 + 4 + 8 + 4 + 8 + 5 + 2 + 4 + 3 + 4 + 4 + 4 + 4 + 4 + 3 = 64 writer.addRow(new BigRow(false, (sbyte)1, (short)1024, 65536, Int64.MaxValue, (float)1.0, -15.0, bytes(0, 1, 2, 3, 4), "hi", new MiddleStruct(inner(1, "bye"), inner(2, "sigh")), list(inner(3, "good"), inner(4, "bad")), map(), Timestamp.Parse("2000-03-12 15:00:00"), HiveDecimal.Parse( "12345678.6547456"))); // 1 + 2 + 4 + 8 + 4 + 8 + 3 + 4 + 3 + 4 + 4 + 4 + 3 + 4 + 2 + 4 + 3 + 5 + 4 + 5 + 7 + 4 + 7 = // 97 writer.addRow(new BigRow(true, (sbyte)100, (short)2048, 65536, Int64.MaxValue, (float)2.0, -5.0, bytes(), "bye", new MiddleStruct(inner(1, "bye"), inner(2, "sigh")), list(inner(100000000, "cat"), inner(-100000, "in"), inner(1234, "hat")), map(inner(5, "chani"), inner(1, "mauddib")), Timestamp.Parse("2000-03-11 15:00:00"), HiveDecimal.Parse("12345678.6547452"))); writer.close(); long rowCount = writer.getNumberOfRows(); rawDataSize = writer.getRawDataSize(); Assert.Equal(2, rowCount); Assert.Equal(1740, rawDataSize); } Reader reader = OrcFile.createReader(TestFilePath, OrcFile.readerOptions(conf)); Assert.Equal(2, reader.getNumberOfRows()); Assert.Equal(1740, reader.getRawDataSize()); Assert.Equal(8, reader.getRawDataSizeOfColumns(Lists.newArrayList("boolean1"))); Assert.Equal(8, reader.getRawDataSizeOfColumns(Lists.newArrayList("byte1"))); Assert.Equal(8, reader.getRawDataSizeOfColumns(Lists.newArrayList("short1"))); Assert.Equal(8, reader.getRawDataSizeOfColumns(Lists.newArrayList("int1"))); Assert.Equal(16, reader.getRawDataSizeOfColumns(Lists.newArrayList("long1"))); Assert.Equal(8, reader.getRawDataSizeOfColumns(Lists.newArrayList("float1"))); Assert.Equal(16, reader.getRawDataSizeOfColumns(Lists.newArrayList("double1"))); Assert.Equal(5, reader.getRawDataSizeOfColumns(Lists.newArrayList("bytes1"))); Assert.Equal(172, reader.getRawDataSizeOfColumns(Lists.newArrayList("string1"))); Assert.Equal(455, reader.getRawDataSizeOfColumns(Lists.newArrayList("list"))); Assert.Equal(368, reader.getRawDataSizeOfColumns(Lists.newArrayList("map"))); Assert.Equal(364, reader.getRawDataSizeOfColumns(Lists.newArrayList("middle"))); Assert.Equal(80, reader.getRawDataSizeOfColumns(Lists.newArrayList("ts"))); Assert.Equal(224, reader.getRawDataSizeOfColumns(Lists.newArrayList("decimal1"))); Assert.Equal(88, reader.getRawDataSizeOfColumns(Lists.newArrayList("ts", "int1"))); Assert.Equal(1195, reader.getRawDataSizeOfColumns(Lists.newArrayList("middle", "list", "map", "float1"))); Assert.Equal(185, reader.getRawDataSizeOfColumns(Lists.newArrayList("bytes1", "byte1", "string1"))); Assert.Equal(rawDataSize, reader.getRawDataSizeOfColumns(Lists.newArrayList("boolean1", "byte1", "short1", "int1", "long1", "float1", "double1", "bytes1", "string1", "list", "map", "middle", "ts", "decimal1"))); // check the stats ColumnStatistics[] stats = reader.getStatistics(); Assert.Equal(2, stats[1].getNumberOfValues()); Assert.Equal(1, ((BooleanColumnStatistics)stats[1]).getFalseCount()); Assert.Equal(1, ((BooleanColumnStatistics)stats[1]).getTrueCount()); Assert.Equal("count: 2 hasNull: False true: 1", stats[1].ToString()); Assert.Equal(2048, ((IntegerColumnStatistics)stats[3]).getMaximum()); Assert.Equal(1024, ((IntegerColumnStatistics)stats[3]).getMinimum()); Assert.Equal(true, ((IntegerColumnStatistics)stats[3]).isSumDefined()); Assert.Equal(3072, ((IntegerColumnStatistics)stats[3]).getSum()); Assert.Equal("count: 2 hasNull: False min: 1024 max: 2048 sum: 3072", stats[3].ToString()); Assert.Equal(Int64.MaxValue, ((IntegerColumnStatistics)stats[5]).getMaximum()); Assert.Equal(Int64.MaxValue, ((IntegerColumnStatistics)stats[5]).getMinimum()); Assert.Equal(false, ((IntegerColumnStatistics)stats[5]).isSumDefined()); Assert.Equal("count: 2 hasNull: False min: 9223372036854775807 max: 9223372036854775807", stats[5].ToString()); Assert.Equal(-15.0, ((DoubleColumnStatistics)stats[7]).getMinimum()); Assert.Equal(-5.0, ((DoubleColumnStatistics)stats[7]).getMaximum()); Assert.Equal(-20.0, ((DoubleColumnStatistics)stats[7]).getSum(), 5); Assert.Equal("count: 2 hasNull: False min: -15 max: -5 sum: -20", stats[7].ToString()); Assert.Equal(5, ((BinaryColumnStatistics)stats[8]).getSum()); Assert.Equal("count: 2 hasNull: False sum: 5", stats[8].ToString()); Assert.Equal("bye", ((StringColumnStatistics)stats[9]).getMinimum()); Assert.Equal("hi", ((StringColumnStatistics)stats[9]).getMaximum()); Assert.Equal(5, ((StringColumnStatistics)stats[9]).getSum()); Assert.Equal("count: 2 hasNull: False min: bye max: hi sum: 5", stats[9].ToString()); }