public void testReadTimestampFormat_0_11(string readerTimeZone) { string oldFilePath = Path.Combine(TestHelpers.ResourcesDirectory, "orc-file-11-format.orc"); using (TestHelpers.SetTimeZoneInfo(readerTimeZone)) { Reader reader = OrcFile.createReader(oldFilePath, OrcFile.readerOptions(conf)); StructObjectInspector readerInspector = (StructObjectInspector)reader.getObjectInspector(); IList <StructField> fields = readerInspector.getAllStructFieldRefs(); TimestampObjectInspector tso = (TimestampObjectInspector)readerInspector .getStructFieldRef("ts").getFieldObjectInspector(); using (RecordReader rows = reader.rows()) { object row = rows.next(); Assert.NotNull(row); Assert.Equal(Timestamp.Parse("2000-03-12 15:00:00"), tso.getPrimitiveJavaObject(readerInspector.getStructFieldData(row, fields[12]))); // check the contents of second row Assert.Equal(true, rows.hasNext()); rows.seekToRow(7499); row = rows.next(); Assert.Equal(Timestamp.Parse("2000-03-12 15:00:01"), tso.getPrimitiveJavaObject(readerInspector.getStructFieldData(row, fields[12]))); Assert.Equal(false, rows.hasNext()); } } }
public void testOrcSerDeStatsMap() { ObjectInspector inspector = ObjectInspectorFactory.getReflectionObjectInspector(typeof(MapStruct)); using (Stream file = File.OpenWrite(TestFilePath)) using (Writer writer = OrcFile.createWriter(TestFilePath, file, OrcFile.writerOptions(conf) .inspector(inspector) .stripeSize(10000) .bufferSize(10000))) { for (int row = 0; row < 1000; row++) { Dictionary <string, double> test = new Dictionary <string, double>(); for (int i = 0; i < 10; i++) { test.Add("hi" + i, 2.0); } writer.addRow(new MapStruct(test)); } writer.close(); // stats from writer Assert.Equal(1000, writer.getNumberOfRows()); Assert.Equal(950000, writer.getRawDataSize()); } Reader reader = OrcFile.createReader(TestFilePath, OrcFile.readerOptions(conf)); // stats from reader Assert.Equal(1000, reader.getNumberOfRows()); Assert.Equal(950000, reader.getRawDataSize()); Assert.Equal(950000, reader.getRawDataSizeOfColumns(Lists.newArrayList("map1"))); }
public OrcFileStripeMergeRecordReader(Configuration conf, FileSplit split) { path = split.getPath(); start = split.getStart(); end = start + split.getLength(); FileSystem fs = path.getFileSystem(conf); this.reader = OrcFile.createReader(path, OrcFile.readerOptions(conf).filesystem(fs)); this.iter = reader.getStripes().GetEnumerator(); this.stripeIdx = 0; this.stripeStatistics = ((ReaderImpl)reader).getOrcProtoStripeStatistics(); }
public void testHalfDistinctCheckDisabled() { ObjectInspector inspector = ObjectInspectorFactory.getReflectionObjectInspector(typeof(string)); int[] input = new int[20000]; // conf.setBoolean(ConfVars.HIVE_ORC_ROW_INDEX_STRIDE_DICTIONARY_CHECK.varname, false); using (Stream file = File.OpenWrite(TestFilePath)) using (Writer writer = OrcFile.createWriter(TestFilePath, file, OrcFile.writerOptions(conf) .inspector(inspector) .compress(CompressionKind.NONE) .bufferSize(10000))) { Random rand = new Random(123); for (int i = 0; i < 20000; i++) { input[i] = rand.Next(10000); } for (int i = 0; i < 20000; i++) { writer.addRow(input[i].ToString()); } } Reader reader = OrcFile.createReader(TestFilePath, OrcFile.readerOptions(conf)); using (RecordReader rows = reader.rows()) { int idx = 0; while (rows.hasNext()) { object row = rows.next(); Assert.Equal(input[idx++].ToString(), row); } // make sure the encoding type is correct foreach (StripeInformation stripe in reader.getStripes()) { // hacky but does the job, this casting will work as long this test resides // within the same package as ORC reader OrcProto.StripeFooter footer = ((RecordReaderImpl)rows).readStripeFooter(stripe); for (int i = 0; i < footer.ColumnsCount; ++i) { OrcProto.ColumnEncoding encoding = footer.GetColumns(i); Assert.Equal(OrcProto.ColumnEncoding.Types.Kind.DICTIONARY_V2, encoding.Kind); } } } }
public void testTimestampWriter(string writerTimeZone, string readerTimeZone) { ObjectInspector inspector = ObjectInspectorFactory.getReflectionObjectInspector(typeof(Timestamp)); List <string> ts = new List <string>(); using (Stream file = File.OpenWrite(TestFilePath)) using (Writer writer = OrcFile.createWriter(TestFilePath, file, OrcFile.writerOptions(conf) .inspector(inspector) .stripeSize(100000) .bufferSize(10000))) using (TestHelpers.SetTimeZoneInfo(writerTimeZone)) { ts.Add("2003-01-01 01:00:00.000000222"); ts.Add("1996-08-02 09:00:00.723100809"); ts.Add("1999-01-01 02:00:00.999999999"); ts.Add("1995-01-02 03:00:00.688888888"); ts.Add("2002-01-01 04:00:00.1"); ts.Add("2010-03-02 05:00:00.000009001"); ts.Add("2005-01-01 06:00:00.000002229"); ts.Add("2006-01-01 07:00:00.900203003"); ts.Add("2003-01-01 08:00:00.800000007"); ts.Add("1998-11-02 10:00:00.857340643"); ts.Add("2008-10-02 11:00:00.0"); ts.Add("2037-01-01 00:00:00.000999"); ts.Add("2014-03-28 00:00:00.0"); foreach (string t in ts) { writer.addRow(Timestamp.Parse(t)); } } using (TestHelpers.SetTimeZoneInfo(readerTimeZone)) { Reader reader = OrcFile.createReader(TestFilePath, OrcFile.readerOptions(conf)); using (RecordReader rows = reader.rows(null)) { int idx = 0; while (rows.hasNext()) { object row = rows.next(); Timestamp got = ((Timestamp)row); Assert.Equal(ts[idx++], got.ToString()); } } } }
public void testTooManyDistinctV11AlwaysDictionary() { ObjectInspector inspector = ObjectInspectorFactory.getReflectionObjectInspector(typeof(string)); using (Stream file = File.OpenWrite(TestFilePath)) using (Writer writer = OrcFile.createWriter(TestFilePath, file, OrcFile.writerOptions(conf) .inspector(inspector) .compress(CompressionKind.NONE) .version(OrcFile.Version.V_0_11) .bufferSize(10000))) { for (int i = 0; i < 20000; i++) { writer.addRow(i.ToString()); } } Reader reader = OrcFile.createReader(TestFilePath, OrcFile.readerOptions(conf)); using (RecordReader rows = reader.rows()) { int idx = 0; while (rows.hasNext()) { object row = rows.next(); Assert.Equal((idx++).ToString(), row); } // make sure the encoding type is correct foreach (StripeInformation stripe in reader.getStripes()) { // hacky but does the job, this casting will work as long this test resides // within the same package as ORC reader OrcProto.StripeFooter footer = ((RecordReaderImpl)rows).readStripeFooter(stripe); for (int i = 0; i < footer.ColumnsCount; ++i) { OrcProto.ColumnEncoding encoding = footer.GetColumns(i); Assert.Equal(OrcProto.ColumnEncoding.Types.Kind.DICTIONARY, encoding.Kind); } } } }
public bool validateInput(FileSystem fs, HiveConf conf, List <FileStatus> files) { if (files.Count <= 0) { return(false); } foreach (FileStatus file in files) { try { OrcFile.createReader(file.getPath(), OrcFile.readerOptions(conf).filesystem(fs)); } catch (System.IO.IOException e) { return(false); } } return(true); }
getRecordReader(InputSplit inputSplit, JobConf conf, Reporter reporter) { FileSplit fSplit = (FileSplit)inputSplit; reporter.setStatus(fSplit.ToString()); Path path = fSplit.getPath(); OrcFile.ReaderOptions opts = OrcFile.readerOptions(conf); if (fSplit is OrcSplit) { OrcSplit orcSplit = (OrcSplit)fSplit; if (orcSplit.hasFooter()) { opts.fileMetaInfo(orcSplit.getFileMetaInfo()); } } Reader reader = OrcFile.createReader(path, opts); return(new VectorizedOrcRecordReader(reader, conf, fSplit)); }
public void testOrcSerDeStatsSimpleWithNulls() { ObjectInspector inspector = ObjectInspectorFactory.getReflectionObjectInspector(typeof(SimpleStruct)); using (Stream file = File.OpenWrite(TestFilePath)) using (Writer writer = OrcFile.createWriter(TestFilePath, file, OrcFile.writerOptions(conf) .inspector(inspector) .stripeSize(10000) .bufferSize(10000))) { for (int row = 0; row < 1000; row++) { if (row % 2 == 0) { writer.addRow(new SimpleStruct(new byte[] { 1, 2, 3 }, "hi")); } else { writer.addRow(null); } } writer.close(); // stats from writer Assert.Equal(1000, writer.getNumberOfRows()); Assert.Equal(44500, writer.getRawDataSize()); } Reader reader = OrcFile.createReader(TestFilePath, OrcFile.readerOptions(conf)); // stats from reader Assert.Equal(1000, reader.getNumberOfRows()); Assert.Equal(44500, reader.getRawDataSize()); Assert.Equal(1500, reader.getRawDataSizeOfColumns(Lists.newArrayList("bytes1"))); Assert.Equal(43000, reader.getRawDataSizeOfColumns(Lists.newArrayList("string1"))); Assert.Equal(44500, reader.getRawDataSizeOfColumns(Lists.newArrayList("bytes1", "string1"))); }
public void testBitPack64Large() { ObjectInspector inspector = ObjectInspectorFactory.getReflectionObjectInspector(typeof(long)); const int size = 1080832; long[] inp = new long[size]; Random rand = new Random(1234); for (int i = 0; i < size; i++) { inp[i] = rand.NextLong(); } List <long> input = inp.ToList(); using (Stream file = File.OpenWrite(TestFilePath)) using (Writer writer = OrcFile.createWriter(TestFilePath, file, OrcFile.writerOptions(conf) .inspector(inspector) .compress(CompressionKind.ZLIB))) { foreach (long l in input) { writer.addRow(l); } } Reader reader = OrcFile.createReader(TestFilePath, OrcFile.readerOptions(conf)); using (RecordReader rows = reader.rows()) { int idx = 0; while (rows.hasNext()) { object row = rows.next(); Assert.Equal(input[idx++], ((long)row)); } } }
public void testBitPacking(long val) { long[] input = new long[] { val, 0, val, val, 0, val, 0, val, val, 0, val, 0, val, val, 0, 0, val, val, 0, val, 0, 0, val, 0, val, 0, val, 0, 0, val, 0, val, 0, val, 0, 0, val, 0, val, 0, val, 0, 0, val, 0, val, 0, val, 0, 0, val, 0, val, 0, val, 0, 0, val, 0, val, 0, val, 0, 0, val, 0, val, 0, val, 0, 0, val, 0, val, 0, val, 0, 0, val, 0, val, 0, val, 0, 0, val, 0, val, 0, val, 0, 0, val, 0, val, 0, 0, val, val }; ObjectInspector inspector = ObjectInspectorFactory.getReflectionObjectInspector(typeof(long)); using (Stream file = File.OpenWrite(TestFilePath)) using (Writer writer = OrcFile.createWriter(TestFilePath, file, OrcFile.writerOptions(conf) .inspector(inspector) .stripeSize(100000) .compress(CompressionKind.NONE) .bufferSize(10000))) { foreach (long l in input) { writer.addRow(l); } } Reader reader = OrcFile.createReader(TestFilePath, OrcFile.readerOptions(conf)); using (RecordReader rows = reader.rows()) { int idx = 0; while (rows.hasNext()) { object row = rows.next(); Assert.Equal(input[idx++], ((long)row)); } } }
public void testMultiStripeWithNull() { ObjectInspector inspector = ObjectInspectorFactory.getReflectionObjectInspector(typeof(MyStruct)); using (Stream file = File.OpenWrite(TestFilePath)) using (Writer writer = OrcFile.createWriter(TestFilePath, file, OrcFile.writerOptions(conf) .inspector(inspector) .stripeSize(100000) .compress(CompressionKind.NONE) .bufferSize(10000))) { Random rand = new Random(100); writer.addRow(new MyStruct(null, null, true, new List <InnerStruct> { new InnerStruct(100) })); for (int i = 2; i < 20000; i++) { writer.addRow(new MyStruct(rand.Next(1), "a", true, new List <InnerStruct> { new InnerStruct(100) })); } writer.addRow(new MyStruct(null, null, true, new List <InnerStruct> { new InnerStruct(100) })); } Reader reader = OrcFile.createReader(TestFilePath, OrcFile.readerOptions(conf)); // check the stats ColumnStatistics[] stats = reader.getStatistics(); Assert.Equal(20000, reader.getNumberOfRows()); Assert.Equal(20000, stats[0].getNumberOfValues()); Assert.Equal(0, ((IntegerColumnStatistics)stats[1]).getMaximum()); Assert.Equal(0, ((IntegerColumnStatistics)stats[1]).getMinimum()); Assert.Equal(true, ((IntegerColumnStatistics)stats[1]).isSumDefined()); Assert.Equal(0, ((IntegerColumnStatistics)stats[1]).getSum()); Assert.Equal("count: 19998 hasNull: True min: 0 max: 0 sum: 0", stats[1].ToString()); Assert.Equal("a", ((StringColumnStatistics)stats[2]).getMaximum()); Assert.Equal("a", ((StringColumnStatistics)stats[2]).getMinimum()); Assert.Equal(19998, stats[2].getNumberOfValues()); Assert.Equal("count: 19998 hasNull: True min: a max: a sum: 19998", stats[2].ToString()); // check the inspectors StructObjectInspector readerInspector = (StructObjectInspector)reader.getObjectInspector(); Assert.Equal(ObjectInspectorCategory.STRUCT, readerInspector.getCategory()); Assert.Equal("struct<a:int,b:string,c:boolean,list:array<struct<z:int>>>", readerInspector.getTypeName()); using (RecordReader rows = reader.rows()) { List <bool> expected = new List <bool>(); foreach (StripeInformation sinfo in reader.getStripes()) { expected.Add(false); } // only the first and last stripe will have PRESENT stream expected[0] = true; expected[expected.Count - 1] = true; List <bool> got = new List <bool>(); // check if the strip footer contains PRESENT stream foreach (StripeInformation sinfo in reader.getStripes()) { OrcProto.StripeFooter sf = ((RecordReaderImpl)rows).readStripeFooter(sinfo); got.Add(sf.ToString().IndexOf(OrcProto.Stream.Types.Kind.PRESENT.ToString()) != -1); } Assert.Equal(expected, got); // row 1 OrcStruct row = (OrcStruct)rows.next(); Assert.NotNull(row); Assert.Null(row.getFieldValue(0)); Assert.Null(row.getFieldValue(1)); Assert.Equal(true, row.getFieldValue(2)); Assert.Equal(100, ((OrcStruct)((IList <object>)row.getFieldValue(3))[0]). getFieldValue(0)); rows.seekToRow(19998); // last-1 row row = (OrcStruct)rows.next(); Assert.NotNull(row); Assert.NotNull(row.getFieldValue(1)); Assert.Equal(0, row.getFieldValue(0)); Assert.Equal(true, row.getFieldValue(2)); Assert.Equal(100, ((OrcStruct)((IList <object>)row.getFieldValue(3))[0]). getFieldValue(0)); // last row row = (OrcStruct)rows.next(); Assert.NotNull(row); Assert.Null(row.getFieldValue(0)); Assert.Null(row.getFieldValue(1)); Assert.Equal(true, row.getFieldValue(2)); Assert.Equal(100, ((OrcStruct)((IList <object>)row.getFieldValue(3))[0]). getFieldValue(0)); } }
public void testSerdeStatsOldFormat() { string testFile = Path.Combine(TestHelpers.ResourcesDirectory, "orc-file-11-format.orc"); Reader reader = OrcFile.createReader(testFile, OrcFile.readerOptions(conf)); int stripeCount = 0; int rowCount = 0; long currentOffset = -1; foreach (StripeInformation stripe in reader.getStripes()) { stripeCount += 1; rowCount += (int)stripe.getNumberOfRows(); if (currentOffset < 0) { currentOffset = stripe.getOffset() + stripe.getIndexLength() + stripe.getDataLength() + stripe.getFooterLength(); } else { Assert.Equal(currentOffset, stripe.getOffset()); currentOffset += stripe.getIndexLength() + stripe.getDataLength() + stripe.getFooterLength(); } } Assert.Equal(reader.getNumberOfRows(), rowCount); #if JAVA_SIZE Assert.Equal(6300000, reader.getRawDataSize()); #endif Assert.Equal(2, stripeCount); // check the stats ColumnStatistics[] stats = reader.getStatistics(); Assert.Equal(7500, stats[1].getNumberOfValues()); Assert.Equal(3750, ((BooleanColumnStatistics)stats[1]).getFalseCount()); Assert.Equal(3750, ((BooleanColumnStatistics)stats[1]).getTrueCount()); Assert.Equal("count: 7500 hasNull: True true: 3750", stats[1].ToString()); Assert.Equal(2048, ((IntegerColumnStatistics)stats[3]).getMaximum()); Assert.Equal(1024, ((IntegerColumnStatistics)stats[3]).getMinimum()); Assert.Equal(true, ((IntegerColumnStatistics)stats[3]).isSumDefined()); Assert.Equal(11520000, ((IntegerColumnStatistics)stats[3]).getSum()); Assert.Equal("count: 7500 hasNull: True min: 1024 max: 2048 sum: 11520000", stats[3].ToString()); Assert.Equal(Int64.MaxValue, ((IntegerColumnStatistics)stats[5]).getMaximum()); Assert.Equal(Int64.MaxValue, ((IntegerColumnStatistics)stats[5]).getMinimum()); Assert.Equal(false, ((IntegerColumnStatistics)stats[5]).isSumDefined()); Assert.Equal( "count: 7500 hasNull: True min: 9223372036854775807 max: 9223372036854775807", stats[5].ToString()); Assert.Equal(-15.0, ((DoubleColumnStatistics)stats[7]).getMinimum()); Assert.Equal(-5.0, ((DoubleColumnStatistics)stats[7]).getMaximum()); Assert.Equal(-75000.0, ((DoubleColumnStatistics)stats[7]).getSum(), 5); Assert.Equal("count: 7500 hasNull: True min: -15 max: -5 sum: -75000", stats[7].ToString()); Assert.Equal("bye", ((StringColumnStatistics)stats[9]).getMinimum()); Assert.Equal("hi", ((StringColumnStatistics)stats[9]).getMaximum()); Assert.Equal(0, ((StringColumnStatistics)stats[9]).getSum()); Assert.Equal("count: 7500 hasNull: True min: bye max: hi sum: 0", stats[9].ToString()); // old orc format will not have binary statistics. ToString() will show only // the general column statistics Assert.Equal("count: 7500 hasNull: True", stats[8].ToString()); // since old orc format doesn't support binary statistics, // this should throw ClassCastException Assert.Throws <InvalidCastException>(() => ((BinaryColumnStatistics)stats[8]).getSum()); }
public void testOrcSerDeStatsComplexOldFormat() { ObjectInspector inspector = ObjectInspectorFactory.getReflectionObjectInspector(typeof(BigRow)); long rawDataSize; using (Stream file = File.OpenWrite(TestFilePath)) using (Writer writer = OrcFile.createWriter(TestFilePath, file, OrcFile.writerOptions(conf) .inspector(inspector) .stripeSize(100000) .version(OrcFile.Version.V_0_11) .bufferSize(10000))) { // 1 + 2 + 4 + 8 + 4 + 8 + 5 + 2 + 4 + 3 + 4 + 4 + 4 + 4 + 4 + 3 = 64 writer.addRow(new BigRow(false, (sbyte)1, (short)1024, 65536, Int64.MaxValue, (float)1.0, -15.0, bytes(0, 1, 2, 3, 4), "hi", new MiddleStruct(inner(1, "bye"), inner(2, "sigh")), list(inner(3, "good"), inner(4, "bad")), map(), Timestamp.Parse("2000-03-12 15:00:00"), HiveDecimal.Parse( "12345678.6547456"))); // 1 + 2 + 4 + 8 + 4 + 8 + 3 + 4 + 3 + 4 + 4 + 4 + 3 + 4 + 2 + 4 + 3 + 5 + 4 + 5 + 7 + 4 + 7 = // 97 writer.addRow(new BigRow(true, (sbyte)100, (short)2048, 65536, Int64.MaxValue, (float)2.0, -5.0, bytes(), "bye", new MiddleStruct(inner(1, "bye"), inner(2, "sigh")), list(inner(100000000, "cat"), inner(-100000, "in"), inner(1234, "hat")), map(inner(5, "chani"), inner(1, "mauddib")), Timestamp.Parse("2000-03-11 15:00:00"), HiveDecimal.Parse("12345678.6547452"))); writer.close(); long rowCount = writer.getNumberOfRows(); rawDataSize = writer.getRawDataSize(); Assert.Equal(2, rowCount); Assert.Equal(1740, rawDataSize); } Reader reader = OrcFile.createReader(TestFilePath, OrcFile.readerOptions(conf)); Assert.Equal(2, reader.getNumberOfRows()); Assert.Equal(1740, reader.getRawDataSize()); Assert.Equal(8, reader.getRawDataSizeOfColumns(Lists.newArrayList("boolean1"))); Assert.Equal(8, reader.getRawDataSizeOfColumns(Lists.newArrayList("byte1"))); Assert.Equal(8, reader.getRawDataSizeOfColumns(Lists.newArrayList("short1"))); Assert.Equal(8, reader.getRawDataSizeOfColumns(Lists.newArrayList("int1"))); Assert.Equal(16, reader.getRawDataSizeOfColumns(Lists.newArrayList("long1"))); Assert.Equal(8, reader.getRawDataSizeOfColumns(Lists.newArrayList("float1"))); Assert.Equal(16, reader.getRawDataSizeOfColumns(Lists.newArrayList("double1"))); Assert.Equal(5, reader.getRawDataSizeOfColumns(Lists.newArrayList("bytes1"))); Assert.Equal(172, reader.getRawDataSizeOfColumns(Lists.newArrayList("string1"))); Assert.Equal(455, reader.getRawDataSizeOfColumns(Lists.newArrayList("list"))); Assert.Equal(368, reader.getRawDataSizeOfColumns(Lists.newArrayList("map"))); Assert.Equal(364, reader.getRawDataSizeOfColumns(Lists.newArrayList("middle"))); Assert.Equal(80, reader.getRawDataSizeOfColumns(Lists.newArrayList("ts"))); Assert.Equal(224, reader.getRawDataSizeOfColumns(Lists.newArrayList("decimal1"))); Assert.Equal(88, reader.getRawDataSizeOfColumns(Lists.newArrayList("ts", "int1"))); Assert.Equal(1195, reader.getRawDataSizeOfColumns(Lists.newArrayList("middle", "list", "map", "float1"))); Assert.Equal(185, reader.getRawDataSizeOfColumns(Lists.newArrayList("bytes1", "byte1", "string1"))); Assert.Equal(rawDataSize, reader.getRawDataSizeOfColumns(Lists.newArrayList("boolean1", "byte1", "short1", "int1", "long1", "float1", "double1", "bytes1", "string1", "list", "map", "middle", "ts", "decimal1"))); // check the stats ColumnStatistics[] stats = reader.getStatistics(); Assert.Equal(2, stats[1].getNumberOfValues()); Assert.Equal(1, ((BooleanColumnStatistics)stats[1]).getFalseCount()); Assert.Equal(1, ((BooleanColumnStatistics)stats[1]).getTrueCount()); Assert.Equal("count: 2 hasNull: False true: 1", stats[1].ToString()); Assert.Equal(2048, ((IntegerColumnStatistics)stats[3]).getMaximum()); Assert.Equal(1024, ((IntegerColumnStatistics)stats[3]).getMinimum()); Assert.Equal(true, ((IntegerColumnStatistics)stats[3]).isSumDefined()); Assert.Equal(3072, ((IntegerColumnStatistics)stats[3]).getSum()); Assert.Equal("count: 2 hasNull: False min: 1024 max: 2048 sum: 3072", stats[3].ToString()); Assert.Equal(Int64.MaxValue, ((IntegerColumnStatistics)stats[5]).getMaximum()); Assert.Equal(Int64.MaxValue, ((IntegerColumnStatistics)stats[5]).getMinimum()); Assert.Equal(false, ((IntegerColumnStatistics)stats[5]).isSumDefined()); Assert.Equal("count: 2 hasNull: False min: 9223372036854775807 max: 9223372036854775807", stats[5].ToString()); Assert.Equal(-15.0, ((DoubleColumnStatistics)stats[7]).getMinimum()); Assert.Equal(-5.0, ((DoubleColumnStatistics)stats[7]).getMaximum()); Assert.Equal(-20.0, ((DoubleColumnStatistics)stats[7]).getSum(), 5); Assert.Equal("count: 2 hasNull: False min: -15 max: -5 sum: -20", stats[7].ToString()); Assert.Equal(5, ((BinaryColumnStatistics)stats[8]).getSum()); Assert.Equal("count: 2 hasNull: False sum: 5", stats[8].ToString()); Assert.Equal("bye", ((StringColumnStatistics)stats[9]).getMinimum()); Assert.Equal("hi", ((StringColumnStatistics)stats[9]).getMaximum()); Assert.Equal(5, ((StringColumnStatistics)stats[9]).getSum()); Assert.Equal("count: 2 hasNull: False min: bye max: hi sum: 5", stats[9].ToString()); }
public void testStringAndBinaryStatistics() { ObjectInspector inspector = ObjectInspectorFactory.getReflectionObjectInspector(typeof(SimpleStruct)); using (Stream file = File.OpenWrite(TestFilePath)) using (Writer writer = OrcFile.createWriter(TestFilePath, file, OrcFile.writerOptions(conf) .inspector(inspector) .stripeSize(100000) .bufferSize(10000))) { writer.addRow(new SimpleStruct(bytes(0, 1, 2, 3, 4), "foo")); writer.addRow(new SimpleStruct(bytes(0, 1, 2, 3), "bar")); writer.addRow(new SimpleStruct(bytes(0, 1, 2, 3, 4, 5), null)); writer.addRow(new SimpleStruct(null, "hi")); writer.close(); Assert.Equal(4, writer.getNumberOfRows()); Assert.Equal(273, writer.getRawDataSize()); } Reader reader = OrcFile.createReader(TestFilePath, OrcFile.readerOptions(conf)); Assert.Equal(4, reader.getNumberOfRows()); Assert.Equal(273, reader.getRawDataSize()); Assert.Equal(15, reader.getRawDataSizeOfColumns(Lists.newArrayList("bytes1"))); Assert.Equal(258, reader.getRawDataSizeOfColumns(Lists.newArrayList("string1"))); Assert.Equal(273, reader.getRawDataSizeOfColumns(Lists.newArrayList("bytes1", "string1"))); // check the stats ColumnStatistics[] stats = reader.getStatistics(); Assert.Equal(4, stats[0].getNumberOfValues()); Assert.Equal("count: 4 hasNull: False", stats[0].ToString()); Assert.Equal(3, stats[1].getNumberOfValues()); Assert.Equal(15, ((BinaryColumnStatistics)stats[1]).getSum()); Assert.Equal("count: 3 hasNull: True sum: 15", stats[1].ToString()); Assert.Equal(3, stats[2].getNumberOfValues()); Assert.Equal("bar", ((StringColumnStatistics)stats[2]).getMinimum()); Assert.Equal("hi", ((StringColumnStatistics)stats[2]).getMaximum()); Assert.Equal(8, ((StringColumnStatistics)stats[2]).getSum()); Assert.Equal("count: 3 hasNull: True min: bar max: hi sum: 8", stats[2].ToString()); // check the inspectors StructObjectInspector readerInspector = (StructObjectInspector)reader.getObjectInspector(); Assert.Equal(ObjectInspectorCategory.STRUCT, readerInspector.getCategory()); Assert.Equal("struct<bytes1:binary,string1:string>", readerInspector.getTypeName()); IList <StructField> fields = readerInspector.getAllStructFieldRefs(); BinaryObjectInspector bi = (BinaryObjectInspector)readerInspector. getStructFieldRef("bytes1").getFieldObjectInspector(); StringObjectInspector st = (StringObjectInspector)readerInspector. getStructFieldRef("string1").getFieldObjectInspector(); using (RecordReader rows = reader.rows()) { object row = rows.next(); Assert.NotNull(row); // check the contents of the first row Assert.Equal(bytes(0, 1, 2, 3, 4), bi.get( readerInspector.getStructFieldData(row, fields[0]))); Assert.Equal("foo", st.getPrimitiveJavaObject(readerInspector. getStructFieldData(row, fields[1]))); // check the contents of second row Assert.Equal(true, rows.hasNext()); row = rows.next(); Assert.Equal(bytes(0, 1, 2, 3), bi.get( readerInspector.getStructFieldData(row, fields[0]))); Assert.Equal("bar", st.getPrimitiveJavaObject(readerInspector. getStructFieldData(row, fields[1]))); // check the contents of second row Assert.Equal(true, rows.hasNext()); row = rows.next(); Assert.Equal(bytes(0, 1, 2, 3, 4, 5), bi.get( readerInspector.getStructFieldData(row, fields[0]))); Assert.Null(st.getPrimitiveJavaObject(readerInspector. getStructFieldData(row, fields[1]))); // check the contents of second row Assert.Equal(true, rows.hasNext()); row = rows.next(); Assert.Null(bi.get( readerInspector.getStructFieldData(row, fields[0]))); Assert.Equal("hi", st.getPrimitiveJavaObject(readerInspector. getStructFieldData(row, fields[1]))); Assert.Equal(false, rows.hasNext()); } }
public void testHasNull() { ObjectInspector inspector = ObjectInspectorFactory.getReflectionObjectInspector(typeof(SimpleStruct)); using (Stream file = File.OpenWrite(TestFilePath)) using (Writer writer = OrcFile.createWriter(TestFilePath, file, OrcFile.writerOptions(conf) .inspector(inspector) .rowIndexStride(1000) .stripeSize(10000) .bufferSize(10000))) { // STRIPE 1 // RG1 for (int i = 0; i < 1000; i++) { writer.addRow(new SimpleStruct(bytes(1, 2, 3), "RG1")); } // RG2 for (int i = 0; i < 1000; i++) { writer.addRow(new SimpleStruct(bytes(1, 2, 3), null)); } // RG3 for (int i = 0; i < 1000; i++) { writer.addRow(new SimpleStruct(bytes(1, 2, 3), "RG3")); } // RG4 for (int i = 0; i < 1000; i++) { writer.addRow(new SimpleStruct(bytes(1, 2, 3), null)); } // RG5 for (int i = 0; i < 1000; i++) { writer.addRow(new SimpleStruct(bytes(1, 2, 3), null)); } // STRIPE 2 for (int i = 0; i < 5000; i++) { writer.addRow(new SimpleStruct(bytes(1, 2, 3), null)); } // STRIPE 3 for (int i = 0; i < 5000; i++) { writer.addRow(new SimpleStruct(bytes(1, 2, 3), "STRIPE-3")); } // STRIPE 4 for (int i = 0; i < 5000; i++) { writer.addRow(new SimpleStruct(bytes(1, 2, 3), null)); } } Reader reader = OrcFile.createReader(TestFilePath, OrcFile.readerOptions(conf)); // check the file level stats ColumnStatistics[] stats = reader.getStatistics(); Assert.Equal(20000, stats[0].getNumberOfValues()); Assert.Equal(20000, stats[1].getNumberOfValues()); Assert.Equal(7000, stats[2].getNumberOfValues()); Assert.Equal(false, stats[0].hasNull()); Assert.Equal(false, stats[1].hasNull()); Assert.Equal(true, stats[2].hasNull()); // check the stripe level stats List <StripeStatistics> stripeStats = reader.getStripeStatistics(); // stripe 1 stats StripeStatistics ss1 = stripeStats[0]; ColumnStatistics ss1_cs1 = ss1.getColumnStatistics()[0]; ColumnStatistics ss1_cs2 = ss1.getColumnStatistics()[1]; ColumnStatistics ss1_cs3 = ss1.getColumnStatistics()[2]; Assert.Equal(false, ss1_cs1.hasNull()); Assert.Equal(false, ss1_cs2.hasNull()); Assert.Equal(true, ss1_cs3.hasNull()); // stripe 2 stats StripeStatistics ss2 = stripeStats[1]; ColumnStatistics ss2_cs1 = ss2.getColumnStatistics()[0]; ColumnStatistics ss2_cs2 = ss2.getColumnStatistics()[1]; ColumnStatistics ss2_cs3 = ss2.getColumnStatistics()[2]; Assert.Equal(false, ss2_cs1.hasNull()); Assert.Equal(false, ss2_cs2.hasNull()); Assert.Equal(true, ss2_cs3.hasNull()); // stripe 3 stats StripeStatistics ss3 = stripeStats[2]; ColumnStatistics ss3_cs1 = ss3.getColumnStatistics()[0]; ColumnStatistics ss3_cs2 = ss3.getColumnStatistics()[1]; ColumnStatistics ss3_cs3 = ss3.getColumnStatistics()[2]; Assert.Equal(false, ss3_cs1.hasNull()); Assert.Equal(false, ss3_cs2.hasNull()); Assert.Equal(false, ss3_cs3.hasNull()); // stripe 4 stats StripeStatistics ss4 = stripeStats[3]; ColumnStatistics ss4_cs1 = ss4.getColumnStatistics()[0]; ColumnStatistics ss4_cs2 = ss4.getColumnStatistics()[1]; ColumnStatistics ss4_cs3 = ss4.getColumnStatistics()[2]; Assert.Equal(false, ss4_cs1.hasNull()); Assert.Equal(false, ss4_cs2.hasNull()); Assert.Equal(true, ss4_cs3.hasNull()); #if false // Test file dump TextWriter origOut = System.Console.Out; string outputFilename = "orc-file-has-null.out"; FileOutputStream myOut = new FileOutputStream(workDir + File.separator + outputFilename); // replace stdout and run command System.Console.SetOut(new StreamWriter(myOut)); FileDump.main(new String[] { testFilePath.toString(), "--rowindex=2" }); System.Console.Out.Flush(); System.SetOut(origOut); TestFileDump.checkOutput(outputFilename, workDir + File.separator + outputFilename); #endif }
private void checkVectorizedReader() { Reader vreader = OrcFile.createReader(TestFilePath, OrcFile.readerOptions(conf)); Reader reader = OrcFile.createReader(TestFilePath, OrcFile.readerOptions(conf)); using (RecordReaderImpl vrr = (RecordReaderImpl)vreader.rows()) using (RecordReaderImpl rr = (RecordReaderImpl)reader.rows()) { VectorizedRowBatch batch = null; // Check Vectorized ORC reader against ORC row reader while (vrr.hasNext()) { batch = vrr.nextBatch(batch); for (int i = 0; i < batch.size; i++) { OrcStruct row = (OrcStruct)rr.next(); for (int j = 0; j < batch.cols.Length; j++) { object a = (row.getFieldValue(j)); ColumnVector cv = batch.cols[j]; // if the value is repeating, use row 0 int rowId = cv.isRepeating ? 0 : i; // make sure the null flag agrees if (a == null) { Assert.True(!cv.noNulls && cv.isNull[rowId]); } else if (a is bool) { // bool values are stores a 1's and 0's, so convert and compare long temp = (bool)a ? 1 : 0; long b = ((LongColumnVector)cv).vector[rowId]; Assert.Equal(temp.ToString(), b.ToString()); } else if (a is Timestamp) { // Timestamps are stored as long, so convert and compare Timestamp t = (Timestamp)a; // Timestamp.getTime() is overriden and is // long time = super.getTime(); // return (time + (nanos / 1000000)); long timeInNanoSec = (t.Milliseconds * 1000000) + (t.getNanos() % 1000000); long b = ((LongColumnVector)cv).vector[rowId]; Assert.Equal(timeInNanoSec.ToString(), b.ToString()); } else if (a is Date) { // Dates are stored as long, so convert and compare Date adt = (Date)a; long b = ((LongColumnVector)cv).vector[rowId]; // Assert.Equal(adt, Date.daysToMillis((int)b)); Assert.Equal(adt.Days, (int)b); } else if (a is HiveDecimal) { // Decimals are stored as BigInteger, so convert and compare HiveDecimal dec = (HiveDecimal)a; HiveDecimal b = ((DecimalColumnVector)cv).vector[i]; Assert.Equal(dec, b); } else if (a is double) { double b = ((DoubleColumnVector)cv).vector[rowId]; Assert.Equal(a.ToString(), b.ToString()); } else if (a is string) { BytesColumnVector bcv = (BytesColumnVector)cv; string b = Encoding.UTF8.GetString(bcv.vector[rowId], bcv.start[rowId], bcv.length[rowId]); Assert.Equal((string)a, b); } else if (a is int || a is long || a is sbyte || a is short) { Assert.Equal(a.ToString(), ((LongColumnVector)cv).vector[rowId].ToString()); } else { Assert.True(false); } } } // Check repeating Assert.Equal(false, batch.cols[0].isRepeating); Assert.Equal(false, batch.cols[1].isRepeating); Assert.Equal(false, batch.cols[2].isRepeating); Assert.Equal(true, batch.cols[3].isRepeating); Assert.Equal(false, batch.cols[4].isRepeating); Assert.Equal(false, batch.cols[5].isRepeating); Assert.Equal(false, batch.cols[6].isRepeating); Assert.Equal(false, batch.cols[7].isRepeating); Assert.Equal(false, batch.cols[8].isRepeating); Assert.Equal(false, batch.cols[9].isRepeating); // Check non null Assert.Equal(false, batch.cols[0].noNulls); Assert.Equal(false, batch.cols[1].noNulls); Assert.Equal(true, batch.cols[2].noNulls); Assert.Equal(true, batch.cols[3].noNulls); Assert.Equal(false, batch.cols[4].noNulls); Assert.Equal(false, batch.cols[5].noNulls); Assert.Equal(false, batch.cols[6].noNulls); Assert.Equal(false, batch.cols[7].noNulls); Assert.Equal(false, batch.cols[8].noNulls); Assert.Equal(false, batch.cols[9].noNulls); } Assert.Equal(false, rr.hasNext()); } }
public void testColumnsWithNullAndCompression() { ObjectInspector inspector = ObjectInspectorFactory.getReflectionObjectInspector(typeof(MyStruct)); using (Stream file = File.OpenWrite(TestFilePath)) using (Writer writer = OrcFile.createWriter(TestFilePath, file, OrcFile.writerOptions(conf) .inspector(inspector) .stripeSize(100000) .bufferSize(10000))) { writer.addRow(new MyStruct(3, "a", true, Lists.newArrayList(new InnerStruct(100)))); writer.addRow(new MyStruct(null, "b", true, Lists.newArrayList(new InnerStruct(100)))); writer.addRow(new MyStruct(3, null, false, Lists.newArrayList(new InnerStruct(100)))); writer.addRow(new MyStruct(3, "d", true, Lists.newArrayList(new InnerStruct(100)))); writer.addRow(new MyStruct(2, "e", true, Lists.newArrayList(new InnerStruct(100)))); writer.addRow(new MyStruct(2, "f", true, Lists.newArrayList(new InnerStruct(100)))); writer.addRow(new MyStruct(2, "g", true, Lists.newArrayList(new InnerStruct(100)))); writer.addRow(new MyStruct(2, "h", true, Lists.newArrayList(new InnerStruct(100)))); } Reader reader = OrcFile.createReader(TestFilePath, OrcFile.readerOptions(conf)); // check the stats ColumnStatistics[] stats = reader.getStatistics(); Assert.Equal(8, reader.getNumberOfRows()); Assert.Equal(8, stats[0].getNumberOfValues()); Assert.Equal(3, ((IntegerColumnStatistics)stats[1]).getMaximum()); Assert.Equal(2, ((IntegerColumnStatistics)stats[1]).getMinimum()); Assert.Equal(true, ((IntegerColumnStatistics)stats[1]).isSumDefined()); Assert.Equal(17, ((IntegerColumnStatistics)stats[1]).getSum()); Assert.Equal("count: 7 hasNull: True min: 2 max: 3 sum: 17", stats[1].ToString()); Assert.Equal("h", ((StringColumnStatistics)stats[2]).getMaximum()); Assert.Equal("a", ((StringColumnStatistics)stats[2]).getMinimum()); Assert.Equal(7, stats[2].getNumberOfValues()); Assert.Equal("count: 7 hasNull: True min: a max: h sum: 7", stats[2].ToString()); // check the inspectors StructObjectInspector readerInspector = (StructObjectInspector)reader.getObjectInspector(); Assert.Equal(ObjectInspectorCategory.STRUCT, readerInspector.getCategory()); Assert.Equal("struct<a:int,b:string,c:boolean,list:array<struct<z:int>>>", readerInspector.getTypeName()); using (RecordReader rows = reader.rows()) { // only the last strip will have PRESENT stream List <bool> expected = new List <bool>(); foreach (StripeInformation sinfo in reader.getStripes()) { expected.Add(false); } expected[expected.Count - 1] = true; List <bool> got = new List <bool>(); // check if the strip footer contains PRESENT stream foreach (StripeInformation sinfo in reader.getStripes()) { OrcProto.StripeFooter sf = ((RecordReaderImpl)rows).readStripeFooter(sinfo); got.Add(sf.ToString().IndexOf(OrcProto.Stream.Types.Kind.PRESENT.ToString()) != -1); } Assert.Equal(expected, got); // row 1 OrcStruct row = (OrcStruct)rows.next(); Assert.NotNull(row); Assert.Equal(3, row.getFieldValue(0)); Assert.Equal("a", row.getFieldValue(1).ToString()); Assert.Equal(true, row.getFieldValue(2)); Assert.Equal(100, ((OrcStruct)((IList <object>)row.getFieldValue(3))[0]). getFieldValue(0)); // row 2 row = (OrcStruct)rows.next(); Assert.NotNull(row); Assert.Null(row.getFieldValue(0)); Assert.Equal("b", row.getFieldValue(1).ToString()); Assert.Equal(true, row.getFieldValue(2)); Assert.Equal(100, ((OrcStruct)((IList <object>)row.getFieldValue(3))[0]). getFieldValue(0)); // row 3 row = (OrcStruct)rows.next(); Assert.NotNull(row); Assert.Null(row.getFieldValue(1)); Assert.Equal(3, row.getFieldValue(0)); Assert.Equal(false, row.getFieldValue(2)); Assert.Equal(100, ((OrcStruct)((IList <object>)row.getFieldValue(3))[0]). getFieldValue(0)); } }