Example #1
0
        public void testDeltaUnknownSign()
        {
            ObjectInspector inspector = ObjectInspectorFactory.getReflectionObjectInspector(typeof(int));

            using (Stream file = File.OpenWrite(TestFilePath))
                using (Writer w = OrcFile.createWriter(TestFilePath, file, OrcFile.writerOptions(conf)
                                                       .compress(CompressionKind.NONE)
                                                       .inspector(inspector)
                                                       .rowIndexStride(0)
                                                       .encodingStrategy(OrcFile.EncodingStrategy.COMPRESSION)
                                                       .version(OrcFile.Version.V_0_12)))
                {
                    w.addRow(0);
                    for (int i = 0; i < 511; ++i)
                    {
                        w.addRow(i);
                    }
                }

            using (CaptureStdoutToMemory capture = new CaptureStdoutToMemory())
            {
                FileDump.Main(TestFilePath);

                // monotonicity will be undetermined for this sequence 0,0,1,2,3,...510. Hence DIRECT encoding
                // will be used. 2 bytes for header and 640 bytes for data (512 values with fixed bit of 10 bits
                // each, 5120/8 = 640). Total bytes 642
                Assert.True(capture.Text.Contains("Stream: column 0 section DATA start: 3 length 642"));
            }
        }
Example #2
0
        public void testShortRepeat()
        {
            ObjectInspector inspector = ObjectInspectorFactory.getReflectionObjectInspector(typeof(int));

            using (Stream file = File.OpenWrite(TestFilePath))
                using (Writer w = OrcFile.createWriter(TestFilePath, file, OrcFile.writerOptions(conf)
                                                       .compress(CompressionKind.NONE)
                                                       .inspector(inspector)
                                                       .rowIndexStride(0)
                                                       .encodingStrategy(OrcFile.EncodingStrategy.COMPRESSION)
                                                       .version(OrcFile.Version.V_0_12)))
                {
                    for (int i = 0; i < 5; ++i)
                    {
                        w.addRow(10);
                    }
                }

            using (CaptureStdoutToMemory capture = new CaptureStdoutToMemory())
            {
                FileDump.Main(TestFilePath);

                // 1 byte header + 1 byte value
                Assert.True(capture.Text.Contains("Stream: column 0 section DATA start: 3 length 2"));
            }
        }
Example #3
0
        public void testFixedDeltaOneDescending()
        {
            ObjectInspector inspector = ObjectInspectorFactory.getReflectionObjectInspector(typeof(int));

            using (Stream file = File.OpenWrite(TestFilePath))
                using (Writer w = OrcFile.createWriter(TestFilePath, file, OrcFile.writerOptions(conf)
                                                       .compress(CompressionKind.NONE)
                                                       .inspector(inspector)
                                                       .rowIndexStride(0)
                                                       .encodingStrategy(OrcFile.EncodingStrategy.COMPRESSION)
                                                       .version(OrcFile.Version.V_0_12)))
                {
                    for (int i = 0; i < 5120; ++i)
                    {
                        w.addRow(512 - (i % 512));
                    }
                }

            using (CaptureStdoutToMemory capture = new CaptureStdoutToMemory())
            {
                FileDump.Main(TestFilePath);

                // 10 runs of 512 elements. Each run has 2 bytes header, 2 byte base (base = 512, zigzag + varint)
                // and 1 byte delta (delta = 1). In total, 5 bytes per run.
                Assert.True(capture.Text.Contains("Stream: column 0 section DATA start: 3 length 50"));
            }
        }
Example #4
0
        public void testDump()
        {
            // conf.set(HiveConf.ConfVars.HIVE_ORC_ENCODING_STRATEGY.varname, "COMPRESSION");
            using (Stream file = File.OpenWrite(TestFilePath))
            {
                OrcFile.WriterOptions options = new OrcFile.WriterOptions(new Properties(), conf);
                options.inspector(ObjectInspectorFactory.getReflectionObjectInspector(typeof(MyRecord)));
                options.stripeSize(100000);
                options.compress(CompressionKind.ZLIB);
                options.bufferSize(10000);
                options.rowIndexStride(1000);
                using (Writer writer = OrcFile.createWriter(TestFilePath, file, options))
                {
                    Random r1 = new Random(1);
                    for (int i = 0; i < 21000; ++i)
                    {
                        writer.addRow(new MyRecord(r1.Next(), r1.NextLong(),
                                                   TestHelpers.words[r1.Next(TestHelpers.words.Length)]));
                    }
                }
            }

            string outputFilename = "orc-file-dump.out";

            using (CaptureStdout capture = new CaptureStdout(Path.Combine(workDir, outputFilename)))
            {
                FileDump.Main(new string[] { TestFilePath.ToString(), "--rowindex=1,2,3" });
            }

            TestHelpers.CompareFilesByLine(outputFilename, Path.Combine(workDir, outputFilename));
        }
Example #5
0
        public void testPatchedBase()
        {
            ObjectInspector inspector = ObjectInspectorFactory.getReflectionObjectInspector(typeof(int));

            using (Stream file = File.OpenWrite(TestFilePath))
                using (Writer w = OrcFile.createWriter(TestFilePath, file, OrcFile.writerOptions(conf)
                                                       .compress(CompressionKind.NONE)
                                                       .inspector(inspector)
                                                       .rowIndexStride(0)
                                                       .encodingStrategy(OrcFile.EncodingStrategy.COMPRESSION)
                                                       .version(OrcFile.Version.V_0_12)))
                {
                    Random rand = new Random(123);
                    w.addRow(10000000);
                    for (int i = 0; i < 511; ++i)
                    {
                        w.addRow(rand.Next(i + 1));
                    }
                }

            using (CaptureStdoutToMemory capture = new CaptureStdoutToMemory())
            {
                FileDump.Main(TestFilePath);

                // use PATCHED_BASE encoding
                Assert.True(capture.Text.Contains("Stream: column 0 section DATA start: 3 length 583"));
            }
        }
Example #6
0
        public void testOrcSerDeStatsMap()
        {
            ObjectInspector inspector = ObjectInspectorFactory.getReflectionObjectInspector(typeof(MapStruct));

            using (Stream file = File.OpenWrite(TestFilePath))
                using (Writer writer = OrcFile.createWriter(TestFilePath, file, OrcFile.writerOptions(conf)
                                                            .inspector(inspector)
                                                            .stripeSize(10000)
                                                            .bufferSize(10000)))
                {
                    for (int row = 0; row < 1000; row++)
                    {
                        Dictionary <string, double> test = new Dictionary <string, double>();
                        for (int i = 0; i < 10; i++)
                        {
                            test.Add("hi" + i, 2.0);
                        }
                        writer.addRow(new MapStruct(test));
                    }
                    writer.close();

                    // stats from writer
                    Assert.Equal(1000, writer.getNumberOfRows());
                    Assert.Equal(950000, writer.getRawDataSize());
                }

            Reader reader = OrcFile.createReader(TestFilePath,
                                                 OrcFile.readerOptions(conf));

            // stats from reader
            Assert.Equal(1000, reader.getNumberOfRows());
            Assert.Equal(950000, reader.getRawDataSize());
            Assert.Equal(950000, reader.getRawDataSizeOfColumns(Lists.newArrayList("map1")));
        }
        public void testHalfDistinctCheckDisabled()
        {
            ObjectInspector inspector = ObjectInspectorFactory.getReflectionObjectInspector(typeof(string));

            int[] input = new int[20000];

            // conf.setBoolean(ConfVars.HIVE_ORC_ROW_INDEX_STRIDE_DICTIONARY_CHECK.varname, false);
            using (Stream file = File.OpenWrite(TestFilePath))
                using (Writer writer = OrcFile.createWriter(TestFilePath, file, OrcFile.writerOptions(conf)
                                                            .inspector(inspector)
                                                            .compress(CompressionKind.NONE)
                                                            .bufferSize(10000)))
                {
                    Random rand = new Random(123);
                    for (int i = 0; i < 20000; i++)
                    {
                        input[i] = rand.Next(10000);
                    }

                    for (int i = 0; i < 20000; i++)
                    {
                        writer.addRow(input[i].ToString());
                    }
                }

            Reader reader = OrcFile.createReader(TestFilePath, OrcFile.readerOptions(conf));

            using (RecordReader rows = reader.rows())
            {
                int idx = 0;
                while (rows.hasNext())
                {
                    object row = rows.next();
                    Assert.Equal(input[idx++].ToString(), row);
                }

                // make sure the encoding type is correct
                foreach (StripeInformation stripe in reader.getStripes())
                {
                    // hacky but does the job, this casting will work as long this test resides
                    // within the same package as ORC reader
                    OrcProto.StripeFooter footer = ((RecordReaderImpl)rows).readStripeFooter(stripe);
                    for (int i = 0; i < footer.ColumnsCount; ++i)
                    {
                        OrcProto.ColumnEncoding encoding = footer.GetColumns(i);
                        Assert.Equal(OrcProto.ColumnEncoding.Types.Kind.DICTIONARY_V2, encoding.Kind);
                    }
                }
            }
        }
        public void testTimestampWriter(string writerTimeZone, string readerTimeZone)
        {
            ObjectInspector inspector = ObjectInspectorFactory.getReflectionObjectInspector(typeof(Timestamp));
            List <string>   ts        = new List <string>();

            using (Stream file = File.OpenWrite(TestFilePath))
                using (Writer writer = OrcFile.createWriter(TestFilePath, file, OrcFile.writerOptions(conf)
                                                            .inspector(inspector)
                                                            .stripeSize(100000)
                                                            .bufferSize(10000)))
                    using (TestHelpers.SetTimeZoneInfo(writerTimeZone))
                    {
                        ts.Add("2003-01-01 01:00:00.000000222");
                        ts.Add("1996-08-02 09:00:00.723100809");
                        ts.Add("1999-01-01 02:00:00.999999999");
                        ts.Add("1995-01-02 03:00:00.688888888");
                        ts.Add("2002-01-01 04:00:00.1");
                        ts.Add("2010-03-02 05:00:00.000009001");
                        ts.Add("2005-01-01 06:00:00.000002229");
                        ts.Add("2006-01-01 07:00:00.900203003");
                        ts.Add("2003-01-01 08:00:00.800000007");
                        ts.Add("1998-11-02 10:00:00.857340643");
                        ts.Add("2008-10-02 11:00:00.0");
                        ts.Add("2037-01-01 00:00:00.000999");
                        ts.Add("2014-03-28 00:00:00.0");
                        foreach (string t in ts)
                        {
                            writer.addRow(Timestamp.Parse(t));
                        }
                    }

            using (TestHelpers.SetTimeZoneInfo(readerTimeZone))
            {
                Reader reader = OrcFile.createReader(TestFilePath, OrcFile.readerOptions(conf));
                using (RecordReader rows = reader.rows(null))
                {
                    int idx = 0;
                    while (rows.hasNext())
                    {
                        object    row = rows.next();
                        Timestamp got = ((Timestamp)row);
                        Assert.Equal(ts[idx++], got.ToString());
                    }
                }
            }
        }
        public void SimpleTest()
        {
            OrcFile.WriterOptions options = new OrcFile.WriterOptions(new Properties(), new Configuration());
            options.inspector(PrimitiveObjectInspectorFactory.writableStringObjectInspector);
            using (Stream file = File.Create(filename))
                using (Writer writer = OrcFile.createWriter(filename, file, options))
                {
                    writer.addRow("hello");
                }

            Reader reader = OrcFile.createReader(() => File.OpenRead(filename), filename);

            using (RecordReader recordReader = reader.rows())
            {
                object value = recordReader.next();
                Assert.True(value is string);
                Assert.Equal("hello", value);
            }
        }
        public void testTooManyDistinctV11AlwaysDictionary()
        {
            ObjectInspector inspector = ObjectInspectorFactory.getReflectionObjectInspector(typeof(string));

            using (Stream file = File.OpenWrite(TestFilePath))
                using (Writer writer = OrcFile.createWriter(TestFilePath, file, OrcFile.writerOptions(conf)
                                                            .inspector(inspector)
                                                            .compress(CompressionKind.NONE)
                                                            .version(OrcFile.Version.V_0_11)
                                                            .bufferSize(10000)))
                {
                    for (int i = 0; i < 20000; i++)
                    {
                        writer.addRow(i.ToString());
                    }
                }

            Reader reader = OrcFile.createReader(TestFilePath, OrcFile.readerOptions(conf));

            using (RecordReader rows = reader.rows())
            {
                int idx = 0;
                while (rows.hasNext())
                {
                    object row = rows.next();
                    Assert.Equal((idx++).ToString(), row);
                }

                // make sure the encoding type is correct
                foreach (StripeInformation stripe in reader.getStripes())
                {
                    // hacky but does the job, this casting will work as long this test resides
                    // within the same package as ORC reader
                    OrcProto.StripeFooter footer = ((RecordReaderImpl)rows).readStripeFooter(stripe);
                    for (int i = 0; i < footer.ColumnsCount; ++i)
                    {
                        OrcProto.ColumnEncoding encoding = footer.GetColumns(i);
                        Assert.Equal(OrcProto.ColumnEncoding.Types.Kind.DICTIONARY, encoding.Kind);
                    }
                }
            }
        }
Example #11
0
        public void testDictionaryThreshold()
        {
            // conf.set(HiveConf.ConfVars.HIVE_ORC_ENCODING_STRATEGY.varname, "COMPRESSION");
            // conf.setFloat(HiveConf.ConfVars.HIVE_ORC_DICTIONARY_KEY_SIZE_THRESHOLD.varname, 0.49f);
            using (Stream file = File.OpenWrite(TestFilePath))
            {
                OrcFile.WriterOptions options = new OrcFile.WriterOptions(new Properties(), conf);
                options.inspector(ObjectInspectorFactory.getReflectionObjectInspector(typeof(MyRecord)));
                options.stripeSize(100000);
                options.compress(CompressionKind.ZLIB);
                options.bufferSize(10000);
                options.rowIndexStride(1000);
                using (Writer writer = OrcFile.createWriter(TestFilePath, file, options))
                {
                    Random r1      = new Random(1);
                    int    nextInt = 0;
                    for (int i = 0; i < 21000; ++i)
                    {
                        // Write out the same string twice, this guarantees the fraction of rows with
                        // distinct strings is 0.5
                        if (i % 2 == 0)
                        {
                            nextInt = r1.Next(TestHelpers.words.Length);
                            // Append the value of i to the word, this guarantees when an index or word is repeated
                            // the actual string is unique.
                            TestHelpers.words[nextInt] += "-" + i;
                        }
                        writer.addRow(new MyRecord(r1.Next(), r1.NextLong(), TestHelpers.words[nextInt]));
                    }
                }
            }

            string outputFilename = "orc-file-dump-dictionary-threshold.out";

            using (CaptureStdout capture = new CaptureStdout(Path.Combine(workDir, outputFilename)))
            {
                FileDump.Main(new string[] { TestFilePath.ToString(), "--rowindex=1,2,3" });
            }

            TestHelpers.CompareFilesByLine(outputFilename, Path.Combine(workDir, outputFilename));
        }
Example #12
0
        public void testJsonDump()
        {
            ObjectInspector inspector;

            inspector = ObjectInspectorFactory.getReflectionObjectInspector(typeof(MyRecord));
            // conf.set(HiveConf.ConfVars.HIVE_ORC_ENCODING_STRATEGY.varname, "COMPRESSION");
            OrcFile.WriterOptions options = OrcFile.writerOptions(conf)
                                            .inspector(inspector)
                                            .stripeSize(100000)
                                            .compress(CompressionKind.ZLIB)
                                            .bufferSize(10000)
                                            .rowIndexStride(1000)
                                            .bloomFilterColumns("s");
            using (Stream file = File.OpenWrite(TestFilePath))
                using (Writer writer = OrcFile.createWriter(TestFilePath, file, options))
                {
                    Random r1 = new Random(1);
                    for (int i = 0; i < 21000; ++i)
                    {
                        if (i % 100 == 0)
                        {
                            writer.addRow(new MyRecord(r1.Next(), r1.NextLong(), null));
                        }
                        else
                        {
                            writer.addRow(new MyRecord(r1.Next(), r1.NextLong(),
                                                       TestHelpers.words[r1.Next(TestHelpers.words.Length)]));
                        }
                    }
                }

            const string outputFilename = "orc-file-dump.json";

            using (CaptureStdout capture = new CaptureStdout(Path.Combine(workDir, outputFilename)))
            {
                FileDump.Main(new string[] { TestFilePath.ToString(), "-j", "-p", "--rowindex=3" });
            }

            TestHelpers.CompareFilesByLine(outputFilename, Path.Combine(workDir, outputFilename));
        }
Example #13
0
        public void testOrcSerDeStatsSimpleWithNulls()
        {
            ObjectInspector inspector = ObjectInspectorFactory.getReflectionObjectInspector(typeof(SimpleStruct));

            using (Stream file = File.OpenWrite(TestFilePath))
                using (Writer writer = OrcFile.createWriter(TestFilePath, file, OrcFile.writerOptions(conf)
                                                            .inspector(inspector)
                                                            .stripeSize(10000)
                                                            .bufferSize(10000)))
                {
                    for (int row = 0; row < 1000; row++)
                    {
                        if (row % 2 == 0)
                        {
                            writer.addRow(new SimpleStruct(new byte[] { 1, 2, 3 }, "hi"));
                        }
                        else
                        {
                            writer.addRow(null);
                        }
                    }
                    writer.close();

                    // stats from writer
                    Assert.Equal(1000, writer.getNumberOfRows());
                    Assert.Equal(44500, writer.getRawDataSize());
                }

            Reader reader = OrcFile.createReader(TestFilePath,
                                                 OrcFile.readerOptions(conf));

            // stats from reader
            Assert.Equal(1000, reader.getNumberOfRows());
            Assert.Equal(44500, reader.getRawDataSize());
            Assert.Equal(1500, reader.getRawDataSizeOfColumns(Lists.newArrayList("bytes1")));
            Assert.Equal(43000, reader.getRawDataSizeOfColumns(Lists.newArrayList("string1")));
            Assert.Equal(44500, reader.getRawDataSizeOfColumns(Lists.newArrayList("bytes1", "string1")));
        }
Example #14
0
        public void testBitPack64Large()
        {
            ObjectInspector inspector = ObjectInspectorFactory.getReflectionObjectInspector(typeof(long));

            const int size = 1080832;

            long[] inp  = new long[size];
            Random rand = new Random(1234);

            for (int i = 0; i < size; i++)
            {
                inp[i] = rand.NextLong();
            }
            List <long> input = inp.ToList();

            using (Stream file = File.OpenWrite(TestFilePath))
                using (Writer writer = OrcFile.createWriter(TestFilePath, file, OrcFile.writerOptions(conf)
                                                            .inspector(inspector)
                                                            .compress(CompressionKind.ZLIB)))
                {
                    foreach (long l in input)
                    {
                        writer.addRow(l);
                    }
                }

            Reader reader = OrcFile.createReader(TestFilePath, OrcFile.readerOptions(conf));

            using (RecordReader rows = reader.rows())
            {
                int idx = 0;
                while (rows.hasNext())
                {
                    object row = rows.next();
                    Assert.Equal(input[idx++], ((long)row));
                }
            }
        }
        public void testBitPacking(long val)
        {
            long[] input = new long[]
            {
                val, 0, val, val, 0, val, 0, val, val, 0, val, 0, val, val, 0, 0,
                val, val, 0, val, 0, 0, val, 0, val, 0, val, 0, 0, val, 0, val, 0, val, 0, 0, val, 0, val,
                0, val, 0, 0, val, 0, val, 0, val, 0, 0, val, 0, val, 0, val, 0, 0, val, 0, val, 0, val, 0,
                0, val, 0, val, 0, val, 0, 0, val, 0, val, 0, val, 0, 0, val, 0, val, 0, val, 0, 0, val, 0,
                val, 0, val, 0, 0, val, 0, val, 0, 0, val, val
            };

            ObjectInspector inspector = ObjectInspectorFactory.getReflectionObjectInspector(typeof(long));

            using (Stream file = File.OpenWrite(TestFilePath))
                using (Writer writer = OrcFile.createWriter(TestFilePath, file, OrcFile.writerOptions(conf)
                                                            .inspector(inspector)
                                                            .stripeSize(100000)
                                                            .compress(CompressionKind.NONE)
                                                            .bufferSize(10000)))
                {
                    foreach (long l in input)
                    {
                        writer.addRow(l);
                    }
                }

            Reader reader = OrcFile.createReader(TestFilePath, OrcFile.readerOptions(conf));

            using (RecordReader rows = reader.rows())
            {
                int idx = 0;
                while (rows.hasNext())
                {
                    object row = rows.next();
                    Assert.Equal(input[idx++], ((long)row));
                }
            }
        }
        public void createFile()
        {
            ObjectInspector inspector = ObjectInspectorFactory.getReflectionObjectInspector(typeof(MyRecord));

            using (Stream file = File.OpenWrite(TestFilePath))
                using (Writer writer = OrcFile.createWriter(TestFilePath, file, OrcFile.writerOptions(conf)
                                                            .stripeSize(100000)
                                                            .compress(CompressionKind.ZLIB)
                                                            .inspector(inspector)
                                                            .bufferSize(10000)
                                                            .rowIndexStride(10000)))
                {
                    Random   r1             = new Random(1);
                    string[] words          = TestHelpers.words;
                    string[] dates          = new string[] { "1991-02-28", "1970-01-31", "1950-04-23" };
                    string[] decimalStrings = new string[]
                    {
                        "234.443", "10001000", "0.3333367", "67788798.0", "-234.443",
                        "-10001000", "-0.3333367", "-67788798.0", "0"
                    };
                    for (int i = 0; i < 21000; ++i)
                    {
                        if ((i % 7) != 0)
                        {
                            writer.addRow(new MyRecord(((i % 3) == 0), (sbyte)(i % 5), i, (long)200, (short)(300 + i), (double)(400 + i),
                                                       words[r1.Next(words.Length)], new Timestamp(DateTime.Now),
                                                       Date.Parse(dates[i % 3]), HiveDecimal.Parse(decimalStrings[i % decimalStrings.Length])));
                        }
                        else
                        {
                            writer.addRow(new MyRecord(null, null, i, (long)200, null, null, null, null, null, null));
                        }
                    }
                }
            checkVectorizedReader();
        }
        public void testColumnsWithNullAndCompression()
        {
            ObjectInspector inspector = ObjectInspectorFactory.getReflectionObjectInspector(typeof(MyStruct));

            using (Stream file = File.OpenWrite(TestFilePath))
                using (Writer writer = OrcFile.createWriter(TestFilePath, file, OrcFile.writerOptions(conf)
                                                            .inspector(inspector)
                                                            .stripeSize(100000)
                                                            .bufferSize(10000)))
                {
                    writer.addRow(new MyStruct(3, "a", true,
                                               Lists.newArrayList(new InnerStruct(100))));
                    writer.addRow(new MyStruct(null, "b", true,
                                               Lists.newArrayList(new InnerStruct(100))));
                    writer.addRow(new MyStruct(3, null, false,
                                               Lists.newArrayList(new InnerStruct(100))));
                    writer.addRow(new MyStruct(3, "d", true,
                                               Lists.newArrayList(new InnerStruct(100))));
                    writer.addRow(new MyStruct(2, "e", true,
                                               Lists.newArrayList(new InnerStruct(100))));
                    writer.addRow(new MyStruct(2, "f", true,
                                               Lists.newArrayList(new InnerStruct(100))));
                    writer.addRow(new MyStruct(2, "g", true,
                                               Lists.newArrayList(new InnerStruct(100))));
                    writer.addRow(new MyStruct(2, "h", true,
                                               Lists.newArrayList(new InnerStruct(100))));
                }

            Reader reader = OrcFile.createReader(TestFilePath,
                                                 OrcFile.readerOptions(conf));

            // check the stats
            ColumnStatistics[] stats = reader.getStatistics();
            Assert.Equal(8, reader.getNumberOfRows());
            Assert.Equal(8, stats[0].getNumberOfValues());

            Assert.Equal(3, ((IntegerColumnStatistics)stats[1]).getMaximum());
            Assert.Equal(2, ((IntegerColumnStatistics)stats[1]).getMinimum());
            Assert.Equal(true, ((IntegerColumnStatistics)stats[1]).isSumDefined());
            Assert.Equal(17, ((IntegerColumnStatistics)stats[1]).getSum());
            Assert.Equal("count: 7 hasNull: True min: 2 max: 3 sum: 17",
                         stats[1].ToString());

            Assert.Equal("h", ((StringColumnStatistics)stats[2]).getMaximum());
            Assert.Equal("a", ((StringColumnStatistics)stats[2]).getMinimum());
            Assert.Equal(7, stats[2].getNumberOfValues());
            Assert.Equal("count: 7 hasNull: True min: a max: h sum: 7",
                         stats[2].ToString());

            // check the inspectors
            StructObjectInspector readerInspector = (StructObjectInspector)reader.getObjectInspector();

            Assert.Equal(ObjectInspectorCategory.STRUCT,
                         readerInspector.getCategory());
            Assert.Equal("struct<a:int,b:string,c:boolean,list:array<struct<z:int>>>",
                         readerInspector.getTypeName());

            using (RecordReader rows = reader.rows())
            {
                // only the last strip will have PRESENT stream
                List <bool> expected = new List <bool>();
                foreach (StripeInformation sinfo in reader.getStripes())
                {
                    expected.Add(false);
                }
                expected[expected.Count - 1] = true;

                List <bool> got = new List <bool>();
                // check if the strip footer contains PRESENT stream
                foreach (StripeInformation sinfo in reader.getStripes())
                {
                    OrcProto.StripeFooter sf = ((RecordReaderImpl)rows).readStripeFooter(sinfo);
                    got.Add(sf.ToString().IndexOf(OrcProto.Stream.Types.Kind.PRESENT.ToString()) != -1);
                }
                Assert.Equal(expected, got);

                // row 1
                OrcStruct row = (OrcStruct)rows.next();
                Assert.NotNull(row);
                Assert.Equal(3, row.getFieldValue(0));
                Assert.Equal("a", row.getFieldValue(1).ToString());
                Assert.Equal(true, row.getFieldValue(2));
                Assert.Equal(100, ((OrcStruct)((IList <object>)row.getFieldValue(3))[0]).
                             getFieldValue(0));

                // row 2
                row = (OrcStruct)rows.next();
                Assert.NotNull(row);
                Assert.Null(row.getFieldValue(0));
                Assert.Equal("b", row.getFieldValue(1).ToString());
                Assert.Equal(true, row.getFieldValue(2));
                Assert.Equal(100, ((OrcStruct)((IList <object>)row.getFieldValue(3))[0]).
                             getFieldValue(0));

                // row 3
                row = (OrcStruct)rows.next();
                Assert.NotNull(row);
                Assert.Null(row.getFieldValue(1));
                Assert.Equal(3, row.getFieldValue(0));
                Assert.Equal(false, row.getFieldValue(2));
                Assert.Equal(100, ((OrcStruct)((IList <object>)row.getFieldValue(3))[0]).
                             getFieldValue(0));
            }
        }
Example #18
0
        public void testSplitEliminationComplexExpr()
        {
            ObjectInspector inspector = ObjectInspectorFactory.getReflectionObjectInspector(typeof(AllTypesRow));

            using (Stream file = File.OpenWrite(testFilePath))
                using (Writer writer = OrcFile.createWriter(testFilePath, file, conf, inspector,
                                                            100000, CompressionKind.NONE, 10000, 10000))
                {
                    writeData(writer);
                }

            conf.set(ShimLoader.getHadoopShims().getHadoopConfNames().get("MAPREDMINSPLITSIZE"), "1000");
            conf.set(ShimLoader.getHadoopShims().getHadoopConfNames().get("MAPREDMAXSPLITSIZE"), "150000");
            InputFormat @in = new OrcInputFormat();

            FileInputFormat.setInputPaths(conf, testFilePath.ToString());

            // predicate expression: userid <= 100 and subtype <= 1000.0
            GenericUDF           udf       = new GenericUDFOPEqualOrLessThan();
            List <ExprNodeDesc>  childExpr = new List <ExprNodeDesc>();
            ExprNodeColumnDesc   col       = new ExprNodeColumnDesc(typeof(long), "userid", "T", false);
            ExprNodeConstantDesc con       = new ExprNodeConstantDesc(100);

            childExpr.Add(col);
            childExpr.Add(con);
            ExprNodeGenericFuncDesc en = new ExprNodeGenericFuncDesc(inspector, udf, childExpr);

            GenericUDF           udf1       = new GenericUDFOPEqualOrLessThan();
            List <ExprNodeDesc>  childExpr1 = new List <ExprNodeDesc>();
            ExprNodeColumnDesc   col1       = new ExprNodeColumnDesc(typeof(double), "subtype", "T", false);
            ExprNodeConstantDesc con1       = new ExprNodeConstantDesc(1000.0);

            childExpr1.Add(col1);
            childExpr1.Add(con1);
            ExprNodeGenericFuncDesc en1 = new ExprNodeGenericFuncDesc(inspector, udf1, childExpr1);

            GenericUDF          udf2       = new GenericUDFOPAnd();
            List <ExprNodeDesc> childExpr2 = new List <ExprNodeDesc>();

            childExpr2.Add(en);
            childExpr2.Add(en1);
            ExprNodeGenericFuncDesc en2 = new ExprNodeGenericFuncDesc(inspector, udf2, childExpr2);

            string sargStr = Utilities.serializeExpression(en2);

            conf.set("hive.io.filter.expr.serialized", sargStr);
            InputSplit[] splits = @in.getSplits(conf, 1);
            Assert.Equal(2, splits.Length);

            con          = new ExprNodeConstantDesc(2);
            childExpr[1] = con;
            en           = new ExprNodeGenericFuncDesc(inspector, udf, childExpr);

            con1          = new ExprNodeConstantDesc(0.0);
            childExpr1[1] = con1;
            en1           = new ExprNodeGenericFuncDesc(inspector, udf1, childExpr1);

            childExpr2[0] = en;
            childExpr2[1] = en1;
            en2           = new ExprNodeGenericFuncDesc(inspector, udf2, childExpr2);

            sargStr = Utilities.serializeExpression(en2);
            conf.set("hive.io.filter.expr.serialized", sargStr);
            splits = @in.getSplits(conf, 1);
            // no stripe will satisfy the predicate
            Assert.Equal(0, splits.Length);

            con          = new ExprNodeConstantDesc(2);
            childExpr[1] = con;
            en           = new ExprNodeGenericFuncDesc(inspector, udf, childExpr);

            con1          = new ExprNodeConstantDesc(1.0);
            childExpr1[1] = con1;
            en1           = new ExprNodeGenericFuncDesc(inspector, udf1, childExpr1);

            childExpr2[0] = en;
            childExpr2[1] = en1;
            en2           = new ExprNodeGenericFuncDesc(inspector, udf2, childExpr2);

            sargStr = Utilities.serializeExpression(en2);
            conf.set("hive.io.filter.expr.serialized", sargStr);
            splits = @in.getSplits(conf, 1);
            // only first stripe will satisfy condition and hence single split
            Assert.Equal(1, splits.Length);

            udf          = new GenericUDFOPEqual();
            con          = new ExprNodeConstantDesc(13);
            childExpr[1] = con;
            en           = new ExprNodeGenericFuncDesc(inspector, udf, childExpr);

            con1          = new ExprNodeConstantDesc(80.0);
            childExpr1[1] = con1;
            en1           = new ExprNodeGenericFuncDesc(inspector, udf1, childExpr1);

            childExpr2[0] = en;
            childExpr2[1] = en1;
            en2           = new ExprNodeGenericFuncDesc(inspector, udf2, childExpr2);

            sargStr = Utilities.serializeExpression(en2);
            conf.set("hive.io.filter.expr.serialized", sargStr);
            splits = @in.getSplits(conf, 1);
            // first two stripes will satisfy condition and hence single split
            Assert.Equal(2, splits.Length);

            udf          = new GenericUDFOPEqual();
            con          = new ExprNodeConstantDesc(13);
            childExpr[1] = con;
            en           = new ExprNodeGenericFuncDesc(inspector, udf, childExpr);

            udf1          = new GenericUDFOPEqual();
            con1          = new ExprNodeConstantDesc(80.0);
            childExpr1[1] = con1;
            en1           = new ExprNodeGenericFuncDesc(inspector, udf1, childExpr1);

            childExpr2[0] = en;
            childExpr2[1] = en1;
            en2           = new ExprNodeGenericFuncDesc(inspector, udf2, childExpr2);

            sargStr = Utilities.serializeExpression(en2);
            conf.set("hive.io.filter.expr.serialized", sargStr);
            splits = @in.getSplits(conf, 1);
            // only second stripes will satisfy condition and hence single split
            Assert.Equal(1, splits.Length);
        }
Example #19
0
        public void testSplitEliminationSmallMaxSplit()
        {
            ObjectInspector inspector = ObjectInspectorFactory.getReflectionObjectInspector(typeof(AllTypesRow));

            using (Stream file = File.OpenWrite(testFilePath))
                using (Writer writer = OrcFile.createWriter(testFilePath, file, conf, inspector,
                                                            100000, CompressionKind.NONE, 10000, 10000))
                {
                    writeData(writer);
                }
            conf.set(ShimLoader.getHadoopShims().getHadoopConfNames().get("MAPREDMINSPLITSIZE"), "1000");
            conf.set(ShimLoader.getHadoopShims().getHadoopConfNames().get("MAPREDMAXSPLITSIZE"), "5000");
            InputFormat @in = new OrcInputFormat();

            FileInputFormat.setInputPaths(conf, testFilePath.ToString());

            GenericUDF           udf       = new GenericUDFOPEqualOrLessThan();
            List <ExprNodeDesc>  childExpr = new List <ExprNodeDesc>();
            ExprNodeColumnDesc   col       = new ExprNodeColumnDesc(typeof(long), "userid", "T", false);
            ExprNodeConstantDesc con       = new ExprNodeConstantDesc(100);

            childExpr.Add(col);
            childExpr.Add(con);
            ExprNodeGenericFuncDesc en = new ExprNodeGenericFuncDesc(inspector, udf, childExpr);
            string sargStr             = Utilities.serializeExpression(en);

            conf.set("hive.io.filter.expr.serialized", sargStr);
            InputSplit[] splits = @in.getSplits(conf, 1);
            Assert.Equal(5, splits.Length);

            con          = new ExprNodeConstantDesc(1);
            childExpr[1] = con;
            en           = new ExprNodeGenericFuncDesc(inspector, udf, childExpr);
            sargStr      = Utilities.serializeExpression(en);
            conf.set("hive.io.filter.expr.serialized", sargStr);
            splits = @in.getSplits(conf, 1);
            Assert.Equal(0, splits.Length);

            con          = new ExprNodeConstantDesc(2);
            childExpr[1] = con;
            en           = new ExprNodeGenericFuncDesc(inspector, udf, childExpr);
            sargStr      = Utilities.serializeExpression(en);
            conf.set("hive.io.filter.expr.serialized", sargStr);
            splits = @in.getSplits(conf, 1);
            Assert.Equal(1, splits.Length);

            con          = new ExprNodeConstantDesc(5);
            childExpr[1] = con;
            en           = new ExprNodeGenericFuncDesc(inspector, udf, childExpr);
            sargStr      = Utilities.serializeExpression(en);
            conf.set("hive.io.filter.expr.serialized", sargStr);
            splits = @in.getSplits(conf, 1);
            Assert.Equal(2, splits.Length);

            con          = new ExprNodeConstantDesc(13);
            childExpr[1] = con;
            en           = new ExprNodeGenericFuncDesc(inspector, udf, childExpr);
            sargStr      = Utilities.serializeExpression(en);
            conf.set("hive.io.filter.expr.serialized", sargStr);
            splits = @in.getSplits(conf, 1);
            Assert.Equal(3, splits.Length);

            con          = new ExprNodeConstantDesc(29);
            childExpr[1] = con;
            en           = new ExprNodeGenericFuncDesc(inspector, udf, childExpr);
            sargStr      = Utilities.serializeExpression(en);
            conf.set("hive.io.filter.expr.serialized", sargStr);
            splits = @in.getSplits(conf, 1);
            Assert.Equal(4, splits.Length);

            con          = new ExprNodeConstantDesc(70);
            childExpr[1] = con;
            en           = new ExprNodeGenericFuncDesc(inspector, udf, childExpr);
            sargStr      = Utilities.serializeExpression(en);
            conf.set("hive.io.filter.expr.serialized", sargStr);
            splits = @in.getSplits(conf, 1);
            Assert.Equal(5, splits.Length);
        }
        public void testHasNull()
        {
            ObjectInspector inspector = ObjectInspectorFactory.getReflectionObjectInspector(typeof(SimpleStruct));

            using (Stream file = File.OpenWrite(TestFilePath))
                using (Writer writer = OrcFile.createWriter(TestFilePath, file, OrcFile.writerOptions(conf)
                                                            .inspector(inspector)
                                                            .rowIndexStride(1000)
                                                            .stripeSize(10000)
                                                            .bufferSize(10000)))
                {
                    // STRIPE 1
                    // RG1
                    for (int i = 0; i < 1000; i++)
                    {
                        writer.addRow(new SimpleStruct(bytes(1, 2, 3), "RG1"));
                    }
                    // RG2
                    for (int i = 0; i < 1000; i++)
                    {
                        writer.addRow(new SimpleStruct(bytes(1, 2, 3), null));
                    }
                    // RG3
                    for (int i = 0; i < 1000; i++)
                    {
                        writer.addRow(new SimpleStruct(bytes(1, 2, 3), "RG3"));
                    }
                    // RG4
                    for (int i = 0; i < 1000; i++)
                    {
                        writer.addRow(new SimpleStruct(bytes(1, 2, 3), null));
                    }
                    // RG5
                    for (int i = 0; i < 1000; i++)
                    {
                        writer.addRow(new SimpleStruct(bytes(1, 2, 3), null));
                    }
                    // STRIPE 2
                    for (int i = 0; i < 5000; i++)
                    {
                        writer.addRow(new SimpleStruct(bytes(1, 2, 3), null));
                    }
                    // STRIPE 3
                    for (int i = 0; i < 5000; i++)
                    {
                        writer.addRow(new SimpleStruct(bytes(1, 2, 3), "STRIPE-3"));
                    }
                    // STRIPE 4
                    for (int i = 0; i < 5000; i++)
                    {
                        writer.addRow(new SimpleStruct(bytes(1, 2, 3), null));
                    }
                }

            Reader reader = OrcFile.createReader(TestFilePath, OrcFile.readerOptions(conf));

            // check the file level stats
            ColumnStatistics[] stats = reader.getStatistics();
            Assert.Equal(20000, stats[0].getNumberOfValues());
            Assert.Equal(20000, stats[1].getNumberOfValues());
            Assert.Equal(7000, stats[2].getNumberOfValues());
            Assert.Equal(false, stats[0].hasNull());
            Assert.Equal(false, stats[1].hasNull());
            Assert.Equal(true, stats[2].hasNull());

            // check the stripe level stats
            List <StripeStatistics> stripeStats = reader.getStripeStatistics();
            // stripe 1 stats
            StripeStatistics ss1     = stripeStats[0];
            ColumnStatistics ss1_cs1 = ss1.getColumnStatistics()[0];
            ColumnStatistics ss1_cs2 = ss1.getColumnStatistics()[1];
            ColumnStatistics ss1_cs3 = ss1.getColumnStatistics()[2];

            Assert.Equal(false, ss1_cs1.hasNull());
            Assert.Equal(false, ss1_cs2.hasNull());
            Assert.Equal(true, ss1_cs3.hasNull());

            // stripe 2 stats
            StripeStatistics ss2     = stripeStats[1];
            ColumnStatistics ss2_cs1 = ss2.getColumnStatistics()[0];
            ColumnStatistics ss2_cs2 = ss2.getColumnStatistics()[1];
            ColumnStatistics ss2_cs3 = ss2.getColumnStatistics()[2];

            Assert.Equal(false, ss2_cs1.hasNull());
            Assert.Equal(false, ss2_cs2.hasNull());
            Assert.Equal(true, ss2_cs3.hasNull());

            // stripe 3 stats
            StripeStatistics ss3     = stripeStats[2];
            ColumnStatistics ss3_cs1 = ss3.getColumnStatistics()[0];
            ColumnStatistics ss3_cs2 = ss3.getColumnStatistics()[1];
            ColumnStatistics ss3_cs3 = ss3.getColumnStatistics()[2];

            Assert.Equal(false, ss3_cs1.hasNull());
            Assert.Equal(false, ss3_cs2.hasNull());
            Assert.Equal(false, ss3_cs3.hasNull());

            // stripe 4 stats
            StripeStatistics ss4     = stripeStats[3];
            ColumnStatistics ss4_cs1 = ss4.getColumnStatistics()[0];
            ColumnStatistics ss4_cs2 = ss4.getColumnStatistics()[1];
            ColumnStatistics ss4_cs3 = ss4.getColumnStatistics()[2];

            Assert.Equal(false, ss4_cs1.hasNull());
            Assert.Equal(false, ss4_cs2.hasNull());
            Assert.Equal(true, ss4_cs3.hasNull());

#if false
            // Test file dump
            TextWriter       origOut        = System.Console.Out;
            string           outputFilename = "orc-file-has-null.out";
            FileOutputStream myOut          = new FileOutputStream(workDir + File.separator + outputFilename);

            // replace stdout and run command
            System.Console.SetOut(new StreamWriter(myOut));
            FileDump.main(new String[] { testFilePath.toString(), "--rowindex=2" });
            System.Console.Out.Flush();
            System.SetOut(origOut);

            TestFileDump.checkOutput(outputFilename, workDir + File.separator + outputFilename);
#endif
        }
Example #21
0
        public void testDataDump()
        {
            using (Stream file = File.OpenWrite(TestFilePath))
            {
                OrcFile.WriterOptions options = new OrcFile.WriterOptions(new Properties(), conf);
                options.inspector(ObjectInspectorFactory.getReflectionObjectInspector(typeof(AllTypesRecord)));
                options.stripeSize(100000);
                options.compress(CompressionKind.NONE);
                options.bufferSize(10000);
                options.rowIndexStride(1000);
                using (Writer writer = OrcFile.createWriter(TestFilePath, file, options))
                {
                    Dictionary <string, string> m = new Dictionary <string, string>(2);
                    m.Add("k1", "v1");
                    writer.addRow(new AllTypesRecord(
                                      true,
                                      (sbyte)10,
                                      (short)100,
                                      1000,
                                      10000L,
                                      4.0f,
                                      20.0,
                                      HiveDecimal.Parse("4.2222"),
                                      new Timestamp(1416967764000L),
                                      new Date(1416967764000L),
                                      "string",
                                      m,
                                      new List <int> {
                        100, 200
                    },
                                      new AllTypesRecord.Struct(10, "foo")));
                    m.Clear();
                    m.Add("k3", "v3");
                    writer.addRow(new AllTypesRecord(
                                      false,
                                      (sbyte)20,
                                      (short)200,
                                      2000,
                                      20000L,
                                      8.0f,
                                      40.0,
                                      HiveDecimal.Parse("2.2222"),
                                      new Timestamp(1416967364000L),
                                      new Date(1411967764000L),
                                      "abcd",
                                      m,
                                      new List <int> {
                        200, 300
                    },
                                      new AllTypesRecord.Struct(20, "bar")));
                }
            }

            string[] lines;
            using (CaptureStdoutToMemory capture = new CaptureStdoutToMemory())
            {
                FileDump.Main(TestFilePath, "-d");

                lines = capture.Text.Split(new[] { '\n' }, StringSplitOptions.RemoveEmptyEntries);
            }
            Assert.Equal(2, lines.Length);

            // Don't be fooled by the big space in the middle, this line is quite long
            Assert.Equal("{\"b\":true,\"bt\":10,\"s\":100,\"i\":1000,\"l\":10000,\"f\":4,\"d\":20,\"de\":\"4.2222\",\"t\":\"2014-11-25 18:09:24\",\"dt\":\"2014-11-25\",\"str\":\"string\",\"c\":\"hello                                                                                                                                                                                                                                                          \",\"vc\":\"hello\",\"m\":[{\"_key\":\"k1\",\"_value\":\"v1\"}],\"a\":[100,200],\"st\":{\"i\":10,\"s\":\"foo\"}}", lines[0]);
            Assert.Equal("{\"b\":false,\"bt\":20,\"s\":200,\"i\":2000,\"l\":20000,\"f\":8,\"d\":40,\"de\":\"2.2222\",\"t\":\"2014-11-25 18:02:44\",\"dt\":\"2014-09-28\",\"str\":\"abcd\",\"c\":\"world                                                                                                                                                                                                                                                          \",\"vc\":\"world\",\"m\":[{\"_key\":\"k3\",\"_value\":\"v3\"}],\"a\":[200,300],\"st\":{\"i\":20,\"s\":\"bar\"}}", lines[1]);
        }
Example #22
0
        public void testStringAndBinaryStatistics()
        {
            ObjectInspector inspector = ObjectInspectorFactory.getReflectionObjectInspector(typeof(SimpleStruct));

            using (Stream file = File.OpenWrite(TestFilePath))
                using (Writer writer = OrcFile.createWriter(TestFilePath, file, OrcFile.writerOptions(conf)
                                                            .inspector(inspector)
                                                            .stripeSize(100000)
                                                            .bufferSize(10000)))
                {
                    writer.addRow(new SimpleStruct(bytes(0, 1, 2, 3, 4), "foo"));
                    writer.addRow(new SimpleStruct(bytes(0, 1, 2, 3), "bar"));
                    writer.addRow(new SimpleStruct(bytes(0, 1, 2, 3, 4, 5), null));
                    writer.addRow(new SimpleStruct(null, "hi"));
                    writer.close();

                    Assert.Equal(4, writer.getNumberOfRows());
                    Assert.Equal(273, writer.getRawDataSize());
                }

            Reader reader = OrcFile.createReader(TestFilePath,
                                                 OrcFile.readerOptions(conf));

            Assert.Equal(4, reader.getNumberOfRows());
            Assert.Equal(273, reader.getRawDataSize());
            Assert.Equal(15, reader.getRawDataSizeOfColumns(Lists.newArrayList("bytes1")));
            Assert.Equal(258, reader.getRawDataSizeOfColumns(Lists.newArrayList("string1")));
            Assert.Equal(273, reader.getRawDataSizeOfColumns(Lists.newArrayList("bytes1", "string1")));

            // check the stats
            ColumnStatistics[] stats = reader.getStatistics();
            Assert.Equal(4, stats[0].getNumberOfValues());
            Assert.Equal("count: 4 hasNull: False", stats[0].ToString());

            Assert.Equal(3, stats[1].getNumberOfValues());
            Assert.Equal(15, ((BinaryColumnStatistics)stats[1]).getSum());
            Assert.Equal("count: 3 hasNull: True sum: 15", stats[1].ToString());

            Assert.Equal(3, stats[2].getNumberOfValues());
            Assert.Equal("bar", ((StringColumnStatistics)stats[2]).getMinimum());
            Assert.Equal("hi", ((StringColumnStatistics)stats[2]).getMaximum());
            Assert.Equal(8, ((StringColumnStatistics)stats[2]).getSum());
            Assert.Equal("count: 3 hasNull: True min: bar max: hi sum: 8",
                         stats[2].ToString());

            // check the inspectors
            StructObjectInspector readerInspector =
                (StructObjectInspector)reader.getObjectInspector();

            Assert.Equal(ObjectInspectorCategory.STRUCT, readerInspector.getCategory());
            Assert.Equal("struct<bytes1:binary,string1:string>", readerInspector.getTypeName());
            IList <StructField>   fields = readerInspector.getAllStructFieldRefs();
            BinaryObjectInspector bi     = (BinaryObjectInspector)readerInspector.
                                           getStructFieldRef("bytes1").getFieldObjectInspector();
            StringObjectInspector st = (StringObjectInspector)readerInspector.
                                       getStructFieldRef("string1").getFieldObjectInspector();

            using (RecordReader rows = reader.rows())
            {
                object row = rows.next();
                Assert.NotNull(row);
                // check the contents of the first row
                Assert.Equal(bytes(0, 1, 2, 3, 4), bi.get(
                                 readerInspector.getStructFieldData(row, fields[0])));
                Assert.Equal("foo", st.getPrimitiveJavaObject(readerInspector.
                                                              getStructFieldData(row, fields[1])));

                // check the contents of second row
                Assert.Equal(true, rows.hasNext());
                row = rows.next();
                Assert.Equal(bytes(0, 1, 2, 3), bi.get(
                                 readerInspector.getStructFieldData(row, fields[0])));
                Assert.Equal("bar", st.getPrimitiveJavaObject(readerInspector.
                                                              getStructFieldData(row, fields[1])));

                // check the contents of second row
                Assert.Equal(true, rows.hasNext());
                row = rows.next();
                Assert.Equal(bytes(0, 1, 2, 3, 4, 5), bi.get(
                                 readerInspector.getStructFieldData(row, fields[0])));
                Assert.Null(st.getPrimitiveJavaObject(readerInspector.
                                                      getStructFieldData(row, fields[1])));

                // check the contents of second row
                Assert.Equal(true, rows.hasNext());
                row = rows.next();
                Assert.Null(bi.get(
                                readerInspector.getStructFieldData(row, fields[0])));
                Assert.Equal("hi", st.getPrimitiveJavaObject(readerInspector.
                                                             getStructFieldData(row, fields[1])));

                Assert.Equal(false, rows.hasNext());
            }
        }
        public void testMultiStripeWithNull()
        {
            ObjectInspector inspector = ObjectInspectorFactory.getReflectionObjectInspector(typeof(MyStruct));

            using (Stream file = File.OpenWrite(TestFilePath))
                using (Writer writer = OrcFile.createWriter(TestFilePath, file, OrcFile.writerOptions(conf)
                                                            .inspector(inspector)
                                                            .stripeSize(100000)
                                                            .compress(CompressionKind.NONE)
                                                            .bufferSize(10000)))
                {
                    Random rand = new Random(100);
                    writer.addRow(new MyStruct(null, null, true, new List <InnerStruct> {
                        new InnerStruct(100)
                    }));
                    for (int i = 2; i < 20000; i++)
                    {
                        writer.addRow(new MyStruct(rand.Next(1), "a", true, new List <InnerStruct> {
                            new InnerStruct(100)
                        }));
                    }
                    writer.addRow(new MyStruct(null, null, true, new List <InnerStruct> {
                        new InnerStruct(100)
                    }));
                }

            Reader reader = OrcFile.createReader(TestFilePath,
                                                 OrcFile.readerOptions(conf));

            // check the stats
            ColumnStatistics[] stats = reader.getStatistics();
            Assert.Equal(20000, reader.getNumberOfRows());
            Assert.Equal(20000, stats[0].getNumberOfValues());

            Assert.Equal(0, ((IntegerColumnStatistics)stats[1]).getMaximum());
            Assert.Equal(0, ((IntegerColumnStatistics)stats[1]).getMinimum());
            Assert.Equal(true, ((IntegerColumnStatistics)stats[1]).isSumDefined());
            Assert.Equal(0, ((IntegerColumnStatistics)stats[1]).getSum());
            Assert.Equal("count: 19998 hasNull: True min: 0 max: 0 sum: 0",
                         stats[1].ToString());

            Assert.Equal("a", ((StringColumnStatistics)stats[2]).getMaximum());
            Assert.Equal("a", ((StringColumnStatistics)stats[2]).getMinimum());
            Assert.Equal(19998, stats[2].getNumberOfValues());
            Assert.Equal("count: 19998 hasNull: True min: a max: a sum: 19998",
                         stats[2].ToString());

            // check the inspectors
            StructObjectInspector readerInspector =
                (StructObjectInspector)reader.getObjectInspector();

            Assert.Equal(ObjectInspectorCategory.STRUCT,
                         readerInspector.getCategory());
            Assert.Equal("struct<a:int,b:string,c:boolean,list:array<struct<z:int>>>",
                         readerInspector.getTypeName());

            using (RecordReader rows = reader.rows())
            {
                List <bool> expected = new List <bool>();
                foreach (StripeInformation sinfo in reader.getStripes())
                {
                    expected.Add(false);
                }
                // only the first and last stripe will have PRESENT stream
                expected[0] = true;
                expected[expected.Count - 1] = true;

                List <bool> got = new List <bool>();
                // check if the strip footer contains PRESENT stream
                foreach (StripeInformation sinfo in reader.getStripes())
                {
                    OrcProto.StripeFooter sf =
                        ((RecordReaderImpl)rows).readStripeFooter(sinfo);
                    got.Add(sf.ToString().IndexOf(OrcProto.Stream.Types.Kind.PRESENT.ToString()) != -1);
                }
                Assert.Equal(expected, got);

                // row 1
                OrcStruct row = (OrcStruct)rows.next();
                Assert.NotNull(row);
                Assert.Null(row.getFieldValue(0));
                Assert.Null(row.getFieldValue(1));
                Assert.Equal(true, row.getFieldValue(2));
                Assert.Equal(100, ((OrcStruct)((IList <object>)row.getFieldValue(3))[0]).
                             getFieldValue(0));

                rows.seekToRow(19998);
                // last-1 row
                row = (OrcStruct)rows.next();
                Assert.NotNull(row);
                Assert.NotNull(row.getFieldValue(1));
                Assert.Equal(0, row.getFieldValue(0));
                Assert.Equal(true, row.getFieldValue(2));
                Assert.Equal(100, ((OrcStruct)((IList <object>)row.getFieldValue(3))[0]).
                             getFieldValue(0));

                // last row
                row = (OrcStruct)rows.next();
                Assert.NotNull(row);
                Assert.Null(row.getFieldValue(0));
                Assert.Null(row.getFieldValue(1));
                Assert.Equal(true, row.getFieldValue(2));
                Assert.Equal(100, ((OrcStruct)((IList <object>)row.getFieldValue(3))[0]).
                             getFieldValue(0));
            }
        }
Example #24
0
        public void testOrcSerDeStatsComplexOldFormat()
        {
            ObjectInspector inspector = ObjectInspectorFactory.getReflectionObjectInspector(typeof(BigRow));

            long rawDataSize;

            using (Stream file = File.OpenWrite(TestFilePath))
                using (Writer writer = OrcFile.createWriter(TestFilePath, file, OrcFile.writerOptions(conf)
                                                            .inspector(inspector)
                                                            .stripeSize(100000)
                                                            .version(OrcFile.Version.V_0_11)
                                                            .bufferSize(10000)))
                {
                    // 1 + 2 + 4 + 8 + 4 + 8 + 5 + 2 + 4 + 3 + 4 + 4 + 4 + 4 + 4 + 3 = 64
                    writer.addRow(new BigRow(false, (sbyte)1, (short)1024, 65536,
                                             Int64.MaxValue, (float)1.0, -15.0, bytes(0, 1, 2, 3, 4), "hi",
                                             new MiddleStruct(inner(1, "bye"), inner(2, "sigh")),
                                             list(inner(3, "good"), inner(4, "bad")),
                                             map(), Timestamp.Parse("2000-03-12 15:00:00"), HiveDecimal.Parse(
                                                 "12345678.6547456")));
                    // 1 + 2 + 4 + 8 + 4 + 8 + 3 + 4 + 3 + 4 + 4 + 4 + 3 + 4 + 2 + 4 + 3 + 5 + 4 + 5 + 7 + 4 + 7 =
                    // 97
                    writer.addRow(new BigRow(true, (sbyte)100, (short)2048, 65536,
                                             Int64.MaxValue, (float)2.0, -5.0, bytes(), "bye",
                                             new MiddleStruct(inner(1, "bye"), inner(2, "sigh")),
                                             list(inner(100000000, "cat"), inner(-100000, "in"), inner(1234, "hat")),
                                             map(inner(5, "chani"), inner(1, "mauddib")), Timestamp.Parse("2000-03-11 15:00:00"),
                                             HiveDecimal.Parse("12345678.6547452")));
                    writer.close();

                    long rowCount = writer.getNumberOfRows();
                    rawDataSize = writer.getRawDataSize();
                    Assert.Equal(2, rowCount);
                    Assert.Equal(1740, rawDataSize);
                }

            Reader reader = OrcFile.createReader(TestFilePath,
                                                 OrcFile.readerOptions(conf));

            Assert.Equal(2, reader.getNumberOfRows());
            Assert.Equal(1740, reader.getRawDataSize());
            Assert.Equal(8, reader.getRawDataSizeOfColumns(Lists.newArrayList("boolean1")));
            Assert.Equal(8, reader.getRawDataSizeOfColumns(Lists.newArrayList("byte1")));
            Assert.Equal(8, reader.getRawDataSizeOfColumns(Lists.newArrayList("short1")));
            Assert.Equal(8, reader.getRawDataSizeOfColumns(Lists.newArrayList("int1")));
            Assert.Equal(16, reader.getRawDataSizeOfColumns(Lists.newArrayList("long1")));
            Assert.Equal(8, reader.getRawDataSizeOfColumns(Lists.newArrayList("float1")));
            Assert.Equal(16, reader.getRawDataSizeOfColumns(Lists.newArrayList("double1")));
            Assert.Equal(5, reader.getRawDataSizeOfColumns(Lists.newArrayList("bytes1")));
            Assert.Equal(172, reader.getRawDataSizeOfColumns(Lists.newArrayList("string1")));
            Assert.Equal(455, reader.getRawDataSizeOfColumns(Lists.newArrayList("list")));
            Assert.Equal(368, reader.getRawDataSizeOfColumns(Lists.newArrayList("map")));
            Assert.Equal(364, reader.getRawDataSizeOfColumns(Lists.newArrayList("middle")));
            Assert.Equal(80, reader.getRawDataSizeOfColumns(Lists.newArrayList("ts")));
            Assert.Equal(224, reader.getRawDataSizeOfColumns(Lists.newArrayList("decimal1")));
            Assert.Equal(88, reader.getRawDataSizeOfColumns(Lists.newArrayList("ts", "int1")));
            Assert.Equal(1195,
                         reader.getRawDataSizeOfColumns(Lists.newArrayList("middle", "list", "map", "float1")));
            Assert.Equal(185,
                         reader.getRawDataSizeOfColumns(Lists.newArrayList("bytes1", "byte1", "string1")));
            Assert.Equal(rawDataSize, reader.getRawDataSizeOfColumns(Lists.newArrayList("boolean1",
                                                                                        "byte1", "short1", "int1", "long1", "float1", "double1", "bytes1", "string1", "list",
                                                                                        "map", "middle", "ts", "decimal1")));

            // check the stats
            ColumnStatistics[] stats = reader.getStatistics();
            Assert.Equal(2, stats[1].getNumberOfValues());
            Assert.Equal(1, ((BooleanColumnStatistics)stats[1]).getFalseCount());
            Assert.Equal(1, ((BooleanColumnStatistics)stats[1]).getTrueCount());
            Assert.Equal("count: 2 hasNull: False true: 1", stats[1].ToString());

            Assert.Equal(2048, ((IntegerColumnStatistics)stats[3]).getMaximum());
            Assert.Equal(1024, ((IntegerColumnStatistics)stats[3]).getMinimum());
            Assert.Equal(true, ((IntegerColumnStatistics)stats[3]).isSumDefined());
            Assert.Equal(3072, ((IntegerColumnStatistics)stats[3]).getSum());
            Assert.Equal("count: 2 hasNull: False min: 1024 max: 2048 sum: 3072",
                         stats[3].ToString());

            Assert.Equal(Int64.MaxValue, ((IntegerColumnStatistics)stats[5]).getMaximum());
            Assert.Equal(Int64.MaxValue, ((IntegerColumnStatistics)stats[5]).getMinimum());
            Assert.Equal(false, ((IntegerColumnStatistics)stats[5]).isSumDefined());
            Assert.Equal("count: 2 hasNull: False min: 9223372036854775807 max: 9223372036854775807",
                         stats[5].ToString());

            Assert.Equal(-15.0, ((DoubleColumnStatistics)stats[7]).getMinimum());
            Assert.Equal(-5.0, ((DoubleColumnStatistics)stats[7]).getMaximum());
            Assert.Equal(-20.0, ((DoubleColumnStatistics)stats[7]).getSum(), 5);
            Assert.Equal("count: 2 hasNull: False min: -15 max: -5 sum: -20",
                         stats[7].ToString());

            Assert.Equal(5, ((BinaryColumnStatistics)stats[8]).getSum());
            Assert.Equal("count: 2 hasNull: False sum: 5", stats[8].ToString());

            Assert.Equal("bye", ((StringColumnStatistics)stats[9]).getMinimum());
            Assert.Equal("hi", ((StringColumnStatistics)stats[9]).getMaximum());
            Assert.Equal(5, ((StringColumnStatistics)stats[9]).getSum());
            Assert.Equal("count: 2 hasNull: False min: bye max: hi sum: 5", stats[9].ToString());
        }