public void testReadTimestampFormat_0_11(string readerTimeZone)
        {
            string oldFilePath = Path.Combine(TestHelpers.ResourcesDirectory, "orc-file-11-format.orc");

            using (TestHelpers.SetTimeZoneInfo(readerTimeZone))
            {
                Reader reader = OrcFile.createReader(oldFilePath, OrcFile.readerOptions(conf));

                StructObjectInspector    readerInspector = (StructObjectInspector)reader.getObjectInspector();
                IList <StructField>      fields          = readerInspector.getAllStructFieldRefs();
                TimestampObjectInspector tso             = (TimestampObjectInspector)readerInspector
                                                           .getStructFieldRef("ts").getFieldObjectInspector();

                using (RecordReader rows = reader.rows())
                {
                    object row = rows.next();
                    Assert.NotNull(row);
                    Assert.Equal(Timestamp.Parse("2000-03-12 15:00:00"),
                                 tso.getPrimitiveJavaObject(readerInspector.getStructFieldData(row,
                                                                                               fields[12])));

                    // check the contents of second row
                    Assert.Equal(true, rows.hasNext());
                    rows.seekToRow(7499);
                    row = rows.next();
                    Assert.Equal(Timestamp.Parse("2000-03-12 15:00:01"),
                                 tso.getPrimitiveJavaObject(readerInspector.getStructFieldData(row,
                                                                                               fields[12])));

                    Assert.Equal(false, rows.hasNext());
                }
            }
        }
Esempio n. 2
0
        public void testOrcSerDeStatsMap()
        {
            ObjectInspector inspector = ObjectInspectorFactory.getReflectionObjectInspector(typeof(MapStruct));

            using (Stream file = File.OpenWrite(TestFilePath))
                using (Writer writer = OrcFile.createWriter(TestFilePath, file, OrcFile.writerOptions(conf)
                                                            .inspector(inspector)
                                                            .stripeSize(10000)
                                                            .bufferSize(10000)))
                {
                    for (int row = 0; row < 1000; row++)
                    {
                        Dictionary <string, double> test = new Dictionary <string, double>();
                        for (int i = 0; i < 10; i++)
                        {
                            test.Add("hi" + i, 2.0);
                        }
                        writer.addRow(new MapStruct(test));
                    }
                    writer.close();

                    // stats from writer
                    Assert.Equal(1000, writer.getNumberOfRows());
                    Assert.Equal(950000, writer.getRawDataSize());
                }

            Reader reader = OrcFile.createReader(TestFilePath,
                                                 OrcFile.readerOptions(conf));

            // stats from reader
            Assert.Equal(1000, reader.getNumberOfRows());
            Assert.Equal(950000, reader.getRawDataSize());
            Assert.Equal(950000, reader.getRawDataSizeOfColumns(Lists.newArrayList("map1")));
        }
        public OrcFileStripeMergeRecordReader(Configuration conf, FileSplit split)
        {
            path  = split.getPath();
            start = split.getStart();
            end   = start + split.getLength();
            FileSystem fs = path.getFileSystem(conf);

            this.reader           = OrcFile.createReader(path, OrcFile.readerOptions(conf).filesystem(fs));
            this.iter             = reader.getStripes().GetEnumerator();
            this.stripeIdx        = 0;
            this.stripeStatistics = ((ReaderImpl)reader).getOrcProtoStripeStatistics();
        }
        public void testHalfDistinctCheckDisabled()
        {
            ObjectInspector inspector = ObjectInspectorFactory.getReflectionObjectInspector(typeof(string));

            int[] input = new int[20000];

            // conf.setBoolean(ConfVars.HIVE_ORC_ROW_INDEX_STRIDE_DICTIONARY_CHECK.varname, false);
            using (Stream file = File.OpenWrite(TestFilePath))
                using (Writer writer = OrcFile.createWriter(TestFilePath, file, OrcFile.writerOptions(conf)
                                                            .inspector(inspector)
                                                            .compress(CompressionKind.NONE)
                                                            .bufferSize(10000)))
                {
                    Random rand = new Random(123);
                    for (int i = 0; i < 20000; i++)
                    {
                        input[i] = rand.Next(10000);
                    }

                    for (int i = 0; i < 20000; i++)
                    {
                        writer.addRow(input[i].ToString());
                    }
                }

            Reader reader = OrcFile.createReader(TestFilePath, OrcFile.readerOptions(conf));

            using (RecordReader rows = reader.rows())
            {
                int idx = 0;
                while (rows.hasNext())
                {
                    object row = rows.next();
                    Assert.Equal(input[idx++].ToString(), row);
                }

                // make sure the encoding type is correct
                foreach (StripeInformation stripe in reader.getStripes())
                {
                    // hacky but does the job, this casting will work as long this test resides
                    // within the same package as ORC reader
                    OrcProto.StripeFooter footer = ((RecordReaderImpl)rows).readStripeFooter(stripe);
                    for (int i = 0; i < footer.ColumnsCount; ++i)
                    {
                        OrcProto.ColumnEncoding encoding = footer.GetColumns(i);
                        Assert.Equal(OrcProto.ColumnEncoding.Types.Kind.DICTIONARY_V2, encoding.Kind);
                    }
                }
            }
        }
        public void testTimestampWriter(string writerTimeZone, string readerTimeZone)
        {
            ObjectInspector inspector = ObjectInspectorFactory.getReflectionObjectInspector(typeof(Timestamp));
            List <string>   ts        = new List <string>();

            using (Stream file = File.OpenWrite(TestFilePath))
                using (Writer writer = OrcFile.createWriter(TestFilePath, file, OrcFile.writerOptions(conf)
                                                            .inspector(inspector)
                                                            .stripeSize(100000)
                                                            .bufferSize(10000)))
                    using (TestHelpers.SetTimeZoneInfo(writerTimeZone))
                    {
                        ts.Add("2003-01-01 01:00:00.000000222");
                        ts.Add("1996-08-02 09:00:00.723100809");
                        ts.Add("1999-01-01 02:00:00.999999999");
                        ts.Add("1995-01-02 03:00:00.688888888");
                        ts.Add("2002-01-01 04:00:00.1");
                        ts.Add("2010-03-02 05:00:00.000009001");
                        ts.Add("2005-01-01 06:00:00.000002229");
                        ts.Add("2006-01-01 07:00:00.900203003");
                        ts.Add("2003-01-01 08:00:00.800000007");
                        ts.Add("1998-11-02 10:00:00.857340643");
                        ts.Add("2008-10-02 11:00:00.0");
                        ts.Add("2037-01-01 00:00:00.000999");
                        ts.Add("2014-03-28 00:00:00.0");
                        foreach (string t in ts)
                        {
                            writer.addRow(Timestamp.Parse(t));
                        }
                    }

            using (TestHelpers.SetTimeZoneInfo(readerTimeZone))
            {
                Reader reader = OrcFile.createReader(TestFilePath, OrcFile.readerOptions(conf));
                using (RecordReader rows = reader.rows(null))
                {
                    int idx = 0;
                    while (rows.hasNext())
                    {
                        object    row = rows.next();
                        Timestamp got = ((Timestamp)row);
                        Assert.Equal(ts[idx++], got.ToString());
                    }
                }
            }
        }
        public void testTooManyDistinctV11AlwaysDictionary()
        {
            ObjectInspector inspector = ObjectInspectorFactory.getReflectionObjectInspector(typeof(string));

            using (Stream file = File.OpenWrite(TestFilePath))
                using (Writer writer = OrcFile.createWriter(TestFilePath, file, OrcFile.writerOptions(conf)
                                                            .inspector(inspector)
                                                            .compress(CompressionKind.NONE)
                                                            .version(OrcFile.Version.V_0_11)
                                                            .bufferSize(10000)))
                {
                    for (int i = 0; i < 20000; i++)
                    {
                        writer.addRow(i.ToString());
                    }
                }

            Reader reader = OrcFile.createReader(TestFilePath, OrcFile.readerOptions(conf));

            using (RecordReader rows = reader.rows())
            {
                int idx = 0;
                while (rows.hasNext())
                {
                    object row = rows.next();
                    Assert.Equal((idx++).ToString(), row);
                }

                // make sure the encoding type is correct
                foreach (StripeInformation stripe in reader.getStripes())
                {
                    // hacky but does the job, this casting will work as long this test resides
                    // within the same package as ORC reader
                    OrcProto.StripeFooter footer = ((RecordReaderImpl)rows).readStripeFooter(stripe);
                    for (int i = 0; i < footer.ColumnsCount; ++i)
                    {
                        OrcProto.ColumnEncoding encoding = footer.GetColumns(i);
                        Assert.Equal(OrcProto.ColumnEncoding.Types.Kind.DICTIONARY, encoding.Kind);
                    }
                }
            }
        }
Esempio n. 7
0
 public bool validateInput(FileSystem fs, HiveConf conf, List <FileStatus> files)
 {
     if (files.Count <= 0)
     {
         return(false);
     }
     foreach (FileStatus file in files)
     {
         try
         {
             OrcFile.createReader(file.getPath(),
                                  OrcFile.readerOptions(conf).filesystem(fs));
         }
         catch (System.IO.IOException e)
         {
             return(false);
         }
     }
     return(true);
 }
Esempio n. 8
0
        getRecordReader(InputSplit inputSplit, JobConf conf,
                        Reporter reporter)
        {
            FileSplit fSplit = (FileSplit)inputSplit;

            reporter.setStatus(fSplit.ToString());

            Path path = fSplit.getPath();

            OrcFile.ReaderOptions opts = OrcFile.readerOptions(conf);
            if (fSplit is OrcSplit)
            {
                OrcSplit orcSplit = (OrcSplit)fSplit;
                if (orcSplit.hasFooter())
                {
                    opts.fileMetaInfo(orcSplit.getFileMetaInfo());
                }
            }
            Reader reader = OrcFile.createReader(path, opts);

            return(new VectorizedOrcRecordReader(reader, conf, fSplit));
        }
Esempio n. 9
0
        public void testOrcSerDeStatsSimpleWithNulls()
        {
            ObjectInspector inspector = ObjectInspectorFactory.getReflectionObjectInspector(typeof(SimpleStruct));

            using (Stream file = File.OpenWrite(TestFilePath))
                using (Writer writer = OrcFile.createWriter(TestFilePath, file, OrcFile.writerOptions(conf)
                                                            .inspector(inspector)
                                                            .stripeSize(10000)
                                                            .bufferSize(10000)))
                {
                    for (int row = 0; row < 1000; row++)
                    {
                        if (row % 2 == 0)
                        {
                            writer.addRow(new SimpleStruct(new byte[] { 1, 2, 3 }, "hi"));
                        }
                        else
                        {
                            writer.addRow(null);
                        }
                    }
                    writer.close();

                    // stats from writer
                    Assert.Equal(1000, writer.getNumberOfRows());
                    Assert.Equal(44500, writer.getRawDataSize());
                }

            Reader reader = OrcFile.createReader(TestFilePath,
                                                 OrcFile.readerOptions(conf));

            // stats from reader
            Assert.Equal(1000, reader.getNumberOfRows());
            Assert.Equal(44500, reader.getRawDataSize());
            Assert.Equal(1500, reader.getRawDataSizeOfColumns(Lists.newArrayList("bytes1")));
            Assert.Equal(43000, reader.getRawDataSizeOfColumns(Lists.newArrayList("string1")));
            Assert.Equal(44500, reader.getRawDataSizeOfColumns(Lists.newArrayList("bytes1", "string1")));
        }
Esempio n. 10
0
        public void testBitPack64Large()
        {
            ObjectInspector inspector = ObjectInspectorFactory.getReflectionObjectInspector(typeof(long));

            const int size = 1080832;

            long[] inp  = new long[size];
            Random rand = new Random(1234);

            for (int i = 0; i < size; i++)
            {
                inp[i] = rand.NextLong();
            }
            List <long> input = inp.ToList();

            using (Stream file = File.OpenWrite(TestFilePath))
                using (Writer writer = OrcFile.createWriter(TestFilePath, file, OrcFile.writerOptions(conf)
                                                            .inspector(inspector)
                                                            .compress(CompressionKind.ZLIB)))
                {
                    foreach (long l in input)
                    {
                        writer.addRow(l);
                    }
                }

            Reader reader = OrcFile.createReader(TestFilePath, OrcFile.readerOptions(conf));

            using (RecordReader rows = reader.rows())
            {
                int idx = 0;
                while (rows.hasNext())
                {
                    object row = rows.next();
                    Assert.Equal(input[idx++], ((long)row));
                }
            }
        }
        public void testBitPacking(long val)
        {
            long[] input = new long[]
            {
                val, 0, val, val, 0, val, 0, val, val, 0, val, 0, val, val, 0, 0,
                val, val, 0, val, 0, 0, val, 0, val, 0, val, 0, 0, val, 0, val, 0, val, 0, 0, val, 0, val,
                0, val, 0, 0, val, 0, val, 0, val, 0, 0, val, 0, val, 0, val, 0, 0, val, 0, val, 0, val, 0,
                0, val, 0, val, 0, val, 0, 0, val, 0, val, 0, val, 0, 0, val, 0, val, 0, val, 0, 0, val, 0,
                val, 0, val, 0, 0, val, 0, val, 0, 0, val, val
            };

            ObjectInspector inspector = ObjectInspectorFactory.getReflectionObjectInspector(typeof(long));

            using (Stream file = File.OpenWrite(TestFilePath))
                using (Writer writer = OrcFile.createWriter(TestFilePath, file, OrcFile.writerOptions(conf)
                                                            .inspector(inspector)
                                                            .stripeSize(100000)
                                                            .compress(CompressionKind.NONE)
                                                            .bufferSize(10000)))
                {
                    foreach (long l in input)
                    {
                        writer.addRow(l);
                    }
                }

            Reader reader = OrcFile.createReader(TestFilePath, OrcFile.readerOptions(conf));

            using (RecordReader rows = reader.rows())
            {
                int idx = 0;
                while (rows.hasNext())
                {
                    object row = rows.next();
                    Assert.Equal(input[idx++], ((long)row));
                }
            }
        }
        public void testMultiStripeWithNull()
        {
            ObjectInspector inspector = ObjectInspectorFactory.getReflectionObjectInspector(typeof(MyStruct));

            using (Stream file = File.OpenWrite(TestFilePath))
                using (Writer writer = OrcFile.createWriter(TestFilePath, file, OrcFile.writerOptions(conf)
                                                            .inspector(inspector)
                                                            .stripeSize(100000)
                                                            .compress(CompressionKind.NONE)
                                                            .bufferSize(10000)))
                {
                    Random rand = new Random(100);
                    writer.addRow(new MyStruct(null, null, true, new List <InnerStruct> {
                        new InnerStruct(100)
                    }));
                    for (int i = 2; i < 20000; i++)
                    {
                        writer.addRow(new MyStruct(rand.Next(1), "a", true, new List <InnerStruct> {
                            new InnerStruct(100)
                        }));
                    }
                    writer.addRow(new MyStruct(null, null, true, new List <InnerStruct> {
                        new InnerStruct(100)
                    }));
                }

            Reader reader = OrcFile.createReader(TestFilePath,
                                                 OrcFile.readerOptions(conf));

            // check the stats
            ColumnStatistics[] stats = reader.getStatistics();
            Assert.Equal(20000, reader.getNumberOfRows());
            Assert.Equal(20000, stats[0].getNumberOfValues());

            Assert.Equal(0, ((IntegerColumnStatistics)stats[1]).getMaximum());
            Assert.Equal(0, ((IntegerColumnStatistics)stats[1]).getMinimum());
            Assert.Equal(true, ((IntegerColumnStatistics)stats[1]).isSumDefined());
            Assert.Equal(0, ((IntegerColumnStatistics)stats[1]).getSum());
            Assert.Equal("count: 19998 hasNull: True min: 0 max: 0 sum: 0",
                         stats[1].ToString());

            Assert.Equal("a", ((StringColumnStatistics)stats[2]).getMaximum());
            Assert.Equal("a", ((StringColumnStatistics)stats[2]).getMinimum());
            Assert.Equal(19998, stats[2].getNumberOfValues());
            Assert.Equal("count: 19998 hasNull: True min: a max: a sum: 19998",
                         stats[2].ToString());

            // check the inspectors
            StructObjectInspector readerInspector =
                (StructObjectInspector)reader.getObjectInspector();

            Assert.Equal(ObjectInspectorCategory.STRUCT,
                         readerInspector.getCategory());
            Assert.Equal("struct<a:int,b:string,c:boolean,list:array<struct<z:int>>>",
                         readerInspector.getTypeName());

            using (RecordReader rows = reader.rows())
            {
                List <bool> expected = new List <bool>();
                foreach (StripeInformation sinfo in reader.getStripes())
                {
                    expected.Add(false);
                }
                // only the first and last stripe will have PRESENT stream
                expected[0] = true;
                expected[expected.Count - 1] = true;

                List <bool> got = new List <bool>();
                // check if the strip footer contains PRESENT stream
                foreach (StripeInformation sinfo in reader.getStripes())
                {
                    OrcProto.StripeFooter sf =
                        ((RecordReaderImpl)rows).readStripeFooter(sinfo);
                    got.Add(sf.ToString().IndexOf(OrcProto.Stream.Types.Kind.PRESENT.ToString()) != -1);
                }
                Assert.Equal(expected, got);

                // row 1
                OrcStruct row = (OrcStruct)rows.next();
                Assert.NotNull(row);
                Assert.Null(row.getFieldValue(0));
                Assert.Null(row.getFieldValue(1));
                Assert.Equal(true, row.getFieldValue(2));
                Assert.Equal(100, ((OrcStruct)((IList <object>)row.getFieldValue(3))[0]).
                             getFieldValue(0));

                rows.seekToRow(19998);
                // last-1 row
                row = (OrcStruct)rows.next();
                Assert.NotNull(row);
                Assert.NotNull(row.getFieldValue(1));
                Assert.Equal(0, row.getFieldValue(0));
                Assert.Equal(true, row.getFieldValue(2));
                Assert.Equal(100, ((OrcStruct)((IList <object>)row.getFieldValue(3))[0]).
                             getFieldValue(0));

                // last row
                row = (OrcStruct)rows.next();
                Assert.NotNull(row);
                Assert.Null(row.getFieldValue(0));
                Assert.Null(row.getFieldValue(1));
                Assert.Equal(true, row.getFieldValue(2));
                Assert.Equal(100, ((OrcStruct)((IList <object>)row.getFieldValue(3))[0]).
                             getFieldValue(0));
            }
        }
Esempio n. 13
0
        public void testSerdeStatsOldFormat()
        {
            string testFile = Path.Combine(TestHelpers.ResourcesDirectory, "orc-file-11-format.orc");
            Reader reader   = OrcFile.createReader(testFile, OrcFile.readerOptions(conf));

            int  stripeCount   = 0;
            int  rowCount      = 0;
            long currentOffset = -1;

            foreach (StripeInformation stripe in reader.getStripes())
            {
                stripeCount += 1;
                rowCount    += (int)stripe.getNumberOfRows();
                if (currentOffset < 0)
                {
                    currentOffset = stripe.getOffset() + stripe.getIndexLength()
                                    + stripe.getDataLength() + stripe.getFooterLength();
                }
                else
                {
                    Assert.Equal(currentOffset, stripe.getOffset());
                    currentOffset += stripe.getIndexLength() + stripe.getDataLength()
                                     + stripe.getFooterLength();
                }
            }
            Assert.Equal(reader.getNumberOfRows(), rowCount);
#if JAVA_SIZE
            Assert.Equal(6300000, reader.getRawDataSize());
#endif
            Assert.Equal(2, stripeCount);

            // check the stats
            ColumnStatistics[] stats = reader.getStatistics();
            Assert.Equal(7500, stats[1].getNumberOfValues());
            Assert.Equal(3750, ((BooleanColumnStatistics)stats[1]).getFalseCount());
            Assert.Equal(3750, ((BooleanColumnStatistics)stats[1]).getTrueCount());
            Assert.Equal("count: 7500 hasNull: True true: 3750", stats[1].ToString());

            Assert.Equal(2048, ((IntegerColumnStatistics)stats[3]).getMaximum());
            Assert.Equal(1024, ((IntegerColumnStatistics)stats[3]).getMinimum());
            Assert.Equal(true, ((IntegerColumnStatistics)stats[3]).isSumDefined());
            Assert.Equal(11520000, ((IntegerColumnStatistics)stats[3]).getSum());
            Assert.Equal("count: 7500 hasNull: True min: 1024 max: 2048 sum: 11520000",
                         stats[3].ToString());

            Assert.Equal(Int64.MaxValue, ((IntegerColumnStatistics)stats[5]).getMaximum());
            Assert.Equal(Int64.MaxValue, ((IntegerColumnStatistics)stats[5]).getMinimum());
            Assert.Equal(false, ((IntegerColumnStatistics)stats[5]).isSumDefined());
            Assert.Equal(
                "count: 7500 hasNull: True min: 9223372036854775807 max: 9223372036854775807",
                stats[5].ToString());

            Assert.Equal(-15.0, ((DoubleColumnStatistics)stats[7]).getMinimum());
            Assert.Equal(-5.0, ((DoubleColumnStatistics)stats[7]).getMaximum());
            Assert.Equal(-75000.0, ((DoubleColumnStatistics)stats[7]).getSum(), 5);
            Assert.Equal("count: 7500 hasNull: True min: -15 max: -5 sum: -75000",
                         stats[7].ToString());

            Assert.Equal("bye", ((StringColumnStatistics)stats[9]).getMinimum());
            Assert.Equal("hi", ((StringColumnStatistics)stats[9]).getMaximum());
            Assert.Equal(0, ((StringColumnStatistics)stats[9]).getSum());
            Assert.Equal("count: 7500 hasNull: True min: bye max: hi sum: 0", stats[9].ToString());

            // old orc format will not have binary statistics. ToString() will show only
            // the general column statistics
            Assert.Equal("count: 7500 hasNull: True", stats[8].ToString());

            // since old orc format doesn't support binary statistics,
            // this should throw ClassCastException
            Assert.Throws <InvalidCastException>(() => ((BinaryColumnStatistics)stats[8]).getSum());
        }
Esempio n. 14
0
        public void testOrcSerDeStatsComplexOldFormat()
        {
            ObjectInspector inspector = ObjectInspectorFactory.getReflectionObjectInspector(typeof(BigRow));

            long rawDataSize;

            using (Stream file = File.OpenWrite(TestFilePath))
                using (Writer writer = OrcFile.createWriter(TestFilePath, file, OrcFile.writerOptions(conf)
                                                            .inspector(inspector)
                                                            .stripeSize(100000)
                                                            .version(OrcFile.Version.V_0_11)
                                                            .bufferSize(10000)))
                {
                    // 1 + 2 + 4 + 8 + 4 + 8 + 5 + 2 + 4 + 3 + 4 + 4 + 4 + 4 + 4 + 3 = 64
                    writer.addRow(new BigRow(false, (sbyte)1, (short)1024, 65536,
                                             Int64.MaxValue, (float)1.0, -15.0, bytes(0, 1, 2, 3, 4), "hi",
                                             new MiddleStruct(inner(1, "bye"), inner(2, "sigh")),
                                             list(inner(3, "good"), inner(4, "bad")),
                                             map(), Timestamp.Parse("2000-03-12 15:00:00"), HiveDecimal.Parse(
                                                 "12345678.6547456")));
                    // 1 + 2 + 4 + 8 + 4 + 8 + 3 + 4 + 3 + 4 + 4 + 4 + 3 + 4 + 2 + 4 + 3 + 5 + 4 + 5 + 7 + 4 + 7 =
                    // 97
                    writer.addRow(new BigRow(true, (sbyte)100, (short)2048, 65536,
                                             Int64.MaxValue, (float)2.0, -5.0, bytes(), "bye",
                                             new MiddleStruct(inner(1, "bye"), inner(2, "sigh")),
                                             list(inner(100000000, "cat"), inner(-100000, "in"), inner(1234, "hat")),
                                             map(inner(5, "chani"), inner(1, "mauddib")), Timestamp.Parse("2000-03-11 15:00:00"),
                                             HiveDecimal.Parse("12345678.6547452")));
                    writer.close();

                    long rowCount = writer.getNumberOfRows();
                    rawDataSize = writer.getRawDataSize();
                    Assert.Equal(2, rowCount);
                    Assert.Equal(1740, rawDataSize);
                }

            Reader reader = OrcFile.createReader(TestFilePath,
                                                 OrcFile.readerOptions(conf));

            Assert.Equal(2, reader.getNumberOfRows());
            Assert.Equal(1740, reader.getRawDataSize());
            Assert.Equal(8, reader.getRawDataSizeOfColumns(Lists.newArrayList("boolean1")));
            Assert.Equal(8, reader.getRawDataSizeOfColumns(Lists.newArrayList("byte1")));
            Assert.Equal(8, reader.getRawDataSizeOfColumns(Lists.newArrayList("short1")));
            Assert.Equal(8, reader.getRawDataSizeOfColumns(Lists.newArrayList("int1")));
            Assert.Equal(16, reader.getRawDataSizeOfColumns(Lists.newArrayList("long1")));
            Assert.Equal(8, reader.getRawDataSizeOfColumns(Lists.newArrayList("float1")));
            Assert.Equal(16, reader.getRawDataSizeOfColumns(Lists.newArrayList("double1")));
            Assert.Equal(5, reader.getRawDataSizeOfColumns(Lists.newArrayList("bytes1")));
            Assert.Equal(172, reader.getRawDataSizeOfColumns(Lists.newArrayList("string1")));
            Assert.Equal(455, reader.getRawDataSizeOfColumns(Lists.newArrayList("list")));
            Assert.Equal(368, reader.getRawDataSizeOfColumns(Lists.newArrayList("map")));
            Assert.Equal(364, reader.getRawDataSizeOfColumns(Lists.newArrayList("middle")));
            Assert.Equal(80, reader.getRawDataSizeOfColumns(Lists.newArrayList("ts")));
            Assert.Equal(224, reader.getRawDataSizeOfColumns(Lists.newArrayList("decimal1")));
            Assert.Equal(88, reader.getRawDataSizeOfColumns(Lists.newArrayList("ts", "int1")));
            Assert.Equal(1195,
                         reader.getRawDataSizeOfColumns(Lists.newArrayList("middle", "list", "map", "float1")));
            Assert.Equal(185,
                         reader.getRawDataSizeOfColumns(Lists.newArrayList("bytes1", "byte1", "string1")));
            Assert.Equal(rawDataSize, reader.getRawDataSizeOfColumns(Lists.newArrayList("boolean1",
                                                                                        "byte1", "short1", "int1", "long1", "float1", "double1", "bytes1", "string1", "list",
                                                                                        "map", "middle", "ts", "decimal1")));

            // check the stats
            ColumnStatistics[] stats = reader.getStatistics();
            Assert.Equal(2, stats[1].getNumberOfValues());
            Assert.Equal(1, ((BooleanColumnStatistics)stats[1]).getFalseCount());
            Assert.Equal(1, ((BooleanColumnStatistics)stats[1]).getTrueCount());
            Assert.Equal("count: 2 hasNull: False true: 1", stats[1].ToString());

            Assert.Equal(2048, ((IntegerColumnStatistics)stats[3]).getMaximum());
            Assert.Equal(1024, ((IntegerColumnStatistics)stats[3]).getMinimum());
            Assert.Equal(true, ((IntegerColumnStatistics)stats[3]).isSumDefined());
            Assert.Equal(3072, ((IntegerColumnStatistics)stats[3]).getSum());
            Assert.Equal("count: 2 hasNull: False min: 1024 max: 2048 sum: 3072",
                         stats[3].ToString());

            Assert.Equal(Int64.MaxValue, ((IntegerColumnStatistics)stats[5]).getMaximum());
            Assert.Equal(Int64.MaxValue, ((IntegerColumnStatistics)stats[5]).getMinimum());
            Assert.Equal(false, ((IntegerColumnStatistics)stats[5]).isSumDefined());
            Assert.Equal("count: 2 hasNull: False min: 9223372036854775807 max: 9223372036854775807",
                         stats[5].ToString());

            Assert.Equal(-15.0, ((DoubleColumnStatistics)stats[7]).getMinimum());
            Assert.Equal(-5.0, ((DoubleColumnStatistics)stats[7]).getMaximum());
            Assert.Equal(-20.0, ((DoubleColumnStatistics)stats[7]).getSum(), 5);
            Assert.Equal("count: 2 hasNull: False min: -15 max: -5 sum: -20",
                         stats[7].ToString());

            Assert.Equal(5, ((BinaryColumnStatistics)stats[8]).getSum());
            Assert.Equal("count: 2 hasNull: False sum: 5", stats[8].ToString());

            Assert.Equal("bye", ((StringColumnStatistics)stats[9]).getMinimum());
            Assert.Equal("hi", ((StringColumnStatistics)stats[9]).getMaximum());
            Assert.Equal(5, ((StringColumnStatistics)stats[9]).getSum());
            Assert.Equal("count: 2 hasNull: False min: bye max: hi sum: 5", stats[9].ToString());
        }
Esempio n. 15
0
        public void testStringAndBinaryStatistics()
        {
            ObjectInspector inspector = ObjectInspectorFactory.getReflectionObjectInspector(typeof(SimpleStruct));

            using (Stream file = File.OpenWrite(TestFilePath))
                using (Writer writer = OrcFile.createWriter(TestFilePath, file, OrcFile.writerOptions(conf)
                                                            .inspector(inspector)
                                                            .stripeSize(100000)
                                                            .bufferSize(10000)))
                {
                    writer.addRow(new SimpleStruct(bytes(0, 1, 2, 3, 4), "foo"));
                    writer.addRow(new SimpleStruct(bytes(0, 1, 2, 3), "bar"));
                    writer.addRow(new SimpleStruct(bytes(0, 1, 2, 3, 4, 5), null));
                    writer.addRow(new SimpleStruct(null, "hi"));
                    writer.close();

                    Assert.Equal(4, writer.getNumberOfRows());
                    Assert.Equal(273, writer.getRawDataSize());
                }

            Reader reader = OrcFile.createReader(TestFilePath,
                                                 OrcFile.readerOptions(conf));

            Assert.Equal(4, reader.getNumberOfRows());
            Assert.Equal(273, reader.getRawDataSize());
            Assert.Equal(15, reader.getRawDataSizeOfColumns(Lists.newArrayList("bytes1")));
            Assert.Equal(258, reader.getRawDataSizeOfColumns(Lists.newArrayList("string1")));
            Assert.Equal(273, reader.getRawDataSizeOfColumns(Lists.newArrayList("bytes1", "string1")));

            // check the stats
            ColumnStatistics[] stats = reader.getStatistics();
            Assert.Equal(4, stats[0].getNumberOfValues());
            Assert.Equal("count: 4 hasNull: False", stats[0].ToString());

            Assert.Equal(3, stats[1].getNumberOfValues());
            Assert.Equal(15, ((BinaryColumnStatistics)stats[1]).getSum());
            Assert.Equal("count: 3 hasNull: True sum: 15", stats[1].ToString());

            Assert.Equal(3, stats[2].getNumberOfValues());
            Assert.Equal("bar", ((StringColumnStatistics)stats[2]).getMinimum());
            Assert.Equal("hi", ((StringColumnStatistics)stats[2]).getMaximum());
            Assert.Equal(8, ((StringColumnStatistics)stats[2]).getSum());
            Assert.Equal("count: 3 hasNull: True min: bar max: hi sum: 8",
                         stats[2].ToString());

            // check the inspectors
            StructObjectInspector readerInspector =
                (StructObjectInspector)reader.getObjectInspector();

            Assert.Equal(ObjectInspectorCategory.STRUCT, readerInspector.getCategory());
            Assert.Equal("struct<bytes1:binary,string1:string>", readerInspector.getTypeName());
            IList <StructField>   fields = readerInspector.getAllStructFieldRefs();
            BinaryObjectInspector bi     = (BinaryObjectInspector)readerInspector.
                                           getStructFieldRef("bytes1").getFieldObjectInspector();
            StringObjectInspector st = (StringObjectInspector)readerInspector.
                                       getStructFieldRef("string1").getFieldObjectInspector();

            using (RecordReader rows = reader.rows())
            {
                object row = rows.next();
                Assert.NotNull(row);
                // check the contents of the first row
                Assert.Equal(bytes(0, 1, 2, 3, 4), bi.get(
                                 readerInspector.getStructFieldData(row, fields[0])));
                Assert.Equal("foo", st.getPrimitiveJavaObject(readerInspector.
                                                              getStructFieldData(row, fields[1])));

                // check the contents of second row
                Assert.Equal(true, rows.hasNext());
                row = rows.next();
                Assert.Equal(bytes(0, 1, 2, 3), bi.get(
                                 readerInspector.getStructFieldData(row, fields[0])));
                Assert.Equal("bar", st.getPrimitiveJavaObject(readerInspector.
                                                              getStructFieldData(row, fields[1])));

                // check the contents of second row
                Assert.Equal(true, rows.hasNext());
                row = rows.next();
                Assert.Equal(bytes(0, 1, 2, 3, 4, 5), bi.get(
                                 readerInspector.getStructFieldData(row, fields[0])));
                Assert.Null(st.getPrimitiveJavaObject(readerInspector.
                                                      getStructFieldData(row, fields[1])));

                // check the contents of second row
                Assert.Equal(true, rows.hasNext());
                row = rows.next();
                Assert.Null(bi.get(
                                readerInspector.getStructFieldData(row, fields[0])));
                Assert.Equal("hi", st.getPrimitiveJavaObject(readerInspector.
                                                             getStructFieldData(row, fields[1])));

                Assert.Equal(false, rows.hasNext());
            }
        }
        public void testHasNull()
        {
            ObjectInspector inspector = ObjectInspectorFactory.getReflectionObjectInspector(typeof(SimpleStruct));

            using (Stream file = File.OpenWrite(TestFilePath))
                using (Writer writer = OrcFile.createWriter(TestFilePath, file, OrcFile.writerOptions(conf)
                                                            .inspector(inspector)
                                                            .rowIndexStride(1000)
                                                            .stripeSize(10000)
                                                            .bufferSize(10000)))
                {
                    // STRIPE 1
                    // RG1
                    for (int i = 0; i < 1000; i++)
                    {
                        writer.addRow(new SimpleStruct(bytes(1, 2, 3), "RG1"));
                    }
                    // RG2
                    for (int i = 0; i < 1000; i++)
                    {
                        writer.addRow(new SimpleStruct(bytes(1, 2, 3), null));
                    }
                    // RG3
                    for (int i = 0; i < 1000; i++)
                    {
                        writer.addRow(new SimpleStruct(bytes(1, 2, 3), "RG3"));
                    }
                    // RG4
                    for (int i = 0; i < 1000; i++)
                    {
                        writer.addRow(new SimpleStruct(bytes(1, 2, 3), null));
                    }
                    // RG5
                    for (int i = 0; i < 1000; i++)
                    {
                        writer.addRow(new SimpleStruct(bytes(1, 2, 3), null));
                    }
                    // STRIPE 2
                    for (int i = 0; i < 5000; i++)
                    {
                        writer.addRow(new SimpleStruct(bytes(1, 2, 3), null));
                    }
                    // STRIPE 3
                    for (int i = 0; i < 5000; i++)
                    {
                        writer.addRow(new SimpleStruct(bytes(1, 2, 3), "STRIPE-3"));
                    }
                    // STRIPE 4
                    for (int i = 0; i < 5000; i++)
                    {
                        writer.addRow(new SimpleStruct(bytes(1, 2, 3), null));
                    }
                }

            Reader reader = OrcFile.createReader(TestFilePath, OrcFile.readerOptions(conf));

            // check the file level stats
            ColumnStatistics[] stats = reader.getStatistics();
            Assert.Equal(20000, stats[0].getNumberOfValues());
            Assert.Equal(20000, stats[1].getNumberOfValues());
            Assert.Equal(7000, stats[2].getNumberOfValues());
            Assert.Equal(false, stats[0].hasNull());
            Assert.Equal(false, stats[1].hasNull());
            Assert.Equal(true, stats[2].hasNull());

            // check the stripe level stats
            List <StripeStatistics> stripeStats = reader.getStripeStatistics();
            // stripe 1 stats
            StripeStatistics ss1     = stripeStats[0];
            ColumnStatistics ss1_cs1 = ss1.getColumnStatistics()[0];
            ColumnStatistics ss1_cs2 = ss1.getColumnStatistics()[1];
            ColumnStatistics ss1_cs3 = ss1.getColumnStatistics()[2];

            Assert.Equal(false, ss1_cs1.hasNull());
            Assert.Equal(false, ss1_cs2.hasNull());
            Assert.Equal(true, ss1_cs3.hasNull());

            // stripe 2 stats
            StripeStatistics ss2     = stripeStats[1];
            ColumnStatistics ss2_cs1 = ss2.getColumnStatistics()[0];
            ColumnStatistics ss2_cs2 = ss2.getColumnStatistics()[1];
            ColumnStatistics ss2_cs3 = ss2.getColumnStatistics()[2];

            Assert.Equal(false, ss2_cs1.hasNull());
            Assert.Equal(false, ss2_cs2.hasNull());
            Assert.Equal(true, ss2_cs3.hasNull());

            // stripe 3 stats
            StripeStatistics ss3     = stripeStats[2];
            ColumnStatistics ss3_cs1 = ss3.getColumnStatistics()[0];
            ColumnStatistics ss3_cs2 = ss3.getColumnStatistics()[1];
            ColumnStatistics ss3_cs3 = ss3.getColumnStatistics()[2];

            Assert.Equal(false, ss3_cs1.hasNull());
            Assert.Equal(false, ss3_cs2.hasNull());
            Assert.Equal(false, ss3_cs3.hasNull());

            // stripe 4 stats
            StripeStatistics ss4     = stripeStats[3];
            ColumnStatistics ss4_cs1 = ss4.getColumnStatistics()[0];
            ColumnStatistics ss4_cs2 = ss4.getColumnStatistics()[1];
            ColumnStatistics ss4_cs3 = ss4.getColumnStatistics()[2];

            Assert.Equal(false, ss4_cs1.hasNull());
            Assert.Equal(false, ss4_cs2.hasNull());
            Assert.Equal(true, ss4_cs3.hasNull());

#if false
            // Test file dump
            TextWriter       origOut        = System.Console.Out;
            string           outputFilename = "orc-file-has-null.out";
            FileOutputStream myOut          = new FileOutputStream(workDir + File.separator + outputFilename);

            // replace stdout and run command
            System.Console.SetOut(new StreamWriter(myOut));
            FileDump.main(new String[] { testFilePath.toString(), "--rowindex=2" });
            System.Console.Out.Flush();
            System.SetOut(origOut);

            TestFileDump.checkOutput(outputFilename, workDir + File.separator + outputFilename);
#endif
        }
        private void checkVectorizedReader()
        {
            Reader vreader = OrcFile.createReader(TestFilePath,
                                                  OrcFile.readerOptions(conf));
            Reader reader = OrcFile.createReader(TestFilePath,
                                                 OrcFile.readerOptions(conf));

            using (RecordReaderImpl vrr = (RecordReaderImpl)vreader.rows())
                using (RecordReaderImpl rr = (RecordReaderImpl)reader.rows())
                {
                    VectorizedRowBatch batch = null;

                    // Check Vectorized ORC reader against ORC row reader
                    while (vrr.hasNext())
                    {
                        batch = vrr.nextBatch(batch);
                        for (int i = 0; i < batch.size; i++)
                        {
                            OrcStruct row = (OrcStruct)rr.next();
                            for (int j = 0; j < batch.cols.Length; j++)
                            {
                                object       a  = (row.getFieldValue(j));
                                ColumnVector cv = batch.cols[j];
                                // if the value is repeating, use row 0
                                int rowId = cv.isRepeating ? 0 : i;

                                // make sure the null flag agrees
                                if (a == null)
                                {
                                    Assert.True(!cv.noNulls && cv.isNull[rowId]);
                                }
                                else if (a is bool)
                                {
                                    // bool values are stores a 1's and 0's, so convert and compare
                                    long temp = (bool)a ? 1 : 0;
                                    long b    = ((LongColumnVector)cv).vector[rowId];
                                    Assert.Equal(temp.ToString(), b.ToString());
                                }
                                else if (a is Timestamp)
                                {
                                    // Timestamps are stored as long, so convert and compare
                                    Timestamp t = (Timestamp)a;
                                    // Timestamp.getTime() is overriden and is
                                    // long time = super.getTime();
                                    // return (time + (nanos / 1000000));
                                    long timeInNanoSec = (t.Milliseconds * 1000000)
                                                         + (t.getNanos() % 1000000);
                                    long b = ((LongColumnVector)cv).vector[rowId];
                                    Assert.Equal(timeInNanoSec.ToString(), b.ToString());
                                }
                                else if (a is Date)
                                {
                                    // Dates are stored as long, so convert and compare

                                    Date adt = (Date)a;
                                    long b   = ((LongColumnVector)cv).vector[rowId];
                                    // Assert.Equal(adt, Date.daysToMillis((int)b));
                                    Assert.Equal(adt.Days, (int)b);
                                }
                                else if (a is HiveDecimal)
                                {
                                    // Decimals are stored as BigInteger, so convert and compare
                                    HiveDecimal dec = (HiveDecimal)a;
                                    HiveDecimal b   = ((DecimalColumnVector)cv).vector[i];
                                    Assert.Equal(dec, b);
                                }
                                else if (a is double)
                                {
                                    double b = ((DoubleColumnVector)cv).vector[rowId];
                                    Assert.Equal(a.ToString(), b.ToString());
                                }
                                else if (a is string)
                                {
                                    BytesColumnVector bcv = (BytesColumnVector)cv;
                                    string            b   = Encoding.UTF8.GetString(bcv.vector[rowId], bcv.start[rowId], bcv.length[rowId]);
                                    Assert.Equal((string)a, b);
                                }
                                else if (a is int || a is long || a is sbyte || a is short)
                                {
                                    Assert.Equal(a.ToString(),
                                                 ((LongColumnVector)cv).vector[rowId].ToString());
                                }
                                else
                                {
                                    Assert.True(false);
                                }
                            }
                        }

                        // Check repeating
                        Assert.Equal(false, batch.cols[0].isRepeating);
                        Assert.Equal(false, batch.cols[1].isRepeating);
                        Assert.Equal(false, batch.cols[2].isRepeating);
                        Assert.Equal(true, batch.cols[3].isRepeating);
                        Assert.Equal(false, batch.cols[4].isRepeating);
                        Assert.Equal(false, batch.cols[5].isRepeating);
                        Assert.Equal(false, batch.cols[6].isRepeating);
                        Assert.Equal(false, batch.cols[7].isRepeating);
                        Assert.Equal(false, batch.cols[8].isRepeating);
                        Assert.Equal(false, batch.cols[9].isRepeating);

                        // Check non null
                        Assert.Equal(false, batch.cols[0].noNulls);
                        Assert.Equal(false, batch.cols[1].noNulls);
                        Assert.Equal(true, batch.cols[2].noNulls);
                        Assert.Equal(true, batch.cols[3].noNulls);
                        Assert.Equal(false, batch.cols[4].noNulls);
                        Assert.Equal(false, batch.cols[5].noNulls);
                        Assert.Equal(false, batch.cols[6].noNulls);
                        Assert.Equal(false, batch.cols[7].noNulls);
                        Assert.Equal(false, batch.cols[8].noNulls);
                        Assert.Equal(false, batch.cols[9].noNulls);
                    }
                    Assert.Equal(false, rr.hasNext());
                }
        }
        public void testColumnsWithNullAndCompression()
        {
            ObjectInspector inspector = ObjectInspectorFactory.getReflectionObjectInspector(typeof(MyStruct));

            using (Stream file = File.OpenWrite(TestFilePath))
                using (Writer writer = OrcFile.createWriter(TestFilePath, file, OrcFile.writerOptions(conf)
                                                            .inspector(inspector)
                                                            .stripeSize(100000)
                                                            .bufferSize(10000)))
                {
                    writer.addRow(new MyStruct(3, "a", true,
                                               Lists.newArrayList(new InnerStruct(100))));
                    writer.addRow(new MyStruct(null, "b", true,
                                               Lists.newArrayList(new InnerStruct(100))));
                    writer.addRow(new MyStruct(3, null, false,
                                               Lists.newArrayList(new InnerStruct(100))));
                    writer.addRow(new MyStruct(3, "d", true,
                                               Lists.newArrayList(new InnerStruct(100))));
                    writer.addRow(new MyStruct(2, "e", true,
                                               Lists.newArrayList(new InnerStruct(100))));
                    writer.addRow(new MyStruct(2, "f", true,
                                               Lists.newArrayList(new InnerStruct(100))));
                    writer.addRow(new MyStruct(2, "g", true,
                                               Lists.newArrayList(new InnerStruct(100))));
                    writer.addRow(new MyStruct(2, "h", true,
                                               Lists.newArrayList(new InnerStruct(100))));
                }

            Reader reader = OrcFile.createReader(TestFilePath,
                                                 OrcFile.readerOptions(conf));

            // check the stats
            ColumnStatistics[] stats = reader.getStatistics();
            Assert.Equal(8, reader.getNumberOfRows());
            Assert.Equal(8, stats[0].getNumberOfValues());

            Assert.Equal(3, ((IntegerColumnStatistics)stats[1]).getMaximum());
            Assert.Equal(2, ((IntegerColumnStatistics)stats[1]).getMinimum());
            Assert.Equal(true, ((IntegerColumnStatistics)stats[1]).isSumDefined());
            Assert.Equal(17, ((IntegerColumnStatistics)stats[1]).getSum());
            Assert.Equal("count: 7 hasNull: True min: 2 max: 3 sum: 17",
                         stats[1].ToString());

            Assert.Equal("h", ((StringColumnStatistics)stats[2]).getMaximum());
            Assert.Equal("a", ((StringColumnStatistics)stats[2]).getMinimum());
            Assert.Equal(7, stats[2].getNumberOfValues());
            Assert.Equal("count: 7 hasNull: True min: a max: h sum: 7",
                         stats[2].ToString());

            // check the inspectors
            StructObjectInspector readerInspector = (StructObjectInspector)reader.getObjectInspector();

            Assert.Equal(ObjectInspectorCategory.STRUCT,
                         readerInspector.getCategory());
            Assert.Equal("struct<a:int,b:string,c:boolean,list:array<struct<z:int>>>",
                         readerInspector.getTypeName());

            using (RecordReader rows = reader.rows())
            {
                // only the last strip will have PRESENT stream
                List <bool> expected = new List <bool>();
                foreach (StripeInformation sinfo in reader.getStripes())
                {
                    expected.Add(false);
                }
                expected[expected.Count - 1] = true;

                List <bool> got = new List <bool>();
                // check if the strip footer contains PRESENT stream
                foreach (StripeInformation sinfo in reader.getStripes())
                {
                    OrcProto.StripeFooter sf = ((RecordReaderImpl)rows).readStripeFooter(sinfo);
                    got.Add(sf.ToString().IndexOf(OrcProto.Stream.Types.Kind.PRESENT.ToString()) != -1);
                }
                Assert.Equal(expected, got);

                // row 1
                OrcStruct row = (OrcStruct)rows.next();
                Assert.NotNull(row);
                Assert.Equal(3, row.getFieldValue(0));
                Assert.Equal("a", row.getFieldValue(1).ToString());
                Assert.Equal(true, row.getFieldValue(2));
                Assert.Equal(100, ((OrcStruct)((IList <object>)row.getFieldValue(3))[0]).
                             getFieldValue(0));

                // row 2
                row = (OrcStruct)rows.next();
                Assert.NotNull(row);
                Assert.Null(row.getFieldValue(0));
                Assert.Equal("b", row.getFieldValue(1).ToString());
                Assert.Equal(true, row.getFieldValue(2));
                Assert.Equal(100, ((OrcStruct)((IList <object>)row.getFieldValue(3))[0]).
                             getFieldValue(0));

                // row 3
                row = (OrcStruct)rows.next();
                Assert.NotNull(row);
                Assert.Null(row.getFieldValue(1));
                Assert.Equal(3, row.getFieldValue(0));
                Assert.Equal(false, row.getFieldValue(2));
                Assert.Equal(100, ((OrcStruct)((IList <object>)row.getFieldValue(3))[0]).
                             getFieldValue(0));
            }
        }