Esempio n. 1
0
        public void testDeltaUnknownSign()
        {
            ObjectInspector inspector = ObjectInspectorFactory.getReflectionObjectInspector(typeof(int));

            using (Stream file = File.OpenWrite(TestFilePath))
                using (Writer w = OrcFile.createWriter(TestFilePath, file, OrcFile.writerOptions(conf)
                                                       .compress(CompressionKind.NONE)
                                                       .inspector(inspector)
                                                       .rowIndexStride(0)
                                                       .encodingStrategy(OrcFile.EncodingStrategy.COMPRESSION)
                                                       .version(OrcFile.Version.V_0_12)))
                {
                    w.addRow(0);
                    for (int i = 0; i < 511; ++i)
                    {
                        w.addRow(i);
                    }
                }

            using (CaptureStdoutToMemory capture = new CaptureStdoutToMemory())
            {
                FileDump.Main(TestFilePath);

                // monotonicity will be undetermined for this sequence 0,0,1,2,3,...510. Hence DIRECT encoding
                // will be used. 2 bytes for header and 640 bytes for data (512 values with fixed bit of 10 bits
                // each, 5120/8 = 640). Total bytes 642
                Assert.True(capture.Text.Contains("Stream: column 0 section DATA start: 3 length 642"));
            }
        }
Esempio n. 2
0
        public void testOrcSerDeStatsMap()
        {
            ObjectInspector inspector = ObjectInspectorFactory.getReflectionObjectInspector(typeof(MapStruct));

            using (Stream file = File.OpenWrite(TestFilePath))
                using (Writer writer = OrcFile.createWriter(TestFilePath, file, OrcFile.writerOptions(conf)
                                                            .inspector(inspector)
                                                            .stripeSize(10000)
                                                            .bufferSize(10000)))
                {
                    for (int row = 0; row < 1000; row++)
                    {
                        Dictionary <string, double> test = new Dictionary <string, double>();
                        for (int i = 0; i < 10; i++)
                        {
                            test.Add("hi" + i, 2.0);
                        }
                        writer.addRow(new MapStruct(test));
                    }
                    writer.close();

                    // stats from writer
                    Assert.Equal(1000, writer.getNumberOfRows());
                    Assert.Equal(950000, writer.getRawDataSize());
                }

            Reader reader = OrcFile.createReader(TestFilePath,
                                                 OrcFile.readerOptions(conf));

            // stats from reader
            Assert.Equal(1000, reader.getNumberOfRows());
            Assert.Equal(950000, reader.getRawDataSize());
            Assert.Equal(950000, reader.getRawDataSizeOfColumns(Lists.newArrayList("map1")));
        }
Esempio n. 3
0
        public void testDump()
        {
            // conf.set(HiveConf.ConfVars.HIVE_ORC_ENCODING_STRATEGY.varname, "COMPRESSION");
            using (Stream file = File.OpenWrite(TestFilePath))
            {
                OrcFile.WriterOptions options = new OrcFile.WriterOptions(new Properties(), conf);
                options.inspector(ObjectInspectorFactory.getReflectionObjectInspector(typeof(MyRecord)));
                options.stripeSize(100000);
                options.compress(CompressionKind.ZLIB);
                options.bufferSize(10000);
                options.rowIndexStride(1000);
                using (Writer writer = OrcFile.createWriter(TestFilePath, file, options))
                {
                    Random r1 = new Random(1);
                    for (int i = 0; i < 21000; ++i)
                    {
                        writer.addRow(new MyRecord(r1.Next(), r1.NextLong(),
                                                   TestHelpers.words[r1.Next(TestHelpers.words.Length)]));
                    }
                }
            }

            string outputFilename = "orc-file-dump.out";

            using (CaptureStdout capture = new CaptureStdout(Path.Combine(workDir, outputFilename)))
            {
                FileDump.Main(new string[] { TestFilePath.ToString(), "--rowindex=1,2,3" });
            }

            TestHelpers.CompareFilesByLine(outputFilename, Path.Combine(workDir, outputFilename));
        }
Esempio n. 4
0
        public void testShortRepeat()
        {
            ObjectInspector inspector = ObjectInspectorFactory.getReflectionObjectInspector(typeof(int));

            using (Stream file = File.OpenWrite(TestFilePath))
                using (Writer w = OrcFile.createWriter(TestFilePath, file, OrcFile.writerOptions(conf)
                                                       .compress(CompressionKind.NONE)
                                                       .inspector(inspector)
                                                       .rowIndexStride(0)
                                                       .encodingStrategy(OrcFile.EncodingStrategy.COMPRESSION)
                                                       .version(OrcFile.Version.V_0_12)))
                {
                    for (int i = 0; i < 5; ++i)
                    {
                        w.addRow(10);
                    }
                }

            using (CaptureStdoutToMemory capture = new CaptureStdoutToMemory())
            {
                FileDump.Main(TestFilePath);

                // 1 byte header + 1 byte value
                Assert.True(capture.Text.Contains("Stream: column 0 section DATA start: 3 length 2"));
            }
        }
Esempio n. 5
0
        public void testFixedDeltaOneDescending()
        {
            ObjectInspector inspector = ObjectInspectorFactory.getReflectionObjectInspector(typeof(int));

            using (Stream file = File.OpenWrite(TestFilePath))
                using (Writer w = OrcFile.createWriter(TestFilePath, file, OrcFile.writerOptions(conf)
                                                       .compress(CompressionKind.NONE)
                                                       .inspector(inspector)
                                                       .rowIndexStride(0)
                                                       .encodingStrategy(OrcFile.EncodingStrategy.COMPRESSION)
                                                       .version(OrcFile.Version.V_0_12)))
                {
                    for (int i = 0; i < 5120; ++i)
                    {
                        w.addRow(512 - (i % 512));
                    }
                }

            using (CaptureStdoutToMemory capture = new CaptureStdoutToMemory())
            {
                FileDump.Main(TestFilePath);

                // 10 runs of 512 elements. Each run has 2 bytes header, 2 byte base (base = 512, zigzag + varint)
                // and 1 byte delta (delta = 1). In total, 5 bytes per run.
                Assert.True(capture.Text.Contains("Stream: column 0 section DATA start: 3 length 50"));
            }
        }
Esempio n. 6
0
        public void testPatchedBase()
        {
            ObjectInspector inspector = ObjectInspectorFactory.getReflectionObjectInspector(typeof(int));

            using (Stream file = File.OpenWrite(TestFilePath))
                using (Writer w = OrcFile.createWriter(TestFilePath, file, OrcFile.writerOptions(conf)
                                                       .compress(CompressionKind.NONE)
                                                       .inspector(inspector)
                                                       .rowIndexStride(0)
                                                       .encodingStrategy(OrcFile.EncodingStrategy.COMPRESSION)
                                                       .version(OrcFile.Version.V_0_12)))
                {
                    Random rand = new Random(123);
                    w.addRow(10000000);
                    for (int i = 0; i < 511; ++i)
                    {
                        w.addRow(rand.Next(i + 1));
                    }
                }

            using (CaptureStdoutToMemory capture = new CaptureStdoutToMemory())
            {
                FileDump.Main(TestFilePath);

                // use PATCHED_BASE encoding
                Assert.True(capture.Text.Contains("Stream: column 0 section DATA start: 3 length 583"));
            }
        }
        public void testReadTimestampFormat_0_11(string readerTimeZone)
        {
            string oldFilePath = Path.Combine(TestHelpers.ResourcesDirectory, "orc-file-11-format.orc");

            using (TestHelpers.SetTimeZoneInfo(readerTimeZone))
            {
                Reader reader = OrcFile.createReader(oldFilePath, OrcFile.readerOptions(conf));

                StructObjectInspector    readerInspector = (StructObjectInspector)reader.getObjectInspector();
                IList <StructField>      fields          = readerInspector.getAllStructFieldRefs();
                TimestampObjectInspector tso             = (TimestampObjectInspector)readerInspector
                                                           .getStructFieldRef("ts").getFieldObjectInspector();

                using (RecordReader rows = reader.rows())
                {
                    object row = rows.next();
                    Assert.NotNull(row);
                    Assert.Equal(Timestamp.Parse("2000-03-12 15:00:00"),
                                 tso.getPrimitiveJavaObject(readerInspector.getStructFieldData(row,
                                                                                               fields[12])));

                    // check the contents of second row
                    Assert.Equal(true, rows.hasNext());
                    rows.seekToRow(7499);
                    row = rows.next();
                    Assert.Equal(Timestamp.Parse("2000-03-12 15:00:01"),
                                 tso.getPrimitiveJavaObject(readerInspector.getStructFieldData(row,
                                                                                               fields[12])));

                    Assert.Equal(false, rows.hasNext());
                }
            }
        }
        public OrcFileStripeMergeRecordReader(Configuration conf, FileSplit split)
        {
            path  = split.getPath();
            start = split.getStart();
            end   = start + split.getLength();
            FileSystem fs = path.getFileSystem(conf);

            this.reader           = OrcFile.createReader(path, OrcFile.readerOptions(conf).filesystem(fs));
            this.iter             = reader.getStripes().GetEnumerator();
            this.stripeIdx        = 0;
            this.stripeStatistics = ((ReaderImpl)reader).getOrcProtoStripeStatistics();
        }
        public void testHalfDistinctCheckDisabled()
        {
            ObjectInspector inspector = ObjectInspectorFactory.getReflectionObjectInspector(typeof(string));

            int[] input = new int[20000];

            // conf.setBoolean(ConfVars.HIVE_ORC_ROW_INDEX_STRIDE_DICTIONARY_CHECK.varname, false);
            using (Stream file = File.OpenWrite(TestFilePath))
                using (Writer writer = OrcFile.createWriter(TestFilePath, file, OrcFile.writerOptions(conf)
                                                            .inspector(inspector)
                                                            .compress(CompressionKind.NONE)
                                                            .bufferSize(10000)))
                {
                    Random rand = new Random(123);
                    for (int i = 0; i < 20000; i++)
                    {
                        input[i] = rand.Next(10000);
                    }

                    for (int i = 0; i < 20000; i++)
                    {
                        writer.addRow(input[i].ToString());
                    }
                }

            Reader reader = OrcFile.createReader(TestFilePath, OrcFile.readerOptions(conf));

            using (RecordReader rows = reader.rows())
            {
                int idx = 0;
                while (rows.hasNext())
                {
                    object row = rows.next();
                    Assert.Equal(input[idx++].ToString(), row);
                }

                // make sure the encoding type is correct
                foreach (StripeInformation stripe in reader.getStripes())
                {
                    // hacky but does the job, this casting will work as long this test resides
                    // within the same package as ORC reader
                    OrcProto.StripeFooter footer = ((RecordReaderImpl)rows).readStripeFooter(stripe);
                    for (int i = 0; i < footer.ColumnsCount; ++i)
                    {
                        OrcProto.ColumnEncoding encoding = footer.GetColumns(i);
                        Assert.Equal(OrcProto.ColumnEncoding.Types.Kind.DICTIONARY_V2, encoding.Kind);
                    }
                }
            }
        }
        public void testTimestampWriter(string writerTimeZone, string readerTimeZone)
        {
            ObjectInspector inspector = ObjectInspectorFactory.getReflectionObjectInspector(typeof(Timestamp));
            List <string>   ts        = new List <string>();

            using (Stream file = File.OpenWrite(TestFilePath))
                using (Writer writer = OrcFile.createWriter(TestFilePath, file, OrcFile.writerOptions(conf)
                                                            .inspector(inspector)
                                                            .stripeSize(100000)
                                                            .bufferSize(10000)))
                    using (TestHelpers.SetTimeZoneInfo(writerTimeZone))
                    {
                        ts.Add("2003-01-01 01:00:00.000000222");
                        ts.Add("1996-08-02 09:00:00.723100809");
                        ts.Add("1999-01-01 02:00:00.999999999");
                        ts.Add("1995-01-02 03:00:00.688888888");
                        ts.Add("2002-01-01 04:00:00.1");
                        ts.Add("2010-03-02 05:00:00.000009001");
                        ts.Add("2005-01-01 06:00:00.000002229");
                        ts.Add("2006-01-01 07:00:00.900203003");
                        ts.Add("2003-01-01 08:00:00.800000007");
                        ts.Add("1998-11-02 10:00:00.857340643");
                        ts.Add("2008-10-02 11:00:00.0");
                        ts.Add("2037-01-01 00:00:00.000999");
                        ts.Add("2014-03-28 00:00:00.0");
                        foreach (string t in ts)
                        {
                            writer.addRow(Timestamp.Parse(t));
                        }
                    }

            using (TestHelpers.SetTimeZoneInfo(readerTimeZone))
            {
                Reader reader = OrcFile.createReader(TestFilePath, OrcFile.readerOptions(conf));
                using (RecordReader rows = reader.rows(null))
                {
                    int idx = 0;
                    while (rows.hasNext())
                    {
                        object    row = rows.next();
                        Timestamp got = ((Timestamp)row);
                        Assert.Equal(ts[idx++], got.ToString());
                    }
                }
            }
        }
Esempio n. 11
0
        public void SimpleTest()
        {
            OrcFile.WriterOptions options = new OrcFile.WriterOptions(new Properties(), new Configuration());
            options.inspector(PrimitiveObjectInspectorFactory.writableStringObjectInspector);
            using (Stream file = File.Create(filename))
                using (Writer writer = OrcFile.createWriter(filename, file, options))
                {
                    writer.addRow("hello");
                }

            Reader reader = OrcFile.createReader(() => File.OpenRead(filename), filename);

            using (RecordReader recordReader = reader.rows())
            {
                object value = recordReader.next();
                Assert.True(value is string);
                Assert.Equal("hello", value);
            }
        }
        public void testTooManyDistinctV11AlwaysDictionary()
        {
            ObjectInspector inspector = ObjectInspectorFactory.getReflectionObjectInspector(typeof(string));

            using (Stream file = File.OpenWrite(TestFilePath))
                using (Writer writer = OrcFile.createWriter(TestFilePath, file, OrcFile.writerOptions(conf)
                                                            .inspector(inspector)
                                                            .compress(CompressionKind.NONE)
                                                            .version(OrcFile.Version.V_0_11)
                                                            .bufferSize(10000)))
                {
                    for (int i = 0; i < 20000; i++)
                    {
                        writer.addRow(i.ToString());
                    }
                }

            Reader reader = OrcFile.createReader(TestFilePath, OrcFile.readerOptions(conf));

            using (RecordReader rows = reader.rows())
            {
                int idx = 0;
                while (rows.hasNext())
                {
                    object row = rows.next();
                    Assert.Equal((idx++).ToString(), row);
                }

                // make sure the encoding type is correct
                foreach (StripeInformation stripe in reader.getStripes())
                {
                    // hacky but does the job, this casting will work as long this test resides
                    // within the same package as ORC reader
                    OrcProto.StripeFooter footer = ((RecordReaderImpl)rows).readStripeFooter(stripe);
                    for (int i = 0; i < footer.ColumnsCount; ++i)
                    {
                        OrcProto.ColumnEncoding encoding = footer.GetColumns(i);
                        Assert.Equal(OrcProto.ColumnEncoding.Types.Kind.DICTIONARY, encoding.Kind);
                    }
                }
            }
        }
Esempio n. 13
0
 public bool validateInput(FileSystem fs, HiveConf conf, List <FileStatus> files)
 {
     if (files.Count <= 0)
     {
         return(false);
     }
     foreach (FileStatus file in files)
     {
         try
         {
             OrcFile.createReader(file.getPath(),
                                  OrcFile.readerOptions(conf).filesystem(fs));
         }
         catch (System.IO.IOException e)
         {
             return(false);
         }
     }
     return(true);
 }
Esempio n. 14
0
        public void testDictionaryThreshold()
        {
            // conf.set(HiveConf.ConfVars.HIVE_ORC_ENCODING_STRATEGY.varname, "COMPRESSION");
            // conf.setFloat(HiveConf.ConfVars.HIVE_ORC_DICTIONARY_KEY_SIZE_THRESHOLD.varname, 0.49f);
            using (Stream file = File.OpenWrite(TestFilePath))
            {
                OrcFile.WriterOptions options = new OrcFile.WriterOptions(new Properties(), conf);
                options.inspector(ObjectInspectorFactory.getReflectionObjectInspector(typeof(MyRecord)));
                options.stripeSize(100000);
                options.compress(CompressionKind.ZLIB);
                options.bufferSize(10000);
                options.rowIndexStride(1000);
                using (Writer writer = OrcFile.createWriter(TestFilePath, file, options))
                {
                    Random r1      = new Random(1);
                    int    nextInt = 0;
                    for (int i = 0; i < 21000; ++i)
                    {
                        // Write out the same string twice, this guarantees the fraction of rows with
                        // distinct strings is 0.5
                        if (i % 2 == 0)
                        {
                            nextInt = r1.Next(TestHelpers.words.Length);
                            // Append the value of i to the word, this guarantees when an index or word is repeated
                            // the actual string is unique.
                            TestHelpers.words[nextInt] += "-" + i;
                        }
                        writer.addRow(new MyRecord(r1.Next(), r1.NextLong(), TestHelpers.words[nextInt]));
                    }
                }
            }

            string outputFilename = "orc-file-dump-dictionary-threshold.out";

            using (CaptureStdout capture = new CaptureStdout(Path.Combine(workDir, outputFilename)))
            {
                FileDump.Main(new string[] { TestFilePath.ToString(), "--rowindex=1,2,3" });
            }

            TestHelpers.CompareFilesByLine(outputFilename, Path.Combine(workDir, outputFilename));
        }
Esempio n. 15
0
        public void testJsonDump()
        {
            ObjectInspector inspector;

            inspector = ObjectInspectorFactory.getReflectionObjectInspector(typeof(MyRecord));
            // conf.set(HiveConf.ConfVars.HIVE_ORC_ENCODING_STRATEGY.varname, "COMPRESSION");
            OrcFile.WriterOptions options = OrcFile.writerOptions(conf)
                                            .inspector(inspector)
                                            .stripeSize(100000)
                                            .compress(CompressionKind.ZLIB)
                                            .bufferSize(10000)
                                            .rowIndexStride(1000)
                                            .bloomFilterColumns("s");
            using (Stream file = File.OpenWrite(TestFilePath))
                using (Writer writer = OrcFile.createWriter(TestFilePath, file, options))
                {
                    Random r1 = new Random(1);
                    for (int i = 0; i < 21000; ++i)
                    {
                        if (i % 100 == 0)
                        {
                            writer.addRow(new MyRecord(r1.Next(), r1.NextLong(), null));
                        }
                        else
                        {
                            writer.addRow(new MyRecord(r1.Next(), r1.NextLong(),
                                                       TestHelpers.words[r1.Next(TestHelpers.words.Length)]));
                        }
                    }
                }

            const string outputFilename = "orc-file-dump.json";

            using (CaptureStdout capture = new CaptureStdout(Path.Combine(workDir, outputFilename)))
            {
                FileDump.Main(new string[] { TestFilePath.ToString(), "-j", "-p", "--rowindex=3" });
            }

            TestHelpers.CompareFilesByLine(outputFilename, Path.Combine(workDir, outputFilename));
        }
Esempio n. 16
0
        getRecordReader(InputSplit inputSplit, JobConf conf,
                        Reporter reporter)
        {
            FileSplit fSplit = (FileSplit)inputSplit;

            reporter.setStatus(fSplit.ToString());

            Path path = fSplit.getPath();

            OrcFile.ReaderOptions opts = OrcFile.readerOptions(conf);
            if (fSplit is OrcSplit)
            {
                OrcSplit orcSplit = (OrcSplit)fSplit;
                if (orcSplit.hasFooter())
                {
                    opts.fileMetaInfo(orcSplit.getFileMetaInfo());
                }
            }
            Reader reader = OrcFile.createReader(path, opts);

            return(new VectorizedOrcRecordReader(reader, conf, fSplit));
        }
Esempio n. 17
0
        public void testBitPack64Large()
        {
            ObjectInspector inspector = ObjectInspectorFactory.getReflectionObjectInspector(typeof(long));

            const int size = 1080832;

            long[] inp  = new long[size];
            Random rand = new Random(1234);

            for (int i = 0; i < size; i++)
            {
                inp[i] = rand.NextLong();
            }
            List <long> input = inp.ToList();

            using (Stream file = File.OpenWrite(TestFilePath))
                using (Writer writer = OrcFile.createWriter(TestFilePath, file, OrcFile.writerOptions(conf)
                                                            .inspector(inspector)
                                                            .compress(CompressionKind.ZLIB)))
                {
                    foreach (long l in input)
                    {
                        writer.addRow(l);
                    }
                }

            Reader reader = OrcFile.createReader(TestFilePath, OrcFile.readerOptions(conf));

            using (RecordReader rows = reader.rows())
            {
                int idx = 0;
                while (rows.hasNext())
                {
                    object row = rows.next();
                    Assert.Equal(input[idx++], ((long)row));
                }
            }
        }
Esempio n. 18
0
        public void testOrcSerDeStatsSimpleWithNulls()
        {
            ObjectInspector inspector = ObjectInspectorFactory.getReflectionObjectInspector(typeof(SimpleStruct));

            using (Stream file = File.OpenWrite(TestFilePath))
                using (Writer writer = OrcFile.createWriter(TestFilePath, file, OrcFile.writerOptions(conf)
                                                            .inspector(inspector)
                                                            .stripeSize(10000)
                                                            .bufferSize(10000)))
                {
                    for (int row = 0; row < 1000; row++)
                    {
                        if (row % 2 == 0)
                        {
                            writer.addRow(new SimpleStruct(new byte[] { 1, 2, 3 }, "hi"));
                        }
                        else
                        {
                            writer.addRow(null);
                        }
                    }
                    writer.close();

                    // stats from writer
                    Assert.Equal(1000, writer.getNumberOfRows());
                    Assert.Equal(44500, writer.getRawDataSize());
                }

            Reader reader = OrcFile.createReader(TestFilePath,
                                                 OrcFile.readerOptions(conf));

            // stats from reader
            Assert.Equal(1000, reader.getNumberOfRows());
            Assert.Equal(44500, reader.getRawDataSize());
            Assert.Equal(1500, reader.getRawDataSizeOfColumns(Lists.newArrayList("bytes1")));
            Assert.Equal(43000, reader.getRawDataSizeOfColumns(Lists.newArrayList("string1")));
            Assert.Equal(44500, reader.getRawDataSizeOfColumns(Lists.newArrayList("bytes1", "string1")));
        }
        public void testBitPacking(long val)
        {
            long[] input = new long[]
            {
                val, 0, val, val, 0, val, 0, val, val, 0, val, 0, val, val, 0, 0,
                val, val, 0, val, 0, 0, val, 0, val, 0, val, 0, 0, val, 0, val, 0, val, 0, 0, val, 0, val,
                0, val, 0, 0, val, 0, val, 0, val, 0, 0, val, 0, val, 0, val, 0, 0, val, 0, val, 0, val, 0,
                0, val, 0, val, 0, val, 0, 0, val, 0, val, 0, val, 0, 0, val, 0, val, 0, val, 0, 0, val, 0,
                val, 0, val, 0, 0, val, 0, val, 0, 0, val, val
            };

            ObjectInspector inspector = ObjectInspectorFactory.getReflectionObjectInspector(typeof(long));

            using (Stream file = File.OpenWrite(TestFilePath))
                using (Writer writer = OrcFile.createWriter(TestFilePath, file, OrcFile.writerOptions(conf)
                                                            .inspector(inspector)
                                                            .stripeSize(100000)
                                                            .compress(CompressionKind.NONE)
                                                            .bufferSize(10000)))
                {
                    foreach (long l in input)
                    {
                        writer.addRow(l);
                    }
                }

            Reader reader = OrcFile.createReader(TestFilePath, OrcFile.readerOptions(conf));

            using (RecordReader rows = reader.rows())
            {
                int idx = 0;
                while (rows.hasNext())
                {
                    object row = rows.next();
                    Assert.Equal(input[idx++], ((long)row));
                }
            }
        }
        public void createFile()
        {
            ObjectInspector inspector = ObjectInspectorFactory.getReflectionObjectInspector(typeof(MyRecord));

            using (Stream file = File.OpenWrite(TestFilePath))
                using (Writer writer = OrcFile.createWriter(TestFilePath, file, OrcFile.writerOptions(conf)
                                                            .stripeSize(100000)
                                                            .compress(CompressionKind.ZLIB)
                                                            .inspector(inspector)
                                                            .bufferSize(10000)
                                                            .rowIndexStride(10000)))
                {
                    Random   r1             = new Random(1);
                    string[] words          = TestHelpers.words;
                    string[] dates          = new string[] { "1991-02-28", "1970-01-31", "1950-04-23" };
                    string[] decimalStrings = new string[]
                    {
                        "234.443", "10001000", "0.3333367", "67788798.0", "-234.443",
                        "-10001000", "-0.3333367", "-67788798.0", "0"
                    };
                    for (int i = 0; i < 21000; ++i)
                    {
                        if ((i % 7) != 0)
                        {
                            writer.addRow(new MyRecord(((i % 3) == 0), (sbyte)(i % 5), i, (long)200, (short)(300 + i), (double)(400 + i),
                                                       words[r1.Next(words.Length)], new Timestamp(DateTime.Now),
                                                       Date.Parse(dates[i % 3]), HiveDecimal.Parse(decimalStrings[i % decimalStrings.Length])));
                        }
                        else
                        {
                            writer.addRow(new MyRecord(null, null, i, (long)200, null, null, null, null, null, null));
                        }
                    }
                }
            checkVectorizedReader();
        }
Esempio n. 21
0
        public void testOrcSerDeStatsComplexOldFormat()
        {
            ObjectInspector inspector = ObjectInspectorFactory.getReflectionObjectInspector(typeof(BigRow));

            long rawDataSize;

            using (Stream file = File.OpenWrite(TestFilePath))
                using (Writer writer = OrcFile.createWriter(TestFilePath, file, OrcFile.writerOptions(conf)
                                                            .inspector(inspector)
                                                            .stripeSize(100000)
                                                            .version(OrcFile.Version.V_0_11)
                                                            .bufferSize(10000)))
                {
                    // 1 + 2 + 4 + 8 + 4 + 8 + 5 + 2 + 4 + 3 + 4 + 4 + 4 + 4 + 4 + 3 = 64
                    writer.addRow(new BigRow(false, (sbyte)1, (short)1024, 65536,
                                             Int64.MaxValue, (float)1.0, -15.0, bytes(0, 1, 2, 3, 4), "hi",
                                             new MiddleStruct(inner(1, "bye"), inner(2, "sigh")),
                                             list(inner(3, "good"), inner(4, "bad")),
                                             map(), Timestamp.Parse("2000-03-12 15:00:00"), HiveDecimal.Parse(
                                                 "12345678.6547456")));
                    // 1 + 2 + 4 + 8 + 4 + 8 + 3 + 4 + 3 + 4 + 4 + 4 + 3 + 4 + 2 + 4 + 3 + 5 + 4 + 5 + 7 + 4 + 7 =
                    // 97
                    writer.addRow(new BigRow(true, (sbyte)100, (short)2048, 65536,
                                             Int64.MaxValue, (float)2.0, -5.0, bytes(), "bye",
                                             new MiddleStruct(inner(1, "bye"), inner(2, "sigh")),
                                             list(inner(100000000, "cat"), inner(-100000, "in"), inner(1234, "hat")),
                                             map(inner(5, "chani"), inner(1, "mauddib")), Timestamp.Parse("2000-03-11 15:00:00"),
                                             HiveDecimal.Parse("12345678.6547452")));
                    writer.close();

                    long rowCount = writer.getNumberOfRows();
                    rawDataSize = writer.getRawDataSize();
                    Assert.Equal(2, rowCount);
                    Assert.Equal(1740, rawDataSize);
                }

            Reader reader = OrcFile.createReader(TestFilePath,
                                                 OrcFile.readerOptions(conf));

            Assert.Equal(2, reader.getNumberOfRows());
            Assert.Equal(1740, reader.getRawDataSize());
            Assert.Equal(8, reader.getRawDataSizeOfColumns(Lists.newArrayList("boolean1")));
            Assert.Equal(8, reader.getRawDataSizeOfColumns(Lists.newArrayList("byte1")));
            Assert.Equal(8, reader.getRawDataSizeOfColumns(Lists.newArrayList("short1")));
            Assert.Equal(8, reader.getRawDataSizeOfColumns(Lists.newArrayList("int1")));
            Assert.Equal(16, reader.getRawDataSizeOfColumns(Lists.newArrayList("long1")));
            Assert.Equal(8, reader.getRawDataSizeOfColumns(Lists.newArrayList("float1")));
            Assert.Equal(16, reader.getRawDataSizeOfColumns(Lists.newArrayList("double1")));
            Assert.Equal(5, reader.getRawDataSizeOfColumns(Lists.newArrayList("bytes1")));
            Assert.Equal(172, reader.getRawDataSizeOfColumns(Lists.newArrayList("string1")));
            Assert.Equal(455, reader.getRawDataSizeOfColumns(Lists.newArrayList("list")));
            Assert.Equal(368, reader.getRawDataSizeOfColumns(Lists.newArrayList("map")));
            Assert.Equal(364, reader.getRawDataSizeOfColumns(Lists.newArrayList("middle")));
            Assert.Equal(80, reader.getRawDataSizeOfColumns(Lists.newArrayList("ts")));
            Assert.Equal(224, reader.getRawDataSizeOfColumns(Lists.newArrayList("decimal1")));
            Assert.Equal(88, reader.getRawDataSizeOfColumns(Lists.newArrayList("ts", "int1")));
            Assert.Equal(1195,
                         reader.getRawDataSizeOfColumns(Lists.newArrayList("middle", "list", "map", "float1")));
            Assert.Equal(185,
                         reader.getRawDataSizeOfColumns(Lists.newArrayList("bytes1", "byte1", "string1")));
            Assert.Equal(rawDataSize, reader.getRawDataSizeOfColumns(Lists.newArrayList("boolean1",
                                                                                        "byte1", "short1", "int1", "long1", "float1", "double1", "bytes1", "string1", "list",
                                                                                        "map", "middle", "ts", "decimal1")));

            // check the stats
            ColumnStatistics[] stats = reader.getStatistics();
            Assert.Equal(2, stats[1].getNumberOfValues());
            Assert.Equal(1, ((BooleanColumnStatistics)stats[1]).getFalseCount());
            Assert.Equal(1, ((BooleanColumnStatistics)stats[1]).getTrueCount());
            Assert.Equal("count: 2 hasNull: False true: 1", stats[1].ToString());

            Assert.Equal(2048, ((IntegerColumnStatistics)stats[3]).getMaximum());
            Assert.Equal(1024, ((IntegerColumnStatistics)stats[3]).getMinimum());
            Assert.Equal(true, ((IntegerColumnStatistics)stats[3]).isSumDefined());
            Assert.Equal(3072, ((IntegerColumnStatistics)stats[3]).getSum());
            Assert.Equal("count: 2 hasNull: False min: 1024 max: 2048 sum: 3072",
                         stats[3].ToString());

            Assert.Equal(Int64.MaxValue, ((IntegerColumnStatistics)stats[5]).getMaximum());
            Assert.Equal(Int64.MaxValue, ((IntegerColumnStatistics)stats[5]).getMinimum());
            Assert.Equal(false, ((IntegerColumnStatistics)stats[5]).isSumDefined());
            Assert.Equal("count: 2 hasNull: False min: 9223372036854775807 max: 9223372036854775807",
                         stats[5].ToString());

            Assert.Equal(-15.0, ((DoubleColumnStatistics)stats[7]).getMinimum());
            Assert.Equal(-5.0, ((DoubleColumnStatistics)stats[7]).getMaximum());
            Assert.Equal(-20.0, ((DoubleColumnStatistics)stats[7]).getSum(), 5);
            Assert.Equal("count: 2 hasNull: False min: -15 max: -5 sum: -20",
                         stats[7].ToString());

            Assert.Equal(5, ((BinaryColumnStatistics)stats[8]).getSum());
            Assert.Equal("count: 2 hasNull: False sum: 5", stats[8].ToString());

            Assert.Equal("bye", ((StringColumnStatistics)stats[9]).getMinimum());
            Assert.Equal("hi", ((StringColumnStatistics)stats[9]).getMaximum());
            Assert.Equal(5, ((StringColumnStatistics)stats[9]).getSum());
            Assert.Equal("count: 2 hasNull: False min: bye max: hi sum: 5", stats[9].ToString());
        }
Esempio n. 22
0
        public void testSplitEliminationComplexExpr()
        {
            ObjectInspector inspector = ObjectInspectorFactory.getReflectionObjectInspector(typeof(AllTypesRow));

            using (Stream file = File.OpenWrite(testFilePath))
                using (Writer writer = OrcFile.createWriter(testFilePath, file, conf, inspector,
                                                            100000, CompressionKind.NONE, 10000, 10000))
                {
                    writeData(writer);
                }

            conf.set(ShimLoader.getHadoopShims().getHadoopConfNames().get("MAPREDMINSPLITSIZE"), "1000");
            conf.set(ShimLoader.getHadoopShims().getHadoopConfNames().get("MAPREDMAXSPLITSIZE"), "150000");
            InputFormat @in = new OrcInputFormat();

            FileInputFormat.setInputPaths(conf, testFilePath.ToString());

            // predicate expression: userid <= 100 and subtype <= 1000.0
            GenericUDF           udf       = new GenericUDFOPEqualOrLessThan();
            List <ExprNodeDesc>  childExpr = new List <ExprNodeDesc>();
            ExprNodeColumnDesc   col       = new ExprNodeColumnDesc(typeof(long), "userid", "T", false);
            ExprNodeConstantDesc con       = new ExprNodeConstantDesc(100);

            childExpr.Add(col);
            childExpr.Add(con);
            ExprNodeGenericFuncDesc en = new ExprNodeGenericFuncDesc(inspector, udf, childExpr);

            GenericUDF           udf1       = new GenericUDFOPEqualOrLessThan();
            List <ExprNodeDesc>  childExpr1 = new List <ExprNodeDesc>();
            ExprNodeColumnDesc   col1       = new ExprNodeColumnDesc(typeof(double), "subtype", "T", false);
            ExprNodeConstantDesc con1       = new ExprNodeConstantDesc(1000.0);

            childExpr1.Add(col1);
            childExpr1.Add(con1);
            ExprNodeGenericFuncDesc en1 = new ExprNodeGenericFuncDesc(inspector, udf1, childExpr1);

            GenericUDF          udf2       = new GenericUDFOPAnd();
            List <ExprNodeDesc> childExpr2 = new List <ExprNodeDesc>();

            childExpr2.Add(en);
            childExpr2.Add(en1);
            ExprNodeGenericFuncDesc en2 = new ExprNodeGenericFuncDesc(inspector, udf2, childExpr2);

            string sargStr = Utilities.serializeExpression(en2);

            conf.set("hive.io.filter.expr.serialized", sargStr);
            InputSplit[] splits = @in.getSplits(conf, 1);
            Assert.Equal(2, splits.Length);

            con          = new ExprNodeConstantDesc(2);
            childExpr[1] = con;
            en           = new ExprNodeGenericFuncDesc(inspector, udf, childExpr);

            con1          = new ExprNodeConstantDesc(0.0);
            childExpr1[1] = con1;
            en1           = new ExprNodeGenericFuncDesc(inspector, udf1, childExpr1);

            childExpr2[0] = en;
            childExpr2[1] = en1;
            en2           = new ExprNodeGenericFuncDesc(inspector, udf2, childExpr2);

            sargStr = Utilities.serializeExpression(en2);
            conf.set("hive.io.filter.expr.serialized", sargStr);
            splits = @in.getSplits(conf, 1);
            // no stripe will satisfy the predicate
            Assert.Equal(0, splits.Length);

            con          = new ExprNodeConstantDesc(2);
            childExpr[1] = con;
            en           = new ExprNodeGenericFuncDesc(inspector, udf, childExpr);

            con1          = new ExprNodeConstantDesc(1.0);
            childExpr1[1] = con1;
            en1           = new ExprNodeGenericFuncDesc(inspector, udf1, childExpr1);

            childExpr2[0] = en;
            childExpr2[1] = en1;
            en2           = new ExprNodeGenericFuncDesc(inspector, udf2, childExpr2);

            sargStr = Utilities.serializeExpression(en2);
            conf.set("hive.io.filter.expr.serialized", sargStr);
            splits = @in.getSplits(conf, 1);
            // only first stripe will satisfy condition and hence single split
            Assert.Equal(1, splits.Length);

            udf          = new GenericUDFOPEqual();
            con          = new ExprNodeConstantDesc(13);
            childExpr[1] = con;
            en           = new ExprNodeGenericFuncDesc(inspector, udf, childExpr);

            con1          = new ExprNodeConstantDesc(80.0);
            childExpr1[1] = con1;
            en1           = new ExprNodeGenericFuncDesc(inspector, udf1, childExpr1);

            childExpr2[0] = en;
            childExpr2[1] = en1;
            en2           = new ExprNodeGenericFuncDesc(inspector, udf2, childExpr2);

            sargStr = Utilities.serializeExpression(en2);
            conf.set("hive.io.filter.expr.serialized", sargStr);
            splits = @in.getSplits(conf, 1);
            // first two stripes will satisfy condition and hence single split
            Assert.Equal(2, splits.Length);

            udf          = new GenericUDFOPEqual();
            con          = new ExprNodeConstantDesc(13);
            childExpr[1] = con;
            en           = new ExprNodeGenericFuncDesc(inspector, udf, childExpr);

            udf1          = new GenericUDFOPEqual();
            con1          = new ExprNodeConstantDesc(80.0);
            childExpr1[1] = con1;
            en1           = new ExprNodeGenericFuncDesc(inspector, udf1, childExpr1);

            childExpr2[0] = en;
            childExpr2[1] = en1;
            en2           = new ExprNodeGenericFuncDesc(inspector, udf2, childExpr2);

            sargStr = Utilities.serializeExpression(en2);
            conf.set("hive.io.filter.expr.serialized", sargStr);
            splits = @in.getSplits(conf, 1);
            // only second stripes will satisfy condition and hence single split
            Assert.Equal(1, splits.Length);
        }
Esempio n. 23
0
        public void testSplitEliminationSmallMaxSplit()
        {
            ObjectInspector inspector = ObjectInspectorFactory.getReflectionObjectInspector(typeof(AllTypesRow));

            using (Stream file = File.OpenWrite(testFilePath))
                using (Writer writer = OrcFile.createWriter(testFilePath, file, conf, inspector,
                                                            100000, CompressionKind.NONE, 10000, 10000))
                {
                    writeData(writer);
                }
            conf.set(ShimLoader.getHadoopShims().getHadoopConfNames().get("MAPREDMINSPLITSIZE"), "1000");
            conf.set(ShimLoader.getHadoopShims().getHadoopConfNames().get("MAPREDMAXSPLITSIZE"), "5000");
            InputFormat @in = new OrcInputFormat();

            FileInputFormat.setInputPaths(conf, testFilePath.ToString());

            GenericUDF           udf       = new GenericUDFOPEqualOrLessThan();
            List <ExprNodeDesc>  childExpr = new List <ExprNodeDesc>();
            ExprNodeColumnDesc   col       = new ExprNodeColumnDesc(typeof(long), "userid", "T", false);
            ExprNodeConstantDesc con       = new ExprNodeConstantDesc(100);

            childExpr.Add(col);
            childExpr.Add(con);
            ExprNodeGenericFuncDesc en = new ExprNodeGenericFuncDesc(inspector, udf, childExpr);
            string sargStr             = Utilities.serializeExpression(en);

            conf.set("hive.io.filter.expr.serialized", sargStr);
            InputSplit[] splits = @in.getSplits(conf, 1);
            Assert.Equal(5, splits.Length);

            con          = new ExprNodeConstantDesc(1);
            childExpr[1] = con;
            en           = new ExprNodeGenericFuncDesc(inspector, udf, childExpr);
            sargStr      = Utilities.serializeExpression(en);
            conf.set("hive.io.filter.expr.serialized", sargStr);
            splits = @in.getSplits(conf, 1);
            Assert.Equal(0, splits.Length);

            con          = new ExprNodeConstantDesc(2);
            childExpr[1] = con;
            en           = new ExprNodeGenericFuncDesc(inspector, udf, childExpr);
            sargStr      = Utilities.serializeExpression(en);
            conf.set("hive.io.filter.expr.serialized", sargStr);
            splits = @in.getSplits(conf, 1);
            Assert.Equal(1, splits.Length);

            con          = new ExprNodeConstantDesc(5);
            childExpr[1] = con;
            en           = new ExprNodeGenericFuncDesc(inspector, udf, childExpr);
            sargStr      = Utilities.serializeExpression(en);
            conf.set("hive.io.filter.expr.serialized", sargStr);
            splits = @in.getSplits(conf, 1);
            Assert.Equal(2, splits.Length);

            con          = new ExprNodeConstantDesc(13);
            childExpr[1] = con;
            en           = new ExprNodeGenericFuncDesc(inspector, udf, childExpr);
            sargStr      = Utilities.serializeExpression(en);
            conf.set("hive.io.filter.expr.serialized", sargStr);
            splits = @in.getSplits(conf, 1);
            Assert.Equal(3, splits.Length);

            con          = new ExprNodeConstantDesc(29);
            childExpr[1] = con;
            en           = new ExprNodeGenericFuncDesc(inspector, udf, childExpr);
            sargStr      = Utilities.serializeExpression(en);
            conf.set("hive.io.filter.expr.serialized", sargStr);
            splits = @in.getSplits(conf, 1);
            Assert.Equal(4, splits.Length);

            con          = new ExprNodeConstantDesc(70);
            childExpr[1] = con;
            en           = new ExprNodeGenericFuncDesc(inspector, udf, childExpr);
            sargStr      = Utilities.serializeExpression(en);
            conf.set("hive.io.filter.expr.serialized", sargStr);
            splits = @in.getSplits(conf, 1);
            Assert.Equal(5, splits.Length);
        }
        public void testHasNull()
        {
            ObjectInspector inspector = ObjectInspectorFactory.getReflectionObjectInspector(typeof(SimpleStruct));

            using (Stream file = File.OpenWrite(TestFilePath))
                using (Writer writer = OrcFile.createWriter(TestFilePath, file, OrcFile.writerOptions(conf)
                                                            .inspector(inspector)
                                                            .rowIndexStride(1000)
                                                            .stripeSize(10000)
                                                            .bufferSize(10000)))
                {
                    // STRIPE 1
                    // RG1
                    for (int i = 0; i < 1000; i++)
                    {
                        writer.addRow(new SimpleStruct(bytes(1, 2, 3), "RG1"));
                    }
                    // RG2
                    for (int i = 0; i < 1000; i++)
                    {
                        writer.addRow(new SimpleStruct(bytes(1, 2, 3), null));
                    }
                    // RG3
                    for (int i = 0; i < 1000; i++)
                    {
                        writer.addRow(new SimpleStruct(bytes(1, 2, 3), "RG3"));
                    }
                    // RG4
                    for (int i = 0; i < 1000; i++)
                    {
                        writer.addRow(new SimpleStruct(bytes(1, 2, 3), null));
                    }
                    // RG5
                    for (int i = 0; i < 1000; i++)
                    {
                        writer.addRow(new SimpleStruct(bytes(1, 2, 3), null));
                    }
                    // STRIPE 2
                    for (int i = 0; i < 5000; i++)
                    {
                        writer.addRow(new SimpleStruct(bytes(1, 2, 3), null));
                    }
                    // STRIPE 3
                    for (int i = 0; i < 5000; i++)
                    {
                        writer.addRow(new SimpleStruct(bytes(1, 2, 3), "STRIPE-3"));
                    }
                    // STRIPE 4
                    for (int i = 0; i < 5000; i++)
                    {
                        writer.addRow(new SimpleStruct(bytes(1, 2, 3), null));
                    }
                }

            Reader reader = OrcFile.createReader(TestFilePath, OrcFile.readerOptions(conf));

            // check the file level stats
            ColumnStatistics[] stats = reader.getStatistics();
            Assert.Equal(20000, stats[0].getNumberOfValues());
            Assert.Equal(20000, stats[1].getNumberOfValues());
            Assert.Equal(7000, stats[2].getNumberOfValues());
            Assert.Equal(false, stats[0].hasNull());
            Assert.Equal(false, stats[1].hasNull());
            Assert.Equal(true, stats[2].hasNull());

            // check the stripe level stats
            List <StripeStatistics> stripeStats = reader.getStripeStatistics();
            // stripe 1 stats
            StripeStatistics ss1     = stripeStats[0];
            ColumnStatistics ss1_cs1 = ss1.getColumnStatistics()[0];
            ColumnStatistics ss1_cs2 = ss1.getColumnStatistics()[1];
            ColumnStatistics ss1_cs3 = ss1.getColumnStatistics()[2];

            Assert.Equal(false, ss1_cs1.hasNull());
            Assert.Equal(false, ss1_cs2.hasNull());
            Assert.Equal(true, ss1_cs3.hasNull());

            // stripe 2 stats
            StripeStatistics ss2     = stripeStats[1];
            ColumnStatistics ss2_cs1 = ss2.getColumnStatistics()[0];
            ColumnStatistics ss2_cs2 = ss2.getColumnStatistics()[1];
            ColumnStatistics ss2_cs3 = ss2.getColumnStatistics()[2];

            Assert.Equal(false, ss2_cs1.hasNull());
            Assert.Equal(false, ss2_cs2.hasNull());
            Assert.Equal(true, ss2_cs3.hasNull());

            // stripe 3 stats
            StripeStatistics ss3     = stripeStats[2];
            ColumnStatistics ss3_cs1 = ss3.getColumnStatistics()[0];
            ColumnStatistics ss3_cs2 = ss3.getColumnStatistics()[1];
            ColumnStatistics ss3_cs3 = ss3.getColumnStatistics()[2];

            Assert.Equal(false, ss3_cs1.hasNull());
            Assert.Equal(false, ss3_cs2.hasNull());
            Assert.Equal(false, ss3_cs3.hasNull());

            // stripe 4 stats
            StripeStatistics ss4     = stripeStats[3];
            ColumnStatistics ss4_cs1 = ss4.getColumnStatistics()[0];
            ColumnStatistics ss4_cs2 = ss4.getColumnStatistics()[1];
            ColumnStatistics ss4_cs3 = ss4.getColumnStatistics()[2];

            Assert.Equal(false, ss4_cs1.hasNull());
            Assert.Equal(false, ss4_cs2.hasNull());
            Assert.Equal(true, ss4_cs3.hasNull());

#if false
            // Test file dump
            TextWriter       origOut        = System.Console.Out;
            string           outputFilename = "orc-file-has-null.out";
            FileOutputStream myOut          = new FileOutputStream(workDir + File.separator + outputFilename);

            // replace stdout and run command
            System.Console.SetOut(new StreamWriter(myOut));
            FileDump.main(new String[] { testFilePath.toString(), "--rowindex=2" });
            System.Console.Out.Flush();
            System.SetOut(origOut);

            TestFileDump.checkOutput(outputFilename, workDir + File.separator + outputFilename);
#endif
        }
Esempio n. 25
0
        public void testDataDump()
        {
            using (Stream file = File.OpenWrite(TestFilePath))
            {
                OrcFile.WriterOptions options = new OrcFile.WriterOptions(new Properties(), conf);
                options.inspector(ObjectInspectorFactory.getReflectionObjectInspector(typeof(AllTypesRecord)));
                options.stripeSize(100000);
                options.compress(CompressionKind.NONE);
                options.bufferSize(10000);
                options.rowIndexStride(1000);
                using (Writer writer = OrcFile.createWriter(TestFilePath, file, options))
                {
                    Dictionary <string, string> m = new Dictionary <string, string>(2);
                    m.Add("k1", "v1");
                    writer.addRow(new AllTypesRecord(
                                      true,
                                      (sbyte)10,
                                      (short)100,
                                      1000,
                                      10000L,
                                      4.0f,
                                      20.0,
                                      HiveDecimal.Parse("4.2222"),
                                      new Timestamp(1416967764000L),
                                      new Date(1416967764000L),
                                      "string",
                                      m,
                                      new List <int> {
                        100, 200
                    },
                                      new AllTypesRecord.Struct(10, "foo")));
                    m.Clear();
                    m.Add("k3", "v3");
                    writer.addRow(new AllTypesRecord(
                                      false,
                                      (sbyte)20,
                                      (short)200,
                                      2000,
                                      20000L,
                                      8.0f,
                                      40.0,
                                      HiveDecimal.Parse("2.2222"),
                                      new Timestamp(1416967364000L),
                                      new Date(1411967764000L),
                                      "abcd",
                                      m,
                                      new List <int> {
                        200, 300
                    },
                                      new AllTypesRecord.Struct(20, "bar")));
                }
            }

            string[] lines;
            using (CaptureStdoutToMemory capture = new CaptureStdoutToMemory())
            {
                FileDump.Main(TestFilePath, "-d");

                lines = capture.Text.Split(new[] { '\n' }, StringSplitOptions.RemoveEmptyEntries);
            }
            Assert.Equal(2, lines.Length);

            // Don't be fooled by the big space in the middle, this line is quite long
            Assert.Equal("{\"b\":true,\"bt\":10,\"s\":100,\"i\":1000,\"l\":10000,\"f\":4,\"d\":20,\"de\":\"4.2222\",\"t\":\"2014-11-25 18:09:24\",\"dt\":\"2014-11-25\",\"str\":\"string\",\"c\":\"hello                                                                                                                                                                                                                                                          \",\"vc\":\"hello\",\"m\":[{\"_key\":\"k1\",\"_value\":\"v1\"}],\"a\":[100,200],\"st\":{\"i\":10,\"s\":\"foo\"}}", lines[0]);
            Assert.Equal("{\"b\":false,\"bt\":20,\"s\":200,\"i\":2000,\"l\":20000,\"f\":8,\"d\":40,\"de\":\"2.2222\",\"t\":\"2014-11-25 18:02:44\",\"dt\":\"2014-09-28\",\"str\":\"abcd\",\"c\":\"world                                                                                                                                                                                                                                                          \",\"vc\":\"world\",\"m\":[{\"_key\":\"k3\",\"_value\":\"v3\"}],\"a\":[200,300],\"st\":{\"i\":20,\"s\":\"bar\"}}", lines[1]);
        }
Esempio n. 26
0
 public static Reader createReader(Path path, OrcFile.ReaderOptions options)
 {
     return new ReaderImpl(path, options);
 }
Esempio n. 27
0
        public void testStringAndBinaryStatistics()
        {
            ObjectInspector inspector = ObjectInspectorFactory.getReflectionObjectInspector(typeof(SimpleStruct));

            using (Stream file = File.OpenWrite(TestFilePath))
                using (Writer writer = OrcFile.createWriter(TestFilePath, file, OrcFile.writerOptions(conf)
                                                            .inspector(inspector)
                                                            .stripeSize(100000)
                                                            .bufferSize(10000)))
                {
                    writer.addRow(new SimpleStruct(bytes(0, 1, 2, 3, 4), "foo"));
                    writer.addRow(new SimpleStruct(bytes(0, 1, 2, 3), "bar"));
                    writer.addRow(new SimpleStruct(bytes(0, 1, 2, 3, 4, 5), null));
                    writer.addRow(new SimpleStruct(null, "hi"));
                    writer.close();

                    Assert.Equal(4, writer.getNumberOfRows());
                    Assert.Equal(273, writer.getRawDataSize());
                }

            Reader reader = OrcFile.createReader(TestFilePath,
                                                 OrcFile.readerOptions(conf));

            Assert.Equal(4, reader.getNumberOfRows());
            Assert.Equal(273, reader.getRawDataSize());
            Assert.Equal(15, reader.getRawDataSizeOfColumns(Lists.newArrayList("bytes1")));
            Assert.Equal(258, reader.getRawDataSizeOfColumns(Lists.newArrayList("string1")));
            Assert.Equal(273, reader.getRawDataSizeOfColumns(Lists.newArrayList("bytes1", "string1")));

            // check the stats
            ColumnStatistics[] stats = reader.getStatistics();
            Assert.Equal(4, stats[0].getNumberOfValues());
            Assert.Equal("count: 4 hasNull: False", stats[0].ToString());

            Assert.Equal(3, stats[1].getNumberOfValues());
            Assert.Equal(15, ((BinaryColumnStatistics)stats[1]).getSum());
            Assert.Equal("count: 3 hasNull: True sum: 15", stats[1].ToString());

            Assert.Equal(3, stats[2].getNumberOfValues());
            Assert.Equal("bar", ((StringColumnStatistics)stats[2]).getMinimum());
            Assert.Equal("hi", ((StringColumnStatistics)stats[2]).getMaximum());
            Assert.Equal(8, ((StringColumnStatistics)stats[2]).getSum());
            Assert.Equal("count: 3 hasNull: True min: bar max: hi sum: 8",
                         stats[2].ToString());

            // check the inspectors
            StructObjectInspector readerInspector =
                (StructObjectInspector)reader.getObjectInspector();

            Assert.Equal(ObjectInspectorCategory.STRUCT, readerInspector.getCategory());
            Assert.Equal("struct<bytes1:binary,string1:string>", readerInspector.getTypeName());
            IList <StructField>   fields = readerInspector.getAllStructFieldRefs();
            BinaryObjectInspector bi     = (BinaryObjectInspector)readerInspector.
                                           getStructFieldRef("bytes1").getFieldObjectInspector();
            StringObjectInspector st = (StringObjectInspector)readerInspector.
                                       getStructFieldRef("string1").getFieldObjectInspector();

            using (RecordReader rows = reader.rows())
            {
                object row = rows.next();
                Assert.NotNull(row);
                // check the contents of the first row
                Assert.Equal(bytes(0, 1, 2, 3, 4), bi.get(
                                 readerInspector.getStructFieldData(row, fields[0])));
                Assert.Equal("foo", st.getPrimitiveJavaObject(readerInspector.
                                                              getStructFieldData(row, fields[1])));

                // check the contents of second row
                Assert.Equal(true, rows.hasNext());
                row = rows.next();
                Assert.Equal(bytes(0, 1, 2, 3), bi.get(
                                 readerInspector.getStructFieldData(row, fields[0])));
                Assert.Equal("bar", st.getPrimitiveJavaObject(readerInspector.
                                                              getStructFieldData(row, fields[1])));

                // check the contents of second row
                Assert.Equal(true, rows.hasNext());
                row = rows.next();
                Assert.Equal(bytes(0, 1, 2, 3, 4, 5), bi.get(
                                 readerInspector.getStructFieldData(row, fields[0])));
                Assert.Null(st.getPrimitiveJavaObject(readerInspector.
                                                      getStructFieldData(row, fields[1])));

                // check the contents of second row
                Assert.Equal(true, rows.hasNext());
                row = rows.next();
                Assert.Null(bi.get(
                                readerInspector.getStructFieldData(row, fields[0])));
                Assert.Equal("hi", st.getPrimitiveJavaObject(readerInspector.
                                                             getStructFieldData(row, fields[1])));

                Assert.Equal(false, rows.hasNext());
            }
        }
        public void testMultiStripeWithNull()
        {
            ObjectInspector inspector = ObjectInspectorFactory.getReflectionObjectInspector(typeof(MyStruct));

            using (Stream file = File.OpenWrite(TestFilePath))
                using (Writer writer = OrcFile.createWriter(TestFilePath, file, OrcFile.writerOptions(conf)
                                                            .inspector(inspector)
                                                            .stripeSize(100000)
                                                            .compress(CompressionKind.NONE)
                                                            .bufferSize(10000)))
                {
                    Random rand = new Random(100);
                    writer.addRow(new MyStruct(null, null, true, new List <InnerStruct> {
                        new InnerStruct(100)
                    }));
                    for (int i = 2; i < 20000; i++)
                    {
                        writer.addRow(new MyStruct(rand.Next(1), "a", true, new List <InnerStruct> {
                            new InnerStruct(100)
                        }));
                    }
                    writer.addRow(new MyStruct(null, null, true, new List <InnerStruct> {
                        new InnerStruct(100)
                    }));
                }

            Reader reader = OrcFile.createReader(TestFilePath,
                                                 OrcFile.readerOptions(conf));

            // check the stats
            ColumnStatistics[] stats = reader.getStatistics();
            Assert.Equal(20000, reader.getNumberOfRows());
            Assert.Equal(20000, stats[0].getNumberOfValues());

            Assert.Equal(0, ((IntegerColumnStatistics)stats[1]).getMaximum());
            Assert.Equal(0, ((IntegerColumnStatistics)stats[1]).getMinimum());
            Assert.Equal(true, ((IntegerColumnStatistics)stats[1]).isSumDefined());
            Assert.Equal(0, ((IntegerColumnStatistics)stats[1]).getSum());
            Assert.Equal("count: 19998 hasNull: True min: 0 max: 0 sum: 0",
                         stats[1].ToString());

            Assert.Equal("a", ((StringColumnStatistics)stats[2]).getMaximum());
            Assert.Equal("a", ((StringColumnStatistics)stats[2]).getMinimum());
            Assert.Equal(19998, stats[2].getNumberOfValues());
            Assert.Equal("count: 19998 hasNull: True min: a max: a sum: 19998",
                         stats[2].ToString());

            // check the inspectors
            StructObjectInspector readerInspector =
                (StructObjectInspector)reader.getObjectInspector();

            Assert.Equal(ObjectInspectorCategory.STRUCT,
                         readerInspector.getCategory());
            Assert.Equal("struct<a:int,b:string,c:boolean,list:array<struct<z:int>>>",
                         readerInspector.getTypeName());

            using (RecordReader rows = reader.rows())
            {
                List <bool> expected = new List <bool>();
                foreach (StripeInformation sinfo in reader.getStripes())
                {
                    expected.Add(false);
                }
                // only the first and last stripe will have PRESENT stream
                expected[0] = true;
                expected[expected.Count - 1] = true;

                List <bool> got = new List <bool>();
                // check if the strip footer contains PRESENT stream
                foreach (StripeInformation sinfo in reader.getStripes())
                {
                    OrcProto.StripeFooter sf =
                        ((RecordReaderImpl)rows).readStripeFooter(sinfo);
                    got.Add(sf.ToString().IndexOf(OrcProto.Stream.Types.Kind.PRESENT.ToString()) != -1);
                }
                Assert.Equal(expected, got);

                // row 1
                OrcStruct row = (OrcStruct)rows.next();
                Assert.NotNull(row);
                Assert.Null(row.getFieldValue(0));
                Assert.Null(row.getFieldValue(1));
                Assert.Equal(true, row.getFieldValue(2));
                Assert.Equal(100, ((OrcStruct)((IList <object>)row.getFieldValue(3))[0]).
                             getFieldValue(0));

                rows.seekToRow(19998);
                // last-1 row
                row = (OrcStruct)rows.next();
                Assert.NotNull(row);
                Assert.NotNull(row.getFieldValue(1));
                Assert.Equal(0, row.getFieldValue(0));
                Assert.Equal(true, row.getFieldValue(2));
                Assert.Equal(100, ((OrcStruct)((IList <object>)row.getFieldValue(3))[0]).
                             getFieldValue(0));

                // last row
                row = (OrcStruct)rows.next();
                Assert.NotNull(row);
                Assert.Null(row.getFieldValue(0));
                Assert.Null(row.getFieldValue(1));
                Assert.Equal(true, row.getFieldValue(2));
                Assert.Equal(100, ((OrcStruct)((IList <object>)row.getFieldValue(3))[0]).
                             getFieldValue(0));
            }
        }
        private void checkVectorizedReader()
        {
            Reader vreader = OrcFile.createReader(TestFilePath,
                                                  OrcFile.readerOptions(conf));
            Reader reader = OrcFile.createReader(TestFilePath,
                                                 OrcFile.readerOptions(conf));

            using (RecordReaderImpl vrr = (RecordReaderImpl)vreader.rows())
                using (RecordReaderImpl rr = (RecordReaderImpl)reader.rows())
                {
                    VectorizedRowBatch batch = null;

                    // Check Vectorized ORC reader against ORC row reader
                    while (vrr.hasNext())
                    {
                        batch = vrr.nextBatch(batch);
                        for (int i = 0; i < batch.size; i++)
                        {
                            OrcStruct row = (OrcStruct)rr.next();
                            for (int j = 0; j < batch.cols.Length; j++)
                            {
                                object       a  = (row.getFieldValue(j));
                                ColumnVector cv = batch.cols[j];
                                // if the value is repeating, use row 0
                                int rowId = cv.isRepeating ? 0 : i;

                                // make sure the null flag agrees
                                if (a == null)
                                {
                                    Assert.True(!cv.noNulls && cv.isNull[rowId]);
                                }
                                else if (a is bool)
                                {
                                    // bool values are stores a 1's and 0's, so convert and compare
                                    long temp = (bool)a ? 1 : 0;
                                    long b    = ((LongColumnVector)cv).vector[rowId];
                                    Assert.Equal(temp.ToString(), b.ToString());
                                }
                                else if (a is Timestamp)
                                {
                                    // Timestamps are stored as long, so convert and compare
                                    Timestamp t = (Timestamp)a;
                                    // Timestamp.getTime() is overriden and is
                                    // long time = super.getTime();
                                    // return (time + (nanos / 1000000));
                                    long timeInNanoSec = (t.Milliseconds * 1000000)
                                                         + (t.getNanos() % 1000000);
                                    long b = ((LongColumnVector)cv).vector[rowId];
                                    Assert.Equal(timeInNanoSec.ToString(), b.ToString());
                                }
                                else if (a is Date)
                                {
                                    // Dates are stored as long, so convert and compare

                                    Date adt = (Date)a;
                                    long b   = ((LongColumnVector)cv).vector[rowId];
                                    // Assert.Equal(adt, Date.daysToMillis((int)b));
                                    Assert.Equal(adt.Days, (int)b);
                                }
                                else if (a is HiveDecimal)
                                {
                                    // Decimals are stored as BigInteger, so convert and compare
                                    HiveDecimal dec = (HiveDecimal)a;
                                    HiveDecimal b   = ((DecimalColumnVector)cv).vector[i];
                                    Assert.Equal(dec, b);
                                }
                                else if (a is double)
                                {
                                    double b = ((DoubleColumnVector)cv).vector[rowId];
                                    Assert.Equal(a.ToString(), b.ToString());
                                }
                                else if (a is string)
                                {
                                    BytesColumnVector bcv = (BytesColumnVector)cv;
                                    string            b   = Encoding.UTF8.GetString(bcv.vector[rowId], bcv.start[rowId], bcv.length[rowId]);
                                    Assert.Equal((string)a, b);
                                }
                                else if (a is int || a is long || a is sbyte || a is short)
                                {
                                    Assert.Equal(a.ToString(),
                                                 ((LongColumnVector)cv).vector[rowId].ToString());
                                }
                                else
                                {
                                    Assert.True(false);
                                }
                            }
                        }

                        // Check repeating
                        Assert.Equal(false, batch.cols[0].isRepeating);
                        Assert.Equal(false, batch.cols[1].isRepeating);
                        Assert.Equal(false, batch.cols[2].isRepeating);
                        Assert.Equal(true, batch.cols[3].isRepeating);
                        Assert.Equal(false, batch.cols[4].isRepeating);
                        Assert.Equal(false, batch.cols[5].isRepeating);
                        Assert.Equal(false, batch.cols[6].isRepeating);
                        Assert.Equal(false, batch.cols[7].isRepeating);
                        Assert.Equal(false, batch.cols[8].isRepeating);
                        Assert.Equal(false, batch.cols[9].isRepeating);

                        // Check non null
                        Assert.Equal(false, batch.cols[0].noNulls);
                        Assert.Equal(false, batch.cols[1].noNulls);
                        Assert.Equal(true, batch.cols[2].noNulls);
                        Assert.Equal(true, batch.cols[3].noNulls);
                        Assert.Equal(false, batch.cols[4].noNulls);
                        Assert.Equal(false, batch.cols[5].noNulls);
                        Assert.Equal(false, batch.cols[6].noNulls);
                        Assert.Equal(false, batch.cols[7].noNulls);
                        Assert.Equal(false, batch.cols[8].noNulls);
                        Assert.Equal(false, batch.cols[9].noNulls);
                    }
                    Assert.Equal(false, rr.hasNext());
                }
        }
Esempio n. 30
0
        public void testSerdeStatsOldFormat()
        {
            string testFile = Path.Combine(TestHelpers.ResourcesDirectory, "orc-file-11-format.orc");
            Reader reader   = OrcFile.createReader(testFile, OrcFile.readerOptions(conf));

            int  stripeCount   = 0;
            int  rowCount      = 0;
            long currentOffset = -1;

            foreach (StripeInformation stripe in reader.getStripes())
            {
                stripeCount += 1;
                rowCount    += (int)stripe.getNumberOfRows();
                if (currentOffset < 0)
                {
                    currentOffset = stripe.getOffset() + stripe.getIndexLength()
                                    + stripe.getDataLength() + stripe.getFooterLength();
                }
                else
                {
                    Assert.Equal(currentOffset, stripe.getOffset());
                    currentOffset += stripe.getIndexLength() + stripe.getDataLength()
                                     + stripe.getFooterLength();
                }
            }
            Assert.Equal(reader.getNumberOfRows(), rowCount);
#if JAVA_SIZE
            Assert.Equal(6300000, reader.getRawDataSize());
#endif
            Assert.Equal(2, stripeCount);

            // check the stats
            ColumnStatistics[] stats = reader.getStatistics();
            Assert.Equal(7500, stats[1].getNumberOfValues());
            Assert.Equal(3750, ((BooleanColumnStatistics)stats[1]).getFalseCount());
            Assert.Equal(3750, ((BooleanColumnStatistics)stats[1]).getTrueCount());
            Assert.Equal("count: 7500 hasNull: True true: 3750", stats[1].ToString());

            Assert.Equal(2048, ((IntegerColumnStatistics)stats[3]).getMaximum());
            Assert.Equal(1024, ((IntegerColumnStatistics)stats[3]).getMinimum());
            Assert.Equal(true, ((IntegerColumnStatistics)stats[3]).isSumDefined());
            Assert.Equal(11520000, ((IntegerColumnStatistics)stats[3]).getSum());
            Assert.Equal("count: 7500 hasNull: True min: 1024 max: 2048 sum: 11520000",
                         stats[3].ToString());

            Assert.Equal(Int64.MaxValue, ((IntegerColumnStatistics)stats[5]).getMaximum());
            Assert.Equal(Int64.MaxValue, ((IntegerColumnStatistics)stats[5]).getMinimum());
            Assert.Equal(false, ((IntegerColumnStatistics)stats[5]).isSumDefined());
            Assert.Equal(
                "count: 7500 hasNull: True min: 9223372036854775807 max: 9223372036854775807",
                stats[5].ToString());

            Assert.Equal(-15.0, ((DoubleColumnStatistics)stats[7]).getMinimum());
            Assert.Equal(-5.0, ((DoubleColumnStatistics)stats[7]).getMaximum());
            Assert.Equal(-75000.0, ((DoubleColumnStatistics)stats[7]).getSum(), 5);
            Assert.Equal("count: 7500 hasNull: True min: -15 max: -5 sum: -75000",
                         stats[7].ToString());

            Assert.Equal("bye", ((StringColumnStatistics)stats[9]).getMinimum());
            Assert.Equal("hi", ((StringColumnStatistics)stats[9]).getMaximum());
            Assert.Equal(0, ((StringColumnStatistics)stats[9]).getSum());
            Assert.Equal("count: 7500 hasNull: True min: bye max: hi sum: 0", stats[9].ToString());

            // old orc format will not have binary statistics. ToString() will show only
            // the general column statistics
            Assert.Equal("count: 7500 hasNull: True", stats[8].ToString());

            // since old orc format doesn't support binary statistics,
            // this should throw ClassCastException
            Assert.Throws <InvalidCastException>(() => ((BinaryColumnStatistics)stats[8]).getSum());
        }
        public void testColumnsWithNullAndCompression()
        {
            ObjectInspector inspector = ObjectInspectorFactory.getReflectionObjectInspector(typeof(MyStruct));

            using (Stream file = File.OpenWrite(TestFilePath))
                using (Writer writer = OrcFile.createWriter(TestFilePath, file, OrcFile.writerOptions(conf)
                                                            .inspector(inspector)
                                                            .stripeSize(100000)
                                                            .bufferSize(10000)))
                {
                    writer.addRow(new MyStruct(3, "a", true,
                                               Lists.newArrayList(new InnerStruct(100))));
                    writer.addRow(new MyStruct(null, "b", true,
                                               Lists.newArrayList(new InnerStruct(100))));
                    writer.addRow(new MyStruct(3, null, false,
                                               Lists.newArrayList(new InnerStruct(100))));
                    writer.addRow(new MyStruct(3, "d", true,
                                               Lists.newArrayList(new InnerStruct(100))));
                    writer.addRow(new MyStruct(2, "e", true,
                                               Lists.newArrayList(new InnerStruct(100))));
                    writer.addRow(new MyStruct(2, "f", true,
                                               Lists.newArrayList(new InnerStruct(100))));
                    writer.addRow(new MyStruct(2, "g", true,
                                               Lists.newArrayList(new InnerStruct(100))));
                    writer.addRow(new MyStruct(2, "h", true,
                                               Lists.newArrayList(new InnerStruct(100))));
                }

            Reader reader = OrcFile.createReader(TestFilePath,
                                                 OrcFile.readerOptions(conf));

            // check the stats
            ColumnStatistics[] stats = reader.getStatistics();
            Assert.Equal(8, reader.getNumberOfRows());
            Assert.Equal(8, stats[0].getNumberOfValues());

            Assert.Equal(3, ((IntegerColumnStatistics)stats[1]).getMaximum());
            Assert.Equal(2, ((IntegerColumnStatistics)stats[1]).getMinimum());
            Assert.Equal(true, ((IntegerColumnStatistics)stats[1]).isSumDefined());
            Assert.Equal(17, ((IntegerColumnStatistics)stats[1]).getSum());
            Assert.Equal("count: 7 hasNull: True min: 2 max: 3 sum: 17",
                         stats[1].ToString());

            Assert.Equal("h", ((StringColumnStatistics)stats[2]).getMaximum());
            Assert.Equal("a", ((StringColumnStatistics)stats[2]).getMinimum());
            Assert.Equal(7, stats[2].getNumberOfValues());
            Assert.Equal("count: 7 hasNull: True min: a max: h sum: 7",
                         stats[2].ToString());

            // check the inspectors
            StructObjectInspector readerInspector = (StructObjectInspector)reader.getObjectInspector();

            Assert.Equal(ObjectInspectorCategory.STRUCT,
                         readerInspector.getCategory());
            Assert.Equal("struct<a:int,b:string,c:boolean,list:array<struct<z:int>>>",
                         readerInspector.getTypeName());

            using (RecordReader rows = reader.rows())
            {
                // only the last strip will have PRESENT stream
                List <bool> expected = new List <bool>();
                foreach (StripeInformation sinfo in reader.getStripes())
                {
                    expected.Add(false);
                }
                expected[expected.Count - 1] = true;

                List <bool> got = new List <bool>();
                // check if the strip footer contains PRESENT stream
                foreach (StripeInformation sinfo in reader.getStripes())
                {
                    OrcProto.StripeFooter sf = ((RecordReaderImpl)rows).readStripeFooter(sinfo);
                    got.Add(sf.ToString().IndexOf(OrcProto.Stream.Types.Kind.PRESENT.ToString()) != -1);
                }
                Assert.Equal(expected, got);

                // row 1
                OrcStruct row = (OrcStruct)rows.next();
                Assert.NotNull(row);
                Assert.Equal(3, row.getFieldValue(0));
                Assert.Equal("a", row.getFieldValue(1).ToString());
                Assert.Equal(true, row.getFieldValue(2));
                Assert.Equal(100, ((OrcStruct)((IList <object>)row.getFieldValue(3))[0]).
                             getFieldValue(0));

                // row 2
                row = (OrcStruct)rows.next();
                Assert.NotNull(row);
                Assert.Null(row.getFieldValue(0));
                Assert.Equal("b", row.getFieldValue(1).ToString());
                Assert.Equal(true, row.getFieldValue(2));
                Assert.Equal(100, ((OrcStruct)((IList <object>)row.getFieldValue(3))[0]).
                             getFieldValue(0));

                // row 3
                row = (OrcStruct)rows.next();
                Assert.NotNull(row);
                Assert.Null(row.getFieldValue(1));
                Assert.Equal(3, row.getFieldValue(0));
                Assert.Equal(false, row.getFieldValue(2));
                Assert.Equal(100, ((OrcStruct)((IList <object>)row.getFieldValue(3))[0]).
                             getFieldValue(0));
            }
        }