コード例 #1
0
        public void testHalfDistinctCheckDisabled()
        {
            ObjectInspector inspector = ObjectInspectorFactory.getReflectionObjectInspector(typeof(string));

            int[] input = new int[20000];

            // conf.setBoolean(ConfVars.HIVE_ORC_ROW_INDEX_STRIDE_DICTIONARY_CHECK.varname, false);
            using (Stream file = File.OpenWrite(TestFilePath))
                using (Writer writer = OrcFile.createWriter(TestFilePath, file, OrcFile.writerOptions(conf)
                                                            .inspector(inspector)
                                                            .compress(CompressionKind.NONE)
                                                            .bufferSize(10000)))
                {
                    Random rand = new Random(123);
                    for (int i = 0; i < 20000; i++)
                    {
                        input[i] = rand.Next(10000);
                    }

                    for (int i = 0; i < 20000; i++)
                    {
                        writer.addRow(input[i].ToString());
                    }
                }

            Reader reader = OrcFile.createReader(TestFilePath, OrcFile.readerOptions(conf));

            using (RecordReader rows = reader.rows())
            {
                int idx = 0;
                while (rows.hasNext())
                {
                    object row = rows.next();
                    Assert.Equal(input[idx++].ToString(), row);
                }

                // make sure the encoding type is correct
                foreach (StripeInformation stripe in reader.getStripes())
                {
                    // hacky but does the job, this casting will work as long this test resides
                    // within the same package as ORC reader
                    OrcProto.StripeFooter footer = ((RecordReaderImpl)rows).readStripeFooter(stripe);
                    for (int i = 0; i < footer.ColumnsCount; ++i)
                    {
                        OrcProto.ColumnEncoding encoding = footer.GetColumns(i);
                        Assert.Equal(OrcProto.ColumnEncoding.Types.Kind.DICTIONARY_V2, encoding.Kind);
                    }
                }
            }
        }
コード例 #2
0
        public void testTooManyDistinctV11AlwaysDictionary()
        {
            ObjectInspector inspector = ObjectInspectorFactory.getReflectionObjectInspector(typeof(string));

            using (Stream file = File.OpenWrite(TestFilePath))
                using (Writer writer = OrcFile.createWriter(TestFilePath, file, OrcFile.writerOptions(conf)
                                                            .inspector(inspector)
                                                            .compress(CompressionKind.NONE)
                                                            .version(OrcFile.Version.V_0_11)
                                                            .bufferSize(10000)))
                {
                    for (int i = 0; i < 20000; i++)
                    {
                        writer.addRow(i.ToString());
                    }
                }

            Reader reader = OrcFile.createReader(TestFilePath, OrcFile.readerOptions(conf));

            using (RecordReader rows = reader.rows())
            {
                int idx = 0;
                while (rows.hasNext())
                {
                    object row = rows.next();
                    Assert.Equal((idx++).ToString(), row);
                }

                // make sure the encoding type is correct
                foreach (StripeInformation stripe in reader.getStripes())
                {
                    // hacky but does the job, this casting will work as long this test resides
                    // within the same package as ORC reader
                    OrcProto.StripeFooter footer = ((RecordReaderImpl)rows).readStripeFooter(stripe);
                    for (int i = 0; i < footer.ColumnsCount; ++i)
                    {
                        OrcProto.ColumnEncoding encoding = footer.GetColumns(i);
                        Assert.Equal(OrcProto.ColumnEncoding.Types.Kind.DICTIONARY, encoding.Kind);
                    }
                }
            }
        }
コード例 #3
0
        public static void printJsonMetaData(List <string> files, Configuration conf,
                                             List <int> rowIndexCols, bool prettyPrint, bool printTimeZone)
        {
            JsonWriter writer    = new JsonWriter();
            bool       multiFile = files.Count > 1;

            if (multiFile)
            {
                writer.array();
            }
            else
            {
                writer.newObject();
            }
            foreach (string filename in files)
            {
                if (multiFile)
                {
                    writer.newObject();
                }
                writer.key("fileName").value(Path.GetFileName(filename));
                Reader reader = OrcFile.createReader(filename, OrcFile.readerOptions(conf));
                writer.key("fileVersion").value(OrcFile.VersionHelper.getName(reader.getFileVersion()));
                writer.key("writerVersion").value(reader.getWriterVersion().ToString());
                using (RecordReaderImpl rows = (RecordReaderImpl)reader.rows())
                {
                    writer.key("numberOfRows").value(reader.getNumberOfRows());
                    writer.key("compression").value(reader.getCompression().ToString());
                    if (reader.getCompression() != CompressionKind.NONE)
                    {
                        writer.key("compressionBufferSize").value(reader.getCompressionSize());
                    }
                    writer.key("schemaString").value(reader.getObjectInspector().getTypeName());
                    writer.key("schema").array();
                    writeSchema(writer, reader.getTypes());
                    writer.endArray();

                    writer.key("stripeStatistics").array();
                    List <StripeStatistics> stripeStatistics = reader.getStripeStatistics();
                    for (int n = 0; n < stripeStatistics.Count; n++)
                    {
                        writer.newObject();
                        writer.key("stripeNumber").value(n + 1);
                        StripeStatistics ss = stripeStatistics[n];
                        writer.key("columnStatistics").array();
                        for (int i = 0; i < ss.getColumnStatistics().Length; i++)
                        {
                            writer.newObject();
                            writer.key("columnId").value(i);
                            writeColumnStatistics(writer, ss.getColumnStatistics()[i]);
                            writer.endObject();
                        }
                        writer.endArray();
                        writer.endObject();
                    }
                    writer.endArray();

                    ColumnStatistics[] stats = reader.getStatistics();
                    int colCount             = stats.Length;
                    writer.key("fileStatistics").array();
                    for (int i = 0; i < stats.Length; ++i)
                    {
                        writer.newObject();
                        writer.key("columnId").value(i);
                        writeColumnStatistics(writer, stats[i]);
                        writer.endObject();
                    }
                    writer.endArray();

                    writer.key("stripes").array();
                    int stripeIx = -1;
                    foreach (StripeInformation stripe in reader.getStripes())
                    {
                        ++stripeIx;
                        long stripeStart             = stripe.getOffset();
                        OrcProto.StripeFooter footer = rows.readStripeFooter(stripe);
                        writer.newObject(); // start of stripe information
                        writer.key("stripeNumber").value(stripeIx + 1);
                        writer.key("stripeInformation");
                        writeStripeInformation(writer, stripe);
                        if (printTimeZone)
                        {
                            writer.key("writerTimezone").value(
                                footer.HasWriterTimezone ? footer.WriterTimezone : FileDump.UNKNOWN);
                        }
                        long sectionStart = stripeStart;

                        writer.key("streams").array();
                        foreach (OrcProto.Stream section in footer.StreamsList)
                        {
                            writer.newObject();
                            string kind = section.HasKind ? section.Kind.ToString() : FileDump.UNKNOWN;
                            writer.key("columnId").value(section.Column);
                            writer.key("section").value(kind);
                            writer.key("startOffset").value(sectionStart);
                            writer.key("length").value(section.Length);
                            sectionStart += (long)section.Length;
                            writer.endObject();
                        }
                        writer.endArray();

                        writer.key("encodings").array();
                        for (int i = 0; i < footer.ColumnsCount; ++i)
                        {
                            writer.newObject();
                            OrcProto.ColumnEncoding encoding = footer.ColumnsList[i];
                            writer.key("columnId").value(i);
                            writer.key("kind").value(encoding.Kind.ToString());
                            if (encoding.Kind == OrcProto.ColumnEncoding.Types.Kind.DICTIONARY ||
                                encoding.Kind == OrcProto.ColumnEncoding.Types.Kind.DICTIONARY_V2)
                            {
                                writer.key("dictionarySize").value(encoding.DictionarySize);
                            }
                            writer.endObject();
                        }
                        writer.endArray();

                        if (rowIndexCols != null && rowIndexCols.Count != 0)
                        {
                            // include the columns that are specified, only if the columns are included, bloom filter
                            // will be read
                            bool[] sargColumns = new bool[colCount];
                            foreach (int colIdx in rowIndexCols)
                            {
                                sargColumns[colIdx] = true;
                            }
                            RecordReaderImpl.Index indices = rows.readRowIndex(stripeIx, null, sargColumns);
                            writer.key("indexes").array();
                            foreach (int col in rowIndexCols)
                            {
                                writer.newObject();
                                writer.key("columnId").value(col);
                                writeRowGroupIndexes(writer, col, indices.getRowGroupIndex());
                                writeBloomFilterIndexes(writer, col, indices.getBloomFilterIndex());
                                writer.endObject();
                            }
                            writer.endArray();
                        }
                        writer.endObject(); // end of stripe information
                    }
                    writer.endArray();

                    long fileLen     = new FileInfo(filename).Length;
                    long paddedBytes = FileDump.getTotalPaddingSize(reader);
                    // empty ORC file is ~45 bytes. Assumption here is file length always >0
                    double percentPadding = ((double)paddedBytes / (double)fileLen) * 100;
                    writer.key("fileLength").value(fileLen);
                    writer.key("paddingLength").value(paddedBytes);
                    writer.key("paddingRatio").value(percentPadding);
                    rows.close();
                }

                writer.endObject();
            }
            if (multiFile)
            {
                writer.endArray();
            }

            if (prettyPrint)
            {
#if false
                string prettyJson;
                if (multiFile)
                {
                    JSONArray jsonArray = new JSONArray(writer.toString());
                    prettyJson = jsonArray.toString(2);
                }
                else
                {
                    JSONObject jsonObject = new JSONObject(writer.toString());
                    prettyJson = jsonObject.toString(2);
                }
#else
                string prettyJson = writer.ToString();
#endif
                System.Console.WriteLine(prettyJson);
            }
            else
            {
                System.Console.WriteLine(writer.ToString());
            }
        }
コード例 #4
0
        public RecordReaderImpl.Index readRowIndex(StripeInformation stripe,
                                                   OrcProto.StripeFooter footer, bool[] included, OrcProto.RowIndex[] indexes,
                                                   bool[] sargColumns, OrcProto.BloomFilterIndex[] bloomFilterIndices)
        {
            if (footer == null)
            {
                footer = readStripeFooter(stripe);
            }
            if (indexes == null)
            {
                indexes = new OrcProto.RowIndex[typeCount];
            }
            if (bloomFilterIndices == null)
            {
                bloomFilterIndices = new OrcProto.BloomFilterIndex[typeCount];
            }
            long offset = stripe.getOffset();
            IList <OrcProto.Stream> streams = footer.StreamsList;

            for (int i = 0; i < streams.Count; i++)
            {
                OrcProto.Stream stream     = streams[i];
                OrcProto.Stream nextStream = null;
                if (i < streams.Count - 1)
                {
                    nextStream = streams[i + 1];
                }
                int col = (int)stream.Column;
                int len = (int)stream.Length;
                // row index stream and bloom filter are interlaced, check if the sarg column contains bloom
                // filter and combine the io to read row index and bloom filters for that column together
                if (stream.HasKind && (stream.Kind == OrcProto.Stream.Types.Kind.ROW_INDEX))
                {
                    bool readBloomFilter = false;
                    if (sargColumns != null && sargColumns[col] &&
                        nextStream.Kind == OrcProto.Stream.Types.Kind.BLOOM_FILTER)
                    {
                        len            += (int)nextStream.Length;
                        i              += 1;
                        readBloomFilter = true;
                    }
                    if ((included == null || included[col]) && indexes[col] == null)
                    {
                        byte[] buffer = new byte[len];
                        file.readFully(offset, buffer, 0, buffer.Length);
                        ByteBuffer bb = ByteBuffer.wrap(buffer);
                        indexes[col] = OrcProto.RowIndex.ParseFrom(InStream.create(null, "index",
                                                                                   new List <DiskRange> {
                            new RecordReaderImpl.BufferChunk(bb, 0)
                        },
                                                                                   (long)stream.Length, codec, bufferSize));
                        if (readBloomFilter)
                        {
                            bb.position((int)stream.Length);
                            bloomFilterIndices[col] = OrcProto.BloomFilterIndex.ParseFrom(InStream.create(
                                                                                              null, "bloom_filter", new List <DiskRange> {
                                new RecordReaderImpl.BufferChunk(bb, 0)
                            },
                                                                                              (long)nextStream.Length, codec, bufferSize));
                        }
                    }
                }
                offset += len;
            }

            RecordReaderImpl.Index index = new RecordReaderImpl.Index(indexes, bloomFilterIndices);
            return(index);
        }
コード例 #5
0
        public void testMultiStripeWithNull()
        {
            ObjectInspector inspector = ObjectInspectorFactory.getReflectionObjectInspector(typeof(MyStruct));

            using (Stream file = File.OpenWrite(TestFilePath))
                using (Writer writer = OrcFile.createWriter(TestFilePath, file, OrcFile.writerOptions(conf)
                                                            .inspector(inspector)
                                                            .stripeSize(100000)
                                                            .compress(CompressionKind.NONE)
                                                            .bufferSize(10000)))
                {
                    Random rand = new Random(100);
                    writer.addRow(new MyStruct(null, null, true, new List <InnerStruct> {
                        new InnerStruct(100)
                    }));
                    for (int i = 2; i < 20000; i++)
                    {
                        writer.addRow(new MyStruct(rand.Next(1), "a", true, new List <InnerStruct> {
                            new InnerStruct(100)
                        }));
                    }
                    writer.addRow(new MyStruct(null, null, true, new List <InnerStruct> {
                        new InnerStruct(100)
                    }));
                }

            Reader reader = OrcFile.createReader(TestFilePath,
                                                 OrcFile.readerOptions(conf));

            // check the stats
            ColumnStatistics[] stats = reader.getStatistics();
            Assert.Equal(20000, reader.getNumberOfRows());
            Assert.Equal(20000, stats[0].getNumberOfValues());

            Assert.Equal(0, ((IntegerColumnStatistics)stats[1]).getMaximum());
            Assert.Equal(0, ((IntegerColumnStatistics)stats[1]).getMinimum());
            Assert.Equal(true, ((IntegerColumnStatistics)stats[1]).isSumDefined());
            Assert.Equal(0, ((IntegerColumnStatistics)stats[1]).getSum());
            Assert.Equal("count: 19998 hasNull: True min: 0 max: 0 sum: 0",
                         stats[1].ToString());

            Assert.Equal("a", ((StringColumnStatistics)stats[2]).getMaximum());
            Assert.Equal("a", ((StringColumnStatistics)stats[2]).getMinimum());
            Assert.Equal(19998, stats[2].getNumberOfValues());
            Assert.Equal("count: 19998 hasNull: True min: a max: a sum: 19998",
                         stats[2].ToString());

            // check the inspectors
            StructObjectInspector readerInspector =
                (StructObjectInspector)reader.getObjectInspector();

            Assert.Equal(ObjectInspectorCategory.STRUCT,
                         readerInspector.getCategory());
            Assert.Equal("struct<a:int,b:string,c:boolean,list:array<struct<z:int>>>",
                         readerInspector.getTypeName());

            using (RecordReader rows = reader.rows())
            {
                List <bool> expected = new List <bool>();
                foreach (StripeInformation sinfo in reader.getStripes())
                {
                    expected.Add(false);
                }
                // only the first and last stripe will have PRESENT stream
                expected[0] = true;
                expected[expected.Count - 1] = true;

                List <bool> got = new List <bool>();
                // check if the strip footer contains PRESENT stream
                foreach (StripeInformation sinfo in reader.getStripes())
                {
                    OrcProto.StripeFooter sf =
                        ((RecordReaderImpl)rows).readStripeFooter(sinfo);
                    got.Add(sf.ToString().IndexOf(OrcProto.Stream.Types.Kind.PRESENT.ToString()) != -1);
                }
                Assert.Equal(expected, got);

                // row 1
                OrcStruct row = (OrcStruct)rows.next();
                Assert.NotNull(row);
                Assert.Null(row.getFieldValue(0));
                Assert.Null(row.getFieldValue(1));
                Assert.Equal(true, row.getFieldValue(2));
                Assert.Equal(100, ((OrcStruct)((IList <object>)row.getFieldValue(3))[0]).
                             getFieldValue(0));

                rows.seekToRow(19998);
                // last-1 row
                row = (OrcStruct)rows.next();
                Assert.NotNull(row);
                Assert.NotNull(row.getFieldValue(1));
                Assert.Equal(0, row.getFieldValue(0));
                Assert.Equal(true, row.getFieldValue(2));
                Assert.Equal(100, ((OrcStruct)((IList <object>)row.getFieldValue(3))[0]).
                             getFieldValue(0));

                // last row
                row = (OrcStruct)rows.next();
                Assert.NotNull(row);
                Assert.Null(row.getFieldValue(0));
                Assert.Null(row.getFieldValue(1));
                Assert.Equal(true, row.getFieldValue(2));
                Assert.Equal(100, ((OrcStruct)((IList <object>)row.getFieldValue(3))[0]).
                             getFieldValue(0));
            }
        }
コード例 #6
0
        public void testColumnsWithNullAndCompression()
        {
            ObjectInspector inspector = ObjectInspectorFactory.getReflectionObjectInspector(typeof(MyStruct));

            using (Stream file = File.OpenWrite(TestFilePath))
                using (Writer writer = OrcFile.createWriter(TestFilePath, file, OrcFile.writerOptions(conf)
                                                            .inspector(inspector)
                                                            .stripeSize(100000)
                                                            .bufferSize(10000)))
                {
                    writer.addRow(new MyStruct(3, "a", true,
                                               Lists.newArrayList(new InnerStruct(100))));
                    writer.addRow(new MyStruct(null, "b", true,
                                               Lists.newArrayList(new InnerStruct(100))));
                    writer.addRow(new MyStruct(3, null, false,
                                               Lists.newArrayList(new InnerStruct(100))));
                    writer.addRow(new MyStruct(3, "d", true,
                                               Lists.newArrayList(new InnerStruct(100))));
                    writer.addRow(new MyStruct(2, "e", true,
                                               Lists.newArrayList(new InnerStruct(100))));
                    writer.addRow(new MyStruct(2, "f", true,
                                               Lists.newArrayList(new InnerStruct(100))));
                    writer.addRow(new MyStruct(2, "g", true,
                                               Lists.newArrayList(new InnerStruct(100))));
                    writer.addRow(new MyStruct(2, "h", true,
                                               Lists.newArrayList(new InnerStruct(100))));
                }

            Reader reader = OrcFile.createReader(TestFilePath,
                                                 OrcFile.readerOptions(conf));

            // check the stats
            ColumnStatistics[] stats = reader.getStatistics();
            Assert.Equal(8, reader.getNumberOfRows());
            Assert.Equal(8, stats[0].getNumberOfValues());

            Assert.Equal(3, ((IntegerColumnStatistics)stats[1]).getMaximum());
            Assert.Equal(2, ((IntegerColumnStatistics)stats[1]).getMinimum());
            Assert.Equal(true, ((IntegerColumnStatistics)stats[1]).isSumDefined());
            Assert.Equal(17, ((IntegerColumnStatistics)stats[1]).getSum());
            Assert.Equal("count: 7 hasNull: True min: 2 max: 3 sum: 17",
                         stats[1].ToString());

            Assert.Equal("h", ((StringColumnStatistics)stats[2]).getMaximum());
            Assert.Equal("a", ((StringColumnStatistics)stats[2]).getMinimum());
            Assert.Equal(7, stats[2].getNumberOfValues());
            Assert.Equal("count: 7 hasNull: True min: a max: h sum: 7",
                         stats[2].ToString());

            // check the inspectors
            StructObjectInspector readerInspector = (StructObjectInspector)reader.getObjectInspector();

            Assert.Equal(ObjectInspectorCategory.STRUCT,
                         readerInspector.getCategory());
            Assert.Equal("struct<a:int,b:string,c:boolean,list:array<struct<z:int>>>",
                         readerInspector.getTypeName());

            using (RecordReader rows = reader.rows())
            {
                // only the last strip will have PRESENT stream
                List <bool> expected = new List <bool>();
                foreach (StripeInformation sinfo in reader.getStripes())
                {
                    expected.Add(false);
                }
                expected[expected.Count - 1] = true;

                List <bool> got = new List <bool>();
                // check if the strip footer contains PRESENT stream
                foreach (StripeInformation sinfo in reader.getStripes())
                {
                    OrcProto.StripeFooter sf = ((RecordReaderImpl)rows).readStripeFooter(sinfo);
                    got.Add(sf.ToString().IndexOf(OrcProto.Stream.Types.Kind.PRESENT.ToString()) != -1);
                }
                Assert.Equal(expected, got);

                // row 1
                OrcStruct row = (OrcStruct)rows.next();
                Assert.NotNull(row);
                Assert.Equal(3, row.getFieldValue(0));
                Assert.Equal("a", row.getFieldValue(1).ToString());
                Assert.Equal(true, row.getFieldValue(2));
                Assert.Equal(100, ((OrcStruct)((IList <object>)row.getFieldValue(3))[0]).
                             getFieldValue(0));

                // row 2
                row = (OrcStruct)rows.next();
                Assert.NotNull(row);
                Assert.Null(row.getFieldValue(0));
                Assert.Equal("b", row.getFieldValue(1).ToString());
                Assert.Equal(true, row.getFieldValue(2));
                Assert.Equal(100, ((OrcStruct)((IList <object>)row.getFieldValue(3))[0]).
                             getFieldValue(0));

                // row 3
                row = (OrcStruct)rows.next();
                Assert.NotNull(row);
                Assert.Null(row.getFieldValue(1));
                Assert.Equal(3, row.getFieldValue(0));
                Assert.Equal(false, row.getFieldValue(2));
                Assert.Equal(100, ((OrcStruct)((IList <object>)row.getFieldValue(3))[0]).
                             getFieldValue(0));
            }
        }