Ejemplo n.º 1
0
 /**
  * Is this stream part of a dictionary?
  * @return is this part of a dictionary?
  */
 public static bool isDictionary(OrcProto.Stream.Types.Kind kind,
                                 OrcProto.ColumnEncoding encoding)
 {
     Debug.Assert(kind != OrcProto.Stream.Types.Kind.DICTIONARY_COUNT);
     OrcProto.ColumnEncoding.Types.Kind encodingKind = encoding.Kind;
     return(kind == OrcProto.Stream.Types.Kind.DICTIONARY_DATA ||
            (kind == OrcProto.Stream.Types.Kind.LENGTH &&
             (encodingKind == OrcProto.ColumnEncoding.Types.Kind.DICTIONARY ||
              encodingKind == OrcProto.ColumnEncoding.Types.Kind.DICTIONARY_V2)));
 }
        public void testHalfDistinctCheckDisabled()
        {
            ObjectInspector inspector = ObjectInspectorFactory.getReflectionObjectInspector(typeof(string));

            int[] input = new int[20000];

            // conf.setBoolean(ConfVars.HIVE_ORC_ROW_INDEX_STRIDE_DICTIONARY_CHECK.varname, false);
            using (Stream file = File.OpenWrite(TestFilePath))
                using (Writer writer = OrcFile.createWriter(TestFilePath, file, OrcFile.writerOptions(conf)
                                                            .inspector(inspector)
                                                            .compress(CompressionKind.NONE)
                                                            .bufferSize(10000)))
                {
                    Random rand = new Random(123);
                    for (int i = 0; i < 20000; i++)
                    {
                        input[i] = rand.Next(10000);
                    }

                    for (int i = 0; i < 20000; i++)
                    {
                        writer.addRow(input[i].ToString());
                    }
                }

            Reader reader = OrcFile.createReader(TestFilePath, OrcFile.readerOptions(conf));

            using (RecordReader rows = reader.rows())
            {
                int idx = 0;
                while (rows.hasNext())
                {
                    object row = rows.next();
                    Assert.Equal(input[idx++].ToString(), row);
                }

                // make sure the encoding type is correct
                foreach (StripeInformation stripe in reader.getStripes())
                {
                    // hacky but does the job, this casting will work as long this test resides
                    // within the same package as ORC reader
                    OrcProto.StripeFooter footer = ((RecordReaderImpl)rows).readStripeFooter(stripe);
                    for (int i = 0; i < footer.ColumnsCount; ++i)
                    {
                        OrcProto.ColumnEncoding encoding = footer.GetColumns(i);
                        Assert.Equal(OrcProto.ColumnEncoding.Types.Kind.DICTIONARY_V2, encoding.Kind);
                    }
                }
            }
        }
        public void testTooManyDistinctV11AlwaysDictionary()
        {
            ObjectInspector inspector = ObjectInspectorFactory.getReflectionObjectInspector(typeof(string));

            using (Stream file = File.OpenWrite(TestFilePath))
                using (Writer writer = OrcFile.createWriter(TestFilePath, file, OrcFile.writerOptions(conf)
                                                            .inspector(inspector)
                                                            .compress(CompressionKind.NONE)
                                                            .version(OrcFile.Version.V_0_11)
                                                            .bufferSize(10000)))
                {
                    for (int i = 0; i < 20000; i++)
                    {
                        writer.addRow(i.ToString());
                    }
                }

            Reader reader = OrcFile.createReader(TestFilePath, OrcFile.readerOptions(conf));

            using (RecordReader rows = reader.rows())
            {
                int idx = 0;
                while (rows.hasNext())
                {
                    object row = rows.next();
                    Assert.Equal((idx++).ToString(), row);
                }

                // make sure the encoding type is correct
                foreach (StripeInformation stripe in reader.getStripes())
                {
                    // hacky but does the job, this casting will work as long this test resides
                    // within the same package as ORC reader
                    OrcProto.StripeFooter footer = ((RecordReaderImpl)rows).readStripeFooter(stripe);
                    for (int i = 0; i < footer.ColumnsCount; ++i)
                    {
                        OrcProto.ColumnEncoding encoding = footer.GetColumns(i);
                        Assert.Equal(OrcProto.ColumnEncoding.Types.Kind.DICTIONARY, encoding.Kind);
                    }
                }
            }
        }
Ejemplo n.º 4
0
        public static void addRgFilteredStreamToRanges(OrcProto.Stream stream,
                                                       bool[] includedRowGroups, bool isCompressed, OrcProto.RowIndex index,
                                                       OrcProto.ColumnEncoding encoding, OrcProto.Type type, int compressionSize, bool hasNull,
                                                       long offset, long length, DiskRangeList.CreateHelper list, bool doMergeBuffers)
        {
            for (int group = 0; group < includedRowGroups.Length; ++group)
            {
                if (!includedRowGroups[group])
                {
                    continue;
                }
                int posn = getIndexPosition(
                    encoding.Kind, type.Kind, stream.Kind, isCompressed, hasNull);
                long start = (long)index.EntryList[group].PositionsList[posn];
                long nextGroupOffset;
                bool isLast = group == (includedRowGroups.Length - 1);
                nextGroupOffset = isLast ? length : (int)index.EntryList[group + 1].PositionsList[posn];

                start += offset;
                long end = offset + estimateRgEndOffset(
                    isCompressed, isLast, nextGroupOffset, length, compressionSize);
                list.addOrMerge(start, end, doMergeBuffers, true);
            }
        }
Ejemplo n.º 5
0
        public static void printJsonMetaData(List <string> files, Configuration conf,
                                             List <int> rowIndexCols, bool prettyPrint, bool printTimeZone)
        {
            JsonWriter writer    = new JsonWriter();
            bool       multiFile = files.Count > 1;

            if (multiFile)
            {
                writer.array();
            }
            else
            {
                writer.newObject();
            }
            foreach (string filename in files)
            {
                if (multiFile)
                {
                    writer.newObject();
                }
                writer.key("fileName").value(Path.GetFileName(filename));
                Reader reader = OrcFile.createReader(filename, OrcFile.readerOptions(conf));
                writer.key("fileVersion").value(OrcFile.VersionHelper.getName(reader.getFileVersion()));
                writer.key("writerVersion").value(reader.getWriterVersion().ToString());
                using (RecordReaderImpl rows = (RecordReaderImpl)reader.rows())
                {
                    writer.key("numberOfRows").value(reader.getNumberOfRows());
                    writer.key("compression").value(reader.getCompression().ToString());
                    if (reader.getCompression() != CompressionKind.NONE)
                    {
                        writer.key("compressionBufferSize").value(reader.getCompressionSize());
                    }
                    writer.key("schemaString").value(reader.getObjectInspector().getTypeName());
                    writer.key("schema").array();
                    writeSchema(writer, reader.getTypes());
                    writer.endArray();

                    writer.key("stripeStatistics").array();
                    List <StripeStatistics> stripeStatistics = reader.getStripeStatistics();
                    for (int n = 0; n < stripeStatistics.Count; n++)
                    {
                        writer.newObject();
                        writer.key("stripeNumber").value(n + 1);
                        StripeStatistics ss = stripeStatistics[n];
                        writer.key("columnStatistics").array();
                        for (int i = 0; i < ss.getColumnStatistics().Length; i++)
                        {
                            writer.newObject();
                            writer.key("columnId").value(i);
                            writeColumnStatistics(writer, ss.getColumnStatistics()[i]);
                            writer.endObject();
                        }
                        writer.endArray();
                        writer.endObject();
                    }
                    writer.endArray();

                    ColumnStatistics[] stats = reader.getStatistics();
                    int colCount             = stats.Length;
                    writer.key("fileStatistics").array();
                    for (int i = 0; i < stats.Length; ++i)
                    {
                        writer.newObject();
                        writer.key("columnId").value(i);
                        writeColumnStatistics(writer, stats[i]);
                        writer.endObject();
                    }
                    writer.endArray();

                    writer.key("stripes").array();
                    int stripeIx = -1;
                    foreach (StripeInformation stripe in reader.getStripes())
                    {
                        ++stripeIx;
                        long stripeStart             = stripe.getOffset();
                        OrcProto.StripeFooter footer = rows.readStripeFooter(stripe);
                        writer.newObject(); // start of stripe information
                        writer.key("stripeNumber").value(stripeIx + 1);
                        writer.key("stripeInformation");
                        writeStripeInformation(writer, stripe);
                        if (printTimeZone)
                        {
                            writer.key("writerTimezone").value(
                                footer.HasWriterTimezone ? footer.WriterTimezone : FileDump.UNKNOWN);
                        }
                        long sectionStart = stripeStart;

                        writer.key("streams").array();
                        foreach (OrcProto.Stream section in footer.StreamsList)
                        {
                            writer.newObject();
                            string kind = section.HasKind ? section.Kind.ToString() : FileDump.UNKNOWN;
                            writer.key("columnId").value(section.Column);
                            writer.key("section").value(kind);
                            writer.key("startOffset").value(sectionStart);
                            writer.key("length").value(section.Length);
                            sectionStart += (long)section.Length;
                            writer.endObject();
                        }
                        writer.endArray();

                        writer.key("encodings").array();
                        for (int i = 0; i < footer.ColumnsCount; ++i)
                        {
                            writer.newObject();
                            OrcProto.ColumnEncoding encoding = footer.ColumnsList[i];
                            writer.key("columnId").value(i);
                            writer.key("kind").value(encoding.Kind.ToString());
                            if (encoding.Kind == OrcProto.ColumnEncoding.Types.Kind.DICTIONARY ||
                                encoding.Kind == OrcProto.ColumnEncoding.Types.Kind.DICTIONARY_V2)
                            {
                                writer.key("dictionarySize").value(encoding.DictionarySize);
                            }
                            writer.endObject();
                        }
                        writer.endArray();

                        if (rowIndexCols != null && rowIndexCols.Count != 0)
                        {
                            // include the columns that are specified, only if the columns are included, bloom filter
                            // will be read
                            bool[] sargColumns = new bool[colCount];
                            foreach (int colIdx in rowIndexCols)
                            {
                                sargColumns[colIdx] = true;
                            }
                            RecordReaderImpl.Index indices = rows.readRowIndex(stripeIx, null, sargColumns);
                            writer.key("indexes").array();
                            foreach (int col in rowIndexCols)
                            {
                                writer.newObject();
                                writer.key("columnId").value(col);
                                writeRowGroupIndexes(writer, col, indices.getRowGroupIndex());
                                writeBloomFilterIndexes(writer, col, indices.getBloomFilterIndex());
                                writer.endObject();
                            }
                            writer.endArray();
                        }
                        writer.endObject(); // end of stripe information
                    }
                    writer.endArray();

                    long fileLen     = new FileInfo(filename).Length;
                    long paddedBytes = FileDump.getTotalPaddingSize(reader);
                    // empty ORC file is ~45 bytes. Assumption here is file length always >0
                    double percentPadding = ((double)paddedBytes / (double)fileLen) * 100;
                    writer.key("fileLength").value(fileLen);
                    writer.key("paddingLength").value(paddedBytes);
                    writer.key("paddingRatio").value(percentPadding);
                    rows.close();
                }

                writer.endObject();
            }
            if (multiFile)
            {
                writer.endArray();
            }

            if (prettyPrint)
            {
#if false
                string prettyJson;
                if (multiFile)
                {
                    JSONArray jsonArray = new JSONArray(writer.toString());
                    prettyJson = jsonArray.toString(2);
                }
                else
                {
                    JSONObject jsonObject = new JSONObject(writer.toString());
                    prettyJson = jsonObject.toString(2);
                }
#else
                string prettyJson = writer.ToString();
#endif
                System.Console.WriteLine(prettyJson);
            }
            else
            {
                System.Console.WriteLine(writer.ToString());
            }
        }
 public StreamReaderBuilder setColumnEncoding(OrcProto.ColumnEncoding encoding)
 {
     this.columnEncoding = encoding;
     return this;
 }