private void checkVectorizedReader() { Reader vreader = OrcFile.createReader(TestFilePath, OrcFile.readerOptions(conf)); Reader reader = OrcFile.createReader(TestFilePath, OrcFile.readerOptions(conf)); using (RecordReaderImpl vrr = (RecordReaderImpl)vreader.rows()) using (RecordReaderImpl rr = (RecordReaderImpl)reader.rows()) { VectorizedRowBatch batch = null; // Check Vectorized ORC reader against ORC row reader while (vrr.hasNext()) { batch = vrr.nextBatch(batch); for (int i = 0; i < batch.size; i++) { OrcStruct row = (OrcStruct)rr.next(); for (int j = 0; j < batch.cols.Length; j++) { object a = (row.getFieldValue(j)); ColumnVector cv = batch.cols[j]; // if the value is repeating, use row 0 int rowId = cv.isRepeating ? 0 : i; // make sure the null flag agrees if (a == null) { Assert.True(!cv.noNulls && cv.isNull[rowId]); } else if (a is bool) { // bool values are stores a 1's and 0's, so convert and compare long temp = (bool)a ? 1 : 0; long b = ((LongColumnVector)cv).vector[rowId]; Assert.Equal(temp.ToString(), b.ToString()); } else if (a is Timestamp) { // Timestamps are stored as long, so convert and compare Timestamp t = (Timestamp)a; // Timestamp.getTime() is overriden and is // long time = super.getTime(); // return (time + (nanos / 1000000)); long timeInNanoSec = (t.Milliseconds * 1000000) + (t.getNanos() % 1000000); long b = ((LongColumnVector)cv).vector[rowId]; Assert.Equal(timeInNanoSec.ToString(), b.ToString()); } else if (a is Date) { // Dates are stored as long, so convert and compare Date adt = (Date)a; long b = ((LongColumnVector)cv).vector[rowId]; // Assert.Equal(adt, Date.daysToMillis((int)b)); Assert.Equal(adt.Days, (int)b); } else if (a is HiveDecimal) { // Decimals are stored as BigInteger, so convert and compare HiveDecimal dec = (HiveDecimal)a; HiveDecimal b = ((DecimalColumnVector)cv).vector[i]; Assert.Equal(dec, b); } else if (a is double) { double b = ((DoubleColumnVector)cv).vector[rowId]; Assert.Equal(a.ToString(), b.ToString()); } else if (a is string) { BytesColumnVector bcv = (BytesColumnVector)cv; string b = Encoding.UTF8.GetString(bcv.vector[rowId], bcv.start[rowId], bcv.length[rowId]); Assert.Equal((string)a, b); } else if (a is int || a is long || a is sbyte || a is short) { Assert.Equal(a.ToString(), ((LongColumnVector)cv).vector[rowId].ToString()); } else { Assert.True(false); } } } // Check repeating Assert.Equal(false, batch.cols[0].isRepeating); Assert.Equal(false, batch.cols[1].isRepeating); Assert.Equal(false, batch.cols[2].isRepeating); Assert.Equal(true, batch.cols[3].isRepeating); Assert.Equal(false, batch.cols[4].isRepeating); Assert.Equal(false, batch.cols[5].isRepeating); Assert.Equal(false, batch.cols[6].isRepeating); Assert.Equal(false, batch.cols[7].isRepeating); Assert.Equal(false, batch.cols[8].isRepeating); Assert.Equal(false, batch.cols[9].isRepeating); // Check non null Assert.Equal(false, batch.cols[0].noNulls); Assert.Equal(false, batch.cols[1].noNulls); Assert.Equal(true, batch.cols[2].noNulls); Assert.Equal(true, batch.cols[3].noNulls); Assert.Equal(false, batch.cols[4].noNulls); Assert.Equal(false, batch.cols[5].noNulls); Assert.Equal(false, batch.cols[6].noNulls); Assert.Equal(false, batch.cols[7].noNulls); Assert.Equal(false, batch.cols[8].noNulls); Assert.Equal(false, batch.cols[9].noNulls); } Assert.Equal(false, rr.hasNext()); } }
public static void printJsonMetaData(List <string> files, Configuration conf, List <int> rowIndexCols, bool prettyPrint, bool printTimeZone) { JsonWriter writer = new JsonWriter(); bool multiFile = files.Count > 1; if (multiFile) { writer.array(); } else { writer.newObject(); } foreach (string filename in files) { if (multiFile) { writer.newObject(); } writer.key("fileName").value(Path.GetFileName(filename)); Reader reader = OrcFile.createReader(filename, OrcFile.readerOptions(conf)); writer.key("fileVersion").value(OrcFile.VersionHelper.getName(reader.getFileVersion())); writer.key("writerVersion").value(reader.getWriterVersion().ToString()); using (RecordReaderImpl rows = (RecordReaderImpl)reader.rows()) { writer.key("numberOfRows").value(reader.getNumberOfRows()); writer.key("compression").value(reader.getCompression().ToString()); if (reader.getCompression() != CompressionKind.NONE) { writer.key("compressionBufferSize").value(reader.getCompressionSize()); } writer.key("schemaString").value(reader.getObjectInspector().getTypeName()); writer.key("schema").array(); writeSchema(writer, reader.getTypes()); writer.endArray(); writer.key("stripeStatistics").array(); List <StripeStatistics> stripeStatistics = reader.getStripeStatistics(); for (int n = 0; n < stripeStatistics.Count; n++) { writer.newObject(); writer.key("stripeNumber").value(n + 1); StripeStatistics ss = stripeStatistics[n]; writer.key("columnStatistics").array(); for (int i = 0; i < ss.getColumnStatistics().Length; i++) { writer.newObject(); writer.key("columnId").value(i); writeColumnStatistics(writer, ss.getColumnStatistics()[i]); writer.endObject(); } writer.endArray(); writer.endObject(); } writer.endArray(); ColumnStatistics[] stats = reader.getStatistics(); int colCount = stats.Length; writer.key("fileStatistics").array(); for (int i = 0; i < stats.Length; ++i) { writer.newObject(); writer.key("columnId").value(i); writeColumnStatistics(writer, stats[i]); writer.endObject(); } writer.endArray(); writer.key("stripes").array(); int stripeIx = -1; foreach (StripeInformation stripe in reader.getStripes()) { ++stripeIx; long stripeStart = stripe.getOffset(); OrcProto.StripeFooter footer = rows.readStripeFooter(stripe); writer.newObject(); // start of stripe information writer.key("stripeNumber").value(stripeIx + 1); writer.key("stripeInformation"); writeStripeInformation(writer, stripe); if (printTimeZone) { writer.key("writerTimezone").value( footer.HasWriterTimezone ? footer.WriterTimezone : FileDump.UNKNOWN); } long sectionStart = stripeStart; writer.key("streams").array(); foreach (OrcProto.Stream section in footer.StreamsList) { writer.newObject(); string kind = section.HasKind ? section.Kind.ToString() : FileDump.UNKNOWN; writer.key("columnId").value(section.Column); writer.key("section").value(kind); writer.key("startOffset").value(sectionStart); writer.key("length").value(section.Length); sectionStart += (long)section.Length; writer.endObject(); } writer.endArray(); writer.key("encodings").array(); for (int i = 0; i < footer.ColumnsCount; ++i) { writer.newObject(); OrcProto.ColumnEncoding encoding = footer.ColumnsList[i]; writer.key("columnId").value(i); writer.key("kind").value(encoding.Kind.ToString()); if (encoding.Kind == OrcProto.ColumnEncoding.Types.Kind.DICTIONARY || encoding.Kind == OrcProto.ColumnEncoding.Types.Kind.DICTIONARY_V2) { writer.key("dictionarySize").value(encoding.DictionarySize); } writer.endObject(); } writer.endArray(); if (rowIndexCols != null && rowIndexCols.Count != 0) { // include the columns that are specified, only if the columns are included, bloom filter // will be read bool[] sargColumns = new bool[colCount]; foreach (int colIdx in rowIndexCols) { sargColumns[colIdx] = true; } RecordReaderImpl.Index indices = rows.readRowIndex(stripeIx, null, sargColumns); writer.key("indexes").array(); foreach (int col in rowIndexCols) { writer.newObject(); writer.key("columnId").value(col); writeRowGroupIndexes(writer, col, indices.getRowGroupIndex()); writeBloomFilterIndexes(writer, col, indices.getBloomFilterIndex()); writer.endObject(); } writer.endArray(); } writer.endObject(); // end of stripe information } writer.endArray(); long fileLen = new FileInfo(filename).Length; long paddedBytes = FileDump.getTotalPaddingSize(reader); // empty ORC file is ~45 bytes. Assumption here is file length always >0 double percentPadding = ((double)paddedBytes / (double)fileLen) * 100; writer.key("fileLength").value(fileLen); writer.key("paddingLength").value(paddedBytes); writer.key("paddingRatio").value(percentPadding); rows.close(); } writer.endObject(); } if (multiFile) { writer.endArray(); } if (prettyPrint) { #if false string prettyJson; if (multiFile) { JSONArray jsonArray = new JSONArray(writer.toString()); prettyJson = jsonArray.toString(2); } else { JSONObject jsonObject = new JSONObject(writer.toString()); prettyJson = jsonObject.toString(2); } #else string prettyJson = writer.ToString(); #endif System.Console.WriteLine(prettyJson); } else { System.Console.WriteLine(writer.ToString()); } }