/**
  * To handle multiple INSERT... statements in a single transaction, we want to make sure
  * to generate unique {@code rowId} for all inserted rows of the transaction.
  * @return largest rowId created by previous statements (maybe 0)
  * @
  */
 private long findRowIdOffsetForInsert()
 {
     /*
      * 1. need to know bucket we are writing to
      * 2. need to know which delta dir it's in
      * Then,
      * 1. find the same bucket file in previous delta dir for this txn
      * 2. read the footer and get AcidStats which has insert count
      * 2.1 if AcidStats.inserts>0 done
      *  else go to previous delta file
      *  For example, consider insert/update/insert case...*/
     if (options.getStatementId() <= 0)
     {
         return(0);//there is only 1 statement in this transaction (so far)
     }
     for (int pastStmt = options.getStatementId() - 1; pastStmt >= 0; pastStmt--)
     {
         Path matchingBucket = AcidUtils.createFilename(options.getFinalDestination(), options.clone().statementId(pastStmt));
         if (!fs.exists(matchingBucket))
         {
             continue;
         }
         Reader reader = OrcFile.createReader(matchingBucket, OrcFile.readerOptions(options.getConfiguration()));
         //no close() on Reader?!
         AcidStats acidStats = parseAcidStats(reader);
         if (acidStats.inserts > 0)
         {
             return(acidStats.inserts);
         }
     }
     //if we got here, we looked at all delta files in this txn, prior to current statement and didn't
     //find any inserts...
     return(0);
 }
 /** Ctor used when creating file info during init and when getting a new one. */
 public FileMetaInfo(CompressionKind compressionKind, int bufferSize, int metadataSize,
     ByteBuffer footerBuffer, IList<uint> versionList, OrcFile.WriterVersion writerVersion,
     ByteBuffer fullFooterBuffer)
 {
     this.compressionKind = compressionKind;
     this.bufferSize = bufferSize;
     this.metadataSize = metadataSize;
     this.footerBuffer = footerBuffer;
     this.versionList = versionList;
     this.writerVersion = writerVersion;
     this.footerMetaAndPsBuffer = fullFooterBuffer;
 }
        OrcRecordUpdater(Path path,
                         AcidOutputFormat.Options options)
        {
            this.options = options;
            this.bucket.set(options.getBucket());
            this.path = AcidUtils.createFilename(path, options);
            FileSystem fs = options.getFilesystem();

            if (fs == null)
            {
                fs = path.getFileSystem(options.getConfiguration());
            }
            this.fs = fs;
            try
            {
                FSDataOutputStream strm = fs.create(new Path(path, ACID_FORMAT), false);
                strm.writeInt(ORC_ACID_VERSION);
                strm.close();
            }
            catch (IOException ioe)
            {
                if (LOG.isDebugEnabled())
                {
                    LOG.debug("Failed to create " + path + "/" + ACID_FORMAT + " with " +
                              ioe);
                }
            }
            if (options.getMinimumTransactionId() != options.getMaximumTransactionId() &&
                !options.isWritingBase())
            {
                flushLengths = fs.create(getSideFile(this.path), true, 8,
                                         options.getReporter());
            }
            else
            {
                flushLengths = null;
            }
            OrcFile.WriterOptions writerOptions = null;
            if (options is OrcOptions)
            {
                writerOptions = ((OrcOptions)options).getOrcOptions();
            }
            if (writerOptions == null)
            {
                writerOptions = OrcFile.writerOptions( /* options.getTableProperties(), */
                    options.getConfiguration());
            }
            writerOptions.fileSystem(fs).callback(indexBuilder);
            if (!options.isWritingBase())
            {
                writerOptions.blockPadding(false);
                writerOptions.bufferSize(DELTA_BUFFER_SIZE);
                writerOptions.stripeSize(DELTA_STRIPE_SIZE);
            }
            rowInspector = (StructObjectInspector)options.getInspector();
            writerOptions.inspector(createEventSchema(findRecId(options.getInspector(),
                                                                options.getRecordIdColumn())));
            this.writer = OrcFile.createWriter(this.path, writerOptions);
            item        = new OrcStruct(FIELDS);
            item.setFieldValue(OPERATION, operation);
            item.setFieldValue(CURRENT_TRANSACTION, currentTransaction);
            item.setFieldValue(ORIGINAL_TRANSACTION, originalTransaction);
            item.setFieldValue(BUCKET, bucket);
            item.setFieldValue(ROW_ID, rowId);
        }
 /** Ctor used when reading splits - no version list or full footer buffer. */
 public FileMetaInfo(CompressionKind compressionKind, int bufferSize, int metadataSize,
     ByteBuffer footerBuffer, OrcFile.WriterVersion writerVersion)
     : this(compressionKind, bufferSize, metadataSize, footerBuffer, null,
             writerVersion, null)
 {
 }
        /**
         * Create a reader that merge sorts the ACID events together.
         * @param conf the configuration
         * @param collapseEvents should the events on the same row be collapsed
         * @param isOriginal is the base file a pre-acid file
         * @param bucket the bucket we are reading
         * @param options the options to read with
         * @param deltaDirectory the list of delta directories to include
         * @
         */
        OrcRawRecordMerger(Configuration conf,
                           bool collapseEvents,
                           Reader reader,
                           bool isOriginal,
                           int bucket,
                           ValidTxnList validTxnList,
                           Reader.Options options,
                           Path[] deltaDirectory)
        {
            this.conf         = conf;
            this.collapse     = collapseEvents;
            this.offset       = options.getOffset();
            this.length       = options.getLength();
            this.validTxnList = validTxnList;
            TypeDescription typeDescr = OrcUtils.getDesiredRowTypeDescr(conf);

            if (typeDescr == null)
            {
                throw new IOException(ErrorMsg.SCHEMA_REQUIRED_TO_READ_ACID_TABLES.getErrorCodedMsg());
            }

            objectInspector = OrcRecordUpdater.createEventSchema
                                  (OrcStruct.createObjectInspector(0, OrcUtils.getOrcTypes(typeDescr)));

            // modify the options to reflect the event instead of the base row
            Reader.Options eventOptions = createEventOptions(options);
            if (reader == null)
            {
                baseReader = null;
            }
            else
            {
                // find the min/max based on the offset and length
                if (isOriginal)
                {
                    discoverOriginalKeyBounds(reader, bucket, options);
                }
                else
                {
                    discoverKeyBounds(reader, options);
                }
                LOG.info("min key = " + minKey + ", max key = " + maxKey);
                // use the min/max instead of the byte range
                ReaderPair pair;
                ReaderKey  key = new ReaderKey();
                if (isOriginal)
                {
                    options = options.clone();
                    options.range(options.getOffset(), Long.MAX_VALUE);
                    pair = new OriginalReaderPair(key, reader, bucket, minKey, maxKey,
                                                  options);
                }
                else
                {
                    pair = new ReaderPair(key, reader, bucket, minKey, maxKey,
                                          eventOptions, 0);
                }

                // if there is at least one record, put it in the map
                if (pair.nextRecord != null)
                {
                    readers.put(key, pair);
                }
                baseReader = pair.recordReader;
            }

            // we always want to read all of the deltas
            eventOptions.range(0, Long.MAX_VALUE);
            if (deltaDirectory != null)
            {
                foreach (Path delta in deltaDirectory)
                {
                    ReaderKey             key       = new ReaderKey();
                    Path                  deltaFile = AcidUtils.createBucketFile(delta, bucket);
                    AcidUtils.ParsedDelta deltaDir  = AcidUtils.parsedDelta(delta);
                    FileSystem            fs        = deltaFile.getFileSystem(conf);
                    long                  length    = getLastFlushLength(fs, deltaFile);
                    if (length != -1 && fs.exists(deltaFile))
                    {
                        Reader deltaReader = OrcFile.createReader(deltaFile,
                                                                  OrcFile.readerOptions(conf).maxLength(length));
                        Reader.Options deltaEventOptions = null;
                        if (eventOptions.getSearchArgument() != null)
                        {
                            // Turn off the sarg before pushing it to delta.  We never want to push a sarg to a delta as
                            // it can produce wrong results (if the latest valid version of the record is filtered out by
                            // the sarg) or ArrayOutOfBounds errors (when the sarg is applied to a delete record)
                            // unless the delta only has insert events
                            OrcRecordUpdater.AcidStats acidStats = OrcRecordUpdater.parseAcidStats(deltaReader);
                            if (acidStats.deletes > 0 || acidStats.updates > 0)
                            {
                                deltaEventOptions = eventOptions.clone().searchArgument(null, null);
                            }
                        }
                        ReaderPair deltaPair;
                        deltaPair = new ReaderPair(key, deltaReader, bucket, minKey,
                                                   maxKey, deltaEventOptions != null ? deltaEventOptions : eventOptions, deltaDir.getStatementId());
                        if (deltaPair.nextRecord != null)
                        {
                            readers.put(key, deltaPair);
                        }
                    }
                }
            }

            // get the first record
            Map.Entry <ReaderKey, ReaderPair> entry = readers.pollFirstEntry();
            if (entry == null)
            {
                columns = 0;
                primary = null;
            }
            else
            {
                primary = entry.getValue();
                if (readers.isEmpty())
                {
                    secondaryKey = null;
                }
                else
                {
                    secondaryKey = readers.firstKey();
                }
                // get the number of columns in the user's rows
                columns = primary.getColumns();
            }
        }
Beispiel #6
0
        public WriterImpl(
            Stream stream,
            string path,
            OrcFile.WriterOptions options,
            ObjectInspector inspector,
            TypeDescription schema,
            long stripeSize,
            CompressionKind compress,
            int bufferSize,
            int rowIndexStride,
            MemoryManager memoryManager,
            bool addBlockPadding,
            OrcFile.Version version,
            OrcFile.WriterCallback callback,
            OrcFile.EncodingStrategy encodingStrategy,
            OrcFile.CompressionStrategy compressionStrategy,
            double paddingTolerance,
            long blockSizeValue,
            string bloomFilterColumnNames,
            double bloomFilterFpp)
        {
            this.baseStream = stream;
            this.streamFactory = new StreamFactory(this);
            this.path = path;
            this.options = options;
            this.callback = callback;
            this.schema = schema;
            this.adjustedStripeSize = stripeSize;
            this.defaultStripeSize = stripeSize;
            this.version = version;
            this.encodingStrategy = encodingStrategy;
            this.compressionStrategy = compressionStrategy;
            this.addBlockPadding = addBlockPadding;
            this.blockSize = blockSizeValue;
            this.paddingTolerance = paddingTolerance;
            this.compress = compress;
            this.rowIndexStride = rowIndexStride;
            this.memoryManager = memoryManager;
            buildIndex = rowIndexStride > 0;
            codec = createCodec(compress);
            int numColumns = schema.getMaximumId() + 1;
            this.bufferSize = getEstimatedBufferSize(defaultStripeSize, numColumns, bufferSize);
            if (version == OrcFile.Version.V_0_11)
            {
                /* do not write bloom filters for ORC v11 */
                this.bloomFilterColumns = new bool[schema.getMaximumId() + 1];
            }
            else
            {
                this.bloomFilterColumns =
                    OrcUtils.includeColumns(bloomFilterColumnNames, schema);
            }
            this.bloomFilterFpp = bloomFilterFpp;
            treeWriter = createTreeWriter(inspector, schema, streamFactory, false);
            if (buildIndex && rowIndexStride < MIN_ROW_INDEX_STRIDE)
            {
                throw new ArgumentException("Row stride must be at least " +
                    MIN_ROW_INDEX_STRIDE);
            }

            // ensure that we are able to handle callbacks before we register ourselves
            memoryManager.addWriter(path, stripeSize, this);
        }
        public static void printJsonMetaData(List <string> files, Configuration conf,
                                             List <int> rowIndexCols, bool prettyPrint, bool printTimeZone)
        {
            JsonWriter writer    = new JsonWriter();
            bool       multiFile = files.Count > 1;

            if (multiFile)
            {
                writer.array();
            }
            else
            {
                writer.newObject();
            }
            foreach (string filename in files)
            {
                if (multiFile)
                {
                    writer.newObject();
                }
                writer.key("fileName").value(Path.GetFileName(filename));
                Reader reader = OrcFile.createReader(filename, OrcFile.readerOptions(conf));
                writer.key("fileVersion").value(OrcFile.VersionHelper.getName(reader.getFileVersion()));
                writer.key("writerVersion").value(reader.getWriterVersion().ToString());
                using (RecordReaderImpl rows = (RecordReaderImpl)reader.rows())
                {
                    writer.key("numberOfRows").value(reader.getNumberOfRows());
                    writer.key("compression").value(reader.getCompression().ToString());
                    if (reader.getCompression() != CompressionKind.NONE)
                    {
                        writer.key("compressionBufferSize").value(reader.getCompressionSize());
                    }
                    writer.key("schemaString").value(reader.getObjectInspector().getTypeName());
                    writer.key("schema").array();
                    writeSchema(writer, reader.getTypes());
                    writer.endArray();

                    writer.key("stripeStatistics").array();
                    List <StripeStatistics> stripeStatistics = reader.getStripeStatistics();
                    for (int n = 0; n < stripeStatistics.Count; n++)
                    {
                        writer.newObject();
                        writer.key("stripeNumber").value(n + 1);
                        StripeStatistics ss = stripeStatistics[n];
                        writer.key("columnStatistics").array();
                        for (int i = 0; i < ss.getColumnStatistics().Length; i++)
                        {
                            writer.newObject();
                            writer.key("columnId").value(i);
                            writeColumnStatistics(writer, ss.getColumnStatistics()[i]);
                            writer.endObject();
                        }
                        writer.endArray();
                        writer.endObject();
                    }
                    writer.endArray();

                    ColumnStatistics[] stats = reader.getStatistics();
                    int colCount             = stats.Length;
                    writer.key("fileStatistics").array();
                    for (int i = 0; i < stats.Length; ++i)
                    {
                        writer.newObject();
                        writer.key("columnId").value(i);
                        writeColumnStatistics(writer, stats[i]);
                        writer.endObject();
                    }
                    writer.endArray();

                    writer.key("stripes").array();
                    int stripeIx = -1;
                    foreach (StripeInformation stripe in reader.getStripes())
                    {
                        ++stripeIx;
                        long stripeStart             = stripe.getOffset();
                        OrcProto.StripeFooter footer = rows.readStripeFooter(stripe);
                        writer.newObject(); // start of stripe information
                        writer.key("stripeNumber").value(stripeIx + 1);
                        writer.key("stripeInformation");
                        writeStripeInformation(writer, stripe);
                        if (printTimeZone)
                        {
                            writer.key("writerTimezone").value(
                                footer.HasWriterTimezone ? footer.WriterTimezone : FileDump.UNKNOWN);
                        }
                        long sectionStart = stripeStart;

                        writer.key("streams").array();
                        foreach (OrcProto.Stream section in footer.StreamsList)
                        {
                            writer.newObject();
                            string kind = section.HasKind ? section.Kind.ToString() : FileDump.UNKNOWN;
                            writer.key("columnId").value(section.Column);
                            writer.key("section").value(kind);
                            writer.key("startOffset").value(sectionStart);
                            writer.key("length").value(section.Length);
                            sectionStart += (long)section.Length;
                            writer.endObject();
                        }
                        writer.endArray();

                        writer.key("encodings").array();
                        for (int i = 0; i < footer.ColumnsCount; ++i)
                        {
                            writer.newObject();
                            OrcProto.ColumnEncoding encoding = footer.ColumnsList[i];
                            writer.key("columnId").value(i);
                            writer.key("kind").value(encoding.Kind.ToString());
                            if (encoding.Kind == OrcProto.ColumnEncoding.Types.Kind.DICTIONARY ||
                                encoding.Kind == OrcProto.ColumnEncoding.Types.Kind.DICTIONARY_V2)
                            {
                                writer.key("dictionarySize").value(encoding.DictionarySize);
                            }
                            writer.endObject();
                        }
                        writer.endArray();

                        if (rowIndexCols != null && rowIndexCols.Count != 0)
                        {
                            // include the columns that are specified, only if the columns are included, bloom filter
                            // will be read
                            bool[] sargColumns = new bool[colCount];
                            foreach (int colIdx in rowIndexCols)
                            {
                                sargColumns[colIdx] = true;
                            }
                            RecordReaderImpl.Index indices = rows.readRowIndex(stripeIx, null, sargColumns);
                            writer.key("indexes").array();
                            foreach (int col in rowIndexCols)
                            {
                                writer.newObject();
                                writer.key("columnId").value(col);
                                writeRowGroupIndexes(writer, col, indices.getRowGroupIndex());
                                writeBloomFilterIndexes(writer, col, indices.getBloomFilterIndex());
                                writer.endObject();
                            }
                            writer.endArray();
                        }
                        writer.endObject(); // end of stripe information
                    }
                    writer.endArray();

                    long fileLen     = new FileInfo(filename).Length;
                    long paddedBytes = FileDump.getTotalPaddingSize(reader);
                    // empty ORC file is ~45 bytes. Assumption here is file length always >0
                    double percentPadding = ((double)paddedBytes / (double)fileLen) * 100;
                    writer.key("fileLength").value(fileLen);
                    writer.key("paddingLength").value(paddedBytes);
                    writer.key("paddingRatio").value(percentPadding);
                    rows.close();
                }

                writer.endObject();
            }
            if (multiFile)
            {
                writer.endArray();
            }

            if (prettyPrint)
            {
#if false
                string prettyJson;
                if (multiFile)
                {
                    JSONArray jsonArray = new JSONArray(writer.toString());
                    prettyJson = jsonArray.toString(2);
                }
                else
                {
                    JSONObject jsonObject = new JSONObject(writer.toString());
                    prettyJson = jsonObject.toString(2);
                }
#else
                string prettyJson = writer.ToString();
#endif
                System.Console.WriteLine(prettyJson);
            }
            else
            {
                System.Console.WriteLine(writer.ToString());
            }
        }
 public OrcOptions orcOptions(OrcFile.WriterOptions opts)
 {
     this.orcOptions = opts;
     return this;
 }