コード例 #1
0
 /**
  * To handle multiple INSERT... statements in a single transaction, we want to make sure
  * to generate unique {@code rowId} for all inserted rows of the transaction.
  * @return largest rowId created by previous statements (maybe 0)
  * @
  */
 private long findRowIdOffsetForInsert()
 {
     /*
      * 1. need to know bucket we are writing to
      * 2. need to know which delta dir it's in
      * Then,
      * 1. find the same bucket file in previous delta dir for this txn
      * 2. read the footer and get AcidStats which has insert count
      * 2.1 if AcidStats.inserts>0 done
      *  else go to previous delta file
      *  For example, consider insert/update/insert case...*/
     if (options.getStatementId() <= 0)
     {
         return(0);//there is only 1 statement in this transaction (so far)
     }
     for (int pastStmt = options.getStatementId() - 1; pastStmt >= 0; pastStmt--)
     {
         Path matchingBucket = AcidUtils.createFilename(options.getFinalDestination(), options.clone().statementId(pastStmt));
         if (!fs.exists(matchingBucket))
         {
             continue;
         }
         Reader reader = OrcFile.createReader(matchingBucket, OrcFile.readerOptions(options.getConfiguration()));
         //no close() on Reader?!
         AcidStats acidStats = parseAcidStats(reader);
         if (acidStats.inserts > 0)
         {
             return(acidStats.inserts);
         }
     }
     //if we got here, we looked at all delta files in this txn, prior to current statement and didn't
     //find any inserts...
     return(0);
 }
コード例 #2
0
        /**
         * Create a reader that merge sorts the ACID events together.
         * @param conf the configuration
         * @param collapseEvents should the events on the same row be collapsed
         * @param isOriginal is the base file a pre-acid file
         * @param bucket the bucket we are reading
         * @param options the options to read with
         * @param deltaDirectory the list of delta directories to include
         * @
         */
        OrcRawRecordMerger(Configuration conf,
                           bool collapseEvents,
                           Reader reader,
                           bool isOriginal,
                           int bucket,
                           ValidTxnList validTxnList,
                           Reader.Options options,
                           Path[] deltaDirectory)
        {
            this.conf         = conf;
            this.collapse     = collapseEvents;
            this.offset       = options.getOffset();
            this.length       = options.getLength();
            this.validTxnList = validTxnList;
            TypeDescription typeDescr = OrcUtils.getDesiredRowTypeDescr(conf);

            if (typeDescr == null)
            {
                throw new IOException(ErrorMsg.SCHEMA_REQUIRED_TO_READ_ACID_TABLES.getErrorCodedMsg());
            }

            objectInspector = OrcRecordUpdater.createEventSchema
                                  (OrcStruct.createObjectInspector(0, OrcUtils.getOrcTypes(typeDescr)));

            // modify the options to reflect the event instead of the base row
            Reader.Options eventOptions = createEventOptions(options);
            if (reader == null)
            {
                baseReader = null;
            }
            else
            {
                // find the min/max based on the offset and length
                if (isOriginal)
                {
                    discoverOriginalKeyBounds(reader, bucket, options);
                }
                else
                {
                    discoverKeyBounds(reader, options);
                }
                LOG.info("min key = " + minKey + ", max key = " + maxKey);
                // use the min/max instead of the byte range
                ReaderPair pair;
                ReaderKey  key = new ReaderKey();
                if (isOriginal)
                {
                    options = options.clone();
                    options.range(options.getOffset(), Long.MAX_VALUE);
                    pair = new OriginalReaderPair(key, reader, bucket, minKey, maxKey,
                                                  options);
                }
                else
                {
                    pair = new ReaderPair(key, reader, bucket, minKey, maxKey,
                                          eventOptions, 0);
                }

                // if there is at least one record, put it in the map
                if (pair.nextRecord != null)
                {
                    readers.put(key, pair);
                }
                baseReader = pair.recordReader;
            }

            // we always want to read all of the deltas
            eventOptions.range(0, Long.MAX_VALUE);
            if (deltaDirectory != null)
            {
                foreach (Path delta in deltaDirectory)
                {
                    ReaderKey             key       = new ReaderKey();
                    Path                  deltaFile = AcidUtils.createBucketFile(delta, bucket);
                    AcidUtils.ParsedDelta deltaDir  = AcidUtils.parsedDelta(delta);
                    FileSystem            fs        = deltaFile.getFileSystem(conf);
                    long                  length    = getLastFlushLength(fs, deltaFile);
                    if (length != -1 && fs.exists(deltaFile))
                    {
                        Reader deltaReader = OrcFile.createReader(deltaFile,
                                                                  OrcFile.readerOptions(conf).maxLength(length));
                        Reader.Options deltaEventOptions = null;
                        if (eventOptions.getSearchArgument() != null)
                        {
                            // Turn off the sarg before pushing it to delta.  We never want to push a sarg to a delta as
                            // it can produce wrong results (if the latest valid version of the record is filtered out by
                            // the sarg) or ArrayOutOfBounds errors (when the sarg is applied to a delete record)
                            // unless the delta only has insert events
                            OrcRecordUpdater.AcidStats acidStats = OrcRecordUpdater.parseAcidStats(deltaReader);
                            if (acidStats.deletes > 0 || acidStats.updates > 0)
                            {
                                deltaEventOptions = eventOptions.clone().searchArgument(null, null);
                            }
                        }
                        ReaderPair deltaPair;
                        deltaPair = new ReaderPair(key, deltaReader, bucket, minKey,
                                                   maxKey, deltaEventOptions != null ? deltaEventOptions : eventOptions, deltaDir.getStatementId());
                        if (deltaPair.nextRecord != null)
                        {
                            readers.put(key, deltaPair);
                        }
                    }
                }
            }

            // get the first record
            Map.Entry <ReaderKey, ReaderPair> entry = readers.pollFirstEntry();
            if (entry == null)
            {
                columns = 0;
                primary = null;
            }
            else
            {
                primary = entry.getValue();
                if (readers.isEmpty())
                {
                    secondaryKey = null;
                }
                else
                {
                    secondaryKey = readers.firstKey();
                }
                // get the number of columns in the user's rows
                columns = primary.getColumns();
            }
        }
コード例 #3
0
        public static void printJsonMetaData(List <string> files, Configuration conf,
                                             List <int> rowIndexCols, bool prettyPrint, bool printTimeZone)
        {
            JsonWriter writer    = new JsonWriter();
            bool       multiFile = files.Count > 1;

            if (multiFile)
            {
                writer.array();
            }
            else
            {
                writer.newObject();
            }
            foreach (string filename in files)
            {
                if (multiFile)
                {
                    writer.newObject();
                }
                writer.key("fileName").value(Path.GetFileName(filename));
                Reader reader = OrcFile.createReader(filename, OrcFile.readerOptions(conf));
                writer.key("fileVersion").value(OrcFile.VersionHelper.getName(reader.getFileVersion()));
                writer.key("writerVersion").value(reader.getWriterVersion().ToString());
                using (RecordReaderImpl rows = (RecordReaderImpl)reader.rows())
                {
                    writer.key("numberOfRows").value(reader.getNumberOfRows());
                    writer.key("compression").value(reader.getCompression().ToString());
                    if (reader.getCompression() != CompressionKind.NONE)
                    {
                        writer.key("compressionBufferSize").value(reader.getCompressionSize());
                    }
                    writer.key("schemaString").value(reader.getObjectInspector().getTypeName());
                    writer.key("schema").array();
                    writeSchema(writer, reader.getTypes());
                    writer.endArray();

                    writer.key("stripeStatistics").array();
                    List <StripeStatistics> stripeStatistics = reader.getStripeStatistics();
                    for (int n = 0; n < stripeStatistics.Count; n++)
                    {
                        writer.newObject();
                        writer.key("stripeNumber").value(n + 1);
                        StripeStatistics ss = stripeStatistics[n];
                        writer.key("columnStatistics").array();
                        for (int i = 0; i < ss.getColumnStatistics().Length; i++)
                        {
                            writer.newObject();
                            writer.key("columnId").value(i);
                            writeColumnStatistics(writer, ss.getColumnStatistics()[i]);
                            writer.endObject();
                        }
                        writer.endArray();
                        writer.endObject();
                    }
                    writer.endArray();

                    ColumnStatistics[] stats = reader.getStatistics();
                    int colCount             = stats.Length;
                    writer.key("fileStatistics").array();
                    for (int i = 0; i < stats.Length; ++i)
                    {
                        writer.newObject();
                        writer.key("columnId").value(i);
                        writeColumnStatistics(writer, stats[i]);
                        writer.endObject();
                    }
                    writer.endArray();

                    writer.key("stripes").array();
                    int stripeIx = -1;
                    foreach (StripeInformation stripe in reader.getStripes())
                    {
                        ++stripeIx;
                        long stripeStart             = stripe.getOffset();
                        OrcProto.StripeFooter footer = rows.readStripeFooter(stripe);
                        writer.newObject(); // start of stripe information
                        writer.key("stripeNumber").value(stripeIx + 1);
                        writer.key("stripeInformation");
                        writeStripeInformation(writer, stripe);
                        if (printTimeZone)
                        {
                            writer.key("writerTimezone").value(
                                footer.HasWriterTimezone ? footer.WriterTimezone : FileDump.UNKNOWN);
                        }
                        long sectionStart = stripeStart;

                        writer.key("streams").array();
                        foreach (OrcProto.Stream section in footer.StreamsList)
                        {
                            writer.newObject();
                            string kind = section.HasKind ? section.Kind.ToString() : FileDump.UNKNOWN;
                            writer.key("columnId").value(section.Column);
                            writer.key("section").value(kind);
                            writer.key("startOffset").value(sectionStart);
                            writer.key("length").value(section.Length);
                            sectionStart += (long)section.Length;
                            writer.endObject();
                        }
                        writer.endArray();

                        writer.key("encodings").array();
                        for (int i = 0; i < footer.ColumnsCount; ++i)
                        {
                            writer.newObject();
                            OrcProto.ColumnEncoding encoding = footer.ColumnsList[i];
                            writer.key("columnId").value(i);
                            writer.key("kind").value(encoding.Kind.ToString());
                            if (encoding.Kind == OrcProto.ColumnEncoding.Types.Kind.DICTIONARY ||
                                encoding.Kind == OrcProto.ColumnEncoding.Types.Kind.DICTIONARY_V2)
                            {
                                writer.key("dictionarySize").value(encoding.DictionarySize);
                            }
                            writer.endObject();
                        }
                        writer.endArray();

                        if (rowIndexCols != null && rowIndexCols.Count != 0)
                        {
                            // include the columns that are specified, only if the columns are included, bloom filter
                            // will be read
                            bool[] sargColumns = new bool[colCount];
                            foreach (int colIdx in rowIndexCols)
                            {
                                sargColumns[colIdx] = true;
                            }
                            RecordReaderImpl.Index indices = rows.readRowIndex(stripeIx, null, sargColumns);
                            writer.key("indexes").array();
                            foreach (int col in rowIndexCols)
                            {
                                writer.newObject();
                                writer.key("columnId").value(col);
                                writeRowGroupIndexes(writer, col, indices.getRowGroupIndex());
                                writeBloomFilterIndexes(writer, col, indices.getBloomFilterIndex());
                                writer.endObject();
                            }
                            writer.endArray();
                        }
                        writer.endObject(); // end of stripe information
                    }
                    writer.endArray();

                    long fileLen     = new FileInfo(filename).Length;
                    long paddedBytes = FileDump.getTotalPaddingSize(reader);
                    // empty ORC file is ~45 bytes. Assumption here is file length always >0
                    double percentPadding = ((double)paddedBytes / (double)fileLen) * 100;
                    writer.key("fileLength").value(fileLen);
                    writer.key("paddingLength").value(paddedBytes);
                    writer.key("paddingRatio").value(percentPadding);
                    rows.close();
                }

                writer.endObject();
            }
            if (multiFile)
            {
                writer.endArray();
            }

            if (prettyPrint)
            {
#if false
                string prettyJson;
                if (multiFile)
                {
                    JSONArray jsonArray = new JSONArray(writer.toString());
                    prettyJson = jsonArray.toString(2);
                }
                else
                {
                    JSONObject jsonObject = new JSONObject(writer.toString());
                    prettyJson = jsonObject.toString(2);
                }
#else
                string prettyJson = writer.ToString();
#endif
                System.Console.WriteLine(prettyJson);
            }
            else
            {
                System.Console.WriteLine(writer.ToString());
            }
        }