Beispiel #1
0
 public ParquetLoader(IHostEnvironment env, Arguments args, IMultiStreamSource files)
     : this(env, args, OpenStream(files))
 {
 }
Beispiel #2
0
        private ParquetLoader(IHost host, ModelLoadContext ctx, IMultiStreamSource files)
        {
            Contracts.AssertValue(host);
            _host = host;
            _host.AssertValue(ctx);
            _host.AssertValue(files);

            // *** Binary format ***
            // int: cached chunk size
            // bool: TreatBigIntegersAsDates flag
            // Schema of the loader (0x00010002)

            _columnChunkReadSize = ctx.Reader.ReadInt32();
            bool treatBigIntegersAsDates = ctx.Reader.ReadBoolean();

            if (ctx.Header.ModelVerWritten >= 0x00010002)
            {
                // Load the schema
                byte[] buffer = null;
                if (!ctx.TryLoadBinaryStream(SchemaCtxName, r => buffer = r.ReadByteArray()))
                {
                    throw _host.ExceptDecode();
                }
                var strm   = new MemoryStream(buffer, writable: false);
                var loader = new BinaryLoader(_host, new BinaryLoader.Arguments(), strm);
                Schema = loader.Schema;
            }

            // Only load Parquest related data if a file is present. Otherwise, just the Schema is valid.
            if (files.Count > 0)
            {
                _parquetOptions = new ParquetOptions()
                {
                    TreatByteArrayAsString  = true,
                    TreatBigIntegersAsDates = treatBigIntegersAsDates
                };

                _parquetStream = OpenStream(files);
                DataSet schemaDataSet;

                try
                {
                    // We only care about the schema so ignore the rows.
                    ReaderOptions readerOptions = new ReaderOptions()
                    {
                        Count  = 0,
                        Offset = 0
                    };
                    schemaDataSet = ParquetReader.Read(_parquetStream, _parquetOptions, readerOptions);
                    _rowCount     = schemaDataSet.TotalRowCount;
                }
                catch (Exception ex)
                {
                    throw new InvalidDataException("Cannot read Parquet file", ex);
                }

                _columnsLoaded = InitColumns(schemaDataSet);
                Schema         = CreateSchema(_host, _columnsLoaded);
            }
            else if (Schema == null)
            {
                throw _host.Except("Parquet loader must be created with one file");
            }
        }
 /// <summary>
 /// Loads and returns the loader and transforms from the specified model stream.
 /// </summary>
 /// <param name="env">The host environment to use.</param>
 /// <param name="modelStream">The model stream.</param>
 /// <param name="files">The data source to initialize the loader with.</param>
 /// <param name="extractInnerPipe">Whether to extract the transforms and loader from the wrapped CompositeDataLoader.</param>
 /// <returns>The created data view.</returns>
 public static IDataView LoadPipeline(IHostEnvironment env, Stream modelStream, IMultiStreamSource files, bool extractInnerPipe = false)
 {
     // REVIEW: Should not duplicate loading loader/transforms code. This method should call LoadLoader.
     Contracts.CheckValue(env, nameof(env));
     env.CheckValue(modelStream, nameof(modelStream));
     env.CheckValue(files, nameof(files));
     using (var rep = RepositoryReader.Open(modelStream, env))
     {
         return(LoadPipeline(env, rep, files, extractInnerPipe));
     }
 }
        /// <summary>
        /// Loads and returns the loader and transforms from the specified repository reader.
        /// </summary>
        /// <param name="env">The host environment to use.</param>
        /// <param name="rep">The repository reader.</param>
        /// <param name="files">The data source to initialize the loader with.</param>
        /// <param name="extractInnerPipe">Whether to extract the transforms and loader from the wrapped CompositeDataLoader.</param>
        /// <returns>The created data view.</returns>
        public static IDataView LoadPipeline(IHostEnvironment env, RepositoryReader rep, IMultiStreamSource files, bool extractInnerPipe = false)
        {
            // REVIEW: Should not duplicate loading loader/transforms code. This method should call LoadLoader.
            Contracts.CheckValue(env, nameof(env));
            env.CheckValue(rep, nameof(rep));
            env.CheckValue(files, nameof(files));
            using (var ent = rep.OpenEntry(DirDataLoaderModel, ModelLoadContext.ModelStreamName))
            {
                ILegacyDataLoader loader;
                env.Assert(ent.Stream.Position == 0);
                ModelLoadContext.LoadModel <ILegacyDataLoader, SignatureLoadDataLoader>(env, out loader, rep, ent, DirDataLoaderModel, files);
                IDataView result = loader;
                if (extractInnerPipe)
                {
                    var cdl = loader as LegacyCompositeDataLoader;
                    result = cdl == null ? loader : cdl.View;
                }

                return(result);
            }
        }
Beispiel #5
0
        public static TransposeLoader Create(IHostEnvironment env, ModelLoadContext ctx, IMultiStreamSource files)
        {
            Contracts.CheckValue(env, nameof(env));
            IHost h = env.Register(LoadName);

            h.CheckValue(ctx, nameof(ctx));
            ctx.CheckAtModel(GetVersionInfo());
            h.CheckValue(files, nameof(files));

            return(h.Apply("Loading Model",
                           ch =>
            {
                if (files.Count == 0)
                {
                    BinaryLoader schemaView = null;
                    // In the case where we have no input streams, but we have an input schema from
                    // the model repository, we still want to surface ourselves as being a binary loader
                    // with the existing schema. The loader "owns" this stream.
                    if (ctx.TryLoadBinaryStream("Schema.idv",
                                                r => schemaView = new BinaryLoader(h, new BinaryLoader.Arguments(),
                                                                                   HybridMemoryStream.CreateCache(r.BaseStream), leaveOpen: false)))
                    {
                        h.AssertValue(schemaView);
                        h.CheckDecode(schemaView.GetRowCount() == 0);
                        // REVIEW: Do we want to be a bit more restrictive around uninterpretable columns?
                        return new TransposeLoader(h, ctx, schemaView);
                    }
                    h.Assert(schemaView == null);
                    // Fall through, allow the failure to be on OpenStream.
                }
                return new TransposeLoader(h, ctx, files);
            }));
        }
        /// <summary>
        /// Loads data view (loader and transforms) from <paramref name="rep"/> if <paramref name="loadTransforms"/> is set to true,
        /// otherwise loads loader only.
        /// </summary>
        public static ILegacyDataLoader LoadLoader(IHostEnvironment env, RepositoryReader rep, IMultiStreamSource files, bool loadTransforms)
        {
            Contracts.CheckValue(env, nameof(env));
            env.CheckValue(rep, nameof(rep));
            env.CheckValue(files, nameof(files));

            ILegacyDataLoader loader;

            // If loadTransforms is false, load the loader only, not the transforms.
            Repository.Entry ent = null;
            string           dir = "";

            if (!loadTransforms)
            {
                ent = rep.OpenEntryOrNull(dir = Path.Combine(DirDataLoaderModel, "Loader"), ModelLoadContext.ModelStreamName);
            }

            if (ent == null) // either loadTransforms is true, or it's not a composite loader
            {
                ent = rep.OpenEntry(dir = DirDataLoaderModel, ModelLoadContext.ModelStreamName);
            }

            env.CheckDecode(ent != null, "Loader is not found.");
            env.AssertNonEmpty(dir);
            using (ent)
            {
                env.Assert(ent.Stream.Position == 0);
                ModelLoadContext.LoadModel <ILegacyDataLoader, SignatureLoadDataLoader>(env, out loader, rep, ent, dir, files);
            }
            return(loader);
        }
        private static InferenceResult InferTextFileColumnTypesCore(IHostEnvironment env, IMultiStreamSource fileSource, Arguments args, IChannel ch)
        {
            Contracts.AssertValue(ch);
            ch.AssertValue(env);
            ch.AssertValue(fileSource);
            ch.AssertValue(args);

            if (args.ColumnCount == 0)
            {
                ch.Error("Too many empty columns for automatic inference.");
                return(InferenceResult.Fail());
            }

            if (args.ColumnCount >= SmartColumnsLim)
            {
                ch.Error("Too many columns for automatic inference.");
                return(InferenceResult.Fail());
            }

            // Read the file as the specified number of text columns.
            var textLoaderArgs = new TextLoader.Arguments
            {
                Column       = new[] { TextLoader.Column.Parse(string.Format("C:TX:0-{0}", args.ColumnCount - 1)) },
                Separator    = args.Separator,
                AllowSparse  = args.AllowSparse,
                AllowQuoting = args.AllowQuote,
            };
            var idv = TextLoader.ReadFile(env, textLoaderArgs, fileSource);

            idv = idv.Take(args.MaxRowsToRead);

            // Read all the data into memory.
            // List items are rows of the dataset.
            var data = new List <ReadOnlyMemory <char>[]>();

            using (var cursor = idv.GetRowCursor(col => true))
            {
                int  columnIndex;
                bool found = cursor.Schema.TryGetColumnIndex("C", out columnIndex);
                Contracts.Assert(found);
                var colType = cursor.Schema.GetColumnType(columnIndex);
                Contracts.Assert(colType.ItemType.IsText);
                ValueGetter <VBuffer <ReadOnlyMemory <char> > > vecGetter = null;
                ValueGetter <ReadOnlyMemory <char> >            oneGetter = null;
                bool isVector = colType.IsVector;
                if (isVector)
                {
                    vecGetter = cursor.GetGetter <VBuffer <ReadOnlyMemory <char> > >(columnIndex);
                }
                else
                {
                    Contracts.Assert(args.ColumnCount == 1);
                    oneGetter = cursor.GetGetter <ReadOnlyMemory <char> >(columnIndex);
                }

                VBuffer <ReadOnlyMemory <char> > line    = default;
                ReadOnlyMemory <char>            tsValue = default;
                while (cursor.MoveNext())
                {
                    if (isVector)
                    {
                        vecGetter(ref line);
                        Contracts.Assert(line.Length == args.ColumnCount);
                        var values = new ReadOnlyMemory <char> [args.ColumnCount];
                        line.CopyTo(values);
                        data.Add(values);
                    }
                    else
                    {
                        oneGetter(ref tsValue);
                        var values = new[] { tsValue };
                        data.Add(values);
                    }
                }
            }

            if (data.Count < 2)
            {
                ch.Error("Too few rows ({0}) for automatic inference.", data.Count);
                return(InferenceResult.Fail());
            }

            var cols = new IntermediateColumn[args.ColumnCount];

            for (int i = 0; i < args.ColumnCount; i++)
            {
                cols[i] = new IntermediateColumn(data.Select(x => x[i]).ToArray(), i);
            }

            foreach (var expert in GetExperts())
            {
                expert.Apply(cols);
            }

            Contracts.Check(cols.All(x => x.SuggestedType != null), "Column type inference must be conclusive");

            // Aggregating header signals.
            int suspect   = 0;
            var usedNames = new HashSet <string>();

            for (int i = 0; i < args.ColumnCount; i++)
            {
                if (cols[i].HasHeader == true)
                {
                    if (usedNames.Add(cols[i].RawData[0].ToString()))
                    {
                        suspect++;
                    }
                    else
                    {
                        // duplicate value in the first column is a strong signal that this is not a header
                        suspect -= args.ColumnCount;
                    }
                }
                else if (cols[i].HasHeader == false)
                {
                    suspect--;
                }
            }

            // REVIEW: Why not use this for column names as well?
            TextLoader.Arguments fileArgs;
            bool hasHeader;

            if (TextLoader.FileContainsValidSchema(env, fileSource, out fileArgs))
            {
                hasHeader = fileArgs.HasHeader;
            }
            else
            {
                hasHeader = suspect > 0;
            }

            // suggest names
            var names = new List <string>();

            usedNames.Clear();
            foreach (var col in cols)
            {
                string name0;
                string name;
                name0 = name = SuggestName(col, hasHeader);
                int i = 0;
                while (!usedNames.Add(name))
                {
                    name = string.Format("{0}_{1:00}", name0, i++);
                }
                names.Add(name);
            }
            var outCols =
                cols.Select((x, i) => new Column(x.ColumnId, names[i], x.SuggestedType)).ToArray();

            var numerics = outCols.Count(x => x.ItemType.IsNumber);

            ch.Info("Detected {0} numeric and {1} text columns.", numerics, outCols.Length - numerics);
            if (hasHeader)
            {
                ch.Info("Generated column names from the file header.");
            }

            return(InferenceResult.Success(outCols, hasHeader, cols.Select(col => col.RawData).ToArray()));
        }
Beispiel #8
0
 /// <summary>
 /// Create a text reader <see cref="TextLoader"/>.
 /// </summary>
 /// <param name="catalog">The <see cref="DataOperations"/> catalog.</param>
 /// <param name="args">Defines the settings of the load operation.</param>
 /// <param name="dataSample">Allows to expose items that can be used for reading.</param>
 public static TextLoader CreateTextReader(this DataOperations catalog,
                                           TextLoader.Arguments args,
                                           IMultiStreamSource dataSample = null)
 => new TextLoader(CatalogUtils.GetEnvironment(catalog), args, dataSample);
        private static bool TryParseFile(IChannel ch, TextLoader.Arguments args, IMultiStreamSource source, bool skipStrictValidation, out ColumnSplitResult result)
        {
            result = default(ColumnSplitResult);
            try
            {
                // No need to provide information from unsuccessful loader, so we create temporary environment and get information from it in case of success
                using (var loaderEnv = new TlcEnvironment(0, true))
                {
                    var messages = new ConcurrentBag <ChannelMessage>();
                    loaderEnv.AddListener <ChannelMessage>(
                        (src, msg) =>
                    {
                        messages.Add(msg);
                    });
                    var  idv          = new TextLoader(loaderEnv, args, source).Take(1000);
                    var  columnCounts = new List <int>();
                    int  columnIndex;
                    bool found = idv.Schema.TryGetColumnIndex("C", out columnIndex);
                    ch.Assert(found);

                    using (var cursor = idv.GetRowCursor(x => x == columnIndex))
                    {
                        var getter = cursor.GetGetter <VBuffer <DvText> >(columnIndex);

                        VBuffer <DvText> line = default(VBuffer <DvText>);
                        while (cursor.MoveNext())
                        {
                            getter(ref line);
                            columnCounts.Add(line.Length);
                        }
                    }

                    Contracts.Check(columnCounts.Count > 0);
                    var mostCommon = columnCounts.GroupBy(x => x).OrderByDescending(x => x.Count()).First();
                    if (!skipStrictValidation && mostCommon.Count() < UniformColumnCountThreshold * columnCounts.Count)
                    {
                        return(false);
                    }

                    // If user explicitly specified separator we're allowing "single" column case;
                    // Otherwise user will see message informing that we were not able to detect any columns.
                    if (!skipStrictValidation && mostCommon.Key <= 1)
                    {
                        return(false);
                    }

                    result = new ColumnSplitResult(true, args.Separator, args.AllowQuoting, args.AllowSparse, mostCommon.Key);
                    ch.Trace("Discovered {0} columns using separator '{1}'", mostCommon.Key, args.Separator);
                    foreach (var msg in messages)
                    {
                        ch.Send(msg);
                    }
                    return(true);
                }
            }
            catch (Exception ex)
            {
                if (!ex.IsMarked())
                {
                    throw;
                }
                // For known exceptions, we just continue to the next separator candidate.
            }
            return(false);
        }
        /// <summary>
        /// Attempt to detect text loader arguments.
        /// The algorithm selects the first 'acceptable' set: the one that recognizes the same number of columns in at
        /// least <see cref="UniformColumnCountThreshold"/> of the sample's lines,
        /// and this number of columns is more than 1.
        /// We sweep on separator, allow sparse and allow quote parameter.
        /// </summary>
        public static ColumnSplitResult TrySplitColumns(IHostEnvironment env, IMultiStreamSource source,
                                                        string[] separatorCandidates, bool?allowSparse = null, bool?allowQuote = null, bool skipStrictValidation = false)
        {
            Contracts.CheckValue(env, nameof(env));
            var h = env.Register("CandidateLoader");

            h.CheckValue(source, nameof(source));
            h.CheckNonEmpty(separatorCandidates, nameof(separatorCandidates));
            // Default value for sparse and quote is true.
            bool[] sparse = new[] { true, false };
            bool[] quote  = new[] { true, false };
            if (allowSparse.HasValue)
            {
                sparse = new[] { allowSparse.Value }
            }
            ;
            if (allowQuote.HasValue)
            {
                quote = new[] { allowQuote.Value }
            }
            ;
            bool foundAny = false;
            var  result   = default(ColumnSplitResult);

            using (var ch = env.Register("SplitColumns").Start("SplitColumns"))
            {
                foreach (var perm in (from _allowSparse in sparse
                                      from _allowQuote in quote
                                      from _sep in separatorCandidates
                                      select new { _allowSparse, _allowQuote, _sep }))
                {
                    var args = new TextLoader.Arguments
                    {
                        Column       = new[] { TextLoader.Column.Parse("C:TX:0-**") },
                        Separator    = perm._sep,
                        AllowQuoting = perm._allowQuote,
                        AllowSparse  = perm._allowSparse
                    };

                    if (TryParseFile(ch, args, source, skipStrictValidation, out result))
                    {
                        foundAny = true;
                        break;
                    }
                }

                if (foundAny)
                {
                    ch.Info("Discovered {0} columns using separator '{1}'.", result.ColumnCount, result.Separator);
                }
                else
                {
                    // REVIEW: May need separate messages for GUI-specific and non-specific. This component can be used
                    // by itself outside the GUI.
                    ch.Info("Couldn't determine columns in the file using separators {0}. Does the input file consist of only a single column? "
                            + "If so, in TLC GUI, please close the import wizard, and then, in the loader settings to the right, manually add a column, "
                            + "choose a name, and set source index to 0.",
                            string.Join(",", separatorCandidates.Select(c => string.Format("'{0}'", GetSeparatorString(c)))));
                }
                ch.Done();
            }
            return(foundAny ? result : new ColumnSplitResult(false, null, true, true, 0));
        }
Beispiel #11
0
        public static ParquetLoader Create(IHostEnvironment env, ModelLoadContext ctx, IMultiStreamSource files)
        {
            Contracts.CheckValue(env, nameof(env));
            IHost host = env.Register(LoaderName);

            env.CheckValue(ctx, nameof(ctx));
            ctx.CheckAtModel(GetVersionInfo());
            env.CheckValue(files, nameof(files));

            // *** Binary format ***
            // int: cached chunk size
            // bool: TreatBigIntegersAsDates flag

            Arguments args = new Arguments
            {
                ColumnChunkReadSize     = ctx.Reader.ReadInt32(),
                TreatBigIntegersAsDates = ctx.Reader.ReadBoolean()
            };

            return(host.Apply("Loading Model",
                              ch => new ParquetLoader(args, host, OpenStream(files))));
        }
Beispiel #12
0
        /// <summary>
        /// Creates a data loader from the 'LoadName{settings}' string.
        /// </summary>
        public static IDataLoader CreateLoader(this IHostEnvironment env, string settings, IMultiStreamSource files)
        {
            Contracts.CheckValue(env, nameof(env));
            Contracts.CheckValue(files, nameof(files));
            Type factoryType = typeof(IComponentFactory <IMultiStreamSource, IDataLoader>);

            return(CreateCore <IDataLoader>(env, factoryType, typeof(SignatureDataLoader), settings, files));
        }
Beispiel #13
0
        // REVIEW: Add one more overload that works off SubComponents.

        /// <summary>
        /// Creates a data loader from the arguments object.
        /// </summary>
        public static IDataLoader CreateLoader <TArgs>(this IHostEnvironment env, TArgs arguments, IMultiStreamSource files)
            where TArgs : class, new()
        {
            Contracts.CheckValue(env, nameof(env));
            env.CheckValue(files, nameof(files));
            return(CreateCore <IDataLoader, TArgs, SignatureDataLoader>(env, arguments, files));
        }
Beispiel #14
0
        public static ParquetLoader Create(IHostEnvironment env, ModelLoadContext ctx, IMultiStreamSource files)
        {
            Contracts.CheckValue(env, nameof(env));
            IHost host = env.Register(LoaderName);

            env.CheckValue(ctx, nameof(ctx));
            ctx.CheckAtModel(GetVersionInfo());
            env.CheckValue(files, nameof(files));

            return(host.Apply("Loading Model",
                              ch => new ParquetLoader(host, ctx, files)));
        }
Beispiel #15
0
 /// <summary>
 /// Create a text loader <see cref="TextLoader"/>.
 /// </summary>
 /// <param name="catalog">The <see cref="DataOperationsCatalog"/> catalog.</param>
 /// <param name="options">Defines the settings of the load operation.</param>
 /// <param name="dataSample">The optional location of a data sample. The sample can be used to infer slot name annotations if present, and also the number
 /// of slots in <see cref="TextLoader.Options.Columns"/> defined with <see cref="TextLoader.Range"/> with <see langword="null"/> maximum index.
 /// If the sample has been saved with ML.NET's <see cref="SaveAsText(DataOperationsCatalog, IDataView, Stream, char, bool, bool, bool, bool)"/>,
 /// it will also contain the schema information in the header that the loader can read even if <see cref="TextLoader.Options.Columns"/> are not specified.
 /// In order to use the schema defined in the file, all other <see cref="TextLoader.Options"/> sould be left with their default values.</param>
 public static TextLoader CreateTextLoader(this DataOperationsCatalog catalog,
                                           TextLoader.Options options,
                                           IMultiStreamSource dataSample = null)
 => new TextLoader(CatalogUtils.GetEnvironment(catalog), options, dataSample);
Beispiel #16
0
 private static Stream OpenStream(IMultiStreamSource files)
 {
     Contracts.CheckValue(files, nameof(files));
     Contracts.CheckParam(files.Count == 1, nameof(files), "Parquet loader must be created with one file");
     return(files.Open(0));
 }
Beispiel #17
0
 /// <summary>
 /// Create a text reader <see cref="TextLoader"/>.
 /// </summary>
 /// <param name="catalog">The <see cref="DataOperations"/> catalog.</param>
 /// <param name="columns">The columns of the schema.</param>
 /// <param name="hasHeader">Whether the file has a header.</param>
 /// <param name="separatorChar">The character used as separator between data points in a row. By default the tab character is used as separator.</param>
 /// <param name="dataSample">The optional location of a data sample.</param>
 public static TextLoader CreateTextReader(this DataOperations catalog,
                                           TextLoader.Column[] columns,
                                           bool hasHeader                = TextLoader.DefaultArguments.HasHeader,
                                           char separatorChar            = TextLoader.DefaultArguments.Separator,
                                           IMultiStreamSource dataSample = null)
 => new TextLoader(CatalogUtils.GetEnvironment(catalog), columns, hasHeader, separatorChar, dataSample);