public ParquetLoader(IHostEnvironment env, Arguments args, IMultiStreamSource files) : this(env, args, OpenStream(files)) { }
private ParquetLoader(IHost host, ModelLoadContext ctx, IMultiStreamSource files) { Contracts.AssertValue(host); _host = host; _host.AssertValue(ctx); _host.AssertValue(files); // *** Binary format *** // int: cached chunk size // bool: TreatBigIntegersAsDates flag // Schema of the loader (0x00010002) _columnChunkReadSize = ctx.Reader.ReadInt32(); bool treatBigIntegersAsDates = ctx.Reader.ReadBoolean(); if (ctx.Header.ModelVerWritten >= 0x00010002) { // Load the schema byte[] buffer = null; if (!ctx.TryLoadBinaryStream(SchemaCtxName, r => buffer = r.ReadByteArray())) { throw _host.ExceptDecode(); } var strm = new MemoryStream(buffer, writable: false); var loader = new BinaryLoader(_host, new BinaryLoader.Arguments(), strm); Schema = loader.Schema; } // Only load Parquest related data if a file is present. Otherwise, just the Schema is valid. if (files.Count > 0) { _parquetOptions = new ParquetOptions() { TreatByteArrayAsString = true, TreatBigIntegersAsDates = treatBigIntegersAsDates }; _parquetStream = OpenStream(files); DataSet schemaDataSet; try { // We only care about the schema so ignore the rows. ReaderOptions readerOptions = new ReaderOptions() { Count = 0, Offset = 0 }; schemaDataSet = ParquetReader.Read(_parquetStream, _parquetOptions, readerOptions); _rowCount = schemaDataSet.TotalRowCount; } catch (Exception ex) { throw new InvalidDataException("Cannot read Parquet file", ex); } _columnsLoaded = InitColumns(schemaDataSet); Schema = CreateSchema(_host, _columnsLoaded); } else if (Schema == null) { throw _host.Except("Parquet loader must be created with one file"); } }
/// <summary> /// Loads and returns the loader and transforms from the specified model stream. /// </summary> /// <param name="env">The host environment to use.</param> /// <param name="modelStream">The model stream.</param> /// <param name="files">The data source to initialize the loader with.</param> /// <param name="extractInnerPipe">Whether to extract the transforms and loader from the wrapped CompositeDataLoader.</param> /// <returns>The created data view.</returns> public static IDataView LoadPipeline(IHostEnvironment env, Stream modelStream, IMultiStreamSource files, bool extractInnerPipe = false) { // REVIEW: Should not duplicate loading loader/transforms code. This method should call LoadLoader. Contracts.CheckValue(env, nameof(env)); env.CheckValue(modelStream, nameof(modelStream)); env.CheckValue(files, nameof(files)); using (var rep = RepositoryReader.Open(modelStream, env)) { return(LoadPipeline(env, rep, files, extractInnerPipe)); } }
/// <summary> /// Loads and returns the loader and transforms from the specified repository reader. /// </summary> /// <param name="env">The host environment to use.</param> /// <param name="rep">The repository reader.</param> /// <param name="files">The data source to initialize the loader with.</param> /// <param name="extractInnerPipe">Whether to extract the transforms and loader from the wrapped CompositeDataLoader.</param> /// <returns>The created data view.</returns> public static IDataView LoadPipeline(IHostEnvironment env, RepositoryReader rep, IMultiStreamSource files, bool extractInnerPipe = false) { // REVIEW: Should not duplicate loading loader/transforms code. This method should call LoadLoader. Contracts.CheckValue(env, nameof(env)); env.CheckValue(rep, nameof(rep)); env.CheckValue(files, nameof(files)); using (var ent = rep.OpenEntry(DirDataLoaderModel, ModelLoadContext.ModelStreamName)) { ILegacyDataLoader loader; env.Assert(ent.Stream.Position == 0); ModelLoadContext.LoadModel <ILegacyDataLoader, SignatureLoadDataLoader>(env, out loader, rep, ent, DirDataLoaderModel, files); IDataView result = loader; if (extractInnerPipe) { var cdl = loader as LegacyCompositeDataLoader; result = cdl == null ? loader : cdl.View; } return(result); } }
public static TransposeLoader Create(IHostEnvironment env, ModelLoadContext ctx, IMultiStreamSource files) { Contracts.CheckValue(env, nameof(env)); IHost h = env.Register(LoadName); h.CheckValue(ctx, nameof(ctx)); ctx.CheckAtModel(GetVersionInfo()); h.CheckValue(files, nameof(files)); return(h.Apply("Loading Model", ch => { if (files.Count == 0) { BinaryLoader schemaView = null; // In the case where we have no input streams, but we have an input schema from // the model repository, we still want to surface ourselves as being a binary loader // with the existing schema. The loader "owns" this stream. if (ctx.TryLoadBinaryStream("Schema.idv", r => schemaView = new BinaryLoader(h, new BinaryLoader.Arguments(), HybridMemoryStream.CreateCache(r.BaseStream), leaveOpen: false))) { h.AssertValue(schemaView); h.CheckDecode(schemaView.GetRowCount() == 0); // REVIEW: Do we want to be a bit more restrictive around uninterpretable columns? return new TransposeLoader(h, ctx, schemaView); } h.Assert(schemaView == null); // Fall through, allow the failure to be on OpenStream. } return new TransposeLoader(h, ctx, files); })); }
/// <summary> /// Loads data view (loader and transforms) from <paramref name="rep"/> if <paramref name="loadTransforms"/> is set to true, /// otherwise loads loader only. /// </summary> public static ILegacyDataLoader LoadLoader(IHostEnvironment env, RepositoryReader rep, IMultiStreamSource files, bool loadTransforms) { Contracts.CheckValue(env, nameof(env)); env.CheckValue(rep, nameof(rep)); env.CheckValue(files, nameof(files)); ILegacyDataLoader loader; // If loadTransforms is false, load the loader only, not the transforms. Repository.Entry ent = null; string dir = ""; if (!loadTransforms) { ent = rep.OpenEntryOrNull(dir = Path.Combine(DirDataLoaderModel, "Loader"), ModelLoadContext.ModelStreamName); } if (ent == null) // either loadTransforms is true, or it's not a composite loader { ent = rep.OpenEntry(dir = DirDataLoaderModel, ModelLoadContext.ModelStreamName); } env.CheckDecode(ent != null, "Loader is not found."); env.AssertNonEmpty(dir); using (ent) { env.Assert(ent.Stream.Position == 0); ModelLoadContext.LoadModel <ILegacyDataLoader, SignatureLoadDataLoader>(env, out loader, rep, ent, dir, files); } return(loader); }
private static InferenceResult InferTextFileColumnTypesCore(IHostEnvironment env, IMultiStreamSource fileSource, Arguments args, IChannel ch) { Contracts.AssertValue(ch); ch.AssertValue(env); ch.AssertValue(fileSource); ch.AssertValue(args); if (args.ColumnCount == 0) { ch.Error("Too many empty columns for automatic inference."); return(InferenceResult.Fail()); } if (args.ColumnCount >= SmartColumnsLim) { ch.Error("Too many columns for automatic inference."); return(InferenceResult.Fail()); } // Read the file as the specified number of text columns. var textLoaderArgs = new TextLoader.Arguments { Column = new[] { TextLoader.Column.Parse(string.Format("C:TX:0-{0}", args.ColumnCount - 1)) }, Separator = args.Separator, AllowSparse = args.AllowSparse, AllowQuoting = args.AllowQuote, }; var idv = TextLoader.ReadFile(env, textLoaderArgs, fileSource); idv = idv.Take(args.MaxRowsToRead); // Read all the data into memory. // List items are rows of the dataset. var data = new List <ReadOnlyMemory <char>[]>(); using (var cursor = idv.GetRowCursor(col => true)) { int columnIndex; bool found = cursor.Schema.TryGetColumnIndex("C", out columnIndex); Contracts.Assert(found); var colType = cursor.Schema.GetColumnType(columnIndex); Contracts.Assert(colType.ItemType.IsText); ValueGetter <VBuffer <ReadOnlyMemory <char> > > vecGetter = null; ValueGetter <ReadOnlyMemory <char> > oneGetter = null; bool isVector = colType.IsVector; if (isVector) { vecGetter = cursor.GetGetter <VBuffer <ReadOnlyMemory <char> > >(columnIndex); } else { Contracts.Assert(args.ColumnCount == 1); oneGetter = cursor.GetGetter <ReadOnlyMemory <char> >(columnIndex); } VBuffer <ReadOnlyMemory <char> > line = default; ReadOnlyMemory <char> tsValue = default; while (cursor.MoveNext()) { if (isVector) { vecGetter(ref line); Contracts.Assert(line.Length == args.ColumnCount); var values = new ReadOnlyMemory <char> [args.ColumnCount]; line.CopyTo(values); data.Add(values); } else { oneGetter(ref tsValue); var values = new[] { tsValue }; data.Add(values); } } } if (data.Count < 2) { ch.Error("Too few rows ({0}) for automatic inference.", data.Count); return(InferenceResult.Fail()); } var cols = new IntermediateColumn[args.ColumnCount]; for (int i = 0; i < args.ColumnCount; i++) { cols[i] = new IntermediateColumn(data.Select(x => x[i]).ToArray(), i); } foreach (var expert in GetExperts()) { expert.Apply(cols); } Contracts.Check(cols.All(x => x.SuggestedType != null), "Column type inference must be conclusive"); // Aggregating header signals. int suspect = 0; var usedNames = new HashSet <string>(); for (int i = 0; i < args.ColumnCount; i++) { if (cols[i].HasHeader == true) { if (usedNames.Add(cols[i].RawData[0].ToString())) { suspect++; } else { // duplicate value in the first column is a strong signal that this is not a header suspect -= args.ColumnCount; } } else if (cols[i].HasHeader == false) { suspect--; } } // REVIEW: Why not use this for column names as well? TextLoader.Arguments fileArgs; bool hasHeader; if (TextLoader.FileContainsValidSchema(env, fileSource, out fileArgs)) { hasHeader = fileArgs.HasHeader; } else { hasHeader = suspect > 0; } // suggest names var names = new List <string>(); usedNames.Clear(); foreach (var col in cols) { string name0; string name; name0 = name = SuggestName(col, hasHeader); int i = 0; while (!usedNames.Add(name)) { name = string.Format("{0}_{1:00}", name0, i++); } names.Add(name); } var outCols = cols.Select((x, i) => new Column(x.ColumnId, names[i], x.SuggestedType)).ToArray(); var numerics = outCols.Count(x => x.ItemType.IsNumber); ch.Info("Detected {0} numeric and {1} text columns.", numerics, outCols.Length - numerics); if (hasHeader) { ch.Info("Generated column names from the file header."); } return(InferenceResult.Success(outCols, hasHeader, cols.Select(col => col.RawData).ToArray())); }
/// <summary> /// Create a text reader <see cref="TextLoader"/>. /// </summary> /// <param name="catalog">The <see cref="DataOperations"/> catalog.</param> /// <param name="args">Defines the settings of the load operation.</param> /// <param name="dataSample">Allows to expose items that can be used for reading.</param> public static TextLoader CreateTextReader(this DataOperations catalog, TextLoader.Arguments args, IMultiStreamSource dataSample = null) => new TextLoader(CatalogUtils.GetEnvironment(catalog), args, dataSample);
private static bool TryParseFile(IChannel ch, TextLoader.Arguments args, IMultiStreamSource source, bool skipStrictValidation, out ColumnSplitResult result) { result = default(ColumnSplitResult); try { // No need to provide information from unsuccessful loader, so we create temporary environment and get information from it in case of success using (var loaderEnv = new TlcEnvironment(0, true)) { var messages = new ConcurrentBag <ChannelMessage>(); loaderEnv.AddListener <ChannelMessage>( (src, msg) => { messages.Add(msg); }); var idv = new TextLoader(loaderEnv, args, source).Take(1000); var columnCounts = new List <int>(); int columnIndex; bool found = idv.Schema.TryGetColumnIndex("C", out columnIndex); ch.Assert(found); using (var cursor = idv.GetRowCursor(x => x == columnIndex)) { var getter = cursor.GetGetter <VBuffer <DvText> >(columnIndex); VBuffer <DvText> line = default(VBuffer <DvText>); while (cursor.MoveNext()) { getter(ref line); columnCounts.Add(line.Length); } } Contracts.Check(columnCounts.Count > 0); var mostCommon = columnCounts.GroupBy(x => x).OrderByDescending(x => x.Count()).First(); if (!skipStrictValidation && mostCommon.Count() < UniformColumnCountThreshold * columnCounts.Count) { return(false); } // If user explicitly specified separator we're allowing "single" column case; // Otherwise user will see message informing that we were not able to detect any columns. if (!skipStrictValidation && mostCommon.Key <= 1) { return(false); } result = new ColumnSplitResult(true, args.Separator, args.AllowQuoting, args.AllowSparse, mostCommon.Key); ch.Trace("Discovered {0} columns using separator '{1}'", mostCommon.Key, args.Separator); foreach (var msg in messages) { ch.Send(msg); } return(true); } } catch (Exception ex) { if (!ex.IsMarked()) { throw; } // For known exceptions, we just continue to the next separator candidate. } return(false); }
/// <summary> /// Attempt to detect text loader arguments. /// The algorithm selects the first 'acceptable' set: the one that recognizes the same number of columns in at /// least <see cref="UniformColumnCountThreshold"/> of the sample's lines, /// and this number of columns is more than 1. /// We sweep on separator, allow sparse and allow quote parameter. /// </summary> public static ColumnSplitResult TrySplitColumns(IHostEnvironment env, IMultiStreamSource source, string[] separatorCandidates, bool?allowSparse = null, bool?allowQuote = null, bool skipStrictValidation = false) { Contracts.CheckValue(env, nameof(env)); var h = env.Register("CandidateLoader"); h.CheckValue(source, nameof(source)); h.CheckNonEmpty(separatorCandidates, nameof(separatorCandidates)); // Default value for sparse and quote is true. bool[] sparse = new[] { true, false }; bool[] quote = new[] { true, false }; if (allowSparse.HasValue) { sparse = new[] { allowSparse.Value } } ; if (allowQuote.HasValue) { quote = new[] { allowQuote.Value } } ; bool foundAny = false; var result = default(ColumnSplitResult); using (var ch = env.Register("SplitColumns").Start("SplitColumns")) { foreach (var perm in (from _allowSparse in sparse from _allowQuote in quote from _sep in separatorCandidates select new { _allowSparse, _allowQuote, _sep })) { var args = new TextLoader.Arguments { Column = new[] { TextLoader.Column.Parse("C:TX:0-**") }, Separator = perm._sep, AllowQuoting = perm._allowQuote, AllowSparse = perm._allowSparse }; if (TryParseFile(ch, args, source, skipStrictValidation, out result)) { foundAny = true; break; } } if (foundAny) { ch.Info("Discovered {0} columns using separator '{1}'.", result.ColumnCount, result.Separator); } else { // REVIEW: May need separate messages for GUI-specific and non-specific. This component can be used // by itself outside the GUI. ch.Info("Couldn't determine columns in the file using separators {0}. Does the input file consist of only a single column? " + "If so, in TLC GUI, please close the import wizard, and then, in the loader settings to the right, manually add a column, " + "choose a name, and set source index to 0.", string.Join(",", separatorCandidates.Select(c => string.Format("'{0}'", GetSeparatorString(c))))); } ch.Done(); } return(foundAny ? result : new ColumnSplitResult(false, null, true, true, 0)); }
public static ParquetLoader Create(IHostEnvironment env, ModelLoadContext ctx, IMultiStreamSource files) { Contracts.CheckValue(env, nameof(env)); IHost host = env.Register(LoaderName); env.CheckValue(ctx, nameof(ctx)); ctx.CheckAtModel(GetVersionInfo()); env.CheckValue(files, nameof(files)); // *** Binary format *** // int: cached chunk size // bool: TreatBigIntegersAsDates flag Arguments args = new Arguments { ColumnChunkReadSize = ctx.Reader.ReadInt32(), TreatBigIntegersAsDates = ctx.Reader.ReadBoolean() }; return(host.Apply("Loading Model", ch => new ParquetLoader(args, host, OpenStream(files)))); }
/// <summary> /// Creates a data loader from the 'LoadName{settings}' string. /// </summary> public static IDataLoader CreateLoader(this IHostEnvironment env, string settings, IMultiStreamSource files) { Contracts.CheckValue(env, nameof(env)); Contracts.CheckValue(files, nameof(files)); Type factoryType = typeof(IComponentFactory <IMultiStreamSource, IDataLoader>); return(CreateCore <IDataLoader>(env, factoryType, typeof(SignatureDataLoader), settings, files)); }
// REVIEW: Add one more overload that works off SubComponents. /// <summary> /// Creates a data loader from the arguments object. /// </summary> public static IDataLoader CreateLoader <TArgs>(this IHostEnvironment env, TArgs arguments, IMultiStreamSource files) where TArgs : class, new() { Contracts.CheckValue(env, nameof(env)); env.CheckValue(files, nameof(files)); return(CreateCore <IDataLoader, TArgs, SignatureDataLoader>(env, arguments, files)); }
public static ParquetLoader Create(IHostEnvironment env, ModelLoadContext ctx, IMultiStreamSource files) { Contracts.CheckValue(env, nameof(env)); IHost host = env.Register(LoaderName); env.CheckValue(ctx, nameof(ctx)); ctx.CheckAtModel(GetVersionInfo()); env.CheckValue(files, nameof(files)); return(host.Apply("Loading Model", ch => new ParquetLoader(host, ctx, files))); }
/// <summary> /// Create a text loader <see cref="TextLoader"/>. /// </summary> /// <param name="catalog">The <see cref="DataOperationsCatalog"/> catalog.</param> /// <param name="options">Defines the settings of the load operation.</param> /// <param name="dataSample">The optional location of a data sample. The sample can be used to infer slot name annotations if present, and also the number /// of slots in <see cref="TextLoader.Options.Columns"/> defined with <see cref="TextLoader.Range"/> with <see langword="null"/> maximum index. /// If the sample has been saved with ML.NET's <see cref="SaveAsText(DataOperationsCatalog, IDataView, Stream, char, bool, bool, bool, bool)"/>, /// it will also contain the schema information in the header that the loader can read even if <see cref="TextLoader.Options.Columns"/> are not specified. /// In order to use the schema defined in the file, all other <see cref="TextLoader.Options"/> sould be left with their default values.</param> public static TextLoader CreateTextLoader(this DataOperationsCatalog catalog, TextLoader.Options options, IMultiStreamSource dataSample = null) => new TextLoader(CatalogUtils.GetEnvironment(catalog), options, dataSample);
private static Stream OpenStream(IMultiStreamSource files) { Contracts.CheckValue(files, nameof(files)); Contracts.CheckParam(files.Count == 1, nameof(files), "Parquet loader must be created with one file"); return(files.Open(0)); }
/// <summary> /// Create a text reader <see cref="TextLoader"/>. /// </summary> /// <param name="catalog">The <see cref="DataOperations"/> catalog.</param> /// <param name="columns">The columns of the schema.</param> /// <param name="hasHeader">Whether the file has a header.</param> /// <param name="separatorChar">The character used as separator between data points in a row. By default the tab character is used as separator.</param> /// <param name="dataSample">The optional location of a data sample.</param> public static TextLoader CreateTextReader(this DataOperations catalog, TextLoader.Column[] columns, bool hasHeader = TextLoader.DefaultArguments.HasHeader, char separatorChar = TextLoader.DefaultArguments.Separator, IMultiStreamSource dataSample = null) => new TextLoader(CatalogUtils.GetEnvironment(catalog), columns, hasHeader, separatorChar, dataSample);