Пример #1
0
        /// <summary>
        /// Load a <see cref="IDataView"/> from a text file using <see cref="TextLoader"/>.
        /// Note that <see cref="IDataView"/>'s are lazy, so no actual loading happens here, just schema validation.
        /// </summary>
        /// <param name="catalog">The <see cref="DataOperationsCatalog"/> catalog.</param>
        /// <param name="path">The path to the file(s).</param>
        /// <param name="columns">The columns of the schema.</param>
        /// <param name="separatorChar">The character used as separator between data points in a row. By default the tab character is used as separator.</param>
        /// <param name="hasHeader">Whether the file has a header. When <see langword="true"/>, the loader will skip the first line when
        /// <see cref="TextLoader.Load(IMultiStreamSource)"/> is called.</param>
        /// <param name="allowQuoting">Whether the input may include double-quoted values. This parameter is used to distinguish separator characters
        /// in an input value from actual separators. When <see langword="true"/>, separators within double quotes are treated as part of the
        /// input value. When <see langword="false"/>, all separators, even those whitin quotes, are treated as delimiting a new column.
        /// It is also used to distinguish empty values from missing values. When <see langword="true"/>, missing value are denoted by consecutive
        /// separators and empty values by \"\". When <see langword="false"/>, empty values are denoted by consecutive separators and missing
        /// values by the default missing value for each type documented in <see cref="DataKind"/>.</param>
        /// <param name="trimWhitespace">Remove trailing whitespace from lines.</param>
        /// <param name="allowSparse">Whether the input may include sparse representations. For example, a row containing
        /// "5 2:6 4:3" means that there are 5 columns, and the only non-zero are columns 2 and 4, which have values 6 and 3,
        /// respectively. Column indices are zero-based, so columns 2 and 4 represent the 3rd and 5th columns.
        /// A column may also have dense values followed by sparse values represented in this fashion. For example,
        /// a row containing "1 2 5 2:6 4:3" represents two dense columns with values 1 and 2, followed by 5 sparsely represented
        /// columns with values 0, 0, 6, 0, and 3. The indices of the sparse columns start from 0, even though 0 represents the third column.</param>
        /// <returns>The data view.</returns>
        public static IDataView LoadFromTextFile(this DataOperationsCatalog catalog,
                                                 string path,
                                                 TextLoader.Column[] columns,
                                                 char separatorChar  = TextLoader.Defaults.Separator,
                                                 bool hasHeader      = TextLoader.Defaults.HasHeader,
                                                 bool allowQuoting   = TextLoader.Defaults.AllowQuoting,
                                                 bool trimWhitespace = TextLoader.Defaults.TrimWhitespace,
                                                 bool allowSparse    = TextLoader.Defaults.AllowSparse)
        {
            CheckValidPathContents(path);

            var options = new TextLoader.Options
            {
                Columns        = columns,
                Separators     = new[] { separatorChar },
                HasHeader      = hasHeader,
                AllowQuoting   = allowQuoting,
                TrimWhitespace = trimWhitespace,
                AllowSparse    = allowSparse
            };

            var loader = new TextLoader(CatalogUtils.GetEnvironment(catalog), options: options);

            return(loader.Load(new MultiFileSource(path)));
        }
 /// <summary>
 /// Create a text loader <see cref="TextLoader"/> by inferencing the dataset schema from a data model type.
 /// </summary>
 /// <param name="catalog">The <see cref="DataOperationsCatalog"/> catalog.</param>
 /// <param name="hasHeader">Does the file contains header?</param>
 /// <param name="separatorChar">Column separator character. Default is '\t'</param>
 /// <param name="allowQuotedStrings">Whether the input may include quoted values,
 /// which can contain separator characters, colons,
 /// and distinguish empty values from missing values. When true, consecutive separators
 /// denote a missing value and an empty value is denoted by \"\".
 /// When false, consecutive separators denote an empty value.</param>
 /// <param name="supportSparse">Whether the input may include sparse representations for example,
 /// if one of the row contains "5 2:6 4:3" that's mean there are 5 columns all zero
 /// except for 3rd and 5th columns which have values 6 and 3</param>
 /// <param name="trimWhitespace">Remove trailing whitespace from lines</param>
 public static TextLoader CreateTextLoader <TInput>(this DataOperationsCatalog catalog,
                                                    bool hasHeader          = TextLoader.DefaultArguments.HasHeader,
                                                    char separatorChar      = TextLoader.DefaultArguments.Separator,
                                                    bool allowQuotedStrings = TextLoader.DefaultArguments.AllowQuoting,
                                                    bool supportSparse      = TextLoader.DefaultArguments.AllowSparse,
                                                    bool trimWhitespace     = TextLoader.DefaultArguments.TrimWhitespace)
 => TextLoader.CreateTextReader <TInput>(CatalogUtils.GetEnvironment(catalog), hasHeader, separatorChar, allowQuotedStrings, supportSparse, trimWhitespace);
Пример #3
0
        private protected CrossValidationResult[] CrossValidateTrain(IDataView data, IEstimator <ITransformer> estimator,
                                                                     int numFolds, string samplingKeyColumn, int?seed = null)
        {
            Environment.CheckValue(data, nameof(data));
            Environment.CheckValue(estimator, nameof(estimator));
            Environment.CheckParam(numFolds > 1, nameof(numFolds), "Must be more than 1");
            Environment.CheckValueOrNull(samplingKeyColumn);

            DataOperationsCatalog.EnsureGroupPreservationColumn(Environment, ref data, ref samplingKeyColumn, seed);
            var result = new CrossValidationResult[numFolds];
            int fold   = 0;

            // Sequential per-fold training.
            // REVIEW: we could have a parallel implementation here. We would need to
            // spawn off a separate host per fold in that case.
            foreach (var split in DataOperationsCatalog.CrossValidationSplit(Environment, data, numFolds, samplingKeyColumn))
            {
                var model      = estimator.Fit(split.TrainSet);
                var scoredTest = model.Transform(split.TestSet);
                result[fold] = new CrossValidationResult(model, scoredTest, fold);
                fold++;
            }

            return(result);
        }
Пример #4
0
        /// <summary>
        /// Load a <see cref="IDataView"/> from a text file using <see cref="TextLoader"/>.
        /// Note that <see cref="IDataView"/>'s are lazy, so no actual loading happens here, just schema validation.
        /// </summary>
        /// <param name="catalog">The <see cref="DataOperationsCatalog"/> catalog.</param>
        /// <param name="path">The path to the file.</param>
        /// <param name="columns">The columns of the schema.</param>
        /// <param name="separatorChar">The character used as separator between data points in a row. By default the tab character is used as separator.</param>
        /// <param name="hasHeader">Whether the file has a header with feature names. Note: If a TextLoader is created with hasHeader = true but without a
        /// dataSample, then vector columns made by TextLoader will not contain slot name annotations (slots being the elements of the given vector column),
        /// because the output schema is made when the TextLoader is made, and not when <see cref="TextLoader.Load(IMultiStreamSource)"/> is called.
        /// In addition, the case where dataSample = null and hasHeader = true indicates to the loader that when it is given a file when Load()
        /// is called, it needs to skip the first line.</param>
        /// <param name="allowQuoting">Whether the file can contain columns defined by a quoted string.</param>
        /// <param name="trimWhitespace">Remove trailing whitespace from lines</param>
        /// <param name="allowSparse">Whether the file can contain numerical vectors in sparse format.</param>
        /// <returns>The data view.</returns>
        public static IDataView LoadFromTextFile(this DataOperationsCatalog catalog,
                                                 string path,
                                                 TextLoader.Column[] columns,
                                                 char separatorChar  = TextLoader.Defaults.Separator,
                                                 bool hasHeader      = TextLoader.Defaults.HasHeader,
                                                 bool allowQuoting   = TextLoader.Defaults.AllowQuoting,
                                                 bool trimWhitespace = TextLoader.Defaults.TrimWhitespace,
                                                 bool allowSparse    = TextLoader.Defaults.AllowSparse)
        {
            Contracts.CheckNonEmpty(path, nameof(path));
            if (!File.Exists(path))
            {
                throw Contracts.ExceptParam(nameof(path), "File does not exist at path: {0}", path);
            }

            var options = new TextLoader.Options
            {
                Columns        = columns,
                Separators     = new[] { separatorChar },
                HasHeader      = hasHeader,
                AllowQuoting   = allowQuoting,
                TrimWhitespace = trimWhitespace,
                AllowSparse    = allowSparse
            };

            var loader = new TextLoader(CatalogUtils.GetEnvironment(catalog), options: options);

            return(loader.Load(new MultiFileSource(path)));
        }
Пример #5
0
 /// <summary>
 /// Create a text loader <see cref="TextLoader"/>.
 /// </summary>
 /// <param name="catalog">The <see cref="DataOperationsCatalog"/> catalog.</param>
 /// <param name="columns">Array of columns <see cref="TextLoader.Column"/> defining the schema.</param>
 /// <param name="separatorChar">The character used as separator between data points in a row. By default the tab character is used as separator.</param>
 /// <param name="hasHeader">Whether the file has a header.</param>
 /// <param name="allowSparse">Whether the file can contain numerical vectors in sparse format.</param>
 /// <param name="allowQuoting">Whether the file can contain column defined by a quoted string.</param>
 /// <param name="dataSample">The optional location of a data sample. The sample can be used to infer column names and number of slots in each column.</param>
 public static TextLoader CreateTextLoader(this DataOperationsCatalog catalog,
                                           TextLoader.Column[] columns,
                                           char separatorChar            = TextLoader.Defaults.Separator,
                                           bool hasHeader                = TextLoader.Defaults.HasHeader,
                                           bool allowSparse              = TextLoader.Defaults.AllowSparse,
                                           bool allowQuoting             = TextLoader.Defaults.AllowQuoting,
                                           IMultiStreamSource dataSample = null)
 => new TextLoader(CatalogUtils.GetEnvironment(catalog), columns, separatorChar, hasHeader, allowSparse, allowQuoting, dataSample);
Пример #6
0
        /// <summary>
        /// Load a <see cref="IDataView"/> from a text file using <see cref="TextLoader"/>.
        /// Note that <see cref="IDataView"/>'s are lazy, so no actual loading happens here, just schema validation.
        /// </summary>
        /// <param name="catalog">The <see cref="DataOperationsCatalog"/> catalog.</param>
        /// <param name="path">Specifies a file or path of files from which to load.</param>
        /// <param name="options">Defines the settings of the load operation. No need to specify a Columns field,
        /// as columns will be infered by this method.</param>
        /// <returns>The data view.</returns>
        public static IDataView LoadFromTextFile <TInput>(this DataOperationsCatalog catalog, string path,
                                                          TextLoader.Options options)
        {
            CheckValidPathContents(path);

            return(TextLoader.CreateTextLoader <TInput>(CatalogUtils.GetEnvironment(catalog), options)
                   .Load(new MultiFileSource(path)));
        }
        /// <summary>
        /// Read a data view from a text file using <see cref="TextLoader"/>.
        /// </summary>
        /// <param name="catalog">The <see cref="DataOperationsCatalog"/> catalog.</param>
        /// <param name="path">Specifies a file from which to read.</param>
        /// <param name="args">Defines the settings of the load operation.</param>
        public static IDataView ReadFromTextFile(this DataOperationsCatalog catalog, string path, TextLoader.Arguments args = null)
        {
            Contracts.CheckNonEmpty(path, nameof(path));

            var env    = catalog.GetEnvironment();
            var source = new MultiFileSource(path);

            return(new TextLoader(env, args, source).Read(source));
        }
Пример #8
0
 /// <summary>
 /// Create a text loader <see cref="TextLoader"/> by inferencing the dataset schema from a data model type.
 /// </summary>
 /// <typeparam name="TInput">Defines the schema of the data to be loaded. Use public fields or properties
 /// decorated with <see cref="LoadColumnAttribute"/> (and possibly other attributes) to specify the column
 /// names and their data types in the schema of the loaded data.</typeparam>
 /// <param name="catalog">The <see cref="DataOperationsCatalog"/> catalog.</param>
 /// <param name="separatorChar">Column separator character. Default is '\t'</param>
 /// <param name="hasHeader">Whether the file has a header with feature names. When a <see paramref="dataSample"/> is provided, <see langword="true"/>
 /// indicates that the first line in the <see paramref="dataSample"/> will be used for feature names, and that when <see cref="TextLoader.Load(IMultiStreamSource)"/>
 /// is called, the first line will be skipped. When there is no <see paramref="dataSample"/> provided, <see langword="true"/> just indicates that the loader should
 /// skip the first line when <see cref="TextLoader.Load(IMultiStreamSource)"/> is called, but columns will not have slot names annotations. This is
 /// because the output schema is made when the loader is created, and not when <see cref="TextLoader.Load(IMultiStreamSource)"/> is called.</param>
 /// <param name="dataSample">The optional location of a data sample. The sample can be used to infer slot name annotations if present.</param>
 /// <param name="allowQuoting">Whether the input may include double-quoted values. This parameter is used to distinguish separator characters
 /// in an input value from actual separators. When <see langword="true"/>, separators within double quotes are treated as part of the
 /// input value. When <see langword="false"/>, all separators, even those whitin quotes, are treated as delimiting a new column.</param>
 /// <param name="trimWhitespace">Remove trailing whitespace from lines.</param>
 /// <param name="allowSparse">Whether the input may include sparse representations. For example, a row containing
 /// "5 2:6 4:3" means that there are 5 columns, and the only non-zero are columns 2 and 4, which have values 6 and 3,
 /// respectively. Column indices are zero-based, so columns 2 and 4 represent the 3rd and 5th columns.
 /// A column may also have dense values followed by sparse values represented in this fashion. For example,
 /// a row containing "1 2 5 2:6 4:3" represents two dense columns with values 1 and 2, followed by 5 sparsely represented
 /// columns with values 0, 0, 6, 0, and 3. The indices of the sparse columns start from 0, even though 0 represents the third column.</param>
 public static TextLoader CreateTextLoader <TInput>(this DataOperationsCatalog catalog,
                                                    char separatorChar            = TextLoader.Defaults.Separator,
                                                    bool hasHeader                = TextLoader.Defaults.HasHeader,
                                                    IMultiStreamSource dataSample = null,
                                                    bool allowQuoting             = TextLoader.Defaults.AllowQuoting,
                                                    bool trimWhitespace           = TextLoader.Defaults.TrimWhitespace,
                                                    bool allowSparse              = TextLoader.Defaults.AllowSparse)
 => TextLoader.CreateTextLoader <TInput>(CatalogUtils.GetEnvironment(catalog), hasHeader, separatorChar, allowQuoting,
                                         allowSparse, trimWhitespace, dataSample: dataSample);
Пример #9
0
        /// <summary>
        /// Read a data view from an <see cref="IMultiStreamSource"/> on a binary file using <see cref="BinaryLoader"/>.
        /// </summary>
        /// <param name="catalog">The catalog.</param>
        /// <param name="fileSource">The file source to read from. This can be a <see cref="MultiFileSource"/>, for example.</param>
        public static IDataView ReadFromBinary(this DataOperationsCatalog catalog, IMultiStreamSource fileSource)
        {
            Contracts.CheckValue(fileSource, nameof(fileSource));

            var env = catalog.GetEnvironment();

            var reader = new BinaryLoader(env, new BinaryLoader.Arguments(), fileSource);

            return(reader);
        }
        /// <summary>
        /// Create a database loader <see cref="DatabaseLoader"/>.
        /// </summary>
        /// <param name="catalog">The <see cref="DataOperationsCatalog"/> catalog.</param>
        /// <param name="columns">Array of columns <see cref="DatabaseLoader.Column"/> defining the schema.</param>
        public static DatabaseLoader CreateDatabaseLoader(this DataOperationsCatalog catalog,
                                                          params DatabaseLoader.Column[] columns)
        {
            var options = new DatabaseLoader.Options
            {
                Columns = columns,
            };

            return(new DatabaseLoader(CatalogUtils.GetEnvironment(catalog), options));
        }
Пример #11
0
        /// <summary>
        /// Load a <see cref="IDataView"/> from a text file using <see cref="TextLoader"/>.
        /// Note that <see cref="IDataView"/>'s are lazy, so no actual loading happens here, just schema validation.
        /// </summary>
        /// <param name="catalog">The <see cref="DataOperationsCatalog"/> catalog.</param>
        /// <param name="path">Specifies a file or path of files from which to load.</param>
        /// <param name="options">Defines the settings of the load operation.</param>
        /// <example>
        /// <format type="text/markdown">
        /// <![CDATA[
        /// [!code-csharp[LoadFromTextFile](~/../docs/samples/docs/samples/Microsoft.ML.Samples/Dynamic/DataOperations/SaveAndLoadFromText.cs)]
        /// ]]>
        /// </format>
        /// </example>
        public static IDataView LoadFromTextFile(this DataOperationsCatalog catalog, string path,
                                                 TextLoader.Options options = null)
        {
            CheckValidPathContents(path);

            var env    = catalog.GetEnvironment();
            var source = new MultiFileSource(path);

            return(new TextLoader(env, options, dataSample: source).Load(source));
        }
Пример #12
0
        /// <summary>Create a database loader <see cref="DatabaseLoader"/>.</summary>
        /// <param name="catalog">The <see cref="DataOperationsCatalog"/> catalog.</param>
        /// <param name="columns">Array of columns <see cref="DatabaseLoader.Column"/> defining the schema.</param>
        public static DatabaseLoader CreateDatabaseLoader(this DataOperationsCatalog catalog,
                                                          params DatabaseLoader.Column[] columns)
        {
            var options = new DatabaseLoader.Options
            {
                Columns = columns,
            };

            return(catalog.CreateDatabaseLoader(options));
        }
Пример #13
0
        /// <summary>
        /// Read a data view from a binary file using <see cref="BinaryLoader"/>.
        /// </summary>
        /// <param name="catalog">The catalog.</param>
        /// <param name="path">The path to the file to read from.</param>
        public static IDataView ReadFromBinary(this DataOperationsCatalog catalog, string path)
        {
            Contracts.CheckNonEmpty(path, nameof(path));

            var env = catalog.GetEnvironment();

            var reader = new BinaryLoader(env, new BinaryLoader.Arguments(), path);

            return(reader);
        }
        /// <summary>
        /// Load a <see cref="IDataView"/> from a text file using <see cref="TextLoader"/>.
        /// Note that <see cref="IDataView"/>'s are lazy, so no actual loading happens here, just schema validation.
        /// </summary>
        /// <param name="catalog">The <see cref="DataOperationsCatalog"/> catalog.</param>
        /// <param name="path">Specifies a file from which to load.</param>
        /// <param name="options">Defines the settings of the load operation. No need to specify a Columns field,
        /// as columns will be infered by this method.</param>
        /// <returns>The data view.</returns>
        public static IDataView LoadFromTextFile <TInput>(this DataOperationsCatalog catalog, string path,
                                                          TextLoader.Options options)
        {
            Contracts.CheckNonEmpty(path, nameof(path));
            if (!File.Exists(path))
            {
                throw Contracts.ExceptParam(nameof(path), "File does not exist at path: {0}", path);
            }

            return(TextLoader.CreateTextLoader <TInput>(CatalogUtils.GetEnvironment(catalog), options)
                   .Load(new MultiFileSource(path)));
        }
Пример #15
0
        /// <summary>
        /// Save the data view into a binary stream.
        /// </summary>
        /// <param name="catalog">The catalog.</param>
        /// <param name="data">The data view to save.</param>
        /// <param name="stream">The stream to write to.</param>
        /// <param name="keepHidden">Whether to keep hidden columns in the dataset.</param>
        public static void SaveAsBinary(this DataOperationsCatalog catalog, IDataView data, Stream stream,
                                        bool keepHidden = false)
        {
            Contracts.CheckValue(catalog, nameof(catalog));
            Contracts.CheckValue(data, nameof(data));
            Contracts.CheckValue(stream, nameof(stream));

            var env   = catalog.GetEnvironment();
            var saver = new BinarySaver(env, new BinarySaver.Arguments());

            using (var ch = env.Start("Saving data"))
                DataSaverUtils.SaveDataView(ch, saver, data, stream, keepHidden);
        }
Пример #16
0
        /// <summary>
        /// Load a <see cref="IDataView"/> from a text file using <see cref="TextLoader"/>.
        /// Note that <see cref="IDataView"/>'s are lazy, so no actual loading happens here, just schema validation.
        /// </summary>
        /// <param name="catalog">The <see cref="DataOperationsCatalog"/> catalog.</param>
        /// <param name="path">Specifies a file from which to load.</param>
        /// <param name="options">Defines the settings of the load operation.</param>
        /// <example>
        /// <format type="text/markdown">
        /// <![CDATA[
        /// [!code-csharp[LoadFromTextFile](~/../docs/samples/docs/samples/Microsoft.ML.Samples/Dynamic/DataOperations/SaveAndLoadFromText.cs)]
        /// ]]>
        /// </format>
        /// </example>
        public static IDataView LoadFromTextFile(this DataOperationsCatalog catalog, string path,
                                                 TextLoader.Options options = null)
        {
            Contracts.CheckNonEmpty(path, nameof(path));
            if (!File.Exists(path))
            {
                throw Contracts.ExceptParam(nameof(path), "File does not exist at path: {0}", path);
            }

            var env    = catalog.GetEnvironment();
            var source = new MultiFileSource(path);

            return(new TextLoader(env, options, dataSample: source).Load(source));
        }
Пример #17
0
        /// <summary>
        /// Create the ML context.
        /// </summary>
        /// <param name="seed">Random seed. Set to <c>null</c> for a non-deterministic environment.</param>
        /// <param name="conc">Concurrency level. Set to 1 to run single-threaded. Set to 0 to pick automatically.</param>
        public MLContext(int?seed = null, int conc = 0)
        {
            _env = new LocalEnvironment(seed, conc, MakeCompositionContainer);
            _env.AddListener(ProcessMessage);

            BinaryClassification     = new BinaryClassificationCatalog(_env);
            MulticlassClassification = new MulticlassClassificationCatalog(_env);
            Regression = new RegressionCatalog(_env);
            Clustering = new ClusteringCatalog(_env);
            Ranking    = new RankingCatalog(_env);
            Transforms = new TransformsCatalog(_env);
            Model      = new ModelOperationsCatalog(_env);
            Data       = new DataOperationsCatalog(_env);
        }
Пример #18
0
        /// <summary>
        /// Load a <see cref="IDataView"/> from a binary file.
        /// Note that <see cref="IDataView"/>'s are lazy, so no actual loading happens here, just schema validation.
        /// </summary>
        /// <param name="catalog">The catalog.</param>
        /// <param name="path">The path to the file to load from.</param>
        /// <example>
        /// <format type="text/markdown">
        /// <![CDATA[
        /// [!code-csharp[LoadFromBinary](~/../docs/samples/docs/samples/Microsoft.ML.Samples/Dynamic/DataOperations/SaveAndLoadFromBinary.cs)]
        /// ]]>
        /// </format>
        /// </example>
        public static IDataView LoadFromBinary(this DataOperationsCatalog catalog, string path)
        {
            Contracts.CheckNonEmpty(path, nameof(path));
            if (!File.Exists(path))
            {
                throw Contracts.ExceptParam(nameof(path), "File does not exist at path: {0}", path);
            }

            var env = catalog.GetEnvironment();

            var loader = new BinaryLoader(env, new BinaryLoader.Arguments(), path);

            return(loader);
        }
Пример #19
0
        /// <summary>
        /// Load a <see cref="IDataView"/> from a text file using <see cref="TextLoader"/>.
        /// Note that <see cref="IDataView"/>'s are lazy, so no actual loading happens here, just schema validation.
        /// </summary>
        /// <param name="catalog">The <see cref="DataOperationsCatalog"/> catalog.</param>
        /// <param name="path">The path to the file(s).</param>
        /// <param name="separatorChar">Column separator character. Default is '\t'</param>
        /// <param name="hasHeader">Whether the file has a header. When <see langword="true"/>, the loader will skip the first line when
        /// <see cref="TextLoader.Load(IMultiStreamSource)"/> is called.</param>
        /// <param name="allowQuoting">Whether the input may include double-quoted values. This parameter is used to distinguish separator characters
        /// in an input value from actual separators. When <see langword="true"/>, separators within double quotes are treated as part of the
        /// input value. When <see langword="false"/>, all separators, even those whitin quotes, are treated as delimiting a new column.
        /// It is also used to distinguish empty values from missing values. When <see langword="true"/>, missing value are denoted by consecutive
        /// separators and empty values by \"\". When <see langword="false"/>, empty values are denoted by consecutive separators and missing
        /// values by the default missing value for each type documented in <see cref="DataKind"/>.</param>
        /// <param name="trimWhitespace">Remove trailing whitespace from lines.</param>
        /// <param name="allowSparse">Whether the input may include sparse representations. For example, a row containing
        /// "5 2:6 4:3" means that there are 5 columns, and the only non-zero are columns 2 and 4, which have values 6 and 3,
        /// respectively. Column indices are zero-based, so columns 2 and 4 represent the 3rd and 5th columns.
        /// A column may also have dense values followed by sparse values represented in this fashion. For example,
        /// a row containing "1 2 5 2:6 4:3" represents two dense columns with values 1 and 2, followed by 5 sparsely represented
        /// columns with values 0, 0, 6, 0, and 3. The indices of the sparse columns start from 0, even though 0 represents the third column.</param>
        /// <returns>The data view.</returns>
        public static IDataView LoadFromTextFile <TInput>(this DataOperationsCatalog catalog,
                                                          string path,
                                                          char separatorChar  = TextLoader.Defaults.Separator,
                                                          bool hasHeader      = TextLoader.Defaults.HasHeader,
                                                          bool allowQuoting   = TextLoader.Defaults.AllowQuoting,
                                                          bool trimWhitespace = TextLoader.Defaults.TrimWhitespace,
                                                          bool allowSparse    = TextLoader.Defaults.AllowSparse)
        {
            CheckValidPathContents(path);

            // REVIEW: it is almost always a mistake to have a 'trainable' text loader here.
            // Therefore, we are going to disallow data sample.
            return(TextLoader.CreateTextLoader <TInput>(CatalogUtils.GetEnvironment(catalog), hasHeader, separatorChar,
                                                        allowQuoting, allowSparse, trimWhitespace).Load(new MultiFileSource(path)));
        }
Пример #20
0
        /// <summary>
        /// Create the ML context.
        /// </summary>
        /// <param name="seed">Random seed. Set to <c>null</c> for a non-deterministic environment.</param>
        public MLContext(int?seed = null)
        {
            _env = new LocalEnvironment(seed);
            _env.AddListener(ProcessMessage);

            BinaryClassification     = new BinaryClassificationCatalog(_env);
            MulticlassClassification = new MulticlassClassificationCatalog(_env);
            Regression       = new RegressionCatalog(_env);
            Clustering       = new ClusteringCatalog(_env);
            Ranking          = new RankingCatalog(_env);
            AnomalyDetection = new AnomalyDetectionCatalog(_env);
            Transforms       = new TransformsCatalog(_env);
            Model            = new ModelOperationsCatalog(_env);
            Data             = new DataOperationsCatalog(_env);
        }
        /// <summary>
        /// Read a data view from a text file using <see cref="TextLoader"/>.
        /// </summary>
        /// <param name="catalog">The <see cref="DataOperationsCatalog"/> catalog.</param>
        /// <param name="columns">The columns of the schema.</param>
        /// <param name="hasHeader">Whether the file has a header.</param>
        /// <param name="separatorChar">The character used as separator between data points in a row. By default the tab character is used as separator.</param>
        /// <param name="path">The path to the file.</param>
        /// <returns>The data view.</returns>
        public static IDataView ReadFromTextFile(this DataOperationsCatalog catalog,
                                                 string path,
                                                 TextLoader.Column[] columns,
                                                 bool hasHeader     = TextLoader.DefaultArguments.HasHeader,
                                                 char separatorChar = TextLoader.DefaultArguments.Separator)
        {
            Contracts.CheckNonEmpty(path, nameof(path));

            var env = catalog.GetEnvironment();

            // REVIEW: it is almost always a mistake to have a 'trainable' text loader here.
            // Therefore, we are going to disallow data sample.
            var reader = new TextLoader(env, columns, hasHeader, separatorChar, dataSample: null);

            return(reader.Read(new MultiFileSource(path)));
        }
Пример #22
0
        private protected CrossValidationResult[] CrossValidateTrain(IDataView data, IEstimator <ITransformer> estimator,
                                                                     int numFolds, string samplingKeyColumn, int?seed = null)
        {
            Environment.CheckValue(data, nameof(data));
            Environment.CheckValue(estimator, nameof(estimator));
            Environment.CheckParam(numFolds > 1, nameof(numFolds), "Must be more than 1");
            Environment.CheckValueOrNull(samplingKeyColumn);

            DataOperationsCatalog.EnsureGroupPreservationColumn(Environment, ref data, ref samplingKeyColumn, seed);

            Func <int, CrossValidationResult> foldFunction =
                fold =>
            {
                var trainFilter = new RangeFilter(Environment, new RangeFilter.Options
                {
                    Column     = samplingKeyColumn,
                    Min        = (double)fold / numFolds,
                    Max        = (double)(fold + 1) / numFolds,
                    Complement = true
                }, data);
                var testFilter = new RangeFilter(Environment, new RangeFilter.Options
                {
                    Column     = samplingKeyColumn,
                    Min        = (double)fold / numFolds,
                    Max        = (double)(fold + 1) / numFolds,
                    Complement = false
                }, data);

                var model      = estimator.Fit(trainFilter);
                var scoredTest = model.Transform(testFilter);
                return(new CrossValidationResult(model, scoredTest, fold));
            };

            // Sequential per-fold training.
            // REVIEW: we could have a parallel implementation here. We would need to
            // spawn off a separate host per fold in that case.
            var result = new CrossValidationResult[numFolds];

            for (int fold = 0; fold < numFolds; fold++)
            {
                result[fold] = foldFunction(fold);
            }

            return(result);
        }
Пример #23
0
        /// <summary>
        /// Load a <see cref="IDataView"/> from a text file using <see cref="TextLoader"/>.
        /// Note that <see cref="IDataView"/>'s are lazy, so no actual loading happens here, just schema validation.
        /// </summary>
        /// <param name="catalog">The <see cref="DataOperationsCatalog"/> catalog.</param>
        /// <param name="path">The path to the file.</param>
        /// <param name="separatorChar">Column separator character. Default is '\t'</param>
        /// <param name="hasHeader">Whether the file has a header with feature names. Note: If a TextLoader is created with hasHeader = true but without a
        /// dataSample, then vector columns made by TextLoader will not contain slot name annotations (slots being the elements of the given vector column),
        /// because the output schema is made when the TextLoader is made, and not when <see cref="TextLoader.Load(IMultiStreamSource)"/> is called.
        /// In addition, the case where dataSample = null and hasHeader = true indicates to the loader that when it is given a file when Load()
        /// is called, it needs to skip the first line.</param>
        /// <param name="allowQuoting">Whether the input may include quoted values,
        /// which can contain separator characters, colons,
        /// and distinguish empty values from missing values. When true, consecutive separators
        /// denote a missing value and an empty value is denoted by \"\".
        /// When false, consecutive separators denote an empty value.</param>
        /// <param name="trimWhitespace">Remove trailing whitespace from lines</param>
        /// <param name="allowSparse">Whether the input may include sparse representations for example,
        /// if one of the row contains "5 2:6 4:3" that's mean there are 5 columns all zero
        /// except for 3rd and 5th columns which have values 6 and 3</param>
        /// <returns>The data view.</returns>
        public static IDataView LoadFromTextFile <TInput>(this DataOperationsCatalog catalog,
                                                          string path,
                                                          char separatorChar  = TextLoader.Defaults.Separator,
                                                          bool hasHeader      = TextLoader.Defaults.HasHeader,
                                                          bool allowQuoting   = TextLoader.Defaults.AllowQuoting,
                                                          bool trimWhitespace = TextLoader.Defaults.TrimWhitespace,
                                                          bool allowSparse    = TextLoader.Defaults.AllowSparse)
        {
            Contracts.CheckNonEmpty(path, nameof(path));
            if (!File.Exists(path))
            {
                throw Contracts.ExceptParam(nameof(path), "File does not exist at path: {0}", path);
            }

            // REVIEW: it is almost always a mistake to have a 'trainable' text loader here.
            // Therefore, we are going to disallow data sample.
            return(TextLoader.CreateTextLoader <TInput>(CatalogUtils.GetEnvironment(catalog), hasHeader, separatorChar,
                                                        allowQuoting, allowSparse, trimWhitespace).Load(new MultiFileSource(path)));
        }
        /// <summary>
        /// Save the data view as text.
        /// </summary>
        /// <param name="catalog">The <see cref="DataOperationsCatalog"/> catalog.</param>
        /// <param name="data">The data view to save.</param>
        /// <param name="stream">The stream to write to.</param>
        /// <param name="separatorChar">The column separator.</param>
        /// <param name="headerRow">Whether to write the header row.</param>
        /// <param name="schema">Whether to write the header comment with the schema.</param>
        /// <param name="keepHidden">Whether to keep hidden columns in the dataset.</param>
        public static void SaveAsText(this DataOperationsCatalog catalog,
                                      IDataView data,
                                      Stream stream,
                                      char separatorChar = TextLoader.DefaultArguments.Separator,
                                      bool headerRow     = TextLoader.DefaultArguments.HasHeader,
                                      bool schema        = true,
                                      bool keepHidden    = false)
        {
            Contracts.CheckValue(catalog, nameof(catalog));
            Contracts.CheckValue(data, nameof(data));
            Contracts.CheckValue(stream, nameof(stream));

            var env   = catalog.GetEnvironment();
            var saver = new TextSaver(env, new TextSaver.Arguments {
                Separator = separatorChar.ToString(), OutputHeader = headerRow, OutputSchema = schema
            });

            using (var ch = env.Start("Saving data"))
                DataSaverUtils.SaveDataView(ch, saver, data, stream, keepHidden);
        }
Пример #25
0
        /// <summary>
        /// Create a text loader <see cref="TextLoader"/>.
        /// </summary>
        /// <param name="catalog">The <see cref="DataOperationsCatalog"/> catalog.</param>
        /// <param name="columns">Array of columns <see cref="TextLoader.Column"/> defining the schema.</param>
        /// <param name="separatorChar">The character used as separator between data points in a row. By default the tab character is used as separator.</param>
        /// <param name="hasHeader">Whether the file has a header with feature names. When a <see paramref="dataSample"/> is provided, <see langword="true"/>
        /// indicates that the first line in the <see paramref="dataSample"/> will be used for feature names, and that when <see cref="TextLoader.Load(IMultiStreamSource)"/>
        /// is called, the first line will be skipped. When there is no <see paramref="dataSample"/> provided, <see langword="true"/> just indicates that the loader should
        /// skip the first line when <see cref="TextLoader.Load(IMultiStreamSource)"/> is called, but columns will not have slot names annotations. This is
        /// because the output schema is made when the loader is created, and not when <see cref="TextLoader.Load(IMultiStreamSource)"/> is called.</param>
        /// <param name="dataSample">The optional location of a data sample. The sample can be used to infer slot name annotations if present, and also the number
        /// of slots in a column defined with <see cref="TextLoader.Range"/> with <see langword="null"/> maximum index.
        /// If the sample has been saved with ML.NET's <see cref="SaveAsText(DataOperationsCatalog, IDataView, Stream, char, bool, bool, bool, bool)"/>,
        /// it will also contain the schema information in the header that the loader can read even if <paramref name="columns"/> is <see langword="null"/>.
        /// In order to use the schema defined in the file, all other arguments sould be left with their default values.</param>
        /// <param name="allowQuoting">Whether the input may include double-quoted values. This parameter is used to distinguish separator characters
        /// in an input value from actual separators. When <see langword="true"/>, separators within double quotes are treated as part of the
        /// input value. When <see langword="false"/>, all separators, even those within quotes, are treated as delimiting a new column.</param>
        /// <param name="trimWhitespace">Remove trailing whitespace from lines.</param>
        /// <param name="allowSparse">Whether the input may include sparse representations. For example, a row containing
        /// "5 2:6 4:3" means that there are 5 columns, and the only non-zero are columns 2 and 4, which have values 6 and 3,
        /// respectively. Column indices are zero-based, so columns 2 and 4 represent the 3rd and 5th columns.
        /// A column may also have dense values followed by sparse values represented in this fashion. For example,
        /// a row containing "1 2 5 2:6 4:3" represents two dense columns with values 1 and 2, followed by 5 sparsely represented
        /// columns with values 0, 0, 6, 0, and 3. The indices of the sparse columns start from 0, even though 0 represents the third column.</param>
        /// <example>
        /// <format type="text/markdown">
        /// <![CDATA[
        /// [!code-csharp[CreateTextLoader](~/../docs/samples/docs/samples/Microsoft.ML.Samples/Dynamic/DataOperations/LoadingText.cs)]
        /// ]]>
        /// </format>
        /// </example>
        public static TextLoader CreateTextLoader(this DataOperationsCatalog catalog,
                                                  TextLoader.Column[] columns,
                                                  char separatorChar            = TextLoader.Defaults.Separator,
                                                  bool hasHeader                = TextLoader.Defaults.HasHeader,
                                                  IMultiStreamSource dataSample = null,
                                                  bool allowQuoting             = TextLoader.Defaults.AllowQuoting,
                                                  bool trimWhitespace           = TextLoader.Defaults.TrimWhitespace,
                                                  bool allowSparse              = TextLoader.Defaults.AllowSparse)
        {
            var options = new TextLoader.Options
            {
                Columns        = columns,
                Separators     = new[] { separatorChar },
                HasHeader      = hasHeader,
                AllowQuoting   = allowQuoting,
                TrimWhitespace = trimWhitespace,
                AllowSparse    = allowSparse,
            };

            return(new TextLoader(CatalogUtils.GetEnvironment(catalog), options: options, dataSample: dataSample));
        }
Пример #26
0
 /// <summary>
 /// Create a text loader <see cref="TextLoader"/>.
 /// </summary>
 /// <param name="catalog">The <see cref="DataOperationsCatalog"/> catalog.</param>
 /// <param name="options">Defines the settings of the load operation.</param>
 /// <param name="dataSample">The optional location of a data sample. The sample can be used to infer slot name annotations if present, and also the number
 /// of slots in <see cref="TextLoader.Options.Columns"/> defined with <see cref="TextLoader.Range"/> with <see langword="null"/> maximum index.
 /// If the sample has been saved with ML.NET's <see cref="SaveAsText(DataOperationsCatalog, IDataView, Stream, char, bool, bool, bool, bool)"/>,
 /// it will also contain the schema information in the header that the loader can read even if <see cref="TextLoader.Options.Columns"/> are not specified.
 /// In order to use the schema defined in the file, all other <see cref="TextLoader.Options"/> sould be left with their default values.</param>
 public static TextLoader CreateTextLoader(this DataOperationsCatalog catalog,
                                           TextLoader.Options options,
                                           IMultiStreamSource dataSample = null)
 => new TextLoader(CatalogUtils.GetEnvironment(catalog), options, dataSample);
Пример #27
0
 /// <summary>Create a database loader <see cref="DatabaseLoader"/>.</summary>
 /// <typeparam name="TInput">Defines the schema of the data to be loaded. Use public fields or properties
 /// decorated with <see cref="LoadColumnAttribute"/> (and possibly other attributes) to specify the column
 /// names and their data types in the schema of the loaded data.</typeparam>
 /// <param name="catalog">The <see cref="DataOperationsCatalog"/> catalog.</param>
 public static DatabaseLoader CreateDatabaseLoader <TInput>(this DataOperationsCatalog catalog)
 => DatabaseLoader.CreateDatabaseLoader <TInput>(CatalogUtils.GetEnvironment(catalog));
 /// <summary>
 /// Drop rows where a specified predicate returns true.
 /// </summary>
 /// <typeparam name="TSrc">The class defining which columns to take from the incoming data.</typeparam>
 /// <param name="catalog">The data operations catalog.</param>
 /// <param name="input">The input data.</param>
 /// <param name="filterPredicate">A predicate, that takes an input of type <typeparamref name="TSrc"/> and returns true if the row should be filtered (dropped) and false otherwise.</param>
 public static IDataView FilterByCustomPredicate <TSrc>(this DataOperationsCatalog catalog, IDataView input, Func <TSrc, bool> filterPredicate)
     where TSrc : class, new()
 => new CustomMappingFilter <TSrc>(catalog.GetEnvironment(), input, filterPredicate);
 /// <summary>
 /// Create a text loader <see cref="TextLoader"/>.
 /// </summary>
 /// <param name="catalog">The <see cref="DataOperationsCatalog"/> catalog.</param>
 /// <param name="columns">Array of columns <see cref="TextLoader.Column"/> defining the schema.</param>
 /// <param name="hasHeader">Whether the file has a header.</param>
 /// <param name="separatorChar">The character used as separator between data points in a row. By default the tab character is used as separator.</param>
 /// <param name="dataSample">The optional location of a data sample. The sample can be used to infer column names and number of slots in each column.</param>
 public static TextLoader CreateTextLoader(this DataOperationsCatalog catalog,
                                           TextLoader.Column[] columns,
                                           bool hasHeader                = TextLoader.DefaultArguments.HasHeader,
                                           char separatorChar            = TextLoader.DefaultArguments.Separator,
                                           IMultiStreamSource dataSample = null)
 => new TextLoader(CatalogUtils.GetEnvironment(catalog), columns, hasHeader, separatorChar, dataSample);
 /// <summary>
 /// Drop rows where a specified predicate returns true. This filter allows to maintain a per-cursor state.
 /// </summary>
 /// <typeparam name="TSrc">The class defining which columns to take from the incoming data.</typeparam>
 /// <typeparam name="TState">The type that describes per-cursor state.</typeparam>
 /// <param name="catalog">The data operations catalog.</param>
 /// <param name="input">The input data.</param>
 /// <param name="filterPredicate">A predicate, that takes an input of type <typeparamref name="TSrc"/> and a state object of type
 /// <typeparamref name="TState"/>, and returns true if the row should be filtered (dropped) and false otherwise.</param>
 /// <param name="stateInitAction">The action to initialize the state object, that is called once before the cursor is initialized.</param>
 public static IDataView FilterByStatefulCustomPredicate <TSrc, TState>(this DataOperationsCatalog catalog, IDataView input, Func <TSrc, TState, bool> filterPredicate,
                                                                        Action <TState> stateInitAction)
     where TSrc : class, new()
     where TState : class, new()
 => new StatefulCustomMappingFilter <TSrc, TState>(catalog.GetEnvironment(), input, filterPredicate, stateInitAction);