/// <summary>
        /// Attempt to detect text loader arguments.
        /// The algorithm selects the first 'acceptable' set: the one that recognizes the same number of columns in at
        /// least <see cref="UniformColumnCountThreshold"/> of the sample's lines,
        /// and this number of columns is more than 1.
        /// We sweep on separator, allow sparse and allow quote parameter.
        /// </summary>
        public static ColumnSplitResult TrySplitColumns(MLContext context, IMultiStreamSource source, char[] separatorCandidates)
        {
            var sparse   = new[] { false, true };
            var quote    = new[] { true, false };
            var foundAny = false;
            var result   = default(ColumnSplitResult);

            foreach (var perm in (from _allowSparse in sparse
                                  from _allowQuote in quote
                                  from _sep in separatorCandidates
                                  select new { _allowSparse, _allowQuote, _sep }))
            {
                var options = new TextLoader.Options
                {
                    Columns = new[] { new TextLoader.Column()
                                      {
                                          Name     = "C",
                                          DataKind = DataKind.String,
                                          Source   = new[] { new TextLoader.Range(0, null) }
                                      } },
                    Separators   = new[] { perm._sep },
                    AllowQuoting = perm._allowQuote,
                    AllowSparse  = perm._allowSparse
                };

                if (TryParseFile(context, options, source, out result))
                {
                    foundAny = true;
                    break;
                }
            }
            return(foundAny ? result : new ColumnSplitResult(false, null, true, true, 0));
        }
예제 #2
0
        public IDataView Build(string path)
        {
            var readerOptions = new TextLoader.Options()
            {
                Separators = new[] { ',' },
                HasHeader  = true,
                Columns    = new[]
                {
                    new TextLoader.Column("Season", DataKind.Single, 2),
                    new TextLoader.Column("Year", DataKind.Single, 3),
                    new TextLoader.Column("Month", DataKind.Single, 4),
                    new TextLoader.Column("Hour", DataKind.Single, 5),
                    new TextLoader.Column("Holiday", DataKind.Single, 6),
                    new TextLoader.Column("Weekday", DataKind.Single, 7),
                    new TextLoader.Column("Weather", DataKind.Single, 8),
                    new TextLoader.Column("Temperature", DataKind.Single, 9),
                    new TextLoader.Column("NormalizedTemperature", DataKind.Single, 10),
                    new TextLoader.Column("Humidity", DataKind.Single, 11),
                    new TextLoader.Column("Windspeed", DataKind.Single, 12),
                    new TextLoader.Column("Count", DataKind.Single, 15),
                }
            };

            return(mLContext.Data.LoadFromTextFile(path, readerOptions));
        }
예제 #3
0
        public void SetupSentimentPipeline()
        {
            _sentimentExample = new SentimentData()
            {
                SentimentText = "Not a big fan of this."
            };

            string sentimentDataPath = GetBenchmarkDataPath("wikipedia-detox-250-line-data.tsv");

            var mlContext = new MLContext(seed: 1);

            // Create text loader.
            var options = new TextLoader.Options()
            {
                Columns = new[]
                {
                    new TextLoader.Column("Label", DataKind.Boolean, 0),
                    new TextLoader.Column("SentimentText", DataKind.String, 1)
                },
                HasHeader = true,
            };
            var loader = new TextLoader(mlContext, options: options);

            IDataView data = loader.Load(sentimentDataPath);

            var pipeline = mlContext.Transforms.Text.FeaturizeText("Features", "SentimentText")
                           .Append(mlContext.BinaryClassification.Trainers.SdcaNonCalibrated(
                                       new SdcaNonCalibratedBinaryTrainer.Options {
                NumberOfThreads = 1, ConvergenceTolerance = 1e-2f,
            }));

            var model = pipeline.Fit(data);

            _sentimentModel = mlContext.Model.CreatePredictionEngine <SentimentData, SentimentPrediction>(model);
        }
        public void MetacomponentsFeaturesRenamed()
        {
            // Create text loader.
            var options = new TextLoader.Options()
            {
                Columns    = TestDatasets.irisData.GetLoaderColumns(),
                Separators = new[] { ',' },
            };
            var loader = new TextLoader(Env, options: options);

            var data = loader.Load(GetDataPath(TestDatasets.irisData.trainFilename));

            var sdcaTrainer = ML.BinaryClassification.Trainers.SdcaNonCalibrated(
                new SdcaNonCalibratedBinaryClassificationTrainer.Options {
                LabelColumnName           = "Label",
                FeatureColumnName         = "Vars",
                MaximumNumberOfIterations = 100,
                Shuffle         = true,
                NumberOfThreads = 1,
            });

            var pipeline = new ColumnConcatenatingEstimator(Env, "Vars", "SepalLength", "SepalWidth", "PetalLength", "PetalWidth")
                           .Append(new ValueToKeyMappingEstimator(Env, "Label"), TransformerScope.TrainTest)
                           .Append(ML.MulticlassClassification.Trainers.OneVersusAll(sdcaTrainer))
                           .Append(new KeyToValueMappingEstimator(Env, "PredictedLabel"));

            var model = pipeline.Fit(data);

            TestEstimatorCore(pipeline, data);
            Done();
        }
예제 #5
0
        private TransformerChain <MulticlassPredictionTransformer <MaximumEntropyModelParameters> > Train(string dataPath)
        {
            // Create text loader.
            var options = new TextLoader.Options()
            {
                Columns = new[]
                {
                    new TextLoader.Column("Label", DataKind.Single, 0),
                    new TextLoader.Column("SepalLength", DataKind.Single, 1),
                    new TextLoader.Column("SepalWidth", DataKind.Single, 2),
                    new TextLoader.Column("PetalLength", DataKind.Single, 3),
                    new TextLoader.Column("PetalWidth", DataKind.Single, 4),
                },
                HasHeader = true,
            };
            var loader = new TextLoader(_mlContext, options: options);

            IDataView data = loader.Load(dataPath);

            var pipeline = new ColumnConcatenatingEstimator(_mlContext, "Features", new[] { "SepalLength", "SepalWidth", "PetalLength", "PetalWidth" })
                           .Append(_mlContext.Transforms.Conversion.MapValueToKey("Label"))
                           .Append(_mlContext.MulticlassClassification.Trainers.SdcaMaximumEntropy());

            return(pipeline.Fit(data));
        }
예제 #6
0
        /// <summary>
        /// Configures a loader for text files.
        /// </summary>
        /// <typeparam name="TShape">The type shape parameter, which must be a valid-schema shape. As a practical
        /// matter this is generally not explicitly defined from the user, but is instead inferred from the return
        /// type of the <paramref name="func"/> where one takes an input <see cref="Context"/> and uses it to compose
        /// a shape-type instance describing what the columns are and how to load them from the file.</typeparam>
        /// <param name="env">The environment.</param>
        /// <param name="func">The delegate that describes what fields to read from the text file, as well as
        /// describing their input type. The way in which it works is that the delegate is fed a <see cref="Context"/>,
        /// and the user composes a shape type with <see cref="PipelineColumn"/> instances out of that <see cref="Context"/>.
        /// The resulting data will have columns with the names corresponding to their names in the shape type.</param>
        /// <param name="files">Input files. If <c>null</c> then no files are read, but this means that options or
        /// configurations that require input data for initialization (for example, <paramref name="hasHeader"/> or
        /// <see cref="Context.LoadFloat(int, int?)"/>) with a <c>null</c> second argument.</param>
        /// <param name="separator">Text field separator.</param>
        /// <param name="hasHeader">Data file has header with feature names.</param>
        /// <param name="allowQuoting">Whether the input -may include quoted values, which can contain separator
        /// characters, colons, and distinguish empty values from missing values. When true, consecutive separators
        /// denote a missing value and an empty value is denoted by <c>""</c>. When false, consecutive separators
        /// denote an empty value.</param>
        /// <param name="allowSparse">Whether the input may include sparse representations.</param>
        /// <param name="trimWhitspace">Remove trailing whitespace from lines.</param>
        /// <returns>A configured statically-typed loader for text files.</returns>
        public static DataLoader <IMultiStreamSource, TShape> CreateLoader <[IsShape] TShape>(
            IHostEnvironment env, Func <Context, TShape> func, IMultiStreamSource files = null,
            char separator     = '\t', bool hasHeader = false, bool allowQuoting = true, bool allowSparse = true,
            bool trimWhitspace = false)
        {
            Contracts.CheckValue(env, nameof(env));
            env.CheckValue(func, nameof(func));
            env.CheckValueOrNull(files);

            // Populate all args except the columns.
            var args = new TextLoader.Options();

            args.AllowQuoting   = allowQuoting;
            args.AllowSparse    = allowSparse;
            args.HasHeader      = hasHeader;
            args.Separators     = new[] { separator };
            args.TrimWhitespace = trimWhitspace;

            var rec = new TextReconciler(args, files);
            var ctx = new Context(rec);

            using (var ch = env.Start("Initializing " + nameof(TextLoader)))
            {
                var loaderEst = StaticPipeUtils.LoaderEstimatorAnalyzerHelper(env, ch, ctx, rec, func);
                Contracts.AssertValue(loaderEst);
                return(loaderEst.Fit(files));
            }
        }
예제 #7
0
        public void SetupBreastCancerPipeline()
        {
            _breastCancerExample = new BreastCancerData()
            {
                Features = new[] { 5f, 1f, 1f, 1f, 2f, 1f, 3f, 1f, 1f }
            };

            string breastCancerDataPath = GetBenchmarkDataPath("breast-cancer.txt");

            var env = new MLContext(seed: 1);

            // Create text loader.
            var options = new TextLoader.Options()
            {
                Columns = new[]
                {
                    new TextLoader.Column("Label", DataKind.Boolean, 0),
                    new TextLoader.Column("Features", DataKind.Single, new[] { new TextLoader.Range(1, 9) })
                },
                HasHeader = false,
            };
            var loader = new TextLoader(env, options: options);

            IDataView data = loader.Load(breastCancerDataPath);

            var pipeline = env.BinaryClassification.Trainers.SdcaNonCalibrated(
                new SdcaNonCalibratedBinaryTrainer.Options {
                NumberOfThreads = 1, ConvergenceTolerance = 1e-2f,
            });

            var model = pipeline.Fit(data);

            _breastCancerModel = env.Model.CreatePredictionEngine <BreastCancerData, BreastCancerPrediction>(model);
        }
예제 #8
0
        public Task<List<List<double>>> LoadCorrelationData()
        {
            return Task.Run(async () =>
            {
                var trainingDataPath = await MlDotNet.FilePath(@"ms-appx:///Data/titanic.csv");
                var readerOptions = new TextLoader.Options()
                {
                    Separators = new[] { ',' },
                    HasHeader = true,
                    AllowQuoting = true,
                    Columns = new[]
                        {
                        new TextLoader.Column("Survived", DataKind.Single, 1),
                        new TextLoader.Column("PClass", DataKind.Single, 2),
                        new TextLoader.Column("Age", DataKind.Single, 5),
                        new TextLoader.Column("SibSp", DataKind.Single, 6),
                        new TextLoader.Column("Parch", DataKind.Single, 7),
                        new TextLoader.Column("Fare", DataKind.Single, 9)
                        }
                };

                var dataView = _mlContext.Data.LoadFromTextFile(trainingDataPath, readerOptions);
                var result = new List<List<double>>();
                for (int i = 0; i < dataView.Schema.Count; i++)
                {
                    var column = dataView.Schema[i];
                    result.Add(dataView.GetColumn<float>(column).Select(f => (double)f).ToList());
                }

                return result;
            });
        }
예제 #9
0
        /// <summary>
        /// Load a <see cref="IDataView"/> from a text file using <see cref="TextLoader"/>.
        /// Note that <see cref="IDataView"/>'s are lazy, so no actual loading happens here, just schema validation.
        /// </summary>
        /// <param name="catalog">The <see cref="DataOperationsCatalog"/> catalog.</param>
        /// <param name="path">The path to the file.</param>
        /// <param name="columns">The columns of the schema.</param>
        /// <param name="separatorChar">The character used as separator between data points in a row. By default the tab character is used as separator.</param>
        /// <param name="hasHeader">Whether the file has a header with feature names. Note: If a TextLoader is created with hasHeader = true but without a
        /// dataSample, then vector columns made by TextLoader will not contain slot name annotations (slots being the elements of the given vector column),
        /// because the output schema is made when the TextLoader is made, and not when <see cref="TextLoader.Load(IMultiStreamSource)"/> is called.
        /// In addition, the case where dataSample = null and hasHeader = true indicates to the loader that when it is given a file when Load()
        /// is called, it needs to skip the first line.</param>
        /// <param name="allowQuoting">Whether the file can contain columns defined by a quoted string.</param>
        /// <param name="trimWhitespace">Remove trailing whitespace from lines</param>
        /// <param name="allowSparse">Whether the file can contain numerical vectors in sparse format.</param>
        /// <returns>The data view.</returns>
        public static IDataView LoadFromTextFile(this DataOperationsCatalog catalog,
                                                 string path,
                                                 TextLoader.Column[] columns,
                                                 char separatorChar  = TextLoader.Defaults.Separator,
                                                 bool hasHeader      = TextLoader.Defaults.HasHeader,
                                                 bool allowQuoting   = TextLoader.Defaults.AllowQuoting,
                                                 bool trimWhitespace = TextLoader.Defaults.TrimWhitespace,
                                                 bool allowSparse    = TextLoader.Defaults.AllowSparse)
        {
            Contracts.CheckNonEmpty(path, nameof(path));
            if (!File.Exists(path))
            {
                throw Contracts.ExceptParam(nameof(path), "File does not exist at path: {0}", path);
            }

            var options = new TextLoader.Options
            {
                Columns        = columns,
                Separators     = new[] { separatorChar },
                HasHeader      = hasHeader,
                AllowQuoting   = allowQuoting,
                TrimWhitespace = trimWhitespace,
                AllowSparse    = allowSparse
            };

            var loader = new TextLoader(CatalogUtils.GetEnvironment(catalog), options: options);

            return(loader.Load(new MultiFileSource(path)));
        }
예제 #10
0
        /// <summary>
        /// Load a <see cref="IDataView"/> from a text file using <see cref="TextLoader"/>.
        /// Note that <see cref="IDataView"/>'s are lazy, so no actual loading happens here, just schema validation.
        /// </summary>
        /// <param name="catalog">The <see cref="DataOperationsCatalog"/> catalog.</param>
        /// <param name="path">The path to the file(s).</param>
        /// <param name="columns">The columns of the schema.</param>
        /// <param name="separatorChar">The character used as separator between data points in a row. By default the tab character is used as separator.</param>
        /// <param name="hasHeader">Whether the file has a header. When <see langword="true"/>, the loader will skip the first line when
        /// <see cref="TextLoader.Load(IMultiStreamSource)"/> is called.</param>
        /// <param name="allowQuoting">Whether the input may include double-quoted values. This parameter is used to distinguish separator characters
        /// in an input value from actual separators. When <see langword="true"/>, separators within double quotes are treated as part of the
        /// input value. When <see langword="false"/>, all separators, even those whitin quotes, are treated as delimiting a new column.
        /// It is also used to distinguish empty values from missing values. When <see langword="true"/>, missing value are denoted by consecutive
        /// separators and empty values by \"\". When <see langword="false"/>, empty values are denoted by consecutive separators and missing
        /// values by the default missing value for each type documented in <see cref="DataKind"/>.</param>
        /// <param name="trimWhitespace">Remove trailing whitespace from lines.</param>
        /// <param name="allowSparse">Whether the input may include sparse representations. For example, a row containing
        /// "5 2:6 4:3" means that there are 5 columns, and the only non-zero are columns 2 and 4, which have values 6 and 3,
        /// respectively. Column indices are zero-based, so columns 2 and 4 represent the 3rd and 5th columns.
        /// A column may also have dense values followed by sparse values represented in this fashion. For example,
        /// a row containing "1 2 5 2:6 4:3" represents two dense columns with values 1 and 2, followed by 5 sparsely represented
        /// columns with values 0, 0, 6, 0, and 3. The indices of the sparse columns start from 0, even though 0 represents the third column.</param>
        /// <returns>The data view.</returns>
        public static IDataView LoadFromTextFile(this DataOperationsCatalog catalog,
                                                 string path,
                                                 TextLoader.Column[] columns,
                                                 char separatorChar  = TextLoader.Defaults.Separator,
                                                 bool hasHeader      = TextLoader.Defaults.HasHeader,
                                                 bool allowQuoting   = TextLoader.Defaults.AllowQuoting,
                                                 bool trimWhitespace = TextLoader.Defaults.TrimWhitespace,
                                                 bool allowSparse    = TextLoader.Defaults.AllowSparse)
        {
            CheckValidPathContents(path);

            var options = new TextLoader.Options
            {
                Columns        = columns,
                Separators     = new[] { separatorChar },
                HasHeader      = hasHeader,
                AllowQuoting   = allowQuoting,
                TrimWhitespace = trimWhitespace,
                AllowSparse    = allowSparse
            };

            var loader = new TextLoader(CatalogUtils.GetEnvironment(catalog), options: options);

            return(loader.Load(new MultiFileSource(path)));
        }
예제 #11
0
        public static ColumnInferenceResults InferColumns(MLContext context, string path, ColumnInformation columnInfo, bool hasHeader,
                                                          TextFileContents.ColumnSplitResult splitInference, ColumnTypeInference.InferenceResult typeInference,
                                                          bool trimWhitespace, bool groupColumns)
        {
            var loaderColumns      = ColumnTypeInference.GenerateLoaderColumns(typeInference.Columns);
            var typedLoaderOptions = new TextLoader.Options
            {
                Columns        = loaderColumns,
                Separators     = new[] { splitInference.Separator.Value },
                AllowSparse    = splitInference.AllowSparse,
                AllowQuoting   = splitInference.AllowQuote,
                ReadMultilines = splitInference.ReadMultilines,
                HasHeader      = hasHeader,
                TrimWhitespace = trimWhitespace
            };
            var textLoader = context.Data.CreateTextLoader(typedLoaderOptions);
            var dataView   = textLoader.Load(path);

            // Validate all columns specified in column info exist in inferred data view
            ColumnInferenceValidationUtil.ValidateSpecifiedColumnsExist(columnInfo, dataView);

            var purposeInferenceResult = PurposeInference.InferPurposes(context, dataView, columnInfo);

            // start building result objects
            IEnumerable <TextLoader.Column>       columnResults  = null;
            IEnumerable <(string, ColumnPurpose)> purposeResults = null;

            // infer column grouping and generate column names
            if (groupColumns)
            {
                var groupingResult = ColumnGroupingInference.InferGroupingAndNames(context, hasHeader,
                                                                                   typeInference.Columns, purposeInferenceResult);

                columnResults  = groupingResult.Select(c => c.GenerateTextLoaderColumn());
                purposeResults = groupingResult.Select(c => (c.SuggestedName, c.Purpose));
            }
            else
            {
                columnResults  = loaderColumns;
                purposeResults = purposeInferenceResult.Select(p => (dataView.Schema[p.ColumnIndex].Name, p.Purpose));
            }

            var textLoaderOptions = new TextLoader.Options()
            {
                Columns        = columnResults.ToArray(),
                AllowQuoting   = splitInference.AllowQuote,
                AllowSparse    = splitInference.AllowSparse,
                Separators     = new char[] { splitInference.Separator.Value },
                ReadMultilines = splitInference.ReadMultilines,
                HasHeader      = hasHeader,
                TrimWhitespace = trimWhitespace
            };

            return(new ColumnInferenceResults()
            {
                TextLoaderOptions = textLoaderOptions,
                ColumnInformation = ColumnInformationUtil.BuildColumnInfo(purposeResults)
            });
        }
예제 #12
0
        /// <summary>
        /// Load a <see cref="IDataView"/> from a text file using <see cref="TextLoader"/>.
        /// Note that <see cref="IDataView"/>'s are lazy, so no actual loading happens here, just schema validation.
        /// </summary>
        /// <param name="catalog">The <see cref="DataOperationsCatalog"/> catalog.</param>
        /// <param name="path">Specifies a file or path of files from which to load.</param>
        /// <param name="options">Defines the settings of the load operation. No need to specify a Columns field,
        /// as columns will be infered by this method.</param>
        /// <returns>The data view.</returns>
        public static IDataView LoadFromTextFile <TInput>(this DataOperationsCatalog catalog, string path,
                                                          TextLoader.Options options)
        {
            CheckValidPathContents(path);

            return(TextLoader.CreateTextLoader <TInput>(CatalogUtils.GetEnvironment(catalog), options)
                   .Load(new MultiFileSource(path)));
        }
예제 #13
0
            public TextReconciler(TextLoader.Options options, IMultiStreamSource files)
            {
                Contracts.AssertValue(options);
                Contracts.AssertValueOrNull(files);

                _args  = options;
                _files = files;
            }
예제 #14
0
        private static IDataScorerTransform _TrainSentiment()
        {
            bool normalize = true;

            var args = new TextLoader.Options()
            {
                Separators = new[] { '\t' },
                HasHeader  = true,
                Columns    = new[]
                {
                    new TextLoader.Column("Label", DataKind.Boolean, 0),
                    new TextLoader.Column("SentimentText", DataKind.String, 1)
                }
            };

            var args2 = new TextFeaturizingEstimator.Options()
            {
                KeepDiacritics         = false,
                KeepPunctuations       = false,
                CaseMode               = TextNormalizingEstimator.CaseMode.Lower,
                OutputTokensColumnName = "tokens",
                Norm = normalize ? TextFeaturizingEstimator.NormFunction.L2 : TextFeaturizingEstimator.NormFunction.None,
                CharFeatureExtractor = new WordBagEstimator.Options()
                {
                    NgramLength = 3, UseAllLengths = false
                },
                WordFeatureExtractor = new WordBagEstimator.Options()
                {
                    NgramLength = 2, UseAllLengths = true
                },
            };

            var trainFilename = FileHelper.GetTestFile("wikipedia-detox-250-line-data.tsv");

            /*using (*/
            var env = EnvHelper.NewTestEnvironment(seed: 1, conc: 1);
            {
                // Pipeline
                var loader = new TextLoader(env, args).Load(new MultiFileSource(trainFilename));

                var trans = TextFeaturizingEstimator.Create(env, args2, loader);

                // Train
                var trainer = new SdcaLogisticRegressionBinaryTrainer(env, new SdcaLogisticRegressionBinaryTrainer.Options
                {
                    LabelColumnName   = "Label",
                    FeatureColumnName = "Features"
                });

                var cached    = new Microsoft.ML.Data.CacheDataView(env, trans, prefetch: null);
                var predictor = trainer.Fit(cached);

                var trainRoles = new RoleMappedData(cached, label: "Label", feature: "Features");
                var scoreRoles = new RoleMappedData(trans, label: "Label", feature: "Features");
                return(ScoreUtils.GetScorer(predictor.Model, scoreRoles, env, trainRoles.Schema));
            }
        }
        private (Pipeline, ColumnInferenceResults) GetMockedAzureImagePipelineAndInference()
        {
            // construct pipeline
            var onnxPipeLineNode = new PipelineNode(nameof(SpecialTransformer.ApplyOnnxModel), PipelineNodeType.Transform, new[] { "input.1" }, new[] { "output.1" },
                                                    new Dictionary <string, object>()
            {
                { "outputColumnNames", "output1" },
                { "inputColumnNames", "input1" },
            });
            var loadImageNode   = new PipelineNode(EstimatorName.ImageLoading.ToString(), PipelineNodeType.Transform, "ImageSource", "ImageSource_featurized");
            var resizeImageNode = new PipelineNode(
                nameof(SpecialTransformer.ResizeImage),
                PipelineNodeType.Transform,
                "ImageSource_featurized",
                "ImageSource_featurized",
                new Dictionary <string, object>()
            {
                { "imageWidth", 224 },
                { "imageHeight", 224 },
            });
            var extractPixelsNode    = new PipelineNode(nameof(SpecialTransformer.ExtractPixel), PipelineNodeType.Transform, "ImageSource_featurized", "ImageSource_featurized");
            var normalizePipeline    = new PipelineNode(nameof(SpecialTransformer.NormalizeMapping), PipelineNodeType.Transform, string.Empty, string.Empty);
            var labelMapPipelineNode = new PipelineNode(nameof(SpecialTransformer.LabelMapping), PipelineNodeType.Transform, string.Empty, string.Empty);
            var bestPipeLine         = new Pipeline(new PipelineNode[]
            {
                loadImageNode,
                resizeImageNode,
                extractPixelsNode,
                normalizePipeline,
                onnxPipeLineNode,
                labelMapPipelineNode,
            });

            // construct column inference
            var textLoaderArgs = new TextLoader.Options()
            {
                Columns = new[] {
                    new TextLoader.Column("Label", DataKind.String, 0),
                    new TextLoader.Column("ImageSource", DataKind.String, 1),     // 0?
                },
                AllowQuoting = true,
                AllowSparse  = true,
                HasHeader    = true,
                Separators   = new[] { '\t' }
            };

            var columnInference = new ColumnInferenceResults()
            {
                TextLoaderOptions = textLoaderArgs,
                ColumnInformation = new ColumnInformation()
                {
                    LabelColumnName = "Label"
                }
            };

            return(bestPipeLine, columnInference);
        }
예제 #16
0
        /// <summary>
        /// Load a <see cref="IDataView"/> from a text file using <see cref="TextLoader"/>.
        /// Note that <see cref="IDataView"/>'s are lazy, so no actual loading happens here, just schema validation.
        /// </summary>
        /// <param name="catalog">The <see cref="DataOperationsCatalog"/> catalog.</param>
        /// <param name="path">Specifies a file or path of files from which to load.</param>
        /// <param name="options">Defines the settings of the load operation.</param>
        /// <example>
        /// <format type="text/markdown">
        /// <![CDATA[
        /// [!code-csharp[LoadFromTextFile](~/../docs/samples/docs/samples/Microsoft.ML.Samples/Dynamic/DataOperations/SaveAndLoadFromText.cs)]
        /// ]]>
        /// </format>
        /// </example>
        public static IDataView LoadFromTextFile(this DataOperationsCatalog catalog, string path,
                                                 TextLoader.Options options = null)
        {
            CheckValidPathContents(path);

            var env    = catalog.GetEnvironment();
            var source = new MultiFileSource(path);

            return(new TextLoader(env, options, dataSample: source).Load(source));
        }
        private (Pipeline, ColumnInferenceResults) GetMockedRecommendationPipelineAndInference()
        {
            if (_mockedPipeline == null)
            {
                MLContext context    = new MLContext();
                var       hyperParam = new Dictionary <string, object>()
                {
                    { "MatrixColumnIndexColumnName", "userId" },
                    { "MatrixRowIndexColumnName", "movieId" },
                    { "LabelColumnName", "Label" },
                    { nameof(MatrixFactorizationTrainer.Options.NumberOfIterations), 10 },
                    { nameof(MatrixFactorizationTrainer.Options.LearningRate), 0.01f },
                    { nameof(MatrixFactorizationTrainer.Options.ApproximationRank), 8 },
                    { nameof(MatrixFactorizationTrainer.Options.Lambda), 0.01f },
                    { nameof(MatrixFactorizationTrainer.Options.LossFunction), MatrixFactorizationTrainer.LossFunctionType.SquareLossRegression },
                    { nameof(MatrixFactorizationTrainer.Options.Alpha), 1f },
                    { nameof(MatrixFactorizationTrainer.Options.C), 0.00001f },
                };
                var valueToKeyPipelineNode1 = new PipelineNode(nameof(EstimatorName.ValueToKeyMapping), PipelineNodeType.Transform, "userId", "userId");
                var valueToKeyPipelineNode2 = new PipelineNode(nameof(EstimatorName.ValueToKeyMapping), PipelineNodeType.Transform, "movieId", "movieId");
                var matrixPipelineNode      = new PipelineNode(nameof(TrainerName.MatrixFactorization), PipelineNodeType.Trainer, "Features", "Score", hyperParam);
                var pipeline = new Pipeline(new PipelineNode[]
                {
                    valueToKeyPipelineNode1,
                    valueToKeyPipelineNode2,
                    matrixPipelineNode
                });

                _mockedPipeline = pipeline;
                var textLoaderArgs = new TextLoader.Options()
                {
                    Columns = new[] {
                        new TextLoader.Column("Label", DataKind.String, 0),
                        new TextLoader.Column("userId", DataKind.String, 1),
                        new TextLoader.Column("movieId", DataKind.String, 2),
                    },
                    AllowQuoting = true,
                    AllowSparse  = true,
                    HasHeader    = true,
                    Separators   = new[] { ',' }
                };

                this._columnInference = new ColumnInferenceResults()
                {
                    TextLoaderOptions = textLoaderArgs,
                    ColumnInformation = new ColumnInformation()
                    {
                        LabelColumnName  = "Label",
                        UserIdColumnName = "userId",
                        ItemIdColumnName = "movieId"
                    }
                };
            }
            return(_mockedPipeline, _columnInference);
        }
예제 #18
0
        public void TrainSentiment()
        {
            // Pipeline
            var arguments = new TextLoader.Options()
            {
                Columns = new TextLoader.Column[]
                {
                    new TextLoader.Column()
                    {
                        Name   = "Label",
                        Source = new[] { new TextLoader.Range()
                                         {
                                             Min = 0, Max = 0
                                         } },
                        Type = DataKind.Num
                    },

                    new TextLoader.Column()
                    {
                        Name   = "SentimentText",
                        Source = new[] { new TextLoader.Range()
                                         {
                                             Min = 1, Max = 1
                                         } },
                        Type = DataKind.Text
                    }
                },
                HasHeader    = true,
                AllowQuoting = false,
                AllowSparse  = false
            };

            var loader = mlContext.Data.ReadFromTextFile(_sentimentDataPath, arguments);
            var text   = mlContext.Transforms.Text.FeaturizeText("WordEmbeddings", new List <string> {
                "SentimentText"
            },
                                                                 new TextFeaturizingEstimator.Options {
                OutputTokens     = true,
                KeepPunctuations = false,
                UseStopRemover   = true,
                VectorNormalizer = TextFeaturizingEstimator.TextNormKind.None,
                UseCharExtractor = false,
                UseWordExtractor = false,
            }).Fit(loader).Transform(loader);

            var trans = mlContext.Transforms.Text.ExtractWordEmbeddings("Features", "WordEmbeddings_TransformedText",
                                                                        WordEmbeddingsExtractingEstimator.PretrainedModelKind.Sswe).Fit(text).Transform(text);

            // Train
            var trainer   = mlContext.MulticlassClassification.Trainers.StochasticDualCoordinateAscent();
            var predicted = trainer.Fit(trans);

            _consumer.Consume(predicted);
        }
예제 #19
0
        public void ReconfigurablePrediction()
        {
            var mlContext = new MLContext(seed: 1);

            var options = new TextLoader.Options
            {
                HasHeader  = TestDatasets.Sentiment.fileHasHeader,
                Separators = new[] { TestDatasets.Sentiment.fileSeparator }
            };

            var data = mlContext.Data.LoadFromTextFile <TweetSentiment>(TestCommon.GetDataPath(DataDir, TestDatasets.Sentiment.trainFilename),
                                                                        options);

            // Create a training pipeline.
            var pipeline = mlContext.Transforms.Text.FeaturizeText("Features", "SentimentText")
                           .AppendCacheCheckpoint(mlContext)
                           .Append(mlContext.BinaryClassification.Trainers.LbfgsLogisticRegression(
                                       new LbfgsLogisticRegressionBinaryTrainer.Options {
                NumberOfThreads = 1
            }));

            // Train the model.
            var model  = pipeline.Fit(data);
            var engine = mlContext.Model.CreatePredictionEngine <TweetSentiment, Prediction>(model);
            var pr     = engine.Predict(new TweetSentiment()
            {
                SentimentText = "Good Bad job"
            });

            // Score is 0.64 so predicted label is true.
            Assert.True(pr.PredictedLabel);
            Assert.True(pr.Score > 0);
            var transformers = new List <ITransformer>();

            foreach (var transform in model)
            {
                if (transform != model.LastTransformer)
                {
                    transformers.Add(transform);
                }
            }
            transformers.Add(mlContext.BinaryClassification.ChangeModelThreshold(model.LastTransformer, 0.7f));
            var newModel  = new TransformerChain <BinaryPredictionTransformer <CalibratedModelParametersBase <LinearBinaryModelParameters, PlattCalibrator> > >(transformers.ToArray());
            var newEngine = mlContext.Model.CreatePredictionEngine <TweetSentiment, Prediction>(newModel);

            pr = newEngine.Predict(new TweetSentiment()
            {
                SentimentText = "Good Bad job"
            });
            // Score is still 0.64 but since threshold is no longer 0 but 0.7 predicted label now is false.

            Assert.False(pr.PredictedLabel);
            Assert.False(pr.Score > 0.7);
        }
        /// <summary>
        /// Load a <see cref="IDataView"/> from a text file using <see cref="TextLoader"/>.
        /// Note that <see cref="IDataView"/>'s are lazy, so no actual loading happens here, just schema validation.
        /// </summary>
        /// <param name="catalog">The <see cref="DataOperationsCatalog"/> catalog.</param>
        /// <param name="path">Specifies a file from which to load.</param>
        /// <param name="options">Defines the settings of the load operation. No need to specify a Columns field,
        /// as columns will be infered by this method.</param>
        /// <returns>The data view.</returns>
        public static IDataView LoadFromTextFile <TInput>(this DataOperationsCatalog catalog, string path,
                                                          TextLoader.Options options)
        {
            Contracts.CheckNonEmpty(path, nameof(path));
            if (!File.Exists(path))
            {
                throw Contracts.ExceptParam(nameof(path), "File does not exist at path: {0}", path);
            }

            return(TextLoader.CreateTextLoader <TInput>(CatalogUtils.GetEnvironment(catalog), options)
                   .Load(new MultiFileSource(path)));
        }
예제 #21
0
        /// <summary>
        /// Load a <see cref="IDataView"/> from a text file using <see cref="TextLoader"/>.
        /// Note that <see cref="IDataView"/>'s are lazy, so no actual loading happens here, just schema validation.
        /// </summary>
        /// <param name="catalog">The <see cref="DataOperationsCatalog"/> catalog.</param>
        /// <param name="path">Specifies a file from which to load.</param>
        /// <param name="options">Defines the settings of the load operation.</param>
        /// <example>
        /// <format type="text/markdown">
        /// <![CDATA[
        /// [!code-csharp[LoadFromTextFile](~/../docs/samples/docs/samples/Microsoft.ML.Samples/Dynamic/DataOperations/SaveAndLoadFromText.cs)]
        /// ]]>
        /// </format>
        /// </example>
        public static IDataView LoadFromTextFile(this DataOperationsCatalog catalog, string path,
                                                 TextLoader.Options options = null)
        {
            Contracts.CheckNonEmpty(path, nameof(path));
            if (!File.Exists(path))
            {
                throw Contracts.ExceptParam(nameof(path), "File does not exist at path: {0}", path);
            }

            var env    = catalog.GetEnvironment();
            var source = new MultiFileSource(path);

            return(new TextLoader(env, options, dataSample: source).Load(source));
        }
        public void TestInvalidMultilineCSVQuote()
        {
            var mlContext = new MLContext(seed: 1);

            string badInputCsv =
                "id,description,animal\n" +
                "9,\"this is a quoted field correctly formatted\",cat\n" +
                "10,\"this is a quoted field\nwithout closing quote,cat\n" +
                "11,this field isn't quoted,dog\n" +
                "12,this will reach the end of the file without finding a closing quote so it will throw,frog\n"
            ;

            var filePath = GetOutputPath("multiline-invalid.csv");

            File.WriteAllText(filePath, badInputCsv);

            bool threwException = false;

            try
            {
                var options = new TextLoader.Options()
                {
                    HasHeader      = true,
                    Separator      = ",",
                    AllowQuoting   = true,
                    ReadMultilines = true,
                    Columns        = new[]
                    {
                        new TextLoader.Column("id", DataKind.Int32, 0),
                        new TextLoader.Column("description", DataKind.String, 1),
                        new TextLoader.Column("animal", DataKind.String, 2),
                    },
                };

                var data = mlContext.Data.LoadFromTextFile(filePath, options);

                data.Preview();
            }
            catch (EndOfStreamException)
            {
                threwException = true;
            }
            catch (FormatException)
            {
                threwException = true;
            }

            Assert.True(threwException, "Invalid file should have thrown an exception");
        }
예제 #23
0
        /// <summary>
        /// 通过<see cref="DataFrancis.IDataView"/>来加载机器学习数据
        /// </summary>
        /// <param name="datas">待加载的数据</param>
        /// <returns></returns>
        public static IDataViewML ToIDataView(this IDataViewF datas)
        {
            var dataView = datas.ToArray().ToIDirectView();
            var file     = ToolPerfo.CreateTemporaryFile(@"txt").Path;

            CreateDataObj.PipeFromFile(file, "\t", "null").DataAdd(dataView, default);
            var op = new TextLoader.Options()
            {
                HasHeader  = true,
                Separators = new[] { '\t' },
                Columns    = GetColumn(dataView).ToArray()
            };

            return(ToolML.Context.Data.LoadFromTextFile(file, op));
        }
        private (Pipeline, ColumnInferenceResults) GetMockedOvaPipelineAndInference()
        {
            if (_mockedOvaPipeline == null)
            {
                MLContext context = new MLContext();
                // same learners with different hyperparameters
                var hyperparams1 = new Microsoft.ML.AutoML.ParameterSet(new List <Microsoft.ML.AutoML.IParameterValue>()
                {
                    new LongParameterValue("NumLeaves", 2)
                });
                var trainer1    = new SuggestedTrainer(context, new FastForestOvaExtension(), new ColumnInformation(), hyperparams1);
                var transforms1 = new List <SuggestedTransform>()
                {
                    ColumnConcatenatingExtension.CreateSuggestedTransform(context, new[] { "In" }, "Out")
                };
                var inferredPipeline1 = new SuggestedPipeline(transforms1, new List <SuggestedTransform>(), trainer1, context, true);

                this._mockedOvaPipeline = inferredPipeline1.ToPipeline();
                var textLoaderArgs = new TextLoader.Options()
                {
                    Columns = new[] {
                        new TextLoader.Column("Label", DataKind.Boolean, 0),
                        new TextLoader.Column("col1", DataKind.Single, 1),
                        new TextLoader.Column("col2", DataKind.Single, 0),
                        new TextLoader.Column("col3", DataKind.String, 0),
                        new TextLoader.Column("col4", DataKind.Int32, 0),
                        new TextLoader.Column("col5", DataKind.UInt32, 0),
                    },
                    AllowQuoting = true,
                    AllowSparse  = true,
                    HasHeader    = true,
                    Separators   = new[] { ',' }
                };


                this._columnInference = new ColumnInferenceResults()
                {
                    TextLoaderOptions = textLoaderArgs,
                    ColumnInformation = new ColumnInformation()
                    {
                        LabelColumnName = "Label"
                    }
                };
            }
            return(_mockedOvaPipeline, _columnInference);
        }
        private (Pipeline, ColumnInferenceResults) GetMockedRankingPipelineAndInference()
        {
            if (_mockedPipeline == null)
            {
                MLContext context    = new MLContext();
                var       hyperParam = new Dictionary <string, object>()
                {
                    { "rowGroupColumnName", "GroupId" },
                    { "LabelColumnName", "Label" },
                };
                var hashPipelineNode     = new PipelineNode(nameof(EstimatorName.Hashing), PipelineNodeType.Transform, "GroupId", "GroupId");
                var lightGbmPipelineNode = new PipelineNode(nameof(TrainerName.LightGbmRanking), PipelineNodeType.Trainer, "Features", "Score", hyperParam);
                var pipeline             = new Pipeline(new PipelineNode[]
                {
                    hashPipelineNode,
                    lightGbmPipelineNode
                });
                _mockedPipeline = pipeline;
                var textLoaderArgs = new TextLoader.Options()
                {
                    Columns = new[] {
                        new TextLoader.Column("Label", DataKind.Boolean, 0),
                        new TextLoader.Column("GroupId", DataKind.Single, 1),
                        new TextLoader.Column("col1", DataKind.Single, 0),
                        new TextLoader.Column("col2", DataKind.String, 0),
                        new TextLoader.Column("col3", DataKind.Int32, 0),
                        new TextLoader.Column("col4", DataKind.UInt32, 0),
                    },
                    AllowQuoting = true,
                    AllowSparse  = true,
                    HasHeader    = true,
                    Separators   = new[] { ',' }
                };

                this._columnInference = new ColumnInferenceResults()
                {
                    TextLoaderOptions = textLoaderArgs,
                    ColumnInformation = new ColumnInformation()
                    {
                        LabelColumnName = "Label", GroupIdColumnName = "GroupId"
                    }
                };
            }

            return(_mockedPipeline, _columnInference);
        }
        private (Pipeline, ColumnInferenceResults) GetMockedRecommendationPipelineAndInference()
        {
            if (mockedPipeline == null)
            {
                MLContext context = new MLContext();

                var trainer1 = new SuggestedTrainer(context, new MatrixFactorizationExtension(), new ColumnInformation()
                {
                    LabelColumnName  = "Label",
                    UserIdColumnName = "userId",
                    ItemIdColumnName = "movieId",
                }, hyperParamSet: null);
                var transforms1 = new List <SuggestedTransform>()
                {
                    ColumnConcatenatingExtension.CreateSuggestedTransform(context, new[] { "In" }, "Out")
                };
                var inferredPipeline1 = new SuggestedPipeline(transforms1, new List <SuggestedTransform>(), trainer1, context, false);

                mockedPipeline = inferredPipeline1.ToPipeline();
                var textLoaderArgs = new TextLoader.Options()
                {
                    Columns = new[] {
                        new TextLoader.Column("Label", DataKind.String, 0),
                        new TextLoader.Column("userId", DataKind.String, 1),
                        new TextLoader.Column("movieId", DataKind.String, 2),
                    },
                    AllowQuoting = true,
                    AllowSparse  = true,
                    HasHeader    = true,
                    Separators   = new[] { ',' }
                };

                this.columnInference = new ColumnInferenceResults()
                {
                    TextLoaderOptions = textLoaderArgs,
                    ColumnInformation = new ColumnInformation()
                    {
                        LabelColumnName  = "Label",
                        UserIdColumnName = "userId",
                        ItemIdColumnName = "movieId"
                    }
                };
            }
            return(mockedPipeline, columnInference);
        }
예제 #27
0
        private static bool TryParseFile(MLContext context, TextLoader.Options options, IMultiStreamSource source,
                                         out ColumnSplitResult result)
        {
            result = null;
            // try to instantiate data view with swept arguments
            try
            {
                var textLoader   = context.Data.CreateTextLoader(options, source);
                var idv          = context.Data.TakeRows(textLoader.Load(source), 1000);
                var columnCounts = new List <int>();
                var column       = idv.Schema["C"];

                using (var cursor = idv.GetRowCursor(new[] { column }))
                {
                    var getter = cursor.GetGetter <VBuffer <ReadOnlyMemory <char> > >(column);

                    VBuffer <ReadOnlyMemory <char> > line = default;
                    while (cursor.MoveNext())
                    {
                        getter(ref line);
                        columnCounts.Add(line.Length);
                    }
                }

                var mostCommon = columnCounts.GroupBy(x => x).OrderByDescending(x => x.Count()).First();
                if (mostCommon.Count() < UniformColumnCountThreshold * columnCounts.Count)
                {
                    return(false);
                }

                // disallow single-column case
                if (mostCommon.Key <= 1)
                {
                    return(false);
                }

                result = new ColumnSplitResult(true, options.Separators.First(), options.AllowQuoting, options.ReadMultilines, options.AllowSparse, mostCommon.Key);
                return(true);
            }
            // fail gracefully if unable to instantiate data view with swept arguments
            catch (Exception)
            {
                return(false);
            }
        }
예제 #28
0
        public IDataView Load(string trainingDataPath)
        {
            var readerOptions = new TextLoader.Options()
            {
                Separators = new[] { ',' },
                HasHeader  = true,
                Columns    = new[]
                {
                    new TextLoader.Column("CustomerId", DataKind.Int32, 0),
                    new TextLoader.Column("Gender", DataKind.String, 1),
                    new TextLoader.Column("Age", DataKind.Int32, 2),
                    new TextLoader.Column("AnnualIncome", DataKind.Single, 3),
                    new TextLoader.Column("SpendingScore", DataKind.Single, 4),
                }
            };

            return(_mlContext.Data.LoadFromTextFile(trainingDataPath, readerOptions));
        }
예제 #29
0
        public List <T> GetRecords <T>(MemoryStream stream) where T : ICsvReadable, new()
        {
            // this library only allows loading from a file.
            // so write to a local file, use the length of the memory stream
            // to write to a different file based on the input data
            // this will be executed during the first "warmup" run
            var file = "data" + stream.Length + ".csv";

            if (!File.Exists(file))
            {
                using var data = File.Create(file);
                stream.CopyTo(data);
            }

            var activate   = ActivatorFactory.Create <T>(_activationMethod);
            var allRecords = new List <T>();
            var mlc        = new MLContext();

            using (var reader = new StreamReader(stream))
            {
                var schema = new TextLoader.Column[25];
                for (int i = 0; i < schema.Length; i++)
                {
                    schema[i] = new TextLoader.Column("" + i, DataKind.String, i);
                }

                var opts = new TextLoader.Options()
                {
                    HasHeader = false, Separators = new[] { ',' }, Columns = schema
                };
                var l       = mlc.Data.LoadFromTextFile(file, opts);
                var rc      = l.GetRowCursor(l.Schema);
                var cols    = l.Schema.ToArray();
                var getters = cols.Select(c => rc.GetGetter <ReadOnlyMemory <char> >(c)).ToArray();
                while (rc.MoveNext())
                {
                    var record = activate();
                    record.Read(i => { ReadOnlyMemory <char> s = null; getters[i](ref s); return(s.ToString()); });
                    allRecords.Add(record);
                }
            }

            return(allRecords);
        }
예제 #30
0
        public void TrainSentiment()
        {
            // Pipeline
            var arguments = new TextLoader.Options()
            {
                Columns = new TextLoader.Column[]
                {
                    new TextLoader.Column("Label", DataKind.Single, new[] { new TextLoader.Range()
                                                                            {
                                                                                Min = 0, Max = 0
                                                                            } }),
                    new TextLoader.Column("SentimentText", DataKind.String, new[] { new TextLoader.Range()
                                                                                    {
                                                                                        Min = 1, Max = 1
                                                                                    } })
                },
                HasHeader    = true,
                AllowQuoting = false,
                AllowSparse  = false
            };

            var loader = _mlContext.Data.LoadFromTextFile(_sentimentDataPath, arguments);
            var text   = _mlContext.Transforms.Text.FeaturizeText("WordEmbeddings", new TextFeaturizingEstimator.Options
            {
                OutputTokensColumnName  = "WordEmbeddings_TransformedText",
                KeepPunctuations        = false,
                StopWordsRemoverOptions = new StopWordsRemovingEstimator.Options(),
                Norm = TextFeaturizingEstimator.NormFunction.None,
                CharFeatureExtractor = null,
                WordFeatureExtractor = null,
            }, "SentimentText").Fit(loader).Transform(loader);

            var trans = _mlContext.Transforms.Text.ApplyWordEmbedding("Features", "WordEmbeddings_TransformedText",
                                                                      WordEmbeddingEstimator.PretrainedModelKind.SentimentSpecificWordEmbedding)
                        .Append(_mlContext.Transforms.Conversion.MapValueToKey("Label"))
                        .Fit(text).Transform(text);

            // Train
            var trainer   = _mlContext.MulticlassClassification.Trainers.SdcaMaximumEntropy();
            var predicted = trainer.Fit(trans);

            _consumer.Consume(predicted);
        }