/// <summary>
        /// Configures a reader for text files.
        /// </summary>
        /// <typeparam name="TShape">The type shape parameter, which must be a valid-schema shape. As a practical
        /// matter this is generally not explicitly defined from the user, but is instead inferred from the return
        /// type of the <paramref name="func"/> where one takes an input <see cref="Context"/> and uses it to compose
        /// a shape-type instance describing what the columns are and how to load them from the file.</typeparam>
        /// <param name="env">The environment.</param>
        /// <param name="func">The delegate that describes what fields to read from the text file, as well as
        /// describing their input type. The way in which it works is that the delegate is fed a <see cref="Context"/>,
        /// and the user composes a shape type with <see cref="PipelineColumn"/> instances out of that <see cref="Context"/>.
        /// The resulting data will have columns with the names corresponding to their names in the shape type.</param>
        /// <param name="files">Input files. If <c>null</c> then no files are read, but this means that options or
        /// configurations that require input data for initialization (for example, <paramref name="hasHeader"/> or
        /// <see cref="Context.LoadFloat(int, int?)"/>) with a <c>null</c> second argument.</param>
        /// <param name="hasHeader">Data file has header with feature names.</param>
        /// <param name="separator">Text field separator.</param>
        /// <param name="allowQuoting">Whether the input -may include quoted values, which can contain separator
        /// characters, colons, and distinguish empty values from missing values. When true, consecutive separators
        /// denote a missing value and an empty value is denoted by <c>""</c>. When false, consecutive separators
        /// denote an empty value.</param>
        /// <param name="allowSparse">Whether the input may include sparse representations.</param>
        /// <param name="trimWhitspace">Remove trailing whitespace from lines.</param>
        /// <returns>A configured statically-typed reader for text files.</returns>
        public static DataReader <IMultiStreamSource, TShape> CreateReader <[IsShape] TShape>(
            IHostEnvironment env, Func <Context, TShape> func, IMultiStreamSource files = null,
            bool hasHeader     = false, char separator = '\t', bool allowQuoting = true, bool allowSparse = true,
            bool trimWhitspace = false)
        {
            Contracts.CheckValue(env, nameof(env));
            env.CheckValue(func, nameof(func));
            env.CheckValueOrNull(files);

            // Populate all args except the columns.
            var args = new TextLoader.Arguments();

            args.AllowQuoting   = allowQuoting;
            args.AllowSparse    = allowSparse;
            args.HasHeader      = hasHeader;
            args.Separators     = new[] { separator };
            args.TrimWhitespace = trimWhitspace;

            var rec = new TextReconciler(args, files);
            var ctx = new Context(rec);

            using (var ch = env.Start("Initializing " + nameof(TextLoader)))
            {
                var readerEst = StaticPipeUtils.ReaderEstimatorAnalyzerHelper(env, ch, ctx, rec, func);
                Contracts.AssertValue(readerEst);
                return(readerEst.Fit(files));
            }
        }
Exemple #2
0
        /// <summary>
        /// Attempt to detect text loader arguments.
        /// The algorithm selects the first 'acceptable' set: the one that recognizes the same number of columns in at
        /// least <see cref="UniformColumnCountThreshold"/> of the sample's lines,
        /// and this number of columns is more than 1.
        /// We sweep on separator, allow sparse and allow quote parameter.
        /// </summary>
        public static ColumnSplitResult TrySplitColumns(IMultiStreamSource source, string[] separatorCandidates)
        {
            var sparse   = new[] { true, false };
            var quote    = new[] { true, false };
            var foundAny = false;
            var result   = default(ColumnSplitResult);

            foreach (var perm in (from _allowSparse in sparse
                                  from _allowQuote in quote
                                  from _sep in separatorCandidates
                                  select new { _allowSparse, _allowQuote, _sep }))
            {
                var args = new TextLoader.Arguments
                {
                    Column       = new[] { TextLoader.Column.Parse("C:TX:0-**") },
                    Separator    = perm._sep,
                    AllowQuoting = perm._allowQuote,
                    AllowSparse  = perm._allowSparse
                };

                if (TryParseFile(args, source, out result))
                {
                    foundAny = true;
                    break;
                }
            }
            return(foundAny ? result : new ColumnSplitResult(false, null, true, true, 0));
        }
        private static ColumnInferenceResult InferColumns(MLContext context,
                                                          TextFileSample sample, Func <TextLoader, IDataView> createDataView, string label,
                                                          bool hasHeader, string separator, bool?isQuoted, bool?isSparse)
        {
            var splitInference  = InferSplit(sample, separator, isQuoted, isSparse);
            var typeInference   = InferColumnTypes(context, sample, splitInference);
            var typedLoaderArgs = new TextLoader.Arguments
            {
                Column       = ColumnTypeInference.GenerateLoaderColumns(typeInference.Columns),
                Separator    = splitInference.Separator,
                AllowSparse  = splitInference.AllowSparse,
                AllowQuoting = splitInference.AllowQuote,
                HasHeader    = hasHeader
            };
            var textLoader = context.Data.CreateTextReader(typedLoaderArgs);
            var dataView   = createDataView(textLoader);

            var purposeInferenceResult = PurposeInference.InferPurposes(context, dataView, label);

            // infer column grouping and generate column names
            var groupingResult = ColumnGroupingInference.InferGroupingAndNames(context, hasHeader,
                                                                               typeInference.Columns, purposeInferenceResult);

            // build result objects & return
            var inferredColumns = groupingResult.Select(c => (c.GenerateTextLoaderColumn(), c.Purpose)).ToArray();

            return(new ColumnInferenceResult(inferredColumns, splitInference.AllowQuote, splitInference.AllowSparse, splitInference.Separator, hasHeader));
        }
        private IDataScorerTransform _TrainSentiment()
        {
            bool normalize = true;

            var args = new TextLoader.Arguments()
            {
                Separator = "tab",
                HasHeader = true,
                Column    = new[] {
                    new TextLoader.Column("Label", DataKind.BL, 0),
                    new TextLoader.Column("SentimentText", DataKind.Text, 1)
                }
            };

            var args2 = new TextFeaturizingEstimator.Arguments()
            {
                Column = new TextFeaturizingEstimator.Column
                {
                    Name   = "Features",
                    Source = new[] { "SentimentText" }
                },
                KeepDiacritics               = false,
                KeepPunctuations             = false,
                TextCase                     = TextNormalizingEstimator.CaseNormalizationMode.Lower,
                OutputTokens                 = true,
                UsePredefinedStopWordRemover = true,
                VectorNormalizer             = normalize ? TextFeaturizingEstimator.TextNormKind.L2 : TextFeaturizingEstimator.TextNormKind.None,
                CharFeatureExtractor         = new NgramExtractorTransform.NgramExtractorArguments()
                {
                    NgramLength = 3, AllLengths = false
                },
                WordFeatureExtractor = new NgramExtractorTransform.NgramExtractorArguments()
                {
                    NgramLength = 2, AllLengths = true
                },
            };

            var trainFilename = FileHelper.GetTestFile("wikipedia-detox-250-line-data.tsv");

            using (var env = EnvHelper.NewTestEnvironment(seed: 1, conc: 1))
            {
                // Pipeline
                var loader = new TextLoader(env, args).Read(new MultiFileSource(trainFilename));
                var trans  = TextFeaturizingEstimator.Create(env, args2, loader);

                // Train
                var trainer = new SdcaBinaryTrainer(env, new SdcaBinaryTrainer.Arguments
                {
                    NumThreads = 1
                });

                var cached    = new CacheDataView(env, trans, prefetch: null);
                var predictor = trainer.Fit(cached);

                var scoreRoles = new RoleMappedData(trans, label: "Label", feature: "Features");
                var trainRoles = new RoleMappedData(cached, label: "Label", feature: "Features");
                return(ScoreUtils.GetScorer(predictor.Model, scoreRoles, env, trainRoles.Schema));
            }
        }
            public TextReconciler(TextLoader.Arguments args, IMultiStreamSource files)
            {
                Contracts.AssertValue(args);
                Contracts.AssertValueOrNull(files);

                _args  = args;
                _files = files;
            }
        static void Main(string[] args)
        {
            Console.WriteLine("SentimentAnalysis Start!");
            //1. Create ML.NET context/environment
            MLContext mLContext = new MLContext();
            //2. Create DataReader with data schema mapped to file's columns
            string baseDataPath = @"Data/base.tsv";

            var reader = new TextLoader.Arguments()
            {
                Separator = "tab",
                HasHeader = true,
                Column    = new TextLoader.Column[] {
                    new TextLoader.Column("Label", DataKind.Bool, 0),
                    new TextLoader.Column("Text", DataKind.Text, 1)
                }
            };
            //Load training data
            IDataView trainingDataView = mLContext.Data.TextReader(reader).Read(new MultiFileSource(baseDataPath));

            //3.Create a flexible pipeline (composed by a chain of estimators) for creating/traing the model.
            var pipeline = mLContext.Transforms.Text.FeaturizeText("Text", "Features")
                           .Append(mLContext.BinaryClassification.Trainers.FastTree(numLeaves: 50, numTrees: 50, minDatapointsInLeafs: 20));
            //Train model
            var model = pipeline.Fit(trainingDataView);

            //Evaluate model
            var       testDataPath = @"Data/test.tsv";
            IDataView testDataView = mLContext.Data.TextReader(reader).Read(new MultiFileSource(testDataPath));
            var       predictions  = model.Transform(testDataView);
            var       metrics      = mLContext.BinaryClassification.Evaluate(predictions, "Label");

            Console.WriteLine();
            Console.WriteLine("Model quality metrics evaluation");
            Console.WriteLine("--------------------------------");
            Console.WriteLine($"Accuracy: {metrics.Accuracy:P2}");
            Console.WriteLine($"Auc: {metrics.Auc:P2}");
            Console.WriteLine($"F1Score: {metrics.F1Score:P2}");
            Console.WriteLine("=============== End of model evaluation ===============");
            Console.ReadLine();

            //Save Model
            using (var stream = new FileStream(@"Data/model.zip", FileMode.Create, FileAccess.Write, FileShare.Write))
            {
                mLContext.Model.Save(model, stream);
            }
            //Consume model
            var predictionFunct = model.MakePredictionFunction <SentimentIssue, SentimentPrediction>(mLContext);
            var sampleStatement = new SentimentIssue
            {
                Text = "This is a very rude movie"
            };
            var resultprediction = predictionFunct.Predict(sampleStatement);

            Console.WriteLine($"Text: {sampleStatement.Text} | Prediction: {(resultprediction.Prediction ? "Negative" : "Positive")} sentiment");
            Console.ReadLine();
        }
        public void TrainSentiment()
        {
            // Pipeline
            var arguments = new TextLoader.Arguments()
            {
                Columns = new TextLoader.Column[]
                {
                    new TextLoader.Column()
                    {
                        Name   = "Label",
                        Source = new[] { new TextLoader.Range()
                                         {
                                             Min = 0, Max = 0
                                         } },
                        Type = DataKind.Num
                    },

                    new TextLoader.Column()
                    {
                        Name   = "SentimentText",
                        Source = new[] { new TextLoader.Range()
                                         {
                                             Min = 1, Max = 1
                                         } },
                        Type = DataKind.Text
                    }
                },
                HasHeader    = true,
                AllowQuoting = false,
                AllowSparse  = false
            };

            var loader = mlContext.Data.ReadFromTextFile(_sentimentDataPath, arguments);
            var text   = mlContext.Transforms.Text.FeaturizeText("WordEmbeddings", new List <string> {
                "SentimentText"
            },
                                                                 new TextFeaturizingEstimator.Options {
                OutputTokens     = true,
                KeepPunctuations = false,
                UseStopRemover   = true,
                VectorNormalizer = TextFeaturizingEstimator.TextNormKind.None,
                UseCharExtractor = false,
                UseWordExtractor = false,
            }).Fit(loader).Transform(loader);

            var trans = mlContext.Transforms.Text.ExtractWordEmbeddings("Features", "WordEmbeddings_TransformedText",
                                                                        WordEmbeddingsExtractingEstimator.PretrainedModelKind.Sswe).Fit(text).Transform(text);

            // Train
            var trainer   = mlContext.MulticlassClassification.Trainers.StochasticDualCoordinateAscent();
            var predicted = trainer.Fit(trans);

            _consumer.Consume(predicted);
        }
Exemple #8
0
        public static IDataView GetKcHouseDataView(string dataPath)
        {
            var  dataSchema = "col=Id:TX:0 col=Date:TX:1 col=Label:R4:2 col=Bedrooms:R4:3 col=Bathrooms:R4:4 col=SqftLiving:R4:5 col=SqftLot:R4:6 col=Floors:R4:7 col=Waterfront:R4:8 col=View:R4:9 col=Condition:R4:10 col=Grade:R4:11 col=SqftAbove:R4:12 col=SqftBasement:R4:13 col=YearBuilt:R4:14 col=YearRenovated:R4:15 col=Zipcode:R4:16 col=Lat:R4:17 col=Long:R4:18 col=SqftLiving15:R4:19 col=SqftLot15:R4:20 header+ sep=,";
            var  txtArgs    = new TextLoader.Arguments();
            bool parsed     = CmdParser.ParseArguments(s_environment, dataSchema, txtArgs);

            s_environment.Assert(parsed);
            var txtLoader = new TextLoader(s_environment, txtArgs, new MultiFileSource(dataPath));

            return(txtLoader);
        }
        public void TrainSentiment()
        {
            // Pipeline
            var arguments = new TextLoader.Arguments()
            {
                Column = new TextLoader.Column[]
                {
                    new TextLoader.Column()
                    {
                        Name   = "Label",
                        Source = new[] { new TextLoader.Range()
                                         {
                                             Min = 0, Max = 0
                                         } },
                        Type = DataKind.Num
                    },

                    new TextLoader.Column()
                    {
                        Name   = "SentimentText",
                        Source = new[] { new TextLoader.Range()
                                         {
                                             Min = 1, Max = 1
                                         } },
                        Type = DataKind.Text
                    }
                },
                HasHeader    = true,
                AllowQuoting = false,
                AllowSparse  = false
            };
            var loader = _env.Data.ReadFromTextFile(_sentimentDataPath, arguments);
            var text   = new TextFeaturizingEstimator(_env, "SentimentText", "WordEmbeddings", args =>
            {
                args.OutputTokens     = true;
                args.KeepPunctuations = false;
                args.UseStopRemover   = true;
                args.VectorNormalizer = TextFeaturizingEstimator.TextNormKind.None;
                args.UseCharExtractor = false;
                args.UseWordExtractor = false;
            }).Fit(loader).Transform(loader);
            var trans = new WordEmbeddingsExtractingEstimator(_env, "WordEmbeddings_TransformedText", "Features",
                                                              WordEmbeddingsExtractingTransformer.PretrainedModelKind.Sswe).Fit(text).Transform(text);
            // Train
            var trainer   = new SdcaMultiClassTrainer(_env, "Label", "Features", maxIterations: 20);
            var predicted = trainer.Fit(trans);

            _consumer.Consume(predicted);
        }
Exemple #10
0
        public static IDataView LoadTrainingData <TSrc>(this MLContext mlContext, string dataPath, TSrc inputData)
        {
            // If working in Visual Studio, make sure the 'Copy to Output Directory'
            // property of iris-data.txt is set to 'Copy always'
            IMapper mapper = new FlowerTypeMapper();

            if (inputData is AdequacyLevelData)
            {
                mapper = new AdequacyLevelMapper();
            }
            TextLoader.Arguments args = mapper.Map(inputData);

            TextLoader textLoader       = mlContext.Data.TextReader(args);
            IDataView  trainingDataView = textLoader.Read(new MultiFileSource(dataPath));

            return(trainingDataView);
        }
        public TextLoader.Arguments Map <TSrc>(TSrc inputData)
        {
            FlowerTypeData data = inputData as FlowerTypeData;
            var            args = new TextLoader.Arguments()
            {
                Separator = ",",
                HasHeader = true,
                Column    = new[]
                {
                    new TextLoader.Column(nameof(data.SepalLength), DataKind.R4, 0),
                    new TextLoader.Column(nameof(data.SepalWidth), DataKind.R4, 1),
                    new TextLoader.Column(nameof(data.PetalLength), DataKind.R4, 2),
                    new TextLoader.Column(nameof(data.PetalWidth), DataKind.R4, 3),
                    new TextLoader.Column(nameof(data.Label), DataKind.Text, 4)
                }
            };

            return(args);
        }
Exemple #12
0
        public void Train(string dest)
        {
            using (var env = new ConsoleEnvironment(verbose: false))
            {
                var args = new TextLoader.Arguments()
                {
                    Separator = ",",
                    HasHeader = true,
                    Column    = new TextLoader.Column[] {
                        new TextLoader.Column("Label", DataKind.R4, 0),
                        new TextLoader.Column("Sepal_length", DataKind.R4, 1),
                        new TextLoader.Column("Sepal_width", DataKind.R4, 2),
                        new TextLoader.Column("Petal_length", DataKind.R4, 3),
                        new TextLoader.Column("Petal_width", DataKind.R4, 4),
                    }
                };

                var reader = new TextLoader(env, args);
                var concat = new ColumnConcatenatingEstimator(env,
                                                              "Features", "Sepal_length",
                                                              "Sepal_width", "Petal_length", "Petal_width");
                var km       = new MulticlassLogisticRegression(env, "Label", "Features");
                var pipeline = concat.Append(km);

                IDataView trainingDataView = reader.Read(new MultiFileSource(_dataset));
                var       model            = pipeline.Fit(trainingDataView);

                var obs = new IrisObservation()
                {
                    Sepal_length = 3.3f,
                    Sepal_width  = 1.6f,
                    Petal_length = 0.2f,
                    Petal_width  = 5.1f,
                };

                _fct = model.MakePredictionFunction <IrisObservation, IrisPrediction>(env);
                using (var stdest = File.OpenWrite(dest))
                    model.SaveTo(env, stdest);
            }
        }
        public static ColumnGroupingInference.GroupingColumn[] InferColumnPurposes(IChannel ch, IHost env, TextFileSample sample, TextFileContents.ColumnSplitResult splitResult, out bool hasHeader)
        {
            ch.Info("Detecting column types");
            var typeInferenceResult = ColumnTypeInference.InferTextFileColumnTypes(env, sample,
                                                                                   new ColumnTypeInference.Arguments
            {
                ColumnCount = splitResult.ColumnCount,
                Separator   = splitResult.Separator,
                AllowSparse = splitResult.AllowSparse,
                AllowQuote  = splitResult.AllowQuote,
            });

            hasHeader = typeInferenceResult.HasHeader;
            if (!typeInferenceResult.IsSuccess)
            {
                ch.Error("Couldn't detect column types.");
                return(null);
            }

            ch.Info("Detecting column purposes");
            var typedLoaderArgs = new TextLoader.Arguments
            {
                Column       = ColumnTypeInference.GenerateLoaderColumns(typeInferenceResult.Columns),
                Separator    = splitResult.Separator,
                AllowSparse  = splitResult.AllowSparse,
                AllowQuoting = splitResult.AllowQuote,
                HasHeader    = typeInferenceResult.HasHeader
            };
            var typedLoader = new TextLoader(env, typedLoaderArgs, sample);

            var purposeInferenceResult = PurposeInference.InferPurposes(env, typedLoader,
                                                                        Utils.GetIdentityPermutation(typedLoaderArgs.Column.Length), new PurposeInference.Arguments());

            ch.Info("Detecting column grouping and generating column names");

            ColumnGroupingInference.GroupingColumn[] groupingResult = ColumnGroupingInference.InferGroupingAndNames(env, typeInferenceResult.HasHeader,
                                                                                                                    typeInferenceResult.Columns, purposeInferenceResult.Columns).Columns;

            return(groupingResult);
        }
 public TextLoader.Arguments Map <TSrc>(TSrc inputData)
 {
     AdequacyLevelData data = inputData as AdequacyLevelData;
     {
         var args = new TextLoader.Arguments()
         {
             Separator = ",",
             HasHeader = true,
             Column    = new[]
             {
                 new TextLoader.Column(nameof(data.YearsInAgile), DataKind.R4, 0),
                 new TextLoader.Column(nameof(data.YearsInNET), DataKind.R4, 1),
                 new TextLoader.Column(nameof(data.YearsInSQL), DataKind.R4, 2),
                 new TextLoader.Column(nameof(data.AdequacyLevelInAgile), DataKind.R4, 3),
                 new TextLoader.Column(nameof(data.AdequacyLevelInNET), DataKind.R4, 4),
                 new TextLoader.Column(nameof(data.AdequacyLevelInSQL), DataKind.R4, 5),
                 new TextLoader.Column(nameof(data.Label), DataKind.Text, 6)
             }
         };
         return(args);
     }
 }
        private ITransformer _TrainSentiment2Transformer()
        {
            var args = new TextLoader.Arguments()
            {
                Separator = "tab",
                HasHeader = true,
                Column    = new[] {
                    new TextLoader.Column("Label", DataKind.BL, 0),
                    new TextLoader.Column("SentimentText", DataKind.Text, 1)
                }
            };
            var ml = new MLContext(seed: 1, conc: 1);
            //var reader = ml.Data.TextReader(args);
            var trainFilename = FileHelper.GetTestFile("wikipedia-detox-250-line-data.tsv");

            var data     = ml.Data.ReadFromTextFile(trainFilename, args);
            var pipeline = ml.Transforms.Text.FeaturizeText("SentimentText", "Features")
                           .Append(ml.BinaryClassification.Trainers.StochasticDualCoordinateAscent("Label", "Features", advancedSettings: s => s.NumThreads = 1));
            var model = pipeline.Fit(data);

            return(model);
        }
Exemple #16
0
        private static bool TryParseFile(TextLoader.Arguments args, IMultiStreamSource source, out ColumnSplitResult result)
        {
            result = null;
            var textLoader   = new TextLoader(new MLContext(), args);
            var idv          = textLoader.Read(source).Take(1000);
            var columnCounts = new List <int>();
            var column       = idv.Schema["C"];
            var columnIndex  = column.Index;

            using (var cursor = idv.GetRowCursor(x => x == columnIndex))
            {
                var getter = cursor.GetGetter <VBuffer <ReadOnlyMemory <char> > >(columnIndex);

                VBuffer <ReadOnlyMemory <char> > line = default;
                while (cursor.MoveNext())
                {
                    getter(ref line);
                    columnCounts.Add(line.Length);
                }
            }

            var mostCommon = columnCounts.GroupBy(x => x).OrderByDescending(x => x.Count()).First();

            if (mostCommon.Count() < UniformColumnCountThreshold * columnCounts.Count)
            {
                return(false);
            }

            // disallow single-column case
            if (mostCommon.Key <= 1)
            {
                return(false);
            }

            result = new ColumnSplitResult(true, args.Separator, args.AllowQuoting, args.AllowSparse, mostCommon.Key);
            return(true);
        }
Exemple #17
0
        /// <summary>
        /// Reads a text file as a IDataView.
        /// Follows pandas API.
        /// </summary>
        /// <param name="filename">filename</param>
        /// <param name="sep">column separator</param>
        /// <param name="header">has a header or not</param>
        /// <param name="names">column names (can be empty)</param>
        /// <param name="dtypes">column types (can be empty)</param>
        /// <param name="nrows">number of rows to read</param>
        /// <param name="guess_rows">number of rows used to guess types</param>
        /// <param name="encoding">text encoding</param>
        /// <param name="useThreads">specific to TextLoader</param>
        /// <param name="host">host</param>
        /// <param name="index">add a column to hold the index</param>
        /// <returns>TextLoader</returns>
        public static IDataView ReadCsvToTextLoader(string[] filenames,
                                                    char sep          = ',', bool header = true,
                                                    string[] names    = null, ColumnType[] dtypes = null,
                                                    int nrows         = -1, int guess_rows        = 10,
                                                    Encoding encoding = null, bool useThreads     = true,
                                                    bool index        = false, IHost host         = null)
        {
            var df = ReadCsv(filenames[0], sep: sep, header: header, names: names, dtypes: dtypes,
                             nrows: guess_rows, guess_rows: guess_rows, encoding: encoding, index: index);
            var sch  = df.Schema;
            var cols = new TextLoader.Column[sch.ColumnCount];

            for (int i = 0; i < cols.Length; ++i)
            {
                cols[i] = TextLoader.Column.Parse(df.NameType(i));
            }

            var args = new TextLoader.Arguments()
            {
                AllowQuoting   = false,
                Separator      = string.Format("{0}", sep),
                Column         = cols,
                TrimWhitespace = true,
                UseThreads     = useThreads,
                HasHeader      = header,
                MaxRows        = nrows > 0 ? (int?)nrows : null
            };

            if (host == null)
            {
                host = new ConsoleEnvironment().Register("TextLoader");
            }
            var multiSource = new MultiFileSource(filenames);

            return(new TextLoader(host, args, multiSource).Read(multiSource));
        }
Exemple #18
0
        private static InferenceResult InferTextFileColumnTypesCore(IHostEnvironment env, IMultiStreamSource fileSource, Arguments args, IChannel ch)
        {
            Contracts.AssertValue(ch);
            ch.AssertValue(env);
            ch.AssertValue(fileSource);
            ch.AssertValue(args);

            if (args.ColumnCount == 0)
            {
                ch.Error("Too many empty columns for automatic inference.");
                return(InferenceResult.Fail());
            }

            if (args.ColumnCount >= SmartColumnsLim)
            {
                ch.Error("Too many columns for automatic inference.");
                return(InferenceResult.Fail());
            }

            // Read the file as the specified number of text columns.
            var textLoaderArgs = new TextLoader.Arguments
            {
                Column       = new[] { TextLoader.Column.Parse(string.Format("C:TX:0-{0}", args.ColumnCount - 1)) },
                Separator    = args.Separator,
                AllowSparse  = args.AllowSparse,
                AllowQuoting = args.AllowQuote,
            };
            var idv = TextLoader.ReadFile(env, textLoaderArgs, fileSource);

            idv = idv.Take(args.MaxRowsToRead);

            // Read all the data into memory.
            // List items are rows of the dataset.
            var data = new List <ReadOnlyMemory <char>[]>();

            using (var cursor = idv.GetRowCursor(col => true))
            {
                int  columnIndex;
                bool found = cursor.Schema.TryGetColumnIndex("C", out columnIndex);
                Contracts.Assert(found);
                var colType = cursor.Schema.GetColumnType(columnIndex);
                Contracts.Assert(colType.ItemType.IsText);
                ValueGetter <VBuffer <ReadOnlyMemory <char> > > vecGetter = null;
                ValueGetter <ReadOnlyMemory <char> >            oneGetter = null;
                bool isVector = colType.IsVector;
                if (isVector)
                {
                    vecGetter = cursor.GetGetter <VBuffer <ReadOnlyMemory <char> > >(columnIndex);
                }
                else
                {
                    Contracts.Assert(args.ColumnCount == 1);
                    oneGetter = cursor.GetGetter <ReadOnlyMemory <char> >(columnIndex);
                }

                VBuffer <ReadOnlyMemory <char> > line    = default;
                ReadOnlyMemory <char>            tsValue = default;
                while (cursor.MoveNext())
                {
                    if (isVector)
                    {
                        vecGetter(ref line);
                        Contracts.Assert(line.Length == args.ColumnCount);
                        var values = new ReadOnlyMemory <char> [args.ColumnCount];
                        line.CopyTo(values);
                        data.Add(values);
                    }
                    else
                    {
                        oneGetter(ref tsValue);
                        var values = new[] { tsValue };
                        data.Add(values);
                    }
                }
            }

            if (data.Count < 2)
            {
                ch.Error("Too few rows ({0}) for automatic inference.", data.Count);
                return(InferenceResult.Fail());
            }

            var cols = new IntermediateColumn[args.ColumnCount];

            for (int i = 0; i < args.ColumnCount; i++)
            {
                cols[i] = new IntermediateColumn(data.Select(x => x[i]).ToArray(), i);
            }

            foreach (var expert in GetExperts())
            {
                expert.Apply(cols);
            }

            Contracts.Check(cols.All(x => x.SuggestedType != null), "Column type inference must be conclusive");

            // Aggregating header signals.
            int suspect   = 0;
            var usedNames = new HashSet <string>();

            for (int i = 0; i < args.ColumnCount; i++)
            {
                if (cols[i].HasHeader == true)
                {
                    if (usedNames.Add(cols[i].RawData[0].ToString()))
                    {
                        suspect++;
                    }
                    else
                    {
                        // duplicate value in the first column is a strong signal that this is not a header
                        suspect -= args.ColumnCount;
                    }
                }
                else if (cols[i].HasHeader == false)
                {
                    suspect--;
                }
            }

            // REVIEW: Why not use this for column names as well?
            TextLoader.Arguments fileArgs;
            bool hasHeader;

            if (TextLoader.FileContainsValidSchema(env, fileSource, out fileArgs))
            {
                hasHeader = fileArgs.HasHeader;
            }
            else
            {
                hasHeader = suspect > 0;
            }

            // suggest names
            var names = new List <string>();

            usedNames.Clear();
            foreach (var col in cols)
            {
                string name0;
                string name;
                name0 = name = SuggestName(col, hasHeader);
                int i = 0;
                while (!usedNames.Add(name))
                {
                    name = string.Format("{0}_{1:00}", name0, i++);
                }
                names.Add(name);
            }
            var outCols =
                cols.Select((x, i) => new Column(x.ColumnId, names[i], x.SuggestedType)).ToArray();

            var numerics = outCols.Count(x => x.ItemType.IsNumber);

            ch.Info("Detected {0} numeric and {1} text columns.", numerics, outCols.Length - numerics);
            if (hasHeader)
            {
                ch.Info("Generated column names from the file header.");
            }

            return(InferenceResult.Success(outCols, hasHeader, cols.Select(col => col.RawData).ToArray()));
        }
 /// <summary>
 /// Create a text reader.
 /// </summary>
 /// <param name="catalog">The catalog.</param>
 /// <param name="args">The arguments to text reader, describing the data schema.</param>
 /// <param name="dataSample">The optional location of a data sample.</param>
 public static TextLoader TextReader(this DataOperations catalog,
                                     TextLoader.Arguments args, IMultiStreamSource dataSample = null)
 => new TextLoader(CatalogUtils.GetEnvironment(catalog), args, dataSample);
Exemple #20
0
        PrepareData(MLContext mlContext)
        {
            IDataView data      = null;
            IDataView trainData = null;
            IDataView testData  = null;

            TextLoader.Column[] columns = new[] {
                // A boolean column depicting the 'label'.
                new TextLoader.Column("Label", DataKind.BL, 30),
                // 29 Features V1..V28 + Amount
                new TextLoader.Column("V1", DataKind.R4, 1),
                new TextLoader.Column("V2", DataKind.R4, 2),
                new TextLoader.Column("V3", DataKind.R4, 3),
                new TextLoader.Column("V4", DataKind.R4, 4),
                new TextLoader.Column("V5", DataKind.R4, 5),
                new TextLoader.Column("V6", DataKind.R4, 6),
                new TextLoader.Column("V7", DataKind.R4, 7),
                new TextLoader.Column("V8", DataKind.R4, 8),
                new TextLoader.Column("V9", DataKind.R4, 9),
                new TextLoader.Column("V10", DataKind.R4, 10),
                new TextLoader.Column("V11", DataKind.R4, 11),
                new TextLoader.Column("V12", DataKind.R4, 12),
                new TextLoader.Column("V13", DataKind.R4, 13),
                new TextLoader.Column("V14", DataKind.R4, 14),
                new TextLoader.Column("V15", DataKind.R4, 15),
                new TextLoader.Column("V16", DataKind.R4, 16),
                new TextLoader.Column("V17", DataKind.R4, 17),
                new TextLoader.Column("V18", DataKind.R4, 18),
                new TextLoader.Column("V19", DataKind.R4, 19),
                new TextLoader.Column("V20", DataKind.R4, 20),
                new TextLoader.Column("V21", DataKind.R4, 21),
                new TextLoader.Column("V22", DataKind.R4, 22),
                new TextLoader.Column("V23", DataKind.R4, 23),
                new TextLoader.Column("V24", DataKind.R4, 24),
                new TextLoader.Column("V25", DataKind.R4, 25),
                new TextLoader.Column("V26", DataKind.R4, 26),
                new TextLoader.Column("V27", DataKind.R4, 27),
                new TextLoader.Column("V28", DataKind.R4, 28),
                new TextLoader.Column("Amount", DataKind.R4, 29)
            };

            TextLoader.Arguments txtLoaderArgs = new TextLoader.Arguments
            {
                Column     = columns,
                HasHeader  = true,
                Separators = new char[] { ',' }
            };

            // Step one: read the data as an IDataView.
            // Create the reader: define the data columns
            // and where to find them in the text file.
            var reader = new TextLoader(mlContext, txtLoaderArgs);


            // We know that this is a Binary Classification task,
            // so we create a Binary Classification context:
            // it will give us the algorithms we need,
            // as well as the evaluation procedure.
            var classification = new BinaryClassificationCatalog(mlContext);

            if (!File.Exists(Path.Combine(_outputPath, "testData.idv")) &&
                !File.Exists(Path.Combine(_outputPath, "trainData.idv")))
            {
                // Split the data 80:20 into train and test sets, train and evaluate.

                data = reader.Read(new MultiFileSource(_dataSetFile));
                ConsoleHelpers.ConsoleWriteHeader("Show 4 transactions fraud (true) and 4 transactions not fraud (false) -  (source)");
                ConsoleHelpers.InspectData(mlContext, data, 4);



                // Can't do stratification when column type is a boolean, is this an issue?
                //(trainData, testData) = classification.TrainTestSplit(data, testFraction: 0.2, stratificationColumn: "Label");
                (trainData, testData) = classification.TrainTestSplit(data, testFraction: 0.2);

                // save test split
                using (var fileStream = File.Create(Path.Combine(_outputPath, "testData.csv")))
                {
                    mlContext.Data.SaveAsText(testData, fileStream, separatorChar: ',', headerRow: true, schema: true);
                }

                // save train split
                using (var fileStream = File.Create(Path.Combine(_outputPath, "trainData.csv")))
                {
                    mlContext.Data.SaveAsText(trainData, fileStream, separatorChar: ',', headerRow: true, schema: true);
                }
            }
            else
            {
                //Add the "StratificationColumn" that was added by classification.TrainTestSplit()
                // And Label is moved to column 0

                TextLoader.Column[] columnsPlus = new[] {
                    // A boolean column depicting the 'label'.
                    new TextLoader.Column("Label", DataKind.BL, 0),
                    // 30 Features V1..V28 + Amount + StratificationColumn
                    new TextLoader.Column("V1", DataKind.R4, 1),
                    new TextLoader.Column("V2", DataKind.R4, 2),
                    new TextLoader.Column("V3", DataKind.R4, 3),
                    new TextLoader.Column("V4", DataKind.R4, 4),
                    new TextLoader.Column("V5", DataKind.R4, 5),
                    new TextLoader.Column("V6", DataKind.R4, 6),
                    new TextLoader.Column("V7", DataKind.R4, 7),
                    new TextLoader.Column("V8", DataKind.R4, 8),
                    new TextLoader.Column("V9", DataKind.R4, 9),
                    new TextLoader.Column("V10", DataKind.R4, 10),
                    new TextLoader.Column("V11", DataKind.R4, 11),
                    new TextLoader.Column("V12", DataKind.R4, 12),
                    new TextLoader.Column("V13", DataKind.R4, 13),
                    new TextLoader.Column("V14", DataKind.R4, 14),
                    new TextLoader.Column("V15", DataKind.R4, 15),
                    new TextLoader.Column("V16", DataKind.R4, 16),
                    new TextLoader.Column("V17", DataKind.R4, 17),
                    new TextLoader.Column("V18", DataKind.R4, 18),
                    new TextLoader.Column("V19", DataKind.R4, 19),
                    new TextLoader.Column("V20", DataKind.R4, 20),
                    new TextLoader.Column("V21", DataKind.R4, 21),
                    new TextLoader.Column("V22", DataKind.R4, 22),
                    new TextLoader.Column("V23", DataKind.R4, 23),
                    new TextLoader.Column("V24", DataKind.R4, 24),
                    new TextLoader.Column("V25", DataKind.R4, 25),
                    new TextLoader.Column("V26", DataKind.R4, 26),
                    new TextLoader.Column("V27", DataKind.R4, 27),
                    new TextLoader.Column("V28", DataKind.R4, 28),
                    new TextLoader.Column("Amount", DataKind.R4, 29),
                    new TextLoader.Column("StratificationColumn", DataKind.R4, 30)
                };

                // Load splited data
                trainData = mlContext.Data.ReadFromTextFile(Path.Combine(_outputPath, "trainData.csv"),
                                                            columnsPlus,
                                                            hasHeader: txtLoaderArgs.HasHeader,
                                                            separatorChar: txtLoaderArgs.Separators[0]);


                testData = mlContext.Data.ReadFromTextFile(Path.Combine(_outputPath, "testData.csv"),
                                                           columnsPlus,
                                                           hasHeader: txtLoaderArgs.HasHeader,
                                                           separatorChar: txtLoaderArgs.Separators[0]);
            }

            ConsoleHelpers.ConsoleWriteHeader("Show 4 transactions fraud (true) and 4 transactions not fraud (false) -  (traindata)");
            ConsoleHelpers.InspectData(mlContext, trainData, 4);

            ConsoleHelpers.ConsoleWriteHeader("Show 4 transactions fraud (true) and 4 transactions not fraud (false) -  (testData)");
            ConsoleHelpers.InspectData(mlContext, testData, 4);

            return(classification, reader, trainData, testData);
        }
        private static List <Tuple <int, TimeSpan, int> > _MeasureTime(int conc,
                                                                       string engine, IDataScorerTransform scorer, ITransformer transformer,
                                                                       int N, int ncall, bool cacheScikit)
        {
            var args = new TextLoader.Arguments()
            {
                Separator = "tab",
                HasHeader = true,
                Column    = new[]
                {
                    new TextLoader.Column("Label", DataKind.BL, 0),
                    new TextLoader.Column("SentimentText", DataKind.Text, 1)
                }
            };

            var testFilename = FileHelper.GetTestFile("wikipedia-detox-250-line-test.tsv");
            var times        = new List <Tuple <int, TimeSpan, int> >();

            using (var env = EnvHelper.NewTestEnvironment(seed: 1, conc: conc))
            {
                // Take a couple examples out of the test data and run predictions on top.
                var       testLoader = TextLoader.ReadFile(env, args, new MultiFileSource(testFilename));
                IDataView cache;
                if (cacheScikit)
                {
                    cache = new ExtendedCacheTransform(env, new ExtendedCacheTransform.Arguments(), testLoader);
                }
                else
                {
                    cache = new CacheDataView(env, testLoader, new[] { 0, 1 });
                }
                var testData = cache.AsEnumerable <SentimentData>(env, false);

                if (engine == "mlnet")
                {
                    Console.WriteLine("engine={0} N={1} ncall={2} cacheScikit={3}", engine, N, ncall, cacheScikit);
                    var fct = transformer.MakePredictionFunction <SentimentData, SentimentPrediction>(env);
                    var sw  = new Stopwatch();
                    for (int call = 1; call <= ncall; ++call)
                    {
                        sw.Reset();
                        sw.Start();
                        for (int i = 0; i < N; ++i)
                        {
                            foreach (var input in testData)
                            {
                                fct.Predict(input);
                            }
                        }
                        sw.Stop();
                        times.Add(new Tuple <int, TimeSpan, int>(N, sw.Elapsed, call));
                    }
                }
                else if (engine == "scikit")
                {
                    Console.WriteLine("engine={0} N={1} ncall={2} cacheScikit={3}", engine, N, ncall, cacheScikit);
                    var model  = new ValueMapperPredictionEngine <SentimentData>(env, scorer, conc: conc);
                    var output = new ValueMapperPredictionEngine <SentimentData> .PredictionTypeForBinaryClassification();

                    var sw = new Stopwatch();
                    for (int call = 1; call <= ncall; ++call)
                    {
                        sw.Reset();
                        sw.Start();
                        for (int i = 0; i < N; ++i)
                        {
                            foreach (var input in testData)
                            {
                                model.Predict(input, ref output);
                            }
                        }
                        sw.Stop();
                        times.Add(new Tuple <int, TimeSpan, int>(N, sw.Elapsed, call));
                    }
                }
                else
                {
                    throw new NotImplementedException($"Unknown engine '{engine}'.");
                }
            }
            return(times);
        }
        /// <summary>
        /// Read a data view from a text file using <see cref="TextLoader"/>.
        /// </summary>
        /// <param name="catalog">The <see cref="DataOperationsCatalog"/> catalog.</param>
        /// <param name="path">Specifies a file from which to read.</param>
        /// <param name="args">Defines the settings of the load operation.</param>
        public static IDataView ReadFromTextFile(this DataOperationsCatalog catalog, string path, TextLoader.Arguments args = null)
        {
            Contracts.CheckNonEmpty(path, nameof(path));

            var env    = catalog.GetEnvironment();
            var source = new MultiFileSource(path);

            return(new TextLoader(env, args, source).Read(source));
        }
Exemple #23
0
        public void TrainSentiment()
        {
            var env = new MLContext(seed: 1);
            // Pipeline
            var arguments = new TextLoader.Arguments()
            {
                Column = new TextLoader.Column[]
                {
                    new TextLoader.Column()
                    {
                        Name   = "Label",
                        Source = new[] { new TextLoader.Range()
                                         {
                                             Min = 0, Max = 0
                                         } },
                        Type = DataKind.Num
                    },

                    new TextLoader.Column()
                    {
                        Name   = "SentimentText",
                        Source = new[] { new TextLoader.Range()
                                         {
                                             Min = 1, Max = 1
                                         } },
                        Type = DataKind.Text
                    }
                },
                HasHeader    = true,
                AllowQuoting = false,
                AllowSparse  = false
            };
            var loader = env.Data.ReadFromTextFile(_sentimentDataPath, arguments);

            var text = TextFeaturizingEstimator.Create(env,
                                                       new TextFeaturizingEstimator.Arguments()
            {
                Column = new TextFeaturizingEstimator.Column
                {
                    Name   = "WordEmbeddings",
                    Source = new[] { "SentimentText" }
                },
                OutputTokens                 = true,
                KeepPunctuations             = false,
                UsePredefinedStopWordRemover = true,
                VectorNormalizer             = TextFeaturizingEstimator.TextNormKind.None,
                CharFeatureExtractor         = null,
                WordFeatureExtractor         = null,
            }, loader);

            var trans = WordEmbeddingsExtractingTransformer.Create(env,
                                                                   new WordEmbeddingsExtractingTransformer.Arguments()
            {
                Column = new WordEmbeddingsExtractingTransformer.Column[1]
                {
                    new WordEmbeddingsExtractingTransformer.Column
                    {
                        Name   = "Features",
                        Source = "WordEmbeddings_TransformedText"
                    }
                },
                ModelKind = WordEmbeddingsExtractingTransformer.PretrainedModelKind.Sswe,
            }, text);

            // Train
            var trainer   = new SdcaMultiClassTrainer(env, "Label", "Features", maxIterations: 20);
            var predicted = trainer.Fit(trans);

            _consumer.Consume(predicted);
        }
        // This method is called if only a datafile is specified, without a loader/term and value columns.
        // It determines the type of the Value column and returns the appropriate TextLoader component factory.
        private static IComponentFactory <IMultiStreamSource, IDataLoader> GetLoaderFactory(string filename, bool keyValues, IHost host)
        {
            Contracts.AssertValue(host);

            // If the user specified non-key values, we define the value column to be numeric.
            if (!keyValues)
            {
                return(ComponentFactoryUtils.CreateFromFunction <IMultiStreamSource, IDataLoader>(
                           (env, files) => TextLoader.Create(
                               env,
                               new TextLoader.Arguments()
                {
                    Column = new[]
                    {
                        new TextLoader.Column("Term", DataKind.TX, 0),
                        new TextLoader.Column("Value", DataKind.Num, 1)
                    }
                },
                               files)));
            }

            // If the user specified key values, we scan the values to determine the range of the key type.
            ulong min = ulong.MaxValue;
            ulong max = ulong.MinValue;

            try
            {
                var  txtArgs = new TextLoader.Arguments();
                bool parsed  = CmdParser.ParseArguments(host, "col=Term:TX:0 col=Value:TX:1", txtArgs);
                host.Assert(parsed);
                var data = TextLoader.ReadFile(host, txtArgs, new MultiFileSource(filename));
                using (var cursor = data.GetRowCursor(c => true))
                {
                    var    getTerm = cursor.GetGetter <DvText>(0);
                    var    getVal  = cursor.GetGetter <DvText>(1);
                    DvText txt     = default(DvText);

                    using (var ch = host.Start("Creating Text Lookup Loader"))
                    {
                        long countNonKeys = 0;
                        while (cursor.MoveNext())
                        {
                            getVal(ref txt);
                            ulong res;
                            // Try to parse the text as a key value between 1 and ulong.MaxValue. If this succeeds and res>0,
                            // we update max and min accordingly. If res==0 it means the value is missing, in which case we ignore it for
                            // computing max and min.
                            if (Conversions.Instance.TryParseKey(ref txt, 1, ulong.MaxValue, out res))
                            {
                                if (res < min && res != 0)
                                {
                                    min = res;
                                }
                                if (res > max)
                                {
                                    max = res;
                                }
                            }
                            // If parsing as key did not succeed, the value can still be 0, so we try parsing it as a ulong. If it succeeds,
                            // then the value is 0, and we update min accordingly.
                            else if (Conversions.Instance.TryParse(ref txt, out res))
                            {
                                ch.Assert(res == 0);
                                min = 0;
                            }
                            //If parsing as a ulong fails, we increment the counter for the non-key values.
                            else
                            {
                                var term = default(DvText);
                                getTerm(ref term);
                                if (countNonKeys < 5)
                                {
                                    ch.Warning("Term '{0}' in mapping file is mapped to non key value '{1}'", term, txt);
                                }
                                countNonKeys++;
                            }
                        }
                        if (countNonKeys > 0)
                        {
                            ch.Warning("Found {0} non key values in the file '{1}'", countNonKeys, filename);
                        }
                        if (min > max)
                        {
                            min = 0;
                            max = uint.MaxValue - 1;
                            ch.Warning("did not find any valid key values in the file '{0}'", filename);
                        }
                        else
                        {
                            ch.Info("Found key values in the range {0} to {1} in the file '{2}'", min, max, filename);
                        }
                        ch.Done();
                    }
                }
            }
            catch (Exception e)
            {
                throw host.Except(e, "Failed to parse the lookup file '{0}' in TermLookupTransform", filename);
            }

            TextLoader.Column valueColumn = new TextLoader.Column("Value", DataKind.U4, 1);
            if (max - min < (ulong)int.MaxValue)
            {
                valueColumn.KeyRange = new KeyRange(min, max);
            }
            else if (max - min < (ulong)uint.MaxValue)
            {
                valueColumn.KeyRange = new KeyRange(min);
            }
            else
            {
                valueColumn.Type     = DataKind.U8;
                valueColumn.KeyRange = new KeyRange(min);
            }

            return(ComponentFactoryUtils.CreateFromFunction <IMultiStreamSource, IDataLoader>(
                       (env, files) => TextLoader.Create(
                           env,
                           new TextLoader.Arguments()
            {
                Column = new[]
                {
                    new TextLoader.Column("Term", DataKind.TX, 0),
                    valueColumn
                }
            },
                           files)));
        }
        /// <summary>
        /// Attempt to detect text loader arguments.
        /// The algorithm selects the first 'acceptable' set: the one that recognizes the same number of columns in at
        /// least <see cref="UniformColumnCountThreshold"/> of the sample's lines,
        /// and this number of columns is more than 1.
        /// We sweep on separator, allow sparse and allow quote parameter.
        /// </summary>
        public static ColumnSplitResult TrySplitColumns(IHostEnvironment env, IMultiStreamSource source,
                                                        string[] separatorCandidates, bool?allowSparse = null, bool?allowQuote = null, bool skipStrictValidation = false)
        {
            Contracts.CheckValue(env, nameof(env));
            var h = env.Register("CandidateLoader");

            h.CheckValue(source, nameof(source));
            h.CheckNonEmpty(separatorCandidates, nameof(separatorCandidates));
            // Default value for sparse and quote is true.
            bool[] sparse = new[] { true, false };
            bool[] quote  = new[] { true, false };
            if (allowSparse.HasValue)
            {
                sparse = new[] { allowSparse.Value }
            }
            ;
            if (allowQuote.HasValue)
            {
                quote = new[] { allowQuote.Value }
            }
            ;
            bool foundAny = false;
            var  result   = default(ColumnSplitResult);

            using (var ch = env.Register("SplitColumns").Start("SplitColumns"))
            {
                foreach (var perm in (from _allowSparse in sparse
                                      from _allowQuote in quote
                                      from _sep in separatorCandidates
                                      select new { _allowSparse, _allowQuote, _sep }))
                {
                    var args = new TextLoader.Arguments
                    {
                        Column       = new[] { TextLoader.Column.Parse("C:TX:0-**") },
                        Separator    = perm._sep,
                        AllowQuoting = perm._allowQuote,
                        AllowSparse  = perm._allowSparse
                    };

                    if (TryParseFile(ch, args, source, skipStrictValidation, out result))
                    {
                        foundAny = true;
                        break;
                    }
                }

                if (foundAny)
                {
                    ch.Info("Discovered {0} columns using separator '{1}'.", result.ColumnCount, result.Separator);
                }
                else
                {
                    // REVIEW: May need separate messages for GUI-specific and non-specific. This component can be used
                    // by itself outside the GUI.
                    ch.Info("Couldn't determine columns in the file using separators {0}. Does the input file consist of only a single column? "
                            + "If so, in TLC GUI, please close the import wizard, and then, in the loader settings to the right, manually add a column, "
                            + "choose a name, and set source index to 0.",
                            string.Join(",", separatorCandidates.Select(c => string.Format("'{0}'", GetSeparatorString(c)))));
                }
            }
            return(foundAny ? result : new ColumnSplitResult(false, null, true, true, 0));
        }
        private static bool TryParseFile(IChannel ch, TextLoader.Arguments args, IMultiStreamSource source, bool skipStrictValidation, out ColumnSplitResult result)
        {
            result = default(ColumnSplitResult);
            try
            {
                // No need to provide information from unsuccessful loader, so we create temporary environment and get information from it in case of success
                using (var loaderEnv = new ConsoleEnvironment(0, true))
                {
                    var messages = new ConcurrentBag <ChannelMessage>();
                    loaderEnv.AddListener <ChannelMessage>(
                        (src, msg) =>
                    {
                        messages.Add(msg);
                    });
                    var  idv          = TextLoader.ReadFile(loaderEnv, args, source).Take(1000);
                    var  columnCounts = new List <int>();
                    int  columnIndex;
                    bool found = idv.Schema.TryGetColumnIndex("C", out columnIndex);
                    ch.Assert(found);

                    using (var cursor = idv.GetRowCursor(x => x == columnIndex))
                    {
                        var getter = cursor.GetGetter <VBuffer <ReadOnlyMemory <char> > >(columnIndex);

                        VBuffer <ReadOnlyMemory <char> > line = default;
                        while (cursor.MoveNext())
                        {
                            getter(ref line);
                            columnCounts.Add(line.Length);
                        }
                    }

                    Contracts.Check(columnCounts.Count > 0);
                    var mostCommon = columnCounts.GroupBy(x => x).OrderByDescending(x => x.Count()).First();
                    if (!skipStrictValidation && mostCommon.Count() < UniformColumnCountThreshold * columnCounts.Count)
                    {
                        return(false);
                    }

                    // If user explicitly specified separator we're allowing "single" column case;
                    // Otherwise user will see message informing that we were not able to detect any columns.
                    if (!skipStrictValidation && mostCommon.Key <= 1)
                    {
                        return(false);
                    }

                    result = new ColumnSplitResult(true, args.Separator, args.AllowQuoting, args.AllowSparse, mostCommon.Key);
                    ch.Trace("Discovered {0} columns using separator '{1}'", mostCommon.Key, args.Separator);
                    foreach (var msg in messages)
                    {
                        ch.Send(msg);
                    }
                    return(true);
                }
            }
            catch (Exception ex)
            {
                if (!ex.IsMarked())
                {
                    throw;
                }
                // For known exceptions, we just continue to the next separator candidate.
            }
            return(false);
        }
Exemple #27
0
 public MyTextLoader(IHostEnvironment env, TextLoader.Arguments args)
 {
     _env  = env;
     _args = args;
 }
        private List <Tuple <int, TimeSpan, int, float[]> > _MeasureTime(int conc,
                                                                         string strategy, string engine, IDataScorerTransform scorer, ITransformer trscorer, int ncall)
        {
            var args = new TextLoader.Arguments()
            {
                Separator = "tab",
                HasHeader = true,
                Column    = new[]
                {
                    new TextLoader.Column("Label", DataKind.BL, 0),
                    new TextLoader.Column("SentimentText", DataKind.Text, 1)
                }
            };

            var testFilename = FileHelper.GetTestFile("wikipedia-detox-250-line-test.tsv");
            var times        = new List <Tuple <int, TimeSpan, int, float[]> >();

            using (var env = EnvHelper.NewTestEnvironment(seed: 1, conc: conc))
            {
                // Take a couple examples out of the test data and run predictions on top.
                var       testLoader = TextLoader.ReadFile(env, args, new MultiFileSource(testFilename));
                IDataView cache;
                if (strategy.Contains("extcache"))
                {
                    cache = new ExtendedCacheTransform(env, new ExtendedCacheTransform.Arguments(), testLoader);
                }
                else
                {
                    cache = new CacheDataView(env, testLoader, new[] { 0, 1 });
                }
                var testData      = cache.AsEnumerable <SentimentData>(env, false);
                var testDataArray = cache.AsEnumerable <SentimentData>(env, false).ToArray();
                int N             = 1;

                if (engine == "mlnet")
                {
                    var model = trscorer.MakePredictionFunction <SentimentData, SentimentPrediction>(env);
                    var sw    = new Stopwatch();
                    for (int call = 1; call <= ncall; ++call)
                    {
                        sw.Reset();
                        var pred = new List <float>();
                        sw.Start();
                        for (int i = 0; i < N; ++i)
                        {
                            if (strategy.Contains("array"))
                            {
                                foreach (var input in testDataArray)
                                {
                                    pred.Add(model.Predict(input).Score);
                                }
                            }
                            else
                            {
                                foreach (var input in testData)
                                {
                                    pred.Add(model.Predict(input).Score);
                                }
                            }
                        }
                        sw.Stop();
                        times.Add(new Tuple <int, TimeSpan, int, float[]>(N, sw.Elapsed, call, pred.ToArray()));
                    }
                }
                else if (engine == "scikit")
                {
                    string allSchema = SchemaHelper.ToString(scorer.Schema);
                    Assert.IsTrue(allSchema.Contains("PredictedLabel:Bool:4; Score:R4:5; Probability:R4:6"));
                    var model  = new ValueMapperPredictionEngine <SentimentData>(env, scorer, conc: conc);
                    var output = new ValueMapperPredictionEngine <SentimentData> .PredictionTypeForBinaryClassification();

                    var sw = new Stopwatch();
                    for (int call = 1; call <= ncall; ++call)
                    {
                        var pred = new List <float>();
                        sw.Reset();
                        sw.Start();
                        for (int i = 0; i < N; ++i)
                        {
                            if (strategy.Contains("array"))
                            {
                                foreach (var input in testDataArray)
                                {
                                    model.Predict(input, ref output);
                                    pred.Add(output.Score);
                                }
                            }
                            else
                            {
                                foreach (var input in testData)
                                {
                                    model.Predict(input, ref output);
                                    pred.Add(output.Score);
                                }
                            }
                        }
                        sw.Stop();
                        times.Add(new Tuple <int, TimeSpan, int, float[]>(N, sw.Elapsed, call, pred.ToArray()));
                    }
                }
                else
                {
                    throw new NotImplementedException($"Unknown engine '{engine}'.");
                }
            }
            return(times);
        }
        public static SuggestedRecipe[] InferRecipesFromData(IHostEnvironment env, string dataFile, string schemaDefinitionFile,
                                                             out Type predictorType, out string settingsString, out TransformInference.InferenceResult inferenceResult,
                                                             bool excludeFeaturesConcatTransforms = false)
        {
            Contracts.CheckValue(env, nameof(env));
            var h = env.Register("InferRecipesFromData", seed: 0, verbose: false);

            using (var ch = h.Start("InferRecipesFromData"))
            {
                // Validate the schema file has content if provided.
                // Warn the user early if that is provided but beign skipped.
                string schemaJson = null;
                if (!string.IsNullOrEmpty(schemaDefinitionFile))
                {
                    try
                    {
                        schemaJson = File.ReadAllText(schemaDefinitionFile);
                    }
                    catch (Exception ex)
                    {
                        ch.Warning($"Unable to read the schema file. Proceeding to infer the schema :{ex.Message}");
                    }
                }

                ch.Info("Loading file sample into memory.");
                var sample = TextFileSample.CreateFromFullFile(h, dataFile);

                ch.Info("Detecting separator and columns");
                var splitResult = TextFileContents.TrySplitColumns(h, sample, TextFileContents.DefaultSeparators);

                // initialize to clustering if we're not successful?
                predictorType  = typeof(SignatureClusteringTrainer);
                settingsString = "";
                if (!splitResult.IsSuccess)
                {
                    throw ch.ExceptDecode("Couldn't detect separator.");
                }

                ch.Info($"Separator detected as '{splitResult.Separator}', there's {splitResult.ColumnCount} columns.");

                ColumnGroupingInference.GroupingColumn[] columns;
                bool hasHeader = false;
                if (string.IsNullOrEmpty(schemaJson))
                {
                    ch.Warning("Empty schema file. Proceeding to infer the schema.");
                    columns = InferenceUtils.InferColumnPurposes(ch, h, sample, splitResult, out hasHeader);
                }
                else
                {
                    try
                    {
                        columns = JsonConvert.DeserializeObject <ColumnGroupingInference.GroupingColumn[]>(schemaJson);
                        ch.Info("Using the provided schema file.");
                    }
                    catch
                    {
                        ch.Warning("Invalid json in the schema file. Proceeding to infer the schema.");
                        columns = InferenceUtils.InferColumnPurposes(ch, h, sample, splitResult, out hasHeader);
                    }
                }

                var finalLoaderArgs = new TextLoader.Arguments
                {
                    Column       = ColumnGroupingInference.GenerateLoaderColumns(columns),
                    HasHeader    = hasHeader,
                    Separator    = splitResult.Separator,
                    AllowSparse  = splitResult.AllowSparse,
                    AllowQuoting = splitResult.AllowQuote
                };

                settingsString = CommandLine.CmdParser.GetSettings(ch, finalLoaderArgs, new TextLoader.Arguments());
                ch.Info($"Loader options: {settingsString}");

                ch.Info("Inferring recipes");
                var finalData = TextLoader.ReadFile(h, finalLoaderArgs, sample);
                var cached    = new CacheDataView(h, finalData,
                                                  Enumerable.Range(0, finalLoaderArgs.Column.Length).ToArray());

                var purposeColumns = columns.Select((x, i) => new PurposeInference.Column(i, x.Purpose, x.ItemKind)).ToArray();

                var fraction = sample.FullFileSize == null ? 1.0 : (double)sample.SampleSize / sample.FullFileSize.Value;
                var transformInferenceResult = TransformInference.InferTransforms(h, cached, purposeColumns,
                                                                                  new TransformInference.Arguments
                {
                    EstimatedSampleFraction         = fraction,
                    ExcludeFeaturesConcatTransforms = excludeFeaturesConcatTransforms
                }
                                                                                  );
                predictorType = InferenceUtils.InferPredictorCategoryType(cached, purposeColumns);
                var recipeInferenceResult = InferRecipes(h, transformInferenceResult, predictorType);

                ch.Done();

                inferenceResult = transformInferenceResult;
                return(recipeInferenceResult.SuggestedRecipes);
            }
        }
        // This method is called if only a datafile is specified, without a loader/term and value columns.
        // It determines the type of the Value column and returns the appropriate TextLoader component factory.
        private static IComponentFactory <IMultiStreamSource, IDataLoader> GetLoaderFactory(string filename, bool keyValues, IHost host)
        {
            Contracts.AssertValue(host);

            // If the user specified non-key values, we define the value column to be numeric.
            if (!keyValues)
            {
                return(ComponentFactoryUtils.CreateFromFunction <IMultiStreamSource, IDataLoader>(
                           (env, files) => TextLoader.Create(
                               env,
                               new TextLoader.Arguments()
                {
                    Column = new[]
                    {
                        new TextLoader.Column("Term", DataKind.TX, 0),
                        new TextLoader.Column("Value", DataKind.Num, 1)
                    }
                },
                               files)));
            }

            // If the user specified key values, we scan the values to determine the range of the key type.
            ulong min = ulong.MaxValue;
            ulong max = ulong.MinValue;

            try
            {
                var  txtArgs = new TextLoader.Arguments();
                bool parsed  = CmdParser.ParseArguments(host, "col=Term:TX:0 col=Value:TX:1", txtArgs);
                host.Assert(parsed);
                var data = TextLoader.ReadFile(host, txtArgs, new MultiFileSource(filename));
                using (var cursor = data.GetRowCursor(c => true))
                {
                    var getTerm = cursor.GetGetter <ReadOnlyMemory <char> >(0);
                    var getVal  = cursor.GetGetter <ReadOnlyMemory <char> >(1);
                    ReadOnlyMemory <char> txt = default;

                    using (var ch = host.Start("Creating Text Lookup Loader"))
                    {
                        long countNonKeys = 0;
                        while (cursor.MoveNext())
                        {
                            getVal(ref txt);
                            ulong res;
                            // Try to parse the text as a key value between 1 and ulong.MaxValue. If this succeeds and res>0,
                            // we update max and min accordingly. If res==0 it means the value is missing, in which case we ignore it for
                            // computing max and min.
                            if (Runtime.Data.Conversion.Conversions.Instance.TryParseKey(in txt, 1, ulong.MaxValue, out res))
                            {
                                if (res < min && res != 0)
                                {
                                    min = res;
                                }
                                if (res > max)
                                {
                                    max = res;
                                }
                            }
                            // If parsing as key did not succeed, the value can still be 0, so we try parsing it as a ulong. If it succeeds,
                            // then the value is 0, and we update min accordingly.
                            else if (Runtime.Data.Conversion.Conversions.Instance.TryParse(in txt, out res))
                            {
                                ch.Assert(res == 0);
                                min = 0;
                            }