Beispiel #1
0
        private int[][] CompileSlotMap(string slotMapString, int srcSlotCount)
        {
            var parts   = ReadOnlyMemoryUtils.Split(slotMapString.AsMemory(), new[] { ';' }).ToArray();
            var slotMap = new int[parts.Length][];

            for (int i = 0; i < slotMap.Length; i++)
            {
                var slotIndices = ReadOnlyMemoryUtils.Split(parts[i], new[] { ',' }).ToArray();
                var slots       = new int[slotIndices.Length];
                slotMap[i] = slots;
                for (int j = 0; j < slots.Length; j++)
                {
                    int index;
                    if (!int.TryParse(slotIndices[j].ToString(), out index) || index < 0 || index >= srcSlotCount)
                    {
                        throw Host.Except("Unexpected slot index '{1}' in group {0}. Expected 0 to {2}", i, slotIndices[j], srcSlotCount - 1);
                    }
                    slots[j] = index;
                }

                if (slots.Distinct().Count() < slots.Length)
                {
                    throw Host.Except("Group '{0}' has duplicate slot indices", parts[i]);
                }
            }

            return(slotMap);
        }
Beispiel #2
0
        public static CommonOutputs.TransformOutput RenameBinaryPredictionScoreColumns(IHostEnvironment env,
                                                                                       RenameBinaryPredictionScoreColumnsInput input)
        {
            Contracts.CheckValue(env, nameof(env));
            var host = env.Register("ScoreModel");

            host.CheckValue(input, nameof(input));
            EntryPointUtils.CheckInputArgs(host, input);

            if (input.PredictorModel.Predictor.PredictionKind == PredictionKind.BinaryClassification)
            {
                DataViewType labelType;
                var          labelNames = input.PredictorModel.GetLabelInfo(host, out labelType);
                if (labelNames != null && labelNames.Length == 2)
                {
                    var positiveClass = labelNames[1];

                    // Rename all the score columns.
                    int colMax;
                    var maxScoreId = input.Data.Schema.GetMaxAnnotationKind(out colMax, AnnotationUtils.Kinds.ScoreColumnSetId);
                    var copyCols   = new List <(string name, string source)>();
                    for (int i = 0; i < input.Data.Schema.Count; i++)
                    {
                        if (input.Data.Schema[i].IsHidden)
                        {
                            continue;
                        }
                        if (!ShouldAddColumn(input.Data.Schema, i, null, maxScoreId))
                        {
                            continue;
                        }
                        // Do not rename the PredictedLabel column.
                        ReadOnlyMemory <char> tmp = default;
                        if (input.Data.Schema.TryGetAnnotation(TextDataViewType.Instance, AnnotationUtils.Kinds.ScoreValueKind, i,
                                                               ref tmp) &&
                            ReadOnlyMemoryUtils.EqualsStr(AnnotationUtils.Const.ScoreValueKind.PredictedLabel, tmp))
                        {
                            continue;
                        }
                        var source = input.Data.Schema[i].Name;
                        var name   = source + "." + positiveClass;
                        copyCols.Add((name, source));
                    }

                    var copyColumn = new ColumnCopyingTransformer(env, copyCols.ToArray()).Transform(input.Data);
                    var dropColumn = ColumnSelectingTransformer.CreateDrop(env, copyColumn, copyCols.Select(c => c.source).ToArray());
                    return(new CommonOutputs.TransformOutput {
                        Model = new TransformModelImpl(env, dropColumn, input.Data), OutputData = dropColumn
                    });
                }
            }

            var newView = NopTransform.CreateIfNeeded(env, input.Data);

            return(new CommonOutputs.TransformOutput {
                Model = new TransformModelImpl(env, newView, input.Data), OutputData = newView
            });
        }
        private void Evaluate(IndentedTextWriter wrt, Delegate del, DataViewType typeRes, DataViewType[] types,
                              string text, int ichMin, int ichLim)
        {
            Contracts.AssertValue(del);
            Contracts.AssertNonEmpty(types);
            var args    = new object[types.Length];
            var getters = new Func <ReadOnlyMemory <char>, bool> [types.Length];

            for (int i = 0; i < getters.Length; i++)
            {
                getters[i] = GetGetter(i, types[i], args);
            }

            StringBuilder   sb      = new StringBuilder();
            Action <object> printer = GetPrinter(typeRes, sb);

            ReadOnlyMemory <char> chars = text.AsMemory().Slice(ichMin, ichLim - ichMin);

            for (bool more = true; more;)
            {
                ReadOnlyMemory <char> line;
                if (RuntimeInformation.IsOSPlatform(OSPlatform.Windows))
                {
                    more = ReadOnlyMemoryUtils.SplitOne(chars, '\x0D', out line, out chars);
                }
                else
                {
                    more = ReadOnlyMemoryUtils.SplitOne(chars, '\x0A', out line, out chars);
                }
                line = ReadOnlyMemoryUtils.TrimWhiteSpace(line);
                if (line.IsEmpty)
                {
                    continue;
                }

                // Note this "hack" to map _ to empty. It's easier than fully handling quoting and is sufficient
                // for these tests.
                var vals = ReadOnlyMemoryUtils.Split(line, new char[] { ',' })
                           .Select(x => ReadOnlyMemoryUtils.TrimWhiteSpace(x))
                           .Select(x => ReadOnlyMemoryUtils.EqualsStr("_", x) ? ReadOnlyMemory <char> .Empty : x)
                           .ToArray();

                Contracts.Assert(vals.Length == getters.Length);
                for (int i = 0; i < getters.Length; i++)
                {
                    if (!getters[i](vals[i]))
                    {
                        wrt.Write("*** Parsing {0} Failed *** ", vals[i]);
                    }
                }
                var res = del.DynamicInvoke(args);
                printer(res);
                wrt.WriteLine(sb);
            }
        }
        public static TX Lower(TX a)
        {
            if (a.IsEmpty)
            {
                return(a);
            }
            var sb = new StringBuilder();

            ReadOnlyMemoryUtils.AddLowerCaseToStringBuilder(a.Span, sb);
            return(sb.ToString().AsMemory());
        }
Beispiel #5
0
        protected override Delegate GetGetterCore(IChannel ch, IRow input, int iinfo, out Action disposer)
        {
            Host.AssertValueOrNull(ch);
            Host.AssertValue(input);
            Host.Assert(0 <= iinfo && iinfo < Infos.Length);
            Host.Assert(Infos[iinfo].TypeSrc.IsVector & Infos[iinfo].TypeSrc.ItemType.IsText);
            disposer = null;

            var      ex            = _exes[iinfo];
            Language stopWordslang = ex.Lang;
            var      lang          = default(ReadOnlyMemory <char>);
            var      getLang       = ex.LangsColIndex >= 0 ? input.GetGetter <ReadOnlyMemory <char> >(ex.LangsColIndex) : null;

            var getSrc = GetSrcGetter <VBuffer <ReadOnlyMemory <char> > >(input, iinfo);
            var src    = default(VBuffer <ReadOnlyMemory <char> >);
            var buffer = new StringBuilder();
            var list   = new List <ReadOnlyMemory <char> >();

            ValueGetter <VBuffer <ReadOnlyMemory <char> > > del =
                (ref VBuffer <ReadOnlyMemory <char> > dst) =>
            {
                var langToUse = stopWordslang;
                UpdateLanguage(ref langToUse, getLang, ref lang);

                getSrc(ref src);
                list.Clear();

                var srcValues = src.GetValues();
                for (int i = 0; i < srcValues.Length; i++)
                {
                    if (srcValues[i].IsEmpty)
                    {
                        continue;
                    }
                    buffer.Clear();
                    ReadOnlyMemoryUtils.AddLowerCaseToStringBuilder(srcValues[i].Span, buffer);

                    // REVIEW nihejazi: Consider using a trie for string matching (Aho-Corasick, etc.)
                    if (StopWords[(int)langToUse].Get(buffer) == null)
                    {
                        list.Add(srcValues[i]);
                    }
                }

                VBufferUtils.Copy(list, ref dst, list.Count);
            };

            return(del);
        }
        public void TestCrossValidationMacroWithNonDefaultNames()
        {
            string dataPath = GetDataPath(@"adult.tiny.with-schema.txt");
            var    env      = new MLContext(42);
            var    subGraph = env.CreateExperiment();

            var textToKey = new Legacy.Transforms.TextToKeyConverter();

            textToKey.Column = new[] { new Legacy.Transforms.ValueToKeyMappingTransformerColumn()
                                       {
                                           Name = "Label1", Source = "Label"
                                       } };
            var textToKeyOutput = subGraph.Add(textToKey);

            var hash = new Legacy.Transforms.HashConverter();

            hash.Column = new[] { new Legacy.Transforms.HashJoiningTransformColumn()
                                  {
                                      Name = "GroupId1", Source = "Workclass"
                                  } };
            hash.Data = textToKeyOutput.OutputData;
            var hashOutput = subGraph.Add(hash);

            var learnerInput = new Legacy.Trainers.FastTreeRanker
            {
                TrainingData  = hashOutput.OutputData,
                NumThreads    = 1,
                LabelColumn   = "Label1",
                GroupIdColumn = "GroupId1"
            };
            var learnerOutput = subGraph.Add(learnerInput);

            var modelCombine = new Legacy.Transforms.ManyHeterogeneousModelCombiner
            {
                TransformModels = new ArrayVar <TransformModel>(textToKeyOutput.Model, hashOutput.Model),
                PredictorModel  = learnerOutput.PredictorModel
            };
            var modelCombineOutput = subGraph.Add(modelCombine);

            var experiment  = env.CreateExperiment();
            var importInput = new Legacy.Data.TextLoader(dataPath);

            importInput.Arguments.HasHeader = true;
            importInput.Arguments.Column    = new TextLoaderColumn[]
            {
                new TextLoaderColumn {
                    Name = "Label", Source = new[] { new TextLoaderRange(0) }
                },
                new TextLoaderColumn {
                    Name = "Workclass", Source = new[] { new TextLoaderRange(1) }, Type = Legacy.Data.DataKind.Text
                },
                new TextLoaderColumn {
                    Name = "Features", Source = new[] { new TextLoaderRange(9, 14) }
                }
            };
            var importOutput = experiment.Add(importInput);

            var crossValidate = new Legacy.Models.CrossValidator
            {
                Data           = importOutput.Data,
                Nodes          = subGraph,
                TransformModel = null,
                LabelColumn    = "Label1",
                GroupColumn    = "GroupId1",
                NameColumn     = "Workclass",
                Kind           = Legacy.Models.MacroUtilsTrainerKinds.SignatureRankerTrainer
            };

            crossValidate.Inputs.Data            = textToKey.Data;
            crossValidate.Outputs.PredictorModel = modelCombineOutput.PredictorModel;
            var crossValidateOutput = experiment.Add(crossValidate);

            experiment.Compile();
            experiment.SetInput(importInput.InputFile, new SimpleFileHandle(env, dataPath, false, false));
            experiment.Run();
            var data = experiment.GetOutput(crossValidateOutput.OverallMetrics);

            var schema = data.Schema;
            var b      = schema.TryGetColumnIndex("NDCG", out int metricCol);

            Assert.True(b);
            b = schema.TryGetColumnIndex("Fold Index", out int foldCol);
            Assert.True(b);
            using (var cursor = data.GetRowCursor(col => col == metricCol || col == foldCol))
            {
                var getter                 = cursor.GetGetter <VBuffer <double> >(metricCol);
                var foldGetter             = cursor.GetGetter <ReadOnlyMemory <char> >(foldCol);
                ReadOnlyMemory <char> fold = default;

                // Get the verage.
                b = cursor.MoveNext();
                Assert.True(b);
                var avg = default(VBuffer <double>);
                getter(ref avg);
                foldGetter(ref fold);
                Assert.True(ReadOnlyMemoryUtils.EqualsStr("Average", fold));

                // Get the standard deviation.
                b = cursor.MoveNext();
                Assert.True(b);
                var stdev = default(VBuffer <double>);
                getter(ref stdev);
                foldGetter(ref fold);
                Assert.True(ReadOnlyMemoryUtils.EqualsStr("Standard Deviation", fold));
                var stdevValues = stdev.GetValues();
                Assert.Equal(2.462, stdevValues[0], 3);
                Assert.Equal(2.763, stdevValues[1], 3);
                Assert.Equal(3.273, stdevValues[2], 3);

                var sumBldr = new BufferBuilder <double>(R8Adder.Instance);
                sumBldr.Reset(avg.Length, true);
                var val = default(VBuffer <double>);
                for (int f = 0; f < 2; f++)
                {
                    b = cursor.MoveNext();
                    Assert.True(b);
                    getter(ref val);
                    foldGetter(ref fold);
                    sumBldr.AddFeatures(0, in val);
                    Assert.True(ReadOnlyMemoryUtils.EqualsStr("Fold " + f, fold));
                }
                var sum = default(VBuffer <double>);
                sumBldr.GetResult(ref sum);

                var avgValues = avg.GetValues();
                var sumValues = sum.GetValues();
                for (int i = 0; i < avgValues.Length; i++)
                {
                    Assert.Equal(avgValues[i], sumValues[i] / 2);
                }
                b = cursor.MoveNext();
                Assert.False(b);
            }

            data = experiment.GetOutput(crossValidateOutput.PerInstanceMetrics);
            Assert.True(data.Schema.TryGetColumnIndex("Instance", out int nameCol));
            using (var cursor = data.GetRowCursor(col => col == nameCol))
            {
                var getter = cursor.GetGetter <ReadOnlyMemory <char> >(nameCol);
                while (cursor.MoveNext())
                {
                    ReadOnlyMemory <char> name = default;
                    getter(ref name);
                    Assert.Subset(new HashSet <string>()
                    {
                        "Private", "?", "Federal-gov"
                    }, new HashSet <string>()
                    {
                        name.ToString()
                    });
                    if (cursor.Position > 4)
                    {
                        break;
                    }
                }
            }
        }
        public void TestCrossValidationMacroWithStratification()
        {
            var dataPath = GetDataPath(@"breast-cancer.txt");
            var env      = new MLContext(42);
            var subGraph = env.CreateExperiment();

            var nop       = new Legacy.Transforms.NoOperation();
            var nopOutput = subGraph.Add(nop);

            var learnerInput = new Legacy.Trainers.StochasticDualCoordinateAscentBinaryClassifier
            {
                TrainingData = nopOutput.OutputData,
                NumThreads   = 1
            };
            var learnerOutput = subGraph.Add(learnerInput);

            var modelCombine = new Legacy.Transforms.ManyHeterogeneousModelCombiner
            {
                TransformModels = new ArrayVar <TransformModel>(nopOutput.Model),
                PredictorModel  = learnerOutput.PredictorModel
            };
            var modelCombineOutput = subGraph.Add(modelCombine);

            var experiment  = env.CreateExperiment();
            var importInput = new Legacy.Data.TextLoader(dataPath);

            importInput.Arguments.Column = new Legacy.Data.TextLoaderColumn[]
            {
                new Legacy.Data.TextLoaderColumn {
                    Name = "Label", Source = new[] { new Legacy.Data.TextLoaderRange(0) }
                },
                new Legacy.Data.TextLoaderColumn {
                    Name = "Strat", Source = new[] { new Legacy.Data.TextLoaderRange(1) }
                },
                new Legacy.Data.TextLoaderColumn {
                    Name = "Features", Source = new[] { new Legacy.Data.TextLoaderRange(2, 9) }
                }
            };
            var importOutput = experiment.Add(importInput);

            var crossValidate = new Legacy.Models.CrossValidator
            {
                Data                 = importOutput.Data,
                Nodes                = subGraph,
                TransformModel       = null,
                StratificationColumn = "Strat"
            };

            crossValidate.Inputs.Data            = nop.Data;
            crossValidate.Outputs.PredictorModel = modelCombineOutput.PredictorModel;
            var crossValidateOutput = experiment.Add(crossValidate);

            experiment.Compile();
            experiment.SetInput(importInput.InputFile, new SimpleFileHandle(env, dataPath, false, false));
            experiment.Run();
            var data = experiment.GetOutput(crossValidateOutput.OverallMetrics);

            var schema = data.Schema;
            var b      = schema.TryGetColumnIndex("AUC", out int metricCol);

            Assert.True(b);
            b = schema.TryGetColumnIndex("Fold Index", out int foldCol);
            Assert.True(b);
            using (var cursor = data.GetRowCursor(col => col == metricCol || col == foldCol))
            {
                var getter                 = cursor.GetGetter <double>(metricCol);
                var foldGetter             = cursor.GetGetter <ReadOnlyMemory <char> >(foldCol);
                ReadOnlyMemory <char> fold = default;

                // Get the verage.
                b = cursor.MoveNext();
                Assert.True(b);
                double avg = 0;
                getter(ref avg);
                foldGetter(ref fold);
                Assert.True(ReadOnlyMemoryUtils.EqualsStr("Average", fold));

                // Get the standard deviation.
                b = cursor.MoveNext();
                Assert.True(b);
                double stdev = 0;
                getter(ref stdev);
                foldGetter(ref fold);
                Assert.True(ReadOnlyMemoryUtils.EqualsStr("Standard Deviation", fold));
                Assert.Equal(0.00488, stdev, 5);

                double sum = 0;
                double val = 0;
                for (int f = 0; f < 2; f++)
                {
                    b = cursor.MoveNext();
                    Assert.True(b);
                    getter(ref val);
                    foldGetter(ref fold);
                    sum += val;
                    Assert.True(ReadOnlyMemoryUtils.EqualsStr("Fold " + f, fold));
                }
                Assert.Equal(avg, sum / 2);
                b = cursor.MoveNext();
                Assert.False(b);
            }
        }
        public void TestCrossValidationMacroWithMultiClass()
        {
            var dataPath = GetDataPath(@"Train-Tiny-28x28.txt");
            var env      = new MLContext(42);
            var subGraph = env.CreateExperiment();

            var nop       = new Legacy.Transforms.NoOperation();
            var nopOutput = subGraph.Add(nop);

            var learnerInput = new Legacy.Trainers.StochasticDualCoordinateAscentClassifier
            {
                TrainingData = nopOutput.OutputData,
                NumThreads   = 1
            };
            var learnerOutput = subGraph.Add(learnerInput);

            var modelCombine = new Legacy.Transforms.ManyHeterogeneousModelCombiner
            {
                TransformModels = new ArrayVar <TransformModel>(nopOutput.Model),
                PredictorModel  = learnerOutput.PredictorModel
            };
            var modelCombineOutput = subGraph.Add(modelCombine);

            var experiment   = env.CreateExperiment();
            var importInput  = new Legacy.Data.TextLoader(dataPath);
            var importOutput = experiment.Add(importInput);

            var crossValidate = new Legacy.Models.CrossValidator
            {
                Data           = importOutput.Data,
                Nodes          = subGraph,
                Kind           = Legacy.Models.MacroUtilsTrainerKinds.SignatureMultiClassClassifierTrainer,
                TransformModel = null
            };

            crossValidate.Inputs.Data            = nop.Data;
            crossValidate.Outputs.PredictorModel = modelCombineOutput.PredictorModel;
            var crossValidateOutput = experiment.Add(crossValidate);

            experiment.Compile();
            importInput.SetInput(env, experiment);
            experiment.Run();
            var data = experiment.GetOutput(crossValidateOutput.OverallMetrics);

            var schema = data.Schema;
            var b      = schema.TryGetColumnIndex("Accuracy(micro-avg)", out int metricCol);

            Assert.True(b);
            b = schema.TryGetColumnIndex("Fold Index", out int foldCol);
            Assert.True(b);
            using (var cursor = data.GetRowCursor(col => col == metricCol || col == foldCol))
            {
                var getter                 = cursor.GetGetter <double>(metricCol);
                var foldGetter             = cursor.GetGetter <ReadOnlyMemory <char> >(foldCol);
                ReadOnlyMemory <char> fold = default;

                // Get the average.
                b = cursor.MoveNext();
                Assert.True(b);
                double avg = 0;
                getter(ref avg);
                foldGetter(ref fold);
                Assert.True(ReadOnlyMemoryUtils.EqualsStr("Average", fold));

                // Get the standard deviation.
                b = cursor.MoveNext();
                Assert.True(b);
                double stdev = 0;
                getter(ref stdev);
                foldGetter(ref fold);
                Assert.True(ReadOnlyMemoryUtils.EqualsStr("Standard Deviation", fold));
                Assert.Equal(0.015, stdev, 3);

                double sum = 0;
                double val = 0;
                for (int f = 0; f < 2; f++)
                {
                    b = cursor.MoveNext();
                    Assert.True(b);
                    getter(ref val);
                    foldGetter(ref fold);
                    sum += val;
                    Assert.True(ReadOnlyMemoryUtils.EqualsStr("Fold " + f, fold));
                }
                Assert.Equal(avg, sum / 2);
                b = cursor.MoveNext();
                Assert.False(b);
            }

            var confusion = experiment.GetOutput(crossValidateOutput.ConfusionMatrix);

            schema = confusion.Schema;
            b      = schema.TryGetColumnIndex("Count", out int countCol);
            Assert.True(b);
            b = schema.TryGetColumnIndex("Fold Index", out foldCol);
            Assert.True(b);
            var type = schema[countCol].Metadata.Schema[MetadataUtils.Kinds.SlotNames].Type;

            Assert.True(type is VectorType vecType && vecType.ItemType is TextType && vecType.Size == 10);
            var slotNames = default(VBuffer <ReadOnlyMemory <char> >);

            schema[countCol].GetSlotNames(ref slotNames);
            var slotNameValues = slotNames.GetValues();

            for (int i = 0; i < slotNameValues.Length; i++)
            {
                Assert.True(ReadOnlyMemoryUtils.EqualsStr(i.ToString(), slotNameValues[i]));
            }
            using (var curs = confusion.GetRowCursor(col => true))
            {
                var countGetter = curs.GetGetter <VBuffer <double> >(countCol);
                var foldGetter  = curs.GetGetter <ReadOnlyMemory <char> >(foldCol);
                var confCount   = default(VBuffer <double>);
                var foldIndex   = default(ReadOnlyMemory <char>);
                int rowCount    = 0;
                var foldCur     = "Fold 0";
                while (curs.MoveNext())
                {
                    countGetter(ref confCount);
                    foldGetter(ref foldIndex);
                    rowCount++;
                    Assert.True(ReadOnlyMemoryUtils.EqualsStr(foldCur, foldIndex));
                    if (rowCount == 10)
                    {
                        rowCount = 0;
                        foldCur  = "Fold 1";
                    }
                }
                Assert.Equal(0, rowCount);
            }

            var warnings = experiment.GetOutput(crossValidateOutput.Warnings);

            using (var cursor = warnings.GetRowCursor(col => true))
                Assert.False(cursor.MoveNext());
        }
        [ConditionalFact(typeof(BaseTestBaseline), nameof(BaseTestBaseline.LessThanNetCore30OrNotNetCore))] // netcore3.0 output differs from Baseline
        public void TestCrossValidationMacro()
        {
            var dataPath = GetDataPath(TestDatasets.generatedRegressionDatasetmacro.trainFilename);
            var env      = new MLContext(42);
            var subGraph = env.CreateExperiment();

            var nop       = new Legacy.Transforms.NoOperation();
            var nopOutput = subGraph.Add(nop);

            var generate = new Legacy.Transforms.RandomNumberGenerator();

            generate.Column = new[] { new Legacy.Transforms.GenerateNumberTransformColumn()
                                      {
                                          Name = "Weight1"
                                      } };
            generate.Data = nopOutput.OutputData;
            var generateOutput = subGraph.Add(generate);

            var learnerInput = new Legacy.Trainers.PoissonRegressor
            {
                TrainingData = generateOutput.OutputData,
                NumThreads   = 1,
                WeightColumn = "Weight1"
            };
            var learnerOutput = subGraph.Add(learnerInput);

            var modelCombine = new Legacy.Transforms.ManyHeterogeneousModelCombiner
            {
                TransformModels = new ArrayVar <TransformModel>(nopOutput.Model, generateOutput.Model),
                PredictorModel  = learnerOutput.PredictorModel
            };
            var modelCombineOutput = subGraph.Add(modelCombine);

            var experiment  = env.CreateExperiment();
            var importInput = new Legacy.Data.TextLoader(dataPath)
            {
                Arguments = new Legacy.Data.TextLoaderArguments
                {
                    Separator = new[] { ';' },
                    HasHeader = true,
                    Column    = new[]
                    {
                        new TextLoaderColumn()
                        {
                            Name   = "Label",
                            Source = new [] { new TextLoaderRange(11) },
                            Type   = Legacy.Data.DataKind.Num
                        },

                        new TextLoaderColumn()
                        {
                            Name   = "Features",
                            Source = new [] { new TextLoaderRange(0, 10) },
                            Type   = Legacy.Data.DataKind.Num
                        }
                    }
                }
            };
            var importOutput = experiment.Add(importInput);

            var crossValidate = new Legacy.Models.CrossValidator
            {
                Data           = importOutput.Data,
                Nodes          = subGraph,
                Kind           = Legacy.Models.MacroUtilsTrainerKinds.SignatureRegressorTrainer,
                TransformModel = null,
                WeightColumn   = "Weight1"
            };

            crossValidate.Inputs.Data            = nop.Data;
            crossValidate.Outputs.PredictorModel = modelCombineOutput.PredictorModel;
            var crossValidateOutput = experiment.Add(crossValidate);

            experiment.Compile();
            importInput.SetInput(env, experiment);
            experiment.Run();
            var data = experiment.GetOutput(crossValidateOutput.OverallMetrics);

            var schema = data.Schema;
            var b      = schema.TryGetColumnIndex("L1(avg)", out int metricCol);

            Assert.True(b);
            b = schema.TryGetColumnIndex("Fold Index", out int foldCol);
            Assert.True(b);
            b = schema.TryGetColumnIndex("IsWeighted", out int isWeightedCol);
            using (var cursor = data.GetRowCursor(col => col == metricCol || col == foldCol || col == isWeightedCol))
            {
                var getter                 = cursor.GetGetter <double>(metricCol);
                var foldGetter             = cursor.GetGetter <ReadOnlyMemory <char> >(foldCol);
                ReadOnlyMemory <char> fold = default;
                var    isWeightedGetter    = cursor.GetGetter <bool>(isWeightedCol);
                bool   isWeighted          = default;
                double avg                 = 0;
                double weightedAvg         = 0;
                for (int w = 0; w < 2; w++)
                {
                    // Get the average.
                    b = cursor.MoveNext();
                    Assert.True(b);
                    if (w == 1)
                    {
                        getter(ref weightedAvg);
                    }
                    else
                    {
                        getter(ref avg);
                    }
                    foldGetter(ref fold);
                    Assert.True(ReadOnlyMemoryUtils.EqualsStr("Average", fold));
                    isWeightedGetter(ref isWeighted);
                    Assert.True(isWeighted == (w == 1));

                    // Get the standard deviation.
                    b = cursor.MoveNext();
                    Assert.True(b);
                    double stdev = 0;
                    getter(ref stdev);
                    foldGetter(ref fold);
                    Assert.True(ReadOnlyMemoryUtils.EqualsStr("Standard Deviation", fold));
                    if (w == 1)
                    {
                        Assert.Equal(1.585, stdev, 3);
                    }
                    else
                    {
                        Assert.Equal(1.39, stdev, 2);
                    }
                    isWeightedGetter(ref isWeighted);
                    Assert.True(isWeighted == (w == 1));
                }
                double sum         = 0;
                double weightedSum = 0;
                for (int f = 0; f < 2; f++)
                {
                    for (int w = 0; w < 2; w++)
                    {
                        b = cursor.MoveNext();
                        Assert.True(b);
                        double val = 0;
                        getter(ref val);
                        foldGetter(ref fold);
                        if (w == 1)
                        {
                            weightedSum += val;
                        }
                        else
                        {
                            sum += val;
                        }
                        Assert.True(ReadOnlyMemoryUtils.EqualsStr("Fold " + f, fold));
                        isWeightedGetter(ref isWeighted);
                        Assert.True(isWeighted == (w == 1));
                    }
                }
                Assert.Equal(weightedAvg, weightedSum / 2);
                Assert.Equal(avg, sum / 2);
                b = cursor.MoveNext();
                Assert.False(b);
            }
        }
Beispiel #10
0
        private void LoadStopWords(IHostEnvironment env, IChannel ch, ArgumentsBase loaderArgs, out NormStr.Pool stopWordsMap)
        {
            Contracts.AssertValue(env);
            env.AssertValue(ch);
            ch.AssertValue(loaderArgs);

            if ((!string.IsNullOrEmpty(loaderArgs.Stopwords) || Utils.Size(loaderArgs.Stopword) > 0) &&
                (!string.IsNullOrWhiteSpace(loaderArgs.DataFile) || loaderArgs.Loader != null ||
                 !string.IsNullOrWhiteSpace(loaderArgs.StopwordsColumn)))
            {
                ch.Warning("Explicit stopwords list specified. Data file arguments will be ignored");
            }

            var src = default(ReadOnlyMemory <char>);

            stopWordsMap = new NormStr.Pool();
            var buffer = new StringBuilder();

            var stopwords = loaderArgs.Stopwords.AsMemory();

            stopwords = ReadOnlyMemoryUtils.TrimSpaces(stopwords);
            if (!stopwords.IsEmpty)
            {
                bool warnEmpty = true;
                for (bool more = true; more;)
                {
                    ReadOnlyMemory <char> stopword;
                    more     = ReadOnlyMemoryUtils.SplitOne(stopwords, ',', out stopword, out stopwords);
                    stopword = ReadOnlyMemoryUtils.TrimSpaces(stopword);
                    if (!stopword.IsEmpty)
                    {
                        buffer.Clear();
                        ReadOnlyMemoryUtils.AddLowerCaseToStringBuilder(stopword.Span, buffer);
                        stopWordsMap.Add(buffer);
                    }
                    else if (warnEmpty)
                    {
                        ch.Warning("Empty strings ignored in 'stopwords' specification");
                        warnEmpty = false;
                    }
                }
                ch.CheckUserArg(stopWordsMap.Count > 0, nameof(Arguments.Stopwords), "stopwords is empty");
            }
            else if (Utils.Size(loaderArgs.Stopword) > 0)
            {
                bool warnEmpty = true;
                foreach (string word in loaderArgs.Stopword)
                {
                    var stopword = word.AsSpan();
                    stopword = stopword.Trim(' ');
                    if (!stopword.IsEmpty)
                    {
                        buffer.Clear();
                        ReadOnlyMemoryUtils.AddLowerCaseToStringBuilder(stopword, buffer);
                        stopWordsMap.Add(buffer);
                    }
                    else if (warnEmpty)
                    {
                        ch.Warning("Empty strings ignored in 'stopword' specification");
                        warnEmpty = false;
                    }
                }
            }
            else
            {
                string srcCol = loaderArgs.StopwordsColumn;
                var    loader = LoadStopwords(env, ch, loaderArgs.DataFile, loaderArgs.Loader, ref srcCol);
                int    colSrc;
                if (!loader.Schema.TryGetColumnIndex(srcCol, out colSrc))
                {
                    throw ch.ExceptUserArg(nameof(Arguments.StopwordsColumn), "Unknown column '{0}'", srcCol);
                }
                var typeSrc = loader.Schema[colSrc].Type;
                ch.CheckUserArg(typeSrc.IsText, nameof(Arguments.StopwordsColumn), "Must be a scalar text column");

                // Accumulate the stopwords.
                using (var cursor = loader.GetRowCursor(col => col == colSrc))
                {
                    bool warnEmpty = true;
                    var  getter    = cursor.GetGetter <ReadOnlyMemory <char> >(colSrc);
                    while (cursor.MoveNext())
                    {
                        getter(ref src);
                        if (!src.IsEmpty)
                        {
                            buffer.Clear();
                            ReadOnlyMemoryUtils.AddLowerCaseToStringBuilder(src.Span, buffer);
                            stopWordsMap.Add(buffer);
                        }
                        else if (warnEmpty)
                        {
                            ch.Warning("Empty rows ignored in data file");
                            warnEmpty = false;
                        }
                    }
                }
                ch.CheckUserArg(stopWordsMap.Count > 0, nameof(Arguments.DataFile), "dataFile is empty");
            }
        }