コード例 #1
0
        public void TestCountTargetEncodingWithNoise()
        {
            var dataPath = GetDataPath("breast-cancer.txt");
            var data     = ML.Data.LoadFromTextFile(dataPath, new[] {
                new TextLoader.Column("ScalarString", DataKind.String, 1),
                new TextLoader.Column("VectorString", DataKind.String, 2, 9),
                new TextLoader.Column("Label", DataKind.Boolean, 0)
            });

            IReadOnlyList <float> weights = null;
            var estimator = ML.Transforms.CountTargetEncode(new[] {
                new InputOutputColumnPair("ScalarString"), new InputOutputColumnPair("VectorString")
            }, "Label", CountTableBuilderBase.CreateCMCountTableBuilder(2, 1 << 6), laplaceScale: 1f)
                            .Append(ML.Transforms.Concatenate("Features", "ScalarString", "VectorString"))
                            .Append(ML.BinaryClassification.Trainers.AveragedPerceptron().WithOnFitDelegate(x => weights = x.Model.Weights));

            TestEstimatorCore(estimator, data);
            var model = estimator.Fit(data);

            IReadOnlyList <float> weightsNoNoise = null;

            estimator = ML.Transforms.CountTargetEncode(new[] {
                new InputOutputColumnPair("ScalarString"), new InputOutputColumnPair("VectorString")
            }, "Label", CountTableBuilderBase.CreateCMCountTableBuilder(2, 1 << 6))
                        .Append(ML.Transforms.Concatenate("Features", "ScalarString", "VectorString"))
                        .Append(ML.BinaryClassification.Trainers.AveragedPerceptron().WithOnFitDelegate(x => weightsNoNoise = x.Model.Weights));

            var modelNoNoise = estimator.Fit(data);

            // Fit another pipeline just to make sure that the difference between the first and the second model doesn't come from randomness.
            IReadOnlyList <float> weightsNoNoise2 = null;

            estimator = ML.Transforms.CountTargetEncode(new[] {
                new InputOutputColumnPair("ScalarString"), new InputOutputColumnPair("VectorString")
            }, "Label", CountTableBuilderBase.CreateCMCountTableBuilder(2, 1 << 6))
                        .Append(ML.Transforms.Concatenate("Features", "ScalarString", "VectorString"))
                        .Append(ML.BinaryClassification.Trainers.AveragedPerceptron().WithOnFitDelegate(x => weightsNoNoise2 = x.Model.Weights));

            var modelNoNoise2 = estimator.Fit(data);

            // The second and third models should be identical, and the first one different, because it was trained with noise.
            Assert.Equal(weightsNoNoise, weightsNoNoise2);
            Assert.NotEqual(weights, weightsNoNoise);

            var transformed        = model.Transform(data);
            var transformedNoNoise = modelNoNoise.Transform(data);

            var select = ML.Transforms.SelectColumns("Features").Fit(transformed);

            transformed        = select.Transform(transformed);
            transformedNoNoise = select.Transform(transformedNoNoise);
            CheckSameValues(transformed, transformedNoNoise);

            Done();
        }
コード例 #2
0
        public void TestSaveAndLoadExternalCounts()
        {
            var dataPath = GetDataPath("breast-cancer.txt");
            var data     = ML.Data.LoadFromTextFile(dataPath, new[] { new TextLoader.Column("Label", DataKind.Single, 0),
                                                                      new TextLoader.Column("Text", DataKind.String, 1, 9) });
            var estimator   = ML.Transforms.CountTargetEncode("Text", builder: CountTableBuilderBase.CreateCMCountTableBuilder(2, 1 << 6));
            var transformer = estimator.Fit(data);

            estimator = ML.Transforms.CountTargetEncode("Text", transformer);
            var transformer1 = estimator.Fit(data);

            CheckDoubleCounts(data, transformer, transformer1, 3);
        }
コード例 #3
0
        public void TestSaveAndLoadExternalCountsMultipleColumns()
        {
            var dataPath = GetDataPath("breast-cancer.txt");
            var data     = ML.Data.LoadFromTextFile(dataPath, new[] {
                new TextLoader.Column("ScalarString", DataKind.String, 1),
                new TextLoader.Column("VectorString", DataKind.String, 1, 9),
                new TextLoader.Column("Label", DataKind.Single, 0)
            });

            var estimator = ML.Transforms.CountTargetEncode(new[] {
                new InputOutputColumnPair("ScalarString"), new InputOutputColumnPair("VectorString")
            }, "Label", CountTableBuilderBase.CreateCMCountTableBuilder(2, 1 << 6));
            var transformer = estimator.Fit(data);

            estimator = ML.Transforms.CountTargetEncode(new[] { new InputOutputColumnPair("ScalarString"), new InputOutputColumnPair("VectorString") }, transformer);
            var transformer1 = estimator.Fit(data);

            CheckDoubleCounts(data, transformer, transformer1, 2, 5);
        }
コード例 #4
0
        public void TestCountTargetEncodingEstimatorWithBuilders()
        {
            var dataPath = GetDataPath("breast-cancer.txt");
            var data     = ML.Data.LoadFromTextFile(dataPath, new[] {
                new TextLoader.Column("ScalarString", DataKind.String, 1),
                new TextLoader.Column("VectorString", DataKind.String, 1, 9),
                new TextLoader.Column("Label", DataKind.Single, 0)
            });

            var estimator = ML.Transforms.CountTargetEncode("VectorString1", "VectorString", builder: CountTableBuilderBase.CreateCMCountTableBuilder(3, 1 << 10),
                                                            priorCoefficient: 0.5f, laplaceScale: 0.1f, numberOfBits: 25).Append(
                ML.Transforms.CountTargetEncode(new[] { new InputOutputColumnPair("ScalarString1", "ScalarString"), new InputOutputColumnPair("VectorString2", "VectorString") },
                                                "Label", sharedTable: true)).Append(
                ML.Transforms.CountTargetEncode("ScalarString2", "ScalarString", builder: CountTableBuilderBase.CreateDictionaryCountTableBuilder(1)));

            TestEstimatorCore(estimator, data);
            Done();
        }