public void TestCountTargetEncodingWithNoise() { var dataPath = GetDataPath("breast-cancer.txt"); var data = ML.Data.LoadFromTextFile(dataPath, new[] { new TextLoader.Column("ScalarString", DataKind.String, 1), new TextLoader.Column("VectorString", DataKind.String, 2, 9), new TextLoader.Column("Label", DataKind.Boolean, 0) }); IReadOnlyList <float> weights = null; var estimator = ML.Transforms.CountTargetEncode(new[] { new InputOutputColumnPair("ScalarString"), new InputOutputColumnPair("VectorString") }, "Label", CountTableBuilderBase.CreateCMCountTableBuilder(2, 1 << 6), laplaceScale: 1f) .Append(ML.Transforms.Concatenate("Features", "ScalarString", "VectorString")) .Append(ML.BinaryClassification.Trainers.AveragedPerceptron().WithOnFitDelegate(x => weights = x.Model.Weights)); TestEstimatorCore(estimator, data); var model = estimator.Fit(data); IReadOnlyList <float> weightsNoNoise = null; estimator = ML.Transforms.CountTargetEncode(new[] { new InputOutputColumnPair("ScalarString"), new InputOutputColumnPair("VectorString") }, "Label", CountTableBuilderBase.CreateCMCountTableBuilder(2, 1 << 6)) .Append(ML.Transforms.Concatenate("Features", "ScalarString", "VectorString")) .Append(ML.BinaryClassification.Trainers.AveragedPerceptron().WithOnFitDelegate(x => weightsNoNoise = x.Model.Weights)); var modelNoNoise = estimator.Fit(data); // Fit another pipeline just to make sure that the difference between the first and the second model doesn't come from randomness. IReadOnlyList <float> weightsNoNoise2 = null; estimator = ML.Transforms.CountTargetEncode(new[] { new InputOutputColumnPair("ScalarString"), new InputOutputColumnPair("VectorString") }, "Label", CountTableBuilderBase.CreateCMCountTableBuilder(2, 1 << 6)) .Append(ML.Transforms.Concatenate("Features", "ScalarString", "VectorString")) .Append(ML.BinaryClassification.Trainers.AveragedPerceptron().WithOnFitDelegate(x => weightsNoNoise2 = x.Model.Weights)); var modelNoNoise2 = estimator.Fit(data); // The second and third models should be identical, and the first one different, because it was trained with noise. Assert.Equal(weightsNoNoise, weightsNoNoise2); Assert.NotEqual(weights, weightsNoNoise); var transformed = model.Transform(data); var transformedNoNoise = modelNoNoise.Transform(data); var select = ML.Transforms.SelectColumns("Features").Fit(transformed); transformed = select.Transform(transformed); transformedNoNoise = select.Transform(transformedNoNoise); CheckSameValues(transformed, transformedNoNoise); Done(); }
public void TestSaveAndLoadExternalCounts() { var dataPath = GetDataPath("breast-cancer.txt"); var data = ML.Data.LoadFromTextFile(dataPath, new[] { new TextLoader.Column("Label", DataKind.Single, 0), new TextLoader.Column("Text", DataKind.String, 1, 9) }); var estimator = ML.Transforms.CountTargetEncode("Text", builder: CountTableBuilderBase.CreateCMCountTableBuilder(2, 1 << 6)); var transformer = estimator.Fit(data); estimator = ML.Transforms.CountTargetEncode("Text", transformer); var transformer1 = estimator.Fit(data); CheckDoubleCounts(data, transformer, transformer1, 3); }
public void TestSaveAndLoadExternalCountsMultipleColumns() { var dataPath = GetDataPath("breast-cancer.txt"); var data = ML.Data.LoadFromTextFile(dataPath, new[] { new TextLoader.Column("ScalarString", DataKind.String, 1), new TextLoader.Column("VectorString", DataKind.String, 1, 9), new TextLoader.Column("Label", DataKind.Single, 0) }); var estimator = ML.Transforms.CountTargetEncode(new[] { new InputOutputColumnPair("ScalarString"), new InputOutputColumnPair("VectorString") }, "Label", CountTableBuilderBase.CreateCMCountTableBuilder(2, 1 << 6)); var transformer = estimator.Fit(data); estimator = ML.Transforms.CountTargetEncode(new[] { new InputOutputColumnPair("ScalarString"), new InputOutputColumnPair("VectorString") }, transformer); var transformer1 = estimator.Fit(data); CheckDoubleCounts(data, transformer, transformer1, 2, 5); }
public void TestCountTargetEncodingEstimatorWithBuilders() { var dataPath = GetDataPath("breast-cancer.txt"); var data = ML.Data.LoadFromTextFile(dataPath, new[] { new TextLoader.Column("ScalarString", DataKind.String, 1), new TextLoader.Column("VectorString", DataKind.String, 1, 9), new TextLoader.Column("Label", DataKind.Single, 0) }); var estimator = ML.Transforms.CountTargetEncode("VectorString1", "VectorString", builder: CountTableBuilderBase.CreateCMCountTableBuilder(3, 1 << 10), priorCoefficient: 0.5f, laplaceScale: 0.1f, numberOfBits: 25).Append( ML.Transforms.CountTargetEncode(new[] { new InputOutputColumnPair("ScalarString1", "ScalarString"), new InputOutputColumnPair("VectorString2", "VectorString") }, "Label", sharedTable: true)).Append( ML.Transforms.CountTargetEncode("ScalarString2", "ScalarString", builder: CountTableBuilderBase.CreateDictionaryCountTableBuilder(1))); TestEstimatorCore(estimator, data); Done(); }