private static IDataScorerTransform _TrainSentiment() { bool normalize = true; var args = new TextLoader.Options() { Separators = new[] { '\t' }, HasHeader = true, Columns = new[] { new TextLoader.Column("Label", DataKind.Boolean, 0), new TextLoader.Column("SentimentText", DataKind.String, 1) } }; var args2 = new TextFeaturizingEstimator.Options() { KeepDiacritics = false, KeepPunctuations = false, CaseMode = TextNormalizingEstimator.CaseMode.Lower, OutputTokensColumnName = "tokens", Norm = normalize ? TextFeaturizingEstimator.NormFunction.L2 : TextFeaturizingEstimator.NormFunction.None, CharFeatureExtractor = new WordBagEstimator.Options() { NgramLength = 3, UseAllLengths = false }, WordFeatureExtractor = new WordBagEstimator.Options() { NgramLength = 2, UseAllLengths = true }, }; var trainFilename = FileHelper.GetTestFile("wikipedia-detox-250-line-data.tsv"); /*using (*/ var env = EnvHelper.NewTestEnvironment(seed: 1, conc: 1); { // Pipeline var loader = new TextLoader(env, args).Load(new MultiFileSource(trainFilename)); var trans = TextFeaturizingEstimator.Create(env, args2, loader); // Train var trainer = new SdcaLogisticRegressionBinaryTrainer(env, new SdcaLogisticRegressionBinaryTrainer.Options { LabelColumnName = "Label", FeatureColumnName = "Features" }); var cached = new Microsoft.ML.Data.CacheDataView(env, trans, prefetch: null); var predictor = trainer.Fit(cached); var trainRoles = new RoleMappedData(cached, label: "Label", feature: "Features"); var scoreRoles = new RoleMappedData(trans, label: "Label", feature: "Features"); return(ScoreUtils.GetScorer(predictor.Model, scoreRoles, env, trainRoles.Schema)); } }
/// <summary> /// Creation of the pipeline knowing parameters _inDataFrame, _cacheFile, _reuse. /// </summary> protected IDataTransform CreatePipeline(IHostEnvironment env, IDataView input) { if (_inDataFrame) { if (_async) { var view = new CacheDataView(env, input, null); var tr = new PassThroughTransform(env, new PassThroughTransform.Arguments(), view); return(tr); } else { var args = new SortInDataFrameTransform.Arguments() { numThreads = _numThreads, sortColumn = null }; var tr = new SortInDataFrameTransform(env, args, input); return(tr); } } else { string nt = _numThreads > 0 ? string.Format("{{t={0}}}", _numThreads) : string.Empty; using (var ch = Host.Start("Caching data...")) { if (_reuse && File.Exists(_cacheFile)) { ch.Info(MessageSensitivity.UserData, "Reusing cache '{0}'", _cacheFile); } else { ch.Info(MessageSensitivity.UserData, "Building cache '{0}'", _cacheFile); var saver = ComponentCreation.CreateSaver(env, _saverSettings); using (var fs0 = Host.CreateOutputFile(_cacheFile)) DataSaverUtils.SaveDataView(ch, saver, input, fs0, true); } } var loader = ComponentCreation.CreateLoader(env, string.Format("binary{{{0}}}", nt), new MultiFileSource(_cacheFile)); SchemaHelper.CheckSchema(Host, input.Schema, loader.Schema); var copy = ComponentCreation.CreateTransform(env, "skip{s=0}", loader); return(copy); } }
private static IDataScorerTransform _TrainSentiment() { bool normalize = true; var args = new TextLoader.Arguments() { Separator = "tab", HasHeader = true, Column = new[] { new TextLoader.Column("Label", DataKind.BL, 0), new TextLoader.Column("SentimentText", DataKind.Text, 1) } }; var args2 = new TextFeaturizingEstimator.Arguments() { Column = new TextFeaturizingEstimator.Column { Name = "Features", Source = new[] { "SentimentText" } }, KeepDiacritics = false, KeepPunctuations = false, TextCase = TextNormalizingEstimator.CaseNormalizationMode.Lower, OutputTokens = true, UsePredefinedStopWordRemover = true, VectorNormalizer = normalize ? TextFeaturizingEstimator.TextNormKind.L2 : TextFeaturizingEstimator.TextNormKind.None, CharFeatureExtractor = new NgramExtractorTransform.NgramExtractorArguments() { NgramLength = 3, AllLengths = false }, WordFeatureExtractor = new NgramExtractorTransform.NgramExtractorArguments() { NgramLength = 2, AllLengths = true }, }; var trainFilename = FileHelper.GetTestFile("wikipedia-detox-250-line-data.tsv"); using (var env = EnvHelper.NewTestEnvironment(seed: 1, conc: 1)) { // Pipeline var loader = new TextLoader(env, args).Read(new MultiFileSource(trainFilename)); var trans = TextFeaturizingEstimator.Create(env, args2, loader); // Train var trainer = new SdcaBinaryTrainer(env, new SdcaBinaryTrainer.Arguments { NumThreads = 1, LabelColumn = "Label", FeatureColumn = "Features" }); var cached = new Microsoft.ML.Data.CacheDataView(env, trans, prefetch: null); var predictor = trainer.Fit(cached); var trainRoles = new RoleMappedData(cached, label: "Label", feature: "Features"); var scoreRoles = new RoleMappedData(trans, label: "Label", feature: "Features"); return(ScoreUtils.GetScorer(predictor.Model, scoreRoles, env, trainRoles.Schema)); } }