/// <summary> /// 文字列を正規化します。 /// </summary> /// <remarks> /// ここでいう'正規化'の定義は恣意的なものであり、 /// ・ローマ数字/漢数字など → 半角数字 /// ・かな文字 → 全角ひらがな /// ・半角/全角アルファベット → 半角大文字アルファベット /// に変換します。 /// </remarks> public static string NormalizeText(string text, NormalizeTextOption option) { // なにもしません。 if (string.IsNullOrEmpty(text)) { return text; } if ((option & NormalizeTextOption.Number) != 0) { var kanjiDigit = option.HasFlag(NormalizeTextOption.KanjiDigit); text = NormalizeNumber(text, kanjiDigit); } if ((option & NormalizeTextOption.Alphabet) != 0) { text = NormalizeAlphabet(text); } if ((option & NormalizeTextOption.Kana) != 0) { text = NormalizeKana(text); } if ((option & NormalizeTextOption.Symbol) != 0) { text = NormalizeSymbol(text); } return text; }
/// <summary> /// 文字列を正規化します。 /// </summary> /// <remarks> /// ここでいう'正規化'の定義は恣意的なものであり、 /// ・ローマ数字/漢数字など → 半角数字 /// ・かな文字 → 全角ひらがな /// ・半角/全角アルファベット → 半角大文字アルファベット /// に変換します。 /// </remarks> public static string NormalizeText(string text, NormalizeTextOption option) { // なにもしません。 if (string.IsNullOrEmpty(text)) { return(text); } if ((option & NormalizeTextOption.Number) != 0) { var kanjiDigit = option.HasFlag(NormalizeTextOption.KanjiDigit); text = NormalizeNumber(text, kanjiDigit); } if ((option & NormalizeTextOption.Alphabet) != 0) { text = NormalizeAlphabet(text); } if ((option & NormalizeTextOption.Kana) != 0) { text = NormalizeKana(text); } if ((option & NormalizeTextOption.Symbol) != 0) { text = NormalizeSymbol(text); } return(text); }
static void Main(string[] args) { var context = new MLContext(); context.Log += Context_Log; // Load Data var trainDataset = context.Data.LoadFromTextFile <ModelInput>(@".\datasets\wikipedia-detox-250-line-data-train.tsv", hasHeader: true); var testDataset = context.Data.LoadFromTextFile <ModelInput>(@".\datasets\wikipedia-detox-250-line-test.tsv", hasHeader: true); var normalizeTextOption = new NormalizeTextOption(); var applyWordEmbeddingOption = new ApplyWordEmbeddingOption(); // Create pipeline var pipeline = context.AutoML().CreateSweepableEstimator( // Create NormalizeText transformer and sweep over it. (context, option) => { return(context.Transforms.Text.NormalizeText( option.OutputColumnName, option.InputColumnName, option.CaseMode, option.KeepDiacritics, option.KeepPunctuations, option.KeepNumbers)); }, normalizeTextOption, new string[] { "SentimentText" }, new string[] { "txt" }, nameof(TextNormalizingEstimator)) .Append(context.Transforms.Text.TokenizeIntoWords("txt", "txt")) .Append(context.Transforms.Text.RemoveDefaultStopWords("txt", "txt")) .Append(context.AutoML().CreateSweepableEstimator( // Create ApplyWordEmbedding transformer and sweep over it (context, option) => { return(context.Transforms.Text.ApplyWordEmbedding( option.outputColumnName, option.inputColumnName, option.ModelKind)); }, applyWordEmbeddingOption, new string[] { "txt" }, new string[] { "txt" }, nameof(WordEmbeddingEstimator))) .Append( // use SdcaLogisticRegression and FastForest as trainer context.AutoML().BinaryClassification.SdcaLogisticRegression("Sentiment", "txt"), context.AutoML().BinaryClassification.FastForest("Sentiment", "txt")); var experimentOption = new Experiment.Option() { EvaluateFunction = (MLContext context, IDataView data) => { return(context.BinaryClassification.EvaluateNonCalibrated(data, "Sentiment").Accuracy); }, MaximumTrainingTime = 60 * 60, ParameterSweeperIteration = 100, }; var experiment = context.AutoML().CreateExperiment(pipeline, experimentOption); var result = experiment.TrainAsync(trainDataset, 0.1f, new Reporter()).Result; // evaluate on test var eval = result.BestModel.Transform(testDataset); var metric = context.BinaryClassification.EvaluateNonCalibrated(eval, "Sentiment"); Console.WriteLine($"best model validate score: {result.BestIteration.EvaluateScore}"); Console.WriteLine($"best model test score: {metric.Accuracy}"); }