示例#1
0
        /// <summary>
        /// 文字列を正規化します。
        /// </summary>
        /// <remarks>
        /// ここでいう'正規化'の定義は恣意的なものであり、
        /// ・ローマ数字/漢数字など → 半角数字
        /// ・かな文字 → 全角ひらがな
        /// ・半角/全角アルファベット → 半角大文字アルファベット
        /// に変換します。
        /// </remarks>
        public static string NormalizeText(string text, NormalizeTextOption option)
        {
            // なにもしません。
            if (string.IsNullOrEmpty(text))
            {
                return text;
            }

            if ((option & NormalizeTextOption.Number) != 0)
            {
                var kanjiDigit = option.HasFlag(NormalizeTextOption.KanjiDigit);
                text = NormalizeNumber(text, kanjiDigit);
            }

            if ((option & NormalizeTextOption.Alphabet) != 0)
            {
                text = NormalizeAlphabet(text);
            }

            if ((option & NormalizeTextOption.Kana) != 0)
            {
                text = NormalizeKana(text);
            }

            if ((option & NormalizeTextOption.Symbol) != 0)
            {
                text = NormalizeSymbol(text);
            }

            return text;
        }
示例#2
0
        /// <summary>
        /// 文字列を正規化します。
        /// </summary>
        /// <remarks>
        /// ここでいう'正規化'の定義は恣意的なものであり、
        /// ・ローマ数字/漢数字など → 半角数字
        /// ・かな文字 → 全角ひらがな
        /// ・半角/全角アルファベット → 半角大文字アルファベット
        /// に変換します。
        /// </remarks>
        public static string NormalizeText(string text, NormalizeTextOption option)
        {
            // なにもしません。
            if (string.IsNullOrEmpty(text))
            {
                return(text);
            }

            if ((option & NormalizeTextOption.Number) != 0)
            {
                var kanjiDigit = option.HasFlag(NormalizeTextOption.KanjiDigit);
                text = NormalizeNumber(text, kanjiDigit);
            }

            if ((option & NormalizeTextOption.Alphabet) != 0)
            {
                text = NormalizeAlphabet(text);
            }

            if ((option & NormalizeTextOption.Kana) != 0)
            {
                text = NormalizeKana(text);
            }

            if ((option & NormalizeTextOption.Symbol) != 0)
            {
                text = NormalizeSymbol(text);
            }

            return(text);
        }
示例#3
0
        static void Main(string[] args)
        {
            var context = new MLContext();

            context.Log += Context_Log;

            // Load Data
            var trainDataset = context.Data.LoadFromTextFile <ModelInput>(@".\datasets\wikipedia-detox-250-line-data-train.tsv", hasHeader: true);
            var testDataset  = context.Data.LoadFromTextFile <ModelInput>(@".\datasets\wikipedia-detox-250-line-test.tsv", hasHeader: true);

            var normalizeTextOption      = new NormalizeTextOption();
            var applyWordEmbeddingOption = new ApplyWordEmbeddingOption();

            // Create pipeline
            var pipeline = context.AutoML().CreateSweepableEstimator(
                // Create NormalizeText transformer and sweep over it.
                (context, option) =>
            {
                return(context.Transforms.Text.NormalizeText(
                           option.OutputColumnName,
                           option.InputColumnName,
                           option.CaseMode,
                           option.KeepDiacritics,
                           option.KeepPunctuations,
                           option.KeepNumbers));
            },
                normalizeTextOption,
                new string[] { "SentimentText" },
                new string[] { "txt" },
                nameof(TextNormalizingEstimator))
                           .Append(context.Transforms.Text.TokenizeIntoWords("txt", "txt"))
                           .Append(context.Transforms.Text.RemoveDefaultStopWords("txt", "txt"))
                           .Append(context.AutoML().CreateSweepableEstimator(
                                       // Create ApplyWordEmbedding transformer and sweep over it
                                       (context, option) =>
            {
                return(context.Transforms.Text.ApplyWordEmbedding(
                           option.outputColumnName,
                           option.inputColumnName,
                           option.ModelKind));
            },
                                       applyWordEmbeddingOption,
                                       new string[] { "txt" },
                                       new string[] { "txt" },
                                       nameof(WordEmbeddingEstimator)))
                           .Append(
                // use SdcaLogisticRegression and FastForest as trainer
                context.AutoML().BinaryClassification.SdcaLogisticRegression("Sentiment", "txt"),
                context.AutoML().BinaryClassification.FastForest("Sentiment", "txt"));

            var experimentOption = new Experiment.Option()
            {
                EvaluateFunction = (MLContext context, IDataView data) =>
                {
                    return(context.BinaryClassification.EvaluateNonCalibrated(data, "Sentiment").Accuracy);
                },
                MaximumTrainingTime       = 60 * 60,
                ParameterSweeperIteration = 100,
            };

            var experiment = context.AutoML().CreateExperiment(pipeline, experimentOption);
            var result     = experiment.TrainAsync(trainDataset, 0.1f, new Reporter()).Result;

            // evaluate on test
            var eval   = result.BestModel.Transform(testDataset);
            var metric = context.BinaryClassification.EvaluateNonCalibrated(eval, "Sentiment");

            Console.WriteLine($"best model validate score: {result.BestIteration.EvaluateScore}");
            Console.WriteLine($"best model test score: {metric.Accuracy}");
        }