Example #1
0
        public void GenderTest()
        {
            var options = new ClassifyOptions
            {
                TrainingCorpusDir = Path.Combine(Configuration.GetValue <String>("MachineLearning:dataDir"), "Gender")
            };
            var classifier = new ClassifierFactory <WordFeatureExtractor>(options, SupportedLanguage.English);

            var corpus = GetLabeledCorpus(options);

            var tokenizer = new TokenizerFactory(new TokenizationOptions
            {
                Pattern = RegexTokenizer.WORD_PUNC
            }, SupportedLanguage.English);

            tokenizer.GetTokenizer <RegexTokenizer>();

            corpus.ForEach(x => x.Words = tokenizer.Tokenize(x.Text));

            classifier.Train(corpus);
            string text = "Bridget";

            classifier.Classify(new Sentence {
                Text = text, Words = tokenizer.Tokenize(text)
            });

            corpus.Shuffle();
            var trainingData = corpus.Skip(2000).ToList();

            classifier.Train(trainingData);

            var testData = corpus.Take(2000).ToList();
            int correct  = 0;

            testData.ForEach(td =>
            {
                var classes = classifier.Classify(td);
                if (td.Label == classes[0].Item1)
                {
                    correct++;
                }
            });

            var accuracy = (float)correct / testData.Count;
        }
Example #2
0
        public void CookingTest()
        {
            var reader    = new FasttextDataReader();
            var sentences = reader.Read(new ReaderOptions
            {
                DataDir  = Path.Combine(Configuration.GetValue <String>("MachineLearning:dataDir"), "Text Classification", "cooking.stackexchange"),
                FileName = "cooking.stackexchange.txt"
            });

            var tokenizer = new TokenizerFactory(new TokenizationOptions {
            }, SupportedLanguage.English);

            tokenizer.GetTokenizer <TreebankTokenizer>();

            var newSentences = tokenizer.Tokenize(sentences.Select(x => x.Text).ToList());

            for (int i = 0; i < newSentences.Count; i++)
            {
                newSentences[i].Label = sentences[i].Label;
            }
            sentences = newSentences.ToList();

            sentences.Shuffle();

            var options = new ClassifyOptions
            {
                ModelFilePath     = Path.Combine(Configuration.GetValue <String>("MachineLearning:dataDir"), "Text Classification", "cooking.stackexchange", "nb.model"),
                TrainingCorpusDir = Path.Combine(Configuration.GetValue <String>("MachineLearning:dataDir"), "Text Classification", "cooking.stackexchange"),
                Dimension         = 100
            };
            var classifier = new ClassifierFactory <SentenceFeatureExtractor>(options, SupportedLanguage.English);

            var dataset = sentences.Split(0.7M);

            classifier.Train(dataset.Item1);

            int correct = 0;
            int total   = 0;

            dataset.Item2.ForEach(td =>
            {
                var classes = classifier.Classify(td);
                if (td.Label == classes[0].Item1)
                {
                    correct++;
                }
                total++;
            });

            var accuracy = (float)correct / total;

            Assert.IsTrue(accuracy > 0.5);
        }
        public void SpookyAuthorIdentification()
        {
            var reader    = new KaggleTextDataReader();
            var sentences = reader.Read(new ReaderOptions {
                FileName = "train.csv"
            });

            var tokenizer = new TokenizerFactory(new TokenizationOptions {
            }, SupportedLanguage.English);

            tokenizer.GetTokenizer <TreebankTokenizer>();

            var newSentences = tokenizer.Tokenize(sentences.Select(x => x.Text).ToList());

            for (int i = 0; i < newSentences.Count; i++)
            {
                newSentences[i].Id    = sentences[i].Id;
                newSentences[i].Label = sentences[i].Label;
            }
            sentences = newSentences.ToList();

            sentences.Shuffle();
            var dataset = sentences.Take(2000).ToList().Split(0.7M);

            var options = new ClassifyOptions
            {
                ModelDir      = AppContext.BaseDirectory,
                ModelFilePath = Path.Combine(AppContext.BaseDirectory, "nb.model"),
                Dimension     = 300
            };
            var classifier = new ClassifierFactory <SentenceFeatureExtractor>(options, SupportedLanguage.English);

            classifier.GetClassifer("NaiveBayesClassifier");
            classifier.Train(dataset.Item1);

            int correct = 0;
            int total   = 0;

            dataset.Item2.ForEach(td =>
            {
                var classes = classifier.Classify(td);
                if (td.Label == classes[0].Item1)
                {
                    correct++;
                }
                total++;
            });

            var accuracy = (float)correct / total;

            Assert.IsTrue(accuracy > 0.5);
        }
Example #4
0
        public async Task <bool> Train(AgentBase agent, NlpDoc doc, PipeModel meta)
        {
            Init(meta);

            var sentences = doc.Sentences.Select(x => new Sentence
            {
                Label = x.Intent.Label,
                Text  = x.Text,
                Words = x.Tokens
            }).ToList();

            _classifier.Train(sentences);

            Console.WriteLine($"Saved model to {Settings.ModelDir}");

            return(true);
        }
Example #5
0
        public async Task <bool> Train(Agent agent, NlpDoc doc, PipeModel meta)
        {
            meta.Model = "classification-nb.model";
            string modelFileName = Path.Combine(Settings.ModelDir, meta.Model);

            var options = new ClassifyOptions
            {
                ModelFilePath = modelFileName
            };
            var classifier = new ClassifierFactory <NaiveBayesClassifier, SentenceFeatureExtractor>(options, SupportedLanguage.English);

            var sentences = doc.Sentences.Select(x => new Sentence
            {
                Label = x.Intent.Label,
                Text  = x.Text,
                Words = x.Tokens
            }).ToList();

            classifier.Train(sentences);

            Console.WriteLine($"Saved model to {modelFileName}");

            return(true);
        }