public async Task <bool> Predict(Agent agent, NlpDoc doc, PipeModel meta)
        {
            doc.Tokenizer = this;

            // same as train
            doc.Sentences.ForEach(snt =>
            {
                snt.Tokens = _tokenizer.Tokenize(snt.Text);
            });

            return(true);
        }
Example #2
0
        public void TestVectorizer()
        {
            var tokenizer = new TokenizerFactory(new TokenizationOptions {
            }, SupportedLanguage.English);

            tokenizer.GetTokenizer <TreebankTokenizer>();

            var extractor = new CountFeatureExtractor();

            extractor.Sentences = tokenizer.Tokenize(Corpus());
            extractor.Vectorize(new List <string>());

            var vectors = Vectors();

            for (int i = 0; i < extractor.Sentences.Count; i++)
            {
                var sentence = extractor.Sentences[i];

                for (int j = 0; j < extractor.Features.Count; j++)
                {
                    var word = sentence.Words.Find(w => w.Lemma == extractor.Features[j]);

                    if (word != null)
                    {
                        Assert.IsTrue(word.Vector == vectors[i][j]);
                    }
                }
            }
        }
Example #3
0
        public void TriGramInCoNLL2000()
        {
            // tokenization
            var tokenizer = new TokenizerFactory <RegexTokenizer>(new TokenizationOptions
            {
                Pattern = RegexTokenizer.WORD_PUNC
            }, SupportedLanguage.English);

            var tokens = tokenizer.Tokenize("Chancellor of the Exchequer Nigel Lawson's restated commitment");

            // test tag
            var tagger = new TaggerFactory <NGramTagger>(new TagOptions
            {
                NGram  = 3,
                Tag    = "NN",
                Corpus = GetTaggedCorpus()
            }, SupportedLanguage.English);

            tagger.Tag(new Sentence {
                Words = tokens
            });

            Assert.IsTrue(tokens[0].Pos == "NNP");
            Assert.IsTrue(tokens[1].Pos == "IN");
            Assert.IsTrue(tokens[2].Pos == "DT");
            Assert.IsTrue(tokens[3].Pos == "NNP");
        }
        public void ReplaceConventionsIncludeMultipleSymbol()
        {
            var tokenizer = new TokenizerFactory(new TokenizationOptions
            {
            }, SupportedLanguage.English);

            tokenizer.GetTokenizer <TreebankTokenizer>();

            var tokens = tokenizer.Tokenize("I jump. And you?");

            Assert.IsTrue(tokens[0].Text == "I");
            Assert.IsTrue(tokens[0].Start == 0);

            Assert.IsTrue(tokens[1].Text == "jump");
            Assert.IsTrue(tokens[1].Start == 2);

            Assert.IsTrue(tokens[2].Text == ".");
            Assert.IsTrue(tokens[2].Start == 6);

            Assert.IsTrue(tokens[3].Text == "And");
            Assert.IsTrue(tokens[3].Start == 8);

            Assert.IsTrue(tokens[4].Text == "you");
            Assert.IsTrue(tokens[4].Start == 12);

            Assert.IsTrue(tokens[5].Text == "?");
            Assert.IsTrue(tokens[5].Start == 15);
        }
Example #5
0
        public void ReplaceConventions()
        {
            var tokenizer = new TokenizerFactory(new TokenizationOptions
            {
            }, SupportedLanguage.English);

            tokenizer.GetTokenizer <TreebankTokenizer>();

            var tokens = tokenizer.Tokenize("I cannot jump.");

            Assert.IsTrue(tokens[0].Text == "I");
            Assert.IsTrue(tokens[0].Start == 0);

            Assert.IsTrue(tokens[1].Text == "can");
            Assert.IsTrue(tokens[1].Start == 2);

            Assert.IsTrue(tokens[2].Text == "not");
            Assert.IsTrue(tokens[2].Start == 5);

            Assert.IsTrue(tokens[3].Text == "jump");
            Assert.IsTrue(tokens[3].Start == 9);

            Assert.IsTrue(tokens[4].Text == ".");
            Assert.IsTrue(tokens[4].Start == 13);
        }
        public void OneHotTest()
        {
            var reader    = new FasttextDataReader();
            var sentences = reader.Read(new ReaderOptions
            {
                DataDir  = Path.Combine(Configuration.GetValue <String>("MachineLearning:dataDir"), "Text Classification", "cooking.stackexchange"),
                FileName = "cooking.stackexchange.txt"
            });

            var tokenizer = new TokenizerFactory(new TokenizationOptions {
            }, SupportedLanguage.English);

            tokenizer.GetTokenizer <TreebankTokenizer>();

            var newSentences = tokenizer.Tokenize(sentences.Select(x => x.Text).ToList());

            for (int i = 0; i < newSentences.Count; i++)
            {
                newSentences[i].Label = sentences[i].Label;
            }
            sentences = newSentences.ToList();

            var encoder = new OneHotEncoder();

            encoder.Sentences = sentences;
            encoder.EncodeAll();
        }
        public void TriGramInCoNLL2000()
        {
            // tokenization
            var tokenizer = new TokenizerFactory(new TokenizationOptions
            {
                Pattern = RegexTokenizer.WORD_PUNC
            }, SupportedLanguage.English);

            tokenizer.GetTokenizer <RegexTokenizer>();

            var tokens = tokenizer.Tokenize("Chancellor of the Exchequer Nigel Lawson's restated commitment");

            // test tag
            var tagger = new TaggerFactory(new TagOptions
            {
                CorpusDir = Configuration.GetValue <String>("CherubNLP:dataDir"),
                NGram     = 3,
                Tag       = "NN"
            }, SupportedLanguage.English);

            tagger.GetTagger <NGramTagger>();

            tagger.Tag(new Sentence {
                Words = tokens
            });

            Assert.IsTrue(tokens[0].Pos == "NNP");
            Assert.IsTrue(tokens[1].Pos == "IN");
            Assert.IsTrue(tokens[2].Pos == "DT");
            Assert.IsTrue(tokens[3].Pos == "NNP");
        }
Example #8
0
        public void TokenizeInWhiteSpace()
        {
            var tokenizer = new TokenizerFactory(new TokenizationOptions
            {
                Pattern = RegexTokenizer.WHITE_SPACE
            }, SupportedLanguage.English);

            tokenizer.GetTokenizer <RegexTokenizer>();

            var tokens = tokenizer.Tokenize("Chop into pieces, isn't it?");

            Assert.IsTrue(tokens[0].Start == 0);
            Assert.IsTrue(tokens[0].Text == "Chop");

            Assert.IsTrue(tokens[1].Start == 5);
            Assert.IsTrue(tokens[1].Text == "into");

            Assert.IsTrue(tokens[2].Start == 10);
            Assert.IsTrue(tokens[2].Text == "pieces,");

            Assert.IsTrue(tokens[3].Start == 18);
            Assert.IsTrue(tokens[3].Text == "isn't");

            Assert.IsTrue(tokens[4].Start == 24);
            Assert.IsTrue(tokens[4].Text == "it?");
        }
Example #9
0
        public void GenderTest()
        {
            var options = new ClassifyOptions
            {
                TrainingCorpusDir = Path.Combine(Configuration.GetValue <String>("MachineLearning:dataDir"), "Gender")
            };
            var classifier = new ClassifierFactory <WordFeatureExtractor>(options, SupportedLanguage.English);

            var corpus = GetLabeledCorpus(options);

            var tokenizer = new TokenizerFactory(new TokenizationOptions
            {
                Pattern = RegexTokenizer.WORD_PUNC
            }, SupportedLanguage.English);

            tokenizer.GetTokenizer <RegexTokenizer>();

            corpus.ForEach(x => x.Words = tokenizer.Tokenize(x.Text));

            classifier.Train(corpus);
            string text = "Bridget";

            classifier.Classify(new Sentence {
                Text = text, Words = tokenizer.Tokenize(text)
            });

            corpus.Shuffle();
            var trainingData = corpus.Skip(2000).ToList();

            classifier.Train(trainingData);

            var testData = corpus.Take(2000).ToList();
            int correct  = 0;

            testData.ForEach(td =>
            {
                var classes = classifier.Classify(td);
                if (td.Label == classes[0].Item1)
                {
                    correct++;
                }
            });

            var accuracy = (float)correct / testData.Count;
        }
Example #10
0
        public void CookingTest()
        {
            var reader    = new FasttextDataReader();
            var sentences = reader.Read(new ReaderOptions
            {
                DataDir  = Path.Combine(Configuration.GetValue <String>("MachineLearning:dataDir"), "Text Classification", "cooking.stackexchange"),
                FileName = "cooking.stackexchange.txt"
            });

            var tokenizer = new TokenizerFactory(new TokenizationOptions {
            }, SupportedLanguage.English);

            tokenizer.GetTokenizer <TreebankTokenizer>();

            var newSentences = tokenizer.Tokenize(sentences.Select(x => x.Text).ToList());

            for (int i = 0; i < newSentences.Count; i++)
            {
                newSentences[i].Label = sentences[i].Label;
            }
            sentences = newSentences.ToList();

            sentences.Shuffle();

            var options = new ClassifyOptions
            {
                ModelFilePath     = Path.Combine(Configuration.GetValue <String>("MachineLearning:dataDir"), "Text Classification", "cooking.stackexchange", "nb.model"),
                TrainingCorpusDir = Path.Combine(Configuration.GetValue <String>("MachineLearning:dataDir"), "Text Classification", "cooking.stackexchange"),
                Dimension         = 100
            };
            var classifier = new ClassifierFactory <SentenceFeatureExtractor>(options, SupportedLanguage.English);

            var dataset = sentences.Split(0.7M);

            classifier.Train(dataset.Item1);

            int correct = 0;
            int total   = 0;

            dataset.Item2.ForEach(td =>
            {
                var classes = classifier.Classify(td);
                if (td.Label == classes[0].Item1)
                {
                    correct++;
                }
                total++;
            });

            var accuracy = (float)correct / total;

            Assert.IsTrue(accuracy > 0.5);
        }
        public void SpookyAuthorIdentification()
        {
            var reader    = new KaggleTextDataReader();
            var sentences = reader.Read(new ReaderOptions {
                FileName = "train.csv"
            });

            var tokenizer = new TokenizerFactory(new TokenizationOptions {
            }, SupportedLanguage.English);

            tokenizer.GetTokenizer <TreebankTokenizer>();

            var newSentences = tokenizer.Tokenize(sentences.Select(x => x.Text).ToList());

            for (int i = 0; i < newSentences.Count; i++)
            {
                newSentences[i].Id    = sentences[i].Id;
                newSentences[i].Label = sentences[i].Label;
            }
            sentences = newSentences.ToList();

            sentences.Shuffle();
            var dataset = sentences.Take(2000).ToList().Split(0.7M);

            var options = new ClassifyOptions
            {
                ModelDir      = AppContext.BaseDirectory,
                ModelFilePath = Path.Combine(AppContext.BaseDirectory, "nb.model"),
                Dimension     = 300
            };
            var classifier = new ClassifierFactory <SentenceFeatureExtractor>(options, SupportedLanguage.English);

            classifier.GetClassifer("NaiveBayesClassifier");
            classifier.Train(dataset.Item1);

            int correct = 0;
            int total   = 0;

            dataset.Item2.ForEach(td =>
            {
                var classes = classifier.Classify(td);
                if (td.Label == classes[0].Item1)
                {
                    correct++;
                }
                total++;
            });

            var accuracy = (float)correct / total;

            Assert.IsTrue(accuracy > 0.5);
        }
Example #12
0
        public void TagInCoNLL2000()
        {
            var tokenizer = new TokenizerFactory <RegexTokenizer>(new TokenizationOptions {
            }, SupportedLanguage.English);
            var tokens    = tokenizer.Tokenize("How are you doing?");

            var tagger = new TaggerFactory <DefaultTagger>(new TagOptions
            {
                Tag = "NN"
            }, SupportedLanguage.English);

            tagger.Tag(new Sentence {
                Words = tokens
            });
        }
Example #13
0
        public void ReplacePunctuation()
        {
            var tokenizer = new TokenizerFactory <TreebankTokenizer>(new TokenizationOptions
            {
            }, SupportedLanguage.English);

            var tokens = tokenizer.Tokenize("Hello World...");

            Assert.IsTrue(tokens[0].Text == "Hello");
            Assert.IsTrue(tokens[0].Start == 0);

            Assert.IsTrue(tokens[1].Text == "World");
            Assert.IsTrue(tokens[1].Start == 6);

            Assert.IsTrue(tokens[2].Text == "...");
            Assert.IsTrue(tokens[2].Start == 11);
        }
Example #14
0
        public void ReplaceEndingQuoting()
        {
            var tokenizer = new TokenizerFactory <TreebankTokenizer>(new TokenizationOptions
            {
            }, SupportedLanguage.English);

            var tokens = tokenizer.Tokenize("Aren't you");

            Assert.IsTrue(tokens[0].Text == "Are");
            Assert.IsTrue(tokens[0].Start == 0);

            Assert.IsTrue(tokens[1].Text == "n't");
            Assert.IsTrue(tokens[1].Start == 3);

            Assert.IsTrue(tokens[2].Text == "you");
            Assert.IsTrue(tokens[2].Start == 7);
        }
Example #15
0
        public void ReplaceStartingQuoting()
        {
            var tokenizer = new TokenizerFactory <TreebankTokenizer>(new TokenizationOptions
            {
            }, SupportedLanguage.English);

            var tokens = tokenizer.Tokenize("«Hello!");

            Assert.IsTrue(tokens[0].Text == "«");
            Assert.IsTrue(tokens[0].Start == 0);

            Assert.IsTrue(tokens[1].Text == "Hello");
            Assert.IsTrue(tokens[1].Start == 1);

            Assert.IsTrue(tokens[2].Text == "!");
            Assert.IsTrue(tokens[2].Start == 6);
        }
        public void UniGramInCoNLL2000()
        {
            // tokenization
            var tokenizer = new TokenizerFactory(new TokenizationOptions
            {
                Pattern = RegexTokenizer.WORD_PUNC
            }, SupportedLanguage.English);

            tokenizer.GetTokenizer <RegexTokenizer>();

            var tokens = tokenizer.Tokenize("Chancellor of the Exchequer Nigel Lawson's restated commitment");

            // test tag
            var tagger = new TaggerFactory(new TagOptions
            {
                CorpusDir = Configuration.GetValue <String>("CherubNLP:dataDir"),
                NGram     = 1,
                Tag       = "NN"
            }, SupportedLanguage.English);

            tagger.GetTagger <NGramTagger>();

            var watch = Stopwatch.StartNew();

            tagger.Tag(new Sentence {
                Words = tokens
            });
            watch.Stop();
            var elapsedMs1 = watch.ElapsedMilliseconds;

            Assert.IsTrue(tokens[0].Pos == "NNP");
            Assert.IsTrue(tokens[1].Pos == "IN");
            Assert.IsTrue(tokens[2].Pos == "DT");
            Assert.IsTrue(tokens[3].Pos == "NNP");

            // test if model is loaded repeatly.
            watch = Stopwatch.StartNew();
            tagger.Tag(new Sentence {
                Words = tokens
            });
            watch.Stop();
            var elapsedMs2 = watch.ElapsedMilliseconds;

            Assert.IsTrue(elapsedMs1 > elapsedMs2 * 100);
        }
Example #17
0
        public void ReplaceBrackets()
        {
            var tokenizer = new TokenizerFactory <TreebankTokenizer>(new TokenizationOptions
            {
            }, SupportedLanguage.English);

            var tokens = tokenizer.Tokenize("<Hello.>");

            Assert.IsTrue(tokens[0].Text == "<");
            Assert.IsTrue(tokens[0].Start == 0);

            Assert.IsTrue(tokens[1].Text == "Hello");
            Assert.IsTrue(tokens[1].Start == 1);

            Assert.IsTrue(tokens[2].Text == ".");
            Assert.IsTrue(tokens[2].Start == 6);

            Assert.IsTrue(tokens[3].Text == ">");
            Assert.IsTrue(tokens[3].Start == 7);
        }
        public void TokenizeInBlankLine()
        {
            var tokenizer = new TokenizerFactory <RegexTokenizer>(new TokenizationOptions
            {
                Pattern = RegexTokenizer.BLANK_LINE
            }, SupportedLanguage.English);

            var tokens = tokenizer.Tokenize(@"Chop into pieces, 

isn't

it?");

            Assert.IsTrue(tokens[0].Start == 0);
            Assert.IsTrue(tokens[0].Text == "Chop into pieces,");

            Assert.IsTrue(tokens[1].Start == 18);
            Assert.IsTrue(tokens[1].Text == "isn't");

            Assert.IsTrue(tokens[2].Start == 28);
            Assert.IsTrue(tokens[2].Text == "it?");
        }
Example #19
0
        public void TokenizeInWordPunctuation()
        {
            var tokenizer = new TokenizerFactory(new TokenizationOptions
            {
                Pattern      = RegexTokenizer.WORD_PUNC,
                SpecialWords = new List <string> {
                    "n't"
                }
            }, SupportedLanguage.English);

            tokenizer.GetTokenizer <RegexTokenizer>();

            var tokens = tokenizer.Tokenize("Chop into pieces, isn't it?");

            Assert.IsTrue(tokens[0].Start == 0);
            Assert.IsTrue(tokens[0].Text == "Chop");

            Assert.IsTrue(tokens[1].Start == 5);
            Assert.IsTrue(tokens[1].Text == "into");

            Assert.IsTrue(tokens[2].Start == 10);
            Assert.IsTrue(tokens[2].Text == "pieces");

            Assert.IsTrue(tokens[3].Start == 16);
            Assert.IsTrue(tokens[3].Text == ",");

            Assert.IsTrue(tokens[4].Start == 18);
            Assert.IsTrue(tokens[4].Text == "is");

            Assert.IsTrue(tokens[5].Start == 20);
            Assert.IsTrue(tokens[5].Text == "n't");

            Assert.IsTrue(tokens[6].Start == 24);
            Assert.IsTrue(tokens[6].Text == "it");

            Assert.IsTrue(tokens[7].Start == 26);
            Assert.IsTrue(tokens[7].Text == "?");
        }
        public void TokenizeInWordPunctuation()
        {
            var tokenizer = new TokenizerFactory <RegexTokenizer>(new TokenizationOptions
            {
                Pattern = RegexTokenizer.WORD_PUNC
            }, SupportedLanguage.English);

            var tokens = tokenizer.Tokenize("Chop into pieces, isn't it?");

            Assert.IsTrue(tokens[0].Start == 0);
            Assert.IsTrue(tokens[0].Text == "Chop");

            Assert.IsTrue(tokens[1].Start == 5);
            Assert.IsTrue(tokens[1].Text == "into");

            Assert.IsTrue(tokens[2].Start == 10);
            Assert.IsTrue(tokens[2].Text == "pieces");

            Assert.IsTrue(tokens[3].Start == 16);
            Assert.IsTrue(tokens[3].Text == ",");

            Assert.IsTrue(tokens[4].Start == 18);
            Assert.IsTrue(tokens[4].Text == "isn");

            Assert.IsTrue(tokens[5].Start == 21);
            Assert.IsTrue(tokens[5].Text == "'");

            Assert.IsTrue(tokens[6].Start == 22);
            Assert.IsTrue(tokens[6].Text == "t");

            Assert.IsTrue(tokens[7].Start == 24);
            Assert.IsTrue(tokens[7].Text == "it");

            Assert.IsTrue(tokens[8].Start == 26);
            Assert.IsTrue(tokens[8].Text == "?");
        }