Ejemplo n.º 1
0
        public void ReplaceConventions()
        {
            var tokenizer = new TokenizerFactory(new TokenizationOptions
            {
            }, SupportedLanguage.English);

            tokenizer.GetTokenizer <TreebankTokenizer>();

            var tokens = tokenizer.Tokenize("I cannot jump.");

            Assert.IsTrue(tokens[0].Text == "I");
            Assert.IsTrue(tokens[0].Start == 0);

            Assert.IsTrue(tokens[1].Text == "can");
            Assert.IsTrue(tokens[1].Start == 2);

            Assert.IsTrue(tokens[2].Text == "not");
            Assert.IsTrue(tokens[2].Start == 5);

            Assert.IsTrue(tokens[3].Text == "jump");
            Assert.IsTrue(tokens[3].Start == 9);

            Assert.IsTrue(tokens[4].Text == ".");
            Assert.IsTrue(tokens[4].Start == 13);
        }
Ejemplo n.º 2
0
        public void TokenizeInWhiteSpace()
        {
            var tokenizer = new TokenizerFactory(new TokenizationOptions
            {
                Pattern = RegexTokenizer.WHITE_SPACE
            }, SupportedLanguage.English);

            tokenizer.GetTokenizer <RegexTokenizer>();

            var tokens = tokenizer.Tokenize("Chop into pieces, isn't it?");

            Assert.IsTrue(tokens[0].Start == 0);
            Assert.IsTrue(tokens[0].Text == "Chop");

            Assert.IsTrue(tokens[1].Start == 5);
            Assert.IsTrue(tokens[1].Text == "into");

            Assert.IsTrue(tokens[2].Start == 10);
            Assert.IsTrue(tokens[2].Text == "pieces,");

            Assert.IsTrue(tokens[3].Start == 18);
            Assert.IsTrue(tokens[3].Text == "isn't");

            Assert.IsTrue(tokens[4].Start == 24);
            Assert.IsTrue(tokens[4].Text == "it?");
        }
Ejemplo n.º 3
0
        public void TriGramInCoNLL2000()
        {
            // tokenization
            var tokenizer = new TokenizerFactory(new TokenizationOptions
            {
                Pattern = RegexTokenizer.WORD_PUNC
            }, SupportedLanguage.English);

            tokenizer.GetTokenizer <RegexTokenizer>();

            var tokens = tokenizer.Tokenize("Chancellor of the Exchequer Nigel Lawson's restated commitment");

            // test tag
            var tagger = new TaggerFactory(new TagOptions
            {
                CorpusDir = Configuration.GetValue <String>("BotSharp.NLP:dataDir"),
                NGram     = 3,
                Tag       = "NN"
            }, SupportedLanguage.English);

            tagger.GetTagger <NGramTagger>();

            tagger.Tag(new Sentence {
                Words = tokens
            });

            Assert.IsTrue(tokens[0].Pos == "NNP");
            Assert.IsTrue(tokens[1].Pos == "IN");
            Assert.IsTrue(tokens[2].Pos == "DT");
            Assert.IsTrue(tokens[3].Pos == "NNP");
        }
        public void ReplaceConventionsIncludeMultipleSymbol()
        {
            var tokenizer = new TokenizerFactory(new TokenizationOptions
            {
            }, SupportedLanguage.English);

            tokenizer.GetTokenizer <TreebankTokenizer>();

            var tokens = tokenizer.Tokenize("I jump. And you?");

            Assert.IsTrue(tokens[0].Text == "I");
            Assert.IsTrue(tokens[0].Start == 0);

            Assert.IsTrue(tokens[1].Text == "jump");
            Assert.IsTrue(tokens[1].Start == 2);

            Assert.IsTrue(tokens[2].Text == ".");
            Assert.IsTrue(tokens[2].Start == 6);

            Assert.IsTrue(tokens[3].Text == "And");
            Assert.IsTrue(tokens[3].Start == 8);

            Assert.IsTrue(tokens[4].Text == "you");
            Assert.IsTrue(tokens[4].Start == 12);

            Assert.IsTrue(tokens[5].Text == "?");
            Assert.IsTrue(tokens[5].Start == 15);
        }
Ejemplo n.º 5
0
        public void CookingTest()
        {
            var reader    = new FasttextDataReader();
            var sentences = reader.Read(new ReaderOptions
            {
                DataDir  = Path.Combine(Configuration.GetValue <String>("MachineLearning:dataDir"), "Text Classification", "cooking.stackexchange"),
                FileName = "cooking.stackexchange.txt"
            });

            var tokenizer = new TokenizerFactory(new TokenizationOptions {
            }, SupportedLanguage.English);

            tokenizer.GetTokenizer <TreebankTokenizer>();

            var newSentences = tokenizer.Tokenize(sentences.Select(x => x.Text).ToList());

            for (int i = 0; i < newSentences.Count; i++)
            {
                newSentences[i].Label = sentences[i].Label;
            }
            sentences = newSentences.ToList();

            sentences.Shuffle();

            var options = new ClassifyOptions
            {
                ModelFilePath     = Path.Combine(Configuration.GetValue <String>("MachineLearning:dataDir"), "Text Classification", "cooking.stackexchange", "nb.model"),
                TrainingCorpusDir = Path.Combine(Configuration.GetValue <String>("MachineLearning:dataDir"), "Text Classification", "cooking.stackexchange"),
                Dimension         = 100
            };
            var classifier = new ClassifierFactory <SentenceFeatureExtractor>(options, SupportedLanguage.English);

            var dataset = sentences.Split(0.7M);

            classifier.Train(dataset.Item1);

            int correct = 0;
            int total   = 0;

            dataset.Item2.ForEach(td =>
            {
                var classes = classifier.Classify(td);
                if (td.Label == classes[0].Item1)
                {
                    correct++;
                }
                total++;
            });

            var accuracy = (float)correct / total;

            Assert.IsTrue(accuracy > 0.5);
        }
Ejemplo n.º 6
0
        public void SpookyAuthorIdentification()
        {
            var reader    = new KaggleTextDataReader();
            var sentences = reader.Read(new ReaderOptions {
                FileName = "train.csv"
            });

            var tokenizer = new TokenizerFactory(new TokenizationOptions {
            }, SupportedLanguage.English);

            tokenizer.GetTokenizer <TreebankTokenizer>();

            var newSentences = tokenizer.Tokenize(sentences.Select(x => x.Text).ToList());

            for (int i = 0; i < newSentences.Count; i++)
            {
                newSentences[i].Id    = sentences[i].Id;
                newSentences[i].Label = sentences[i].Label;
            }
            sentences = newSentences.ToList();

            sentences.Shuffle();
            var dataset = sentences.Take(2000).ToList().Split(0.7M);

            var options = new ClassifyOptions
            {
                ModelDir      = AppContext.BaseDirectory,
                ModelFilePath = Path.Combine(AppContext.BaseDirectory, "nb.model"),
                Dimension     = 300
            };
            var classifier = new ClassifierFactory <SentenceFeatureExtractor>(options, SupportedLanguage.English);

            classifier.GetClassifer("NaiveBayesClassifier");
            classifier.Train(dataset.Item1);

            int correct = 0;
            int total   = 0;

            dataset.Item2.ForEach(td =>
            {
                var classes = classifier.Classify(td);
                if (td.Label == classes[0].Item1)
                {
                    correct++;
                }
                total++;
            });

            var accuracy = (float)correct / total;

            Assert.IsTrue(accuracy > 0.5);
        }
Ejemplo n.º 7
0
        private void Init()
        {
            if (_tokenizer == null)
            {
                _tokenizer = new TokenizerFactory(new TokenizationOptions
                {
                    Pattern = Configuration.GetValue <String>("options:pattern")
                }, SupportedLanguage.English);

                string tokenizerName = Configuration.GetValue <String>($"tokenizer");

                _tokenizer.GetTokenizer(tokenizerName);
            }
        }
Ejemplo n.º 8
0
        static void Main(string[] args)
        {
            Console.WriteLine("Please enter text to tag");
            var text   = Console.ReadLine();
            var corpus = CorpusFactory.GetCorpus("brills");

            var tokenizer = TokenizerFactory.GetTokenizer("simple");
            var tokens    = tokenizer.Tokenize(text);

            var tagger  = TaggerFactory.GetTagger("simple");
            var results = tagger.Tag(corpus, tokens.Where(x => !string.IsNullOrWhiteSpace(x)).ToList());

            Console.WriteLine(String.Join(" ", results.Select(x => string.Format("{0}({1})", x.Token, x.Tag))));
            Console.WriteLine("Press any key to exit");
            var stop = Console.ReadLine();
        }
Ejemplo n.º 9
0
        public void UniGramInCoNLL2000()
        {
            // tokenization
            var tokenizer = new TokenizerFactory(new TokenizationOptions
            {
                Pattern = RegexTokenizer.WORD_PUNC
            }, SupportedLanguage.English);

            tokenizer.GetTokenizer <RegexTokenizer>();

            var tokens = tokenizer.Tokenize("Chancellor of the Exchequer Nigel Lawson's restated commitment");

            // test tag
            var tagger = new TaggerFactory(new TagOptions
            {
                CorpusDir = Configuration.GetValue <String>("CherubNLP:dataDir"),
                NGram     = 1,
                Tag       = "NN"
            }, SupportedLanguage.English);

            tagger.GetTagger <NGramTagger>();

            var watch = Stopwatch.StartNew();

            tagger.Tag(new Sentence {
                Words = tokens
            });
            watch.Stop();
            var elapsedMs1 = watch.ElapsedMilliseconds;

            Assert.IsTrue(tokens[0].Pos == "NNP");
            Assert.IsTrue(tokens[1].Pos == "IN");
            Assert.IsTrue(tokens[2].Pos == "DT");
            Assert.IsTrue(tokens[3].Pos == "NNP");

            // test if model is loaded repeatly.
            watch = Stopwatch.StartNew();
            tagger.Tag(new Sentence {
                Words = tokens
            });
            watch.Stop();
            var elapsedMs2 = watch.ElapsedMilliseconds;

            Assert.IsTrue(elapsedMs1 > elapsedMs2 * 100);
        }
Ejemplo n.º 10
0
        public void GenderTest()
        {
            var options = new ClassifyOptions
            {
                TrainingCorpusDir = Path.Combine(Configuration.GetValue <String>("MachineLearning:dataDir"), "Gender")
            };
            var classifier = new ClassifierFactory <WordFeatureExtractor>(options, SupportedLanguage.English);

            var corpus = GetLabeledCorpus(options);

            var tokenizer = new TokenizerFactory(new TokenizationOptions
            {
                Pattern = RegexTokenizer.WORD_PUNC
            }, SupportedLanguage.English);

            tokenizer.GetTokenizer <RegexTokenizer>();

            corpus.ForEach(x => x.Words = tokenizer.Tokenize(x.Text));

            classifier.Train(corpus);
            string text = "Bridget";

            classifier.Classify(new Sentence {
                Text = text, Words = tokenizer.Tokenize(text)
            });

            corpus.Shuffle();
            var trainingData = corpus.Skip(2000).ToList();

            classifier.Train(trainingData);

            var testData = corpus.Take(2000).ToList();
            int correct  = 0;

            testData.ForEach(td =>
            {
                var classes = classifier.Classify(td);
                if (td.Label == classes[0].Item1)
                {
                    correct++;
                }
            });

            var accuracy = (float)correct / testData.Count;
        }
Ejemplo n.º 11
0
        public ActionResult Index(TaggerViewModel model)
        {
            if (!ModelState.IsValid)
            {
                return(View(model));
            }

            var corpus = CorpusFactory.GetCorpus("brills");

            var tokenizer = TokenizerFactory.GetTokenizer("simple");
            var tokens    = tokenizer.Tokenize(model.Text);

            var tagger  = TaggerFactory.GetTagger("simple");
            var results = tagger.Tag(corpus, tokens.Where(x => !string.IsNullOrWhiteSpace(x)).ToList());

            ViewBag.Results = String.Join(" ", results.Select(x => string.Format("{0}({1})", x.Token, x.Tag)));

            return(View(model));
        }
Ejemplo n.º 12
0
        public void ReplacePunctuation()
        {
            var tokenizer = new TokenizerFactory(new TokenizationOptions
            {
            }, SupportedLanguage.English);

            tokenizer.GetTokenizer <TreebankTokenizer>();

            var tokens = tokenizer.Tokenize("Hello World...");

            Assert.IsTrue(tokens[0].Text == "Hello");
            Assert.IsTrue(tokens[0].Start == 0);

            Assert.IsTrue(tokens[1].Text == "World");
            Assert.IsTrue(tokens[1].Start == 6);

            Assert.IsTrue(tokens[2].Text == "...");
            Assert.IsTrue(tokens[2].Start == 11);
        }
Ejemplo n.º 13
0
        public void ReplaceEndingQuoting()
        {
            var tokenizer = new TokenizerFactory(new TokenizationOptions
            {
            }, SupportedLanguage.English);

            tokenizer.GetTokenizer <TreebankTokenizer>();

            var tokens = tokenizer.Tokenize("Aren't you");

            Assert.IsTrue(tokens[0].Text == "Are");
            Assert.IsTrue(tokens[0].Start == 0);

            Assert.IsTrue(tokens[1].Text == "n't");
            Assert.IsTrue(tokens[1].Start == 3);

            Assert.IsTrue(tokens[2].Text == "you");
            Assert.IsTrue(tokens[2].Start == 7);
        }
Ejemplo n.º 14
0
        public void ReplaceStartingQuoting()
        {
            var tokenizer = new TokenizerFactory(new TokenizationOptions
            {
            }, SupportedLanguage.English);

            tokenizer.GetTokenizer <TreebankTokenizer>();

            var tokens = tokenizer.Tokenize("«Hello!");

            Assert.IsTrue(tokens[0].Text == "«");
            Assert.IsTrue(tokens[0].Start == 0);

            Assert.IsTrue(tokens[1].Text == "Hello");
            Assert.IsTrue(tokens[1].Start == 1);

            Assert.IsTrue(tokens[2].Text == "!");
            Assert.IsTrue(tokens[2].Start == 6);
        }
Ejemplo n.º 15
0
        public void TagInCoNLL2000()
        {
            var tokenizer = new TokenizerFactory(new TokenizationOptions {
            }, SupportedLanguage.English);

            tokenizer.GetTokenizer <RegexTokenizer>();

            var tokens = tokenizer.Tokenize("How are you doing?");

            var tagger = new TaggerFactory(new TagOptions
            {
                Tag = "NN"
            }, SupportedLanguage.English);

            tagger.GetTagger <DefaultTagger>();

            tagger.Tag(new Sentence {
                Words = tokens
            });
        }
Ejemplo n.º 16
0
        public void ReplaceBrackets()
        {
            var tokenizer = new TokenizerFactory(new TokenizationOptions
            {
            }, SupportedLanguage.English);

            tokenizer.GetTokenizer <TreebankTokenizer>();

            var tokens = tokenizer.Tokenize("<Hello.>");

            Assert.IsTrue(tokens[0].Text == "<");
            Assert.IsTrue(tokens[0].Start == 0);

            Assert.IsTrue(tokens[1].Text == "Hello");
            Assert.IsTrue(tokens[1].Start == 1);

            Assert.IsTrue(tokens[2].Text == ".");
            Assert.IsTrue(tokens[2].Start == 6);

            Assert.IsTrue(tokens[3].Text == ">");
            Assert.IsTrue(tokens[3].Start == 7);
        }
Ejemplo n.º 17
0
        public void TokenizeInWordPunctuation()
        {
            var tokenizer = new TokenizerFactory(new TokenizationOptions
            {
                Pattern      = RegexTokenizer.WORD_PUNC,
                SpecialWords = new List <string> {
                    "n't"
                }
            }, SupportedLanguage.English);

            tokenizer.GetTokenizer <RegexTokenizer>();

            var tokens = tokenizer.Tokenize("Chop into pieces, isn't it?");

            Assert.IsTrue(tokens[0].Start == 0);
            Assert.IsTrue(tokens[0].Text == "Chop");

            Assert.IsTrue(tokens[1].Start == 5);
            Assert.IsTrue(tokens[1].Text == "into");

            Assert.IsTrue(tokens[2].Start == 10);
            Assert.IsTrue(tokens[2].Text == "pieces");

            Assert.IsTrue(tokens[3].Start == 16);
            Assert.IsTrue(tokens[3].Text == ",");

            Assert.IsTrue(tokens[4].Start == 18);
            Assert.IsTrue(tokens[4].Text == "is");

            Assert.IsTrue(tokens[5].Start == 20);
            Assert.IsTrue(tokens[5].Text == "n't");

            Assert.IsTrue(tokens[6].Start == 24);
            Assert.IsTrue(tokens[6].Text == "it");

            Assert.IsTrue(tokens[7].Start == 26);
            Assert.IsTrue(tokens[7].Text == "?");
        }
Ejemplo n.º 18
0
        public void TokenizeInBlankLine()
        {
            var tokenizer = new TokenizerFactory(new TokenizationOptions
            {
                Pattern = RegexTokenizer.BLANK_LINE
            }, SupportedLanguage.English);

            tokenizer.GetTokenizer <RegexTokenizer>();

            var tokens = tokenizer.Tokenize(@"Chop into pieces, 

isn't

it?");

            Assert.IsTrue(tokens[0].Start == 0);
            Assert.IsTrue(tokens[0].Text == "Chop into pieces,");

            Assert.IsTrue(tokens[1].Start == 18);
            Assert.IsTrue(tokens[1].Text == "isn't");

            Assert.IsTrue(tokens[2].Start == 28);
            Assert.IsTrue(tokens[2].Text == "it?");
        }