Пример #1
0
        public void TriGramInCoNLL2000()
        {
            // tokenization
            var tokenizer = new TokenizerFactory(new TokenizationOptions
            {
                Pattern = RegexTokenizer.WORD_PUNC
            }, SupportedLanguage.English);

            tokenizer.GetTokenizer <RegexTokenizer>();

            var tokens = tokenizer.Tokenize("Chancellor of the Exchequer Nigel Lawson's restated commitment");

            // test tag
            var tagger = new TaggerFactory(new TagOptions
            {
                CorpusDir = Configuration.GetValue <String>("CherubNLP:dataDir"),
                NGram     = 3,
                Tag       = "NN"
            }, SupportedLanguage.English);

            tagger.GetTagger <NGramTagger>();

            tagger.Tag(new Sentence {
                Words = tokens
            });

            Assert.IsTrue(tokens[0].Pos == "NNP");
            Assert.IsTrue(tokens[1].Pos == "IN");
            Assert.IsTrue(tokens[2].Pos == "DT");
            Assert.IsTrue(tokens[3].Pos == "NNP");
        }
Пример #2
0
        private void Init()
        {
            if (_tagger == null)
            {
                _tagger = new TaggerFactory(new TagOptions
                {
                    CorpusDir = Path.Combine(AppDomain.CurrentDomain.GetData("DataPath").ToString(), "Corpus")
                }, SupportedLanguage.English);

                string tokenizerName = Configuration.GetValue <String>($"tagger");

                _tagger.GetTagger(tokenizerName);
            }
        }
Пример #3
0
        static void Main(string[] args)
        {
            Console.WriteLine("Please enter text to tag");
            var text   = Console.ReadLine();
            var corpus = CorpusFactory.GetCorpus("brills");

            var tokenizer = TokenizerFactory.GetTokenizer("simple");
            var tokens    = tokenizer.Tokenize(text);

            var tagger  = TaggerFactory.GetTagger("simple");
            var results = tagger.Tag(corpus, tokens.Where(x => !string.IsNullOrWhiteSpace(x)).ToList());

            Console.WriteLine(String.Join(" ", results.Select(x => string.Format("{0}({1})", x.Token, x.Tag))));
            Console.WriteLine("Press any key to exit");
            var stop = Console.ReadLine();
        }
Пример #4
0
        public void UniGramInCoNLL2000()
        {
            // tokenization
            var tokenizer = new TokenizerFactory(new TokenizationOptions
            {
                Pattern = RegexTokenizer.WORD_PUNC
            }, SupportedLanguage.English);

            tokenizer.GetTokenizer <RegexTokenizer>();

            var tokens = tokenizer.Tokenize("Chancellor of the Exchequer Nigel Lawson's restated commitment");

            // test tag
            var tagger = new TaggerFactory(new TagOptions
            {
                CorpusDir = Configuration.GetValue <String>("CherubNLP:dataDir"),
                NGram     = 1,
                Tag       = "NN"
            }, SupportedLanguage.English);

            tagger.GetTagger <NGramTagger>();

            var watch = Stopwatch.StartNew();

            tagger.Tag(new Sentence {
                Words = tokens
            });
            watch.Stop();
            var elapsedMs1 = watch.ElapsedMilliseconds;

            Assert.IsTrue(tokens[0].Pos == "NNP");
            Assert.IsTrue(tokens[1].Pos == "IN");
            Assert.IsTrue(tokens[2].Pos == "DT");
            Assert.IsTrue(tokens[3].Pos == "NNP");

            // test if model is loaded repeatly.
            watch = Stopwatch.StartNew();
            tagger.Tag(new Sentence {
                Words = tokens
            });
            watch.Stop();
            var elapsedMs2 = watch.ElapsedMilliseconds;

            Assert.IsTrue(elapsedMs1 > elapsedMs2 * 100);
        }
Пример #5
0
        public ActionResult Index(TaggerViewModel model)
        {
            if (!ModelState.IsValid)
            {
                return(View(model));
            }

            var corpus = CorpusFactory.GetCorpus("brills");

            var tokenizer = TokenizerFactory.GetTokenizer("simple");
            var tokens    = tokenizer.Tokenize(model.Text);

            var tagger  = TaggerFactory.GetTagger("simple");
            var results = tagger.Tag(corpus, tokens.Where(x => !string.IsNullOrWhiteSpace(x)).ToList());

            ViewBag.Results = String.Join(" ", results.Select(x => string.Format("{0}({1})", x.Token, x.Tag)));

            return(View(model));
        }
Пример #6
0
        public void TagInCoNLL2000()
        {
            var tokenizer = new TokenizerFactory(new TokenizationOptions {
            }, SupportedLanguage.English);

            tokenizer.GetTokenizer <RegexTokenizer>();

            var tokens = tokenizer.Tokenize("How are you doing?");

            var tagger = new TaggerFactory(new TagOptions
            {
                Tag = "NN"
            }, SupportedLanguage.English);

            tagger.GetTagger <DefaultTagger>();

            tagger.Tag(new Sentence {
                Words = tokens
            });
        }