public void TriGramInCoNLL2000() { // tokenization var tokenizer = new TokenizerFactory(new TokenizationOptions { Pattern = RegexTokenizer.WORD_PUNC }, SupportedLanguage.English); tokenizer.GetTokenizer <RegexTokenizer>(); var tokens = tokenizer.Tokenize("Chancellor of the Exchequer Nigel Lawson's restated commitment"); // test tag var tagger = new TaggerFactory(new TagOptions { CorpusDir = Configuration.GetValue <String>("CherubNLP:dataDir"), NGram = 3, Tag = "NN" }, SupportedLanguage.English); tagger.GetTagger <NGramTagger>(); tagger.Tag(new Sentence { Words = tokens }); Assert.IsTrue(tokens[0].Pos == "NNP"); Assert.IsTrue(tokens[1].Pos == "IN"); Assert.IsTrue(tokens[2].Pos == "DT"); Assert.IsTrue(tokens[3].Pos == "NNP"); }
private void Init() { if (_tagger == null) { _tagger = new TaggerFactory(new TagOptions { CorpusDir = Path.Combine(AppDomain.CurrentDomain.GetData("DataPath").ToString(), "Corpus") }, SupportedLanguage.English); string tokenizerName = Configuration.GetValue <String>($"tagger"); _tagger.GetTagger(tokenizerName); } }
static void Main(string[] args) { Console.WriteLine("Please enter text to tag"); var text = Console.ReadLine(); var corpus = CorpusFactory.GetCorpus("brills"); var tokenizer = TokenizerFactory.GetTokenizer("simple"); var tokens = tokenizer.Tokenize(text); var tagger = TaggerFactory.GetTagger("simple"); var results = tagger.Tag(corpus, tokens.Where(x => !string.IsNullOrWhiteSpace(x)).ToList()); Console.WriteLine(String.Join(" ", results.Select(x => string.Format("{0}({1})", x.Token, x.Tag)))); Console.WriteLine("Press any key to exit"); var stop = Console.ReadLine(); }
public void UniGramInCoNLL2000() { // tokenization var tokenizer = new TokenizerFactory(new TokenizationOptions { Pattern = RegexTokenizer.WORD_PUNC }, SupportedLanguage.English); tokenizer.GetTokenizer <RegexTokenizer>(); var tokens = tokenizer.Tokenize("Chancellor of the Exchequer Nigel Lawson's restated commitment"); // test tag var tagger = new TaggerFactory(new TagOptions { CorpusDir = Configuration.GetValue <String>("CherubNLP:dataDir"), NGram = 1, Tag = "NN" }, SupportedLanguage.English); tagger.GetTagger <NGramTagger>(); var watch = Stopwatch.StartNew(); tagger.Tag(new Sentence { Words = tokens }); watch.Stop(); var elapsedMs1 = watch.ElapsedMilliseconds; Assert.IsTrue(tokens[0].Pos == "NNP"); Assert.IsTrue(tokens[1].Pos == "IN"); Assert.IsTrue(tokens[2].Pos == "DT"); Assert.IsTrue(tokens[3].Pos == "NNP"); // test if model is loaded repeatly. watch = Stopwatch.StartNew(); tagger.Tag(new Sentence { Words = tokens }); watch.Stop(); var elapsedMs2 = watch.ElapsedMilliseconds; Assert.IsTrue(elapsedMs1 > elapsedMs2 * 100); }
public ActionResult Index(TaggerViewModel model) { if (!ModelState.IsValid) { return(View(model)); } var corpus = CorpusFactory.GetCorpus("brills"); var tokenizer = TokenizerFactory.GetTokenizer("simple"); var tokens = tokenizer.Tokenize(model.Text); var tagger = TaggerFactory.GetTagger("simple"); var results = tagger.Tag(corpus, tokens.Where(x => !string.IsNullOrWhiteSpace(x)).ToList()); ViewBag.Results = String.Join(" ", results.Select(x => string.Format("{0}({1})", x.Token, x.Tag))); return(View(model)); }
public void TagInCoNLL2000() { var tokenizer = new TokenizerFactory(new TokenizationOptions { }, SupportedLanguage.English); tokenizer.GetTokenizer <RegexTokenizer>(); var tokens = tokenizer.Tokenize("How are you doing?"); var tagger = new TaggerFactory(new TagOptions { Tag = "NN" }, SupportedLanguage.English); tagger.GetTagger <DefaultTagger>(); tagger.Tag(new Sentence { Words = tokens }); }