public void ReplaceConventions() { var tokenizer = new TokenizerFactory(new TokenizationOptions { }, SupportedLanguage.English); tokenizer.GetTokenizer <TreebankTokenizer>(); var tokens = tokenizer.Tokenize("I cannot jump."); Assert.IsTrue(tokens[0].Text == "I"); Assert.IsTrue(tokens[0].Start == 0); Assert.IsTrue(tokens[1].Text == "can"); Assert.IsTrue(tokens[1].Start == 2); Assert.IsTrue(tokens[2].Text == "not"); Assert.IsTrue(tokens[2].Start == 5); Assert.IsTrue(tokens[3].Text == "jump"); Assert.IsTrue(tokens[3].Start == 9); Assert.IsTrue(tokens[4].Text == "."); Assert.IsTrue(tokens[4].Start == 13); }
public void TokenizeInWhiteSpace() { var tokenizer = new TokenizerFactory(new TokenizationOptions { Pattern = RegexTokenizer.WHITE_SPACE }, SupportedLanguage.English); tokenizer.GetTokenizer <RegexTokenizer>(); var tokens = tokenizer.Tokenize("Chop into pieces, isn't it?"); Assert.IsTrue(tokens[0].Start == 0); Assert.IsTrue(tokens[0].Text == "Chop"); Assert.IsTrue(tokens[1].Start == 5); Assert.IsTrue(tokens[1].Text == "into"); Assert.IsTrue(tokens[2].Start == 10); Assert.IsTrue(tokens[2].Text == "pieces,"); Assert.IsTrue(tokens[3].Start == 18); Assert.IsTrue(tokens[3].Text == "isn't"); Assert.IsTrue(tokens[4].Start == 24); Assert.IsTrue(tokens[4].Text == "it?"); }
public void TriGramInCoNLL2000() { // tokenization var tokenizer = new TokenizerFactory(new TokenizationOptions { Pattern = RegexTokenizer.WORD_PUNC }, SupportedLanguage.English); tokenizer.GetTokenizer <RegexTokenizer>(); var tokens = tokenizer.Tokenize("Chancellor of the Exchequer Nigel Lawson's restated commitment"); // test tag var tagger = new TaggerFactory(new TagOptions { CorpusDir = Configuration.GetValue <String>("BotSharp.NLP:dataDir"), NGram = 3, Tag = "NN" }, SupportedLanguage.English); tagger.GetTagger <NGramTagger>(); tagger.Tag(new Sentence { Words = tokens }); Assert.IsTrue(tokens[0].Pos == "NNP"); Assert.IsTrue(tokens[1].Pos == "IN"); Assert.IsTrue(tokens[2].Pos == "DT"); Assert.IsTrue(tokens[3].Pos == "NNP"); }
public void ReplaceConventionsIncludeMultipleSymbol() { var tokenizer = new TokenizerFactory(new TokenizationOptions { }, SupportedLanguage.English); tokenizer.GetTokenizer <TreebankTokenizer>(); var tokens = tokenizer.Tokenize("I jump. And you?"); Assert.IsTrue(tokens[0].Text == "I"); Assert.IsTrue(tokens[0].Start == 0); Assert.IsTrue(tokens[1].Text == "jump"); Assert.IsTrue(tokens[1].Start == 2); Assert.IsTrue(tokens[2].Text == "."); Assert.IsTrue(tokens[2].Start == 6); Assert.IsTrue(tokens[3].Text == "And"); Assert.IsTrue(tokens[3].Start == 8); Assert.IsTrue(tokens[4].Text == "you"); Assert.IsTrue(tokens[4].Start == 12); Assert.IsTrue(tokens[5].Text == "?"); Assert.IsTrue(tokens[5].Start == 15); }
public void CookingTest() { var reader = new FasttextDataReader(); var sentences = reader.Read(new ReaderOptions { DataDir = Path.Combine(Configuration.GetValue <String>("MachineLearning:dataDir"), "Text Classification", "cooking.stackexchange"), FileName = "cooking.stackexchange.txt" }); var tokenizer = new TokenizerFactory(new TokenizationOptions { }, SupportedLanguage.English); tokenizer.GetTokenizer <TreebankTokenizer>(); var newSentences = tokenizer.Tokenize(sentences.Select(x => x.Text).ToList()); for (int i = 0; i < newSentences.Count; i++) { newSentences[i].Label = sentences[i].Label; } sentences = newSentences.ToList(); sentences.Shuffle(); var options = new ClassifyOptions { ModelFilePath = Path.Combine(Configuration.GetValue <String>("MachineLearning:dataDir"), "Text Classification", "cooking.stackexchange", "nb.model"), TrainingCorpusDir = Path.Combine(Configuration.GetValue <String>("MachineLearning:dataDir"), "Text Classification", "cooking.stackexchange"), Dimension = 100 }; var classifier = new ClassifierFactory <SentenceFeatureExtractor>(options, SupportedLanguage.English); var dataset = sentences.Split(0.7M); classifier.Train(dataset.Item1); int correct = 0; int total = 0; dataset.Item2.ForEach(td => { var classes = classifier.Classify(td); if (td.Label == classes[0].Item1) { correct++; } total++; }); var accuracy = (float)correct / total; Assert.IsTrue(accuracy > 0.5); }
public void SpookyAuthorIdentification() { var reader = new KaggleTextDataReader(); var sentences = reader.Read(new ReaderOptions { FileName = "train.csv" }); var tokenizer = new TokenizerFactory(new TokenizationOptions { }, SupportedLanguage.English); tokenizer.GetTokenizer <TreebankTokenizer>(); var newSentences = tokenizer.Tokenize(sentences.Select(x => x.Text).ToList()); for (int i = 0; i < newSentences.Count; i++) { newSentences[i].Id = sentences[i].Id; newSentences[i].Label = sentences[i].Label; } sentences = newSentences.ToList(); sentences.Shuffle(); var dataset = sentences.Take(2000).ToList().Split(0.7M); var options = new ClassifyOptions { ModelDir = AppContext.BaseDirectory, ModelFilePath = Path.Combine(AppContext.BaseDirectory, "nb.model"), Dimension = 300 }; var classifier = new ClassifierFactory <SentenceFeatureExtractor>(options, SupportedLanguage.English); classifier.GetClassifer("NaiveBayesClassifier"); classifier.Train(dataset.Item1); int correct = 0; int total = 0; dataset.Item2.ForEach(td => { var classes = classifier.Classify(td); if (td.Label == classes[0].Item1) { correct++; } total++; }); var accuracy = (float)correct / total; Assert.IsTrue(accuracy > 0.5); }
private void Init() { if (_tokenizer == null) { _tokenizer = new TokenizerFactory(new TokenizationOptions { Pattern = Configuration.GetValue <String>("options:pattern") }, SupportedLanguage.English); string tokenizerName = Configuration.GetValue <String>($"tokenizer"); _tokenizer.GetTokenizer(tokenizerName); } }
static void Main(string[] args) { Console.WriteLine("Please enter text to tag"); var text = Console.ReadLine(); var corpus = CorpusFactory.GetCorpus("brills"); var tokenizer = TokenizerFactory.GetTokenizer("simple"); var tokens = tokenizer.Tokenize(text); var tagger = TaggerFactory.GetTagger("simple"); var results = tagger.Tag(corpus, tokens.Where(x => !string.IsNullOrWhiteSpace(x)).ToList()); Console.WriteLine(String.Join(" ", results.Select(x => string.Format("{0}({1})", x.Token, x.Tag)))); Console.WriteLine("Press any key to exit"); var stop = Console.ReadLine(); }
public void UniGramInCoNLL2000() { // tokenization var tokenizer = new TokenizerFactory(new TokenizationOptions { Pattern = RegexTokenizer.WORD_PUNC }, SupportedLanguage.English); tokenizer.GetTokenizer <RegexTokenizer>(); var tokens = tokenizer.Tokenize("Chancellor of the Exchequer Nigel Lawson's restated commitment"); // test tag var tagger = new TaggerFactory(new TagOptions { CorpusDir = Configuration.GetValue <String>("CherubNLP:dataDir"), NGram = 1, Tag = "NN" }, SupportedLanguage.English); tagger.GetTagger <NGramTagger>(); var watch = Stopwatch.StartNew(); tagger.Tag(new Sentence { Words = tokens }); watch.Stop(); var elapsedMs1 = watch.ElapsedMilliseconds; Assert.IsTrue(tokens[0].Pos == "NNP"); Assert.IsTrue(tokens[1].Pos == "IN"); Assert.IsTrue(tokens[2].Pos == "DT"); Assert.IsTrue(tokens[3].Pos == "NNP"); // test if model is loaded repeatly. watch = Stopwatch.StartNew(); tagger.Tag(new Sentence { Words = tokens }); watch.Stop(); var elapsedMs2 = watch.ElapsedMilliseconds; Assert.IsTrue(elapsedMs1 > elapsedMs2 * 100); }
public void GenderTest() { var options = new ClassifyOptions { TrainingCorpusDir = Path.Combine(Configuration.GetValue <String>("MachineLearning:dataDir"), "Gender") }; var classifier = new ClassifierFactory <WordFeatureExtractor>(options, SupportedLanguage.English); var corpus = GetLabeledCorpus(options); var tokenizer = new TokenizerFactory(new TokenizationOptions { Pattern = RegexTokenizer.WORD_PUNC }, SupportedLanguage.English); tokenizer.GetTokenizer <RegexTokenizer>(); corpus.ForEach(x => x.Words = tokenizer.Tokenize(x.Text)); classifier.Train(corpus); string text = "Bridget"; classifier.Classify(new Sentence { Text = text, Words = tokenizer.Tokenize(text) }); corpus.Shuffle(); var trainingData = corpus.Skip(2000).ToList(); classifier.Train(trainingData); var testData = corpus.Take(2000).ToList(); int correct = 0; testData.ForEach(td => { var classes = classifier.Classify(td); if (td.Label == classes[0].Item1) { correct++; } }); var accuracy = (float)correct / testData.Count; }
public ActionResult Index(TaggerViewModel model) { if (!ModelState.IsValid) { return(View(model)); } var corpus = CorpusFactory.GetCorpus("brills"); var tokenizer = TokenizerFactory.GetTokenizer("simple"); var tokens = tokenizer.Tokenize(model.Text); var tagger = TaggerFactory.GetTagger("simple"); var results = tagger.Tag(corpus, tokens.Where(x => !string.IsNullOrWhiteSpace(x)).ToList()); ViewBag.Results = String.Join(" ", results.Select(x => string.Format("{0}({1})", x.Token, x.Tag))); return(View(model)); }
public void ReplacePunctuation() { var tokenizer = new TokenizerFactory(new TokenizationOptions { }, SupportedLanguage.English); tokenizer.GetTokenizer <TreebankTokenizer>(); var tokens = tokenizer.Tokenize("Hello World..."); Assert.IsTrue(tokens[0].Text == "Hello"); Assert.IsTrue(tokens[0].Start == 0); Assert.IsTrue(tokens[1].Text == "World"); Assert.IsTrue(tokens[1].Start == 6); Assert.IsTrue(tokens[2].Text == "..."); Assert.IsTrue(tokens[2].Start == 11); }
public void ReplaceEndingQuoting() { var tokenizer = new TokenizerFactory(new TokenizationOptions { }, SupportedLanguage.English); tokenizer.GetTokenizer <TreebankTokenizer>(); var tokens = tokenizer.Tokenize("Aren't you"); Assert.IsTrue(tokens[0].Text == "Are"); Assert.IsTrue(tokens[0].Start == 0); Assert.IsTrue(tokens[1].Text == "n't"); Assert.IsTrue(tokens[1].Start == 3); Assert.IsTrue(tokens[2].Text == "you"); Assert.IsTrue(tokens[2].Start == 7); }
public void ReplaceStartingQuoting() { var tokenizer = new TokenizerFactory(new TokenizationOptions { }, SupportedLanguage.English); tokenizer.GetTokenizer <TreebankTokenizer>(); var tokens = tokenizer.Tokenize("«Hello!"); Assert.IsTrue(tokens[0].Text == "«"); Assert.IsTrue(tokens[0].Start == 0); Assert.IsTrue(tokens[1].Text == "Hello"); Assert.IsTrue(tokens[1].Start == 1); Assert.IsTrue(tokens[2].Text == "!"); Assert.IsTrue(tokens[2].Start == 6); }
public void TagInCoNLL2000() { var tokenizer = new TokenizerFactory(new TokenizationOptions { }, SupportedLanguage.English); tokenizer.GetTokenizer <RegexTokenizer>(); var tokens = tokenizer.Tokenize("How are you doing?"); var tagger = new TaggerFactory(new TagOptions { Tag = "NN" }, SupportedLanguage.English); tagger.GetTagger <DefaultTagger>(); tagger.Tag(new Sentence { Words = tokens }); }
public void ReplaceBrackets() { var tokenizer = new TokenizerFactory(new TokenizationOptions { }, SupportedLanguage.English); tokenizer.GetTokenizer <TreebankTokenizer>(); var tokens = tokenizer.Tokenize("<Hello.>"); Assert.IsTrue(tokens[0].Text == "<"); Assert.IsTrue(tokens[0].Start == 0); Assert.IsTrue(tokens[1].Text == "Hello"); Assert.IsTrue(tokens[1].Start == 1); Assert.IsTrue(tokens[2].Text == "."); Assert.IsTrue(tokens[2].Start == 6); Assert.IsTrue(tokens[3].Text == ">"); Assert.IsTrue(tokens[3].Start == 7); }
public void TokenizeInWordPunctuation() { var tokenizer = new TokenizerFactory(new TokenizationOptions { Pattern = RegexTokenizer.WORD_PUNC, SpecialWords = new List <string> { "n't" } }, SupportedLanguage.English); tokenizer.GetTokenizer <RegexTokenizer>(); var tokens = tokenizer.Tokenize("Chop into pieces, isn't it?"); Assert.IsTrue(tokens[0].Start == 0); Assert.IsTrue(tokens[0].Text == "Chop"); Assert.IsTrue(tokens[1].Start == 5); Assert.IsTrue(tokens[1].Text == "into"); Assert.IsTrue(tokens[2].Start == 10); Assert.IsTrue(tokens[2].Text == "pieces"); Assert.IsTrue(tokens[3].Start == 16); Assert.IsTrue(tokens[3].Text == ","); Assert.IsTrue(tokens[4].Start == 18); Assert.IsTrue(tokens[4].Text == "is"); Assert.IsTrue(tokens[5].Start == 20); Assert.IsTrue(tokens[5].Text == "n't"); Assert.IsTrue(tokens[6].Start == 24); Assert.IsTrue(tokens[6].Text == "it"); Assert.IsTrue(tokens[7].Start == 26); Assert.IsTrue(tokens[7].Text == "?"); }
public void TokenizeInBlankLine() { var tokenizer = new TokenizerFactory(new TokenizationOptions { Pattern = RegexTokenizer.BLANK_LINE }, SupportedLanguage.English); tokenizer.GetTokenizer <RegexTokenizer>(); var tokens = tokenizer.Tokenize(@"Chop into pieces, isn't it?"); Assert.IsTrue(tokens[0].Start == 0); Assert.IsTrue(tokens[0].Text == "Chop into pieces,"); Assert.IsTrue(tokens[1].Start == 18); Assert.IsTrue(tokens[1].Text == "isn't"); Assert.IsTrue(tokens[2].Start == 28); Assert.IsTrue(tokens[2].Text == "it?"); }