public async Task <bool> Predict(Agent agent, NlpDoc doc, PipeModel meta) { doc.Tokenizer = this; // same as train doc.Sentences.ForEach(snt => { snt.Tokens = _tokenizer.Tokenize(snt.Text); }); return(true); }
public void TestVectorizer() { var tokenizer = new TokenizerFactory(new TokenizationOptions { }, SupportedLanguage.English); tokenizer.GetTokenizer <TreebankTokenizer>(); var extractor = new CountFeatureExtractor(); extractor.Sentences = tokenizer.Tokenize(Corpus()); extractor.Vectorize(new List <string>()); var vectors = Vectors(); for (int i = 0; i < extractor.Sentences.Count; i++) { var sentence = extractor.Sentences[i]; for (int j = 0; j < extractor.Features.Count; j++) { var word = sentence.Words.Find(w => w.Lemma == extractor.Features[j]); if (word != null) { Assert.IsTrue(word.Vector == vectors[i][j]); } } } }
public void TriGramInCoNLL2000() { // tokenization var tokenizer = new TokenizerFactory <RegexTokenizer>(new TokenizationOptions { Pattern = RegexTokenizer.WORD_PUNC }, SupportedLanguage.English); var tokens = tokenizer.Tokenize("Chancellor of the Exchequer Nigel Lawson's restated commitment"); // test tag var tagger = new TaggerFactory <NGramTagger>(new TagOptions { NGram = 3, Tag = "NN", Corpus = GetTaggedCorpus() }, SupportedLanguage.English); tagger.Tag(new Sentence { Words = tokens }); Assert.IsTrue(tokens[0].Pos == "NNP"); Assert.IsTrue(tokens[1].Pos == "IN"); Assert.IsTrue(tokens[2].Pos == "DT"); Assert.IsTrue(tokens[3].Pos == "NNP"); }
public void ReplaceConventionsIncludeMultipleSymbol() { var tokenizer = new TokenizerFactory(new TokenizationOptions { }, SupportedLanguage.English); tokenizer.GetTokenizer <TreebankTokenizer>(); var tokens = tokenizer.Tokenize("I jump. And you?"); Assert.IsTrue(tokens[0].Text == "I"); Assert.IsTrue(tokens[0].Start == 0); Assert.IsTrue(tokens[1].Text == "jump"); Assert.IsTrue(tokens[1].Start == 2); Assert.IsTrue(tokens[2].Text == "."); Assert.IsTrue(tokens[2].Start == 6); Assert.IsTrue(tokens[3].Text == "And"); Assert.IsTrue(tokens[3].Start == 8); Assert.IsTrue(tokens[4].Text == "you"); Assert.IsTrue(tokens[4].Start == 12); Assert.IsTrue(tokens[5].Text == "?"); Assert.IsTrue(tokens[5].Start == 15); }
public void ReplaceConventions() { var tokenizer = new TokenizerFactory(new TokenizationOptions { }, SupportedLanguage.English); tokenizer.GetTokenizer <TreebankTokenizer>(); var tokens = tokenizer.Tokenize("I cannot jump."); Assert.IsTrue(tokens[0].Text == "I"); Assert.IsTrue(tokens[0].Start == 0); Assert.IsTrue(tokens[1].Text == "can"); Assert.IsTrue(tokens[1].Start == 2); Assert.IsTrue(tokens[2].Text == "not"); Assert.IsTrue(tokens[2].Start == 5); Assert.IsTrue(tokens[3].Text == "jump"); Assert.IsTrue(tokens[3].Start == 9); Assert.IsTrue(tokens[4].Text == "."); Assert.IsTrue(tokens[4].Start == 13); }
public void OneHotTest() { var reader = new FasttextDataReader(); var sentences = reader.Read(new ReaderOptions { DataDir = Path.Combine(Configuration.GetValue <String>("MachineLearning:dataDir"), "Text Classification", "cooking.stackexchange"), FileName = "cooking.stackexchange.txt" }); var tokenizer = new TokenizerFactory(new TokenizationOptions { }, SupportedLanguage.English); tokenizer.GetTokenizer <TreebankTokenizer>(); var newSentences = tokenizer.Tokenize(sentences.Select(x => x.Text).ToList()); for (int i = 0; i < newSentences.Count; i++) { newSentences[i].Label = sentences[i].Label; } sentences = newSentences.ToList(); var encoder = new OneHotEncoder(); encoder.Sentences = sentences; encoder.EncodeAll(); }
public void TriGramInCoNLL2000() { // tokenization var tokenizer = new TokenizerFactory(new TokenizationOptions { Pattern = RegexTokenizer.WORD_PUNC }, SupportedLanguage.English); tokenizer.GetTokenizer <RegexTokenizer>(); var tokens = tokenizer.Tokenize("Chancellor of the Exchequer Nigel Lawson's restated commitment"); // test tag var tagger = new TaggerFactory(new TagOptions { CorpusDir = Configuration.GetValue <String>("CherubNLP:dataDir"), NGram = 3, Tag = "NN" }, SupportedLanguage.English); tagger.GetTagger <NGramTagger>(); tagger.Tag(new Sentence { Words = tokens }); Assert.IsTrue(tokens[0].Pos == "NNP"); Assert.IsTrue(tokens[1].Pos == "IN"); Assert.IsTrue(tokens[2].Pos == "DT"); Assert.IsTrue(tokens[3].Pos == "NNP"); }
public void TokenizeInWhiteSpace() { var tokenizer = new TokenizerFactory(new TokenizationOptions { Pattern = RegexTokenizer.WHITE_SPACE }, SupportedLanguage.English); tokenizer.GetTokenizer <RegexTokenizer>(); var tokens = tokenizer.Tokenize("Chop into pieces, isn't it?"); Assert.IsTrue(tokens[0].Start == 0); Assert.IsTrue(tokens[0].Text == "Chop"); Assert.IsTrue(tokens[1].Start == 5); Assert.IsTrue(tokens[1].Text == "into"); Assert.IsTrue(tokens[2].Start == 10); Assert.IsTrue(tokens[2].Text == "pieces,"); Assert.IsTrue(tokens[3].Start == 18); Assert.IsTrue(tokens[3].Text == "isn't"); Assert.IsTrue(tokens[4].Start == 24); Assert.IsTrue(tokens[4].Text == "it?"); }
public void GenderTest() { var options = new ClassifyOptions { TrainingCorpusDir = Path.Combine(Configuration.GetValue <String>("MachineLearning:dataDir"), "Gender") }; var classifier = new ClassifierFactory <WordFeatureExtractor>(options, SupportedLanguage.English); var corpus = GetLabeledCorpus(options); var tokenizer = new TokenizerFactory(new TokenizationOptions { Pattern = RegexTokenizer.WORD_PUNC }, SupportedLanguage.English); tokenizer.GetTokenizer <RegexTokenizer>(); corpus.ForEach(x => x.Words = tokenizer.Tokenize(x.Text)); classifier.Train(corpus); string text = "Bridget"; classifier.Classify(new Sentence { Text = text, Words = tokenizer.Tokenize(text) }); corpus.Shuffle(); var trainingData = corpus.Skip(2000).ToList(); classifier.Train(trainingData); var testData = corpus.Take(2000).ToList(); int correct = 0; testData.ForEach(td => { var classes = classifier.Classify(td); if (td.Label == classes[0].Item1) { correct++; } }); var accuracy = (float)correct / testData.Count; }
public void CookingTest() { var reader = new FasttextDataReader(); var sentences = reader.Read(new ReaderOptions { DataDir = Path.Combine(Configuration.GetValue <String>("MachineLearning:dataDir"), "Text Classification", "cooking.stackexchange"), FileName = "cooking.stackexchange.txt" }); var tokenizer = new TokenizerFactory(new TokenizationOptions { }, SupportedLanguage.English); tokenizer.GetTokenizer <TreebankTokenizer>(); var newSentences = tokenizer.Tokenize(sentences.Select(x => x.Text).ToList()); for (int i = 0; i < newSentences.Count; i++) { newSentences[i].Label = sentences[i].Label; } sentences = newSentences.ToList(); sentences.Shuffle(); var options = new ClassifyOptions { ModelFilePath = Path.Combine(Configuration.GetValue <String>("MachineLearning:dataDir"), "Text Classification", "cooking.stackexchange", "nb.model"), TrainingCorpusDir = Path.Combine(Configuration.GetValue <String>("MachineLearning:dataDir"), "Text Classification", "cooking.stackexchange"), Dimension = 100 }; var classifier = new ClassifierFactory <SentenceFeatureExtractor>(options, SupportedLanguage.English); var dataset = sentences.Split(0.7M); classifier.Train(dataset.Item1); int correct = 0; int total = 0; dataset.Item2.ForEach(td => { var classes = classifier.Classify(td); if (td.Label == classes[0].Item1) { correct++; } total++; }); var accuracy = (float)correct / total; Assert.IsTrue(accuracy > 0.5); }
public void SpookyAuthorIdentification() { var reader = new KaggleTextDataReader(); var sentences = reader.Read(new ReaderOptions { FileName = "train.csv" }); var tokenizer = new TokenizerFactory(new TokenizationOptions { }, SupportedLanguage.English); tokenizer.GetTokenizer <TreebankTokenizer>(); var newSentences = tokenizer.Tokenize(sentences.Select(x => x.Text).ToList()); for (int i = 0; i < newSentences.Count; i++) { newSentences[i].Id = sentences[i].Id; newSentences[i].Label = sentences[i].Label; } sentences = newSentences.ToList(); sentences.Shuffle(); var dataset = sentences.Take(2000).ToList().Split(0.7M); var options = new ClassifyOptions { ModelDir = AppContext.BaseDirectory, ModelFilePath = Path.Combine(AppContext.BaseDirectory, "nb.model"), Dimension = 300 }; var classifier = new ClassifierFactory <SentenceFeatureExtractor>(options, SupportedLanguage.English); classifier.GetClassifer("NaiveBayesClassifier"); classifier.Train(dataset.Item1); int correct = 0; int total = 0; dataset.Item2.ForEach(td => { var classes = classifier.Classify(td); if (td.Label == classes[0].Item1) { correct++; } total++; }); var accuracy = (float)correct / total; Assert.IsTrue(accuracy > 0.5); }
public void TagInCoNLL2000() { var tokenizer = new TokenizerFactory <RegexTokenizer>(new TokenizationOptions { }, SupportedLanguage.English); var tokens = tokenizer.Tokenize("How are you doing?"); var tagger = new TaggerFactory <DefaultTagger>(new TagOptions { Tag = "NN" }, SupportedLanguage.English); tagger.Tag(new Sentence { Words = tokens }); }
public void ReplacePunctuation() { var tokenizer = new TokenizerFactory <TreebankTokenizer>(new TokenizationOptions { }, SupportedLanguage.English); var tokens = tokenizer.Tokenize("Hello World..."); Assert.IsTrue(tokens[0].Text == "Hello"); Assert.IsTrue(tokens[0].Start == 0); Assert.IsTrue(tokens[1].Text == "World"); Assert.IsTrue(tokens[1].Start == 6); Assert.IsTrue(tokens[2].Text == "..."); Assert.IsTrue(tokens[2].Start == 11); }
public void ReplaceEndingQuoting() { var tokenizer = new TokenizerFactory <TreebankTokenizer>(new TokenizationOptions { }, SupportedLanguage.English); var tokens = tokenizer.Tokenize("Aren't you"); Assert.IsTrue(tokens[0].Text == "Are"); Assert.IsTrue(tokens[0].Start == 0); Assert.IsTrue(tokens[1].Text == "n't"); Assert.IsTrue(tokens[1].Start == 3); Assert.IsTrue(tokens[2].Text == "you"); Assert.IsTrue(tokens[2].Start == 7); }
public void ReplaceStartingQuoting() { var tokenizer = new TokenizerFactory <TreebankTokenizer>(new TokenizationOptions { }, SupportedLanguage.English); var tokens = tokenizer.Tokenize("«Hello!"); Assert.IsTrue(tokens[0].Text == "«"); Assert.IsTrue(tokens[0].Start == 0); Assert.IsTrue(tokens[1].Text == "Hello"); Assert.IsTrue(tokens[1].Start == 1); Assert.IsTrue(tokens[2].Text == "!"); Assert.IsTrue(tokens[2].Start == 6); }
public void UniGramInCoNLL2000() { // tokenization var tokenizer = new TokenizerFactory(new TokenizationOptions { Pattern = RegexTokenizer.WORD_PUNC }, SupportedLanguage.English); tokenizer.GetTokenizer <RegexTokenizer>(); var tokens = tokenizer.Tokenize("Chancellor of the Exchequer Nigel Lawson's restated commitment"); // test tag var tagger = new TaggerFactory(new TagOptions { CorpusDir = Configuration.GetValue <String>("CherubNLP:dataDir"), NGram = 1, Tag = "NN" }, SupportedLanguage.English); tagger.GetTagger <NGramTagger>(); var watch = Stopwatch.StartNew(); tagger.Tag(new Sentence { Words = tokens }); watch.Stop(); var elapsedMs1 = watch.ElapsedMilliseconds; Assert.IsTrue(tokens[0].Pos == "NNP"); Assert.IsTrue(tokens[1].Pos == "IN"); Assert.IsTrue(tokens[2].Pos == "DT"); Assert.IsTrue(tokens[3].Pos == "NNP"); // test if model is loaded repeatly. watch = Stopwatch.StartNew(); tagger.Tag(new Sentence { Words = tokens }); watch.Stop(); var elapsedMs2 = watch.ElapsedMilliseconds; Assert.IsTrue(elapsedMs1 > elapsedMs2 * 100); }
public void ReplaceBrackets() { var tokenizer = new TokenizerFactory <TreebankTokenizer>(new TokenizationOptions { }, SupportedLanguage.English); var tokens = tokenizer.Tokenize("<Hello.>"); Assert.IsTrue(tokens[0].Text == "<"); Assert.IsTrue(tokens[0].Start == 0); Assert.IsTrue(tokens[1].Text == "Hello"); Assert.IsTrue(tokens[1].Start == 1); Assert.IsTrue(tokens[2].Text == "."); Assert.IsTrue(tokens[2].Start == 6); Assert.IsTrue(tokens[3].Text == ">"); Assert.IsTrue(tokens[3].Start == 7); }
public void TokenizeInBlankLine() { var tokenizer = new TokenizerFactory <RegexTokenizer>(new TokenizationOptions { Pattern = RegexTokenizer.BLANK_LINE }, SupportedLanguage.English); var tokens = tokenizer.Tokenize(@"Chop into pieces, isn't it?"); Assert.IsTrue(tokens[0].Start == 0); Assert.IsTrue(tokens[0].Text == "Chop into pieces,"); Assert.IsTrue(tokens[1].Start == 18); Assert.IsTrue(tokens[1].Text == "isn't"); Assert.IsTrue(tokens[2].Start == 28); Assert.IsTrue(tokens[2].Text == "it?"); }
public void TokenizeInWordPunctuation() { var tokenizer = new TokenizerFactory(new TokenizationOptions { Pattern = RegexTokenizer.WORD_PUNC, SpecialWords = new List <string> { "n't" } }, SupportedLanguage.English); tokenizer.GetTokenizer <RegexTokenizer>(); var tokens = tokenizer.Tokenize("Chop into pieces, isn't it?"); Assert.IsTrue(tokens[0].Start == 0); Assert.IsTrue(tokens[0].Text == "Chop"); Assert.IsTrue(tokens[1].Start == 5); Assert.IsTrue(tokens[1].Text == "into"); Assert.IsTrue(tokens[2].Start == 10); Assert.IsTrue(tokens[2].Text == "pieces"); Assert.IsTrue(tokens[3].Start == 16); Assert.IsTrue(tokens[3].Text == ","); Assert.IsTrue(tokens[4].Start == 18); Assert.IsTrue(tokens[4].Text == "is"); Assert.IsTrue(tokens[5].Start == 20); Assert.IsTrue(tokens[5].Text == "n't"); Assert.IsTrue(tokens[6].Start == 24); Assert.IsTrue(tokens[6].Text == "it"); Assert.IsTrue(tokens[7].Start == 26); Assert.IsTrue(tokens[7].Text == "?"); }
public void TokenizeInWordPunctuation() { var tokenizer = new TokenizerFactory <RegexTokenizer>(new TokenizationOptions { Pattern = RegexTokenizer.WORD_PUNC }, SupportedLanguage.English); var tokens = tokenizer.Tokenize("Chop into pieces, isn't it?"); Assert.IsTrue(tokens[0].Start == 0); Assert.IsTrue(tokens[0].Text == "Chop"); Assert.IsTrue(tokens[1].Start == 5); Assert.IsTrue(tokens[1].Text == "into"); Assert.IsTrue(tokens[2].Start == 10); Assert.IsTrue(tokens[2].Text == "pieces"); Assert.IsTrue(tokens[3].Start == 16); Assert.IsTrue(tokens[3].Text == ","); Assert.IsTrue(tokens[4].Start == 18); Assert.IsTrue(tokens[4].Text == "isn"); Assert.IsTrue(tokens[5].Start == 21); Assert.IsTrue(tokens[5].Text == "'"); Assert.IsTrue(tokens[6].Start == 22); Assert.IsTrue(tokens[6].Text == "t"); Assert.IsTrue(tokens[7].Start == 24); Assert.IsTrue(tokens[7].Text == "it"); Assert.IsTrue(tokens[8].Start == 26); Assert.IsTrue(tokens[8].Text == "?"); }