public void TestDisambiguation() { var fsm = new FsmMorphologicalAnalyzer(); var corpus = new DisambiguationCorpus("../../../penntreebank.txt"); var algorithm = new DummyDisambiguation(); var correctParse = 0; var correctRoot = 0; for (var i = 0; i < corpus.SentenceCount(); i++) { var sentenceAnalyses = fsm.RobustMorphologicalAnalysis(corpus.GetSentence(i)); var fsmParses = algorithm.Disambiguate(sentenceAnalyses); for (var j = 0; j < corpus.GetSentence(i).WordCount(); j++) { var word = (DisambiguatedWord)corpus.GetSentence(i).GetWord(j); if (fsmParses[j].TransitionList().Equals(word.GetParse().ToString())) { correctParse++; } if (fsmParses[j].GetWord().Equals(word.GetParse().GetWord())) { correctRoot++; } } } Assert.AreEqual(0.86, (correctRoot + 0.0) / corpus.NumberOfWords(), 0.01); Assert.AreEqual(0.70, (correctParse + 0.0) / corpus.NumberOfWords(), 0.01); }
/** * <summary> The train method gets sentences from given {@link DisambiguationCorpus} and both word and the next word of that sentence at each iteration. * Then, adds these words together with their part of speech tags to word unigram and bigram models. It also adds the last inflectional group of * word to the ig unigram and bigram models. * <p/> * At the end, it calculates the NGram probabilities of both word and ig unigram models by using LaplaceSmoothing, and * both word and ig bigram models by using InterpolatedSmoothing.</summary> * * <param name="corpus">{@link DisambiguationCorpus} to train.</param> */ public override void Train(DisambiguationCorpus corpus) { int i, j; var words1 = new Word[1]; var igs1 = new Word[1]; var words2 = new Word[2]; var igs2 = new Word[2]; wordUniGramModel = new NGram <Word>(1); igUniGramModel = new NGram <Word>(1); wordBiGramModel = new NGram <Word>(2); igBiGramModel = new NGram <Word>(2); for (i = 0; i < corpus.SentenceCount(); i++) { var sentence = corpus.GetSentence(i); for (j = 0; j < sentence.WordCount() - 1; j++) { var word = (DisambiguatedWord)sentence.GetWord(j); var nextWord = (DisambiguatedWord)sentence.GetWord(j + 1); words2[0] = word.GetParse().GetWordWithPos(); words1[0] = words2[0]; words2[1] = nextWord.GetParse().GetWordWithPos(); wordUniGramModel.AddNGram(words1); wordBiGramModel.AddNGram(words2); int k; for (k = 0; k < nextWord.GetParse().Size(); k++) { igs2[0] = new Word(word.GetParse().GetLastInflectionalGroup().ToString()); igs2[1] = new Word(nextWord.GetParse().GetInflectionalGroup(k).ToString()); igBiGramModel.AddNGram(igs2); igs1[0] = igs2[1]; igUniGramModel.AddNGram(igs1); } } } wordUniGramModel.CalculateNGramProbabilities(new LaplaceSmoothing <Word>()); igUniGramModel.CalculateNGramProbabilities(new LaplaceSmoothing <Word>()); wordBiGramModel.CalculateNGramProbabilities(new LaplaceSmoothing <Word>()); igBiGramModel.CalculateNGramProbabilities(new LaplaceSmoothing <Word>()); }
/** * <summary> The train method initially creates new NGrams; wordUniGramModel, wordBiGramModel, igUniGramModel, and igBiGramModel. It gets the * sentences from given corpus and gets each word as a DisambiguatedWord. Then, adds the word together with its part of speech * tags to the wordUniGramModel. It also gets the transition list of that word and adds it to the igUniGramModel. * <p/> * If there exists a next word in the sentence, it adds the current and next {@link DisambiguatedWord} to the wordBiGramModel with * their part of speech tags. It also adds them to the igBiGramModel with their transition lists. * <p/> * At the end, it calculates the NGram probabilities of both word and ig unigram models by using LaplaceSmoothing, and * both word and ig bigram models by using InterpolatedSmoothing.</summary> * * <param name="corpus">{@link DisambiguationCorpus} to train.</param> */ public override void Train(DisambiguationCorpus corpus) { int i, j; var words1 = new Word[1]; var igs1 = new Word[1]; var words2 = new Word[2]; var igs2 = new Word[2]; wordUniGramModel = new NGram <Word>(1); wordBiGramModel = new NGram <Word>(2); igUniGramModel = new NGram <Word>(1); igBiGramModel = new NGram <Word>(2); for (i = 0; i < corpus.SentenceCount(); i++) { var sentence = corpus.GetSentence(i); for (j = 0; j < sentence.WordCount(); j++) { var word = (DisambiguatedWord)sentence.GetWord(j); words1[0] = word.GetParse().GetWordWithPos(); wordUniGramModel.AddNGram(words1); igs1[0] = new Word(word.GetParse().GetTransitionList()); igUniGramModel.AddNGram(igs1); if (j + 1 < sentence.WordCount()) { words2[0] = words1[0]; words2[1] = ((DisambiguatedWord)sentence.GetWord(j + 1)).GetParse().GetWordWithPos(); wordBiGramModel.AddNGram(words2); igs2[0] = igs1[0]; igs2[1] = new Word(((DisambiguatedWord)sentence.GetWord(j + 1)).GetParse() .GetTransitionList()); igBiGramModel.AddNGram(igs2); } } } wordUniGramModel.CalculateNGramProbabilities(new LaplaceSmoothing <Word>()); igUniGramModel.CalculateNGramProbabilities(new LaplaceSmoothing <Word>()); wordBiGramModel.CalculateNGramProbabilities(new LaplaceSmoothing <Word>()); igBiGramModel.CalculateNGramProbabilities(new LaplaceSmoothing <Word>()); }