Пример #1
0
        public void TestDisambiguation()
        {
            var fsm          = new FsmMorphologicalAnalyzer();
            var corpus       = new DisambiguationCorpus("../../../penntreebank.txt");
            var algorithm    = new DummyDisambiguation();
            var correctParse = 0;
            var correctRoot  = 0;

            for (var i = 0; i < corpus.SentenceCount(); i++)
            {
                var sentenceAnalyses = fsm.RobustMorphologicalAnalysis(corpus.GetSentence(i));
                var fsmParses        = algorithm.Disambiguate(sentenceAnalyses);
                for (var j = 0; j < corpus.GetSentence(i).WordCount(); j++)
                {
                    var word = (DisambiguatedWord)corpus.GetSentence(i).GetWord(j);
                    if (fsmParses[j].TransitionList().Equals(word.GetParse().ToString()))
                    {
                        correctParse++;
                    }

                    if (fsmParses[j].GetWord().Equals(word.GetParse().GetWord()))
                    {
                        correctRoot++;
                    }
                }
            }

            Assert.AreEqual(0.86, (correctRoot + 0.0) / corpus.NumberOfWords(), 0.01);
            Assert.AreEqual(0.70, (correctParse + 0.0) / corpus.NumberOfWords(), 0.01);
        }
Пример #2
0
        /**
         * <summary> The train method gets sentences from given {@link DisambiguationCorpus} and both word and the next word of that sentence at each iteration.
         * Then, adds these words together with their part of speech tags to word unigram and bigram models. It also adds the last inflectional group of
         * word to the ig unigram and bigram models.
         * <p/>
         * At the end, it calculates the NGram probabilities of both word and ig unigram models by using LaplaceSmoothing, and
         * both word and ig bigram models by using InterpolatedSmoothing.</summary>
         *
         * <param name="corpus">{@link DisambiguationCorpus} to train.</param>
         */
        public override void Train(DisambiguationCorpus corpus)
        {
            int i, j;
            var words1 = new Word[1];
            var igs1   = new Word[1];
            var words2 = new Word[2];
            var igs2   = new Word[2];

            wordUniGramModel = new NGram <Word>(1);
            igUniGramModel   = new NGram <Word>(1);
            wordBiGramModel  = new NGram <Word>(2);
            igBiGramModel    = new NGram <Word>(2);
            for (i = 0; i < corpus.SentenceCount(); i++)
            {
                var sentence = corpus.GetSentence(i);
                for (j = 0; j < sentence.WordCount() - 1; j++)
                {
                    var word     = (DisambiguatedWord)sentence.GetWord(j);
                    var nextWord = (DisambiguatedWord)sentence.GetWord(j + 1);
                    words2[0] = word.GetParse().GetWordWithPos();
                    words1[0] = words2[0];
                    words2[1] = nextWord.GetParse().GetWordWithPos();
                    wordUniGramModel.AddNGram(words1);
                    wordBiGramModel.AddNGram(words2);
                    int k;
                    for (k = 0; k < nextWord.GetParse().Size(); k++)
                    {
                        igs2[0] = new Word(word.GetParse().GetLastInflectionalGroup().ToString());
                        igs2[1] = new Word(nextWord.GetParse().GetInflectionalGroup(k).ToString());
                        igBiGramModel.AddNGram(igs2);
                        igs1[0] = igs2[1];
                        igUniGramModel.AddNGram(igs1);
                    }
                }
            }

            wordUniGramModel.CalculateNGramProbabilities(new LaplaceSmoothing <Word>());
            igUniGramModel.CalculateNGramProbabilities(new LaplaceSmoothing <Word>());
            wordBiGramModel.CalculateNGramProbabilities(new LaplaceSmoothing <Word>());
            igBiGramModel.CalculateNGramProbabilities(new LaplaceSmoothing <Word>());
        }
        /**
         * <summary> The train method initially creates new NGrams; wordUniGramModel, wordBiGramModel, igUniGramModel, and igBiGramModel. It gets the
         * sentences from given corpus and gets each word as a DisambiguatedWord. Then, adds the word together with its part of speech
         * tags to the wordUniGramModel. It also gets the transition list of that word and adds it to the igUniGramModel.
         * <p/>
         * If there exists a next word in the sentence, it adds the current and next {@link DisambiguatedWord} to the wordBiGramModel with
         * their part of speech tags. It also adds them to the igBiGramModel with their transition lists.
         * <p/>
         * At the end, it calculates the NGram probabilities of both word and ig unigram models by using LaplaceSmoothing, and
         * both word and ig bigram models by using InterpolatedSmoothing.</summary>
         *
         * <param name="corpus">{@link DisambiguationCorpus} to train.</param>
         */
        public override void Train(DisambiguationCorpus corpus)
        {
            int i, j;
            var words1 = new Word[1];
            var igs1   = new Word[1];
            var words2 = new Word[2];
            var igs2   = new Word[2];

            wordUniGramModel = new NGram <Word>(1);
            wordBiGramModel  = new NGram <Word>(2);
            igUniGramModel   = new NGram <Word>(1);
            igBiGramModel    = new NGram <Word>(2);
            for (i = 0; i < corpus.SentenceCount(); i++)
            {
                var sentence = corpus.GetSentence(i);
                for (j = 0; j < sentence.WordCount(); j++)
                {
                    var word = (DisambiguatedWord)sentence.GetWord(j);
                    words1[0] = word.GetParse().GetWordWithPos();
                    wordUniGramModel.AddNGram(words1);
                    igs1[0] = new Word(word.GetParse().GetTransitionList());
                    igUniGramModel.AddNGram(igs1);
                    if (j + 1 < sentence.WordCount())
                    {
                        words2[0] = words1[0];
                        words2[1] = ((DisambiguatedWord)sentence.GetWord(j + 1)).GetParse().GetWordWithPos();
                        wordBiGramModel.AddNGram(words2);
                        igs2[0] = igs1[0];
                        igs2[1] = new Word(((DisambiguatedWord)sentence.GetWord(j + 1)).GetParse()
                                           .GetTransitionList());
                        igBiGramModel.AddNGram(igs2);
                    }
                }
            }

            wordUniGramModel.CalculateNGramProbabilities(new LaplaceSmoothing <Word>());
            igUniGramModel.CalculateNGramProbabilities(new LaplaceSmoothing <Word>());
            wordBiGramModel.CalculateNGramProbabilities(new LaplaceSmoothing <Word>());
            igBiGramModel.CalculateNGramProbabilities(new LaplaceSmoothing <Word>());
        }