Ejemplo n.º 1
0
        /**
         * <summary>Train method for the Naive pos tagger. The algorithm gets all possible tag list. Then counts all
         * possible tags (with its counts) for each possible word.</summary>
         *
         * <param name="corpus">Training data for the tagger.</param>
         */
        public void Train(PosTaggedCorpus corpus)
        {
            var map = new Dictionary <string, CounterHashMap <string> >();

            for (var i = 0; i < corpus.SentenceCount(); i++)
            {
                var s = corpus.GetSentence(i);
                for (var j = 0; j < s.WordCount(); j++)
                {
                    var word = (PosTaggedWord)corpus.GetSentence(i).GetWord(j);
                    if (map.ContainsKey(word.GetName()))
                    {
                        map[word.GetName()].Put(word.GetTag());
                    }
                    else
                    {
                        var counterMap = new CounterHashMap <string>();
                        counterMap.Put(word.GetTag());
                        map[word.GetName()] = counterMap;
                    }
                }
            }
            _maxMap = new Dictionary <string, string>();
            foreach (var word in map.Keys)
            {
                _maxMap[word] = map[word].Max();
            }
        }
Ejemplo n.º 2
0
        /**
         * <summary>calculateEmissionProbabilities calculates the emission probabilities for a specific state. The method takes the state,
         * an array of observations (which also consists of an array of states) and an array of instances (which also consists
         * of an array of emitted symbols).</summary>
         *
         * <param name="state">The state for which emission probabilities will be calculated.</param>
         * <param name="observations">An array of instances, where each instance consists of an array of states.</param>
         * <param name="emittedSymbols">An array of instances, where each instance consists of an array of symbols.</param>
         * <returns>A {@link HashMap} Emission probabilities for a single state. Contains a probability for each symbol emitted.</returns>
         */
        protected Dictionary <TSymbol, double> CalculateEmissionProbabilities(TState state, List <TState>[] observations,
                                                                              List <TSymbol>[] emittedSymbols)
        {
            var counts = new CounterHashMap <TSymbol>();
            var emissionProbabilities = new Dictionary <TSymbol, double>();

            for (var i = 0; i < observations.Length; i++)
            {
                for (var j = 0; j < observations[i].Count; j++)
                {
                    var currentState  = observations[i][j];
                    var currentSymbol = emittedSymbols[i][j];
                    if (currentState.Equals(state))
                    {
                        counts.Put(currentSymbol);
                    }
                }
            }

            double sum = counts.SumOfCounts();

            foreach (var symbol in counts.Keys)
            {
                emissionProbabilities[symbol] = counts[symbol] / sum;
            }

            return(emissionProbabilities);
        }
Ejemplo n.º 3
0
        public void TestTransitionWith()
        {
            var transitionCounts = new CounterHashMap <string>();

            foreach (var state in stateList)
            {
                var transitions = fsm.GetTransitions(state);
                foreach (var transition in transitions)
                {
                    transitionCounts.Put(transition.ToString());
                }
            }
            var topList = transitionCounts.TopN(5);

            Assert.AreEqual("0", topList[0].Key);
            Assert.AreEqual(111, topList[0].Value);
            Assert.AreEqual("lAr", topList[1].Key);
            Assert.AreEqual(37, topList[1].Value);
            Assert.AreEqual("DHr", topList[2].Key);
            Assert.AreEqual(28, topList[2].Value);
            Assert.AreEqual("Hn", topList[3].Key);
            Assert.AreEqual(24, topList[3].Value);
            Assert.AreEqual("lArH", topList[4].Key);
            Assert.AreEqual(23, topList[4].Value);
        }
Ejemplo n.º 4
0
        public void TestTransitionWithName()
        {
            var transitionCounts = new CounterHashMap <string>();

            foreach (var state in stateList)
            {
                var transitions = fsm.GetTransitions(state);
                foreach (var transition in transitions)
                {
                    if (transition.With() != null)
                    {
                        transitionCounts.Put(transition.With());
                    }
                }
            }
            var topList = transitionCounts.TopN(4);

            Assert.AreEqual("^DB+VERB+CAUS", topList[0].Key);
            Assert.AreEqual(33, topList[0].Value);
            Assert.AreEqual("^DB+VERB+PASS", topList[1].Key);
            Assert.AreEqual(31, topList[1].Value);
            Assert.AreEqual("A3PL", topList[2].Key);
            Assert.AreEqual(28, topList[2].Value);
            Assert.AreEqual("LOC", topList[3].Key);
            Assert.AreEqual(24, topList[3].Value);
        }
        /**
         * <summary>Another constructor of {@link PosTaggedCorpus} which takes a fileName of the corpus as an input, reads the
         * corpus from that file.</summary>
         *
         * <param name="fileName">Name of the corpus file.</param>
         */
        public PosTaggedCorpus(string fileName)
        {
            var newSentence = new Sentence();

            sentences = new List <Sentence>();
            _tagList  = new CounterHashMap <string>();
            var streamReader = new StreamReader(fileName);
            var line         = streamReader.ReadLine();

            while (line != null)
            {
                var words = line.Split(new char[] { ' ', '\t' }, StringSplitOptions.None);
                foreach (var word in words)
                {
                    if (word != "")
                    {
                        if (word.Contains("/"))
                        {
                            var    name = word.Substring(0, word.LastIndexOf('/'));
                            var    tag  = word.Substring(word.LastIndexOf('/') + 1);
                            string shortTag;
                            if (tag.Contains("+"))
                            {
                                shortTag = tag.Substring(0, tag.IndexOf("+", StringComparison.CurrentCulture));
                            }
                            else
                            {
                                if (tag.Contains("-"))
                                {
                                    shortTag = tag.Substring(0, tag.IndexOf("-", StringComparison.CurrentCulture));
                                }
                                else
                                {
                                    shortTag = tag;
                                }
                            }

                            _tagList.Put(shortTag);
                            newSentence.AddWord(new PosTaggedWord(name, shortTag));
                            if (tag == ".")
                            {
                                AddSentence(newSentence);
                                newSentence = new Sentence();
                            }
                        }
                    }
                }

                line = streamReader.ReadLine();
            }

            if (newSentence.WordCount() > 0)
            {
                AddSentence(newSentence);
            }
        }
Ejemplo n.º 6
0
        /**
         * <summary> Given an array of class labels, returns the maximum occurred one.</summary>
         *
         * <param name="classLabels">An array of class labels.</param>
         * <returns>The class label that occurs most in the array of class labels (mod of class label list).</returns>
         */
        public static string GetMaximum(List <string> classLabels)
        {
            var frequencies = new CounterHashMap <string>();

            foreach (var label in classLabels)
            {
                frequencies.Put(label);
            }
            return(frequencies.Max());
        }
        private static string NextWordPos(FsmParseList nextParseList)
        {
            var map = new CounterHashMap <string>();

            for (var i = 0; i < nextParseList.Size(); i++)
            {
                map.Put(nextParseList.GetFsmParse(i).GetPos());
            }

            return(map.Max());
        }
Ejemplo n.º 8
0
        public void TestPut3()
        {
            var random         = new Random();
            var counterHashMap = new CounterHashMap <int>();

            for (var i = 0; i < 1000000; i++)
            {
                counterHashMap.Put(random.Next(1000000));
            }

            Assert.AreEqual(((Dictionary <int, int>)counterHashMap).Count / 1000000.0, 0.632, 0.001);
        }
Ejemplo n.º 9
0
        public void TestMax()
        {
            var counterHashMap = new CounterHashMap <string>();

            counterHashMap.Put("item1");
            counterHashMap.Put("item2");
            counterHashMap.Put("item3");
            counterHashMap.Put("item1");
            counterHashMap.Put("item2");
            counterHashMap.Put("item1");
            Assert.AreEqual("item1", counterHashMap.Max());
        }
Ejemplo n.º 10
0
        public void TestTopN2()
        {
            var counterHashMap = new CounterHashMap <string>();

            for (var i = 0; i < 1000; i++)
            {
                counterHashMap.PutNTimes(i + "", 2 * i + 2);
            }

            Assert.AreEqual(990 + "", counterHashMap.TopN(10)[9].Key);
            Assert.AreEqual(900 + "", counterHashMap.TopN(100)[99].Key);
        }
Ejemplo n.º 11
0
        public void TestPutNTimes2()
        {
            var random         = new Random();
            var counterHashMap = new CounterHashMap <int>();

            for (var i = 0; i < 1000; i++)
            {
                counterHashMap.PutNTimes(random.Next(1000), i + 1);
            }

            Assert.AreEqual(500500, counterHashMap.SumOfCounts());
        }
Ejemplo n.º 12
0
        public void TestMaxThreshold1()
        {
            var counterHashMap = new CounterHashMap <string>();

            counterHashMap.Put("item1");
            counterHashMap.Put("item2");
            counterHashMap.Put("item3");
            counterHashMap.Put("item1");
            counterHashMap.Put("item2");
            counterHashMap.Put("item1");
            Assert.AreEqual("item1", counterHashMap.Max(0.4999));
            Assert.AreNotEqual("item1", counterHashMap.Max(0.5001));
        }
Ejemplo n.º 13
0
 /**
  * <summary>Counts words recursively given height and wordCounter.</summary>
  *
  * <param name="wordCounter">word counter keeping symbols and their counts.</param>
  * <param name="height">     height for NGram. if height = 1, If level = 1, N-Gram is treated as UniGram, if level = 2,</param>
  *                    N-Gram is treated as Bigram, etc.
  */
 public void CountWords(CounterHashMap <TSymbol> wordCounter, int height)
 {
     if (height == 0)
     {
         wordCounter.PutNTimes(_symbol, _count);
     }
     else
     {
         foreach (var child in _children.Values)
         {
             child.CountWords(wordCounter, height - 1);
         }
     }
 }
Ejemplo n.º 14
0
        public void TestPut1()
        {
            var counterHashMap = new CounterHashMap <string>();

            counterHashMap.Put("item1");
            counterHashMap.Put("item2");
            counterHashMap.Put("item3");
            counterHashMap.Put("item1");
            counterHashMap.Put("item2");
            counterHashMap.Put("item1");
            Assert.AreEqual(3, counterHashMap.Count("item1"));
            Assert.AreEqual(2, counterHashMap.Count("item2"));
            Assert.AreEqual(1, counterHashMap.Count("item3"));
        }
Ejemplo n.º 15
0
        public void TestPutNTimes1()
        {
            var counterHashMap = new CounterHashMap <string>();

            counterHashMap.PutNTimes("item1", 2);
            counterHashMap.PutNTimes("item2", 3);
            counterHashMap.PutNTimes("item3", 6);
            counterHashMap.PutNTimes("item1", 2);
            counterHashMap.PutNTimes("item2", 3);
            counterHashMap.PutNTimes("item1", 2);
            Assert.AreEqual(6, counterHashMap.Count("item1"));
            Assert.AreEqual(6, counterHashMap.Count("item2"));
            Assert.AreEqual(6, counterHashMap.Count("item3"));
        }
Ejemplo n.º 16
0
        public void TestTopN1()
        {
            var counterHashMap = new CounterHashMap <string>();

            counterHashMap.Put("item1");
            counterHashMap.Put("item2");
            counterHashMap.Put("item3");
            counterHashMap.Put("item1");
            counterHashMap.Put("item2");
            counterHashMap.Put("item1");
            Assert.AreEqual("item1", counterHashMap.TopN(1)[0].Key);
            Assert.AreEqual("item2", counterHashMap.TopN(2)[1].Key);
            Assert.AreEqual("item3", counterHashMap.TopN(3)[2].Key);
        }
Ejemplo n.º 17
0
        public void TestMaxThreshold2()
        {
            var random         = new Random();
            var counterHashMap = new CounterHashMap <string>();

            for (var i = 0; i < 1000000; i++)
            {
                counterHashMap.Put(random.Next(100) + "");
            }

            var probability = counterHashMap.Count(counterHashMap.Max()) / 1000000.0;

            Assert.NotNull(counterHashMap.Max(probability - 0.001));
            Assert.Null(counterHashMap.Max(probability + 0.001));
        }
Ejemplo n.º 18
0
        /**
         * <summary> The classify method takes two strings; actual class and predicted class as inputs. If the matrix {@link HashMap} contains
         * given actual class string as a key, it then assigns the corresponding object of that key to a {@link CounterHashMap}, if not
         * it creates a new {@link CounterHashMap}. Then, it puts the given predicted class string to the counterHashMap and
         * also put this counterHashMap to the matrix {@link HashMap} together with the given actual class string.</summary>
         *
         * <param name="actualClass">   string input actual class.</param>
         * <param name="predictedClass">string input predicted class.</param>
         */
        public void Classify(string actualClass, string predictedClass)
        {
            CounterHashMap <string> counterHashMap;

            if (_matrix.ContainsKey(actualClass))
            {
                counterHashMap = _matrix[actualClass];
            }
            else
            {
                counterHashMap = new CounterHashMap <string>();
            }

            counterHashMap.Put(predictedClass);
            _matrix[actualClass] = counterHashMap;
        }
Ejemplo n.º 19
0
        public void TestDependencyCorpus()
        {
            var relationCounts = new CounterHashMap <TurkishDependencyType>();
            var corpus         = new TurkishDependencyTreeBankCorpus("metu-treebank.xml");

            Assert.AreEqual(5635, corpus.SentenceCount());
            var wordCount = 0;

            for (var i = 0; i < corpus.SentenceCount(); i++)
            {
                var sentence = (TurkishDependencyTreeBankSentence)corpus.GetSentence(i);
                wordCount += sentence.WordCount();
                for (var j = 0; j < sentence.WordCount(); j++)
                {
                    var word = (TurkishDependencyTreeBankWord)sentence.GetWord(j);
                    if (word.GetRelation() != null)
                    {
                        relationCounts.Put(word.GetRelation().GetTurkishDependencyType());
                    }
                }
            }

            Assert.AreEqual(11692, relationCounts[TurkishDependencyType.MODIFIER]);
            Assert.AreEqual(903, relationCounts[TurkishDependencyType.INTENSIFIER]);
            Assert.AreEqual(1142, relationCounts[TurkishDependencyType.LOCATIVE_ADJUNCT]);
            Assert.AreEqual(240, relationCounts[TurkishDependencyType.VOCATIVE]);
            Assert.AreEqual(7261, relationCounts[TurkishDependencyType.SENTENCE]);
            Assert.AreEqual(16, relationCounts[TurkishDependencyType.EQU_ADJUNCT]);
            Assert.AreEqual(159, relationCounts[TurkishDependencyType.NEGATIVE_PARTICLE]);
            Assert.AreEqual(4481, relationCounts[TurkishDependencyType.SUBJECT]);
            Assert.AreEqual(2476, relationCounts[TurkishDependencyType.COORDINATION]);
            Assert.AreEqual(2050, relationCounts[TurkishDependencyType.CLASSIFIER]);
            Assert.AreEqual(73, relationCounts[TurkishDependencyType.COLLOCATION]);
            Assert.AreEqual(1516, relationCounts[TurkishDependencyType.POSSESSOR]);
            Assert.AreEqual(523, relationCounts[TurkishDependencyType.ABLATIVE_ADJUNCT]);
            Assert.AreEqual(23, relationCounts[TurkishDependencyType.FOCUS_PARTICLE]);
            Assert.AreEqual(1952, relationCounts[TurkishDependencyType.DETERMINER]);
            Assert.AreEqual(1361, relationCounts[TurkishDependencyType.DATIVE_ADJUNCT]);
            Assert.AreEqual(202, relationCounts[TurkishDependencyType.APPOSITION]);
            Assert.AreEqual(289, relationCounts[TurkishDependencyType.QUESTION_PARTICLE]);
            Assert.AreEqual(597, relationCounts[TurkishDependencyType.S_MODIFIER]);
            Assert.AreEqual(10, relationCounts[TurkishDependencyType.ETOL]);
            Assert.AreEqual(8338, relationCounts[TurkishDependencyType.OBJECT]);
            Assert.AreEqual(271, relationCounts[TurkishDependencyType.INSTRUMENTAL_ADJUNCT]);
            Assert.AreEqual(85, relationCounts[TurkishDependencyType.RELATIVIZER]);
            Assert.AreEqual(53993, wordCount);
        }
Ejemplo n.º 20
0
        /**
         * <summary>Constructs a dictionary of non rare words with given N-Gram level and probability threshold.</summary>
         *
         * <param name="level">Level for counting words. Counts for different levels of the N-Gram can be set. If level = 1, N-Gram is treated as UniGram, if level = 2,</param>
         *              N-Gram is treated as Bigram, etc.
         * <param name="probability">probability threshold for non rare words.</param>
         * <returns>{@link HashSet} non rare words.</returns>
         */
        public HashSet <TSymbol> ConstructDictionaryWithNonRareWords(int level, double probability)
        {
            var result      = new HashSet <TSymbol>();
            var wordCounter = new CounterHashMap <TSymbol>();

            rootNode.CountWords(wordCounter, level);
            var sum = wordCounter.SumOfCounts();

            foreach (var symbol in wordCounter.Keys)
            {
                if (wordCounter[symbol] / (sum + 0.0) > probability)
                {
                    result.Add(symbol);
                }
            }

            return(result);
        }
Ejemplo n.º 21
0
        public void TestAdd3()
        {
            var counterHashMap1 = new CounterHashMap <int>();

            for (var i = 0; i < 1000; i++)
            {
                counterHashMap1.Put(i);
            }

            var counterHashMap2 = new CounterHashMap <int>();

            for (var i = 500; i < 1000; i++)
            {
                counterHashMap2.PutNTimes(1000 + i, i + 1);
            }

            counterHashMap1.Add(counterHashMap2);
            Assert.AreEqual(1500, ((Dictionary <int, int>)counterHashMap1).Count);
        }
Ejemplo n.º 22
0
        public void TestPut2()
        {
            var random         = new Random();
            var counterHashMap = new CounterHashMap <int>();

            for (var i = 0; i < 1000; i++)
            {
                counterHashMap.Put(random.Next(1000));
            }

            var count = 0;

            for (var i = 0; i < 1000; i++)
            {
                count += counterHashMap.Count(i);
            }

            Assert.AreEqual(1000, count);
        }
Ejemplo n.º 23
0
        public void TestAdd2()
        {
            var counterHashMap1 = new CounterHashMap <string>();

            counterHashMap1.Put("item1");
            counterHashMap1.Put("item2");
            counterHashMap1.Put("item1");
            counterHashMap1.Put("item2");
            counterHashMap1.Put("item1");
            var counterHashMap2 = new CounterHashMap <string>();

            counterHashMap2.Put("item4");
            counterHashMap2.PutNTimes("item5", 4);
            counterHashMap2.Put("item2");
            counterHashMap1.Add(counterHashMap2);
            Assert.AreEqual(3, counterHashMap1.Count("item1"));
            Assert.AreEqual(3, counterHashMap1.Count("item2"));
            Assert.AreEqual(1, counterHashMap1.Count("item4"));
            Assert.AreEqual(4, counterHashMap1.Count("item5"));
        }
        public void TestNERCorpus()
        {
            CounterHashMap <NamedEntityType> counter = new CounterHashMap <NamedEntityType>();
            NERCorpus nerCorpus = new NERCorpus("../../../nerdata.txt");

            Assert.AreEqual(27556, nerCorpus.SentenceCount());
            Assert.AreEqual(492233, nerCorpus.NumberOfWords());
            for (int i = 0; i < nerCorpus.SentenceCount(); i++)
            {
                NamedEntitySentence namedEntitySentence = (NamedEntitySentence)nerCorpus.GetSentence(i);
                for (int j = 0; j < namedEntitySentence.WordCount(); j++)
                {
                    NamedEntityWord namedEntityWord = (NamedEntityWord)namedEntitySentence.GetWord(j);
                    counter.Put(namedEntityWord.GetNamedEntityType());
                }
            }
            Assert.AreEqual(438976, counter[NamedEntityType.NONE]);
            Assert.AreEqual(23878, counter[NamedEntityType.PERSON]);
            Assert.AreEqual(16931, counter[NamedEntityType.ORGANIZATION]);
            Assert.AreEqual(12448, counter[NamedEntityType.LOCATION]);
        }
Ejemplo n.º 25
0
        public void TestAdd1()
        {
            var counterHashMap1 = new CounterHashMap <string>();

            counterHashMap1.Put("item1");
            counterHashMap1.Put("item2");
            counterHashMap1.Put("item3");
            counterHashMap1.Put("item1");
            counterHashMap1.Put("item2");
            counterHashMap1.Put("item1");
            var counterHashMap2 = new CounterHashMap <string>();

            counterHashMap2.PutNTimes("item1", 2);
            counterHashMap2.PutNTimes("item2", 3);
            counterHashMap2.PutNTimes("item3", 6);
            counterHashMap2.PutNTimes("item1", 2);
            counterHashMap2.PutNTimes("item2", 3);
            counterHashMap2.PutNTimes("item1", 2);
            counterHashMap1.Add(counterHashMap2);
            Assert.AreEqual(9, counterHashMap1.Count("item1"));
            Assert.AreEqual(8, counterHashMap1.Count("item2"));
            Assert.AreEqual(7, counterHashMap1.Count("item3"));
        }
Ejemplo n.º 26
0
        public void TestStartEndStates()
        {
            var endStateCount = 0;

            foreach (var state in stateList)
            {
                if (state.IsEndState())
                {
                    endStateCount++;
                }
            }
            Assert.AreEqual(35, endStateCount);
            var posCounts = new CounterHashMap <string>();

            foreach (var state in stateList)
            {
                if (state.GetPos() != null)
                {
                    posCounts.Put(state.GetPos());
                }
            }
            Assert.AreEqual(1, posCounts["HEAD"]);
            Assert.AreEqual(6, posCounts["PRON"]);
            Assert.AreEqual(1, posCounts["PROP"]);
            Assert.AreEqual(8, posCounts["NUM"]);
            Assert.AreEqual(7, posCounts["ADJ"]);
            Assert.AreEqual(1, posCounts["INTERJ"]);
            Assert.AreEqual(1, posCounts["DET"]);
            Assert.AreEqual(1, posCounts["ADVERB"]);
            Assert.AreEqual(1, posCounts["QUES"]);
            Assert.AreEqual(1, posCounts["CONJ"]);
            Assert.AreEqual(26, posCounts["VERB"]);
            Assert.AreEqual(1, posCounts["POSTP"]);
            Assert.AreEqual(1, posCounts["DUP"]);
            Assert.AreEqual(11, posCounts["NOUN"]);
        }
 /**
  * <summary>Empty constructor for {@link TurkishDependencyTreeBankCorpus}. Initializes the sentences and wordList attributes.</summary>
  */
 public TurkishDependencyTreeBankCorpus()
 {
     sentences = new List <Sentence>();
     wordList  = new CounterHashMap <Word>();
 }
 /**
  * <summary>A constructor of {@link PosTaggedCorpus} which initializes the sentences of the corpus, the word list of
  * the corpus, and all possible tags.</summary>
  */
 public PosTaggedCorpus()
 {
     sentences = new List <Sentence>();
     wordList  = new CounterHashMap <Word>();
     _tagList  = new CounterHashMap <string>();
 }
Ejemplo n.º 29
0
 /**
  * <summary> Constructor which creates an {@link ArrayList} of sentences and a {@link CounterHashMap} of wordList.</summary>
  */
 public DisambiguationCorpus()
 {
     sentences = new List <Sentence>();
     wordList  = new CounterHashMap <Word>();
 }
Ejemplo n.º 30
0
 /**
  * <summary>A constructor of {@link Corpus} class which creates new {@link ArrayList} for sentences and a {@link CounterHashMap}
  * for wordList.</summary>
  */
 public Corpus()
 {
     sentences  = new List <Sentence>();
     paragraphs = new List <Paragraph>();
     wordList   = new CounterHashMap <Word>();
 }