/** * <summary>Train method for the Naive pos tagger. The algorithm gets all possible tag list. Then counts all * possible tags (with its counts) for each possible word.</summary> * * <param name="corpus">Training data for the tagger.</param> */ public void Train(PosTaggedCorpus corpus) { var map = new Dictionary <string, CounterHashMap <string> >(); for (var i = 0; i < corpus.SentenceCount(); i++) { var s = corpus.GetSentence(i); for (var j = 0; j < s.WordCount(); j++) { var word = (PosTaggedWord)corpus.GetSentence(i).GetWord(j); if (map.ContainsKey(word.GetName())) { map[word.GetName()].Put(word.GetTag()); } else { var counterMap = new CounterHashMap <string>(); counterMap.Put(word.GetTag()); map[word.GetName()] = counterMap; } } } _maxMap = new Dictionary <string, string>(); foreach (var word in map.Keys) { _maxMap[word] = map[word].Max(); } }
/** * <summary>calculateEmissionProbabilities calculates the emission probabilities for a specific state. The method takes the state, * an array of observations (which also consists of an array of states) and an array of instances (which also consists * of an array of emitted symbols).</summary> * * <param name="state">The state for which emission probabilities will be calculated.</param> * <param name="observations">An array of instances, where each instance consists of an array of states.</param> * <param name="emittedSymbols">An array of instances, where each instance consists of an array of symbols.</param> * <returns>A {@link HashMap} Emission probabilities for a single state. Contains a probability for each symbol emitted.</returns> */ protected Dictionary <TSymbol, double> CalculateEmissionProbabilities(TState state, List <TState>[] observations, List <TSymbol>[] emittedSymbols) { var counts = new CounterHashMap <TSymbol>(); var emissionProbabilities = new Dictionary <TSymbol, double>(); for (var i = 0; i < observations.Length; i++) { for (var j = 0; j < observations[i].Count; j++) { var currentState = observations[i][j]; var currentSymbol = emittedSymbols[i][j]; if (currentState.Equals(state)) { counts.Put(currentSymbol); } } } double sum = counts.SumOfCounts(); foreach (var symbol in counts.Keys) { emissionProbabilities[symbol] = counts[symbol] / sum; } return(emissionProbabilities); }
public void TestTransitionWith() { var transitionCounts = new CounterHashMap <string>(); foreach (var state in stateList) { var transitions = fsm.GetTransitions(state); foreach (var transition in transitions) { transitionCounts.Put(transition.ToString()); } } var topList = transitionCounts.TopN(5); Assert.AreEqual("0", topList[0].Key); Assert.AreEqual(111, topList[0].Value); Assert.AreEqual("lAr", topList[1].Key); Assert.AreEqual(37, topList[1].Value); Assert.AreEqual("DHr", topList[2].Key); Assert.AreEqual(28, topList[2].Value); Assert.AreEqual("Hn", topList[3].Key); Assert.AreEqual(24, topList[3].Value); Assert.AreEqual("lArH", topList[4].Key); Assert.AreEqual(23, topList[4].Value); }
public void TestTransitionWithName() { var transitionCounts = new CounterHashMap <string>(); foreach (var state in stateList) { var transitions = fsm.GetTransitions(state); foreach (var transition in transitions) { if (transition.With() != null) { transitionCounts.Put(transition.With()); } } } var topList = transitionCounts.TopN(4); Assert.AreEqual("^DB+VERB+CAUS", topList[0].Key); Assert.AreEqual(33, topList[0].Value); Assert.AreEqual("^DB+VERB+PASS", topList[1].Key); Assert.AreEqual(31, topList[1].Value); Assert.AreEqual("A3PL", topList[2].Key); Assert.AreEqual(28, topList[2].Value); Assert.AreEqual("LOC", topList[3].Key); Assert.AreEqual(24, topList[3].Value); }
/** * <summary>Another constructor of {@link PosTaggedCorpus} which takes a fileName of the corpus as an input, reads the * corpus from that file.</summary> * * <param name="fileName">Name of the corpus file.</param> */ public PosTaggedCorpus(string fileName) { var newSentence = new Sentence(); sentences = new List <Sentence>(); _tagList = new CounterHashMap <string>(); var streamReader = new StreamReader(fileName); var line = streamReader.ReadLine(); while (line != null) { var words = line.Split(new char[] { ' ', '\t' }, StringSplitOptions.None); foreach (var word in words) { if (word != "") { if (word.Contains("/")) { var name = word.Substring(0, word.LastIndexOf('/')); var tag = word.Substring(word.LastIndexOf('/') + 1); string shortTag; if (tag.Contains("+")) { shortTag = tag.Substring(0, tag.IndexOf("+", StringComparison.CurrentCulture)); } else { if (tag.Contains("-")) { shortTag = tag.Substring(0, tag.IndexOf("-", StringComparison.CurrentCulture)); } else { shortTag = tag; } } _tagList.Put(shortTag); newSentence.AddWord(new PosTaggedWord(name, shortTag)); if (tag == ".") { AddSentence(newSentence); newSentence = new Sentence(); } } } } line = streamReader.ReadLine(); } if (newSentence.WordCount() > 0) { AddSentence(newSentence); } }
/** * <summary> Given an array of class labels, returns the maximum occurred one.</summary> * * <param name="classLabels">An array of class labels.</param> * <returns>The class label that occurs most in the array of class labels (mod of class label list).</returns> */ public static string GetMaximum(List <string> classLabels) { var frequencies = new CounterHashMap <string>(); foreach (var label in classLabels) { frequencies.Put(label); } return(frequencies.Max()); }
private static string NextWordPos(FsmParseList nextParseList) { var map = new CounterHashMap <string>(); for (var i = 0; i < nextParseList.Size(); i++) { map.Put(nextParseList.GetFsmParse(i).GetPos()); } return(map.Max()); }
public void TestPut3() { var random = new Random(); var counterHashMap = new CounterHashMap <int>(); for (var i = 0; i < 1000000; i++) { counterHashMap.Put(random.Next(1000000)); } Assert.AreEqual(((Dictionary <int, int>)counterHashMap).Count / 1000000.0, 0.632, 0.001); }
public void TestMax() { var counterHashMap = new CounterHashMap <string>(); counterHashMap.Put("item1"); counterHashMap.Put("item2"); counterHashMap.Put("item3"); counterHashMap.Put("item1"); counterHashMap.Put("item2"); counterHashMap.Put("item1"); Assert.AreEqual("item1", counterHashMap.Max()); }
public void TestTopN2() { var counterHashMap = new CounterHashMap <string>(); for (var i = 0; i < 1000; i++) { counterHashMap.PutNTimes(i + "", 2 * i + 2); } Assert.AreEqual(990 + "", counterHashMap.TopN(10)[9].Key); Assert.AreEqual(900 + "", counterHashMap.TopN(100)[99].Key); }
public void TestPutNTimes2() { var random = new Random(); var counterHashMap = new CounterHashMap <int>(); for (var i = 0; i < 1000; i++) { counterHashMap.PutNTimes(random.Next(1000), i + 1); } Assert.AreEqual(500500, counterHashMap.SumOfCounts()); }
public void TestMaxThreshold1() { var counterHashMap = new CounterHashMap <string>(); counterHashMap.Put("item1"); counterHashMap.Put("item2"); counterHashMap.Put("item3"); counterHashMap.Put("item1"); counterHashMap.Put("item2"); counterHashMap.Put("item1"); Assert.AreEqual("item1", counterHashMap.Max(0.4999)); Assert.AreNotEqual("item1", counterHashMap.Max(0.5001)); }
/** * <summary>Counts words recursively given height and wordCounter.</summary> * * <param name="wordCounter">word counter keeping symbols and their counts.</param> * <param name="height"> height for NGram. if height = 1, If level = 1, N-Gram is treated as UniGram, if level = 2,</param> * N-Gram is treated as Bigram, etc. */ public void CountWords(CounterHashMap <TSymbol> wordCounter, int height) { if (height == 0) { wordCounter.PutNTimes(_symbol, _count); } else { foreach (var child in _children.Values) { child.CountWords(wordCounter, height - 1); } } }
public void TestPut1() { var counterHashMap = new CounterHashMap <string>(); counterHashMap.Put("item1"); counterHashMap.Put("item2"); counterHashMap.Put("item3"); counterHashMap.Put("item1"); counterHashMap.Put("item2"); counterHashMap.Put("item1"); Assert.AreEqual(3, counterHashMap.Count("item1")); Assert.AreEqual(2, counterHashMap.Count("item2")); Assert.AreEqual(1, counterHashMap.Count("item3")); }
public void TestPutNTimes1() { var counterHashMap = new CounterHashMap <string>(); counterHashMap.PutNTimes("item1", 2); counterHashMap.PutNTimes("item2", 3); counterHashMap.PutNTimes("item3", 6); counterHashMap.PutNTimes("item1", 2); counterHashMap.PutNTimes("item2", 3); counterHashMap.PutNTimes("item1", 2); Assert.AreEqual(6, counterHashMap.Count("item1")); Assert.AreEqual(6, counterHashMap.Count("item2")); Assert.AreEqual(6, counterHashMap.Count("item3")); }
public void TestTopN1() { var counterHashMap = new CounterHashMap <string>(); counterHashMap.Put("item1"); counterHashMap.Put("item2"); counterHashMap.Put("item3"); counterHashMap.Put("item1"); counterHashMap.Put("item2"); counterHashMap.Put("item1"); Assert.AreEqual("item1", counterHashMap.TopN(1)[0].Key); Assert.AreEqual("item2", counterHashMap.TopN(2)[1].Key); Assert.AreEqual("item3", counterHashMap.TopN(3)[2].Key); }
public void TestMaxThreshold2() { var random = new Random(); var counterHashMap = new CounterHashMap <string>(); for (var i = 0; i < 1000000; i++) { counterHashMap.Put(random.Next(100) + ""); } var probability = counterHashMap.Count(counterHashMap.Max()) / 1000000.0; Assert.NotNull(counterHashMap.Max(probability - 0.001)); Assert.Null(counterHashMap.Max(probability + 0.001)); }
/** * <summary> The classify method takes two strings; actual class and predicted class as inputs. If the matrix {@link HashMap} contains * given actual class string as a key, it then assigns the corresponding object of that key to a {@link CounterHashMap}, if not * it creates a new {@link CounterHashMap}. Then, it puts the given predicted class string to the counterHashMap and * also put this counterHashMap to the matrix {@link HashMap} together with the given actual class string.</summary> * * <param name="actualClass"> string input actual class.</param> * <param name="predictedClass">string input predicted class.</param> */ public void Classify(string actualClass, string predictedClass) { CounterHashMap <string> counterHashMap; if (_matrix.ContainsKey(actualClass)) { counterHashMap = _matrix[actualClass]; } else { counterHashMap = new CounterHashMap <string>(); } counterHashMap.Put(predictedClass); _matrix[actualClass] = counterHashMap; }
public void TestDependencyCorpus() { var relationCounts = new CounterHashMap <TurkishDependencyType>(); var corpus = new TurkishDependencyTreeBankCorpus("metu-treebank.xml"); Assert.AreEqual(5635, corpus.SentenceCount()); var wordCount = 0; for (var i = 0; i < corpus.SentenceCount(); i++) { var sentence = (TurkishDependencyTreeBankSentence)corpus.GetSentence(i); wordCount += sentence.WordCount(); for (var j = 0; j < sentence.WordCount(); j++) { var word = (TurkishDependencyTreeBankWord)sentence.GetWord(j); if (word.GetRelation() != null) { relationCounts.Put(word.GetRelation().GetTurkishDependencyType()); } } } Assert.AreEqual(11692, relationCounts[TurkishDependencyType.MODIFIER]); Assert.AreEqual(903, relationCounts[TurkishDependencyType.INTENSIFIER]); Assert.AreEqual(1142, relationCounts[TurkishDependencyType.LOCATIVE_ADJUNCT]); Assert.AreEqual(240, relationCounts[TurkishDependencyType.VOCATIVE]); Assert.AreEqual(7261, relationCounts[TurkishDependencyType.SENTENCE]); Assert.AreEqual(16, relationCounts[TurkishDependencyType.EQU_ADJUNCT]); Assert.AreEqual(159, relationCounts[TurkishDependencyType.NEGATIVE_PARTICLE]); Assert.AreEqual(4481, relationCounts[TurkishDependencyType.SUBJECT]); Assert.AreEqual(2476, relationCounts[TurkishDependencyType.COORDINATION]); Assert.AreEqual(2050, relationCounts[TurkishDependencyType.CLASSIFIER]); Assert.AreEqual(73, relationCounts[TurkishDependencyType.COLLOCATION]); Assert.AreEqual(1516, relationCounts[TurkishDependencyType.POSSESSOR]); Assert.AreEqual(523, relationCounts[TurkishDependencyType.ABLATIVE_ADJUNCT]); Assert.AreEqual(23, relationCounts[TurkishDependencyType.FOCUS_PARTICLE]); Assert.AreEqual(1952, relationCounts[TurkishDependencyType.DETERMINER]); Assert.AreEqual(1361, relationCounts[TurkishDependencyType.DATIVE_ADJUNCT]); Assert.AreEqual(202, relationCounts[TurkishDependencyType.APPOSITION]); Assert.AreEqual(289, relationCounts[TurkishDependencyType.QUESTION_PARTICLE]); Assert.AreEqual(597, relationCounts[TurkishDependencyType.S_MODIFIER]); Assert.AreEqual(10, relationCounts[TurkishDependencyType.ETOL]); Assert.AreEqual(8338, relationCounts[TurkishDependencyType.OBJECT]); Assert.AreEqual(271, relationCounts[TurkishDependencyType.INSTRUMENTAL_ADJUNCT]); Assert.AreEqual(85, relationCounts[TurkishDependencyType.RELATIVIZER]); Assert.AreEqual(53993, wordCount); }
/** * <summary>Constructs a dictionary of non rare words with given N-Gram level and probability threshold.</summary> * * <param name="level">Level for counting words. Counts for different levels of the N-Gram can be set. If level = 1, N-Gram is treated as UniGram, if level = 2,</param> * N-Gram is treated as Bigram, etc. * <param name="probability">probability threshold for non rare words.</param> * <returns>{@link HashSet} non rare words.</returns> */ public HashSet <TSymbol> ConstructDictionaryWithNonRareWords(int level, double probability) { var result = new HashSet <TSymbol>(); var wordCounter = new CounterHashMap <TSymbol>(); rootNode.CountWords(wordCounter, level); var sum = wordCounter.SumOfCounts(); foreach (var symbol in wordCounter.Keys) { if (wordCounter[symbol] / (sum + 0.0) > probability) { result.Add(symbol); } } return(result); }
public void TestAdd3() { var counterHashMap1 = new CounterHashMap <int>(); for (var i = 0; i < 1000; i++) { counterHashMap1.Put(i); } var counterHashMap2 = new CounterHashMap <int>(); for (var i = 500; i < 1000; i++) { counterHashMap2.PutNTimes(1000 + i, i + 1); } counterHashMap1.Add(counterHashMap2); Assert.AreEqual(1500, ((Dictionary <int, int>)counterHashMap1).Count); }
public void TestPut2() { var random = new Random(); var counterHashMap = new CounterHashMap <int>(); for (var i = 0; i < 1000; i++) { counterHashMap.Put(random.Next(1000)); } var count = 0; for (var i = 0; i < 1000; i++) { count += counterHashMap.Count(i); } Assert.AreEqual(1000, count); }
public void TestAdd2() { var counterHashMap1 = new CounterHashMap <string>(); counterHashMap1.Put("item1"); counterHashMap1.Put("item2"); counterHashMap1.Put("item1"); counterHashMap1.Put("item2"); counterHashMap1.Put("item1"); var counterHashMap2 = new CounterHashMap <string>(); counterHashMap2.Put("item4"); counterHashMap2.PutNTimes("item5", 4); counterHashMap2.Put("item2"); counterHashMap1.Add(counterHashMap2); Assert.AreEqual(3, counterHashMap1.Count("item1")); Assert.AreEqual(3, counterHashMap1.Count("item2")); Assert.AreEqual(1, counterHashMap1.Count("item4")); Assert.AreEqual(4, counterHashMap1.Count("item5")); }
public void TestNERCorpus() { CounterHashMap <NamedEntityType> counter = new CounterHashMap <NamedEntityType>(); NERCorpus nerCorpus = new NERCorpus("../../../nerdata.txt"); Assert.AreEqual(27556, nerCorpus.SentenceCount()); Assert.AreEqual(492233, nerCorpus.NumberOfWords()); for (int i = 0; i < nerCorpus.SentenceCount(); i++) { NamedEntitySentence namedEntitySentence = (NamedEntitySentence)nerCorpus.GetSentence(i); for (int j = 0; j < namedEntitySentence.WordCount(); j++) { NamedEntityWord namedEntityWord = (NamedEntityWord)namedEntitySentence.GetWord(j); counter.Put(namedEntityWord.GetNamedEntityType()); } } Assert.AreEqual(438976, counter[NamedEntityType.NONE]); Assert.AreEqual(23878, counter[NamedEntityType.PERSON]); Assert.AreEqual(16931, counter[NamedEntityType.ORGANIZATION]); Assert.AreEqual(12448, counter[NamedEntityType.LOCATION]); }
public void TestAdd1() { var counterHashMap1 = new CounterHashMap <string>(); counterHashMap1.Put("item1"); counterHashMap1.Put("item2"); counterHashMap1.Put("item3"); counterHashMap1.Put("item1"); counterHashMap1.Put("item2"); counterHashMap1.Put("item1"); var counterHashMap2 = new CounterHashMap <string>(); counterHashMap2.PutNTimes("item1", 2); counterHashMap2.PutNTimes("item2", 3); counterHashMap2.PutNTimes("item3", 6); counterHashMap2.PutNTimes("item1", 2); counterHashMap2.PutNTimes("item2", 3); counterHashMap2.PutNTimes("item1", 2); counterHashMap1.Add(counterHashMap2); Assert.AreEqual(9, counterHashMap1.Count("item1")); Assert.AreEqual(8, counterHashMap1.Count("item2")); Assert.AreEqual(7, counterHashMap1.Count("item3")); }
public void TestStartEndStates() { var endStateCount = 0; foreach (var state in stateList) { if (state.IsEndState()) { endStateCount++; } } Assert.AreEqual(35, endStateCount); var posCounts = new CounterHashMap <string>(); foreach (var state in stateList) { if (state.GetPos() != null) { posCounts.Put(state.GetPos()); } } Assert.AreEqual(1, posCounts["HEAD"]); Assert.AreEqual(6, posCounts["PRON"]); Assert.AreEqual(1, posCounts["PROP"]); Assert.AreEqual(8, posCounts["NUM"]); Assert.AreEqual(7, posCounts["ADJ"]); Assert.AreEqual(1, posCounts["INTERJ"]); Assert.AreEqual(1, posCounts["DET"]); Assert.AreEqual(1, posCounts["ADVERB"]); Assert.AreEqual(1, posCounts["QUES"]); Assert.AreEqual(1, posCounts["CONJ"]); Assert.AreEqual(26, posCounts["VERB"]); Assert.AreEqual(1, posCounts["POSTP"]); Assert.AreEqual(1, posCounts["DUP"]); Assert.AreEqual(11, posCounts["NOUN"]); }
/** * <summary>Empty constructor for {@link TurkishDependencyTreeBankCorpus}. Initializes the sentences and wordList attributes.</summary> */ public TurkishDependencyTreeBankCorpus() { sentences = new List <Sentence>(); wordList = new CounterHashMap <Word>(); }
/** * <summary>A constructor of {@link PosTaggedCorpus} which initializes the sentences of the corpus, the word list of * the corpus, and all possible tags.</summary> */ public PosTaggedCorpus() { sentences = new List <Sentence>(); wordList = new CounterHashMap <Word>(); _tagList = new CounterHashMap <string>(); }
/** * <summary> Constructor which creates an {@link ArrayList} of sentences and a {@link CounterHashMap} of wordList.</summary> */ public DisambiguationCorpus() { sentences = new List <Sentence>(); wordList = new CounterHashMap <Word>(); }
/** * <summary>A constructor of {@link Corpus} class which creates new {@link ArrayList} for sentences and a {@link CounterHashMap} * for wordList.</summary> */ public Corpus() { sentences = new List <Sentence>(); paragraphs = new List <Paragraph>(); wordList = new CounterHashMap <Word>(); }