public void AddSafely_KeyAbsent_AddsKeyAndValue() { // arrange var actualDictionary = new Dictionary <int, int>(); var expectedDictionary = new Dictionary <int, int> { { 1, 1 } }; // act actualDictionary.AddSafely(1, 1); // assert actualDictionary.Should().BeEquivalentTo(expectedDictionary); }
public void AddSafely_KeyPresent_ReplacesValueOfKey() { // arrange var actualDictionary = new Dictionary <int, int> { { 1, 1 } }; var expectedDictionary = new Dictionary <int, int> { { 1, 2 } }; // act actualDictionary.AddSafely(1, 2); // assert actualDictionary.Should().BeEquivalentTo(expectedDictionary); }
public static Dictionary <TKey, TElement> ToDictionarySafely <TSource, TKey, TElement>( this IEnumerable <TSource> source, Func <TSource, TKey> keySelector, Func <TSource, TElement> elementSelector, IEqualityComparer <TKey> comparer ) { var dictionary = new Dictionary <TKey, TElement>(comparer); foreach (var item in source) { dictionary.AddSafely(keySelector(item), elementSelector(item)); } return(dictionary); }
public static void GetDictionaries(string DictionaryFile, Dictionary WordsCount, Dictionary NGrams) { //read trained n-grams and word counts from file, fill-up dictionary foreach (var ln in File.ReadAllLines(DictionaryFile)) { var kvp = ln.Split('\t'); var srcKey = kvp[0]; var srcVal = kvp[1]; var keySplits = srcKey.Split(' '); var keyTag = keySplits.Last(); var key = string.Join(" ", keySplits.Take(keySplits.Count() - 1)); if (keyTag == "WORDTAG") { WordsCount.AddSafely(key, srcVal); } else { NGrams.AddSafely(key, srcVal); } } }
public void Reduce(int rareWordsLimit = 3) { var wordsCount = new Dictionary(); var ngramsCount = new Dictionary(); string line; while ((line = Console.ReadLine()) != null) { var kvp = line.Split(new[] { '\t' }, StringSplitOptions.RemoveEmptyEntries); var val = int.Parse(kvp[1]); var keySplits = kvp[0].Split(new[] { ' ' }, StringSplitOptions.RemoveEmptyEntries); var typeTag = keySplits.Last(); var key = string.Join(" ", keySplits.Take(keySplits.Count() - 1)); if (keySplits.Last() == "WORDTAG") { wordsCount.AddSafely(key, val); } else if (keySplits.Last() == "NGRAM") { ngramsCount.AddSafely(key, val); } } //store pairs for rare words - tag : quanity var rareWordsCount = new Dictionary(); var wordsCountOrdered = wordsCount.OrderBy(p => p.Key).OrderByDescending(p => p.Value); foreach (var wordCount in wordsCountOrdered) { if (wordCount.Value >= rareWordsLimit) { Console.WriteLine(string.Format("{1} WORDTAG\t{0}", wordCount.Value, wordCount.Key)); } else { var tag = wordCount.Key.Split(' ').Last(); rareWordsCount.AddSafely(tag, wordCount.Value); } } foreach (var rareWordCount in rareWordsCount) { Console.WriteLine(string.Format("{1} {2} WORDTAG\t{0}", rareWordCount.Value, RARE_WORD, rareWordCount.Key)); } foreach (var ngramCount in ngramsCount.OrderBy(p => p.Key).OrderByDescending(p => p.Value)) { Console.WriteLine(string.Format("{2} {1}-GRAM\t{0}", ngramCount.Value, ngramCount.Key.Split(new[] { ' ' }).Count(), ngramCount.Key)); } Console.Out.Close(); }
public void Map(int nGramCount = 3) { var counters = new Dictionary(); var ngrams = new Dictionary(); var tags = new List<string>(); string line; while ((line = Console.ReadLine()) != null) { var wordTagPairs = line.Split(new[] { '\t' }, StringSplitOptions.RemoveEmptyEntries); foreach (var wordTagPair in wordTagPairs) { //Words count counters.AddSafely(wordTagPair, 1); //Collect tags tags.Add(wordTagPair.Split(' ')[1]); } foreach (var counter in counters) { Console.WriteLine(string.Format(WordOutputFormatString, counter.Key, counter.Value)); } //NGRAMS for (var i = 1; i <= nGramCount; i++) { var spt = Enumerable.Repeat(START_TAG, i - 1).Concat(tags); int iterCnt; if (i > 1) { //Add stop tag if ngrams > 1 spt = spt.Concat(new[] { STOP_TAG }); //iter till STOP word (total number of iterations = number of tags + 1), example * * A B STOP = 3 & * A B STOP = 3 iterCnt = tags.Count() + 1; } else { //iter = number of tags, example: AB = 2 iters iterCnt = tags.Count(); } for (var k = 0; k < iterCnt; k++) { //move to the next tag and compose ngram of length i var ngram = string.Join(" ", spt.Skip(k).Take(i)); ngrams.AddSafely(ngram, 1); } //Add start sentence tags, will be needed for calculating probabilities in HMM if (i > 1) { var startSentenceTag = string.Join(" ", Enumerable.Repeat(START_TAG, i - 1)); ngrams.AddSafely(startSentenceTag, 1); } } foreach (var ngram in ngrams) { Console.WriteLine(string.Format(NGramOutputFormatString, ngram.Key, ngram.Value)); } counters.Clear(); tags.Clear(); ngrams.Clear(); } Console.Out.Close(); }
static void Main(string[] args) { Console.SetIn(new StreamReader("../../App_Data/gene.counts")); Console.SetOut(new StreamWriter("../../App_Data/gene.counts.out")); /* string line; while ((line = Console.ReadLine()) != null) { if (line != "") { var spts = line.Split(new[] { ' ' }, StringSplitOptions.RemoveEmptyEntries); if (spts[1] == "WORDTAG") { //"1 WORDTAG O mind" //_RARE_ O WORDTAG 29683 if (int.Parse(spts[0]) < 5) { Console.WriteLine("{0} WORDTAG {1} _RARE_", spts[0], spts[2]); } else { Console.WriteLine(line); } } } else { Console.WriteLine(line); } } */ List<WordCount> counts = new List<WordCount>(); Dictionary ngrams = new Dictionary(); string line; while ((line = Console.ReadLine()) != null) { if (line != "") { var spts = line.Split(new[] { ' ' }, StringSplitOptions.RemoveEmptyEntries); if (spts[1] == "WORDTAG") { //"1 WORDTAG O mind" //_RARE_ O WORDTAG 29683 var wc = counts.FirstOrDefault(p => p.word == spts[3]); if (wc == null) { wc = new WordCount { word = spts[3], cntG = 0, cntO = 0 }; counts.Add(wc); } if (spts[2] == "O") { wc.cntO += int.Parse(spts[0]); } else { wc.cntG += int.Parse(spts[0]); } //counts.AddSafely(string.Format("{1} {0} WORDTAG", spts[2], spts[3]), spts[0]); } else { //749 3-GRAM * * I-GENE //* * O 3-GRAM 13047 ngrams.AddSafely(string.Format("{0} {1}", string.Join(" ", spts.Skip(2)), spts[1]), spts[0]); } } } var words = counts.Where(p => p.sum >= 5); var rareWords = counts.Where(p => p.sum < 5); foreach (var word in words.OrderBy(p => p.word).OrderByDescending(p => p.sum)) { if (word.cntO != 0) { Console.WriteLine(string.Format("{0} O WORDTAG\t{1}", word.word, word.cntO)); } if (word.cntG != 0) { Console.WriteLine(string.Format("{0} I-GENE WORDTAG\t{1}", word.word, word.cntG)); } } var rares = rareWords.GroupBy(p => { if (Regex.IsMatch(p.word, "\\d+")) { return "NUMERIC"; } else if (p.word.ToUpper() == p.word) { return "ALL_CAPITAL"; } else if (p.word.Last().ToString().ToUpper() == p.word.Last().ToString()) { return "LAST_CAPITAL"; } else { return "RARE"; } }); foreach (var rare in rares) { Console.WriteLine(string.Format("_RARE_{0}_ O WORDTAG\t{1}", rare.Key, rare.Sum(p => p.cntO))); Console.WriteLine(string.Format("_RARE_{0}_ I-GENE WORDTAG\t{1}", rare.Key, rare.Sum(p => p.cntG))); } /* Console.WriteLine(string.Format("_RARE_ O WORDTAG\t{0}", rareWords.Sum(p => p.cntO))); Console.WriteLine(string.Format("_RARE_ I-GENE WORDTAG\t{0}", rareWords.Sum(p => p.cntG))); */ foreach (var kvp in ngrams.OrderBy(p => p.Key).OrderByDescending(p => p.Value)) { Console.WriteLine(string.Format("{0}\t{1}", kvp.Key, kvp.Value)); } Console.Out.Close(); }