Example #1
0
        public static void GetDictionaries(string DictionaryFile, Dictionary WordsCount, Dictionary NGrams)
        {
            //read trained n-grams and word counts from file, fill-up dictionary
            foreach (var ln in File.ReadAllLines(DictionaryFile))
            {
                var kvp = ln.Split('\t');

                var srcKey = kvp[0];
                var srcVal = kvp[1];

                var keySplits = srcKey.Split(' ');
                var keyTag = keySplits.Last();
                var key = string.Join(" ", keySplits.Take(keySplits.Count() - 1));

                if (keyTag == "WORDTAG")
                {
                    WordsCount.AddSafely(key, srcVal);
                }
                else
                {
                    NGrams.AddSafely(key, srcVal);
                }
            }
        }
Example #2
0
        public void Reduce(int rareWordsLimit = 3)
        {
            var wordsCount = new Dictionary();
            var ngramsCount = new Dictionary();

            string line;
            while ((line = Console.ReadLine()) != null)
            {
                var kvp = line.Split(new[] { '\t' }, StringSplitOptions.RemoveEmptyEntries);

                var val = int.Parse(kvp[1]);

                var keySplits = kvp[0].Split(new[] { ' ' }, StringSplitOptions.RemoveEmptyEntries);

                var typeTag = keySplits.Last();
                var key = string.Join(" ", keySplits.Take(keySplits.Count() - 1));

                if (keySplits.Last() == "WORDTAG")
                {
                    wordsCount.AddSafely(key, val);
                }
                else if (keySplits.Last() == "NGRAM")
                {
                    ngramsCount.AddSafely(key, val);
                }
            }

            //store pairs for rare words - tag : quanity
            var rareWordsCount = new Dictionary();
            var wordsCountOrdered = wordsCount.OrderBy(p => p.Key).OrderByDescending(p => p.Value);
            foreach (var wordCount in wordsCountOrdered)
            {
                if (wordCount.Value >= rareWordsLimit)
                {
                    Console.WriteLine(string.Format("{1} WORDTAG\t{0}", wordCount.Value, wordCount.Key));
                }
                else
                {
                    var tag = wordCount.Key.Split(' ').Last();
                    rareWordsCount.AddSafely(tag, wordCount.Value);
                }
            }

            foreach (var rareWordCount in rareWordsCount)
            {
                Console.WriteLine(string.Format("{1} {2} WORDTAG\t{0}", rareWordCount.Value, RARE_WORD, rareWordCount.Key));
            }

            foreach (var ngramCount in ngramsCount.OrderBy(p => p.Key).OrderByDescending(p => p.Value))
            {
                Console.WriteLine(string.Format("{2} {1}-GRAM\t{0}", ngramCount.Value, ngramCount.Key.Split(new[] { ' ' }).Count(), ngramCount.Key));
            }

            Console.Out.Close();
        }
Example #3
0
        public void Map(int nGramCount = 3)
        {
            var counters = new Dictionary();
            var ngrams = new Dictionary();
            var tags = new List<string>();

            string line;
            while ((line = Console.ReadLine()) != null)
            {
                var wordTagPairs = line.Split(new[] { '\t' }, StringSplitOptions.RemoveEmptyEntries);
                foreach (var wordTagPair in wordTagPairs)
                {
                    //Words count
                    counters.AddSafely(wordTagPair, 1);

                    //Collect tags
                    tags.Add(wordTagPair.Split(' ')[1]);
                }

                foreach (var counter in counters)
                {
                    Console.WriteLine(string.Format(WordOutputFormatString, counter.Key, counter.Value));
                }

                //NGRAMS
                for (var i = 1; i <= nGramCount; i++)
                {
                    var spt = Enumerable.Repeat(START_TAG, i - 1).Concat(tags);

                    int iterCnt;

                    if (i > 1)
                    {
                        //Add stop tag if ngrams > 1
                        spt = spt.Concat(new[] { STOP_TAG });

                        //iter till STOP word (total number of iterations = number of tags + 1), example * * A B STOP = 3 & * A B STOP = 3
                        iterCnt = tags.Count() + 1;
                    }
                    else
                    {

                        //iter = number of tags, example: AB = 2 iters
                        iterCnt = tags.Count();
                    }

                    for (var k = 0; k < iterCnt; k++)
                    {
                        //move to the next tag and compose ngram of length i
                        var ngram = string.Join(" ", spt.Skip(k).Take(i));

                        ngrams.AddSafely(ngram, 1);
                    }

                    //Add start sentence tags, will be needed for calculating probabilities in HMM
                    if (i > 1)
                    {
                        var startSentenceTag = string.Join(" ", Enumerable.Repeat(START_TAG, i - 1));

                        ngrams.AddSafely(startSentenceTag, 1);
                    }
                }

                foreach (var ngram in ngrams)
                {
                    Console.WriteLine(string.Format(NGramOutputFormatString, ngram.Key, ngram.Value));
                }

                counters.Clear();
                tags.Clear();
                ngrams.Clear();
            }

            Console.Out.Close();
        }
Example #4
0
        static void Main(string[] args)
        {
            Console.SetIn(new StreamReader("../../App_Data/gene.counts"));
            Console.SetOut(new StreamWriter("../../App_Data/gene.counts.out"));

            /*
            string line;
            while ((line = Console.ReadLine()) != null)
            {
                if (line != "")
                {

                    var spts = line.Split(new[] { ' ' }, StringSplitOptions.RemoveEmptyEntries);

                    if (spts[1] == "WORDTAG")
                    {
                        //"1 WORDTAG O mind"
                        //_RARE_ O WORDTAG	29683

                        if (int.Parse(spts[0]) < 5)
                        {
                            Console.WriteLine("{0} WORDTAG {1} _RARE_", spts[0], spts[2]);
                        }
                        else
                        {
                            Console.WriteLine(line);
                        }
                    }
                }
                else
                {
                    Console.WriteLine(line);
                }

            }
             */

            List<WordCount> counts = new List<WordCount>();
            Dictionary ngrams = new Dictionary();

            string line;
            while ((line = Console.ReadLine()) != null)
            {
                if (line != "")
                {
                    var spts = line.Split(new[] { ' ' }, StringSplitOptions.RemoveEmptyEntries);

                    if (spts[1] == "WORDTAG")
                    {
                        //"1 WORDTAG O mind"
                        //_RARE_ O WORDTAG	29683

                        var wc = counts.FirstOrDefault(p => p.word == spts[3]);

                        if (wc == null)
                        {
                            wc = new WordCount { word = spts[3], cntG = 0, cntO = 0 };

                            counts.Add(wc);
                        }

                        if (spts[2] == "O")
                        {
                            wc.cntO += int.Parse(spts[0]);
                        }
                        else
                        {
                            wc.cntG += int.Parse(spts[0]);
                        }

                        //counts.AddSafely(string.Format("{1} {0} WORDTAG", spts[2], spts[3]), spts[0]);
                    }
                    else
                    {
                        //749 3-GRAM * * I-GENE
                        //* * O 3-GRAM	13047

                        ngrams.AddSafely(string.Format("{0} {1}", string.Join(" ", spts.Skip(2)), spts[1]), spts[0]);
                    }
                }
            }

            var words = counts.Where(p => p.sum >= 5);
            var rareWords = counts.Where(p => p.sum < 5);

            foreach (var word in words.OrderBy(p => p.word).OrderByDescending(p => p.sum))
            {
                if (word.cntO != 0)
                {
                    Console.WriteLine(string.Format("{0} O WORDTAG\t{1}", word.word, word.cntO));
                }

                if (word.cntG != 0)
                {
                    Console.WriteLine(string.Format("{0} I-GENE WORDTAG\t{1}", word.word, word.cntG));
                }
            }

            var rares = rareWords.GroupBy(p => {
                if (Regex.IsMatch(p.word, "\\d+"))
                {
                    return "NUMERIC";
                }
                else if (p.word.ToUpper() == p.word)
                {
                    return "ALL_CAPITAL";
                }
                else if (p.word.Last().ToString().ToUpper() == p.word.Last().ToString())
                {
                    return "LAST_CAPITAL";
                }
                else
                {
                    return "RARE";
                }
            });

            foreach (var rare in rares)
            {
                Console.WriteLine(string.Format("_RARE_{0}_ O WORDTAG\t{1}", rare.Key, rare.Sum(p => p.cntO)));
                Console.WriteLine(string.Format("_RARE_{0}_ I-GENE WORDTAG\t{1}", rare.Key, rare.Sum(p => p.cntG)));
            }

            /*
            Console.WriteLine(string.Format("_RARE_ O WORDTAG\t{0}", rareWords.Sum(p => p.cntO)));
            Console.WriteLine(string.Format("_RARE_ I-GENE WORDTAG\t{0}", rareWords.Sum(p => p.cntG)));
             */

            foreach (var kvp in ngrams.OrderBy(p => p.Key).OrderByDescending(p => p.Value))
            {
                Console.WriteLine(string.Format("{0}\t{1}", kvp.Key, kvp.Value));
            }

            Console.Out.Close();
        }