Beispiel #1
0
        private static void PostProcessFrequencyDictionary()
        {
            Console.WriteLine("Which minimum frequency should be used for building dictionaries?");
            var minFrequency = int.Parse(Console.ReadLine());

            Console.WriteLine("Started writing frequency list");

            var result             = new FrequencyResults();
            var frequencyDirectory = Utilities.PathToDownloadDirectory + "frequencies";
            var frequencyFilePath  = frequencyDirectory + "/frequencies.txt";

            result.LoadFrequencyDictionary(frequencyFilePath, minFrequency);

            var groupedTokens = new Dictionary <string, List <WordOccurrenceAndFrequency> >();

            foreach (var wordFrequency in result.WordFrequencies)
            {
                var lcToken = wordFrequency.Key.Word.ToLowerInvariant();
                if (groupedTokens.ContainsKey(lcToken))
                {
                    groupedTokens[lcToken].Add(new WordOccurrenceAndFrequency()
                    {
                        Word = wordFrequency.Key.Word,
                        IsFirstTokenInSentence = wordFrequency.Key.IsFirstTokenInSentence,
                        Frequency = wordFrequency.Value
                    });
                }
                else
                {
                    groupedTokens[lcToken] = new List <WordOccurrenceAndFrequency>()
                    {
                        new WordOccurrenceAndFrequency()
                        {
                            Word = wordFrequency.Key.Word,
                            IsFirstTokenInSentence = wordFrequency.Key.IsFirstTokenInSentence,
                            Frequency = wordFrequency.Value
                        }
                    };
                }
            }

            var mergedTokens = new Dictionary <string, List <WordOccurrenceAndFrequency> >();

            foreach (var grp in groupedTokens)
            {
                var list = grp.Value.Where(v => !v.IsFirstTokenInSentence).ToList();

                // Merge tokens which are first in sentence with others
                foreach (var token in grp.Value.Where(wf => wf.IsFirstTokenInSentence))
                {
                    // Find other tokens which are not the first in the sentence and increase their frequency respectively to their frequency
                    var otherTokens = list.Where(v => v.Word == token.Word || v.Word == Utilities.LowerCaseFirstLetter(token.Word)).ToList();
                    if (otherTokens.Any())
                    {
                        var totalGrpFreq = otherTokens.Sum(t => t.Frequency);
                        foreach (var otherToken in otherTokens)
                        {
                            otherToken.Frequency += (token.Frequency * otherToken.Frequency) / totalGrpFreq;
                        }
                    }
                    else
                    {
                        list.Add(token);
                    }
                }

                mergedTokens.Add(grp.Key, list);
            }

            // Post processed frequencies for debug
            var postProcessedFrequencyFilePath = frequencyDirectory + "/post-processed-frequencies.txt";
            var ppLines = mergedTokens
                          .Select(ent => string.Join(Utilities.Csv2ndLevelSeparator,
                                                     ent.Value.Select(wf => string.Format("{0}{3}{1}{3}{2}", wf.Word, wf.IsFirstTokenInSentence, wf.Frequency, Utilities.CsvSeparator))));

            File.WriteAllLines(postProcessedFrequencyFilePath, ppLines);

            // Load fleex words
            var fleexWords     = new HashSet <string>();
            var fleexWordsFile = frequencyDirectory + "/fleex - words.txt";

            foreach (var word in File.ReadAllLines(fleexWordsFile))
            {
                fleexWords.Add(word);
            }

            // Final frequency list
            var frequencyListPath = frequencyDirectory + "/frequency-list.txt";
            var flLines           = mergedTokens
                                    .SelectMany(ent => ent.Value)
                                    .OrderByDescending(wf => wf.Frequency)
                                    .Select(wf => string.Format("{0},{1},{2},{3},{4}", wf.Word, wf.Frequency,
                                                                char.IsUpper(wf.Word[0]) ? 1 : 0, wf.Word.Any(char.IsUpper) ? 1 : 0,
                                                                fleexWords.Contains(wf.Word) ? 1 : 0));

            File.WriteAllLines(frequencyListPath, flLines);

            Console.WriteLine("Finished writing frequency list");
        }