private static void PostProcessFrequencyDictionary() { Console.WriteLine("Which minimum frequency should be used for building dictionaries?"); var minFrequency = int.Parse(Console.ReadLine()); Console.WriteLine("Started writing frequency list"); var result = new FrequencyResults(); var frequencyDirectory = Utilities.PathToDownloadDirectory + "frequencies"; var frequencyFilePath = frequencyDirectory + "/frequencies.txt"; result.LoadFrequencyDictionary(frequencyFilePath, minFrequency); var groupedTokens = new Dictionary <string, List <WordOccurrenceAndFrequency> >(); foreach (var wordFrequency in result.WordFrequencies) { var lcToken = wordFrequency.Key.Word.ToLowerInvariant(); if (groupedTokens.ContainsKey(lcToken)) { groupedTokens[lcToken].Add(new WordOccurrenceAndFrequency() { Word = wordFrequency.Key.Word, IsFirstTokenInSentence = wordFrequency.Key.IsFirstTokenInSentence, Frequency = wordFrequency.Value }); } else { groupedTokens[lcToken] = new List <WordOccurrenceAndFrequency>() { new WordOccurrenceAndFrequency() { Word = wordFrequency.Key.Word, IsFirstTokenInSentence = wordFrequency.Key.IsFirstTokenInSentence, Frequency = wordFrequency.Value } }; } } var mergedTokens = new Dictionary <string, List <WordOccurrenceAndFrequency> >(); foreach (var grp in groupedTokens) { var list = grp.Value.Where(v => !v.IsFirstTokenInSentence).ToList(); // Merge tokens which are first in sentence with others foreach (var token in grp.Value.Where(wf => wf.IsFirstTokenInSentence)) { // Find other tokens which are not the first in the sentence and increase their frequency respectively to their frequency var otherTokens = list.Where(v => v.Word == token.Word || v.Word == Utilities.LowerCaseFirstLetter(token.Word)).ToList(); if (otherTokens.Any()) { var totalGrpFreq = otherTokens.Sum(t => t.Frequency); foreach (var otherToken in otherTokens) { otherToken.Frequency += (token.Frequency * otherToken.Frequency) / totalGrpFreq; } } else { list.Add(token); } } mergedTokens.Add(grp.Key, list); } // Post processed frequencies for debug var postProcessedFrequencyFilePath = frequencyDirectory + "/post-processed-frequencies.txt"; var ppLines = mergedTokens .Select(ent => string.Join(Utilities.Csv2ndLevelSeparator, ent.Value.Select(wf => string.Format("{0}{3}{1}{3}{2}", wf.Word, wf.IsFirstTokenInSentence, wf.Frequency, Utilities.CsvSeparator)))); File.WriteAllLines(postProcessedFrequencyFilePath, ppLines); // Load fleex words var fleexWords = new HashSet <string>(); var fleexWordsFile = frequencyDirectory + "/fleex - words.txt"; foreach (var word in File.ReadAllLines(fleexWordsFile)) { fleexWords.Add(word); } // Final frequency list var frequencyListPath = frequencyDirectory + "/frequency-list.txt"; var flLines = mergedTokens .SelectMany(ent => ent.Value) .OrderByDescending(wf => wf.Frequency) .Select(wf => string.Format("{0},{1},{2},{3},{4}", wf.Word, wf.Frequency, char.IsUpper(wf.Word[0]) ? 1 : 0, wf.Word.Any(char.IsUpper) ? 1 : 0, fleexWords.Contains(wf.Word) ? 1 : 0)); File.WriteAllLines(frequencyListPath, flLines); Console.WriteLine("Finished writing frequency list"); }