private static void BuildFrequencyDictionary() { var result = new FrequencyResults(); Console.WriteLine("How many sentences do you want to parse?"); var nbOfSentencesToParse = int.Parse(Console.ReadLine()); var nbOfAlreadyParsedSentences = 0; var frequencyDirectory = Utilities.PathToDownloadDirectory + "frequencies"; if (!Directory.Exists(frequencyDirectory)) { Directory.CreateDirectory(frequencyDirectory); } var frequencyFilePath = frequencyDirectory + "/frequencies.txt"; var excludedFrequencyFilePath = frequencyDirectory + "/excluded-frequencies.txt"; var nbOfSentencesParsedFilePath = frequencyDirectory + "/nbOfSentencesParsed.txt"; var parsingResumed = false; if (File.Exists(nbOfSentencesParsedFilePath)) { int nbOfSentencesParsed; if (int.TryParse(File.ReadAllText(nbOfSentencesParsedFilePath), out nbOfSentencesParsed)) { Console.WriteLine("{0} sentences have already been parsed. Resume parsing? (y/n)", nbOfSentencesParsed); var resumeParsing = string.Equals(Console.ReadLine(), "Y", StringComparison.InvariantCultureIgnoreCase); if (resumeParsing) { nbOfAlreadyParsedSentences = nbOfSentencesParsed; parsingResumed = true; } } } var sw = Stopwatch.StartNew(); Console.WriteLine("Building of frequency dictionary started"); // Tokenize the sentences and compute the frequencies Func <string[], int, bool> extractTokens = (tokens, sentenceCounter) => { for (var i = 0; i < tokens.Length; i++) { var wordOccurence = new WordOccurrence() { IsFirstTokenInSentence = i == 0, Word = tokens[i] }; result.AddOccurence(wordOccurence); } return(true); }; Utilities.ExtractTokensFromTxtFiles(extractTokens, nbOfSentencesToParse, nbOfAlreadyParsedSentences); // Load previous frequency dictionaries that were already computed if (parsingResumed) { result.LoadFrequencyDictionary(frequencyFilePath); result.LoadFrequencyDictionary(excludedFrequencyFilePath); } // Save frequency files on disk result.SaveFrequencyDictionary(frequencyFilePath); result.SaveExcludedFrequencyDictionary(excludedFrequencyFilePath); // Save the nb of sentences parsed (for information and being able to relaunch the parsing at this point) File.WriteAllText(nbOfSentencesParsedFilePath, nbOfSentencesToParse.ToString()); Console.WriteLine("Building of frequency dictionary done"); Console.WriteLine("====================================="); sw.Stop(); Console.WriteLine("Ellapsed time: {0}", sw.Elapsed.ToString("g")); }
private static void PostProcessFrequencyDictionary() { Console.WriteLine("Which minimum frequency should be used for building dictionaries?"); var minFrequency = int.Parse(Console.ReadLine()); Console.WriteLine("Started writing frequency list"); var result = new FrequencyResults(); var frequencyDirectory = Utilities.PathToDownloadDirectory + "frequencies"; var frequencyFilePath = frequencyDirectory + "/frequencies.txt"; result.LoadFrequencyDictionary(frequencyFilePath, minFrequency); var groupedTokens = new Dictionary <string, List <WordOccurrenceAndFrequency> >(); foreach (var wordFrequency in result.WordFrequencies) { var lcToken = wordFrequency.Key.Word.ToLowerInvariant(); if (groupedTokens.ContainsKey(lcToken)) { groupedTokens[lcToken].Add(new WordOccurrenceAndFrequency() { Word = wordFrequency.Key.Word, IsFirstTokenInSentence = wordFrequency.Key.IsFirstTokenInSentence, Frequency = wordFrequency.Value }); } else { groupedTokens[lcToken] = new List <WordOccurrenceAndFrequency>() { new WordOccurrenceAndFrequency() { Word = wordFrequency.Key.Word, IsFirstTokenInSentence = wordFrequency.Key.IsFirstTokenInSentence, Frequency = wordFrequency.Value } }; } } var mergedTokens = new Dictionary <string, List <WordOccurrenceAndFrequency> >(); foreach (var grp in groupedTokens) { var list = grp.Value.Where(v => !v.IsFirstTokenInSentence).ToList(); // Merge tokens which are first in sentence with others foreach (var token in grp.Value.Where(wf => wf.IsFirstTokenInSentence)) { // Find other tokens which are not the first in the sentence and increase their frequency respectively to their frequency var otherTokens = list.Where(v => v.Word == token.Word || v.Word == Utilities.LowerCaseFirstLetter(token.Word)).ToList(); if (otherTokens.Any()) { var totalGrpFreq = otherTokens.Sum(t => t.Frequency); foreach (var otherToken in otherTokens) { otherToken.Frequency += (token.Frequency * otherToken.Frequency) / totalGrpFreq; } } else { list.Add(token); } } mergedTokens.Add(grp.Key, list); } // Post processed frequencies for debug var postProcessedFrequencyFilePath = frequencyDirectory + "/post-processed-frequencies.txt"; var ppLines = mergedTokens .Select(ent => string.Join(Utilities.Csv2ndLevelSeparator, ent.Value.Select(wf => string.Format("{0}{3}{1}{3}{2}", wf.Word, wf.IsFirstTokenInSentence, wf.Frequency, Utilities.CsvSeparator)))); File.WriteAllLines(postProcessedFrequencyFilePath, ppLines); // Load fleex words var fleexWords = new HashSet <string>(); var fleexWordsFile = frequencyDirectory + "/fleex - words.txt"; foreach (var word in File.ReadAllLines(fleexWordsFile)) { fleexWords.Add(word); } // Final frequency list var frequencyListPath = frequencyDirectory + "/frequency-list.txt"; var flLines = mergedTokens .SelectMany(ent => ent.Value) .OrderByDescending(wf => wf.Frequency) .Select(wf => string.Format("{0},{1},{2},{3},{4}", wf.Word, wf.Frequency, char.IsUpper(wf.Word[0]) ? 1 : 0, wf.Word.Any(char.IsUpper) ? 1 : 0, fleexWords.Contains(wf.Word) ? 1 : 0)); File.WriteAllLines(frequencyListPath, flLines); Console.WriteLine("Finished writing frequency list"); }