Exemplo n.º 1
0
        private static void BuildFrequencyDictionary()
        {
            var result = new FrequencyResults();

            Console.WriteLine("How many sentences do you want to parse?");
            var nbOfSentencesToParse = int.Parse(Console.ReadLine());

            var nbOfAlreadyParsedSentences = 0;
            var frequencyDirectory         = Utilities.PathToDownloadDirectory + "frequencies";

            if (!Directory.Exists(frequencyDirectory))
            {
                Directory.CreateDirectory(frequencyDirectory);
            }
            var frequencyFilePath           = frequencyDirectory + "/frequencies.txt";
            var excludedFrequencyFilePath   = frequencyDirectory + "/excluded-frequencies.txt";
            var nbOfSentencesParsedFilePath = frequencyDirectory + "/nbOfSentencesParsed.txt";
            var parsingResumed = false;

            if (File.Exists(nbOfSentencesParsedFilePath))
            {
                int nbOfSentencesParsed;
                if (int.TryParse(File.ReadAllText(nbOfSentencesParsedFilePath), out nbOfSentencesParsed))
                {
                    Console.WriteLine("{0} sentences have already been parsed. Resume parsing? (y/n)", nbOfSentencesParsed);
                    var resumeParsing = string.Equals(Console.ReadLine(), "Y", StringComparison.InvariantCultureIgnoreCase);
                    if (resumeParsing)
                    {
                        nbOfAlreadyParsedSentences = nbOfSentencesParsed;
                        parsingResumed             = true;
                    }
                }
            }

            var sw = Stopwatch.StartNew();

            Console.WriteLine("Building of frequency dictionary started");

            // Tokenize the sentences and compute the frequencies
            Func <string[], int, bool> extractTokens = (tokens, sentenceCounter) =>
            {
                for (var i = 0; i < tokens.Length; i++)
                {
                    var wordOccurence = new WordOccurrence()
                    {
                        IsFirstTokenInSentence = i == 0,
                        Word = tokens[i]
                    };
                    result.AddOccurence(wordOccurence);
                }
                return(true);
            };

            Utilities.ExtractTokensFromTxtFiles(extractTokens, nbOfSentencesToParse, nbOfAlreadyParsedSentences);

            // Load previous frequency dictionaries that were already computed
            if (parsingResumed)
            {
                result.LoadFrequencyDictionary(frequencyFilePath);
                result.LoadFrequencyDictionary(excludedFrequencyFilePath);
            }

            // Save frequency files on disk
            result.SaveFrequencyDictionary(frequencyFilePath);
            result.SaveExcludedFrequencyDictionary(excludedFrequencyFilePath);

            // Save the nb of sentences parsed (for information and being able to relaunch the parsing at this point)
            File.WriteAllText(nbOfSentencesParsedFilePath, nbOfSentencesToParse.ToString());

            Console.WriteLine("Building of frequency dictionary done");
            Console.WriteLine("=====================================");

            sw.Stop();
            Console.WriteLine("Ellapsed time: {0}", sw.Elapsed.ToString("g"));
        }
Exemplo n.º 2
0
        private static void PostProcessFrequencyDictionary()
        {
            Console.WriteLine("Which minimum frequency should be used for building dictionaries?");
            var minFrequency = int.Parse(Console.ReadLine());

            Console.WriteLine("Started writing frequency list");

            var result             = new FrequencyResults();
            var frequencyDirectory = Utilities.PathToDownloadDirectory + "frequencies";
            var frequencyFilePath  = frequencyDirectory + "/frequencies.txt";

            result.LoadFrequencyDictionary(frequencyFilePath, minFrequency);

            var groupedTokens = new Dictionary <string, List <WordOccurrenceAndFrequency> >();

            foreach (var wordFrequency in result.WordFrequencies)
            {
                var lcToken = wordFrequency.Key.Word.ToLowerInvariant();
                if (groupedTokens.ContainsKey(lcToken))
                {
                    groupedTokens[lcToken].Add(new WordOccurrenceAndFrequency()
                    {
                        Word = wordFrequency.Key.Word,
                        IsFirstTokenInSentence = wordFrequency.Key.IsFirstTokenInSentence,
                        Frequency = wordFrequency.Value
                    });
                }
                else
                {
                    groupedTokens[lcToken] = new List <WordOccurrenceAndFrequency>()
                    {
                        new WordOccurrenceAndFrequency()
                        {
                            Word = wordFrequency.Key.Word,
                            IsFirstTokenInSentence = wordFrequency.Key.IsFirstTokenInSentence,
                            Frequency = wordFrequency.Value
                        }
                    };
                }
            }

            var mergedTokens = new Dictionary <string, List <WordOccurrenceAndFrequency> >();

            foreach (var grp in groupedTokens)
            {
                var list = grp.Value.Where(v => !v.IsFirstTokenInSentence).ToList();

                // Merge tokens which are first in sentence with others
                foreach (var token in grp.Value.Where(wf => wf.IsFirstTokenInSentence))
                {
                    // Find other tokens which are not the first in the sentence and increase their frequency respectively to their frequency
                    var otherTokens = list.Where(v => v.Word == token.Word || v.Word == Utilities.LowerCaseFirstLetter(token.Word)).ToList();
                    if (otherTokens.Any())
                    {
                        var totalGrpFreq = otherTokens.Sum(t => t.Frequency);
                        foreach (var otherToken in otherTokens)
                        {
                            otherToken.Frequency += (token.Frequency * otherToken.Frequency) / totalGrpFreq;
                        }
                    }
                    else
                    {
                        list.Add(token);
                    }
                }

                mergedTokens.Add(grp.Key, list);
            }

            // Post processed frequencies for debug
            var postProcessedFrequencyFilePath = frequencyDirectory + "/post-processed-frequencies.txt";
            var ppLines = mergedTokens
                          .Select(ent => string.Join(Utilities.Csv2ndLevelSeparator,
                                                     ent.Value.Select(wf => string.Format("{0}{3}{1}{3}{2}", wf.Word, wf.IsFirstTokenInSentence, wf.Frequency, Utilities.CsvSeparator))));

            File.WriteAllLines(postProcessedFrequencyFilePath, ppLines);

            // Load fleex words
            var fleexWords     = new HashSet <string>();
            var fleexWordsFile = frequencyDirectory + "/fleex - words.txt";

            foreach (var word in File.ReadAllLines(fleexWordsFile))
            {
                fleexWords.Add(word);
            }

            // Final frequency list
            var frequencyListPath = frequencyDirectory + "/frequency-list.txt";
            var flLines           = mergedTokens
                                    .SelectMany(ent => ent.Value)
                                    .OrderByDescending(wf => wf.Frequency)
                                    .Select(wf => string.Format("{0},{1},{2},{3},{4}", wf.Word, wf.Frequency,
                                                                char.IsUpper(wf.Word[0]) ? 1 : 0, wf.Word.Any(char.IsUpper) ? 1 : 0,
                                                                fleexWords.Contains(wf.Word) ? 1 : 0));

            File.WriteAllLines(frequencyListPath, flLines);

            Console.WriteLine("Finished writing frequency list");
        }