public void ComputePmis()
        {
            var sw = Stopwatch.StartNew();

            Console.WriteLine("Start post processing ngrams frequencies");

            // Load results
            var result = new NGramFrequenciesResults(N);

            var ngramsDirectory = Utilities.PathToDownloadDirectory + "/ngrams";
            var ngramDirectory  = ngramsDirectory + string.Format("/{0}-gram", N);

            if (!Directory.Exists(ngramDirectory))
            {
                Directory.CreateDirectory(ngramDirectory);
            }
            var wordFrequencyFilePath = ngramDirectory + "/word-frequencies.txt";
            var ngramFreqFilePath     = ngramDirectory + "/ngrams-frequencies.txt";

            result.LoadResults(wordFrequencyFilePath, ngramFreqFilePath);

            // Save frequency files on disk
            var ngramsPmisFilePath = ngramDirectory + "/ngrams-pmis.txt";

            result.SaveCollocationPMIs(ngramsPmisFilePath, FrequencyFilter);

            Console.WriteLine("Done post processing ngrams frequencies");
            sw.Stop();
            Console.WriteLine("Executed in {0}", sw.Elapsed.ToString("g"));
        }
예제 #2
0
        // Methods -------------

        public void ComputeNgramsFrequencies()
        {
            var result = new NGramFrequenciesResults(N);

            var nbOfAlreadyParsedSentences = 0;
            var ngramsDirectory            = this.PathToDownloadDirectory + "ngrams";

            if (!Directory.Exists(ngramsDirectory))
            {
                Directory.CreateDirectory(ngramsDirectory);
            }

            var ngramDirectory = ngramsDirectory + string.Format("/{0}-gram", N);

            if (!Directory.Exists(ngramDirectory))
            {
                Directory.CreateDirectory(ngramDirectory);
            }

            var wordFrequencyFilePath       = ngramDirectory + "/word-frequencies.txt";
            var ngramFreqFilePath           = ngramDirectory + "/ngrams-frequencies.txt";
            var nbOfSentencesParsedFilePath = ngramDirectory + "/nbOfSentencesParsed.txt";
            var parsingResumed = false;

            if (File.Exists(nbOfSentencesParsedFilePath))
            {
                int nbOfSentencesParsed;
                if (int.TryParse(File.ReadAllText(nbOfSentencesParsedFilePath), out nbOfSentencesParsed))
                {
                    Console.WriteLine("{0} sentences have already been parsed. Resume parsing? (y/n)", nbOfSentencesParsed);
                    var resumeParsing = string.Equals(Console.ReadLine(), "Y", StringComparison.InvariantCultureIgnoreCase);
                    if (resumeParsing)
                    {
                        nbOfAlreadyParsedSentences = nbOfSentencesParsed;
                        parsingResumed             = true;
                    }
                }
            }

            // Final frequency list
            Console.WriteLine("Load frequency list");
            var frequencyDirectory = PathToDownloadDirectory + "frequencies";
            var frequencyListPath  = frequencyDirectory + "/frequency-list - 150m.txt";
            var freqDic            = new Dictionary <string, long>();

            using (var reader = new StreamReader(File.OpenRead(frequencyListPath)))
            {
                var line = reader.ReadLine();
                while (line != null)
                {
                    var parts = line.Split(Utilities.CsvSeparator);
                    if (parts.Length == 2)
                    {
                        freqDic.Add(string.Intern(parts[0]), long.Parse(parts[1]));
                    }

                    line = reader.ReadLine();
                }
            }

            var sw = Stopwatch.StartNew();

            Console.WriteLine("Start computing {0}-grams frequencies", N);

            // Tokenize the sentences and compute the frequencies
            Func <string[], int, bool> extractTokens = (tokens, sentenceCounter) =>
            {
                if (sentenceCounter % FlushNbOfSentences == 0)
                {
                    var nbOfFlushedNGrams = result.FlushNgramsWithFrequencyBelow(FlushMinFrequency);
                    Console.WriteLine("Flushed {0} ngrams with frequency below {1}", nbOfFlushedNGrams, FlushMinFrequency);
                }

                // Lowercase the first token if necessary
                if (tokens.Length > 0 && !string.IsNullOrEmpty(tokens[0]) && char.IsLetter(tokens[0][0]))
                {
                    long freq;
                    long lcFreq;
                    var  lcToken = string.Intern(Utilities.LowerCaseFirstLetter(tokens[0]));
                    if (freqDic.TryGetValue(tokens[0], out freq) && freqDic.TryGetValue(lcToken, out lcFreq) && lcFreq > freq)
                    {
                        tokens[0] = lcToken;
                    }
                }
                result.AddBigrams(tokens);
                return(true);
            };

            Utilities.ExtractTokensFromTxtFiles(extractTokens, NbOfSentencesToParse, nbOfAlreadyParsedSentences);

            // Final flushing
            Console.WriteLine("Flushed {0} ngrams with frequency below {1}", result.FlushNgramsWithFrequencyBelow(FlushMinFrequency), FlushMinFrequency);

            // Load previous frequency dictionaries that were already computed
            Console.WriteLine("Loading previous results");
            if (parsingResumed)
            {
                result.LoadResults(wordFrequencyFilePath, ngramFreqFilePath);
            }

            // Save results on disk for later
            Console.WriteLine("Saving results on disk");
            result.SaveResults(wordFrequencyFilePath, ngramFreqFilePath);

            // Save the nb of sentences parsed (for information and being able to relaunch the parsing at this point)
            File.WriteAllText(nbOfSentencesParsedFilePath, NbOfSentencesToParse.ToString());

            Console.WriteLine("Finished computing {0}-grams frequencies", N);
            Console.WriteLine("=====================================");

            sw.Stop();
            Console.WriteLine("Ellapsed time: {0}", sw.Elapsed.ToString("g"));
        }