Exemple #1
0
        public static void Build_UsePortion(BuildParams_t bp, int tfProcessorDictionaryCapacity = TFPROCESSOR_DICTIONARY_CAPACITY)
        {
            #region [.-0-.]
            Console.WriteLine($"start process folder: '{bp.InputDirectory}'...");

            var tfProcessor = TFProcessor.Create(tfProcessorDictionaryCapacity);
            #endregion

            #region [.-1-.]
            var first_fi = default(FileInfo);
            var fis      = bp.EnumerateFilesFromInputFolder();
            var fi_num   = 0;
            foreach (var fi in fis)
            {
                if (first_fi == null)
                {
                    first_fi = fi;
                }

                Console.WriteLine($"{(++fi_num)}). start process file: '{fi.Name}' [{fi.DisplaySize()}]...");

                BuildTFMatrix_UsePortion(bp, fi, tfProcessor);

                Console.WriteLine($"end process file{Environment.NewLine}");
            }

            if (first_fi == null)
            {
                throw (new InvalidDataException($"No .txt-files found by path: '{bp.InputDirectory}'"));
            }
            #endregion

            #region [.-2-.]
            Console.Write("start calc probability...");
            var probabilityResult = tfProcessor.CalcProbabilityOrdered(Config.Inst.CUT_PERCENT);
            tfProcessor = default(TFProcessor);
            GC.Collect();
            Console.WriteLine("end calc probability");
            #endregion

            #region [.-3-.]
            Console.Write("start write result...");
            if (!Directory.Exists(bp.OutputDirectory))
            {
                Directory.CreateDirectory(bp.OutputDirectory);
            }

            var nfi = new NumberFormatInfo()
            {
                NumberDecimalSeparator = "."
            };

            var outputFile = Path.Combine(bp.OutputDirectory, Path.GetFileNameWithoutExtension(first_fi.Name) + $"-({bp.Ngrams}-cut_{bp.CutPercent.GetValueOrDefault()}%){first_fi.Extension}");

            using (var sw = new StreamWriter(outputFile, false, Config.Inst.OUTPUT_ENCODING))
            {
                sw.WriteLine($"#\t'{first_fi.Name}' ({bp.Ngrams}-cut_{bp.CutPercent.GetValueOrDefault()}%)");

                foreach (var tp in probabilityResult)
                {
                    if (tp.Probability != 0)
                    {
                        sw.Write(tp.Term);
                        sw.Write('\t');
                        sw.WriteLine(tp.Probability.ToString(nfi));
                    }
                }
            }

            var tempOutputFolder = Path.Combine(bp.InputDirectory, "temp");
            if (Directory.Exists(tempOutputFolder) && !Directory.EnumerateFiles(tempOutputFolder, "*", SearchOption.TopDirectoryOnly).Any())
            {
                Directory.Delete(tempOutputFolder, true);
            }

            Console.WriteLine($"end write result{Environment.NewLine}");
            #endregion
        }
        public static void Build(BuildParams_t bp, int tfProcessorDictionaryCapacity = TFPROCESSOR_DICTIONARY_CAPACITY)
        {
            #region [.-0-.]
            Console.WriteLine($"start process folder: '{bp.InputDirectory}'...");

            var tokenizer   = new mld_tokenizer(bp.UrlDetectorModel);
            var tfProcessor = TFProcessor.Create(tfProcessorDictionaryCapacity);

#if DEBUG
            var skipWordCount = 0;
#endif
            var processWordAction = default(Action <string>);
            if (bp.ClearCyrillicsChars)
            {
                if (bp.ClearDigitsChars)
                {
                    processWordAction = (word) =>
                    {
                        if (!word.HasCyrillicsOrDigitsChars())
                        {
                            if (word.Length <= bp.SingleWordMaxLength)
                            {
                                tfProcessor.AddTerm(word);
                            }
#if DEBUG
                            else
                            {
                                Console.WriteLine($"\r\nskip-by-len #{(++skipWordCount)}: '{word}'");
                            }
#endif
                        }
                    };
                }
                else
                {
                    processWordAction = (word) =>
                    {
                        if (!word.HasCyrillicsChars())
                        {
                            if (word.Length <= bp.SingleWordMaxLength)
                            {
                                tfProcessor.AddTerm(word);
                            }
#if DEBUG
                            else
                            {
                                Console.WriteLine($"\r\nskip-by-len #{(++skipWordCount)}: '{word}'");
                            }
#endif
                        }
                    };
                }
            }
            else
            {
                if (bp.ClearDigitsChars)
                {
                    processWordAction = (word) =>
                    {
                        if (word.Length <= bp.SingleWordMaxLength)
                        {
                            if (!word.HasDigitsChars())
                            {
                                tfProcessor.AddTerm(word);
                            }
                        }
#if DEBUG
                        else
                        {
                            Console.WriteLine($"\r\nskip-by-len #{(++skipWordCount)}: '{word}'");
                        }
#endif
                    };
                }
                else
                {
                    processWordAction = (word) =>
                    {
                        if (word.Length <= bp.SingleWordMaxLength)
                        {
                            tfProcessor.AddTerm(word);
                        }
#if DEBUG
                        else
                        {
                            Console.WriteLine($"\r\nskip-by-len #{(++skipWordCount)}: '{word}'");
                        }
#endif
                    };
                }
            }
            #endregion

            #region [.-1-.]
            var totalSentenceCount = 0;
            var first_fi           = default(FileInfo);
            var fis    = bp.EnumerateFilesFromInputFolder();
            var fi_num = 0;
            foreach (var fi in fis)
            {
                if (first_fi == null)
                {
                    first_fi = fi;
                }

                Console.WriteLine($"{(++fi_num)}). start process file: '{fi.Name}' [{fi.DisplaySize()}]...");

                using (var sr = new StreamReader(fi.FullName, Config.Inst.INPUT_ENCODING))
                {
                    for (var line = sr.ReadLine(); line != null; line = sr.ReadLine())
                    {
                        tokenizer.Run(line, processWordAction);

                        #region [.print-2-console.]
                        if ((++totalSentenceCount % 100_000) == 0)
                        {
                            Console.Write('.');
                            if ((totalSentenceCount % 1_000_000) == 0)
                            {
                                Console.WriteLine($"sentence-count: {totalSentenceCount}, ngrams_1-count: {tfProcessor.DictionarySize}");
                            }
                        }
                        #endregion
                    }
                    #region [.print-2-console.]
                    Console.WriteLine($"total-sentence-count: {totalSentenceCount}");
                    #endregion
                }
                GC.Collect();
                Console.WriteLine("end process file");
            }

            if (first_fi == null)
            {
                throw (new InvalidDataException($"No .txt-files found by path: '{bp.InputDirectory}'"));
            }
            #endregion

            #region [.-2-.]
            Console.Write("start calc probability...");
            var probabilityResult = tfProcessor.CalcProbabilityOrdered(Config.Inst.CUT_PERCENT);
            tfProcessor = default(TFProcessor);
            GC.Collect();
            Console.WriteLine("end calc probability");
            #endregion

            #region [.-3-.]
            Console.Write("start write result...");
            if (!Directory.Exists(bp.OutputDirectory))
            {
                Directory.CreateDirectory(bp.OutputDirectory);
            }

            var nfi = new NumberFormatInfo()
            {
                NumberDecimalSeparator = "."
            };

            var outputFile = Path.Combine(bp.OutputDirectory, Path.GetFileNameWithoutExtension(first_fi.Name) + $"-({bp.Ngrams}-cut_{bp.CutPercent.GetValueOrDefault()}%){first_fi.Extension}");

            using (var sw = new StreamWriter(outputFile, false, Config.Inst.OUTPUT_ENCODING))
            {
                sw.WriteLine($"#\t'{first_fi.Name}' ({bp.Ngrams}-cut_{bp.CutPercent.GetValueOrDefault()}%)");

                foreach (var tp in probabilityResult)
                {
                    if (tp.Probability != 0)
                    {
                        sw.Write(tp.Term);
                        sw.Write('\t');
                        sw.WriteLine(tp.Probability.ToString(nfi));
                    }
                }
            }

            Console.WriteLine($"end write result{Environment.NewLine}");
            #endregion
        }