Esempio n. 1
0
        public Portioner(BuildParams_t bp, FileInfo fi, TFProcessor tfProcessor)
        {
            _Bp = bp;
            _Fi = fi;

            _OutputFilenames = new Dictionary <NGramsEnum, List <string> >();
            _OutputFilenames.Add(NGramsEnum.ngram_1, new List <string>());
            _DocumentNgrams_1 = new Dictionary <string, int>(_Bp.MaxPortionSize, stringComparer.Instance);

            switch (_Bp.Ngrams)
            {
            case NGramsEnum.ngram_4:
                _OutputFilenames.Add(NGramsEnum.ngram_4, new List <string>());
                _DocumentNgrams_4 = new Dictionary <string, int>(_Bp.MaxPortionSize, stringComparer.Instance);
                goto case NGramsEnum.ngram_3;

            case NGramsEnum.ngram_3:
                _OutputFilenames.Add(NGramsEnum.ngram_3, new List <string>());
                _DocumentNgrams_3 = new Dictionary <string, int>(_Bp.MaxPortionSize, stringComparer.Instance);
                goto case NGramsEnum.ngram_2;

            case NGramsEnum.ngram_2:
                _OutputFilenames.Add(NGramsEnum.ngram_2, new List <string>());
                _DocumentNgrams_2 = new Dictionary <string, int>(_Bp.MaxPortionSize, stringComparer.Instance);
                break;
            }

            _Sb          = new StringBuilder();
            _TFProcessor = tfProcessor;
            using (var p = Process.GetCurrentProcess())
            {
                _CurrentProcesId = p.Id;
            }
        }
        private static void BuildTFMatrix_UsePortion(BuildParams_t bp, FileInfo fi, TFProcessor tfProcessor)
        {
            var portioner = new Portioner(bp, fi, tfProcessor);

            portioner.BuildTFMatrix_UsePortion();

            GC.Collect();
        }
Esempio n. 3
0
        public static void Build(BuildParams_t bp, int tfProcessorDictionaryCapacity = TFPROCESSOR_DICTIONARY_CAPACITY)
        {
            #region [.-0-.]
            Console.WriteLine($"start process folder: '{bp.InputDirectory}'...");

            var tokenizer   = new mld_tokenizer(bp.UrlDetectorModel);
            var tfProcessor = TFProcessor.Create(tfProcessorDictionaryCapacity);

#if DEBUG
            var skipWordCount = 0;
#endif
            var processWordAction = default(Action <string>);
            if (bp.ClearCyrillicsChars)
            {
                if (bp.ClearDigitsChars)
                {
                    processWordAction = (word) =>
                    {
                        if (!word.HasCyrillicsOrDigitsChars())
                        {
                            if (word.Length <= bp.SingleWordMaxLength)
                            {
                                tfProcessor.AddTerm(word);
                            }
#if DEBUG
                            else
                            {
                                Console.Write($"\r\nskip-by-len #{(++skipWordCount)}: '{word}'");
                            }
#endif
                        }
                    };
                }
                else
                {
                    processWordAction = (word) =>
                    {
                        if (!word.HasCyrillicsChars())
                        {
                            if (word.Length <= bp.SingleWordMaxLength)
                            {
                                tfProcessor.AddTerm(word);
                            }
#if DEBUG
                            else
                            {
                                Console.Write($"\r\nskip-by-len #{(++skipWordCount)}: '{word}'");
                            }
#endif
                        }
                    };
                }
            }
            else
            {
                if (bp.ClearDigitsChars)
                {
                    processWordAction = (word) =>
                    {
                        if (word.Length <= bp.SingleWordMaxLength)
                        {
                            if (!word.HasDigitsChars())
                            {
                                tfProcessor.AddTerm(word);
                            }
                        }
#if DEBUG
                        else
                        {
                            Console.Write($"\r\nskip-by-len #{(++skipWordCount)}: '{word}'");
                        }
#endif
                    };
                }
                else
                {
                    processWordAction = (word) =>
                    {
                        if (word.Length <= bp.SingleWordMaxLength)
                        {
                            tfProcessor.AddTerm(word);
                        }
#if DEBUG
                        else
                        {
                            Console.Write($"\r\nskip-by-len #{(++skipWordCount)}: '{word}'");
                        }
#endif
                    };
                }
            }
            #endregion

            #region [.-1-.]
            var totalSentenceCount = 0;
            var first_fi           = default(FileInfo);
            var fis    = bp.EnumerateFilesFromInputFolder();
            var fi_num = 0;
            foreach (var fi in fis)
            {
                if (first_fi == null)
                {
                    first_fi = fi;
                }

                Console.WriteLine($"{(++fi_num)}). start process file: '{fi.Name}' [{fi.DisplaySize()}]...");

                using (var sr = new StreamReader(fi.FullName, Config.Inst.INPUT_ENCODING))
                {
                    for (var line = sr.ReadLine(); line != null; line = sr.ReadLine())
                    {
                        tokenizer.Run(line, processWordAction);

                        #region [.print-2-console.]
                        if ((++totalSentenceCount % 100_000) == 0)
                        {
                            Console.Write('.');
                            if ((totalSentenceCount % 1_000_000) == 0)
                            {
                                Console.WriteLine($"sentence-count: {totalSentenceCount}, ngrams_1-count: {tfProcessor.DictionarySize}");
                            }
                        }
                        #endregion
                    }
                    #region [.print-2-console.]
                    Console.WriteLine($"total-sentence-count: {totalSentenceCount}");
                    #endregion
                }
                GC.Collect();
                Console.WriteLine("end process file");
            }

            if (first_fi == null)
            {
                throw (new InvalidDataException($"No .txt-files found by path: '{bp.InputDirectory}'"));
            }
            #endregion

            #region [.-2-.]
            Console.Write("start calc probability...");
            var probabilityResult = tfProcessor.CalcProbabilityOrdered(Config.Inst.CUT_PERCENT);
            tfProcessor = default(TFProcessor);
            GC.Collect();
            Console.WriteLine("end calc probability");
            #endregion

            #region [.-3-.]
            Console.Write("start write result...");
            if (!Directory.Exists(bp.OutputDirectory))
            {
                Directory.CreateDirectory(bp.OutputDirectory);
            }

            var nfi = new NumberFormatInfo()
            {
                NumberDecimalSeparator = "."
            };

            var outputFile = Path.Combine(bp.OutputDirectory, Path.GetFileNameWithoutExtension(first_fi.Name) + $"-({bp.Ngrams}-cut_{bp.CutPercent.GetValueOrDefault()}%){first_fi.Extension}");

            using (var sw = new StreamWriter(outputFile, false, Config.Inst.OUTPUT_ENCODING))
            {
                sw.WriteLine($"#\t'{first_fi.Name}' ({bp.Ngrams}-cut_{bp.CutPercent.GetValueOrDefault()}%)");

                foreach (var tp in probabilityResult)
                {
                    if (tp.Probability != 0)
                    {
                        sw.Write(tp.Term);
                        sw.Write('\t');
                        sw.WriteLine(tp.Probability.ToString(nfi));
                    }
                }
            }

            Console.WriteLine($"end write result{Environment.NewLine}");
            #endregion
        }
Esempio n. 4
0
        public static void Build_UsePortion(BuildParams_t bp, int tfProcessorDictionaryCapacity = TFPROCESSOR_DICTIONARY_CAPACITY)
        {
            #region [.-0-.]
            Console.WriteLine($"start process folder: '{bp.InputDirectory}'...");

            var tfProcessor = TFProcessor.Create(tfProcessorDictionaryCapacity);
            #endregion

            #region [.-1-.]
            var first_fi = default(FileInfo);
            var fis      = bp.EnumerateFilesFromInputFolder();
            var fi_num   = 0;
            foreach (var fi in fis)
            {
                if (first_fi == null)
                {
                    first_fi = fi;
                }

                Console.WriteLine($"{(++fi_num)}). start process file: '{fi.Name}' [{fi.DisplaySize()}]...");

                BuildTFMatrix_UsePortion(bp, fi, tfProcessor);

                Console.WriteLine($"end process file{Environment.NewLine}");
            }

            if (first_fi == null)
            {
                throw (new InvalidDataException($"No .txt-files found by path: '{bp.InputDirectory}'"));
            }
            #endregion

            #region [.-2-.]
            Console.Write("start calc probability...");
            var probabilityResult = tfProcessor.CalcProbabilityOrdered(Config.Inst.CUT_PERCENT);
            tfProcessor = default(TFProcessor);
            GC.Collect();
            Console.WriteLine("end calc probability");
            #endregion

            #region [.-3-.]
            Console.Write("start write result...");
            if (!Directory.Exists(bp.OutputDirectory))
            {
                Directory.CreateDirectory(bp.OutputDirectory);
            }

            var nfi = new NumberFormatInfo()
            {
                NumberDecimalSeparator = "."
            };

            var outputFile = Path.Combine(bp.OutputDirectory, Path.GetFileNameWithoutExtension(first_fi.Name) + $"-({bp.Ngrams}-cut_{bp.CutPercent.GetValueOrDefault()}%){first_fi.Extension}");

            using (var sw = new StreamWriter(outputFile, false, Config.Inst.OUTPUT_ENCODING))
            {
                sw.WriteLine($"#\t'{first_fi.Name}' ({bp.Ngrams}-cut_{bp.CutPercent.GetValueOrDefault()}%)");

                foreach (var tp in probabilityResult)
                {
                    if (tp.Probability != 0)
                    {
                        sw.Write(tp.Term);
                        sw.Write('\t');
                        sw.WriteLine(tp.Probability.ToString(nfi));
                    }
                }
            }

            var tempOutputFolder = Path.Combine(bp.InputDirectory, "temp");
            if (Directory.Exists(tempOutputFolder) && !Directory.EnumerateFiles(tempOutputFolder, "*", SearchOption.TopDirectoryOnly).Any())
            {
                Directory.Delete(tempOutputFolder, true);
            }

            Console.WriteLine($"end write result{Environment.NewLine}");
            #endregion
        }
Esempio n. 5
0
        private static void Main(string[] args)
        {
            var wasErrors = false;

            try
            {
                #region [.print to console config.]
                Console.WriteLine($"{Environment.NewLine}----------------------------------------------");
                Console.WriteLine($"USE_HIGH_PRIORITY        : '{Config.Inst.USE_HIGH_PRIORITY}'");
                Console.WriteLine($"NGARMS                   : '{Config.Inst.NGARMS}'");
                Console.WriteLine($"CUT_PERCENT              : '{Config.Inst.CUT_PERCENT.GetValueOrDefault()}%'");
                #region comm
                //---Console.WriteLine( $"BUILD_MODE               : '{Config.Inst.BUILD_MODE}'" );

                /*---switch ( Config.Inst.BUILD_MODE )
                 * {
                 *  case BuildModeEnum.single_model:
                 * Console.WriteLine( $"NGARMS                   : '{Config.Inst.NGARMS}'" );
                 * Console.WriteLine( $"CUT_PERCENT              : '{Config.Inst.CUT_PERCENT.GetValueOrDefault()}%'" );
                 * //Console.WriteLine( $"CUT_THRESHOLD            : '{Config.Inst.CUT_THRESHOLD}'" );
                 *  break;
                 * }*/
                #endregion
                Console.WriteLine($"INPUT_DIRECTORY          : '{Config.Inst.INPUT_DIRECTORY}'");
                Console.WriteLine($"INPUT_ENCODING           : '{Config.Inst.INPUT_ENCODING.WebName}'");
                Console.WriteLine($"CLEAR_CYRILLICS_CHARS    : '{Config.Inst.CLEAR_CYRILLICS_CHARS}'");
                Console.WriteLine($"CLEAR_DIGITS_CHARS       : '{Config.Inst.CLEAR_DIGITS_CHARS}'");
                Console.WriteLine($"SINGLE_WORD_MAX_LENGTH   : '{Config.Inst.SINGLE_WORD_MAX_LENGTH}'");
                Console.WriteLine($"OUTPUT_DIRECTORY         : '{Config.Inst.OUTPUT_DIRECTORY}'");
                Console.WriteLine($"OUTPUT_ENCODING          : '{Config.Inst.OUTPUT_ENCODING.WebName}'");
                Console.WriteLine($"USE_PORTION              : '{Config.Inst.USE_PORTION}'");
                if (Config.Inst.USE_PORTION)
                {
                    Console.WriteLine($"MAX_PORTION_SIZE         : '{Config.Inst.MAX_PORTION_SIZE}'");
                }
                Console.WriteLine($"----------------------------------------------{Environment.NewLine}");
                #endregion

                #region [.use high priority.]
                if (Config.Inst.USE_HIGH_PRIORITY)
                {
                    Extensions.SetCurrentProcessHighPriority();
                }
                #endregion

                #region [.url-detector.]
                var urlDetectorModel = new UrlDetectorModel(Config.Inst.URL_DETECTOR_RESOURCES_XML_FILENAME);
                #endregion

                #region [.build model's.]
                var bp = new BuildParams_t()
                {
                    UrlDetectorModel    = urlDetectorModel,
                    InputDirectory      = Config.Inst.INPUT_DIRECTORY,
                    Ngrams              = Config.Inst.NGARMS,
                    CutPercent          = Config.Inst.CUT_PERCENT,
                    OutputDirectory     = Config.Inst.OUTPUT_DIRECTORY,
                    MaxPortionSize      = Config.Inst.MAX_PORTION_SIZE,
                    ClearCyrillicsChars = Config.Inst.CLEAR_CYRILLICS_CHARS,
                    ClearDigitsChars    = Config.Inst.CLEAR_DIGITS_CHARS,
                    SingleWordMaxLength = Config.Inst.SINGLE_WORD_MAX_LENGTH,
                };
                var sw = Stopwatch.StartNew();
                if (Config.Inst.USE_PORTION)
                {
                    ModelBuilder.Build_UsePortion(bp);
                }
                else
                {
                    ModelBuilder.Build(bp);
                }
                sw.Stop();

                Console.WriteLine($"'{Config.Inst.NGARMS}; cut_{Config.Inst.CUT_PERCENT.GetValueOrDefault()}%' - success, elapsed: {sw.Elapsed}{Environment.NewLine}");

                #region comm

                /*--if ( Config.Inst.BUILD_MODE == BuildModeEnum.single_model )
                 * {
                 *  var bp = new BuildParams_t()
                 *  {
                 *      UrlDetectorModel    = urlDetectorModel,
                 *      InputDirectory      = Config.Inst.INPUT_DIRECTORY,
                 *      Ngrams              = Config.Inst.NGARMS,
                 *      CutPercent          = Config.Inst.CUT_PERCENT,
                 *      OutputDirectory     = Config.Inst.OUTPUT_DIRECTORY,
                 *      MaxPortionSize      = Config.Inst.MAX_PORTION_SIZE,
                 *      ClearCyrillicsChars = Config.Inst.CLEAR_CYRILLICS_CHARS,
                 *      SingleWordMaxLength = Config.Inst.SINGLE_WORD_MAX_LENGTH,
                 *  };
                 *  var sw = Stopwatch.StartNew();
                 *  if ( Config.Inst.USE_PORTION )
                 *  {
                 *      ModelBuilder.Build_UsePortion( bp );
                 *  }
                 *  else
                 *  {
                 *      ModelBuilder.Build( bp );
                 *  }
                 *  sw.Stop();
                 *
                 *  Console.WriteLine( $"'{Config.Inst.NGARMS}; cut_{Config.Inst.CUT_PERCENT.GetValueOrDefault()}%' - success, elapsed: {sw.Elapsed}{Environment.NewLine}" );
                 * }
                 * else
                 * {
                 #region [.build model's.]
                 *  var sw_total = Stopwatch.StartNew();
                 *  foreach ( var t in Extensions.GetProcessParams() )
                 *  {
                 *      var bp = new BuildParams_t()
                 *      {
                 *          UrlDetectorModel    = urlDetectorModel,
                 *          InputDirectory      = Config.Inst.INPUT_DIRECTORY,
                 *          Ngrams              = t.Item1,
                 *          CutPercent          = TFProcessor.GetCutPercent( t.Item2 ),
                 *          OutputDirectory     = Config.Inst.OUTPUT_DIRECTORY,
                 *          MaxPortionSize      = Config.Inst.MAX_PORTION_SIZE,
                 *          ClearCyrillicsChars = Config.Inst.CLEAR_CYRILLICS_CHARS,
                 *          SingleWordMaxLength = Config.Inst.SINGLE_WORD_MAX_LENGTH,
                 *      };
                 *      try
                 *      {
                 *          var sw = Stopwatch.StartNew();
                 *          if ( Config.Inst.USE_PORTION )
                 *          {
                 *              ModelBuilder.Build_UsePortion( bp );
                 *          }
                 *          else
                 *          {
                 *              ModelBuilder.Build( bp );
                 *          }
                 *          sw.Stop();
                 *
                 *          Console.WriteLine( $"'{bp.Ngrams}; cut_{bp.CutPercent.GetValueOrDefault()}%' - success, elapsed: {sw.Elapsed}{Environment.NewLine}" );
                 *      }
                 *      catch ( Exception ex )
                 *      {
                 *          Console.ForegroundColor = ConsoleColor.Red;
                 *          Console.WriteLine( $"'{bp.Ngrams}; cut_{bp.CutPercent.GetValueOrDefault()}%' - {ex.GetType()}: {ex.Message}" );
                 *          Console.ResetColor();
                 *          wasErrors = true;
                 *      }
                 *  }
                 *  sw_total.Stop();
                 *
                 *  Console.WriteLine( $"total elapsed: {sw_total.Elapsed}" );
                 #endregion
                 * }*/
                #endregion
                #endregion
            }
            catch (Exception ex)
            {
                Console.ForegroundColor = ConsoleColor.Red;
                Console.WriteLine(Environment.NewLine + ex + Environment.NewLine);
                Console.ResetColor();
                wasErrors = true;
            }

            if (wasErrors)
            {
                Console.WriteLine($"{Environment.NewLine}[.....finita fusking comedy (push ENTER 4 exit).....]");
                Console.ReadLine();
            }
            else
            {
                Console.WriteLine($"{Environment.NewLine}[.....finita fusking comedy.....]");
            }
        }