Exemple #1
0
        public mld_tokenizer(UrlDetectorModel urlModel, int wordCapacity)
        {
            var urlConfig = new UrlDetectorConfig()
            {
                Model          = urlModel,
                UrlExtractMode = UrlDetector.UrlExtractModeEnum.Position,
            };

            _UrlDetector         = new UrlDetector(urlConfig);
            _Words               = new List <string>(Math.Max(DEFAULT_WORDCAPACITY, wordCapacity));
            _NgramsSB            = new StringBuilder();
            _AddWordToListAction = new Action <string>(AddWordToList);

            _UIM = xlat_Unsafe.Inst._UPPER_INVARIANT_MAP;
            _CTM = xlat_Unsafe.Inst._CHARTYPE_MAP;
            _IAW = UnsafeConst.Inst._INTERPRETE_AS_WHITESPACE;
            _DWC = UnsafeConst.Inst._DIGIT_WORD_CHARS;

            //--//
            ReAllocWordToUpperBuffer(DEFAULT_WORDTOUPPERBUFFER);
        }
 public ClassifierConfig(string urlDetectorResourcesXmlFilename)
 {
     UrlDetectorModel = new UrlDetectorModel(urlDetectorResourcesXmlFilename);
 }
Exemple #3
0
        private static void Main(string[] args)
        {
            var wasErrors = false;

            try
            {
                #region [.print to console config.]
                Console.WriteLine($"{Environment.NewLine}----------------------------------------------");
                Console.WriteLine($"USE_HIGH_PRIORITY        : '{Config.Inst.USE_HIGH_PRIORITY}'");
                Console.WriteLine($"NGARMS                   : '{Config.Inst.NGARMS}'");
                Console.WriteLine($"CUT_PERCENT              : '{Config.Inst.CUT_PERCENT.GetValueOrDefault()}%'");
                #region comm
                //---Console.WriteLine( $"BUILD_MODE               : '{Config.Inst.BUILD_MODE}'" );

                /*---switch ( Config.Inst.BUILD_MODE )
                 * {
                 *  case BuildModeEnum.single_model:
                 * Console.WriteLine( $"NGARMS                   : '{Config.Inst.NGARMS}'" );
                 * Console.WriteLine( $"CUT_PERCENT              : '{Config.Inst.CUT_PERCENT.GetValueOrDefault()}%'" );
                 * //Console.WriteLine( $"CUT_THRESHOLD            : '{Config.Inst.CUT_THRESHOLD}'" );
                 *  break;
                 * }*/
                #endregion
                Console.WriteLine($"INPUT_DIRECTORY          : '{Config.Inst.INPUT_DIRECTORY}'");
                Console.WriteLine($"INPUT_ENCODING           : '{Config.Inst.INPUT_ENCODING.WebName}'");
                Console.WriteLine($"CLEAR_CYRILLICS_CHARS    : '{Config.Inst.CLEAR_CYRILLICS_CHARS}'");
                Console.WriteLine($"CLEAR_DIGITS_CHARS       : '{Config.Inst.CLEAR_DIGITS_CHARS}'");
                Console.WriteLine($"SINGLE_WORD_MAX_LENGTH   : '{Config.Inst.SINGLE_WORD_MAX_LENGTH}'");
                Console.WriteLine($"OUTPUT_DIRECTORY         : '{Config.Inst.OUTPUT_DIRECTORY}'");
                Console.WriteLine($"OUTPUT_ENCODING          : '{Config.Inst.OUTPUT_ENCODING.WebName}'");
                Console.WriteLine($"USE_PORTION              : '{Config.Inst.USE_PORTION}'");
                if (Config.Inst.USE_PORTION)
                {
                    Console.WriteLine($"MAX_PORTION_SIZE         : '{Config.Inst.MAX_PORTION_SIZE}'");
                }
                Console.WriteLine($"----------------------------------------------{Environment.NewLine}");
                #endregion

                #region [.use high priority.]
                if (Config.Inst.USE_HIGH_PRIORITY)
                {
                    Extensions.SetCurrentProcessHighPriority();
                }
                #endregion

                #region [.url-detector.]
                var urlDetectorModel = new UrlDetectorModel(Config.Inst.URL_DETECTOR_RESOURCES_XML_FILENAME);
                #endregion

                #region [.build model's.]
                var bp = new BuildParams_t()
                {
                    UrlDetectorModel    = urlDetectorModel,
                    InputDirectory      = Config.Inst.INPUT_DIRECTORY,
                    Ngrams              = Config.Inst.NGARMS,
                    CutPercent          = Config.Inst.CUT_PERCENT,
                    OutputDirectory     = Config.Inst.OUTPUT_DIRECTORY,
                    MaxPortionSize      = Config.Inst.MAX_PORTION_SIZE,
                    ClearCyrillicsChars = Config.Inst.CLEAR_CYRILLICS_CHARS,
                    ClearDigitsChars    = Config.Inst.CLEAR_DIGITS_CHARS,
                    SingleWordMaxLength = Config.Inst.SINGLE_WORD_MAX_LENGTH,
                };
                var sw = Stopwatch.StartNew();
                if (Config.Inst.USE_PORTION)
                {
                    ModelBuilder.Build_UsePortion(bp);
                }
                else
                {
                    ModelBuilder.Build(bp);
                }
                sw.Stop();

                Console.WriteLine($"'{Config.Inst.NGARMS}; cut_{Config.Inst.CUT_PERCENT.GetValueOrDefault()}%' - success, elapsed: {sw.Elapsed}{Environment.NewLine}");

                #region comm

                /*--if ( Config.Inst.BUILD_MODE == BuildModeEnum.single_model )
                 * {
                 *  var bp = new BuildParams_t()
                 *  {
                 *      UrlDetectorModel    = urlDetectorModel,
                 *      InputDirectory      = Config.Inst.INPUT_DIRECTORY,
                 *      Ngrams              = Config.Inst.NGARMS,
                 *      CutPercent          = Config.Inst.CUT_PERCENT,
                 *      OutputDirectory     = Config.Inst.OUTPUT_DIRECTORY,
                 *      MaxPortionSize      = Config.Inst.MAX_PORTION_SIZE,
                 *      ClearCyrillicsChars = Config.Inst.CLEAR_CYRILLICS_CHARS,
                 *      SingleWordMaxLength = Config.Inst.SINGLE_WORD_MAX_LENGTH,
                 *  };
                 *  var sw = Stopwatch.StartNew();
                 *  if ( Config.Inst.USE_PORTION )
                 *  {
                 *      ModelBuilder.Build_UsePortion( bp );
                 *  }
                 *  else
                 *  {
                 *      ModelBuilder.Build( bp );
                 *  }
                 *  sw.Stop();
                 *
                 *  Console.WriteLine( $"'{Config.Inst.NGARMS}; cut_{Config.Inst.CUT_PERCENT.GetValueOrDefault()}%' - success, elapsed: {sw.Elapsed}{Environment.NewLine}" );
                 * }
                 * else
                 * {
                 #region [.build model's.]
                 *  var sw_total = Stopwatch.StartNew();
                 *  foreach ( var t in Extensions.GetProcessParams() )
                 *  {
                 *      var bp = new BuildParams_t()
                 *      {
                 *          UrlDetectorModel    = urlDetectorModel,
                 *          InputDirectory      = Config.Inst.INPUT_DIRECTORY,
                 *          Ngrams              = t.Item1,
                 *          CutPercent          = TFProcessor.GetCutPercent( t.Item2 ),
                 *          OutputDirectory     = Config.Inst.OUTPUT_DIRECTORY,
                 *          MaxPortionSize      = Config.Inst.MAX_PORTION_SIZE,
                 *          ClearCyrillicsChars = Config.Inst.CLEAR_CYRILLICS_CHARS,
                 *          SingleWordMaxLength = Config.Inst.SINGLE_WORD_MAX_LENGTH,
                 *      };
                 *      try
                 *      {
                 *          var sw = Stopwatch.StartNew();
                 *          if ( Config.Inst.USE_PORTION )
                 *          {
                 *              ModelBuilder.Build_UsePortion( bp );
                 *          }
                 *          else
                 *          {
                 *              ModelBuilder.Build( bp );
                 *          }
                 *          sw.Stop();
                 *
                 *          Console.WriteLine( $"'{bp.Ngrams}; cut_{bp.CutPercent.GetValueOrDefault()}%' - success, elapsed: {sw.Elapsed}{Environment.NewLine}" );
                 *      }
                 *      catch ( Exception ex )
                 *      {
                 *          Console.ForegroundColor = ConsoleColor.Red;
                 *          Console.WriteLine( $"'{bp.Ngrams}; cut_{bp.CutPercent.GetValueOrDefault()}%' - {ex.GetType()}: {ex.Message}" );
                 *          Console.ResetColor();
                 *          wasErrors = true;
                 *      }
                 *  }
                 *  sw_total.Stop();
                 *
                 *  Console.WriteLine( $"total elapsed: {sw_total.Elapsed}" );
                 #endregion
                 * }*/
                #endregion
                #endregion
            }
            catch (Exception ex)
            {
                Console.ForegroundColor = ConsoleColor.Red;
                Console.WriteLine(Environment.NewLine + ex + Environment.NewLine);
                Console.ResetColor();
                wasErrors = true;
            }

            if (wasErrors)
            {
                Console.WriteLine($"{Environment.NewLine}[.....finita fusking comedy (push ENTER 4 exit).....]");
                Console.ReadLine();
            }
            else
            {
                Console.WriteLine($"{Environment.NewLine}[.....finita fusking comedy.....]");
            }
        }
Exemple #4
0
 public mld_tokenizer(UrlDetectorModel urlModel) : this(urlModel, DEFAULT_WORDCAPACITY)
 {
 }
Exemple #5
0
        private static void Main(string[] args)
        {
            var wasErrors = false;

            try
            {
                #region [.print to console config.]
                Console.WriteLine(Environment.NewLine + "----------------------------------------------");
                Console.WriteLine("USE_BOOST_PRIORITY: '" + USE_BOOST_PRIORITY + "'");
                Console.WriteLine("BUILD_MODE        : '" + BUILD_MODE + "'");
                switch (BUILD_MODE)
                {
                case BuildModeEnum.single_model:
                    Console.WriteLine("METHOD            : '" + METHOD + "'");
                    Console.WriteLine("NGARMS            : '" + NGARMS + "'");
                    Console.WriteLine("D_PARAM           : '" + D_PARAM + "'");
                    break;

                case BuildModeEnum.all_models_by_method:
                    Console.WriteLine("METHOD            : '" + METHOD + "'");
                    break;
                }
                Console.WriteLine("INPUT_FILES       : '" + string.Join("'; '", INPUT_FILES) + "'");
                Console.WriteLine("INPUT_FOLDER      : '" + INPUT_FOLDER + "'");
                Console.WriteLine("INPUT_ENCODING    : '" + INPUT_ENCODING.WebName + "'");
                Console.WriteLine("OUTPUT_FILE       : '" + OUTPUT_FILE_PATTERN + "'");
                Console.WriteLine("OUTPUT_ENCODING   : '" + OUTPUT_ENCODING.WebName + "'");
                Console.WriteLine("----------------------------------------------" + Environment.NewLine);
                #endregion

                #region [.GC.]
                GCSettings.LatencyMode = GCLatencyMode.LowLatency;
                if (GCSettings.LatencyMode != GCLatencyMode.LowLatency)
                {
                    GCSettings.LatencyMode = GCLatencyMode.Batch;
                }
                #endregion

                #region [.use boost priority.]
                if (USE_BOOST_PRIORITY)
                {
                    var pr = Process.GetCurrentProcess();
                    pr.PriorityClass              = ProcessPriorityClass.RealTime;
                    pr.PriorityBoostEnabled       = true;
                    Thread.CurrentThread.Priority = ThreadPriority.Highest;
                }
                #endregion

                #region [.url-detector.]
                var urlDetectorModel = new UrlDetectorModel(URL_DETECTOR_RESOURCES_XML_FILENAME);
                #endregion

                #region [.build model's.]
                if (BUILD_MODE == BuildModeEnum.single_model)
                {
                    var bp = new build_params_t()
                    {
                        UrlDetectorModel      = urlDetectorModel,
                        InputFolder           = INPUT_FOLDER,
                        InputFilenames        = INPUT_FILES,
                        Method                = METHOD,
                        Ngrams                = NGARMS,
                        D_param               = D_PARAM,
                        OutputFilenamePattern = OUTPUT_FILE_PATTERN,
                    };
                    var sw = Stopwatch.StartNew();
                    Build(bp);
                    sw.Stop();

                    Console.WriteLine("'" + METHOD + "; " + NGARMS + "; " + D_PARAM + "' - success, elapsed: " + sw.Elapsed);
                }
                else
                {
                    var tuples = (BUILD_MODE == BuildModeEnum.all_models_by_method)
                                 ? GetProcessParams(METHOD)
                                 : GetProcessParams();

                    #region [.build model's.]
                    var sw_total = Stopwatch.StartNew();
                    foreach (var t in tuples)
                    {
                        var bp = new build_params_t()
                        {
                            UrlDetectorModel      = urlDetectorModel,
                            InputFolder           = INPUT_FOLDER,
                            InputFilenames        = INPUT_FILES,
                            Method                = t.Item1,
                            Ngrams                = t.Item2,
                            D_param               = t.Item3,
                            OutputFilenamePattern = OUTPUT_FILE_PATTERN,
                        };
                        try
                        {
                            var sw = Stopwatch.StartNew();
                            Build(bp);
                            sw.Stop();

                            Console.WriteLine("'" + bp.Method + "; " + bp.Ngrams + "; " + bp.D_param + "' - success, elapsed: " + sw.Elapsed);
                        }
                        catch (Exception ex)
                        {
                            var fc = Console.ForegroundColor; Console.ForegroundColor = ConsoleColor.Red;
                            Console.WriteLine("'" + bp.Method + "; " + bp.Ngrams + "; " + bp.D_param + "' - " + ex.GetType() + ": " + ex.Message);
                            Console.ForegroundColor = fc;
                            wasErrors = true;
                        }
                    }
                    sw_total.Stop();

                    Console.WriteLine("total elapsed: " + sw_total.Elapsed);
                    #endregion
                }
                #endregion
            }
            catch (Exception ex)
            {
                Console.ForegroundColor = ConsoleColor.Red;
                Console.WriteLine(Environment.NewLine + ex + Environment.NewLine);
                Console.ResetColor();
                wasErrors = true;
            }

            Console.WriteLine(Environment.NewLine + "[.....finita fusking comedy (push ENTER 4 exit).....]");
            if (wasErrors)
            {
                Console.ReadLine();
            }
        }
Exemple #6
0
 public ClassifyTokenizer(UrlDetectorModel urlModel) : this(urlModel, DEFAULT_WORDCAPACITY)
 {
 }